{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 400, "global_step": 29669, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0001685260709831811, "grad_norm": 20.571081161499023, "learning_rate": 1.6852039096730705e-09, "logits/chosen": -0.5615859031677246, "logits/rejected": -0.5738595724105835, "logps/chosen": -1.6699402332305908, "logps/rejected": -1.7010023593902588, "loss": 2.882, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.699399948120117, "rewards/margins": 0.3106228709220886, "rewards/rejected": -17.01002311706543, "step": 5 }, { "epoch": 0.0003370521419663622, "grad_norm": 31.532499313354492, "learning_rate": 3.370407819346141e-09, "logits/chosen": -0.5780839323997498, "logits/rejected": -0.44632649421691895, "logps/chosen": -1.7983486652374268, "logps/rejected": -1.7575130462646484, "loss": 4.2936, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.98348617553711, "rewards/margins": -0.4083539843559265, "rewards/rejected": -17.575130462646484, "step": 10 }, { "epoch": 0.0005055782129495433, "grad_norm": 30.432205200195312, "learning_rate": 5.055611729019211e-09, "logits/chosen": -0.3094201982021332, "logits/rejected": -0.33659106492996216, "logps/chosen": -1.6785993576049805, "logps/rejected": -1.8086090087890625, "loss": 2.1801, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -16.785995483398438, "rewards/margins": 1.3000948429107666, "rewards/rejected": -18.086088180541992, "step": 15 }, { "epoch": 0.0006741042839327244, "grad_norm": 27.722911834716797, "learning_rate": 6.740815638692282e-09, "logits/chosen": -0.7963976860046387, "logits/rejected": -0.6953271627426147, "logps/chosen": -1.6158479452133179, "logps/rejected": -1.637351632118225, "loss": 2.9568, "rewards/accuracies": 0.5, "rewards/chosen": -16.15848159790039, "rewards/margins": 0.21503643691539764, "rewards/rejected": -16.373516082763672, "step": 20 }, { "epoch": 0.0008426303549159055, "grad_norm": 27.396446228027344, "learning_rate": 8.426019548365353e-09, "logits/chosen": -0.4122963547706604, "logits/rejected": -0.23325464129447937, "logps/chosen": -1.8151277303695679, "logps/rejected": -1.9228709936141968, "loss": 2.3935, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.151275634765625, "rewards/margins": 1.077433466911316, "rewards/rejected": -19.228710174560547, "step": 25 }, { "epoch": 0.0010111564258990866, "grad_norm": 20.264379501342773, "learning_rate": 1.0111223458038422e-08, "logits/chosen": -0.4613228738307953, "logits/rejected": -0.47090595960617065, "logps/chosen": -1.5245769023895264, "logps/rejected": -1.5697605609893799, "loss": 2.7916, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -15.245768547058105, "rewards/margins": 0.45183706283569336, "rewards/rejected": -15.697604179382324, "step": 30 }, { "epoch": 0.0011796824968822678, "grad_norm": 23.512556076049805, "learning_rate": 1.1796427367711492e-08, "logits/chosen": -0.30312925577163696, "logits/rejected": -0.5215615630149841, "logps/chosen": -2.0133779048919678, "logps/rejected": -2.0369296073913574, "loss": 3.9116, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.133779525756836, "rewards/margins": 0.23551683127880096, "rewards/rejected": -20.36929702758789, "step": 35 }, { "epoch": 0.0013482085678654487, "grad_norm": 17.596803665161133, "learning_rate": 1.3481631277384564e-08, "logits/chosen": -0.5437296628952026, "logits/rejected": -0.5092577934265137, "logps/chosen": -1.5528526306152344, "logps/rejected": -1.5792545080184937, "loss": 2.8937, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -15.528528213500977, "rewards/margins": 0.264018714427948, "rewards/rejected": -15.7925443649292, "step": 40 }, { "epoch": 0.0015167346388486299, "grad_norm": 18.151222229003906, "learning_rate": 1.5166835187057634e-08, "logits/chosen": -0.4424339234828949, "logits/rejected": -0.5105506181716919, "logps/chosen": -1.7674528360366821, "logps/rejected": -1.7977365255355835, "loss": 2.8757, "rewards/accuracies": 0.5, "rewards/chosen": -17.674528121948242, "rewards/margins": 0.30283719301223755, "rewards/rejected": -17.977365493774414, "step": 45 }, { "epoch": 0.001685260709831811, "grad_norm": 17.36290168762207, "learning_rate": 1.6852039096730706e-08, "logits/chosen": -0.6893698573112488, "logits/rejected": -0.600393533706665, "logps/chosen": -1.8375508785247803, "logps/rejected": -1.839758276939392, "loss": 3.4694, "rewards/accuracies": 0.5, "rewards/chosen": -18.37550926208496, "rewards/margins": 0.02207345888018608, "rewards/rejected": -18.3975830078125, "step": 50 }, { "epoch": 0.0018537867808149922, "grad_norm": 35.55210876464844, "learning_rate": 1.8537243006403775e-08, "logits/chosen": -0.6072413921356201, "logits/rejected": -0.5612670183181763, "logps/chosen": -1.688486099243164, "logps/rejected": -1.655321717262268, "loss": 3.4563, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.88486099243164, "rewards/margins": -0.33164510130882263, "rewards/rejected": -16.5532169342041, "step": 55 }, { "epoch": 0.0020223128517981733, "grad_norm": 9.634931564331055, "learning_rate": 2.0222446916076843e-08, "logits/chosen": 0.009985041804611683, "logits/rejected": 0.04499584436416626, "logps/chosen": -1.4743859767913818, "logps/rejected": -1.5558125972747803, "loss": 2.491, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -14.743858337402344, "rewards/margins": 0.8142659068107605, "rewards/rejected": -15.558123588562012, "step": 60 }, { "epoch": 0.0021908389227813544, "grad_norm": 19.739643096923828, "learning_rate": 2.1907650825749915e-08, "logits/chosen": -0.5582388639450073, "logits/rejected": -0.7695599794387817, "logps/chosen": -1.4039520025253296, "logps/rejected": -1.3351157903671265, "loss": 3.7285, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -14.039520263671875, "rewards/margins": -0.688363254070282, "rewards/rejected": -13.351158142089844, "step": 65 }, { "epoch": 0.0023593649937645356, "grad_norm": 16.17816734313965, "learning_rate": 2.3592854735422984e-08, "logits/chosen": -0.7809473872184753, "logits/rejected": -0.7580649852752686, "logps/chosen": -1.7219566106796265, "logps/rejected": -1.7046865224838257, "loss": 3.4937, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.21956443786621, "rewards/margins": -0.17269916832447052, "rewards/rejected": -17.046865463256836, "step": 70 }, { "epoch": 0.0025278910647477163, "grad_norm": 22.2804012298584, "learning_rate": 2.5278058645096056e-08, "logits/chosen": -0.7723641395568848, "logits/rejected": -0.8045178651809692, "logps/chosen": -1.4883219003677368, "logps/rejected": -1.4072678089141846, "loss": 3.8948, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -14.883219718933105, "rewards/margins": -0.8105422854423523, "rewards/rejected": -14.072675704956055, "step": 75 }, { "epoch": 0.0026964171357308974, "grad_norm": 25.86888313293457, "learning_rate": 2.6963262554769128e-08, "logits/chosen": -0.08162397891283035, "logits/rejected": -0.007745756767690182, "logps/chosen": -1.797745943069458, "logps/rejected": -1.9569447040557861, "loss": 3.0899, "rewards/accuracies": 0.5, "rewards/chosen": -17.977460861206055, "rewards/margins": 1.5919866561889648, "rewards/rejected": -19.569446563720703, "step": 80 }, { "epoch": 0.0028649432067140786, "grad_norm": 23.099294662475586, "learning_rate": 2.8648466464442196e-08, "logits/chosen": -0.7632491588592529, "logits/rejected": -0.7715046405792236, "logps/chosen": -1.686475396156311, "logps/rejected": -1.7604153156280518, "loss": 2.6555, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.864755630493164, "rewards/margins": 0.7393980026245117, "rewards/rejected": -17.60415267944336, "step": 85 }, { "epoch": 0.0030334692776972597, "grad_norm": 21.547155380249023, "learning_rate": 3.033367037411527e-08, "logits/chosen": -0.5401488542556763, "logits/rejected": -0.46405941247940063, "logps/chosen": -1.8458507061004639, "logps/rejected": -1.7931216955184937, "loss": 3.5862, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.458507537841797, "rewards/margins": -0.5272892117500305, "rewards/rejected": -17.931217193603516, "step": 90 }, { "epoch": 0.003201995348680441, "grad_norm": 30.385618209838867, "learning_rate": 3.2018874283788334e-08, "logits/chosen": -0.24752536416053772, "logits/rejected": -0.5087365508079529, "logps/chosen": -1.8774124383926392, "logps/rejected": -1.7933375835418701, "loss": 3.9459, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.774124145507812, "rewards/margins": -0.8407486081123352, "rewards/rejected": -17.93337631225586, "step": 95 }, { "epoch": 0.003370521419663622, "grad_norm": 34.13222885131836, "learning_rate": 3.370407819346141e-08, "logits/chosen": -0.3253437876701355, "logits/rejected": -0.32326677441596985, "logps/chosen": -1.559247612953186, "logps/rejected": -1.4792327880859375, "loss": 3.951, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -15.592477798461914, "rewards/margins": -0.8001473546028137, "rewards/rejected": -14.792327880859375, "step": 100 }, { "epoch": 0.003539047490646803, "grad_norm": 28.58753776550293, "learning_rate": 3.538928210313448e-08, "logits/chosen": -0.3705151379108429, "logits/rejected": -0.5448762774467468, "logps/chosen": -1.6743396520614624, "logps/rejected": -1.6100950241088867, "loss": 3.7262, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -16.743396759033203, "rewards/margins": -0.6424452662467957, "rewards/rejected": -16.100950241088867, "step": 105 }, { "epoch": 0.0037075735616299843, "grad_norm": 30.319379806518555, "learning_rate": 3.707448601280755e-08, "logits/chosen": -0.5734914541244507, "logits/rejected": -0.795892059803009, "logps/chosen": -1.7318475246429443, "logps/rejected": -1.8011115789413452, "loss": 2.4595, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.3184757232666, "rewards/margins": 0.692640483379364, "rewards/rejected": -18.0111141204834, "step": 110 }, { "epoch": 0.0038760996326131654, "grad_norm": 24.694068908691406, "learning_rate": 3.8759689922480615e-08, "logits/chosen": -0.28614291548728943, "logits/rejected": -0.40578216314315796, "logps/chosen": -1.639915108680725, "logps/rejected": -1.6953051090240479, "loss": 2.6474, "rewards/accuracies": 0.5, "rewards/chosen": -16.399150848388672, "rewards/margins": 0.5539007186889648, "rewards/rejected": -16.953052520751953, "step": 115 }, { "epoch": 0.004044625703596347, "grad_norm": 27.61186981201172, "learning_rate": 4.044489383215369e-08, "logits/chosen": -0.3833938241004944, "logits/rejected": -0.3862255811691284, "logps/chosen": -1.5427688360214233, "logps/rejected": -1.6645368337631226, "loss": 2.6032, "rewards/accuracies": 0.5, "rewards/chosen": -15.427688598632812, "rewards/margins": 1.2176802158355713, "rewards/rejected": -16.645368576049805, "step": 120 }, { "epoch": 0.004213151774579527, "grad_norm": 21.950868606567383, "learning_rate": 4.213009774182676e-08, "logits/chosen": -0.3655502200126648, "logits/rejected": -0.46016925573349, "logps/chosen": -1.8143894672393799, "logps/rejected": -1.824389100074768, "loss": 2.9648, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.143896102905273, "rewards/margins": 0.09999465942382812, "rewards/rejected": -18.2438907623291, "step": 125 }, { "epoch": 0.004381677845562709, "grad_norm": 28.688859939575195, "learning_rate": 4.381530165149983e-08, "logits/chosen": -0.5313527584075928, "logits/rejected": -0.577104926109314, "logps/chosen": -1.7656917572021484, "logps/rejected": -1.750722885131836, "loss": 3.4855, "rewards/accuracies": 0.5, "rewards/chosen": -17.656917572021484, "rewards/margins": -0.1496877670288086, "rewards/rejected": -17.50722885131836, "step": 130 }, { "epoch": 0.00455020391654589, "grad_norm": 16.734312057495117, "learning_rate": 4.55005055611729e-08, "logits/chosen": -0.2671900689601898, "logits/rejected": -0.3611742854118347, "logps/chosen": -1.9485645294189453, "logps/rejected": -1.9769279956817627, "loss": 3.0009, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.485645294189453, "rewards/margins": 0.2836341857910156, "rewards/rejected": -19.76927947998047, "step": 135 }, { "epoch": 0.004718729987529071, "grad_norm": 27.427196502685547, "learning_rate": 4.718570947084597e-08, "logits/chosen": -0.7098779082298279, "logits/rejected": -0.48461517691612244, "logps/chosen": -1.6501989364624023, "logps/rejected": -1.714129090309143, "loss": 2.602, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.501989364624023, "rewards/margins": 0.6392990946769714, "rewards/rejected": -17.14128875732422, "step": 140 }, { "epoch": 0.004887256058512252, "grad_norm": 18.88107681274414, "learning_rate": 4.887091338051904e-08, "logits/chosen": -0.4832366406917572, "logits/rejected": -0.47723278403282166, "logps/chosen": -1.741558313369751, "logps/rejected": -1.8169167041778564, "loss": 3.0037, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.41558265686035, "rewards/margins": 0.753582775592804, "rewards/rejected": -18.169164657592773, "step": 145 }, { "epoch": 0.005055782129495433, "grad_norm": 29.208202362060547, "learning_rate": 5.055611729019211e-08, "logits/chosen": -0.66395503282547, "logits/rejected": -0.6029442548751831, "logps/chosen": -1.5494129657745361, "logps/rejected": -1.5750149488449097, "loss": 2.9043, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -15.494128227233887, "rewards/margins": 0.25602206587791443, "rewards/rejected": -15.750149726867676, "step": 150 }, { "epoch": 0.005224308200478614, "grad_norm": 20.492918014526367, "learning_rate": 5.224132119986518e-08, "logits/chosen": -0.8075466156005859, "logits/rejected": -0.8982459306716919, "logps/chosen": -1.5187879800796509, "logps/rejected": -1.539340853691101, "loss": 2.9627, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -15.18787956237793, "rewards/margins": 0.20552997291088104, "rewards/rejected": -15.393407821655273, "step": 155 }, { "epoch": 0.005392834271461795, "grad_norm": 14.183518409729004, "learning_rate": 5.3926525109538256e-08, "logits/chosen": -0.32353320717811584, "logits/rejected": -0.3807728588581085, "logps/chosen": -1.760858178138733, "logps/rejected": -1.8059089183807373, "loss": 2.8887, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.60858154296875, "rewards/margins": 0.4505080282688141, "rewards/rejected": -18.059091567993164, "step": 160 }, { "epoch": 0.0055613603424449765, "grad_norm": 18.467872619628906, "learning_rate": 5.561172901921132e-08, "logits/chosen": -0.07653169333934784, "logits/rejected": -0.14054766297340393, "logps/chosen": -1.6342408657073975, "logps/rejected": -1.7179641723632812, "loss": 2.5857, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.342411041259766, "rewards/margins": 0.8372310400009155, "rewards/rejected": -17.179641723632812, "step": 165 }, { "epoch": 0.005729886413428157, "grad_norm": 23.680160522460938, "learning_rate": 5.729693292888439e-08, "logits/chosen": -0.3570849597454071, "logits/rejected": -0.4266514778137207, "logps/chosen": -1.7811241149902344, "logps/rejected": -1.894479513168335, "loss": 2.3806, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.811241149902344, "rewards/margins": 1.1335537433624268, "rewards/rejected": -18.944795608520508, "step": 170 }, { "epoch": 0.005898412484411339, "grad_norm": 14.393828392028809, "learning_rate": 5.898213683855746e-08, "logits/chosen": -0.5269268751144409, "logits/rejected": -0.5787986516952515, "logps/chosen": -1.867837905883789, "logps/rejected": -1.9020452499389648, "loss": 3.0536, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.67837905883789, "rewards/margins": 0.34207430481910706, "rewards/rejected": -19.020450592041016, "step": 175 }, { "epoch": 0.0060669385553945195, "grad_norm": 28.581340789794922, "learning_rate": 6.066734074823054e-08, "logits/chosen": -0.4054934084415436, "logits/rejected": -0.3458530902862549, "logps/chosen": -1.9068748950958252, "logps/rejected": -2.2061736583709717, "loss": 2.2303, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.06874656677246, "rewards/margins": 2.9929919242858887, "rewards/rejected": -22.06174087524414, "step": 180 }, { "epoch": 0.006235464626377701, "grad_norm": 26.398284912109375, "learning_rate": 6.235254465790361e-08, "logits/chosen": -0.7721191048622131, "logits/rejected": -0.6895912885665894, "logps/chosen": -1.4460922479629517, "logps/rejected": -1.4541490077972412, "loss": 3.0706, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -14.46092414855957, "rewards/margins": 0.08056803047657013, "rewards/rejected": -14.541491508483887, "step": 185 }, { "epoch": 0.006403990697360882, "grad_norm": 14.64199161529541, "learning_rate": 6.403774856757667e-08, "logits/chosen": -0.3238942325115204, "logits/rejected": -0.24976284801959991, "logps/chosen": -1.8821359872817993, "logps/rejected": -2.101795196533203, "loss": 2.8468, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.821361541748047, "rewards/margins": 2.1965911388397217, "rewards/rejected": -21.01795196533203, "step": 190 }, { "epoch": 0.0065725167683440625, "grad_norm": 27.205289840698242, "learning_rate": 6.572295247724974e-08, "logits/chosen": -0.41295117139816284, "logits/rejected": -0.22403642535209656, "logps/chosen": -1.789427399635315, "logps/rejected": -1.9406659603118896, "loss": 2.2107, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.89427375793457, "rewards/margins": 1.5123885869979858, "rewards/rejected": -19.406661987304688, "step": 195 }, { "epoch": 0.006741042839327244, "grad_norm": 5.611418724060059, "learning_rate": 6.740815638692282e-08, "logits/chosen": -0.16343602538108826, "logits/rejected": -0.2591503858566284, "logps/chosen": -2.1357204914093018, "logps/rejected": -2.256701707839966, "loss": 2.5936, "rewards/accuracies": 0.5, "rewards/chosen": -21.35720443725586, "rewards/margins": 1.2098113298416138, "rewards/rejected": -22.5670166015625, "step": 200 }, { "epoch": 0.006909568910310425, "grad_norm": 19.420745849609375, "learning_rate": 6.90933602965959e-08, "logits/chosen": -0.7015420198440552, "logits/rejected": -0.8463886380195618, "logps/chosen": -1.6153056621551514, "logps/rejected": -1.5792958736419678, "loss": 3.856, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.153057098388672, "rewards/margins": -0.360098659992218, "rewards/rejected": -15.792959213256836, "step": 205 }, { "epoch": 0.007078094981293606, "grad_norm": 23.551700592041016, "learning_rate": 7.077856420626896e-08, "logits/chosen": -0.5431650876998901, "logits/rejected": -0.5651072263717651, "logps/chosen": -1.4538341760635376, "logps/rejected": -1.4151197671890259, "loss": 3.446, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -14.538342475891113, "rewards/margins": -0.38714489340782166, "rewards/rejected": -14.15119743347168, "step": 210 }, { "epoch": 0.007246621052276787, "grad_norm": 33.41309356689453, "learning_rate": 7.246376811594203e-08, "logits/chosen": -0.85772705078125, "logits/rejected": -0.8544243574142456, "logps/chosen": -1.7897136211395264, "logps/rejected": -1.7153469324111938, "loss": 3.7949, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -17.897136688232422, "rewards/margins": -0.7436662912368774, "rewards/rejected": -17.15346908569336, "step": 215 }, { "epoch": 0.007415147123259969, "grad_norm": 28.571849822998047, "learning_rate": 7.41489720256151e-08, "logits/chosen": -0.4796590805053711, "logits/rejected": -0.5393766164779663, "logps/chosen": -1.9003206491470337, "logps/rejected": -2.005420446395874, "loss": 2.9494, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.003206253051758, "rewards/margins": 1.0509979724884033, "rewards/rejected": -20.0542049407959, "step": 220 }, { "epoch": 0.007583673194243149, "grad_norm": 20.83780288696289, "learning_rate": 7.583417593528817e-08, "logits/chosen": -0.5321765542030334, "logits/rejected": -0.4122452735900879, "logps/chosen": -1.856885552406311, "logps/rejected": -1.857283592224121, "loss": 3.1423, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.56885528564453, "rewards/margins": 0.003980970475822687, "rewards/rejected": -18.57283592224121, "step": 225 }, { "epoch": 0.007752199265226331, "grad_norm": 22.060514450073242, "learning_rate": 7.751937984496123e-08, "logits/chosen": -0.4617394506931305, "logits/rejected": -0.3590688109397888, "logps/chosen": -1.579564094543457, "logps/rejected": -1.6632616519927979, "loss": 3.1801, "rewards/accuracies": 0.5, "rewards/chosen": -15.79564094543457, "rewards/margins": 0.8369754552841187, "rewards/rejected": -16.63261604309082, "step": 230 }, { "epoch": 0.007920725336209512, "grad_norm": 22.34297752380371, "learning_rate": 7.92045837546343e-08, "logits/chosen": -0.5775288343429565, "logits/rejected": -0.5112261176109314, "logps/chosen": -1.5766397714614868, "logps/rejected": -1.7778068780899048, "loss": 1.8744, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -15.766397476196289, "rewards/margins": 2.011671304702759, "rewards/rejected": -17.7780704498291, "step": 235 }, { "epoch": 0.008089251407192693, "grad_norm": 26.297338485717773, "learning_rate": 8.088978766430737e-08, "logits/chosen": -0.6276997923851013, "logits/rejected": -0.5408580899238586, "logps/chosen": -1.744314432144165, "logps/rejected": -1.8765428066253662, "loss": 2.1406, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.443145751953125, "rewards/margins": 1.322283148765564, "rewards/rejected": -18.765426635742188, "step": 240 }, { "epoch": 0.008257777478175873, "grad_norm": 44.11431884765625, "learning_rate": 8.257499157398045e-08, "logits/chosen": -0.4730430543422699, "logits/rejected": -0.4535676836967468, "logps/chosen": -1.7764599323272705, "logps/rejected": -1.780321717262268, "loss": 3.0543, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.764596939086914, "rewards/margins": 0.038619231432676315, "rewards/rejected": -17.8032169342041, "step": 245 }, { "epoch": 0.008426303549159055, "grad_norm": 22.68023109436035, "learning_rate": 8.426019548365352e-08, "logits/chosen": -0.4147162437438965, "logits/rejected": -0.35299405455589294, "logps/chosen": -1.7950069904327393, "logps/rejected": -1.8354160785675049, "loss": 2.7634, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.950069427490234, "rewards/margins": 0.40409010648727417, "rewards/rejected": -18.35416030883789, "step": 250 }, { "epoch": 0.008594829620142236, "grad_norm": 23.5480899810791, "learning_rate": 8.594539939332659e-08, "logits/chosen": -0.6697706580162048, "logits/rejected": -0.6132184863090515, "logps/chosen": -1.8134784698486328, "logps/rejected": -1.8508756160736084, "loss": 2.7429, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.134784698486328, "rewards/margins": 0.373971551656723, "rewards/rejected": -18.50875473022461, "step": 255 }, { "epoch": 0.008763355691125418, "grad_norm": 19.03260040283203, "learning_rate": 8.763060330299966e-08, "logits/chosen": -0.20593388378620148, "logits/rejected": -0.37953242659568787, "logps/chosen": -2.1354403495788574, "logps/rejected": -2.0018649101257324, "loss": 4.4218, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.354402542114258, "rewards/margins": -1.3357551097869873, "rewards/rejected": -20.01865005493164, "step": 260 }, { "epoch": 0.008931881762108598, "grad_norm": 22.810266494750977, "learning_rate": 8.931580721267273e-08, "logits/chosen": -0.5022013783454895, "logits/rejected": -0.5145460367202759, "logps/chosen": -1.636365532875061, "logps/rejected": -1.6282808780670166, "loss": 3.1787, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.36365509033203, "rewards/margins": -0.08084668964147568, "rewards/rejected": -16.282808303833008, "step": 265 }, { "epoch": 0.00910040783309178, "grad_norm": 21.40003776550293, "learning_rate": 9.10010111223458e-08, "logits/chosen": -0.42804187536239624, "logits/rejected": -0.3837874233722687, "logps/chosen": -1.6583467721939087, "logps/rejected": -1.7007923126220703, "loss": 2.7244, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.58346939086914, "rewards/margins": 0.42445507645606995, "rewards/rejected": -17.007923126220703, "step": 270 }, { "epoch": 0.00926893390407496, "grad_norm": 0.5620936155319214, "learning_rate": 9.268621503201888e-08, "logits/chosen": -0.5316357016563416, "logits/rejected": -0.7304517030715942, "logps/chosen": -1.8833847045898438, "logps/rejected": -2.030012607574463, "loss": 3.6374, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -18.833847045898438, "rewards/margins": 1.4662790298461914, "rewards/rejected": -20.300125122070312, "step": 275 }, { "epoch": 0.009437459975058142, "grad_norm": 20.305036544799805, "learning_rate": 9.437141894169194e-08, "logits/chosen": -0.5274496674537659, "logits/rejected": -0.5468205809593201, "logps/chosen": -1.3872991800308228, "logps/rejected": -1.3830955028533936, "loss": 3.1062, "rewards/accuracies": 0.5, "rewards/chosen": -13.872991561889648, "rewards/margins": -0.042035769671201706, "rewards/rejected": -13.830957412719727, "step": 280 }, { "epoch": 0.009605986046041322, "grad_norm": 34.91652297973633, "learning_rate": 9.605662285136501e-08, "logits/chosen": -0.539037823677063, "logits/rejected": -0.3293539583683014, "logps/chosen": -1.6226667165756226, "logps/rejected": -1.6433026790618896, "loss": 2.9828, "rewards/accuracies": 0.5, "rewards/chosen": -16.226667404174805, "rewards/margins": 0.2063591033220291, "rewards/rejected": -16.433025360107422, "step": 285 }, { "epoch": 0.009774512117024504, "grad_norm": 26.9847469329834, "learning_rate": 9.774182676103808e-08, "logits/chosen": -0.36581525206565857, "logits/rejected": -0.33445021510124207, "logps/chosen": -1.572983980178833, "logps/rejected": -1.717125654220581, "loss": 2.8238, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -15.729840278625488, "rewards/margins": 1.44141685962677, "rewards/rejected": -17.1712589263916, "step": 290 }, { "epoch": 0.009943038188007685, "grad_norm": 18.268667221069336, "learning_rate": 9.942703067071115e-08, "logits/chosen": -0.6619695425033569, "logits/rejected": -0.5615943670272827, "logps/chosen": -1.6950050592422485, "logps/rejected": -1.7060911655426025, "loss": 3.0675, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.950048446655273, "rewards/margins": 0.11086149513721466, "rewards/rejected": -17.060911178588867, "step": 295 }, { "epoch": 0.010111564258990865, "grad_norm": 24.732633590698242, "learning_rate": 1.0111223458038422e-07, "logits/chosen": -0.42776185274124146, "logits/rejected": -0.5683671236038208, "logps/chosen": -1.41172194480896, "logps/rejected": -1.4656214714050293, "loss": 2.6678, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -14.117219924926758, "rewards/margins": 0.538996696472168, "rewards/rejected": -14.656216621398926, "step": 300 }, { "epoch": 0.010280090329974047, "grad_norm": 15.735332489013672, "learning_rate": 1.0279743849005728e-07, "logits/chosen": -1.0114845037460327, "logits/rejected": -1.0291179418563843, "logps/chosen": -1.759996771812439, "logps/rejected": -1.738663673400879, "loss": 3.2891, "rewards/accuracies": 0.5, "rewards/chosen": -17.599966049194336, "rewards/margins": -0.21333065629005432, "rewards/rejected": -17.38663673400879, "step": 305 }, { "epoch": 0.010448616400957228, "grad_norm": 31.485431671142578, "learning_rate": 1.0448264239973035e-07, "logits/chosen": -0.5081278085708618, "logits/rejected": -0.5943376421928406, "logps/chosen": -1.7860934734344482, "logps/rejected": -1.8374830484390259, "loss": 2.7103, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.86093521118164, "rewards/margins": 0.5138964653015137, "rewards/rejected": -18.374832153320312, "step": 310 }, { "epoch": 0.01061714247194041, "grad_norm": 24.96269416809082, "learning_rate": 1.0616784630940344e-07, "logits/chosen": -0.4897727370262146, "logits/rejected": -0.6077844500541687, "logps/chosen": -1.6169993877410889, "logps/rejected": -1.6388232707977295, "loss": 3.4165, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -16.16999626159668, "rewards/margins": 0.21823683381080627, "rewards/rejected": -16.388233184814453, "step": 315 }, { "epoch": 0.01078566854292359, "grad_norm": 20.947097778320312, "learning_rate": 1.0785305021907651e-07, "logits/chosen": -0.6524089574813843, "logits/rejected": -0.5542722940444946, "logps/chosen": -1.5016238689422607, "logps/rejected": -1.4815490245819092, "loss": 3.2804, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -15.01623821258545, "rewards/margins": -0.20074859261512756, "rewards/rejected": -14.81549072265625, "step": 320 }, { "epoch": 0.010954194613906771, "grad_norm": 57.642093658447266, "learning_rate": 1.0953825412874958e-07, "logits/chosen": -0.14283767342567444, "logits/rejected": -0.25287362933158875, "logps/chosen": -2.091562509536743, "logps/rejected": -2.088674545288086, "loss": 3.2121, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.915624618530273, "rewards/margins": -0.028880024328827858, "rewards/rejected": -20.88674545288086, "step": 325 }, { "epoch": 0.011122720684889953, "grad_norm": 26.221973419189453, "learning_rate": 1.1122345803842264e-07, "logits/chosen": -0.8916441798210144, "logits/rejected": -0.8147870302200317, "logps/chosen": -1.7097644805908203, "logps/rejected": -1.969477891921997, "loss": 2.2724, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.097644805908203, "rewards/margins": 2.597132444381714, "rewards/rejected": -19.69477653503418, "step": 330 }, { "epoch": 0.011291246755873133, "grad_norm": 22.461708068847656, "learning_rate": 1.1290866194809571e-07, "logits/chosen": -0.7663329243659973, "logits/rejected": -0.6704310178756714, "logps/chosen": -1.5523570775985718, "logps/rejected": -1.4849021434783936, "loss": 3.7766, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -15.52357006072998, "rewards/margins": -0.674548327922821, "rewards/rejected": -14.849021911621094, "step": 335 }, { "epoch": 0.011459772826856314, "grad_norm": 32.96992874145508, "learning_rate": 1.1459386585776879e-07, "logits/chosen": -0.3288322389125824, "logits/rejected": -0.2193385809659958, "logps/chosen": -2.2772393226623535, "logps/rejected": -2.1371006965637207, "loss": 4.5364, "rewards/accuracies": 0.5, "rewards/chosen": -22.77239418029785, "rewards/margins": -1.4013869762420654, "rewards/rejected": -21.37100601196289, "step": 340 }, { "epoch": 0.011628298897839496, "grad_norm": 27.60466194152832, "learning_rate": 1.1627906976744186e-07, "logits/chosen": -0.18993383646011353, "logits/rejected": -0.15279248356819153, "logps/chosen": -1.8032690286636353, "logps/rejected": -1.9014475345611572, "loss": 2.575, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.032690048217773, "rewards/margins": 0.9817847013473511, "rewards/rejected": -19.014476776123047, "step": 345 }, { "epoch": 0.011796824968822677, "grad_norm": 20.22198486328125, "learning_rate": 1.1796427367711492e-07, "logits/chosen": -0.52468341588974, "logits/rejected": -0.5473569631576538, "logps/chosen": -1.686406135559082, "logps/rejected": -1.6621391773223877, "loss": 3.3808, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.86406135559082, "rewards/margins": -0.242669016122818, "rewards/rejected": -16.62139320373535, "step": 350 }, { "epoch": 0.011965351039805857, "grad_norm": 14.638656616210938, "learning_rate": 1.19649477586788e-07, "logits/chosen": -0.061285682022571564, "logits/rejected": -0.03674466535449028, "logps/chosen": -1.6679449081420898, "logps/rejected": -1.850379228591919, "loss": 3.0775, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.679447174072266, "rewards/margins": 1.8243420124053955, "rewards/rejected": -18.5037899017334, "step": 355 }, { "epoch": 0.012133877110789039, "grad_norm": 12.102712631225586, "learning_rate": 1.2133468149646107e-07, "logits/chosen": -0.5100473165512085, "logits/rejected": -0.4110667109489441, "logps/chosen": -1.6822515726089478, "logps/rejected": -1.6356151103973389, "loss": 3.7413, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -16.8225154876709, "rewards/margins": -0.46636518836021423, "rewards/rejected": -16.356151580810547, "step": 360 }, { "epoch": 0.01230240318177222, "grad_norm": 12.664548873901367, "learning_rate": 1.2301988540613412e-07, "logits/chosen": -0.5096290707588196, "logits/rejected": -0.5500877499580383, "logps/chosen": -1.790327787399292, "logps/rejected": -1.7157615423202515, "loss": 4.2013, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.903278350830078, "rewards/margins": -0.7456639409065247, "rewards/rejected": -17.157615661621094, "step": 365 }, { "epoch": 0.012470929252755402, "grad_norm": 24.88652229309082, "learning_rate": 1.2470508931580722e-07, "logits/chosen": -0.47424596548080444, "logits/rejected": -0.36643728613853455, "logps/chosen": -1.7648521661758423, "logps/rejected": -1.726318120956421, "loss": 3.5944, "rewards/accuracies": 0.5, "rewards/chosen": -17.64851951599121, "rewards/margins": -0.38534069061279297, "rewards/rejected": -17.263179779052734, "step": 370 }, { "epoch": 0.012639455323738582, "grad_norm": 14.096327781677246, "learning_rate": 1.263902932254803e-07, "logits/chosen": 0.02537798322737217, "logits/rejected": 0.0857175663113594, "logps/chosen": -2.335552930831909, "logps/rejected": -2.3750240802764893, "loss": 2.8665, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.355531692504883, "rewards/margins": 0.3947105407714844, "rewards/rejected": -23.750240325927734, "step": 375 }, { "epoch": 0.012807981394721763, "grad_norm": 17.82650375366211, "learning_rate": 1.2807549713515333e-07, "logits/chosen": -0.6803150177001953, "logits/rejected": -0.4150848388671875, "logps/chosen": -1.7237411737442017, "logps/rejected": -1.6108821630477905, "loss": 4.1818, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.237409591674805, "rewards/margins": -1.1285897493362427, "rewards/rejected": -16.10881996154785, "step": 380 }, { "epoch": 0.012976507465704945, "grad_norm": 24.19916343688965, "learning_rate": 1.2976070104482643e-07, "logits/chosen": -0.6842484474182129, "logits/rejected": -0.41495975852012634, "logps/chosen": -1.7041406631469727, "logps/rejected": -1.7983328104019165, "loss": 2.4166, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.041406631469727, "rewards/margins": 0.9419221878051758, "rewards/rejected": -17.983327865600586, "step": 385 }, { "epoch": 0.013145033536688125, "grad_norm": 32.509334564208984, "learning_rate": 1.3144590495449948e-07, "logits/chosen": -0.1651981621980667, "logits/rejected": -0.20989704132080078, "logps/chosen": -1.7264354228973389, "logps/rejected": -1.7077096700668335, "loss": 3.39, "rewards/accuracies": 0.5, "rewards/chosen": -17.264352798461914, "rewards/margins": -0.18725700676441193, "rewards/rejected": -17.077096939086914, "step": 390 }, { "epoch": 0.013313559607671306, "grad_norm": 37.62208938598633, "learning_rate": 1.3313110886417255e-07, "logits/chosen": -0.9938071966171265, "logits/rejected": -0.8165189027786255, "logps/chosen": -1.909253478050232, "logps/rejected": -1.8714126348495483, "loss": 3.4985, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -19.092533111572266, "rewards/margins": -0.37840786576271057, "rewards/rejected": -18.714126586914062, "step": 395 }, { "epoch": 0.013482085678654488, "grad_norm": 57.049564361572266, "learning_rate": 1.3481631277384565e-07, "logits/chosen": -0.3681066930294037, "logits/rejected": -0.5049809813499451, "logps/chosen": -1.8573124408721924, "logps/rejected": -1.8679901361465454, "loss": 3.2871, "rewards/accuracies": 0.5, "rewards/chosen": -18.573123931884766, "rewards/margins": 0.10677585750818253, "rewards/rejected": -18.679901123046875, "step": 400 }, { "epoch": 0.013482085678654488, "eval_logits/chosen": -0.700717031955719, "eval_logits/rejected": -0.7018821835517883, "eval_logps/chosen": -1.655368685722351, "eval_logps/rejected": -1.65134859085083, "eval_loss": 3.4378702640533447, "eval_rewards/accuracies": 0.4699999988079071, "eval_rewards/chosen": -16.553686141967773, "eval_rewards/margins": -0.04019847884774208, "eval_rewards/rejected": -16.513486862182617, "eval_runtime": 13.2029, "eval_samples_per_second": 7.574, "eval_steps_per_second": 1.894, "step": 400 }, { "epoch": 0.01365061174963767, "grad_norm": 32.858551025390625, "learning_rate": 1.365015166835187e-07, "logits/chosen": -0.4087739586830139, "logits/rejected": -0.32055968046188354, "logps/chosen": -1.6603796482086182, "logps/rejected": -1.7441883087158203, "loss": 2.4208, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.603796005249023, "rewards/margins": 0.8380862474441528, "rewards/rejected": -17.441883087158203, "step": 405 }, { "epoch": 0.01381913782062085, "grad_norm": 12.036665916442871, "learning_rate": 1.381867205931918e-07, "logits/chosen": -0.3701552748680115, "logits/rejected": -0.3905239701271057, "logps/chosen": -1.765268325805664, "logps/rejected": -1.789777159690857, "loss": 2.888, "rewards/accuracies": 0.5, "rewards/chosen": -17.65268325805664, "rewards/margins": 0.24508953094482422, "rewards/rejected": -17.89777183532715, "step": 410 }, { "epoch": 0.013987663891604031, "grad_norm": 24.49802589416504, "learning_rate": 1.3987192450286484e-07, "logits/chosen": -0.40653306245803833, "logits/rejected": -0.4059394896030426, "logps/chosen": -1.9510490894317627, "logps/rejected": -2.0723910331726074, "loss": 2.9058, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.510488510131836, "rewards/margins": 1.2134212255477905, "rewards/rejected": -20.72391128540039, "step": 415 }, { "epoch": 0.014156189962587213, "grad_norm": 21.609708786010742, "learning_rate": 1.415571284125379e-07, "logits/chosen": 0.07993341982364655, "logits/rejected": 0.046029143035411835, "logps/chosen": -1.7785285711288452, "logps/rejected": -1.9162448644638062, "loss": 3.0046, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.78528594970703, "rewards/margins": 1.3771638870239258, "rewards/rejected": -19.16244888305664, "step": 420 }, { "epoch": 0.014324716033570392, "grad_norm": 22.334131240844727, "learning_rate": 1.4324233232221098e-07, "logits/chosen": -0.1741364747285843, "logits/rejected": -0.12129688262939453, "logps/chosen": -1.4081411361694336, "logps/rejected": -1.5430591106414795, "loss": 2.1877, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -14.08141040802002, "rewards/margins": 1.3491812944412231, "rewards/rejected": -15.430593490600586, "step": 425 }, { "epoch": 0.014493242104553574, "grad_norm": 29.417394638061523, "learning_rate": 1.4492753623188405e-07, "logits/chosen": -0.5014611482620239, "logits/rejected": -0.4477139413356781, "logps/chosen": -1.5395524501800537, "logps/rejected": -1.6200401782989502, "loss": 2.4931, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -15.395523071289062, "rewards/margins": 0.804876446723938, "rewards/rejected": -16.20039939880371, "step": 430 }, { "epoch": 0.014661768175536756, "grad_norm": 25.562877655029297, "learning_rate": 1.466127401415571e-07, "logits/chosen": -0.48289117217063904, "logits/rejected": -0.34240299463272095, "logps/chosen": -1.839874267578125, "logps/rejected": -1.9911397695541382, "loss": 2.1309, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.398744583129883, "rewards/margins": 1.51265549659729, "rewards/rejected": -19.91139793395996, "step": 435 }, { "epoch": 0.014830294246519937, "grad_norm": 42.604774475097656, "learning_rate": 1.482979440512302e-07, "logits/chosen": -0.23516055941581726, "logits/rejected": -0.2886582016944885, "logps/chosen": -1.9065258502960205, "logps/rejected": -1.8996076583862305, "loss": 3.1934, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -19.065256118774414, "rewards/margins": -0.0691794902086258, "rewards/rejected": -18.996078491210938, "step": 440 }, { "epoch": 0.014998820317503117, "grad_norm": 32.816471099853516, "learning_rate": 1.4998314796090324e-07, "logits/chosen": -0.46411070227622986, "logits/rejected": -0.30949535965919495, "logps/chosen": -1.802843689918518, "logps/rejected": -1.8105186223983765, "loss": 3.0283, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.0284366607666, "rewards/margins": 0.07674837112426758, "rewards/rejected": -18.10518455505371, "step": 445 }, { "epoch": 0.015167346388486299, "grad_norm": 39.4717903137207, "learning_rate": 1.5166835187057634e-07, "logits/chosen": -0.12686650454998016, "logits/rejected": -0.17431500554084778, "logps/chosen": -2.150648593902588, "logps/rejected": -2.0025336742401123, "loss": 4.5375, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -21.506488800048828, "rewards/margins": -1.4811471700668335, "rewards/rejected": -20.025339126586914, "step": 450 }, { "epoch": 0.01533587245946948, "grad_norm": 15.029488563537598, "learning_rate": 1.5335355578024941e-07, "logits/chosen": -0.18588756024837494, "logits/rejected": -0.25931158661842346, "logps/chosen": -1.945797324180603, "logps/rejected": -1.991434097290039, "loss": 3.2645, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.45797348022461, "rewards/margins": 0.4563663601875305, "rewards/rejected": -19.91434097290039, "step": 455 }, { "epoch": 0.015504398530452662, "grad_norm": 17.134843826293945, "learning_rate": 1.5503875968992246e-07, "logits/chosen": -0.4510701298713684, "logits/rejected": -0.5152049660682678, "logps/chosen": -1.6451425552368164, "logps/rejected": -1.7289478778839111, "loss": 2.7484, "rewards/accuracies": 0.5, "rewards/chosen": -16.451427459716797, "rewards/margins": 0.8380520939826965, "rewards/rejected": -17.289478302001953, "step": 460 }, { "epoch": 0.015672924601435843, "grad_norm": 16.33306121826172, "learning_rate": 1.5672396359959556e-07, "logits/chosen": -0.2507234513759613, "logits/rejected": -0.3093252182006836, "logps/chosen": -1.4972885847091675, "logps/rejected": -1.6501014232635498, "loss": 2.4099, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -14.972885131835938, "rewards/margins": 1.5281298160552979, "rewards/rejected": -16.501014709472656, "step": 465 }, { "epoch": 0.015841450672419023, "grad_norm": 24.52484703063965, "learning_rate": 1.584091675092686e-07, "logits/chosen": -0.47617292404174805, "logits/rejected": -0.5188449621200562, "logps/chosen": -1.6798864603042603, "logps/rejected": -1.6755599975585938, "loss": 3.1259, "rewards/accuracies": 0.5, "rewards/chosen": -16.798864364624023, "rewards/margins": -0.04326476901769638, "rewards/rejected": -16.75560188293457, "step": 470 }, { "epoch": 0.016009976743402203, "grad_norm": 30.124879837036133, "learning_rate": 1.600943714189417e-07, "logits/chosen": -0.6728774905204773, "logits/rejected": -0.6839405298233032, "logps/chosen": -1.6356920003890991, "logps/rejected": -1.6939365863800049, "loss": 2.659, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.356922149658203, "rewards/margins": 0.5824446678161621, "rewards/rejected": -16.93936538696289, "step": 475 }, { "epoch": 0.016178502814385386, "grad_norm": 35.19898223876953, "learning_rate": 1.6177957532861475e-07, "logits/chosen": -0.3229294419288635, "logits/rejected": -0.1391395628452301, "logps/chosen": -1.695289969444275, "logps/rejected": -1.714638352394104, "loss": 2.9622, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.952899932861328, "rewards/margins": 0.19348230957984924, "rewards/rejected": -17.14638328552246, "step": 480 }, { "epoch": 0.016347028885368566, "grad_norm": 38.96176528930664, "learning_rate": 1.6346477923828782e-07, "logits/chosen": -0.06022145599126816, "logits/rejected": -0.1673354208469391, "logps/chosen": -2.0568957328796387, "logps/rejected": -2.0894064903259277, "loss": 3.747, "rewards/accuracies": 0.5, "rewards/chosen": -20.568958282470703, "rewards/margins": 0.3251078724861145, "rewards/rejected": -20.894065856933594, "step": 485 }, { "epoch": 0.016515554956351746, "grad_norm": 30.77846908569336, "learning_rate": 1.651499831479609e-07, "logits/chosen": -0.6888018250465393, "logits/rejected": -0.6225077509880066, "logps/chosen": -1.6226558685302734, "logps/rejected": -1.5915305614471436, "loss": 3.6297, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.226558685302734, "rewards/margins": -0.31125250458717346, "rewards/rejected": -15.915306091308594, "step": 490 }, { "epoch": 0.01668408102733493, "grad_norm": 41.58315658569336, "learning_rate": 1.6683518705763396e-07, "logits/chosen": -0.5833055377006531, "logits/rejected": -0.5206926465034485, "logps/chosen": -2.156750202178955, "logps/rejected": -2.0980007648468018, "loss": 3.6524, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.567501068115234, "rewards/margins": -0.5874932408332825, "rewards/rejected": -20.98000717163086, "step": 495 }, { "epoch": 0.01685260709831811, "grad_norm": 19.05613136291504, "learning_rate": 1.6852039096730703e-07, "logits/chosen": -0.1755112111568451, "logits/rejected": -0.277473121881485, "logps/chosen": -2.1205787658691406, "logps/rejected": -2.038501739501953, "loss": 3.9988, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.205785751342773, "rewards/margins": -0.820770263671875, "rewards/rejected": -20.3850154876709, "step": 500 }, { "epoch": 0.017021133169301293, "grad_norm": 32.78766632080078, "learning_rate": 1.702055948769801e-07, "logits/chosen": -0.697665274143219, "logits/rejected": -0.6786226034164429, "logps/chosen": -1.837233543395996, "logps/rejected": -1.7147718667984009, "loss": 4.4203, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -18.37233543395996, "rewards/margins": -1.2246164083480835, "rewards/rejected": -17.147716522216797, "step": 505 }, { "epoch": 0.017189659240284472, "grad_norm": 19.369592666625977, "learning_rate": 1.7189079878665318e-07, "logits/chosen": -0.19188065826892853, "logits/rejected": -0.16713164746761322, "logps/chosen": -1.770496129989624, "logps/rejected": -1.862823247909546, "loss": 2.5088, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.7049617767334, "rewards/margins": 0.9232719540596008, "rewards/rejected": -18.628232955932617, "step": 510 }, { "epoch": 0.017358185311267652, "grad_norm": 25.27272605895996, "learning_rate": 1.7357600269632625e-07, "logits/chosen": -0.598182201385498, "logits/rejected": -0.4703841209411621, "logps/chosen": -1.418545126914978, "logps/rejected": -1.3824225664138794, "loss": 3.4124, "rewards/accuracies": 0.5, "rewards/chosen": -14.185450553894043, "rewards/margins": -0.36122599244117737, "rewards/rejected": -13.824226379394531, "step": 515 }, { "epoch": 0.017526711382250836, "grad_norm": 20.890714645385742, "learning_rate": 1.7526120660599932e-07, "logits/chosen": -0.2779539227485657, "logits/rejected": -0.33786919713020325, "logps/chosen": -1.8864319324493408, "logps/rejected": -1.8625271320343018, "loss": 4.4328, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -18.86431884765625, "rewards/margins": -0.23904666304588318, "rewards/rejected": -18.62527084350586, "step": 520 }, { "epoch": 0.017695237453234015, "grad_norm": 20.13123321533203, "learning_rate": 1.769464105156724e-07, "logits/chosen": -0.5535017848014832, "logits/rejected": -0.607734203338623, "logps/chosen": -1.6528222560882568, "logps/rejected": -1.7298600673675537, "loss": 2.7765, "rewards/accuracies": 0.5, "rewards/chosen": -16.528223037719727, "rewards/margins": 0.7703768610954285, "rewards/rejected": -17.298601150512695, "step": 525 }, { "epoch": 0.017863763524217195, "grad_norm": 19.041948318481445, "learning_rate": 1.7863161442534547e-07, "logits/chosen": -0.4681750237941742, "logits/rejected": -0.5170144438743591, "logps/chosen": -1.6290159225463867, "logps/rejected": -1.7050189971923828, "loss": 2.4945, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.290157318115234, "rewards/margins": 0.7600309252738953, "rewards/rejected": -17.050189971923828, "step": 530 }, { "epoch": 0.01803228959520038, "grad_norm": 39.596923828125, "learning_rate": 1.803168183350185e-07, "logits/chosen": -0.44919759035110474, "logits/rejected": -0.4530429244041443, "logps/chosen": -2.0134143829345703, "logps/rejected": -2.075237989425659, "loss": 2.7457, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.134143829345703, "rewards/margins": 0.6182364225387573, "rewards/rejected": -20.75238037109375, "step": 535 }, { "epoch": 0.01820081566618356, "grad_norm": 19.176101684570312, "learning_rate": 1.820020222446916e-07, "logits/chosen": -0.6088439226150513, "logits/rejected": -0.5793352127075195, "logps/chosen": -1.5337207317352295, "logps/rejected": -1.6069551706314087, "loss": 2.4939, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -15.33720588684082, "rewards/margins": 0.7323451042175293, "rewards/rejected": -16.06955337524414, "step": 540 }, { "epoch": 0.018369341737166738, "grad_norm": 21.026443481445312, "learning_rate": 1.8368722615436466e-07, "logits/chosen": -0.7524499893188477, "logits/rejected": -0.7342022657394409, "logps/chosen": -1.7819461822509766, "logps/rejected": -1.3424131870269775, "loss": 7.415, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -17.819461822509766, "rewards/margins": -4.395329475402832, "rewards/rejected": -13.42413330078125, "step": 545 }, { "epoch": 0.01853786780814992, "grad_norm": 27.13228988647461, "learning_rate": 1.8537243006403775e-07, "logits/chosen": -0.46805500984191895, "logits/rejected": -0.3271024823188782, "logps/chosen": -1.8898948431015015, "logps/rejected": -1.9199107885360718, "loss": 2.9731, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.898948669433594, "rewards/margins": 0.30015745759010315, "rewards/rejected": -19.199106216430664, "step": 550 }, { "epoch": 0.0187063938791331, "grad_norm": 20.596628189086914, "learning_rate": 1.870576339737108e-07, "logits/chosen": -0.4603646397590637, "logits/rejected": -0.4007042944431305, "logps/chosen": -2.0334296226501465, "logps/rejected": -2.0893046855926514, "loss": 2.9646, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.33429527282715, "rewards/margins": 0.5587537884712219, "rewards/rejected": -20.893047332763672, "step": 555 }, { "epoch": 0.018874919950116285, "grad_norm": 29.820533752441406, "learning_rate": 1.8874283788338387e-07, "logits/chosen": -0.051391713321208954, "logits/rejected": -0.12424926459789276, "logps/chosen": -1.8017486333847046, "logps/rejected": -1.9026470184326172, "loss": 2.8019, "rewards/accuracies": 0.5, "rewards/chosen": -18.017486572265625, "rewards/margins": 1.0089836120605469, "rewards/rejected": -19.026470184326172, "step": 560 }, { "epoch": 0.019043446021099465, "grad_norm": 16.212018966674805, "learning_rate": 1.9042804179305697e-07, "logits/chosen": -0.1161131039261818, "logits/rejected": -0.12281863391399384, "logps/chosen": -1.7073466777801514, "logps/rejected": -1.6138560771942139, "loss": 4.0608, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.073467254638672, "rewards/margins": -0.9349047541618347, "rewards/rejected": -16.13856315612793, "step": 565 }, { "epoch": 0.019211972092082644, "grad_norm": 27.620121002197266, "learning_rate": 1.9211324570273002e-07, "logits/chosen": -0.34033095836639404, "logits/rejected": -0.3796425759792328, "logps/chosen": -1.9775043725967407, "logps/rejected": -1.8988311290740967, "loss": 3.9004, "rewards/accuracies": 0.5, "rewards/chosen": -19.775043487548828, "rewards/margins": -0.7867323756217957, "rewards/rejected": -18.98831558227539, "step": 570 }, { "epoch": 0.019380498163065828, "grad_norm": 20.538888931274414, "learning_rate": 1.9379844961240311e-07, "logits/chosen": -0.5360706448554993, "logits/rejected": -0.5650817155838013, "logps/chosen": -1.6927764415740967, "logps/rejected": -1.6992028951644897, "loss": 3.0786, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.927766799926758, "rewards/margins": 0.06426439434289932, "rewards/rejected": -16.99203109741211, "step": 575 }, { "epoch": 0.019549024234049008, "grad_norm": 22.677988052368164, "learning_rate": 1.9548365352207616e-07, "logits/chosen": -0.3704223334789276, "logits/rejected": -0.3610188961029053, "logps/chosen": -1.7271544933319092, "logps/rejected": -1.7471681833267212, "loss": 2.9617, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.27154541015625, "rewards/margins": 0.20013637840747833, "rewards/rejected": -17.471683502197266, "step": 580 }, { "epoch": 0.019717550305032187, "grad_norm": 17.373239517211914, "learning_rate": 1.9716885743174923e-07, "logits/chosen": -0.3982377350330353, "logits/rejected": -0.4115076959133148, "logps/chosen": -1.8473840951919556, "logps/rejected": -1.8948602676391602, "loss": 2.8885, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.473840713500977, "rewards/margins": 0.47476130723953247, "rewards/rejected": -18.9486026763916, "step": 585 }, { "epoch": 0.01988607637601537, "grad_norm": 10.618928909301758, "learning_rate": 1.988540613414223e-07, "logits/chosen": -0.3850679099559784, "logits/rejected": -0.33462971448898315, "logps/chosen": -1.780971884727478, "logps/rejected": -1.95230233669281, "loss": 1.987, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.80971908569336, "rewards/margins": 1.7133023738861084, "rewards/rejected": -19.523021697998047, "step": 590 }, { "epoch": 0.02005460244699855, "grad_norm": 23.18230628967285, "learning_rate": 2.0053926525109538e-07, "logits/chosen": -0.3498408794403076, "logits/rejected": -0.5002946257591248, "logps/chosen": -1.680807113647461, "logps/rejected": -1.6838855743408203, "loss": 3.3985, "rewards/accuracies": 0.5, "rewards/chosen": -16.80807113647461, "rewards/margins": 0.030785083770751953, "rewards/rejected": -16.838855743408203, "step": 595 }, { "epoch": 0.02022312851798173, "grad_norm": 31.54558563232422, "learning_rate": 2.0222446916076845e-07, "logits/chosen": -0.11058574914932251, "logits/rejected": -0.0773305743932724, "logps/chosen": -1.980094313621521, "logps/rejected": -2.0804390907287598, "loss": 3.2886, "rewards/accuracies": 0.5, "rewards/chosen": -19.800945281982422, "rewards/margins": 1.0034451484680176, "rewards/rejected": -20.80438804626465, "step": 600 }, { "epoch": 0.020391654588964914, "grad_norm": 12.026350975036621, "learning_rate": 2.0390967307044152e-07, "logits/chosen": -0.24122457206249237, "logits/rejected": -0.2786351144313812, "logps/chosen": -1.7854111194610596, "logps/rejected": -1.8579390048980713, "loss": 2.8281, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.854110717773438, "rewards/margins": 0.7252796292304993, "rewards/rejected": -18.579389572143555, "step": 605 }, { "epoch": 0.020560180659948094, "grad_norm": 22.978633880615234, "learning_rate": 2.0559487698011456e-07, "logits/chosen": -0.18546470999717712, "logits/rejected": -0.1565871238708496, "logps/chosen": -1.7760133743286133, "logps/rejected": -1.8380794525146484, "loss": 3.3063, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.760133743286133, "rewards/margins": 0.6206603050231934, "rewards/rejected": -18.38079261779785, "step": 610 }, { "epoch": 0.020728706730931273, "grad_norm": 38.687129974365234, "learning_rate": 2.0728008088978766e-07, "logits/chosen": -0.3419579863548279, "logits/rejected": -0.43344956636428833, "logps/chosen": -1.891229271888733, "logps/rejected": -1.7788646221160889, "loss": 4.1865, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -18.91229248046875, "rewards/margins": -1.1236467361450195, "rewards/rejected": -17.788646697998047, "step": 615 }, { "epoch": 0.020897232801914457, "grad_norm": 23.251344680786133, "learning_rate": 2.089652847994607e-07, "logits/chosen": -0.46370410919189453, "logits/rejected": -0.518548846244812, "logps/chosen": -1.6470798254013062, "logps/rejected": -1.640175461769104, "loss": 3.2648, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.47079849243164, "rewards/margins": -0.06904516369104385, "rewards/rejected": -16.401752471923828, "step": 620 }, { "epoch": 0.021065758872897637, "grad_norm": 8.000481605529785, "learning_rate": 2.106504887091338e-07, "logits/chosen": -0.19095095992088318, "logits/rejected": -0.2789108157157898, "logps/chosen": -1.883286476135254, "logps/rejected": -1.9219729900360107, "loss": 3.0147, "rewards/accuracies": 0.5, "rewards/chosen": -18.832866668701172, "rewards/margins": 0.38686609268188477, "rewards/rejected": -19.219730377197266, "step": 625 }, { "epoch": 0.02123428494388082, "grad_norm": 17.710752487182617, "learning_rate": 2.1233569261880688e-07, "logits/chosen": -0.6913865804672241, "logits/rejected": -0.6803420782089233, "logps/chosen": -1.4992603063583374, "logps/rejected": -1.6658436059951782, "loss": 2.1558, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -14.99260425567627, "rewards/margins": 1.6658321619033813, "rewards/rejected": -16.658435821533203, "step": 630 }, { "epoch": 0.021402811014864, "grad_norm": 14.574746131896973, "learning_rate": 2.1402089652847992e-07, "logits/chosen": -0.7683529853820801, "logits/rejected": -0.8330462574958801, "logps/chosen": -1.4146801233291626, "logps/rejected": -1.6268253326416016, "loss": 2.1639, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -14.146801948547363, "rewards/margins": 2.1214513778686523, "rewards/rejected": -16.268253326416016, "step": 635 }, { "epoch": 0.02157133708584718, "grad_norm": 28.91573143005371, "learning_rate": 2.1570610043815302e-07, "logits/chosen": -0.5444644689559937, "logits/rejected": -0.41493433713912964, "logps/chosen": -1.7037174701690674, "logps/rejected": -1.7247931957244873, "loss": 3.0659, "rewards/accuracies": 0.5, "rewards/chosen": -17.03717613220215, "rewards/margins": 0.2107563018798828, "rewards/rejected": -17.2479305267334, "step": 640 }, { "epoch": 0.021739863156830363, "grad_norm": 24.06241798400879, "learning_rate": 2.1739130434782607e-07, "logits/chosen": -0.527267575263977, "logits/rejected": -0.42481595277786255, "logps/chosen": -1.6632928848266602, "logps/rejected": -1.5876281261444092, "loss": 3.8596, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -16.6329288482666, "rewards/margins": -0.7566463351249695, "rewards/rejected": -15.87628173828125, "step": 645 }, { "epoch": 0.021908389227813543, "grad_norm": 33.575355529785156, "learning_rate": 2.1907650825749917e-07, "logits/chosen": -0.5085206627845764, "logits/rejected": -0.6477267146110535, "logps/chosen": -1.398726224899292, "logps/rejected": -1.6070148944854736, "loss": 2.3809, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -13.987261772155762, "rewards/margins": 2.0828864574432373, "rewards/rejected": -16.070148468017578, "step": 650 }, { "epoch": 0.022076915298796723, "grad_norm": 21.62416648864746, "learning_rate": 2.207617121671722e-07, "logits/chosen": -1.0992919206619263, "logits/rejected": -1.0706102848052979, "logps/chosen": -1.7128111124038696, "logps/rejected": -1.6248939037322998, "loss": 3.942, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.128108978271484, "rewards/margins": -0.8791699409484863, "rewards/rejected": -16.248939514160156, "step": 655 }, { "epoch": 0.022245441369779906, "grad_norm": 16.11137580871582, "learning_rate": 2.2244691607684528e-07, "logits/chosen": -0.6853328347206116, "logits/rejected": -0.6241937279701233, "logps/chosen": -2.0717978477478027, "logps/rejected": -2.090998888015747, "loss": 2.999, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.71797752380371, "rewards/margins": 0.1920129805803299, "rewards/rejected": -20.909992218017578, "step": 660 }, { "epoch": 0.022413967440763086, "grad_norm": 22.234249114990234, "learning_rate": 2.2413211998651836e-07, "logits/chosen": -0.2508560121059418, "logits/rejected": -0.28406912088394165, "logps/chosen": -1.9362938404083252, "logps/rejected": -1.8561948537826538, "loss": 3.9404, "rewards/accuracies": 0.5, "rewards/chosen": -19.362937927246094, "rewards/margins": -0.8009899258613586, "rewards/rejected": -18.561946868896484, "step": 665 }, { "epoch": 0.022582493511746266, "grad_norm": 48.30265426635742, "learning_rate": 2.2581732389619143e-07, "logits/chosen": -0.8998171091079712, "logits/rejected": -0.8238614797592163, "logps/chosen": -1.8364791870117188, "logps/rejected": -1.8647804260253906, "loss": 2.9906, "rewards/accuracies": 0.5, "rewards/chosen": -18.36479377746582, "rewards/margins": 0.28301066160202026, "rewards/rejected": -18.647804260253906, "step": 670 }, { "epoch": 0.02275101958272945, "grad_norm": 24.514257431030273, "learning_rate": 2.2750252780586447e-07, "logits/chosen": -0.317034512758255, "logits/rejected": -0.35245281457901, "logps/chosen": -1.6648200750350952, "logps/rejected": -1.6886028051376343, "loss": 2.9444, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.6481990814209, "rewards/margins": 0.23782816529273987, "rewards/rejected": -16.886028289794922, "step": 675 }, { "epoch": 0.02291954565371263, "grad_norm": 39.15069580078125, "learning_rate": 2.2918773171553757e-07, "logits/chosen": -0.4383459985256195, "logits/rejected": -0.3607695400714874, "logps/chosen": -1.6028152704238892, "logps/rejected": -1.6575685739517212, "loss": 2.7703, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.028154373168945, "rewards/margins": 0.5475319623947144, "rewards/rejected": -16.575685501098633, "step": 680 }, { "epoch": 0.023088071724695812, "grad_norm": 41.797752380371094, "learning_rate": 2.3087293562521064e-07, "logits/chosen": -0.5070951581001282, "logits/rejected": -0.5197767019271851, "logps/chosen": -1.8675521612167358, "logps/rejected": -1.805352807044983, "loss": 3.7902, "rewards/accuracies": 0.5, "rewards/chosen": -18.675521850585938, "rewards/margins": -0.6219925880432129, "rewards/rejected": -18.053529739379883, "step": 685 }, { "epoch": 0.023256597795678992, "grad_norm": 38.495574951171875, "learning_rate": 2.3255813953488372e-07, "logits/chosen": -0.41856566071510315, "logits/rejected": -0.5050365328788757, "logps/chosen": -1.7272249460220337, "logps/rejected": -1.64120352268219, "loss": 4.1383, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -17.272247314453125, "rewards/margins": -0.8602131605148315, "rewards/rejected": -16.41203498840332, "step": 690 }, { "epoch": 0.02342512386666217, "grad_norm": 27.242050170898438, "learning_rate": 2.342433434445568e-07, "logits/chosen": -0.49871888756752014, "logits/rejected": -0.436892032623291, "logps/chosen": -1.7822635173797607, "logps/rejected": -1.804446816444397, "loss": 2.9828, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.822635650634766, "rewards/margins": 0.22183457016944885, "rewards/rejected": -18.044469833374023, "step": 695 }, { "epoch": 0.023593649937645355, "grad_norm": 24.042818069458008, "learning_rate": 2.3592854735422983e-07, "logits/chosen": -0.28581100702285767, "logits/rejected": -0.38166743516921997, "logps/chosen": -1.5768885612487793, "logps/rejected": -1.607193946838379, "loss": 3.0823, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -15.768885612487793, "rewards/margins": 0.30305296182632446, "rewards/rejected": -16.07193946838379, "step": 700 }, { "epoch": 0.023762176008628535, "grad_norm": 32.76372146606445, "learning_rate": 2.3761375126390293e-07, "logits/chosen": -0.428058922290802, "logits/rejected": -0.39834824204444885, "logps/chosen": -1.963168740272522, "logps/rejected": -1.8921149969100952, "loss": 3.8068, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.631689071655273, "rewards/margins": -0.7105369567871094, "rewards/rejected": -18.92115020751953, "step": 705 }, { "epoch": 0.023930702079611715, "grad_norm": 19.744258880615234, "learning_rate": 2.39298955173576e-07, "logits/chosen": -0.2931436002254486, "logits/rejected": -0.24489137530326843, "logps/chosen": -1.5167903900146484, "logps/rejected": -1.6117607355117798, "loss": 2.4292, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -15.1679048538208, "rewards/margins": 0.9497024416923523, "rewards/rejected": -16.11760711669922, "step": 710 }, { "epoch": 0.024099228150594898, "grad_norm": 21.923982620239258, "learning_rate": 2.409841590832491e-07, "logits/chosen": -0.4506607949733734, "logits/rejected": -0.5256415605545044, "logps/chosen": -1.996930480003357, "logps/rejected": -1.7969143390655518, "loss": 5.0751, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.96930503845215, "rewards/margins": -2.0001626014709473, "rewards/rejected": -17.96914291381836, "step": 715 }, { "epoch": 0.024267754221578078, "grad_norm": 23.204833984375, "learning_rate": 2.4266936299292215e-07, "logits/chosen": -0.43543902039527893, "logits/rejected": -0.37989646196365356, "logps/chosen": -1.9207398891448975, "logps/rejected": -1.9233181476593018, "loss": 3.4542, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.207401275634766, "rewards/margins": 0.02578134462237358, "rewards/rejected": -19.23318099975586, "step": 720 }, { "epoch": 0.024436280292561258, "grad_norm": 24.56550407409668, "learning_rate": 2.443545669025952e-07, "logits/chosen": -0.6575881838798523, "logits/rejected": -0.5963281989097595, "logps/chosen": -1.7525399923324585, "logps/rejected": -1.6743013858795166, "loss": 4.0134, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.525400161743164, "rewards/margins": -0.7823885679244995, "rewards/rejected": -16.743013381958008, "step": 725 }, { "epoch": 0.02460480636354444, "grad_norm": 19.73300552368164, "learning_rate": 2.4603977081226824e-07, "logits/chosen": -0.24325819313526154, "logits/rejected": -0.35052576661109924, "logps/chosen": -1.7987921237945557, "logps/rejected": -1.9659143686294556, "loss": 2.6491, "rewards/accuracies": 0.5, "rewards/chosen": -17.9879207611084, "rewards/margins": 1.6712220907211304, "rewards/rejected": -19.659143447875977, "step": 730 }, { "epoch": 0.02477333243452762, "grad_norm": 41.61115646362305, "learning_rate": 2.4772497472194136e-07, "logits/chosen": -0.7007697224617004, "logits/rejected": -0.6481397747993469, "logps/chosen": -1.6374019384384155, "logps/rejected": -1.7201652526855469, "loss": 2.6239, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.374019622802734, "rewards/margins": 0.8276341557502747, "rewards/rejected": -17.2016544342041, "step": 735 }, { "epoch": 0.024941858505510804, "grad_norm": 14.423554420471191, "learning_rate": 2.4941017863161443e-07, "logits/chosen": -0.5714423656463623, "logits/rejected": -0.5921608209609985, "logps/chosen": -1.9647853374481201, "logps/rejected": -2.303321123123169, "loss": 2.7816, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.64785385131836, "rewards/margins": 3.385359525680542, "rewards/rejected": -23.033212661743164, "step": 740 }, { "epoch": 0.025110384576493984, "grad_norm": 82.18790435791016, "learning_rate": 2.510953825412875e-07, "logits/chosen": -0.40886393189430237, "logits/rejected": -0.2524716258049011, "logps/chosen": -1.6332025527954102, "logps/rejected": -1.7254012823104858, "loss": 2.8306, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.332027435302734, "rewards/margins": 0.9219877123832703, "rewards/rejected": -17.254013061523438, "step": 745 }, { "epoch": 0.025278910647477164, "grad_norm": 25.27329444885254, "learning_rate": 2.527805864509606e-07, "logits/chosen": -0.6217483878135681, "logits/rejected": -0.5722722411155701, "logps/chosen": -1.817664384841919, "logps/rejected": -1.8501255512237549, "loss": 2.7603, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.17664337158203, "rewards/margins": 0.3246099352836609, "rewards/rejected": -18.50125503540039, "step": 750 }, { "epoch": 0.025447436718460347, "grad_norm": 38.237152099609375, "learning_rate": 2.5446579036063365e-07, "logits/chosen": -0.52363520860672, "logits/rejected": -0.5313522219657898, "logps/chosen": -1.905940294265747, "logps/rejected": -1.8410999774932861, "loss": 3.7816, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -19.059402465820312, "rewards/margins": -0.6484056711196899, "rewards/rejected": -18.41099739074707, "step": 755 }, { "epoch": 0.025615962789443527, "grad_norm": 37.8472785949707, "learning_rate": 2.5615099427030667e-07, "logits/chosen": -0.5234827399253845, "logits/rejected": -0.5807468891143799, "logps/chosen": -1.763085961341858, "logps/rejected": -1.7279046773910522, "loss": 3.4781, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.630859375, "rewards/margins": -0.35181283950805664, "rewards/rejected": -17.2790470123291, "step": 760 }, { "epoch": 0.025784488860426707, "grad_norm": 29.356088638305664, "learning_rate": 2.5783619817997974e-07, "logits/chosen": -0.5783195495605469, "logits/rejected": -0.5610502362251282, "logps/chosen": -1.7727210521697998, "logps/rejected": -1.6591606140136719, "loss": 4.158, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -17.72721290588379, "rewards/margins": -1.1356046199798584, "rewards/rejected": -16.59160804748535, "step": 765 }, { "epoch": 0.02595301493140989, "grad_norm": 22.93600845336914, "learning_rate": 2.5952140208965287e-07, "logits/chosen": -0.31732669472694397, "logits/rejected": -0.3070994019508362, "logps/chosen": -1.8145532608032227, "logps/rejected": -1.844628095626831, "loss": 2.8821, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.14553451538086, "rewards/margins": 0.3007470965385437, "rewards/rejected": -18.446279525756836, "step": 770 }, { "epoch": 0.02612154100239307, "grad_norm": 36.81005859375, "learning_rate": 2.6120660599932594e-07, "logits/chosen": -0.6823596954345703, "logits/rejected": -0.8584939241409302, "logps/chosen": -1.685476303100586, "logps/rejected": -1.749103307723999, "loss": 2.5587, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.85476303100586, "rewards/margins": 0.6362693905830383, "rewards/rejected": -17.491031646728516, "step": 775 }, { "epoch": 0.02629006707337625, "grad_norm": 11.87728214263916, "learning_rate": 2.6289180990899896e-07, "logits/chosen": -0.33666494488716125, "logits/rejected": -0.2485232651233673, "logps/chosen": -1.8858550786972046, "logps/rejected": -1.8398468494415283, "loss": 3.6989, "rewards/accuracies": 0.5, "rewards/chosen": -18.85854721069336, "rewards/margins": -0.46007975935935974, "rewards/rejected": -18.398468017578125, "step": 780 }, { "epoch": 0.026458593144359433, "grad_norm": 34.59413528442383, "learning_rate": 2.6457701381867203e-07, "logits/chosen": -0.678062915802002, "logits/rejected": -0.703747034072876, "logps/chosen": -1.4804130792617798, "logps/rejected": -1.481335997581482, "loss": 3.1245, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -14.804130554199219, "rewards/margins": 0.009229278191924095, "rewards/rejected": -14.813360214233398, "step": 785 }, { "epoch": 0.026627119215342613, "grad_norm": 13.038679122924805, "learning_rate": 2.662622177283451e-07, "logits/chosen": -0.43355339765548706, "logits/rejected": -0.26748722791671753, "logps/chosen": -1.9919134378433228, "logps/rejected": -2.074113368988037, "loss": 2.6707, "rewards/accuracies": 0.5, "rewards/chosen": -19.919132232666016, "rewards/margins": 0.8219999074935913, "rewards/rejected": -20.741134643554688, "step": 790 }, { "epoch": 0.026795645286325796, "grad_norm": 38.65684509277344, "learning_rate": 2.679474216380182e-07, "logits/chosen": -0.5652610063552856, "logits/rejected": -0.3666650950908661, "logps/chosen": -1.6639525890350342, "logps/rejected": -1.6299728155136108, "loss": 3.7574, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.639524459838867, "rewards/margins": -0.33979684114456177, "rewards/rejected": -16.299728393554688, "step": 795 }, { "epoch": 0.026964171357308976, "grad_norm": 21.291852951049805, "learning_rate": 2.696326255476913e-07, "logits/chosen": -0.5117262005805969, "logits/rejected": -0.47947850823402405, "logps/chosen": -2.0055885314941406, "logps/rejected": -1.9679181575775146, "loss": 3.4746, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.055883407592773, "rewards/margins": -0.3767028748989105, "rewards/rejected": -19.679183959960938, "step": 800 }, { "epoch": 0.026964171357308976, "eval_logits/chosen": -0.6988369822502136, "eval_logits/rejected": -0.7001645565032959, "eval_logps/chosen": -1.655611276626587, "eval_logps/rejected": -1.6514620780944824, "eval_loss": 3.436960458755493, "eval_rewards/accuracies": 0.4699999988079071, "eval_rewards/chosen": -16.55611228942871, "eval_rewards/margins": -0.041492920368909836, "eval_rewards/rejected": -16.514619827270508, "eval_runtime": 12.9309, "eval_samples_per_second": 7.733, "eval_steps_per_second": 1.933, "step": 800 }, { "epoch": 0.027132697428292156, "grad_norm": 18.55634117126465, "learning_rate": 2.713178294573643e-07, "logits/chosen": -0.5753890872001648, "logits/rejected": -0.4462040364742279, "logps/chosen": -1.4152113199234009, "logps/rejected": -1.5044399499893188, "loss": 2.6021, "rewards/accuracies": 0.5, "rewards/chosen": -14.152114868164062, "rewards/margins": 0.8922847509384155, "rewards/rejected": -15.044398307800293, "step": 805 }, { "epoch": 0.02730122349927534, "grad_norm": 35.19956588745117, "learning_rate": 2.730030333670374e-07, "logits/chosen": -0.5496922731399536, "logits/rejected": -0.417860746383667, "logps/chosen": -1.7335258722305298, "logps/rejected": -1.7158492803573608, "loss": 3.4416, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -17.33525848388672, "rewards/margins": -0.176764577627182, "rewards/rejected": -17.158493041992188, "step": 810 }, { "epoch": 0.02746974957025852, "grad_norm": 21.941349029541016, "learning_rate": 2.7468823727671046e-07, "logits/chosen": -0.3967406749725342, "logits/rejected": -0.5216356515884399, "logps/chosen": -1.9088962078094482, "logps/rejected": -1.7400329113006592, "loss": 4.7099, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -19.08896255493164, "rewards/margins": -1.6886341571807861, "rewards/rejected": -17.400327682495117, "step": 815 }, { "epoch": 0.0276382756412417, "grad_norm": 17.868234634399414, "learning_rate": 2.763734411863836e-07, "logits/chosen": -0.37519901990890503, "logits/rejected": -0.4539732336997986, "logps/chosen": -1.8600950241088867, "logps/rejected": -1.8098268508911133, "loss": 3.6029, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.600950241088867, "rewards/margins": -0.5026828050613403, "rewards/rejected": -18.098268508911133, "step": 820 }, { "epoch": 0.027806801712224882, "grad_norm": 19.673343658447266, "learning_rate": 2.780586450960566e-07, "logits/chosen": -0.7934955954551697, "logits/rejected": -0.8090070486068726, "logps/chosen": -1.6202653646469116, "logps/rejected": -1.5587866306304932, "loss": 3.6798, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -16.202655792236328, "rewards/margins": -0.6147867441177368, "rewards/rejected": -15.587865829467773, "step": 825 }, { "epoch": 0.027975327783208062, "grad_norm": 30.37026023864746, "learning_rate": 2.797438490057297e-07, "logits/chosen": -0.5915518999099731, "logits/rejected": -0.6104450225830078, "logps/chosen": -1.6193252801895142, "logps/rejected": -1.656368613243103, "loss": 3.0309, "rewards/accuracies": 0.5, "rewards/chosen": -16.19325065612793, "rewards/margins": 0.3704357147216797, "rewards/rejected": -16.56368637084961, "step": 830 }, { "epoch": 0.028143853854191242, "grad_norm": 28.94577980041504, "learning_rate": 2.8142905291540275e-07, "logits/chosen": -0.3376855254173279, "logits/rejected": -0.36098232865333557, "logps/chosen": -1.6270509958267212, "logps/rejected": -1.7100646495819092, "loss": 2.4789, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.270511627197266, "rewards/margins": 0.8301382064819336, "rewards/rejected": -17.10064697265625, "step": 835 }, { "epoch": 0.028312379925174425, "grad_norm": 23.16632652282715, "learning_rate": 2.831142568250758e-07, "logits/chosen": -0.10355281829833984, "logits/rejected": -0.10854457318782806, "logps/chosen": -1.6649757623672485, "logps/rejected": -2.3399498462677, "loss": 1.6261, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -16.649757385253906, "rewards/margins": 6.7497406005859375, "rewards/rejected": -23.39949607849121, "step": 840 }, { "epoch": 0.028480905996157605, "grad_norm": 31.516361236572266, "learning_rate": 2.8479946073474884e-07, "logits/chosen": -0.29107311367988586, "logits/rejected": -0.15227220952510834, "logps/chosen": -1.724825143814087, "logps/rejected": -1.6380412578582764, "loss": 4.0866, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.24825096130371, "rewards/margins": -0.8678376078605652, "rewards/rejected": -16.380413055419922, "step": 845 }, { "epoch": 0.028649432067140785, "grad_norm": 21.751916885375977, "learning_rate": 2.8648466464442196e-07, "logits/chosen": -0.3685319125652313, "logits/rejected": -0.47823366522789, "logps/chosen": -1.6271034479141235, "logps/rejected": -2.1975488662719727, "loss": 2.3549, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.271034240722656, "rewards/margins": 5.7044548988342285, "rewards/rejected": -21.97549057006836, "step": 850 }, { "epoch": 0.02881795813812397, "grad_norm": 18.084596633911133, "learning_rate": 2.8816986855409504e-07, "logits/chosen": -0.28694620728492737, "logits/rejected": -0.3606267273426056, "logps/chosen": -1.6423877477645874, "logps/rejected": -1.6014961004257202, "loss": 3.7834, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.42387580871582, "rewards/margins": -0.4089130461215973, "rewards/rejected": -16.014963150024414, "step": 855 }, { "epoch": 0.028986484209107148, "grad_norm": 18.191057205200195, "learning_rate": 2.898550724637681e-07, "logits/chosen": -0.40083274245262146, "logits/rejected": -0.41338223218917847, "logps/chosen": -1.865255355834961, "logps/rejected": -2.0431108474731445, "loss": 1.8863, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.652551651000977, "rewards/margins": 1.7785552740097046, "rewards/rejected": -20.431108474731445, "step": 860 }, { "epoch": 0.02915501028009033, "grad_norm": 36.193058013916016, "learning_rate": 2.915402763734412e-07, "logits/chosen": -0.34367144107818604, "logits/rejected": -0.35139140486717224, "logps/chosen": -1.5019428730010986, "logps/rejected": -1.6563999652862549, "loss": 2.3754, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -15.019430160522461, "rewards/margins": 1.544569969177246, "rewards/rejected": -16.56399917602539, "step": 865 }, { "epoch": 0.02932353635107351, "grad_norm": 29.971036911010742, "learning_rate": 2.932254802831142e-07, "logits/chosen": -0.4394436776638031, "logits/rejected": -0.489290326833725, "logps/chosen": -1.6216356754302979, "logps/rejected": -1.5399792194366455, "loss": 3.8889, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.21635627746582, "rewards/margins": -0.8165642619132996, "rewards/rejected": -15.39979076385498, "step": 870 }, { "epoch": 0.02949206242205669, "grad_norm": 29.058055877685547, "learning_rate": 2.949106841927873e-07, "logits/chosen": -0.6171309351921082, "logits/rejected": -0.5241347551345825, "logps/chosen": -1.6407206058502197, "logps/rejected": -1.7146087884902954, "loss": 2.4908, "rewards/accuracies": 0.5, "rewards/chosen": -16.407207489013672, "rewards/margins": 0.7388814091682434, "rewards/rejected": -17.146087646484375, "step": 875 }, { "epoch": 0.029660588493039874, "grad_norm": 36.95437240600586, "learning_rate": 2.965958881024604e-07, "logits/chosen": -0.3627752661705017, "logits/rejected": -0.3334648907184601, "logps/chosen": -1.6400692462921143, "logps/rejected": -1.598304033279419, "loss": 3.5085, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.400691986083984, "rewards/margins": -0.417651891708374, "rewards/rejected": -15.983039855957031, "step": 880 }, { "epoch": 0.029829114564023054, "grad_norm": 30.360219955444336, "learning_rate": 2.9828109201213347e-07, "logits/chosen": -0.4603070616722107, "logits/rejected": -0.37161141633987427, "logps/chosen": -1.752171277999878, "logps/rejected": -1.8547074794769287, "loss": 2.3261, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.521711349487305, "rewards/margins": 1.0253627300262451, "rewards/rejected": -18.547075271606445, "step": 885 }, { "epoch": 0.029997640635006234, "grad_norm": 24.124099731445312, "learning_rate": 2.999662959218065e-07, "logits/chosen": -0.32377538084983826, "logits/rejected": -0.4409562051296234, "logps/chosen": -1.6169246435165405, "logps/rejected": -1.5861228704452515, "loss": 3.446, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.169246673583984, "rewards/margins": -0.30801552534103394, "rewards/rejected": -15.861230850219727, "step": 890 }, { "epoch": 0.030166166705989417, "grad_norm": 22.953449249267578, "learning_rate": 3.0165149983147956e-07, "logits/chosen": -0.6455360651016235, "logits/rejected": -0.6651984453201294, "logps/chosen": -1.7256028652191162, "logps/rejected": -1.8558244705200195, "loss": 2.8954, "rewards/accuracies": 0.5, "rewards/chosen": -17.256031036376953, "rewards/margins": 1.302215337753296, "rewards/rejected": -18.558244705200195, "step": 895 }, { "epoch": 0.030334692776972597, "grad_norm": 22.631338119506836, "learning_rate": 3.033367037411527e-07, "logits/chosen": -0.47631579637527466, "logits/rejected": -0.5697728395462036, "logps/chosen": -1.7963521480560303, "logps/rejected": -1.802026391029358, "loss": 3.335, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -17.963520050048828, "rewards/margins": 0.05674257129430771, "rewards/rejected": -18.020263671875, "step": 900 }, { "epoch": 0.030503218847955777, "grad_norm": 13.581130981445312, "learning_rate": 3.0502190765082576e-07, "logits/chosen": -0.5964738726615906, "logits/rejected": -0.6222713589668274, "logps/chosen": -1.6977436542510986, "logps/rejected": -1.6556882858276367, "loss": 3.6774, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.977436065673828, "rewards/margins": -0.42055463790893555, "rewards/rejected": -16.556880950927734, "step": 905 }, { "epoch": 0.03067174491893896, "grad_norm": 24.475252151489258, "learning_rate": 3.0670711156049883e-07, "logits/chosen": -0.2089192122220993, "logits/rejected": -0.14673969149589539, "logps/chosen": -1.7369495630264282, "logps/rejected": -1.7116025686264038, "loss": 3.3661, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.369495391845703, "rewards/margins": -0.253470242023468, "rewards/rejected": -17.116024017333984, "step": 910 }, { "epoch": 0.03084027098992214, "grad_norm": 15.064949989318848, "learning_rate": 3.0839231547017185e-07, "logits/chosen": -0.41629713773727417, "logits/rejected": -0.28385329246520996, "logps/chosen": -1.7120001316070557, "logps/rejected": -1.910131812095642, "loss": 2.2545, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.1200008392334, "rewards/margins": 1.9813172817230225, "rewards/rejected": -19.101320266723633, "step": 915 }, { "epoch": 0.031008797060905324, "grad_norm": 18.73622703552246, "learning_rate": 3.100775193798449e-07, "logits/chosen": -0.5003215074539185, "logits/rejected": -0.4684736132621765, "logps/chosen": -1.7935521602630615, "logps/rejected": -2.2938485145568848, "loss": 2.5871, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.935522079467773, "rewards/margins": 5.002964973449707, "rewards/rejected": -22.938486099243164, "step": 920 }, { "epoch": 0.031177323131888503, "grad_norm": 23.802066802978516, "learning_rate": 3.1176272328951804e-07, "logits/chosen": -0.6484017968177795, "logits/rejected": -0.8259338140487671, "logps/chosen": -1.4363877773284912, "logps/rejected": -1.5680042505264282, "loss": 2.2103, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -14.36387825012207, "rewards/margins": 1.316164255142212, "rewards/rejected": -15.680041313171387, "step": 925 }, { "epoch": 0.03134584920287169, "grad_norm": 20.22801399230957, "learning_rate": 3.134479271991911e-07, "logits/chosen": -0.6938272714614868, "logits/rejected": -0.5847961902618408, "logps/chosen": -1.6226203441619873, "logps/rejected": -1.6188541650772095, "loss": 3.099, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.2262020111084, "rewards/margins": -0.03766317293047905, "rewards/rejected": -16.188541412353516, "step": 930 }, { "epoch": 0.03151437527385487, "grad_norm": 36.12754821777344, "learning_rate": 3.1513313110886413e-07, "logits/chosen": -0.3781919777393341, "logits/rejected": -0.21472349762916565, "logps/chosen": -1.8906118869781494, "logps/rejected": -1.7393276691436768, "loss": 4.7426, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.906116485595703, "rewards/margins": -1.512840986251831, "rewards/rejected": -17.39327621459961, "step": 935 }, { "epoch": 0.031682901344838046, "grad_norm": 169.67495727539062, "learning_rate": 3.168183350185372e-07, "logits/chosen": -0.3850443661212921, "logits/rejected": -0.31055304408073425, "logps/chosen": -2.1397993564605713, "logps/rejected": -2.0641467571258545, "loss": 4.6265, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.397991180419922, "rewards/margins": -0.7565252184867859, "rewards/rejected": -20.641468048095703, "step": 940 }, { "epoch": 0.031851427415821226, "grad_norm": 51.70778274536133, "learning_rate": 3.185035389282103e-07, "logits/chosen": -0.5556879043579102, "logits/rejected": -0.6957587599754333, "logps/chosen": -1.6177966594696045, "logps/rejected": -1.445115327835083, "loss": 4.8937, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -16.177967071533203, "rewards/margins": -1.7268126010894775, "rewards/rejected": -14.451153755187988, "step": 945 }, { "epoch": 0.032019953486804406, "grad_norm": 41.51219940185547, "learning_rate": 3.201887428378834e-07, "logits/chosen": -0.553308367729187, "logits/rejected": -0.2694825530052185, "logps/chosen": -1.7990858554840088, "logps/rejected": -1.7950258255004883, "loss": 3.4404, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -17.990856170654297, "rewards/margins": -0.04059934616088867, "rewards/rejected": -17.950258255004883, "step": 950 }, { "epoch": 0.03218847955778759, "grad_norm": 39.060516357421875, "learning_rate": 3.218739467475564e-07, "logits/chosen": -0.21345441043376923, "logits/rejected": -0.2154117077589035, "logps/chosen": -2.0603363513946533, "logps/rejected": -1.9187877178192139, "loss": 4.559, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -20.603363037109375, "rewards/margins": -1.4154869318008423, "rewards/rejected": -19.187877655029297, "step": 955 }, { "epoch": 0.03235700562877077, "grad_norm": 31.9559268951416, "learning_rate": 3.235591506572295e-07, "logits/chosen": -0.5925837755203247, "logits/rejected": -0.4998060166835785, "logps/chosen": -1.7358916997909546, "logps/rejected": -1.7558130025863647, "loss": 3.0011, "rewards/accuracies": 0.5, "rewards/chosen": -17.358917236328125, "rewards/margins": 0.19921168684959412, "rewards/rejected": -17.558130264282227, "step": 960 }, { "epoch": 0.03252553169975395, "grad_norm": 22.82416534423828, "learning_rate": 3.2524435456690257e-07, "logits/chosen": -0.28516143560409546, "logits/rejected": -0.2573995888233185, "logps/chosen": -1.8022711277008057, "logps/rejected": -1.7835382223129272, "loss": 3.5213, "rewards/accuracies": 0.5, "rewards/chosen": -18.022708892822266, "rewards/margins": -0.18732872605323792, "rewards/rejected": -17.83538246154785, "step": 965 }, { "epoch": 0.03269405777073713, "grad_norm": 28.213504791259766, "learning_rate": 3.2692955847657564e-07, "logits/chosen": -0.5727102756500244, "logits/rejected": -0.5985099077224731, "logps/chosen": -1.98666512966156, "logps/rejected": -1.7860243320465088, "loss": 5.0562, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.86665153503418, "rewards/margins": -2.00640869140625, "rewards/rejected": -17.86024284362793, "step": 970 }, { "epoch": 0.03286258384172031, "grad_norm": 20.811389923095703, "learning_rate": 3.2861476238624876e-07, "logits/chosen": -0.3460441827774048, "logits/rejected": -0.4190608561038971, "logps/chosen": -1.6150789260864258, "logps/rejected": -1.7487542629241943, "loss": 2.3329, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.150787353515625, "rewards/margins": 1.3367559909820557, "rewards/rejected": -17.487545013427734, "step": 975 }, { "epoch": 0.03303110991270349, "grad_norm": 27.982589721679688, "learning_rate": 3.302999662959218e-07, "logits/chosen": -0.6669625043869019, "logits/rejected": -0.5034026503562927, "logps/chosen": -1.5034055709838867, "logps/rejected": -1.5552198886871338, "loss": 2.7122, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -15.034055709838867, "rewards/margins": 0.518143892288208, "rewards/rejected": -15.55219841003418, "step": 980 }, { "epoch": 0.03319963598368668, "grad_norm": 19.364490509033203, "learning_rate": 3.3198517020559485e-07, "logits/chosen": -0.3645666539669037, "logits/rejected": -0.3036833703517914, "logps/chosen": -1.6654491424560547, "logps/rejected": -1.8591690063476562, "loss": 2.1034, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -16.654491424560547, "rewards/margins": 1.9371992349624634, "rewards/rejected": -18.591690063476562, "step": 985 }, { "epoch": 0.03336816205466986, "grad_norm": 16.458690643310547, "learning_rate": 3.336703741152679e-07, "logits/chosen": -0.6402947306632996, "logits/rejected": -0.7181426286697388, "logps/chosen": -1.6192500591278076, "logps/rejected": -1.620171308517456, "loss": 3.1025, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.192501068115234, "rewards/margins": 0.009212017059326172, "rewards/rejected": -16.20171356201172, "step": 990 }, { "epoch": 0.03353668812565304, "grad_norm": 22.21959686279297, "learning_rate": 3.35355578024941e-07, "logits/chosen": -0.4840850830078125, "logits/rejected": -0.41178077459335327, "logps/chosen": -1.736339807510376, "logps/rejected": -1.763511300086975, "loss": 2.9488, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.3633975982666, "rewards/margins": 0.27171534299850464, "rewards/rejected": -17.635112762451172, "step": 995 }, { "epoch": 0.03370521419663622, "grad_norm": 23.295127868652344, "learning_rate": 3.3704078193461407e-07, "logits/chosen": -0.09787406027317047, "logits/rejected": -0.16445288062095642, "logps/chosen": -1.9367231130599976, "logps/rejected": -1.9507242441177368, "loss": 3.0651, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.367233276367188, "rewards/margins": 0.14000901579856873, "rewards/rejected": -19.50724220275879, "step": 1000 }, { "epoch": 0.0338737402676194, "grad_norm": 15.762697219848633, "learning_rate": 3.3872598584428714e-07, "logits/chosen": -0.35486823320388794, "logits/rejected": -0.24091534316539764, "logps/chosen": -1.7885253429412842, "logps/rejected": -1.9451637268066406, "loss": 2.5787, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.885255813598633, "rewards/margins": 1.5663820505142212, "rewards/rejected": -19.451635360717773, "step": 1005 }, { "epoch": 0.034042266338602585, "grad_norm": 20.35264778137207, "learning_rate": 3.404111897539602e-07, "logits/chosen": -0.7031236886978149, "logits/rejected": -0.7049790620803833, "logps/chosen": -1.589521884918213, "logps/rejected": -1.5903050899505615, "loss": 3.4214, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -15.895219802856445, "rewards/margins": 0.007831478491425514, "rewards/rejected": -15.903048515319824, "step": 1010 }, { "epoch": 0.034210792409585765, "grad_norm": 26.860595703125, "learning_rate": 3.420963936636333e-07, "logits/chosen": -0.5545657873153687, "logits/rejected": -0.47453317046165466, "logps/chosen": -2.0342624187469482, "logps/rejected": -2.24369478225708, "loss": 2.2526, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.34262466430664, "rewards/margins": 2.0943217277526855, "rewards/rejected": -22.436946868896484, "step": 1015 }, { "epoch": 0.034379318480568945, "grad_norm": 19.55118751525879, "learning_rate": 3.4378159757330636e-07, "logits/chosen": -0.237474724650383, "logits/rejected": -0.09069846570491791, "logps/chosen": -1.5036439895629883, "logps/rejected": -1.4696455001831055, "loss": 3.5962, "rewards/accuracies": 0.5, "rewards/chosen": -15.0364408493042, "rewards/margins": -0.3399861454963684, "rewards/rejected": -14.696454048156738, "step": 1020 }, { "epoch": 0.034547844551552125, "grad_norm": 24.427839279174805, "learning_rate": 3.4546680148297943e-07, "logits/chosen": -0.029159266501665115, "logits/rejected": 0.004416605923324823, "logps/chosen": -1.6479896306991577, "logps/rejected": -1.6794894933700562, "loss": 2.8159, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.479896545410156, "rewards/margins": 0.3149985373020172, "rewards/rejected": -16.79489517211914, "step": 1025 }, { "epoch": 0.034716370622535304, "grad_norm": 35.8406867980957, "learning_rate": 3.471520053926525e-07, "logits/chosen": -0.5084778666496277, "logits/rejected": -0.47411975264549255, "logps/chosen": -1.8330333232879639, "logps/rejected": -1.8400707244873047, "loss": 3.1579, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.330333709716797, "rewards/margins": 0.07037486881017685, "rewards/rejected": -18.400707244873047, "step": 1030 }, { "epoch": 0.034884896693518484, "grad_norm": 12.815260887145996, "learning_rate": 3.4883720930232557e-07, "logits/chosen": -0.44474729895591736, "logits/rejected": -0.5331543684005737, "logps/chosen": -1.9221336841583252, "logps/rejected": -2.0161516666412354, "loss": 2.7604, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.221338272094727, "rewards/margins": 0.9401793479919434, "rewards/rejected": -20.161518096923828, "step": 1035 }, { "epoch": 0.03505342276450167, "grad_norm": 19.394817352294922, "learning_rate": 3.5052241321199864e-07, "logits/chosen": -0.2036978304386139, "logits/rejected": -0.1269582062959671, "logps/chosen": -2.039153814315796, "logps/rejected": -1.904292345046997, "loss": 4.7773, "rewards/accuracies": 0.5, "rewards/chosen": -20.391536712646484, "rewards/margins": -1.3486132621765137, "rewards/rejected": -19.042922973632812, "step": 1040 }, { "epoch": 0.03522194883548485, "grad_norm": 31.41037368774414, "learning_rate": 3.5220761712167166e-07, "logits/chosen": -0.42470335960388184, "logits/rejected": -0.5246855616569519, "logps/chosen": -2.123039722442627, "logps/rejected": -2.208981990814209, "loss": 3.0471, "rewards/accuracies": 0.5, "rewards/chosen": -21.230396270751953, "rewards/margins": 0.8594244122505188, "rewards/rejected": -22.089818954467773, "step": 1045 }, { "epoch": 0.03539047490646803, "grad_norm": 16.57583236694336, "learning_rate": 3.538928210313448e-07, "logits/chosen": -0.26511508226394653, "logits/rejected": -0.28311991691589355, "logps/chosen": -1.609574317932129, "logps/rejected": -1.6120364665985107, "loss": 3.1126, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.09574317932129, "rewards/margins": 0.02462158165872097, "rewards/rejected": -16.120365142822266, "step": 1050 }, { "epoch": 0.03555900097745121, "grad_norm": 19.813968658447266, "learning_rate": 3.5557802494101786e-07, "logits/chosen": -0.7508059144020081, "logits/rejected": -0.8045485615730286, "logps/chosen": -1.5364524126052856, "logps/rejected": -1.5765130519866943, "loss": 3.1092, "rewards/accuracies": 0.5, "rewards/chosen": -15.364524841308594, "rewards/margins": 0.40060538053512573, "rewards/rejected": -15.765130996704102, "step": 1055 }, { "epoch": 0.03572752704843439, "grad_norm": 25.874378204345703, "learning_rate": 3.5726322885069093e-07, "logits/chosen": -0.4468226432800293, "logits/rejected": -0.39583808183670044, "logps/chosen": -1.6582987308502197, "logps/rejected": -1.787592887878418, "loss": 2.8136, "rewards/accuracies": 0.5, "rewards/chosen": -16.58298683166504, "rewards/margins": 1.2929418087005615, "rewards/rejected": -17.875926971435547, "step": 1060 }, { "epoch": 0.03589605311941758, "grad_norm": 23.56908416748047, "learning_rate": 3.5894843276036395e-07, "logits/chosen": -0.50593101978302, "logits/rejected": -0.5206912755966187, "logps/chosen": -1.942335844039917, "logps/rejected": -1.8451474905014038, "loss": 4.047, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -19.423358917236328, "rewards/margins": -0.9718831181526184, "rewards/rejected": -18.451473236083984, "step": 1065 }, { "epoch": 0.03606457919040076, "grad_norm": 25.075082778930664, "learning_rate": 3.60633636670037e-07, "logits/chosen": -0.34409254789352417, "logits/rejected": -0.5635167956352234, "logps/chosen": -1.6613903045654297, "logps/rejected": -1.6975667476654053, "loss": 2.8626, "rewards/accuracies": 0.5, "rewards/chosen": -16.613903045654297, "rewards/margins": 0.3617649972438812, "rewards/rejected": -16.97566795349121, "step": 1070 }, { "epoch": 0.03623310526138394, "grad_norm": 24.480600357055664, "learning_rate": 3.6231884057971015e-07, "logits/chosen": -0.5511472821235657, "logits/rejected": -0.5344496965408325, "logps/chosen": -1.5300863981246948, "logps/rejected": -1.600724458694458, "loss": 2.7084, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -15.300865173339844, "rewards/margins": 0.7063789367675781, "rewards/rejected": -16.007244110107422, "step": 1075 }, { "epoch": 0.03640163133236712, "grad_norm": 26.33382797241211, "learning_rate": 3.640040444893832e-07, "logits/chosen": -0.22453565895557404, "logits/rejected": -0.2891274094581604, "logps/chosen": -1.901058554649353, "logps/rejected": -1.7239421606063843, "loss": 4.8905, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -19.01058578491211, "rewards/margins": -1.7711639404296875, "rewards/rejected": -17.239421844482422, "step": 1080 }, { "epoch": 0.0365701574033503, "grad_norm": 25.77227210998535, "learning_rate": 3.656892483990563e-07, "logits/chosen": -0.506012499332428, "logits/rejected": -0.5107889175415039, "logps/chosen": -1.7310975790023804, "logps/rejected": -1.6403017044067383, "loss": 3.967, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.310977935791016, "rewards/margins": -0.9079577326774597, "rewards/rejected": -16.403018951416016, "step": 1085 }, { "epoch": 0.036738683474333476, "grad_norm": 20.275495529174805, "learning_rate": 3.673744523087293e-07, "logits/chosen": -0.48249855637550354, "logits/rejected": -0.4290506839752197, "logps/chosen": -1.869666337966919, "logps/rejected": -1.8338606357574463, "loss": 3.5558, "rewards/accuracies": 0.5, "rewards/chosen": -18.696664810180664, "rewards/margins": -0.35805749893188477, "rewards/rejected": -18.338603973388672, "step": 1090 }, { "epoch": 0.03690720954531666, "grad_norm": 21.183048248291016, "learning_rate": 3.690596562184024e-07, "logits/chosen": -0.44681963324546814, "logits/rejected": -0.3580966293811798, "logps/chosen": -1.7726942300796509, "logps/rejected": -1.85599684715271, "loss": 2.7325, "rewards/accuracies": 0.5, "rewards/chosen": -17.72694206237793, "rewards/margins": 0.8330275416374207, "rewards/rejected": -18.55997085571289, "step": 1095 }, { "epoch": 0.03707573561629984, "grad_norm": 27.373397827148438, "learning_rate": 3.707448601280755e-07, "logits/chosen": -0.4769948422908783, "logits/rejected": -0.33051571249961853, "logps/chosen": -1.8934491872787476, "logps/rejected": -1.8802127838134766, "loss": 4.1084, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.934494018554688, "rewards/margins": -0.13236340880393982, "rewards/rejected": -18.802127838134766, "step": 1100 }, { "epoch": 0.03724426168728302, "grad_norm": 33.653560638427734, "learning_rate": 3.724300640377486e-07, "logits/chosen": -0.5165948867797852, "logits/rejected": -0.4417875409126282, "logps/chosen": -1.9007021188735962, "logps/rejected": -1.9125080108642578, "loss": 3.0777, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.007020950317383, "rewards/margins": 0.11805801093578339, "rewards/rejected": -19.125080108642578, "step": 1105 }, { "epoch": 0.0374127877582662, "grad_norm": 64.54642486572266, "learning_rate": 3.741152679474216e-07, "logits/chosen": -0.16624772548675537, "logits/rejected": -0.4547352194786072, "logps/chosen": -1.6494159698486328, "logps/rejected": -1.7214076519012451, "loss": 2.7371, "rewards/accuracies": 0.5, "rewards/chosen": -16.494159698486328, "rewards/margins": 0.7199157476425171, "rewards/rejected": -17.21407699584961, "step": 1110 }, { "epoch": 0.03758131382924938, "grad_norm": 24.6578369140625, "learning_rate": 3.7580047185709467e-07, "logits/chosen": -0.014136564917862415, "logits/rejected": -0.16067324578762054, "logps/chosen": -2.145573139190674, "logps/rejected": -2.0913286209106445, "loss": 3.6231, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.455732345581055, "rewards/margins": -0.5424462556838989, "rewards/rejected": -20.913288116455078, "step": 1115 }, { "epoch": 0.03774983990023257, "grad_norm": 23.080047607421875, "learning_rate": 3.7748567576676774e-07, "logits/chosen": -0.7539829015731812, "logits/rejected": -0.7695743441581726, "logps/chosen": -1.6319653987884521, "logps/rejected": -1.5968726873397827, "loss": 3.553, "rewards/accuracies": 0.5, "rewards/chosen": -16.31965446472168, "rewards/margins": -0.35092657804489136, "rewards/rejected": -15.968729019165039, "step": 1120 }, { "epoch": 0.03791836597121575, "grad_norm": 43.620174407958984, "learning_rate": 3.7917087967644087e-07, "logits/chosen": -0.5959217548370361, "logits/rejected": -0.5997222065925598, "logps/chosen": -1.7780771255493164, "logps/rejected": -1.747106909751892, "loss": 3.3741, "rewards/accuracies": 0.5, "rewards/chosen": -17.780773162841797, "rewards/margins": -0.30970388650894165, "rewards/rejected": -17.4710693359375, "step": 1125 }, { "epoch": 0.03808689204219893, "grad_norm": 28.987207412719727, "learning_rate": 3.8085608358611394e-07, "logits/chosen": -0.10104711353778839, "logits/rejected": -0.13577064871788025, "logps/chosen": -2.0339839458465576, "logps/rejected": -2.018089771270752, "loss": 3.2938, "rewards/accuracies": 0.5, "rewards/chosen": -20.339839935302734, "rewards/margins": -0.15894070267677307, "rewards/rejected": -20.180898666381836, "step": 1130 }, { "epoch": 0.03825541811318211, "grad_norm": 27.349302291870117, "learning_rate": 3.8254128749578696e-07, "logits/chosen": -0.4686538279056549, "logits/rejected": -0.33619728684425354, "logps/chosen": -1.8348045349121094, "logps/rejected": -1.895215630531311, "loss": 2.9007, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.348045349121094, "rewards/margins": 0.6041110754013062, "rewards/rejected": -18.952157974243164, "step": 1135 }, { "epoch": 0.03842394418416529, "grad_norm": 49.29142761230469, "learning_rate": 3.8422649140546003e-07, "logits/chosen": -0.6437035799026489, "logits/rejected": -0.6373649835586548, "logps/chosen": -1.6333469152450562, "logps/rejected": -1.700933814048767, "loss": 2.5222, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.33346939086914, "rewards/margins": 0.6758671998977661, "rewards/rejected": -17.009336471557617, "step": 1140 }, { "epoch": 0.03859247025514847, "grad_norm": 27.445465087890625, "learning_rate": 3.859116953151331e-07, "logits/chosen": -0.03426782041788101, "logits/rejected": -0.028365587815642357, "logps/chosen": -2.383859395980835, "logps/rejected": -2.457791805267334, "loss": 3.2568, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.838594436645508, "rewards/margins": 0.7393231391906738, "rewards/rejected": -24.577917098999023, "step": 1145 }, { "epoch": 0.038760996326131655, "grad_norm": 22.15361213684082, "learning_rate": 3.8759689922480623e-07, "logits/chosen": -0.561955988407135, "logits/rejected": -0.6235911250114441, "logps/chosen": -1.4886285066604614, "logps/rejected": -1.5125375986099243, "loss": 2.915, "rewards/accuracies": 0.5, "rewards/chosen": -14.886285781860352, "rewards/margins": 0.239091157913208, "rewards/rejected": -15.12537670135498, "step": 1150 }, { "epoch": 0.038929522397114835, "grad_norm": 19.871843338012695, "learning_rate": 3.8928210313447925e-07, "logits/chosen": -0.6531854867935181, "logits/rejected": -0.6476200819015503, "logps/chosen": -1.7495161294937134, "logps/rejected": -1.8894214630126953, "loss": 2.1886, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.495162963867188, "rewards/margins": 1.399052381515503, "rewards/rejected": -18.894214630126953, "step": 1155 }, { "epoch": 0.039098048468098015, "grad_norm": 38.3976936340332, "learning_rate": 3.909673070441523e-07, "logits/chosen": -0.5150366425514221, "logits/rejected": -0.512414813041687, "logps/chosen": -1.4856387376785278, "logps/rejected": -1.4582234621047974, "loss": 3.3862, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -14.8563871383667, "rewards/margins": -0.2741526663303375, "rewards/rejected": -14.582234382629395, "step": 1160 }, { "epoch": 0.039266574539081195, "grad_norm": 21.68328285217285, "learning_rate": 3.926525109538254e-07, "logits/chosen": -0.2962788939476013, "logits/rejected": -0.34172508120536804, "logps/chosen": -1.905922293663025, "logps/rejected": -1.9026237726211548, "loss": 3.358, "rewards/accuracies": 0.5, "rewards/chosen": -19.059223175048828, "rewards/margins": -0.03298645094037056, "rewards/rejected": -19.02623748779297, "step": 1165 }, { "epoch": 0.039435100610064375, "grad_norm": 24.6955623626709, "learning_rate": 3.9433771486349846e-07, "logits/chosen": -0.401714026927948, "logits/rejected": -0.4989330768585205, "logps/chosen": -1.7247121334075928, "logps/rejected": -1.6620609760284424, "loss": 3.7205, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.247121810913086, "rewards/margins": -0.6265131235122681, "rewards/rejected": -16.620609283447266, "step": 1170 }, { "epoch": 0.03960362668104756, "grad_norm": 22.365659713745117, "learning_rate": 3.9602291877317153e-07, "logits/chosen": -0.5538057088851929, "logits/rejected": -0.3844769597053528, "logps/chosen": -1.7564191818237305, "logps/rejected": -1.7836357355117798, "loss": 2.8724, "rewards/accuracies": 0.5, "rewards/chosen": -17.564189910888672, "rewards/margins": 0.2721673548221588, "rewards/rejected": -17.83635902404785, "step": 1175 }, { "epoch": 0.03977215275203074, "grad_norm": 19.269901275634766, "learning_rate": 3.977081226828446e-07, "logits/chosen": -0.432064950466156, "logits/rejected": -0.31601718068122864, "logps/chosen": -1.9622234106063843, "logps/rejected": -2.001495122909546, "loss": 3.1216, "rewards/accuracies": 0.5, "rewards/chosen": -19.622234344482422, "rewards/margins": 0.3927184045314789, "rewards/rejected": -20.01495361328125, "step": 1180 }, { "epoch": 0.03994067882301392, "grad_norm": 25.248796463012695, "learning_rate": 3.993933265925177e-07, "logits/chosen": -0.47508859634399414, "logits/rejected": -0.5017315149307251, "logps/chosen": -1.7752971649169922, "logps/rejected": -2.049936294555664, "loss": 3.234, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.752971649169922, "rewards/margins": 2.7463924884796143, "rewards/rejected": -20.49936294555664, "step": 1185 }, { "epoch": 0.0401092048939971, "grad_norm": 20.06141471862793, "learning_rate": 4.0107853050219075e-07, "logits/chosen": -0.17705413699150085, "logits/rejected": -0.25445953011512756, "logps/chosen": -1.946390151977539, "logps/rejected": -1.9433777332305908, "loss": 3.1987, "rewards/accuracies": 0.5, "rewards/chosen": -19.46390151977539, "rewards/margins": -0.03012552298605442, "rewards/rejected": -19.43377685546875, "step": 1190 }, { "epoch": 0.04027773096498028, "grad_norm": 33.42091751098633, "learning_rate": 4.027637344118638e-07, "logits/chosen": -0.5977298021316528, "logits/rejected": -0.6200628280639648, "logps/chosen": -1.7825301885604858, "logps/rejected": -1.7205543518066406, "loss": 3.6795, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -17.825302124023438, "rewards/margins": -0.6197582483291626, "rewards/rejected": -17.20554542541504, "step": 1195 }, { "epoch": 0.04044625703596346, "grad_norm": 18.302885055541992, "learning_rate": 4.044489383215369e-07, "logits/chosen": -0.6540176868438721, "logits/rejected": -0.6137186884880066, "logps/chosen": -1.7022262811660767, "logps/rejected": -1.7463051080703735, "loss": 2.8856, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.02226448059082, "rewards/margins": 0.4407869279384613, "rewards/rejected": -17.463048934936523, "step": 1200 }, { "epoch": 0.04044625703596346, "eval_logits/chosen": -0.6984472870826721, "eval_logits/rejected": -0.6997342109680176, "eval_logps/chosen": -1.656233549118042, "eval_logps/rejected": -1.6515949964523315, "eval_loss": 3.4398796558380127, "eval_rewards/accuracies": 0.4699999988079071, "eval_rewards/chosen": -16.562335968017578, "eval_rewards/margins": -0.04638513922691345, "eval_rewards/rejected": -16.51595115661621, "eval_runtime": 12.9174, "eval_samples_per_second": 7.741, "eval_steps_per_second": 1.935, "step": 1200 }, { "epoch": 0.04061478310694665, "grad_norm": 23.862524032592773, "learning_rate": 4.0613414223120997e-07, "logits/chosen": -0.5462231040000916, "logits/rejected": -0.5446901917457581, "logps/chosen": -1.6794078350067139, "logps/rejected": -1.7836143970489502, "loss": 2.4901, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.794078826904297, "rewards/margins": 1.0420640707015991, "rewards/rejected": -17.83614158630371, "step": 1205 }, { "epoch": 0.04078330917792983, "grad_norm": 18.685237884521484, "learning_rate": 4.0781934614088304e-07, "logits/chosen": -0.36270827054977417, "logits/rejected": -0.520888090133667, "logps/chosen": -1.7202644348144531, "logps/rejected": -1.7153618335723877, "loss": 3.3016, "rewards/accuracies": 0.5, "rewards/chosen": -17.2026424407959, "rewards/margins": -0.049027156084775925, "rewards/rejected": -17.15361785888672, "step": 1210 }, { "epoch": 0.04095183524891301, "grad_norm": 27.324533462524414, "learning_rate": 4.095045500505561e-07, "logits/chosen": -0.521334707736969, "logits/rejected": -0.5759360194206238, "logps/chosen": -1.778804063796997, "logps/rejected": -1.7410389184951782, "loss": 3.4761, "rewards/accuracies": 0.5, "rewards/chosen": -17.788042068481445, "rewards/margins": -0.37765082716941833, "rewards/rejected": -17.410388946533203, "step": 1215 }, { "epoch": 0.04112036131989619, "grad_norm": 13.48217487335205, "learning_rate": 4.1118975396022913e-07, "logits/chosen": 0.02969430759549141, "logits/rejected": -0.014116739854216576, "logps/chosen": -1.8857414722442627, "logps/rejected": -1.967960000038147, "loss": 3.302, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.8574161529541, "rewards/margins": 0.8221861720085144, "rewards/rejected": -19.679601669311523, "step": 1220 }, { "epoch": 0.04128888739087937, "grad_norm": 58.38434600830078, "learning_rate": 4.1287495786990225e-07, "logits/chosen": -0.4893341660499573, "logits/rejected": -0.4025818705558777, "logps/chosen": -1.5990569591522217, "logps/rejected": -1.6483962535858154, "loss": 2.7695, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -15.990570068359375, "rewards/margins": 0.4933937191963196, "rewards/rejected": -16.483963012695312, "step": 1225 }, { "epoch": 0.04145741346186255, "grad_norm": 53.46418380737305, "learning_rate": 4.145601617795753e-07, "logits/chosen": -0.5396580696105957, "logits/rejected": -0.6176207661628723, "logps/chosen": -1.6050565242767334, "logps/rejected": -1.5374106168746948, "loss": 3.7522, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.05056381225586, "rewards/margins": -0.6764588356018066, "rewards/rejected": -15.374105453491211, "step": 1230 }, { "epoch": 0.041625939532845734, "grad_norm": 23.273683547973633, "learning_rate": 4.162453656892484e-07, "logits/chosen": -0.517733097076416, "logits/rejected": -0.46530431509017944, "logps/chosen": -1.717034101486206, "logps/rejected": -1.7212120294570923, "loss": 3.0328, "rewards/accuracies": 0.5, "rewards/chosen": -17.17034149169922, "rewards/margins": 0.041781235486269, "rewards/rejected": -17.21212387084961, "step": 1235 }, { "epoch": 0.04179446560382891, "grad_norm": 32.8110237121582, "learning_rate": 4.179305695989214e-07, "logits/chosen": -0.5621334314346313, "logits/rejected": -0.4572630524635315, "logps/chosen": -1.8699407577514648, "logps/rejected": -1.951944351196289, "loss": 2.7422, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.699405670166016, "rewards/margins": 0.8200358152389526, "rewards/rejected": -19.519441604614258, "step": 1240 }, { "epoch": 0.04196299167481209, "grad_norm": 21.079713821411133, "learning_rate": 4.196157735085945e-07, "logits/chosen": -0.32299450039863586, "logits/rejected": -0.2742319703102112, "logps/chosen": -2.0296037197113037, "logps/rejected": -1.9701045751571655, "loss": 3.8158, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -20.296039581298828, "rewards/margins": -0.5949932932853699, "rewards/rejected": -19.701045989990234, "step": 1245 }, { "epoch": 0.04213151774579527, "grad_norm": 26.673555374145508, "learning_rate": 4.213009774182676e-07, "logits/chosen": -0.41463375091552734, "logits/rejected": -0.4377075135707855, "logps/chosen": -1.6575113534927368, "logps/rejected": -1.534987449645996, "loss": 4.3934, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.57511329650879, "rewards/margins": -1.2252376079559326, "rewards/rejected": -15.349874496459961, "step": 1250 }, { "epoch": 0.04230004381677845, "grad_norm": 17.89075469970703, "learning_rate": 4.229861813279407e-07, "logits/chosen": -0.6064554452896118, "logits/rejected": -0.6212650537490845, "logps/chosen": -1.851464867591858, "logps/rejected": -1.6988131999969482, "loss": 4.5567, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -18.514650344848633, "rewards/margins": -1.526517391204834, "rewards/rejected": -16.98813247680664, "step": 1255 }, { "epoch": 0.04246856988776164, "grad_norm": 33.90618896484375, "learning_rate": 4.2467138523761376e-07, "logits/chosen": -0.2495536506175995, "logits/rejected": -0.25900721549987793, "logps/chosen": -1.6897900104522705, "logps/rejected": -1.7224485874176025, "loss": 3.0004, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.897899627685547, "rewards/margins": 0.3265857696533203, "rewards/rejected": -17.224485397338867, "step": 1260 }, { "epoch": 0.04263709595874482, "grad_norm": 15.698116302490234, "learning_rate": 4.263565891472868e-07, "logits/chosen": -0.5363945364952087, "logits/rejected": -0.5787938237190247, "logps/chosen": -1.9835312366485596, "logps/rejected": -1.9302680492401123, "loss": 3.981, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -19.835309982299805, "rewards/margins": -0.5326288342475891, "rewards/rejected": -19.302682876586914, "step": 1265 }, { "epoch": 0.042805622029728, "grad_norm": 25.389694213867188, "learning_rate": 4.2804179305695985e-07, "logits/chosen": -0.29593202471733093, "logits/rejected": -0.221342995762825, "logps/chosen": -1.9105651378631592, "logps/rejected": -1.9219735860824585, "loss": 3.4243, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.105648040771484, "rewards/margins": 0.1140863448381424, "rewards/rejected": -19.21973419189453, "step": 1270 }, { "epoch": 0.04297414810071118, "grad_norm": 20.01345443725586, "learning_rate": 4.2972699696663297e-07, "logits/chosen": -0.6382346153259277, "logits/rejected": -0.5160123705863953, "logps/chosen": -1.6651241779327393, "logps/rejected": -1.7844934463500977, "loss": 2.8522, "rewards/accuracies": 0.5, "rewards/chosen": -16.651241302490234, "rewards/margins": 1.193691372871399, "rewards/rejected": -17.844934463500977, "step": 1275 }, { "epoch": 0.04314267417169436, "grad_norm": 30.08690071105957, "learning_rate": 4.3141220087630604e-07, "logits/chosen": -0.8173457980155945, "logits/rejected": -0.7372242212295532, "logps/chosen": -1.5411250591278076, "logps/rejected": -1.540610432624817, "loss": 3.3094, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -15.41125202178955, "rewards/margins": -0.005145835690200329, "rewards/rejected": -15.406105041503906, "step": 1280 }, { "epoch": 0.04331120024267754, "grad_norm": 24.03042221069336, "learning_rate": 4.3309740478597906e-07, "logits/chosen": 0.08485229313373566, "logits/rejected": 0.023478638380765915, "logps/chosen": -1.7684332132339478, "logps/rejected": -1.9061079025268555, "loss": 2.5292, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.6843318939209, "rewards/margins": 1.3767480850219727, "rewards/rejected": -19.061079025268555, "step": 1285 }, { "epoch": 0.043479726313660726, "grad_norm": 0.18441037833690643, "learning_rate": 4.3478260869565214e-07, "logits/chosen": -0.6562118530273438, "logits/rejected": -0.7641373872756958, "logps/chosen": -1.5792770385742188, "logps/rejected": -1.6253446340560913, "loss": 3.1678, "rewards/accuracies": 0.5, "rewards/chosen": -15.792770385742188, "rewards/margins": 0.4606761932373047, "rewards/rejected": -16.25344467163086, "step": 1290 }, { "epoch": 0.043648252384643905, "grad_norm": 21.094350814819336, "learning_rate": 4.364678126053252e-07, "logits/chosen": -0.5818384289741516, "logits/rejected": -0.6034550666809082, "logps/chosen": -1.4294414520263672, "logps/rejected": -1.4413141012191772, "loss": 2.9525, "rewards/accuracies": 0.5, "rewards/chosen": -14.294413566589355, "rewards/margins": 0.11872673034667969, "rewards/rejected": -14.413141250610352, "step": 1295 }, { "epoch": 0.043816778455627085, "grad_norm": 9.890064239501953, "learning_rate": 4.3815301651499833e-07, "logits/chosen": -0.5558933019638062, "logits/rejected": -0.4949572682380676, "logps/chosen": -1.7029212713241577, "logps/rejected": -1.8398288488388062, "loss": 2.477, "rewards/accuracies": 0.5, "rewards/chosen": -17.02921485900879, "rewards/margins": 1.369077205657959, "rewards/rejected": -18.398290634155273, "step": 1300 }, { "epoch": 0.043985304526610265, "grad_norm": 24.280580520629883, "learning_rate": 4.398382204246714e-07, "logits/chosen": -0.4872204661369324, "logits/rejected": -0.4315074384212494, "logps/chosen": -1.6111892461776733, "logps/rejected": -1.5915868282318115, "loss": 3.3073, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.111894607543945, "rewards/margins": -0.19602584838867188, "rewards/rejected": -15.915868759155273, "step": 1305 }, { "epoch": 0.044153830597593445, "grad_norm": 25.741281509399414, "learning_rate": 4.415234243343444e-07, "logits/chosen": -0.5934125781059265, "logits/rejected": -0.7145117521286011, "logps/chosen": -1.6144893169403076, "logps/rejected": -1.635259985923767, "loss": 2.9124, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -16.144893646240234, "rewards/margins": 0.20770521461963654, "rewards/rejected": -16.35260009765625, "step": 1310 }, { "epoch": 0.04432235666857663, "grad_norm": 38.52206039428711, "learning_rate": 4.432086282440175e-07, "logits/chosen": -0.493429958820343, "logits/rejected": -0.44628891348838806, "logps/chosen": -2.13350248336792, "logps/rejected": -2.1447107791900635, "loss": 3.1856, "rewards/accuracies": 0.5, "rewards/chosen": -21.33502769470215, "rewards/margins": 0.11208000034093857, "rewards/rejected": -21.44710922241211, "step": 1315 }, { "epoch": 0.04449088273955981, "grad_norm": 31.843582153320312, "learning_rate": 4.4489383215369057e-07, "logits/chosen": -0.5034832954406738, "logits/rejected": -0.6175475716590881, "logps/chosen": -1.6708362102508545, "logps/rejected": -1.7281715869903564, "loss": 2.6639, "rewards/accuracies": 0.5, "rewards/chosen": -16.708362579345703, "rewards/margins": 0.5733525156974792, "rewards/rejected": -17.281715393066406, "step": 1320 }, { "epoch": 0.04465940881054299, "grad_norm": 25.711498260498047, "learning_rate": 4.465790360633637e-07, "logits/chosen": -0.5433965921401978, "logits/rejected": -0.5722803473472595, "logps/chosen": -1.6870956420898438, "logps/rejected": -1.6297203302383423, "loss": 3.6554, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -16.870956420898438, "rewards/margins": -0.5737524032592773, "rewards/rejected": -16.297204971313477, "step": 1325 }, { "epoch": 0.04482793488152617, "grad_norm": 148.03579711914062, "learning_rate": 4.482642399730367e-07, "logits/chosen": -0.4677700996398926, "logits/rejected": -0.35929447412490845, "logps/chosen": -1.8165819644927979, "logps/rejected": -1.690076470375061, "loss": 4.6109, "rewards/accuracies": 0.5, "rewards/chosen": -18.16581916809082, "rewards/margins": -1.26505446434021, "rewards/rejected": -16.90076446533203, "step": 1330 }, { "epoch": 0.04499646095250935, "grad_norm": 14.833139419555664, "learning_rate": 4.499494438827098e-07, "logits/chosen": -0.06567313522100449, "logits/rejected": 0.02431054599583149, "logps/chosen": -1.7260477542877197, "logps/rejected": -1.953141450881958, "loss": 1.5616, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.260478973388672, "rewards/margins": 2.270934581756592, "rewards/rejected": -19.531414031982422, "step": 1335 }, { "epoch": 0.04516498702349253, "grad_norm": 31.50690460205078, "learning_rate": 4.5163464779238286e-07, "logits/chosen": -0.36952149868011475, "logits/rejected": -0.3253156542778015, "logps/chosen": -2.0293240547180176, "logps/rejected": -1.9255653619766235, "loss": 4.171, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.293243408203125, "rewards/margins": -1.037587285041809, "rewards/rejected": -19.255653381347656, "step": 1340 }, { "epoch": 0.04533351309447572, "grad_norm": 25.02866554260254, "learning_rate": 4.5331985170205593e-07, "logits/chosen": -0.2764541804790497, "logits/rejected": -0.4271532893180847, "logps/chosen": -1.822595238685608, "logps/rejected": -1.819495439529419, "loss": 3.1072, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.2259521484375, "rewards/margins": -0.03099794313311577, "rewards/rejected": -18.19495391845703, "step": 1345 }, { "epoch": 0.0455020391654589, "grad_norm": 73.92284393310547, "learning_rate": 4.5500505561172895e-07, "logits/chosen": -0.3259121775627136, "logits/rejected": -0.3415904641151428, "logps/chosen": -2.656127691268921, "logps/rejected": -2.435351848602295, "loss": 5.274, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -26.561279296875, "rewards/margins": -2.20776104927063, "rewards/rejected": -24.353517532348633, "step": 1350 }, { "epoch": 0.04567056523644208, "grad_norm": 21.903491973876953, "learning_rate": 4.5669025952140207e-07, "logits/chosen": -0.5971258878707886, "logits/rejected": -0.49006539583206177, "logps/chosen": -1.649746298789978, "logps/rejected": -1.5837621688842773, "loss": 3.7002, "rewards/accuracies": 0.5, "rewards/chosen": -16.497465133666992, "rewards/margins": -0.6598426699638367, "rewards/rejected": -15.837620735168457, "step": 1355 }, { "epoch": 0.04583909130742526, "grad_norm": 23.044069290161133, "learning_rate": 4.5837546343107514e-07, "logits/chosen": -0.43621835112571716, "logits/rejected": -0.32460182905197144, "logps/chosen": -1.6573737859725952, "logps/rejected": -1.6105083227157593, "loss": 3.5635, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.5737361907959, "rewards/margins": -0.46865320205688477, "rewards/rejected": -16.105083465576172, "step": 1360 }, { "epoch": 0.04600761737840844, "grad_norm": 26.895612716674805, "learning_rate": 4.600606673407482e-07, "logits/chosen": -0.6109335422515869, "logits/rejected": -0.5404512286186218, "logps/chosen": -1.8257896900177002, "logps/rejected": -1.864885687828064, "loss": 2.8298, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.257898330688477, "rewards/margins": 0.39095717668533325, "rewards/rejected": -18.648855209350586, "step": 1365 }, { "epoch": 0.046176143449391624, "grad_norm": 29.45931625366211, "learning_rate": 4.617458712504213e-07, "logits/chosen": -0.6883367896080017, "logits/rejected": -0.4843037724494934, "logps/chosen": -1.4697327613830566, "logps/rejected": -1.4948680400848389, "loss": 3.034, "rewards/accuracies": 0.5, "rewards/chosen": -14.69732666015625, "rewards/margins": 0.25135332345962524, "rewards/rejected": -14.94867992401123, "step": 1370 }, { "epoch": 0.046344669520374804, "grad_norm": 26.856428146362305, "learning_rate": 4.634310751600943e-07, "logits/chosen": -0.1802286058664322, "logits/rejected": -0.13944736123085022, "logps/chosen": -1.8919188976287842, "logps/rejected": -2.034350872039795, "loss": 1.9954, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -18.919189453125, "rewards/margins": 1.4243175983428955, "rewards/rejected": -20.343509674072266, "step": 1375 }, { "epoch": 0.046513195591357984, "grad_norm": 28.34373664855957, "learning_rate": 4.6511627906976743e-07, "logits/chosen": -0.4101434350013733, "logits/rejected": -0.2598617672920227, "logps/chosen": -1.6901744604110718, "logps/rejected": -1.7066800594329834, "loss": 3.0128, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.901744842529297, "rewards/margins": 0.16505737602710724, "rewards/rejected": -17.066801071166992, "step": 1380 }, { "epoch": 0.046681721662341163, "grad_norm": 32.586944580078125, "learning_rate": 4.668014829794405e-07, "logits/chosen": -0.8744746446609497, "logits/rejected": -0.8278564214706421, "logps/chosen": -1.6448156833648682, "logps/rejected": -1.6291271448135376, "loss": 3.2346, "rewards/accuracies": 0.5, "rewards/chosen": -16.448158264160156, "rewards/margins": -0.1568845808506012, "rewards/rejected": -16.291271209716797, "step": 1385 }, { "epoch": 0.04685024773332434, "grad_norm": 57.97466278076172, "learning_rate": 4.684866868891136e-07, "logits/chosen": -0.4052852690219879, "logits/rejected": -0.2951774001121521, "logps/chosen": -1.8488044738769531, "logps/rejected": -1.7702134847640991, "loss": 3.8409, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.4880428314209, "rewards/margins": -0.7859078645706177, "rewards/rejected": -17.70213508605957, "step": 1390 }, { "epoch": 0.04701877380430752, "grad_norm": 25.0904483795166, "learning_rate": 4.701718907987866e-07, "logits/chosen": -0.4210020899772644, "logits/rejected": -0.41034144163131714, "logps/chosen": -1.842725396156311, "logps/rejected": -1.8793871402740479, "loss": 3.0764, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.4272518157959, "rewards/margins": 0.3666171133518219, "rewards/rejected": -18.793869018554688, "step": 1395 }, { "epoch": 0.04718729987529071, "grad_norm": 11.349648475646973, "learning_rate": 4.7185709470845967e-07, "logits/chosen": -0.46164339780807495, "logits/rejected": -0.28323012590408325, "logps/chosen": -2.108422040939331, "logps/rejected": -2.2248573303222656, "loss": 3.1837, "rewards/accuracies": 0.5, "rewards/chosen": -21.084218978881836, "rewards/margins": 1.1643527746200562, "rewards/rejected": -22.248571395874023, "step": 1400 }, { "epoch": 0.04735582594627389, "grad_norm": 24.339231491088867, "learning_rate": 4.735422986181328e-07, "logits/chosen": -0.7064308524131775, "logits/rejected": -0.6853176951408386, "logps/chosen": -1.5951899290084839, "logps/rejected": -1.4471842050552368, "loss": 4.5243, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -15.951898574829102, "rewards/margins": -1.4800574779510498, "rewards/rejected": -14.471841812133789, "step": 1405 }, { "epoch": 0.04752435201725707, "grad_norm": 24.074268341064453, "learning_rate": 4.7522750252780586e-07, "logits/chosen": -0.5154116749763489, "logits/rejected": -0.473574161529541, "logps/chosen": -1.5804609060287476, "logps/rejected": -1.4201406240463257, "loss": 4.6391, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -15.804609298706055, "rewards/margins": -1.60320246219635, "rewards/rejected": -14.201406478881836, "step": 1410 }, { "epoch": 0.04769287808824025, "grad_norm": 21.47679328918457, "learning_rate": 4.769127064374789e-07, "logits/chosen": -0.3036695420742035, "logits/rejected": -0.40201884508132935, "logps/chosen": -1.8438358306884766, "logps/rejected": -1.8053996562957764, "loss": 3.4989, "rewards/accuracies": 0.5, "rewards/chosen": -18.438358306884766, "rewards/margins": -0.38436126708984375, "rewards/rejected": -18.053997039794922, "step": 1415 }, { "epoch": 0.04786140415922343, "grad_norm": 28.6302490234375, "learning_rate": 4.78597910347152e-07, "logits/chosen": -0.28868794441223145, "logits/rejected": -0.14128902554512024, "logps/chosen": -1.936479926109314, "logps/rejected": -1.8393840789794922, "loss": 4.0585, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -19.364797592163086, "rewards/margins": -0.9709571003913879, "rewards/rejected": -18.393840789794922, "step": 1420 }, { "epoch": 0.048029930230206616, "grad_norm": 30.24493980407715, "learning_rate": 4.802831142568251e-07, "logits/chosen": -0.2036716639995575, "logits/rejected": -0.17143428325653076, "logps/chosen": -1.869917631149292, "logps/rejected": -1.8757200241088867, "loss": 3.0552, "rewards/accuracies": 0.5, "rewards/chosen": -18.69917869567871, "rewards/margins": 0.058022309094667435, "rewards/rejected": -18.757200241088867, "step": 1425 }, { "epoch": 0.048198456301189796, "grad_norm": 30.839862823486328, "learning_rate": 4.819683181664982e-07, "logits/chosen": -0.43438920378685, "logits/rejected": -0.5123583078384399, "logps/chosen": -1.749098777770996, "logps/rejected": -1.6257498264312744, "loss": 4.7179, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.49098777770996, "rewards/margins": -1.2334905862808228, "rewards/rejected": -16.257495880126953, "step": 1430 }, { "epoch": 0.048366982372172976, "grad_norm": 8.07622241973877, "learning_rate": 4.836535220761712e-07, "logits/chosen": -0.3356507420539856, "logits/rejected": -0.2890998423099518, "logps/chosen": -1.912122130393982, "logps/rejected": -2.059704303741455, "loss": 2.745, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.1212215423584, "rewards/margins": 1.4758189916610718, "rewards/rejected": -20.5970401763916, "step": 1435 }, { "epoch": 0.048535508443156156, "grad_norm": 20.30443000793457, "learning_rate": 4.853387259858443e-07, "logits/chosen": 0.08023197948932648, "logits/rejected": -0.002189111663028598, "logps/chosen": -1.799748420715332, "logps/rejected": -1.933651328086853, "loss": 2.7519, "rewards/accuracies": 0.5, "rewards/chosen": -17.99748420715332, "rewards/margins": 1.339029312133789, "rewards/rejected": -19.336511611938477, "step": 1440 }, { "epoch": 0.048704034514139335, "grad_norm": 30.887516021728516, "learning_rate": 4.870239298955174e-07, "logits/chosen": -0.39526060223579407, "logits/rejected": -0.2602062225341797, "logps/chosen": -1.5550906658172607, "logps/rejected": -1.580798864364624, "loss": 2.9367, "rewards/accuracies": 0.5, "rewards/chosen": -15.550908088684082, "rewards/margins": 0.2570803761482239, "rewards/rejected": -15.807989120483398, "step": 1445 }, { "epoch": 0.048872560585122515, "grad_norm": 17.38456153869629, "learning_rate": 4.887091338051904e-07, "logits/chosen": -0.3647865653038025, "logits/rejected": -0.30020421743392944, "logps/chosen": -1.8236277103424072, "logps/rejected": -1.9944959878921509, "loss": 2.3455, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.236276626586914, "rewards/margins": 1.708682656288147, "rewards/rejected": -19.94495964050293, "step": 1450 }, { "epoch": 0.0490410866561057, "grad_norm": 128.50694274902344, "learning_rate": 4.903943377148635e-07, "logits/chosen": -0.3505763113498688, "logits/rejected": -0.4260808527469635, "logps/chosen": -2.0881869792938232, "logps/rejected": -1.8180824518203735, "loss": 5.8073, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.88187026977539, "rewards/margins": -2.701045513153076, "rewards/rejected": -18.180824279785156, "step": 1455 }, { "epoch": 0.04920961272708888, "grad_norm": 31.025297164916992, "learning_rate": 4.920795416245365e-07, "logits/chosen": -0.5189875364303589, "logits/rejected": -0.5705165863037109, "logps/chosen": -1.635557770729065, "logps/rejected": -1.7688162326812744, "loss": 2.7251, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.35557746887207, "rewards/margins": 1.3325845003128052, "rewards/rejected": -17.688159942626953, "step": 1460 }, { "epoch": 0.04937813879807206, "grad_norm": 24.771587371826172, "learning_rate": 4.937647455342097e-07, "logits/chosen": -0.3771246373653412, "logits/rejected": -0.35342922806739807, "logps/chosen": -1.6810089349746704, "logps/rejected": -1.7020902633666992, "loss": 3.2912, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.810089111328125, "rewards/margins": 0.2108127623796463, "rewards/rejected": -17.020902633666992, "step": 1465 }, { "epoch": 0.04954666486905524, "grad_norm": 12.7250337600708, "learning_rate": 4.954499494438827e-07, "logits/chosen": -0.4494267404079437, "logits/rejected": -0.5832349061965942, "logps/chosen": -1.6091959476470947, "logps/rejected": -1.7226126194000244, "loss": 3.0505, "rewards/accuracies": 0.5, "rewards/chosen": -16.09195899963379, "rewards/margins": 1.1341665983200073, "rewards/rejected": -17.226125717163086, "step": 1470 }, { "epoch": 0.04971519094003842, "grad_norm": 23.972063064575195, "learning_rate": 4.971351533535558e-07, "logits/chosen": -0.18479518592357635, "logits/rejected": -0.12936873733997345, "logps/chosen": -1.9910461902618408, "logps/rejected": -1.9244306087493896, "loss": 3.9929, "rewards/accuracies": 0.5, "rewards/chosen": -19.910459518432617, "rewards/margins": -0.6661556363105774, "rewards/rejected": -19.244304656982422, "step": 1475 }, { "epoch": 0.04988371701102161, "grad_norm": 23.053617477416992, "learning_rate": 4.988203572632289e-07, "logits/chosen": -0.29251593351364136, "logits/rejected": -0.23342540860176086, "logps/chosen": -1.8383159637451172, "logps/rejected": -1.888374924659729, "loss": 2.906, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.383159637451172, "rewards/margins": 0.5005893707275391, "rewards/rejected": -18.88374900817871, "step": 1480 }, { "epoch": 0.05005224308200479, "grad_norm": 30.78038215637207, "learning_rate": 5.005055611729018e-07, "logits/chosen": -0.4969760477542877, "logits/rejected": -0.5444768667221069, "logps/chosen": -1.74346923828125, "logps/rejected": -1.8779300451278687, "loss": 2.1338, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.4346923828125, "rewards/margins": 1.3446089029312134, "rewards/rejected": -18.7793025970459, "step": 1485 }, { "epoch": 0.05022076915298797, "grad_norm": 25.886037826538086, "learning_rate": 5.02190765082575e-07, "logits/chosen": -0.4277273118495941, "logits/rejected": -0.38578924536705017, "logps/chosen": -1.7436481714248657, "logps/rejected": -1.703674077987671, "loss": 3.8824, "rewards/accuracies": 0.5, "rewards/chosen": -17.436481475830078, "rewards/margins": -0.3997390866279602, "rewards/rejected": -17.036739349365234, "step": 1490 }, { "epoch": 0.05038929522397115, "grad_norm": 20.68552589416504, "learning_rate": 5.038759689922481e-07, "logits/chosen": -0.5360159873962402, "logits/rejected": -0.4824953079223633, "logps/chosen": -1.9363552331924438, "logps/rejected": -1.9381564855575562, "loss": 3.1114, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.36355209350586, "rewards/margins": 0.018012618646025658, "rewards/rejected": -19.38156509399414, "step": 1495 }, { "epoch": 0.05055782129495433, "grad_norm": 25.067201614379883, "learning_rate": 5.055611729019212e-07, "logits/chosen": -0.5183056592941284, "logits/rejected": -0.5793188810348511, "logps/chosen": -1.6892459392547607, "logps/rejected": -1.6137710809707642, "loss": 3.7882, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -16.8924617767334, "rewards/margins": -0.7547513842582703, "rewards/rejected": -16.13770866394043, "step": 1500 }, { "epoch": 0.05072634736593751, "grad_norm": 48.83390808105469, "learning_rate": 5.072463768115942e-07, "logits/chosen": -0.128324493765831, "logits/rejected": -0.15247969329357147, "logps/chosen": -1.816828966140747, "logps/rejected": -1.7542918920516968, "loss": 3.7735, "rewards/accuracies": 0.5, "rewards/chosen": -18.168289184570312, "rewards/margins": -0.625369668006897, "rewards/rejected": -17.542919158935547, "step": 1505 }, { "epoch": 0.050894873436920694, "grad_norm": 38.07307815551758, "learning_rate": 5.089315807212673e-07, "logits/chosen": -0.3632165193557739, "logits/rejected": -0.287231981754303, "logps/chosen": -1.9052129983901978, "logps/rejected": -1.9243885278701782, "loss": 2.9336, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.0521297454834, "rewards/margins": 0.1917535811662674, "rewards/rejected": -19.243885040283203, "step": 1510 }, { "epoch": 0.051063399507903874, "grad_norm": 25.32839584350586, "learning_rate": 5.106167846309403e-07, "logits/chosen": -0.5327574014663696, "logits/rejected": -0.46464890241622925, "logps/chosen": -1.7534282207489014, "logps/rejected": -1.8104603290557861, "loss": 2.6527, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.53428077697754, "rewards/margins": 0.5703199505805969, "rewards/rejected": -18.10460090637207, "step": 1515 }, { "epoch": 0.051231925578887054, "grad_norm": 24.18781852722168, "learning_rate": 5.123019885406133e-07, "logits/chosen": -0.4070916771888733, "logits/rejected": -0.3911053538322449, "logps/chosen": -1.787366509437561, "logps/rejected": -1.853560209274292, "loss": 2.4961, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.873666763305664, "rewards/margins": 0.6619375944137573, "rewards/rejected": -18.53560447692871, "step": 1520 }, { "epoch": 0.051400451649870234, "grad_norm": 26.94209861755371, "learning_rate": 5.139871924502864e-07, "logits/chosen": -0.2895117402076721, "logits/rejected": -0.3377618193626404, "logps/chosen": -1.7159216403961182, "logps/rejected": -1.7107264995574951, "loss": 3.3023, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.159215927124023, "rewards/margins": -0.05195007473230362, "rewards/rejected": -17.10726547241211, "step": 1525 }, { "epoch": 0.051568977720853414, "grad_norm": 12.506376266479492, "learning_rate": 5.156723963599595e-07, "logits/chosen": -0.6083391308784485, "logits/rejected": -0.6645565032958984, "logps/chosen": -1.7214155197143555, "logps/rejected": -1.7124853134155273, "loss": 3.2891, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.214157104492188, "rewards/margins": -0.08930368721485138, "rewards/rejected": -17.12485122680664, "step": 1530 }, { "epoch": 0.0517375037918366, "grad_norm": 21.57737159729004, "learning_rate": 5.173576002696326e-07, "logits/chosen": -0.5799289345741272, "logits/rejected": -0.6960101127624512, "logps/chosen": -2.1023707389831543, "logps/rejected": -2.1636109352111816, "loss": 3.9, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -21.02370834350586, "rewards/margins": 0.6123997569084167, "rewards/rejected": -21.636106491088867, "step": 1535 }, { "epoch": 0.05190602986281978, "grad_norm": 6.66773796081543, "learning_rate": 5.190428041793057e-07, "logits/chosen": -0.2898945212364197, "logits/rejected": -0.2669333815574646, "logps/chosen": -1.9524368047714233, "logps/rejected": -2.0663256645202637, "loss": 2.6577, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.52436637878418, "rewards/margins": 1.1388882398605347, "rewards/rejected": -20.663257598876953, "step": 1540 }, { "epoch": 0.05207455593380296, "grad_norm": 23.368228912353516, "learning_rate": 5.207280080889788e-07, "logits/chosen": -0.34807825088500977, "logits/rejected": -0.37616461515426636, "logps/chosen": -1.7619297504425049, "logps/rejected": -1.7687091827392578, "loss": 3.0278, "rewards/accuracies": 0.5, "rewards/chosen": -17.61929702758789, "rewards/margins": 0.06779269874095917, "rewards/rejected": -17.687091827392578, "step": 1545 }, { "epoch": 0.05224308200478614, "grad_norm": 29.27812957763672, "learning_rate": 5.224132119986519e-07, "logits/chosen": -0.4002048373222351, "logits/rejected": -0.21160908043384552, "logps/chosen": -1.8722518682479858, "logps/rejected": -1.9842402935028076, "loss": 2.6542, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.722518920898438, "rewards/margins": 1.1198838949203491, "rewards/rejected": -19.842403411865234, "step": 1550 }, { "epoch": 0.05241160807576932, "grad_norm": 26.955427169799805, "learning_rate": 5.24098415908325e-07, "logits/chosen": -0.6399390697479248, "logits/rejected": -0.5415032505989075, "logps/chosen": -1.7122814655303955, "logps/rejected": -1.7477025985717773, "loss": 2.773, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.122814178466797, "rewards/margins": 0.35421285033226013, "rewards/rejected": -17.477027893066406, "step": 1555 }, { "epoch": 0.0525801341467525, "grad_norm": 19.174209594726562, "learning_rate": 5.257836198179979e-07, "logits/chosen": -0.601922869682312, "logits/rejected": -0.47701844573020935, "logps/chosen": -1.6277345418930054, "logps/rejected": -1.6898586750030518, "loss": 2.5356, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.277345657348633, "rewards/margins": 0.6212414503097534, "rewards/rejected": -16.89858627319336, "step": 1560 }, { "epoch": 0.052748660217735686, "grad_norm": 21.741432189941406, "learning_rate": 5.27468823727671e-07, "logits/chosen": -0.5087687969207764, "logits/rejected": -0.6106857061386108, "logps/chosen": -1.5809760093688965, "logps/rejected": -1.6826480627059937, "loss": 2.7595, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -15.809759140014648, "rewards/margins": 1.0167211294174194, "rewards/rejected": -16.826480865478516, "step": 1565 }, { "epoch": 0.052917186288718866, "grad_norm": 23.948646545410156, "learning_rate": 5.291540276373441e-07, "logits/chosen": -0.2296111136674881, "logits/rejected": -0.31996551156044006, "logps/chosen": -1.6906408071517944, "logps/rejected": -1.7543413639068604, "loss": 2.9589, "rewards/accuracies": 0.5, "rewards/chosen": -16.906408309936523, "rewards/margins": 0.6370050311088562, "rewards/rejected": -17.543415069580078, "step": 1570 }, { "epoch": 0.053085712359702046, "grad_norm": 29.292556762695312, "learning_rate": 5.308392315470171e-07, "logits/chosen": -0.6534808874130249, "logits/rejected": -0.7786849141120911, "logps/chosen": -1.802342414855957, "logps/rejected": -1.7604506015777588, "loss": 3.5204, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -18.02342414855957, "rewards/margins": -0.41891756653785706, "rewards/rejected": -17.604507446289062, "step": 1575 }, { "epoch": 0.053254238430685226, "grad_norm": 29.834030151367188, "learning_rate": 5.325244354566902e-07, "logits/chosen": -0.2010071724653244, "logits/rejected": -0.2582705020904541, "logps/chosen": -1.875382661819458, "logps/rejected": -1.7762857675552368, "loss": 4.0972, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -18.753826141357422, "rewards/margins": -0.9909681081771851, "rewards/rejected": -17.76285743713379, "step": 1580 }, { "epoch": 0.053422764501668406, "grad_norm": 24.455078125, "learning_rate": 5.342096393663633e-07, "logits/chosen": -0.4177281856536865, "logits/rejected": -0.2973349094390869, "logps/chosen": -1.7950809001922607, "logps/rejected": -1.8943901062011719, "loss": 2.5308, "rewards/accuracies": 0.5, "rewards/chosen": -17.950809478759766, "rewards/margins": 0.9930933117866516, "rewards/rejected": -18.94390296936035, "step": 1585 }, { "epoch": 0.05359129057265159, "grad_norm": 19.05913543701172, "learning_rate": 5.358948432760365e-07, "logits/chosen": -0.2933509945869446, "logits/rejected": -0.3594059348106384, "logps/chosen": -1.8748451471328735, "logps/rejected": -1.921567678451538, "loss": 2.712, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.748451232910156, "rewards/margins": 0.46722546219825745, "rewards/rejected": -19.215679168701172, "step": 1590 }, { "epoch": 0.05375981664363477, "grad_norm": 16.711894989013672, "learning_rate": 5.375800471857095e-07, "logits/chosen": -0.42561930418014526, "logits/rejected": -0.3419100344181061, "logps/chosen": -1.541689157485962, "logps/rejected": -1.7792189121246338, "loss": 2.7164, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -15.416891098022461, "rewards/margins": 2.375296115875244, "rewards/rejected": -17.79218864440918, "step": 1595 }, { "epoch": 0.05392834271461795, "grad_norm": 29.211706161499023, "learning_rate": 5.392652510953826e-07, "logits/chosen": -0.6977416276931763, "logits/rejected": -0.6029377579689026, "logps/chosen": -1.619699239730835, "logps/rejected": -1.546464204788208, "loss": 3.8819, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -16.19699478149414, "rewards/margins": -0.732352077960968, "rewards/rejected": -15.464642524719238, "step": 1600 }, { "epoch": 0.05392834271461795, "eval_logits/chosen": -0.699831485748291, "eval_logits/rejected": -0.7011949419975281, "eval_logps/chosen": -1.656389594078064, "eval_logps/rejected": -1.65248441696167, "eval_loss": 3.437371253967285, "eval_rewards/accuracies": 0.4699999988079071, "eval_rewards/chosen": -16.56389617919922, "eval_rewards/margins": -0.039052486419677734, "eval_rewards/rejected": -16.524843215942383, "eval_runtime": 12.9021, "eval_samples_per_second": 7.751, "eval_steps_per_second": 1.938, "step": 1600 }, { "epoch": 0.05409686878560113, "grad_norm": 48.776004791259766, "learning_rate": 5.409504550050556e-07, "logits/chosen": -0.21524472534656525, "logits/rejected": -0.1732257902622223, "logps/chosen": -2.1390466690063477, "logps/rejected": -2.1870763301849365, "loss": 2.7335, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.39046859741211, "rewards/margins": 0.4802955687046051, "rewards/rejected": -21.870765686035156, "step": 1605 }, { "epoch": 0.05426539485658431, "grad_norm": 39.24583053588867, "learning_rate": 5.426356589147286e-07, "logits/chosen": -0.5967484712600708, "logits/rejected": -0.7126034498214722, "logps/chosen": -1.4969217777252197, "logps/rejected": -1.545467734336853, "loss": 2.7097, "rewards/accuracies": 0.5, "rewards/chosen": -14.969217300415039, "rewards/margins": 0.4854598939418793, "rewards/rejected": -15.454675674438477, "step": 1610 }, { "epoch": 0.05443392092756749, "grad_norm": 30.210620880126953, "learning_rate": 5.443208628244017e-07, "logits/chosen": -0.4601069390773773, "logits/rejected": -0.41530901193618774, "logps/chosen": -2.0412802696228027, "logps/rejected": -2.0564990043640137, "loss": 3.1674, "rewards/accuracies": 0.5, "rewards/chosen": -20.41280174255371, "rewards/margins": 0.15218643844127655, "rewards/rejected": -20.56498908996582, "step": 1615 }, { "epoch": 0.05460244699855068, "grad_norm": 17.723539352416992, "learning_rate": 5.460060667340748e-07, "logits/chosen": -0.4109547734260559, "logits/rejected": -0.4085041880607605, "logps/chosen": -1.9298667907714844, "logps/rejected": -1.9639778137207031, "loss": 3.0299, "rewards/accuracies": 0.5, "rewards/chosen": -19.298669815063477, "rewards/margins": 0.341108500957489, "rewards/rejected": -19.6397762298584, "step": 1620 }, { "epoch": 0.05477097306953386, "grad_norm": 19.268251419067383, "learning_rate": 5.476912706437478e-07, "logits/chosen": -0.05240452289581299, "logits/rejected": -0.03201603889465332, "logps/chosen": -1.9599721431732178, "logps/rejected": -1.8682050704956055, "loss": 3.9744, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.599721908569336, "rewards/margins": -0.917669951915741, "rewards/rejected": -18.682048797607422, "step": 1625 }, { "epoch": 0.05493949914051704, "grad_norm": 28.24390411376953, "learning_rate": 5.493764745534209e-07, "logits/chosen": -0.46913594007492065, "logits/rejected": -0.4315093159675598, "logps/chosen": -1.75238037109375, "logps/rejected": -1.8159980773925781, "loss": 2.6752, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.5238037109375, "rewards/margins": 0.6361768841743469, "rewards/rejected": -18.15998077392578, "step": 1630 }, { "epoch": 0.05510802521150022, "grad_norm": 22.484506607055664, "learning_rate": 5.51061678463094e-07, "logits/chosen": 0.1574796438217163, "logits/rejected": 0.19055981934070587, "logps/chosen": -2.093717098236084, "logps/rejected": -2.0308218002319336, "loss": 3.6801, "rewards/accuracies": 0.5, "rewards/chosen": -20.937170028686523, "rewards/margins": -0.6289529800415039, "rewards/rejected": -20.308218002319336, "step": 1635 }, { "epoch": 0.0552765512824834, "grad_norm": 24.651714324951172, "learning_rate": 5.527468823727672e-07, "logits/chosen": -0.4641755521297455, "logits/rejected": -0.5991306900978088, "logps/chosen": -1.6150715351104736, "logps/rejected": -1.6226387023925781, "loss": 3.1799, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.15071678161621, "rewards/margins": 0.07567119598388672, "rewards/rejected": -16.22638702392578, "step": 1640 }, { "epoch": 0.055445077353466585, "grad_norm": 30.90976333618164, "learning_rate": 5.544320862824402e-07, "logits/chosen": -0.3300759196281433, "logits/rejected": -0.39983344078063965, "logps/chosen": -1.6464049816131592, "logps/rejected": -1.761338472366333, "loss": 2.1055, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.46405029296875, "rewards/margins": 1.1493333578109741, "rewards/rejected": -17.613384246826172, "step": 1645 }, { "epoch": 0.055613603424449765, "grad_norm": 26.457569122314453, "learning_rate": 5.561172901921132e-07, "logits/chosen": -0.38823699951171875, "logits/rejected": -0.365914523601532, "logps/chosen": -1.955287218093872, "logps/rejected": -1.883247971534729, "loss": 4.172, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.55286979675293, "rewards/margins": -0.7203909158706665, "rewards/rejected": -18.83247947692871, "step": 1650 }, { "epoch": 0.055782129495432944, "grad_norm": 15.721268653869629, "learning_rate": 5.578024941017863e-07, "logits/chosen": -0.48343658447265625, "logits/rejected": -0.4833255708217621, "logps/chosen": -1.5667378902435303, "logps/rejected": -1.7361778020858765, "loss": 2.0005, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -15.667379379272461, "rewards/margins": 1.694397211074829, "rewards/rejected": -17.361778259277344, "step": 1655 }, { "epoch": 0.055950655566416124, "grad_norm": 36.59139633178711, "learning_rate": 5.594876980114594e-07, "logits/chosen": -0.18844670057296753, "logits/rejected": -0.28556400537490845, "logps/chosen": -1.668357491493225, "logps/rejected": -1.7187904119491577, "loss": 2.7758, "rewards/accuracies": 0.5, "rewards/chosen": -16.68357276916504, "rewards/margins": 0.5043299794197083, "rewards/rejected": -17.187904357910156, "step": 1660 }, { "epoch": 0.056119181637399304, "grad_norm": 18.01271629333496, "learning_rate": 5.611729019211324e-07, "logits/chosen": -0.5732772946357727, "logits/rejected": -0.5180662870407104, "logps/chosen": -1.6921463012695312, "logps/rejected": -1.879349708557129, "loss": 2.4687, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.921463012695312, "rewards/margins": 1.872035264968872, "rewards/rejected": -18.793498992919922, "step": 1665 }, { "epoch": 0.056287707708382484, "grad_norm": 29.73081398010254, "learning_rate": 5.628581058308055e-07, "logits/chosen": -0.41807693243026733, "logits/rejected": -0.4663594663143158, "logps/chosen": -1.612532377243042, "logps/rejected": -1.6978442668914795, "loss": 2.4525, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.125324249267578, "rewards/margins": 0.8531206250190735, "rewards/rejected": -16.978443145751953, "step": 1670 }, { "epoch": 0.05645623377936567, "grad_norm": 17.574676513671875, "learning_rate": 5.645433097404786e-07, "logits/chosen": -0.6845382452011108, "logits/rejected": -0.6192032098770142, "logps/chosen": -1.545688271522522, "logps/rejected": -1.5205169916152954, "loss": 3.3514, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -15.456881523132324, "rewards/margins": -0.25171154737472534, "rewards/rejected": -15.205171585083008, "step": 1675 }, { "epoch": 0.05662475985034885, "grad_norm": 27.954343795776367, "learning_rate": 5.662285136501516e-07, "logits/chosen": -0.24558699131011963, "logits/rejected": -0.13401418924331665, "logps/chosen": -1.796062707901001, "logps/rejected": -1.9593982696533203, "loss": 2.1787, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.96062660217285, "rewards/margins": 1.6333551406860352, "rewards/rejected": -19.593982696533203, "step": 1680 }, { "epoch": 0.05679328592133203, "grad_norm": 19.399494171142578, "learning_rate": 5.679137175598247e-07, "logits/chosen": -0.742358386516571, "logits/rejected": -0.7818718552589417, "logps/chosen": -1.5913759469985962, "logps/rejected": -1.6343498229980469, "loss": 2.7957, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -15.913759231567383, "rewards/margins": 0.42973804473876953, "rewards/rejected": -16.343496322631836, "step": 1685 }, { "epoch": 0.05696181199231521, "grad_norm": 26.784698486328125, "learning_rate": 5.695989214694977e-07, "logits/chosen": -0.04572455957531929, "logits/rejected": -0.2053622305393219, "logps/chosen": -1.710603952407837, "logps/rejected": -1.750200629234314, "loss": 2.8564, "rewards/accuracies": 0.5, "rewards/chosen": -17.106037139892578, "rewards/margins": 0.3959696888923645, "rewards/rejected": -17.50200843811035, "step": 1690 }, { "epoch": 0.05713033806329839, "grad_norm": 10.891070365905762, "learning_rate": 5.712841253791709e-07, "logits/chosen": -0.4375430941581726, "logits/rejected": -0.27901238203048706, "logps/chosen": -1.8508228063583374, "logps/rejected": -1.9515966176986694, "loss": 2.7353, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.508228302001953, "rewards/margins": 1.0077383518218994, "rewards/rejected": -19.515966415405273, "step": 1695 }, { "epoch": 0.05729886413428157, "grad_norm": 28.085920333862305, "learning_rate": 5.729693292888439e-07, "logits/chosen": -0.37988463044166565, "logits/rejected": -0.3089436888694763, "logps/chosen": -1.7150055170059204, "logps/rejected": -1.8164036273956299, "loss": 2.4916, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.150054931640625, "rewards/margins": 1.0139801502227783, "rewards/rejected": -18.16403579711914, "step": 1700 }, { "epoch": 0.05746739020526476, "grad_norm": 27.796403884887695, "learning_rate": 5.74654533198517e-07, "logits/chosen": -0.14855363965034485, "logits/rejected": -0.055423758924007416, "logps/chosen": -1.8769832849502563, "logps/rejected": -1.7882697582244873, "loss": 3.9854, "rewards/accuracies": 0.5, "rewards/chosen": -18.769832611083984, "rewards/margins": -0.887133777141571, "rewards/rejected": -17.8826961517334, "step": 1705 }, { "epoch": 0.05763591627624794, "grad_norm": 19.106117248535156, "learning_rate": 5.763397371081901e-07, "logits/chosen": -0.6356021165847778, "logits/rejected": -0.5874723196029663, "logps/chosen": -1.445164680480957, "logps/rejected": -1.4936878681182861, "loss": 2.7015, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -14.451647758483887, "rewards/margins": 0.4852313995361328, "rewards/rejected": -14.936877250671387, "step": 1710 }, { "epoch": 0.057804442347231116, "grad_norm": 21.671457290649414, "learning_rate": 5.780249410178631e-07, "logits/chosen": -0.45297783613204956, "logits/rejected": -0.38992589712142944, "logps/chosen": -1.5569000244140625, "logps/rejected": -1.5796066522598267, "loss": 3.0231, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -15.569000244140625, "rewards/margins": 0.2270650863647461, "rewards/rejected": -15.796066284179688, "step": 1715 }, { "epoch": 0.057972968418214296, "grad_norm": 29.682052612304688, "learning_rate": 5.797101449275362e-07, "logits/chosen": -0.5061969757080078, "logits/rejected": -0.40872421860694885, "logps/chosen": -1.8883237838745117, "logps/rejected": -1.8492504358291626, "loss": 3.4416, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -18.883237838745117, "rewards/margins": -0.3907338082790375, "rewards/rejected": -18.492504119873047, "step": 1720 }, { "epoch": 0.058141494489197476, "grad_norm": 31.583568572998047, "learning_rate": 5.813953488372093e-07, "logits/chosen": -0.2520661950111389, "logits/rejected": -0.23302459716796875, "logps/chosen": -1.6508252620697021, "logps/rejected": -1.8004897832870483, "loss": 1.9262, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.508251190185547, "rewards/margins": 1.4966458082199097, "rewards/rejected": -18.004898071289062, "step": 1725 }, { "epoch": 0.05831002056018066, "grad_norm": 18.796131134033203, "learning_rate": 5.830805527468824e-07, "logits/chosen": -0.3362746834754944, "logits/rejected": -0.3569917678833008, "logps/chosen": -1.7772108316421509, "logps/rejected": -1.8702561855316162, "loss": 2.8101, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.772109985351562, "rewards/margins": 0.9304534792900085, "rewards/rejected": -18.70256233215332, "step": 1730 }, { "epoch": 0.05847854663116384, "grad_norm": 20.094972610473633, "learning_rate": 5.847657566565553e-07, "logits/chosen": -0.8289566040039062, "logits/rejected": -0.6430098414421082, "logps/chosen": -1.6072938442230225, "logps/rejected": -1.583827257156372, "loss": 3.3512, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.07293701171875, "rewards/margins": -0.23466500639915466, "rewards/rejected": -15.838272094726562, "step": 1735 }, { "epoch": 0.05864707270214702, "grad_norm": 25.6748104095459, "learning_rate": 5.864509605662284e-07, "logits/chosen": -0.17799155414104462, "logits/rejected": -0.09096328914165497, "logps/chosen": -1.725823998451233, "logps/rejected": -1.8445842266082764, "loss": 2.6412, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.25823974609375, "rewards/margins": 1.1876022815704346, "rewards/rejected": -18.445842742919922, "step": 1740 }, { "epoch": 0.0588155987731302, "grad_norm": 43.991390228271484, "learning_rate": 5.881361644759016e-07, "logits/chosen": -0.5916566252708435, "logits/rejected": -0.5461825132369995, "logps/chosen": -1.7417824268341064, "logps/rejected": -2.0078158378601074, "loss": 2.34, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.417823791503906, "rewards/margins": 2.6603331565856934, "rewards/rejected": -20.078155517578125, "step": 1745 }, { "epoch": 0.05898412484411338, "grad_norm": 26.747943878173828, "learning_rate": 5.898213683855746e-07, "logits/chosen": -0.9465047717094421, "logits/rejected": -0.9426982998847961, "logps/chosen": -1.5433447360992432, "logps/rejected": -1.5770765542984009, "loss": 2.7903, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -15.433446884155273, "rewards/margins": 0.33731889724731445, "rewards/rejected": -15.77076530456543, "step": 1750 }, { "epoch": 0.05915265091509656, "grad_norm": 16.108659744262695, "learning_rate": 5.915065722952477e-07, "logits/chosen": -0.7509289979934692, "logits/rejected": -0.6715911030769348, "logps/chosen": -1.8838880062103271, "logps/rejected": -1.9311984777450562, "loss": 2.7285, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.838882446289062, "rewards/margins": 0.47310376167297363, "rewards/rejected": -19.31198501586914, "step": 1755 }, { "epoch": 0.05932117698607975, "grad_norm": 25.781511306762695, "learning_rate": 5.931917762049208e-07, "logits/chosen": 0.09684916585683823, "logits/rejected": -0.03794277831912041, "logps/chosen": -1.651545524597168, "logps/rejected": -1.5341824293136597, "loss": 4.218, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -16.515457153320312, "rewards/margins": -1.1736314296722412, "rewards/rejected": -15.341824531555176, "step": 1760 }, { "epoch": 0.05948970305706293, "grad_norm": 19.869956970214844, "learning_rate": 5.948769801145939e-07, "logits/chosen": -0.5614740252494812, "logits/rejected": -0.6934599876403809, "logps/chosen": -1.4802380800247192, "logps/rejected": -1.4893232583999634, "loss": 3.0454, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -14.80238151550293, "rewards/margins": 0.09085007011890411, "rewards/rejected": -14.893231391906738, "step": 1765 }, { "epoch": 0.05965822912804611, "grad_norm": 38.33526611328125, "learning_rate": 5.965621840242669e-07, "logits/chosen": -0.3522099554538727, "logits/rejected": -0.42708688974380493, "logps/chosen": -1.9067974090576172, "logps/rejected": -1.9417225122451782, "loss": 3.4045, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.067974090576172, "rewards/margins": 0.34925180673599243, "rewards/rejected": -19.417224884033203, "step": 1770 }, { "epoch": 0.05982675519902929, "grad_norm": 21.137075424194336, "learning_rate": 5.9824738793394e-07, "logits/chosen": -0.25959745049476624, "logits/rejected": -0.24761705100536346, "logps/chosen": -1.875977873802185, "logps/rejected": -1.9285907745361328, "loss": 3.4363, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.759777069091797, "rewards/margins": 0.5261300206184387, "rewards/rejected": -19.28590965270996, "step": 1775 }, { "epoch": 0.05999528127001247, "grad_norm": 20.143699645996094, "learning_rate": 5.99932591843613e-07, "logits/chosen": -0.3771205544471741, "logits/rejected": -0.3303254246711731, "logps/chosen": -1.5552126169204712, "logps/rejected": -1.814523458480835, "loss": 2.1268, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -15.552125930786133, "rewards/margins": 2.593108654022217, "rewards/rejected": -18.145235061645508, "step": 1780 }, { "epoch": 0.060163807340995655, "grad_norm": 48.40658950805664, "learning_rate": 6.01617795753286e-07, "logits/chosen": -0.14805591106414795, "logits/rejected": -0.24271002411842346, "logps/chosen": -1.7285377979278564, "logps/rejected": -1.9390876293182373, "loss": 2.9948, "rewards/accuracies": 0.5, "rewards/chosen": -17.285375595092773, "rewards/margins": 2.105499267578125, "rewards/rejected": -19.3908748626709, "step": 1785 }, { "epoch": 0.060332333411978835, "grad_norm": 25.84428596496582, "learning_rate": 6.033029996629591e-07, "logits/chosen": -0.013548873364925385, "logits/rejected": 0.018930787220597267, "logps/chosen": -1.738650918006897, "logps/rejected": -1.6590766906738281, "loss": 3.9597, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.38650894165039, "rewards/margins": -0.7957417368888855, "rewards/rejected": -16.59076690673828, "step": 1790 }, { "epoch": 0.060500859482962015, "grad_norm": 73.66495513916016, "learning_rate": 6.049882035726323e-07, "logits/chosen": -0.5159806609153748, "logits/rejected": -0.3327074646949768, "logps/chosen": -1.820635437965393, "logps/rejected": -1.9370348453521729, "loss": 2.6383, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.20635414123535, "rewards/margins": 1.163994550704956, "rewards/rejected": -19.37034797668457, "step": 1795 }, { "epoch": 0.060669385553945195, "grad_norm": 22.028491973876953, "learning_rate": 6.066734074823054e-07, "logits/chosen": -0.2898898124694824, "logits/rejected": -0.2503661513328552, "logps/chosen": -1.6143338680267334, "logps/rejected": -1.5903599262237549, "loss": 3.4173, "rewards/accuracies": 0.5, "rewards/chosen": -16.143339157104492, "rewards/margins": -0.23974084854125977, "rewards/rejected": -15.903597831726074, "step": 1800 }, { "epoch": 0.060837911624928374, "grad_norm": 24.202577590942383, "learning_rate": 6.083586113919784e-07, "logits/chosen": 0.024017006158828735, "logits/rejected": 0.04643130302429199, "logps/chosen": -1.527374267578125, "logps/rejected": -1.5289933681488037, "loss": 3.619, "rewards/accuracies": 0.5, "rewards/chosen": -15.27374267578125, "rewards/margins": 0.01618986204266548, "rewards/rejected": -15.289934158325195, "step": 1805 }, { "epoch": 0.061006437695911554, "grad_norm": 89.72640991210938, "learning_rate": 6.100438153016515e-07, "logits/chosen": -0.5288889408111572, "logits/rejected": -0.5065113306045532, "logps/chosen": -1.7922258377075195, "logps/rejected": -1.803261399269104, "loss": 3.1556, "rewards/accuracies": 0.5, "rewards/chosen": -17.922256469726562, "rewards/margins": 0.11035575717687607, "rewards/rejected": -18.032611846923828, "step": 1810 }, { "epoch": 0.06117496376689474, "grad_norm": 26.971994400024414, "learning_rate": 6.117290192113246e-07, "logits/chosen": -0.7729194760322571, "logits/rejected": -0.8629820942878723, "logps/chosen": -1.9254817962646484, "logps/rejected": -1.95050048828125, "loss": 3.7236, "rewards/accuracies": 0.5, "rewards/chosen": -19.254819869995117, "rewards/margins": 0.25018566846847534, "rewards/rejected": -19.5050048828125, "step": 1815 }, { "epoch": 0.06134348983787792, "grad_norm": 23.856813430786133, "learning_rate": 6.134142231209977e-07, "logits/chosen": -0.33167606592178345, "logits/rejected": -0.3584614396095276, "logps/chosen": -1.486135721206665, "logps/rejected": -1.5292284488677979, "loss": 2.9592, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -14.861356735229492, "rewards/margins": 0.43092602491378784, "rewards/rejected": -15.29228401184082, "step": 1820 }, { "epoch": 0.0615120159088611, "grad_norm": 21.96845245361328, "learning_rate": 6.150994270306706e-07, "logits/chosen": -0.30157405138015747, "logits/rejected": -0.32445019483566284, "logps/chosen": -1.9414207935333252, "logps/rejected": -1.929107904434204, "loss": 3.3149, "rewards/accuracies": 0.5, "rewards/chosen": -19.414207458496094, "rewards/margins": -0.12312869727611542, "rewards/rejected": -19.291080474853516, "step": 1825 }, { "epoch": 0.06168054197984428, "grad_norm": 42.928070068359375, "learning_rate": 6.167846309403437e-07, "logits/chosen": -0.10247864574193954, "logits/rejected": -0.16112163662910461, "logps/chosen": -1.736191987991333, "logps/rejected": -1.667532205581665, "loss": 3.8427, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.361919403076172, "rewards/margins": -0.6865975260734558, "rewards/rejected": -16.675321578979492, "step": 1830 }, { "epoch": 0.06184906805082746, "grad_norm": 23.188669204711914, "learning_rate": 6.184698348500168e-07, "logits/chosen": -0.5849705934524536, "logits/rejected": -0.3921450972557068, "logps/chosen": -1.8506797552108765, "logps/rejected": -1.881422758102417, "loss": 3.0305, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.506797790527344, "rewards/margins": 0.30743035674095154, "rewards/rejected": -18.814228057861328, "step": 1835 }, { "epoch": 0.06201759412181065, "grad_norm": 77.21211242675781, "learning_rate": 6.201550387596898e-07, "logits/chosen": -0.5807980298995972, "logits/rejected": -0.5246071815490723, "logps/chosen": -2.000711679458618, "logps/rejected": -1.9282617568969727, "loss": 3.775, "rewards/accuracies": 0.5, "rewards/chosen": -20.007116317749023, "rewards/margins": -0.7245005369186401, "rewards/rejected": -19.282617568969727, "step": 1840 }, { "epoch": 0.06218612019279383, "grad_norm": 22.97465705871582, "learning_rate": 6.21840242669363e-07, "logits/chosen": -0.5278009176254272, "logits/rejected": -0.48332375288009644, "logps/chosen": -1.8859955072402954, "logps/rejected": -1.8935880661010742, "loss": 3.1492, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.859954833984375, "rewards/margins": 0.07592477649450302, "rewards/rejected": -18.93587875366211, "step": 1845 }, { "epoch": 0.06235464626377701, "grad_norm": 21.464570999145508, "learning_rate": 6.235254465790361e-07, "logits/chosen": -0.4647518992424011, "logits/rejected": -0.35028940439224243, "logps/chosen": -1.6057485342025757, "logps/rejected": -1.7640788555145264, "loss": 2.3478, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.057485580444336, "rewards/margins": 1.5833041667938232, "rewards/rejected": -17.640789031982422, "step": 1850 }, { "epoch": 0.0625231723347602, "grad_norm": 23.270641326904297, "learning_rate": 6.252106504887092e-07, "logits/chosen": -0.626205563545227, "logits/rejected": -0.5367187261581421, "logps/chosen": -1.6747545003890991, "logps/rejected": -1.808241605758667, "loss": 2.8049, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.74754524230957, "rewards/margins": 1.3348705768585205, "rewards/rejected": -18.082416534423828, "step": 1855 }, { "epoch": 0.06269169840574337, "grad_norm": 20.01548194885254, "learning_rate": 6.268958543983822e-07, "logits/chosen": -0.6003610491752625, "logits/rejected": -0.46292710304260254, "logps/chosen": -2.0454294681549072, "logps/rejected": -2.074065685272217, "loss": 3.3159, "rewards/accuracies": 0.5, "rewards/chosen": -20.45429229736328, "rewards/margins": 0.286365807056427, "rewards/rejected": -20.740657806396484, "step": 1860 }, { "epoch": 0.06286022447672655, "grad_norm": 21.655704498291016, "learning_rate": 6.285810583080553e-07, "logits/chosen": -0.24826118350028992, "logits/rejected": -0.25482481718063354, "logps/chosen": -2.4187121391296387, "logps/rejected": -2.6595332622528076, "loss": 2.1698, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.187122344970703, "rewards/margins": 2.4082117080688477, "rewards/rejected": -26.595333099365234, "step": 1865 }, { "epoch": 0.06302875054770973, "grad_norm": 12.449923515319824, "learning_rate": 6.302662622177283e-07, "logits/chosen": -0.27865132689476013, "logits/rejected": -0.1912352293729782, "logps/chosen": -1.7426397800445557, "logps/rejected": -1.803223967552185, "loss": 2.8243, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.4263973236084, "rewards/margins": 0.6058410406112671, "rewards/rejected": -18.032238006591797, "step": 1870 }, { "epoch": 0.06319727661869291, "grad_norm": 18.938154220581055, "learning_rate": 6.319514661274013e-07, "logits/chosen": -0.9526360630989075, "logits/rejected": -0.8011028170585632, "logps/chosen": -1.36483895778656, "logps/rejected": -1.439701795578003, "loss": 2.9105, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -13.64838981628418, "rewards/margins": 0.7486263513565063, "rewards/rejected": -14.397016525268555, "step": 1875 }, { "epoch": 0.06336580268967609, "grad_norm": 13.122438430786133, "learning_rate": 6.336366700370744e-07, "logits/chosen": -0.7261112332344055, "logits/rejected": -0.6287399530410767, "logps/chosen": -1.5695728063583374, "logps/rejected": -1.7251754999160767, "loss": 2.6971, "rewards/accuracies": 0.5, "rewards/chosen": -15.695727348327637, "rewards/margins": 1.5560270547866821, "rewards/rejected": -17.25175666809082, "step": 1880 }, { "epoch": 0.06353432876065927, "grad_norm": 27.177547454833984, "learning_rate": 6.353218739467475e-07, "logits/chosen": -0.3480846583843231, "logits/rejected": -0.2502368092536926, "logps/chosen": -1.6377151012420654, "logps/rejected": -1.578667163848877, "loss": 3.8532, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.377151489257812, "rewards/margins": -0.5904794931411743, "rewards/rejected": -15.786672592163086, "step": 1885 }, { "epoch": 0.06370285483164245, "grad_norm": 32.058998107910156, "learning_rate": 6.370070778564206e-07, "logits/chosen": -0.15475639700889587, "logits/rejected": -0.08114627748727798, "logps/chosen": -1.704306960105896, "logps/rejected": -1.788220763206482, "loss": 3.0792, "rewards/accuracies": 0.5, "rewards/chosen": -17.04306983947754, "rewards/margins": 0.8391389846801758, "rewards/rejected": -17.8822078704834, "step": 1890 }, { "epoch": 0.06387138090262563, "grad_norm": 21.426652908325195, "learning_rate": 6.386922817660937e-07, "logits/chosen": -0.4754219949245453, "logits/rejected": -0.44864320755004883, "logps/chosen": -1.8361284732818604, "logps/rejected": -1.8722747564315796, "loss": 2.8451, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.361284255981445, "rewards/margins": 0.3614630699157715, "rewards/rejected": -18.722747802734375, "step": 1895 }, { "epoch": 0.06403990697360881, "grad_norm": 24.273914337158203, "learning_rate": 6.403774856757668e-07, "logits/chosen": -0.02922775410115719, "logits/rejected": -0.1809547245502472, "logps/chosen": -1.8290389776229858, "logps/rejected": -1.9153051376342773, "loss": 2.7326, "rewards/accuracies": 0.5, "rewards/chosen": -18.290390014648438, "rewards/margins": 0.8626611828804016, "rewards/rejected": -19.15304946899414, "step": 1900 }, { "epoch": 0.06420843304459199, "grad_norm": 29.323482513427734, "learning_rate": 6.420626895854399e-07, "logits/chosen": -0.12777681648731232, "logits/rejected": -0.08850021660327911, "logps/chosen": -1.8227113485336304, "logps/rejected": -1.8100782632827759, "loss": 3.2956, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.227113723754883, "rewards/margins": -0.12633152306079865, "rewards/rejected": -18.10078239440918, "step": 1905 }, { "epoch": 0.06437695911557519, "grad_norm": 20.20551300048828, "learning_rate": 6.437478934951128e-07, "logits/chosen": -0.6324241757392883, "logits/rejected": -0.6633267998695374, "logps/chosen": -1.5772778987884521, "logps/rejected": -1.6632354259490967, "loss": 2.3262, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -15.772778511047363, "rewards/margins": 0.8595759272575378, "rewards/rejected": -16.632354736328125, "step": 1910 }, { "epoch": 0.06454548518655837, "grad_norm": 27.31475830078125, "learning_rate": 6.454330974047859e-07, "logits/chosen": -0.5674937963485718, "logits/rejected": -0.3717323839664459, "logps/chosen": -1.6537988185882568, "logps/rejected": -1.6325750350952148, "loss": 3.3598, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -16.537988662719727, "rewards/margins": -0.2122385948896408, "rewards/rejected": -16.325748443603516, "step": 1915 }, { "epoch": 0.06471401125754155, "grad_norm": 21.887195587158203, "learning_rate": 6.47118301314459e-07, "logits/chosen": -0.030434776097536087, "logits/rejected": -0.04068700224161148, "logps/chosen": -1.930794358253479, "logps/rejected": -2.0131115913391113, "loss": 2.4377, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.307941436767578, "rewards/margins": 0.8231736421585083, "rewards/rejected": -20.131114959716797, "step": 1920 }, { "epoch": 0.06488253732852473, "grad_norm": 47.787200927734375, "learning_rate": 6.488035052241321e-07, "logits/chosen": -0.6423458456993103, "logits/rejected": -0.5267582535743713, "logps/chosen": -1.8432353734970093, "logps/rejected": -1.9428443908691406, "loss": 2.2613, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.432353973388672, "rewards/margins": 0.9960900545120239, "rewards/rejected": -19.428442001342773, "step": 1925 }, { "epoch": 0.0650510633995079, "grad_norm": 18.88721466064453, "learning_rate": 6.504887091338051e-07, "logits/chosen": -0.30531203746795654, "logits/rejected": -0.4015568196773529, "logps/chosen": -1.5693740844726562, "logps/rejected": -1.6434333324432373, "loss": 2.4761, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -15.693740844726562, "rewards/margins": 0.7405935525894165, "rewards/rejected": -16.434335708618164, "step": 1930 }, { "epoch": 0.06521958947049109, "grad_norm": 30.243013381958008, "learning_rate": 6.521739130434782e-07, "logits/chosen": -0.2444632351398468, "logits/rejected": -0.2351723164319992, "logps/chosen": -1.8687137365341187, "logps/rejected": -2.091553211212158, "loss": 2.259, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.687137603759766, "rewards/margins": 2.2283949851989746, "rewards/rejected": -20.9155330657959, "step": 1935 }, { "epoch": 0.06538811554147426, "grad_norm": 21.0958309173584, "learning_rate": 6.538591169531513e-07, "logits/chosen": -0.5276403427124023, "logits/rejected": -0.5364550352096558, "logps/chosen": -1.705955147743225, "logps/rejected": -1.6914780139923096, "loss": 3.2859, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.059551239013672, "rewards/margins": -0.14477066695690155, "rewards/rejected": -16.914779663085938, "step": 1940 }, { "epoch": 0.06555664161245744, "grad_norm": 32.9719352722168, "learning_rate": 6.555443208628245e-07, "logits/chosen": -0.17250430583953857, "logits/rejected": -0.22453102469444275, "logps/chosen": -1.782339096069336, "logps/rejected": -1.7523667812347412, "loss": 3.4177, "rewards/accuracies": 0.5, "rewards/chosen": -17.82339096069336, "rewards/margins": -0.2997213304042816, "rewards/rejected": -17.52366828918457, "step": 1945 }, { "epoch": 0.06572516768344062, "grad_norm": 21.734193801879883, "learning_rate": 6.572295247724975e-07, "logits/chosen": 0.016134237870573997, "logits/rejected": -0.03876941278576851, "logps/chosen": -1.5835492610931396, "logps/rejected": -1.5337642431259155, "loss": 3.5641, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -15.835492134094238, "rewards/margins": -0.49785009026527405, "rewards/rejected": -15.337640762329102, "step": 1950 }, { "epoch": 0.0658936937544238, "grad_norm": 21.90297508239746, "learning_rate": 6.589147286821705e-07, "logits/chosen": -0.46315592527389526, "logits/rejected": -0.44253987073898315, "logps/chosen": -1.4891859292984009, "logps/rejected": -1.679639220237732, "loss": 2.9178, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -14.89185905456543, "rewards/margins": 1.904531478881836, "rewards/rejected": -16.796390533447266, "step": 1955 }, { "epoch": 0.06606221982540698, "grad_norm": 47.062469482421875, "learning_rate": 6.605999325918436e-07, "logits/chosen": -0.5072580575942993, "logits/rejected": -0.6388121843338013, "logps/chosen": -1.9043185710906982, "logps/rejected": -1.834246039390564, "loss": 3.8808, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -19.04318618774414, "rewards/margins": -0.7007244229316711, "rewards/rejected": -18.34246063232422, "step": 1960 }, { "epoch": 0.06623074589639018, "grad_norm": 30.098649978637695, "learning_rate": 6.622851365015166e-07, "logits/chosen": -0.48198550939559937, "logits/rejected": -0.4838237166404724, "logps/chosen": -1.7086031436920166, "logps/rejected": -1.7821567058563232, "loss": 2.8094, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.086029052734375, "rewards/margins": 0.73553866147995, "rewards/rejected": -17.82156753540039, "step": 1965 }, { "epoch": 0.06639927196737336, "grad_norm": 0.15973490476608276, "learning_rate": 6.639703404111897e-07, "logits/chosen": -0.28459540009498596, "logits/rejected": -0.26555758714675903, "logps/chosen": -1.542386531829834, "logps/rejected": -1.799584150314331, "loss": 2.2961, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -15.423864364624023, "rewards/margins": 2.57197642326355, "rewards/rejected": -17.9958438873291, "step": 1970 }, { "epoch": 0.06656779803835654, "grad_norm": 51.54502487182617, "learning_rate": 6.656555443208628e-07, "logits/chosen": -0.7996042966842651, "logits/rejected": -0.7479974031448364, "logps/chosen": -2.016369342803955, "logps/rejected": -1.9065616130828857, "loss": 4.158, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -20.163692474365234, "rewards/margins": -1.0980759859085083, "rewards/rejected": -19.065616607666016, "step": 1975 }, { "epoch": 0.06673632410933972, "grad_norm": 17.913270950317383, "learning_rate": 6.673407482305359e-07, "logits/chosen": -0.33906176686286926, "logits/rejected": -0.29040712118148804, "logps/chosen": -1.7747215032577515, "logps/rejected": -1.872317910194397, "loss": 2.4515, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.74721336364746, "rewards/margins": 0.9759650230407715, "rewards/rejected": -18.723180770874023, "step": 1980 }, { "epoch": 0.0669048501803229, "grad_norm": 21.00700569152832, "learning_rate": 6.690259521402089e-07, "logits/chosen": -0.4829220771789551, "logits/rejected": -0.4446737766265869, "logps/chosen": -1.4653489589691162, "logps/rejected": -1.5274522304534912, "loss": 2.7453, "rewards/accuracies": 0.5, "rewards/chosen": -14.653491020202637, "rewards/margins": 0.6210311651229858, "rewards/rejected": -15.274523735046387, "step": 1985 }, { "epoch": 0.06707337625130608, "grad_norm": 40.25058364868164, "learning_rate": 6.70711156049882e-07, "logits/chosen": -0.11107297241687775, "logits/rejected": 0.057940077036619186, "logps/chosen": -1.9686638116836548, "logps/rejected": -2.0639851093292236, "loss": 2.3266, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.6866397857666, "rewards/margins": 0.9532124400138855, "rewards/rejected": -20.639850616455078, "step": 1990 }, { "epoch": 0.06724190232228926, "grad_norm": 16.159786224365234, "learning_rate": 6.723963599595552e-07, "logits/chosen": -0.7763963937759399, "logits/rejected": -0.6344070434570312, "logps/chosen": -1.7498953342437744, "logps/rejected": -1.845721960067749, "loss": 2.9993, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.498952865600586, "rewards/margins": 0.9582692980766296, "rewards/rejected": -18.457223892211914, "step": 1995 }, { "epoch": 0.06741042839327244, "grad_norm": 20.94877815246582, "learning_rate": 6.740815638692281e-07, "logits/chosen": -0.45790061354637146, "logits/rejected": -0.5162444710731506, "logps/chosen": -1.5169246196746826, "logps/rejected": -1.4949506521224976, "loss": 3.622, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -15.1692476272583, "rewards/margins": -0.2197399139404297, "rewards/rejected": -14.949508666992188, "step": 2000 }, { "epoch": 0.06741042839327244, "eval_logits/chosen": -0.7068748474121094, "eval_logits/rejected": -0.7089285254478455, "eval_logps/chosen": -1.6583834886550903, "eval_logps/rejected": -1.6555068492889404, "eval_loss": 3.4318957328796387, "eval_rewards/accuracies": 0.4699999988079071, "eval_rewards/chosen": -16.583833694458008, "eval_rewards/margins": -0.02876390889286995, "eval_rewards/rejected": -16.555068969726562, "eval_runtime": 12.8917, "eval_samples_per_second": 7.757, "eval_steps_per_second": 1.939, "step": 2000 }, { "epoch": 0.06757895446425562, "grad_norm": 40.18283462524414, "learning_rate": 6.757667677789012e-07, "logits/chosen": -0.33664292097091675, "logits/rejected": -0.08164303004741669, "logps/chosen": -1.5835665464401245, "logps/rejected": -1.6931190490722656, "loss": 2.8106, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -15.835665702819824, "rewards/margins": 1.0955229997634888, "rewards/rejected": -16.931188583374023, "step": 2005 }, { "epoch": 0.0677474805352388, "grad_norm": 26.831192016601562, "learning_rate": 6.774519716885743e-07, "logits/chosen": -0.3066862225532532, "logits/rejected": -0.5098311901092529, "logps/chosen": -1.628483772277832, "logps/rejected": -1.6336151361465454, "loss": 3.0836, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.284839630126953, "rewards/margins": 0.05131196975708008, "rewards/rejected": -16.336151123046875, "step": 2010 }, { "epoch": 0.06791600660622198, "grad_norm": 16.26115608215332, "learning_rate": 6.791371755982474e-07, "logits/chosen": -0.723468005657196, "logits/rejected": -0.6189843416213989, "logps/chosen": -1.680509328842163, "logps/rejected": -1.7051823139190674, "loss": 2.9784, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.80509376525879, "rewards/margins": 0.2467300444841385, "rewards/rejected": -17.05182456970215, "step": 2015 }, { "epoch": 0.06808453267720517, "grad_norm": 29.91892433166504, "learning_rate": 6.808223795079204e-07, "logits/chosen": -0.7424842119216919, "logits/rejected": -0.704288125038147, "logps/chosen": -1.6226580142974854, "logps/rejected": -1.633644700050354, "loss": 3.1707, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.226581573486328, "rewards/margins": 0.10986528545618057, "rewards/rejected": -16.336444854736328, "step": 2020 }, { "epoch": 0.06825305874818835, "grad_norm": 28.981691360473633, "learning_rate": 6.825075834175935e-07, "logits/chosen": 0.017427653074264526, "logits/rejected": -0.05734679102897644, "logps/chosen": -2.1242425441741943, "logps/rejected": -1.984273910522461, "loss": 4.4866, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -21.2424259185791, "rewards/margins": -1.399685263633728, "rewards/rejected": -19.84273910522461, "step": 2025 }, { "epoch": 0.06842158481917153, "grad_norm": 56.0091552734375, "learning_rate": 6.841927873272666e-07, "logits/chosen": -0.20901520550251007, "logits/rejected": -0.23270806670188904, "logps/chosen": -2.094710111618042, "logps/rejected": -2.2162601947784424, "loss": 2.2359, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.94710350036621, "rewards/margins": 1.215496301651001, "rewards/rejected": -22.162599563598633, "step": 2030 }, { "epoch": 0.06859011089015471, "grad_norm": 33.509132385253906, "learning_rate": 6.858779912369396e-07, "logits/chosen": -0.5838386416435242, "logits/rejected": -0.43091145157814026, "logps/chosen": -1.8723928928375244, "logps/rejected": -1.8803253173828125, "loss": 3.061, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.723926544189453, "rewards/margins": 0.07932768017053604, "rewards/rejected": -18.80325698852539, "step": 2035 }, { "epoch": 0.06875863696113789, "grad_norm": 21.0834903717041, "learning_rate": 6.875631951466127e-07, "logits/chosen": -0.9346014857292175, "logits/rejected": -0.7744470834732056, "logps/chosen": -1.6558029651641846, "logps/rejected": -1.6898488998413086, "loss": 2.8263, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.55803108215332, "rewards/margins": 0.34045690298080444, "rewards/rejected": -16.898487091064453, "step": 2040 }, { "epoch": 0.06892716303212107, "grad_norm": 21.175643920898438, "learning_rate": 6.892483990562858e-07, "logits/chosen": -0.3706910312175751, "logits/rejected": -0.28410759568214417, "logps/chosen": -2.399585723876953, "logps/rejected": -1.9964946508407593, "loss": 7.2358, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.995859146118164, "rewards/margins": -4.030909538269043, "rewards/rejected": -19.964946746826172, "step": 2045 }, { "epoch": 0.06909568910310425, "grad_norm": 26.842979431152344, "learning_rate": 6.909336029659589e-07, "logits/chosen": -0.5333553552627563, "logits/rejected": -0.4201357960700989, "logps/chosen": -1.720887541770935, "logps/rejected": -1.797654390335083, "loss": 3.268, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.208877563476562, "rewards/margins": 0.7676678895950317, "rewards/rejected": -17.976543426513672, "step": 2050 }, { "epoch": 0.06926421517408743, "grad_norm": 20.48428726196289, "learning_rate": 6.926188068756319e-07, "logits/chosen": -0.584562361240387, "logits/rejected": -0.6016994714736938, "logps/chosen": -1.502666711807251, "logps/rejected": -1.5560954809188843, "loss": 2.8615, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -15.026666641235352, "rewards/margins": 0.534288227558136, "rewards/rejected": -15.560954093933105, "step": 2055 }, { "epoch": 0.06943274124507061, "grad_norm": 23.584716796875, "learning_rate": 6.94304010785305e-07, "logits/chosen": -0.6470240354537964, "logits/rejected": -0.6648738384246826, "logps/chosen": -1.733741044998169, "logps/rejected": -1.7135350704193115, "loss": 3.3818, "rewards/accuracies": 0.5, "rewards/chosen": -17.337411880493164, "rewards/margins": -0.20206137001514435, "rewards/rejected": -17.13534927368164, "step": 2060 }, { "epoch": 0.06960126731605379, "grad_norm": 32.5343132019043, "learning_rate": 6.959892146949781e-07, "logits/chosen": -0.13596948981285095, "logits/rejected": -0.12064089626073837, "logps/chosen": -1.509447693824768, "logps/rejected": -1.6260335445404053, "loss": 2.5401, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -15.094476699829102, "rewards/margins": 1.1658592224121094, "rewards/rejected": -16.26033592224121, "step": 2065 }, { "epoch": 0.06976979338703697, "grad_norm": 15.475911140441895, "learning_rate": 6.976744186046511e-07, "logits/chosen": -0.6770817041397095, "logits/rejected": -0.6954010725021362, "logps/chosen": -1.5978891849517822, "logps/rejected": -1.6195186376571655, "loss": 2.9897, "rewards/accuracies": 0.5, "rewards/chosen": -15.978894233703613, "rewards/margins": 0.21629361808300018, "rewards/rejected": -16.195186614990234, "step": 2070 }, { "epoch": 0.06993831945802016, "grad_norm": 28.1234130859375, "learning_rate": 6.993596225143242e-07, "logits/chosen": -0.5654903054237366, "logits/rejected": -0.6328141093254089, "logps/chosen": -1.7364234924316406, "logps/rejected": -1.6471540927886963, "loss": 4.1059, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -17.36423683166504, "rewards/margins": -0.892695426940918, "rewards/rejected": -16.471540451049805, "step": 2075 }, { "epoch": 0.07010684552900334, "grad_norm": 35.210113525390625, "learning_rate": 7.010448264239973e-07, "logits/chosen": -0.052232611924409866, "logits/rejected": -0.19137360155582428, "logps/chosen": -1.8225181102752686, "logps/rejected": -1.6726758480072021, "loss": 4.59, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.225181579589844, "rewards/margins": -1.49842369556427, "rewards/rejected": -16.726757049560547, "step": 2080 }, { "epoch": 0.07027537159998652, "grad_norm": 22.082977294921875, "learning_rate": 7.027300303336703e-07, "logits/chosen": -0.1036457046866417, "logits/rejected": -0.22812290489673615, "logps/chosen": -1.4879047870635986, "logps/rejected": -1.4071712493896484, "loss": 3.868, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -14.879046440124512, "rewards/margins": -0.8073347210884094, "rewards/rejected": -14.0717134475708, "step": 2085 }, { "epoch": 0.0704438976709697, "grad_norm": 18.9871883392334, "learning_rate": 7.044152342433433e-07, "logits/chosen": -0.5369516015052795, "logits/rejected": -0.5570724606513977, "logps/chosen": -1.5926361083984375, "logps/rejected": -1.8241342306137085, "loss": 3.1885, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -15.926362991333008, "rewards/margins": 2.3149805068969727, "rewards/rejected": -18.241342544555664, "step": 2090 }, { "epoch": 0.07061242374195288, "grad_norm": 19.876121520996094, "learning_rate": 7.061004381530165e-07, "logits/chosen": -0.648863673210144, "logits/rejected": -0.6402491331100464, "logps/chosen": -1.9981971979141235, "logps/rejected": -1.8780739307403564, "loss": 4.3948, "rewards/accuracies": 0.5, "rewards/chosen": -19.98197364807129, "rewards/margins": -1.2012332677841187, "rewards/rejected": -18.780738830566406, "step": 2095 }, { "epoch": 0.07078094981293606, "grad_norm": 21.34756851196289, "learning_rate": 7.077856420626896e-07, "logits/chosen": -0.4101831316947937, "logits/rejected": -0.41583624482154846, "logps/chosen": -1.7545570135116577, "logps/rejected": -1.691300392150879, "loss": 3.6937, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -17.54557228088379, "rewards/margins": -0.6325671076774597, "rewards/rejected": -16.91300392150879, "step": 2100 }, { "epoch": 0.07094947588391924, "grad_norm": 30.001667022705078, "learning_rate": 7.094708459723626e-07, "logits/chosen": -0.3208427131175995, "logits/rejected": -0.16100385785102844, "logps/chosen": -1.696240782737732, "logps/rejected": -1.7646242380142212, "loss": 2.6131, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.9624080657959, "rewards/margins": 0.6838338971138, "rewards/rejected": -17.646244049072266, "step": 2105 }, { "epoch": 0.07111800195490242, "grad_norm": 35.3121337890625, "learning_rate": 7.111560498820357e-07, "logits/chosen": -0.6093899607658386, "logits/rejected": -0.5890295505523682, "logps/chosen": -1.7065532207489014, "logps/rejected": -1.719430685043335, "loss": 3.4051, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.065532684326172, "rewards/margins": 0.12877540290355682, "rewards/rejected": -17.19430923461914, "step": 2110 }, { "epoch": 0.0712865280258856, "grad_norm": 22.868438720703125, "learning_rate": 7.128412537917088e-07, "logits/chosen": -0.7410895824432373, "logits/rejected": -0.7691564559936523, "logps/chosen": -1.4438047409057617, "logps/rejected": -1.487786054611206, "loss": 2.9583, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -14.4380464553833, "rewards/margins": 0.4398147463798523, "rewards/rejected": -14.877861022949219, "step": 2115 }, { "epoch": 0.07145505409686878, "grad_norm": 30.30385971069336, "learning_rate": 7.145264577013819e-07, "logits/chosen": -0.4199215769767761, "logits/rejected": -0.35399970412254333, "logps/chosen": -1.6688213348388672, "logps/rejected": -1.6876872777938843, "loss": 3.0365, "rewards/accuracies": 0.5, "rewards/chosen": -16.688213348388672, "rewards/margins": 0.18865904211997986, "rewards/rejected": -16.876873016357422, "step": 2120 }, { "epoch": 0.07162358016785196, "grad_norm": 20.555082321166992, "learning_rate": 7.162116616110549e-07, "logits/chosen": -0.33508139848709106, "logits/rejected": -0.26377108693122864, "logps/chosen": -1.7805957794189453, "logps/rejected": -1.8432705402374268, "loss": 2.5923, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.805957794189453, "rewards/margins": 0.6267482042312622, "rewards/rejected": -18.43270492553711, "step": 2125 }, { "epoch": 0.07179210623883515, "grad_norm": 22.803489685058594, "learning_rate": 7.178968655207279e-07, "logits/chosen": -0.7927559018135071, "logits/rejected": -0.7616699934005737, "logps/chosen": -1.4907985925674438, "logps/rejected": -1.5001600980758667, "loss": 2.9985, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -14.907986640930176, "rewards/margins": 0.09361562877893448, "rewards/rejected": -15.001602172851562, "step": 2130 }, { "epoch": 0.07196063230981833, "grad_norm": 31.1895751953125, "learning_rate": 7.19582069430401e-07, "logits/chosen": -0.5055627822875977, "logits/rejected": -0.5777538418769836, "logps/chosen": -2.0089962482452393, "logps/rejected": -2.064162492752075, "loss": 2.9047, "rewards/accuracies": 0.5, "rewards/chosen": -20.0899658203125, "rewards/margins": 0.5516608953475952, "rewards/rejected": -20.64162254333496, "step": 2135 }, { "epoch": 0.07212915838080151, "grad_norm": 21.80877113342285, "learning_rate": 7.21267273340074e-07, "logits/chosen": -0.8962277173995972, "logits/rejected": -1.062765121459961, "logps/chosen": -1.6682827472686768, "logps/rejected": -1.6122701168060303, "loss": 3.6308, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.68282699584961, "rewards/margins": -0.5601242184638977, "rewards/rejected": -16.12270164489746, "step": 2140 }, { "epoch": 0.0722976844517847, "grad_norm": 39.813602447509766, "learning_rate": 7.229524772497472e-07, "logits/chosen": -0.6865358352661133, "logits/rejected": -0.6879181265830994, "logps/chosen": -1.5924547910690308, "logps/rejected": -1.5645772218704224, "loss": 3.3997, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -15.924548149108887, "rewards/margins": -0.2787768244743347, "rewards/rejected": -15.645771980285645, "step": 2145 }, { "epoch": 0.07246621052276787, "grad_norm": 34.804351806640625, "learning_rate": 7.246376811594203e-07, "logits/chosen": -0.37989914417266846, "logits/rejected": -0.31740498542785645, "logps/chosen": -2.073552370071411, "logps/rejected": -2.0728116035461426, "loss": 3.5797, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.735519409179688, "rewards/margins": -0.0074065206572413445, "rewards/rejected": -20.72811508178711, "step": 2150 }, { "epoch": 0.07263473659375105, "grad_norm": 32.999820709228516, "learning_rate": 7.263228850690934e-07, "logits/chosen": -0.4067623019218445, "logits/rejected": -0.2200227677822113, "logps/chosen": -1.6104068756103516, "logps/rejected": -1.7444422245025635, "loss": 2.4908, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.104068756103516, "rewards/margins": 1.3403522968292236, "rewards/rejected": -17.444419860839844, "step": 2155 }, { "epoch": 0.07280326266473423, "grad_norm": 21.049962997436523, "learning_rate": 7.280080889787664e-07, "logits/chosen": -0.9572398066520691, "logits/rejected": -0.894806981086731, "logps/chosen": -1.5985901355743408, "logps/rejected": -1.6197038888931274, "loss": 2.984, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -15.985898971557617, "rewards/margins": 0.2111394852399826, "rewards/rejected": -16.197040557861328, "step": 2160 }, { "epoch": 0.07297178873571741, "grad_norm": 19.7977294921875, "learning_rate": 7.296932928884395e-07, "logits/chosen": -0.7495409846305847, "logits/rejected": -0.7707471251487732, "logps/chosen": -1.5134512186050415, "logps/rejected": -1.604833960533142, "loss": 2.4129, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -15.134511947631836, "rewards/margins": 0.9138285517692566, "rewards/rejected": -16.048341751098633, "step": 2165 }, { "epoch": 0.0731403148067006, "grad_norm": 26.598833084106445, "learning_rate": 7.313784967981126e-07, "logits/chosen": -0.4947318434715271, "logits/rejected": -0.4180319905281067, "logps/chosen": -1.8544307947158813, "logps/rejected": -1.9303770065307617, "loss": 2.6067, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.5443058013916, "rewards/margins": 0.7594637870788574, "rewards/rejected": -19.303770065307617, "step": 2170 }, { "epoch": 0.07330884087768377, "grad_norm": 22.23002052307129, "learning_rate": 7.330637007077856e-07, "logits/chosen": -0.3323080837726593, "logits/rejected": -0.32338953018188477, "logps/chosen": -1.6461750268936157, "logps/rejected": -1.8945128917694092, "loss": 1.8916, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.461750030517578, "rewards/margins": 2.483377695083618, "rewards/rejected": -18.945127487182617, "step": 2175 }, { "epoch": 0.07347736694866695, "grad_norm": 28.355215072631836, "learning_rate": 7.347489046174586e-07, "logits/chosen": -0.5796966552734375, "logits/rejected": -0.5349574089050293, "logps/chosen": -1.4876244068145752, "logps/rejected": -1.5375049114227295, "loss": 2.7552, "rewards/accuracies": 0.5, "rewards/chosen": -14.876243591308594, "rewards/margins": 0.4988061487674713, "rewards/rejected": -15.375048637390137, "step": 2180 }, { "epoch": 0.07364589301965015, "grad_norm": 27.98024559020996, "learning_rate": 7.364341085271317e-07, "logits/chosen": -0.6341951489448547, "logits/rejected": -0.6173728108406067, "logps/chosen": -1.6965017318725586, "logps/rejected": -1.7058916091918945, "loss": 3.2173, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -16.965015411376953, "rewards/margins": 0.09389963001012802, "rewards/rejected": -17.058916091918945, "step": 2185 }, { "epoch": 0.07381441909063333, "grad_norm": 17.731689453125, "learning_rate": 7.381193124368048e-07, "logits/chosen": -0.5357145667076111, "logits/rejected": -0.5386208295822144, "logps/chosen": -1.4506524801254272, "logps/rejected": -1.4157730340957642, "loss": 3.4912, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -14.506525039672852, "rewards/margins": -0.34879380464553833, "rewards/rejected": -14.157732009887695, "step": 2190 }, { "epoch": 0.0739829451616165, "grad_norm": 26.12714958190918, "learning_rate": 7.398045163464779e-07, "logits/chosen": -0.2705962061882019, "logits/rejected": -0.18878893554210663, "logps/chosen": -1.7146854400634766, "logps/rejected": -1.6282618045806885, "loss": 3.967, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.146854400634766, "rewards/margins": -0.8642366528511047, "rewards/rejected": -16.282617568969727, "step": 2195 }, { "epoch": 0.07415147123259969, "grad_norm": 21.842453002929688, "learning_rate": 7.41489720256151e-07, "logits/chosen": -0.1653115302324295, "logits/rejected": -0.073493592441082, "logps/chosen": -1.6875333786010742, "logps/rejected": -1.7379028797149658, "loss": 2.7714, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.875333786010742, "rewards/margins": 0.5036935806274414, "rewards/rejected": -17.3790283203125, "step": 2200 }, { "epoch": 0.07431999730358287, "grad_norm": 17.958499908447266, "learning_rate": 7.431749241658241e-07, "logits/chosen": -0.4340514540672302, "logits/rejected": -0.4271577000617981, "logps/chosen": -1.8904931545257568, "logps/rejected": -2.2419497966766357, "loss": 1.6577, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.90493392944336, "rewards/margins": 3.5145657062530518, "rewards/rejected": -22.419498443603516, "step": 2205 }, { "epoch": 0.07448852337456605, "grad_norm": 21.014781951904297, "learning_rate": 7.448601280754972e-07, "logits/chosen": -0.85997474193573, "logits/rejected": -0.6709158420562744, "logps/chosen": -1.753631591796875, "logps/rejected": -1.5993316173553467, "loss": 4.6344, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -17.53631591796875, "rewards/margins": -1.5429986715316772, "rewards/rejected": -15.993316650390625, "step": 2210 }, { "epoch": 0.07465704944554923, "grad_norm": 17.678613662719727, "learning_rate": 7.465453319851702e-07, "logits/chosen": -0.5139130353927612, "logits/rejected": -0.6146605610847473, "logps/chosen": -1.7065128087997437, "logps/rejected": -1.9963042736053467, "loss": 2.515, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.06513023376465, "rewards/margins": 2.8979153633117676, "rewards/rejected": -19.963045120239258, "step": 2215 }, { "epoch": 0.0748255755165324, "grad_norm": 24.807268142700195, "learning_rate": 7.482305358948432e-07, "logits/chosen": -0.3597901463508606, "logits/rejected": -0.4648984372615814, "logps/chosen": -1.699566125869751, "logps/rejected": -1.7271579504013062, "loss": 2.9498, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.99566078186035, "rewards/margins": 0.2759190499782562, "rewards/rejected": -17.27157974243164, "step": 2220 }, { "epoch": 0.07499410158751559, "grad_norm": 37.47063446044922, "learning_rate": 7.499157398045163e-07, "logits/chosen": 0.019718538969755173, "logits/rejected": 0.006109035108238459, "logps/chosen": -2.5828468799591064, "logps/rejected": -2.2230963706970215, "loss": 6.7239, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.828466415405273, "rewards/margins": -3.597503662109375, "rewards/rejected": -22.23096466064453, "step": 2225 }, { "epoch": 0.07516262765849877, "grad_norm": 32.3070182800293, "learning_rate": 7.516009437141893e-07, "logits/chosen": -0.6688810586929321, "logits/rejected": -0.5590790510177612, "logps/chosen": -1.5791361331939697, "logps/rejected": -1.5508191585540771, "loss": 3.4132, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -15.791361808776855, "rewards/margins": -0.28317031264305115, "rewards/rejected": -15.508191108703613, "step": 2230 }, { "epoch": 0.07533115372948194, "grad_norm": 12.327166557312012, "learning_rate": 7.532861476238624e-07, "logits/chosen": -0.2609061598777771, "logits/rejected": -0.25732770562171936, "logps/chosen": -2.010161876678467, "logps/rejected": -1.8172937631607056, "loss": 5.3264, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -20.10161590576172, "rewards/margins": -1.9286785125732422, "rewards/rejected": -18.172937393188477, "step": 2235 }, { "epoch": 0.07549967980046514, "grad_norm": 17.71880340576172, "learning_rate": 7.549713515335355e-07, "logits/chosen": -0.7396351099014282, "logits/rejected": -0.7504865527153015, "logps/chosen": -1.7640094757080078, "logps/rejected": -1.7561099529266357, "loss": 3.2278, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -17.640094757080078, "rewards/margins": -0.07899437099695206, "rewards/rejected": -17.561100006103516, "step": 2240 }, { "epoch": 0.07566820587144832, "grad_norm": 12.440887451171875, "learning_rate": 7.566565554432086e-07, "logits/chosen": -0.15917307138442993, "logits/rejected": -0.1628536731004715, "logps/chosen": -2.0435426235198975, "logps/rejected": -2.197415590286255, "loss": 2.1454, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.435426712036133, "rewards/margins": 1.5387285947799683, "rewards/rejected": -21.97415542602539, "step": 2245 }, { "epoch": 0.0758367319424315, "grad_norm": 24.053064346313477, "learning_rate": 7.583417593528817e-07, "logits/chosen": -0.79632967710495, "logits/rejected": -0.5563432574272156, "logps/chosen": -1.7878577709197998, "logps/rejected": -1.9376850128173828, "loss": 1.9539, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.878576278686523, "rewards/margins": 1.498272180557251, "rewards/rejected": -19.376850128173828, "step": 2250 }, { "epoch": 0.07600525801341468, "grad_norm": 25.67947006225586, "learning_rate": 7.600269632625548e-07, "logits/chosen": -0.7355546951293945, "logits/rejected": -0.5501964688301086, "logps/chosen": -1.9016857147216797, "logps/rejected": -1.9401962757110596, "loss": 2.7661, "rewards/accuracies": 0.5, "rewards/chosen": -19.016857147216797, "rewards/margins": 0.3851049542427063, "rewards/rejected": -19.401962280273438, "step": 2255 }, { "epoch": 0.07617378408439786, "grad_norm": 48.06303405761719, "learning_rate": 7.617121671722279e-07, "logits/chosen": -0.40767064690589905, "logits/rejected": -0.3229612708091736, "logps/chosen": -2.4324240684509277, "logps/rejected": -2.4284210205078125, "loss": 3.1582, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.324243545532227, "rewards/margins": -0.04003305360674858, "rewards/rejected": -24.284210205078125, "step": 2260 }, { "epoch": 0.07634231015538104, "grad_norm": 22.984275817871094, "learning_rate": 7.633973710819008e-07, "logits/chosen": -0.2127332240343094, "logits/rejected": -0.05123148113489151, "logps/chosen": -1.8632936477661133, "logps/rejected": -1.8484798669815063, "loss": 3.2724, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -18.632936477661133, "rewards/margins": -0.1481371819972992, "rewards/rejected": -18.484798431396484, "step": 2265 }, { "epoch": 0.07651083622636422, "grad_norm": 226.9107208251953, "learning_rate": 7.650825749915739e-07, "logits/chosen": -0.2728140652179718, "logits/rejected": -0.3592303395271301, "logps/chosen": -2.038437604904175, "logps/rejected": -2.0359554290771484, "loss": 5.825, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.38437843322754, "rewards/margins": -0.024822425097227097, "rewards/rejected": -20.359554290771484, "step": 2270 }, { "epoch": 0.0766793622973474, "grad_norm": 32.962181091308594, "learning_rate": 7.66767778901247e-07, "logits/chosen": -0.39705973863601685, "logits/rejected": -0.31574827432632446, "logps/chosen": -1.697000503540039, "logps/rejected": -1.711599349975586, "loss": 3.405, "rewards/accuracies": 0.5, "rewards/chosen": -16.97000503540039, "rewards/margins": 0.1459902822971344, "rewards/rejected": -17.11599349975586, "step": 2275 }, { "epoch": 0.07684788836833058, "grad_norm": 23.07267951965332, "learning_rate": 7.684529828109201e-07, "logits/chosen": -0.17579427361488342, "logits/rejected": -0.27825185656547546, "logps/chosen": -2.1279714107513428, "logps/rejected": -1.9978828430175781, "loss": 4.3433, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -21.279714584350586, "rewards/margins": -1.3008840084075928, "rewards/rejected": -19.978830337524414, "step": 2280 }, { "epoch": 0.07701641443931376, "grad_norm": 35.53074264526367, "learning_rate": 7.701381867205931e-07, "logits/chosen": -0.4603014588356018, "logits/rejected": -0.49287882447242737, "logps/chosen": -2.0541510581970215, "logps/rejected": -2.232800006866455, "loss": 3.401, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.5415096282959, "rewards/margins": 1.7864891290664673, "rewards/rejected": -22.327999114990234, "step": 2285 }, { "epoch": 0.07718494051029694, "grad_norm": 27.62151527404785, "learning_rate": 7.718233906302662e-07, "logits/chosen": -0.2474125325679779, "logits/rejected": -0.15923914313316345, "logps/chosen": -1.755563735961914, "logps/rejected": -1.8253600597381592, "loss": 2.6478, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.555635452270508, "rewards/margins": 0.6979645490646362, "rewards/rejected": -18.253599166870117, "step": 2290 }, { "epoch": 0.07735346658128013, "grad_norm": 34.95461654663086, "learning_rate": 7.735085945399393e-07, "logits/chosen": -0.6580768823623657, "logits/rejected": -0.7368906140327454, "logps/chosen": -1.8308817148208618, "logps/rejected": -1.9212795495986938, "loss": 2.3617, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.30881690979004, "rewards/margins": 0.90397709608078, "rewards/rejected": -19.212793350219727, "step": 2295 }, { "epoch": 0.07752199265226331, "grad_norm": 16.449514389038086, "learning_rate": 7.751937984496125e-07, "logits/chosen": -0.49866050481796265, "logits/rejected": -0.44423356652259827, "logps/chosen": -1.5588819980621338, "logps/rejected": -1.574965238571167, "loss": 3.1077, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -15.58881950378418, "rewards/margins": 0.16083398461341858, "rewards/rejected": -15.749651908874512, "step": 2300 }, { "epoch": 0.07769051872324649, "grad_norm": 33.75349807739258, "learning_rate": 7.768790023592854e-07, "logits/chosen": -0.35698094964027405, "logits/rejected": -0.31498563289642334, "logps/chosen": -1.8009259700775146, "logps/rejected": -1.8306411504745483, "loss": 3.5954, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.009258270263672, "rewards/margins": 0.29715123772621155, "rewards/rejected": -18.306411743164062, "step": 2305 }, { "epoch": 0.07785904479422967, "grad_norm": 27.460735321044922, "learning_rate": 7.785642062689585e-07, "logits/chosen": -0.5881357789039612, "logits/rejected": -0.5689517855644226, "logps/chosen": -1.458878755569458, "logps/rejected": -1.4286056756973267, "loss": 3.3968, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -14.588786125183105, "rewards/margins": -0.30273109674453735, "rewards/rejected": -14.286054611206055, "step": 2310 }, { "epoch": 0.07802757086521285, "grad_norm": 9.67859172821045, "learning_rate": 7.802494101786316e-07, "logits/chosen": -0.3839682340621948, "logits/rejected": -0.4018523097038269, "logps/chosen": -1.3250752687454224, "logps/rejected": -1.469162940979004, "loss": 2.5192, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -13.250753402709961, "rewards/margins": 1.4408762454986572, "rewards/rejected": -14.691629409790039, "step": 2315 }, { "epoch": 0.07819609693619603, "grad_norm": 53.11182403564453, "learning_rate": 7.819346140883046e-07, "logits/chosen": -0.25247180461883545, "logits/rejected": -0.2038680762052536, "logps/chosen": -2.0797410011291504, "logps/rejected": -1.9495025873184204, "loss": 4.3848, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -20.79741096496582, "rewards/margins": -1.302383542060852, "rewards/rejected": -19.495027542114258, "step": 2320 }, { "epoch": 0.07836462300717921, "grad_norm": 14.444879531860352, "learning_rate": 7.836198179979777e-07, "logits/chosen": -0.5390850305557251, "logits/rejected": -0.5718969106674194, "logps/chosen": -2.016746759414673, "logps/rejected": -1.9924167394638062, "loss": 3.7065, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -20.16746711730957, "rewards/margins": -0.24329786002635956, "rewards/rejected": -19.92416763305664, "step": 2325 }, { "epoch": 0.07853314907816239, "grad_norm": 24.698184967041016, "learning_rate": 7.853050219076508e-07, "logits/chosen": -0.4092784523963928, "logits/rejected": -0.1485476940870285, "logps/chosen": -1.8159494400024414, "logps/rejected": -1.8311035633087158, "loss": 3.4358, "rewards/accuracies": 0.5, "rewards/chosen": -18.159494400024414, "rewards/margins": 0.15154066681861877, "rewards/rejected": -18.31103515625, "step": 2330 }, { "epoch": 0.07870167514914557, "grad_norm": 28.049543380737305, "learning_rate": 7.869902258173239e-07, "logits/chosen": -0.4887131154537201, "logits/rejected": -0.2594299614429474, "logps/chosen": -1.7859032154083252, "logps/rejected": -1.8176990747451782, "loss": 2.9338, "rewards/accuracies": 0.5, "rewards/chosen": -17.859033584594727, "rewards/margins": 0.3179585337638855, "rewards/rejected": -18.176990509033203, "step": 2335 }, { "epoch": 0.07887020122012875, "grad_norm": 24.492351531982422, "learning_rate": 7.886754297269969e-07, "logits/chosen": -0.21818354725837708, "logits/rejected": -0.2571013271808624, "logps/chosen": -1.9694633483886719, "logps/rejected": -2.051032781600952, "loss": 2.5502, "rewards/accuracies": 0.5, "rewards/chosen": -19.694631576538086, "rewards/margins": 0.8156954050064087, "rewards/rejected": -20.510326385498047, "step": 2340 }, { "epoch": 0.07903872729111193, "grad_norm": 26.16986656188965, "learning_rate": 7.9036063363667e-07, "logits/chosen": -0.6751791834831238, "logits/rejected": -0.6740162372589111, "logps/chosen": -1.5954325199127197, "logps/rejected": -1.6436774730682373, "loss": 2.6107, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -15.954325675964355, "rewards/margins": 0.482450008392334, "rewards/rejected": -16.43677520751953, "step": 2345 }, { "epoch": 0.07920725336209512, "grad_norm": 26.028491973876953, "learning_rate": 7.920458375463431e-07, "logits/chosen": -0.6816641092300415, "logits/rejected": -0.6692745089530945, "logps/chosen": -1.7431175708770752, "logps/rejected": -1.6779407262802124, "loss": 3.7004, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -17.43117332458496, "rewards/margins": -0.6517672538757324, "rewards/rejected": -16.779407501220703, "step": 2350 }, { "epoch": 0.0793757794330783, "grad_norm": 17.893020629882812, "learning_rate": 7.937310414560161e-07, "logits/chosen": -0.3664388060569763, "logits/rejected": -0.5129767656326294, "logps/chosen": -1.8353245258331299, "logps/rejected": -1.8356059789657593, "loss": 3.1945, "rewards/accuracies": 0.5, "rewards/chosen": -18.35324478149414, "rewards/margins": 0.0028172493912279606, "rewards/rejected": -18.356060028076172, "step": 2355 }, { "epoch": 0.07954430550406148, "grad_norm": 22.861780166625977, "learning_rate": 7.954162453656892e-07, "logits/chosen": -0.3994244337081909, "logits/rejected": -0.5220701098442078, "logps/chosen": -1.808762550354004, "logps/rejected": -1.820603609085083, "loss": 3.1259, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.08762550354004, "rewards/margins": 0.11841030418872833, "rewards/rejected": -18.206035614013672, "step": 2360 }, { "epoch": 0.07971283157504466, "grad_norm": 31.54043960571289, "learning_rate": 7.971014492753623e-07, "logits/chosen": -0.059911616146564484, "logits/rejected": -0.0022819482255727053, "logps/chosen": -2.044661521911621, "logps/rejected": -2.0498645305633545, "loss": 3.2518, "rewards/accuracies": 0.5, "rewards/chosen": -20.44661521911621, "rewards/margins": 0.05202770233154297, "rewards/rejected": -20.498641967773438, "step": 2365 }, { "epoch": 0.07988135764602784, "grad_norm": 18.20899772644043, "learning_rate": 7.987866531850354e-07, "logits/chosen": -0.6337564587593079, "logits/rejected": -0.6478831171989441, "logps/chosen": -1.5746173858642578, "logps/rejected": -1.8139946460723877, "loss": 1.8241, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -15.746172904968262, "rewards/margins": 2.393773317337036, "rewards/rejected": -18.13994789123535, "step": 2370 }, { "epoch": 0.08004988371701102, "grad_norm": 25.54546356201172, "learning_rate": 8.004718570947084e-07, "logits/chosen": -0.7622129321098328, "logits/rejected": -0.7510684728622437, "logps/chosen": -1.621252417564392, "logps/rejected": -1.6405149698257446, "loss": 3.0707, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.212522506713867, "rewards/margins": 0.19262532889842987, "rewards/rejected": -16.405147552490234, "step": 2375 }, { "epoch": 0.0802184097879942, "grad_norm": 22.381973266601562, "learning_rate": 8.021570610043815e-07, "logits/chosen": -0.29075971245765686, "logits/rejected": -0.2355223149061203, "logps/chosen": -1.7290035486221313, "logps/rejected": -1.7951444387435913, "loss": 2.585, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.290037155151367, "rewards/margins": 0.6614087224006653, "rewards/rejected": -17.951444625854492, "step": 2380 }, { "epoch": 0.08038693585897738, "grad_norm": 15.249910354614258, "learning_rate": 8.038422649140546e-07, "logits/chosen": -0.6698895692825317, "logits/rejected": -0.5626107454299927, "logps/chosen": -1.3483240604400635, "logps/rejected": -1.4742333889007568, "loss": 2.2219, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -13.483240127563477, "rewards/margins": 1.259093999862671, "rewards/rejected": -14.742334365844727, "step": 2385 }, { "epoch": 0.08055546192996056, "grad_norm": 9.537259101867676, "learning_rate": 8.055274688237276e-07, "logits/chosen": -0.38138988614082336, "logits/rejected": -0.39942893385887146, "logps/chosen": -2.068192958831787, "logps/rejected": -2.230304479598999, "loss": 2.7261, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.681930541992188, "rewards/margins": 1.6211124658584595, "rewards/rejected": -22.30304527282715, "step": 2390 }, { "epoch": 0.08072398800094374, "grad_norm": 42.501583099365234, "learning_rate": 8.072126727334006e-07, "logits/chosen": -0.6858797669410706, "logits/rejected": -0.6615114808082581, "logps/chosen": -1.8533143997192383, "logps/rejected": -1.8078025579452515, "loss": 3.6226, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.533143997192383, "rewards/margins": -0.4551170766353607, "rewards/rejected": -18.078027725219727, "step": 2395 }, { "epoch": 0.08089251407192692, "grad_norm": 24.91890525817871, "learning_rate": 8.088978766430738e-07, "logits/chosen": -0.36171025037765503, "logits/rejected": -0.43480342626571655, "logps/chosen": -1.681675910949707, "logps/rejected": -1.6178693771362305, "loss": 3.6924, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.81675910949707, "rewards/margins": -0.6380661129951477, "rewards/rejected": -16.178691864013672, "step": 2400 }, { "epoch": 0.08089251407192692, "eval_logits/chosen": -0.7006931900978088, "eval_logits/rejected": -0.703230619430542, "eval_logps/chosen": -1.6610907316207886, "eval_logps/rejected": -1.6590111255645752, "eval_loss": 3.4272820949554443, "eval_rewards/accuracies": 0.4699999988079071, "eval_rewards/chosen": -16.61090850830078, "eval_rewards/margins": -0.020797576755285263, "eval_rewards/rejected": -16.590110778808594, "eval_runtime": 12.8899, "eval_samples_per_second": 7.758, "eval_steps_per_second": 1.939, "step": 2400 }, { "epoch": 0.0810610401429101, "grad_norm": 23.806867599487305, "learning_rate": 8.105830805527469e-07, "logits/chosen": -0.5742610692977905, "logits/rejected": -0.6479157209396362, "logps/chosen": -1.6960289478302002, "logps/rejected": -1.9552192687988281, "loss": 2.0351, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.960285186767578, "rewards/margins": 2.591905355453491, "rewards/rejected": -19.55219268798828, "step": 2405 }, { "epoch": 0.0812295662138933, "grad_norm": 18.037389755249023, "learning_rate": 8.122682844624199e-07, "logits/chosen": -0.6523152589797974, "logits/rejected": -0.6970680356025696, "logps/chosen": -1.4654817581176758, "logps/rejected": -1.5350319147109985, "loss": 2.6353, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -14.654818534851074, "rewards/margins": 0.6954997777938843, "rewards/rejected": -15.350318908691406, "step": 2410 }, { "epoch": 0.08139809228487647, "grad_norm": 16.91983413696289, "learning_rate": 8.13953488372093e-07, "logits/chosen": -0.35806578397750854, "logits/rejected": -0.34750303626060486, "logps/chosen": -1.6740829944610596, "logps/rejected": -1.7585744857788086, "loss": 2.3532, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.740829467773438, "rewards/margins": 0.8449147343635559, "rewards/rejected": -17.585744857788086, "step": 2415 }, { "epoch": 0.08156661835585965, "grad_norm": 40.20941162109375, "learning_rate": 8.156386922817661e-07, "logits/chosen": -0.37193426489830017, "logits/rejected": -0.3277415633201599, "logps/chosen": -1.804286003112793, "logps/rejected": -1.8441333770751953, "loss": 2.9108, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.042861938476562, "rewards/margins": 0.398474782705307, "rewards/rejected": -18.441335678100586, "step": 2420 }, { "epoch": 0.08173514442684283, "grad_norm": 28.02414321899414, "learning_rate": 8.173238961914391e-07, "logits/chosen": -0.6131106615066528, "logits/rejected": -0.5054728984832764, "logps/chosen": -1.6783838272094727, "logps/rejected": -1.5606751441955566, "loss": 4.2124, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.78384017944336, "rewards/margins": -1.177086591720581, "rewards/rejected": -15.606752395629883, "step": 2425 }, { "epoch": 0.08190367049782601, "grad_norm": 29.80112648010254, "learning_rate": 8.190091001011122e-07, "logits/chosen": -0.5313662886619568, "logits/rejected": -0.38392385840415955, "logps/chosen": -1.7252247333526611, "logps/rejected": -1.7433083057403564, "loss": 3.2085, "rewards/accuracies": 0.5, "rewards/chosen": -17.252246856689453, "rewards/margins": 0.1808355301618576, "rewards/rejected": -17.43308448791504, "step": 2430 }, { "epoch": 0.0820721965688092, "grad_norm": 53.98869705200195, "learning_rate": 8.206943040107853e-07, "logits/chosen": -0.26283207535743713, "logits/rejected": -0.2831880748271942, "logps/chosen": -1.9631812572479248, "logps/rejected": -2.0056893825531006, "loss": 3.3863, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.631813049316406, "rewards/margins": 0.4250810742378235, "rewards/rejected": -20.05689239501953, "step": 2435 }, { "epoch": 0.08224072263979237, "grad_norm": 50.21210861206055, "learning_rate": 8.223795079204583e-07, "logits/chosen": -0.20228877663612366, "logits/rejected": -0.15569323301315308, "logps/chosen": -1.7776590585708618, "logps/rejected": -1.6473630666732788, "loss": 4.4394, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -17.776588439941406, "rewards/margins": -1.3029576539993286, "rewards/rejected": -16.473630905151367, "step": 2440 }, { "epoch": 0.08240924871077555, "grad_norm": 21.28071403503418, "learning_rate": 8.240647118301313e-07, "logits/chosen": 0.09300395101308823, "logits/rejected": 0.14108344912528992, "logps/chosen": -2.510117530822754, "logps/rejected": -2.6373324394226074, "loss": 1.9585, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -25.101173400878906, "rewards/margins": 1.2721501588821411, "rewards/rejected": -26.37332534790039, "step": 2445 }, { "epoch": 0.08257777478175873, "grad_norm": 31.054330825805664, "learning_rate": 8.257499157398045e-07, "logits/chosen": -0.8290077447891235, "logits/rejected": -0.6386555433273315, "logps/chosen": -1.528441071510315, "logps/rejected": -1.553753137588501, "loss": 2.8956, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -15.284411430358887, "rewards/margins": 0.253119558095932, "rewards/rejected": -15.537530899047852, "step": 2450 }, { "epoch": 0.08274630085274191, "grad_norm": 21.822750091552734, "learning_rate": 8.274351196494776e-07, "logits/chosen": -0.4548490643501282, "logits/rejected": -0.5014796257019043, "logps/chosen": -1.5887835025787354, "logps/rejected": -1.537469506263733, "loss": 3.5735, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -15.887834548950195, "rewards/margins": -0.5131396055221558, "rewards/rejected": -15.37469482421875, "step": 2455 }, { "epoch": 0.0829148269237251, "grad_norm": 31.784954071044922, "learning_rate": 8.291203235591507e-07, "logits/chosen": -0.22424478828907013, "logits/rejected": -0.25032782554626465, "logps/chosen": -1.8386729955673218, "logps/rejected": -1.7788835763931274, "loss": 3.6937, "rewards/accuracies": 0.5, "rewards/chosen": -18.386730194091797, "rewards/margins": -0.5978950262069702, "rewards/rejected": -17.788835525512695, "step": 2460 }, { "epoch": 0.08308335299470829, "grad_norm": 16.605253219604492, "learning_rate": 8.308055274688237e-07, "logits/chosen": -0.3429441452026367, "logits/rejected": -0.3958896994590759, "logps/chosen": -1.5618867874145508, "logps/rejected": -1.6103063821792603, "loss": 2.7523, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -15.618867874145508, "rewards/margins": 0.4841957986354828, "rewards/rejected": -16.103063583374023, "step": 2465 }, { "epoch": 0.08325187906569147, "grad_norm": 23.05854606628418, "learning_rate": 8.324907313784968e-07, "logits/chosen": -0.5046225786209106, "logits/rejected": -0.3628554344177246, "logps/chosen": -1.6668421030044556, "logps/rejected": -1.7868725061416626, "loss": 3.2138, "rewards/accuracies": 0.5, "rewards/chosen": -16.66842269897461, "rewards/margins": 1.2003029584884644, "rewards/rejected": -17.868724822998047, "step": 2470 }, { "epoch": 0.08342040513667465, "grad_norm": 20.510766983032227, "learning_rate": 8.341759352881699e-07, "logits/chosen": -0.028436947613954544, "logits/rejected": -0.0018680095672607422, "logps/chosen": -1.6618999242782593, "logps/rejected": -1.6381727457046509, "loss": 3.3843, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.618999481201172, "rewards/margins": -0.23727349936962128, "rewards/rejected": -16.38172721862793, "step": 2475 }, { "epoch": 0.08358893120765783, "grad_norm": 29.016969680786133, "learning_rate": 8.358611391978428e-07, "logits/chosen": -0.5356850624084473, "logits/rejected": -0.5231121182441711, "logps/chosen": -1.4784767627716064, "logps/rejected": -1.6118614673614502, "loss": 1.9429, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -14.784765243530273, "rewards/margins": 1.333849310874939, "rewards/rejected": -16.118616104125977, "step": 2480 }, { "epoch": 0.083757457278641, "grad_norm": 24.966922760009766, "learning_rate": 8.375463431075159e-07, "logits/chosen": -0.5175787210464478, "logits/rejected": -0.5980731248855591, "logps/chosen": -1.721757173538208, "logps/rejected": -1.6184993982315063, "loss": 4.1047, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.217571258544922, "rewards/margins": -1.0325804948806763, "rewards/rejected": -16.184993743896484, "step": 2485 }, { "epoch": 0.08392598334962419, "grad_norm": 23.180864334106445, "learning_rate": 8.39231547017189e-07, "logits/chosen": -0.14530567824840546, "logits/rejected": -0.19354048371315002, "logps/chosen": -2.2405307292938232, "logps/rejected": -2.4060721397399902, "loss": 2.3143, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.40530776977539, "rewards/margins": 1.655413031578064, "rewards/rejected": -24.060718536376953, "step": 2490 }, { "epoch": 0.08409450942060737, "grad_norm": 10.979439735412598, "learning_rate": 8.40916750926862e-07, "logits/chosen": -0.44204673171043396, "logits/rejected": -0.440899521112442, "logps/chosen": -1.6555083990097046, "logps/rejected": -1.7864110469818115, "loss": 2.7175, "rewards/accuracies": 0.5, "rewards/chosen": -16.555084228515625, "rewards/margins": 1.309026837348938, "rewards/rejected": -17.864110946655273, "step": 2495 }, { "epoch": 0.08426303549159055, "grad_norm": 23.828073501586914, "learning_rate": 8.426019548365352e-07, "logits/chosen": -0.6088763475418091, "logits/rejected": -0.49291929602622986, "logps/chosen": -1.8682682514190674, "logps/rejected": -1.9699296951293945, "loss": 2.2446, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.68268394470215, "rewards/margins": 1.01661217212677, "rewards/rejected": -19.699295043945312, "step": 2500 }, { "epoch": 0.08443156156257373, "grad_norm": 20.06534194946289, "learning_rate": 8.442871587462083e-07, "logits/chosen": -0.7816548347473145, "logits/rejected": -0.6558118462562561, "logps/chosen": -1.7117058038711548, "logps/rejected": -1.7417329549789429, "loss": 2.8461, "rewards/accuracies": 0.5, "rewards/chosen": -17.11705780029297, "rewards/margins": 0.30027294158935547, "rewards/rejected": -17.417329788208008, "step": 2505 }, { "epoch": 0.0846000876335569, "grad_norm": 14.00971794128418, "learning_rate": 8.459723626558814e-07, "logits/chosen": -0.3416903018951416, "logits/rejected": -0.3518048822879791, "logps/chosen": -1.8263881206512451, "logps/rejected": -2.0474984645843506, "loss": 2.6428, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.263883590698242, "rewards/margins": 2.2111012935638428, "rewards/rejected": -20.474987030029297, "step": 2510 }, { "epoch": 0.08476861370454009, "grad_norm": 28.76343536376953, "learning_rate": 8.476575665655544e-07, "logits/chosen": -0.5365201234817505, "logits/rejected": -0.3862677216529846, "logps/chosen": -1.7432008981704712, "logps/rejected": -1.638911485671997, "loss": 4.1144, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.432008743286133, "rewards/margins": -1.0428931713104248, "rewards/rejected": -16.389114379882812, "step": 2515 }, { "epoch": 0.08493713977552328, "grad_norm": 32.587223052978516, "learning_rate": 8.493427704752275e-07, "logits/chosen": -0.852696418762207, "logits/rejected": -0.8922135233879089, "logps/chosen": -1.7877833843231201, "logps/rejected": -1.7858728170394897, "loss": 3.1803, "rewards/accuracies": 0.5, "rewards/chosen": -17.87783432006836, "rewards/margins": -0.019106198102235794, "rewards/rejected": -17.85873031616211, "step": 2520 }, { "epoch": 0.08510566584650646, "grad_norm": 17.61363983154297, "learning_rate": 8.510279743849005e-07, "logits/chosen": -0.689334511756897, "logits/rejected": -0.60210782289505, "logps/chosen": -1.4714277982711792, "logps/rejected": -1.419995903968811, "loss": 3.6069, "rewards/accuracies": 0.5, "rewards/chosen": -14.714277267456055, "rewards/margins": -0.5143192410469055, "rewards/rejected": -14.199956893920898, "step": 2525 }, { "epoch": 0.08527419191748964, "grad_norm": 14.594451904296875, "learning_rate": 8.527131782945736e-07, "logits/chosen": -0.3180989623069763, "logits/rejected": -0.17765101790428162, "logps/chosen": -1.7926902770996094, "logps/rejected": -1.7263141870498657, "loss": 3.9356, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.926902770996094, "rewards/margins": -0.6637606024742126, "rewards/rejected": -17.263141632080078, "step": 2530 }, { "epoch": 0.08544271798847282, "grad_norm": 34.54674530029297, "learning_rate": 8.543983822042466e-07, "logits/chosen": -0.6042408347129822, "logits/rejected": -0.649804413318634, "logps/chosen": -1.699530005455017, "logps/rejected": -1.7469911575317383, "loss": 2.9274, "rewards/accuracies": 0.5, "rewards/chosen": -16.99530029296875, "rewards/margins": 0.47461146116256714, "rewards/rejected": -17.469911575317383, "step": 2535 }, { "epoch": 0.085611244059456, "grad_norm": 27.885591506958008, "learning_rate": 8.560835861139197e-07, "logits/chosen": -0.45336025953292847, "logits/rejected": -0.5436762571334839, "logps/chosen": -1.7844282388687134, "logps/rejected": -1.680147409439087, "loss": 4.1847, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -17.844282150268555, "rewards/margins": -1.0428078174591064, "rewards/rejected": -16.80147361755371, "step": 2540 }, { "epoch": 0.08577977013043918, "grad_norm": 25.23313331604004, "learning_rate": 8.577687900235928e-07, "logits/chosen": -0.7203700542449951, "logits/rejected": -0.5824630260467529, "logps/chosen": -1.8401525020599365, "logps/rejected": -1.9509315490722656, "loss": 2.2659, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.40152359008789, "rewards/margins": 1.1077923774719238, "rewards/rejected": -19.50931739807129, "step": 2545 }, { "epoch": 0.08594829620142236, "grad_norm": 44.29021072387695, "learning_rate": 8.594539939332659e-07, "logits/chosen": -0.1677735149860382, "logits/rejected": -0.12232518196105957, "logps/chosen": -2.243950605392456, "logps/rejected": -2.3047237396240234, "loss": 2.9511, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.439504623413086, "rewards/margins": 0.6077313423156738, "rewards/rejected": -23.047237396240234, "step": 2550 }, { "epoch": 0.08611682227240554, "grad_norm": 21.166263580322266, "learning_rate": 8.61139197842939e-07, "logits/chosen": -0.16569176316261292, "logits/rejected": -0.1610005795955658, "logps/chosen": -1.6874868869781494, "logps/rejected": -1.731498122215271, "loss": 2.7449, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.87487030029297, "rewards/margins": 0.44011297821998596, "rewards/rejected": -17.314983367919922, "step": 2555 }, { "epoch": 0.08628534834338872, "grad_norm": 60.169769287109375, "learning_rate": 8.628244017526121e-07, "logits/chosen": -0.5680924654006958, "logits/rejected": -0.5643773078918457, "logps/chosen": -1.5531455278396606, "logps/rejected": -1.451002597808838, "loss": 4.0851, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -15.531454086303711, "rewards/margins": -1.0214288234710693, "rewards/rejected": -14.510026931762695, "step": 2560 }, { "epoch": 0.0864538744143719, "grad_norm": 37.528282165527344, "learning_rate": 8.645096056622852e-07, "logits/chosen": -0.3817165791988373, "logits/rejected": -0.1156226173043251, "logps/chosen": -1.930029273033142, "logps/rejected": -2.2233052253723145, "loss": 2.162, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.30029296875, "rewards/margins": 2.9327609539031982, "rewards/rejected": -22.23305320739746, "step": 2565 }, { "epoch": 0.08662240048535508, "grad_norm": 21.0026798248291, "learning_rate": 8.661948095719581e-07, "logits/chosen": -0.400460422039032, "logits/rejected": -0.40357083082199097, "logps/chosen": -1.6324317455291748, "logps/rejected": -1.6684566736221313, "loss": 2.8871, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.324317932128906, "rewards/margins": 0.3602485656738281, "rewards/rejected": -16.684566497802734, "step": 2570 }, { "epoch": 0.08679092655633827, "grad_norm": 25.365571975708008, "learning_rate": 8.678800134816312e-07, "logits/chosen": -0.7402045130729675, "logits/rejected": -0.7104997038841248, "logps/chosen": -1.5982532501220703, "logps/rejected": -1.785348653793335, "loss": 2.21, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -15.98253345489502, "rewards/margins": 1.8709545135498047, "rewards/rejected": -17.85348892211914, "step": 2575 }, { "epoch": 0.08695945262732145, "grad_norm": 16.19428253173828, "learning_rate": 8.695652173913043e-07, "logits/chosen": -0.4783341884613037, "logits/rejected": -0.4398323893547058, "logps/chosen": -1.4235103130340576, "logps/rejected": -1.4861732721328735, "loss": 2.5501, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -14.235102653503418, "rewards/margins": 0.6266301870346069, "rewards/rejected": -14.861734390258789, "step": 2580 }, { "epoch": 0.08712797869830463, "grad_norm": 13.780372619628906, "learning_rate": 8.712504213009773e-07, "logits/chosen": -0.5036159753799438, "logits/rejected": -0.4957659840583801, "logps/chosen": -1.6749789714813232, "logps/rejected": -1.7661399841308594, "loss": 2.2914, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.74979019165039, "rewards/margins": 0.911608874797821, "rewards/rejected": -17.66139793395996, "step": 2585 }, { "epoch": 0.08729650476928781, "grad_norm": 25.284282684326172, "learning_rate": 8.729356252106504e-07, "logits/chosen": -0.19299769401550293, "logits/rejected": -0.13180485367774963, "logps/chosen": -1.554852843284607, "logps/rejected": -1.6819696426391602, "loss": 2.5059, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -15.548528671264648, "rewards/margins": 1.271166443824768, "rewards/rejected": -16.8196964263916, "step": 2590 }, { "epoch": 0.08746503084027099, "grad_norm": 61.71205520629883, "learning_rate": 8.746208291203235e-07, "logits/chosen": -0.7792420387268066, "logits/rejected": -0.8336542248725891, "logps/chosen": -1.8205082416534424, "logps/rejected": -1.7581212520599365, "loss": 3.703, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.205081939697266, "rewards/margins": -0.623867392539978, "rewards/rejected": -17.581214904785156, "step": 2595 }, { "epoch": 0.08763355691125417, "grad_norm": 27.993122100830078, "learning_rate": 8.763060330299967e-07, "logits/chosen": -0.2036883383989334, "logits/rejected": -0.21279895305633545, "logps/chosen": -1.7165559530258179, "logps/rejected": -1.8176734447479248, "loss": 2.4624, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.165559768676758, "rewards/margins": 1.0111745595932007, "rewards/rejected": -18.176733016967773, "step": 2600 }, { "epoch": 0.08780208298223735, "grad_norm": 16.836488723754883, "learning_rate": 8.779912369396697e-07, "logits/chosen": -0.32075661420822144, "logits/rejected": -0.32733437418937683, "logps/chosen": -1.8509960174560547, "logps/rejected": -1.8140252828598022, "loss": 3.6891, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.509960174560547, "rewards/margins": -0.3697075843811035, "rewards/rejected": -18.1402530670166, "step": 2605 }, { "epoch": 0.08797060905322053, "grad_norm": 29.089462280273438, "learning_rate": 8.796764408493428e-07, "logits/chosen": -0.4722573757171631, "logits/rejected": -0.3574286699295044, "logps/chosen": -1.879150629043579, "logps/rejected": -1.8658632040023804, "loss": 3.4009, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.791505813598633, "rewards/margins": -0.13287362456321716, "rewards/rejected": -18.658634185791016, "step": 2610 }, { "epoch": 0.08813913512420371, "grad_norm": 22.84649658203125, "learning_rate": 8.813616447590158e-07, "logits/chosen": -0.1808585822582245, "logits/rejected": -0.3681219816207886, "logps/chosen": -1.8490365743637085, "logps/rejected": -1.804755449295044, "loss": 3.6758, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.490365982055664, "rewards/margins": -0.44281005859375, "rewards/rejected": -18.047555923461914, "step": 2615 }, { "epoch": 0.08830766119518689, "grad_norm": 29.813987731933594, "learning_rate": 8.830468486686888e-07, "logits/chosen": -0.5773710012435913, "logits/rejected": -0.556043267250061, "logps/chosen": -1.7120014429092407, "logps/rejected": -1.75547194480896, "loss": 3.0912, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.120014190673828, "rewards/margins": 0.43470603227615356, "rewards/rejected": -17.554719924926758, "step": 2620 }, { "epoch": 0.08847618726617007, "grad_norm": 23.193584442138672, "learning_rate": 8.847320525783619e-07, "logits/chosen": -0.6107980012893677, "logits/rejected": -0.587156355381012, "logps/chosen": -1.7202411890029907, "logps/rejected": -1.7343101501464844, "loss": 3.0331, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.202411651611328, "rewards/margins": 0.14069156348705292, "rewards/rejected": -17.343101501464844, "step": 2625 }, { "epoch": 0.08864471333715326, "grad_norm": 22.517454147338867, "learning_rate": 8.86417256488035e-07, "logits/chosen": -0.5417401790618896, "logits/rejected": -0.4637575149536133, "logps/chosen": -1.7523746490478516, "logps/rejected": -1.8010832071304321, "loss": 2.9477, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.523746490478516, "rewards/margins": 0.48708638548851013, "rewards/rejected": -18.010833740234375, "step": 2630 }, { "epoch": 0.08881323940813644, "grad_norm": 22.720060348510742, "learning_rate": 8.881024603977081e-07, "logits/chosen": -0.11410139501094818, "logits/rejected": -0.21287024021148682, "logps/chosen": -1.6555503606796265, "logps/rejected": -1.6463634967803955, "loss": 3.539, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.555505752563477, "rewards/margins": -0.09186983108520508, "rewards/rejected": -16.463634490966797, "step": 2635 }, { "epoch": 0.08898176547911962, "grad_norm": 12.026870727539062, "learning_rate": 8.897876643073811e-07, "logits/chosen": -0.39031511545181274, "logits/rejected": -0.45363712310791016, "logps/chosen": -1.5173307657241821, "logps/rejected": -1.8569914102554321, "loss": 2.4449, "rewards/accuracies": 0.5, "rewards/chosen": -15.173307418823242, "rewards/margins": 3.3966071605682373, "rewards/rejected": -18.56991195678711, "step": 2640 }, { "epoch": 0.0891502915501028, "grad_norm": 19.74846649169922, "learning_rate": 8.914728682170542e-07, "logits/chosen": -0.796288788318634, "logits/rejected": -0.7678640484809875, "logps/chosen": -1.6136983633041382, "logps/rejected": -1.503281831741333, "loss": 4.334, "rewards/accuracies": 0.5, "rewards/chosen": -16.136981964111328, "rewards/margins": -1.1041651964187622, "rewards/rejected": -15.032818794250488, "step": 2645 }, { "epoch": 0.08931881762108598, "grad_norm": 10.176506042480469, "learning_rate": 8.931580721267274e-07, "logits/chosen": -0.4079221189022064, "logits/rejected": -0.4823921322822571, "logps/chosen": -1.8552573919296265, "logps/rejected": -1.789438247680664, "loss": 3.9795, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -18.552574157714844, "rewards/margins": -0.6581940650939941, "rewards/rejected": -17.89438247680664, "step": 2650 }, { "epoch": 0.08948734369206916, "grad_norm": 21.36243438720703, "learning_rate": 8.948432760364005e-07, "logits/chosen": -0.15099112689495087, "logits/rejected": -0.23671016097068787, "logps/chosen": -1.9283297061920166, "logps/rejected": -1.9521408081054688, "loss": 3.5779, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.283296585083008, "rewards/margins": 0.23811152577400208, "rewards/rejected": -19.521408081054688, "step": 2655 }, { "epoch": 0.08965586976305234, "grad_norm": 29.926528930664062, "learning_rate": 8.965284799460734e-07, "logits/chosen": -0.3068148195743561, "logits/rejected": -0.235337495803833, "logps/chosen": -1.8573440313339233, "logps/rejected": -1.7485700845718384, "loss": 4.1967, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -18.573440551757812, "rewards/margins": -1.0877423286437988, "rewards/rejected": -17.485698699951172, "step": 2660 }, { "epoch": 0.08982439583403552, "grad_norm": 43.94549560546875, "learning_rate": 8.982136838557465e-07, "logits/chosen": -0.5529682040214539, "logits/rejected": -0.49506306648254395, "logps/chosen": -1.6662704944610596, "logps/rejected": -1.6164556741714478, "loss": 3.7571, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.662704467773438, "rewards/margins": -0.498150497674942, "rewards/rejected": -16.164554595947266, "step": 2665 }, { "epoch": 0.0899929219050187, "grad_norm": 22.449846267700195, "learning_rate": 8.998988877654196e-07, "logits/chosen": -0.536016583442688, "logits/rejected": -0.5582794547080994, "logps/chosen": -1.6905359029769897, "logps/rejected": -1.8122714757919312, "loss": 2.5696, "rewards/accuracies": 0.5, "rewards/chosen": -16.905357360839844, "rewards/margins": 1.2173573970794678, "rewards/rejected": -18.12271499633789, "step": 2670 }, { "epoch": 0.09016144797600188, "grad_norm": 24.964977264404297, "learning_rate": 9.015840916750926e-07, "logits/chosen": -0.44670265913009644, "logits/rejected": -0.3518041670322418, "logps/chosen": -1.6536505222320557, "logps/rejected": -1.6681900024414062, "loss": 3.0175, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.5365047454834, "rewards/margins": 0.14539547264575958, "rewards/rejected": -16.681900024414062, "step": 2675 }, { "epoch": 0.09032997404698506, "grad_norm": 21.91883087158203, "learning_rate": 9.032692955847657e-07, "logits/chosen": -0.5816112756729126, "logits/rejected": -0.5414116978645325, "logps/chosen": -1.976906180381775, "logps/rejected": -1.8742444515228271, "loss": 4.4431, "rewards/accuracies": 0.5, "rewards/chosen": -19.769062042236328, "rewards/margins": -1.0266164541244507, "rewards/rejected": -18.74244499206543, "step": 2680 }, { "epoch": 0.09049850011796826, "grad_norm": 26.940380096435547, "learning_rate": 9.049544994944388e-07, "logits/chosen": -0.17546026408672333, "logits/rejected": -0.1966692954301834, "logps/chosen": -1.948767066001892, "logps/rejected": -2.047886610031128, "loss": 2.2436, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.4876708984375, "rewards/margins": 0.9911966323852539, "rewards/rejected": -20.478870391845703, "step": 2685 }, { "epoch": 0.09066702618895144, "grad_norm": 42.87982940673828, "learning_rate": 9.066397034041119e-07, "logits/chosen": -0.42629021406173706, "logits/rejected": -0.31089669466018677, "logps/chosen": -1.7324479818344116, "logps/rejected": -1.7612009048461914, "loss": 3.0846, "rewards/accuracies": 0.5, "rewards/chosen": -17.324478149414062, "rewards/margins": 0.2875285744667053, "rewards/rejected": -17.612009048461914, "step": 2690 }, { "epoch": 0.09083555225993462, "grad_norm": 26.106229782104492, "learning_rate": 9.083249073137849e-07, "logits/chosen": -0.6283289194107056, "logits/rejected": -0.5349553823471069, "logps/chosen": -1.7475484609603882, "logps/rejected": -1.8252407312393188, "loss": 2.7924, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.47548484802246, "rewards/margins": 0.77692049741745, "rewards/rejected": -18.25240707397461, "step": 2695 }, { "epoch": 0.0910040783309178, "grad_norm": 16.301986694335938, "learning_rate": 9.100101112234579e-07, "logits/chosen": -0.37398990988731384, "logits/rejected": -0.3726533353328705, "logps/chosen": -1.613258719444275, "logps/rejected": -1.5538930892944336, "loss": 3.9193, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.132587432861328, "rewards/margins": -0.5936561822891235, "rewards/rejected": -15.538930892944336, "step": 2700 }, { "epoch": 0.09117260440190098, "grad_norm": 25.090686798095703, "learning_rate": 9.116953151331311e-07, "logits/chosen": -0.5178991556167603, "logits/rejected": -0.3978433907032013, "logps/chosen": -1.6178112030029297, "logps/rejected": -1.6006628274917603, "loss": 3.3633, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.17811393737793, "rewards/margins": -0.17148585617542267, "rewards/rejected": -16.006628036499023, "step": 2705 }, { "epoch": 0.09134113047288415, "grad_norm": 49.08612060546875, "learning_rate": 9.133805190428041e-07, "logits/chosen": -0.1910863071680069, "logits/rejected": -0.22724132239818573, "logps/chosen": -2.229879140853882, "logps/rejected": -2.327415943145752, "loss": 2.6912, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.29879379272461, "rewards/margins": 0.9753679037094116, "rewards/rejected": -23.274160385131836, "step": 2710 }, { "epoch": 0.09150965654386733, "grad_norm": 11.361995697021484, "learning_rate": 9.150657229524772e-07, "logits/chosen": -0.43085527420043945, "logits/rejected": -0.2916302978992462, "logps/chosen": -1.3857858180999756, "logps/rejected": -1.6563870906829834, "loss": 2.1313, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -13.857858657836914, "rewards/margins": 2.7060129642486572, "rewards/rejected": -16.563871383666992, "step": 2715 }, { "epoch": 0.09167818261485051, "grad_norm": 23.483665466308594, "learning_rate": 9.167509268621503e-07, "logits/chosen": -0.5284978151321411, "logits/rejected": -0.5280933380126953, "logps/chosen": -1.9413951635360718, "logps/rejected": -2.0041792392730713, "loss": 3.5537, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -19.413949966430664, "rewards/margins": 0.6278434991836548, "rewards/rejected": -20.041791915893555, "step": 2720 }, { "epoch": 0.0918467086858337, "grad_norm": 21.85501480102539, "learning_rate": 9.184361307718234e-07, "logits/chosen": -0.0768141895532608, "logits/rejected": -0.00690958509221673, "logps/chosen": -1.6710220575332642, "logps/rejected": -1.7174562215805054, "loss": 2.6894, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.710220336914062, "rewards/margins": 0.4643425941467285, "rewards/rejected": -17.174564361572266, "step": 2725 }, { "epoch": 0.09201523475681687, "grad_norm": 19.08591651916504, "learning_rate": 9.201213346814964e-07, "logits/chosen": -0.6823151707649231, "logits/rejected": -0.5807539224624634, "logps/chosen": -1.6391655206680298, "logps/rejected": -1.7414779663085938, "loss": 2.4353, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.39165687561035, "rewards/margins": 1.0231223106384277, "rewards/rejected": -17.414779663085938, "step": 2730 }, { "epoch": 0.09218376082780005, "grad_norm": 17.085020065307617, "learning_rate": 9.218065385911695e-07, "logits/chosen": -0.1736678183078766, "logits/rejected": -0.0709773451089859, "logps/chosen": -1.9339990615844727, "logps/rejected": -2.009704113006592, "loss": 2.4459, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.339990615844727, "rewards/margins": 0.7570520639419556, "rewards/rejected": -20.097042083740234, "step": 2735 }, { "epoch": 0.09235228689878325, "grad_norm": 21.93718147277832, "learning_rate": 9.234917425008426e-07, "logits/chosen": -0.708516001701355, "logits/rejected": -0.536871612071991, "logps/chosen": -1.5185356140136719, "logps/rejected": -1.6370971202850342, "loss": 3.115, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -15.185358047485352, "rewards/margins": 1.185613989830017, "rewards/rejected": -16.3709716796875, "step": 2740 }, { "epoch": 0.09252081296976643, "grad_norm": 48.83224868774414, "learning_rate": 9.251769464105155e-07, "logits/chosen": -0.39072203636169434, "logits/rejected": -0.432451069355011, "logps/chosen": -1.755963921546936, "logps/rejected": -1.6103007793426514, "loss": 4.5056, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.559640884399414, "rewards/margins": -1.4566314220428467, "rewards/rejected": -16.103008270263672, "step": 2745 }, { "epoch": 0.09268933904074961, "grad_norm": 23.23576545715332, "learning_rate": 9.268621503201886e-07, "logits/chosen": 0.031300973147153854, "logits/rejected": -0.01834370568394661, "logps/chosen": -2.059108257293701, "logps/rejected": -2.0446953773498535, "loss": 3.2485, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.591083526611328, "rewards/margins": -0.14412669837474823, "rewards/rejected": -20.44695472717285, "step": 2750 }, { "epoch": 0.09285786511173279, "grad_norm": 11.359932899475098, "learning_rate": 9.285473542298618e-07, "logits/chosen": -0.04536425322294235, "logits/rejected": -0.004798299167305231, "logps/chosen": -1.8216663599014282, "logps/rejected": -1.8901094198226929, "loss": 3.0188, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.216663360595703, "rewards/margins": 0.684432327747345, "rewards/rejected": -18.90109634399414, "step": 2755 }, { "epoch": 0.09302639118271597, "grad_norm": 22.040935516357422, "learning_rate": 9.302325581395349e-07, "logits/chosen": -0.3122491240501404, "logits/rejected": -0.3268435597419739, "logps/chosen": -1.8650197982788086, "logps/rejected": -1.7798147201538086, "loss": 3.9723, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.650197982788086, "rewards/margins": -0.852049708366394, "rewards/rejected": -17.798147201538086, "step": 2760 }, { "epoch": 0.09319491725369915, "grad_norm": 22.49321746826172, "learning_rate": 9.319177620492079e-07, "logits/chosen": -0.507947564125061, "logits/rejected": -0.5531247854232788, "logps/chosen": -1.662655234336853, "logps/rejected": -1.6844278573989868, "loss": 3.5314, "rewards/accuracies": 0.5, "rewards/chosen": -16.62655258178711, "rewards/margins": 0.21772536635398865, "rewards/rejected": -16.84427833557129, "step": 2765 }, { "epoch": 0.09336344332468233, "grad_norm": 16.84610366821289, "learning_rate": 9.33602965958881e-07, "logits/chosen": -0.6405504941940308, "logits/rejected": -0.5383267402648926, "logps/chosen": -1.588544487953186, "logps/rejected": -1.6625516414642334, "loss": 2.5218, "rewards/accuracies": 0.5, "rewards/chosen": -15.885442733764648, "rewards/margins": 0.7400724291801453, "rewards/rejected": -16.625516891479492, "step": 2770 }, { "epoch": 0.0935319693956655, "grad_norm": 29.287565231323242, "learning_rate": 9.352881698685541e-07, "logits/chosen": -0.10549436509609222, "logits/rejected": -0.18245017528533936, "logps/chosen": -1.8676893711090088, "logps/rejected": -1.814923882484436, "loss": 3.5887, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -18.676895141601562, "rewards/margins": -0.5276561975479126, "rewards/rejected": -18.14923667907715, "step": 2775 }, { "epoch": 0.09370049546664869, "grad_norm": 35.53804397583008, "learning_rate": 9.369733737782271e-07, "logits/chosen": -0.9606320261955261, "logits/rejected": -0.9500244855880737, "logps/chosen": -1.4875165224075317, "logps/rejected": -1.488067865371704, "loss": 3.1073, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -14.875162124633789, "rewards/margins": 0.00551605224609375, "rewards/rejected": -14.8806791305542, "step": 2780 }, { "epoch": 0.09386902153763187, "grad_norm": 24.10531997680664, "learning_rate": 9.386585776879002e-07, "logits/chosen": -0.43831610679626465, "logits/rejected": -0.43352681398391724, "logps/chosen": -1.8498563766479492, "logps/rejected": -1.868779182434082, "loss": 3.2661, "rewards/accuracies": 0.5, "rewards/chosen": -18.498563766479492, "rewards/margins": 0.1892286241054535, "rewards/rejected": -18.687793731689453, "step": 2785 }, { "epoch": 0.09403754760861505, "grad_norm": 25.974620819091797, "learning_rate": 9.403437815975732e-07, "logits/chosen": -0.5580755472183228, "logits/rejected": -0.46703624725341797, "logps/chosen": -1.6518490314483643, "logps/rejected": -1.5979589223861694, "loss": 3.5922, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.518489837646484, "rewards/margins": -0.5388990640640259, "rewards/rejected": -15.979589462280273, "step": 2790 }, { "epoch": 0.09420607367959824, "grad_norm": 38.141876220703125, "learning_rate": 9.420289855072463e-07, "logits/chosen": -0.687545657157898, "logits/rejected": -0.7488195896148682, "logps/chosen": -1.834449052810669, "logps/rejected": -1.693974256515503, "loss": 4.435, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -18.344486236572266, "rewards/margins": -1.4047452211380005, "rewards/rejected": -16.939743041992188, "step": 2795 }, { "epoch": 0.09437459975058142, "grad_norm": 33.917057037353516, "learning_rate": 9.437141894169193e-07, "logits/chosen": -0.48264870047569275, "logits/rejected": -0.4806605279445648, "logps/chosen": -1.8548141717910767, "logps/rejected": -1.8633720874786377, "loss": 3.0591, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.548139572143555, "rewards/margins": 0.08558006584644318, "rewards/rejected": -18.633718490600586, "step": 2800 }, { "epoch": 0.09437459975058142, "eval_logits/chosen": -0.725313127040863, "eval_logits/rejected": -0.7294741272926331, "eval_logps/chosen": -1.668626070022583, "eval_logps/rejected": -1.6697936058044434, "eval_loss": 3.416093349456787, "eval_rewards/accuracies": 0.46000000834465027, "eval_rewards/chosen": -16.686260223388672, "eval_rewards/margins": 0.01167456153780222, "eval_rewards/rejected": -16.69793701171875, "eval_runtime": 12.897, "eval_samples_per_second": 7.754, "eval_steps_per_second": 1.938, "step": 2800 }, { "epoch": 0.0945431258215646, "grad_norm": 25.618377685546875, "learning_rate": 9.453993933265925e-07, "logits/chosen": -0.6152405738830566, "logits/rejected": -0.6342421174049377, "logps/chosen": -1.9061603546142578, "logps/rejected": -2.1016743183135986, "loss": 1.5141, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.061603546142578, "rewards/margins": 1.9551403522491455, "rewards/rejected": -21.016742706298828, "step": 2805 }, { "epoch": 0.09471165189254778, "grad_norm": 20.979740142822266, "learning_rate": 9.470845972362656e-07, "logits/chosen": -0.17542439699172974, "logits/rejected": -0.19594234228134155, "logps/chosen": -1.8021949529647827, "logps/rejected": -1.939035415649414, "loss": 2.3183, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.02195167541504, "rewards/margins": 1.3684017658233643, "rewards/rejected": -19.390352249145508, "step": 2810 }, { "epoch": 0.09488017796353096, "grad_norm": 37.135013580322266, "learning_rate": 9.487698011459387e-07, "logits/chosen": -0.8760625123977661, "logits/rejected": -0.9745880365371704, "logps/chosen": -1.827471137046814, "logps/rejected": -1.8833777904510498, "loss": 2.6297, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.27471160888672, "rewards/margins": 0.5590680837631226, "rewards/rejected": -18.83378028869629, "step": 2815 }, { "epoch": 0.09504870403451414, "grad_norm": 16.353891372680664, "learning_rate": 9.504550050556117e-07, "logits/chosen": -0.4122091233730316, "logits/rejected": -0.5152057409286499, "logps/chosen": -1.6709582805633545, "logps/rejected": -1.8144867420196533, "loss": 2.1599, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.709585189819336, "rewards/margins": 1.4352830648422241, "rewards/rejected": -18.144866943359375, "step": 2820 }, { "epoch": 0.09521723010549732, "grad_norm": 36.70476531982422, "learning_rate": 9.521402089652848e-07, "logits/chosen": -0.32984694838523865, "logits/rejected": -0.23229286074638367, "logps/chosen": -2.0945968627929688, "logps/rejected": -2.1234512329101562, "loss": 3.7133, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.945966720581055, "rewards/margins": 0.2885432243347168, "rewards/rejected": -21.234512329101562, "step": 2825 }, { "epoch": 0.0953857561764805, "grad_norm": 33.7508544921875, "learning_rate": 9.538254128749579e-07, "logits/chosen": -0.28869864344596863, "logits/rejected": -0.32969799637794495, "logps/chosen": -1.8106577396392822, "logps/rejected": -1.9228843450546265, "loss": 2.9404, "rewards/accuracies": 0.5, "rewards/chosen": -18.106576919555664, "rewards/margins": 1.1222679615020752, "rewards/rejected": -19.228845596313477, "step": 2830 }, { "epoch": 0.09555428224746368, "grad_norm": 23.599029541015625, "learning_rate": 9.55510616784631e-07, "logits/chosen": -0.3256445527076721, "logits/rejected": -0.3391716778278351, "logps/chosen": -1.7188100814819336, "logps/rejected": -1.7335550785064697, "loss": 3.0343, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.188098907470703, "rewards/margins": 0.14744806289672852, "rewards/rejected": -17.335548400878906, "step": 2835 }, { "epoch": 0.09572280831844686, "grad_norm": 38.509002685546875, "learning_rate": 9.57195820694304e-07, "logits/chosen": -0.38658618927001953, "logits/rejected": -0.15633238852024078, "logps/chosen": -2.1311075687408447, "logps/rejected": -2.3841655254364014, "loss": 3.1981, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -21.31107521057129, "rewards/margins": 2.530580997467041, "rewards/rejected": -23.841655731201172, "step": 2840 }, { "epoch": 0.09589133438943004, "grad_norm": 27.05527114868164, "learning_rate": 9.58881024603977e-07, "logits/chosen": -0.818171501159668, "logits/rejected": -0.6529834866523743, "logps/chosen": -1.4707438945770264, "logps/rejected": -1.6013038158416748, "loss": 3.1145, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -14.707438468933105, "rewards/margins": 1.3056000471115112, "rewards/rejected": -16.01304054260254, "step": 2845 }, { "epoch": 0.09605986046041323, "grad_norm": 24.46522331237793, "learning_rate": 9.605662285136502e-07, "logits/chosen": -0.5285354852676392, "logits/rejected": -0.6483272314071655, "logps/chosen": -1.71415114402771, "logps/rejected": -1.738390326499939, "loss": 2.9818, "rewards/accuracies": 0.5, "rewards/chosen": -17.141511917114258, "rewards/margins": 0.24239292740821838, "rewards/rejected": -17.3839054107666, "step": 2850 }, { "epoch": 0.09622838653139641, "grad_norm": 26.20121955871582, "learning_rate": 9.622514324233232e-07, "logits/chosen": -0.48643478751182556, "logits/rejected": -0.45796099305152893, "logps/chosen": -1.6284525394439697, "logps/rejected": -1.654547929763794, "loss": 3.0792, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.284526824951172, "rewards/margins": 0.2609531283378601, "rewards/rejected": -16.54547691345215, "step": 2855 }, { "epoch": 0.09639691260237959, "grad_norm": 38.97121047973633, "learning_rate": 9.639366363329963e-07, "logits/chosen": -0.18868876993656158, "logits/rejected": -0.110798180103302, "logps/chosen": -2.010000705718994, "logps/rejected": -1.9542725086212158, "loss": 3.6253, "rewards/accuracies": 0.5, "rewards/chosen": -20.100006103515625, "rewards/margins": -0.5572818517684937, "rewards/rejected": -19.542726516723633, "step": 2860 }, { "epoch": 0.09656543867336277, "grad_norm": 24.709421157836914, "learning_rate": 9.656218402426694e-07, "logits/chosen": -0.7445005178451538, "logits/rejected": -0.7172940373420715, "logps/chosen": -1.8106021881103516, "logps/rejected": -1.9134094715118408, "loss": 2.2706, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.106021881103516, "rewards/margins": 1.0280708074569702, "rewards/rejected": -19.134092330932617, "step": 2865 }, { "epoch": 0.09673396474434595, "grad_norm": 13.824490547180176, "learning_rate": 9.673070441523424e-07, "logits/chosen": -0.7275068163871765, "logits/rejected": -0.7243115305900574, "logps/chosen": -1.5358262062072754, "logps/rejected": -1.5569114685058594, "loss": 2.9848, "rewards/accuracies": 0.5, "rewards/chosen": -15.35826301574707, "rewards/margins": 0.21085242927074432, "rewards/rejected": -15.569114685058594, "step": 2870 }, { "epoch": 0.09690249081532913, "grad_norm": 22.991600036621094, "learning_rate": 9.689922480620153e-07, "logits/chosen": -0.4551324248313904, "logits/rejected": -0.4542100429534912, "logps/chosen": -1.6309089660644531, "logps/rejected": -1.6141412258148193, "loss": 3.4348, "rewards/accuracies": 0.5, "rewards/chosen": -16.30908966064453, "rewards/margins": -0.16767773032188416, "rewards/rejected": -16.14141273498535, "step": 2875 }, { "epoch": 0.09707101688631231, "grad_norm": 40.251258850097656, "learning_rate": 9.706774519716886e-07, "logits/chosen": -0.30819210410118103, "logits/rejected": -0.15154561400413513, "logps/chosen": -1.7786144018173218, "logps/rejected": -1.8767932653427124, "loss": 2.6402, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.786144256591797, "rewards/margins": 0.9817888140678406, "rewards/rejected": -18.767932891845703, "step": 2880 }, { "epoch": 0.09723954295729549, "grad_norm": 16.79170799255371, "learning_rate": 9.723626558813617e-07, "logits/chosen": -0.3624119758605957, "logits/rejected": -0.5596259832382202, "logps/chosen": -1.7782881259918213, "logps/rejected": -1.7136462926864624, "loss": 3.736, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.782880783081055, "rewards/margins": -0.6464160680770874, "rewards/rejected": -17.136465072631836, "step": 2885 }, { "epoch": 0.09740806902827867, "grad_norm": 53.098182678222656, "learning_rate": 9.740478597910347e-07, "logits/chosen": -0.5692394375801086, "logits/rejected": -0.6051616072654724, "logps/chosen": -2.1128344535827637, "logps/rejected": -2.038572311401367, "loss": 3.8688, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.128345489501953, "rewards/margins": -0.742621123790741, "rewards/rejected": -20.385725021362305, "step": 2890 }, { "epoch": 0.09757659509926185, "grad_norm": 2.4433932304382324, "learning_rate": 9.757330637007078e-07, "logits/chosen": -0.10106615722179413, "logits/rejected": -0.12207716703414917, "logps/chosen": -2.044761896133423, "logps/rejected": -2.2555930614471436, "loss": 2.3954, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.447616577148438, "rewards/margins": 2.1083126068115234, "rewards/rejected": -22.55592918395996, "step": 2895 }, { "epoch": 0.09774512117024503, "grad_norm": 43.901123046875, "learning_rate": 9.774182676103809e-07, "logits/chosen": -0.17931941151618958, "logits/rejected": -0.30949029326438904, "logps/chosen": -2.0980095863342285, "logps/rejected": -2.07295823097229, "loss": 3.3587, "rewards/accuracies": 0.5, "rewards/chosen": -20.980093002319336, "rewards/margins": -0.2505127787590027, "rewards/rejected": -20.72957992553711, "step": 2900 }, { "epoch": 0.09791364724122822, "grad_norm": 14.931760787963867, "learning_rate": 9.79103471520054e-07, "logits/chosen": -0.5947299003601074, "logits/rejected": -0.7893961668014526, "logps/chosen": -1.727439522743225, "logps/rejected": -1.5623410940170288, "loss": 4.8803, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -17.27439308166504, "rewards/margins": -1.6509826183319092, "rewards/rejected": -15.623411178588867, "step": 2905 }, { "epoch": 0.0980821733122114, "grad_norm": 19.381126403808594, "learning_rate": 9.80788675429727e-07, "logits/chosen": -0.5605853796005249, "logits/rejected": -0.49684804677963257, "logps/chosen": -1.5796529054641724, "logps/rejected": -1.7275539636611938, "loss": 2.1057, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -15.796528816223145, "rewards/margins": 1.4790114164352417, "rewards/rejected": -17.27553939819336, "step": 2910 }, { "epoch": 0.09825069938319458, "grad_norm": 28.500883102416992, "learning_rate": 9.824738793394e-07, "logits/chosen": -0.6147949695587158, "logits/rejected": -0.8111165165901184, "logps/chosen": -1.7637847661972046, "logps/rejected": -1.633888840675354, "loss": 4.3361, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -17.637847900390625, "rewards/margins": -1.2989604473114014, "rewards/rejected": -16.338886260986328, "step": 2915 }, { "epoch": 0.09841922545417776, "grad_norm": 20.428529739379883, "learning_rate": 9.84159083249073e-07, "logits/chosen": -0.2776259779930115, "logits/rejected": -0.18497855961322784, "logps/chosen": -2.1334004402160645, "logps/rejected": -2.049607992172241, "loss": 4.0409, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.334003448486328, "rewards/margins": -0.8379223942756653, "rewards/rejected": -20.49608039855957, "step": 2920 }, { "epoch": 0.09858775152516094, "grad_norm": 26.86744499206543, "learning_rate": 9.85844287158746e-07, "logits/chosen": -0.6018735766410828, "logits/rejected": -0.670646071434021, "logps/chosen": -1.6858047246932983, "logps/rejected": -1.7887957096099854, "loss": 2.9202, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.858049392700195, "rewards/margins": 1.0299079418182373, "rewards/rejected": -17.887958526611328, "step": 2925 }, { "epoch": 0.09875627759614412, "grad_norm": 52.686256408691406, "learning_rate": 9.875294910684193e-07, "logits/chosen": -0.314828097820282, "logits/rejected": -0.24025221168994904, "logps/chosen": -1.9489824771881104, "logps/rejected": -2.161886215209961, "loss": 2.3532, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.489826202392578, "rewards/margins": 2.1290364265441895, "rewards/rejected": -21.61886215209961, "step": 2930 }, { "epoch": 0.0989248036671273, "grad_norm": 25.790864944458008, "learning_rate": 9.892146949780924e-07, "logits/chosen": -0.5112024545669556, "logits/rejected": -0.5521891713142395, "logps/chosen": -1.707360029220581, "logps/rejected": -1.6799137592315674, "loss": 3.4054, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.073598861694336, "rewards/margins": -0.27446356415748596, "rewards/rejected": -16.799137115478516, "step": 2935 }, { "epoch": 0.09909332973811048, "grad_norm": 35.845943450927734, "learning_rate": 9.908998988877655e-07, "logits/chosen": -0.46860605478286743, "logits/rejected": -0.6842837929725647, "logps/chosen": -1.6268116235733032, "logps/rejected": -1.5220801830291748, "loss": 4.1136, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -16.268115997314453, "rewards/margins": -1.047314167022705, "rewards/rejected": -15.220802307128906, "step": 2940 }, { "epoch": 0.09926185580909366, "grad_norm": 19.779817581176758, "learning_rate": 9.925851027974385e-07, "logits/chosen": -0.53152996301651, "logits/rejected": -0.4204404950141907, "logps/chosen": -1.6282031536102295, "logps/rejected": -1.6569904088974, "loss": 3.1771, "rewards/accuracies": 0.5, "rewards/chosen": -16.282032012939453, "rewards/margins": 0.28787460923194885, "rewards/rejected": -16.569904327392578, "step": 2945 }, { "epoch": 0.09943038188007684, "grad_norm": 23.035404205322266, "learning_rate": 9.942703067071116e-07, "logits/chosen": -0.6475009322166443, "logits/rejected": -0.4998777508735657, "logps/chosen": -2.263774871826172, "logps/rejected": -2.4253833293914795, "loss": 2.4489, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.63774871826172, "rewards/margins": 1.6160846948623657, "rewards/rejected": -24.253833770751953, "step": 2950 }, { "epoch": 0.09959890795106002, "grad_norm": 15.458049774169922, "learning_rate": 9.959555106167847e-07, "logits/chosen": -0.44105878472328186, "logits/rejected": -0.5367931127548218, "logps/chosen": -1.7834441661834717, "logps/rejected": -1.888159990310669, "loss": 2.3057, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.834440231323242, "rewards/margins": 1.04715895652771, "rewards/rejected": -18.8815975189209, "step": 2955 }, { "epoch": 0.09976743402204322, "grad_norm": 30.391836166381836, "learning_rate": 9.976407145264577e-07, "logits/chosen": -0.3933469355106354, "logits/rejected": -0.35893380641937256, "logps/chosen": -1.749251365661621, "logps/rejected": -1.7169334888458252, "loss": 3.4411, "rewards/accuracies": 0.5, "rewards/chosen": -17.49251365661621, "rewards/margins": -0.32318076491355896, "rewards/rejected": -17.16933250427246, "step": 2960 }, { "epoch": 0.0999359600930264, "grad_norm": 26.183565139770508, "learning_rate": 9.993259184361306e-07, "logits/chosen": -0.5088056325912476, "logits/rejected": -0.49937066435813904, "logps/chosen": -1.6994832754135132, "logps/rejected": -1.613231897354126, "loss": 4.2055, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.99483299255371, "rewards/margins": -0.8625134229660034, "rewards/rejected": -16.132320404052734, "step": 2965 }, { "epoch": 0.10010448616400958, "grad_norm": 32.271846771240234, "learning_rate": 9.999999688545453e-07, "logits/chosen": -0.33978405594825745, "logits/rejected": -0.1545068919658661, "logps/chosen": -1.9441852569580078, "logps/rejected": -1.9940448999404907, "loss": 2.7681, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.441852569580078, "rewards/margins": 0.4985966682434082, "rewards/rejected": -19.940448760986328, "step": 2970 }, { "epoch": 0.10027301223499276, "grad_norm": 24.9840145111084, "learning_rate": 9.99999778521225e-07, "logits/chosen": -1.09787118434906, "logits/rejected": -1.0583521127700806, "logps/chosen": -1.7992699146270752, "logps/rejected": -1.7420806884765625, "loss": 3.9549, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -17.992698669433594, "rewards/margins": -0.5718933939933777, "rewards/rejected": -17.420804977416992, "step": 2975 }, { "epoch": 0.10044153830597594, "grad_norm": 14.769031524658203, "learning_rate": 9.999994151576805e-07, "logits/chosen": -1.1787911653518677, "logits/rejected": -1.0617015361785889, "logps/chosen": -1.6846939325332642, "logps/rejected": -1.791347861289978, "loss": 2.3642, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.84693717956543, "rewards/margins": 1.0665404796600342, "rewards/rejected": -17.91347885131836, "step": 2980 }, { "epoch": 0.10061006437695912, "grad_norm": 31.43277359008789, "learning_rate": 9.999988787640376e-07, "logits/chosen": -0.07004846632480621, "logits/rejected": -0.003652901854366064, "logps/chosen": -1.5951316356658936, "logps/rejected": -1.7138687372207642, "loss": 2.2606, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -15.951314926147461, "rewards/margins": 1.1873712539672852, "rewards/rejected": -17.138687133789062, "step": 2985 }, { "epoch": 0.1007785904479423, "grad_norm": 17.785526275634766, "learning_rate": 9.99998169340482e-07, "logits/chosen": 0.04601895064115524, "logits/rejected": -0.05084504559636116, "logps/chosen": -1.6214570999145508, "logps/rejected": -1.5676156282424927, "loss": 3.6376, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.21457290649414, "rewards/margins": -0.5384153127670288, "rewards/rejected": -15.676156997680664, "step": 2990 }, { "epoch": 0.10094711651892548, "grad_norm": 24.750946044921875, "learning_rate": 9.99997286887259e-07, "logits/chosen": -0.40296635031700134, "logits/rejected": -0.32598623633384705, "logps/chosen": -1.5744359493255615, "logps/rejected": -1.5301647186279297, "loss": 3.6044, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -15.744359970092773, "rewards/margins": -0.44271326065063477, "rewards/rejected": -15.30164623260498, "step": 2995 }, { "epoch": 0.10111564258990866, "grad_norm": 22.85517692565918, "learning_rate": 9.999962314046742e-07, "logits/chosen": -0.6072363257408142, "logits/rejected": -0.6490095853805542, "logps/chosen": -1.840370535850525, "logps/rejected": -1.7861398458480835, "loss": 3.6313, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.403705596923828, "rewards/margins": -0.5423071980476379, "rewards/rejected": -17.861400604248047, "step": 3000 }, { "epoch": 0.10128416866089184, "grad_norm": 16.391801834106445, "learning_rate": 9.999950028930927e-07, "logits/chosen": -0.6177361011505127, "logits/rejected": -0.44297394156455994, "logps/chosen": -1.480201005935669, "logps/rejected": -1.643811821937561, "loss": 3.065, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -14.802009582519531, "rewards/margins": 1.6361064910888672, "rewards/rejected": -16.43811798095703, "step": 3005 }, { "epoch": 0.10145269473187501, "grad_norm": 7.250625133514404, "learning_rate": 9.9999360135294e-07, "logits/chosen": -0.35616278648376465, "logits/rejected": -0.4137846529483795, "logps/chosen": -2.107487678527832, "logps/rejected": -2.161532163619995, "loss": 3.3089, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.074878692626953, "rewards/margins": 0.5404449701309204, "rewards/rejected": -21.61532211303711, "step": 3010 }, { "epoch": 0.10162122080285821, "grad_norm": 25.29311752319336, "learning_rate": 9.999920267847007e-07, "logits/chosen": -0.0703842043876648, "logits/rejected": -0.06832405179738998, "logps/chosen": -2.167907238006592, "logps/rejected": -1.8903629779815674, "loss": 5.8812, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.679073333740234, "rewards/margins": -2.775441884994507, "rewards/rejected": -18.90363121032715, "step": 3015 }, { "epoch": 0.10178974687384139, "grad_norm": 17.77655792236328, "learning_rate": 9.999902791889196e-07, "logits/chosen": -0.3733082711696625, "logits/rejected": -0.39694738388061523, "logps/chosen": -1.7270221710205078, "logps/rejected": -1.724534273147583, "loss": 3.1331, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.270221710205078, "rewards/margins": -0.02487936057150364, "rewards/rejected": -17.245342254638672, "step": 3020 }, { "epoch": 0.10195827294482457, "grad_norm": 15.479154586791992, "learning_rate": 9.999883585662018e-07, "logits/chosen": -0.6984944343566895, "logits/rejected": -0.6338313817977905, "logps/chosen": -1.5714912414550781, "logps/rejected": -1.6366016864776611, "loss": 2.7082, "rewards/accuracies": 0.5, "rewards/chosen": -15.714914321899414, "rewards/margins": 0.6511033177375793, "rewards/rejected": -16.366016387939453, "step": 3025 }, { "epoch": 0.10212679901580775, "grad_norm": 20.5145263671875, "learning_rate": 9.99986264917212e-07, "logits/chosen": -0.4067594110965729, "logits/rejected": -0.3656831383705139, "logps/chosen": -1.598494529724121, "logps/rejected": -1.6863136291503906, "loss": 2.3492, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -15.984945297241211, "rewards/margins": 0.878189206123352, "rewards/rejected": -16.863134384155273, "step": 3030 }, { "epoch": 0.10229532508679093, "grad_norm": 22.588560104370117, "learning_rate": 9.999839982426744e-07, "logits/chosen": -0.7191510796546936, "logits/rejected": -0.6838647127151489, "logps/chosen": -1.9661766290664673, "logps/rejected": -2.049471616744995, "loss": 2.5507, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.661766052246094, "rewards/margins": 0.832950234413147, "rewards/rejected": -20.49471664428711, "step": 3035 }, { "epoch": 0.10246385115777411, "grad_norm": 19.967512130737305, "learning_rate": 9.99981558543374e-07, "logits/chosen": -0.6296879053115845, "logits/rejected": -0.5854636430740356, "logps/chosen": -2.0275909900665283, "logps/rejected": -1.940290093421936, "loss": 3.9449, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.275909423828125, "rewards/margins": -0.8730102777481079, "rewards/rejected": -19.40289878845215, "step": 3040 }, { "epoch": 0.10263237722875729, "grad_norm": 27.079069137573242, "learning_rate": 9.999789458201542e-07, "logits/chosen": -0.19687362015247345, "logits/rejected": -0.28982409834861755, "logps/chosen": -1.9375187158584595, "logps/rejected": -1.863250494003296, "loss": 3.814, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -19.375186920166016, "rewards/margins": -0.7426818013191223, "rewards/rejected": -18.632503509521484, "step": 3045 }, { "epoch": 0.10280090329974047, "grad_norm": 89.50406646728516, "learning_rate": 9.999761600739198e-07, "logits/chosen": -0.19124934077262878, "logits/rejected": -0.32821229100227356, "logps/chosen": -2.2497265338897705, "logps/rejected": -2.3240628242492676, "loss": 2.759, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.49726676940918, "rewards/margins": 0.743362545967102, "rewards/rejected": -23.24062728881836, "step": 3050 }, { "epoch": 0.10296942937072365, "grad_norm": 35.753177642822266, "learning_rate": 9.999732013056347e-07, "logits/chosen": -0.2083957940340042, "logits/rejected": -0.24470682442188263, "logps/chosen": -1.7198501825332642, "logps/rejected": -1.783496618270874, "loss": 2.8293, "rewards/accuracies": 0.5, "rewards/chosen": -17.198501586914062, "rewards/margins": 0.6364647746086121, "rewards/rejected": -17.8349666595459, "step": 3055 }, { "epoch": 0.10313795544170683, "grad_norm": 24.368762969970703, "learning_rate": 9.999700695163228e-07, "logits/chosen": -0.6474219560623169, "logits/rejected": -0.7258163690567017, "logps/chosen": -1.804152488708496, "logps/rejected": -1.6962175369262695, "loss": 4.1681, "rewards/accuracies": 0.5, "rewards/chosen": -18.04152488708496, "rewards/margins": -1.0793492794036865, "rewards/rejected": -16.962177276611328, "step": 3060 }, { "epoch": 0.10330648151269001, "grad_norm": 23.3294677734375, "learning_rate": 9.999667647070678e-07, "logits/chosen": -0.4061599671840668, "logits/rejected": -0.3327089846134186, "logps/chosen": -1.9971929788589478, "logps/rejected": -1.8557159900665283, "loss": 4.6338, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.971927642822266, "rewards/margins": -1.4147707223892212, "rewards/rejected": -18.557159423828125, "step": 3065 }, { "epoch": 0.1034750075836732, "grad_norm": 15.371095657348633, "learning_rate": 9.999632868790135e-07, "logits/chosen": -0.7014227509498596, "logits/rejected": -0.7013593316078186, "logps/chosen": -1.5081361532211304, "logps/rejected": -1.695508360862732, "loss": 2.0241, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -15.0813627243042, "rewards/margins": 1.8737218379974365, "rewards/rejected": -16.9550838470459, "step": 3070 }, { "epoch": 0.10364353365465638, "grad_norm": 24.77459144592285, "learning_rate": 9.999596360333634e-07, "logits/chosen": -0.39553630352020264, "logits/rejected": -0.31712251901626587, "logps/chosen": -1.7771854400634766, "logps/rejected": -1.7988437414169312, "loss": 2.9031, "rewards/accuracies": 0.5, "rewards/chosen": -17.7718563079834, "rewards/margins": 0.21658353507518768, "rewards/rejected": -17.988439559936523, "step": 3075 }, { "epoch": 0.10381205972563956, "grad_norm": 22.489713668823242, "learning_rate": 9.99955812171381e-07, "logits/chosen": -0.37975651025772095, "logits/rejected": -0.3504462242126465, "logps/chosen": -1.5920625925064087, "logps/rejected": -1.608381986618042, "loss": 3.0138, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -15.920623779296875, "rewards/margins": 0.16319552063941956, "rewards/rejected": -16.083820343017578, "step": 3080 }, { "epoch": 0.10398058579662274, "grad_norm": 21.4431095123291, "learning_rate": 9.999518152943892e-07, "logits/chosen": -0.3899223804473877, "logits/rejected": -0.5776330232620239, "logps/chosen": -1.7637507915496826, "logps/rejected": -1.8772176504135132, "loss": 2.3654, "rewards/accuracies": 0.5, "rewards/chosen": -17.637508392333984, "rewards/margins": 1.1346690654754639, "rewards/rejected": -18.772174835205078, "step": 3085 }, { "epoch": 0.10414911186760592, "grad_norm": 35.68021774291992, "learning_rate": 9.999476454037716e-07, "logits/chosen": -0.23362183570861816, "logits/rejected": -0.2220069169998169, "logps/chosen": -1.9831234216690063, "logps/rejected": -1.9658180475234985, "loss": 3.2693, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.83123207092285, "rewards/margins": -0.17305298149585724, "rewards/rejected": -19.658180236816406, "step": 3090 }, { "epoch": 0.1043176379385891, "grad_norm": 13.99560546875, "learning_rate": 9.99943302500971e-07, "logits/chosen": -0.1623246967792511, "logits/rejected": -0.0977829098701477, "logps/chosen": -1.9267008304595947, "logps/rejected": -2.038973093032837, "loss": 2.4041, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.26700782775879, "rewards/margins": 1.1227233409881592, "rewards/rejected": -20.389732360839844, "step": 3095 }, { "epoch": 0.10448616400957228, "grad_norm": 34.67253494262695, "learning_rate": 9.999387865874904e-07, "logits/chosen": -0.31366387009620667, "logits/rejected": -0.18402309715747833, "logps/chosen": -1.8098551034927368, "logps/rejected": -1.8108694553375244, "loss": 3.2052, "rewards/accuracies": 0.5, "rewards/chosen": -18.09855079650879, "rewards/margins": 0.010143804363906384, "rewards/rejected": -18.10869598388672, "step": 3100 }, { "epoch": 0.10465469008055546, "grad_norm": 24.195720672607422, "learning_rate": 9.999340976648928e-07, "logits/chosen": -0.8281705975532532, "logits/rejected": -0.8121271133422852, "logps/chosen": -1.3327033519744873, "logps/rejected": -1.316989541053772, "loss": 3.2535, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -13.327033996582031, "rewards/margins": -0.1571389138698578, "rewards/rejected": -13.169894218444824, "step": 3105 }, { "epoch": 0.10482321615153864, "grad_norm": 6.956542491912842, "learning_rate": 9.999292357348005e-07, "logits/chosen": -0.477001428604126, "logits/rejected": -0.32476919889450073, "logps/chosen": -1.6057878732681274, "logps/rejected": -1.7301912307739258, "loss": 2.7141, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.057878494262695, "rewards/margins": 1.2440357208251953, "rewards/rejected": -17.30191421508789, "step": 3110 }, { "epoch": 0.10499174222252182, "grad_norm": 15.775235176086426, "learning_rate": 9.99924200798896e-07, "logits/chosen": -0.29664766788482666, "logits/rejected": -0.24033299088478088, "logps/chosen": -1.8023223876953125, "logps/rejected": -1.7808622121810913, "loss": 3.3402, "rewards/accuracies": 0.5, "rewards/chosen": -18.023223876953125, "rewards/margins": -0.21460160613059998, "rewards/rejected": -17.808622360229492, "step": 3115 }, { "epoch": 0.105160268293505, "grad_norm": 27.72269058227539, "learning_rate": 9.999189928589217e-07, "logits/chosen": -0.45769166946411133, "logits/rejected": -0.4425771236419678, "logps/chosen": -1.8008079528808594, "logps/rejected": -1.8088890314102173, "loss": 3.0318, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.008079528808594, "rewards/margins": 0.0808115005493164, "rewards/rejected": -18.08888816833496, "step": 3120 }, { "epoch": 0.10532879436448819, "grad_norm": 32.695072174072266, "learning_rate": 9.999136119166803e-07, "logits/chosen": -0.2397170513868332, "logits/rejected": -0.1902095526456833, "logps/chosen": -1.861707091331482, "logps/rejected": -1.9670108556747437, "loss": 2.7405, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.617069244384766, "rewards/margins": 1.0530370473861694, "rewards/rejected": -19.670108795166016, "step": 3125 }, { "epoch": 0.10549732043547137, "grad_norm": 32.09098815917969, "learning_rate": 9.999080579740335e-07, "logits/chosen": -0.2794944643974304, "logits/rejected": -0.15733040869235992, "logps/chosen": -1.518586277961731, "logps/rejected": -1.4439319372177124, "loss": 3.7903, "rewards/accuracies": 0.5, "rewards/chosen": -15.18586254119873, "rewards/margins": -0.7465431094169617, "rewards/rejected": -14.43932056427002, "step": 3130 }, { "epoch": 0.10566584650645455, "grad_norm": 35.430084228515625, "learning_rate": 9.999023310329032e-07, "logits/chosen": -0.39965033531188965, "logits/rejected": -0.32974153757095337, "logps/chosen": -1.5488475561141968, "logps/rejected": -1.5709179639816284, "loss": 2.9906, "rewards/accuracies": 0.5, "rewards/chosen": -15.48847484588623, "rewards/margins": 0.22070512175559998, "rewards/rejected": -15.709179878234863, "step": 3135 }, { "epoch": 0.10583437257743773, "grad_norm": 19.31629753112793, "learning_rate": 9.99896431095272e-07, "logits/chosen": -0.6975013613700867, "logits/rejected": -0.6075304746627808, "logps/chosen": -1.9307944774627686, "logps/rejected": -1.9646923542022705, "loss": 2.9467, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.307945251464844, "rewards/margins": 0.3389785885810852, "rewards/rejected": -19.646923065185547, "step": 3140 }, { "epoch": 0.10600289864842091, "grad_norm": 23.149045944213867, "learning_rate": 9.998903581631808e-07, "logits/chosen": -0.17336881160736084, "logits/rejected": -0.12885281443595886, "logps/chosen": -1.6452945470809937, "logps/rejected": -1.8046413660049438, "loss": 2.4754, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.452945709228516, "rewards/margins": 1.5934689044952393, "rewards/rejected": -18.04641342163086, "step": 3145 }, { "epoch": 0.10617142471940409, "grad_norm": 33.11212158203125, "learning_rate": 9.998841122387315e-07, "logits/chosen": -0.3954317569732666, "logits/rejected": -0.5562535524368286, "logps/chosen": -1.827657699584961, "logps/rejected": -1.8005775213241577, "loss": 3.4283, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -18.276575088500977, "rewards/margins": -0.27080029249191284, "rewards/rejected": -18.00577735900879, "step": 3150 }, { "epoch": 0.10633995079038727, "grad_norm": 24.7961368560791, "learning_rate": 9.998776933240858e-07, "logits/chosen": -0.6464129686355591, "logits/rejected": -0.6087260842323303, "logps/chosen": -1.6729981899261475, "logps/rejected": -1.562254786491394, "loss": 4.1478, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -16.729984283447266, "rewards/margins": -1.1074340343475342, "rewards/rejected": -15.62254810333252, "step": 3155 }, { "epoch": 0.10650847686137045, "grad_norm": 49.34171676635742, "learning_rate": 9.998711014214648e-07, "logits/chosen": -0.1933070868253708, "logits/rejected": -0.15686890482902527, "logps/chosen": -1.7761684656143188, "logps/rejected": -2.0039217472076416, "loss": 2.2653, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.761682510375977, "rewards/margins": 2.277535915374756, "rewards/rejected": -20.03921890258789, "step": 3160 }, { "epoch": 0.10667700293235363, "grad_norm": 22.142282485961914, "learning_rate": 9.998643365331496e-07, "logits/chosen": -0.3458858132362366, "logits/rejected": -0.3292901813983917, "logps/chosen": -1.9010028839111328, "logps/rejected": -2.0131144523620605, "loss": 2.3761, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.010028839111328, "rewards/margins": 1.1211156845092773, "rewards/rejected": -20.131145477294922, "step": 3165 }, { "epoch": 0.10684552900333681, "grad_norm": 20.39383316040039, "learning_rate": 9.998573986614815e-07, "logits/chosen": -0.5390158891677856, "logits/rejected": -0.5976846218109131, "logps/chosen": -1.7556896209716797, "logps/rejected": -1.7425487041473389, "loss": 3.2772, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.556896209716797, "rewards/margins": -0.13141116499900818, "rewards/rejected": -17.425487518310547, "step": 3170 }, { "epoch": 0.10701405507431999, "grad_norm": 27.439123153686523, "learning_rate": 9.998502878088613e-07, "logits/chosen": -0.18560490012168884, "logits/rejected": -0.11901037395000458, "logps/chosen": -1.8787009716033936, "logps/rejected": -1.9589424133300781, "loss": 2.5655, "rewards/accuracies": 0.5, "rewards/chosen": -18.787012100219727, "rewards/margins": 0.8024119138717651, "rewards/rejected": -19.58942413330078, "step": 3175 }, { "epoch": 0.10718258114530319, "grad_norm": 47.90141296386719, "learning_rate": 9.998430039777496e-07, "logits/chosen": -0.8242633938789368, "logits/rejected": -0.5837022662162781, "logps/chosen": -1.5141726732254028, "logps/rejected": -1.659218430519104, "loss": 2.7919, "rewards/accuracies": 0.5, "rewards/chosen": -15.14172649383545, "rewards/margins": 1.4504566192626953, "rewards/rejected": -16.59218406677246, "step": 3180 }, { "epoch": 0.10735110721628636, "grad_norm": 11.335456848144531, "learning_rate": 9.998355471706676e-07, "logits/chosen": -0.09614237397909164, "logits/rejected": -0.008962017484009266, "logps/chosen": -1.9023325443267822, "logps/rejected": -2.03184175491333, "loss": 3.2201, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -19.023326873779297, "rewards/margins": 1.2950923442840576, "rewards/rejected": -20.318416595458984, "step": 3185 }, { "epoch": 0.10751963328726954, "grad_norm": 26.510276794433594, "learning_rate": 9.998279173901951e-07, "logits/chosen": -0.27121829986572266, "logits/rejected": -0.274336576461792, "logps/chosen": -1.76497483253479, "logps/rejected": -1.8142725229263306, "loss": 3.2245, "rewards/accuracies": 0.5, "rewards/chosen": -17.64974594116211, "rewards/margins": 0.492978572845459, "rewards/rejected": -18.14272689819336, "step": 3190 }, { "epoch": 0.10768815935825272, "grad_norm": 36.92559051513672, "learning_rate": 9.998201146389731e-07, "logits/chosen": -0.08655179291963577, "logits/rejected": -0.14043311774730682, "logps/chosen": -2.083996534347534, "logps/rejected": -2.0735764503479004, "loss": 3.3045, "rewards/accuracies": 0.5, "rewards/chosen": -20.8399658203125, "rewards/margins": -0.1042022705078125, "rewards/rejected": -20.735763549804688, "step": 3195 }, { "epoch": 0.1078566854292359, "grad_norm": 24.98342514038086, "learning_rate": 9.998121389197015e-07, "logits/chosen": -0.5542726516723633, "logits/rejected": -0.5774582624435425, "logps/chosen": -1.83783757686615, "logps/rejected": -1.8029735088348389, "loss": 3.4937, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.378376007080078, "rewards/margins": -0.34863871335983276, "rewards/rejected": -18.029735565185547, "step": 3200 }, { "epoch": 0.1078566854292359, "eval_logits/chosen": -0.7412148118019104, "eval_logits/rejected": -0.7483264207839966, "eval_logps/chosen": -1.679823637008667, "eval_logps/rejected": -1.6859045028686523, "eval_loss": 3.401301383972168, "eval_rewards/accuracies": 0.4699999988079071, "eval_rewards/chosen": -16.798236846923828, "eval_rewards/margins": 0.06080753356218338, "eval_rewards/rejected": -16.859045028686523, "eval_runtime": 12.9013, "eval_samples_per_second": 7.751, "eval_steps_per_second": 1.938, "step": 3200 }, { "epoch": 0.10802521150021908, "grad_norm": 22.075878143310547, "learning_rate": 9.998039902351404e-07, "logits/chosen": -0.5936040878295898, "logits/rejected": -0.6071931719779968, "logps/chosen": -2.6149399280548096, "logps/rejected": -2.601722002029419, "loss": 3.6117, "rewards/accuracies": 0.5, "rewards/chosen": -26.149398803710938, "rewards/margins": -0.13217754662036896, "rewards/rejected": -26.017221450805664, "step": 3205 }, { "epoch": 0.10819373757120226, "grad_norm": 14.17717170715332, "learning_rate": 9.997956685881097e-07, "logits/chosen": -0.8582628965377808, "logits/rejected": -0.9205999374389648, "logps/chosen": -1.6408593654632568, "logps/rejected": -1.5776455402374268, "loss": 3.7241, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.408592224121094, "rewards/margins": -0.6321369409561157, "rewards/rejected": -15.776455879211426, "step": 3210 }, { "epoch": 0.10836226364218544, "grad_norm": 41.10568618774414, "learning_rate": 9.997871739814894e-07, "logits/chosen": -0.6507139801979065, "logits/rejected": -0.6784830689430237, "logps/chosen": -1.9885571002960205, "logps/rejected": -1.9365136623382568, "loss": 3.6387, "rewards/accuracies": 0.5, "rewards/chosen": -19.885570526123047, "rewards/margins": -0.5204324722290039, "rewards/rejected": -19.365137100219727, "step": 3215 }, { "epoch": 0.10853078971316862, "grad_norm": 28.58173942565918, "learning_rate": 9.99778506418219e-07, "logits/chosen": -0.4071117043495178, "logits/rejected": -0.44211989641189575, "logps/chosen": -1.6532456874847412, "logps/rejected": -1.737221121788025, "loss": 3.0016, "rewards/accuracies": 0.5, "rewards/chosen": -16.532459259033203, "rewards/margins": 0.8397525548934937, "rewards/rejected": -17.372211456298828, "step": 3220 }, { "epoch": 0.1086993157841518, "grad_norm": 19.28890609741211, "learning_rate": 9.99769665901298e-07, "logits/chosen": -0.320314884185791, "logits/rejected": -0.27917996048927307, "logps/chosen": -1.9284579753875732, "logps/rejected": -2.031317949295044, "loss": 2.9486, "rewards/accuracies": 0.5, "rewards/chosen": -19.28458023071289, "rewards/margins": 1.0285985469818115, "rewards/rejected": -20.313180923461914, "step": 3225 }, { "epoch": 0.10886784185513498, "grad_norm": 22.410377502441406, "learning_rate": 9.997606524337856e-07, "logits/chosen": -0.5617813467979431, "logits/rejected": -0.7193830013275146, "logps/chosen": -1.606142282485962, "logps/rejected": -1.7297741174697876, "loss": 2.3398, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.06142234802246, "rewards/margins": 1.23631751537323, "rewards/rejected": -17.297740936279297, "step": 3230 }, { "epoch": 0.10903636792611818, "grad_norm": 16.280595779418945, "learning_rate": 9.997514660188012e-07, "logits/chosen": -0.2999853193759918, "logits/rejected": -0.3209920823574066, "logps/chosen": -2.043691635131836, "logps/rejected": -1.9882628917694092, "loss": 3.8689, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.43691635131836, "rewards/margins": -0.5542882084846497, "rewards/rejected": -19.88262939453125, "step": 3235 }, { "epoch": 0.10920489399710136, "grad_norm": 47.62866973876953, "learning_rate": 9.997421066595242e-07, "logits/chosen": -0.27927201986312866, "logits/rejected": -0.3500203490257263, "logps/chosen": -1.8980588912963867, "logps/rejected": -1.8838756084442139, "loss": 3.2512, "rewards/accuracies": 0.5, "rewards/chosen": -18.980587005615234, "rewards/margins": -0.1418312042951584, "rewards/rejected": -18.838756561279297, "step": 3240 }, { "epoch": 0.10937342006808454, "grad_norm": 23.319583892822266, "learning_rate": 9.997325743591927e-07, "logits/chosen": -0.4203563332557678, "logits/rejected": -0.24051852524280548, "logps/chosen": -1.784014344215393, "logps/rejected": -1.7746086120605469, "loss": 3.2181, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.840145111083984, "rewards/margins": -0.09405937045812607, "rewards/rejected": -17.746084213256836, "step": 3245 }, { "epoch": 0.10954194613906772, "grad_norm": 40.517086029052734, "learning_rate": 9.997228691211062e-07, "logits/chosen": -0.6908475160598755, "logits/rejected": -0.6859616041183472, "logps/chosen": -1.6905533075332642, "logps/rejected": -1.7253916263580322, "loss": 2.771, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.905534744262695, "rewards/margins": 0.3483811318874359, "rewards/rejected": -17.253915786743164, "step": 3250 }, { "epoch": 0.1097104722100509, "grad_norm": 20.207984924316406, "learning_rate": 9.997129909486227e-07, "logits/chosen": 0.0059540183283388615, "logits/rejected": 0.033363353461027145, "logps/chosen": -2.2267398834228516, "logps/rejected": -2.2308731079101562, "loss": 3.2782, "rewards/accuracies": 0.5, "rewards/chosen": -22.267398834228516, "rewards/margins": 0.04133415222167969, "rewards/rejected": -22.308731079101562, "step": 3255 }, { "epoch": 0.10987899828103408, "grad_norm": 21.74619483947754, "learning_rate": 9.997029398451613e-07, "logits/chosen": -0.6611747741699219, "logits/rejected": -0.828205406665802, "logps/chosen": -1.661211371421814, "logps/rejected": -1.7203048467636108, "loss": 2.9658, "rewards/accuracies": 0.5, "rewards/chosen": -16.61211585998535, "rewards/margins": 0.5909349322319031, "rewards/rejected": -17.20305061340332, "step": 3260 }, { "epoch": 0.11004752435201726, "grad_norm": 22.0983829498291, "learning_rate": 9.996927158141997e-07, "logits/chosen": -0.2499655932188034, "logits/rejected": -0.25793296098709106, "logps/chosen": -1.9067351818084717, "logps/rejected": -1.9466044902801514, "loss": 2.8391, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.067352294921875, "rewards/margins": 0.3986912667751312, "rewards/rejected": -19.466045379638672, "step": 3265 }, { "epoch": 0.11021605042300044, "grad_norm": 30.887603759765625, "learning_rate": 9.996823188592761e-07, "logits/chosen": -0.48815393447875977, "logits/rejected": -0.4621101915836334, "logps/chosen": -1.9688169956207275, "logps/rejected": -2.039149761199951, "loss": 3.3813, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.688167572021484, "rewards/margins": 0.7033309936523438, "rewards/rejected": -20.39150047302246, "step": 3270 }, { "epoch": 0.11038457649398362, "grad_norm": 18.655275344848633, "learning_rate": 9.99671748983989e-07, "logits/chosen": -0.6428505778312683, "logits/rejected": -0.6691970229148865, "logps/chosen": -1.804451584815979, "logps/rejected": -1.76565682888031, "loss": 3.4545, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.04451560974121, "rewards/margins": -0.3879469037055969, "rewards/rejected": -17.65656852722168, "step": 3275 }, { "epoch": 0.1105531025649668, "grad_norm": 35.781402587890625, "learning_rate": 9.996610061919956e-07, "logits/chosen": -0.07583383470773697, "logits/rejected": 0.12737944722175598, "logps/chosen": -2.226592540740967, "logps/rejected": -2.279959201812744, "loss": 2.6481, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.26592445373535, "rewards/margins": 0.533666729927063, "rewards/rejected": -22.799591064453125, "step": 3280 }, { "epoch": 0.11072162863594998, "grad_norm": 30.42848777770996, "learning_rate": 9.99650090487014e-07, "logits/chosen": -0.32865971326828003, "logits/rejected": -0.2108878195285797, "logps/chosen": -1.6149228811264038, "logps/rejected": -1.7221686840057373, "loss": 2.7874, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.149229049682617, "rewards/margins": 1.0724586248397827, "rewards/rejected": -17.221689224243164, "step": 3285 }, { "epoch": 0.11089015470693317, "grad_norm": 42.69295883178711, "learning_rate": 9.996390018728216e-07, "logits/chosen": -0.1308925449848175, "logits/rejected": -0.10833205282688141, "logps/chosen": -1.7586466073989868, "logps/rejected": -1.8226900100708008, "loss": 2.7789, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.58646583557129, "rewards/margins": 0.6404353380203247, "rewards/rejected": -18.226900100708008, "step": 3290 }, { "epoch": 0.11105868077791635, "grad_norm": 32.807212829589844, "learning_rate": 9.996277403532553e-07, "logits/chosen": -0.46557608246803284, "logits/rejected": -0.3887875974178314, "logps/chosen": -1.9230026006698608, "logps/rejected": -1.7927436828613281, "loss": 4.3985, "rewards/accuracies": 0.5, "rewards/chosen": -19.230026245117188, "rewards/margins": -1.302587628364563, "rewards/rejected": -17.927440643310547, "step": 3295 }, { "epoch": 0.11122720684889953, "grad_norm": 20.832653045654297, "learning_rate": 9.996163059322128e-07, "logits/chosen": -0.4223001003265381, "logits/rejected": -0.36276328563690186, "logps/chosen": -1.6754577159881592, "logps/rejected": -1.7316392660140991, "loss": 2.6598, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.75457763671875, "rewards/margins": 0.5618153810501099, "rewards/rejected": -17.31639289855957, "step": 3300 }, { "epoch": 0.11139573291988271, "grad_norm": 16.177337646484375, "learning_rate": 9.996046986136508e-07, "logits/chosen": -0.5084297060966492, "logits/rejected": -0.6135154962539673, "logps/chosen": -1.4946218729019165, "logps/rejected": -1.5074679851531982, "loss": 3.029, "rewards/accuracies": 0.5, "rewards/chosen": -14.946218490600586, "rewards/margins": 0.12846069037914276, "rewards/rejected": -15.074679374694824, "step": 3305 }, { "epoch": 0.11156425899086589, "grad_norm": 25.017414093017578, "learning_rate": 9.995929184015864e-07, "logits/chosen": -0.3279469311237335, "logits/rejected": -0.25481417775154114, "logps/chosen": -2.0133399963378906, "logps/rejected": -2.0342857837677, "loss": 3.1221, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.133398056030273, "rewards/margins": 0.2094583511352539, "rewards/rejected": -20.342859268188477, "step": 3310 }, { "epoch": 0.11173278506184907, "grad_norm": 17.220426559448242, "learning_rate": 9.99580965300096e-07, "logits/chosen": -0.7247230410575867, "logits/rejected": -0.6492441296577454, "logps/chosen": -1.6587985754013062, "logps/rejected": -1.6609020233154297, "loss": 3.0976, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.58798599243164, "rewards/margins": 0.021033382043242455, "rewards/rejected": -16.609020233154297, "step": 3315 }, { "epoch": 0.11190131113283225, "grad_norm": 18.606319427490234, "learning_rate": 9.995688393133163e-07, "logits/chosen": -0.3558014929294586, "logits/rejected": -0.3948605954647064, "logps/chosen": -2.1518683433532715, "logps/rejected": -2.4243855476379395, "loss": 2.6067, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.5186824798584, "rewards/margins": 2.725175142288208, "rewards/rejected": -24.24385643005371, "step": 3320 }, { "epoch": 0.11206983720381543, "grad_norm": 23.35149383544922, "learning_rate": 9.995565404454436e-07, "logits/chosen": -0.3345261216163635, "logits/rejected": -0.4478190541267395, "logps/chosen": -1.6862157583236694, "logps/rejected": -1.6972332000732422, "loss": 3.1233, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.862157821655273, "rewards/margins": 0.11017484962940216, "rewards/rejected": -16.972332000732422, "step": 3325 }, { "epoch": 0.11223836327479861, "grad_norm": 21.12904167175293, "learning_rate": 9.99544068700734e-07, "logits/chosen": -0.47232285141944885, "logits/rejected": -0.43678373098373413, "logps/chosen": -1.9338699579238892, "logps/rejected": -1.812021255493164, "loss": 4.6974, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -19.338699340820312, "rewards/margins": -1.218487024307251, "rewards/rejected": -18.120214462280273, "step": 3330 }, { "epoch": 0.11240688934578179, "grad_norm": 47.94718551635742, "learning_rate": 9.995314240835032e-07, "logits/chosen": -0.37584561109542847, "logits/rejected": -0.2552175521850586, "logps/chosen": -1.8831770420074463, "logps/rejected": -1.8070863485336304, "loss": 3.9093, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.831768035888672, "rewards/margins": -0.7609058618545532, "rewards/rejected": -18.070865631103516, "step": 3335 }, { "epoch": 0.11257541541676497, "grad_norm": 6.487846374511719, "learning_rate": 9.995186065981275e-07, "logits/chosen": -0.3062947392463684, "logits/rejected": -0.21386337280273438, "logps/chosen": -2.0181260108947754, "logps/rejected": -2.0927810668945312, "loss": 3.1407, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.181259155273438, "rewards/margins": 0.7465518712997437, "rewards/rejected": -20.927810668945312, "step": 3340 }, { "epoch": 0.11274394148774815, "grad_norm": 13.073860168457031, "learning_rate": 9.995056162490423e-07, "logits/chosen": -0.64457768201828, "logits/rejected": -0.6320878267288208, "logps/chosen": -1.70615553855896, "logps/rejected": -1.741113305091858, "loss": 3.1045, "rewards/accuracies": 0.5, "rewards/chosen": -17.061555862426758, "rewards/margins": 0.3495769500732422, "rewards/rejected": -17.4111328125, "step": 3345 }, { "epoch": 0.11291246755873134, "grad_norm": 22.207111358642578, "learning_rate": 9.994924530407429e-07, "logits/chosen": -0.6318604946136475, "logits/rejected": -0.6339292526245117, "logps/chosen": -1.6218618154525757, "logps/rejected": -1.507359266281128, "loss": 4.2157, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.218618392944336, "rewards/margins": -1.145025610923767, "rewards/rejected": -15.073593139648438, "step": 3350 }, { "epoch": 0.11308099362971452, "grad_norm": 53.56328582763672, "learning_rate": 9.99479116977785e-07, "logits/chosen": -0.2964858412742615, "logits/rejected": -0.2377942055463791, "logps/chosen": -2.008469581604004, "logps/rejected": -2.0366320610046387, "loss": 2.9689, "rewards/accuracies": 0.5, "rewards/chosen": -20.08469581604004, "rewards/margins": 0.28162336349487305, "rewards/rejected": -20.36631965637207, "step": 3355 }, { "epoch": 0.1132495197006977, "grad_norm": 21.440587997436523, "learning_rate": 9.994656080647833e-07, "logits/chosen": -0.11374900490045547, "logits/rejected": -0.1913887858390808, "logps/chosen": -2.431771755218506, "logps/rejected": -2.641162872314453, "loss": 2.6706, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.317718505859375, "rewards/margins": 2.0939087867736816, "rewards/rejected": -26.4116268157959, "step": 3360 }, { "epoch": 0.11341804577168088, "grad_norm": 47.10682678222656, "learning_rate": 9.994519263064125e-07, "logits/chosen": -0.5051501393318176, "logits/rejected": -0.5939575433731079, "logps/chosen": -1.783546805381775, "logps/rejected": -1.7367546558380127, "loss": 3.549, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.835468292236328, "rewards/margins": -0.4679209589958191, "rewards/rejected": -17.3675479888916, "step": 3365 }, { "epoch": 0.11358657184266406, "grad_norm": 22.752422332763672, "learning_rate": 9.99438071707408e-07, "logits/chosen": -0.7217915654182434, "logits/rejected": -0.6456397771835327, "logps/chosen": -1.6618263721466064, "logps/rejected": -1.5785537958145142, "loss": 3.8696, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -16.61826515197754, "rewards/margins": -0.8327254056930542, "rewards/rejected": -15.785539627075195, "step": 3370 }, { "epoch": 0.11375509791364724, "grad_norm": 28.850156784057617, "learning_rate": 9.994240442725639e-07, "logits/chosen": -0.28035932779312134, "logits/rejected": -0.38660159707069397, "logps/chosen": -1.8262609243392944, "logps/rejected": -1.949873685836792, "loss": 2.7329, "rewards/accuracies": 0.5, "rewards/chosen": -18.262609481811523, "rewards/margins": 1.2361291646957397, "rewards/rejected": -19.498737335205078, "step": 3375 }, { "epoch": 0.11392362398463042, "grad_norm": 20.6746883392334, "learning_rate": 9.994098440067344e-07, "logits/chosen": -0.3793238699436188, "logits/rejected": -0.5065209269523621, "logps/chosen": -1.8463551998138428, "logps/rejected": -1.7899481058120728, "loss": 3.791, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.463550567626953, "rewards/margins": -0.5640712976455688, "rewards/rejected": -17.899478912353516, "step": 3380 }, { "epoch": 0.1140921500556136, "grad_norm": 24.47591781616211, "learning_rate": 9.99395470914834e-07, "logits/chosen": -0.5719932317733765, "logits/rejected": -0.38576334714889526, "logps/chosen": -2.1512346267700195, "logps/rejected": -2.1791083812713623, "loss": 3.1521, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.512344360351562, "rewards/margins": 0.2787378430366516, "rewards/rejected": -21.79108238220215, "step": 3385 }, { "epoch": 0.11426067612659678, "grad_norm": 27.42087745666504, "learning_rate": 9.993809250018364e-07, "logits/chosen": -0.687567412853241, "logits/rejected": -0.7501705884933472, "logps/chosen": -1.8461263179779053, "logps/rejected": -1.9768394231796265, "loss": 3.1181, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.461265563964844, "rewards/margins": 1.3071308135986328, "rewards/rejected": -19.768394470214844, "step": 3390 }, { "epoch": 0.11442920219757996, "grad_norm": 16.828157424926758, "learning_rate": 9.993662062727757e-07, "logits/chosen": -0.6422561407089233, "logits/rejected": -0.6002863645553589, "logps/chosen": -1.6547744274139404, "logps/rejected": -1.6610130071640015, "loss": 3.0731, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.547744750976562, "rewards/margins": 0.06238384172320366, "rewards/rejected": -16.61012840270996, "step": 3395 }, { "epoch": 0.11459772826856314, "grad_norm": 26.950260162353516, "learning_rate": 9.99351314732745e-07, "logits/chosen": -0.11768583953380585, "logits/rejected": -0.1222616657614708, "logps/chosen": -2.5058677196502686, "logps/rejected": -2.431462287902832, "loss": 4.0213, "rewards/accuracies": 0.5, "rewards/chosen": -25.058677673339844, "rewards/margins": -0.7440546154975891, "rewards/rejected": -24.31462287902832, "step": 3400 }, { "epoch": 0.11476625433954633, "grad_norm": 25.076194763183594, "learning_rate": 9.99336250386898e-07, "logits/chosen": -0.4412030577659607, "logits/rejected": -0.3538280725479126, "logps/chosen": -2.0328128337860107, "logps/rejected": -2.137967586517334, "loss": 2.6655, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.3281307220459, "rewards/margins": 1.051546335220337, "rewards/rejected": -21.379674911499023, "step": 3405 }, { "epoch": 0.11493478041052951, "grad_norm": 15.957403182983398, "learning_rate": 9.993210132404479e-07, "logits/chosen": -0.4786578118801117, "logits/rejected": -0.5396562814712524, "logps/chosen": -2.0582754611968994, "logps/rejected": -2.204559087753296, "loss": 2.1681, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.582754135131836, "rewards/margins": 1.4628350734710693, "rewards/rejected": -22.045589447021484, "step": 3410 }, { "epoch": 0.1151033064815127, "grad_norm": 43.187171936035156, "learning_rate": 9.993056032986676e-07, "logits/chosen": -0.6285707950592041, "logits/rejected": -0.4946421682834625, "logps/chosen": -1.5473029613494873, "logps/rejected": -1.574300765991211, "loss": 2.8268, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -15.473031997680664, "rewards/margins": 0.269977867603302, "rewards/rejected": -15.743008613586426, "step": 3415 }, { "epoch": 0.11527183255249587, "grad_norm": 16.885690689086914, "learning_rate": 9.992900205668896e-07, "logits/chosen": -0.3438575267791748, "logits/rejected": -0.3555828928947449, "logps/chosen": -1.5282278060913086, "logps/rejected": -1.643481969833374, "loss": 2.6082, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -15.28227710723877, "rewards/margins": 1.1525436639785767, "rewards/rejected": -16.43482208251953, "step": 3420 }, { "epoch": 0.11544035862347905, "grad_norm": 101.62150573730469, "learning_rate": 9.992742650505071e-07, "logits/chosen": -0.37929654121398926, "logits/rejected": -0.45227640867233276, "logps/chosen": -2.4064865112304688, "logps/rejected": -2.1600139141082764, "loss": 6.0675, "rewards/accuracies": 0.5, "rewards/chosen": -24.064863204956055, "rewards/margins": -2.464722156524658, "rewards/rejected": -21.600141525268555, "step": 3425 }, { "epoch": 0.11560888469446223, "grad_norm": 24.92697525024414, "learning_rate": 9.992583367549719e-07, "logits/chosen": -0.5041292905807495, "logits/rejected": -0.6150269508361816, "logps/chosen": -1.5577831268310547, "logps/rejected": -1.5323947668075562, "loss": 3.3247, "rewards/accuracies": 0.5, "rewards/chosen": -15.577832221984863, "rewards/margins": -0.2538827955722809, "rewards/rejected": -15.323948860168457, "step": 3430 }, { "epoch": 0.11577741076544541, "grad_norm": 26.776927947998047, "learning_rate": 9.992422356857963e-07, "logits/chosen": -0.6362167596817017, "logits/rejected": -0.6052538752555847, "logps/chosen": -1.5442123413085938, "logps/rejected": -1.5520304441452026, "loss": 3.1438, "rewards/accuracies": 0.5, "rewards/chosen": -15.442123413085938, "rewards/margins": 0.07818098366260529, "rewards/rejected": -15.520304679870605, "step": 3435 }, { "epoch": 0.11594593683642859, "grad_norm": 23.41965675354004, "learning_rate": 9.992259618485523e-07, "logits/chosen": 0.030453210696578026, "logits/rejected": 0.03214035555720329, "logps/chosen": -2.0576891899108887, "logps/rejected": -2.1275794506073, "loss": 2.723, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.576892852783203, "rewards/margins": 0.6989046335220337, "rewards/rejected": -21.27579689025879, "step": 3440 }, { "epoch": 0.11611446290741177, "grad_norm": 23.502748489379883, "learning_rate": 9.992095152488718e-07, "logits/chosen": -0.40521278977394104, "logits/rejected": -0.27406761050224304, "logps/chosen": -1.9019501209259033, "logps/rejected": -2.0432791709899902, "loss": 2.512, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.019500732421875, "rewards/margins": 1.4132897853851318, "rewards/rejected": -20.432790756225586, "step": 3445 }, { "epoch": 0.11628298897839495, "grad_norm": 21.05483627319336, "learning_rate": 9.991928958924458e-07, "logits/chosen": -0.36097151041030884, "logits/rejected": -0.35222867131233215, "logps/chosen": -1.7462047338485718, "logps/rejected": -1.74808669090271, "loss": 3.1059, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.462047576904297, "rewards/margins": 0.01881999894976616, "rewards/rejected": -17.48086929321289, "step": 3450 }, { "epoch": 0.11645151504937813, "grad_norm": 19.621593475341797, "learning_rate": 9.991761037850262e-07, "logits/chosen": -0.4645184874534607, "logits/rejected": -0.47621220350265503, "logps/chosen": -1.7795826196670532, "logps/rejected": -1.776755928993225, "loss": 3.2258, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.795825958251953, "rewards/margins": -0.028265666216611862, "rewards/rejected": -17.767559051513672, "step": 3455 }, { "epoch": 0.11662004112036133, "grad_norm": 30.127208709716797, "learning_rate": 9.99159138932424e-07, "logits/chosen": -0.4851096570491791, "logits/rejected": -0.47763925790786743, "logps/chosen": -1.8681427240371704, "logps/rejected": -1.7204406261444092, "loss": 4.5361, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.681425094604492, "rewards/margins": -1.4770203828811646, "rewards/rejected": -17.20440673828125, "step": 3460 }, { "epoch": 0.1167885671913445, "grad_norm": 27.600656509399414, "learning_rate": 9.991420013405095e-07, "logits/chosen": -0.4824017584323883, "logits/rejected": -0.43301159143447876, "logps/chosen": -1.8536741733551025, "logps/rejected": -1.9033809900283813, "loss": 2.9276, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.536741256713867, "rewards/margins": 0.49706658720970154, "rewards/rejected": -19.033809661865234, "step": 3465 }, { "epoch": 0.11695709326232769, "grad_norm": 15.61844253540039, "learning_rate": 9.99124691015214e-07, "logits/chosen": -0.7837399840354919, "logits/rejected": -0.675243079662323, "logps/chosen": -1.6042782068252563, "logps/rejected": -1.7261016368865967, "loss": 2.5794, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.042781829833984, "rewards/margins": 1.2182351350784302, "rewards/rejected": -17.261016845703125, "step": 3470 }, { "epoch": 0.11712561933331087, "grad_norm": 18.784343719482422, "learning_rate": 9.991072079625275e-07, "logits/chosen": -0.08538699895143509, "logits/rejected": -0.10807951539754868, "logps/chosen": -1.6151269674301147, "logps/rejected": -1.8042793273925781, "loss": 2.1694, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.151269912719727, "rewards/margins": 1.8915237188339233, "rewards/rejected": -18.04279327392578, "step": 3475 }, { "epoch": 0.11729414540429405, "grad_norm": 24.512720108032227, "learning_rate": 9.990895521885005e-07, "logits/chosen": 0.02145923301577568, "logits/rejected": -0.07628260552883148, "logps/chosen": -1.8797547817230225, "logps/rejected": -1.8971583843231201, "loss": 3.3072, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.797550201416016, "rewards/margins": 0.17403507232666016, "rewards/rejected": -18.97158432006836, "step": 3480 }, { "epoch": 0.11746267147527722, "grad_norm": 13.824844360351562, "learning_rate": 9.990717236992428e-07, "logits/chosen": -0.40924936532974243, "logits/rejected": -0.3904082477092743, "logps/chosen": -1.5942176580429077, "logps/rejected": -1.5907622575759888, "loss": 3.1973, "rewards/accuracies": 0.5, "rewards/chosen": -15.942178726196289, "rewards/margins": -0.03455467149615288, "rewards/rejected": -15.907621383666992, "step": 3485 }, { "epoch": 0.1176311975462604, "grad_norm": 21.68479347229004, "learning_rate": 9.990537225009242e-07, "logits/chosen": -0.3777514100074768, "logits/rejected": -0.34054070711135864, "logps/chosen": -1.5628550052642822, "logps/rejected": -1.6658351421356201, "loss": 2.6825, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -15.62855052947998, "rewards/margins": 1.0298006534576416, "rewards/rejected": -16.65835189819336, "step": 3490 }, { "epoch": 0.11779972361724358, "grad_norm": 40.326507568359375, "learning_rate": 9.99035548599774e-07, "logits/chosen": -0.5707781910896301, "logits/rejected": -0.5124248266220093, "logps/chosen": -1.8460489511489868, "logps/rejected": -1.740540862083435, "loss": 4.0993, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.46048927307129, "rewards/margins": -1.0550801753997803, "rewards/rejected": -17.405406951904297, "step": 3495 }, { "epoch": 0.11796824968822676, "grad_norm": 76.25070190429688, "learning_rate": 9.990172020020818e-07, "logits/chosen": -0.30215927958488464, "logits/rejected": -0.23703379929065704, "logps/chosen": -1.8280613422393799, "logps/rejected": -1.7260147333145142, "loss": 4.21, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.280614852905273, "rewards/margins": -1.020465612411499, "rewards/rejected": -17.260149002075195, "step": 3500 }, { "epoch": 0.11813677575920994, "grad_norm": 26.626041412353516, "learning_rate": 9.989986827141963e-07, "logits/chosen": -0.4228217601776123, "logits/rejected": -0.4215395450592041, "logps/chosen": -1.7155015468597412, "logps/rejected": -1.7580169439315796, "loss": 2.7408, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.15501594543457, "rewards/margins": 0.4251536428928375, "rewards/rejected": -17.580167770385742, "step": 3505 }, { "epoch": 0.11830530183019312, "grad_norm": 25.78866195678711, "learning_rate": 9.989799907425268e-07, "logits/chosen": -0.3247675597667694, "logits/rejected": -0.3234170079231262, "logps/chosen": -1.9856287240982056, "logps/rejected": -2.078611373901367, "loss": 3.0682, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.856287002563477, "rewards/margins": 0.9298248291015625, "rewards/rejected": -20.786113739013672, "step": 3510 }, { "epoch": 0.11847382790117632, "grad_norm": 27.890932083129883, "learning_rate": 9.989611260935413e-07, "logits/chosen": -0.3991270959377289, "logits/rejected": -0.38465866446495056, "logps/chosen": -2.2957253456115723, "logps/rejected": -2.220700740814209, "loss": 3.9668, "rewards/accuracies": 0.5, "rewards/chosen": -22.957252502441406, "rewards/margins": -0.750244140625, "rewards/rejected": -22.207008361816406, "step": 3515 }, { "epoch": 0.1186423539721595, "grad_norm": 62.70400619506836, "learning_rate": 9.989420887737683e-07, "logits/chosen": -0.32912951707839966, "logits/rejected": -0.4093703329563141, "logps/chosen": -2.0264673233032227, "logps/rejected": -1.9524818658828735, "loss": 3.9945, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.264673233032227, "rewards/margins": -0.7398567199707031, "rewards/rejected": -19.52481460571289, "step": 3520 }, { "epoch": 0.11881088004314268, "grad_norm": 31.33793067932129, "learning_rate": 9.98922878789796e-07, "logits/chosen": -0.12958894670009613, "logits/rejected": -0.21700029075145721, "logps/chosen": -2.0517940521240234, "logps/rejected": -2.1143057346343994, "loss": 2.6628, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.517940521240234, "rewards/margins": 0.6251155734062195, "rewards/rejected": -21.143054962158203, "step": 3525 }, { "epoch": 0.11897940611412586, "grad_norm": 32.016998291015625, "learning_rate": 9.989034961482721e-07, "logits/chosen": -0.3710765242576599, "logits/rejected": -0.35672563314437866, "logps/chosen": -2.2033329010009766, "logps/rejected": -2.3549094200134277, "loss": 2.7372, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.0333309173584, "rewards/margins": 1.5157625675201416, "rewards/rejected": -23.54909324645996, "step": 3530 }, { "epoch": 0.11914793218510904, "grad_norm": 31.52229881286621, "learning_rate": 9.988839408559044e-07, "logits/chosen": -0.5221918225288391, "logits/rejected": -0.5441256761550903, "logps/chosen": -1.7516603469848633, "logps/rejected": -1.7036716938018799, "loss": 3.6209, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.5166015625, "rewards/margins": -0.4798874855041504, "rewards/rejected": -17.03671646118164, "step": 3535 }, { "epoch": 0.11931645825609222, "grad_norm": 23.19977569580078, "learning_rate": 9.988642129194598e-07, "logits/chosen": -0.23060889542102814, "logits/rejected": -0.27095380425453186, "logps/chosen": -1.9323209524154663, "logps/rejected": -1.8174877166748047, "loss": 4.1969, "rewards/accuracies": 0.5, "rewards/chosen": -19.32320785522461, "rewards/margins": -1.1483336687088013, "rewards/rejected": -18.174877166748047, "step": 3540 }, { "epoch": 0.1194849843270754, "grad_norm": 25.622182846069336, "learning_rate": 9.988443123457655e-07, "logits/chosen": -0.320268452167511, "logits/rejected": -0.28582894802093506, "logps/chosen": -1.6403043270111084, "logps/rejected": -1.7142482995986938, "loss": 2.5097, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.403045654296875, "rewards/margins": 0.7394381761550903, "rewards/rejected": -17.14248275756836, "step": 3545 }, { "epoch": 0.11965351039805858, "grad_norm": 30.17853355407715, "learning_rate": 9.988242391417086e-07, "logits/chosen": -0.7527714967727661, "logits/rejected": -0.7969571352005005, "logps/chosen": -1.7517030239105225, "logps/rejected": -1.8085277080535889, "loss": 3.1445, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.517030715942383, "rewards/margins": 0.5682462453842163, "rewards/rejected": -18.085277557373047, "step": 3550 }, { "epoch": 0.11982203646904176, "grad_norm": 34.78050994873047, "learning_rate": 9.988039933142353e-07, "logits/chosen": -0.6721317768096924, "logits/rejected": -0.6552478075027466, "logps/chosen": -1.6741306781768799, "logps/rejected": -1.7111762762069702, "loss": 2.8275, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.74130630493164, "rewards/margins": 0.37045803666114807, "rewards/rejected": -17.11176300048828, "step": 3555 }, { "epoch": 0.11999056254002494, "grad_norm": 39.94550323486328, "learning_rate": 9.98783574870352e-07, "logits/chosen": -0.21335110068321228, "logits/rejected": -0.20708951354026794, "logps/chosen": -1.988205909729004, "logps/rejected": -2.133162021636963, "loss": 3.0632, "rewards/accuracies": 0.5, "rewards/chosen": -19.882061004638672, "rewards/margins": 1.4495601654052734, "rewards/rejected": -21.331623077392578, "step": 3560 }, { "epoch": 0.12015908861100812, "grad_norm": 88.8822021484375, "learning_rate": 9.987629838171248e-07, "logits/chosen": -0.6824513673782349, "logits/rejected": -0.7000759840011597, "logps/chosen": -1.8915393352508545, "logps/rejected": -1.930729866027832, "loss": 2.9006, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.915393829345703, "rewards/margins": 0.39190226793289185, "rewards/rejected": -19.307296752929688, "step": 3565 }, { "epoch": 0.12032761468199131, "grad_norm": 24.55368995666504, "learning_rate": 9.987422201616792e-07, "logits/chosen": -0.585277259349823, "logits/rejected": -0.6046692132949829, "logps/chosen": -2.3041303157806396, "logps/rejected": -2.2990269660949707, "loss": 3.2232, "rewards/accuracies": 0.5, "rewards/chosen": -23.041301727294922, "rewards/margins": -0.05103177949786186, "rewards/rejected": -22.990270614624023, "step": 3570 }, { "epoch": 0.12049614075297449, "grad_norm": 18.27791976928711, "learning_rate": 9.98721283911201e-07, "logits/chosen": -0.28167563676834106, "logits/rejected": -0.320262610912323, "logps/chosen": -1.744283676147461, "logps/rejected": -1.791398286819458, "loss": 2.734, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.442834854125977, "rewards/margins": 0.4711475372314453, "rewards/rejected": -17.913982391357422, "step": 3575 }, { "epoch": 0.12066466682395767, "grad_norm": 16.623876571655273, "learning_rate": 9.987001750729354e-07, "logits/chosen": -0.7061062455177307, "logits/rejected": -0.8444076776504517, "logps/chosen": -1.6471954584121704, "logps/rejected": -1.7329185009002686, "loss": 2.4762, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.471954345703125, "rewards/margins": 0.8572310209274292, "rewards/rejected": -17.329185485839844, "step": 3580 }, { "epoch": 0.12083319289494085, "grad_norm": 57.20680236816406, "learning_rate": 9.98678893654187e-07, "logits/chosen": -0.08321056514978409, "logits/rejected": -0.08359815180301666, "logps/chosen": -2.018667221069336, "logps/rejected": -1.961265206336975, "loss": 3.642, "rewards/accuracies": 0.5, "rewards/chosen": -20.18667221069336, "rewards/margins": -0.5740194320678711, "rewards/rejected": -19.612651824951172, "step": 3585 }, { "epoch": 0.12100171896592403, "grad_norm": 17.29876708984375, "learning_rate": 9.986574396623208e-07, "logits/chosen": -0.3871995806694031, "logits/rejected": -0.45271673798561096, "logps/chosen": -2.271118640899658, "logps/rejected": -2.2195260524749756, "loss": 5.4421, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -22.711185455322266, "rewards/margins": -0.5159248113632202, "rewards/rejected": -22.19525909423828, "step": 3590 }, { "epoch": 0.12117024503690721, "grad_norm": 22.723844528198242, "learning_rate": 9.986358131047609e-07, "logits/chosen": -0.4757024645805359, "logits/rejected": -0.35431593656539917, "logps/chosen": -1.4331119060516357, "logps/rejected": -1.6594613790512085, "loss": 1.8193, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -14.3311185836792, "rewards/margins": 2.2634944915771484, "rewards/rejected": -16.594614028930664, "step": 3595 }, { "epoch": 0.12133877110789039, "grad_norm": 77.45389556884766, "learning_rate": 9.986140139889916e-07, "logits/chosen": -0.04966864734888077, "logits/rejected": 0.07870599627494812, "logps/chosen": -2.0236928462982178, "logps/rejected": -2.0264344215393066, "loss": 3.1565, "rewards/accuracies": 0.5, "rewards/chosen": -20.236928939819336, "rewards/margins": 0.027413081377744675, "rewards/rejected": -20.26434326171875, "step": 3600 }, { "epoch": 0.12133877110789039, "eval_logits/chosen": -0.7526273131370544, "eval_logits/rejected": -0.7618316411972046, "eval_logps/chosen": -1.6854208707809448, "eval_logps/rejected": -1.6938505172729492, "eval_loss": 3.3851799964904785, "eval_rewards/accuracies": 0.4699999988079071, "eval_rewards/chosen": -16.854209899902344, "eval_rewards/margins": 0.08429720997810364, "eval_rewards/rejected": -16.938507080078125, "eval_runtime": 12.8944, "eval_samples_per_second": 7.755, "eval_steps_per_second": 1.939, "step": 3600 }, { "epoch": 0.12150729717887357, "grad_norm": 22.01042938232422, "learning_rate": 9.98592042322557e-07, "logits/chosen": -0.6339259147644043, "logits/rejected": -0.670158326625824, "logps/chosen": -1.7541553974151611, "logps/rejected": -1.723435640335083, "loss": 3.4452, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -17.541553497314453, "rewards/margins": -0.3071993887424469, "rewards/rejected": -17.23435401916504, "step": 3605 }, { "epoch": 0.12167582324985675, "grad_norm": 87.37193298339844, "learning_rate": 9.9856989811306e-07, "logits/chosen": -0.28349608182907104, "logits/rejected": -0.31893137097358704, "logps/chosen": -2.0025875568389893, "logps/rejected": -1.8682279586791992, "loss": 4.5711, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -20.025875091552734, "rewards/margins": -1.3435958623886108, "rewards/rejected": -18.682279586791992, "step": 3610 }, { "epoch": 0.12184434932083993, "grad_norm": 22.39496421813965, "learning_rate": 9.985475813681639e-07, "logits/chosen": -0.6315996050834656, "logits/rejected": -0.7218677401542664, "logps/chosen": -1.6861488819122314, "logps/rejected": -1.6557300090789795, "loss": 3.4551, "rewards/accuracies": 0.5, "rewards/chosen": -16.86149024963379, "rewards/margins": -0.3041892945766449, "rewards/rejected": -16.557300567626953, "step": 3615 }, { "epoch": 0.12201287539182311, "grad_norm": 24.289531707763672, "learning_rate": 9.985250920955921e-07, "logits/chosen": -0.590740978717804, "logits/rejected": -0.4106348156929016, "logps/chosen": -1.9113991260528564, "logps/rejected": -1.8914527893066406, "loss": 3.4557, "rewards/accuracies": 0.5, "rewards/chosen": -19.113990783691406, "rewards/margins": -0.19946375489234924, "rewards/rejected": -18.914525985717773, "step": 3620 }, { "epoch": 0.1221814014628063, "grad_norm": 31.299972534179688, "learning_rate": 9.98502430303127e-07, "logits/chosen": -0.2667599022388458, "logits/rejected": -0.2786843180656433, "logps/chosen": -2.008376359939575, "logps/rejected": -2.3521313667297363, "loss": 2.2606, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.08376121520996, "rewards/margins": 3.437549591064453, "rewards/rejected": -23.521310806274414, "step": 3625 }, { "epoch": 0.12234992753378948, "grad_norm": 31.534088134765625, "learning_rate": 9.984795959986112e-07, "logits/chosen": 0.22582361102104187, "logits/rejected": 0.14064531028270721, "logps/chosen": -2.0859715938568115, "logps/rejected": -1.983786940574646, "loss": 4.2662, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.859716415405273, "rewards/margins": -1.0218474864959717, "rewards/rejected": -19.83786964416504, "step": 3630 }, { "epoch": 0.12251845360477266, "grad_norm": 32.42240905761719, "learning_rate": 9.984565891899463e-07, "logits/chosen": -0.1923142522573471, "logits/rejected": -0.25374284386634827, "logps/chosen": -1.7470464706420898, "logps/rejected": -1.6832809448242188, "loss": 3.7209, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.47046661376953, "rewards/margins": -0.6376568675041199, "rewards/rejected": -16.832809448242188, "step": 3635 }, { "epoch": 0.12268697967575584, "grad_norm": 35.28072738647461, "learning_rate": 9.984334098850944e-07, "logits/chosen": -0.3545475900173187, "logits/rejected": -0.23502996563911438, "logps/chosen": -2.015934944152832, "logps/rejected": -1.9783185720443726, "loss": 3.5999, "rewards/accuracies": 0.5, "rewards/chosen": -20.159351348876953, "rewards/margins": -0.37616491317749023, "rewards/rejected": -19.783187866210938, "step": 3640 }, { "epoch": 0.12285550574673902, "grad_norm": 27.003463745117188, "learning_rate": 9.984100580920768e-07, "logits/chosen": -0.27759090065956116, "logits/rejected": -0.2839242219924927, "logps/chosen": -1.5591684579849243, "logps/rejected": -1.6653550863265991, "loss": 3.5842, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -15.591684341430664, "rewards/margins": 1.0618667602539062, "rewards/rejected": -16.65355110168457, "step": 3645 }, { "epoch": 0.1230240318177222, "grad_norm": 27.013710021972656, "learning_rate": 9.983865338189746e-07, "logits/chosen": -0.5425965189933777, "logits/rejected": -0.5991086959838867, "logps/chosen": -1.6972814798355103, "logps/rejected": -1.698444128036499, "loss": 3.2068, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.97281265258789, "rewards/margins": 0.011627817526459694, "rewards/rejected": -16.984439849853516, "step": 3650 }, { "epoch": 0.12319255788870538, "grad_norm": 12.66113567352295, "learning_rate": 9.983628370739288e-07, "logits/chosen": -0.6093885898590088, "logits/rejected": -0.48226800560951233, "logps/chosen": -1.6346813440322876, "logps/rejected": -1.6976230144500732, "loss": 3.1563, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -16.346811294555664, "rewards/margins": 0.6294196844100952, "rewards/rejected": -16.976232528686523, "step": 3655 }, { "epoch": 0.12336108395968856, "grad_norm": 66.68254089355469, "learning_rate": 9.983389678651398e-07, "logits/chosen": -0.32299280166625977, "logits/rejected": -0.2723495662212372, "logps/chosen": -2.3520588874816895, "logps/rejected": -2.3130524158477783, "loss": 4.0511, "rewards/accuracies": 0.5, "rewards/chosen": -23.520587921142578, "rewards/margins": -0.39006367325782776, "rewards/rejected": -23.130521774291992, "step": 3660 }, { "epoch": 0.12352961003067174, "grad_norm": 18.499338150024414, "learning_rate": 9.983149262008677e-07, "logits/chosen": -0.7030686736106873, "logits/rejected": -0.6500356793403625, "logps/chosen": -1.80410635471344, "logps/rejected": -1.8138717412948608, "loss": 2.9992, "rewards/accuracies": 0.5, "rewards/chosen": -18.041065216064453, "rewards/margins": 0.09765472263097763, "rewards/rejected": -18.138717651367188, "step": 3665 }, { "epoch": 0.12369813610165492, "grad_norm": 22.921619415283203, "learning_rate": 9.982907120894325e-07, "logits/chosen": -0.4772499203681946, "logits/rejected": -0.6954213380813599, "logps/chosen": -1.6093018054962158, "logps/rejected": -1.5932279825210571, "loss": 3.2344, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.093017578125, "rewards/margins": -0.16073980927467346, "rewards/rejected": -15.932279586791992, "step": 3670 }, { "epoch": 0.1238666621726381, "grad_norm": 21.458614349365234, "learning_rate": 9.982663255392137e-07, "logits/chosen": -0.34770479798316956, "logits/rejected": -0.39969635009765625, "logps/chosen": -1.571874976158142, "logps/rejected": -1.5835822820663452, "loss": 3.1312, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -15.71875, "rewards/margins": 0.11707315593957901, "rewards/rejected": -15.835823059082031, "step": 3675 }, { "epoch": 0.1240351882436213, "grad_norm": 29.32381820678711, "learning_rate": 9.982417665586508e-07, "logits/chosen": -0.4228406846523285, "logits/rejected": -0.45245176553726196, "logps/chosen": -2.108222007751465, "logps/rejected": -1.8850529193878174, "loss": 6.1308, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.08222007751465, "rewards/margins": -2.2316925525665283, "rewards/rejected": -18.850528717041016, "step": 3680 }, { "epoch": 0.12420371431460447, "grad_norm": 25.706727981567383, "learning_rate": 9.98217035156242e-07, "logits/chosen": -0.32080286741256714, "logits/rejected": -0.32200899720191956, "logps/chosen": -2.022761583328247, "logps/rejected": -2.0280632972717285, "loss": 3.2838, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.227615356445312, "rewards/margins": 0.05301866680383682, "rewards/rejected": -20.2806339263916, "step": 3685 }, { "epoch": 0.12437224038558765, "grad_norm": 37.784912109375, "learning_rate": 9.981921313405464e-07, "logits/chosen": -0.3293726444244385, "logits/rejected": -0.42785710096359253, "logps/chosen": -1.616093635559082, "logps/rejected": -1.6466785669326782, "loss": 2.855, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.160934448242188, "rewards/margins": 0.3058490753173828, "rewards/rejected": -16.466785430908203, "step": 3690 }, { "epoch": 0.12454076645657083, "grad_norm": 22.215789794921875, "learning_rate": 9.981670551201824e-07, "logits/chosen": -0.7644900679588318, "logits/rejected": -0.6727944016456604, "logps/chosen": -1.6866724491119385, "logps/rejected": -1.7234609127044678, "loss": 2.886, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.866724014282227, "rewards/margins": 0.3678857684135437, "rewards/rejected": -17.234609603881836, "step": 3695 }, { "epoch": 0.12470929252755401, "grad_norm": 32.08245849609375, "learning_rate": 9.981418065038273e-07, "logits/chosen": -0.30399376153945923, "logits/rejected": -0.19136790931224823, "logps/chosen": -1.7521512508392334, "logps/rejected": -1.924407958984375, "loss": 1.9513, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.521514892578125, "rewards/margins": 1.7225669622421265, "rewards/rejected": -19.24407958984375, "step": 3700 }, { "epoch": 0.1248778185985372, "grad_norm": 35.04029846191406, "learning_rate": 9.981163855002192e-07, "logits/chosen": -0.35477086901664734, "logits/rejected": -0.43373337388038635, "logps/chosen": -1.8812839984893799, "logps/rejected": -1.8338260650634766, "loss": 3.5299, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.812841415405273, "rewards/margins": -0.4745791554450989, "rewards/rejected": -18.338260650634766, "step": 3705 }, { "epoch": 0.1250463446695204, "grad_norm": 18.448402404785156, "learning_rate": 9.98090792118155e-07, "logits/chosen": -0.2764646112918854, "logits/rejected": -0.3987608850002289, "logps/chosen": -1.8961387872695923, "logps/rejected": -2.0002424716949463, "loss": 2.5696, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.961389541625977, "rewards/margins": 1.041035532951355, "rewards/rejected": -20.002424240112305, "step": 3710 }, { "epoch": 0.12521487074050355, "grad_norm": 20.717815399169922, "learning_rate": 9.980650263664917e-07, "logits/chosen": -0.2851320803165436, "logits/rejected": -0.3245907723903656, "logps/chosen": -1.7553514242172241, "logps/rejected": -1.8028484582901, "loss": 2.8536, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.553516387939453, "rewards/margins": 0.4749675691127777, "rewards/rejected": -18.02848243713379, "step": 3715 }, { "epoch": 0.12538339681148675, "grad_norm": 17.02474021911621, "learning_rate": 9.980390882541456e-07, "logits/chosen": -0.7261825799942017, "logits/rejected": -0.675274670124054, "logps/chosen": -1.6618716716766357, "logps/rejected": -1.7756210565567017, "loss": 2.3753, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.618717193603516, "rewards/margins": 1.137491226196289, "rewards/rejected": -17.756208419799805, "step": 3720 }, { "epoch": 0.1255519228824699, "grad_norm": 22.09320640563965, "learning_rate": 9.980129777900932e-07, "logits/chosen": -0.6453119516372681, "logits/rejected": -0.6601329445838928, "logps/chosen": -1.3661413192749023, "logps/rejected": -1.289612889289856, "loss": 3.8121, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -13.661413192749023, "rewards/margins": -0.7652846574783325, "rewards/rejected": -12.896130561828613, "step": 3725 }, { "epoch": 0.1257204489534531, "grad_norm": 19.057912826538086, "learning_rate": 9.9798669498337e-07, "logits/chosen": -0.4333661198616028, "logits/rejected": -0.4165084958076477, "logps/chosen": -1.9831759929656982, "logps/rejected": -2.0759761333465576, "loss": 2.2843, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.83176040649414, "rewards/margins": 0.9280030131340027, "rewards/rejected": -20.759763717651367, "step": 3730 }, { "epoch": 0.12588897502443627, "grad_norm": 25.03388786315918, "learning_rate": 9.979602398430715e-07, "logits/chosen": -0.6505326628684998, "logits/rejected": -0.5560685396194458, "logps/chosen": -1.5252478122711182, "logps/rejected": -1.583319067955017, "loss": 2.7461, "rewards/accuracies": 0.5, "rewards/chosen": -15.252477645874023, "rewards/margins": 0.5807129144668579, "rewards/rejected": -15.83319091796875, "step": 3735 }, { "epoch": 0.12605750109541947, "grad_norm": 23.37642478942871, "learning_rate": 9.97933612378353e-07, "logits/chosen": -0.3261653780937195, "logits/rejected": -0.32249951362609863, "logps/chosen": -1.7047618627548218, "logps/rejected": -1.8726396560668945, "loss": 2.5351, "rewards/accuracies": 0.5, "rewards/chosen": -17.04762077331543, "rewards/margins": 1.6787769794464111, "rewards/rejected": -18.726398468017578, "step": 3740 }, { "epoch": 0.12622602716640263, "grad_norm": 21.604785919189453, "learning_rate": 9.97906812598429e-07, "logits/chosen": -0.5837305784225464, "logits/rejected": -0.6095893979072571, "logps/chosen": -1.9071247577667236, "logps/rejected": -1.909767746925354, "loss": 3.1111, "rewards/accuracies": 0.5, "rewards/chosen": -19.071247100830078, "rewards/margins": 0.026428794488310814, "rewards/rejected": -19.09767723083496, "step": 3745 }, { "epoch": 0.12639455323738583, "grad_norm": 31.987625122070312, "learning_rate": 9.978798405125739e-07, "logits/chosen": -0.37430623173713684, "logits/rejected": -0.5127814412117004, "logps/chosen": -1.7623825073242188, "logps/rejected": -1.9457428455352783, "loss": 2.3911, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.623825073242188, "rewards/margins": 1.8336032629013062, "rewards/rejected": -19.457427978515625, "step": 3750 }, { "epoch": 0.126563079308369, "grad_norm": 22.882980346679688, "learning_rate": 9.978526961301218e-07, "logits/chosen": -0.28497645258903503, "logits/rejected": -0.4219549298286438, "logps/chosen": -1.6226459741592407, "logps/rejected": -1.6232162714004517, "loss": 3.2443, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.226459503173828, "rewards/margins": 0.0057045938447117805, "rewards/rejected": -16.23216438293457, "step": 3755 }, { "epoch": 0.12673160537935219, "grad_norm": 21.146074295043945, "learning_rate": 9.978253794604658e-07, "logits/chosen": -0.5370095372200012, "logits/rejected": -0.7515683174133301, "logps/chosen": -1.5996499061584473, "logps/rejected": -1.6215238571166992, "loss": 2.9973, "rewards/accuracies": 0.5, "rewards/chosen": -15.996500015258789, "rewards/margins": 0.218739315867424, "rewards/rejected": -16.21523666381836, "step": 3760 }, { "epoch": 0.12690013145033538, "grad_norm": 21.315244674682617, "learning_rate": 9.9779789051306e-07, "logits/chosen": -0.44618409872055054, "logits/rejected": -0.40568017959594727, "logps/chosen": -1.8859344720840454, "logps/rejected": -1.7330694198608398, "loss": 4.5439, "rewards/accuracies": 0.0, "rewards/chosen": -18.859344482421875, "rewards/margins": -1.5286482572555542, "rewards/rejected": -17.3306941986084, "step": 3765 }, { "epoch": 0.12706865752131855, "grad_norm": 98.00025939941406, "learning_rate": 9.977702292974165e-07, "logits/chosen": -0.5471475720405579, "logits/rejected": -0.5172755122184753, "logps/chosen": -1.957866907119751, "logps/rejected": -1.828743577003479, "loss": 4.3705, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -19.578670501708984, "rewards/margins": -1.291232705116272, "rewards/rejected": -18.28743553161621, "step": 3770 }, { "epoch": 0.12723718359230174, "grad_norm": 22.819908142089844, "learning_rate": 9.97742395823108e-07, "logits/chosen": -0.5692895650863647, "logits/rejected": -0.4863424301147461, "logps/chosen": -1.811985731124878, "logps/rejected": -1.6893869638442993, "loss": 4.2602, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -18.119857788085938, "rewards/margins": -1.2259877920150757, "rewards/rejected": -16.893869400024414, "step": 3775 }, { "epoch": 0.1274057096632849, "grad_norm": 28.10711097717285, "learning_rate": 9.977143900997664e-07, "logits/chosen": -0.550839900970459, "logits/rejected": -0.6493675112724304, "logps/chosen": -2.099008560180664, "logps/rejected": -2.022428512573242, "loss": 3.8717, "rewards/accuracies": 0.5, "rewards/chosen": -20.99008560180664, "rewards/margins": -0.7658005952835083, "rewards/rejected": -20.224285125732422, "step": 3780 }, { "epoch": 0.1275742357342681, "grad_norm": 57.06027603149414, "learning_rate": 9.976862121370838e-07, "logits/chosen": -0.37889954447746277, "logits/rejected": -0.48400768637657166, "logps/chosen": -1.8434131145477295, "logps/rejected": -1.9132566452026367, "loss": 2.4397, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -18.434131622314453, "rewards/margins": 0.6984332203865051, "rewards/rejected": -19.132564544677734, "step": 3785 }, { "epoch": 0.12774276180525126, "grad_norm": 29.508501052856445, "learning_rate": 9.976578619448112e-07, "logits/chosen": -0.6355900764465332, "logits/rejected": -0.6713854074478149, "logps/chosen": -1.7098877429962158, "logps/rejected": -1.7345527410507202, "loss": 2.9836, "rewards/accuracies": 0.5, "rewards/chosen": -17.098876953125, "rewards/margins": 0.24665145576000214, "rewards/rejected": -17.34552764892578, "step": 3790 }, { "epoch": 0.12791128787623446, "grad_norm": 23.302797317504883, "learning_rate": 9.976293395327596e-07, "logits/chosen": -0.6434440016746521, "logits/rejected": -0.4357782304286957, "logps/chosen": -1.9776248931884766, "logps/rejected": -2.018247604370117, "loss": 3.053, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.776248931884766, "rewards/margins": 0.40622633695602417, "rewards/rejected": -20.18247413635254, "step": 3795 }, { "epoch": 0.12807981394721762, "grad_norm": 76.14665985107422, "learning_rate": 9.976006449107993e-07, "logits/chosen": -0.30134084820747375, "logits/rejected": -0.23310093581676483, "logps/chosen": -2.2058663368225098, "logps/rejected": -2.109875440597534, "loss": 4.0036, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -22.05866241455078, "rewards/margins": -0.9599083065986633, "rewards/rejected": -21.098752975463867, "step": 3800 }, { "epoch": 0.12824834001820082, "grad_norm": 23.990066528320312, "learning_rate": 9.975717780888602e-07, "logits/chosen": -0.4945443272590637, "logits/rejected": -0.5459513068199158, "logps/chosen": -1.6550929546356201, "logps/rejected": -1.7307243347167969, "loss": 2.461, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.55093002319336, "rewards/margins": 0.7563133239746094, "rewards/rejected": -17.307241439819336, "step": 3805 }, { "epoch": 0.12841686608918398, "grad_norm": 23.518524169921875, "learning_rate": 9.975427390769327e-07, "logits/chosen": -0.7974092364311218, "logits/rejected": -0.8891481161117554, "logps/chosen": -1.5475536584854126, "logps/rejected": -1.5537341833114624, "loss": 3.0766, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -15.475537300109863, "rewards/margins": 0.061804674565792084, "rewards/rejected": -15.53734302520752, "step": 3810 }, { "epoch": 0.12858539216016718, "grad_norm": 25.759145736694336, "learning_rate": 9.975135278850652e-07, "logits/chosen": -0.46052369475364685, "logits/rejected": -0.4665374159812927, "logps/chosen": -2.109952449798584, "logps/rejected": -2.2452311515808105, "loss": 2.0272, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -21.099523544311523, "rewards/margins": 1.352790355682373, "rewards/rejected": -22.452314376831055, "step": 3815 }, { "epoch": 0.12875391823115037, "grad_norm": 22.549362182617188, "learning_rate": 9.974841445233673e-07, "logits/chosen": -0.767123818397522, "logits/rejected": -0.804367184638977, "logps/chosen": -1.5822185277938843, "logps/rejected": -1.6508678197860718, "loss": 2.7948, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -15.822186470031738, "rewards/margins": 0.6864920854568481, "rewards/rejected": -16.508676528930664, "step": 3820 }, { "epoch": 0.12892244430213354, "grad_norm": 19.02248764038086, "learning_rate": 9.97454589002007e-07, "logits/chosen": -0.48755064606666565, "logits/rejected": -0.7069557309150696, "logps/chosen": -1.649071455001831, "logps/rejected": -1.7350966930389404, "loss": 2.5954, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.49071502685547, "rewards/margins": 0.8602536916732788, "rewards/rejected": -17.350969314575195, "step": 3825 }, { "epoch": 0.12909097037311673, "grad_norm": 17.452266693115234, "learning_rate": 9.974248613312122e-07, "logits/chosen": -0.5228408575057983, "logits/rejected": -0.5327178239822388, "logps/chosen": -1.8645496368408203, "logps/rejected": -1.6770515441894531, "loss": 4.9626, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -18.64549446105957, "rewards/margins": -1.874979019165039, "rewards/rejected": -16.77051544189453, "step": 3830 }, { "epoch": 0.1292594964440999, "grad_norm": 20.079130172729492, "learning_rate": 9.973949615212709e-07, "logits/chosen": -0.11785700172185898, "logits/rejected": -0.11308972537517548, "logps/chosen": -2.2112364768981934, "logps/rejected": -2.1767048835754395, "loss": 3.9099, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.112361907958984, "rewards/margins": -0.34531623125076294, "rewards/rejected": -21.767047882080078, "step": 3835 }, { "epoch": 0.1294280225150831, "grad_norm": 29.872507095336914, "learning_rate": 9.973648895825297e-07, "logits/chosen": -0.5825292468070984, "logits/rejected": -0.6343661546707153, "logps/chosen": -1.552022933959961, "logps/rejected": -1.5541592836380005, "loss": 3.1794, "rewards/accuracies": 0.5, "rewards/chosen": -15.520230293273926, "rewards/margins": 0.021363258361816406, "rewards/rejected": -15.541592597961426, "step": 3840 }, { "epoch": 0.12959654858606626, "grad_norm": 28.075502395629883, "learning_rate": 9.973346455253959e-07, "logits/chosen": -0.5383496284484863, "logits/rejected": -0.4658065736293793, "logps/chosen": -1.51207435131073, "logps/rejected": -1.427404522895813, "loss": 3.9488, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -15.120744705200195, "rewards/margins": -0.8466998934745789, "rewards/rejected": -14.27404499053955, "step": 3845 }, { "epoch": 0.12976507465704945, "grad_norm": 33.11229705810547, "learning_rate": 9.973042293603354e-07, "logits/chosen": -0.24757274985313416, "logits/rejected": -0.4024467468261719, "logps/chosen": -1.6025587320327759, "logps/rejected": -1.7867257595062256, "loss": 2.643, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.025588989257812, "rewards/margins": 1.8416721820831299, "rewards/rejected": -17.867259979248047, "step": 3850 }, { "epoch": 0.12993360072803262, "grad_norm": 13.463517189025879, "learning_rate": 9.97273641097874e-07, "logits/chosen": -0.21165132522583008, "logits/rejected": -0.36822745203971863, "logps/chosen": -1.5910065174102783, "logps/rejected": -1.629421591758728, "loss": 2.9384, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -15.910066604614258, "rewards/margins": 0.38414907455444336, "rewards/rejected": -16.29421615600586, "step": 3855 }, { "epoch": 0.1301021267990158, "grad_norm": 17.587718963623047, "learning_rate": 9.972428807485972e-07, "logits/chosen": -0.4675527513027191, "logits/rejected": -0.412848562002182, "logps/chosen": -1.6668990850448608, "logps/rejected": -1.8437143564224243, "loss": 1.9073, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.668991088867188, "rewards/margins": 1.7681528329849243, "rewards/rejected": -18.437143325805664, "step": 3860 }, { "epoch": 0.13027065286999898, "grad_norm": 23.6627197265625, "learning_rate": 9.972119483231502e-07, "logits/chosen": -0.6713553667068481, "logits/rejected": -0.6638098955154419, "logps/chosen": -1.6672265529632568, "logps/rejected": -1.875038504600525, "loss": 2.3677, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.67226791381836, "rewards/margins": 2.0781185626983643, "rewards/rejected": -18.750385284423828, "step": 3865 }, { "epoch": 0.13043917894098217, "grad_norm": 27.069753646850586, "learning_rate": 9.97180843832237e-07, "logits/chosen": -0.6440941095352173, "logits/rejected": -0.5108510851860046, "logps/chosen": -1.7878713607788086, "logps/rejected": -1.7104899883270264, "loss": 3.883, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -17.878713607788086, "rewards/margins": -0.773812472820282, "rewards/rejected": -17.104900360107422, "step": 3870 }, { "epoch": 0.13060770501196536, "grad_norm": 19.128957748413086, "learning_rate": 9.97149567286622e-07, "logits/chosen": -0.5943403840065002, "logits/rejected": -0.6030054688453674, "logps/chosen": -1.5599461793899536, "logps/rejected": -1.617856740951538, "loss": 2.6472, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -15.599462509155273, "rewards/margins": 0.5791065096855164, "rewards/rejected": -16.178569793701172, "step": 3875 }, { "epoch": 0.13077623108294853, "grad_norm": 20.993228912353516, "learning_rate": 9.97118118697129e-07, "logits/chosen": -0.6433027982711792, "logits/rejected": -0.5695537328720093, "logps/chosen": -1.9984300136566162, "logps/rejected": -2.169884204864502, "loss": 2.2335, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.98430061340332, "rewards/margins": 1.7145410776138306, "rewards/rejected": -21.698841094970703, "step": 3880 }, { "epoch": 0.13094475715393172, "grad_norm": 23.70119285583496, "learning_rate": 9.970864980746402e-07, "logits/chosen": -0.7846357226371765, "logits/rejected": -0.6926722526550293, "logps/chosen": -1.5518462657928467, "logps/rejected": -1.6256211996078491, "loss": 2.7388, "rewards/accuracies": 0.5, "rewards/chosen": -15.518463134765625, "rewards/margins": 0.7377495765686035, "rewards/rejected": -16.25621223449707, "step": 3885 }, { "epoch": 0.1311132832249149, "grad_norm": 20.39211654663086, "learning_rate": 9.970547054300993e-07, "logits/chosen": -0.6103144884109497, "logits/rejected": -0.4634561538696289, "logps/chosen": -1.693368911743164, "logps/rejected": -1.6804195642471313, "loss": 3.4093, "rewards/accuracies": 0.5, "rewards/chosen": -16.93368911743164, "rewards/margins": -0.12949447333812714, "rewards/rejected": -16.8041934967041, "step": 3890 }, { "epoch": 0.13128180929589808, "grad_norm": 23.615053176879883, "learning_rate": 9.970227407745077e-07, "logits/chosen": -0.7008322477340698, "logits/rejected": -0.63578861951828, "logps/chosen": -1.8769195079803467, "logps/rejected": -1.9181302785873413, "loss": 3.0259, "rewards/accuracies": 0.5, "rewards/chosen": -18.769195556640625, "rewards/margins": 0.41210755705833435, "rewards/rejected": -19.181303024291992, "step": 3895 }, { "epoch": 0.13145033536688125, "grad_norm": 20.815834045410156, "learning_rate": 9.969906041189276e-07, "logits/chosen": -0.6005369424819946, "logits/rejected": -0.4755684733390808, "logps/chosen": -1.6404476165771484, "logps/rejected": -1.680262804031372, "loss": 2.9446, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.404476165771484, "rewards/margins": 0.3981497883796692, "rewards/rejected": -16.80262565612793, "step": 3900 }, { "epoch": 0.13161886143786444, "grad_norm": 23.32740592956543, "learning_rate": 9.969582954744799e-07, "logits/chosen": -0.4026781916618347, "logits/rejected": -0.37207791209220886, "logps/chosen": -1.8272647857666016, "logps/rejected": -1.9074573516845703, "loss": 2.4612, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.27264976501465, "rewards/margins": 0.8019250631332397, "rewards/rejected": -19.074573516845703, "step": 3905 }, { "epoch": 0.1317873875088476, "grad_norm": 37.29978561401367, "learning_rate": 9.96925814852346e-07, "logits/chosen": -0.4581456184387207, "logits/rejected": -0.5759280920028687, "logps/chosen": -1.7276592254638672, "logps/rejected": -1.7996619939804077, "loss": 2.7312, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.27659034729004, "rewards/margins": 0.720028817653656, "rewards/rejected": -17.996620178222656, "step": 3910 }, { "epoch": 0.1319559135798308, "grad_norm": 29.896190643310547, "learning_rate": 9.968931622637651e-07, "logits/chosen": -0.23583588004112244, "logits/rejected": -0.2508379817008972, "logps/chosen": -2.083098888397217, "logps/rejected": -2.0896639823913574, "loss": 3.1457, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.830989837646484, "rewards/margins": 0.06565189361572266, "rewards/rejected": -20.89664077758789, "step": 3915 }, { "epoch": 0.13212443965081397, "grad_norm": 18.37214469909668, "learning_rate": 9.968603377200377e-07, "logits/chosen": -0.54749995470047, "logits/rejected": -0.3493199348449707, "logps/chosen": -1.6087677478790283, "logps/rejected": -1.9329464435577393, "loss": 1.8455, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -16.087677001953125, "rewards/margins": 3.241788387298584, "rewards/rejected": -19.329465866088867, "step": 3920 }, { "epoch": 0.13229296572179716, "grad_norm": 25.379499435424805, "learning_rate": 9.96827341232523e-07, "logits/chosen": -0.473417192697525, "logits/rejected": -0.4509311616420746, "logps/chosen": -1.8252029418945312, "logps/rejected": -1.9195716381072998, "loss": 2.9775, "rewards/accuracies": 0.5, "rewards/chosen": -18.252029418945312, "rewards/margins": 0.9436872601509094, "rewards/rejected": -19.195714950561523, "step": 3925 }, { "epoch": 0.13246149179278036, "grad_norm": 23.761089324951172, "learning_rate": 9.967941728126398e-07, "logits/chosen": -0.5376905202865601, "logits/rejected": -0.43829545378685, "logps/chosen": -1.7135887145996094, "logps/rejected": -1.6739921569824219, "loss": 3.5778, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.135887145996094, "rewards/margins": -0.39596787095069885, "rewards/rejected": -16.739919662475586, "step": 3930 }, { "epoch": 0.13263001786376352, "grad_norm": 38.03128433227539, "learning_rate": 9.967608324718661e-07, "logits/chosen": -0.5036236047744751, "logits/rejected": -0.6714465618133545, "logps/chosen": -1.942854642868042, "logps/rejected": -2.249541759490967, "loss": 2.6183, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.428546905517578, "rewards/margins": 3.0668723583221436, "rewards/rejected": -22.495418548583984, "step": 3935 }, { "epoch": 0.13279854393474672, "grad_norm": 15.746185302734375, "learning_rate": 9.9672732022174e-07, "logits/chosen": -0.12133710086345673, "logits/rejected": -0.2733135521411896, "logps/chosen": -1.9632943868637085, "logps/rejected": -2.0086188316345215, "loss": 3.6416, "rewards/accuracies": 0.5, "rewards/chosen": -19.632946014404297, "rewards/margins": 0.45324498414993286, "rewards/rejected": -20.08618927001953, "step": 3940 }, { "epoch": 0.13296707000572988, "grad_norm": 26.305767059326172, "learning_rate": 9.966936360738586e-07, "logits/chosen": -0.7777701616287231, "logits/rejected": -0.9092004895210266, "logps/chosen": -1.8936526775360107, "logps/rejected": -1.915820837020874, "loss": 3.3031, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -18.936527252197266, "rewards/margins": 0.2216823548078537, "rewards/rejected": -19.1582088470459, "step": 3945 }, { "epoch": 0.13313559607671308, "grad_norm": 21.544748306274414, "learning_rate": 9.966597800398789e-07, "logits/chosen": -0.5004564523696899, "logits/rejected": -0.49012812972068787, "logps/chosen": -1.747496247291565, "logps/rejected": -1.7987169027328491, "loss": 2.7442, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.474964141845703, "rewards/margins": 0.5122075080871582, "rewards/rejected": -17.987171173095703, "step": 3950 }, { "epoch": 0.13330412214769624, "grad_norm": 17.286357879638672, "learning_rate": 9.966257521315166e-07, "logits/chosen": -0.20877251029014587, "logits/rejected": -0.23092889785766602, "logps/chosen": -2.1308019161224365, "logps/rejected": -2.1815826892852783, "loss": 2.6274, "rewards/accuracies": 0.5, "rewards/chosen": -21.308019638061523, "rewards/margins": 0.5078089833259583, "rewards/rejected": -21.815828323364258, "step": 3955 }, { "epoch": 0.13347264821867943, "grad_norm": 18.512609481811523, "learning_rate": 9.965915523605482e-07, "logits/chosen": -0.5298896431922913, "logits/rejected": -0.37859925627708435, "logps/chosen": -1.8345773220062256, "logps/rejected": -1.9792572259902954, "loss": 2.3234, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.34577178955078, "rewards/margins": 1.446800947189331, "rewards/rejected": -19.792572021484375, "step": 3960 }, { "epoch": 0.1336411742896626, "grad_norm": 19.06651496887207, "learning_rate": 9.965571807388082e-07, "logits/chosen": -0.41732341051101685, "logits/rejected": -0.3108731508255005, "logps/chosen": -1.5117712020874023, "logps/rejected": -1.74689519405365, "loss": 2.5905, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -15.117711067199707, "rewards/margins": 2.3512415885925293, "rewards/rejected": -17.468952178955078, "step": 3965 }, { "epoch": 0.1338097003606458, "grad_norm": 21.32099723815918, "learning_rate": 9.965226372781914e-07, "logits/chosen": -0.7179542779922485, "logits/rejected": -0.7836523056030273, "logps/chosen": -1.6507011651992798, "logps/rejected": -1.714941382408142, "loss": 2.5683, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.50701332092285, "rewards/margins": 0.6424006223678589, "rewards/rejected": -17.1494140625, "step": 3970 }, { "epoch": 0.13397822643162896, "grad_norm": 15.45632266998291, "learning_rate": 9.96487921990652e-07, "logits/chosen": -0.6785871386528015, "logits/rejected": -0.824084460735321, "logps/chosen": -1.446620225906372, "logps/rejected": -1.482995629310608, "loss": 2.9628, "rewards/accuracies": 0.5, "rewards/chosen": -14.466203689575195, "rewards/margins": 0.36375388503074646, "rewards/rejected": -14.8299560546875, "step": 3975 }, { "epoch": 0.13414675250261215, "grad_norm": 18.907337188720703, "learning_rate": 9.96453034888204e-07, "logits/chosen": -0.6268816590309143, "logits/rejected": -0.5451667904853821, "logps/chosen": -1.8741906881332397, "logps/rejected": -1.7744128704071045, "loss": 4.0882, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -18.741907119750977, "rewards/margins": -0.9977784156799316, "rewards/rejected": -17.744129180908203, "step": 3980 }, { "epoch": 0.13431527857359535, "grad_norm": 29.47572135925293, "learning_rate": 9.964179759829199e-07, "logits/chosen": -0.13526254892349243, "logits/rejected": -0.24333901703357697, "logps/chosen": -1.850494146347046, "logps/rejected": -1.8943946361541748, "loss": 3.1622, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.504940032958984, "rewards/margins": 0.4390055537223816, "rewards/rejected": -18.943946838378906, "step": 3985 }, { "epoch": 0.13448380464457851, "grad_norm": 45.685768127441406, "learning_rate": 9.963827452869325e-07, "logits/chosen": -0.32695698738098145, "logits/rejected": -0.3549268841743469, "logps/chosen": -1.690030813217163, "logps/rejected": -1.7497482299804688, "loss": 3.099, "rewards/accuracies": 0.5, "rewards/chosen": -16.90030860900879, "rewards/margins": 0.5971736907958984, "rewards/rejected": -17.497482299804688, "step": 3990 }, { "epoch": 0.1346523307155617, "grad_norm": 38.33502960205078, "learning_rate": 9.963473428124334e-07, "logits/chosen": -0.4510404169559479, "logits/rejected": -0.5716635584831238, "logps/chosen": -1.9731696844100952, "logps/rejected": -1.9337685108184814, "loss": 3.5215, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.731698989868164, "rewards/margins": -0.39401358366012573, "rewards/rejected": -19.337682723999023, "step": 3995 }, { "epoch": 0.13482085678654487, "grad_norm": 18.996416091918945, "learning_rate": 9.963117685716744e-07, "logits/chosen": -0.7336264848709106, "logits/rejected": -0.6646067500114441, "logps/chosen": -1.6362497806549072, "logps/rejected": -1.7025432586669922, "loss": 2.7504, "rewards/accuracies": 0.5, "rewards/chosen": -16.362497329711914, "rewards/margins": 0.6629348993301392, "rewards/rejected": -17.025432586669922, "step": 4000 }, { "epoch": 0.13482085678654487, "eval_logits/chosen": -0.7574084997177124, "eval_logits/rejected": -0.7684286236763, "eval_logps/chosen": -1.6912825107574463, "eval_logps/rejected": -1.7017545700073242, "eval_loss": 3.3711276054382324, "eval_rewards/accuracies": 0.47999998927116394, "eval_rewards/chosen": -16.912826538085938, "eval_rewards/margins": 0.10472100228071213, "eval_rewards/rejected": -17.017545700073242, "eval_runtime": 12.8943, "eval_samples_per_second": 7.755, "eval_steps_per_second": 1.939, "step": 4000 }, { "epoch": 0.13498938285752807, "grad_norm": 44.0849723815918, "learning_rate": 9.962760225769664e-07, "logits/chosen": -0.2051403522491455, "logits/rejected": -0.19655892252922058, "logps/chosen": -2.0785844326019287, "logps/rejected": -2.2632524967193604, "loss": 1.9564, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.785846710205078, "rewards/margins": 1.8466781377792358, "rewards/rejected": -22.632522583007812, "step": 4005 }, { "epoch": 0.13515790892851123, "grad_norm": 29.675466537475586, "learning_rate": 9.962401048406792e-07, "logits/chosen": -0.1515626162290573, "logits/rejected": -0.11530622094869614, "logps/chosen": -2.379957675933838, "logps/rejected": -2.8324062824249268, "loss": 3.4024, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.799577713012695, "rewards/margins": 4.524485111236572, "rewards/rejected": -28.32406234741211, "step": 4010 }, { "epoch": 0.13532643499949443, "grad_norm": 13.641871452331543, "learning_rate": 9.96204015375243e-07, "logits/chosen": -0.8857007026672363, "logits/rejected": -0.8562878370285034, "logps/chosen": -1.5614138841629028, "logps/rejected": -1.7495664358139038, "loss": 1.9699, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -15.614137649536133, "rewards/margins": 1.881527304649353, "rewards/rejected": -17.495664596557617, "step": 4015 }, { "epoch": 0.1354949610704776, "grad_norm": 22.9551944732666, "learning_rate": 9.961677541931466e-07, "logits/chosen": -0.5161920785903931, "logits/rejected": -0.43604689836502075, "logps/chosen": -1.6171886920928955, "logps/rejected": -1.686789870262146, "loss": 2.6197, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.171884536743164, "rewards/margins": 0.6960126757621765, "rewards/rejected": -16.86789894104004, "step": 4020 }, { "epoch": 0.1356634871414608, "grad_norm": 21.6877384185791, "learning_rate": 9.961313213069386e-07, "logits/chosen": -0.6385021209716797, "logits/rejected": -0.441770076751709, "logps/chosen": -1.7623170614242554, "logps/rejected": -1.9857242107391357, "loss": 1.9666, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.623170852661133, "rewards/margins": 2.2340712547302246, "rewards/rejected": -19.857242584228516, "step": 4025 }, { "epoch": 0.13583201321244395, "grad_norm": 33.36929702758789, "learning_rate": 9.960947167292274e-07, "logits/chosen": -0.7597072720527649, "logits/rejected": -0.5987949371337891, "logps/chosen": -1.963201880455017, "logps/rejected": -2.146829605102539, "loss": 2.8322, "rewards/accuracies": 0.5, "rewards/chosen": -19.632022857666016, "rewards/margins": 1.8362756967544556, "rewards/rejected": -21.46829605102539, "step": 4030 }, { "epoch": 0.13600053928342715, "grad_norm": 22.11895751953125, "learning_rate": 9.960579404726797e-07, "logits/chosen": -0.36422428488731384, "logits/rejected": -0.222096249461174, "logps/chosen": -1.9910913705825806, "logps/rejected": -2.0189363956451416, "loss": 3.3419, "rewards/accuracies": 0.5, "rewards/chosen": -19.910913467407227, "rewards/margins": 0.2784494459629059, "rewards/rejected": -20.189363479614258, "step": 4035 }, { "epoch": 0.13616906535441034, "grad_norm": 31.01476287841797, "learning_rate": 9.96020992550023e-07, "logits/chosen": -0.3307796120643616, "logits/rejected": -0.5669467449188232, "logps/chosen": -1.5970932245254517, "logps/rejected": -1.6248077154159546, "loss": 2.8625, "rewards/accuracies": 0.5, "rewards/chosen": -15.970930099487305, "rewards/margins": 0.27714595198631287, "rewards/rejected": -16.248077392578125, "step": 4040 }, { "epoch": 0.1363375914253935, "grad_norm": 24.931251525878906, "learning_rate": 9.95983872974043e-07, "logits/chosen": -0.6774837970733643, "logits/rejected": -0.595493495464325, "logps/chosen": -1.5050591230392456, "logps/rejected": -1.4872527122497559, "loss": 3.2411, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -15.050592422485352, "rewards/margins": -0.17806454002857208, "rewards/rejected": -14.872526168823242, "step": 4045 }, { "epoch": 0.1365061174963767, "grad_norm": 33.974796295166016, "learning_rate": 9.959465817575858e-07, "logits/chosen": -0.5587460398674011, "logits/rejected": -0.32978394627571106, "logps/chosen": -1.6928952932357788, "logps/rejected": -1.8271658420562744, "loss": 2.4119, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -16.928955078125, "rewards/margins": 1.3427047729492188, "rewards/rejected": -18.271657943725586, "step": 4050 }, { "epoch": 0.13667464356735987, "grad_norm": 16.821455001831055, "learning_rate": 9.95909118913556e-07, "logits/chosen": -0.8507031202316284, "logits/rejected": -0.7523888349533081, "logps/chosen": -1.645453691482544, "logps/rejected": -1.8627036809921265, "loss": 1.3962, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -16.45453453063965, "rewards/margins": 2.1725001335144043, "rewards/rejected": -18.62703514099121, "step": 4055 }, { "epoch": 0.13684316963834306, "grad_norm": 23.652524948120117, "learning_rate": 9.958714844549183e-07, "logits/chosen": -0.6848211288452148, "logits/rejected": -0.7481715083122253, "logps/chosen": -1.671247124671936, "logps/rejected": -1.6934289932250977, "loss": 3.0554, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.712472915649414, "rewards/margins": 0.22181835770606995, "rewards/rejected": -16.934289932250977, "step": 4060 }, { "epoch": 0.13701169570932623, "grad_norm": 8.807697296142578, "learning_rate": 9.958336783946964e-07, "logits/chosen": -0.10141198337078094, "logits/rejected": -0.18145883083343506, "logps/chosen": -1.7855380773544312, "logps/rejected": -1.9229755401611328, "loss": 2.2009, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.85538101196289, "rewards/margins": 1.3743743896484375, "rewards/rejected": -19.229755401611328, "step": 4065 }, { "epoch": 0.13718022178030942, "grad_norm": 10.74984359741211, "learning_rate": 9.957957007459734e-07, "logits/chosen": -0.5784136056900024, "logits/rejected": -0.5899661779403687, "logps/chosen": -1.6685832738876343, "logps/rejected": -1.7646796703338623, "loss": 2.4155, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.685832977294922, "rewards/margins": 0.9609651565551758, "rewards/rejected": -17.64679527282715, "step": 4070 }, { "epoch": 0.13734874785129259, "grad_norm": 15.270707130432129, "learning_rate": 9.957575515218919e-07, "logits/chosen": -0.7281454205513, "logits/rejected": -0.6709175109863281, "logps/chosen": -1.7470428943634033, "logps/rejected": -1.9406992197036743, "loss": 3.2565, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.470428466796875, "rewards/margins": 1.9365642070770264, "rewards/rejected": -19.406991958618164, "step": 4075 }, { "epoch": 0.13751727392227578, "grad_norm": 39.64847183227539, "learning_rate": 9.95719230735654e-07, "logits/chosen": -0.1662284880876541, "logits/rejected": -0.023158108815550804, "logps/chosen": -1.5711307525634766, "logps/rejected": -1.640824556350708, "loss": 2.6556, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -15.71130657196045, "rewards/margins": 0.6969379186630249, "rewards/rejected": -16.408245086669922, "step": 4080 }, { "epoch": 0.13768579999325894, "grad_norm": 55.75159454345703, "learning_rate": 9.956807384005209e-07, "logits/chosen": -0.5080665946006775, "logits/rejected": -0.5409480333328247, "logps/chosen": -1.77492356300354, "logps/rejected": -1.7267024517059326, "loss": 3.8746, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -17.749237060546875, "rewards/margins": -0.4822106957435608, "rewards/rejected": -17.267024993896484, "step": 4085 }, { "epoch": 0.13785432606424214, "grad_norm": 44.708839416503906, "learning_rate": 9.956420745298132e-07, "logits/chosen": -0.5287455320358276, "logits/rejected": -0.45763248205184937, "logps/chosen": -1.798195242881775, "logps/rejected": -1.77925705909729, "loss": 3.2948, "rewards/accuracies": 0.5, "rewards/chosen": -17.981952667236328, "rewards/margins": -0.18938274681568146, "rewards/rejected": -17.792570114135742, "step": 4090 }, { "epoch": 0.13802285213522533, "grad_norm": 25.557100296020508, "learning_rate": 9.956032391369109e-07, "logits/chosen": -0.2953604757785797, "logits/rejected": -0.36451655626296997, "logps/chosen": -2.143887758255005, "logps/rejected": -1.942565679550171, "loss": 5.0524, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -21.43887710571289, "rewards/margins": -2.0132193565368652, "rewards/rejected": -19.425655364990234, "step": 4095 }, { "epoch": 0.1381913782062085, "grad_norm": 26.615631103515625, "learning_rate": 9.955642322352538e-07, "logits/chosen": -0.5584558844566345, "logits/rejected": -0.5644673705101013, "logps/chosen": -1.7165279388427734, "logps/rejected": -1.680544137954712, "loss": 3.7856, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.165279388427734, "rewards/margins": -0.35983896255493164, "rewards/rejected": -16.80544090270996, "step": 4100 }, { "epoch": 0.1383599042771917, "grad_norm": 17.09918975830078, "learning_rate": 9.955250538383402e-07, "logits/chosen": -0.4320560097694397, "logits/rejected": -0.5020492076873779, "logps/chosen": -1.728809118270874, "logps/rejected": -1.7111473083496094, "loss": 3.5286, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.2880916595459, "rewards/margins": -0.1766192466020584, "rewards/rejected": -17.111474990844727, "step": 4105 }, { "epoch": 0.13852843034817486, "grad_norm": 32.29576110839844, "learning_rate": 9.954857039597285e-07, "logits/chosen": -0.5975824594497681, "logits/rejected": -0.6915780305862427, "logps/chosen": -1.9237430095672607, "logps/rejected": -1.9363324642181396, "loss": 3.0509, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.237430572509766, "rewards/margins": 0.12589678168296814, "rewards/rejected": -19.363325119018555, "step": 4110 }, { "epoch": 0.13869695641915805, "grad_norm": 15.100337028503418, "learning_rate": 9.95446182613036e-07, "logits/chosen": -0.8075372576713562, "logits/rejected": -0.6522815227508545, "logps/chosen": -1.5407874584197998, "logps/rejected": -1.5490734577178955, "loss": 3.0917, "rewards/accuracies": 0.5, "rewards/chosen": -15.407875061035156, "rewards/margins": 0.08286003768444061, "rewards/rejected": -15.490735054016113, "step": 4115 }, { "epoch": 0.13886548249014122, "grad_norm": 12.08738899230957, "learning_rate": 9.954064898119393e-07, "logits/chosen": -0.5792916417121887, "logits/rejected": -0.7377229332923889, "logps/chosen": -1.6446812152862549, "logps/rejected": -1.7030906677246094, "loss": 2.9207, "rewards/accuracies": 0.5, "rewards/chosen": -16.44681167602539, "rewards/margins": 0.5840937495231628, "rewards/rejected": -17.03090476989746, "step": 4120 }, { "epoch": 0.1390340085611244, "grad_norm": 20.199708938598633, "learning_rate": 9.953666255701747e-07, "logits/chosen": -0.4421865940093994, "logits/rejected": -0.34469595551490784, "logps/chosen": -2.231015682220459, "logps/rejected": -1.954992651939392, "loss": 7.4907, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.31015968322754, "rewards/margins": -2.7602322101593018, "rewards/rejected": -19.5499267578125, "step": 4125 }, { "epoch": 0.13920253463210758, "grad_norm": 24.32143211364746, "learning_rate": 9.953265899015378e-07, "logits/chosen": -0.45347872376441956, "logits/rejected": -0.5372802019119263, "logps/chosen": -2.0228564739227295, "logps/rejected": -2.0596730709075928, "loss": 3.7307, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.228565216064453, "rewards/margins": 0.36816471815109253, "rewards/rejected": -20.596731185913086, "step": 4130 }, { "epoch": 0.13937106070309077, "grad_norm": 18.27474594116211, "learning_rate": 9.952863828198832e-07, "logits/chosen": -0.5868837833404541, "logits/rejected": -0.8388587236404419, "logps/chosen": -1.9159198999404907, "logps/rejected": -1.8486074209213257, "loss": 3.7772, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.159198760986328, "rewards/margins": -0.6731247901916504, "rewards/rejected": -18.486074447631836, "step": 4135 }, { "epoch": 0.13953958677407394, "grad_norm": 29.658910751342773, "learning_rate": 9.952460043391251e-07, "logits/chosen": -0.4052742123603821, "logits/rejected": -0.30263233184814453, "logps/chosen": -1.8732227087020874, "logps/rejected": -1.9779338836669922, "loss": 2.7132, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.732229232788086, "rewards/margins": 1.0471103191375732, "rewards/rejected": -19.779338836669922, "step": 4140 }, { "epoch": 0.13970811284505713, "grad_norm": 18.792110443115234, "learning_rate": 9.952054544732366e-07, "logits/chosen": -0.7875449657440186, "logits/rejected": -0.8418458700180054, "logps/chosen": -1.5930900573730469, "logps/rejected": -1.4902503490447998, "loss": 4.2111, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -15.930900573730469, "rewards/margins": -1.0283968448638916, "rewards/rejected": -14.902502059936523, "step": 4145 }, { "epoch": 0.13987663891604032, "grad_norm": 15.064722061157227, "learning_rate": 9.95164733236251e-07, "logits/chosen": -0.45020851492881775, "logits/rejected": -0.5293843746185303, "logps/chosen": -1.3041831254959106, "logps/rejected": -1.4093153476715088, "loss": 2.4878, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -13.041831970214844, "rewards/margins": 1.0513209104537964, "rewards/rejected": -14.093152046203613, "step": 4150 }, { "epoch": 0.1400451649870235, "grad_norm": 21.02431869506836, "learning_rate": 9.951238406422594e-07, "logits/chosen": -0.4772763252258301, "logits/rejected": -0.358822226524353, "logps/chosen": -1.6318118572235107, "logps/rejected": -1.6670395135879517, "loss": 2.9819, "rewards/accuracies": 0.5, "rewards/chosen": -16.318119049072266, "rewards/margins": 0.35227519273757935, "rewards/rejected": -16.670392990112305, "step": 4155 }, { "epoch": 0.14021369105800668, "grad_norm": 12.805978775024414, "learning_rate": 9.950827767054141e-07, "logits/chosen": -0.5042682886123657, "logits/rejected": -0.36263027787208557, "logps/chosen": -1.785143494606018, "logps/rejected": -1.8810393810272217, "loss": 2.3962, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.851436614990234, "rewards/margins": 0.9589592814445496, "rewards/rejected": -18.810396194458008, "step": 4160 }, { "epoch": 0.14038221712898985, "grad_norm": 24.619632720947266, "learning_rate": 9.950415414399252e-07, "logits/chosen": 0.04073786735534668, "logits/rejected": -0.05679405480623245, "logps/chosen": -2.54649019241333, "logps/rejected": -3.144075632095337, "loss": 3.1133, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.464900970458984, "rewards/margins": 5.975854873657227, "rewards/rejected": -31.44075584411621, "step": 4165 }, { "epoch": 0.14055074319997304, "grad_norm": 15.252907752990723, "learning_rate": 9.950001348600625e-07, "logits/chosen": -0.5035009384155273, "logits/rejected": -0.5782333016395569, "logps/chosen": -1.7510089874267578, "logps/rejected": -1.777282953262329, "loss": 2.9862, "rewards/accuracies": 0.5, "rewards/chosen": -17.510089874267578, "rewards/margins": 0.2627398371696472, "rewards/rejected": -17.772830963134766, "step": 4170 }, { "epoch": 0.1407192692709562, "grad_norm": 19.329147338867188, "learning_rate": 9.949585569801554e-07, "logits/chosen": -0.24519672989845276, "logits/rejected": -0.30350321531295776, "logps/chosen": -1.5690886974334717, "logps/rejected": -1.8500550985336304, "loss": 2.8123, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -15.690889358520508, "rewards/margins": 2.809664249420166, "rewards/rejected": -18.500551223754883, "step": 4175 }, { "epoch": 0.1408877953419394, "grad_norm": 18.852506637573242, "learning_rate": 9.949168078145925e-07, "logits/chosen": -0.7113882899284363, "logits/rejected": -0.7007073163986206, "logps/chosen": -1.8825536966323853, "logps/rejected": -1.8443183898925781, "loss": 3.4646, "rewards/accuracies": 0.5, "rewards/chosen": -18.82553482055664, "rewards/margins": -0.38235053420066833, "rewards/rejected": -18.44318389892578, "step": 4180 }, { "epoch": 0.14105632141292257, "grad_norm": 22.75662612915039, "learning_rate": 9.948748873778212e-07, "logits/chosen": -0.22483432292938232, "logits/rejected": -0.10999743640422821, "logps/chosen": -1.646648645401001, "logps/rejected": -1.7723115682601929, "loss": 3.2349, "rewards/accuracies": 0.5, "rewards/chosen": -16.46648597717285, "rewards/margins": 1.2566298246383667, "rewards/rejected": -17.723115921020508, "step": 4185 }, { "epoch": 0.14122484748390576, "grad_norm": 25.911178588867188, "learning_rate": 9.948327956843487e-07, "logits/chosen": -0.24501433968544006, "logits/rejected": -0.2834986746311188, "logps/chosen": -1.7771613597869873, "logps/rejected": -1.839350700378418, "loss": 2.9625, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.7716121673584, "rewards/margins": 0.6218937635421753, "rewards/rejected": -18.39350700378418, "step": 4190 }, { "epoch": 0.14139337355488893, "grad_norm": 66.31412506103516, "learning_rate": 9.94790532748741e-07, "logits/chosen": -0.2734326720237732, "logits/rejected": -0.2607944905757904, "logps/chosen": -2.068535566329956, "logps/rejected": -2.099501848220825, "loss": 2.8103, "rewards/accuracies": 0.5, "rewards/chosen": -20.685354232788086, "rewards/margins": 0.3096626400947571, "rewards/rejected": -20.995018005371094, "step": 4195 }, { "epoch": 0.14156189962587212, "grad_norm": 33.704647064208984, "learning_rate": 9.947480985856241e-07, "logits/chosen": -0.107094407081604, "logits/rejected": -0.1890457272529602, "logps/chosen": -2.6533761024475098, "logps/rejected": -2.707219123840332, "loss": 2.9024, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.533761978149414, "rewards/margins": 0.5384283065795898, "rewards/rejected": -27.072193145751953, "step": 4200 }, { "epoch": 0.14173042569685532, "grad_norm": 26.672903060913086, "learning_rate": 9.947054932096827e-07, "logits/chosen": -0.48258861899375916, "logits/rejected": -0.7191343903541565, "logps/chosen": -1.6313819885253906, "logps/rejected": -1.6558310985565186, "loss": 2.8408, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.31382179260254, "rewards/margins": 0.24449129402637482, "rewards/rejected": -16.558313369750977, "step": 4205 }, { "epoch": 0.14189895176783848, "grad_norm": 52.22471618652344, "learning_rate": 9.946627166356608e-07, "logits/chosen": -0.7462440729141235, "logits/rejected": -0.7692978978157043, "logps/chosen": -1.5429750680923462, "logps/rejected": -1.4933751821517944, "loss": 3.5855, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -15.429750442504883, "rewards/margins": -0.4959982931613922, "rewards/rejected": -14.933751106262207, "step": 4210 }, { "epoch": 0.14206747783882168, "grad_norm": 16.94681167602539, "learning_rate": 9.946197688783612e-07, "logits/chosen": -0.8591279983520508, "logits/rejected": -0.7224575877189636, "logps/chosen": -1.5940550565719604, "logps/rejected": -1.6403049230575562, "loss": 3.1158, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -15.94054889678955, "rewards/margins": 0.4624989926815033, "rewards/rejected": -16.40304946899414, "step": 4215 }, { "epoch": 0.14223600390980484, "grad_norm": 35.62173080444336, "learning_rate": 9.945766499526472e-07, "logits/chosen": -0.22328560054302216, "logits/rejected": -0.25591030716896057, "logps/chosen": -1.9984073638916016, "logps/rejected": -2.045516014099121, "loss": 2.7188, "rewards/accuracies": 0.5, "rewards/chosen": -19.984073638916016, "rewards/margins": 0.47108450531959534, "rewards/rejected": -20.455158233642578, "step": 4220 }, { "epoch": 0.14240452998078804, "grad_norm": 37.213661193847656, "learning_rate": 9.9453335987344e-07, "logits/chosen": -0.6606870889663696, "logits/rejected": -0.7392102479934692, "logps/chosen": -2.145911693572998, "logps/rejected": -2.1802453994750977, "loss": 3.5793, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.459117889404297, "rewards/margins": 0.3433364927768707, "rewards/rejected": -21.802453994750977, "step": 4225 }, { "epoch": 0.1425730560517712, "grad_norm": 23.552959442138672, "learning_rate": 9.944898986557208e-07, "logits/chosen": -0.5075179934501648, "logits/rejected": -0.5485433340072632, "logps/chosen": -1.5870262384414673, "logps/rejected": -1.6476974487304688, "loss": 2.639, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -15.870260238647461, "rewards/margins": 0.6067121624946594, "rewards/rejected": -16.476974487304688, "step": 4230 }, { "epoch": 0.1427415821227544, "grad_norm": 19.936710357666016, "learning_rate": 9.944462663145299e-07, "logits/chosen": -0.6237483024597168, "logits/rejected": -0.7083162069320679, "logps/chosen": -1.8480708599090576, "logps/rejected": -1.78692626953125, "loss": 3.6742, "rewards/accuracies": 0.5, "rewards/chosen": -18.4807071685791, "rewards/margins": -0.6114450693130493, "rewards/rejected": -17.8692626953125, "step": 4235 }, { "epoch": 0.14291010819373756, "grad_norm": 40.25701904296875, "learning_rate": 9.944024628649665e-07, "logits/chosen": -0.4969102740287781, "logits/rejected": -0.6931605339050293, "logps/chosen": -1.6680597066879272, "logps/rejected": -1.658609390258789, "loss": 3.2996, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.68059730529785, "rewards/margins": -0.09450320899486542, "rewards/rejected": -16.58609390258789, "step": 4240 }, { "epoch": 0.14307863426472076, "grad_norm": 30.906673431396484, "learning_rate": 9.943584883221897e-07, "logits/chosen": -0.3676472306251526, "logits/rejected": -0.4649665355682373, "logps/chosen": -1.7316913604736328, "logps/rejected": -1.6490224599838257, "loss": 3.8978, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -17.31691551208496, "rewards/margins": -0.8266892433166504, "rewards/rejected": -16.490224838256836, "step": 4245 }, { "epoch": 0.14324716033570392, "grad_norm": 22.4572696685791, "learning_rate": 9.943143427014166e-07, "logits/chosen": -0.330500066280365, "logits/rejected": -0.4424312710762024, "logps/chosen": -1.9481405019760132, "logps/rejected": -1.9094657897949219, "loss": 3.5611, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.481403350830078, "rewards/margins": -0.3867475390434265, "rewards/rejected": -19.094655990600586, "step": 4250 }, { "epoch": 0.14341568640668712, "grad_norm": 27.16339683532715, "learning_rate": 9.942700260179248e-07, "logits/chosen": -0.7029052376747131, "logits/rejected": -0.5835430026054382, "logps/chosen": -1.7974525690078735, "logps/rejected": -1.9083143472671509, "loss": 3.1116, "rewards/accuracies": 0.5, "rewards/chosen": -17.974523544311523, "rewards/margins": 1.1086170673370361, "rewards/rejected": -19.083141326904297, "step": 4255 }, { "epoch": 0.1435842124776703, "grad_norm": 20.374530792236328, "learning_rate": 9.942255382870506e-07, "logits/chosen": -0.6019352674484253, "logits/rejected": -0.5517061352729797, "logps/chosen": -1.4612500667572021, "logps/rejected": -1.5348877906799316, "loss": 2.6918, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -14.612500190734863, "rewards/margins": 0.736376941204071, "rewards/rejected": -15.348876953125, "step": 4260 }, { "epoch": 0.14375273854865347, "grad_norm": 36.474327087402344, "learning_rate": 9.941808795241892e-07, "logits/chosen": -0.4818621575832367, "logits/rejected": -0.39904358983039856, "logps/chosen": -1.5083070993423462, "logps/rejected": -1.5565204620361328, "loss": 2.742, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -15.08306884765625, "rewards/margins": 0.48213452100753784, "rewards/rejected": -15.565203666687012, "step": 4265 }, { "epoch": 0.14392126461963667, "grad_norm": 19.787578582763672, "learning_rate": 9.941360497447954e-07, "logits/chosen": -0.3252061605453491, "logits/rejected": -0.22330248355865479, "logps/chosen": -1.7723124027252197, "logps/rejected": -1.6830450296401978, "loss": 4.0528, "rewards/accuracies": 0.5, "rewards/chosen": -17.723125457763672, "rewards/margins": -0.8926737904548645, "rewards/rejected": -16.830448150634766, "step": 4270 }, { "epoch": 0.14408979069061983, "grad_norm": 16.231342315673828, "learning_rate": 9.94091048964383e-07, "logits/chosen": -0.7457118034362793, "logits/rejected": -0.5876340866088867, "logps/chosen": -2.191019058227539, "logps/rejected": -2.25704026222229, "loss": 3.2538, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.91019058227539, "rewards/margins": 0.6602120399475098, "rewards/rejected": -22.570402145385742, "step": 4275 }, { "epoch": 0.14425831676160303, "grad_norm": 18.32788848876953, "learning_rate": 9.94045877198525e-07, "logits/chosen": -0.18542389571666718, "logits/rejected": -0.06351794302463531, "logps/chosen": -1.9895687103271484, "logps/rejected": -2.033998966217041, "loss": 2.8715, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.895687103271484, "rewards/margins": 0.44430312514305115, "rewards/rejected": -20.339988708496094, "step": 4280 }, { "epoch": 0.1444268428325862, "grad_norm": 39.58668518066406, "learning_rate": 9.940005344628535e-07, "logits/chosen": -0.24566316604614258, "logits/rejected": -0.20698890089988708, "logps/chosen": -1.7219035625457764, "logps/rejected": -1.6719424724578857, "loss": 3.6078, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.219036102294922, "rewards/margins": -0.49961042404174805, "rewards/rejected": -16.719425201416016, "step": 4285 }, { "epoch": 0.1445953689035694, "grad_norm": 21.891944885253906, "learning_rate": 9.9395502077306e-07, "logits/chosen": -0.48740309476852417, "logits/rejected": -0.5501845479011536, "logps/chosen": -1.685595154762268, "logps/rejected": -1.7272504568099976, "loss": 2.7276, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.8559513092041, "rewards/margins": 0.4165545403957367, "rewards/rejected": -17.272504806518555, "step": 4290 }, { "epoch": 0.14476389497455255, "grad_norm": 30.494169235229492, "learning_rate": 9.939093361448944e-07, "logits/chosen": -0.3153546452522278, "logits/rejected": -0.32003530859947205, "logps/chosen": -1.6743957996368408, "logps/rejected": -1.6838239431381226, "loss": 3.0992, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.743959426879883, "rewards/margins": 0.09428195655345917, "rewards/rejected": -16.838239669799805, "step": 4295 }, { "epoch": 0.14493242104553575, "grad_norm": 22.064924240112305, "learning_rate": 9.938634805941671e-07, "logits/chosen": -0.9115715026855469, "logits/rejected": -0.8669688105583191, "logps/chosen": -1.8068641424179077, "logps/rejected": -1.7367687225341797, "loss": 3.8418, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.068639755249023, "rewards/margins": -0.7009520530700684, "rewards/rejected": -17.367687225341797, "step": 4300 }, { "epoch": 0.1451009471165189, "grad_norm": 21.508249282836914, "learning_rate": 9.938174541367466e-07, "logits/chosen": -0.44647008180618286, "logits/rejected": -0.6526331901550293, "logps/chosen": -1.685974359512329, "logps/rejected": -1.668164849281311, "loss": 3.2786, "rewards/accuracies": 0.5, "rewards/chosen": -16.8597412109375, "rewards/margins": -0.17809438705444336, "rewards/rejected": -16.6816463470459, "step": 4305 }, { "epoch": 0.1452694731875021, "grad_norm": 31.063007354736328, "learning_rate": 9.937712567885608e-07, "logits/chosen": -0.49208030104637146, "logits/rejected": -0.5488861203193665, "logps/chosen": -2.050914764404297, "logps/rejected": -1.9329230785369873, "loss": 4.2608, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.509145736694336, "rewards/margins": -1.1799161434173584, "rewards/rejected": -19.3292293548584, "step": 4310 }, { "epoch": 0.1454379992584853, "grad_norm": 29.91889762878418, "learning_rate": 9.93724888565597e-07, "logits/chosen": -0.3852444887161255, "logits/rejected": -0.3011249005794525, "logps/chosen": -2.0186514854431152, "logps/rejected": -2.0107169151306152, "loss": 3.1377, "rewards/accuracies": 0.5, "rewards/chosen": -20.18651580810547, "rewards/margins": -0.07934770733118057, "rewards/rejected": -20.107166290283203, "step": 4315 }, { "epoch": 0.14560652532946847, "grad_norm": 16.69972801208496, "learning_rate": 9.93678349483901e-07, "logits/chosen": -0.5411044955253601, "logits/rejected": -0.6111071705818176, "logps/chosen": -1.8235629796981812, "logps/rejected": -1.8869975805282593, "loss": 2.7772, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.23563003540039, "rewards/margins": 0.6343483924865723, "rewards/rejected": -18.869977951049805, "step": 4320 }, { "epoch": 0.14577505140045166, "grad_norm": 24.001726150512695, "learning_rate": 9.936316395595788e-07, "logits/chosen": -0.2619974613189697, "logits/rejected": -0.3185933530330658, "logps/chosen": -2.2614998817443848, "logps/rejected": -2.4379124641418457, "loss": 2.6044, "rewards/accuracies": 0.5, "rewards/chosen": -22.615001678466797, "rewards/margins": 1.7641239166259766, "rewards/rejected": -24.379125595092773, "step": 4325 }, { "epoch": 0.14594357747143483, "grad_norm": 33.08540725708008, "learning_rate": 9.935847588087942e-07, "logits/chosen": -0.2971573770046234, "logits/rejected": -0.37692004442214966, "logps/chosen": -1.8515352010726929, "logps/rejected": -1.8141686916351318, "loss": 3.5533, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.51535415649414, "rewards/margins": -0.373665988445282, "rewards/rejected": -18.141687393188477, "step": 4330 }, { "epoch": 0.14611210354241802, "grad_norm": 7.031495094299316, "learning_rate": 9.935377072477709e-07, "logits/chosen": -0.24837207794189453, "logits/rejected": -0.22563381493091583, "logps/chosen": -2.002490997314453, "logps/rejected": -2.1387100219726562, "loss": 2.0619, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -20.024911880493164, "rewards/margins": 1.3621878623962402, "rewards/rejected": -21.387096405029297, "step": 4335 }, { "epoch": 0.1462806296134012, "grad_norm": 23.234272003173828, "learning_rate": 9.934904848927919e-07, "logits/chosen": -0.35107535123825073, "logits/rejected": -0.28245970606803894, "logps/chosen": -1.78196120262146, "logps/rejected": -1.920299768447876, "loss": 2.5926, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.81961441040039, "rewards/margins": 1.3833829164505005, "rewards/rejected": -19.2029972076416, "step": 4340 }, { "epoch": 0.14644915568438438, "grad_norm": 20.092247009277344, "learning_rate": 9.934430917601988e-07, "logits/chosen": -0.9359011650085449, "logits/rejected": -0.9503633379936218, "logps/chosen": -1.5813624858856201, "logps/rejected": -1.5645041465759277, "loss": 3.2902, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -15.813626289367676, "rewards/margins": -0.16858339309692383, "rewards/rejected": -15.645042419433594, "step": 4345 }, { "epoch": 0.14661768175536755, "grad_norm": 28.749696731567383, "learning_rate": 9.933955278663926e-07, "logits/chosen": -0.5206044912338257, "logits/rejected": -0.6252120733261108, "logps/chosen": -1.46475088596344, "logps/rejected": -1.4322856664657593, "loss": 3.5735, "rewards/accuracies": 0.5, "rewards/chosen": -14.647509574890137, "rewards/margins": -0.32465142011642456, "rewards/rejected": -14.322857856750488, "step": 4350 }, { "epoch": 0.14678620782635074, "grad_norm": 35.41548538208008, "learning_rate": 9.933477932278331e-07, "logits/chosen": -0.2737705707550049, "logits/rejected": -0.11229648441076279, "logps/chosen": -1.6251589059829712, "logps/rejected": -1.7368186712265015, "loss": 3.4879, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.2515869140625, "rewards/margins": 1.1165990829467773, "rewards/rejected": -17.368188858032227, "step": 4355 }, { "epoch": 0.1469547338973339, "grad_norm": 16.393056869506836, "learning_rate": 9.932998878610395e-07, "logits/chosen": -0.6516093015670776, "logits/rejected": -0.7561215758323669, "logps/chosen": -1.7601429224014282, "logps/rejected": -1.7828855514526367, "loss": 3.2031, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.601428985595703, "rewards/margins": 0.2274264395236969, "rewards/rejected": -17.828855514526367, "step": 4360 }, { "epoch": 0.1471232599683171, "grad_norm": 26.61617660522461, "learning_rate": 9.9325181178259e-07, "logits/chosen": -0.7604036331176758, "logits/rejected": -0.7403522729873657, "logps/chosen": -1.8788295984268188, "logps/rejected": -1.847124695777893, "loss": 3.4314, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.78829574584961, "rewards/margins": -0.3170498013496399, "rewards/rejected": -18.47124671936035, "step": 4365 }, { "epoch": 0.1472917860393003, "grad_norm": 26.9691162109375, "learning_rate": 9.932035650091217e-07, "logits/chosen": -0.3129528760910034, "logits/rejected": -0.03679082915186882, "logps/chosen": -2.2325453758239746, "logps/rejected": -2.3919172286987305, "loss": 2.0888, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.325454711914062, "rewards/margins": 1.5937201976776123, "rewards/rejected": -23.919174194335938, "step": 4370 }, { "epoch": 0.14746031211028346, "grad_norm": 12.038168907165527, "learning_rate": 9.93155147557331e-07, "logits/chosen": -0.3520078659057617, "logits/rejected": -0.44000229239463806, "logps/chosen": -1.4542829990386963, "logps/rejected": -1.5576039552688599, "loss": 2.5207, "rewards/accuracies": 0.5, "rewards/chosen": -14.542831420898438, "rewards/margins": 1.0332090854644775, "rewards/rejected": -15.57603931427002, "step": 4375 }, { "epoch": 0.14762883818126665, "grad_norm": 15.351378440856934, "learning_rate": 9.931065594439734e-07, "logits/chosen": -0.7100510001182556, "logits/rejected": -0.6339886784553528, "logps/chosen": -1.542508840560913, "logps/rejected": -1.4648298025131226, "loss": 3.8282, "rewards/accuracies": 0.5, "rewards/chosen": -15.425088882446289, "rewards/margins": -0.7767902612686157, "rewards/rejected": -14.648298263549805, "step": 4380 }, { "epoch": 0.14779736425224982, "grad_norm": 26.496545791625977, "learning_rate": 9.930578006858632e-07, "logits/chosen": -0.7247037887573242, "logits/rejected": -0.8406316041946411, "logps/chosen": -1.6559617519378662, "logps/rejected": -1.729272484779358, "loss": 2.5924, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.559616088867188, "rewards/margins": 0.7331069707870483, "rewards/rejected": -17.292724609375, "step": 4385 }, { "epoch": 0.147965890323233, "grad_norm": 32.76643371582031, "learning_rate": 9.930088712998738e-07, "logits/chosen": -0.6651844382286072, "logits/rejected": -0.4605945646762848, "logps/chosen": -2.0005507469177246, "logps/rejected": -2.0000672340393066, "loss": 3.2982, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.00550651550293, "rewards/margins": -0.004834270570427179, "rewards/rejected": -20.000673294067383, "step": 4390 }, { "epoch": 0.14813441639421618, "grad_norm": 32.53934097290039, "learning_rate": 9.929597713029379e-07, "logits/chosen": -0.44316577911376953, "logits/rejected": -0.5748022198677063, "logps/chosen": -2.264150619506836, "logps/rejected": -2.81333327293396, "loss": 2.6056, "rewards/accuracies": 0.5, "rewards/chosen": -22.64150619506836, "rewards/margins": 5.491827011108398, "rewards/rejected": -28.13333511352539, "step": 4395 }, { "epoch": 0.14830294246519937, "grad_norm": 32.589378356933594, "learning_rate": 9.929105007120468e-07, "logits/chosen": -0.6730566024780273, "logits/rejected": -0.6370115280151367, "logps/chosen": -1.741729497909546, "logps/rejected": -1.756996512413025, "loss": 3.0312, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.417295455932617, "rewards/margins": 0.15266962349414825, "rewards/rejected": -17.569965362548828, "step": 4400 }, { "epoch": 0.14830294246519937, "eval_logits/chosen": -0.7628591060638428, "eval_logits/rejected": -0.7754251956939697, "eval_logps/chosen": -1.6971991062164307, "eval_logps/rejected": -1.709100604057312, "eval_loss": 3.360646963119507, "eval_rewards/accuracies": 0.49000000953674316, "eval_rewards/chosen": -16.97199249267578, "eval_rewards/margins": 0.11901436001062393, "eval_rewards/rejected": -17.091007232666016, "eval_runtime": 12.8869, "eval_samples_per_second": 7.76, "eval_steps_per_second": 1.94, "step": 4400 }, { "epoch": 0.14847146853618254, "grad_norm": 24.610149383544922, "learning_rate": 9.928610595442514e-07, "logits/chosen": -0.3652550280094147, "logits/rejected": -0.32519787549972534, "logps/chosen": -2.136669635772705, "logps/rejected": -2.183389902114868, "loss": 3.8469, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.366697311401367, "rewards/margins": 0.4672008454799652, "rewards/rejected": -21.833898544311523, "step": 4405 }, { "epoch": 0.14863999460716573, "grad_norm": 27.064762115478516, "learning_rate": 9.928114478166613e-07, "logits/chosen": -0.3917112648487091, "logits/rejected": -0.38873490691185, "logps/chosen": -1.883427381515503, "logps/rejected": -1.9321670532226562, "loss": 2.6834, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.834274291992188, "rewards/margins": 0.48739489912986755, "rewards/rejected": -19.321670532226562, "step": 4410 }, { "epoch": 0.1488085206781489, "grad_norm": 17.143781661987305, "learning_rate": 9.92761665546445e-07, "logits/chosen": -0.6784085035324097, "logits/rejected": -0.777459979057312, "logps/chosen": -1.6531413793563843, "logps/rejected": -1.7106910943984985, "loss": 2.9767, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.531414031982422, "rewards/margins": 0.5754953622817993, "rewards/rejected": -17.106908798217773, "step": 4415 }, { "epoch": 0.1489770467491321, "grad_norm": 20.776880264282227, "learning_rate": 9.927117127508305e-07, "logits/chosen": -0.9443651437759399, "logits/rejected": -0.7738694548606873, "logps/chosen": -1.633569359779358, "logps/rejected": -1.733229398727417, "loss": 2.2787, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.335697174072266, "rewards/margins": 0.996599018573761, "rewards/rejected": -17.332294464111328, "step": 4420 }, { "epoch": 0.14914557282011529, "grad_norm": 76.14692687988281, "learning_rate": 9.926615894471042e-07, "logits/chosen": -0.2984052896499634, "logits/rejected": -0.23642554879188538, "logps/chosen": -1.9602696895599365, "logps/rejected": -1.781272530555725, "loss": 4.897, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.60269546508789, "rewards/margins": -1.7899707555770874, "rewards/rejected": -17.812725067138672, "step": 4425 }, { "epoch": 0.14931409889109845, "grad_norm": 22.077974319458008, "learning_rate": 9.926112956526118e-07, "logits/chosen": -0.7455543279647827, "logits/rejected": -0.7593054175376892, "logps/chosen": -1.776155710220337, "logps/rejected": -1.761704683303833, "loss": 3.2266, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.76155662536621, "rewards/margins": -0.14451150596141815, "rewards/rejected": -17.617046356201172, "step": 4430 }, { "epoch": 0.14948262496208164, "grad_norm": 30.44013214111328, "learning_rate": 9.92560831384758e-07, "logits/chosen": -0.28689366579055786, "logits/rejected": -0.458076536655426, "logps/chosen": -2.103567361831665, "logps/rejected": -2.0760843753814697, "loss": 3.5442, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.03567123413086, "rewards/margins": -0.27482956647872925, "rewards/rejected": -20.760845184326172, "step": 4435 }, { "epoch": 0.1496511510330648, "grad_norm": 41.20957565307617, "learning_rate": 9.925101966610067e-07, "logits/chosen": -0.2810547947883606, "logits/rejected": -0.3384184241294861, "logps/chosen": -2.0202267169952393, "logps/rejected": -2.081272602081299, "loss": 2.7238, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.202266693115234, "rewards/margins": 0.610458254814148, "rewards/rejected": -20.812725067138672, "step": 4440 }, { "epoch": 0.149819677104048, "grad_norm": 41.3414421081543, "learning_rate": 9.924593914988806e-07, "logits/chosen": -0.29100021719932556, "logits/rejected": -0.4603959918022156, "logps/chosen": -1.6702888011932373, "logps/rejected": -1.7332197427749634, "loss": 2.5224, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -16.70288848876953, "rewards/margins": 0.6293088793754578, "rewards/rejected": -17.332195281982422, "step": 4445 }, { "epoch": 0.14998820317503117, "grad_norm": 13.796420097351074, "learning_rate": 9.924084159159608e-07, "logits/chosen": -0.5338420271873474, "logits/rejected": -0.6065437197685242, "logps/chosen": -1.4859281778335571, "logps/rejected": -1.7795900106430054, "loss": 1.2324, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -14.859281539916992, "rewards/margins": 2.9366185665130615, "rewards/rejected": -17.795900344848633, "step": 4450 }, { "epoch": 0.15015672924601436, "grad_norm": 20.278915405273438, "learning_rate": 9.923572699298888e-07, "logits/chosen": -0.6260601282119751, "logits/rejected": -0.5569266676902771, "logps/chosen": -1.9350582361221313, "logps/rejected": -1.8638197183609009, "loss": 3.845, "rewards/accuracies": 0.5, "rewards/chosen": -19.350582122802734, "rewards/margins": -0.7123873233795166, "rewards/rejected": -18.638195037841797, "step": 4455 }, { "epoch": 0.15032525531699753, "grad_norm": 32.74846267700195, "learning_rate": 9.923059535583636e-07, "logits/chosen": -0.3299594223499298, "logits/rejected": -0.18386907875537872, "logps/chosen": -1.8987400531768799, "logps/rejected": -1.8763782978057861, "loss": 3.5642, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.98740005493164, "rewards/margins": -0.22361735999584198, "rewards/rejected": -18.763782501220703, "step": 4460 }, { "epoch": 0.15049378138798072, "grad_norm": 36.09501266479492, "learning_rate": 9.92254466819144e-07, "logits/chosen": -0.3556022644042969, "logits/rejected": -0.31961601972579956, "logps/chosen": -1.8345237970352173, "logps/rejected": -1.9695651531219482, "loss": 1.9562, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.345237731933594, "rewards/margins": 1.3504129648208618, "rewards/rejected": -19.69565200805664, "step": 4465 }, { "epoch": 0.1506623074589639, "grad_norm": 16.155614852905273, "learning_rate": 9.922028097300475e-07, "logits/chosen": -0.503342866897583, "logits/rejected": -0.38056522607803345, "logps/chosen": -1.684818983078003, "logps/rejected": -1.8295650482177734, "loss": 2.0349, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.848190307617188, "rewards/margins": 1.4474587440490723, "rewards/rejected": -18.295650482177734, "step": 4470 }, { "epoch": 0.15083083352994708, "grad_norm": 25.365110397338867, "learning_rate": 9.921509823089505e-07, "logits/chosen": -0.8112923502922058, "logits/rejected": -0.8188997507095337, "logps/chosen": -1.9288915395736694, "logps/rejected": -1.8551056385040283, "loss": 3.8075, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.28891944885254, "rewards/margins": -0.7378617525100708, "rewards/rejected": -18.55105209350586, "step": 4475 }, { "epoch": 0.15099935960093028, "grad_norm": 31.704790115356445, "learning_rate": 9.920989845737885e-07, "logits/chosen": -0.4111596941947937, "logits/rejected": -0.4327424466609955, "logps/chosen": -1.8056867122650146, "logps/rejected": -1.8885425329208374, "loss": 2.938, "rewards/accuracies": 0.5, "rewards/chosen": -18.056869506835938, "rewards/margins": 0.8285573124885559, "rewards/rejected": -18.885425567626953, "step": 4480 }, { "epoch": 0.15116788567191344, "grad_norm": 66.93142700195312, "learning_rate": 9.92046816542556e-07, "logits/chosen": -0.4983634054660797, "logits/rejected": -0.44021081924438477, "logps/chosen": -1.9229497909545898, "logps/rejected": -1.9702413082122803, "loss": 3.5794, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -19.2294979095459, "rewards/margins": 0.47291526198387146, "rewards/rejected": -19.70241355895996, "step": 4485 }, { "epoch": 0.15133641174289664, "grad_norm": 27.996192932128906, "learning_rate": 9.91994478233306e-07, "logits/chosen": -0.21563634276390076, "logits/rejected": -0.08831771463155746, "logps/chosen": -1.7543987035751343, "logps/rejected": -1.7821855545043945, "loss": 3.1206, "rewards/accuracies": 0.5, "rewards/chosen": -17.543987274169922, "rewards/margins": 0.27786731719970703, "rewards/rejected": -17.821855545043945, "step": 4490 }, { "epoch": 0.1515049378138798, "grad_norm": 31.757884979248047, "learning_rate": 9.919419696641512e-07, "logits/chosen": -0.6939610838890076, "logits/rejected": -0.7058770060539246, "logps/chosen": -1.8848850727081299, "logps/rejected": -1.9081329107284546, "loss": 2.8662, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.84885025024414, "rewards/margins": 0.23247957229614258, "rewards/rejected": -19.081329345703125, "step": 4495 }, { "epoch": 0.151673463884863, "grad_norm": 18.024682998657227, "learning_rate": 9.918892908532621e-07, "logits/chosen": -0.6620336174964905, "logits/rejected": -0.6601449847221375, "logps/chosen": -2.105888605117798, "logps/rejected": -2.041553020477295, "loss": 3.884, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.058883666992188, "rewards/margins": -0.6433547735214233, "rewards/rejected": -20.415531158447266, "step": 4500 }, { "epoch": 0.15184198995584616, "grad_norm": 21.89647674560547, "learning_rate": 9.918364418188692e-07, "logits/chosen": -0.20931684970855713, "logits/rejected": -0.26949331164360046, "logps/chosen": -2.057158946990967, "logps/rejected": -2.0671579837799072, "loss": 3.5455, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.571590423583984, "rewards/margins": 0.0999903678894043, "rewards/rejected": -20.671581268310547, "step": 4505 }, { "epoch": 0.15201051602682936, "grad_norm": 22.97340965270996, "learning_rate": 9.917834225792615e-07, "logits/chosen": -0.25889870524406433, "logits/rejected": -0.5221267938613892, "logps/chosen": -1.8550924062728882, "logps/rejected": -1.8294522762298584, "loss": 3.3505, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -18.55092430114746, "rewards/margins": -0.2563992440700531, "rewards/rejected": -18.294525146484375, "step": 4510 }, { "epoch": 0.15217904209781252, "grad_norm": 28.872962951660156, "learning_rate": 9.917302331527864e-07, "logits/chosen": -0.4469106197357178, "logits/rejected": -0.43801528215408325, "logps/chosen": -1.8834269046783447, "logps/rejected": -1.8372375965118408, "loss": 3.8107, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -18.83426856994629, "rewards/margins": -0.461892694234848, "rewards/rejected": -18.37237548828125, "step": 4515 }, { "epoch": 0.15234756816879572, "grad_norm": 22.910249710083008, "learning_rate": 9.916768735578513e-07, "logits/chosen": -0.40119099617004395, "logits/rejected": -0.46237850189208984, "logps/chosen": -1.7076635360717773, "logps/rejected": -1.8835132122039795, "loss": 2.4283, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.076635360717773, "rewards/margins": 1.7584972381591797, "rewards/rejected": -18.835132598876953, "step": 4520 }, { "epoch": 0.15251609423977888, "grad_norm": 16.087495803833008, "learning_rate": 9.916233438129213e-07, "logits/chosen": -0.5997047424316406, "logits/rejected": -0.5393815040588379, "logps/chosen": -1.5532910823822021, "logps/rejected": -1.5561127662658691, "loss": 3.1255, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -15.53291130065918, "rewards/margins": 0.0282150749117136, "rewards/rejected": -15.561126708984375, "step": 4525 }, { "epoch": 0.15268462031076208, "grad_norm": 19.14484405517578, "learning_rate": 9.915696439365216e-07, "logits/chosen": -0.553629457950592, "logits/rejected": -0.4729720950126648, "logps/chosen": -1.8226730823516846, "logps/rejected": -1.6980514526367188, "loss": 4.3086, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -18.226730346679688, "rewards/margins": -1.2462149858474731, "rewards/rejected": -16.980514526367188, "step": 4530 }, { "epoch": 0.15285314638174527, "grad_norm": 33.58027648925781, "learning_rate": 9.91515773947235e-07, "logits/chosen": -0.2996135354042053, "logits/rejected": -0.30325740575790405, "logps/chosen": -1.7898555994033813, "logps/rejected": -1.7090318202972412, "loss": 3.8958, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.898555755615234, "rewards/margins": -0.8082362413406372, "rewards/rejected": -17.090320587158203, "step": 4535 }, { "epoch": 0.15302167245272844, "grad_norm": 41.30805587768555, "learning_rate": 9.914617338637038e-07, "logits/chosen": -0.2837643027305603, "logits/rejected": -0.28739625215530396, "logps/chosen": -1.7426464557647705, "logps/rejected": -1.8160514831542969, "loss": 2.6695, "rewards/accuracies": 0.5, "rewards/chosen": -17.426464080810547, "rewards/margins": 0.7340496778488159, "rewards/rejected": -18.16051483154297, "step": 4540 }, { "epoch": 0.15319019852371163, "grad_norm": 169.09719848632812, "learning_rate": 9.914075237046296e-07, "logits/chosen": -0.5721148252487183, "logits/rejected": -0.7339398860931396, "logps/chosen": -1.804178237915039, "logps/rejected": -1.5889647006988525, "loss": 5.3903, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.04178237915039, "rewards/margins": -2.1521360874176025, "rewards/rejected": -15.88964557647705, "step": 4545 }, { "epoch": 0.1533587245946948, "grad_norm": 20.485097885131836, "learning_rate": 9.913531434887718e-07, "logits/chosen": -0.6761281490325928, "logits/rejected": -0.627028226852417, "logps/chosen": -1.6126229763031006, "logps/rejected": -1.644484281539917, "loss": 2.8009, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.126230239868164, "rewards/margins": 0.31861066818237305, "rewards/rejected": -16.444841384887695, "step": 4550 }, { "epoch": 0.153527250665678, "grad_norm": 14.091863632202148, "learning_rate": 9.912985932349498e-07, "logits/chosen": -0.5259329080581665, "logits/rejected": -0.3877810835838318, "logps/chosen": -1.4369596242904663, "logps/rejected": -1.5198132991790771, "loss": 2.5432, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -14.369596481323242, "rewards/margins": 0.8285359144210815, "rewards/rejected": -15.198132514953613, "step": 4555 }, { "epoch": 0.15369577673666115, "grad_norm": 25.25543975830078, "learning_rate": 9.912438729620412e-07, "logits/chosen": -0.4410991072654724, "logits/rejected": -0.6500669121742249, "logps/chosen": -1.6194860935211182, "logps/rejected": -1.7692973613739014, "loss": 2.1913, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.19485855102539, "rewards/margins": 1.4981143474578857, "rewards/rejected": -17.692974090576172, "step": 4560 }, { "epoch": 0.15386430280764435, "grad_norm": 12.284860610961914, "learning_rate": 9.911889826889823e-07, "logits/chosen": -0.3081240952014923, "logits/rejected": -0.3586021959781647, "logps/chosen": -1.7863963842391968, "logps/rejected": -2.0202338695526123, "loss": 2.0076, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.863964080810547, "rewards/margins": 2.3383731842041016, "rewards/rejected": -20.20233726501465, "step": 4565 }, { "epoch": 0.15403282887862751, "grad_norm": 28.293598175048828, "learning_rate": 9.911339224347684e-07, "logits/chosen": -0.48678913712501526, "logits/rejected": -0.541092038154602, "logps/chosen": -1.9596703052520752, "logps/rejected": -1.844909429550171, "loss": 4.2267, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -19.59670066833496, "rewards/margins": -1.147606372833252, "rewards/rejected": -18.4490966796875, "step": 4570 }, { "epoch": 0.1542013549496107, "grad_norm": 19.478225708007812, "learning_rate": 9.91078692218454e-07, "logits/chosen": -0.8977234959602356, "logits/rejected": -0.8771381378173828, "logps/chosen": -1.830026626586914, "logps/rejected": -1.9383594989776611, "loss": 2.5465, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.30026626586914, "rewards/margins": 1.083329677581787, "rewards/rejected": -19.383596420288086, "step": 4575 }, { "epoch": 0.15436988102059387, "grad_norm": 25.026344299316406, "learning_rate": 9.910232920591518e-07, "logits/chosen": -0.702392578125, "logits/rejected": -0.49518975615501404, "logps/chosen": -1.558858871459961, "logps/rejected": -1.669965386390686, "loss": 2.4797, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -15.588589668273926, "rewards/margins": 1.1110646724700928, "rewards/rejected": -16.69965362548828, "step": 4580 }, { "epoch": 0.15453840709157707, "grad_norm": 20.98350715637207, "learning_rate": 9.90967721976034e-07, "logits/chosen": -0.4868387281894684, "logits/rejected": -0.3116268813610077, "logps/chosen": -1.7529996633529663, "logps/rejected": -1.7481448650360107, "loss": 3.2614, "rewards/accuracies": 0.5, "rewards/chosen": -17.529998779296875, "rewards/margins": -0.04854869842529297, "rewards/rejected": -17.481449127197266, "step": 4585 }, { "epoch": 0.15470693316256026, "grad_norm": 24.198535919189453, "learning_rate": 9.90911981988331e-07, "logits/chosen": -0.8397296667098999, "logits/rejected": -0.8432148098945618, "logps/chosen": -1.7114646434783936, "logps/rejected": -1.6712682247161865, "loss": 3.5441, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -17.11464500427246, "rewards/margins": -0.40196236968040466, "rewards/rejected": -16.71268081665039, "step": 4590 }, { "epoch": 0.15487545923354343, "grad_norm": 13.411236763000488, "learning_rate": 9.90856072115332e-07, "logits/chosen": -0.35893386602401733, "logits/rejected": -0.3036018908023834, "logps/chosen": -1.6867424249649048, "logps/rejected": -1.7620487213134766, "loss": 2.993, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.86742401123047, "rewards/margins": 0.7530642747879028, "rewards/rejected": -17.620487213134766, "step": 4595 }, { "epoch": 0.15504398530452662, "grad_norm": 46.425350189208984, "learning_rate": 9.907999923763855e-07, "logits/chosen": -0.5545027852058411, "logits/rejected": -0.5339478254318237, "logps/chosen": -1.9723618030548096, "logps/rejected": -1.987330436706543, "loss": 3.1193, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.723617553710938, "rewards/margins": 0.1496877670288086, "rewards/rejected": -19.873302459716797, "step": 4600 }, { "epoch": 0.1552125113755098, "grad_norm": 20.02104377746582, "learning_rate": 9.907437427908983e-07, "logits/chosen": -0.4415665566921234, "logits/rejected": -0.5307387113571167, "logps/chosen": -1.5795601606369019, "logps/rejected": -1.679488182067871, "loss": 2.3314, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -15.795602798461914, "rewards/margins": 0.9992785453796387, "rewards/rejected": -16.794879913330078, "step": 4605 }, { "epoch": 0.15538103744649298, "grad_norm": 26.12117576599121, "learning_rate": 9.906873233783363e-07, "logits/chosen": -0.814510703086853, "logits/rejected": -0.5907190442085266, "logps/chosen": -1.5405075550079346, "logps/rejected": -1.7465384006500244, "loss": 2.3861, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -15.405075073242188, "rewards/margins": 2.0603084564208984, "rewards/rejected": -17.465383529663086, "step": 4610 }, { "epoch": 0.15554956351747615, "grad_norm": 19.746633529663086, "learning_rate": 9.90630734158224e-07, "logits/chosen": -0.43650826811790466, "logits/rejected": -0.686555027961731, "logps/chosen": -1.5947755575180054, "logps/rejected": -1.760087251663208, "loss": 2.3027, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -15.947755813598633, "rewards/margins": 1.6531174182891846, "rewards/rejected": -17.600873947143555, "step": 4615 }, { "epoch": 0.15571808958845934, "grad_norm": 17.813255310058594, "learning_rate": 9.905739751501447e-07, "logits/chosen": -0.5361579656600952, "logits/rejected": -0.4771784842014313, "logps/chosen": -1.6195627450942993, "logps/rejected": -1.671136498451233, "loss": 2.7585, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.195627212524414, "rewards/margins": 0.5157370567321777, "rewards/rejected": -16.71136474609375, "step": 4620 }, { "epoch": 0.1558866156594425, "grad_norm": 27.102527618408203, "learning_rate": 9.905170463737405e-07, "logits/chosen": -0.6031167507171631, "logits/rejected": -0.5590722560882568, "logps/chosen": -1.8172991275787354, "logps/rejected": -1.7973215579986572, "loss": 3.2828, "rewards/accuracies": 0.5, "rewards/chosen": -18.172992706298828, "rewards/margins": -0.19977673888206482, "rewards/rejected": -17.973215103149414, "step": 4625 }, { "epoch": 0.1560551417304257, "grad_norm": 69.08274841308594, "learning_rate": 9.904599478487121e-07, "logits/chosen": -0.30473047494888306, "logits/rejected": -0.28297197818756104, "logps/chosen": -2.3783857822418213, "logps/rejected": -2.160964012145996, "loss": 5.4019, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -23.783857345581055, "rewards/margins": -2.1742167472839355, "rewards/rejected": -21.609642028808594, "step": 4630 }, { "epoch": 0.15622366780140887, "grad_norm": 19.284488677978516, "learning_rate": 9.90402679594819e-07, "logits/chosen": -0.8089929819107056, "logits/rejected": -0.8168126344680786, "logps/chosen": -1.6827844381332397, "logps/rejected": -1.726898193359375, "loss": 2.802, "rewards/accuracies": 0.5, "rewards/chosen": -16.827844619750977, "rewards/margins": 0.4411369264125824, "rewards/rejected": -17.26898193359375, "step": 4635 }, { "epoch": 0.15639219387239206, "grad_norm": 25.2092342376709, "learning_rate": 9.903452416318796e-07, "logits/chosen": -0.4793424606323242, "logits/rejected": -0.42350155115127563, "logps/chosen": -1.631096601486206, "logps/rejected": -1.5376965999603271, "loss": 3.9991, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -16.31096649169922, "rewards/margins": -0.934001088142395, "rewards/rejected": -15.376965522766113, "step": 4640 }, { "epoch": 0.15656071994337525, "grad_norm": 25.0135498046875, "learning_rate": 9.90287633979771e-07, "logits/chosen": -0.27238425612449646, "logits/rejected": -0.3240829408168793, "logps/chosen": -2.0996146202087402, "logps/rejected": -2.365563154220581, "loss": 1.3909, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.99614906311035, "rewards/margins": 2.6594841480255127, "rewards/rejected": -23.6556339263916, "step": 4645 }, { "epoch": 0.15672924601435842, "grad_norm": 22.960716247558594, "learning_rate": 9.90229856658429e-07, "logits/chosen": -0.21965241432189941, "logits/rejected": -0.4097241461277008, "logps/chosen": -1.7522590160369873, "logps/rejected": -1.758040189743042, "loss": 3.5186, "rewards/accuracies": 0.5, "rewards/chosen": -17.5225887298584, "rewards/margins": 0.057814598083496094, "rewards/rejected": -17.580402374267578, "step": 4650 }, { "epoch": 0.1568977720853416, "grad_norm": 25.19063377380371, "learning_rate": 9.901719096878476e-07, "logits/chosen": -0.6338127255439758, "logits/rejected": -0.5322138071060181, "logps/chosen": -1.6390702724456787, "logps/rejected": -1.6156508922576904, "loss": 3.5505, "rewards/accuracies": 0.5, "rewards/chosen": -16.390703201293945, "rewards/margins": -0.23419399559497833, "rewards/rejected": -16.156509399414062, "step": 4655 }, { "epoch": 0.15706629815632478, "grad_norm": 21.21099090576172, "learning_rate": 9.901137930880802e-07, "logits/chosen": -0.6699775457382202, "logits/rejected": -0.7210027575492859, "logps/chosen": -1.574340581893921, "logps/rejected": -1.5693788528442383, "loss": 3.1195, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -15.743408203125, "rewards/margins": -0.04962043836712837, "rewards/rejected": -15.69378662109375, "step": 4660 }, { "epoch": 0.15723482422730797, "grad_norm": 15.227494239807129, "learning_rate": 9.90055506879239e-07, "logits/chosen": -0.5711018443107605, "logits/rejected": -0.4353240430355072, "logps/chosen": -1.8625409603118896, "logps/rejected": -2.0394296646118164, "loss": 1.9554, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.625408172607422, "rewards/margins": 1.7688881158828735, "rewards/rejected": -20.394298553466797, "step": 4665 }, { "epoch": 0.15740335029829114, "grad_norm": 22.745201110839844, "learning_rate": 9.899970510814941e-07, "logits/chosen": -0.5177310705184937, "logits/rejected": -0.5119687914848328, "logps/chosen": -1.8094947338104248, "logps/rejected": -1.8350107669830322, "loss": 3.8939, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -18.094947814941406, "rewards/margins": 0.25515851378440857, "rewards/rejected": -18.350107192993164, "step": 4670 }, { "epoch": 0.15757187636927433, "grad_norm": 30.20716667175293, "learning_rate": 9.899384257150752e-07, "logits/chosen": -0.2176884412765503, "logits/rejected": -0.2588900625705719, "logps/chosen": -1.8750667572021484, "logps/rejected": -2.0222976207733154, "loss": 2.3665, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.750667572021484, "rewards/margins": 1.4723093509674072, "rewards/rejected": -20.22297477722168, "step": 4675 }, { "epoch": 0.1577404024402575, "grad_norm": 44.010440826416016, "learning_rate": 9.898796308002698e-07, "logits/chosen": -0.7645952701568604, "logits/rejected": -0.5875598192214966, "logps/chosen": -1.7068369388580322, "logps/rejected": -1.7132556438446045, "loss": 3.0981, "rewards/accuracies": 0.5, "rewards/chosen": -17.068368911743164, "rewards/margins": 0.06418828666210175, "rewards/rejected": -17.132556915283203, "step": 4680 }, { "epoch": 0.1579089285112407, "grad_norm": 37.04304885864258, "learning_rate": 9.898206663574244e-07, "logits/chosen": -0.6298079490661621, "logits/rejected": -0.5319895148277283, "logps/chosen": -1.7374365329742432, "logps/rejected": -1.611778974533081, "loss": 4.3514, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -17.374366760253906, "rewards/margins": -1.2565762996673584, "rewards/rejected": -16.11779022216797, "step": 4685 }, { "epoch": 0.15807745458222386, "grad_norm": 16.909269332885742, "learning_rate": 9.897615324069447e-07, "logits/chosen": -0.4190613627433777, "logits/rejected": -0.3520964980125427, "logps/chosen": -1.878212332725525, "logps/rejected": -2.0033230781555176, "loss": 2.2772, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.782123565673828, "rewards/margins": 1.2511035203933716, "rewards/rejected": -20.03322982788086, "step": 4690 }, { "epoch": 0.15824598065320705, "grad_norm": 28.17768096923828, "learning_rate": 9.897022289692946e-07, "logits/chosen": -0.217799574136734, "logits/rejected": -0.20938460528850555, "logps/chosen": -2.18269681930542, "logps/rejected": -2.183464527130127, "loss": 3.3206, "rewards/accuracies": 0.5, "rewards/chosen": -21.826969146728516, "rewards/margins": 0.007677173707634211, "rewards/rejected": -21.834646224975586, "step": 4695 }, { "epoch": 0.15841450672419025, "grad_norm": 20.799894332885742, "learning_rate": 9.896427560649965e-07, "logits/chosen": -0.2771221399307251, "logits/rejected": -0.3630429804325104, "logps/chosen": -1.9276962280273438, "logps/rejected": -1.8691962957382202, "loss": 3.7554, "rewards/accuracies": 0.5, "rewards/chosen": -19.276962280273438, "rewards/margins": -0.5849997401237488, "rewards/rejected": -18.69196128845215, "step": 4700 }, { "epoch": 0.1585830327951734, "grad_norm": 26.0824031829834, "learning_rate": 9.895831137146318e-07, "logits/chosen": -0.4591868817806244, "logits/rejected": -0.49390801787376404, "logps/chosen": -1.8923221826553345, "logps/rejected": -1.9390461444854736, "loss": 2.8115, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.923221588134766, "rewards/margins": 0.4672381281852722, "rewards/rejected": -19.390460968017578, "step": 4705 }, { "epoch": 0.1587515588661566, "grad_norm": 17.884014129638672, "learning_rate": 9.8952330193884e-07, "logits/chosen": -0.7502976059913635, "logits/rejected": -0.45593467354774475, "logps/chosen": -1.6914688348770142, "logps/rejected": -1.630499243736267, "loss": 3.7795, "rewards/accuracies": 0.5, "rewards/chosen": -16.914688110351562, "rewards/margins": -0.609697163105011, "rewards/rejected": -16.304988861083984, "step": 4710 }, { "epoch": 0.15892008493713977, "grad_norm": 21.209936141967773, "learning_rate": 9.894633207583202e-07, "logits/chosen": -0.7454978227615356, "logits/rejected": -0.7061313390731812, "logps/chosen": -1.8046905994415283, "logps/rejected": -1.8066291809082031, "loss": 3.2366, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -18.046905517578125, "rewards/margins": 0.019386673346161842, "rewards/rejected": -18.066293716430664, "step": 4715 }, { "epoch": 0.15908861100812297, "grad_norm": 23.273921966552734, "learning_rate": 9.894031701938287e-07, "logits/chosen": -0.47708067297935486, "logits/rejected": -0.5105594396591187, "logps/chosen": -1.8260581493377686, "logps/rejected": -1.7911970615386963, "loss": 4.0011, "rewards/accuracies": 0.5, "rewards/chosen": -18.260583877563477, "rewards/margins": -0.34861230850219727, "rewards/rejected": -17.911970138549805, "step": 4720 }, { "epoch": 0.15925713707910613, "grad_norm": 31.692359924316406, "learning_rate": 9.89342850266182e-07, "logits/chosen": -0.05979665368795395, "logits/rejected": -0.1203019842505455, "logps/chosen": -2.0891809463500977, "logps/rejected": -2.055260181427002, "loss": 3.5457, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.89181137084961, "rewards/margins": -0.3392105996608734, "rewards/rejected": -20.552600860595703, "step": 4725 }, { "epoch": 0.15942566315008933, "grad_norm": 29.511369705200195, "learning_rate": 9.892823609962543e-07, "logits/chosen": -0.5264648199081421, "logits/rejected": -0.5226877331733704, "logps/chosen": -1.7901853322982788, "logps/rejected": -1.7577362060546875, "loss": 3.4421, "rewards/accuracies": 0.5, "rewards/chosen": -17.90185546875, "rewards/margins": -0.32449159026145935, "rewards/rejected": -17.577362060546875, "step": 4730 }, { "epoch": 0.1595941892210725, "grad_norm": 69.38639068603516, "learning_rate": 9.89221702404978e-07, "logits/chosen": -0.338792622089386, "logits/rejected": -0.20237763226032257, "logps/chosen": -1.7657020092010498, "logps/rejected": -1.7486886978149414, "loss": 3.3845, "rewards/accuracies": 0.5, "rewards/chosen": -17.65702247619629, "rewards/margins": -0.1701340675354004, "rewards/rejected": -17.486886978149414, "step": 4735 }, { "epoch": 0.15976271529205568, "grad_norm": 18.09201431274414, "learning_rate": 9.891608745133453e-07, "logits/chosen": -0.45816025137901306, "logits/rejected": -0.4524189829826355, "logps/chosen": -1.8241355419158936, "logps/rejected": -1.8072960376739502, "loss": 3.4098, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.241355895996094, "rewards/margins": -0.16839809715747833, "rewards/rejected": -18.07295799255371, "step": 4740 }, { "epoch": 0.15993124136303885, "grad_norm": 12.629457473754883, "learning_rate": 9.890998773424061e-07, "logits/chosen": -0.7667060494422913, "logits/rejected": -0.8384987711906433, "logps/chosen": -1.699568510055542, "logps/rejected": -1.9147104024887085, "loss": 1.3524, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -16.995685577392578, "rewards/margins": 2.1514182090759277, "rewards/rejected": -19.147104263305664, "step": 4745 }, { "epoch": 0.16009976743402204, "grad_norm": 25.00735092163086, "learning_rate": 9.890387109132692e-07, "logits/chosen": -0.6868584752082825, "logits/rejected": -0.7392014265060425, "logps/chosen": -1.9946911334991455, "logps/rejected": -2.007953643798828, "loss": 3.0638, "rewards/accuracies": 0.5, "rewards/chosen": -19.946910858154297, "rewards/margins": 0.13262434303760529, "rewards/rejected": -20.07953453063965, "step": 4750 }, { "epoch": 0.1602682935050052, "grad_norm": 26.735124588012695, "learning_rate": 9.889773752471017e-07, "logits/chosen": -0.16993948817253113, "logits/rejected": -0.21659204363822937, "logps/chosen": -1.8886897563934326, "logps/rejected": -1.9271786212921143, "loss": 3.3185, "rewards/accuracies": 0.5, "rewards/chosen": -18.886898040771484, "rewards/margins": 0.3848879933357239, "rewards/rejected": -19.271785736083984, "step": 4755 }, { "epoch": 0.1604368195759884, "grad_norm": 21.052892684936523, "learning_rate": 9.889158703651296e-07, "logits/chosen": -0.460991233587265, "logits/rejected": -0.5180394649505615, "logps/chosen": -1.4519951343536377, "logps/rejected": -1.507206678390503, "loss": 2.5809, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -14.519950866699219, "rewards/margins": 0.552115797996521, "rewards/rejected": -15.072067260742188, "step": 4760 }, { "epoch": 0.1606053456469716, "grad_norm": 38.17951965332031, "learning_rate": 9.888541962886371e-07, "logits/chosen": -0.2759491503238678, "logits/rejected": -0.2980247735977173, "logps/chosen": -1.7075881958007812, "logps/rejected": -1.8610565662384033, "loss": 2.0582, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -17.075881958007812, "rewards/margins": 1.5346832275390625, "rewards/rejected": -18.610565185546875, "step": 4765 }, { "epoch": 0.16077387171795476, "grad_norm": 27.016674041748047, "learning_rate": 9.887923530389676e-07, "logits/chosen": -0.5326557159423828, "logits/rejected": -0.6520851850509644, "logps/chosen": -2.017056941986084, "logps/rejected": -1.8724479675292969, "loss": 4.602, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.17057228088379, "rewards/margins": -1.4460914134979248, "rewards/rejected": -18.7244815826416, "step": 4770 }, { "epoch": 0.16094239778893796, "grad_norm": 21.01801300048828, "learning_rate": 9.887303406375224e-07, "logits/chosen": -0.3734387159347534, "logits/rejected": -0.3648655116558075, "logps/chosen": -1.9372440576553345, "logps/rejected": -1.8856014013290405, "loss": 3.5991, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.3724422454834, "rewards/margins": -0.5164254903793335, "rewards/rejected": -18.856014251708984, "step": 4775 }, { "epoch": 0.16111092385992112, "grad_norm": 21.466033935546875, "learning_rate": 9.886681591057613e-07, "logits/chosen": -0.06261344254016876, "logits/rejected": -0.1325574368238449, "logps/chosen": -2.264781951904297, "logps/rejected": -2.4676618576049805, "loss": 1.8788, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.647817611694336, "rewards/margins": 2.028799533843994, "rewards/rejected": -24.676618576049805, "step": 4780 }, { "epoch": 0.16127944993090432, "grad_norm": 24.781084060668945, "learning_rate": 9.886058084652032e-07, "logits/chosen": -0.6147192716598511, "logits/rejected": -0.6076455116271973, "logps/chosen": -1.492680549621582, "logps/rejected": -1.502992868423462, "loss": 3.0557, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -14.92680549621582, "rewards/margins": 0.10312385857105255, "rewards/rejected": -15.029928207397461, "step": 4785 }, { "epoch": 0.16144797600188748, "grad_norm": 276.31402587890625, "learning_rate": 9.885432887374252e-07, "logits/chosen": -0.6703779101371765, "logits/rejected": -0.6189125180244446, "logps/chosen": -2.240828275680542, "logps/rejected": -2.2110419273376465, "loss": 3.3497, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -22.408281326293945, "rewards/margins": -0.2978610098361969, "rewards/rejected": -22.110422134399414, "step": 4790 }, { "epoch": 0.16161650207287068, "grad_norm": 24.567428588867188, "learning_rate": 9.884805999440627e-07, "logits/chosen": -0.40758800506591797, "logits/rejected": -0.35104599595069885, "logps/chosen": -2.059055805206299, "logps/rejected": -1.9487988948822021, "loss": 4.1751, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.59055519104004, "rewards/margins": -1.1025663614273071, "rewards/rejected": -19.487987518310547, "step": 4795 }, { "epoch": 0.16178502814385384, "grad_norm": 36.26649856567383, "learning_rate": 9.8841774210681e-07, "logits/chosen": -0.3545742928981781, "logits/rejected": -0.39434343576431274, "logps/chosen": -1.829400658607483, "logps/rejected": -1.7184016704559326, "loss": 4.145, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -18.294008255004883, "rewards/margins": -1.1099907159805298, "rewards/rejected": -17.184017181396484, "step": 4800 }, { "epoch": 0.16178502814385384, "eval_logits/chosen": -0.7746235728263855, "eval_logits/rejected": -0.7902061939239502, "eval_logps/chosen": -1.708162784576416, "eval_logps/rejected": -1.7237504720687866, "eval_loss": 3.340670585632324, "eval_rewards/accuracies": 0.5099999904632568, "eval_rewards/chosen": -17.081628799438477, "eval_rewards/margins": 0.15587686002254486, "eval_rewards/rejected": -17.237504959106445, "eval_runtime": 12.8967, "eval_samples_per_second": 7.754, "eval_steps_per_second": 1.938, "step": 4800 }, { "epoch": 0.16195355421483704, "grad_norm": 27.378616333007812, "learning_rate": 9.883547152474195e-07, "logits/chosen": -0.25638216733932495, "logits/rejected": -0.1468086540699005, "logps/chosen": -1.7783082723617554, "logps/rejected": -1.8178768157958984, "loss": 3.2298, "rewards/accuracies": 0.5, "rewards/chosen": -17.783084869384766, "rewards/margins": 0.3956846594810486, "rewards/rejected": -18.178768157958984, "step": 4805 }, { "epoch": 0.1621220802858202, "grad_norm": 20.546716690063477, "learning_rate": 9.882915193877024e-07, "logits/chosen": -0.921650767326355, "logits/rejected": -0.7422033548355103, "logps/chosen": -1.7827374935150146, "logps/rejected": -1.8927743434906006, "loss": 2.4601, "rewards/accuracies": 0.5, "rewards/chosen": -17.827375411987305, "rewards/margins": 1.1003668308258057, "rewards/rejected": -18.927743911743164, "step": 4810 }, { "epoch": 0.1622906063568034, "grad_norm": 27.94754409790039, "learning_rate": 9.882281545495285e-07, "logits/chosen": -0.07513687759637833, "logits/rejected": -0.22319336235523224, "logps/chosen": -1.5725983381271362, "logps/rejected": -1.6626056432724, "loss": 2.6133, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -15.725984573364258, "rewards/margins": 0.9000707864761353, "rewards/rejected": -16.626056671142578, "step": 4815 }, { "epoch": 0.1624591324277866, "grad_norm": 28.80483055114746, "learning_rate": 9.881646207548257e-07, "logits/chosen": -0.7495092153549194, "logits/rejected": -0.6132189631462097, "logps/chosen": -1.7083728313446045, "logps/rejected": -1.913587212562561, "loss": 3.0995, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.083728790283203, "rewards/margins": 2.052143096923828, "rewards/rejected": -19.1358699798584, "step": 4820 }, { "epoch": 0.16262765849876976, "grad_norm": 20.03661346435547, "learning_rate": 9.881009180255807e-07, "logits/chosen": -0.4534785747528076, "logits/rejected": -0.32503554224967957, "logps/chosen": -1.5937612056732178, "logps/rejected": -1.6601520776748657, "loss": 2.6911, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -15.937612533569336, "rewards/margins": 0.6639073491096497, "rewards/rejected": -16.601520538330078, "step": 4825 }, { "epoch": 0.16279618456975295, "grad_norm": 26.125856399536133, "learning_rate": 9.88037046383838e-07, "logits/chosen": -0.3081539571285248, "logits/rejected": -0.33038654923439026, "logps/chosen": -1.4090408086776733, "logps/rejected": -1.528576135635376, "loss": 2.7093, "rewards/accuracies": 0.5, "rewards/chosen": -14.090408325195312, "rewards/margins": 1.1953526735305786, "rewards/rejected": -15.285760879516602, "step": 4830 }, { "epoch": 0.16296471064073612, "grad_norm": 28.715280532836914, "learning_rate": 9.879730058517017e-07, "logits/chosen": -0.3715924322605133, "logits/rejected": -0.35126742720603943, "logps/chosen": -1.8122129440307617, "logps/rejected": -1.9514148235321045, "loss": 2.1753, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.122127532958984, "rewards/margins": 1.3920185565948486, "rewards/rejected": -19.514148712158203, "step": 4835 }, { "epoch": 0.1631332367117193, "grad_norm": 34.09306716918945, "learning_rate": 9.879087964513335e-07, "logits/chosen": -0.27698105573654175, "logits/rejected": -0.3473663926124573, "logps/chosen": -2.0653817653656006, "logps/rejected": -2.1154792308807373, "loss": 2.7881, "rewards/accuracies": 0.5, "rewards/chosen": -20.653818130493164, "rewards/margins": 0.5009748339653015, "rewards/rejected": -21.15479278564453, "step": 4840 }, { "epoch": 0.16330176278270248, "grad_norm": 24.596439361572266, "learning_rate": 9.878444182049537e-07, "logits/chosen": -0.7519720792770386, "logits/rejected": -0.7053220868110657, "logps/chosen": -1.832969069480896, "logps/rejected": -1.7468926906585693, "loss": 3.9439, "rewards/accuracies": 0.5, "rewards/chosen": -18.32969093322754, "rewards/margins": -0.8607624173164368, "rewards/rejected": -17.468929290771484, "step": 4845 }, { "epoch": 0.16347028885368567, "grad_norm": 21.327348709106445, "learning_rate": 9.87779871134841e-07, "logits/chosen": -0.3140432834625244, "logits/rejected": -0.38714686036109924, "logps/chosen": -1.9222043752670288, "logps/rejected": -1.9501903057098389, "loss": 3.0331, "rewards/accuracies": 0.5, "rewards/chosen": -19.222043991088867, "rewards/margins": 0.27986058592796326, "rewards/rejected": -19.501903533935547, "step": 4850 }, { "epoch": 0.16363881492466884, "grad_norm": 17.42647933959961, "learning_rate": 9.877151552633327e-07, "logits/chosen": -0.388538658618927, "logits/rejected": -0.17918941378593445, "logps/chosen": -1.743137001991272, "logps/rejected": -1.9893146753311157, "loss": 3.62, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.43136978149414, "rewards/margins": 2.4617762565612793, "rewards/rejected": -19.893146514892578, "step": 4855 }, { "epoch": 0.16380734099565203, "grad_norm": 51.683448791503906, "learning_rate": 9.876502706128242e-07, "logits/chosen": -0.4844183325767517, "logits/rejected": -0.7024241089820862, "logps/chosen": -1.7798906564712524, "logps/rejected": -1.8352922201156616, "loss": 3.1281, "rewards/accuracies": 0.5, "rewards/chosen": -17.798908233642578, "rewards/margins": 0.5540148019790649, "rewards/rejected": -18.352920532226562, "step": 4860 }, { "epoch": 0.1639758670666352, "grad_norm": 24.889793395996094, "learning_rate": 9.875852172057699e-07, "logits/chosen": -0.8120881915092468, "logits/rejected": -0.7175842523574829, "logps/chosen": -1.6519649028778076, "logps/rejected": -1.7745163440704346, "loss": 2.6253, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.519649505615234, "rewards/margins": 1.2255139350891113, "rewards/rejected": -17.745162963867188, "step": 4865 }, { "epoch": 0.1641443931376184, "grad_norm": 37.9599609375, "learning_rate": 9.87519995064682e-07, "logits/chosen": -0.24003906548023224, "logits/rejected": -0.4899943470954895, "logps/chosen": -2.0363316535949707, "logps/rejected": -1.8726288080215454, "loss": 4.9414, "rewards/accuracies": 0.5, "rewards/chosen": -20.36331558227539, "rewards/margins": -1.637028455734253, "rewards/rejected": -18.726289749145508, "step": 4870 }, { "epoch": 0.16431291920860158, "grad_norm": 14.678876876831055, "learning_rate": 9.874546042121313e-07, "logits/chosen": -0.6595714688301086, "logits/rejected": -0.593315601348877, "logps/chosen": -1.868060827255249, "logps/rejected": -1.9109785556793213, "loss": 3.0881, "rewards/accuracies": 0.5, "rewards/chosen": -18.68060874938965, "rewards/margins": 0.429177850484848, "rewards/rejected": -19.109785079956055, "step": 4875 }, { "epoch": 0.16448144527958475, "grad_norm": 36.223548889160156, "learning_rate": 9.873890446707469e-07, "logits/chosen": -0.40664395689964294, "logits/rejected": -0.4028412699699402, "logps/chosen": -1.7218236923217773, "logps/rejected": -1.8740679025650024, "loss": 2.015, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.218236923217773, "rewards/margins": 1.5224418640136719, "rewards/rejected": -18.740680694580078, "step": 4880 }, { "epoch": 0.16464997135056794, "grad_norm": 29.82371711730957, "learning_rate": 9.873233164632166e-07, "logits/chosen": -0.4903503954410553, "logits/rejected": -0.4249725341796875, "logps/chosen": -1.972914457321167, "logps/rejected": -2.142824649810791, "loss": 2.1319, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.729143142700195, "rewards/margins": 1.6991031169891357, "rewards/rejected": -21.428245544433594, "step": 4885 }, { "epoch": 0.1648184974215511, "grad_norm": 18.373544692993164, "learning_rate": 9.872574196122863e-07, "logits/chosen": -0.42714181542396545, "logits/rejected": -0.5435231924057007, "logps/chosen": -1.7797390222549438, "logps/rejected": -1.795607328414917, "loss": 3.3615, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -17.79738998413086, "rewards/margins": 0.1586824357509613, "rewards/rejected": -17.956073760986328, "step": 4890 }, { "epoch": 0.1649870234925343, "grad_norm": 25.699377059936523, "learning_rate": 9.871913541407602e-07, "logits/chosen": -0.8359493017196655, "logits/rejected": -1.0766808986663818, "logps/chosen": -1.8503162860870361, "logps/rejected": -1.7836980819702148, "loss": 3.7392, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -18.503162384033203, "rewards/margins": -0.6661826968193054, "rewards/rejected": -17.83698272705078, "step": 4895 }, { "epoch": 0.16515554956351747, "grad_norm": 24.661052703857422, "learning_rate": 9.87125120071501e-07, "logits/chosen": -0.6496170163154602, "logits/rejected": -0.6843874454498291, "logps/chosen": -1.588417410850525, "logps/rejected": -1.6372764110565186, "loss": 2.6524, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -15.884173393249512, "rewards/margins": 0.48858919739723206, "rewards/rejected": -16.37276268005371, "step": 4900 }, { "epoch": 0.16532407563450066, "grad_norm": 32.23655319213867, "learning_rate": 9.870587174274297e-07, "logits/chosen": -0.5077248811721802, "logits/rejected": -0.4583125114440918, "logps/chosen": -1.8581647872924805, "logps/rejected": -1.850717544555664, "loss": 3.3088, "rewards/accuracies": 0.5, "rewards/chosen": -18.581645965576172, "rewards/margins": -0.07447147369384766, "rewards/rejected": -18.50717544555664, "step": 4905 }, { "epoch": 0.16549260170548383, "grad_norm": 20.224699020385742, "learning_rate": 9.869921462315256e-07, "logits/chosen": -0.49347972869873047, "logits/rejected": -0.45034274458885193, "logps/chosen": -1.4449265003204346, "logps/rejected": -1.6118764877319336, "loss": 1.9775, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -14.449264526367188, "rewards/margins": 1.6694999933242798, "rewards/rejected": -16.118764877319336, "step": 4910 }, { "epoch": 0.16566112777646702, "grad_norm": 20.77910041809082, "learning_rate": 9.869254065068265e-07, "logits/chosen": -0.5645862817764282, "logits/rejected": -0.7167826294898987, "logps/chosen": -1.7465883493423462, "logps/rejected": -1.704671859741211, "loss": 3.5355, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -17.465885162353516, "rewards/margins": -0.41916388273239136, "rewards/rejected": -17.04671859741211, "step": 4915 }, { "epoch": 0.1658296538474502, "grad_norm": 33.12343978881836, "learning_rate": 9.868584982764282e-07, "logits/chosen": -0.4147886335849762, "logits/rejected": -0.5667712092399597, "logps/chosen": -1.6357110738754272, "logps/rejected": -1.6382662057876587, "loss": 3.2923, "rewards/accuracies": 0.5, "rewards/chosen": -16.35711097717285, "rewards/margins": 0.025551462545990944, "rewards/rejected": -16.38266372680664, "step": 4920 }, { "epoch": 0.16599817991843338, "grad_norm": 99.80087280273438, "learning_rate": 9.867914215634852e-07, "logits/chosen": -0.45811495184898376, "logits/rejected": -0.39101505279541016, "logps/chosen": -2.020596981048584, "logps/rejected": -2.055602550506592, "loss": 2.8389, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.20596694946289, "rewards/margins": 0.35005730390548706, "rewards/rejected": -20.5560245513916, "step": 4925 }, { "epoch": 0.16616670598941657, "grad_norm": 25.863933563232422, "learning_rate": 9.867241763912098e-07, "logits/chosen": -0.879712700843811, "logits/rejected": -0.911721408367157, "logps/chosen": -1.6381375789642334, "logps/rejected": -1.5870459079742432, "loss": 3.6379, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -16.38137435913086, "rewards/margins": -0.5109177827835083, "rewards/rejected": -15.870458602905273, "step": 4930 }, { "epoch": 0.16633523206039974, "grad_norm": 78.96031188964844, "learning_rate": 9.866567627828735e-07, "logits/chosen": -0.7218554615974426, "logits/rejected": -0.7632007598876953, "logps/chosen": -2.043527841567993, "logps/rejected": -1.9857170581817627, "loss": 3.6717, "rewards/accuracies": 0.5, "rewards/chosen": -20.43527603149414, "rewards/margins": -0.5781074166297913, "rewards/rejected": -19.85717010498047, "step": 4935 }, { "epoch": 0.16650375813138293, "grad_norm": 21.739521026611328, "learning_rate": 9.865891807618048e-07, "logits/chosen": -0.5816560983657837, "logits/rejected": -0.528420090675354, "logps/chosen": -1.5282984972000122, "logps/rejected": -1.672467589378357, "loss": 1.9666, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -15.282984733581543, "rewards/margins": 1.4416911602020264, "rewards/rejected": -16.72467613220215, "step": 4940 }, { "epoch": 0.1666722842023661, "grad_norm": 36.299537658691406, "learning_rate": 9.865214303513916e-07, "logits/chosen": -0.32008111476898193, "logits/rejected": -0.10507240146398544, "logps/chosen": -2.123836040496826, "logps/rejected": -2.2200088500976562, "loss": 2.7891, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.238357543945312, "rewards/margins": 0.9617301821708679, "rewards/rejected": -22.200088500976562, "step": 4945 }, { "epoch": 0.1668408102733493, "grad_norm": 56.05241012573242, "learning_rate": 9.864535115750795e-07, "logits/chosen": -0.21717092394828796, "logits/rejected": -0.2897348403930664, "logps/chosen": -1.9129555225372314, "logps/rejected": -1.926944375038147, "loss": 3.1466, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.12955665588379, "rewards/margins": 0.13988880813121796, "rewards/rejected": -19.26944351196289, "step": 4950 }, { "epoch": 0.16700933634433246, "grad_norm": 21.449539184570312, "learning_rate": 9.863854244563725e-07, "logits/chosen": -0.43917790055274963, "logits/rejected": -0.44615238904953003, "logps/chosen": -1.9873645305633545, "logps/rejected": -2.0402169227600098, "loss": 2.9082, "rewards/accuracies": 0.5, "rewards/chosen": -19.873645782470703, "rewards/margins": 0.5285249948501587, "rewards/rejected": -20.402172088623047, "step": 4955 }, { "epoch": 0.16717786241531565, "grad_norm": 37.11687469482422, "learning_rate": 9.86317169018833e-07, "logits/chosen": -0.5457051396369934, "logits/rejected": -0.5516785979270935, "logps/chosen": -1.803820252418518, "logps/rejected": -1.8196312189102173, "loss": 3.1856, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.038204193115234, "rewards/margins": 0.1581093817949295, "rewards/rejected": -18.19631004333496, "step": 4960 }, { "epoch": 0.16734638848629882, "grad_norm": 25.299577713012695, "learning_rate": 9.862487452860814e-07, "logits/chosen": -0.3777625262737274, "logits/rejected": -0.3700116276741028, "logps/chosen": -1.6833274364471436, "logps/rejected": -1.6934551000595093, "loss": 3.1463, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.833276748657227, "rewards/margins": 0.10127668082714081, "rewards/rejected": -16.934551239013672, "step": 4965 }, { "epoch": 0.167514914557282, "grad_norm": 25.97972297668457, "learning_rate": 9.861801532817965e-07, "logits/chosen": -0.7144454717636108, "logits/rejected": -0.647177517414093, "logps/chosen": -1.6258128881454468, "logps/rejected": -1.7660753726959229, "loss": 2.6313, "rewards/accuracies": 0.5, "rewards/chosen": -16.258129119873047, "rewards/margins": 1.4026236534118652, "rewards/rejected": -17.660751342773438, "step": 4970 }, { "epoch": 0.16768344062826518, "grad_norm": 16.2631778717041, "learning_rate": 9.861113930297155e-07, "logits/chosen": -0.34379979968070984, "logits/rejected": -0.5184779763221741, "logps/chosen": -1.8947486877441406, "logps/rejected": -2.0445353984832764, "loss": 3.0566, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.947486877441406, "rewards/margins": 1.4978677034378052, "rewards/rejected": -20.445354461669922, "step": 4975 }, { "epoch": 0.16785196669924837, "grad_norm": 15.502371788024902, "learning_rate": 9.86042464553633e-07, "logits/chosen": -0.37498247623443604, "logits/rejected": -0.42046207189559937, "logps/chosen": -1.5052076578140259, "logps/rejected": -1.6121822595596313, "loss": 2.1299, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -15.05207633972168, "rewards/margins": 1.0697453022003174, "rewards/rejected": -16.121822357177734, "step": 4980 }, { "epoch": 0.16802049277023157, "grad_norm": 22.429950714111328, "learning_rate": 9.859733678774031e-07, "logits/chosen": -0.5181079506874084, "logits/rejected": -0.4065426290035248, "logps/chosen": -1.9371169805526733, "logps/rejected": -2.3126235008239746, "loss": 1.6999, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.371170043945312, "rewards/margins": 3.7550644874572754, "rewards/rejected": -23.12623405456543, "step": 4985 }, { "epoch": 0.16818901884121473, "grad_norm": 36.0225944519043, "learning_rate": 9.859041030249372e-07, "logits/chosen": -0.22574977576732635, "logits/rejected": -0.2761825621128082, "logps/chosen": -2.0685524940490723, "logps/rejected": -1.9672033786773682, "loss": 4.0854, "rewards/accuracies": 0.5, "rewards/chosen": -20.685522079467773, "rewards/margins": -1.0134881734848022, "rewards/rejected": -19.672035217285156, "step": 4990 }, { "epoch": 0.16835754491219793, "grad_norm": 78.74285125732422, "learning_rate": 9.858346700202048e-07, "logits/chosen": -0.24602890014648438, "logits/rejected": -0.36132198572158813, "logps/chosen": -2.4286651611328125, "logps/rejected": -2.3484768867492676, "loss": 3.8898, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -24.28664779663086, "rewards/margins": -0.8018797636032104, "rewards/rejected": -23.48476791381836, "step": 4995 }, { "epoch": 0.1685260709831811, "grad_norm": 29.308990478515625, "learning_rate": 9.857650688872345e-07, "logits/chosen": -0.6732075214385986, "logits/rejected": -0.6380990147590637, "logps/chosen": -1.8012161254882812, "logps/rejected": -1.7454330921173096, "loss": 3.6902, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -18.012157440185547, "rewards/margins": -0.5578286051750183, "rewards/rejected": -17.454328536987305, "step": 5000 }, { "epoch": 0.16869459705416429, "grad_norm": 19.303850173950195, "learning_rate": 9.856952996501121e-07, "logits/chosen": -0.5144712924957275, "logits/rejected": -0.6592981219291687, "logps/chosen": -1.752929925918579, "logps/rejected": -1.8932254314422607, "loss": 3.3654, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -17.529298782348633, "rewards/margins": 1.402956247329712, "rewards/rejected": -18.932254791259766, "step": 5005 }, { "epoch": 0.16886312312514745, "grad_norm": 21.755300521850586, "learning_rate": 9.856253623329822e-07, "logits/chosen": -0.38438963890075684, "logits/rejected": -0.384308397769928, "logps/chosen": -1.6376289129257202, "logps/rejected": -1.8839448690414429, "loss": 1.7633, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -16.37628936767578, "rewards/margins": 2.4631593227386475, "rewards/rejected": -18.83945083618164, "step": 5010 }, { "epoch": 0.16903164919613065, "grad_norm": 29.190107345581055, "learning_rate": 9.855552569600473e-07, "logits/chosen": -0.35856691002845764, "logits/rejected": -0.40881720185279846, "logps/chosen": -1.7899494171142578, "logps/rejected": -1.7577186822891235, "loss": 3.5591, "rewards/accuracies": 0.5, "rewards/chosen": -17.899494171142578, "rewards/margins": -0.3223080635070801, "rewards/rejected": -17.577186584472656, "step": 5015 }, { "epoch": 0.1692001752671138, "grad_norm": 10.551178932189941, "learning_rate": 9.85484983555568e-07, "logits/chosen": -0.6378435492515564, "logits/rejected": -0.5291840434074402, "logps/chosen": -1.5740575790405273, "logps/rejected": -1.7817277908325195, "loss": 1.8232, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -15.740574836730957, "rewards/margins": 2.076702833175659, "rewards/rejected": -17.817277908325195, "step": 5020 }, { "epoch": 0.169368701338097, "grad_norm": 39.82679748535156, "learning_rate": 9.854145421438634e-07, "logits/chosen": -0.4193429946899414, "logits/rejected": -0.6050506830215454, "logps/chosen": -1.467621088027954, "logps/rejected": -1.443719506263733, "loss": 3.3306, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -14.676210403442383, "rewards/margins": -0.2390153855085373, "rewards/rejected": -14.43719482421875, "step": 5025 }, { "epoch": 0.16953722740908017, "grad_norm": 37.75446701049805, "learning_rate": 9.853439327493102e-07, "logits/chosen": -0.5865007638931274, "logits/rejected": -0.5626325607299805, "logps/chosen": -1.9477989673614502, "logps/rejected": -2.027604818344116, "loss": 2.6758, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.477991104125977, "rewards/margins": 0.7980586290359497, "rewards/rejected": -20.276050567626953, "step": 5030 }, { "epoch": 0.16970575348006336, "grad_norm": 18.804014205932617, "learning_rate": 9.852731553963435e-07, "logits/chosen": -0.3155723512172699, "logits/rejected": -0.3093792200088501, "logps/chosen": -2.113027572631836, "logps/rejected": -1.9566431045532227, "loss": 4.6724, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.130273818969727, "rewards/margins": -1.5638428926467896, "rewards/rejected": -19.56643295288086, "step": 5035 }, { "epoch": 0.16987427955104656, "grad_norm": 47.76856231689453, "learning_rate": 9.85202210109457e-07, "logits/chosen": -0.4034988284111023, "logits/rejected": -0.4541633725166321, "logps/chosen": -1.8395111560821533, "logps/rejected": -1.839477300643921, "loss": 3.3137, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.395111083984375, "rewards/margins": -0.00033893584623001516, "rewards/rejected": -18.394771575927734, "step": 5040 }, { "epoch": 0.17004280562202972, "grad_norm": 27.453750610351562, "learning_rate": 9.851310969132017e-07, "logits/chosen": -0.2835858464241028, "logits/rejected": -0.3324558734893799, "logps/chosen": -1.7421255111694336, "logps/rejected": -1.6985328197479248, "loss": 3.5587, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -17.421253204345703, "rewards/margins": -0.4359270930290222, "rewards/rejected": -16.985326766967773, "step": 5045 }, { "epoch": 0.17021133169301292, "grad_norm": 24.93321418762207, "learning_rate": 9.850598158321871e-07, "logits/chosen": -0.6215084195137024, "logits/rejected": -0.5685603618621826, "logps/chosen": -1.6707217693328857, "logps/rejected": -1.7257225513458252, "loss": 2.6856, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.707218170166016, "rewards/margins": 0.5500091314315796, "rewards/rejected": -17.257226943969727, "step": 5050 }, { "epoch": 0.17037985776399608, "grad_norm": 11.612754821777344, "learning_rate": 9.849883668910808e-07, "logits/chosen": -0.5886046290397644, "logits/rejected": -0.5103198289871216, "logps/chosen": -1.879009485244751, "logps/rejected": -1.9458684921264648, "loss": 2.9753, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.790096282958984, "rewards/margins": 0.6685900688171387, "rewards/rejected": -19.45868492126465, "step": 5055 }, { "epoch": 0.17054838383497928, "grad_norm": 37.67578887939453, "learning_rate": 9.849167501146087e-07, "logits/chosen": -0.46718257665634155, "logits/rejected": -0.6271299123764038, "logps/chosen": -1.9356199502944946, "logps/rejected": -1.833788514137268, "loss": 4.194, "rewards/accuracies": 0.5, "rewards/chosen": -19.356197357177734, "rewards/margins": -1.0183136463165283, "rewards/rejected": -18.3378849029541, "step": 5060 }, { "epoch": 0.17071690990596244, "grad_norm": 90.10015106201172, "learning_rate": 9.848449655275542e-07, "logits/chosen": -0.508590817451477, "logits/rejected": -0.489675909280777, "logps/chosen": -2.6385538578033447, "logps/rejected": -2.5385284423828125, "loss": 4.5381, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -26.385540008544922, "rewards/margins": -1.0002546310424805, "rewards/rejected": -25.385284423828125, "step": 5065 }, { "epoch": 0.17088543597694564, "grad_norm": 18.061155319213867, "learning_rate": 9.847730131547592e-07, "logits/chosen": -0.7201946973800659, "logits/rejected": -0.6724601984024048, "logps/chosen": -1.8912174701690674, "logps/rejected": -1.9988510608673096, "loss": 2.908, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.912174224853516, "rewards/margins": 1.0763366222381592, "rewards/rejected": -19.988510131835938, "step": 5070 }, { "epoch": 0.1710539620479288, "grad_norm": 18.397972106933594, "learning_rate": 9.847008930211238e-07, "logits/chosen": -0.6509224772453308, "logits/rejected": -0.5626755952835083, "logps/chosen": -1.779077172279358, "logps/rejected": -1.9971240758895874, "loss": 2.4914, "rewards/accuracies": 0.5, "rewards/chosen": -17.790771484375, "rewards/margins": 2.180471420288086, "rewards/rejected": -19.971242904663086, "step": 5075 }, { "epoch": 0.171222488118912, "grad_norm": 43.98146057128906, "learning_rate": 9.846286051516055e-07, "logits/chosen": -0.7050510048866272, "logits/rejected": -0.6167488098144531, "logps/chosen": -1.776510238647461, "logps/rejected": -1.702623724937439, "loss": 3.8085, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -17.765100479125977, "rewards/margins": -0.7388646006584167, "rewards/rejected": -17.02623748779297, "step": 5080 }, { "epoch": 0.17139101418989516, "grad_norm": 32.1152458190918, "learning_rate": 9.84556149571221e-07, "logits/chosen": -0.5005149841308594, "logits/rejected": -0.6876319050788879, "logps/chosen": -1.667649507522583, "logps/rejected": -1.7641193866729736, "loss": 2.361, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -16.676494598388672, "rewards/margins": 0.9646992683410645, "rewards/rejected": -17.641193389892578, "step": 5085 }, { "epoch": 0.17155954026087836, "grad_norm": 58.48193359375, "learning_rate": 9.844835263050435e-07, "logits/chosen": -0.5897291898727417, "logits/rejected": -0.6116207242012024, "logps/chosen": -1.7926311492919922, "logps/rejected": -1.6217567920684814, "loss": 4.739, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -17.926311492919922, "rewards/margins": -1.7087459564208984, "rewards/rejected": -16.217565536499023, "step": 5090 }, { "epoch": 0.17172806633186155, "grad_norm": 87.55854797363281, "learning_rate": 9.844107353782054e-07, "logits/chosen": -0.3844572901725769, "logits/rejected": -0.7152290940284729, "logps/chosen": -1.977447509765625, "logps/rejected": -1.8635094165802002, "loss": 4.4243, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -19.77447509765625, "rewards/margins": -1.139381766319275, "rewards/rejected": -18.63509178161621, "step": 5095 }, { "epoch": 0.17189659240284472, "grad_norm": 30.744888305664062, "learning_rate": 9.843377768158971e-07, "logits/chosen": -0.12349516153335571, "logits/rejected": -0.04538143798708916, "logps/chosen": -2.1029043197631836, "logps/rejected": -2.2611899375915527, "loss": 2.5362, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.029043197631836, "rewards/margins": 1.5828584432601929, "rewards/rejected": -22.611900329589844, "step": 5100 }, { "epoch": 0.1720651184738279, "grad_norm": 16.7409610748291, "learning_rate": 9.842646506433663e-07, "logits/chosen": -0.7018550634384155, "logits/rejected": -0.5086437463760376, "logps/chosen": -1.8820219039916992, "logps/rejected": -1.940272331237793, "loss": 2.9036, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.820220947265625, "rewards/margins": 0.5825031995773315, "rewards/rejected": -19.402721405029297, "step": 5105 }, { "epoch": 0.17223364454481108, "grad_norm": 24.791820526123047, "learning_rate": 9.84191356885919e-07, "logits/chosen": -0.44258204102516174, "logits/rejected": -0.49366721510887146, "logps/chosen": -1.5188804864883423, "logps/rejected": -1.4613652229309082, "loss": 3.6738, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -15.188802719116211, "rewards/margins": -0.5751511454582214, "rewards/rejected": -14.613652229309082, "step": 5110 }, { "epoch": 0.17240217061579427, "grad_norm": 38.88006591796875, "learning_rate": 9.841178955689197e-07, "logits/chosen": -0.4519910216331482, "logits/rejected": -0.45148682594299316, "logps/chosen": -1.992713212966919, "logps/rejected": -2.0434980392456055, "loss": 3.3871, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.92713165283203, "rewards/margins": 0.5078484416007996, "rewards/rejected": -20.434978485107422, "step": 5115 }, { "epoch": 0.17257069668677744, "grad_norm": 20.674013137817383, "learning_rate": 9.840442667177902e-07, "logits/chosen": -0.4431692957878113, "logits/rejected": -0.42282018065452576, "logps/chosen": -1.7422406673431396, "logps/rejected": -1.968392014503479, "loss": 1.8286, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.422407150268555, "rewards/margins": 2.2615127563476562, "rewards/rejected": -19.68391990661621, "step": 5120 }, { "epoch": 0.17273922275776063, "grad_norm": 22.77712631225586, "learning_rate": 9.839704703580104e-07, "logits/chosen": -0.5449277758598328, "logits/rejected": -0.4195954203605652, "logps/chosen": -1.7473876476287842, "logps/rejected": -1.7471768856048584, "loss": 3.135, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.473878860473633, "rewards/margins": -0.002109623048454523, "rewards/rejected": -17.47176742553711, "step": 5125 }, { "epoch": 0.1729077488287438, "grad_norm": 27.39888572692871, "learning_rate": 9.838965065151185e-07, "logits/chosen": -0.5343486070632935, "logits/rejected": -0.5076795816421509, "logps/chosen": -1.7901744842529297, "logps/rejected": -1.8608520030975342, "loss": 2.9082, "rewards/accuracies": 0.5, "rewards/chosen": -17.901744842529297, "rewards/margins": 0.7067748308181763, "rewards/rejected": -18.608518600463867, "step": 5130 }, { "epoch": 0.173076274899727, "grad_norm": 17.52597427368164, "learning_rate": 9.838223752147105e-07, "logits/chosen": -0.1843159943819046, "logits/rejected": -0.2545137405395508, "logps/chosen": -1.8627382516860962, "logps/rejected": -1.9470560550689697, "loss": 2.4678, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.627382278442383, "rewards/margins": 0.8431784510612488, "rewards/rejected": -19.47056007385254, "step": 5135 }, { "epoch": 0.17324480097071016, "grad_norm": 35.71249008178711, "learning_rate": 9.837480764824404e-07, "logits/chosen": -0.557886004447937, "logits/rejected": -0.3560100197792053, "logps/chosen": -1.9558770656585693, "logps/rejected": -1.8917385339736938, "loss": 3.773, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -19.55877113342285, "rewards/margins": -0.641386091709137, "rewards/rejected": -18.91738510131836, "step": 5140 }, { "epoch": 0.17341332704169335, "grad_norm": 7.512803077697754, "learning_rate": 9.836736103440199e-07, "logits/chosen": -0.22889885306358337, "logits/rejected": -0.15423983335494995, "logps/chosen": -2.0584750175476074, "logps/rejected": -2.3373608589172363, "loss": 2.4755, "rewards/accuracies": 0.5, "rewards/chosen": -20.584747314453125, "rewards/margins": 2.788860559463501, "rewards/rejected": -23.373611450195312, "step": 5145 }, { "epoch": 0.17358185311267654, "grad_norm": 26.877891540527344, "learning_rate": 9.835989768252188e-07, "logits/chosen": -0.718030571937561, "logits/rejected": -0.7842522859573364, "logps/chosen": -1.863952875137329, "logps/rejected": -1.868159532546997, "loss": 3.3168, "rewards/accuracies": 0.5, "rewards/chosen": -18.639530181884766, "rewards/margins": 0.04206543043255806, "rewards/rejected": -18.68159294128418, "step": 5150 }, { "epoch": 0.1737503791836597, "grad_norm": 13.796395301818848, "learning_rate": 9.835241759518648e-07, "logits/chosen": -0.6250889301300049, "logits/rejected": -0.5135878324508667, "logps/chosen": -2.1139869689941406, "logps/rejected": -2.2198023796081543, "loss": 2.6164, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.139867782592773, "rewards/margins": 1.0581531524658203, "rewards/rejected": -22.198020935058594, "step": 5155 }, { "epoch": 0.1739189052546429, "grad_norm": 32.31558609008789, "learning_rate": 9.834492077498438e-07, "logits/chosen": -0.4362711012363434, "logits/rejected": -0.2519915997982025, "logps/chosen": -2.14894437789917, "logps/rejected": -2.1211888790130615, "loss": 3.382, "rewards/accuracies": 0.5, "rewards/chosen": -21.489444732666016, "rewards/margins": -0.27755576372146606, "rewards/rejected": -21.21188735961914, "step": 5160 }, { "epoch": 0.17408743132562607, "grad_norm": 39.9125862121582, "learning_rate": 9.833740722450989e-07, "logits/chosen": -0.18216952681541443, "logits/rejected": -0.32020363211631775, "logps/chosen": -1.8249847888946533, "logps/rejected": -2.1250081062316895, "loss": 2.6139, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.249849319458008, "rewards/margins": 3.0002331733703613, "rewards/rejected": -21.250080108642578, "step": 5165 }, { "epoch": 0.17425595739660926, "grad_norm": 61.76107406616211, "learning_rate": 9.832987694636318e-07, "logits/chosen": -0.7317668795585632, "logits/rejected": -0.8237543106079102, "logps/chosen": -1.6339073181152344, "logps/rejected": -1.7707321643829346, "loss": 2.3452, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.339075088500977, "rewards/margins": 1.3682477474212646, "rewards/rejected": -17.707321166992188, "step": 5170 }, { "epoch": 0.17442448346759243, "grad_norm": 15.688084602355957, "learning_rate": 9.83223299431502e-07, "logits/chosen": -0.45724186301231384, "logits/rejected": -0.6557691693305969, "logps/chosen": -1.8950207233428955, "logps/rejected": -1.9983152151107788, "loss": 2.846, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.950206756591797, "rewards/margins": 1.032945990562439, "rewards/rejected": -19.983150482177734, "step": 5175 }, { "epoch": 0.17459300953857562, "grad_norm": 163.08804321289062, "learning_rate": 9.831476621748262e-07, "logits/chosen": -0.27674490213394165, "logits/rejected": -0.11143366992473602, "logps/chosen": -2.5093579292297363, "logps/rejected": -2.5167133808135986, "loss": 3.8225, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.093576431274414, "rewards/margins": 0.07355757057666779, "rewards/rejected": -25.16713523864746, "step": 5180 }, { "epoch": 0.1747615356095588, "grad_norm": 39.82172393798828, "learning_rate": 9.8307185771978e-07, "logits/chosen": -0.4417055547237396, "logits/rejected": -0.5631288886070251, "logps/chosen": -1.964228630065918, "logps/rejected": -1.9234275817871094, "loss": 3.6418, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.642284393310547, "rewards/margins": -0.408011257648468, "rewards/rejected": -19.23427391052246, "step": 5185 }, { "epoch": 0.17493006168054198, "grad_norm": 16.453245162963867, "learning_rate": 9.82995886092596e-07, "logits/chosen": -0.4573501944541931, "logits/rejected": -0.3422870934009552, "logps/chosen": -2.0120418071746826, "logps/rejected": -2.1436915397644043, "loss": 2.3553, "rewards/accuracies": 0.5, "rewards/chosen": -20.120418548583984, "rewards/margins": 1.316498041152954, "rewards/rejected": -21.43691635131836, "step": 5190 }, { "epoch": 0.17509858775152515, "grad_norm": 18.16073989868164, "learning_rate": 9.829197473195653e-07, "logits/chosen": -0.5349117517471313, "logits/rejected": -0.5821327567100525, "logps/chosen": -1.5197559595108032, "logps/rejected": -1.6109358072280884, "loss": 2.6405, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -15.197558403015137, "rewards/margins": 0.9117996096611023, "rewards/rejected": -16.109357833862305, "step": 5195 }, { "epoch": 0.17526711382250834, "grad_norm": 20.79674530029297, "learning_rate": 9.828434414270362e-07, "logits/chosen": -0.48446908593177795, "logits/rejected": -0.4432446360588074, "logps/chosen": -2.0608952045440674, "logps/rejected": -1.973127007484436, "loss": 3.9514, "rewards/accuracies": 0.5, "rewards/chosen": -20.608951568603516, "rewards/margins": -0.8776818513870239, "rewards/rejected": -19.73126983642578, "step": 5200 }, { "epoch": 0.17526711382250834, "eval_logits/chosen": -0.8001312017440796, "eval_logits/rejected": -0.8201078772544861, "eval_logps/chosen": -1.7195191383361816, "eval_logps/rejected": -1.7392371892929077, "eval_loss": 3.3126471042633057, "eval_rewards/accuracies": 0.5099999904632568, "eval_rewards/chosen": -17.195192337036133, "eval_rewards/margins": 0.1971810758113861, "eval_rewards/rejected": -17.392372131347656, "eval_runtime": 12.9075, "eval_samples_per_second": 7.747, "eval_steps_per_second": 1.937, "step": 5200 }, { "epoch": 0.17543563989349154, "grad_norm": 46.21132278442383, "learning_rate": 9.827669684414153e-07, "logits/chosen": -0.9259397387504578, "logits/rejected": -0.6422527432441711, "logps/chosen": -1.5217444896697998, "logps/rejected": -1.620764970779419, "loss": 2.2411, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -15.217447280883789, "rewards/margins": 0.9902023077011108, "rewards/rejected": -16.2076473236084, "step": 5205 }, { "epoch": 0.1756041659644747, "grad_norm": 50.1036491394043, "learning_rate": 9.826903283891667e-07, "logits/chosen": -0.6379965543746948, "logits/rejected": -0.6837188005447388, "logps/chosen": -1.9009840488433838, "logps/rejected": -1.8843845129013062, "loss": 3.2654, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.009838104248047, "rewards/margins": -0.16599377989768982, "rewards/rejected": -18.84384536743164, "step": 5210 }, { "epoch": 0.1757726920354579, "grad_norm": 32.01786804199219, "learning_rate": 9.82613521296813e-07, "logits/chosen": -0.43797287344932556, "logits/rejected": -0.4878421723842621, "logps/chosen": -1.6981594562530518, "logps/rejected": -1.7703397274017334, "loss": 2.5659, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.98159408569336, "rewards/margins": 0.7218042612075806, "rewards/rejected": -17.703399658203125, "step": 5215 }, { "epoch": 0.17594121810644106, "grad_norm": 24.32670021057129, "learning_rate": 9.825365471909337e-07, "logits/chosen": -0.22852332890033722, "logits/rejected": -0.220924973487854, "logps/chosen": -1.610447883605957, "logps/rejected": -1.6706794500350952, "loss": 2.923, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.104480743408203, "rewards/margins": 0.6023159027099609, "rewards/rejected": -16.7067928314209, "step": 5220 }, { "epoch": 0.17610974417742425, "grad_norm": 19.045787811279297, "learning_rate": 9.824594060981665e-07, "logits/chosen": -0.2582782208919525, "logits/rejected": -0.3515141010284424, "logps/chosen": -1.7661092281341553, "logps/rejected": -1.8709022998809814, "loss": 2.6033, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.66109275817871, "rewards/margins": 1.0479302406311035, "rewards/rejected": -18.70902442932129, "step": 5225 }, { "epoch": 0.17627827024840742, "grad_norm": 26.933544158935547, "learning_rate": 9.823820980452072e-07, "logits/chosen": -0.20558574795722961, "logits/rejected": -0.15755276381969452, "logps/chosen": -1.7019073963165283, "logps/rejected": -1.833353042602539, "loss": 2.3367, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.01907730102539, "rewards/margins": 1.3144561052322388, "rewards/rejected": -18.33353042602539, "step": 5230 }, { "epoch": 0.17644679631939061, "grad_norm": 21.980003356933594, "learning_rate": 9.823046230588085e-07, "logits/chosen": -0.3728945851325989, "logits/rejected": -0.17971554398536682, "logps/chosen": -2.131380558013916, "logps/rejected": -2.4322307109832764, "loss": 1.9342, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -21.313804626464844, "rewards/margins": 3.00850248336792, "rewards/rejected": -24.322307586669922, "step": 5235 }, { "epoch": 0.17661532239037378, "grad_norm": 22.891450881958008, "learning_rate": 9.82226981165782e-07, "logits/chosen": -0.4645145833492279, "logits/rejected": -0.3906251788139343, "logps/chosen": -2.064272165298462, "logps/rejected": -2.2810354232788086, "loss": 3.3253, "rewards/accuracies": 0.5, "rewards/chosen": -20.64272117614746, "rewards/margins": 2.1676318645477295, "rewards/rejected": -22.810354232788086, "step": 5240 }, { "epoch": 0.17678384846135697, "grad_norm": 34.193443298339844, "learning_rate": 9.821491723929963e-07, "logits/chosen": 0.022101493552327156, "logits/rejected": -0.01094393152743578, "logps/chosen": -1.9077723026275635, "logps/rejected": -1.9567878246307373, "loss": 3.0994, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.077722549438477, "rewards/margins": 0.49015599489212036, "rewards/rejected": -19.567880630493164, "step": 5245 }, { "epoch": 0.17695237453234014, "grad_norm": 19.244543075561523, "learning_rate": 9.82071196767378e-07, "logits/chosen": -0.7040097117424011, "logits/rejected": -0.5882058143615723, "logps/chosen": -1.7580782175064087, "logps/rejected": -1.6740825176239014, "loss": 3.8844, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.580781936645508, "rewards/margins": -0.8399547338485718, "rewards/rejected": -16.740825653076172, "step": 5250 }, { "epoch": 0.17712090060332333, "grad_norm": 23.584217071533203, "learning_rate": 9.819930543159112e-07, "logits/chosen": -0.47853463888168335, "logits/rejected": -0.4111636281013489, "logps/chosen": -1.689819097518921, "logps/rejected": -1.7779957056045532, "loss": 2.4064, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.898193359375, "rewards/margins": 0.8817658424377441, "rewards/rejected": -17.779958724975586, "step": 5255 }, { "epoch": 0.17728942667430653, "grad_norm": 60.129798889160156, "learning_rate": 9.819147450656382e-07, "logits/chosen": -0.3128248155117035, "logits/rejected": -0.3781060576438904, "logps/chosen": -1.551443338394165, "logps/rejected": -1.5790199041366577, "loss": 2.8277, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -15.514434814453125, "rewards/margins": 0.27576375007629395, "rewards/rejected": -15.790199279785156, "step": 5260 }, { "epoch": 0.1774579527452897, "grad_norm": 26.094388961791992, "learning_rate": 9.818362690436586e-07, "logits/chosen": -0.7319404482841492, "logits/rejected": -0.6763112545013428, "logps/chosen": -1.6520391702651978, "logps/rejected": -1.6662318706512451, "loss": 2.9878, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.5203914642334, "rewards/margins": 0.14192715287208557, "rewards/rejected": -16.66231918334961, "step": 5265 }, { "epoch": 0.1776264788162729, "grad_norm": 82.54247283935547, "learning_rate": 9.817576262771298e-07, "logits/chosen": -0.11548449099063873, "logits/rejected": -0.031751085072755814, "logps/chosen": -2.201295852661133, "logps/rejected": -2.252549648284912, "loss": 2.7633, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.012958526611328, "rewards/margins": 0.5125373601913452, "rewards/rejected": -22.525497436523438, "step": 5270 }, { "epoch": 0.17779500488725605, "grad_norm": 44.67779541015625, "learning_rate": 9.816788167932672e-07, "logits/chosen": -0.6120609045028687, "logits/rejected": -0.6190296411514282, "logps/chosen": -2.088254451751709, "logps/rejected": -2.0191900730133057, "loss": 4.0392, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -20.882543563842773, "rewards/margins": -0.6906436681747437, "rewards/rejected": -20.1919002532959, "step": 5275 }, { "epoch": 0.17796353095823925, "grad_norm": 31.414321899414062, "learning_rate": 9.815998406193436e-07, "logits/chosen": -0.32458242774009705, "logits/rejected": -0.31162434816360474, "logps/chosen": -2.0117292404174805, "logps/rejected": -1.9058940410614014, "loss": 4.1566, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -20.117290496826172, "rewards/margins": -1.058351755142212, "rewards/rejected": -19.058940887451172, "step": 5280 }, { "epoch": 0.1781320570292224, "grad_norm": 47.64741134643555, "learning_rate": 9.81520697782689e-07, "logits/chosen": -0.3747056722640991, "logits/rejected": -0.3660200238227844, "logps/chosen": -2.0151686668395996, "logps/rejected": -2.171931505203247, "loss": 2.8612, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.15168571472168, "rewards/margins": 1.5676277875900269, "rewards/rejected": -21.719314575195312, "step": 5285 }, { "epoch": 0.1783005831002056, "grad_norm": 37.78845977783203, "learning_rate": 9.814413883106924e-07, "logits/chosen": -0.2810427248477936, "logits/rejected": -0.30334943532943726, "logps/chosen": -2.2405996322631836, "logps/rejected": -2.317270517349243, "loss": 2.7636, "rewards/accuracies": 0.5, "rewards/chosen": -22.405996322631836, "rewards/margins": 0.7667078971862793, "rewards/rejected": -23.172704696655273, "step": 5290 }, { "epoch": 0.17846910917118877, "grad_norm": 24.239185333251953, "learning_rate": 9.813619122307993e-07, "logits/chosen": -0.2296961098909378, "logits/rejected": -0.23331353068351746, "logps/chosen": -1.9587428569793701, "logps/rejected": -1.9045612812042236, "loss": 3.8117, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -19.587427139282227, "rewards/margins": -0.5418151617050171, "rewards/rejected": -19.045612335205078, "step": 5295 }, { "epoch": 0.17863763524217197, "grad_norm": 116.19218444824219, "learning_rate": 9.81282269570513e-07, "logits/chosen": -0.6433163285255432, "logits/rejected": -0.4264778196811676, "logps/chosen": -2.0715627670288086, "logps/rejected": -2.044890880584717, "loss": 3.5245, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.715627670288086, "rewards/margins": -0.266719251871109, "rewards/rejected": -20.448909759521484, "step": 5300 }, { "epoch": 0.17880616131315513, "grad_norm": 21.32752227783203, "learning_rate": 9.812024603573954e-07, "logits/chosen": -0.4090822637081146, "logits/rejected": -0.46003809571266174, "logps/chosen": -1.727638840675354, "logps/rejected": -1.7646032571792603, "loss": 2.8879, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.27638816833496, "rewards/margins": 0.36964479088783264, "rewards/rejected": -17.646032333374023, "step": 5305 }, { "epoch": 0.17897468738413833, "grad_norm": 23.98275375366211, "learning_rate": 9.811224846190647e-07, "logits/chosen": -0.5765715837478638, "logits/rejected": -0.6701818704605103, "logps/chosen": -1.867069959640503, "logps/rejected": -1.9343897104263306, "loss": 2.684, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.67070198059082, "rewards/margins": 0.6731952428817749, "rewards/rejected": -19.343896865844727, "step": 5310 }, { "epoch": 0.17914321345512152, "grad_norm": 15.5569429397583, "learning_rate": 9.810423423831974e-07, "logits/chosen": -0.8625133633613586, "logits/rejected": -0.7863910794258118, "logps/chosen": -1.8105857372283936, "logps/rejected": -1.9400856494903564, "loss": 2.0151, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.105857849121094, "rewards/margins": 1.2949975728988647, "rewards/rejected": -19.400856018066406, "step": 5315 }, { "epoch": 0.17931173952610469, "grad_norm": 32.97955322265625, "learning_rate": 9.80962033677528e-07, "logits/chosen": -0.32209211587905884, "logits/rejected": -0.5163687467575073, "logps/chosen": -1.8706210851669312, "logps/rejected": -1.6956441402435303, "loss": 4.881, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -18.70621109008789, "rewards/margins": -1.7497684955596924, "rewards/rejected": -16.95644187927246, "step": 5320 }, { "epoch": 0.17948026559708788, "grad_norm": 24.574478149414062, "learning_rate": 9.808815585298475e-07, "logits/chosen": -0.28291743993759155, "logits/rejected": -0.28594347834587097, "logps/chosen": -1.8679603338241577, "logps/rejected": -1.9361158609390259, "loss": 2.8133, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.679601669311523, "rewards/margins": 0.6815553903579712, "rewards/rejected": -19.361156463623047, "step": 5325 }, { "epoch": 0.17964879166807105, "grad_norm": 14.93967342376709, "learning_rate": 9.80800916968006e-07, "logits/chosen": -0.755969762802124, "logits/rejected": -0.7597614526748657, "logps/chosen": -1.9039316177368164, "logps/rejected": -2.2054519653320312, "loss": 2.0603, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.039318084716797, "rewards/margins": 3.015202045440674, "rewards/rejected": -22.05451774597168, "step": 5330 }, { "epoch": 0.17981731773905424, "grad_norm": 15.54341983795166, "learning_rate": 9.807201090199095e-07, "logits/chosen": -0.4750305116176605, "logits/rejected": -0.5239988565444946, "logps/chosen": -1.9736168384552002, "logps/rejected": -1.8567231893539429, "loss": 4.2195, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -19.736167907714844, "rewards/margins": -1.1689354181289673, "rewards/rejected": -18.567232131958008, "step": 5335 }, { "epoch": 0.1799858438100374, "grad_norm": 26.10828399658203, "learning_rate": 9.806391347135233e-07, "logits/chosen": -1.0066381692886353, "logits/rejected": -1.0506072044372559, "logps/chosen": -1.6646168231964111, "logps/rejected": -1.6354296207427979, "loss": 3.5164, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.646167755126953, "rewards/margins": -0.29187268018722534, "rewards/rejected": -16.354293823242188, "step": 5340 }, { "epoch": 0.1801543698810206, "grad_norm": 22.388622283935547, "learning_rate": 9.805579940768687e-07, "logits/chosen": -0.6220000982284546, "logits/rejected": -0.43516239523887634, "logps/chosen": -1.761366605758667, "logps/rejected": -1.8666290044784546, "loss": 2.7263, "rewards/accuracies": 0.5, "rewards/chosen": -17.613666534423828, "rewards/margins": 1.0526221990585327, "rewards/rejected": -18.66628646850586, "step": 5345 }, { "epoch": 0.18032289595200376, "grad_norm": 25.53797721862793, "learning_rate": 9.804766871380257e-07, "logits/chosen": -0.4475626051425934, "logits/rejected": -0.5037604570388794, "logps/chosen": -1.771932601928711, "logps/rejected": -1.826242446899414, "loss": 2.6043, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.719324111938477, "rewards/margins": 0.543099045753479, "rewards/rejected": -18.26242446899414, "step": 5350 }, { "epoch": 0.18049142202298696, "grad_norm": 22.603837966918945, "learning_rate": 9.803952139251311e-07, "logits/chosen": -1.0026981830596924, "logits/rejected": -1.0315004587173462, "logps/chosen": -1.718327522277832, "logps/rejected": -1.6823968887329102, "loss": 3.4208, "rewards/accuracies": 0.5, "rewards/chosen": -17.183277130126953, "rewards/margins": -0.35930928587913513, "rewards/rejected": -16.8239688873291, "step": 5355 }, { "epoch": 0.18065994809397012, "grad_norm": 18.36005401611328, "learning_rate": 9.803135744663802e-07, "logits/chosen": -0.5656792521476746, "logits/rejected": -0.49610671401023865, "logps/chosen": -1.738930344581604, "logps/rejected": -1.7509796619415283, "loss": 3.1428, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.38930320739746, "rewards/margins": 0.12049512565135956, "rewards/rejected": -17.509796142578125, "step": 5360 }, { "epoch": 0.18082847416495332, "grad_norm": 25.456018447875977, "learning_rate": 9.802317687900247e-07, "logits/chosen": -0.6400080323219299, "logits/rejected": -0.5408387184143066, "logps/chosen": -1.99357008934021, "logps/rejected": -1.9206161499023438, "loss": 3.8065, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -19.935699462890625, "rewards/margins": -0.72953861951828, "rewards/rejected": -19.206161499023438, "step": 5365 }, { "epoch": 0.1809970002359365, "grad_norm": 26.897117614746094, "learning_rate": 9.80149796924374e-07, "logits/chosen": -0.41049957275390625, "logits/rejected": -0.5050762295722961, "logps/chosen": -2.407325267791748, "logps/rejected": -2.060502529144287, "loss": 6.8161, "rewards/accuracies": 0.5, "rewards/chosen": -24.073257446289062, "rewards/margins": -3.4682304859161377, "rewards/rejected": -20.605026245117188, "step": 5370 }, { "epoch": 0.18116552630691968, "grad_norm": 22.910654067993164, "learning_rate": 9.80067658897796e-07, "logits/chosen": -0.7714192271232605, "logits/rejected": -0.8598917126655579, "logps/chosen": -1.7116810083389282, "logps/rejected": -1.7139278650283813, "loss": 3.0925, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.11680793762207, "rewards/margins": 0.02247037924826145, "rewards/rejected": -17.139278411865234, "step": 5375 }, { "epoch": 0.18133405237790287, "grad_norm": 24.104690551757812, "learning_rate": 9.799853547387152e-07, "logits/chosen": -0.5268293023109436, "logits/rejected": -0.4755741059780121, "logps/chosen": -2.100019931793213, "logps/rejected": -2.20682430267334, "loss": 2.3812, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.000202178955078, "rewards/margins": 1.068042516708374, "rewards/rejected": -22.0682430267334, "step": 5380 }, { "epoch": 0.18150257844888604, "grad_norm": 24.671052932739258, "learning_rate": 9.799028844756137e-07, "logits/chosen": -0.4852059781551361, "logits/rejected": -0.4492795467376709, "logps/chosen": -2.029736042022705, "logps/rejected": -2.0513997077941895, "loss": 2.9847, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.297359466552734, "rewards/margins": 0.21663923561573029, "rewards/rejected": -20.513999938964844, "step": 5385 }, { "epoch": 0.18167110451986923, "grad_norm": 29.028913497924805, "learning_rate": 9.798202481370314e-07, "logits/chosen": -0.779220700263977, "logits/rejected": -0.7618386149406433, "logps/chosen": -1.896601676940918, "logps/rejected": -1.668404221534729, "loss": 5.3947, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.966014862060547, "rewards/margins": -2.281973361968994, "rewards/rejected": -16.68404197692871, "step": 5390 }, { "epoch": 0.1818396305908524, "grad_norm": 28.980819702148438, "learning_rate": 9.797374457515652e-07, "logits/chosen": -0.45707112550735474, "logits/rejected": -0.532502293586731, "logps/chosen": -1.9134578704833984, "logps/rejected": -1.8541061878204346, "loss": 3.8084, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -19.134578704833984, "rewards/margins": -0.5935171842575073, "rewards/rejected": -18.541059494018555, "step": 5395 }, { "epoch": 0.1820081566618356, "grad_norm": 39.4348030090332, "learning_rate": 9.796544773478701e-07, "logits/chosen": -0.3773210644721985, "logits/rejected": -0.3709460198879242, "logps/chosen": -2.3887171745300293, "logps/rejected": -2.5245351791381836, "loss": 1.9783, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -23.88717269897461, "rewards/margins": 1.3581793308258057, "rewards/rejected": -25.245349884033203, "step": 5400 }, { "epoch": 0.18217668273281876, "grad_norm": 25.09897232055664, "learning_rate": 9.79571342954658e-07, "logits/chosen": -0.42378631234169006, "logits/rejected": -0.3182446360588074, "logps/chosen": -2.159898281097412, "logps/rejected": -2.1635546684265137, "loss": 4.122, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -21.598981857299805, "rewards/margins": 0.03656425327062607, "rewards/rejected": -21.63554573059082, "step": 5405 }, { "epoch": 0.18234520880380195, "grad_norm": 40.908599853515625, "learning_rate": 9.794880426006983e-07, "logits/chosen": -0.7844399809837341, "logits/rejected": -0.6861599087715149, "logps/chosen": -1.6490141153335571, "logps/rejected": -1.6898380517959595, "loss": 2.8132, "rewards/accuracies": 0.5, "rewards/chosen": -16.49013900756836, "rewards/margins": 0.40823841094970703, "rewards/rejected": -16.898380279541016, "step": 5410 }, { "epoch": 0.18251373487478512, "grad_norm": 39.26100540161133, "learning_rate": 9.794045763148184e-07, "logits/chosen": -0.6198582053184509, "logits/rejected": -0.6737528443336487, "logps/chosen": -1.8701423406600952, "logps/rejected": -1.860345482826233, "loss": 3.2316, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.70142364501953, "rewards/margins": -0.0979672446846962, "rewards/rejected": -18.603458404541016, "step": 5415 }, { "epoch": 0.1826822609457683, "grad_norm": 32.086490631103516, "learning_rate": 9.793209441259022e-07, "logits/chosen": -0.5078805685043335, "logits/rejected": -0.5676628947257996, "logps/chosen": -1.7885487079620361, "logps/rejected": -1.8656467199325562, "loss": 2.5302, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.885486602783203, "rewards/margins": 0.7709810137748718, "rewards/rejected": -18.65646743774414, "step": 5420 }, { "epoch": 0.1828507870167515, "grad_norm": 19.564838409423828, "learning_rate": 9.79237146062892e-07, "logits/chosen": -0.26721253991127014, "logits/rejected": -0.35675540566444397, "logps/chosen": -1.4722105264663696, "logps/rejected": -1.6488120555877686, "loss": 2.0667, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -14.7221040725708, "rewards/margins": 1.7660157680511475, "rewards/rejected": -16.488121032714844, "step": 5425 }, { "epoch": 0.18301931308773467, "grad_norm": 34.31207275390625, "learning_rate": 9.791531821547865e-07, "logits/chosen": -0.23346984386444092, "logits/rejected": -0.258556604385376, "logps/chosen": -2.2937192916870117, "logps/rejected": -2.3578433990478516, "loss": 3.7761, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.937192916870117, "rewards/margins": 0.6412402391433716, "rewards/rejected": -23.578433990478516, "step": 5430 }, { "epoch": 0.18318783915871786, "grad_norm": 25.44965934753418, "learning_rate": 9.790690524306426e-07, "logits/chosen": -0.47591328620910645, "logits/rejected": -0.679408609867096, "logps/chosen": -1.8982775211334229, "logps/rejected": -1.9024616479873657, "loss": 3.0692, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.98277473449707, "rewards/margins": 0.0418427474796772, "rewards/rejected": -19.024616241455078, "step": 5435 }, { "epoch": 0.18335636522970103, "grad_norm": 28.910072326660156, "learning_rate": 9.789847569195742e-07, "logits/chosen": -0.5068162083625793, "logits/rejected": -0.3690851330757141, "logps/chosen": -2.17146635055542, "logps/rejected": -2.3016185760498047, "loss": 1.9669, "rewards/accuracies": 1.0, "rewards/chosen": -21.714664459228516, "rewards/margins": 1.3015224933624268, "rewards/rejected": -23.016185760498047, "step": 5440 }, { "epoch": 0.18352489130068422, "grad_norm": 27.728517532348633, "learning_rate": 9.789002956507525e-07, "logits/chosen": -0.4048040509223938, "logits/rejected": -0.4588392376899719, "logps/chosen": -1.8538213968276978, "logps/rejected": -2.2220959663391113, "loss": 2.2989, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.5382137298584, "rewards/margins": 3.6827454566955566, "rewards/rejected": -22.220958709716797, "step": 5445 }, { "epoch": 0.1836934173716674, "grad_norm": 60.68367385864258, "learning_rate": 9.788156686534069e-07, "logits/chosen": -0.3275999426841736, "logits/rejected": -0.19864805042743683, "logps/chosen": -2.4291739463806152, "logps/rejected": -2.6376471519470215, "loss": 2.9337, "rewards/accuracies": 0.5, "rewards/chosen": -24.29174041748047, "rewards/margins": 2.084731101989746, "rewards/rejected": -26.3764705657959, "step": 5450 }, { "epoch": 0.18386194344265058, "grad_norm": 24.850337982177734, "learning_rate": 9.787308759568225e-07, "logits/chosen": -0.6782561540603638, "logits/rejected": -0.5187098979949951, "logps/chosen": -1.7515907287597656, "logps/rejected": -2.032395839691162, "loss": 1.6244, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.515907287597656, "rewards/margins": 2.8080544471740723, "rewards/rejected": -20.32396125793457, "step": 5455 }, { "epoch": 0.18403046951363375, "grad_norm": 44.323890686035156, "learning_rate": 9.786459175903433e-07, "logits/chosen": -0.6140015125274658, "logits/rejected": -0.4879150986671448, "logps/chosen": -2.3758678436279297, "logps/rejected": -2.6171932220458984, "loss": 2.4341, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.758678436279297, "rewards/margins": 2.4132542610168457, "rewards/rejected": -26.17193031311035, "step": 5460 }, { "epoch": 0.18419899558461694, "grad_norm": 20.042804718017578, "learning_rate": 9.7856079358337e-07, "logits/chosen": -0.587514340877533, "logits/rejected": -0.5617462396621704, "logps/chosen": -1.873822808265686, "logps/rejected": -1.8781769275665283, "loss": 3.4026, "rewards/accuracies": 0.5, "rewards/chosen": -18.73822784423828, "rewards/margins": 0.04354066774249077, "rewards/rejected": -18.781768798828125, "step": 5465 }, { "epoch": 0.1843675216556001, "grad_norm": 16.114381790161133, "learning_rate": 9.784755039653605e-07, "logits/chosen": -0.44803792238235474, "logits/rejected": -0.6248019337654114, "logps/chosen": -1.5381447076797485, "logps/rejected": -1.7500909566879272, "loss": 2.2383, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -15.381446838378906, "rewards/margins": 2.1194636821746826, "rewards/rejected": -17.50090980529785, "step": 5470 }, { "epoch": 0.1845360477265833, "grad_norm": 184.44888305664062, "learning_rate": 9.783900487658304e-07, "logits/chosen": -0.37028566002845764, "logits/rejected": -0.23744535446166992, "logps/chosen": -2.110156536102295, "logps/rejected": -1.7765562534332275, "loss": 7.1675, "rewards/accuracies": 0.5, "rewards/chosen": -21.1015682220459, "rewards/margins": -3.3360047340393066, "rewards/rejected": -17.76556396484375, "step": 5475 }, { "epoch": 0.1847045737975665, "grad_norm": 22.66653060913086, "learning_rate": 9.78304428014352e-07, "logits/chosen": -0.8866994976997375, "logits/rejected": -0.7490772008895874, "logps/chosen": -1.6607614755630493, "logps/rejected": -1.740256905555725, "loss": 2.4509, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.60761260986328, "rewards/margins": 0.7949555516242981, "rewards/rejected": -17.402568817138672, "step": 5480 }, { "epoch": 0.18487309986854966, "grad_norm": 29.36440658569336, "learning_rate": 9.782186417405556e-07, "logits/chosen": -0.21898670494556427, "logits/rejected": -0.17396847903728485, "logps/chosen": -2.279313564300537, "logps/rejected": -2.278048515319824, "loss": 3.2844, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.793132781982422, "rewards/margins": -0.012647914700210094, "rewards/rejected": -22.78048324584961, "step": 5485 }, { "epoch": 0.18504162593953286, "grad_norm": 25.005599975585938, "learning_rate": 9.781326899741284e-07, "logits/chosen": -0.6242612600326538, "logits/rejected": -0.6651891469955444, "logps/chosen": -1.8814970254898071, "logps/rejected": -1.8976047039031982, "loss": 3.0298, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.814970016479492, "rewards/margins": 0.16107892990112305, "rewards/rejected": -18.976049423217773, "step": 5490 }, { "epoch": 0.18521015201051602, "grad_norm": 32.897987365722656, "learning_rate": 9.780465727448149e-07, "logits/chosen": -0.5068541169166565, "logits/rejected": -0.54652339220047, "logps/chosen": -1.840574860572815, "logps/rejected": -1.9388887882232666, "loss": 3.0219, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.405750274658203, "rewards/margins": 0.983138918876648, "rewards/rejected": -19.388887405395508, "step": 5495 }, { "epoch": 0.18537867808149922, "grad_norm": 19.555273056030273, "learning_rate": 9.779602900824167e-07, "logits/chosen": -0.5434045791625977, "logits/rejected": -0.7178775668144226, "logps/chosen": -1.7472089529037476, "logps/rejected": -1.7612006664276123, "loss": 2.9711, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.472087860107422, "rewards/margins": 0.13991737365722656, "rewards/rejected": -17.61200714111328, "step": 5500 }, { "epoch": 0.18554720415248238, "grad_norm": 41.695682525634766, "learning_rate": 9.77873842016793e-07, "logits/chosen": -0.681435227394104, "logits/rejected": -0.6236013174057007, "logps/chosen": -1.890181541442871, "logps/rejected": -1.9118419885635376, "loss": 3.0724, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -18.901817321777344, "rewards/margins": 0.21660356223583221, "rewards/rejected": -19.118419647216797, "step": 5505 }, { "epoch": 0.18571573022346557, "grad_norm": 21.719018936157227, "learning_rate": 9.777872285778603e-07, "logits/chosen": -0.6559665203094482, "logits/rejected": -0.5704790949821472, "logps/chosen": -1.7349132299423218, "logps/rejected": -1.7577717304229736, "loss": 2.9269, "rewards/accuracies": 0.5, "rewards/chosen": -17.349132537841797, "rewards/margins": 0.22858504951000214, "rewards/rejected": -17.577716827392578, "step": 5510 }, { "epoch": 0.18588425629444874, "grad_norm": 30.6837215423584, "learning_rate": 9.777004497955918e-07, "logits/chosen": -0.9211034774780273, "logits/rejected": -0.9266023635864258, "logps/chosen": -1.7248684167861938, "logps/rejected": -1.8038078546524048, "loss": 2.5282, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.24868392944336, "rewards/margins": 0.7893939018249512, "rewards/rejected": -18.03807830810547, "step": 5515 }, { "epoch": 0.18605278236543193, "grad_norm": 30.61154556274414, "learning_rate": 9.77613505700018e-07, "logits/chosen": -0.4484230875968933, "logits/rejected": -0.28866034746170044, "logps/chosen": -1.8092586994171143, "logps/rejected": -1.8665987253189087, "loss": 3.0392, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.092586517333984, "rewards/margins": 0.5734013319015503, "rewards/rejected": -18.66598892211914, "step": 5520 }, { "epoch": 0.1862213084364151, "grad_norm": 24.71278190612793, "learning_rate": 9.775263963212275e-07, "logits/chosen": -0.9451042413711548, "logits/rejected": -0.9575036764144897, "logps/chosen": -1.615065336227417, "logps/rejected": -1.6316133737564087, "loss": 3.1126, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.150653839111328, "rewards/margins": 0.16548070311546326, "rewards/rejected": -16.31613540649414, "step": 5525 }, { "epoch": 0.1863898345073983, "grad_norm": 18.097076416015625, "learning_rate": 9.774391216893646e-07, "logits/chosen": -0.41714709997177124, "logits/rejected": -0.2718644142150879, "logps/chosen": -1.9782493114471436, "logps/rejected": -2.1051318645477295, "loss": 2.3445, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.78249168395996, "rewards/margins": 1.2688255310058594, "rewards/rejected": -21.051319122314453, "step": 5530 }, { "epoch": 0.1865583605783815, "grad_norm": 20.818796157836914, "learning_rate": 9.773516818346323e-07, "logits/chosen": -0.8004224896430969, "logits/rejected": -0.694173276424408, "logps/chosen": -1.8741105794906616, "logps/rejected": -1.9920237064361572, "loss": 2.3865, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.741107940673828, "rewards/margins": 1.1791307926177979, "rewards/rejected": -19.920238494873047, "step": 5535 }, { "epoch": 0.18672688664936465, "grad_norm": 23.69890594482422, "learning_rate": 9.772640767872899e-07, "logits/chosen": -0.4144554138183594, "logits/rejected": -0.4145297110080719, "logps/chosen": -1.9912002086639404, "logps/rejected": -1.9969688653945923, "loss": 4.3266, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.912002563476562, "rewards/margins": 0.05768384784460068, "rewards/rejected": -19.96968650817871, "step": 5540 }, { "epoch": 0.18689541272034785, "grad_norm": 24.523786544799805, "learning_rate": 9.771763065776538e-07, "logits/chosen": -0.5034445524215698, "logits/rejected": -0.6185353994369507, "logps/chosen": -1.7700822353363037, "logps/rejected": -1.7892067432403564, "loss": 2.9114, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.700820922851562, "rewards/margins": 0.1912446916103363, "rewards/rejected": -17.892066955566406, "step": 5545 }, { "epoch": 0.187063938791331, "grad_norm": 18.63873863220215, "learning_rate": 9.77088371236098e-07, "logits/chosen": -0.5159907937049866, "logits/rejected": -0.5340949296951294, "logps/chosen": -1.7579313516616821, "logps/rejected": -1.73773992061615, "loss": 3.3439, "rewards/accuracies": 0.5, "rewards/chosen": -17.57931137084961, "rewards/margins": -0.2019151747226715, "rewards/rejected": -17.377399444580078, "step": 5550 }, { "epoch": 0.1872324648623142, "grad_norm": 41.46942138671875, "learning_rate": 9.770002707930535e-07, "logits/chosen": -0.37359291315078735, "logits/rejected": -0.4222946763038635, "logps/chosen": -1.71608567237854, "logps/rejected": -1.8431408405303955, "loss": 2.3919, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.160858154296875, "rewards/margins": 1.2705483436584473, "rewards/rejected": -18.431407928466797, "step": 5555 }, { "epoch": 0.18740099093329737, "grad_norm": 30.87368392944336, "learning_rate": 9.769120052790084e-07, "logits/chosen": -0.4397002160549164, "logits/rejected": -0.3921523094177246, "logps/chosen": -1.8612377643585205, "logps/rejected": -1.8991447687149048, "loss": 2.8729, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.612377166748047, "rewards/margins": 0.3790697157382965, "rewards/rejected": -18.99144744873047, "step": 5560 }, { "epoch": 0.18756951700428057, "grad_norm": 23.553661346435547, "learning_rate": 9.768235747245078e-07, "logits/chosen": -0.25023049116134644, "logits/rejected": -0.2950226366519928, "logps/chosen": -1.9554675817489624, "logps/rejected": -2.0229415893554688, "loss": 2.6882, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.554676055908203, "rewards/margins": 0.6747380495071411, "rewards/rejected": -20.229412078857422, "step": 5565 }, { "epoch": 0.18773804307526373, "grad_norm": 24.0847110748291, "learning_rate": 9.767349791601539e-07, "logits/chosen": -0.7085575461387634, "logits/rejected": -0.7902520895004272, "logps/chosen": -1.752450942993164, "logps/rejected": -1.6067931652069092, "loss": 4.5458, "rewards/accuracies": 0.5, "rewards/chosen": -17.52450942993164, "rewards/margins": -1.456578016281128, "rewards/rejected": -16.06793212890625, "step": 5570 }, { "epoch": 0.18790656914624693, "grad_norm": 25.039995193481445, "learning_rate": 9.766462186166064e-07, "logits/chosen": -0.3297646939754486, "logits/rejected": -0.5008463263511658, "logps/chosen": -1.6626918315887451, "logps/rejected": -1.8057029247283936, "loss": 2.5971, "rewards/accuracies": 0.5, "rewards/chosen": -16.62691879272461, "rewards/margins": 1.430110216140747, "rewards/rejected": -18.057031631469727, "step": 5575 }, { "epoch": 0.1880750952172301, "grad_norm": 36.930419921875, "learning_rate": 9.76557293124582e-07, "logits/chosen": -0.6175158023834229, "logits/rejected": -0.5426197648048401, "logps/chosen": -1.818953275680542, "logps/rejected": -1.803902268409729, "loss": 3.4941, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.189531326293945, "rewards/margins": -0.15051087737083435, "rewards/rejected": -18.039020538330078, "step": 5580 }, { "epoch": 0.1882436212882133, "grad_norm": 17.54623794555664, "learning_rate": 9.764682027148538e-07, "logits/chosen": -0.9951989054679871, "logits/rejected": -0.990329921245575, "logps/chosen": -1.5430244207382202, "logps/rejected": -1.5838948488235474, "loss": 2.7648, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -15.430246353149414, "rewards/margins": 0.4087028503417969, "rewards/rejected": -15.838948249816895, "step": 5585 }, { "epoch": 0.18841214735919648, "grad_norm": 22.05076026916504, "learning_rate": 9.763789474182529e-07, "logits/chosen": -0.5766128301620483, "logits/rejected": -0.6728538870811462, "logps/chosen": -1.7131710052490234, "logps/rejected": -1.6873975992202759, "loss": 3.4466, "rewards/accuracies": 0.5, "rewards/chosen": -17.131710052490234, "rewards/margins": -0.2577347755432129, "rewards/rejected": -16.87397575378418, "step": 5590 }, { "epoch": 0.18858067343017965, "grad_norm": 0.013031297363340855, "learning_rate": 9.762895272656667e-07, "logits/chosen": -0.33102065324783325, "logits/rejected": -0.30809807777404785, "logps/chosen": -2.0100135803222656, "logps/rejected": -2.431529998779297, "loss": 2.0247, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.10013771057129, "rewards/margins": 4.215163230895996, "rewards/rejected": -24.3153018951416, "step": 5595 }, { "epoch": 0.18874919950116284, "grad_norm": 18.23565101623535, "learning_rate": 9.761999422880402e-07, "logits/chosen": -0.45090073347091675, "logits/rejected": -0.39279770851135254, "logps/chosen": -1.7720234394073486, "logps/rejected": -1.8577537536621094, "loss": 2.4942, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.720233917236328, "rewards/margins": 0.8573040962219238, "rewards/rejected": -18.577539443969727, "step": 5600 }, { "epoch": 0.18874919950116284, "eval_logits/chosen": -0.7959946990013123, "eval_logits/rejected": -0.818673312664032, "eval_logps/chosen": -1.7273130416870117, "eval_logps/rejected": -1.749546766281128, "eval_loss": 3.2864151000976562, "eval_rewards/accuracies": 0.5099999904632568, "eval_rewards/chosen": -17.273130416870117, "eval_rewards/margins": 0.22233757376670837, "eval_rewards/rejected": -17.495468139648438, "eval_runtime": 12.912, "eval_samples_per_second": 7.745, "eval_steps_per_second": 1.936, "step": 5600 }, { "epoch": 0.188917725572146, "grad_norm": 26.624677658081055, "learning_rate": 9.761101925163752e-07, "logits/chosen": -0.595131516456604, "logits/rejected": -0.687487781047821, "logps/chosen": -1.9345438480377197, "logps/rejected": -2.0004289150238037, "loss": 2.9438, "rewards/accuracies": 0.5, "rewards/chosen": -19.34543800354004, "rewards/margins": 0.6588513255119324, "rewards/rejected": -20.004289627075195, "step": 5605 }, { "epoch": 0.1890862516431292, "grad_norm": 32.13505172729492, "learning_rate": 9.76020277981731e-07, "logits/chosen": -0.4904160499572754, "logits/rejected": -0.33541202545166016, "logps/chosen": -1.7449373006820679, "logps/rejected": -1.8215818405151367, "loss": 2.8797, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.44937515258789, "rewards/margins": 0.7664445042610168, "rewards/rejected": -18.215818405151367, "step": 5610 }, { "epoch": 0.18925477771411237, "grad_norm": 9.597542762756348, "learning_rate": 9.759301987152225e-07, "logits/chosen": -0.3985956609249115, "logits/rejected": -0.4843437075614929, "logps/chosen": -1.9301046133041382, "logps/rejected": -1.9960343837738037, "loss": 2.5725, "rewards/accuracies": 0.5, "rewards/chosen": -19.30104637145996, "rewards/margins": 0.6592990756034851, "rewards/rejected": -19.960346221923828, "step": 5615 }, { "epoch": 0.18942330378509556, "grad_norm": 22.30307388305664, "learning_rate": 9.758399547480232e-07, "logits/chosen": -0.4911147654056549, "logits/rejected": -0.5110594034194946, "logps/chosen": -1.7245635986328125, "logps/rejected": -1.575623869895935, "loss": 4.65, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -17.245634078979492, "rewards/margins": -1.4893945455551147, "rewards/rejected": -15.75623893737793, "step": 5620 }, { "epoch": 0.18959182985607873, "grad_norm": 18.47562599182129, "learning_rate": 9.757495461113632e-07, "logits/chosen": -0.432157427072525, "logits/rejected": -0.41354498267173767, "logps/chosen": -1.7506507635116577, "logps/rejected": -1.8003524541854858, "loss": 2.9609, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.506507873535156, "rewards/margins": 0.49701786041259766, "rewards/rejected": -18.00352668762207, "step": 5625 }, { "epoch": 0.18976035592706192, "grad_norm": 27.529369354248047, "learning_rate": 9.756589728365288e-07, "logits/chosen": -0.569477379322052, "logits/rejected": -0.5368244051933289, "logps/chosen": -1.5696738958358765, "logps/rejected": -1.6201614141464233, "loss": 2.7874, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -15.696739196777344, "rewards/margins": 0.5048743486404419, "rewards/rejected": -16.201614379882812, "step": 5630 }, { "epoch": 0.18992888199804508, "grad_norm": 25.479154586791992, "learning_rate": 9.755682349548643e-07, "logits/chosen": -0.36692845821380615, "logits/rejected": -0.32827451825141907, "logps/chosen": -1.7392432689666748, "logps/rejected": -1.812835693359375, "loss": 2.8218, "rewards/accuracies": 0.5, "rewards/chosen": -17.39243507385254, "rewards/margins": 0.7359241247177124, "rewards/rejected": -18.12835693359375, "step": 5635 }, { "epoch": 0.19009740806902828, "grad_norm": 18.921920776367188, "learning_rate": 9.7547733249777e-07, "logits/chosen": -0.7113291025161743, "logits/rejected": -0.6769397258758545, "logps/chosen": -1.5992504358291626, "logps/rejected": -1.6348832845687866, "loss": 3.048, "rewards/accuracies": 0.5, "rewards/chosen": -15.992505073547363, "rewards/margins": 0.356327623128891, "rewards/rejected": -16.348833084106445, "step": 5640 }, { "epoch": 0.19026593414001147, "grad_norm": 24.759885787963867, "learning_rate": 9.753862654967044e-07, "logits/chosen": -0.6169870495796204, "logits/rejected": -0.9548047184944153, "logps/chosen": -1.5832029581069946, "logps/rejected": -1.6658849716186523, "loss": 2.3219, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -15.83202838897705, "rewards/margins": 0.8268213272094727, "rewards/rejected": -16.658849716186523, "step": 5645 }, { "epoch": 0.19043446021099464, "grad_norm": 23.502939224243164, "learning_rate": 9.752950339831815e-07, "logits/chosen": -0.6806584000587463, "logits/rejected": -0.6301476955413818, "logps/chosen": -1.8754730224609375, "logps/rejected": -1.8923496007919312, "loss": 3.2034, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -18.754730224609375, "rewards/margins": 0.1687663048505783, "rewards/rejected": -18.92349624633789, "step": 5650 }, { "epoch": 0.19060298628197783, "grad_norm": 41.726806640625, "learning_rate": 9.752036379887733e-07, "logits/chosen": -0.5170332789421082, "logits/rejected": -0.577129065990448, "logps/chosen": -1.943185806274414, "logps/rejected": -1.9628463983535767, "loss": 4.2989, "rewards/accuracies": 0.5, "rewards/chosen": -19.43185806274414, "rewards/margins": 0.1966075897216797, "rewards/rejected": -19.628467559814453, "step": 5655 }, { "epoch": 0.190771512352961, "grad_norm": 16.944015502929688, "learning_rate": 9.751120775451083e-07, "logits/chosen": -0.3677743077278137, "logits/rejected": -0.34167230129241943, "logps/chosen": -2.053145170211792, "logps/rejected": -2.0245869159698486, "loss": 3.7197, "rewards/accuracies": 0.5, "rewards/chosen": -20.531452178955078, "rewards/margins": -0.2855833172798157, "rewards/rejected": -20.245868682861328, "step": 5660 }, { "epoch": 0.1909400384239442, "grad_norm": 31.120407104492188, "learning_rate": 9.750203526838719e-07, "logits/chosen": -0.3285277783870697, "logits/rejected": -0.3090980052947998, "logps/chosen": -1.9437472820281982, "logps/rejected": -1.775830626487732, "loss": 4.7381, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -19.437475204467773, "rewards/margins": -1.6791677474975586, "rewards/rejected": -17.7583065032959, "step": 5665 }, { "epoch": 0.19110856449492736, "grad_norm": 27.374696731567383, "learning_rate": 9.749284634368064e-07, "logits/chosen": -0.2821030020713806, "logits/rejected": -0.4119594097137451, "logps/chosen": -1.6939541101455688, "logps/rejected": -1.643109917640686, "loss": 3.578, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -16.93954086303711, "rewards/margins": -0.5084422826766968, "rewards/rejected": -16.43109893798828, "step": 5670 }, { "epoch": 0.19127709056591055, "grad_norm": 27.388362884521484, "learning_rate": 9.748364098357113e-07, "logits/chosen": -0.5865864157676697, "logits/rejected": -0.6384282112121582, "logps/chosen": -1.9883333444595337, "logps/rejected": -1.9986881017684937, "loss": 3.3969, "rewards/accuracies": 0.5, "rewards/chosen": -19.88333511352539, "rewards/margins": 0.10354681313037872, "rewards/rejected": -19.986881256103516, "step": 5675 }, { "epoch": 0.19144561663689372, "grad_norm": 28.64921760559082, "learning_rate": 9.747441919124426e-07, "logits/chosen": -0.3893504738807678, "logits/rejected": -0.5681343078613281, "logps/chosen": -2.015651226043701, "logps/rejected": -2.0036251544952393, "loss": 3.2282, "rewards/accuracies": 0.5, "rewards/chosen": -20.156513214111328, "rewards/margins": -0.12026166915893555, "rewards/rejected": -20.036251068115234, "step": 5680 }, { "epoch": 0.1916141427078769, "grad_norm": 39.19678497314453, "learning_rate": 9.74651809698913e-07, "logits/chosen": -0.34206151962280273, "logits/rejected": -0.2204916775226593, "logps/chosen": -1.8198713064193726, "logps/rejected": -1.842043161392212, "loss": 2.984, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.198715209960938, "rewards/margins": 0.22171735763549805, "rewards/rejected": -18.42043113708496, "step": 5685 }, { "epoch": 0.19178266877886008, "grad_norm": 52.62766647338867, "learning_rate": 9.74559263227093e-07, "logits/chosen": -0.25368112325668335, "logits/rejected": -0.28468552231788635, "logps/chosen": -1.864861249923706, "logps/rejected": -1.778679609298706, "loss": 3.9123, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -18.648611068725586, "rewards/margins": -0.8618162870407104, "rewards/rejected": -17.78679656982422, "step": 5690 }, { "epoch": 0.19195119484984327, "grad_norm": 20.276491165161133, "learning_rate": 9.744665525290087e-07, "logits/chosen": -0.9554246068000793, "logits/rejected": -0.8531731367111206, "logps/chosen": -1.6874557733535767, "logps/rejected": -1.726912260055542, "loss": 2.9321, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.874557495117188, "rewards/margins": 0.3945651054382324, "rewards/rejected": -17.269123077392578, "step": 5695 }, { "epoch": 0.19211972092082646, "grad_norm": 16.296485900878906, "learning_rate": 9.743736776367435e-07, "logits/chosen": -0.747841477394104, "logits/rejected": -0.6805245876312256, "logps/chosen": -1.3950637578964233, "logps/rejected": -1.4789271354675293, "loss": 2.6315, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -13.950637817382812, "rewards/margins": 0.8386334180831909, "rewards/rejected": -14.789271354675293, "step": 5700 }, { "epoch": 0.19228824699180963, "grad_norm": 29.865983963012695, "learning_rate": 9.742806385824383e-07, "logits/chosen": -0.7948347330093384, "logits/rejected": -0.76251620054245, "logps/chosen": -1.8513984680175781, "logps/rejected": -1.9355262517929077, "loss": 2.8024, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.513986587524414, "rewards/margins": 0.8412765264511108, "rewards/rejected": -19.355260848999023, "step": 5705 }, { "epoch": 0.19245677306279282, "grad_norm": 38.51240921020508, "learning_rate": 9.7418743539829e-07, "logits/chosen": -0.20339004695415497, "logits/rejected": -0.27671149373054504, "logps/chosen": -1.9387633800506592, "logps/rejected": -1.8538455963134766, "loss": 3.995, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.38763427734375, "rewards/margins": -0.8491789698600769, "rewards/rejected": -18.538455963134766, "step": 5710 }, { "epoch": 0.192625299133776, "grad_norm": 26.83456039428711, "learning_rate": 9.740940681165526e-07, "logits/chosen": -0.6951231956481934, "logits/rejected": -0.5157309770584106, "logps/chosen": -1.8345329761505127, "logps/rejected": -1.8946685791015625, "loss": 2.8783, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.345333099365234, "rewards/margins": 0.6013542413711548, "rewards/rejected": -18.946683883666992, "step": 5715 }, { "epoch": 0.19279382520475918, "grad_norm": 11.871299743652344, "learning_rate": 9.740005367695368e-07, "logits/chosen": -0.7048253417015076, "logits/rejected": -0.7177507281303406, "logps/chosen": -1.4334383010864258, "logps/rejected": -1.6110057830810547, "loss": 1.9898, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -14.334383964538574, "rewards/margins": 1.7756729125976562, "rewards/rejected": -16.110057830810547, "step": 5720 }, { "epoch": 0.19296235127574235, "grad_norm": 22.618762969970703, "learning_rate": 9.739068413896098e-07, "logits/chosen": -0.7570010423660278, "logits/rejected": -0.6863754987716675, "logps/chosen": -1.5401397943496704, "logps/rejected": -1.59748113155365, "loss": 2.7378, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -15.401399612426758, "rewards/margins": 0.5734131932258606, "rewards/rejected": -15.974810600280762, "step": 5725 }, { "epoch": 0.19313087734672554, "grad_norm": 16.779813766479492, "learning_rate": 9.738129820091964e-07, "logits/chosen": -0.5011672973632812, "logits/rejected": -0.5614322423934937, "logps/chosen": -1.8755191564559937, "logps/rejected": -1.8646976947784424, "loss": 3.2224, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.755191802978516, "rewards/margins": -0.10821404308080673, "rewards/rejected": -18.646976470947266, "step": 5730 }, { "epoch": 0.1932994034177087, "grad_norm": 30.153682708740234, "learning_rate": 9.737189586607774e-07, "logits/chosen": -0.40961700677871704, "logits/rejected": -0.6002104878425598, "logps/chosen": -2.0768322944641113, "logps/rejected": -2.105672836303711, "loss": 3.5266, "rewards/accuracies": 0.5, "rewards/chosen": -20.768325805664062, "rewards/margins": 0.2884038984775543, "rewards/rejected": -21.05672836303711, "step": 5735 }, { "epoch": 0.1934679294886919, "grad_norm": 27.910282135009766, "learning_rate": 9.736247713768908e-07, "logits/chosen": -0.23275843262672424, "logits/rejected": -0.2791540026664734, "logps/chosen": -1.5457642078399658, "logps/rejected": -1.623756766319275, "loss": 2.9391, "rewards/accuracies": 0.5, "rewards/chosen": -15.4576416015625, "rewards/margins": 0.7799254655838013, "rewards/rejected": -16.237567901611328, "step": 5740 }, { "epoch": 0.19363645555967507, "grad_norm": 19.33202362060547, "learning_rate": 9.735304201901306e-07, "logits/chosen": -0.877076268196106, "logits/rejected": -0.9000272750854492, "logps/chosen": -1.7524656057357788, "logps/rejected": -1.6089779138565063, "loss": 4.5017, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -17.524654388427734, "rewards/margins": -1.4348783493041992, "rewards/rejected": -16.089778900146484, "step": 5745 }, { "epoch": 0.19380498163065826, "grad_norm": 25.68280601501465, "learning_rate": 9.734359051331485e-07, "logits/chosen": -0.2692585587501526, "logits/rejected": -0.3588291108608246, "logps/chosen": -1.8761327266693115, "logps/rejected": -1.8713629245758057, "loss": 3.3386, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.76132583618164, "rewards/margins": -0.047698307782411575, "rewards/rejected": -18.713626861572266, "step": 5750 }, { "epoch": 0.19397350770164146, "grad_norm": 32.249664306640625, "learning_rate": 9.73341226238652e-07, "logits/chosen": -0.250613272190094, "logits/rejected": -0.2793341875076294, "logps/chosen": -1.8432165384292603, "logps/rejected": -1.846566915512085, "loss": 3.6394, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -18.43216323852539, "rewards/margins": 0.03350649029016495, "rewards/rejected": -18.46567153930664, "step": 5755 }, { "epoch": 0.19414203377262462, "grad_norm": 44.116703033447266, "learning_rate": 9.732463835394063e-07, "logits/chosen": -0.4042670726776123, "logits/rejected": -0.23007135093212128, "logps/chosen": -1.9827651977539062, "logps/rejected": -1.97702157497406, "loss": 3.736, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.827651977539062, "rewards/margins": -0.05743579939007759, "rewards/rejected": -19.770214080810547, "step": 5760 }, { "epoch": 0.19431055984360782, "grad_norm": 28.332536697387695, "learning_rate": 9.731513770682323e-07, "logits/chosen": -0.5446762442588806, "logits/rejected": -0.5711333155632019, "logps/chosen": -1.74869704246521, "logps/rejected": -1.832109808921814, "loss": 2.508, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.48697280883789, "rewards/margins": 0.8341274261474609, "rewards/rejected": -18.32110023498535, "step": 5765 }, { "epoch": 0.19447908591459098, "grad_norm": 6.460046291351318, "learning_rate": 9.730562068580082e-07, "logits/chosen": -0.8367756009101868, "logits/rejected": -0.9457358121871948, "logps/chosen": -2.0421996116638184, "logps/rejected": -2.199319362640381, "loss": 2.4971, "rewards/accuracies": 0.5, "rewards/chosen": -20.421993255615234, "rewards/margins": 1.5711987018585205, "rewards/rejected": -21.993194580078125, "step": 5770 }, { "epoch": 0.19464761198557418, "grad_norm": 28.444000244140625, "learning_rate": 9.729608729416685e-07, "logits/chosen": -0.5505703687667847, "logits/rejected": -0.7946529388427734, "logps/chosen": -1.8401174545288086, "logps/rejected": -1.7754510641098022, "loss": 3.7309, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -18.401174545288086, "rewards/margins": -0.6466630697250366, "rewards/rejected": -17.754512786865234, "step": 5775 }, { "epoch": 0.19481613805655734, "grad_norm": 37.49628448486328, "learning_rate": 9.728653753522045e-07, "logits/chosen": -0.5423851013183594, "logits/rejected": -0.5040590167045593, "logps/chosen": -1.6697317361831665, "logps/rejected": -1.6786043643951416, "loss": 3.2312, "rewards/accuracies": 0.5, "rewards/chosen": -16.697317123413086, "rewards/margins": 0.08872584998607635, "rewards/rejected": -16.786041259765625, "step": 5780 }, { "epoch": 0.19498466412754054, "grad_norm": 35.8685188293457, "learning_rate": 9.727697141226644e-07, "logits/chosen": -0.5237818360328674, "logits/rejected": -0.6760590672492981, "logps/chosen": -1.9653745889663696, "logps/rejected": -1.8444054126739502, "loss": 4.2802, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -19.653745651245117, "rewards/margins": -1.209693193435669, "rewards/rejected": -18.444053649902344, "step": 5785 }, { "epoch": 0.1951531901985237, "grad_norm": 34.6473388671875, "learning_rate": 9.726738892861526e-07, "logits/chosen": -0.37371161580085754, "logits/rejected": -0.44602465629577637, "logps/chosen": -1.9422938823699951, "logps/rejected": -2.0096755027770996, "loss": 3.0281, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.42293930053711, "rewards/margins": 0.67381751537323, "rewards/rejected": -20.096755981445312, "step": 5790 }, { "epoch": 0.1953217162695069, "grad_norm": 25.640531539916992, "learning_rate": 9.725779008758303e-07, "logits/chosen": -0.4616813659667969, "logits/rejected": -0.475913941860199, "logps/chosen": -1.3816555738449097, "logps/rejected": -1.4525748491287231, "loss": 3.0528, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -13.816555976867676, "rewards/margins": 0.7091928720474243, "rewards/rejected": -14.525749206542969, "step": 5795 }, { "epoch": 0.19549024234049006, "grad_norm": 18.854049682617188, "learning_rate": 9.724817489249154e-07, "logits/chosen": -0.24698737263679504, "logits/rejected": -0.237580806016922, "logps/chosen": -1.6802804470062256, "logps/rejected": -1.8639495372772217, "loss": 2.637, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.802804946899414, "rewards/margins": 1.8366916179656982, "rewards/rejected": -18.639495849609375, "step": 5800 }, { "epoch": 0.19565876841147326, "grad_norm": 139.14752197265625, "learning_rate": 9.72385433466682e-07, "logits/chosen": -0.6270781755447388, "logits/rejected": -0.7254256010055542, "logps/chosen": -2.0985748767852783, "logps/rejected": -1.8484370708465576, "loss": 5.6863, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.985748291015625, "rewards/margins": -2.5013768672943115, "rewards/rejected": -18.484371185302734, "step": 5805 }, { "epoch": 0.19582729448245645, "grad_norm": 19.971731185913086, "learning_rate": 9.722889545344614e-07, "logits/chosen": -0.4281018376350403, "logits/rejected": -0.37164705991744995, "logps/chosen": -1.734190583229065, "logps/rejected": -1.7541513442993164, "loss": 3.3521, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.34190559387207, "rewards/margins": 0.19960804283618927, "rewards/rejected": -17.541513442993164, "step": 5810 }, { "epoch": 0.19599582055343961, "grad_norm": 60.791259765625, "learning_rate": 9.721923121616413e-07, "logits/chosen": -0.07752398401498795, "logits/rejected": -0.12650911509990692, "logps/chosen": -1.9717010259628296, "logps/rejected": -2.07768177986145, "loss": 3.2981, "rewards/accuracies": 0.5, "rewards/chosen": -19.71701431274414, "rewards/margins": 1.0598057508468628, "rewards/rejected": -20.776817321777344, "step": 5815 }, { "epoch": 0.1961643466244228, "grad_norm": 30.976852416992188, "learning_rate": 9.720955063816654e-07, "logits/chosen": -0.6050688624382019, "logits/rejected": -0.6339203119277954, "logps/chosen": -1.7570356130599976, "logps/rejected": -1.7321546077728271, "loss": 3.3348, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -17.570356369018555, "rewards/margins": -0.24881133437156677, "rewards/rejected": -17.32154655456543, "step": 5820 }, { "epoch": 0.19633287269540597, "grad_norm": 20.5854434967041, "learning_rate": 9.719985372280347e-07, "logits/chosen": -0.4038739800453186, "logits/rejected": -0.3884919583797455, "logps/chosen": -2.019855499267578, "logps/rejected": -2.1005451679229736, "loss": 2.6202, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.19855308532715, "rewards/margins": 0.8068978190422058, "rewards/rejected": -21.005451202392578, "step": 5825 }, { "epoch": 0.19650139876638917, "grad_norm": 18.702695846557617, "learning_rate": 9.71901404734306e-07, "logits/chosen": -0.8625004887580872, "logits/rejected": -0.6800850033760071, "logps/chosen": -1.7070497274398804, "logps/rejected": -1.8050239086151123, "loss": 2.3985, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.070499420166016, "rewards/margins": 0.9797409176826477, "rewards/rejected": -18.05023956298828, "step": 5830 }, { "epoch": 0.19666992483737233, "grad_norm": 16.789480209350586, "learning_rate": 9.718041089340936e-07, "logits/chosen": -0.6353279948234558, "logits/rejected": -0.6146889328956604, "logps/chosen": -1.8607136011123657, "logps/rejected": -1.9272617101669312, "loss": 2.6298, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.60713768005371, "rewards/margins": 0.665479302406311, "rewards/rejected": -19.27261734008789, "step": 5835 }, { "epoch": 0.19683845090835553, "grad_norm": 37.87785720825195, "learning_rate": 9.717066498610673e-07, "logits/chosen": -0.5720896124839783, "logits/rejected": -0.7081347107887268, "logps/chosen": -1.609519600868225, "logps/rejected": -1.6527904272079468, "loss": 2.8625, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.095195770263672, "rewards/margins": 0.43270936608314514, "rewards/rejected": -16.527904510498047, "step": 5840 }, { "epoch": 0.1970069769793387, "grad_norm": 10.778923034667969, "learning_rate": 9.71609027548954e-07, "logits/chosen": -0.35430818796157837, "logits/rejected": -0.2933744192123413, "logps/chosen": -1.6635167598724365, "logps/rejected": -1.7969443798065186, "loss": 2.5076, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.635168075561523, "rewards/margins": 1.3342747688293457, "rewards/rejected": -17.96944236755371, "step": 5845 }, { "epoch": 0.1971755030503219, "grad_norm": 23.068889617919922, "learning_rate": 9.715112420315368e-07, "logits/chosen": -1.0444772243499756, "logits/rejected": -0.9749389886856079, "logps/chosen": -1.5760352611541748, "logps/rejected": -1.6423890590667725, "loss": 2.6107, "rewards/accuracies": 0.5, "rewards/chosen": -15.760353088378906, "rewards/margins": 0.66353839635849, "rewards/rejected": -16.423892974853516, "step": 5850 }, { "epoch": 0.19734402912130505, "grad_norm": 30.91828727722168, "learning_rate": 9.714132933426557e-07, "logits/chosen": -0.43545252084732056, "logits/rejected": -0.40154963731765747, "logps/chosen": -1.805863618850708, "logps/rejected": -1.7854446172714233, "loss": 3.3247, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.058637619018555, "rewards/margins": -0.2041909247636795, "rewards/rejected": -17.85444450378418, "step": 5855 }, { "epoch": 0.19751255519228825, "grad_norm": 31.091777801513672, "learning_rate": 9.713151815162067e-07, "logits/chosen": -0.4769212603569031, "logits/rejected": -0.5193304419517517, "logps/chosen": -1.664804220199585, "logps/rejected": -1.700951337814331, "loss": 2.8362, "rewards/accuracies": 0.5, "rewards/chosen": -16.648040771484375, "rewards/margins": 0.36147135496139526, "rewards/rejected": -17.00951385498047, "step": 5860 }, { "epoch": 0.19768108126327144, "grad_norm": 25.882474899291992, "learning_rate": 9.712169065861424e-07, "logits/chosen": -0.49053654074668884, "logits/rejected": -0.2957160472869873, "logps/chosen": -2.1436679363250732, "logps/rejected": -2.3108649253845215, "loss": 2.587, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.43667984008789, "rewards/margins": 1.6719691753387451, "rewards/rejected": -23.10865020751953, "step": 5865 }, { "epoch": 0.1978496073342546, "grad_norm": 20.540668487548828, "learning_rate": 9.71118468586472e-07, "logits/chosen": -0.26937225461006165, "logits/rejected": -0.18302848935127258, "logps/chosen": -1.7931854724884033, "logps/rejected": -1.7600908279418945, "loss": 3.406, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.931856155395508, "rewards/margins": -0.33094778656959534, "rewards/rejected": -17.600908279418945, "step": 5870 }, { "epoch": 0.1980181334052378, "grad_norm": 49.5175666809082, "learning_rate": 9.710198675512608e-07, "logits/chosen": -0.4052053391933441, "logits/rejected": -0.4798244535923004, "logps/chosen": -1.8708423376083374, "logps/rejected": -1.916273832321167, "loss": 2.8076, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.708423614501953, "rewards/margins": 0.45431557297706604, "rewards/rejected": -19.162738800048828, "step": 5875 }, { "epoch": 0.19818665947622097, "grad_norm": 29.265352249145508, "learning_rate": 9.70921103514631e-07, "logits/chosen": -0.641699492931366, "logits/rejected": -0.7690142393112183, "logps/chosen": -1.9759972095489502, "logps/rejected": -1.9683644771575928, "loss": 3.2972, "rewards/accuracies": 0.5, "rewards/chosen": -19.759973526000977, "rewards/margins": -0.07632637023925781, "rewards/rejected": -19.683645248413086, "step": 5880 }, { "epoch": 0.19835518554720416, "grad_norm": 35.96839904785156, "learning_rate": 9.708221765107607e-07, "logits/chosen": -0.45869994163513184, "logits/rejected": -0.4570987820625305, "logps/chosen": -2.0931758880615234, "logps/rejected": -2.1789519786834717, "loss": 2.7377, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.931758880615234, "rewards/margins": 0.8577610850334167, "rewards/rejected": -21.789520263671875, "step": 5885 }, { "epoch": 0.19852371161818733, "grad_norm": 23.893787384033203, "learning_rate": 9.70723086573885e-07, "logits/chosen": -0.6110178828239441, "logits/rejected": -0.6918951869010925, "logps/chosen": -1.456732988357544, "logps/rejected": -1.639822244644165, "loss": 2.0513, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -14.567329406738281, "rewards/margins": 1.8308935165405273, "rewards/rejected": -16.398223876953125, "step": 5890 }, { "epoch": 0.19869223768917052, "grad_norm": 15.295425415039062, "learning_rate": 9.706238337382947e-07, "logits/chosen": -0.45297136902809143, "logits/rejected": -0.5232487916946411, "logps/chosen": -1.600191354751587, "logps/rejected": -1.6955150365829468, "loss": 2.8533, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.00191307067871, "rewards/margins": 0.9532366991043091, "rewards/rejected": -16.955150604248047, "step": 5895 }, { "epoch": 0.19886076376015369, "grad_norm": 24.45920753479004, "learning_rate": 9.705244180383373e-07, "logits/chosen": -0.3062538504600525, "logits/rejected": -0.4098590910434723, "logps/chosen": -1.7645599842071533, "logps/rejected": -1.7392189502716064, "loss": 3.3747, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -17.645599365234375, "rewards/margins": -0.2534084916114807, "rewards/rejected": -17.39219093322754, "step": 5900 }, { "epoch": 0.19902928983113688, "grad_norm": 19.98879623413086, "learning_rate": 9.704248395084168e-07, "logits/chosen": -0.4781894087791443, "logits/rejected": -0.3514857888221741, "logps/chosen": -2.0645909309387207, "logps/rejected": -2.069446325302124, "loss": 3.1028, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.64590835571289, "rewards/margins": 0.048557378351688385, "rewards/rejected": -20.6944637298584, "step": 5905 }, { "epoch": 0.19919781590212005, "grad_norm": 28.52830696105957, "learning_rate": 9.703250981829932e-07, "logits/chosen": -0.09460530430078506, "logits/rejected": -0.139235720038414, "logps/chosen": -2.3103229999542236, "logps/rejected": -2.6174914836883545, "loss": 1.5726, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -23.103229522705078, "rewards/margins": 3.0716869831085205, "rewards/rejected": -26.174917221069336, "step": 5910 }, { "epoch": 0.19936634197310324, "grad_norm": 49.13142395019531, "learning_rate": 9.702251940965833e-07, "logits/chosen": -0.5760871171951294, "logits/rejected": -0.335256963968277, "logps/chosen": -1.875337839126587, "logps/rejected": -1.816159963607788, "loss": 3.7141, "rewards/accuracies": 0.5, "rewards/chosen": -18.75337791442871, "rewards/margins": -0.5917772054672241, "rewards/rejected": -18.16160011291504, "step": 5915 }, { "epoch": 0.19953486804408643, "grad_norm": 23.02393341064453, "learning_rate": 9.701251272837599e-07, "logits/chosen": -0.46277540922164917, "logits/rejected": -0.30417174100875854, "logps/chosen": -1.7341015338897705, "logps/rejected": -1.7499128580093384, "loss": 3.1858, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.341014862060547, "rewards/margins": 0.15811166167259216, "rewards/rejected": -17.499130249023438, "step": 5920 }, { "epoch": 0.1997033941150696, "grad_norm": 43.57001495361328, "learning_rate": 9.700248977791522e-07, "logits/chosen": -0.6107692122459412, "logits/rejected": -0.6478714346885681, "logps/chosen": -1.9078069925308228, "logps/rejected": -2.158576488494873, "loss": 2.1498, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.07806968688965, "rewards/margins": 2.5076966285705566, "rewards/rejected": -21.585765838623047, "step": 5925 }, { "epoch": 0.1998719201860528, "grad_norm": 25.969892501831055, "learning_rate": 9.699245056174454e-07, "logits/chosen": -0.5042437314987183, "logits/rejected": -0.5279260873794556, "logps/chosen": -2.0870537757873535, "logps/rejected": -2.097421169281006, "loss": 3.393, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -20.87053680419922, "rewards/margins": 0.10367707908153534, "rewards/rejected": -20.974214553833008, "step": 5930 }, { "epoch": 0.20004044625703596, "grad_norm": 26.384300231933594, "learning_rate": 9.698239508333816e-07, "logits/chosen": -0.7063272595405579, "logits/rejected": -0.7647172808647156, "logps/chosen": -1.8058459758758545, "logps/rejected": -1.8751310110092163, "loss": 2.4807, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.058460235595703, "rewards/margins": 0.6928521990776062, "rewards/rejected": -18.751310348510742, "step": 5935 }, { "epoch": 0.20020897232801915, "grad_norm": 34.54125213623047, "learning_rate": 9.697232334617589e-07, "logits/chosen": -0.6371638774871826, "logits/rejected": -0.6679813861846924, "logps/chosen": -1.9524176120758057, "logps/rejected": -2.2252604961395264, "loss": 2.9359, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.5241756439209, "rewards/margins": 2.7284293174743652, "rewards/rejected": -22.252605438232422, "step": 5940 }, { "epoch": 0.20037749839900232, "grad_norm": 115.66785430908203, "learning_rate": 9.696223535374313e-07, "logits/chosen": -0.2789526879787445, "logits/rejected": -0.33569225668907166, "logps/chosen": -2.0028605461120605, "logps/rejected": -2.0199637413024902, "loss": 3.5454, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.02860450744629, "rewards/margins": 0.1710345298051834, "rewards/rejected": -20.19964027404785, "step": 5945 }, { "epoch": 0.2005460244699855, "grad_norm": 20.133207321166992, "learning_rate": 9.695213110953095e-07, "logits/chosen": -0.24441027641296387, "logits/rejected": -0.00073289277497679, "logps/chosen": -1.8092113733291626, "logps/rejected": -2.1856415271759033, "loss": 1.4174, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.092111587524414, "rewards/margins": 3.764303684234619, "rewards/rejected": -21.856416702270508, "step": 5950 }, { "epoch": 0.20071455054096868, "grad_norm": 31.918554306030273, "learning_rate": 9.694201061703604e-07, "logits/chosen": -0.16023916006088257, "logits/rejected": -0.14087925851345062, "logps/chosen": -2.2564046382904053, "logps/rejected": -2.180935859680176, "loss": 3.9491, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.564044952392578, "rewards/margins": -0.7546874284744263, "rewards/rejected": -21.80936050415039, "step": 5955 }, { "epoch": 0.20088307661195187, "grad_norm": 23.612600326538086, "learning_rate": 9.693187387976069e-07, "logits/chosen": -0.7719990015029907, "logits/rejected": -0.7834844589233398, "logps/chosen": -1.8583654165267944, "logps/rejected": -1.797662377357483, "loss": 3.6691, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.583656311035156, "rewards/margins": -0.6070324778556824, "rewards/rejected": -17.97662353515625, "step": 5960 }, { "epoch": 0.20105160268293504, "grad_norm": 22.211774826049805, "learning_rate": 9.692172090121283e-07, "logits/chosen": -0.6716977953910828, "logits/rejected": -0.650750994682312, "logps/chosen": -2.168424606323242, "logps/rejected": -2.2529549598693848, "loss": 2.8945, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.684246063232422, "rewards/margins": 0.8453027009963989, "rewards/rejected": -22.52954864501953, "step": 5965 }, { "epoch": 0.20122012875391823, "grad_norm": 26.79001235961914, "learning_rate": 9.6911551684906e-07, "logits/chosen": -0.2786385416984558, "logits/rejected": -0.3720299303531647, "logps/chosen": -1.8238327503204346, "logps/rejected": -1.905925989151001, "loss": 2.5022, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.238325119018555, "rewards/margins": 0.8209331631660461, "rewards/rejected": -19.05925941467285, "step": 5970 }, { "epoch": 0.20138865482490143, "grad_norm": 33.01688766479492, "learning_rate": 9.69013662343594e-07, "logits/chosen": -0.5326055288314819, "logits/rejected": -0.5187439322471619, "logps/chosen": -1.7838671207427979, "logps/rejected": -1.8846409320831299, "loss": 2.8234, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.83867073059082, "rewards/margins": 1.0077372789382935, "rewards/rejected": -18.84640884399414, "step": 5975 }, { "epoch": 0.2015571808958846, "grad_norm": 52.05866241455078, "learning_rate": 9.689116455309778e-07, "logits/chosen": 0.0032115548383444548, "logits/rejected": -0.10644103586673737, "logps/chosen": -1.8666794300079346, "logps/rejected": -2.0400755405426025, "loss": 2.6665, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.666791915893555, "rewards/margins": 1.7339622974395752, "rewards/rejected": -20.400753021240234, "step": 5980 }, { "epoch": 0.20172570696686778, "grad_norm": 26.484529495239258, "learning_rate": 9.688094664465153e-07, "logits/chosen": -0.3309114873409271, "logits/rejected": -0.4025183320045471, "logps/chosen": -1.8796402215957642, "logps/rejected": -1.9859651327133179, "loss": 3.4244, "rewards/accuracies": 0.5, "rewards/chosen": -18.79640007019043, "rewards/margins": 1.0632522106170654, "rewards/rejected": -19.85965347290039, "step": 5985 }, { "epoch": 0.20189423303785095, "grad_norm": 21.167022705078125, "learning_rate": 9.68707125125567e-07, "logits/chosen": -0.1683267056941986, "logits/rejected": -0.21480047702789307, "logps/chosen": -1.8829946517944336, "logps/rejected": -1.7001540660858154, "loss": 5.0036, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.829946517944336, "rewards/margins": -1.8284060955047607, "rewards/rejected": -17.001541137695312, "step": 5990 }, { "epoch": 0.20206275910883414, "grad_norm": 33.96486282348633, "learning_rate": 9.68604621603549e-07, "logits/chosen": -0.4873688220977783, "logits/rejected": -0.45059436559677124, "logps/chosen": -1.7515493631362915, "logps/rejected": -1.759034514427185, "loss": 3.4861, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -17.515491485595703, "rewards/margins": 0.0748540386557579, "rewards/rejected": -17.59034538269043, "step": 5995 }, { "epoch": 0.2022312851798173, "grad_norm": 14.7745943069458, "learning_rate": 9.685019559159335e-07, "logits/chosen": -0.7040256261825562, "logits/rejected": -0.678537130355835, "logps/chosen": -1.5471678972244263, "logps/rejected": -1.618430495262146, "loss": 2.6757, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -15.4716796875, "rewards/margins": 0.7126253247261047, "rewards/rejected": -16.18430519104004, "step": 6000 }, { "epoch": 0.2022312851798173, "eval_logits/chosen": -0.7735000848770142, "eval_logits/rejected": -0.7976916432380676, "eval_logps/chosen": -1.7360255718231201, "eval_logps/rejected": -1.7606263160705566, "eval_loss": 3.2614545822143555, "eval_rewards/accuracies": 0.5199999809265137, "eval_rewards/chosen": -17.36025619506836, "eval_rewards/margins": 0.24600711464881897, "eval_rewards/rejected": -17.60626220703125, "eval_runtime": 12.9219, "eval_samples_per_second": 7.739, "eval_steps_per_second": 1.935, "step": 6000 }, { "epoch": 0.2023998112508005, "grad_norm": 24.49705696105957, "learning_rate": 9.683991280982496e-07, "logits/chosen": -0.6119563579559326, "logits/rejected": -0.612609326839447, "logps/chosen": -2.0236449241638184, "logps/rejected": -2.0016684532165527, "loss": 3.3791, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.2364501953125, "rewards/margins": -0.21976280212402344, "rewards/rejected": -20.016687393188477, "step": 6005 }, { "epoch": 0.20256833732178367, "grad_norm": 24.686538696289062, "learning_rate": 9.682961381860816e-07, "logits/chosen": -0.5340906381607056, "logits/rejected": -0.5442155599594116, "logps/chosen": -1.488397479057312, "logps/rejected": -1.5215156078338623, "loss": 2.8946, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -14.883974075317383, "rewards/margins": 0.33118313550949097, "rewards/rejected": -15.215158462524414, "step": 6010 }, { "epoch": 0.20273686339276686, "grad_norm": 11.82690715789795, "learning_rate": 9.681929862150702e-07, "logits/chosen": -0.6329524517059326, "logits/rejected": -0.5246933698654175, "logps/chosen": -2.3101534843444824, "logps/rejected": -2.4467110633850098, "loss": 2.6406, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.10153579711914, "rewards/margins": 1.3655731678009033, "rewards/rejected": -24.46710968017578, "step": 6015 }, { "epoch": 0.20290538946375003, "grad_norm": 29.439022064208984, "learning_rate": 9.680896722209122e-07, "logits/chosen": -0.5506235361099243, "logits/rejected": -0.4831174910068512, "logps/chosen": -1.7477543354034424, "logps/rejected": -1.8734235763549805, "loss": 2.246, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.477542877197266, "rewards/margins": 1.2566947937011719, "rewards/rejected": -18.734233856201172, "step": 6020 }, { "epoch": 0.20307391553473322, "grad_norm": 15.599130630493164, "learning_rate": 9.67986196239361e-07, "logits/chosen": -0.0006256193155422807, "logits/rejected": 0.09851661324501038, "logps/chosen": -2.0748696327209473, "logps/rejected": -2.4163742065429688, "loss": 2.5649, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.748693466186523, "rewards/margins": 3.4150452613830566, "rewards/rejected": -24.163740158081055, "step": 6025 }, { "epoch": 0.20324244160571642, "grad_norm": 23.84718894958496, "learning_rate": 9.67882558306225e-07, "logits/chosen": -0.45076194405555725, "logits/rejected": -0.6076852679252625, "logps/chosen": -1.6174421310424805, "logps/rejected": -1.6184570789337158, "loss": 3.1546, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.174421310424805, "rewards/margins": 0.010150337591767311, "rewards/rejected": -16.184572219848633, "step": 6030 }, { "epoch": 0.20341096767669958, "grad_norm": 24.911104202270508, "learning_rate": 9.677787584573693e-07, "logits/chosen": -0.6882834434509277, "logits/rejected": -0.7800687551498413, "logps/chosen": -1.8254387378692627, "logps/rejected": -1.8409912586212158, "loss": 3.1078, "rewards/accuracies": 0.5, "rewards/chosen": -18.254384994506836, "rewards/margins": 0.1555270254611969, "rewards/rejected": -18.409914016723633, "step": 6035 }, { "epoch": 0.20357949374768278, "grad_norm": 19.878774642944336, "learning_rate": 9.676747967287153e-07, "logits/chosen": -0.21769729256629944, "logits/rejected": -0.21601931750774384, "logps/chosen": -2.1479928493499756, "logps/rejected": -2.5670371055603027, "loss": 2.5418, "rewards/accuracies": 0.5, "rewards/chosen": -21.479928970336914, "rewards/margins": 4.190441131591797, "rewards/rejected": -25.67037010192871, "step": 6040 }, { "epoch": 0.20374801981866594, "grad_norm": 26.25444793701172, "learning_rate": 9.675706731562395e-07, "logits/chosen": -0.12167356163263321, "logits/rejected": -0.1530594676733017, "logps/chosen": -2.234086275100708, "logps/rejected": -2.3034989833831787, "loss": 2.8796, "rewards/accuracies": 0.5, "rewards/chosen": -22.340862274169922, "rewards/margins": 0.6941286325454712, "rewards/rejected": -23.034990310668945, "step": 6045 }, { "epoch": 0.20391654588964914, "grad_norm": 33.240116119384766, "learning_rate": 9.674663877759758e-07, "logits/chosen": -0.15784066915512085, "logits/rejected": -0.27910029888153076, "logps/chosen": -1.6888351440429688, "logps/rejected": -1.777195692062378, "loss": 2.5451, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.888351440429688, "rewards/margins": 0.8836051225662231, "rewards/rejected": -17.771955490112305, "step": 6050 }, { "epoch": 0.2040850719606323, "grad_norm": 37.929264068603516, "learning_rate": 9.673619406240122e-07, "logits/chosen": -0.8239002227783203, "logits/rejected": -0.8135004043579102, "logps/chosen": -1.9584920406341553, "logps/rejected": -1.9400832653045654, "loss": 3.2595, "rewards/accuracies": 0.5, "rewards/chosen": -19.58492088317871, "rewards/margins": -0.1840866059064865, "rewards/rejected": -19.400833129882812, "step": 6055 }, { "epoch": 0.2042535980316155, "grad_norm": 15.64293098449707, "learning_rate": 9.672573317364945e-07, "logits/chosen": -0.4086344838142395, "logits/rejected": -0.49510034918785095, "logps/chosen": -1.9705301523208618, "logps/rejected": -1.976130485534668, "loss": 3.5653, "rewards/accuracies": 0.5, "rewards/chosen": -19.705303192138672, "rewards/margins": 0.056003473699092865, "rewards/rejected": -19.76130485534668, "step": 6060 }, { "epoch": 0.20442212410259866, "grad_norm": 99.170654296875, "learning_rate": 9.671525611496235e-07, "logits/chosen": -0.6581277847290039, "logits/rejected": -0.6759839653968811, "logps/chosen": -2.240769147872925, "logps/rejected": -2.044039011001587, "loss": 5.0291, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -22.407690048217773, "rewards/margins": -1.9673011302947998, "rewards/rejected": -20.440387725830078, "step": 6065 }, { "epoch": 0.20459065017358186, "grad_norm": 28.03131675720215, "learning_rate": 9.67047628899656e-07, "logits/chosen": -0.5428565740585327, "logits/rejected": -0.6084557175636292, "logps/chosen": -1.6421184539794922, "logps/rejected": -1.7033920288085938, "loss": 2.8575, "rewards/accuracies": 0.5, "rewards/chosen": -16.421184539794922, "rewards/margins": 0.6127360463142395, "rewards/rejected": -17.033920288085938, "step": 6070 }, { "epoch": 0.20475917624456502, "grad_norm": 18.674060821533203, "learning_rate": 9.66942535022905e-07, "logits/chosen": -0.46286773681640625, "logits/rejected": -0.45752209424972534, "logps/chosen": -1.7388484477996826, "logps/rejected": -1.9831173419952393, "loss": 2.4786, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.388484954833984, "rewards/margins": 2.4426865577697754, "rewards/rejected": -19.8311710357666, "step": 6075 }, { "epoch": 0.20492770231554822, "grad_norm": 19.101802825927734, "learning_rate": 9.668372795557398e-07, "logits/chosen": -0.734752357006073, "logits/rejected": -0.9037183523178101, "logps/chosen": -1.6903842687606812, "logps/rejected": -1.6715500354766846, "loss": 3.2694, "rewards/accuracies": 0.5, "rewards/chosen": -16.90384292602539, "rewards/margins": -0.1883423775434494, "rewards/rejected": -16.71550178527832, "step": 6080 }, { "epoch": 0.2050962283865314, "grad_norm": 38.23634719848633, "learning_rate": 9.667318625345847e-07, "logits/chosen": -0.4373010993003845, "logits/rejected": -0.30279842019081116, "logps/chosen": -1.723619818687439, "logps/rejected": -1.8654381036758423, "loss": 2.4254, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.2362003326416, "rewards/margins": 1.4181811809539795, "rewards/rejected": -18.65437889099121, "step": 6085 }, { "epoch": 0.20526475445751458, "grad_norm": 15.726313591003418, "learning_rate": 9.666262839959203e-07, "logits/chosen": -0.5284382700920105, "logits/rejected": -0.5790424346923828, "logps/chosen": -1.8899242877960205, "logps/rejected": -2.023179531097412, "loss": 2.2101, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.899242401123047, "rewards/margins": 1.3325533866882324, "rewards/rejected": -20.231792449951172, "step": 6090 }, { "epoch": 0.20543328052849777, "grad_norm": 25.096664428710938, "learning_rate": 9.665205439762833e-07, "logits/chosen": -0.3775356709957123, "logits/rejected": -0.22368212044239044, "logps/chosen": -2.0895133018493652, "logps/rejected": -2.1160058975219727, "loss": 3.4497, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.89513397216797, "rewards/margins": 0.2649245262145996, "rewards/rejected": -21.160058975219727, "step": 6095 }, { "epoch": 0.20560180659948094, "grad_norm": 86.66022491455078, "learning_rate": 9.664146425122664e-07, "logits/chosen": -0.624174952507019, "logits/rejected": -0.5643637776374817, "logps/chosen": -2.0391335487365723, "logps/rejected": -2.046374797821045, "loss": 3.3601, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.391334533691406, "rewards/margins": 0.0724119171500206, "rewards/rejected": -20.4637451171875, "step": 6100 }, { "epoch": 0.20577033267046413, "grad_norm": 29.608823776245117, "learning_rate": 9.663085796405177e-07, "logits/chosen": -0.8779303431510925, "logits/rejected": -0.7524505257606506, "logps/chosen": -1.580993413925171, "logps/rejected": -1.547828197479248, "loss": 3.3839, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -15.809934616088867, "rewards/margins": -0.33165159821510315, "rewards/rejected": -15.47828197479248, "step": 6105 }, { "epoch": 0.2059388587414473, "grad_norm": 27.97531509399414, "learning_rate": 9.662023553977414e-07, "logits/chosen": -0.4506549835205078, "logits/rejected": -0.5821264386177063, "logps/chosen": -1.843197226524353, "logps/rejected": -1.8881915807724, "loss": 3.2041, "rewards/accuracies": 0.5, "rewards/chosen": -18.431970596313477, "rewards/margins": 0.449946790933609, "rewards/rejected": -18.88191795349121, "step": 6110 }, { "epoch": 0.2061073848124305, "grad_norm": 36.354248046875, "learning_rate": 9.660959698206977e-07, "logits/chosen": -0.21553261578083038, "logits/rejected": -0.3534262776374817, "logps/chosen": -2.1483542919158936, "logps/rejected": -2.0875895023345947, "loss": 3.7702, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.48354148864746, "rewards/margins": -0.6076488494873047, "rewards/rejected": -20.87589454650879, "step": 6115 }, { "epoch": 0.20627591088341365, "grad_norm": 26.448593139648438, "learning_rate": 9.65989422946202e-07, "logits/chosen": -0.507719099521637, "logits/rejected": -0.4437492787837982, "logps/chosen": -1.8520978689193726, "logps/rejected": -2.002349853515625, "loss": 3.574, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.520977020263672, "rewards/margins": 1.5025217533111572, "rewards/rejected": -20.02349853515625, "step": 6120 }, { "epoch": 0.20644443695439685, "grad_norm": 18.775527954101562, "learning_rate": 9.658827148111263e-07, "logits/chosen": -0.5065479278564453, "logits/rejected": -0.434520959854126, "logps/chosen": -2.0218491554260254, "logps/rejected": -2.132096767425537, "loss": 2.2438, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -20.218490600585938, "rewards/margins": 1.102478265762329, "rewards/rejected": -21.320964813232422, "step": 6125 }, { "epoch": 0.20661296302538001, "grad_norm": 23.01225471496582, "learning_rate": 9.657758454523983e-07, "logits/chosen": -0.2993611991405487, "logits/rejected": -0.33458638191223145, "logps/chosen": -1.8482387065887451, "logps/rejected": -1.9364979267120361, "loss": 2.8185, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.48238754272461, "rewards/margins": 0.8825904726982117, "rewards/rejected": -19.364978790283203, "step": 6130 }, { "epoch": 0.2067814890963632, "grad_norm": 35.34732437133789, "learning_rate": 9.656688149070006e-07, "logits/chosen": -0.7763724327087402, "logits/rejected": -0.8895975947380066, "logps/chosen": -1.8610140085220337, "logps/rejected": -1.8075506687164307, "loss": 3.7522, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.610139846801758, "rewards/margins": -0.5346325635910034, "rewards/rejected": -18.075504302978516, "step": 6135 }, { "epoch": 0.2069500151673464, "grad_norm": 33.043312072753906, "learning_rate": 9.65561623211973e-07, "logits/chosen": -0.7509564757347107, "logits/rejected": -0.6484511494636536, "logps/chosen": -1.8732191324234009, "logps/rejected": -1.766880989074707, "loss": 4.2271, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -18.732189178466797, "rewards/margins": -1.063381552696228, "rewards/rejected": -17.66880989074707, "step": 6140 }, { "epoch": 0.20711854123832957, "grad_norm": 21.4063663482666, "learning_rate": 9.6545427040441e-07, "logits/chosen": -0.6801129579544067, "logits/rejected": -0.6815675497055054, "logps/chosen": -2.0237112045288086, "logps/rejected": -1.9940065145492554, "loss": 3.3582, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.237112045288086, "rewards/margins": -0.2970461845397949, "rewards/rejected": -19.940067291259766, "step": 6145 }, { "epoch": 0.20728706730931276, "grad_norm": 29.270660400390625, "learning_rate": 9.653467565214622e-07, "logits/chosen": -0.7824691534042358, "logits/rejected": -0.8962273597717285, "logps/chosen": -1.485214114189148, "logps/rejected": -1.6048316955566406, "loss": 2.268, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -14.852142333984375, "rewards/margins": 1.1961743831634521, "rewards/rejected": -16.048315048217773, "step": 6150 }, { "epoch": 0.20745559338029593, "grad_norm": 19.40476417541504, "learning_rate": 9.652390816003357e-07, "logits/chosen": -0.6924790143966675, "logits/rejected": -0.7991618514060974, "logps/chosen": -1.395509123802185, "logps/rejected": -1.5696979761123657, "loss": 1.7101, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -13.95509147644043, "rewards/margins": 1.741887092590332, "rewards/rejected": -15.696978569030762, "step": 6155 }, { "epoch": 0.20762411945127912, "grad_norm": 15.866957664489746, "learning_rate": 9.65131245678293e-07, "logits/chosen": -0.7179878354072571, "logits/rejected": -0.5750179290771484, "logps/chosen": -1.5896713733673096, "logps/rejected": -1.6341173648834229, "loss": 3.1135, "rewards/accuracies": 0.5, "rewards/chosen": -15.89671516418457, "rewards/margins": 0.4444583058357239, "rewards/rejected": -16.341171264648438, "step": 6160 }, { "epoch": 0.2077926455222623, "grad_norm": 19.111637115478516, "learning_rate": 9.650232487926514e-07, "logits/chosen": -0.7711466550827026, "logits/rejected": -0.8609915971755981, "logps/chosen": -1.733947515487671, "logps/rejected": -1.926256775856018, "loss": 1.9845, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.3394775390625, "rewards/margins": 1.9230899810791016, "rewards/rejected": -19.262563705444336, "step": 6165 }, { "epoch": 0.20796117159324548, "grad_norm": 20.010040283203125, "learning_rate": 9.649150909807847e-07, "logits/chosen": -0.8053653836250305, "logits/rejected": -0.7153798341751099, "logps/chosen": -2.1477370262145996, "logps/rejected": -2.236959934234619, "loss": 2.813, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.477371215820312, "rewards/margins": 0.892225444316864, "rewards/rejected": -22.369596481323242, "step": 6170 }, { "epoch": 0.20812969766422865, "grad_norm": 14.432764053344727, "learning_rate": 9.64806772280122e-07, "logits/chosen": -0.39327144622802734, "logits/rejected": -0.4320458769798279, "logps/chosen": -1.6436790227890015, "logps/rejected": -1.7568355798721313, "loss": 2.5333, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.43678855895996, "rewards/margins": 1.1315654516220093, "rewards/rejected": -17.568355560302734, "step": 6175 }, { "epoch": 0.20829822373521184, "grad_norm": 30.609670639038086, "learning_rate": 9.646982927281479e-07, "logits/chosen": -0.6090951561927795, "logits/rejected": -0.634852945804596, "logps/chosen": -1.7177422046661377, "logps/rejected": -1.7539911270141602, "loss": 3.2011, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.177419662475586, "rewards/margins": 0.36249056458473206, "rewards/rejected": -17.5399112701416, "step": 6180 }, { "epoch": 0.208466749806195, "grad_norm": 29.538747787475586, "learning_rate": 9.64589652362403e-07, "logits/chosen": -0.7170549631118774, "logits/rejected": -0.6587635278701782, "logps/chosen": -1.4641923904418945, "logps/rejected": -1.5876529216766357, "loss": 2.7722, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -14.641921997070312, "rewards/margins": 1.2346062660217285, "rewards/rejected": -15.8765287399292, "step": 6185 }, { "epoch": 0.2086352758771782, "grad_norm": 37.199344635009766, "learning_rate": 9.644808512204837e-07, "logits/chosen": -0.5983039140701294, "logits/rejected": -0.7115954756736755, "logps/chosen": -1.6693763732910156, "logps/rejected": -1.6038440465927124, "loss": 3.7817, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.693761825561523, "rewards/margins": -0.6553219556808472, "rewards/rejected": -16.038440704345703, "step": 6190 }, { "epoch": 0.2088038019481614, "grad_norm": 21.11469841003418, "learning_rate": 9.643718893400416e-07, "logits/chosen": -0.6305486559867859, "logits/rejected": -0.46515828371047974, "logps/chosen": -1.977230429649353, "logps/rejected": -2.1082019805908203, "loss": 2.644, "rewards/accuracies": 0.5, "rewards/chosen": -19.77230453491211, "rewards/margins": 1.3097164630889893, "rewards/rejected": -21.082019805908203, "step": 6195 }, { "epoch": 0.20897232801914456, "grad_norm": 25.193111419677734, "learning_rate": 9.642627667587842e-07, "logits/chosen": -0.38711774349212646, "logits/rejected": -0.2904582619667053, "logps/chosen": -1.7007389068603516, "logps/rejected": -1.7201976776123047, "loss": 3.6035, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -17.007389068603516, "rewards/margins": 0.19458922743797302, "rewards/rejected": -17.201976776123047, "step": 6200 }, { "epoch": 0.20914085409012775, "grad_norm": 22.27320098876953, "learning_rate": 9.641534835144742e-07, "logits/chosen": -0.6792038679122925, "logits/rejected": -0.6867347955703735, "logps/chosen": -1.9890989065170288, "logps/rejected": -1.9915831089019775, "loss": 3.3895, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.890989303588867, "rewards/margins": 0.024839973077178, "rewards/rejected": -19.915828704833984, "step": 6205 }, { "epoch": 0.20930938016111092, "grad_norm": 18.266559600830078, "learning_rate": 9.640440396449304e-07, "logits/chosen": -0.7111789584159851, "logits/rejected": -0.691353440284729, "logps/chosen": -1.796891212463379, "logps/rejected": -1.7614777088165283, "loss": 3.5483, "rewards/accuracies": 0.5, "rewards/chosen": -17.968910217285156, "rewards/margins": -0.35413503646850586, "rewards/rejected": -17.614776611328125, "step": 6210 }, { "epoch": 0.2094779062320941, "grad_norm": 29.486312866210938, "learning_rate": 9.639344351880276e-07, "logits/chosen": -0.19421645998954773, "logits/rejected": -0.12176599353551865, "logps/chosen": -1.9066638946533203, "logps/rejected": -2.329479694366455, "loss": 2.2476, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.066638946533203, "rewards/margins": 4.228161334991455, "rewards/rejected": -23.294797897338867, "step": 6215 }, { "epoch": 0.20964643230307728, "grad_norm": 14.711030006408691, "learning_rate": 9.638246701816946e-07, "logits/chosen": -0.7750159502029419, "logits/rejected": -0.7665907740592957, "logps/chosen": -1.6569950580596924, "logps/rejected": -1.7536866664886475, "loss": 2.292, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -16.569950103759766, "rewards/margins": 0.966915488243103, "rewards/rejected": -17.536867141723633, "step": 6220 }, { "epoch": 0.20981495837406047, "grad_norm": 19.998607635498047, "learning_rate": 9.637147446639172e-07, "logits/chosen": -0.7628619074821472, "logits/rejected": -0.639403223991394, "logps/chosen": -1.9863265752792358, "logps/rejected": -2.026005268096924, "loss": 2.8586, "rewards/accuracies": 0.5, "rewards/chosen": -19.863265991210938, "rewards/margins": 0.3967866003513336, "rewards/rejected": -20.260051727294922, "step": 6225 }, { "epoch": 0.20998348444504364, "grad_norm": 23.68602752685547, "learning_rate": 9.636046586727366e-07, "logits/chosen": -0.6806201934814453, "logits/rejected": -0.46249714493751526, "logps/chosen": -1.8867038488388062, "logps/rejected": -1.9447044134140015, "loss": 3.3294, "rewards/accuracies": 0.5, "rewards/chosen": -18.86703872680664, "rewards/margins": 0.5800049901008606, "rewards/rejected": -19.44704246520996, "step": 6230 }, { "epoch": 0.21015201051602683, "grad_norm": 18.60115623474121, "learning_rate": 9.63494412246249e-07, "logits/chosen": -0.2512991428375244, "logits/rejected": -0.20353391766548157, "logps/chosen": -2.148930549621582, "logps/rejected": -2.354224920272827, "loss": 1.8338, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.489307403564453, "rewards/margins": 2.052943706512451, "rewards/rejected": -23.542251586914062, "step": 6235 }, { "epoch": 0.21032053658701, "grad_norm": 21.855871200561523, "learning_rate": 9.63384005422606e-07, "logits/chosen": -0.365181028842926, "logits/rejected": -0.33115094900131226, "logps/chosen": -2.059521436691284, "logps/rejected": -2.308326005935669, "loss": 2.1597, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.595216751098633, "rewards/margins": 2.4880428314208984, "rewards/rejected": -23.0832576751709, "step": 6240 }, { "epoch": 0.2104890626579932, "grad_norm": 43.752532958984375, "learning_rate": 9.632734382400154e-07, "logits/chosen": -0.21305397152900696, "logits/rejected": -0.3376748859882355, "logps/chosen": -1.9535162448883057, "logps/rejected": -1.8234503269195557, "loss": 4.429, "rewards/accuracies": 0.5, "rewards/chosen": -19.5351619720459, "rewards/margins": -1.300659418106079, "rewards/rejected": -18.234500885009766, "step": 6245 }, { "epoch": 0.21065758872897639, "grad_norm": 18.581083297729492, "learning_rate": 9.6316271073674e-07, "logits/chosen": -0.9776862263679504, "logits/rejected": -1.0578795671463013, "logps/chosen": -1.5479357242584229, "logps/rejected": -1.4699079990386963, "loss": 3.9844, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -15.479357719421387, "rewards/margins": -0.7802785634994507, "rewards/rejected": -14.699081420898438, "step": 6250 }, { "epoch": 0.21082611479995955, "grad_norm": 42.386260986328125, "learning_rate": 9.630518229510984e-07, "logits/chosen": -0.4895743727684021, "logits/rejected": -0.546970784664154, "logps/chosen": -1.7075220346450806, "logps/rejected": -1.6165168285369873, "loss": 4.2254, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.075220108032227, "rewards/margins": -0.9100503921508789, "rewards/rejected": -16.16516876220703, "step": 6255 }, { "epoch": 0.21099464087094275, "grad_norm": 30.77920913696289, "learning_rate": 9.629407749214643e-07, "logits/chosen": -0.13082917034626007, "logits/rejected": -0.14675593376159668, "logps/chosen": -2.299105167388916, "logps/rejected": -2.578866958618164, "loss": 2.8197, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.991052627563477, "rewards/margins": 2.797618865966797, "rewards/rejected": -25.788671493530273, "step": 6260 }, { "epoch": 0.2111631669419259, "grad_norm": 26.253183364868164, "learning_rate": 9.628295666862672e-07, "logits/chosen": -0.48517459630966187, "logits/rejected": -0.4192652702331543, "logps/chosen": -1.9712200164794922, "logps/rejected": -1.9250516891479492, "loss": 3.5636, "rewards/accuracies": 0.5, "rewards/chosen": -19.712200164794922, "rewards/margins": -0.4616851806640625, "rewards/rejected": -19.25051498413086, "step": 6265 }, { "epoch": 0.2113316930129091, "grad_norm": 45.756656646728516, "learning_rate": 9.627181982839918e-07, "logits/chosen": -0.760116696357727, "logits/rejected": -0.814267635345459, "logps/chosen": -1.9276320934295654, "logps/rejected": -2.051849842071533, "loss": 2.378, "rewards/accuracies": 0.5, "rewards/chosen": -19.276317596435547, "rewards/margins": 1.242180347442627, "rewards/rejected": -20.51849937438965, "step": 6270 }, { "epoch": 0.21150021908389227, "grad_norm": 21.75693702697754, "learning_rate": 9.626066697531784e-07, "logits/chosen": -0.4948394298553467, "logits/rejected": -0.5552079081535339, "logps/chosen": -1.6278167963027954, "logps/rejected": -1.7882741689682007, "loss": 2.2152, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.278165817260742, "rewards/margins": 1.6045728921890259, "rewards/rejected": -17.882740020751953, "step": 6275 }, { "epoch": 0.21166874515487547, "grad_norm": 28.453824996948242, "learning_rate": 9.624949811324226e-07, "logits/chosen": -0.07232952117919922, "logits/rejected": -0.025306105613708496, "logps/chosen": -2.205857515335083, "logps/rejected": -2.5525448322296143, "loss": 2.6162, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.058574676513672, "rewards/margins": 3.4668731689453125, "rewards/rejected": -25.525447845458984, "step": 6280 }, { "epoch": 0.21183727122585863, "grad_norm": 21.348684310913086, "learning_rate": 9.623831324603752e-07, "logits/chosen": -0.3630000054836273, "logits/rejected": -0.35038089752197266, "logps/chosen": -1.8361036777496338, "logps/rejected": -1.7641780376434326, "loss": 3.8496, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -18.361034393310547, "rewards/margins": -0.7192561030387878, "rewards/rejected": -17.641780853271484, "step": 6285 }, { "epoch": 0.21200579729684182, "grad_norm": 47.256561279296875, "learning_rate": 9.62271123775743e-07, "logits/chosen": -0.39444655179977417, "logits/rejected": -0.3994792401790619, "logps/chosen": -2.297093152999878, "logps/rejected": -2.3859825134277344, "loss": 2.4176, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.970930099487305, "rewards/margins": 0.8888934850692749, "rewards/rejected": -23.859825134277344, "step": 6290 }, { "epoch": 0.212174323367825, "grad_norm": 37.113338470458984, "learning_rate": 9.621589551172875e-07, "logits/chosen": -0.5275182723999023, "logits/rejected": -0.49484142661094666, "logps/chosen": -1.9655338525772095, "logps/rejected": -1.872588872909546, "loss": 4.0064, "rewards/accuracies": 0.5, "rewards/chosen": -19.655338287353516, "rewards/margins": -0.9294483065605164, "rewards/rejected": -18.725889205932617, "step": 6295 }, { "epoch": 0.21234284943880818, "grad_norm": 33.60499572753906, "learning_rate": 9.620466265238261e-07, "logits/chosen": -0.40655916929244995, "logits/rejected": -0.3757340908050537, "logps/chosen": -1.8657829761505127, "logps/rejected": -1.9118738174438477, "loss": 2.9708, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -18.65782928466797, "rewards/margins": 0.4609087109565735, "rewards/rejected": -19.118738174438477, "step": 6300 }, { "epoch": 0.21251137550979138, "grad_norm": 19.811283111572266, "learning_rate": 9.619341380342312e-07, "logits/chosen": -0.7586280107498169, "logits/rejected": -0.7591967582702637, "logps/chosen": -1.701525092124939, "logps/rejected": -1.781437635421753, "loss": 2.4697, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.0152530670166, "rewards/margins": 0.7991257905960083, "rewards/rejected": -17.814376831054688, "step": 6305 }, { "epoch": 0.21267990158077454, "grad_norm": 38.60699462890625, "learning_rate": 9.618214896874305e-07, "logits/chosen": -0.4628763794898987, "logits/rejected": -0.6883228421211243, "logps/chosen": -1.6286872625350952, "logps/rejected": -1.568355917930603, "loss": 3.7782, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.2868709564209, "rewards/margins": -0.6033118367195129, "rewards/rejected": -15.683561325073242, "step": 6310 }, { "epoch": 0.21284842765175774, "grad_norm": 42.60224151611328, "learning_rate": 9.617086815224072e-07, "logits/chosen": -0.17860253155231476, "logits/rejected": -0.10489163547754288, "logps/chosen": -2.4953255653381348, "logps/rejected": -2.519314765930176, "loss": 4.2209, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -24.953256607055664, "rewards/margins": 0.23989124596118927, "rewards/rejected": -25.19314956665039, "step": 6315 }, { "epoch": 0.2130169537227409, "grad_norm": 39.443023681640625, "learning_rate": 9.615957135782e-07, "logits/chosen": -0.7516977190971375, "logits/rejected": -0.752848744392395, "logps/chosen": -1.9006750583648682, "logps/rejected": -1.9128259420394897, "loss": 3.3723, "rewards/accuracies": 0.5, "rewards/chosen": -19.00674819946289, "rewards/margins": 0.12151069939136505, "rewards/rejected": -19.12826156616211, "step": 6320 }, { "epoch": 0.2131854797937241, "grad_norm": 111.87989044189453, "learning_rate": 9.614825858939023e-07, "logits/chosen": -0.3990008533000946, "logits/rejected": -0.5169636607170105, "logps/chosen": -1.9904636144638062, "logps/rejected": -2.0598185062408447, "loss": 2.5614, "rewards/accuracies": 0.5, "rewards/chosen": -19.90463638305664, "rewards/margins": 0.6935473680496216, "rewards/rejected": -20.59818458557129, "step": 6325 }, { "epoch": 0.21335400586470726, "grad_norm": 56.99147415161133, "learning_rate": 9.613692985086634e-07, "logits/chosen": -0.044111065566539764, "logits/rejected": -0.09141120314598083, "logps/chosen": -2.312798023223877, "logps/rejected": -2.367541790008545, "loss": 2.8679, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -23.127979278564453, "rewards/margins": 0.5474358797073364, "rewards/rejected": -23.6754150390625, "step": 6330 }, { "epoch": 0.21352253193569046, "grad_norm": 17.781930923461914, "learning_rate": 9.612558514616874e-07, "logits/chosen": -0.6025907397270203, "logits/rejected": -0.6184767484664917, "logps/chosen": -2.0852742195129395, "logps/rejected": -2.31107497215271, "loss": 2.4546, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.85274314880371, "rewards/margins": 2.258007049560547, "rewards/rejected": -23.110750198364258, "step": 6335 }, { "epoch": 0.21369105800667362, "grad_norm": 38.529624938964844, "learning_rate": 9.61142244792234e-07, "logits/chosen": -0.5689278841018677, "logits/rejected": -0.6688274145126343, "logps/chosen": -1.6294472217559814, "logps/rejected": -1.7487682104110718, "loss": 2.5781, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.29447364807129, "rewards/margins": 1.1932106018066406, "rewards/rejected": -17.487682342529297, "step": 6340 }, { "epoch": 0.21385958407765682, "grad_norm": 13.591431617736816, "learning_rate": 9.610284785396182e-07, "logits/chosen": -0.7865332365036011, "logits/rejected": -0.7709859609603882, "logps/chosen": -1.7249629497528076, "logps/rejected": -1.8565632104873657, "loss": 2.1548, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.249629974365234, "rewards/margins": 1.3160030841827393, "rewards/rejected": -18.56563377380371, "step": 6345 }, { "epoch": 0.21402811014863998, "grad_norm": 25.202449798583984, "learning_rate": 9.609145527432096e-07, "logits/chosen": -0.5416828393936157, "logits/rejected": -0.5098170042037964, "logps/chosen": -1.8249015808105469, "logps/rejected": -1.9837112426757812, "loss": 2.8382, "rewards/accuracies": 0.5, "rewards/chosen": -18.249013900756836, "rewards/margins": 1.5880978107452393, "rewards/rejected": -19.837112426757812, "step": 6350 }, { "epoch": 0.21419663621962318, "grad_norm": 32.652381896972656, "learning_rate": 9.608004674424336e-07, "logits/chosen": -0.4759892523288727, "logits/rejected": -0.26175767183303833, "logps/chosen": -1.7728564739227295, "logps/rejected": -1.8391892910003662, "loss": 2.9791, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.728567123413086, "rewards/margins": 0.6633265614509583, "rewards/rejected": -18.391895294189453, "step": 6355 }, { "epoch": 0.21436516229060637, "grad_norm": 16.489791870117188, "learning_rate": 9.606862226767706e-07, "logits/chosen": -0.32876458764076233, "logits/rejected": -0.36927470564842224, "logps/chosen": -2.0482099056243896, "logps/rejected": -2.105473279953003, "loss": 2.6771, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.482101440429688, "rewards/margins": 0.5726326107978821, "rewards/rejected": -21.05473518371582, "step": 6360 }, { "epoch": 0.21453368836158954, "grad_norm": 23.093629837036133, "learning_rate": 9.605718184857563e-07, "logits/chosen": -0.4692727029323578, "logits/rejected": -0.586158275604248, "logps/chosen": -1.8002586364746094, "logps/rejected": -1.9701658487319946, "loss": 1.7385, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.002586364746094, "rewards/margins": 1.6990737915039062, "rewards/rejected": -19.70166015625, "step": 6365 }, { "epoch": 0.21470221443257273, "grad_norm": 34.142677307128906, "learning_rate": 9.604572549089812e-07, "logits/chosen": -0.4840649664402008, "logits/rejected": -0.4249550700187683, "logps/chosen": -1.9199146032333374, "logps/rejected": -2.041658878326416, "loss": 3.2644, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.19914436340332, "rewards/margins": 1.2174437046051025, "rewards/rejected": -20.416587829589844, "step": 6370 }, { "epoch": 0.2148707405035559, "grad_norm": 28.089157104492188, "learning_rate": 9.603425319860918e-07, "logits/chosen": -0.504118800163269, "logits/rejected": -0.5168687105178833, "logps/chosen": -2.0993189811706543, "logps/rejected": -1.6906509399414062, "loss": 7.128, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.993188858032227, "rewards/margins": -4.086681365966797, "rewards/rejected": -16.90650749206543, "step": 6375 }, { "epoch": 0.2150392665745391, "grad_norm": 27.385347366333008, "learning_rate": 9.602276497567887e-07, "logits/chosen": -0.053630925714969635, "logits/rejected": -0.29481998085975647, "logps/chosen": -1.8843891620635986, "logps/rejected": -2.11942982673645, "loss": 2.0263, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.84389305114746, "rewards/margins": 2.35040545463562, "rewards/rejected": -21.19429588317871, "step": 6380 }, { "epoch": 0.21520779264552226, "grad_norm": 22.69534683227539, "learning_rate": 9.601126082608285e-07, "logits/chosen": -0.4440391957759857, "logits/rejected": -0.5650304555892944, "logps/chosen": -1.7570676803588867, "logps/rejected": -1.7968946695327759, "loss": 3.0257, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.570674896240234, "rewards/margins": 0.39827051758766174, "rewards/rejected": -17.968944549560547, "step": 6385 }, { "epoch": 0.21537631871650545, "grad_norm": 28.62639617919922, "learning_rate": 9.59997407538022e-07, "logits/chosen": -0.6663795113563538, "logits/rejected": -0.7694646120071411, "logps/chosen": -1.6902217864990234, "logps/rejected": -1.7215359210968018, "loss": 2.8422, "rewards/accuracies": 0.5, "rewards/chosen": -16.902217864990234, "rewards/margins": 0.3131416440010071, "rewards/rejected": -17.21535873413086, "step": 6390 }, { "epoch": 0.21554484478748862, "grad_norm": 23.405054092407227, "learning_rate": 9.59882047628236e-07, "logits/chosen": -0.22101497650146484, "logits/rejected": -0.30498817563056946, "logps/chosen": -2.058715581893921, "logps/rejected": -2.188988447189331, "loss": 2.2117, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.587156295776367, "rewards/margins": 1.3027280569076538, "rewards/rejected": -21.889883041381836, "step": 6395 }, { "epoch": 0.2157133708584718, "grad_norm": 20.8488712310791, "learning_rate": 9.59766528571392e-07, "logits/chosen": -0.7016115188598633, "logits/rejected": -0.8231936693191528, "logps/chosen": -1.582484483718872, "logps/rejected": -1.6375694274902344, "loss": 2.8576, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -15.824844360351562, "rewards/margins": 0.5508493185043335, "rewards/rejected": -16.375694274902344, "step": 6400 }, { "epoch": 0.2157133708584718, "eval_logits/chosen": -0.8260197639465332, "eval_logits/rejected": -0.8562415838241577, "eval_logps/chosen": -1.7505967617034912, "eval_logps/rejected": -1.7813204526901245, "eval_loss": 3.2381811141967773, "eval_rewards/accuracies": 0.550000011920929, "eval_rewards/chosen": -17.505966186523438, "eval_rewards/margins": 0.3072388768196106, "eval_rewards/rejected": -17.813203811645508, "eval_runtime": 12.902, "eval_samples_per_second": 7.751, "eval_steps_per_second": 1.938, "step": 6400 }, { "epoch": 0.21588189692945497, "grad_norm": 49.400997161865234, "learning_rate": 9.596508504074664e-07, "logits/chosen": -0.20146696269512177, "logits/rejected": -0.20826086401939392, "logps/chosen": -2.2696011066436768, "logps/rejected": -2.6526265144348145, "loss": 2.9121, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.696012496948242, "rewards/margins": 3.8302536010742188, "rewards/rejected": -26.52626609802246, "step": 6405 }, { "epoch": 0.21605042300043817, "grad_norm": 24.17193603515625, "learning_rate": 9.595350131764911e-07, "logits/chosen": -0.8560646176338196, "logits/rejected": -0.9106782674789429, "logps/chosen": -1.775774359703064, "logps/rejected": -1.9019801616668701, "loss": 2.6313, "rewards/accuracies": 0.5, "rewards/chosen": -17.75774383544922, "rewards/margins": 1.262058973312378, "rewards/rejected": -19.01980209350586, "step": 6410 }, { "epoch": 0.21621894907142136, "grad_norm": 22.19492530822754, "learning_rate": 9.594190169185528e-07, "logits/chosen": -0.9066619873046875, "logits/rejected": -0.865528404712677, "logps/chosen": -1.8104702234268188, "logps/rejected": -1.7503440380096436, "loss": 3.6651, "rewards/accuracies": 0.5, "rewards/chosen": -18.104700088500977, "rewards/margins": -0.6012603640556335, "rewards/rejected": -17.503442764282227, "step": 6415 }, { "epoch": 0.21638747514240453, "grad_norm": 14.114916801452637, "learning_rate": 9.593028616737929e-07, "logits/chosen": -0.731952965259552, "logits/rejected": -0.7744450569152832, "logps/chosen": -1.712438941001892, "logps/rejected": -1.7795616388320923, "loss": 2.6212, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.1243896484375, "rewards/margins": 0.6712266206741333, "rewards/rejected": -17.795616149902344, "step": 6420 }, { "epoch": 0.21655600121338772, "grad_norm": 19.506546020507812, "learning_rate": 9.591865474824084e-07, "logits/chosen": -0.6614322662353516, "logits/rejected": -0.6236995458602905, "logps/chosen": -1.8650777339935303, "logps/rejected": -1.9538198709487915, "loss": 2.4345, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.650775909423828, "rewards/margins": 0.887421727180481, "rewards/rejected": -19.538198471069336, "step": 6425 }, { "epoch": 0.2167245272843709, "grad_norm": 21.893081665039062, "learning_rate": 9.590700743846511e-07, "logits/chosen": -0.4665060043334961, "logits/rejected": -0.4667798578739166, "logps/chosen": -1.7320483922958374, "logps/rejected": -1.7750571966171265, "loss": 2.7378, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.320484161376953, "rewards/margins": 0.4300875663757324, "rewards/rejected": -17.75057029724121, "step": 6430 }, { "epoch": 0.21689305335535408, "grad_norm": 33.71809005737305, "learning_rate": 9.58953442420828e-07, "logits/chosen": -0.6123358607292175, "logits/rejected": -0.5208547711372375, "logps/chosen": -2.1607580184936523, "logps/rejected": -2.6098697185516357, "loss": 3.0716, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.60757827758789, "rewards/margins": 4.491117477416992, "rewards/rejected": -26.098697662353516, "step": 6435 }, { "epoch": 0.21706157942633725, "grad_norm": 29.654769897460938, "learning_rate": 9.588366516313001e-07, "logits/chosen": -0.4644528329372406, "logits/rejected": -0.5957885980606079, "logps/chosen": -1.9361892938613892, "logps/rejected": -2.192762613296509, "loss": 2.9682, "rewards/accuracies": 0.5, "rewards/chosen": -19.361894607543945, "rewards/margins": 2.5657315254211426, "rewards/rejected": -21.92762565612793, "step": 6440 }, { "epoch": 0.21723010549732044, "grad_norm": 33.55850601196289, "learning_rate": 9.587197020564847e-07, "logits/chosen": -0.3282471299171448, "logits/rejected": -0.34332892298698425, "logps/chosen": -1.893535852432251, "logps/rejected": -1.8809051513671875, "loss": 3.262, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.93535804748535, "rewards/margins": -0.12630634009838104, "rewards/rejected": -18.809051513671875, "step": 6445 }, { "epoch": 0.2173986315683036, "grad_norm": 18.890377044677734, "learning_rate": 9.586025937368532e-07, "logits/chosen": -0.27933672070503235, "logits/rejected": -0.37898606061935425, "logps/chosen": -1.7535426616668701, "logps/rejected": -1.7873185873031616, "loss": 3.133, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.53542709350586, "rewards/margins": 0.3377595841884613, "rewards/rejected": -17.873186111450195, "step": 6450 }, { "epoch": 0.2175671576392868, "grad_norm": 23.26448631286621, "learning_rate": 9.584853267129323e-07, "logits/chosen": -0.8444086313247681, "logits/rejected": -0.8294426202774048, "logps/chosen": -1.8335950374603271, "logps/rejected": -1.8612003326416016, "loss": 2.8761, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.335948944091797, "rewards/margins": 0.2760535180568695, "rewards/rejected": -18.612003326416016, "step": 6455 }, { "epoch": 0.21773568371026997, "grad_norm": 34.03776168823242, "learning_rate": 9.583679010253033e-07, "logits/chosen": -0.5135722160339355, "logits/rejected": -0.32302290201187134, "logps/chosen": -2.0652201175689697, "logps/rejected": -2.1298694610595703, "loss": 2.619, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.652198791503906, "rewards/margins": 0.6464970707893372, "rewards/rejected": -21.298694610595703, "step": 6460 }, { "epoch": 0.21790420978125316, "grad_norm": 32.695228576660156, "learning_rate": 9.582503167146027e-07, "logits/chosen": -0.5547333359718323, "logits/rejected": -0.47261205315589905, "logps/chosen": -1.9371652603149414, "logps/rejected": -1.691992998123169, "loss": 5.5003, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -19.371652603149414, "rewards/margins": -2.451723098754883, "rewards/rejected": -16.91992950439453, "step": 6465 }, { "epoch": 0.21807273585223635, "grad_norm": 19.64496421813965, "learning_rate": 9.58132573821522e-07, "logits/chosen": -0.7506909370422363, "logits/rejected": -0.6311949491500854, "logps/chosen": -1.596892237663269, "logps/rejected": -1.7580207586288452, "loss": 2.1849, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -15.96892261505127, "rewards/margins": 1.6112867593765259, "rewards/rejected": -17.580209732055664, "step": 6470 }, { "epoch": 0.21824126192321952, "grad_norm": 25.59090805053711, "learning_rate": 9.580146723868072e-07, "logits/chosen": -0.5404247045516968, "logits/rejected": -0.5816585421562195, "logps/chosen": -1.8674030303955078, "logps/rejected": -1.8501733541488647, "loss": 3.4942, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.67403221130371, "rewards/margins": -0.1722976714372635, "rewards/rejected": -18.501733779907227, "step": 6475 }, { "epoch": 0.21840978799420271, "grad_norm": 20.649513244628906, "learning_rate": 9.578966124512593e-07, "logits/chosen": -0.2291760891675949, "logits/rejected": -0.20900988578796387, "logps/chosen": -2.7134552001953125, "logps/rejected": -2.463433027267456, "loss": 5.9579, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -27.134552001953125, "rewards/margins": -2.500222682952881, "rewards/rejected": -24.63433074951172, "step": 6480 }, { "epoch": 0.21857831406518588, "grad_norm": 24.989622116088867, "learning_rate": 9.577783940557343e-07, "logits/chosen": -0.5035347938537598, "logits/rejected": -0.5523974299430847, "logps/chosen": -1.8505939245224, "logps/rejected": -2.091245412826538, "loss": 2.3623, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.505939483642578, "rewards/margins": 2.406515598297119, "rewards/rejected": -20.91245460510254, "step": 6485 }, { "epoch": 0.21874684013616907, "grad_norm": 25.5734920501709, "learning_rate": 9.576600172411427e-07, "logits/chosen": -0.12298359721899033, "logits/rejected": -0.15590530633926392, "logps/chosen": -2.0219905376434326, "logps/rejected": -2.2334604263305664, "loss": 2.2344, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.219905853271484, "rewards/margins": 2.1146976947784424, "rewards/rejected": -22.334606170654297, "step": 6490 }, { "epoch": 0.21891536620715224, "grad_norm": 17.484527587890625, "learning_rate": 9.575414820484504e-07, "logits/chosen": -0.9196332097053528, "logits/rejected": -1.0346037149429321, "logps/chosen": -1.782769799232483, "logps/rejected": -1.728493094444275, "loss": 3.6021, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -17.827695846557617, "rewards/margins": -0.5427675247192383, "rewards/rejected": -17.284927368164062, "step": 6495 }, { "epoch": 0.21908389227813543, "grad_norm": 18.694116592407227, "learning_rate": 9.574227885186775e-07, "logits/chosen": -0.8279164433479309, "logits/rejected": -0.8473072052001953, "logps/chosen": -1.5809705257415771, "logps/rejected": -1.5579888820648193, "loss": 3.3854, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -15.80970573425293, "rewards/margins": -0.22981634736061096, "rewards/rejected": -15.579889297485352, "step": 6500 }, { "epoch": 0.2192524183491186, "grad_norm": 20.692136764526367, "learning_rate": 9.57303936692899e-07, "logits/chosen": -0.5701172351837158, "logits/rejected": -0.6228208541870117, "logps/chosen": -1.9674314260482788, "logps/rejected": -1.919097900390625, "loss": 3.6805, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -19.674312591552734, "rewards/margins": -0.48333635926246643, "rewards/rejected": -19.190977096557617, "step": 6505 }, { "epoch": 0.2194209444201018, "grad_norm": 26.013273239135742, "learning_rate": 9.571849266122454e-07, "logits/chosen": -0.7226217985153198, "logits/rejected": -0.8986842036247253, "logps/chosen": -1.7132012844085693, "logps/rejected": -1.7211544513702393, "loss": 3.2907, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.13201332092285, "rewards/margins": 0.07953214645385742, "rewards/rejected": -17.211544036865234, "step": 6510 }, { "epoch": 0.21958947049108496, "grad_norm": 58.09033203125, "learning_rate": 9.57065758317901e-07, "logits/chosen": -0.2748204171657562, "logits/rejected": -0.2713713049888611, "logps/chosen": -1.971434235572815, "logps/rejected": -1.9171030521392822, "loss": 3.6719, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.714340209960938, "rewards/margins": -0.5433112978935242, "rewards/rejected": -19.171030044555664, "step": 6515 }, { "epoch": 0.21975799656206815, "grad_norm": 18.163524627685547, "learning_rate": 9.569464318511051e-07, "logits/chosen": -0.5663856863975525, "logits/rejected": -0.741828203201294, "logps/chosen": -1.770307183265686, "logps/rejected": -1.7896112203598022, "loss": 3.1223, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.70307159423828, "rewards/margins": 0.1930416077375412, "rewards/rejected": -17.8961124420166, "step": 6520 }, { "epoch": 0.21992652263305135, "grad_norm": 25.333982467651367, "learning_rate": 9.568269472531524e-07, "logits/chosen": -0.7503092288970947, "logits/rejected": -0.8162568211555481, "logps/chosen": -1.926618218421936, "logps/rejected": -2.043292760848999, "loss": 3.1717, "rewards/accuracies": 0.5, "rewards/chosen": -19.266183853149414, "rewards/margins": 1.166744589805603, "rewards/rejected": -20.432926177978516, "step": 6525 }, { "epoch": 0.2200950487040345, "grad_norm": 20.86461067199707, "learning_rate": 9.567073045653914e-07, "logits/chosen": -0.6699740290641785, "logits/rejected": -0.5963459610939026, "logps/chosen": -1.864882469177246, "logps/rejected": -2.050961494445801, "loss": 2.8193, "rewards/accuracies": 0.5, "rewards/chosen": -18.64882469177246, "rewards/margins": 1.8607914447784424, "rewards/rejected": -20.50961685180664, "step": 6530 }, { "epoch": 0.2202635747750177, "grad_norm": 25.337141036987305, "learning_rate": 9.565875038292257e-07, "logits/chosen": -0.5631424784660339, "logits/rejected": -0.6421005129814148, "logps/chosen": -1.7564456462860107, "logps/rejected": -1.802941083908081, "loss": 2.7187, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.564455032348633, "rewards/margins": 0.4649575352668762, "rewards/rejected": -18.0294132232666, "step": 6535 }, { "epoch": 0.22043210084600087, "grad_norm": 29.222047805786133, "learning_rate": 9.56467545086114e-07, "logits/chosen": -0.6572110652923584, "logits/rejected": -0.6429082155227661, "logps/chosen": -1.6367639303207397, "logps/rejected": -1.780846357345581, "loss": 1.9368, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -16.367639541625977, "rewards/margins": 1.440824031829834, "rewards/rejected": -17.808462142944336, "step": 6540 }, { "epoch": 0.22060062691698407, "grad_norm": 28.995386123657227, "learning_rate": 9.56347428377569e-07, "logits/chosen": -0.42840996384620667, "logits/rejected": -0.5102885365486145, "logps/chosen": -1.6626837253570557, "logps/rejected": -1.6840680837631226, "loss": 3.0719, "rewards/accuracies": 0.5, "rewards/chosen": -16.626834869384766, "rewards/margins": 0.2138429582118988, "rewards/rejected": -16.840679168701172, "step": 6545 }, { "epoch": 0.22076915298796723, "grad_norm": 35.35552978515625, "learning_rate": 9.562271537451584e-07, "logits/chosen": -1.028755784034729, "logits/rejected": -0.9808349609375, "logps/chosen": -1.923678994178772, "logps/rejected": -1.8900012969970703, "loss": 3.4336, "rewards/accuracies": 0.5, "rewards/chosen": -19.23678970336914, "rewards/margins": -0.33677586913108826, "rewards/rejected": -18.900012969970703, "step": 6550 }, { "epoch": 0.22093767905895043, "grad_norm": 26.18325424194336, "learning_rate": 9.561067212305043e-07, "logits/chosen": -0.445654958486557, "logits/rejected": -0.5217684507369995, "logps/chosen": -1.6793758869171143, "logps/rejected": -1.6027400493621826, "loss": 3.9224, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -16.793758392333984, "rewards/margins": -0.7663576006889343, "rewards/rejected": -16.027400970458984, "step": 6555 }, { "epoch": 0.2211062051299336, "grad_norm": 23.301103591918945, "learning_rate": 9.559861308752842e-07, "logits/chosen": -0.7921528816223145, "logits/rejected": -0.7277683019638062, "logps/chosen": -1.7608333826065063, "logps/rejected": -1.758272409439087, "loss": 3.2054, "rewards/accuracies": 0.5, "rewards/chosen": -17.608333587646484, "rewards/margins": -0.025607967749238014, "rewards/rejected": -17.58272361755371, "step": 6560 }, { "epoch": 0.22127473120091679, "grad_norm": 24.503135681152344, "learning_rate": 9.55865382721229e-07, "logits/chosen": -0.4001065194606781, "logits/rejected": -0.45027488470077515, "logps/chosen": -1.9991649389266968, "logps/rejected": -2.029761791229248, "loss": 3.766, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -19.991649627685547, "rewards/margins": 0.3059673309326172, "rewards/rejected": -20.297616958618164, "step": 6565 }, { "epoch": 0.22144325727189995, "grad_norm": 22.448015213012695, "learning_rate": 9.557444768101254e-07, "logits/chosen": -0.19503983855247498, "logits/rejected": -0.15284790098667145, "logps/chosen": -1.850487470626831, "logps/rejected": -1.883581519126892, "loss": 2.8386, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.50487518310547, "rewards/margins": 0.3309392035007477, "rewards/rejected": -18.835817337036133, "step": 6570 }, { "epoch": 0.22161178334288315, "grad_norm": 29.75132179260254, "learning_rate": 9.556234131838141e-07, "logits/chosen": -0.5109794735908508, "logits/rejected": -0.4620528221130371, "logps/chosen": -1.8948596715927124, "logps/rejected": -1.8566793203353882, "loss": 3.462, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.948598861694336, "rewards/margins": -0.3818041682243347, "rewards/rejected": -18.56679344177246, "step": 6575 }, { "epoch": 0.22178030941386634, "grad_norm": 21.997732162475586, "learning_rate": 9.555021918841902e-07, "logits/chosen": -0.2893935441970825, "logits/rejected": -0.3516172766685486, "logps/chosen": -2.042057991027832, "logps/rejected": -2.02874755859375, "loss": 3.2381, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.420581817626953, "rewards/margins": -0.1331055611371994, "rewards/rejected": -20.287473678588867, "step": 6580 }, { "epoch": 0.2219488354848495, "grad_norm": 42.73481750488281, "learning_rate": 9.553808129532037e-07, "logits/chosen": -0.6087731122970581, "logits/rejected": -0.4572841227054596, "logps/chosen": -1.7725818157196045, "logps/rejected": -1.8231289386749268, "loss": 2.8295, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.725818634033203, "rewards/margins": 0.5054678320884705, "rewards/rejected": -18.231287002563477, "step": 6585 }, { "epoch": 0.2221173615558327, "grad_norm": 25.13774299621582, "learning_rate": 9.552592764328593e-07, "logits/chosen": -0.6387229561805725, "logits/rejected": -0.5615711808204651, "logps/chosen": -1.724822759628296, "logps/rejected": -1.750415563583374, "loss": 2.9788, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.248228073120117, "rewards/margins": 0.255929172039032, "rewards/rejected": -17.5041561126709, "step": 6590 }, { "epoch": 0.22228588762681586, "grad_norm": 22.05751609802246, "learning_rate": 9.551375823652158e-07, "logits/chosen": -0.3565082848072052, "logits/rejected": -0.34892910718917847, "logps/chosen": -1.891668677330017, "logps/rejected": -1.9782425165176392, "loss": 2.4094, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.91668701171875, "rewards/margins": 0.8657382130622864, "rewards/rejected": -19.782424926757812, "step": 6595 }, { "epoch": 0.22245441369779906, "grad_norm": 13.150750160217285, "learning_rate": 9.550157307923865e-07, "logits/chosen": -0.4826990067958832, "logits/rejected": -0.4953466057777405, "logps/chosen": -1.9391247034072876, "logps/rejected": -2.0218353271484375, "loss": 2.6082, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.391246795654297, "rewards/margins": 0.8271061778068542, "rewards/rejected": -20.218353271484375, "step": 6600 }, { "epoch": 0.22262293976878222, "grad_norm": 71.07939910888672, "learning_rate": 9.5489372175654e-07, "logits/chosen": -0.11356012523174286, "logits/rejected": -0.12396962940692902, "logps/chosen": -2.3725666999816895, "logps/rejected": -2.422454833984375, "loss": 2.8288, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.72566795349121, "rewards/margins": 0.498879998922348, "rewards/rejected": -24.22454833984375, "step": 6605 }, { "epoch": 0.22279146583976542, "grad_norm": 29.017131805419922, "learning_rate": 9.547715552998984e-07, "logits/chosen": -0.6171203851699829, "logits/rejected": -0.587684690952301, "logps/chosen": -2.0196220874786377, "logps/rejected": -1.9217265844345093, "loss": 4.0861, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.19622230529785, "rewards/margins": -0.9789560437202454, "rewards/rejected": -19.217266082763672, "step": 6610 }, { "epoch": 0.22295999191074858, "grad_norm": 25.13987159729004, "learning_rate": 9.546492314647387e-07, "logits/chosen": -0.8641460537910461, "logits/rejected": -0.7396403551101685, "logps/chosen": -1.8659296035766602, "logps/rejected": -2.2930824756622314, "loss": 1.7524, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.6592960357666, "rewards/margins": 4.271526336669922, "rewards/rejected": -22.930822372436523, "step": 6615 }, { "epoch": 0.22312851798173178, "grad_norm": 15.366714477539062, "learning_rate": 9.545267502933925e-07, "logits/chosen": -0.7649020552635193, "logits/rejected": -0.9529761075973511, "logps/chosen": -1.4214767217636108, "logps/rejected": -1.4711424112319946, "loss": 2.9393, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -14.214767456054688, "rewards/margins": 0.4966561198234558, "rewards/rejected": -14.71142292022705, "step": 6620 }, { "epoch": 0.22329704405271494, "grad_norm": 28.1103458404541, "learning_rate": 9.544041118282457e-07, "logits/chosen": -0.8209296464920044, "logits/rejected": -0.8028782606124878, "logps/chosen": -1.7018072605133057, "logps/rejected": -1.8131166696548462, "loss": 2.1291, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.0180721282959, "rewards/margins": 1.1130938529968262, "rewards/rejected": -18.131168365478516, "step": 6625 }, { "epoch": 0.22346557012369814, "grad_norm": 121.37602233886719, "learning_rate": 9.542813161117384e-07, "logits/chosen": -0.11982444673776627, "logits/rejected": -0.2019883692264557, "logps/chosen": -2.4919440746307373, "logps/rejected": -2.4208712577819824, "loss": 3.8158, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -24.9194393157959, "rewards/margins": -0.7107278108596802, "rewards/rejected": -24.20871353149414, "step": 6630 }, { "epoch": 0.22363409619468133, "grad_norm": 15.064230918884277, "learning_rate": 9.541583631863658e-07, "logits/chosen": -0.36297905445098877, "logits/rejected": -0.26934993267059326, "logps/chosen": -1.9586620330810547, "logps/rejected": -2.617410659790039, "loss": 1.8796, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -19.586620330810547, "rewards/margins": 6.587483882904053, "rewards/rejected": -26.174102783203125, "step": 6635 }, { "epoch": 0.2238026222656645, "grad_norm": 26.91686248779297, "learning_rate": 9.540352530946769e-07, "logits/chosen": -0.43593844771385193, "logits/rejected": -0.3484894633293152, "logps/chosen": -2.200504779815674, "logps/rejected": -2.132821559906006, "loss": 3.8118, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -22.005050659179688, "rewards/margins": -0.6768323183059692, "rewards/rejected": -21.328218460083008, "step": 6640 }, { "epoch": 0.2239711483366477, "grad_norm": 16.408367156982422, "learning_rate": 9.53911985879275e-07, "logits/chosen": -0.34586840867996216, "logits/rejected": -0.5225354433059692, "logps/chosen": -1.722537636756897, "logps/rejected": -1.7929697036743164, "loss": 3.1201, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.22537612915039, "rewards/margins": 0.7043231129646301, "rewards/rejected": -17.929698944091797, "step": 6645 }, { "epoch": 0.22413967440763086, "grad_norm": 22.123409271240234, "learning_rate": 9.537885615828184e-07, "logits/chosen": -0.6946117281913757, "logits/rejected": -0.698863685131073, "logps/chosen": -1.8962112665176392, "logps/rejected": -1.6883169412612915, "loss": 5.1022, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -18.962108612060547, "rewards/margins": -2.0789437294006348, "rewards/rejected": -16.883167266845703, "step": 6650 }, { "epoch": 0.22430820047861405, "grad_norm": 21.01542854309082, "learning_rate": 9.536649802480189e-07, "logits/chosen": -0.6982828974723816, "logits/rejected": -0.6379950046539307, "logps/chosen": -1.8179126977920532, "logps/rejected": -1.7372678518295288, "loss": 3.8856, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -18.179126739501953, "rewards/margins": -0.8064476251602173, "rewards/rejected": -17.3726806640625, "step": 6655 }, { "epoch": 0.22447672654959722, "grad_norm": 14.440115928649902, "learning_rate": 9.535412419176436e-07, "logits/chosen": -0.5513002276420593, "logits/rejected": -0.43792086839675903, "logps/chosen": -2.1794581413269043, "logps/rejected": -2.4261536598205566, "loss": 2.6097, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.79458236694336, "rewards/margins": 2.4669549465179443, "rewards/rejected": -24.261539459228516, "step": 6660 }, { "epoch": 0.2246452526205804, "grad_norm": 26.166810989379883, "learning_rate": 9.534173466345132e-07, "logits/chosen": -0.4692150950431824, "logits/rejected": -0.33260539174079895, "logps/chosen": -1.8202335834503174, "logps/rejected": -1.7992265224456787, "loss": 4.2495, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -18.202335357666016, "rewards/margins": -0.21007022261619568, "rewards/rejected": -17.992265701293945, "step": 6665 }, { "epoch": 0.22481377869156358, "grad_norm": 65.70748901367188, "learning_rate": 9.532932944415031e-07, "logits/chosen": -0.6528714299201965, "logits/rejected": -0.49211350083351135, "logps/chosen": -2.224884033203125, "logps/rejected": -2.3405163288116455, "loss": 3.6286, "rewards/accuracies": 0.5, "rewards/chosen": -22.248838424682617, "rewards/margins": 1.1563222408294678, "rewards/rejected": -23.405162811279297, "step": 6670 }, { "epoch": 0.22498230476254677, "grad_norm": 25.11475944519043, "learning_rate": 9.531690853815428e-07, "logits/chosen": -0.7747845649719238, "logits/rejected": -0.8052785992622375, "logps/chosen": -1.8111671209335327, "logps/rejected": -1.9030259847640991, "loss": 2.2895, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.111669540405273, "rewards/margins": 0.9185881614685059, "rewards/rejected": -19.03026008605957, "step": 6675 }, { "epoch": 0.22515083083352994, "grad_norm": 25.83588409423828, "learning_rate": 9.530447194976163e-07, "logits/chosen": -0.6666157841682434, "logits/rejected": -0.648045539855957, "logps/chosen": -1.7558119297027588, "logps/rejected": -1.7752193212509155, "loss": 3.098, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.558116912841797, "rewards/margins": 0.1940757781267166, "rewards/rejected": -17.752193450927734, "step": 6680 }, { "epoch": 0.22531935690451313, "grad_norm": 16.22815704345703, "learning_rate": 9.529201968327616e-07, "logits/chosen": -0.42409926652908325, "logits/rejected": -0.47118502855300903, "logps/chosen": -2.16229248046875, "logps/rejected": -2.263599395751953, "loss": 2.5547, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.6229248046875, "rewards/margins": 1.0130705833435059, "rewards/rejected": -22.635995864868164, "step": 6685 }, { "epoch": 0.2254878829754963, "grad_norm": 211.99465942382812, "learning_rate": 9.527955174300711e-07, "logits/chosen": -0.7075196504592896, "logits/rejected": -0.8274857401847839, "logps/chosen": -2.0796940326690674, "logps/rejected": -2.0456149578094482, "loss": 3.6199, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.796939849853516, "rewards/margins": -0.34078893065452576, "rewards/rejected": -20.45615005493164, "step": 6690 }, { "epoch": 0.2256564090464795, "grad_norm": 26.797080993652344, "learning_rate": 9.526706813326914e-07, "logits/chosen": -0.2986915409564972, "logits/rejected": -0.4110233187675476, "logps/chosen": -2.0323266983032227, "logps/rejected": -2.0355358123779297, "loss": 3.17, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.323266983032227, "rewards/margins": 0.03209175914525986, "rewards/rejected": -20.355358123779297, "step": 6695 }, { "epoch": 0.22582493511746268, "grad_norm": 37.4174690246582, "learning_rate": 9.525456885838234e-07, "logits/chosen": -0.497040331363678, "logits/rejected": -0.5556106567382812, "logps/chosen": -2.1126370429992676, "logps/rejected": -1.7659200429916382, "loss": 6.574, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -21.12636947631836, "rewards/margins": -3.4671692848205566, "rewards/rejected": -17.65920066833496, "step": 6700 }, { "epoch": 0.22599346118844585, "grad_norm": 31.655685424804688, "learning_rate": 9.524205392267223e-07, "logits/chosen": -0.6221505999565125, "logits/rejected": -0.524019181728363, "logps/chosen": -1.5004098415374756, "logps/rejected": -1.6298938989639282, "loss": 2.1264, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -15.004098892211914, "rewards/margins": 1.2948402166366577, "rewards/rejected": -16.298938751220703, "step": 6705 }, { "epoch": 0.22616198725942904, "grad_norm": 25.590959548950195, "learning_rate": 9.522952333046972e-07, "logits/chosen": -0.6376134157180786, "logits/rejected": -0.6879085302352905, "logps/chosen": -1.544236660003662, "logps/rejected": -1.590477705001831, "loss": 2.8925, "rewards/accuracies": 0.5, "rewards/chosen": -15.442367553710938, "rewards/margins": 0.462411105632782, "rewards/rejected": -15.904777526855469, "step": 6710 }, { "epoch": 0.2263305133304122, "grad_norm": 32.62403106689453, "learning_rate": 9.521697708611114e-07, "logits/chosen": -0.38372719287872314, "logits/rejected": -0.29491403698921204, "logps/chosen": -2.097902774810791, "logps/rejected": -1.9758249521255493, "loss": 4.3138, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.979028701782227, "rewards/margins": -1.2207807302474976, "rewards/rejected": -19.75824737548828, "step": 6715 }, { "epoch": 0.2264990394013954, "grad_norm": 14.913753509521484, "learning_rate": 9.52044151939383e-07, "logits/chosen": -0.3649575710296631, "logits/rejected": -0.3394049108028412, "logps/chosen": -2.1916496753692627, "logps/rejected": -2.3644070625305176, "loss": 1.9326, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -21.9164981842041, "rewards/margins": 1.7275749444961548, "rewards/rejected": -23.64406967163086, "step": 6720 }, { "epoch": 0.22666756547237857, "grad_norm": 29.087324142456055, "learning_rate": 9.519183765829831e-07, "logits/chosen": -0.71019047498703, "logits/rejected": -0.7044280171394348, "logps/chosen": -1.7740542888641357, "logps/rejected": -1.9014533758163452, "loss": 2.9555, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.74054527282715, "rewards/margins": 1.2739917039871216, "rewards/rejected": -19.014535903930664, "step": 6725 }, { "epoch": 0.22683609154336176, "grad_norm": 30.07399559020996, "learning_rate": 9.517924448354381e-07, "logits/chosen": -0.474402517080307, "logits/rejected": -0.5710434913635254, "logps/chosen": -1.7302747964859009, "logps/rejected": -1.8411529064178467, "loss": 2.2038, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.302745819091797, "rewards/margins": 1.1087833642959595, "rewards/rejected": -18.411531448364258, "step": 6730 }, { "epoch": 0.22700461761434493, "grad_norm": 27.725431442260742, "learning_rate": 9.516663567403278e-07, "logits/chosen": -0.6341021060943604, "logits/rejected": -0.7254477739334106, "logps/chosen": -1.5502922534942627, "logps/rejected": -1.6045587062835693, "loss": 2.736, "rewards/accuracies": 0.5, "rewards/chosen": -15.502920150756836, "rewards/margins": 0.5426663160324097, "rewards/rejected": -16.045589447021484, "step": 6735 }, { "epoch": 0.22717314368532812, "grad_norm": 28.309858322143555, "learning_rate": 9.515401123412865e-07, "logits/chosen": -0.5554046630859375, "logits/rejected": -0.5442392826080322, "logps/chosen": -1.5706939697265625, "logps/rejected": -1.4859893321990967, "loss": 3.9188, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -15.706939697265625, "rewards/margins": -0.8470472097396851, "rewards/rejected": -14.859891891479492, "step": 6740 }, { "epoch": 0.2273416697563113, "grad_norm": 23.21711540222168, "learning_rate": 9.514137116820022e-07, "logits/chosen": -0.4454229772090912, "logits/rejected": -0.4550401568412781, "logps/chosen": -1.8525142669677734, "logps/rejected": -1.8814477920532227, "loss": 2.8648, "rewards/accuracies": 0.5, "rewards/chosen": -18.525142669677734, "rewards/margins": 0.2893357276916504, "rewards/rejected": -18.81447982788086, "step": 6745 }, { "epoch": 0.22751019582729448, "grad_norm": 41.267127990722656, "learning_rate": 9.512871548062173e-07, "logits/chosen": -0.43118929862976074, "logits/rejected": -0.45497363805770874, "logps/chosen": -1.9234278202056885, "logps/rejected": -2.0509819984436035, "loss": 2.7819, "rewards/accuracies": 0.5, "rewards/chosen": -19.234275817871094, "rewards/margins": 1.2755451202392578, "rewards/rejected": -20.509822845458984, "step": 6750 }, { "epoch": 0.22767872189827768, "grad_norm": 42.087406158447266, "learning_rate": 9.51160441757728e-07, "logits/chosen": -0.42311835289001465, "logits/rejected": -0.4611433148384094, "logps/chosen": -1.8028514385223389, "logps/rejected": -1.7913545370101929, "loss": 3.3435, "rewards/accuracies": 0.5, "rewards/chosen": -18.028514862060547, "rewards/margins": -0.11496935039758682, "rewards/rejected": -17.91354751586914, "step": 6755 }, { "epoch": 0.22784724796926084, "grad_norm": 17.753387451171875, "learning_rate": 9.51033572580385e-07, "logits/chosen": -0.8366772532463074, "logits/rejected": -0.8449716567993164, "logps/chosen": -1.6344677209854126, "logps/rejected": -1.6162865161895752, "loss": 3.3434, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.344676971435547, "rewards/margins": -0.1818103790283203, "rewards/rejected": -16.162866592407227, "step": 6760 }, { "epoch": 0.22801577404024403, "grad_norm": 15.194600105285645, "learning_rate": 9.509065473180924e-07, "logits/chosen": -0.6022413372993469, "logits/rejected": -0.6520088911056519, "logps/chosen": -1.9417146444320679, "logps/rejected": -2.4450767040252686, "loss": 2.5999, "rewards/accuracies": 0.5, "rewards/chosen": -19.417146682739258, "rewards/margins": 5.033621311187744, "rewards/rejected": -24.450769424438477, "step": 6765 }, { "epoch": 0.2281843001112272, "grad_norm": 16.05148696899414, "learning_rate": 9.507793660148089e-07, "logits/chosen": -0.5362441539764404, "logits/rejected": -0.5728567242622375, "logps/chosen": -2.0258445739746094, "logps/rejected": -2.310606002807617, "loss": 1.6216, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.25844383239746, "rewards/margins": 2.8476147651672363, "rewards/rejected": -23.106060028076172, "step": 6770 }, { "epoch": 0.2283528261822104, "grad_norm": 42.07566452026367, "learning_rate": 9.506520287145467e-07, "logits/chosen": -0.5904275178909302, "logits/rejected": -0.6621605157852173, "logps/chosen": -2.118563175201416, "logps/rejected": -2.208606004714966, "loss": 3.1772, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.18562889099121, "rewards/margins": 0.9004287719726562, "rewards/rejected": -22.086057662963867, "step": 6775 }, { "epoch": 0.22852135225319356, "grad_norm": 14.864330291748047, "learning_rate": 9.505245354613725e-07, "logits/chosen": -0.4866722524166107, "logits/rejected": -0.5212317705154419, "logps/chosen": -1.8210203647613525, "logps/rejected": -2.2158994674682617, "loss": 1.2505, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.210201263427734, "rewards/margins": 3.948791980743408, "rewards/rejected": -22.158994674682617, "step": 6780 }, { "epoch": 0.22868987832417675, "grad_norm": 23.20281410217285, "learning_rate": 9.503968862994065e-07, "logits/chosen": -0.5525738000869751, "logits/rejected": -0.7386514544487, "logps/chosen": -2.000377655029297, "logps/rejected": -2.2927498817443848, "loss": 1.8016, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -20.0037784576416, "rewards/margins": 2.923720598220825, "rewards/rejected": -22.9274959564209, "step": 6785 }, { "epoch": 0.22885840439515992, "grad_norm": 11.40327262878418, "learning_rate": 9.502690812728229e-07, "logits/chosen": -0.6232748627662659, "logits/rejected": -0.5543674230575562, "logps/chosen": -1.6506311893463135, "logps/rejected": -1.6903736591339111, "loss": 3.0261, "rewards/accuracies": 0.5, "rewards/chosen": -16.506309509277344, "rewards/margins": 0.3974243104457855, "rewards/rejected": -16.903736114501953, "step": 6790 }, { "epoch": 0.2290269304661431, "grad_norm": 22.759078979492188, "learning_rate": 9.501411204258504e-07, "logits/chosen": -0.48999086022377014, "logits/rejected": -0.45051321387290955, "logps/chosen": -1.6978733539581299, "logps/rejected": -1.5963754653930664, "loss": 4.0642, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -16.97873306274414, "rewards/margins": -1.0149786472320557, "rewards/rejected": -15.96375560760498, "step": 6795 }, { "epoch": 0.22919545653712628, "grad_norm": 29.907974243164062, "learning_rate": 9.500130038027709e-07, "logits/chosen": -0.6095082759857178, "logits/rejected": -0.4460016191005707, "logps/chosen": -2.117158889770508, "logps/rejected": -2.062587261199951, "loss": 3.7483, "rewards/accuracies": 0.5, "rewards/chosen": -21.171588897705078, "rewards/margins": -0.545714259147644, "rewards/rejected": -20.625873565673828, "step": 6800 }, { "epoch": 0.22919545653712628, "eval_logits/chosen": -0.8406579494476318, "eval_logits/rejected": -0.8750758171081543, "eval_logps/chosen": -1.7596479654312134, "eval_logps/rejected": -1.7937562465667725, "eval_loss": 3.213974714279175, "eval_rewards/accuracies": 0.5699999928474426, "eval_rewards/chosen": -17.596479415893555, "eval_rewards/margins": 0.3410845100879669, "eval_rewards/rejected": -17.937564849853516, "eval_runtime": 12.9165, "eval_samples_per_second": 7.742, "eval_steps_per_second": 1.936, "step": 6800 }, { "epoch": 0.22936398260810947, "grad_norm": 21.820558547973633, "learning_rate": 9.498847314479205e-07, "logits/chosen": -0.6138108968734741, "logits/rejected": -0.6479529738426208, "logps/chosen": -2.047436475753784, "logps/rejected": -1.9390960931777954, "loss": 4.1537, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -20.474365234375, "rewards/margins": -1.0834046602249146, "rewards/rejected": -19.390958786010742, "step": 6805 }, { "epoch": 0.22953250867909267, "grad_norm": 12.025269508361816, "learning_rate": 9.497563034056894e-07, "logits/chosen": -0.5252547264099121, "logits/rejected": -0.572632908821106, "logps/chosen": -2.4980757236480713, "logps/rejected": -2.691744565963745, "loss": 2.5268, "rewards/accuracies": 0.5, "rewards/chosen": -24.980758666992188, "rewards/margins": 1.9366881847381592, "rewards/rejected": -26.917444229125977, "step": 6810 }, { "epoch": 0.22970103475007583, "grad_norm": 23.271806716918945, "learning_rate": 9.496277197205213e-07, "logits/chosen": -0.867204487323761, "logits/rejected": -0.9149467349052429, "logps/chosen": -1.4222182035446167, "logps/rejected": -1.5333744287490845, "loss": 2.2384, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -14.222180366516113, "rewards/margins": 1.1115612983703613, "rewards/rejected": -15.33374309539795, "step": 6815 }, { "epoch": 0.22986956082105903, "grad_norm": 42.697227478027344, "learning_rate": 9.49498980436914e-07, "logits/chosen": -0.2986542582511902, "logits/rejected": -0.32819774746894836, "logps/chosen": -1.8275251388549805, "logps/rejected": -1.9412891864776611, "loss": 2.1117, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.275253295898438, "rewards/margins": 1.137637734413147, "rewards/rejected": -19.412891387939453, "step": 6820 }, { "epoch": 0.2300380868920422, "grad_norm": 17.321014404296875, "learning_rate": 9.493700855994194e-07, "logits/chosen": -0.7194541692733765, "logits/rejected": -0.7003971338272095, "logps/chosen": -1.5345981121063232, "logps/rejected": -1.6274287700653076, "loss": 2.4023, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -15.345980644226074, "rewards/margins": 0.9283071756362915, "rewards/rejected": -16.274288177490234, "step": 6825 }, { "epoch": 0.2302066129630254, "grad_norm": 15.599539756774902, "learning_rate": 9.492410352526423e-07, "logits/chosen": -0.49592137336730957, "logits/rejected": -0.650071918964386, "logps/chosen": -2.09785795211792, "logps/rejected": -1.9914875030517578, "loss": 4.4761, "rewards/accuracies": 0.5, "rewards/chosen": -20.978580474853516, "rewards/margins": -1.063704252243042, "rewards/rejected": -19.914875030517578, "step": 6830 }, { "epoch": 0.23037513903400855, "grad_norm": 22.581743240356445, "learning_rate": 9.491118294412423e-07, "logits/chosen": -0.7487185597419739, "logits/rejected": -0.6236433982849121, "logps/chosen": -1.9754031896591187, "logps/rejected": -2.0959200859069824, "loss": 3.6634, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -19.754032135009766, "rewards/margins": 1.2051713466644287, "rewards/rejected": -20.959203720092773, "step": 6835 }, { "epoch": 0.23054366510499175, "grad_norm": 22.873687744140625, "learning_rate": 9.489824682099327e-07, "logits/chosen": -0.5404466390609741, "logits/rejected": -0.8129202127456665, "logps/chosen": -2.1067965030670166, "logps/rejected": -2.1464054584503174, "loss": 3.3768, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -21.067964553833008, "rewards/margins": 0.3960878252983093, "rewards/rejected": -21.464054107666016, "step": 6840 }, { "epoch": 0.2307121911759749, "grad_norm": 23.130495071411133, "learning_rate": 9.488529516034799e-07, "logits/chosen": -0.5961964130401611, "logits/rejected": -0.6887432336807251, "logps/chosen": -1.951284408569336, "logps/rejected": -2.1935133934020996, "loss": 2.2539, "rewards/accuracies": 0.5, "rewards/chosen": -19.51284408569336, "rewards/margins": 2.4222893714904785, "rewards/rejected": -21.935134887695312, "step": 6845 }, { "epoch": 0.2308807172469581, "grad_norm": 25.29384422302246, "learning_rate": 9.487232796667046e-07, "logits/chosen": -0.7350292205810547, "logits/rejected": -0.6262162327766418, "logps/chosen": -2.2591564655303955, "logps/rejected": -2.370372772216797, "loss": 2.8626, "rewards/accuracies": 0.5, "rewards/chosen": -22.591564178466797, "rewards/margins": 1.112162709236145, "rewards/rejected": -23.703725814819336, "step": 6850 }, { "epoch": 0.23104924331794127, "grad_norm": 54.073036193847656, "learning_rate": 9.485934524444814e-07, "logits/chosen": -0.3840603828430176, "logits/rejected": -0.47770124673843384, "logps/chosen": -2.075129270553589, "logps/rejected": -2.033393383026123, "loss": 3.5247, "rewards/accuracies": 0.5, "rewards/chosen": -20.751293182373047, "rewards/margins": -0.41735896468162537, "rewards/rejected": -20.333934783935547, "step": 6855 }, { "epoch": 0.23121776938892447, "grad_norm": 15.75258731842041, "learning_rate": 9.48463469981738e-07, "logits/chosen": -0.636978268623352, "logits/rejected": -0.5728309750556946, "logps/chosen": -1.620154619216919, "logps/rejected": -1.6520893573760986, "loss": 3.1537, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.201547622680664, "rewards/margins": 0.31934672594070435, "rewards/rejected": -16.520893096923828, "step": 6860 }, { "epoch": 0.23138629545990766, "grad_norm": 20.73732566833496, "learning_rate": 9.483333323234564e-07, "logits/chosen": -0.29280218482017517, "logits/rejected": -0.3819066882133484, "logps/chosen": -2.0080299377441406, "logps/rejected": -2.111577272415161, "loss": 2.4741, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.08030128479004, "rewards/margins": 1.0354706048965454, "rewards/rejected": -21.11577033996582, "step": 6865 }, { "epoch": 0.23155482153089083, "grad_norm": 39.25248336791992, "learning_rate": 9.482030395146721e-07, "logits/chosen": -0.1386869251728058, "logits/rejected": -0.34750866889953613, "logps/chosen": -1.9895210266113281, "logps/rejected": -1.9648908376693726, "loss": 3.6539, "rewards/accuracies": 0.5, "rewards/chosen": -19.89521026611328, "rewards/margins": -0.24630098044872284, "rewards/rejected": -19.648908615112305, "step": 6870 }, { "epoch": 0.23172334760187402, "grad_norm": 39.787757873535156, "learning_rate": 9.480725916004744e-07, "logits/chosen": -0.5389952659606934, "logits/rejected": -0.626449465751648, "logps/chosen": -1.7958948612213135, "logps/rejected": -1.9659671783447266, "loss": 2.1301, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.958948135375977, "rewards/margins": 1.7007251977920532, "rewards/rejected": -19.659671783447266, "step": 6875 }, { "epoch": 0.23189187367285718, "grad_norm": 27.43910789489746, "learning_rate": 9.479419886260062e-07, "logits/chosen": -0.7904404997825623, "logits/rejected": -0.6668750047683716, "logps/chosen": -1.8657734394073486, "logps/rejected": -2.2539117336273193, "loss": 1.8542, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.65773582458496, "rewards/margins": 3.8813834190368652, "rewards/rejected": -22.53911781311035, "step": 6880 }, { "epoch": 0.23206039974384038, "grad_norm": 27.898033142089844, "learning_rate": 9.478112306364639e-07, "logits/chosen": -0.7889149785041809, "logits/rejected": -0.6062323451042175, "logps/chosen": -2.0559146404266357, "logps/rejected": -2.083693265914917, "loss": 4.3504, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -20.559146881103516, "rewards/margins": 0.2777865529060364, "rewards/rejected": -20.836933135986328, "step": 6885 }, { "epoch": 0.23222892581482354, "grad_norm": 28.48827362060547, "learning_rate": 9.476803176770975e-07, "logits/chosen": -0.7070311307907104, "logits/rejected": -0.6959258913993835, "logps/chosen": -1.712421178817749, "logps/rejected": -1.7417049407958984, "loss": 2.8571, "rewards/accuracies": 0.5, "rewards/chosen": -17.12421226501465, "rewards/margins": 0.2928358018398285, "rewards/rejected": -17.417049407958984, "step": 6890 }, { "epoch": 0.23239745188580674, "grad_norm": 43.162017822265625, "learning_rate": 9.475492497932113e-07, "logits/chosen": -0.1779576987028122, "logits/rejected": -0.1989670693874359, "logps/chosen": -2.1419949531555176, "logps/rejected": -2.124372959136963, "loss": 4.1412, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.41994857788086, "rewards/margins": -0.1762198507785797, "rewards/rejected": -21.243728637695312, "step": 6895 }, { "epoch": 0.2325659779567899, "grad_norm": 31.642684936523438, "learning_rate": 9.474180270301624e-07, "logits/chosen": -0.42405325174331665, "logits/rejected": -0.5832849740982056, "logps/chosen": -1.8670570850372314, "logps/rejected": -2.023833990097046, "loss": 1.9224, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.67057228088379, "rewards/margins": 1.56776762008667, "rewards/rejected": -20.238338470458984, "step": 6900 }, { "epoch": 0.2327345040277731, "grad_norm": 30.18916130065918, "learning_rate": 9.472866494333618e-07, "logits/chosen": -0.18663057684898376, "logits/rejected": -0.2984737455844879, "logps/chosen": -2.2856342792510986, "logps/rejected": -2.303093910217285, "loss": 3.2669, "rewards/accuracies": 0.5, "rewards/chosen": -22.856340408325195, "rewards/margins": 0.17459754645824432, "rewards/rejected": -23.03093719482422, "step": 6905 }, { "epoch": 0.23290303009875626, "grad_norm": 19.05219078063965, "learning_rate": 9.471551170482744e-07, "logits/chosen": -0.4543988108634949, "logits/rejected": -0.5632656812667847, "logps/chosen": -1.767507791519165, "logps/rejected": -2.026975393295288, "loss": 2.1611, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.67507553100586, "rewards/margins": 2.5946767330169678, "rewards/rejected": -20.26975440979004, "step": 6910 }, { "epoch": 0.23307155616973946, "grad_norm": 22.81329345703125, "learning_rate": 9.47023429920418e-07, "logits/chosen": -0.5807046890258789, "logits/rejected": -0.6248041987419128, "logps/chosen": -1.697127103805542, "logps/rejected": -1.7438815832138062, "loss": 2.669, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.97127342224121, "rewards/margins": 0.46754246950149536, "rewards/rejected": -17.43881607055664, "step": 6915 }, { "epoch": 0.23324008224072265, "grad_norm": 30.798070907592773, "learning_rate": 9.468915880953648e-07, "logits/chosen": -0.5077857971191406, "logits/rejected": -0.5084593296051025, "logps/chosen": -1.7491766214370728, "logps/rejected": -1.8958772420883179, "loss": 1.9738, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.49176597595215, "rewards/margins": 1.4670063257217407, "rewards/rejected": -18.958772659301758, "step": 6920 }, { "epoch": 0.23340860831170582, "grad_norm": 24.726884841918945, "learning_rate": 9.467595916187396e-07, "logits/chosen": -0.27916693687438965, "logits/rejected": -0.2978662848472595, "logps/chosen": -2.0224666595458984, "logps/rejected": -2.0680315494537354, "loss": 2.9168, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.224666595458984, "rewards/margins": 0.4556505084037781, "rewards/rejected": -20.68031883239746, "step": 6925 }, { "epoch": 0.233577134382689, "grad_norm": 18.920076370239258, "learning_rate": 9.466274405362214e-07, "logits/chosen": -0.8002731204032898, "logits/rejected": -0.8154155015945435, "logps/chosen": -1.664515733718872, "logps/rejected": -1.9041354656219482, "loss": 2.3318, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.645156860351562, "rewards/margins": 2.396198272705078, "rewards/rejected": -19.04135513305664, "step": 6930 }, { "epoch": 0.23374566045367218, "grad_norm": 25.981199264526367, "learning_rate": 9.464951348935424e-07, "logits/chosen": -0.7253482937812805, "logits/rejected": -0.8367172479629517, "logps/chosen": -1.6983642578125, "logps/rejected": -1.812461495399475, "loss": 2.4775, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.983642578125, "rewards/margins": 1.1409722566604614, "rewards/rejected": -18.124614715576172, "step": 6935 }, { "epoch": 0.23391418652465537, "grad_norm": 28.854867935180664, "learning_rate": 9.463626747364886e-07, "logits/chosen": -0.6369872689247131, "logits/rejected": -0.6354336142539978, "logps/chosen": -1.8397117853164673, "logps/rejected": -2.0742554664611816, "loss": 1.956, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.397119522094727, "rewards/margins": 2.345435619354248, "rewards/rejected": -20.7425537109375, "step": 6940 }, { "epoch": 0.23408271259563854, "grad_norm": 16.387676239013672, "learning_rate": 9.462300601108988e-07, "logits/chosen": -0.6813743114471436, "logits/rejected": -0.682715117931366, "logps/chosen": -1.463041067123413, "logps/rejected": -1.5665823221206665, "loss": 2.3658, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -14.630411148071289, "rewards/margins": 1.0354135036468506, "rewards/rejected": -15.665822982788086, "step": 6945 }, { "epoch": 0.23425123866662173, "grad_norm": 20.10076141357422, "learning_rate": 9.460972910626661e-07, "logits/chosen": -0.474844366312027, "logits/rejected": -0.5014239549636841, "logps/chosen": -1.6983531713485718, "logps/rejected": -2.114022731781006, "loss": 2.2609, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.983531951904297, "rewards/margins": 4.15669584274292, "rewards/rejected": -21.140228271484375, "step": 6950 }, { "epoch": 0.2344197647376049, "grad_norm": 53.66566467285156, "learning_rate": 9.459643676377364e-07, "logits/chosen": 0.0645643025636673, "logits/rejected": -0.021223559975624084, "logps/chosen": -2.0207462310791016, "logps/rejected": -1.9168494939804077, "loss": 4.0967, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.207462310791016, "rewards/margins": -1.0389666557312012, "rewards/rejected": -19.168495178222656, "step": 6955 }, { "epoch": 0.2345882908085881, "grad_norm": 15.13321304321289, "learning_rate": 9.458312898821095e-07, "logits/chosen": -0.8532525897026062, "logits/rejected": -0.8107415437698364, "logps/chosen": -1.4715559482574463, "logps/rejected": -1.6997382640838623, "loss": 1.9906, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -14.715560913085938, "rewards/margins": 2.281822681427002, "rewards/rejected": -16.99738121032715, "step": 6960 }, { "epoch": 0.23475681687957126, "grad_norm": 20.282747268676758, "learning_rate": 9.456980578418384e-07, "logits/chosen": -0.2857280969619751, "logits/rejected": -0.3203180730342865, "logps/chosen": -1.5712473392486572, "logps/rejected": -1.6090400218963623, "loss": 2.907, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -15.71247386932373, "rewards/margins": 0.3779268264770508, "rewards/rejected": -16.09040069580078, "step": 6965 }, { "epoch": 0.23492534295055445, "grad_norm": 29.665863037109375, "learning_rate": 9.455646715630289e-07, "logits/chosen": -0.34003663063049316, "logits/rejected": -0.46588826179504395, "logps/chosen": -1.9911762475967407, "logps/rejected": -1.999943494796753, "loss": 3.2851, "rewards/accuracies": 0.5, "rewards/chosen": -19.911760330200195, "rewards/margins": 0.08767547458410263, "rewards/rejected": -19.999435424804688, "step": 6970 }, { "epoch": 0.23509386902153764, "grad_norm": 15.373470306396484, "learning_rate": 9.454311310918413e-07, "logits/chosen": -0.715580403804779, "logits/rejected": -0.6359925270080566, "logps/chosen": -1.8380359411239624, "logps/rejected": -1.8280082941055298, "loss": 3.3255, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.38035774230957, "rewards/margins": -0.10027551651000977, "rewards/rejected": -18.28008460998535, "step": 6975 }, { "epoch": 0.2352623950925208, "grad_norm": 39.4725341796875, "learning_rate": 9.452974364744884e-07, "logits/chosen": -0.8144010305404663, "logits/rejected": -0.9141764640808105, "logps/chosen": -1.7286930084228516, "logps/rejected": -1.7226985692977905, "loss": 3.2424, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -17.286928176879883, "rewards/margins": -0.059942055493593216, "rewards/rejected": -17.226985931396484, "step": 6980 }, { "epoch": 0.235430921163504, "grad_norm": 204.15028381347656, "learning_rate": 9.451635877572368e-07, "logits/chosen": -0.797519326210022, "logits/rejected": -0.7118474841117859, "logps/chosen": -2.037544012069702, "logps/rejected": -2.163386344909668, "loss": 2.3279, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.37544059753418, "rewards/margins": 1.2584235668182373, "rewards/rejected": -21.63386344909668, "step": 6985 }, { "epoch": 0.23559944723448717, "grad_norm": 15.090885162353516, "learning_rate": 9.450295849864063e-07, "logits/chosen": -0.5462337136268616, "logits/rejected": -0.5176796317100525, "logps/chosen": -1.770716667175293, "logps/rejected": -1.8121637105941772, "loss": 2.8383, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.70716667175293, "rewards/margins": 0.4144694209098816, "rewards/rejected": -18.12163734436035, "step": 6990 }, { "epoch": 0.23576797330547036, "grad_norm": 21.92974090576172, "learning_rate": 9.448954282083699e-07, "logits/chosen": -0.7090522050857544, "logits/rejected": -0.6703694462776184, "logps/chosen": -1.772658109664917, "logps/rejected": -1.7615941762924194, "loss": 3.6834, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -17.726581573486328, "rewards/margins": -0.11064042896032333, "rewards/rejected": -17.615942001342773, "step": 6995 }, { "epoch": 0.23593649937645353, "grad_norm": 29.28874397277832, "learning_rate": 9.44761117469554e-07, "logits/chosen": -0.44356757402420044, "logits/rejected": -0.4245205819606781, "logps/chosen": -1.6671860218048096, "logps/rejected": -1.8381659984588623, "loss": 2.5147, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.671859741210938, "rewards/margins": 1.7098020315170288, "rewards/rejected": -18.381662368774414, "step": 7000 }, { "epoch": 0.23610502544743672, "grad_norm": 22.89272117614746, "learning_rate": 9.446266528164382e-07, "logits/chosen": -0.7267704010009766, "logits/rejected": -0.8729580640792847, "logps/chosen": -1.7048250436782837, "logps/rejected": -1.7571985721588135, "loss": 2.688, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.048248291015625, "rewards/margins": 0.5237363576889038, "rewards/rejected": -17.571985244750977, "step": 7005 }, { "epoch": 0.2362735515184199, "grad_norm": 12.386322975158691, "learning_rate": 9.444920342955553e-07, "logits/chosen": -0.6881308555603027, "logits/rejected": -0.7025818824768066, "logps/chosen": -2.3889026641845703, "logps/rejected": -2.4722933769226074, "loss": 3.3052, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -23.889026641845703, "rewards/margins": 0.8339089155197144, "rewards/rejected": -24.72293472290039, "step": 7010 }, { "epoch": 0.23644207758940308, "grad_norm": 29.82985496520996, "learning_rate": 9.443572619534917e-07, "logits/chosen": -0.46417030692100525, "logits/rejected": -0.5751253366470337, "logps/chosen": -1.8959985971450806, "logps/rejected": -2.1164956092834473, "loss": 2.7809, "rewards/accuracies": 0.5, "rewards/chosen": -18.959985733032227, "rewards/margins": 2.204969882965088, "rewards/rejected": -21.16495704650879, "step": 7015 }, { "epoch": 0.23661060366038625, "grad_norm": 28.779356002807617, "learning_rate": 9.442223358368868e-07, "logits/chosen": -0.9354928135871887, "logits/rejected": -0.8245365023612976, "logps/chosen": -1.950887680053711, "logps/rejected": -2.124640703201294, "loss": 2.5568, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.50887680053711, "rewards/margins": 1.7375301122665405, "rewards/rejected": -21.24640655517578, "step": 7020 }, { "epoch": 0.23677912973136944, "grad_norm": 22.374792098999023, "learning_rate": 9.440872559924331e-07, "logits/chosen": -0.19462314248085022, "logits/rejected": -0.1797332465648651, "logps/chosen": -2.0091001987457275, "logps/rejected": -1.94009530544281, "loss": 3.8656, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.090999603271484, "rewards/margins": -0.6900471448898315, "rewards/rejected": -19.40095329284668, "step": 7025 }, { "epoch": 0.23694765580235264, "grad_norm": 79.91705322265625, "learning_rate": 9.439520224668764e-07, "logits/chosen": -0.597054660320282, "logits/rejected": -0.6568640470504761, "logps/chosen": -1.9609358310699463, "logps/rejected": -2.1057090759277344, "loss": 2.5551, "rewards/accuracies": 0.5, "rewards/chosen": -19.609355926513672, "rewards/margins": 1.4477331638336182, "rewards/rejected": -21.057090759277344, "step": 7030 }, { "epoch": 0.2371161818733358, "grad_norm": 20.779041290283203, "learning_rate": 9.438166353070158e-07, "logits/chosen": -0.8121329545974731, "logits/rejected": -0.8105417490005493, "logps/chosen": -1.6662849187850952, "logps/rejected": -1.689305305480957, "loss": 2.9107, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.6628475189209, "rewards/margins": 0.23020382225513458, "rewards/rejected": -16.893051147460938, "step": 7035 }, { "epoch": 0.237284707944319, "grad_norm": 30.07358741760254, "learning_rate": 9.436810945597034e-07, "logits/chosen": -0.8572260141372681, "logits/rejected": -0.9229756593704224, "logps/chosen": -1.8019100427627563, "logps/rejected": -1.8127281665802002, "loss": 3.3009, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.019100189208984, "rewards/margins": 0.10818271338939667, "rewards/rejected": -18.127283096313477, "step": 7040 }, { "epoch": 0.23745323401530216, "grad_norm": 18.86367416381836, "learning_rate": 9.435454002718444e-07, "logits/chosen": -0.6191602349281311, "logits/rejected": -0.5740264654159546, "logps/chosen": -1.707728624343872, "logps/rejected": -1.9487249851226807, "loss": 2.1382, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.077285766601562, "rewards/margins": 2.4099647998809814, "rewards/rejected": -19.48724937438965, "step": 7045 }, { "epoch": 0.23762176008628536, "grad_norm": 17.491254806518555, "learning_rate": 9.434095524903974e-07, "logits/chosen": -0.5991761088371277, "logits/rejected": -0.7022507786750793, "logps/chosen": -1.9639873504638672, "logps/rejected": -1.9689788818359375, "loss": 3.1378, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.639873504638672, "rewards/margins": 0.04991502687335014, "rewards/rejected": -19.689788818359375, "step": 7050 }, { "epoch": 0.23779028615726852, "grad_norm": 36.60987854003906, "learning_rate": 9.43273551262374e-07, "logits/chosen": -0.8516243696212769, "logits/rejected": -0.8240333795547485, "logps/chosen": -1.865692138671875, "logps/rejected": -1.9466756582260132, "loss": 3.1045, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.656917572021484, "rewards/margins": 0.8098365068435669, "rewards/rejected": -19.46675682067871, "step": 7055 }, { "epoch": 0.23795881222825171, "grad_norm": 16.094783782958984, "learning_rate": 9.431373966348387e-07, "logits/chosen": -0.38345012068748474, "logits/rejected": -0.4076191484928131, "logps/chosen": -2.2534797191619873, "logps/rejected": -2.212144613265991, "loss": 3.5651, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.53479766845703, "rewards/margins": -0.413351833820343, "rewards/rejected": -22.121448516845703, "step": 7060 }, { "epoch": 0.23812733829923488, "grad_norm": 26.482032775878906, "learning_rate": 9.430010886549094e-07, "logits/chosen": -0.7364832162857056, "logits/rejected": -0.6639753580093384, "logps/chosen": -2.0670037269592285, "logps/rejected": -1.7953437566757202, "loss": 5.8885, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.670034408569336, "rewards/margins": -2.7165987491607666, "rewards/rejected": -17.95343589782715, "step": 7065 }, { "epoch": 0.23829586437021807, "grad_norm": 23.36323356628418, "learning_rate": 9.428646273697568e-07, "logits/chosen": -0.4462354779243469, "logits/rejected": -0.5697998404502869, "logps/chosen": -2.318197727203369, "logps/rejected": -2.4804301261901855, "loss": 2.7402, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.181976318359375, "rewards/margins": 1.6223284006118774, "rewards/rejected": -24.804302215576172, "step": 7070 }, { "epoch": 0.23846439044120124, "grad_norm": 29.695451736450195, "learning_rate": 9.427280128266049e-07, "logits/chosen": -0.09391313791275024, "logits/rejected": -0.21296796202659607, "logps/chosen": -1.7834405899047852, "logps/rejected": -1.8914865255355835, "loss": 2.8719, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.83440589904785, "rewards/margins": 1.0804593563079834, "rewards/rejected": -18.91486358642578, "step": 7075 }, { "epoch": 0.23863291651218443, "grad_norm": 18.212629318237305, "learning_rate": 9.425912450727305e-07, "logits/chosen": -0.32436639070510864, "logits/rejected": -0.4176466464996338, "logps/chosen": -1.747227668762207, "logps/rejected": -2.081019639968872, "loss": 1.5811, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.472274780273438, "rewards/margins": 3.3379178047180176, "rewards/rejected": -20.810192108154297, "step": 7080 }, { "epoch": 0.23880144258316763, "grad_norm": 21.501232147216797, "learning_rate": 9.424543241554637e-07, "logits/chosen": -0.5840815901756287, "logits/rejected": -0.45451974868774414, "logps/chosen": -2.1316933631896973, "logps/rejected": -2.056612014770508, "loss": 3.951, "rewards/accuracies": 0.5, "rewards/chosen": -21.316936492919922, "rewards/margins": -0.7508147954940796, "rewards/rejected": -20.566120147705078, "step": 7085 }, { "epoch": 0.2389699686541508, "grad_norm": 41.77083969116211, "learning_rate": 9.423172501221872e-07, "logits/chosen": -0.40182381868362427, "logits/rejected": -0.4287623465061188, "logps/chosen": -1.449648141860962, "logps/rejected": -1.610764741897583, "loss": 2.4708, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -14.496480941772461, "rewards/margins": 1.6111669540405273, "rewards/rejected": -16.107648849487305, "step": 7090 }, { "epoch": 0.239138494725134, "grad_norm": 21.541324615478516, "learning_rate": 9.42180023020337e-07, "logits/chosen": -0.6269460320472717, "logits/rejected": -0.5908786058425903, "logps/chosen": -1.9286348819732666, "logps/rejected": -1.9429540634155273, "loss": 3.1871, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -19.286346435546875, "rewards/margins": 0.14319276809692383, "rewards/rejected": -19.429540634155273, "step": 7095 }, { "epoch": 0.23930702079611715, "grad_norm": 40.71235275268555, "learning_rate": 9.420426428974021e-07, "logits/chosen": -0.45769214630126953, "logits/rejected": -0.7178536653518677, "logps/chosen": -2.998274326324463, "logps/rejected": -2.8907580375671387, "loss": 5.4313, "rewards/accuracies": 0.5, "rewards/chosen": -29.982742309570312, "rewards/margins": -1.0751609802246094, "rewards/rejected": -28.907581329345703, "step": 7100 }, { "epoch": 0.23947554686710035, "grad_norm": 21.48967933654785, "learning_rate": 9.419051098009243e-07, "logits/chosen": -0.5916844606399536, "logits/rejected": -0.6824017763137817, "logps/chosen": -1.5144556760787964, "logps/rejected": -1.598915457725525, "loss": 2.7017, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -15.144556999206543, "rewards/margins": 0.844598650932312, "rewards/rejected": -15.989156723022461, "step": 7105 }, { "epoch": 0.2396440729380835, "grad_norm": 25.29039764404297, "learning_rate": 9.417674237784983e-07, "logits/chosen": -0.4818636476993561, "logits/rejected": -0.5085574984550476, "logps/chosen": -1.6022964715957642, "logps/rejected": -1.549843668937683, "loss": 3.662, "rewards/accuracies": 0.5, "rewards/chosen": -16.02296257019043, "rewards/margins": -0.5245256423950195, "rewards/rejected": -15.498437881469727, "step": 7110 }, { "epoch": 0.2398125990090667, "grad_norm": 23.890718460083008, "learning_rate": 9.416295848777718e-07, "logits/chosen": -0.8053399920463562, "logits/rejected": -0.7373215556144714, "logps/chosen": -1.9440422058105469, "logps/rejected": -1.8284790515899658, "loss": 4.4745, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -19.440420150756836, "rewards/margins": -1.1556302309036255, "rewards/rejected": -18.2847900390625, "step": 7115 }, { "epoch": 0.23998112508004987, "grad_norm": 26.433320999145508, "learning_rate": 9.414915931464456e-07, "logits/chosen": -0.8600558042526245, "logits/rejected": -0.8631598353385925, "logps/chosen": -1.7601732015609741, "logps/rejected": -1.8001441955566406, "loss": 2.8108, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.601734161376953, "rewards/margins": 0.3997109532356262, "rewards/rejected": -18.00144386291504, "step": 7120 }, { "epoch": 0.24014965115103307, "grad_norm": 26.806915283203125, "learning_rate": 9.413534486322732e-07, "logits/chosen": -0.5686289072036743, "logits/rejected": -0.6157848834991455, "logps/chosen": -1.79929518699646, "logps/rejected": -1.975182294845581, "loss": 2.1733, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.992952346801758, "rewards/margins": 1.7588691711425781, "rewards/rejected": -19.751821517944336, "step": 7125 }, { "epoch": 0.24031817722201623, "grad_norm": 41.423851013183594, "learning_rate": 9.412151513830606e-07, "logits/chosen": -0.6805993914604187, "logits/rejected": -0.9111586809158325, "logps/chosen": -2.1881282329559326, "logps/rejected": -2.279981851577759, "loss": 2.4858, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.881282806396484, "rewards/margins": 0.9185339212417603, "rewards/rejected": -22.799816131591797, "step": 7130 }, { "epoch": 0.24048670329299943, "grad_norm": 23.80078887939453, "learning_rate": 9.410767014466675e-07, "logits/chosen": -0.514999508857727, "logits/rejected": -0.4333207607269287, "logps/chosen": -1.9776651859283447, "logps/rejected": -2.026716709136963, "loss": 2.8135, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.77665138244629, "rewards/margins": 0.490518182516098, "rewards/rejected": -20.267169952392578, "step": 7135 }, { "epoch": 0.24065522936398262, "grad_norm": 22.437442779541016, "learning_rate": 9.409380988710057e-07, "logits/chosen": -0.6936476230621338, "logits/rejected": -0.7516869306564331, "logps/chosen": -1.657705545425415, "logps/rejected": -1.6317991018295288, "loss": 3.6723, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.577054977416992, "rewards/margins": -0.25906410813331604, "rewards/rejected": -16.317991256713867, "step": 7140 }, { "epoch": 0.24082375543496579, "grad_norm": 29.402507781982422, "learning_rate": 9.4079934370404e-07, "logits/chosen": -0.44884634017944336, "logits/rejected": -0.40613657236099243, "logps/chosen": -2.073878049850464, "logps/rejected": -2.1649553775787354, "loss": 2.7853, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.738779067993164, "rewards/margins": 0.9107732772827148, "rewards/rejected": -21.649551391601562, "step": 7145 }, { "epoch": 0.24099228150594898, "grad_norm": 25.87431526184082, "learning_rate": 9.406604359937884e-07, "logits/chosen": -0.9141045808792114, "logits/rejected": -0.8093926310539246, "logps/chosen": -1.623300552368164, "logps/rejected": -1.6307185888290405, "loss": 3.0879, "rewards/accuracies": 0.5, "rewards/chosen": -16.23300552368164, "rewards/margins": 0.07417931407690048, "rewards/rejected": -16.307186126708984, "step": 7150 }, { "epoch": 0.24116080757693215, "grad_norm": 17.400171279907227, "learning_rate": 9.405213757883212e-07, "logits/chosen": -0.9052945375442505, "logits/rejected": -0.8418493270874023, "logps/chosen": -1.7425527572631836, "logps/rejected": -2.0263311862945557, "loss": 2.0622, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.425527572631836, "rewards/margins": 2.837782621383667, "rewards/rejected": -20.2633113861084, "step": 7155 }, { "epoch": 0.24132933364791534, "grad_norm": 18.482860565185547, "learning_rate": 9.403821631357618e-07, "logits/chosen": -0.7080804109573364, "logits/rejected": -0.7618001103401184, "logps/chosen": -1.8975080251693726, "logps/rejected": -2.0444533824920654, "loss": 2.5719, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.975080490112305, "rewards/margins": 1.4694522619247437, "rewards/rejected": -20.444534301757812, "step": 7160 }, { "epoch": 0.2414978597188985, "grad_norm": 16.333810806274414, "learning_rate": 9.40242798084286e-07, "logits/chosen": -1.0082132816314697, "logits/rejected": -0.8503907322883606, "logps/chosen": -1.8380146026611328, "logps/rejected": -1.8924537897109985, "loss": 3.4038, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.380146026611328, "rewards/margins": 0.5443927049636841, "rewards/rejected": -18.924537658691406, "step": 7165 }, { "epoch": 0.2416663857898817, "grad_norm": 25.957456588745117, "learning_rate": 9.401032806821227e-07, "logits/chosen": -0.6452234387397766, "logits/rejected": -0.5962679982185364, "logps/chosen": -1.7202945947647095, "logps/rejected": -1.86736261844635, "loss": 2.2991, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.20294761657715, "rewards/margins": 1.4706783294677734, "rewards/rejected": -18.67362403869629, "step": 7170 }, { "epoch": 0.24183491186086487, "grad_norm": 27.938039779663086, "learning_rate": 9.399636109775531e-07, "logits/chosen": -0.5970734357833862, "logits/rejected": -0.6353007555007935, "logps/chosen": -1.7174265384674072, "logps/rejected": -1.934348702430725, "loss": 1.799, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -17.174264907836914, "rewards/margins": 2.169222354888916, "rewards/rejected": -19.343486785888672, "step": 7175 }, { "epoch": 0.24200343793184806, "grad_norm": 32.18524932861328, "learning_rate": 9.398237890189119e-07, "logits/chosen": -0.6640039682388306, "logits/rejected": -0.6827796697616577, "logps/chosen": -1.8866550922393799, "logps/rejected": -1.8781245946884155, "loss": 3.2777, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.866552352905273, "rewards/margins": -0.08530483394861221, "rewards/rejected": -18.781246185302734, "step": 7180 }, { "epoch": 0.24217196400283122, "grad_norm": 17.34172821044922, "learning_rate": 9.396838148545855e-07, "logits/chosen": -0.5536460876464844, "logits/rejected": -0.5025007724761963, "logps/chosen": -2.2877159118652344, "logps/rejected": -2.4163801670074463, "loss": 2.5845, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.877161026000977, "rewards/margins": 1.2866411209106445, "rewards/rejected": -24.163801193237305, "step": 7185 }, { "epoch": 0.24234049007381442, "grad_norm": 28.588186264038086, "learning_rate": 9.395436885330138e-07, "logits/chosen": -0.38710886240005493, "logits/rejected": -0.3867741525173187, "logps/chosen": -2.279740810394287, "logps/rejected": -2.194554090499878, "loss": 4.1465, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.797407150268555, "rewards/margins": -0.8518630862236023, "rewards/rejected": -21.945545196533203, "step": 7190 }, { "epoch": 0.2425090161447976, "grad_norm": 20.487680435180664, "learning_rate": 9.394034101026887e-07, "logits/chosen": -0.5616172552108765, "logits/rejected": -0.6295533776283264, "logps/chosen": -1.7545499801635742, "logps/rejected": -2.3229641914367676, "loss": 2.2103, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.54549789428711, "rewards/margins": 5.684144020080566, "rewards/rejected": -23.22964096069336, "step": 7195 }, { "epoch": 0.24267754221578078, "grad_norm": 25.60300064086914, "learning_rate": 9.392629796121552e-07, "logits/chosen": -0.5492648482322693, "logits/rejected": -0.5753589272499084, "logps/chosen": -1.7553179264068604, "logps/rejected": -1.741642951965332, "loss": 3.5349, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.553180694580078, "rewards/margins": -0.13674993813037872, "rewards/rejected": -17.41642951965332, "step": 7200 }, { "epoch": 0.24267754221578078, "eval_logits/chosen": -0.8416627645492554, "eval_logits/rejected": -0.8779700994491577, "eval_logps/chosen": -1.766628384590149, "eval_logps/rejected": -1.8019263744354248, "eval_loss": 3.2034659385681152, "eval_rewards/accuracies": 0.5799999833106995, "eval_rewards/chosen": -17.666284561157227, "eval_rewards/margins": 0.35297882556915283, "eval_rewards/rejected": -18.019264221191406, "eval_runtime": 12.9006, "eval_samples_per_second": 7.752, "eval_steps_per_second": 1.938, "step": 7200 }, { "epoch": 0.24284606828676397, "grad_norm": 25.423349380493164, "learning_rate": 9.391223971100108e-07, "logits/chosen": -0.3729293942451477, "logits/rejected": -0.44401878118515015, "logps/chosen": -1.965648889541626, "logps/rejected": -2.294408082962036, "loss": 2.6272, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.6564884185791, "rewards/margins": 3.287592649459839, "rewards/rejected": -22.944080352783203, "step": 7205 }, { "epoch": 0.24301459435774714, "grad_norm": 15.547274589538574, "learning_rate": 9.389816626449054e-07, "logits/chosen": -0.8222540020942688, "logits/rejected": -0.9664362072944641, "logps/chosen": -1.8262207508087158, "logps/rejected": -1.713092565536499, "loss": 4.6239, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.26220703125, "rewards/margins": -1.1312811374664307, "rewards/rejected": -17.13092613220215, "step": 7210 }, { "epoch": 0.24318312042873033, "grad_norm": 28.496145248413086, "learning_rate": 9.388407762655418e-07, "logits/chosen": -0.5452624559402466, "logits/rejected": -0.4920075833797455, "logps/chosen": -1.8025963306427002, "logps/rejected": -1.8232835531234741, "loss": 3.112, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.025964736938477, "rewards/margins": 0.20687207579612732, "rewards/rejected": -18.23283576965332, "step": 7215 }, { "epoch": 0.2433516464997135, "grad_norm": 29.377939224243164, "learning_rate": 9.386997380206751e-07, "logits/chosen": -0.7790455222129822, "logits/rejected": -0.8049925565719604, "logps/chosen": -1.6588413715362549, "logps/rejected": -1.706597089767456, "loss": 2.6962, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.58841323852539, "rewards/margins": 0.4775591790676117, "rewards/rejected": -17.06597328186035, "step": 7220 }, { "epoch": 0.2435201725706967, "grad_norm": 24.955659866333008, "learning_rate": 9.385585479591133e-07, "logits/chosen": -0.3855653405189514, "logits/rejected": -0.3951939046382904, "logps/chosen": -1.7210830450057983, "logps/rejected": -1.7640886306762695, "loss": 2.962, "rewards/accuracies": 0.5, "rewards/chosen": -17.210830688476562, "rewards/margins": 0.4300549626350403, "rewards/rejected": -17.640888214111328, "step": 7225 }, { "epoch": 0.24368869864167986, "grad_norm": 24.871198654174805, "learning_rate": 9.384172061297165e-07, "logits/chosen": -1.087990403175354, "logits/rejected": -1.0538965463638306, "logps/chosen": -1.6797294616699219, "logps/rejected": -1.7016398906707764, "loss": 2.9103, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.79729652404785, "rewards/margins": 0.21910543739795685, "rewards/rejected": -17.016399383544922, "step": 7230 }, { "epoch": 0.24385722471266305, "grad_norm": 47.272247314453125, "learning_rate": 9.382757125813975e-07, "logits/chosen": -0.5247252583503723, "logits/rejected": -0.6660190224647522, "logps/chosen": -1.898813247680664, "logps/rejected": -1.8705341815948486, "loss": 3.3691, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -18.98813247680664, "rewards/margins": -0.28279104828834534, "rewards/rejected": -18.705341339111328, "step": 7235 }, { "epoch": 0.24402575078364622, "grad_norm": 28.096302032470703, "learning_rate": 9.381340673631217e-07, "logits/chosen": -0.7041982412338257, "logits/rejected": -0.887669563293457, "logps/chosen": -1.9297678470611572, "logps/rejected": -1.8985267877578735, "loss": 3.6096, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.297677993774414, "rewards/margins": -0.312410831451416, "rewards/rejected": -18.985267639160156, "step": 7240 }, { "epoch": 0.2441942768546294, "grad_norm": 29.46830940246582, "learning_rate": 9.379922705239072e-07, "logits/chosen": -0.7096506357192993, "logits/rejected": -0.776728630065918, "logps/chosen": -1.73779296875, "logps/rejected": -1.6395982503890991, "loss": 4.1988, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -17.3779296875, "rewards/margins": -0.9819458723068237, "rewards/rejected": -16.395984649658203, "step": 7245 }, { "epoch": 0.2443628029256126, "grad_norm": 34.96768569946289, "learning_rate": 9.37850322112824e-07, "logits/chosen": -0.23200814425945282, "logits/rejected": -0.40445584058761597, "logps/chosen": -1.738954782485962, "logps/rejected": -1.8703186511993408, "loss": 2.2738, "rewards/accuracies": 0.5, "rewards/chosen": -17.38954734802246, "rewards/margins": 1.313637137413025, "rewards/rejected": -18.703184127807617, "step": 7250 }, { "epoch": 0.24453132899659577, "grad_norm": 16.488719940185547, "learning_rate": 9.377082221789949e-07, "logits/chosen": -0.9548704028129578, "logits/rejected": -1.0706945657730103, "logps/chosen": -1.6227245330810547, "logps/rejected": -1.7374531030654907, "loss": 2.6642, "rewards/accuracies": 0.5, "rewards/chosen": -16.227245330810547, "rewards/margins": 1.1472835540771484, "rewards/rejected": -17.374530792236328, "step": 7255 }, { "epoch": 0.24469985506757896, "grad_norm": 19.77877426147461, "learning_rate": 9.375659707715951e-07, "logits/chosen": -0.20007792115211487, "logits/rejected": -0.41266050934791565, "logps/chosen": -2.046832323074341, "logps/rejected": -2.0882670879364014, "loss": 3.0815, "rewards/accuracies": 0.5, "rewards/chosen": -20.46832275390625, "rewards/margins": 0.4143500328063965, "rewards/rejected": -20.882671356201172, "step": 7260 }, { "epoch": 0.24486838113856213, "grad_norm": 18.866714477539062, "learning_rate": 9.374235679398524e-07, "logits/chosen": -0.24752330780029297, "logits/rejected": -0.22193972766399384, "logps/chosen": -2.1437008380889893, "logps/rejected": -2.275897264480591, "loss": 2.8594, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.437007904052734, "rewards/margins": 1.321963906288147, "rewards/rejected": -22.75897216796875, "step": 7265 }, { "epoch": 0.24503690720954532, "grad_norm": 16.798664093017578, "learning_rate": 9.372810137330464e-07, "logits/chosen": -0.7686780691146851, "logits/rejected": -0.9035130739212036, "logps/chosen": -1.7028411626815796, "logps/rejected": -2.389897108078003, "loss": 1.6245, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.028413772583008, "rewards/margins": 6.8705573081970215, "rewards/rejected": -23.898971557617188, "step": 7270 }, { "epoch": 0.2452054332805285, "grad_norm": 24.184377670288086, "learning_rate": 9.371383082005098e-07, "logits/chosen": -0.8139625787734985, "logits/rejected": -0.7616298794746399, "logps/chosen": -1.6962127685546875, "logps/rejected": -1.8044872283935547, "loss": 2.5592, "rewards/accuracies": 0.5, "rewards/chosen": -16.962127685546875, "rewards/margins": 1.0827454328536987, "rewards/rejected": -18.044872283935547, "step": 7275 }, { "epoch": 0.24537395935151168, "grad_norm": 24.054651260375977, "learning_rate": 9.369954513916273e-07, "logits/chosen": -0.4652763903141022, "logits/rejected": -0.42502039670944214, "logps/chosen": -2.0171265602111816, "logps/rejected": -1.992876410484314, "loss": 3.6302, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.1712646484375, "rewards/margins": -0.24250011146068573, "rewards/rejected": -19.92876625061035, "step": 7280 }, { "epoch": 0.24554248542249485, "grad_norm": 19.809432983398438, "learning_rate": 9.36852443355836e-07, "logits/chosen": -0.7408286333084106, "logits/rejected": -0.7723020911216736, "logps/chosen": -1.9411401748657227, "logps/rejected": -2.0283544063568115, "loss": 2.6034, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.41140365600586, "rewards/margins": 0.8721408843994141, "rewards/rejected": -20.283544540405273, "step": 7285 }, { "epoch": 0.24571101149347804, "grad_norm": 26.219873428344727, "learning_rate": 9.367092841426254e-07, "logits/chosen": -0.5848680138587952, "logits/rejected": -0.6235638856887817, "logps/chosen": -1.9324791431427002, "logps/rejected": -1.8378928899765015, "loss": 4.3249, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -19.324790954589844, "rewards/margins": -0.94586181640625, "rewards/rejected": -18.37892723083496, "step": 7290 }, { "epoch": 0.2458795375644612, "grad_norm": 17.715272903442383, "learning_rate": 9.365659738015372e-07, "logits/chosen": -0.855682373046875, "logits/rejected": -0.9267571568489075, "logps/chosen": -1.8638238906860352, "logps/rejected": -1.7686694860458374, "loss": 4.0473, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -18.63823890686035, "rewards/margins": -0.9515427350997925, "rewards/rejected": -17.686695098876953, "step": 7295 }, { "epoch": 0.2460480636354444, "grad_norm": 43.57025146484375, "learning_rate": 9.364225123821655e-07, "logits/chosen": -0.5702222585678101, "logits/rejected": -0.4446120262145996, "logps/chosen": -1.705815076828003, "logps/rejected": -1.676786184310913, "loss": 3.5881, "rewards/accuracies": 0.5, "rewards/chosen": -17.058151245117188, "rewards/margins": -0.29028749465942383, "rewards/rejected": -16.767864227294922, "step": 7300 }, { "epoch": 0.2462165897064276, "grad_norm": 81.14810180664062, "learning_rate": 9.362788999341567e-07, "logits/chosen": -0.5665368437767029, "logits/rejected": -0.5875300168991089, "logps/chosen": -1.9316835403442383, "logps/rejected": -1.9984298944473267, "loss": 2.5229, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.31683349609375, "rewards/margins": 0.66746586561203, "rewards/rejected": -19.984302520751953, "step": 7305 }, { "epoch": 0.24638511577741076, "grad_norm": 24.982755661010742, "learning_rate": 9.36135136507209e-07, "logits/chosen": -0.6017649173736572, "logits/rejected": -0.5996742248535156, "logps/chosen": -2.1955275535583496, "logps/rejected": -2.2533159255981445, "loss": 3.0001, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.955278396606445, "rewards/margins": 0.5778809785842896, "rewards/rejected": -22.533157348632812, "step": 7310 }, { "epoch": 0.24655364184839396, "grad_norm": 36.10542678833008, "learning_rate": 9.35991222151074e-07, "logits/chosen": -0.430186927318573, "logits/rejected": -0.6433888673782349, "logps/chosen": -1.7533395290374756, "logps/rejected": -1.7290500402450562, "loss": 3.5198, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.533395767211914, "rewards/margins": -0.242896169424057, "rewards/rejected": -17.29050064086914, "step": 7315 }, { "epoch": 0.24672216791937712, "grad_norm": 25.019153594970703, "learning_rate": 9.358471569155542e-07, "logits/chosen": -0.6420100331306458, "logits/rejected": -0.5819183588027954, "logps/chosen": -1.7961498498916626, "logps/rejected": -1.8753328323364258, "loss": 2.7737, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.961498260498047, "rewards/margins": 0.7918294072151184, "rewards/rejected": -18.753326416015625, "step": 7320 }, { "epoch": 0.24689069399036032, "grad_norm": 24.0214900970459, "learning_rate": 9.35702940850505e-07, "logits/chosen": -0.6717097163200378, "logits/rejected": -0.6161051392555237, "logps/chosen": -1.6413822174072266, "logps/rejected": -1.6599630117416382, "loss": 2.8914, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.413822174072266, "rewards/margins": 0.18580922484397888, "rewards/rejected": -16.599628448486328, "step": 7325 }, { "epoch": 0.24705922006134348, "grad_norm": 16.337671279907227, "learning_rate": 9.355585740058341e-07, "logits/chosen": -0.6781617999076843, "logits/rejected": -0.8230530023574829, "logps/chosen": -1.4555952548980713, "logps/rejected": -1.6636173725128174, "loss": 1.8262, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -14.555951118469238, "rewards/margins": 2.0802226066589355, "rewards/rejected": -16.636173248291016, "step": 7330 }, { "epoch": 0.24722774613232668, "grad_norm": 43.3533935546875, "learning_rate": 9.354140564315011e-07, "logits/chosen": -0.8407415151596069, "logits/rejected": -0.846244215965271, "logps/chosen": -1.916285514831543, "logps/rejected": -1.8819057941436768, "loss": 3.4865, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -19.162857055664062, "rewards/margins": -0.34379902482032776, "rewards/rejected": -18.81905746459961, "step": 7335 }, { "epoch": 0.24739627220330984, "grad_norm": 19.098432540893555, "learning_rate": 9.352693881775178e-07, "logits/chosen": -0.7666581273078918, "logits/rejected": -0.7992604970932007, "logps/chosen": -1.715882658958435, "logps/rejected": -1.7032171487808228, "loss": 3.3118, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -17.15882682800293, "rewards/margins": -0.1266559660434723, "rewards/rejected": -17.03217124938965, "step": 7340 }, { "epoch": 0.24756479827429304, "grad_norm": 28.776378631591797, "learning_rate": 9.35124569293948e-07, "logits/chosen": -0.25755545496940613, "logits/rejected": -0.4848417341709137, "logps/chosen": -1.7341407537460327, "logps/rejected": -1.7520534992218018, "loss": 2.9939, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.34140396118164, "rewards/margins": 0.179127499461174, "rewards/rejected": -17.52053451538086, "step": 7345 }, { "epoch": 0.2477333243452762, "grad_norm": 45.91712951660156, "learning_rate": 9.349795998309081e-07, "logits/chosen": -0.44464340806007385, "logits/rejected": -0.4558509290218353, "logps/chosen": -1.9467509984970093, "logps/rejected": -1.8426557779312134, "loss": 4.5344, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -19.46750831604004, "rewards/margins": -1.0409516096115112, "rewards/rejected": -18.426557540893555, "step": 7350 }, { "epoch": 0.2479018504162594, "grad_norm": 42.15274429321289, "learning_rate": 9.348344798385662e-07, "logits/chosen": -0.3427899479866028, "logits/rejected": -0.3019963800907135, "logps/chosen": -2.4352622032165527, "logps/rejected": -2.2316198348999023, "loss": 5.0965, "rewards/accuracies": 0.5, "rewards/chosen": -24.35262107849121, "rewards/margins": -2.036423921585083, "rewards/rejected": -22.31619644165039, "step": 7355 }, { "epoch": 0.2480703764872426, "grad_norm": 71.61480712890625, "learning_rate": 9.346892093671427e-07, "logits/chosen": -0.32045572996139526, "logits/rejected": -0.4350900650024414, "logps/chosen": -2.1125540733337402, "logps/rejected": -2.0484910011291504, "loss": 4.0104, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.125539779663086, "rewards/margins": -0.6406329274177551, "rewards/rejected": -20.484909057617188, "step": 7360 }, { "epoch": 0.24823890255822575, "grad_norm": 78.812255859375, "learning_rate": 9.345437884669098e-07, "logits/chosen": -0.6491799354553223, "logits/rejected": -0.7181687355041504, "logps/chosen": -2.0276429653167725, "logps/rejected": -2.0618278980255127, "loss": 2.8536, "rewards/accuracies": 0.5, "rewards/chosen": -20.276430130004883, "rewards/margins": 0.341848760843277, "rewards/rejected": -20.61827850341797, "step": 7365 }, { "epoch": 0.24840742862920895, "grad_norm": 25.033784866333008, "learning_rate": 9.343982171881921e-07, "logits/chosen": -0.6499834060668945, "logits/rejected": -0.6727867126464844, "logps/chosen": -2.126075029373169, "logps/rejected": -2.216430425643921, "loss": 2.5102, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.26074981689453, "rewards/margins": 0.903557300567627, "rewards/rejected": -22.164308547973633, "step": 7370 }, { "epoch": 0.24857595470019211, "grad_norm": 16.618209838867188, "learning_rate": 9.342524955813661e-07, "logits/chosen": -0.6488254070281982, "logits/rejected": -0.5306954979896545, "logps/chosen": -1.7417656183242798, "logps/rejected": -1.8435453176498413, "loss": 2.5985, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.41765785217285, "rewards/margins": 1.0177967548370361, "rewards/rejected": -18.43545150756836, "step": 7375 }, { "epoch": 0.2487444807711753, "grad_norm": 25.25537872314453, "learning_rate": 9.341066236968602e-07, "logits/chosen": -0.6297262907028198, "logits/rejected": -0.6211252212524414, "logps/chosen": -1.7245155572891235, "logps/rejected": -1.6116546392440796, "loss": 4.2853, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -17.24515724182129, "rewards/margins": -1.128609299659729, "rewards/rejected": -16.116546630859375, "step": 7380 }, { "epoch": 0.24891300684215847, "grad_norm": 36.48454284667969, "learning_rate": 9.339606015851549e-07, "logits/chosen": -0.4730660319328308, "logits/rejected": -0.5225784778594971, "logps/chosen": -1.9237180948257446, "logps/rejected": -2.076869249343872, "loss": 3.0223, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.237178802490234, "rewards/margins": 1.5315120220184326, "rewards/rejected": -20.768692016601562, "step": 7385 }, { "epoch": 0.24908153291314167, "grad_norm": 33.16250228881836, "learning_rate": 9.338144292967829e-07, "logits/chosen": -0.3939630091190338, "logits/rejected": -0.3019777834415436, "logps/chosen": -1.8183307647705078, "logps/rejected": -1.838861107826233, "loss": 3.0173, "rewards/accuracies": 0.5, "rewards/chosen": -18.18330955505371, "rewards/margins": 0.2053002417087555, "rewards/rejected": -18.38861083984375, "step": 7390 }, { "epoch": 0.24925005898412483, "grad_norm": 36.6594123840332, "learning_rate": 9.336681068823284e-07, "logits/chosen": -0.38314300775527954, "logits/rejected": -0.47642940282821655, "logps/chosen": -2.0626749992370605, "logps/rejected": -2.094438076019287, "loss": 3.0801, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.626750946044922, "rewards/margins": 0.31763094663619995, "rewards/rejected": -20.944381713867188, "step": 7395 }, { "epoch": 0.24941858505510803, "grad_norm": 25.774776458740234, "learning_rate": 9.335216343924279e-07, "logits/chosen": -0.3303954005241394, "logits/rejected": -0.4347568154335022, "logps/chosen": -1.9623749256134033, "logps/rejected": -2.0461456775665283, "loss": 2.6242, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.623748779296875, "rewards/margins": 0.8377087712287903, "rewards/rejected": -20.461456298828125, "step": 7400 }, { "epoch": 0.2495871111260912, "grad_norm": 25.02663230895996, "learning_rate": 9.333750118777699e-07, "logits/chosen": -0.5133193731307983, "logits/rejected": -0.3330710828304291, "logps/chosen": -1.7310879230499268, "logps/rejected": -1.8869367837905884, "loss": 2.0172, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.31087875366211, "rewards/margins": 1.558487057685852, "rewards/rejected": -18.869367599487305, "step": 7405 }, { "epoch": 0.2497556371970744, "grad_norm": 24.98771095275879, "learning_rate": 9.332282393890946e-07, "logits/chosen": -0.007177996449172497, "logits/rejected": -0.031101590022444725, "logps/chosen": -2.019897937774658, "logps/rejected": -2.3106188774108887, "loss": 1.9254, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.1989803314209, "rewards/margins": 2.907209873199463, "rewards/rejected": -23.106189727783203, "step": 7410 }, { "epoch": 0.24992416326805758, "grad_norm": 30.926401138305664, "learning_rate": 9.330813169771941e-07, "logits/chosen": -0.8511411547660828, "logits/rejected": -0.7760568857192993, "logps/chosen": -1.6532011032104492, "logps/rejected": -1.6798311471939087, "loss": 3.0094, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.532011032104492, "rewards/margins": 0.2663000524044037, "rewards/rejected": -16.79831314086914, "step": 7415 }, { "epoch": 0.2500926893390408, "grad_norm": 16.393407821655273, "learning_rate": 9.329342446929125e-07, "logits/chosen": -0.7259313464164734, "logits/rejected": -0.7014585137367249, "logps/chosen": -1.7854305505752563, "logps/rejected": -1.9603378772735596, "loss": 1.9886, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.854305267333984, "rewards/margins": 1.749071478843689, "rewards/rejected": -19.603378295898438, "step": 7420 }, { "epoch": 0.25026121541002394, "grad_norm": 34.89096450805664, "learning_rate": 9.327870225871458e-07, "logits/chosen": -0.8024528622627258, "logits/rejected": -0.690179705619812, "logps/chosen": -1.7187427282333374, "logps/rejected": -1.8699325323104858, "loss": 2.1882, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.187427520751953, "rewards/margins": 1.511897087097168, "rewards/rejected": -18.699325561523438, "step": 7425 }, { "epoch": 0.2504297414810071, "grad_norm": 23.38494873046875, "learning_rate": 9.326396507108417e-07, "logits/chosen": -0.7363360524177551, "logits/rejected": -0.7402480840682983, "logps/chosen": -1.6879494190216064, "logps/rejected": -1.7465381622314453, "loss": 2.6665, "rewards/accuracies": 0.5, "rewards/chosen": -16.87949562072754, "rewards/margins": 0.5858873128890991, "rewards/rejected": -17.465381622314453, "step": 7430 }, { "epoch": 0.2505982675519903, "grad_norm": 29.959455490112305, "learning_rate": 9.324921291149999e-07, "logits/chosen": -0.7818613648414612, "logits/rejected": -0.8807746171951294, "logps/chosen": -1.4602100849151611, "logps/rejected": -1.6767206192016602, "loss": 1.5994, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -14.60210132598877, "rewards/margins": 2.1651058197021484, "rewards/rejected": -16.7672061920166, "step": 7435 }, { "epoch": 0.2507667936229735, "grad_norm": 33.429073333740234, "learning_rate": 9.323444578506716e-07, "logits/chosen": -0.5882720947265625, "logits/rejected": -0.756925642490387, "logps/chosen": -1.9938703775405884, "logps/rejected": -2.0285696983337402, "loss": 2.953, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.938703536987305, "rewards/margins": 0.3469921946525574, "rewards/rejected": -20.285696029663086, "step": 7440 }, { "epoch": 0.25093531969395666, "grad_norm": 35.908668518066406, "learning_rate": 9.3219663696896e-07, "logits/chosen": -0.32356154918670654, "logits/rejected": -0.2778133749961853, "logps/chosen": -1.959058403968811, "logps/rejected": -1.9171994924545288, "loss": 3.755, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.5905818939209, "rewards/margins": -0.4185875952243805, "rewards/rejected": -19.171995162963867, "step": 7445 }, { "epoch": 0.2511038457649398, "grad_norm": 19.67608642578125, "learning_rate": 9.320486665210204e-07, "logits/chosen": -0.3477417528629303, "logits/rejected": -0.430941641330719, "logps/chosen": -1.936028242111206, "logps/rejected": -2.15446400642395, "loss": 2.2592, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.360280990600586, "rewards/margins": 2.1843581199645996, "rewards/rejected": -21.544641494750977, "step": 7450 }, { "epoch": 0.251272371835923, "grad_norm": 17.752864837646484, "learning_rate": 9.319005465580594e-07, "logits/chosen": -0.6723566651344299, "logits/rejected": -0.7199611067771912, "logps/chosen": -1.8637107610702515, "logps/rejected": -2.097921371459961, "loss": 1.4462, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -18.637107849121094, "rewards/margins": 2.3421072959899902, "rewards/rejected": -20.979215621948242, "step": 7455 }, { "epoch": 0.2514408979069062, "grad_norm": 29.509733200073242, "learning_rate": 9.317522771313353e-07, "logits/chosen": -0.3042893409729004, "logits/rejected": -0.39669641852378845, "logps/chosen": -1.7221676111221313, "logps/rejected": -1.6911754608154297, "loss": 3.5054, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.221675872802734, "rewards/margins": -0.30992332100868225, "rewards/rejected": -16.911754608154297, "step": 7460 }, { "epoch": 0.2516094239778894, "grad_norm": 90.46784973144531, "learning_rate": 9.316038582921586e-07, "logits/chosen": -0.5147503614425659, "logits/rejected": -0.5856298208236694, "logps/chosen": -2.3417937755584717, "logps/rejected": -2.436811923980713, "loss": 2.7598, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.41794204711914, "rewards/margins": 0.950177788734436, "rewards/rejected": -24.368114471435547, "step": 7465 }, { "epoch": 0.25177795004887255, "grad_norm": 36.988059997558594, "learning_rate": 9.314552900918908e-07, "logits/chosen": -0.6642917394638062, "logits/rejected": -0.7818363308906555, "logps/chosen": -1.3997938632965088, "logps/rejected": -1.5536857843399048, "loss": 2.2515, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -13.99793815612793, "rewards/margins": 1.5389198064804077, "rewards/rejected": -15.536859512329102, "step": 7470 }, { "epoch": 0.25194647611985577, "grad_norm": 24.968826293945312, "learning_rate": 9.31306572581946e-07, "logits/chosen": -0.3435518741607666, "logits/rejected": -0.3793199360370636, "logps/chosen": -1.8419605493545532, "logps/rejected": -1.8113027811050415, "loss": 3.7557, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.419605255126953, "rewards/margins": -0.30657798051834106, "rewards/rejected": -18.113027572631836, "step": 7475 }, { "epoch": 0.25211500219083893, "grad_norm": 22.026336669921875, "learning_rate": 9.311577058137892e-07, "logits/chosen": -0.9103565216064453, "logits/rejected": -0.6073547601699829, "logps/chosen": -1.4468992948532104, "logps/rejected": -1.6920430660247803, "loss": 1.7871, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -14.468992233276367, "rewards/margins": 2.451439380645752, "rewards/rejected": -16.92043113708496, "step": 7480 }, { "epoch": 0.2522835282618221, "grad_norm": 30.010156631469727, "learning_rate": 9.310086898389374e-07, "logits/chosen": -0.632883608341217, "logits/rejected": -0.7742358446121216, "logps/chosen": -1.770172357559204, "logps/rejected": -1.7545216083526611, "loss": 3.3847, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.701723098754883, "rewards/margins": -0.1565081626176834, "rewards/rejected": -17.545215606689453, "step": 7485 }, { "epoch": 0.25245205433280526, "grad_norm": 9.875432968139648, "learning_rate": 9.30859524708959e-07, "logits/chosen": -0.6751956343650818, "logits/rejected": -0.7236267328262329, "logps/chosen": -1.6921708583831787, "logps/rejected": -2.151487350463867, "loss": 1.9065, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.921709060668945, "rewards/margins": 4.593166351318359, "rewards/rejected": -21.514873504638672, "step": 7490 }, { "epoch": 0.2526205804037885, "grad_norm": 8.505745887756348, "learning_rate": 9.307102104754742e-07, "logits/chosen": -0.5317801237106323, "logits/rejected": -0.3818480968475342, "logps/chosen": -1.8155962228775024, "logps/rejected": -2.12412166595459, "loss": 2.6481, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.155963897705078, "rewards/margins": 3.0852532386779785, "rewards/rejected": -21.2412166595459, "step": 7495 }, { "epoch": 0.25278910647477165, "grad_norm": 29.946495056152344, "learning_rate": 9.30560747190155e-07, "logits/chosen": -0.6961748003959656, "logits/rejected": -0.7425335049629211, "logps/chosen": -2.139946460723877, "logps/rejected": -1.9574038982391357, "loss": 4.8539, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -21.399463653564453, "rewards/margins": -1.8254258632659912, "rewards/rejected": -19.574039459228516, "step": 7500 }, { "epoch": 0.2529576325457548, "grad_norm": 8.821472147246823e-05, "learning_rate": 9.304111349047245e-07, "logits/chosen": -0.3789060115814209, "logits/rejected": -0.48966988921165466, "logps/chosen": -2.3770015239715576, "logps/rejected": -2.9544246196746826, "loss": 2.4364, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.770015716552734, "rewards/margins": 5.774231433868408, "rewards/rejected": -29.54425048828125, "step": 7505 }, { "epoch": 0.253126158616738, "grad_norm": 23.108131408691406, "learning_rate": 9.30261373670958e-07, "logits/chosen": -0.3439405858516693, "logits/rejected": -0.3222549259662628, "logps/chosen": -1.766324758529663, "logps/rejected": -1.7195783853530884, "loss": 3.6566, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.66324806213379, "rewards/margins": -0.46746331453323364, "rewards/rejected": -17.195783615112305, "step": 7510 }, { "epoch": 0.2532946846877212, "grad_norm": 32.8663444519043, "learning_rate": 9.301114635406813e-07, "logits/chosen": -0.43090614676475525, "logits/rejected": -0.40088552236557007, "logps/chosen": -1.8684027194976807, "logps/rejected": -2.1092865467071533, "loss": 3.1633, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.684024810791016, "rewards/margins": 2.4088387489318848, "rewards/rejected": -21.09286880493164, "step": 7515 }, { "epoch": 0.25346321075870437, "grad_norm": 3.117332935333252, "learning_rate": 9.299614045657731e-07, "logits/chosen": -0.3704325258731842, "logits/rejected": -0.44101667404174805, "logps/chosen": -1.7551378011703491, "logps/rejected": -1.8985016345977783, "loss": 2.3468, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.55137825012207, "rewards/margins": 1.4336389303207397, "rewards/rejected": -18.985015869140625, "step": 7520 }, { "epoch": 0.25363173682968754, "grad_norm": 30.372228622436523, "learning_rate": 9.298111967981625e-07, "logits/chosen": -0.24413923919200897, "logits/rejected": -0.28805142641067505, "logps/chosen": -2.1313605308532715, "logps/rejected": -2.2983944416046143, "loss": 2.7782, "rewards/accuracies": 0.5, "rewards/chosen": -21.313602447509766, "rewards/margins": 1.67034113407135, "rewards/rejected": -22.983945846557617, "step": 7525 }, { "epoch": 0.25380026290067076, "grad_norm": 32.955474853515625, "learning_rate": 9.296608402898305e-07, "logits/chosen": -0.6186766624450684, "logits/rejected": -0.6145849823951721, "logps/chosen": -1.6524741649627686, "logps/rejected": -1.6749063730239868, "loss": 3.0369, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.524742126464844, "rewards/margins": 0.2243211716413498, "rewards/rejected": -16.74906349182129, "step": 7530 }, { "epoch": 0.2539687889716539, "grad_norm": 28.929431915283203, "learning_rate": 9.295103350928098e-07, "logits/chosen": -0.7780159115791321, "logits/rejected": -0.6910878419876099, "logps/chosen": -1.8065674304962158, "logps/rejected": -1.903062105178833, "loss": 2.889, "rewards/accuracies": 0.5, "rewards/chosen": -18.065673828125, "rewards/margins": 0.9649454951286316, "rewards/rejected": -19.030620574951172, "step": 7535 }, { "epoch": 0.2541373150426371, "grad_norm": 18.927898406982422, "learning_rate": 9.293596812591839e-07, "logits/chosen": -0.23738765716552734, "logits/rejected": -0.3145070970058441, "logps/chosen": -2.188699722290039, "logps/rejected": -2.463549852371216, "loss": 3.5589, "rewards/accuracies": 0.5, "rewards/chosen": -21.886999130249023, "rewards/margins": 2.7484989166259766, "rewards/rejected": -24.635496139526367, "step": 7540 }, { "epoch": 0.25430584111362026, "grad_norm": 11.518166542053223, "learning_rate": 9.292088788410885e-07, "logits/chosen": -0.27805063128471375, "logits/rejected": -0.3325250744819641, "logps/chosen": -2.21008563041687, "logps/rejected": -2.4034438133239746, "loss": 3.4056, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.10085678100586, "rewards/margins": 1.9335815906524658, "rewards/rejected": -24.034439086914062, "step": 7545 }, { "epoch": 0.2544743671846035, "grad_norm": 32.053321838378906, "learning_rate": 9.290579278907104e-07, "logits/chosen": -0.6328898072242737, "logits/rejected": -0.5089461207389832, "logps/chosen": -1.902991533279419, "logps/rejected": -2.017669200897217, "loss": 2.6123, "rewards/accuracies": 0.5, "rewards/chosen": -19.02991485595703, "rewards/margins": 1.1467796564102173, "rewards/rejected": -20.176692962646484, "step": 7550 }, { "epoch": 0.25464289325558664, "grad_norm": 42.185794830322266, "learning_rate": 9.289068284602877e-07, "logits/chosen": -0.43494969606399536, "logits/rejected": -0.5104081034660339, "logps/chosen": -2.2199254035949707, "logps/rejected": -2.1992757320404053, "loss": 3.3121, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.199254989624023, "rewards/margins": -0.20649738609790802, "rewards/rejected": -21.992755889892578, "step": 7555 }, { "epoch": 0.2548114193265698, "grad_norm": 27.1168270111084, "learning_rate": 9.287555806021097e-07, "logits/chosen": -0.9154708981513977, "logits/rejected": -0.902691662311554, "logps/chosen": -1.6451492309570312, "logps/rejected": -1.5507086515426636, "loss": 4.0632, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -16.451494216918945, "rewards/margins": -0.9444063901901245, "rewards/rejected": -15.507085800170898, "step": 7560 }, { "epoch": 0.254979945397553, "grad_norm": 10.494613647460938, "learning_rate": 9.286041843685177e-07, "logits/chosen": -0.3919174075126648, "logits/rejected": -0.5906190872192383, "logps/chosen": -2.21108341217041, "logps/rejected": -2.351998805999756, "loss": 3.1334, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.11083221435547, "rewards/margins": 1.409155011177063, "rewards/rejected": -23.519987106323242, "step": 7565 }, { "epoch": 0.2551484714685362, "grad_norm": 15.32865047454834, "learning_rate": 9.284526398119038e-07, "logits/chosen": -0.23844066262245178, "logits/rejected": -0.24300916492938995, "logps/chosen": -1.9033092260360718, "logps/rejected": -2.0004773139953613, "loss": 2.6859, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.033092498779297, "rewards/margins": 0.9716783761978149, "rewards/rejected": -20.004772186279297, "step": 7570 }, { "epoch": 0.25531699753951936, "grad_norm": 18.072689056396484, "learning_rate": 9.283009469847116e-07, "logits/chosen": -0.6887549161911011, "logits/rejected": -0.7601548433303833, "logps/chosen": -1.7088873386383057, "logps/rejected": -1.8237041234970093, "loss": 2.1205, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.0888729095459, "rewards/margins": 1.1481688022613525, "rewards/rejected": -18.237041473388672, "step": 7575 }, { "epoch": 0.25548552361050253, "grad_norm": 31.30813980102539, "learning_rate": 9.281491059394361e-07, "logits/chosen": -0.7471412420272827, "logits/rejected": -0.6275310516357422, "logps/chosen": -1.9791702032089233, "logps/rejected": -2.046593189239502, "loss": 2.6766, "rewards/accuracies": 0.5, "rewards/chosen": -19.791702270507812, "rewards/margins": 0.6742300987243652, "rewards/rejected": -20.465932846069336, "step": 7580 }, { "epoch": 0.25565404968148575, "grad_norm": 23.66309356689453, "learning_rate": 9.279971167286233e-07, "logits/chosen": -0.44159945845603943, "logits/rejected": -0.5053187608718872, "logps/chosen": -2.107053518295288, "logps/rejected": -2.204468250274658, "loss": 2.3342, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.070533752441406, "rewards/margins": 0.9741487503051758, "rewards/rejected": -22.0446834564209, "step": 7585 }, { "epoch": 0.2558225757524689, "grad_norm": 40.30325698852539, "learning_rate": 9.27844979404871e-07, "logits/chosen": 0.027735818177461624, "logits/rejected": -0.025748800486326218, "logps/chosen": -2.3189496994018555, "logps/rejected": -2.194973945617676, "loss": 4.4813, "rewards/accuracies": 0.5, "rewards/chosen": -23.189496994018555, "rewards/margins": -1.239757776260376, "rewards/rejected": -21.949739456176758, "step": 7590 }, { "epoch": 0.2559911018234521, "grad_norm": 15.760374069213867, "learning_rate": 9.276926940208276e-07, "logits/chosen": -0.43298858404159546, "logits/rejected": -0.38209548592567444, "logps/chosen": -1.8992574214935303, "logps/rejected": -1.923172950744629, "loss": 4.0977, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -18.992572784423828, "rewards/margins": 0.23915652930736542, "rewards/rejected": -19.231731414794922, "step": 7595 }, { "epoch": 0.25615962789443525, "grad_norm": 14.925925254821777, "learning_rate": 9.275402606291933e-07, "logits/chosen": -0.8036720156669617, "logits/rejected": -0.803006649017334, "logps/chosen": -1.5346969366073608, "logps/rejected": -1.8176238536834717, "loss": 2.0604, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -15.346969604492188, "rewards/margins": 2.829270839691162, "rewards/rejected": -18.176240921020508, "step": 7600 }, { "epoch": 0.25615962789443525, "eval_logits/chosen": -0.8601851463317871, "eval_logits/rejected": -0.9017350077629089, "eval_logps/chosen": -1.7739288806915283, "eval_logps/rejected": -1.8104459047317505, "eval_loss": 3.192545175552368, "eval_rewards/accuracies": 0.6100000143051147, "eval_rewards/chosen": -17.739290237426758, "eval_rewards/margins": 0.3651680648326874, "eval_rewards/rejected": -18.10445785522461, "eval_runtime": 12.9145, "eval_samples_per_second": 7.743, "eval_steps_per_second": 1.936, "step": 7600 }, { "epoch": 0.25632815396541847, "grad_norm": 26.527164459228516, "learning_rate": 9.273876792827192e-07, "logits/chosen": -0.717302143573761, "logits/rejected": -0.7854380011558533, "logps/chosen": -1.8394664525985718, "logps/rejected": -1.8153263330459595, "loss": 3.3416, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.394664764404297, "rewards/margins": -0.24140223860740662, "rewards/rejected": -18.153263092041016, "step": 7605 }, { "epoch": 0.25649668003640164, "grad_norm": 17.837858200073242, "learning_rate": 9.272349500342076e-07, "logits/chosen": -0.6437766551971436, "logits/rejected": -0.7148224115371704, "logps/chosen": -1.7915229797363281, "logps/rejected": -2.0384459495544434, "loss": 2.5635, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.91522789001465, "rewards/margins": 2.4692318439483643, "rewards/rejected": -20.384458541870117, "step": 7610 }, { "epoch": 0.2566652061073848, "grad_norm": 9.811715126037598, "learning_rate": 9.270820729365123e-07, "logits/chosen": -0.4629322588443756, "logits/rejected": -0.4842115044593811, "logps/chosen": -2.0855185985565186, "logps/rejected": -2.273648977279663, "loss": 2.1479, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.855188369750977, "rewards/margins": 1.8813034296035767, "rewards/rejected": -22.736492156982422, "step": 7615 }, { "epoch": 0.25683373217836797, "grad_norm": 20.732784271240234, "learning_rate": 9.269290480425378e-07, "logits/chosen": -0.7600752115249634, "logits/rejected": -0.8454673886299133, "logps/chosen": -1.9267241954803467, "logps/rejected": -2.1829326152801514, "loss": 2.5112, "rewards/accuracies": 0.5, "rewards/chosen": -19.267242431640625, "rewards/margins": 2.562086582183838, "rewards/rejected": -21.829326629638672, "step": 7620 }, { "epoch": 0.2570022582493512, "grad_norm": 27.150493621826172, "learning_rate": 9.267758754052402e-07, "logits/chosen": -0.5359091758728027, "logits/rejected": -0.4138420522212982, "logps/chosen": -2.203834056854248, "logps/rejected": -2.302135467529297, "loss": 2.6782, "rewards/accuracies": 0.5, "rewards/chosen": -22.038341522216797, "rewards/margins": 0.9830153584480286, "rewards/rejected": -23.02135467529297, "step": 7625 }, { "epoch": 0.25717078432033436, "grad_norm": 31.828556060791016, "learning_rate": 9.266225550776265e-07, "logits/chosen": -0.3831477761268616, "logits/rejected": -0.4036986827850342, "logps/chosen": -1.6559679508209229, "logps/rejected": -1.6691372394561768, "loss": 3.626, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -16.55967903137207, "rewards/margins": 0.13169364631175995, "rewards/rejected": -16.69137191772461, "step": 7630 }, { "epoch": 0.2573393103913175, "grad_norm": 40.279781341552734, "learning_rate": 9.264690871127545e-07, "logits/chosen": -0.5447670817375183, "logits/rejected": -0.37773579359054565, "logps/chosen": -1.839734673500061, "logps/rejected": -1.8037055730819702, "loss": 3.8751, "rewards/accuracies": 0.5, "rewards/chosen": -18.3973445892334, "rewards/margins": -0.3602902889251709, "rewards/rejected": -18.03705596923828, "step": 7635 }, { "epoch": 0.25750783646230074, "grad_norm": 30.85480499267578, "learning_rate": 9.263154715637339e-07, "logits/chosen": -0.8868627548217773, "logits/rejected": -0.7809014916419983, "logps/chosen": -1.909232497215271, "logps/rejected": -2.0275964736938477, "loss": 2.5388, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.092327117919922, "rewards/margins": 1.1836379766464233, "rewards/rejected": -20.275962829589844, "step": 7640 }, { "epoch": 0.2576763625332839, "grad_norm": 36.895179748535156, "learning_rate": 9.261617084837247e-07, "logits/chosen": -0.2057121992111206, "logits/rejected": -0.4458581805229187, "logps/chosen": -2.2100377082824707, "logps/rejected": -2.46073842048645, "loss": 2.4705, "rewards/accuracies": 0.5, "rewards/chosen": -22.100372314453125, "rewards/margins": 2.5070078372955322, "rewards/rejected": -24.60738182067871, "step": 7645 }, { "epoch": 0.2578448886042671, "grad_norm": 0.7452271580696106, "learning_rate": 9.260077979259382e-07, "logits/chosen": -0.33283504843711853, "logits/rejected": -0.3346695303916931, "logps/chosen": -2.141392469406128, "logps/rejected": -2.394256114959717, "loss": 2.3332, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.413925170898438, "rewards/margins": 2.528637170791626, "rewards/rejected": -23.942562103271484, "step": 7650 }, { "epoch": 0.25801341467525024, "grad_norm": 32.01772689819336, "learning_rate": 9.25853739943637e-07, "logits/chosen": -0.7218912243843079, "logits/rejected": -0.799573540687561, "logps/chosen": -1.6060640811920166, "logps/rejected": -1.6765620708465576, "loss": 2.5264, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.06064224243164, "rewards/margins": 0.7049804925918579, "rewards/rejected": -16.765621185302734, "step": 7655 }, { "epoch": 0.25818194074623346, "grad_norm": 37.131343841552734, "learning_rate": 9.256995345901342e-07, "logits/chosen": -0.6877612471580505, "logits/rejected": -0.8829323649406433, "logps/chosen": -1.8432331085205078, "logps/rejected": -1.828413724899292, "loss": 3.4766, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -18.432331085205078, "rewards/margins": -0.1481925994157791, "rewards/rejected": -18.284137725830078, "step": 7660 }, { "epoch": 0.25835046681721663, "grad_norm": 18.985092163085938, "learning_rate": 9.255451819187945e-07, "logits/chosen": -0.45237255096435547, "logits/rejected": -0.4141133427619934, "logps/chosen": -1.8477073907852173, "logps/rejected": -1.9039417505264282, "loss": 2.9308, "rewards/accuracies": 0.5, "rewards/chosen": -18.47707176208496, "rewards/margins": 0.5623448491096497, "rewards/rejected": -19.039417266845703, "step": 7665 }, { "epoch": 0.2585189928881998, "grad_norm": 28.19369125366211, "learning_rate": 9.25390681983033e-07, "logits/chosen": -1.010534644126892, "logits/rejected": -0.9251688718795776, "logps/chosen": -1.7543470859527588, "logps/rejected": -1.7675797939300537, "loss": 3.1437, "rewards/accuracies": 0.5, "rewards/chosen": -17.54347038269043, "rewards/margins": 0.13232669234275818, "rewards/rejected": -17.675798416137695, "step": 7670 }, { "epoch": 0.25868751895918296, "grad_norm": 23.96495246887207, "learning_rate": 9.252360348363164e-07, "logits/chosen": -0.4807816445827484, "logits/rejected": -0.5013648271560669, "logps/chosen": -2.0067787170410156, "logps/rejected": -2.1580376625061035, "loss": 2.3734, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.067790985107422, "rewards/margins": 1.5125861167907715, "rewards/rejected": -21.58037757873535, "step": 7675 }, { "epoch": 0.2588560450301662, "grad_norm": 20.593505859375, "learning_rate": 9.250812405321618e-07, "logits/chosen": -0.423076868057251, "logits/rejected": -0.5166851878166199, "logps/chosen": -1.464730978012085, "logps/rejected": -1.5135142803192139, "loss": 3.1644, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -14.647310256958008, "rewards/margins": 0.4878327250480652, "rewards/rejected": -15.135144233703613, "step": 7680 }, { "epoch": 0.25902457110114935, "grad_norm": 20.095569610595703, "learning_rate": 9.249262991241372e-07, "logits/chosen": -0.059383898973464966, "logits/rejected": -0.39212357997894287, "logps/chosen": -2.0744400024414062, "logps/rejected": -2.1283066272735596, "loss": 3.3333, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -20.744401931762695, "rewards/margins": 0.5386648178100586, "rewards/rejected": -21.283065795898438, "step": 7685 }, { "epoch": 0.2591930971721325, "grad_norm": 41.47001266479492, "learning_rate": 9.247712106658619e-07, "logits/chosen": -0.4025161862373352, "logits/rejected": -0.42906612157821655, "logps/chosen": -2.040505886077881, "logps/rejected": -2.0720152854919434, "loss": 2.847, "rewards/accuracies": 0.5, "rewards/chosen": -20.405057907104492, "rewards/margins": 0.3150941729545593, "rewards/rejected": -20.72015380859375, "step": 7690 }, { "epoch": 0.25936162324311574, "grad_norm": 6.932709693908691, "learning_rate": 9.246159752110061e-07, "logits/chosen": -0.5451598167419434, "logits/rejected": -0.5768376588821411, "logps/chosen": -2.0141749382019043, "logps/rejected": -2.4219484329223633, "loss": 2.8068, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.141748428344727, "rewards/margins": 4.077737808227539, "rewards/rejected": -24.219484329223633, "step": 7695 }, { "epoch": 0.2595301493140989, "grad_norm": 6.739269733428955, "learning_rate": 9.244605928132902e-07, "logits/chosen": -0.8408918380737305, "logits/rejected": -0.8135308027267456, "logps/chosen": -1.4772506952285767, "logps/rejected": -1.5503901243209839, "loss": 2.8249, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -14.772506713867188, "rewards/margins": 0.7313947677612305, "rewards/rejected": -15.503901481628418, "step": 7700 }, { "epoch": 0.25969867538508207, "grad_norm": 13.87769603729248, "learning_rate": 9.243050635264864e-07, "logits/chosen": -0.9550352096557617, "logits/rejected": -1.0311752557754517, "logps/chosen": -1.6748530864715576, "logps/rejected": -1.7567179203033447, "loss": 3.0043, "rewards/accuracies": 0.5, "rewards/chosen": -16.748531341552734, "rewards/margins": 0.8186489939689636, "rewards/rejected": -17.56717872619629, "step": 7705 }, { "epoch": 0.25986720145606523, "grad_norm": 26.09083366394043, "learning_rate": 9.24149387404417e-07, "logits/chosen": -0.7877703905105591, "logits/rejected": -0.8280628323554993, "logps/chosen": -1.6516129970550537, "logps/rejected": -1.7163331508636475, "loss": 3.0976, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.516132354736328, "rewards/margins": 0.6472002267837524, "rewards/rejected": -17.163331985473633, "step": 7710 }, { "epoch": 0.26003572752704845, "grad_norm": 16.435352325439453, "learning_rate": 9.239935645009555e-07, "logits/chosen": -0.7832959890365601, "logits/rejected": -0.8045759201049805, "logps/chosen": -1.9323409795761108, "logps/rejected": -2.1384482383728027, "loss": 1.8541, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.323410034179688, "rewards/margins": 2.0610733032226562, "rewards/rejected": -21.384485244750977, "step": 7715 }, { "epoch": 0.2602042535980316, "grad_norm": 40.79289245605469, "learning_rate": 9.238375948700261e-07, "logits/chosen": -0.6069063544273376, "logits/rejected": -0.6482208967208862, "logps/chosen": -1.9219573736190796, "logps/rejected": -2.137608289718628, "loss": 1.7378, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.219573974609375, "rewards/margins": 2.156508207321167, "rewards/rejected": -21.376083374023438, "step": 7720 }, { "epoch": 0.2603727796690148, "grad_norm": 34.74724197387695, "learning_rate": 9.236814785656035e-07, "logits/chosen": 0.003743249224498868, "logits/rejected": -0.06489133089780807, "logps/chosen": -2.0553712844848633, "logps/rejected": -2.0764975547790527, "loss": 3.0784, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.553712844848633, "rewards/margins": 0.21126461029052734, "rewards/rejected": -20.764976501464844, "step": 7725 }, { "epoch": 0.26054130573999795, "grad_norm": 50.483341217041016, "learning_rate": 9.235252156417134e-07, "logits/chosen": -0.6431624293327332, "logits/rejected": -0.7084658741950989, "logps/chosen": -1.9164985418319702, "logps/rejected": -1.9143717288970947, "loss": 3.113, "rewards/accuracies": 0.5, "rewards/chosen": -19.16498565673828, "rewards/margins": -0.021271228790283203, "rewards/rejected": -19.14371681213379, "step": 7730 }, { "epoch": 0.2607098318109812, "grad_norm": 72.169677734375, "learning_rate": 9.233688061524327e-07, "logits/chosen": 0.04161912947893143, "logits/rejected": -0.1001749038696289, "logps/chosen": -1.9186267852783203, "logps/rejected": -1.9041011333465576, "loss": 3.2578, "rewards/accuracies": 0.5, "rewards/chosen": -19.186267852783203, "rewards/margins": -0.14525547623634338, "rewards/rejected": -19.041011810302734, "step": 7735 }, { "epoch": 0.26087835788196434, "grad_norm": 50.393035888671875, "learning_rate": 9.232122501518882e-07, "logits/chosen": -0.4775795042514801, "logits/rejected": -0.5555458068847656, "logps/chosen": -2.081469774246216, "logps/rejected": -2.207404375076294, "loss": 1.9899, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.814697265625, "rewards/margins": 1.2593481540679932, "rewards/rejected": -22.074045181274414, "step": 7740 }, { "epoch": 0.2610468839529475, "grad_norm": 24.568172454833984, "learning_rate": 9.230555476942576e-07, "logits/chosen": -0.5699299573898315, "logits/rejected": -0.5548152923583984, "logps/chosen": -1.547564148902893, "logps/rejected": -1.6397268772125244, "loss": 2.4438, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -15.475641250610352, "rewards/margins": 0.9216279983520508, "rewards/rejected": -16.397268295288086, "step": 7745 }, { "epoch": 0.2612154100239307, "grad_norm": 40.764381408691406, "learning_rate": 9.228986988337699e-07, "logits/chosen": -0.6876112222671509, "logits/rejected": -0.6586848497390747, "logps/chosen": -1.8404690027236938, "logps/rejected": -1.828752875328064, "loss": 3.371, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.40468978881836, "rewards/margins": -0.11716127395629883, "rewards/rejected": -18.28752899169922, "step": 7750 }, { "epoch": 0.2613839360949139, "grad_norm": 13.969160079956055, "learning_rate": 9.22741703624704e-07, "logits/chosen": -0.6685362458229065, "logits/rejected": -0.5314013361930847, "logps/chosen": -1.958852767944336, "logps/rejected": -2.212700366973877, "loss": 2.2484, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.58852767944336, "rewards/margins": 2.5384745597839355, "rewards/rejected": -22.127002716064453, "step": 7755 }, { "epoch": 0.26155246216589706, "grad_norm": 23.888172149658203, "learning_rate": 9.225845621213897e-07, "logits/chosen": -0.657477855682373, "logits/rejected": -0.6711796522140503, "logps/chosen": -1.9453691244125366, "logps/rejected": -2.1049752235412598, "loss": 2.3569, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.453693389892578, "rewards/margins": 1.5960614681243896, "rewards/rejected": -21.049753189086914, "step": 7760 }, { "epoch": 0.2617209882368802, "grad_norm": 28.694900512695312, "learning_rate": 9.224272743782078e-07, "logits/chosen": -0.7226378321647644, "logits/rejected": -0.7690739631652832, "logps/chosen": -2.0600342750549316, "logps/rejected": -1.965648889541626, "loss": 4.342, "rewards/accuracies": 0.5, "rewards/chosen": -20.600341796875, "rewards/margins": -0.9438526034355164, "rewards/rejected": -19.6564884185791, "step": 7765 }, { "epoch": 0.26188951430786345, "grad_norm": 20.68834114074707, "learning_rate": 9.222698404495892e-07, "logits/chosen": -0.5737382173538208, "logits/rejected": -0.554766833782196, "logps/chosen": -1.6142475605010986, "logps/rejected": -1.7195327281951904, "loss": 2.5591, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.142475128173828, "rewards/margins": 1.052852749824524, "rewards/rejected": -17.195327758789062, "step": 7770 }, { "epoch": 0.2620580403788466, "grad_norm": 57.733306884765625, "learning_rate": 9.221122603900155e-07, "logits/chosen": -0.7775768041610718, "logits/rejected": -0.6437594294548035, "logps/chosen": -2.050947427749634, "logps/rejected": -2.2061009407043457, "loss": 2.7095, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.50947380065918, "rewards/margins": 1.5515353679656982, "rewards/rejected": -22.06100845336914, "step": 7775 }, { "epoch": 0.2622265664498298, "grad_norm": 16.664154052734375, "learning_rate": 9.219545342540191e-07, "logits/chosen": -0.2868829369544983, "logits/rejected": -0.33900654315948486, "logps/chosen": -1.7464549541473389, "logps/rejected": -1.9309101104736328, "loss": 1.904, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.464550018310547, "rewards/margins": 1.8445497751235962, "rewards/rejected": -19.309101104736328, "step": 7780 }, { "epoch": 0.26239509252081294, "grad_norm": 16.679758071899414, "learning_rate": 9.217966620961828e-07, "logits/chosen": -0.6830729842185974, "logits/rejected": -0.7631603479385376, "logps/chosen": -1.570847988128662, "logps/rejected": -1.589646577835083, "loss": 3.0564, "rewards/accuracies": 0.5, "rewards/chosen": -15.708480834960938, "rewards/margins": 0.18798570334911346, "rewards/rejected": -15.896464347839355, "step": 7785 }, { "epoch": 0.26256361859179617, "grad_norm": 23.36970329284668, "learning_rate": 9.216386439711397e-07, "logits/chosen": -0.6877197027206421, "logits/rejected": -0.5568257570266724, "logps/chosen": -1.8986759185791016, "logps/rejected": -1.9676154851913452, "loss": 2.9447, "rewards/accuracies": 0.5, "rewards/chosen": -18.986759185791016, "rewards/margins": 0.689396858215332, "rewards/rejected": -19.676156997680664, "step": 7790 }, { "epoch": 0.26273214466277933, "grad_norm": 17.951892852783203, "learning_rate": 9.214804799335739e-07, "logits/chosen": -0.6987979412078857, "logits/rejected": -0.8650094866752625, "logps/chosen": -1.6295093297958374, "logps/rejected": -1.909968614578247, "loss": 1.5949, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.295093536376953, "rewards/margins": 2.804591417312622, "rewards/rejected": -19.09968376159668, "step": 7795 }, { "epoch": 0.2629006707337625, "grad_norm": 17.47931480407715, "learning_rate": 9.213221700382196e-07, "logits/chosen": -0.49302539229393005, "logits/rejected": -0.5437559485435486, "logps/chosen": -1.695380449295044, "logps/rejected": -1.804037094116211, "loss": 2.9106, "rewards/accuracies": 0.5, "rewards/chosen": -16.95380401611328, "rewards/margins": 1.0865659713745117, "rewards/rejected": -18.04037094116211, "step": 7800 }, { "epoch": 0.2630691968047457, "grad_norm": 40.34727478027344, "learning_rate": 9.211637143398619e-07, "logits/chosen": -0.4341478943824768, "logits/rejected": -0.4721830487251282, "logps/chosen": -1.685331106185913, "logps/rejected": -1.7352886199951172, "loss": 3.0491, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.85331153869629, "rewards/margins": 0.49957332015037537, "rewards/rejected": -17.352886199951172, "step": 7805 }, { "epoch": 0.2632377228757289, "grad_norm": 17.324604034423828, "learning_rate": 9.210051128933356e-07, "logits/chosen": -0.8900884389877319, "logits/rejected": -0.9262346029281616, "logps/chosen": -2.093942403793335, "logps/rejected": -2.1410953998565674, "loss": 2.9911, "rewards/accuracies": 0.5, "rewards/chosen": -20.939424514770508, "rewards/margins": 0.4715285301208496, "rewards/rejected": -21.410953521728516, "step": 7810 }, { "epoch": 0.26340624894671205, "grad_norm": 9.978482246398926, "learning_rate": 9.208463657535268e-07, "logits/chosen": -0.5291486382484436, "logits/rejected": -0.6421756148338318, "logps/chosen": -1.926701307296753, "logps/rejected": -1.9434372186660767, "loss": 3.3581, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.26701545715332, "rewards/margins": 0.1673574447631836, "rewards/rejected": -19.434371948242188, "step": 7815 }, { "epoch": 0.2635747750176952, "grad_norm": 43.69354248046875, "learning_rate": 9.206874729753716e-07, "logits/chosen": -0.21648378670215607, "logits/rejected": -0.14429767429828644, "logps/chosen": -2.1134257316589355, "logps/rejected": -1.7602497339248657, "loss": 6.6086, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.134258270263672, "rewards/margins": -3.5317604541778564, "rewards/rejected": -17.60249900817871, "step": 7820 }, { "epoch": 0.26374330108867844, "grad_norm": 20.53978157043457, "learning_rate": 9.205284346138562e-07, "logits/chosen": -0.4210183024406433, "logits/rejected": -0.4092690944671631, "logps/chosen": -1.7413660287857056, "logps/rejected": -1.9055869579315186, "loss": 2.033, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.413660049438477, "rewards/margins": 1.6422086954116821, "rewards/rejected": -19.05586814880371, "step": 7825 }, { "epoch": 0.2639118271596616, "grad_norm": 19.872541427612305, "learning_rate": 9.203692507240179e-07, "logits/chosen": -0.39160478115081787, "logits/rejected": -0.48597049713134766, "logps/chosen": -2.0400238037109375, "logps/rejected": -2.3305420875549316, "loss": 2.3703, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.40023422241211, "rewards/margins": 2.9051859378814697, "rewards/rejected": -23.305421829223633, "step": 7830 }, { "epoch": 0.26408035323064477, "grad_norm": 44.66335678100586, "learning_rate": 9.202099213609437e-07, "logits/chosen": -0.7690407037734985, "logits/rejected": -0.6451536417007446, "logps/chosen": -1.8670456409454346, "logps/rejected": -1.8136584758758545, "loss": 3.6153, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.670455932617188, "rewards/margins": -0.5338695645332336, "rewards/rejected": -18.136585235595703, "step": 7835 }, { "epoch": 0.26424887930162794, "grad_norm": 41.984840393066406, "learning_rate": 9.200504465797714e-07, "logits/chosen": -0.5631991624832153, "logits/rejected": -0.6159490346908569, "logps/chosen": -2.0934665203094482, "logps/rejected": -1.8932098150253296, "loss": 5.0476, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.93466567993164, "rewards/margins": -2.002568006515503, "rewards/rejected": -18.932098388671875, "step": 7840 }, { "epoch": 0.26441740537261116, "grad_norm": 33.509124755859375, "learning_rate": 9.198908264356888e-07, "logits/chosen": -0.4450080394744873, "logits/rejected": -0.5047804713249207, "logps/chosen": -1.6830122470855713, "logps/rejected": -1.7845993041992188, "loss": 3.0891, "rewards/accuracies": 0.5, "rewards/chosen": -16.830120086669922, "rewards/margins": 1.0158723592758179, "rewards/rejected": -17.845993041992188, "step": 7845 }, { "epoch": 0.2645859314435943, "grad_norm": 23.490671157836914, "learning_rate": 9.197310609839343e-07, "logits/chosen": -0.8265358209609985, "logits/rejected": -0.6917954683303833, "logps/chosen": -1.649942398071289, "logps/rejected": -1.774391770362854, "loss": 2.2419, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.49942398071289, "rewards/margins": 1.2444937229156494, "rewards/rejected": -17.74391746520996, "step": 7850 }, { "epoch": 0.2647544575145775, "grad_norm": 20.921653747558594, "learning_rate": 9.195711502797963e-07, "logits/chosen": -0.2501029074192047, "logits/rejected": -0.3494180142879486, "logps/chosen": -2.1978306770324707, "logps/rejected": -2.097285509109497, "loss": 4.5838, "rewards/accuracies": 0.5, "rewards/chosen": -21.97830581665039, "rewards/margins": -1.0054512023925781, "rewards/rejected": -20.972856521606445, "step": 7855 }, { "epoch": 0.2649229835855607, "grad_norm": 28.807878494262695, "learning_rate": 9.194110943786135e-07, "logits/chosen": -0.7602871656417847, "logits/rejected": -0.9272792935371399, "logps/chosen": -1.496128797531128, "logps/rejected": -1.7382854223251343, "loss": 2.3124, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -14.96129035949707, "rewards/margins": 2.4215664863586426, "rewards/rejected": -17.382854461669922, "step": 7860 }, { "epoch": 0.2650915096565439, "grad_norm": 21.33911895751953, "learning_rate": 9.192508933357752e-07, "logits/chosen": -0.4943917393684387, "logits/rejected": -0.4705166816711426, "logps/chosen": -1.9024235010147095, "logps/rejected": -2.0955557823181152, "loss": 2.4284, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.024234771728516, "rewards/margins": 1.9313228130340576, "rewards/rejected": -20.955556869506836, "step": 7865 }, { "epoch": 0.26526003572752704, "grad_norm": 20.33489990234375, "learning_rate": 9.190905472067205e-07, "logits/chosen": -0.8497546911239624, "logits/rejected": -0.8793678283691406, "logps/chosen": -1.8554210662841797, "logps/rejected": -1.9822323322296143, "loss": 2.3817, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.554210662841797, "rewards/margins": 1.268110990524292, "rewards/rejected": -19.82232093811035, "step": 7870 }, { "epoch": 0.2654285617985102, "grad_norm": 26.243898391723633, "learning_rate": 9.18930056046939e-07, "logits/chosen": -0.8190891146659851, "logits/rejected": -0.6624525785446167, "logps/chosen": -2.0691378116607666, "logps/rejected": -2.048070192337036, "loss": 3.4637, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.69137954711914, "rewards/margins": -0.21067848801612854, "rewards/rejected": -20.480701446533203, "step": 7875 }, { "epoch": 0.26559708786949343, "grad_norm": 19.79037094116211, "learning_rate": 9.187694199119703e-07, "logits/chosen": -0.8644890785217285, "logits/rejected": -0.8535453677177429, "logps/chosen": -1.686231017112732, "logps/rejected": -1.907679557800293, "loss": 2.1817, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.862308502197266, "rewards/margins": 2.2144851684570312, "rewards/rejected": -19.076793670654297, "step": 7880 }, { "epoch": 0.2657656139404766, "grad_norm": 20.234148025512695, "learning_rate": 9.186086388574041e-07, "logits/chosen": -0.870225727558136, "logits/rejected": -0.7963360548019409, "logps/chosen": -1.719887375831604, "logps/rejected": -1.9151662588119507, "loss": 2.0666, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.19887351989746, "rewards/margins": 1.952789068222046, "rewards/rejected": -19.151662826538086, "step": 7885 }, { "epoch": 0.26593414001145976, "grad_norm": 14.168811798095703, "learning_rate": 9.184477129388807e-07, "logits/chosen": -0.7177601456642151, "logits/rejected": -0.7062331438064575, "logps/chosen": -1.7758731842041016, "logps/rejected": -1.938987374305725, "loss": 2.1635, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.758731842041016, "rewards/margins": 1.6311410665512085, "rewards/rejected": -19.389873504638672, "step": 7890 }, { "epoch": 0.26610266608244293, "grad_norm": 14.62055778503418, "learning_rate": 9.182866422120898e-07, "logits/chosen": -0.8328273892402649, "logits/rejected": -1.0201135873794556, "logps/chosen": -1.7414734363555908, "logps/rejected": -1.9945809841156006, "loss": 2.0646, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.414735794067383, "rewards/margins": 2.531074047088623, "rewards/rejected": -19.945810317993164, "step": 7895 }, { "epoch": 0.26627119215342615, "grad_norm": 29.412940979003906, "learning_rate": 9.181254267327721e-07, "logits/chosen": -0.6478652358055115, "logits/rejected": -0.6217209100723267, "logps/chosen": -2.1247923374176025, "logps/rejected": -2.0956790447235107, "loss": 3.4741, "rewards/accuracies": 0.5, "rewards/chosen": -21.247920989990234, "rewards/margins": -0.29112958908081055, "rewards/rejected": -20.9567928314209, "step": 7900 }, { "epoch": 0.2664397182244093, "grad_norm": 15.633121490478516, "learning_rate": 9.179640665567175e-07, "logits/chosen": -0.42172449827194214, "logits/rejected": -0.4099927544593811, "logps/chosen": -2.567204236984253, "logps/rejected": -2.4554595947265625, "loss": 4.6659, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.672039031982422, "rewards/margins": -1.117445945739746, "rewards/rejected": -24.554594039916992, "step": 7905 }, { "epoch": 0.2666082442953925, "grad_norm": 37.013118743896484, "learning_rate": 9.178025617397667e-07, "logits/chosen": -0.572817325592041, "logits/rejected": -0.5794605016708374, "logps/chosen": -2.0265133380889893, "logps/rejected": -2.1147801876068115, "loss": 2.3479, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.2651309967041, "rewards/margins": 0.8826696276664734, "rewards/rejected": -21.147802352905273, "step": 7910 }, { "epoch": 0.2667767703663757, "grad_norm": 0.1122620701789856, "learning_rate": 9.1764091233781e-07, "logits/chosen": -0.15012314915657043, "logits/rejected": -0.27648302912712097, "logps/chosen": -1.822546362876892, "logps/rejected": -2.272954225540161, "loss": 1.6367, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.2254638671875, "rewards/margins": 4.504077911376953, "rewards/rejected": -22.729541778564453, "step": 7915 }, { "epoch": 0.26694529643735887, "grad_norm": 43.50534439086914, "learning_rate": 9.174791184067881e-07, "logits/chosen": -0.7817455530166626, "logits/rejected": -0.9152389764785767, "logps/chosen": -2.034762144088745, "logps/rejected": -2.0368587970733643, "loss": 3.5936, "rewards/accuracies": 0.5, "rewards/chosen": -20.347620010375977, "rewards/margins": 0.020967865362763405, "rewards/rejected": -20.368587493896484, "step": 7920 }, { "epoch": 0.26711382250834204, "grad_norm": 19.68915367126465, "learning_rate": 9.173171800026911e-07, "logits/chosen": -0.7624861001968384, "logits/rejected": -0.6545786261558533, "logps/chosen": -2.075883388519287, "logps/rejected": -2.0948750972747803, "loss": 3.1003, "rewards/accuracies": 0.5, "rewards/chosen": -20.758832931518555, "rewards/margins": 0.18991927802562714, "rewards/rejected": -20.94875144958496, "step": 7925 }, { "epoch": 0.2672823485793252, "grad_norm": 19.582759857177734, "learning_rate": 9.171550971815599e-07, "logits/chosen": -0.6269463300704956, "logits/rejected": -0.5425857305526733, "logps/chosen": -1.6861941814422607, "logps/rejected": -1.9470901489257812, "loss": 2.2631, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.861942291259766, "rewards/margins": 2.6089584827423096, "rewards/rejected": -19.470901489257812, "step": 7930 }, { "epoch": 0.2674508746503084, "grad_norm": 67.3069839477539, "learning_rate": 9.169928699994846e-07, "logits/chosen": -0.4286138117313385, "logits/rejected": -0.6139780282974243, "logps/chosen": -1.865778923034668, "logps/rejected": -1.8274714946746826, "loss": 3.497, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.65778923034668, "rewards/margins": -0.38307541608810425, "rewards/rejected": -18.274715423583984, "step": 7935 }, { "epoch": 0.2676194007212916, "grad_norm": 24.0504093170166, "learning_rate": 9.168304985126061e-07, "logits/chosen": -0.5239379405975342, "logits/rejected": -0.7206992506980896, "logps/chosen": -2.006237745285034, "logps/rejected": -2.1122491359710693, "loss": 3.175, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -20.062381744384766, "rewards/margins": 1.060112714767456, "rewards/rejected": -21.12249183654785, "step": 7940 }, { "epoch": 0.26778792679227476, "grad_norm": 24.669153213500977, "learning_rate": 9.166679827771145e-07, "logits/chosen": -0.5793864130973816, "logits/rejected": -0.6483038663864136, "logps/chosen": -2.009474754333496, "logps/rejected": -1.9184348583221436, "loss": 4.2305, "rewards/accuracies": 0.5, "rewards/chosen": -20.09474754333496, "rewards/margins": -0.9103986024856567, "rewards/rejected": -19.184350967407227, "step": 7945 }, { "epoch": 0.2679564528632579, "grad_norm": 53.88877487182617, "learning_rate": 9.165053228492499e-07, "logits/chosen": -0.7628545165061951, "logits/rejected": -0.7562915086746216, "logps/chosen": -2.2420852184295654, "logps/rejected": -2.3469042778015137, "loss": 2.4504, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.42085075378418, "rewards/margins": 1.0481895208358765, "rewards/rejected": -23.46904182434082, "step": 7950 }, { "epoch": 0.26812497893424114, "grad_norm": 15.133487701416016, "learning_rate": 9.163425187853029e-07, "logits/chosen": -0.5153997540473938, "logits/rejected": -0.5927127599716187, "logps/chosen": -1.7608623504638672, "logps/rejected": -1.9573513269424438, "loss": 2.3498, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.608623504638672, "rewards/margins": 1.964890718460083, "rewards/rejected": -19.57351303100586, "step": 7955 }, { "epoch": 0.2682935050052243, "grad_norm": 31.434738159179688, "learning_rate": 9.161795706416133e-07, "logits/chosen": -0.4737616181373596, "logits/rejected": -0.5601701140403748, "logps/chosen": -1.8284581899642944, "logps/rejected": -1.917937994003296, "loss": 3.3058, "rewards/accuracies": 0.5, "rewards/chosen": -18.284582138061523, "rewards/margins": 0.894799530506134, "rewards/rejected": -19.179380416870117, "step": 7960 }, { "epoch": 0.2684620310762075, "grad_norm": 21.656169891357422, "learning_rate": 9.160164784745713e-07, "logits/chosen": -1.0133836269378662, "logits/rejected": -0.9854210615158081, "logps/chosen": -1.6828429698944092, "logps/rejected": -1.8976166248321533, "loss": 2.4476, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.828426361083984, "rewards/margins": 2.147737503051758, "rewards/rejected": -18.976165771484375, "step": 7965 }, { "epoch": 0.2686305571471907, "grad_norm": 19.28142547607422, "learning_rate": 9.158532423406164e-07, "logits/chosen": -0.8909885287284851, "logits/rejected": -0.9013813138008118, "logps/chosen": -1.8956714868545532, "logps/rejected": -2.063211679458618, "loss": 2.8542, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.956714630126953, "rewards/margins": 1.6754035949707031, "rewards/rejected": -20.632118225097656, "step": 7970 }, { "epoch": 0.26879908321817386, "grad_norm": 18.054901123046875, "learning_rate": 9.156898622962383e-07, "logits/chosen": -0.4297906756401062, "logits/rejected": -0.3117820620536804, "logps/chosen": -1.7339808940887451, "logps/rejected": -2.3593122959136963, "loss": 2.0759, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.339807510375977, "rewards/margins": 6.253314018249512, "rewards/rejected": -23.593122482299805, "step": 7975 }, { "epoch": 0.26896760928915703, "grad_norm": 16.134092330932617, "learning_rate": 9.155263383979763e-07, "logits/chosen": -1.0646306276321411, "logits/rejected": -1.2803940773010254, "logps/chosen": -1.7984859943389893, "logps/rejected": -1.9528639316558838, "loss": 1.8858, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -17.984859466552734, "rewards/margins": 1.5437793731689453, "rewards/rejected": -19.528636932373047, "step": 7980 }, { "epoch": 0.2691361353601402, "grad_norm": 12.436636924743652, "learning_rate": 9.153626707024197e-07, "logits/chosen": -0.7420053482055664, "logits/rejected": -0.8438242077827454, "logps/chosen": -1.503025770187378, "logps/rejected": -1.6121635437011719, "loss": 2.7156, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -15.030258178710938, "rewards/margins": 1.0913773775100708, "rewards/rejected": -16.12163543701172, "step": 7985 }, { "epoch": 0.2693046614311234, "grad_norm": 4.8486328125, "learning_rate": 9.151988592662075e-07, "logits/chosen": -0.7318106293678284, "logits/rejected": -0.8289157152175903, "logps/chosen": -2.7210354804992676, "logps/rejected": -2.8618569374084473, "loss": 2.2072, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -27.21035385131836, "rewards/margins": 1.4082170724868774, "rewards/rejected": -28.618572235107422, "step": 7990 }, { "epoch": 0.2694731875021066, "grad_norm": 11.79561996459961, "learning_rate": 9.150349041460282e-07, "logits/chosen": -0.6596136093139648, "logits/rejected": -0.6863908767700195, "logps/chosen": -1.6224896907806396, "logps/rejected": -1.7435400485992432, "loss": 3.6967, "rewards/accuracies": 0.5, "rewards/chosen": -16.224897384643555, "rewards/margins": 1.2105019092559814, "rewards/rejected": -17.435400009155273, "step": 7995 }, { "epoch": 0.26964171357308975, "grad_norm": 18.109264373779297, "learning_rate": 9.148708053986203e-07, "logits/chosen": -0.6359624862670898, "logits/rejected": -0.4724443554878235, "logps/chosen": -2.2945563793182373, "logps/rejected": -2.0365214347839355, "loss": 5.7031, "rewards/accuracies": 0.5, "rewards/chosen": -22.945566177368164, "rewards/margins": -2.580352783203125, "rewards/rejected": -20.36521339416504, "step": 8000 }, { "epoch": 0.26964171357308975, "eval_logits/chosen": -0.9466900825500488, "eval_logits/rejected": -0.9982383847236633, "eval_logps/chosen": -1.8017518520355225, "eval_logps/rejected": -1.8493553400039673, "eval_loss": 3.167187452316284, "eval_rewards/accuracies": 0.6100000143051147, "eval_rewards/chosen": -18.01751708984375, "eval_rewards/margins": 0.4760337769985199, "eval_rewards/rejected": -18.493553161621094, "eval_runtime": 12.8998, "eval_samples_per_second": 7.752, "eval_steps_per_second": 1.938, "step": 8000 }, { "epoch": 0.2698102396440729, "grad_norm": 21.768617630004883, "learning_rate": 9.14706563080772e-07, "logits/chosen": -0.5055543184280396, "logits/rejected": -0.4700976014137268, "logps/chosen": -1.7952054738998413, "logps/rejected": -1.823185682296753, "loss": 3.8035, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.95205307006836, "rewards/margins": 0.2798027992248535, "rewards/rejected": -18.231857299804688, "step": 8005 }, { "epoch": 0.26997876571505613, "grad_norm": 6.784413814544678, "learning_rate": 9.14542177249321e-07, "logits/chosen": -0.5981645584106445, "logits/rejected": -0.6344643831253052, "logps/chosen": -2.039539098739624, "logps/rejected": -2.182983875274658, "loss": 2.6472, "rewards/accuracies": 0.5, "rewards/chosen": -20.395395278930664, "rewards/margins": 1.4344440698623657, "rewards/rejected": -21.829837799072266, "step": 8010 }, { "epoch": 0.2701472917860393, "grad_norm": 108.63782501220703, "learning_rate": 9.143776479611544e-07, "logits/chosen": -0.3831273913383484, "logits/rejected": -0.5212526917457581, "logps/chosen": -2.725125551223755, "logps/rejected": -2.3707118034362793, "loss": 6.8674, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -27.25125503540039, "rewards/margins": -3.5441346168518066, "rewards/rejected": -23.70711898803711, "step": 8015 }, { "epoch": 0.27031581785702247, "grad_norm": 69.21826934814453, "learning_rate": 9.142129752732101e-07, "logits/chosen": -0.6095398664474487, "logits/rejected": -0.7510106563568115, "logps/chosen": -1.9469678401947021, "logps/rejected": -2.0188910961151123, "loss": 2.5247, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.46967887878418, "rewards/margins": 0.7192336320877075, "rewards/rejected": -20.188913345336914, "step": 8020 }, { "epoch": 0.2704843439280057, "grad_norm": 14.318962097167969, "learning_rate": 9.140481592424742e-07, "logits/chosen": -0.5337401628494263, "logits/rejected": -0.4288802146911621, "logps/chosen": -2.1296792030334473, "logps/rejected": -2.3022007942199707, "loss": 2.3135, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.29679298400879, "rewards/margins": 1.7252171039581299, "rewards/rejected": -23.022008895874023, "step": 8025 }, { "epoch": 0.27065286999898885, "grad_norm": 13.43238639831543, "learning_rate": 9.138831999259833e-07, "logits/chosen": -0.7914996147155762, "logits/rejected": -0.7920703291893005, "logps/chosen": -1.9322484731674194, "logps/rejected": -2.026156425476074, "loss": 2.7437, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.322486877441406, "rewards/margins": 0.9390776753425598, "rewards/rejected": -20.26156234741211, "step": 8030 }, { "epoch": 0.270821396069972, "grad_norm": 34.501399993896484, "learning_rate": 9.137180973808233e-07, "logits/chosen": -0.5773485898971558, "logits/rejected": -0.7042641639709473, "logps/chosen": -2.2767586708068848, "logps/rejected": -2.022218942642212, "loss": 5.6654, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -22.767587661743164, "rewards/margins": -2.545400619506836, "rewards/rejected": -20.222187042236328, "step": 8035 }, { "epoch": 0.2709899221409552, "grad_norm": 44.062347412109375, "learning_rate": 9.135528516641295e-07, "logits/chosen": -0.6731947660446167, "logits/rejected": -0.8957898020744324, "logps/chosen": -1.9581935405731201, "logps/rejected": -2.2106902599334717, "loss": 2.4807, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.58193588256836, "rewards/margins": 2.524967908859253, "rewards/rejected": -22.106903076171875, "step": 8040 }, { "epoch": 0.2711584482119384, "grad_norm": 21.43016242980957, "learning_rate": 9.133874628330874e-07, "logits/chosen": -0.6238225102424622, "logits/rejected": -0.711872935295105, "logps/chosen": -2.185377597808838, "logps/rejected": -2.101410388946533, "loss": 4.0274, "rewards/accuracies": 0.5, "rewards/chosen": -21.853775024414062, "rewards/margins": -0.8396707773208618, "rewards/rejected": -21.01410484313965, "step": 8045 }, { "epoch": 0.2713269742829216, "grad_norm": 23.42485809326172, "learning_rate": 9.132219309449307e-07, "logits/chosen": -0.6938878297805786, "logits/rejected": -0.846452534198761, "logps/chosen": -1.8809760808944702, "logps/rejected": -2.4321799278259277, "loss": 1.8875, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.80975914001465, "rewards/margins": 5.512040615081787, "rewards/rejected": -24.321800231933594, "step": 8050 }, { "epoch": 0.27149550035390474, "grad_norm": 28.485620498657227, "learning_rate": 9.130562560569444e-07, "logits/chosen": -0.7439653873443604, "logits/rejected": -0.7892839312553406, "logps/chosen": -1.7448148727416992, "logps/rejected": -1.8274883031845093, "loss": 2.3475, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.44814682006836, "rewards/margins": 0.8267344236373901, "rewards/rejected": -18.274883270263672, "step": 8055 }, { "epoch": 0.2716640264248879, "grad_norm": 12.987263679504395, "learning_rate": 9.128904382264615e-07, "logits/chosen": -0.2975326478481293, "logits/rejected": -0.33172541856765747, "logps/chosen": -2.0645947456359863, "logps/rejected": -2.379493474960327, "loss": 1.9203, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.645946502685547, "rewards/margins": 3.148991823196411, "rewards/rejected": -23.79493522644043, "step": 8060 }, { "epoch": 0.2718325524958711, "grad_norm": 7.247758388519287, "learning_rate": 9.127244775108652e-07, "logits/chosen": -0.9111455082893372, "logits/rejected": -1.0679799318313599, "logps/chosen": -1.9295141696929932, "logps/rejected": -2.3214521408081055, "loss": 0.8627, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -19.295141220092773, "rewards/margins": 3.9193801879882812, "rewards/rejected": -23.214523315429688, "step": 8065 }, { "epoch": 0.2720010785668543, "grad_norm": 31.42644500732422, "learning_rate": 9.125583739675879e-07, "logits/chosen": -1.1115130186080933, "logits/rejected": -1.1000540256500244, "logps/chosen": -1.689842939376831, "logps/rejected": -1.5708680152893066, "loss": 4.2923, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.8984317779541, "rewards/margins": -1.1897509098052979, "rewards/rejected": -15.708681106567383, "step": 8070 }, { "epoch": 0.27216960463783746, "grad_norm": 32.55793380737305, "learning_rate": 9.123921276541115e-07, "logits/chosen": -0.6790528893470764, "logits/rejected": -0.4730163514614105, "logps/chosen": -2.0601210594177246, "logps/rejected": -2.2196249961853027, "loss": 2.9702, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.601207733154297, "rewards/margins": 1.5950422286987305, "rewards/rejected": -22.196250915527344, "step": 8075 }, { "epoch": 0.2723381307088207, "grad_norm": 19.02665138244629, "learning_rate": 9.122257386279675e-07, "logits/chosen": -0.8409290313720703, "logits/rejected": -0.8434259295463562, "logps/chosen": -1.7004365921020508, "logps/rejected": -1.6659228801727295, "loss": 3.484, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.004365921020508, "rewards/margins": -0.34513577818870544, "rewards/rejected": -16.659229278564453, "step": 8080 }, { "epoch": 0.27250665677980385, "grad_norm": 39.294979095458984, "learning_rate": 9.120592069467361e-07, "logits/chosen": -0.4286056458950043, "logits/rejected": -0.5703016519546509, "logps/chosen": -1.7742544412612915, "logps/rejected": -1.78805673122406, "loss": 3.1235, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.74254608154297, "rewards/margins": 0.13802233338356018, "rewards/rejected": -17.88056755065918, "step": 8085 }, { "epoch": 0.272675182850787, "grad_norm": 27.521507263183594, "learning_rate": 9.118925326680479e-07, "logits/chosen": -0.5573610663414001, "logits/rejected": -0.5069311857223511, "logps/chosen": -2.0201663970947266, "logps/rejected": -1.9709510803222656, "loss": 3.749, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.201663970947266, "rewards/margins": -0.4921538233757019, "rewards/rejected": -19.709510803222656, "step": 8090 }, { "epoch": 0.2728437089217702, "grad_norm": 108.85688018798828, "learning_rate": 9.117257158495819e-07, "logits/chosen": -0.11293216049671173, "logits/rejected": -0.2181539237499237, "logps/chosen": -2.149608612060547, "logps/rejected": -2.3219380378723145, "loss": 2.284, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.496084213256836, "rewards/margins": 1.7232919931411743, "rewards/rejected": -23.219379425048828, "step": 8095 }, { "epoch": 0.2730122349927534, "grad_norm": 22.91443634033203, "learning_rate": 9.115587565490672e-07, "logits/chosen": -0.36285096406936646, "logits/rejected": -0.35819101333618164, "logps/chosen": -2.12048602104187, "logps/rejected": -2.111513614654541, "loss": 3.5268, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.20486068725586, "rewards/margins": -0.08972187340259552, "rewards/rejected": -21.11513900756836, "step": 8100 }, { "epoch": 0.27318076106373657, "grad_norm": 20.997974395751953, "learning_rate": 9.113916548242815e-07, "logits/chosen": -0.5678842067718506, "logits/rejected": -0.507403552532196, "logps/chosen": -1.812657117843628, "logps/rejected": -2.1203980445861816, "loss": 2.1092, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.126571655273438, "rewards/margins": 3.0774083137512207, "rewards/rejected": -21.2039794921875, "step": 8105 }, { "epoch": 0.27334928713471973, "grad_norm": 22.05033302307129, "learning_rate": 9.112244107330523e-07, "logits/chosen": -0.9710659980773926, "logits/rejected": -0.8690118789672852, "logps/chosen": -1.7730506658554077, "logps/rejected": -1.861966848373413, "loss": 3.4844, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.730506896972656, "rewards/margins": 0.8891617655754089, "rewards/rejected": -18.619670867919922, "step": 8110 }, { "epoch": 0.2735178132057029, "grad_norm": 18.96816635131836, "learning_rate": 9.11057024333256e-07, "logits/chosen": -0.9982341527938843, "logits/rejected": -1.0075794458389282, "logps/chosen": -1.9207165241241455, "logps/rejected": -1.8805665969848633, "loss": 3.9492, "rewards/accuracies": 0.5, "rewards/chosen": -19.207164764404297, "rewards/margins": -0.4014988839626312, "rewards/rejected": -18.805665969848633, "step": 8115 }, { "epoch": 0.2736863392766861, "grad_norm": 16.318857192993164, "learning_rate": 9.108894956828187e-07, "logits/chosen": -0.7426223754882812, "logits/rejected": -0.9075484275817871, "logps/chosen": -1.800121545791626, "logps/rejected": -1.9071872234344482, "loss": 2.8309, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.0012149810791, "rewards/margins": 1.070657730102539, "rewards/rejected": -19.07187271118164, "step": 8120 }, { "epoch": 0.2738548653476693, "grad_norm": 28.79190444946289, "learning_rate": 9.107218248397153e-07, "logits/chosen": -0.6248952150344849, "logits/rejected": -0.6566001772880554, "logps/chosen": -1.7045423984527588, "logps/rejected": -1.808816909790039, "loss": 2.3869, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.045421600341797, "rewards/margins": 1.042743444442749, "rewards/rejected": -18.088167190551758, "step": 8125 }, { "epoch": 0.27402339141865245, "grad_norm": 25.37258529663086, "learning_rate": 9.105540118619701e-07, "logits/chosen": -0.9384697079658508, "logits/rejected": -0.9719937443733215, "logps/chosen": -1.7226083278656006, "logps/rejected": -1.7185615301132202, "loss": 3.3347, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.22608184814453, "rewards/margins": -0.04046592861413956, "rewards/rejected": -17.18561553955078, "step": 8130 }, { "epoch": 0.2741919174896357, "grad_norm": 30.92608070373535, "learning_rate": 9.103860568076566e-07, "logits/chosen": -0.8808475732803345, "logits/rejected": -0.9752508997917175, "logps/chosen": -1.8235105276107788, "logps/rejected": -1.8683273792266846, "loss": 3.0119, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.235103607177734, "rewards/margins": 0.4481666684150696, "rewards/rejected": -18.683271408081055, "step": 8135 }, { "epoch": 0.27436044356061884, "grad_norm": 30.845779418945312, "learning_rate": 9.102179597348974e-07, "logits/chosen": -0.5595682859420776, "logits/rejected": -0.828567385673523, "logps/chosen": -1.7682873010635376, "logps/rejected": -1.7338556051254272, "loss": 3.4405, "rewards/accuracies": 0.5, "rewards/chosen": -17.682872772216797, "rewards/margins": -0.3443172574043274, "rewards/rejected": -17.33855628967285, "step": 8140 }, { "epoch": 0.274528969631602, "grad_norm": 119.611328125, "learning_rate": 9.100497207018643e-07, "logits/chosen": -0.4314725995063782, "logits/rejected": -0.2558334767818451, "logps/chosen": -1.8698337078094482, "logps/rejected": -1.995448112487793, "loss": 2.6223, "rewards/accuracies": 0.5, "rewards/chosen": -18.69833755493164, "rewards/margins": 1.2561436891555786, "rewards/rejected": -19.954483032226562, "step": 8145 }, { "epoch": 0.27469749570258517, "grad_norm": 22.87830924987793, "learning_rate": 9.098813397667782e-07, "logits/chosen": -0.7702856063842773, "logits/rejected": -0.7954924702644348, "logps/chosen": -1.871756911277771, "logps/rejected": -2.1178066730499268, "loss": 2.0098, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.71756935119629, "rewards/margins": 2.4604969024658203, "rewards/rejected": -21.17806625366211, "step": 8150 }, { "epoch": 0.2748660217735684, "grad_norm": 21.176847457885742, "learning_rate": 9.097128169879091e-07, "logits/chosen": -0.6883500814437866, "logits/rejected": -0.6603747010231018, "logps/chosen": -1.5661739110946655, "logps/rejected": -1.6865043640136719, "loss": 2.4503, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -15.661738395690918, "rewards/margins": 1.2033051252365112, "rewards/rejected": -16.86504364013672, "step": 8155 }, { "epoch": 0.27503454784455156, "grad_norm": 29.86301612854004, "learning_rate": 9.095441524235761e-07, "logits/chosen": -0.6998110413551331, "logits/rejected": -0.7533560991287231, "logps/chosen": -2.0183091163635254, "logps/rejected": -1.887945532798767, "loss": 4.3634, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -20.183090209960938, "rewards/margins": -1.3036348819732666, "rewards/rejected": -18.879453659057617, "step": 8160 }, { "epoch": 0.2752030739155347, "grad_norm": 27.845413208007812, "learning_rate": 9.093753461321472e-07, "logits/chosen": -0.7750387787818909, "logits/rejected": -0.7488040924072266, "logps/chosen": -2.1097846031188965, "logps/rejected": -2.3540987968444824, "loss": 2.8014, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.09784507751465, "rewards/margins": 2.4431440830230713, "rewards/rejected": -23.54098892211914, "step": 8165 }, { "epoch": 0.2753715999865179, "grad_norm": 14.149901390075684, "learning_rate": 9.092063981720398e-07, "logits/chosen": -0.6205824017524719, "logits/rejected": -0.5347954630851746, "logps/chosen": -1.62399423122406, "logps/rejected": -1.7922455072402954, "loss": 2.2484, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.239940643310547, "rewards/margins": 1.682511568069458, "rewards/rejected": -17.922454833984375, "step": 8170 }, { "epoch": 0.2755401260575011, "grad_norm": 22.775360107421875, "learning_rate": 9.090373086017202e-07, "logits/chosen": -0.35927271842956543, "logits/rejected": -0.5841717720031738, "logps/chosen": -2.026437759399414, "logps/rejected": -2.075422763824463, "loss": 2.9589, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.26437759399414, "rewards/margins": 0.4898509979248047, "rewards/rejected": -20.754228591918945, "step": 8175 }, { "epoch": 0.2757086521284843, "grad_norm": 42.60033416748047, "learning_rate": 9.088680774797033e-07, "logits/chosen": -0.6615483164787292, "logits/rejected": -0.6774601340293884, "logps/chosen": -1.9017183780670166, "logps/rejected": -2.0788064002990723, "loss": 1.8902, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.01718521118164, "rewards/margins": 1.7708823680877686, "rewards/rejected": -20.788066864013672, "step": 8180 }, { "epoch": 0.27587717819946744, "grad_norm": 16.84940528869629, "learning_rate": 9.086987048645538e-07, "logits/chosen": -0.7287185788154602, "logits/rejected": -0.7646733522415161, "logps/chosen": -2.029722213745117, "logps/rejected": -2.120607852935791, "loss": 2.6955, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.297222137451172, "rewards/margins": 0.908857524394989, "rewards/rejected": -21.206079483032227, "step": 8185 }, { "epoch": 0.27604570427045066, "grad_norm": 78.0438232421875, "learning_rate": 9.085291908148844e-07, "logits/chosen": -0.566503643989563, "logits/rejected": -0.5320937037467957, "logps/chosen": -2.180643081665039, "logps/rejected": -2.3785202503204346, "loss": 2.7018, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.806428909301758, "rewards/margins": 1.9787718057632446, "rewards/rejected": -23.785200119018555, "step": 8190 }, { "epoch": 0.27621423034143383, "grad_norm": 40.340904235839844, "learning_rate": 9.083595353893576e-07, "logits/chosen": -0.5297061800956726, "logits/rejected": -0.6772249937057495, "logps/chosen": -1.8808778524398804, "logps/rejected": -2.027780055999756, "loss": 4.0922, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.808780670166016, "rewards/margins": 1.4690202474594116, "rewards/rejected": -20.27779769897461, "step": 8195 }, { "epoch": 0.276382756412417, "grad_norm": 16.951213836669922, "learning_rate": 9.081897386466843e-07, "logits/chosen": -0.7171338200569153, "logits/rejected": -0.7746341824531555, "logps/chosen": -1.764413833618164, "logps/rejected": -1.7617387771606445, "loss": 3.2047, "rewards/accuracies": 0.5, "rewards/chosen": -17.64413833618164, "rewards/margins": -0.026749800890684128, "rewards/rejected": -17.617389678955078, "step": 8200 }, { "epoch": 0.27655128248340016, "grad_norm": 43.04359817504883, "learning_rate": 9.080198006456246e-07, "logits/chosen": -0.5838262438774109, "logits/rejected": -0.6805842518806458, "logps/chosen": -1.8306487798690796, "logps/rejected": -1.8248965740203857, "loss": 3.1635, "rewards/accuracies": 0.5, "rewards/chosen": -18.306488037109375, "rewards/margins": -0.057521723210811615, "rewards/rejected": -18.248966217041016, "step": 8205 }, { "epoch": 0.2767198085543834, "grad_norm": 19.069856643676758, "learning_rate": 9.078497214449869e-07, "logits/chosen": -0.8335992693901062, "logits/rejected": -0.9426866769790649, "logps/chosen": -1.739052176475525, "logps/rejected": -1.7812341451644897, "loss": 3.1077, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.390522003173828, "rewards/margins": 0.4218207001686096, "rewards/rejected": -17.81234359741211, "step": 8210 }, { "epoch": 0.27688833462536655, "grad_norm": 33.81036376953125, "learning_rate": 9.076795011036296e-07, "logits/chosen": -0.4566499590873718, "logits/rejected": -0.7345161437988281, "logps/chosen": -1.6366084814071655, "logps/rejected": -1.8738971948623657, "loss": 2.7152, "rewards/accuracies": 0.5, "rewards/chosen": -16.366085052490234, "rewards/margins": 2.372885227203369, "rewards/rejected": -18.738971710205078, "step": 8215 }, { "epoch": 0.2770568606963497, "grad_norm": 38.866764068603516, "learning_rate": 9.075091396804587e-07, "logits/chosen": -0.29251110553741455, "logits/rejected": -0.1948651373386383, "logps/chosen": -2.141052722930908, "logps/rejected": -2.3842930793762207, "loss": 2.8919, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.410524368286133, "rewards/margins": 2.432407855987549, "rewards/rejected": -23.842931747436523, "step": 8220 }, { "epoch": 0.2772253867673329, "grad_norm": 22.157766342163086, "learning_rate": 9.073386372344299e-07, "logits/chosen": -0.616371750831604, "logits/rejected": -0.7910966277122498, "logps/chosen": -1.8958488702774048, "logps/rejected": -2.1439311504364014, "loss": 1.7345, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.95848846435547, "rewards/margins": 2.4808223247528076, "rewards/rejected": -21.439311981201172, "step": 8225 }, { "epoch": 0.2773939128383161, "grad_norm": 49.546382904052734, "learning_rate": 9.071679938245471e-07, "logits/chosen": -0.6186679005622864, "logits/rejected": -0.7885586023330688, "logps/chosen": -2.076205015182495, "logps/rejected": -2.2669005393981934, "loss": 2.3493, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.76205062866211, "rewards/margins": 1.9069541692733765, "rewards/rejected": -22.66900634765625, "step": 8230 }, { "epoch": 0.27756243890929927, "grad_norm": 15.524943351745605, "learning_rate": 9.069972095098635e-07, "logits/chosen": -0.5857125520706177, "logits/rejected": -0.4242025911808014, "logps/chosen": -1.6623932123184204, "logps/rejected": -2.17319917678833, "loss": 2.5676, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.623931884765625, "rewards/margins": 5.108059406280518, "rewards/rejected": -21.731990814208984, "step": 8235 }, { "epoch": 0.27773096498028244, "grad_norm": 15.611465454101562, "learning_rate": 9.068262843494808e-07, "logits/chosen": -0.5052198767662048, "logits/rejected": -0.632614016532898, "logps/chosen": -1.7403032779693604, "logps/rejected": -1.9178276062011719, "loss": 1.8667, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.403034210205078, "rewards/margins": 1.7752418518066406, "rewards/rejected": -19.178274154663086, "step": 8240 }, { "epoch": 0.27789949105126566, "grad_norm": 48.774776458740234, "learning_rate": 9.066552184025493e-07, "logits/chosen": -0.7690288424491882, "logits/rejected": -0.7475972175598145, "logps/chosen": -1.7967151403427124, "logps/rejected": -1.9638046026229858, "loss": 1.8465, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -17.967151641845703, "rewards/margins": 1.670892357826233, "rewards/rejected": -19.638042449951172, "step": 8245 }, { "epoch": 0.2780680171222488, "grad_norm": 16.428930282592773, "learning_rate": 9.064840117282684e-07, "logits/chosen": 0.04140068218111992, "logits/rejected": 0.00973301101475954, "logps/chosen": -2.419886350631714, "logps/rejected": -3.2392563819885254, "loss": 1.9439, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.198863983154297, "rewards/margins": 8.193696975708008, "rewards/rejected": -32.39256286621094, "step": 8250 }, { "epoch": 0.278236543193232, "grad_norm": 19.519775390625, "learning_rate": 9.063126643858859e-07, "logits/chosen": -0.4937060475349426, "logits/rejected": -0.558300793170929, "logps/chosen": -1.9107048511505127, "logps/rejected": -2.1082000732421875, "loss": 1.8862, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.107044219970703, "rewards/margins": 1.9749542474746704, "rewards/rejected": -21.082000732421875, "step": 8255 }, { "epoch": 0.27840506926421515, "grad_norm": 48.95004653930664, "learning_rate": 9.061411764346982e-07, "logits/chosen": -0.8905462026596069, "logits/rejected": -0.8621233105659485, "logps/chosen": -1.711071252822876, "logps/rejected": -1.7398639917373657, "loss": 2.9241, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.1107120513916, "rewards/margins": 0.2879270613193512, "rewards/rejected": -17.398639678955078, "step": 8260 }, { "epoch": 0.2785735953351984, "grad_norm": 19.999202728271484, "learning_rate": 9.059695479340507e-07, "logits/chosen": -0.5921692252159119, "logits/rejected": -0.7616919279098511, "logps/chosen": -1.5218065977096558, "logps/rejected": -1.737052321434021, "loss": 2.2126, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -15.218066215515137, "rewards/margins": 2.1524569988250732, "rewards/rejected": -17.370525360107422, "step": 8265 }, { "epoch": 0.27874212140618154, "grad_norm": 17.218103408813477, "learning_rate": 9.057977789433372e-07, "logits/chosen": -0.5395227670669556, "logits/rejected": -0.4856560230255127, "logps/chosen": -2.105602502822876, "logps/rejected": -2.3897926807403564, "loss": 2.2021, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.056026458740234, "rewards/margins": 2.841902732849121, "rewards/rejected": -23.89792823791504, "step": 8270 }, { "epoch": 0.2789106474771647, "grad_norm": 16.424165725708008, "learning_rate": 9.056258695220002e-07, "logits/chosen": -0.8902799487113953, "logits/rejected": -0.8830171823501587, "logps/chosen": -1.8373076915740967, "logps/rejected": -2.084036350250244, "loss": 2.2416, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.373077392578125, "rewards/margins": 2.4672863483428955, "rewards/rejected": -20.840364456176758, "step": 8275 }, { "epoch": 0.2790791735481479, "grad_norm": 39.2321662902832, "learning_rate": 9.05453819729531e-07, "logits/chosen": -0.8157766461372375, "logits/rejected": -0.8567003011703491, "logps/chosen": -1.738638162612915, "logps/rejected": -1.8060699701309204, "loss": 3.035, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.386381149291992, "rewards/margins": 0.6743199229240417, "rewards/rejected": -18.060701370239258, "step": 8280 }, { "epoch": 0.2792476996191311, "grad_norm": 0.01929272711277008, "learning_rate": 9.052816296254687e-07, "logits/chosen": -0.6749808192253113, "logits/rejected": -0.5758073925971985, "logps/chosen": -1.931583046913147, "logps/rejected": -2.3056092262268066, "loss": 2.0042, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.315832138061523, "rewards/margins": 3.740262985229492, "rewards/rejected": -23.056095123291016, "step": 8285 }, { "epoch": 0.27941622569011426, "grad_norm": 17.415260314941406, "learning_rate": 9.051092992694021e-07, "logits/chosen": -0.6588484048843384, "logits/rejected": -0.6199295520782471, "logps/chosen": -1.7081390619277954, "logps/rejected": -1.96379816532135, "loss": 2.4417, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.081390380859375, "rewards/margins": 2.5565922260284424, "rewards/rejected": -19.637985229492188, "step": 8290 }, { "epoch": 0.2795847517610974, "grad_norm": 21.187219619750977, "learning_rate": 9.049368287209675e-07, "logits/chosen": -0.38697052001953125, "logits/rejected": -0.5481768250465393, "logps/chosen": -2.061035633087158, "logps/rejected": -2.188666820526123, "loss": 2.4915, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.6103572845459, "rewards/margins": 1.2763102054595947, "rewards/rejected": -21.886669158935547, "step": 8295 }, { "epoch": 0.27975327783208065, "grad_norm": 17.981689453125, "learning_rate": 9.047642180398505e-07, "logits/chosen": -0.8721598386764526, "logits/rejected": -0.9671875238418579, "logps/chosen": -1.6943117380142212, "logps/rejected": -1.8319746255874634, "loss": 2.6523, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.943117141723633, "rewards/margins": 1.376629114151001, "rewards/rejected": -18.319746017456055, "step": 8300 }, { "epoch": 0.2799218039030638, "grad_norm": 19.146921157836914, "learning_rate": 9.045914672857846e-07, "logits/chosen": -0.874261200428009, "logits/rejected": -0.9750015139579773, "logps/chosen": -1.7600091695785522, "logps/rejected": -1.7663971185684204, "loss": 3.6973, "rewards/accuracies": 0.5, "rewards/chosen": -17.6000919342041, "rewards/margins": 0.06387872993946075, "rewards/rejected": -17.663970947265625, "step": 8305 }, { "epoch": 0.280090329974047, "grad_norm": 58.41259002685547, "learning_rate": 9.044185765185521e-07, "logits/chosen": -0.6827090382575989, "logits/rejected": -0.5883959531784058, "logps/chosen": -2.0557332038879395, "logps/rejected": -2.0246777534484863, "loss": 3.4819, "rewards/accuracies": 0.5, "rewards/chosen": -20.557331085205078, "rewards/margins": -0.3105539381504059, "rewards/rejected": -20.246776580810547, "step": 8310 }, { "epoch": 0.28025885604503015, "grad_norm": 30.77134895324707, "learning_rate": 9.042455457979838e-07, "logits/chosen": -1.1021406650543213, "logits/rejected": -0.9775190353393555, "logps/chosen": -1.8120008707046509, "logps/rejected": -1.7186940908432007, "loss": 4.0694, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.12000846862793, "rewards/margins": -0.9330673217773438, "rewards/rejected": -17.186941146850586, "step": 8315 }, { "epoch": 0.28042738211601337, "grad_norm": 21.78514862060547, "learning_rate": 9.040723751839587e-07, "logits/chosen": -0.830939769744873, "logits/rejected": -0.9109653234481812, "logps/chosen": -1.8959213495254517, "logps/rejected": -2.0493202209472656, "loss": 2.7228, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.959213256835938, "rewards/margins": 1.5339890718460083, "rewards/rejected": -20.493200302124023, "step": 8320 }, { "epoch": 0.28059590818699653, "grad_norm": 19.907915115356445, "learning_rate": 9.038990647364045e-07, "logits/chosen": -0.5402109622955322, "logits/rejected": -0.5689557790756226, "logps/chosen": -1.5069522857666016, "logps/rejected": -1.602616548538208, "loss": 2.2936, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -15.0695219039917, "rewards/margins": 0.9566418528556824, "rewards/rejected": -16.026165008544922, "step": 8325 }, { "epoch": 0.2807644342579797, "grad_norm": 33.659568786621094, "learning_rate": 9.037256145152969e-07, "logits/chosen": -0.530125617980957, "logits/rejected": -0.769835352897644, "logps/chosen": -2.094918727874756, "logps/rejected": -1.9430668354034424, "loss": 4.7906, "rewards/accuracies": 0.5, "rewards/chosen": -20.949188232421875, "rewards/margins": -1.5185197591781616, "rewards/rejected": -19.430667877197266, "step": 8330 }, { "epoch": 0.28093296032896287, "grad_norm": 42.06294250488281, "learning_rate": 9.035520245806603e-07, "logits/chosen": -0.5174092650413513, "logits/rejected": -0.6184954643249512, "logps/chosen": -2.0975887775421143, "logps/rejected": -2.228912830352783, "loss": 2.7851, "rewards/accuracies": 0.5, "rewards/chosen": -20.975887298583984, "rewards/margins": 1.3132425546646118, "rewards/rejected": -22.28913116455078, "step": 8335 }, { "epoch": 0.2811014863999461, "grad_norm": 23.34773063659668, "learning_rate": 9.033782949925672e-07, "logits/chosen": -0.9857513308525085, "logits/rejected": -1.022640585899353, "logps/chosen": -1.6867862939834595, "logps/rejected": -1.6855357885360718, "loss": 3.1505, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.867862701416016, "rewards/margins": -0.012502431869506836, "rewards/rejected": -16.85536003112793, "step": 8340 }, { "epoch": 0.28127001247092925, "grad_norm": 35.15296936035156, "learning_rate": 9.032044258111389e-07, "logits/chosen": -0.4666837751865387, "logits/rejected": -0.5388206243515015, "logps/chosen": -1.867395043373108, "logps/rejected": -1.9092090129852295, "loss": 3.6015, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.673952102661133, "rewards/margins": 0.4181400239467621, "rewards/rejected": -19.092092514038086, "step": 8345 }, { "epoch": 0.2814385385419124, "grad_norm": 29.71284294128418, "learning_rate": 9.030304170965442e-07, "logits/chosen": -1.0103025436401367, "logits/rejected": -1.0144057273864746, "logps/chosen": -1.6153638362884521, "logps/rejected": -1.7078787088394165, "loss": 2.2607, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -16.15363883972168, "rewards/margins": 0.9251474142074585, "rewards/rejected": -17.078784942626953, "step": 8350 }, { "epoch": 0.28160706461289564, "grad_norm": 44.523921966552734, "learning_rate": 9.02856268909001e-07, "logits/chosen": -0.970793604850769, "logits/rejected": -1.0736042261123657, "logps/chosen": -2.353757619857788, "logps/rejected": -2.1389412879943848, "loss": 5.235, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -23.537578582763672, "rewards/margins": -2.148162603378296, "rewards/rejected": -21.389413833618164, "step": 8355 }, { "epoch": 0.2817755906838788, "grad_norm": 19.944168090820312, "learning_rate": 9.026819813087751e-07, "logits/chosen": -0.6036643385887146, "logits/rejected": -0.5563797354698181, "logps/chosen": -1.8941113948822021, "logps/rejected": -2.0597589015960693, "loss": 3.5323, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.941116333007812, "rewards/margins": 1.6564744710922241, "rewards/rejected": -20.597591400146484, "step": 8360 }, { "epoch": 0.281944116754862, "grad_norm": 31.779321670532227, "learning_rate": 9.025075543561804e-07, "logits/chosen": -0.2636929154396057, "logits/rejected": -0.3036728501319885, "logps/chosen": -2.436537981033325, "logps/rejected": -2.284217119216919, "loss": 5.2804, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -24.365379333496094, "rewards/margins": -1.5232096910476685, "rewards/rejected": -22.842172622680664, "step": 8365 }, { "epoch": 0.28211264282584514, "grad_norm": 61.62702178955078, "learning_rate": 9.023329881115793e-07, "logits/chosen": -0.6866958737373352, "logits/rejected": -0.7670364379882812, "logps/chosen": -2.037566661834717, "logps/rejected": -2.0160598754882812, "loss": 3.31, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.375667572021484, "rewards/margins": -0.21506690979003906, "rewards/rejected": -20.160600662231445, "step": 8370 }, { "epoch": 0.28228116889682836, "grad_norm": 13.388713836669922, "learning_rate": 9.021582826353824e-07, "logits/chosen": -0.3811209499835968, "logits/rejected": -0.28033262491226196, "logps/chosen": -2.2232699394226074, "logps/rejected": -2.5159478187561035, "loss": 2.3813, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.232698440551758, "rewards/margins": 2.9267804622650146, "rewards/rejected": -25.15947914123535, "step": 8375 }, { "epoch": 0.2824496949678115, "grad_norm": 71.75244140625, "learning_rate": 9.019834379880482e-07, "logits/chosen": -0.7619329690933228, "logits/rejected": -0.7269800901412964, "logps/chosen": -2.1262454986572266, "logps/rejected": -1.9746198654174805, "loss": 4.6105, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -21.262454986572266, "rewards/margins": -1.5162568092346191, "rewards/rejected": -19.746196746826172, "step": 8380 }, { "epoch": 0.2826182210387947, "grad_norm": 47.1468505859375, "learning_rate": 9.018084542300836e-07, "logits/chosen": -0.6928752660751343, "logits/rejected": -0.7898589372634888, "logps/chosen": -1.8955409526824951, "logps/rejected": -1.9372844696044922, "loss": 3.0123, "rewards/accuracies": 0.5, "rewards/chosen": -18.95541000366211, "rewards/margins": 0.4174327850341797, "rewards/rejected": -19.372844696044922, "step": 8385 }, { "epoch": 0.28278674710977786, "grad_norm": 85.1002197265625, "learning_rate": 9.016333314220437e-07, "logits/chosen": -0.5826826691627502, "logits/rejected": -0.6400490999221802, "logps/chosen": -2.334876775741577, "logps/rejected": -2.1809592247009277, "loss": 4.7307, "rewards/accuracies": 0.5, "rewards/chosen": -23.348770141601562, "rewards/margins": -1.5391769409179688, "rewards/rejected": -21.809593200683594, "step": 8390 }, { "epoch": 0.2829552731807611, "grad_norm": 12.693016052246094, "learning_rate": 9.014580696245315e-07, "logits/chosen": -0.6832539439201355, "logits/rejected": -0.8097559809684753, "logps/chosen": -1.5608079433441162, "logps/rejected": -2.0265145301818848, "loss": 2.1457, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -15.60807991027832, "rewards/margins": 4.657065391540527, "rewards/rejected": -20.265146255493164, "step": 8395 }, { "epoch": 0.28312379925174425, "grad_norm": 25.155517578125, "learning_rate": 9.012826688981983e-07, "logits/chosen": -0.7850767970085144, "logits/rejected": -0.847511887550354, "logps/chosen": -1.9985544681549072, "logps/rejected": -2.099179744720459, "loss": 2.6005, "rewards/accuracies": 0.5, "rewards/chosen": -19.985544204711914, "rewards/margins": 1.0062540769577026, "rewards/rejected": -20.991796493530273, "step": 8400 }, { "epoch": 0.28312379925174425, "eval_logits/chosen": -1.0161163806915283, "eval_logits/rejected": -1.0731902122497559, "eval_logps/chosen": -1.8116202354431152, "eval_logps/rejected": -1.8628262281417847, "eval_loss": 3.1475327014923096, "eval_rewards/accuracies": 0.6100000143051147, "eval_rewards/chosen": -18.116201400756836, "eval_rewards/margins": 0.512061357498169, "eval_rewards/rejected": -18.628263473510742, "eval_runtime": 12.9025, "eval_samples_per_second": 7.75, "eval_steps_per_second": 1.938, "step": 8400 }, { "epoch": 0.2832923253227274, "grad_norm": 24.380800247192383, "learning_rate": 9.011071293037431e-07, "logits/chosen": -0.6405395269393921, "logits/rejected": -0.7416598796844482, "logps/chosen": -1.801579236984253, "logps/rejected": -1.8673263788223267, "loss": 2.5371, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.015789031982422, "rewards/margins": 0.6574710607528687, "rewards/rejected": -18.673263549804688, "step": 8405 }, { "epoch": 0.28346085139371063, "grad_norm": 42.704307556152344, "learning_rate": 9.009314509019136e-07, "logits/chosen": -0.5548028349876404, "logits/rejected": -0.3818223178386688, "logps/chosen": -2.210094451904297, "logps/rejected": -2.4619388580322266, "loss": 2.1946, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.1009464263916, "rewards/margins": 2.5184431076049805, "rewards/rejected": -24.619388580322266, "step": 8410 }, { "epoch": 0.2836293774646938, "grad_norm": 19.076499938964844, "learning_rate": 9.00755633753505e-07, "logits/chosen": -0.3302133083343506, "logits/rejected": -0.5427986979484558, "logps/chosen": -1.8025833368301392, "logps/rejected": -1.8589093685150146, "loss": 2.9957, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.025833129882812, "rewards/margins": 0.5632610321044922, "rewards/rejected": -18.589094161987305, "step": 8415 }, { "epoch": 0.28379790353567697, "grad_norm": 44.3154296875, "learning_rate": 9.005796779193606e-07, "logits/chosen": -0.6408971548080444, "logits/rejected": -0.7122513055801392, "logps/chosen": -1.502657175064087, "logps/rejected": -1.5073373317718506, "loss": 3.3831, "rewards/accuracies": 0.5, "rewards/chosen": -15.026571273803711, "rewards/margins": 0.046801090240478516, "rewards/rejected": -15.073373794555664, "step": 8420 }, { "epoch": 0.28396642960666013, "grad_norm": 26.614643096923828, "learning_rate": 9.004035834603718e-07, "logits/chosen": -0.7186806797981262, "logits/rejected": -0.5692561864852905, "logps/chosen": -1.7262153625488281, "logps/rejected": -1.7688636779785156, "loss": 2.8119, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.26215362548828, "rewards/margins": 0.42648276686668396, "rewards/rejected": -17.688634872436523, "step": 8425 }, { "epoch": 0.28413495567764335, "grad_norm": 21.044157028198242, "learning_rate": 9.002273504374782e-07, "logits/chosen": -0.5849789381027222, "logits/rejected": -0.7305509448051453, "logps/chosen": -1.8201496601104736, "logps/rejected": -1.939295768737793, "loss": 2.5125, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.20149803161621, "rewards/margins": 1.1914616823196411, "rewards/rejected": -19.392959594726562, "step": 8430 }, { "epoch": 0.2843034817486265, "grad_norm": 25.133769989013672, "learning_rate": 9.000509789116671e-07, "logits/chosen": -0.8678895831108093, "logits/rejected": -0.9390038251876831, "logps/chosen": -2.053654909133911, "logps/rejected": -1.8398149013519287, "loss": 5.6367, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.536548614501953, "rewards/margins": -2.138397693634033, "rewards/rejected": -18.398149490356445, "step": 8435 }, { "epoch": 0.2844720078196097, "grad_norm": 17.994701385498047, "learning_rate": 8.998744689439732e-07, "logits/chosen": -0.610783040523529, "logits/rejected": -0.5971266627311707, "logps/chosen": -1.8423837423324585, "logps/rejected": -1.8631229400634766, "loss": 4.3969, "rewards/accuracies": 0.5, "rewards/chosen": -18.423837661743164, "rewards/margins": 0.20739364624023438, "rewards/rejected": -18.631229400634766, "step": 8440 }, { "epoch": 0.28464053389059285, "grad_norm": 18.32319450378418, "learning_rate": 8.996978205954802e-07, "logits/chosen": -0.5855869054794312, "logits/rejected": -0.6118898391723633, "logps/chosen": -1.760204553604126, "logps/rejected": -1.9514306783676147, "loss": 2.2484, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.6020450592041, "rewards/margins": 1.9122610092163086, "rewards/rejected": -19.514307022094727, "step": 8445 }, { "epoch": 0.28480905996157607, "grad_norm": 84.37977600097656, "learning_rate": 8.995210339273192e-07, "logits/chosen": -0.46147075295448303, "logits/rejected": -0.48088377714157104, "logps/chosen": -2.0352015495300293, "logps/rejected": -2.0512282848358154, "loss": 3.5981, "rewards/accuracies": 0.5, "rewards/chosen": -20.35201644897461, "rewards/margins": 0.16026830673217773, "rewards/rejected": -20.512283325195312, "step": 8450 }, { "epoch": 0.28497758603255924, "grad_norm": 18.52257537841797, "learning_rate": 8.993441090006684e-07, "logits/chosen": -0.5983000993728638, "logits/rejected": -0.6808794736862183, "logps/chosen": -2.0714492797851562, "logps/rejected": -2.1377134323120117, "loss": 2.9054, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.714492797851562, "rewards/margins": 0.6626425981521606, "rewards/rejected": -21.377134323120117, "step": 8455 }, { "epoch": 0.2851461121035424, "grad_norm": 37.097259521484375, "learning_rate": 8.991670458767553e-07, "logits/chosen": -1.0555012226104736, "logits/rejected": -0.8454039692878723, "logps/chosen": -2.041461944580078, "logps/rejected": -1.99785578250885, "loss": 3.5113, "rewards/accuracies": 0.5, "rewards/chosen": -20.41461944580078, "rewards/margins": -0.43606311082839966, "rewards/rejected": -19.97855567932129, "step": 8460 }, { "epoch": 0.2853146381745256, "grad_norm": 102.19123077392578, "learning_rate": 8.989898446168541e-07, "logits/chosen": -0.4955880045890808, "logits/rejected": -0.45627278089523315, "logps/chosen": -1.9363250732421875, "logps/rejected": -2.056997776031494, "loss": 2.476, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.363250732421875, "rewards/margins": 1.206725835800171, "rewards/rejected": -20.569976806640625, "step": 8465 }, { "epoch": 0.2854831642455088, "grad_norm": 30.65717315673828, "learning_rate": 8.988125052822872e-07, "logits/chosen": -0.3843476176261902, "logits/rejected": -0.24830541014671326, "logps/chosen": -1.781465768814087, "logps/rejected": -1.8322114944458008, "loss": 2.7073, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.81465721130371, "rewards/margins": 0.5074566602706909, "rewards/rejected": -18.322113037109375, "step": 8470 }, { "epoch": 0.28565169031649196, "grad_norm": 14.486468315124512, "learning_rate": 8.98635027934425e-07, "logits/chosen": -0.3739572763442993, "logits/rejected": -0.5670051574707031, "logps/chosen": -2.1897037029266357, "logps/rejected": -2.430849552154541, "loss": 3.5944, "rewards/accuracies": 0.5, "rewards/chosen": -21.897037506103516, "rewards/margins": 2.411459445953369, "rewards/rejected": -24.308496475219727, "step": 8475 }, { "epoch": 0.2858202163874751, "grad_norm": 10.307511329650879, "learning_rate": 8.984574126346851e-07, "logits/chosen": -0.6641756296157837, "logits/rejected": -0.633230984210968, "logps/chosen": -1.8256546258926392, "logps/rejected": -2.082393169403076, "loss": 1.5023, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -18.25654411315918, "rewards/margins": 2.567385196685791, "rewards/rejected": -20.823930740356445, "step": 8480 }, { "epoch": 0.28598874245845834, "grad_norm": 15.988293647766113, "learning_rate": 8.982796594445332e-07, "logits/chosen": -0.8362739682197571, "logits/rejected": -0.9717584848403931, "logps/chosen": -1.600237250328064, "logps/rejected": -1.7573835849761963, "loss": 2.2579, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.00237464904785, "rewards/margins": 1.5714609622955322, "rewards/rejected": -17.573835372924805, "step": 8485 }, { "epoch": 0.2861572685294415, "grad_norm": 33.097137451171875, "learning_rate": 8.981017684254828e-07, "logits/chosen": -0.42199596762657166, "logits/rejected": -0.43986397981643677, "logps/chosen": -1.9197683334350586, "logps/rejected": -1.9568519592285156, "loss": 3.3897, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -19.197681427001953, "rewards/margins": 0.3708377778530121, "rewards/rejected": -19.56852149963379, "step": 8490 }, { "epoch": 0.2863257946004247, "grad_norm": 22.73672866821289, "learning_rate": 8.979237396390951e-07, "logits/chosen": -1.106762409210205, "logits/rejected": -0.9645527601242065, "logps/chosen": -1.9312461614608765, "logps/rejected": -1.8773243427276611, "loss": 3.5936, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -19.312461853027344, "rewards/margins": -0.5392182469367981, "rewards/rejected": -18.773242950439453, "step": 8495 }, { "epoch": 0.28649432067140784, "grad_norm": 33.734256744384766, "learning_rate": 8.977455731469786e-07, "logits/chosen": -0.6991892457008362, "logits/rejected": -0.5479412078857422, "logps/chosen": -1.7963107824325562, "logps/rejected": -1.7532427310943604, "loss": 3.5209, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.963109970092773, "rewards/margins": -0.4306826591491699, "rewards/rejected": -17.532426834106445, "step": 8500 }, { "epoch": 0.28666284674239106, "grad_norm": 23.050457000732422, "learning_rate": 8.975672690107896e-07, "logits/chosen": -0.6764280200004578, "logits/rejected": -0.5734367370605469, "logps/chosen": -2.08373761177063, "logps/rejected": -2.144780397415161, "loss": 2.9743, "rewards/accuracies": 0.5, "rewards/chosen": -20.83737564086914, "rewards/margins": 0.6104259490966797, "rewards/rejected": -21.447803497314453, "step": 8505 }, { "epoch": 0.28683137281337423, "grad_norm": 30.37391471862793, "learning_rate": 8.973888272922325e-07, "logits/chosen": -0.7625263929367065, "logits/rejected": -0.8862202763557434, "logps/chosen": -2.0527968406677246, "logps/rejected": -2.016758441925049, "loss": 4.0696, "rewards/accuracies": 0.5, "rewards/chosen": -20.527965545654297, "rewards/margins": -0.3603835999965668, "rewards/rejected": -20.16758155822754, "step": 8510 }, { "epoch": 0.2869998988843574, "grad_norm": 26.156654357910156, "learning_rate": 8.972102480530586e-07, "logits/chosen": -0.8353249430656433, "logits/rejected": -0.942841649055481, "logps/chosen": -2.201547861099243, "logps/rejected": -2.3174633979797363, "loss": 3.7065, "rewards/accuracies": 0.5, "rewards/chosen": -22.015478134155273, "rewards/margins": 1.159156322479248, "rewards/rejected": -23.174633026123047, "step": 8515 }, { "epoch": 0.2871684249553406, "grad_norm": 13.300694465637207, "learning_rate": 8.970315313550676e-07, "logits/chosen": -1.0247485637664795, "logits/rejected": -0.9068101048469543, "logps/chosen": -1.7162628173828125, "logps/rejected": -1.7405004501342773, "loss": 3.0056, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.162628173828125, "rewards/margins": 0.24237728118896484, "rewards/rejected": -17.405006408691406, "step": 8520 }, { "epoch": 0.2873369510263238, "grad_norm": 61.11632537841797, "learning_rate": 8.968526772601057e-07, "logits/chosen": -0.758571445941925, "logits/rejected": -0.6705536246299744, "logps/chosen": -2.1682193279266357, "logps/rejected": -2.228081703186035, "loss": 2.8205, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.682193756103516, "rewards/margins": 0.5986258387565613, "rewards/rejected": -22.280818939208984, "step": 8525 }, { "epoch": 0.28750547709730695, "grad_norm": 23.13623046875, "learning_rate": 8.966736858300678e-07, "logits/chosen": -0.735704779624939, "logits/rejected": -0.6865785717964172, "logps/chosen": -1.7772157192230225, "logps/rejected": -1.7628653049468994, "loss": 3.7543, "rewards/accuracies": 0.5, "rewards/chosen": -17.77215576171875, "rewards/margins": -0.1435052901506424, "rewards/rejected": -17.628652572631836, "step": 8530 }, { "epoch": 0.2876740031682901, "grad_norm": 18.340370178222656, "learning_rate": 8.964945571268953e-07, "logits/chosen": -0.6413258910179138, "logits/rejected": -0.5849345922470093, "logps/chosen": -1.7927738428115845, "logps/rejected": -2.061357021331787, "loss": 2.2013, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.927738189697266, "rewards/margins": 2.6858315467834473, "rewards/rejected": -20.613569259643555, "step": 8535 }, { "epoch": 0.28784252923927334, "grad_norm": 17.296632766723633, "learning_rate": 8.963152912125783e-07, "logits/chosen": -0.7454960942268372, "logits/rejected": -0.6678019762039185, "logps/chosen": -2.1356372833251953, "logps/rejected": -2.2526679039001465, "loss": 2.5295, "rewards/accuracies": 0.5, "rewards/chosen": -21.356372833251953, "rewards/margins": 1.1703062057495117, "rewards/rejected": -22.52667808532715, "step": 8540 }, { "epoch": 0.2880110553102565, "grad_norm": 50.47782516479492, "learning_rate": 8.961358881491528e-07, "logits/chosen": -0.777079164981842, "logits/rejected": -0.8767390251159668, "logps/chosen": -1.8275363445281982, "logps/rejected": -1.8506664037704468, "loss": 3.4138, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -18.27536392211914, "rewards/margins": 0.2313002645969391, "rewards/rejected": -18.506664276123047, "step": 8545 }, { "epoch": 0.28817958138123967, "grad_norm": 35.43614959716797, "learning_rate": 8.959563479987035e-07, "logits/chosen": -0.8436108827590942, "logits/rejected": -0.870860755443573, "logps/chosen": -1.8714988231658936, "logps/rejected": -1.936994194984436, "loss": 2.837, "rewards/accuracies": 0.5, "rewards/chosen": -18.714990615844727, "rewards/margins": 0.6549515724182129, "rewards/rejected": -19.36993980407715, "step": 8550 }, { "epoch": 0.28834810745222283, "grad_norm": 17.51824188232422, "learning_rate": 8.957766708233625e-07, "logits/chosen": -1.1280739307403564, "logits/rejected": -1.240715742111206, "logps/chosen": -1.6478168964385986, "logps/rejected": -1.7780081033706665, "loss": 2.7749, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.478168487548828, "rewards/margins": 1.3019111156463623, "rewards/rejected": -17.780078887939453, "step": 8555 }, { "epoch": 0.28851663352320606, "grad_norm": 18.251188278198242, "learning_rate": 8.955968566853086e-07, "logits/chosen": -0.8560983538627625, "logits/rejected": -0.7794613242149353, "logps/chosen": -1.753149390220642, "logps/rejected": -1.9621002674102783, "loss": 2.089, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.531494140625, "rewards/margins": 2.089508056640625, "rewards/rejected": -19.621002197265625, "step": 8560 }, { "epoch": 0.2886851595941892, "grad_norm": 32.65653610229492, "learning_rate": 8.954169056467684e-07, "logits/chosen": -0.7626281380653381, "logits/rejected": -0.7990679740905762, "logps/chosen": -1.5963528156280518, "logps/rejected": -1.5640228986740112, "loss": 3.388, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -15.963528633117676, "rewards/margins": -0.3232990801334381, "rewards/rejected": -15.640230178833008, "step": 8565 }, { "epoch": 0.2888536856651724, "grad_norm": 7.690257549285889, "learning_rate": 8.95236817770016e-07, "logits/chosen": -0.9705495834350586, "logits/rejected": -0.8679699897766113, "logps/chosen": -1.9340327978134155, "logps/rejected": -2.223405122756958, "loss": 1.6523, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.340328216552734, "rewards/margins": 2.8937225341796875, "rewards/rejected": -22.234050750732422, "step": 8570 }, { "epoch": 0.2890222117361556, "grad_norm": 33.865028381347656, "learning_rate": 8.950565931173728e-07, "logits/chosen": -0.6105413436889648, "logits/rejected": -0.6946656107902527, "logps/chosen": -2.086103916168213, "logps/rejected": -2.1015257835388184, "loss": 3.2424, "rewards/accuracies": 0.5, "rewards/chosen": -20.861034393310547, "rewards/margins": 0.1542188674211502, "rewards/rejected": -21.015254974365234, "step": 8575 }, { "epoch": 0.2891907378071388, "grad_norm": 15.458596229553223, "learning_rate": 8.948762317512073e-07, "logits/chosen": -0.504424512386322, "logits/rejected": -0.6665661931037903, "logps/chosen": -1.8025707006454468, "logps/rejected": -1.9153648614883423, "loss": 2.3811, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.025705337524414, "rewards/margins": 1.1279430389404297, "rewards/rejected": -19.15364646911621, "step": 8580 }, { "epoch": 0.28935926387812194, "grad_norm": 21.86128044128418, "learning_rate": 8.946957337339354e-07, "logits/chosen": -0.5030733346939087, "logits/rejected": -0.4260841906070709, "logps/chosen": -1.9623653888702393, "logps/rejected": -2.001736879348755, "loss": 3.0308, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.623653411865234, "rewards/margins": 0.39371395111083984, "rewards/rejected": -20.01736831665039, "step": 8585 }, { "epoch": 0.2895277899491051, "grad_norm": 30.289661407470703, "learning_rate": 8.945150991280205e-07, "logits/chosen": -0.7599642872810364, "logits/rejected": -0.7587991952896118, "logps/chosen": -1.845049262046814, "logps/rejected": -1.9118587970733643, "loss": 2.5643, "rewards/accuracies": 0.5, "rewards/chosen": -18.45049285888672, "rewards/margins": 0.6680957078933716, "rewards/rejected": -19.118587493896484, "step": 8590 }, { "epoch": 0.28969631602008833, "grad_norm": 34.044654846191406, "learning_rate": 8.94334327995973e-07, "logits/chosen": -0.6147949695587158, "logits/rejected": -0.6777374744415283, "logps/chosen": -1.9771358966827393, "logps/rejected": -2.1955230236053467, "loss": 1.9538, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.7713565826416, "rewards/margins": 2.1838738918304443, "rewards/rejected": -21.955232620239258, "step": 8595 }, { "epoch": 0.2898648420910715, "grad_norm": 25.871217727661133, "learning_rate": 8.941534204003509e-07, "logits/chosen": -0.9726818799972534, "logits/rejected": -1.0019123554229736, "logps/chosen": -1.7473583221435547, "logps/rejected": -1.9080984592437744, "loss": 2.3828, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.473581314086914, "rewards/margins": 1.6074016094207764, "rewards/rejected": -19.080984115600586, "step": 8600 }, { "epoch": 0.29003336816205466, "grad_norm": 25.060718536376953, "learning_rate": 8.939723764037588e-07, "logits/chosen": -0.9149066805839539, "logits/rejected": -0.8806743621826172, "logps/chosen": -2.042038679122925, "logps/rejected": -2.366466999053955, "loss": 2.2901, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.420385360717773, "rewards/margins": 3.24428129196167, "rewards/rejected": -23.6646671295166, "step": 8605 }, { "epoch": 0.2902018942330378, "grad_norm": 87.30838012695312, "learning_rate": 8.937911960688493e-07, "logits/chosen": -0.8675652742385864, "logits/rejected": -1.1229223012924194, "logps/chosen": -2.310178279876709, "logps/rejected": -2.154825448989868, "loss": 4.7692, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -23.101781845092773, "rewards/margins": -1.55352783203125, "rewards/rejected": -21.548254013061523, "step": 8610 }, { "epoch": 0.29037042030402105, "grad_norm": 18.950347900390625, "learning_rate": 8.936098794583215e-07, "logits/chosen": -0.554070770740509, "logits/rejected": -0.8680068850517273, "logps/chosen": -1.8547636270523071, "logps/rejected": -2.017946720123291, "loss": 2.1449, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.54763412475586, "rewards/margins": 1.631829857826233, "rewards/rejected": -20.17946434020996, "step": 8615 }, { "epoch": 0.2905389463750042, "grad_norm": 20.925893783569336, "learning_rate": 8.934284266349221e-07, "logits/chosen": -0.45976686477661133, "logits/rejected": -0.5486811399459839, "logps/chosen": -1.9799697399139404, "logps/rejected": -2.120398998260498, "loss": 2.1415, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.799697875976562, "rewards/margins": 1.4042949676513672, "rewards/rejected": -21.20399284362793, "step": 8620 }, { "epoch": 0.2907074724459874, "grad_norm": 24.982271194458008, "learning_rate": 8.932468376614446e-07, "logits/chosen": -1.134254813194275, "logits/rejected": -1.2710667848587036, "logps/chosen": -1.7824151515960693, "logps/rejected": -2.1663620471954346, "loss": 2.336, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.82415199279785, "rewards/margins": 3.839470386505127, "rewards/rejected": -21.663619995117188, "step": 8625 }, { "epoch": 0.2908759985169706, "grad_norm": 50.53211975097656, "learning_rate": 8.9306511260073e-07, "logits/chosen": -0.9046827554702759, "logits/rejected": -1.054692268371582, "logps/chosen": -1.7211036682128906, "logps/rejected": -1.7873337268829346, "loss": 2.6263, "rewards/accuracies": 0.5, "rewards/chosen": -17.211036682128906, "rewards/margins": 0.6622999906539917, "rewards/rejected": -17.873336791992188, "step": 8630 }, { "epoch": 0.29104452458795377, "grad_norm": 49.24147033691406, "learning_rate": 8.92883251515666e-07, "logits/chosen": -0.8043139576911926, "logits/rejected": -0.834795355796814, "logps/chosen": -1.8925583362579346, "logps/rejected": -1.9303007125854492, "loss": 2.8508, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.925580978393555, "rewards/margins": 0.37742501497268677, "rewards/rejected": -19.30300521850586, "step": 8635 }, { "epoch": 0.29121305065893693, "grad_norm": 8.035228729248047, "learning_rate": 8.927012544691877e-07, "logits/chosen": -0.7065514326095581, "logits/rejected": -0.7449557781219482, "logps/chosen": -1.6677768230438232, "logps/rejected": -2.148942470550537, "loss": 1.9738, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -16.67776870727539, "rewards/margins": 4.811653137207031, "rewards/rejected": -21.489421844482422, "step": 8640 }, { "epoch": 0.2913815767299201, "grad_norm": 19.720861434936523, "learning_rate": 8.925191215242769e-07, "logits/chosen": -0.6974073648452759, "logits/rejected": -0.8620367050170898, "logps/chosen": -1.8329426050186157, "logps/rejected": -2.1030852794647217, "loss": 1.5808, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -18.329425811767578, "rewards/margins": 2.701427936553955, "rewards/rejected": -21.030853271484375, "step": 8645 }, { "epoch": 0.2915501028009033, "grad_norm": 96.2696533203125, "learning_rate": 8.92336852743963e-07, "logits/chosen": -0.535683274269104, "logits/rejected": -0.6992945671081543, "logps/chosen": -2.1274380683898926, "logps/rejected": -2.254375696182251, "loss": 3.5109, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.27437973022461, "rewards/margins": 1.2693758010864258, "rewards/rejected": -22.54375648498535, "step": 8650 }, { "epoch": 0.2917186288718865, "grad_norm": 21.258485794067383, "learning_rate": 8.921544481913217e-07, "logits/chosen": -0.9834004640579224, "logits/rejected": -1.0161277055740356, "logps/chosen": -1.942801833152771, "logps/rejected": -1.9483106136322021, "loss": 4.0447, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.42801856994629, "rewards/margins": 0.05508852005004883, "rewards/rejected": -19.483104705810547, "step": 8655 }, { "epoch": 0.29188715494286965, "grad_norm": 40.73654556274414, "learning_rate": 8.919719079294761e-07, "logits/chosen": -0.8954814672470093, "logits/rejected": -0.9347285032272339, "logps/chosen": -1.9151986837387085, "logps/rejected": -2.066523313522339, "loss": 2.6765, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.151987075805664, "rewards/margins": 1.5132462978363037, "rewards/rejected": -20.665233612060547, "step": 8660 }, { "epoch": 0.2920556810138528, "grad_norm": 28.10308265686035, "learning_rate": 8.917892320215963e-07, "logits/chosen": -0.5667335987091064, "logits/rejected": -0.6796129941940308, "logps/chosen": -2.036616563796997, "logps/rejected": -2.078256607055664, "loss": 3.0412, "rewards/accuracies": 0.5, "rewards/chosen": -20.36616325378418, "rewards/margins": 0.41640299558639526, "rewards/rejected": -20.782567977905273, "step": 8665 }, { "epoch": 0.29222420708483604, "grad_norm": 11.453448295593262, "learning_rate": 8.91606420530899e-07, "logits/chosen": -0.4996485710144043, "logits/rejected": -0.6699423789978027, "logps/chosen": -2.383340358734131, "logps/rejected": -2.6928350925445557, "loss": 1.4457, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -23.833402633666992, "rewards/margins": 3.0949490070343018, "rewards/rejected": -26.9283504486084, "step": 8670 }, { "epoch": 0.2923927331558192, "grad_norm": 21.407791137695312, "learning_rate": 8.914234735206485e-07, "logits/chosen": -1.0669759511947632, "logits/rejected": -1.0711863040924072, "logps/chosen": -1.4989492893218994, "logps/rejected": -1.5646870136260986, "loss": 2.4623, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -14.98949146270752, "rewards/margins": 0.6573789715766907, "rewards/rejected": -15.646868705749512, "step": 8675 }, { "epoch": 0.2925612592268024, "grad_norm": 40.112220764160156, "learning_rate": 8.912403910541552e-07, "logits/chosen": -1.0745022296905518, "logits/rejected": -0.9828779101371765, "logps/chosen": -1.8560600280761719, "logps/rejected": -1.7852586507797241, "loss": 3.7889, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -18.56060028076172, "rewards/margins": -0.7080144882202148, "rewards/rejected": -17.85258674621582, "step": 8680 }, { "epoch": 0.2927297852977856, "grad_norm": 21.407392501831055, "learning_rate": 8.910571731947767e-07, "logits/chosen": -0.4315149784088135, "logits/rejected": -0.5765363574028015, "logps/chosen": -1.6076252460479736, "logps/rejected": -1.6742461919784546, "loss": 3.3513, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.07625389099121, "rewards/margins": 0.6662089228630066, "rewards/rejected": -16.742462158203125, "step": 8685 }, { "epoch": 0.29289831136876876, "grad_norm": 24.01103973388672, "learning_rate": 8.908738200059178e-07, "logits/chosen": -1.1414722204208374, "logits/rejected": -1.230464220046997, "logps/chosen": -1.833152413368225, "logps/rejected": -1.9002851247787476, "loss": 2.791, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.331523895263672, "rewards/margins": 0.6713264584541321, "rewards/rejected": -19.002851486206055, "step": 8690 }, { "epoch": 0.2930668374397519, "grad_norm": 22.16900634765625, "learning_rate": 8.906903315510294e-07, "logits/chosen": -0.7110240459442139, "logits/rejected": -0.8365713953971863, "logps/chosen": -1.6791126728057861, "logps/rejected": -1.6718724966049194, "loss": 3.5858, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.791126251220703, "rewards/margins": -0.0724002867937088, "rewards/rejected": -16.71872329711914, "step": 8695 }, { "epoch": 0.2932353635107351, "grad_norm": 40.03135681152344, "learning_rate": 8.9050670789361e-07, "logits/chosen": -0.6402872204780579, "logits/rejected": -0.814786434173584, "logps/chosen": -1.8824033737182617, "logps/rejected": -1.8985248804092407, "loss": 3.2084, "rewards/accuracies": 0.5, "rewards/chosen": -18.824033737182617, "rewards/margins": 0.1612163484096527, "rewards/rejected": -18.985248565673828, "step": 8700 }, { "epoch": 0.2934038895817183, "grad_norm": 19.727540969848633, "learning_rate": 8.903229490972042e-07, "logits/chosen": -0.3936053514480591, "logits/rejected": -0.47466689348220825, "logps/chosen": -2.7040302753448486, "logps/rejected": -2.9196763038635254, "loss": 2.9614, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -27.040302276611328, "rewards/margins": 2.156456708908081, "rewards/rejected": -29.196758270263672, "step": 8705 }, { "epoch": 0.2935724156527015, "grad_norm": 35.2152099609375, "learning_rate": 8.90139055225404e-07, "logits/chosen": -0.3665235936641693, "logits/rejected": -0.44224101305007935, "logps/chosen": -2.0081772804260254, "logps/rejected": -2.328625202178955, "loss": 2.1435, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.081769943237305, "rewards/margins": 3.2044830322265625, "rewards/rejected": -23.286252975463867, "step": 8710 }, { "epoch": 0.29374094172368465, "grad_norm": 26.51418685913086, "learning_rate": 8.899550263418475e-07, "logits/chosen": -0.670992374420166, "logits/rejected": -0.7386754155158997, "logps/chosen": -1.8449127674102783, "logps/rejected": -1.9972244501113892, "loss": 1.7944, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.449127197265625, "rewards/margins": 1.5231168270111084, "rewards/rejected": -19.97224235534668, "step": 8715 }, { "epoch": 0.2939094677946678, "grad_norm": 22.333667755126953, "learning_rate": 8.8977086251022e-07, "logits/chosen": -0.81922447681427, "logits/rejected": -0.7545638084411621, "logps/chosen": -2.3857390880584717, "logps/rejected": -2.6660449504852295, "loss": 3.0081, "rewards/accuracies": 0.5, "rewards/chosen": -23.857393264770508, "rewards/margins": 2.803058385848999, "rewards/rejected": -26.660449981689453, "step": 8720 }, { "epoch": 0.29407799386565103, "grad_norm": 18.825889587402344, "learning_rate": 8.895865637942535e-07, "logits/chosen": -1.0190632343292236, "logits/rejected": -1.0829589366912842, "logps/chosen": -1.6110626459121704, "logps/rejected": -1.6284345388412476, "loss": 3.3191, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.110624313354492, "rewards/margins": 0.1737208366394043, "rewards/rejected": -16.284343719482422, "step": 8725 }, { "epoch": 0.2942465199366342, "grad_norm": 49.59651184082031, "learning_rate": 8.894021302577263e-07, "logits/chosen": -0.7957747578620911, "logits/rejected": -0.8983270525932312, "logps/chosen": -2.2004895210266113, "logps/rejected": -2.2305634021759033, "loss": 3.6281, "rewards/accuracies": 0.5, "rewards/chosen": -22.00489616394043, "rewards/margins": 0.30073651671409607, "rewards/rejected": -22.305633544921875, "step": 8730 }, { "epoch": 0.29441504600761736, "grad_norm": 62.021484375, "learning_rate": 8.892175619644635e-07, "logits/chosen": -0.9021312594413757, "logits/rejected": -0.9147001504898071, "logps/chosen": -2.173342227935791, "logps/rejected": -2.4811558723449707, "loss": 2.1825, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.733421325683594, "rewards/margins": 3.0781381130218506, "rewards/rejected": -24.811559677124023, "step": 8735 }, { "epoch": 0.2945835720786006, "grad_norm": 11.684818267822266, "learning_rate": 8.890328589783373e-07, "logits/chosen": -1.1126207113265991, "logits/rejected": -0.9916723966598511, "logps/chosen": -2.0568923950195312, "logps/rejected": -3.0319085121154785, "loss": 1.1855, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.568927764892578, "rewards/margins": 9.750160217285156, "rewards/rejected": -30.3190860748291, "step": 8740 }, { "epoch": 0.29475209814958375, "grad_norm": 17.79829216003418, "learning_rate": 8.888480213632656e-07, "logits/chosen": -1.0284509658813477, "logits/rejected": -0.9819244146347046, "logps/chosen": -1.696070671081543, "logps/rejected": -1.8273723125457764, "loss": 2.7422, "rewards/accuracies": 0.5, "rewards/chosen": -16.960704803466797, "rewards/margins": 1.3130178451538086, "rewards/rejected": -18.273723602294922, "step": 8745 }, { "epoch": 0.2949206242205669, "grad_norm": 20.419384002685547, "learning_rate": 8.88663049183214e-07, "logits/chosen": -0.7981722950935364, "logits/rejected": -0.8218636512756348, "logps/chosen": -2.0747475624084473, "logps/rejected": -1.9626662731170654, "loss": 4.1684, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.74747657775879, "rewards/margins": -1.1208148002624512, "rewards/rejected": -19.62666130065918, "step": 8750 }, { "epoch": 0.2950891502915501, "grad_norm": 23.614099502563477, "learning_rate": 8.884779425021936e-07, "logits/chosen": -0.9454744458198547, "logits/rejected": -1.0266331434249878, "logps/chosen": -1.5546374320983887, "logps/rejected": -1.7335621118545532, "loss": 1.8324, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -15.546374320983887, "rewards/margins": 1.7892467975616455, "rewards/rejected": -17.335620880126953, "step": 8755 }, { "epoch": 0.2952576763625333, "grad_norm": 30.09710121154785, "learning_rate": 8.882927013842628e-07, "logits/chosen": -1.1429169178009033, "logits/rejected": -1.2618019580841064, "logps/chosen": -1.8678550720214844, "logps/rejected": -2.0443532466888428, "loss": 2.179, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.678550720214844, "rewards/margins": 1.7649834156036377, "rewards/rejected": -20.44353485107422, "step": 8760 }, { "epoch": 0.29542620243351647, "grad_norm": 23.52537727355957, "learning_rate": 8.881073258935262e-07, "logits/chosen": -0.8685327768325806, "logits/rejected": -0.7556779384613037, "logps/chosen": -1.8107166290283203, "logps/rejected": -1.8469966650009155, "loss": 2.8891, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.107166290283203, "rewards/margins": 0.362802118062973, "rewards/rejected": -18.469966888427734, "step": 8765 }, { "epoch": 0.29559472850449964, "grad_norm": 23.21879005432129, "learning_rate": 8.879218160941348e-07, "logits/chosen": -1.1114360094070435, "logits/rejected": -1.1394498348236084, "logps/chosen": -1.627173662185669, "logps/rejected": -1.7318960428237915, "loss": 2.4583, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.27173614501953, "rewards/margins": 1.0472228527069092, "rewards/rejected": -17.318958282470703, "step": 8770 }, { "epoch": 0.2957632545754828, "grad_norm": 17.88161849975586, "learning_rate": 8.877361720502865e-07, "logits/chosen": -0.787623941898346, "logits/rejected": -0.8592535257339478, "logps/chosen": -1.8772789239883423, "logps/rejected": -2.0536484718322754, "loss": 2.2936, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.772790908813477, "rewards/margins": 1.7636915445327759, "rewards/rejected": -20.536481857299805, "step": 8775 }, { "epoch": 0.295931780646466, "grad_norm": 27.691944122314453, "learning_rate": 8.875503938262252e-07, "logits/chosen": -0.6448401808738708, "logits/rejected": -0.6703423261642456, "logps/chosen": -2.0140814781188965, "logps/rejected": -1.8916003704071045, "loss": 5.0234, "rewards/accuracies": 0.5, "rewards/chosen": -20.14081382751465, "rewards/margins": -1.22481107711792, "rewards/rejected": -18.91600227355957, "step": 8780 }, { "epoch": 0.2961003067174492, "grad_norm": 27.60009002685547, "learning_rate": 8.873644814862416e-07, "logits/chosen": -0.45917612314224243, "logits/rejected": -0.7504684925079346, "logps/chosen": -2.220410108566284, "logps/rejected": -2.457399845123291, "loss": 4.0908, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.2041015625, "rewards/margins": 2.3698973655700684, "rewards/rejected": -24.573999404907227, "step": 8785 }, { "epoch": 0.29626883278843236, "grad_norm": 18.960968017578125, "learning_rate": 8.871784350946723e-07, "logits/chosen": -1.1864864826202393, "logits/rejected": -1.3105027675628662, "logps/chosen": -1.658342719078064, "logps/rejected": -1.9156605005264282, "loss": 1.7573, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -16.58342933654785, "rewards/margins": 2.5731749534606934, "rewards/rejected": -19.156604766845703, "step": 8790 }, { "epoch": 0.2964373588594156, "grad_norm": 18.231367111206055, "learning_rate": 8.869922547159009e-07, "logits/chosen": -0.4016094207763672, "logits/rejected": -0.5172806978225708, "logps/chosen": -2.007896900177002, "logps/rejected": -2.092552661895752, "loss": 2.8234, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.078968048095703, "rewards/margins": 0.8465590476989746, "rewards/rejected": -20.925525665283203, "step": 8795 }, { "epoch": 0.29660588493039874, "grad_norm": 21.657655715942383, "learning_rate": 8.868059404143571e-07, "logits/chosen": -0.8904246091842651, "logits/rejected": -0.9135071039199829, "logps/chosen": -1.8589338064193726, "logps/rejected": -2.0225725173950195, "loss": 1.9787, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.589340209960938, "rewards/margins": 1.6363856792449951, "rewards/rejected": -20.225723266601562, "step": 8800 }, { "epoch": 0.29660588493039874, "eval_logits/chosen": -1.1061619520187378, "eval_logits/rejected": -1.1691018342971802, "eval_logps/chosen": -1.8326040506362915, "eval_logps/rejected": -1.8919801712036133, "eval_loss": 3.122637987136841, "eval_rewards/accuracies": 0.6100000143051147, "eval_rewards/chosen": -18.32604217529297, "eval_rewards/margins": 0.5937601923942566, "eval_rewards/rejected": -18.919801712036133, "eval_runtime": 12.9076, "eval_samples_per_second": 7.747, "eval_steps_per_second": 1.937, "step": 8800 }, { "epoch": 0.2967744110013819, "grad_norm": 14.822147369384766, "learning_rate": 8.866194922545167e-07, "logits/chosen": -0.9613476991653442, "logits/rejected": -0.8759132623672485, "logps/chosen": -2.081514835357666, "logps/rejected": -2.1302268505096436, "loss": 3.6599, "rewards/accuracies": 0.5, "rewards/chosen": -20.815149307250977, "rewards/margins": 0.4871188998222351, "rewards/rejected": -21.302268981933594, "step": 8805 }, { "epoch": 0.2969429370723651, "grad_norm": 58.875946044921875, "learning_rate": 8.864329103009025e-07, "logits/chosen": -0.7224435210227966, "logits/rejected": -0.6182790994644165, "logps/chosen": -2.059049129486084, "logps/rejected": -2.1806259155273438, "loss": 3.2944, "rewards/accuracies": 0.5, "rewards/chosen": -20.590492248535156, "rewards/margins": 1.2157670259475708, "rewards/rejected": -21.806259155273438, "step": 8810 }, { "epoch": 0.2971114631433483, "grad_norm": 25.18557357788086, "learning_rate": 8.862461946180826e-07, "logits/chosen": -0.873163104057312, "logits/rejected": -0.8473867177963257, "logps/chosen": -2.5954298973083496, "logps/rejected": -2.170555591583252, "loss": 7.2803, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -25.954299926757812, "rewards/margins": -4.248744010925293, "rewards/rejected": -21.705556869506836, "step": 8815 }, { "epoch": 0.29727998921433146, "grad_norm": 10.100975036621094, "learning_rate": 8.860593452706724e-07, "logits/chosen": -0.07160119712352753, "logits/rejected": -0.18310347199440002, "logps/chosen": -1.888536810874939, "logps/rejected": -2.037360429763794, "loss": 2.3866, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.88536834716797, "rewards/margins": 1.4882361888885498, "rewards/rejected": -20.373605728149414, "step": 8820 }, { "epoch": 0.29744851528531463, "grad_norm": 85.96381378173828, "learning_rate": 8.858723623233329e-07, "logits/chosen": -0.6304708123207092, "logits/rejected": -0.7319132685661316, "logps/chosen": -2.1153464317321777, "logps/rejected": -2.3846354484558105, "loss": 2.1868, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.153467178344727, "rewards/margins": 2.6928863525390625, "rewards/rejected": -23.846351623535156, "step": 8825 }, { "epoch": 0.2976170413562978, "grad_norm": 27.138891220092773, "learning_rate": 8.856852458407716e-07, "logits/chosen": -0.7233554720878601, "logits/rejected": -0.8463021516799927, "logps/chosen": -1.82294499874115, "logps/rejected": -1.8261346817016602, "loss": 3.2798, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -18.229450225830078, "rewards/margins": 0.031897544860839844, "rewards/rejected": -18.2613468170166, "step": 8830 }, { "epoch": 0.297785567427281, "grad_norm": 36.38093566894531, "learning_rate": 8.854979958877421e-07, "logits/chosen": -0.2902129590511322, "logits/rejected": -0.41110771894454956, "logps/chosen": -2.67460298538208, "logps/rejected": -2.487720012664795, "loss": 5.4603, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -26.74602699279785, "rewards/margins": -1.8688280582427979, "rewards/rejected": -24.877201080322266, "step": 8835 }, { "epoch": 0.2979540934982642, "grad_norm": 27.867528915405273, "learning_rate": 8.853106125290442e-07, "logits/chosen": -0.8619322776794434, "logits/rejected": -0.8854168057441711, "logps/chosen": -1.8924545049667358, "logps/rejected": -1.848910927772522, "loss": 3.5393, "rewards/accuracies": 0.5, "rewards/chosen": -18.924543380737305, "rewards/margins": -0.43543368577957153, "rewards/rejected": -18.48910903930664, "step": 8840 }, { "epoch": 0.29812261956924735, "grad_norm": 31.812618255615234, "learning_rate": 8.85123095829524e-07, "logits/chosen": -1.0855190753936768, "logits/rejected": -0.99981290102005, "logps/chosen": -2.0037198066711426, "logps/rejected": -2.1378657817840576, "loss": 2.3159, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.03719711303711, "rewards/margins": 1.3414623737335205, "rewards/rejected": -21.378658294677734, "step": 8845 }, { "epoch": 0.29829114564023057, "grad_norm": 20.495399475097656, "learning_rate": 8.849354458540734e-07, "logits/chosen": -0.6814324855804443, "logits/rejected": -0.8289278745651245, "logps/chosen": -1.9179836511611938, "logps/rejected": -1.9481815099716187, "loss": 3.3473, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.17983627319336, "rewards/margins": 0.3019787669181824, "rewards/rejected": -19.481815338134766, "step": 8850 }, { "epoch": 0.29845967171121374, "grad_norm": 10.193831443786621, "learning_rate": 8.84747662667631e-07, "logits/chosen": -0.7345752716064453, "logits/rejected": -0.8806995153427124, "logps/chosen": -1.7043479681015015, "logps/rejected": -1.8931198120117188, "loss": 1.7613, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -17.04347801208496, "rewards/margins": 1.887718915939331, "rewards/rejected": -18.931198120117188, "step": 8855 }, { "epoch": 0.2986281977821969, "grad_norm": 19.934276580810547, "learning_rate": 8.845597463351811e-07, "logits/chosen": -0.8036659955978394, "logits/rejected": -0.7760517001152039, "logps/chosen": -1.3290200233459473, "logps/rejected": -1.3339701890945435, "loss": 3.0572, "rewards/accuracies": 0.5, "rewards/chosen": -13.290201187133789, "rewards/margins": 0.04950146749615669, "rewards/rejected": -13.339701652526855, "step": 8860 }, { "epoch": 0.29879672385318007, "grad_norm": 16.844425201416016, "learning_rate": 8.843716969217538e-07, "logits/chosen": -0.7272054553031921, "logits/rejected": -0.9063571691513062, "logps/chosen": -1.646959900856018, "logps/rejected": -1.6387298107147217, "loss": 3.6634, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.469600677490234, "rewards/margins": -0.08229970932006836, "rewards/rejected": -16.387298583984375, "step": 8865 }, { "epoch": 0.2989652499241633, "grad_norm": 24.358808517456055, "learning_rate": 8.84183514492426e-07, "logits/chosen": -0.8251449465751648, "logits/rejected": -1.1270743608474731, "logps/chosen": -2.065498113632202, "logps/rejected": -2.5702450275421143, "loss": 3.3374, "rewards/accuracies": 0.5, "rewards/chosen": -20.654979705810547, "rewards/margins": 5.047468662261963, "rewards/rejected": -25.70244789123535, "step": 8870 }, { "epoch": 0.29913377599514646, "grad_norm": 42.65480041503906, "learning_rate": 8.8399519911232e-07, "logits/chosen": -0.8278477787971497, "logits/rejected": -0.7521225214004517, "logps/chosen": -2.142841100692749, "logps/rejected": -2.3546812534332275, "loss": 2.1474, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.42841148376465, "rewards/margins": 2.1184000968933105, "rewards/rejected": -23.546810150146484, "step": 8875 }, { "epoch": 0.2993023020661296, "grad_norm": 14.598514556884766, "learning_rate": 8.838067508466044e-07, "logits/chosen": -0.8302278518676758, "logits/rejected": -1.1150890588760376, "logps/chosen": -1.5165884494781494, "logps/rejected": -1.7599170207977295, "loss": 1.9003, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -15.165884017944336, "rewards/margins": 2.43328595161438, "rewards/rejected": -17.599170684814453, "step": 8880 }, { "epoch": 0.2994708281371128, "grad_norm": 18.158178329467773, "learning_rate": 8.836181697604937e-07, "logits/chosen": -0.7643877863883972, "logits/rejected": -0.7783080339431763, "logps/chosen": -2.0373730659484863, "logps/rejected": -2.215522289276123, "loss": 2.5642, "rewards/accuracies": 0.5, "rewards/chosen": -20.37373161315918, "rewards/margins": 1.781489372253418, "rewards/rejected": -22.155221939086914, "step": 8885 }, { "epoch": 0.299639354208096, "grad_norm": 22.55415153503418, "learning_rate": 8.834294559192483e-07, "logits/chosen": -0.9910376667976379, "logits/rejected": -1.1309711933135986, "logps/chosen": -1.6215696334838867, "logps/rejected": -1.7703845500946045, "loss": 2.8646, "rewards/accuracies": 0.5, "rewards/chosen": -16.215696334838867, "rewards/margins": 1.4881494045257568, "rewards/rejected": -17.703845977783203, "step": 8890 }, { "epoch": 0.2998078802790792, "grad_norm": 28.628612518310547, "learning_rate": 8.832406093881749e-07, "logits/chosen": -0.7218562364578247, "logits/rejected": -0.8594409823417664, "logps/chosen": -2.144453763961792, "logps/rejected": -2.02648663520813, "loss": 4.5639, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -21.444538116455078, "rewards/margins": -1.1796700954437256, "rewards/rejected": -20.264867782592773, "step": 8895 }, { "epoch": 0.29997640635006234, "grad_norm": 23.925495147705078, "learning_rate": 8.830516302326257e-07, "logits/chosen": -0.8188495635986328, "logits/rejected": -0.9341527223587036, "logps/chosen": -1.8960866928100586, "logps/rejected": -1.8661048412322998, "loss": 3.4802, "rewards/accuracies": 0.5, "rewards/chosen": -18.960866928100586, "rewards/margins": -0.2998184263706207, "rewards/rejected": -18.661048889160156, "step": 8900 }, { "epoch": 0.30014493242104556, "grad_norm": 16.280384063720703, "learning_rate": 8.828625185179988e-07, "logits/chosen": -0.48790502548217773, "logits/rejected": -0.6944053769111633, "logps/chosen": -1.6479403972625732, "logps/rejected": -1.7591689825057983, "loss": 2.2314, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -16.47940444946289, "rewards/margins": 1.1122846603393555, "rewards/rejected": -17.591690063476562, "step": 8905 }, { "epoch": 0.30031345849202873, "grad_norm": 22.790857315063477, "learning_rate": 8.826732743097385e-07, "logits/chosen": -0.9420550465583801, "logits/rejected": -1.0960060358047485, "logps/chosen": -1.7473942041397095, "logps/rejected": -1.692716360092163, "loss": 3.9335, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -17.473941802978516, "rewards/margins": -0.5467766523361206, "rewards/rejected": -16.927165985107422, "step": 8910 }, { "epoch": 0.3004819845630119, "grad_norm": 24.296518325805664, "learning_rate": 8.824838976733345e-07, "logits/chosen": -0.6912819147109985, "logits/rejected": -1.0317418575286865, "logps/chosen": -2.033799409866333, "logps/rejected": -2.1355299949645996, "loss": 2.7434, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.337993621826172, "rewards/margins": 1.0173046588897705, "rewards/rejected": -21.355297088623047, "step": 8915 }, { "epoch": 0.30065051063399506, "grad_norm": 10.289129257202148, "learning_rate": 8.822943886743229e-07, "logits/chosen": -1.1432678699493408, "logits/rejected": -1.317195177078247, "logps/chosen": -1.761526107788086, "logps/rejected": -2.0710625648498535, "loss": 1.9022, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.61526107788086, "rewards/margins": 3.0953640937805176, "rewards/rejected": -20.710622787475586, "step": 8920 }, { "epoch": 0.3008190367049783, "grad_norm": 27.289064407348633, "learning_rate": 8.821047473782852e-07, "logits/chosen": -0.5983952879905701, "logits/rejected": -0.35027188062667847, "logps/chosen": -2.5783469676971436, "logps/rejected": -2.4220213890075684, "loss": 5.0843, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -25.783472061157227, "rewards/margins": -1.563256025314331, "rewards/rejected": -24.22021484375, "step": 8925 }, { "epoch": 0.30098756277596145, "grad_norm": 73.90116882324219, "learning_rate": 8.819149738508488e-07, "logits/chosen": -0.7550094127655029, "logits/rejected": -0.8008283376693726, "logps/chosen": -2.168519973754883, "logps/rejected": -2.512540102005005, "loss": 2.612, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.685199737548828, "rewards/margins": 3.4402008056640625, "rewards/rejected": -25.12540054321289, "step": 8930 }, { "epoch": 0.3011560888469446, "grad_norm": 20.581180572509766, "learning_rate": 8.817250681576867e-07, "logits/chosen": -1.3998351097106934, "logits/rejected": -1.3207590579986572, "logps/chosen": -1.6048024892807007, "logps/rejected": -1.691149353981018, "loss": 2.514, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -16.048025131225586, "rewards/margins": 0.8634673357009888, "rewards/rejected": -16.9114933013916, "step": 8935 }, { "epoch": 0.3013246149179278, "grad_norm": 12.386482238769531, "learning_rate": 8.815350303645179e-07, "logits/chosen": -0.7268382906913757, "logits/rejected": -0.9646919369697571, "logps/chosen": -1.9220527410507202, "logps/rejected": -2.1499226093292236, "loss": 2.1157, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.22052574157715, "rewards/margins": 2.278700113296509, "rewards/rejected": -21.49922752380371, "step": 8940 }, { "epoch": 0.301493140988911, "grad_norm": 48.431854248046875, "learning_rate": 8.81344860537107e-07, "logits/chosen": -0.8490222692489624, "logits/rejected": -0.8382167816162109, "logps/chosen": -2.010575532913208, "logps/rejected": -2.0359785556793213, "loss": 3.4625, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.10575294494629, "rewards/margins": 0.2540292739868164, "rewards/rejected": -20.359783172607422, "step": 8945 }, { "epoch": 0.30166166705989417, "grad_norm": 26.83896255493164, "learning_rate": 8.811545587412643e-07, "logits/chosen": -0.7713761329650879, "logits/rejected": -0.9512740969657898, "logps/chosen": -1.7185767889022827, "logps/rejected": -1.8028638362884521, "loss": 3.4062, "rewards/accuracies": 0.5, "rewards/chosen": -17.185766220092773, "rewards/margins": 0.8428729176521301, "rewards/rejected": -18.028640747070312, "step": 8950 }, { "epoch": 0.30183019313087733, "grad_norm": 22.699316024780273, "learning_rate": 8.809641250428457e-07, "logits/chosen": -0.7982644438743591, "logits/rejected": -0.800157368183136, "logps/chosen": -1.7635858058929443, "logps/rejected": -1.7425504922866821, "loss": 3.3254, "rewards/accuracies": 0.5, "rewards/chosen": -17.6358585357666, "rewards/margins": -0.21035441756248474, "rewards/rejected": -17.425504684448242, "step": 8955 }, { "epoch": 0.30199871920186055, "grad_norm": 16.492294311523438, "learning_rate": 8.807735595077526e-07, "logits/chosen": -0.8586235046386719, "logits/rejected": -0.8732374310493469, "logps/chosen": -2.2647202014923096, "logps/rejected": -2.4003055095672607, "loss": 2.5696, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.647199630737305, "rewards/margins": 1.3558542728424072, "rewards/rejected": -24.003055572509766, "step": 8960 }, { "epoch": 0.3021672452728437, "grad_norm": 32.511531829833984, "learning_rate": 8.805828622019326e-07, "logits/chosen": -0.8290532827377319, "logits/rejected": -0.8825815320014954, "logps/chosen": -1.861469030380249, "logps/rejected": -1.8529653549194336, "loss": 3.225, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.61469078063965, "rewards/margins": -0.08503618091344833, "rewards/rejected": -18.529653549194336, "step": 8965 }, { "epoch": 0.3023357713438269, "grad_norm": 72.5047378540039, "learning_rate": 8.803920331913785e-07, "logits/chosen": -0.8660562634468079, "logits/rejected": -0.984213650226593, "logps/chosen": -2.007239580154419, "logps/rejected": -2.131221055984497, "loss": 2.8056, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.07239532470703, "rewards/margins": 1.2398135662078857, "rewards/rejected": -21.312210083007812, "step": 8970 }, { "epoch": 0.30250429741481005, "grad_norm": 26.04867172241211, "learning_rate": 8.802010725421283e-07, "logits/chosen": -0.2826697826385498, "logits/rejected": -0.4146658778190613, "logps/chosen": -1.6923812627792358, "logps/rejected": -1.7552837133407593, "loss": 2.7193, "rewards/accuracies": 0.5, "rewards/chosen": -16.923812866210938, "rewards/margins": 0.6290236711502075, "rewards/rejected": -17.552837371826172, "step": 8975 }, { "epoch": 0.3026728234857933, "grad_norm": 58.66663360595703, "learning_rate": 8.800099803202663e-07, "logits/chosen": -0.7005506753921509, "logits/rejected": -0.7806217670440674, "logps/chosen": -1.9371159076690674, "logps/rejected": -2.365093946456909, "loss": 1.8148, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.371158599853516, "rewards/margins": 4.279781818389893, "rewards/rejected": -23.65093994140625, "step": 8980 }, { "epoch": 0.30284134955677644, "grad_norm": 32.66402816772461, "learning_rate": 8.79818756591922e-07, "logits/chosen": -0.546657919883728, "logits/rejected": -0.6938589811325073, "logps/chosen": -1.9407997131347656, "logps/rejected": -2.485011577606201, "loss": 2.8969, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.40799903869629, "rewards/margins": 5.4421186447143555, "rewards/rejected": -24.850116729736328, "step": 8985 }, { "epoch": 0.3030098756277596, "grad_norm": 25.154067993164062, "learning_rate": 8.796274014232703e-07, "logits/chosen": -1.053919792175293, "logits/rejected": -1.0685478448867798, "logps/chosen": -1.861169457435608, "logps/rejected": -1.8835694789886475, "loss": 3.0481, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.611690521240234, "rewards/margins": 0.22400188446044922, "rewards/rejected": -18.835693359375, "step": 8990 }, { "epoch": 0.30317840169874277, "grad_norm": 42.444305419921875, "learning_rate": 8.794359148805316e-07, "logits/chosen": -0.8252202868461609, "logits/rejected": -0.9097492098808289, "logps/chosen": -1.8112564086914062, "logps/rejected": -1.9504534006118774, "loss": 2.1677, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.112564086914062, "rewards/margins": 1.391969084739685, "rewards/rejected": -19.504533767700195, "step": 8995 }, { "epoch": 0.303346927769726, "grad_norm": 21.111835479736328, "learning_rate": 8.79244297029972e-07, "logits/chosen": -1.1233201026916504, "logits/rejected": -1.2216030359268188, "logps/chosen": -1.5746749639511108, "logps/rejected": -1.7738206386566162, "loss": 1.8468, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -15.746749877929688, "rewards/margins": 1.9914575815200806, "rewards/rejected": -17.738208770751953, "step": 9000 }, { "epoch": 0.30351545384070916, "grad_norm": 41.230464935302734, "learning_rate": 8.790525479379027e-07, "logits/chosen": -0.6682386994361877, "logits/rejected": -0.846937358379364, "logps/chosen": -1.9128681421279907, "logps/rejected": -1.9116519689559937, "loss": 3.1896, "rewards/accuracies": 0.5, "rewards/chosen": -19.128681182861328, "rewards/margins": -0.0121612548828125, "rewards/rejected": -19.11652183532715, "step": 9005 }, { "epoch": 0.3036839799116923, "grad_norm": 21.60496711730957, "learning_rate": 8.788606676706808e-07, "logits/chosen": -0.7037097215652466, "logits/rejected": -0.7241389751434326, "logps/chosen": -1.754116415977478, "logps/rejected": -1.8600555658340454, "loss": 2.4375, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.54116439819336, "rewards/margins": 1.0593905448913574, "rewards/rejected": -18.600553512573242, "step": 9010 }, { "epoch": 0.30385250598267555, "grad_norm": 29.23500633239746, "learning_rate": 8.786686562947083e-07, "logits/chosen": -0.9285395741462708, "logits/rejected": -0.8670550584793091, "logps/chosen": -1.9415439367294312, "logps/rejected": -2.041487216949463, "loss": 2.8543, "rewards/accuracies": 0.5, "rewards/chosen": -19.41543960571289, "rewards/margins": 0.9994330406188965, "rewards/rejected": -20.414873123168945, "step": 9015 }, { "epoch": 0.3040210320536587, "grad_norm": 107.34830474853516, "learning_rate": 8.784765138764327e-07, "logits/chosen": -0.4858947694301605, "logits/rejected": -0.3548693060874939, "logps/chosen": -2.049286365509033, "logps/rejected": -2.0892035961151123, "loss": 3.5124, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.492860794067383, "rewards/margins": 0.3991745114326477, "rewards/rejected": -20.89203453063965, "step": 9020 }, { "epoch": 0.3041895581246419, "grad_norm": 39.668636322021484, "learning_rate": 8.782842404823472e-07, "logits/chosen": -0.9904440641403198, "logits/rejected": -0.9966761469841003, "logps/chosen": -2.305406332015991, "logps/rejected": -2.2075486183166504, "loss": 4.0946, "rewards/accuracies": 0.5, "rewards/chosen": -23.05406379699707, "rewards/margins": -0.9785760045051575, "rewards/rejected": -22.075489044189453, "step": 9025 }, { "epoch": 0.30435808419562504, "grad_norm": 14.98969554901123, "learning_rate": 8.780918361789897e-07, "logits/chosen": -0.7242355942726135, "logits/rejected": -0.6773864030838013, "logps/chosen": -2.184032440185547, "logps/rejected": -2.7087905406951904, "loss": 2.0529, "rewards/accuracies": 0.5, "rewards/chosen": -21.8403263092041, "rewards/margins": 5.24758243560791, "rewards/rejected": -27.087909698486328, "step": 9030 }, { "epoch": 0.30452661026660827, "grad_norm": 18.60861587524414, "learning_rate": 8.778993010329441e-07, "logits/chosen": -0.8874000310897827, "logits/rejected": -1.0219591856002808, "logps/chosen": -2.139561176300049, "logps/rejected": -2.437269926071167, "loss": 2.2432, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.395610809326172, "rewards/margins": 2.977090358734131, "rewards/rejected": -24.37270164489746, "step": 9035 }, { "epoch": 0.30469513633759143, "grad_norm": 16.233108520507812, "learning_rate": 8.777066351108388e-07, "logits/chosen": -0.7690008878707886, "logits/rejected": -1.0270230770111084, "logps/chosen": -1.7827821969985962, "logps/rejected": -1.8636226654052734, "loss": 2.5837, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.827821731567383, "rewards/margins": 0.8084052205085754, "rewards/rejected": -18.636226654052734, "step": 9040 }, { "epoch": 0.3048636624085746, "grad_norm": 42.710716247558594, "learning_rate": 8.775138384793483e-07, "logits/chosen": -0.7992104887962341, "logits/rejected": -0.7307273149490356, "logps/chosen": -1.7060863971710205, "logps/rejected": -1.6231950521469116, "loss": 4.0294, "rewards/accuracies": 0.5, "rewards/chosen": -17.060863494873047, "rewards/margins": -0.8289132118225098, "rewards/rejected": -16.231948852539062, "step": 9045 }, { "epoch": 0.30503218847955776, "grad_norm": 22.898052215576172, "learning_rate": 8.773209112051918e-07, "logits/chosen": -1.1620280742645264, "logits/rejected": -1.0759422779083252, "logps/chosen": -1.6986467838287354, "logps/rejected": -1.8278793096542358, "loss": 2.1036, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -16.986465454101562, "rewards/margins": 1.2923271656036377, "rewards/rejected": -18.27879524230957, "step": 9050 }, { "epoch": 0.305200714550541, "grad_norm": 30.709197998046875, "learning_rate": 8.771278533551338e-07, "logits/chosen": -0.6249741315841675, "logits/rejected": -0.6515600085258484, "logps/chosen": -1.6547349691390991, "logps/rejected": -1.5639965534210205, "loss": 4.2284, "rewards/accuracies": 0.5, "rewards/chosen": -16.547351837158203, "rewards/margins": -0.9073851704597473, "rewards/rejected": -15.639966011047363, "step": 9055 }, { "epoch": 0.30536924062152415, "grad_norm": 21.73700714111328, "learning_rate": 8.769346649959839e-07, "logits/chosen": -0.7830342650413513, "logits/rejected": -0.6824518442153931, "logps/chosen": -1.5401135683059692, "logps/rejected": -1.622176170349121, "loss": 2.4441, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -15.401135444641113, "rewards/margins": 0.820626437664032, "rewards/rejected": -16.22176170349121, "step": 9060 }, { "epoch": 0.3055377666925073, "grad_norm": 29.96711540222168, "learning_rate": 8.76741346194597e-07, "logits/chosen": -1.2152740955352783, "logits/rejected": -1.201468586921692, "logps/chosen": -1.8606659173965454, "logps/rejected": -1.9004096984863281, "loss": 2.8449, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.606660842895508, "rewards/margins": 0.39743900299072266, "rewards/rejected": -19.004098892211914, "step": 9065 }, { "epoch": 0.30570629276349054, "grad_norm": 38.4486083984375, "learning_rate": 8.765478970178733e-07, "logits/chosen": -0.9709786176681519, "logits/rejected": -1.066765546798706, "logps/chosen": -1.6975494623184204, "logps/rejected": -1.7299559116363525, "loss": 2.8402, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -16.975492477416992, "rewards/margins": 0.3240653872489929, "rewards/rejected": -17.299556732177734, "step": 9070 }, { "epoch": 0.3058748188344737, "grad_norm": 0.006397194694727659, "learning_rate": 8.763543175327579e-07, "logits/chosen": -0.5968400239944458, "logits/rejected": -0.7210376858711243, "logps/chosen": -1.7113231420516968, "logps/rejected": -1.9671258926391602, "loss": 2.5714, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.113231658935547, "rewards/margins": 2.55802583694458, "rewards/rejected": -19.6712589263916, "step": 9075 }, { "epoch": 0.30604334490545687, "grad_norm": 34.90524673461914, "learning_rate": 8.761606078062409e-07, "logits/chosen": -0.5262940526008606, "logits/rejected": -0.9337191581726074, "logps/chosen": -1.9339056015014648, "logps/rejected": -2.1920578479766846, "loss": 2.1257, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.33905601501465, "rewards/margins": 2.5815205574035645, "rewards/rejected": -21.920576095581055, "step": 9080 }, { "epoch": 0.30621187097644004, "grad_norm": 20.485763549804688, "learning_rate": 8.759667679053576e-07, "logits/chosen": -0.9563091397285461, "logits/rejected": -1.1899265050888062, "logps/chosen": -1.9684795141220093, "logps/rejected": -2.0848593711853027, "loss": 2.2998, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.684795379638672, "rewards/margins": 1.1637986898422241, "rewards/rejected": -20.848594665527344, "step": 9085 }, { "epoch": 0.30638039704742326, "grad_norm": 18.3884220123291, "learning_rate": 8.757727978971885e-07, "logits/chosen": -0.585478663444519, "logits/rejected": -0.7393139600753784, "logps/chosen": -1.8395030498504639, "logps/rejected": -1.911931037902832, "loss": 2.6764, "rewards/accuracies": 0.5, "rewards/chosen": -18.395029067993164, "rewards/margins": 0.7242798805236816, "rewards/rejected": -19.11931037902832, "step": 9090 }, { "epoch": 0.3065489231184064, "grad_norm": 23.539100646972656, "learning_rate": 8.755786978488589e-07, "logits/chosen": -0.5933682918548584, "logits/rejected": -0.8263611793518066, "logps/chosen": -1.4465597867965698, "logps/rejected": -1.3904832601547241, "loss": 3.7162, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -14.465597152709961, "rewards/margins": -0.5607655644416809, "rewards/rejected": -13.904830932617188, "step": 9095 }, { "epoch": 0.3067174491893896, "grad_norm": 20.3573055267334, "learning_rate": 8.753844678275392e-07, "logits/chosen": -1.2448726892471313, "logits/rejected": -1.4484832286834717, "logps/chosen": -1.6800048351287842, "logps/rejected": -1.8209593296051025, "loss": 3.3054, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -16.800048828125, "rewards/margins": 1.4095432758331299, "rewards/rejected": -18.209592819213867, "step": 9100 }, { "epoch": 0.30688597526037276, "grad_norm": 4.400402545928955, "learning_rate": 8.751901079004447e-07, "logits/chosen": -0.7810710072517395, "logits/rejected": -0.879501223564148, "logps/chosen": -1.8006995916366577, "logps/rejected": -2.0738296508789062, "loss": 1.9631, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.006996154785156, "rewards/margins": 2.7313015460968018, "rewards/rejected": -20.738298416137695, "step": 9105 }, { "epoch": 0.307054501331356, "grad_norm": 51.79671096801758, "learning_rate": 8.749956181348359e-07, "logits/chosen": -0.6710806488990784, "logits/rejected": -0.8282930254936218, "logps/chosen": -2.1655375957489014, "logps/rejected": -2.0684962272644043, "loss": 4.0727, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.655378341674805, "rewards/margins": -0.970413327217102, "rewards/rejected": -20.68496322631836, "step": 9110 }, { "epoch": 0.30722302740233914, "grad_norm": 15.096877098083496, "learning_rate": 8.748009985980177e-07, "logits/chosen": -1.044159173965454, "logits/rejected": -1.160940408706665, "logps/chosen": -1.7486212253570557, "logps/rejected": -1.8441343307495117, "loss": 3.0722, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.4862117767334, "rewards/margins": 0.9551332592964172, "rewards/rejected": -18.44134521484375, "step": 9115 }, { "epoch": 0.3073915534733223, "grad_norm": 12.892077445983887, "learning_rate": 8.746062493573405e-07, "logits/chosen": -0.6979146599769592, "logits/rejected": -0.6973038911819458, "logps/chosen": -1.865674614906311, "logps/rejected": -2.052708625793457, "loss": 1.7233, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.65674591064453, "rewards/margins": 1.8703396320343018, "rewards/rejected": -20.527084350585938, "step": 9120 }, { "epoch": 0.30756007954430553, "grad_norm": 18.94183349609375, "learning_rate": 8.744113704801994e-07, "logits/chosen": -0.970826268196106, "logits/rejected": -1.3497257232666016, "logps/chosen": -1.6942193508148193, "logps/rejected": -1.9070132970809937, "loss": 2.5195, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.94219398498535, "rewards/margins": 2.127938747406006, "rewards/rejected": -19.070133209228516, "step": 9125 }, { "epoch": 0.3077286056152887, "grad_norm": 76.88709259033203, "learning_rate": 8.742163620340342e-07, "logits/chosen": -0.3977760374546051, "logits/rejected": -0.5327490568161011, "logps/chosen": -2.3799424171447754, "logps/rejected": -2.4491336345672607, "loss": 3.1359, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.799423217773438, "rewards/margins": 0.6919113397598267, "rewards/rejected": -24.4913330078125, "step": 9130 }, { "epoch": 0.30789713168627186, "grad_norm": 17.748449325561523, "learning_rate": 8.740212240863295e-07, "logits/chosen": -1.0044649839401245, "logits/rejected": -1.0892714262008667, "logps/chosen": -1.5947548151016235, "logps/rejected": -1.6854908466339111, "loss": 3.3036, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -15.947547912597656, "rewards/margins": 0.9073607325553894, "rewards/rejected": -16.854907989501953, "step": 9135 }, { "epoch": 0.30806565775725503, "grad_norm": 20.412609100341797, "learning_rate": 8.738259567046151e-07, "logits/chosen": -0.45204153656959534, "logits/rejected": -0.5009742975234985, "logps/chosen": -1.8742802143096924, "logps/rejected": -1.9355961084365845, "loss": 2.7849, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.742801666259766, "rewards/margins": 0.6131598353385925, "rewards/rejected": -19.355960845947266, "step": 9140 }, { "epoch": 0.30823418382823825, "grad_norm": 16.282869338989258, "learning_rate": 8.736305599564652e-07, "logits/chosen": -0.6326123476028442, "logits/rejected": -0.582168459892273, "logps/chosen": -1.7905687093734741, "logps/rejected": -2.0051612854003906, "loss": 1.6708, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.90568733215332, "rewards/margins": 2.1459248065948486, "rewards/rejected": -20.051612854003906, "step": 9145 }, { "epoch": 0.3084027098992214, "grad_norm": 22.959726333618164, "learning_rate": 8.734350339094989e-07, "logits/chosen": -0.7373453378677368, "logits/rejected": -0.9396425485610962, "logps/chosen": -1.7323005199432373, "logps/rejected": -1.8902448415756226, "loss": 1.8278, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -17.3230037689209, "rewards/margins": 1.579443097114563, "rewards/rejected": -18.902446746826172, "step": 9150 }, { "epoch": 0.3085712359702046, "grad_norm": 26.068431854248047, "learning_rate": 8.732393786313803e-07, "logits/chosen": -0.8269859552383423, "logits/rejected": -0.8830499649047852, "logps/chosen": -1.9429603815078735, "logps/rejected": -1.9990386962890625, "loss": 3.1581, "rewards/accuracies": 0.5, "rewards/chosen": -19.42960548400879, "rewards/margins": 0.5607836842536926, "rewards/rejected": -19.990388870239258, "step": 9155 }, { "epoch": 0.30873976204118775, "grad_norm": 14.898977279663086, "learning_rate": 8.730435941898175e-07, "logits/chosen": -1.2051137685775757, "logits/rejected": -1.2535730600357056, "logps/chosen": -2.182587146759033, "logps/rejected": -2.2140278816223145, "loss": 3.71, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.825870513916016, "rewards/margins": 0.31440791487693787, "rewards/rejected": -22.140277862548828, "step": 9160 }, { "epoch": 0.30890828811217097, "grad_norm": 32.99822998046875, "learning_rate": 8.728476806525644e-07, "logits/chosen": -0.8135523796081543, "logits/rejected": -0.8619807362556458, "logps/chosen": -1.740747094154358, "logps/rejected": -1.7622601985931396, "loss": 2.9138, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.407470703125, "rewards/margins": 0.21513119339942932, "rewards/rejected": -17.622602462768555, "step": 9165 }, { "epoch": 0.30907681418315414, "grad_norm": 29.103960037231445, "learning_rate": 8.726516380874185e-07, "logits/chosen": -0.764143705368042, "logits/rejected": -0.7308832406997681, "logps/chosen": -2.05553936958313, "logps/rejected": -1.9418214559555054, "loss": 4.1841, "rewards/accuracies": 0.5, "rewards/chosen": -20.55539321899414, "rewards/margins": -1.1371777057647705, "rewards/rejected": -19.418216705322266, "step": 9170 }, { "epoch": 0.3092453402541373, "grad_norm": 28.63543128967285, "learning_rate": 8.724554665622226e-07, "logits/chosen": -0.6835757493972778, "logits/rejected": -0.8575420379638672, "logps/chosen": -1.7750861644744873, "logps/rejected": -1.815028429031372, "loss": 3.2284, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.750864028930664, "rewards/margins": 0.3994219899177551, "rewards/rejected": -18.150283813476562, "step": 9175 }, { "epoch": 0.3094138663251205, "grad_norm": 0.2285619080066681, "learning_rate": 8.722591661448637e-07, "logits/chosen": -0.49821311235427856, "logits/rejected": -0.590837836265564, "logps/chosen": -1.7573721408843994, "logps/rejected": -2.0172266960144043, "loss": 2.2316, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.573719024658203, "rewards/margins": 2.5985465049743652, "rewards/rejected": -20.17226791381836, "step": 9180 }, { "epoch": 0.3095823923961037, "grad_norm": 41.559940338134766, "learning_rate": 8.72062736903274e-07, "logits/chosen": -0.7645959854125977, "logits/rejected": -0.7926516532897949, "logps/chosen": -1.973306655883789, "logps/rejected": -2.1764540672302246, "loss": 2.8765, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.73306655883789, "rewards/margins": 2.0314762592315674, "rewards/rejected": -21.764545440673828, "step": 9185 }, { "epoch": 0.30975091846708686, "grad_norm": 18.116069793701172, "learning_rate": 8.718661789054297e-07, "logits/chosen": -0.525948703289032, "logits/rejected": -0.5813354253768921, "logps/chosen": -1.958701491355896, "logps/rejected": -2.065213203430176, "loss": 2.2873, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.587017059326172, "rewards/margins": 1.0651161670684814, "rewards/rejected": -20.65213394165039, "step": 9190 }, { "epoch": 0.30991944453807, "grad_norm": 27.396312713623047, "learning_rate": 8.716694922193517e-07, "logits/chosen": -1.0580114126205444, "logits/rejected": -1.1286802291870117, "logps/chosen": -1.9167063236236572, "logps/rejected": -1.9518272876739502, "loss": 3.0418, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.167064666748047, "rewards/margins": 0.35120925307273865, "rewards/rejected": -19.518274307250977, "step": 9195 }, { "epoch": 0.31008797060905324, "grad_norm": 26.48879623413086, "learning_rate": 8.714726769131058e-07, "logits/chosen": -0.952828049659729, "logits/rejected": -1.0840023756027222, "logps/chosen": -1.8382370471954346, "logps/rejected": -2.317875385284424, "loss": 2.8347, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -18.382369995117188, "rewards/margins": 4.796382904052734, "rewards/rejected": -23.178752899169922, "step": 9200 }, { "epoch": 0.31008797060905324, "eval_logits/chosen": -1.1909996271133423, "eval_logits/rejected": -1.25919508934021, "eval_logps/chosen": -1.8463243246078491, "eval_logps/rejected": -1.9093353748321533, "eval_loss": 3.115633010864258, "eval_rewards/accuracies": 0.6100000143051147, "eval_rewards/chosen": -18.46324348449707, "eval_rewards/margins": 0.6301097273826599, "eval_rewards/rejected": -19.093353271484375, "eval_runtime": 12.9029, "eval_samples_per_second": 7.75, "eval_steps_per_second": 1.938, "step": 9200 }, { "epoch": 0.3102564966800364, "grad_norm": 1.0767076015472412, "learning_rate": 8.71275733054802e-07, "logits/chosen": -0.5575070381164551, "logits/rejected": -0.8997724652290344, "logps/chosen": -2.1544461250305176, "logps/rejected": -2.622394561767578, "loss": 1.0911, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -21.54446029663086, "rewards/margins": 4.679484844207764, "rewards/rejected": -26.22394371032715, "step": 9205 }, { "epoch": 0.3104250227510196, "grad_norm": 21.940515518188477, "learning_rate": 8.710786607125944e-07, "logits/chosen": -0.535169780254364, "logits/rejected": -0.6544603109359741, "logps/chosen": -2.1633615493774414, "logps/rejected": -2.1183650493621826, "loss": 3.796, "rewards/accuracies": 0.5, "rewards/chosen": -21.633617401123047, "rewards/margins": -0.44996652007102966, "rewards/rejected": -21.18364906311035, "step": 9210 }, { "epoch": 0.31059354882200274, "grad_norm": 20.01294708251953, "learning_rate": 8.708814599546823e-07, "logits/chosen": -0.9173457026481628, "logits/rejected": -0.8900350332260132, "logps/chosen": -1.5918538570404053, "logps/rejected": -1.5473552942276, "loss": 3.5273, "rewards/accuracies": 0.5, "rewards/chosen": -15.918538093566895, "rewards/margins": -0.4449850916862488, "rewards/rejected": -15.473551750183105, "step": 9215 }, { "epoch": 0.31076207489298596, "grad_norm": 31.564699172973633, "learning_rate": 8.706841308493091e-07, "logits/chosen": -0.7942282557487488, "logits/rejected": -0.8029106855392456, "logps/chosen": -2.029221773147583, "logps/rejected": -2.0451905727386475, "loss": 3.3308, "rewards/accuracies": 0.5, "rewards/chosen": -20.292217254638672, "rewards/margins": 0.1596868485212326, "rewards/rejected": -20.451906204223633, "step": 9220 }, { "epoch": 0.31093060096396913, "grad_norm": 22.814817428588867, "learning_rate": 8.704866734647624e-07, "logits/chosen": -1.0109196901321411, "logits/rejected": -0.7166153788566589, "logps/chosen": -1.6865415573120117, "logps/rejected": -1.6409651041030884, "loss": 3.6558, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.865415573120117, "rewards/margins": -0.45576468110084534, "rewards/rejected": -16.409652709960938, "step": 9225 }, { "epoch": 0.3110991270349523, "grad_norm": 21.77043342590332, "learning_rate": 8.702890878693749e-07, "logits/chosen": -0.66042160987854, "logits/rejected": -0.6365646123886108, "logps/chosen": -1.6145178079605103, "logps/rejected": -1.8034298419952393, "loss": 2.3972, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.145177841186523, "rewards/margins": 1.8891197443008423, "rewards/rejected": -18.034297943115234, "step": 9230 }, { "epoch": 0.3112676531059355, "grad_norm": 43.32010269165039, "learning_rate": 8.700913741315228e-07, "logits/chosen": -0.5597115755081177, "logits/rejected": -0.6904144883155823, "logps/chosen": -2.2811083793640137, "logps/rejected": -2.2438998222351074, "loss": 3.6817, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.811084747314453, "rewards/margins": -0.37208643555641174, "rewards/rejected": -22.438995361328125, "step": 9235 }, { "epoch": 0.3114361791769187, "grad_norm": 28.244930267333984, "learning_rate": 8.69893532319627e-07, "logits/chosen": -0.62388676404953, "logits/rejected": -0.6791104078292847, "logps/chosen": -1.8690904378890991, "logps/rejected": -1.922142744064331, "loss": 2.6813, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.690906524658203, "rewards/margins": 0.5305234789848328, "rewards/rejected": -19.22142791748047, "step": 9240 }, { "epoch": 0.31160470524790185, "grad_norm": 18.90542984008789, "learning_rate": 8.696955625021531e-07, "logits/chosen": -0.8645426034927368, "logits/rejected": -1.0093305110931396, "logps/chosen": -1.7112518548965454, "logps/rejected": -1.8172565698623657, "loss": 3.6309, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.112520217895508, "rewards/margins": 1.060046672821045, "rewards/rejected": -18.172565460205078, "step": 9245 }, { "epoch": 0.311773231318885, "grad_norm": 22.444063186645508, "learning_rate": 8.694974647476103e-07, "logits/chosen": -0.9879194498062134, "logits/rejected": -1.0465924739837646, "logps/chosen": -1.9798351526260376, "logps/rejected": -2.0943007469177246, "loss": 2.4644, "rewards/accuracies": 0.5, "rewards/chosen": -19.798351287841797, "rewards/margins": 1.1446588039398193, "rewards/rejected": -20.943008422851562, "step": 9250 }, { "epoch": 0.31194175738986823, "grad_norm": 19.90350341796875, "learning_rate": 8.692992391245526e-07, "logits/chosen": -0.7082827091217041, "logits/rejected": -0.8607162237167358, "logps/chosen": -1.9293746948242188, "logps/rejected": -2.0852489471435547, "loss": 3.0405, "rewards/accuracies": 0.5, "rewards/chosen": -19.293745040893555, "rewards/margins": 1.5587440729141235, "rewards/rejected": -20.852489471435547, "step": 9255 }, { "epoch": 0.3121102834608514, "grad_norm": 40.98049545288086, "learning_rate": 8.69100885701578e-07, "logits/chosen": -0.8298488855361938, "logits/rejected": -0.9287115931510925, "logps/chosen": -2.2203915119171143, "logps/rejected": -2.493387460708618, "loss": 1.8449, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.20391082763672, "rewards/margins": 2.7299604415893555, "rewards/rejected": -24.93387222290039, "step": 9260 }, { "epoch": 0.31227880953183457, "grad_norm": 45.79103088378906, "learning_rate": 8.689024045473289e-07, "logits/chosen": -0.9122930765151978, "logits/rejected": -0.9798024296760559, "logps/chosen": -1.878292441368103, "logps/rejected": -1.9426196813583374, "loss": 3.3604, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.78292465209961, "rewards/margins": 0.643271803855896, "rewards/rejected": -19.426197052001953, "step": 9265 }, { "epoch": 0.31244733560281773, "grad_norm": 98.69233703613281, "learning_rate": 8.687037957304916e-07, "logits/chosen": -0.6158634424209595, "logits/rejected": -0.8452926874160767, "logps/chosen": -2.1620185375213623, "logps/rejected": -2.1422224044799805, "loss": 4.0814, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.620187759399414, "rewards/margins": -0.19796237349510193, "rewards/rejected": -21.422225952148438, "step": 9270 }, { "epoch": 0.31261586167380095, "grad_norm": 27.064868927001953, "learning_rate": 8.685050593197974e-07, "logits/chosen": -0.3698921203613281, "logits/rejected": -0.5817358493804932, "logps/chosen": -1.7380319833755493, "logps/rejected": -1.8976764678955078, "loss": 2.137, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.380319595336914, "rewards/margins": 1.596444010734558, "rewards/rejected": -18.976764678955078, "step": 9275 }, { "epoch": 0.3127843877447841, "grad_norm": 46.59791564941406, "learning_rate": 8.683061953840203e-07, "logits/chosen": -1.0455105304718018, "logits/rejected": -1.0464041233062744, "logps/chosen": -1.9930102825164795, "logps/rejected": -1.9345165491104126, "loss": 3.6931, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -19.930103302001953, "rewards/margins": -0.5849382281303406, "rewards/rejected": -19.345165252685547, "step": 9280 }, { "epoch": 0.3129529138157673, "grad_norm": 17.088973999023438, "learning_rate": 8.681072039919797e-07, "logits/chosen": -1.1266381740570068, "logits/rejected": -0.9821329116821289, "logps/chosen": -2.0811820030212402, "logps/rejected": -2.1210179328918457, "loss": 3.321, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.811819076538086, "rewards/margins": 0.39835816621780396, "rewards/rejected": -21.210176467895508, "step": 9285 }, { "epoch": 0.3131214398867505, "grad_norm": 27.47848129272461, "learning_rate": 8.679080852125388e-07, "logits/chosen": -0.7783876657485962, "logits/rejected": -0.9243799448013306, "logps/chosen": -2.048316240310669, "logps/rejected": -2.2565720081329346, "loss": 2.8541, "rewards/accuracies": 0.5, "rewards/chosen": -20.48316192626953, "rewards/margins": 2.08255934715271, "rewards/rejected": -22.565723419189453, "step": 9290 }, { "epoch": 0.3132899659577337, "grad_norm": 36.77070617675781, "learning_rate": 8.677088391146045e-07, "logits/chosen": -0.9123884439468384, "logits/rejected": -1.1769945621490479, "logps/chosen": -1.8474833965301514, "logps/rejected": -1.9363610744476318, "loss": 2.6286, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.474834442138672, "rewards/margins": 0.8887761831283569, "rewards/rejected": -19.363611221313477, "step": 9295 }, { "epoch": 0.31345849202871684, "grad_norm": 20.16952896118164, "learning_rate": 8.675094657671281e-07, "logits/chosen": -0.8153167963027954, "logits/rejected": -0.8752968907356262, "logps/chosen": -2.131622791290283, "logps/rejected": -2.1669955253601074, "loss": 3.2583, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.316226959228516, "rewards/margins": 0.3537294268608093, "rewards/rejected": -21.66995620727539, "step": 9300 }, { "epoch": 0.3136270180997, "grad_norm": 73.12217712402344, "learning_rate": 8.673099652391049e-07, "logits/chosen": -0.43089723587036133, "logits/rejected": -0.40491873025894165, "logps/chosen": -2.151824474334717, "logps/rejected": -2.3143529891967773, "loss": 2.2401, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.51824378967285, "rewards/margins": 1.6252834796905518, "rewards/rejected": -23.143529891967773, "step": 9305 }, { "epoch": 0.3137955441706832, "grad_norm": 27.147113800048828, "learning_rate": 8.671103375995743e-07, "logits/chosen": -0.7571858167648315, "logits/rejected": -0.8392313122749329, "logps/chosen": -1.9569320678710938, "logps/rejected": -1.8534408807754517, "loss": 4.0802, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.569320678710938, "rewards/margins": -1.0349119901657104, "rewards/rejected": -18.534408569335938, "step": 9310 }, { "epoch": 0.3139640702416664, "grad_norm": 29.20941734313965, "learning_rate": 8.669105829176193e-07, "logits/chosen": -1.1783753633499146, "logits/rejected": -1.204984188079834, "logps/chosen": -1.939965844154358, "logps/rejected": -2.06992506980896, "loss": 2.4982, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.399660110473633, "rewards/margins": 1.2995918989181519, "rewards/rejected": -20.69925308227539, "step": 9315 }, { "epoch": 0.31413259631264956, "grad_norm": 24.12032699584961, "learning_rate": 8.667107012623674e-07, "logits/chosen": -0.9502077102661133, "logits/rejected": -1.0443466901779175, "logps/chosen": -2.350555896759033, "logps/rejected": -2.3637421131134033, "loss": 3.0449, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.50555992126465, "rewards/margins": 0.1318587362766266, "rewards/rejected": -23.637420654296875, "step": 9320 }, { "epoch": 0.3143011223836327, "grad_norm": 31.85713768005371, "learning_rate": 8.665106927029894e-07, "logits/chosen": -0.6978214383125305, "logits/rejected": -0.8441342115402222, "logps/chosen": -1.7820132970809937, "logps/rejected": -2.2420763969421387, "loss": 2.1663, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.820133209228516, "rewards/margins": 4.6006340980529785, "rewards/rejected": -22.420766830444336, "step": 9325 }, { "epoch": 0.31446964845461595, "grad_norm": 1.8817801475524902, "learning_rate": 8.663105573087007e-07, "logits/chosen": -0.695899486541748, "logits/rejected": -0.8433302044868469, "logps/chosen": -1.9872623682022095, "logps/rejected": -2.4130704402923584, "loss": 1.7234, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.872623443603516, "rewards/margins": 4.258082866668701, "rewards/rejected": -24.130704879760742, "step": 9330 }, { "epoch": 0.3146381745255991, "grad_norm": 15.969996452331543, "learning_rate": 8.661102951487601e-07, "logits/chosen": -1.011237382888794, "logits/rejected": -1.1954885721206665, "logps/chosen": -1.8208835124969482, "logps/rejected": -1.9458087682724, "loss": 2.7697, "rewards/accuracies": 0.5, "rewards/chosen": -18.208837509155273, "rewards/margins": 1.2492512464523315, "rewards/rejected": -19.458087921142578, "step": 9335 }, { "epoch": 0.3148067005965823, "grad_norm": 44.60512161254883, "learning_rate": 8.659099062924706e-07, "logits/chosen": -0.6627892851829529, "logits/rejected": -0.7953187227249146, "logps/chosen": -1.8654075860977173, "logps/rejected": -1.9347187280654907, "loss": 2.8071, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.65407371520996, "rewards/margins": 0.6931111216545105, "rewards/rejected": -19.347187042236328, "step": 9340 }, { "epoch": 0.3149752266675655, "grad_norm": 82.3418197631836, "learning_rate": 8.657093908091788e-07, "logits/chosen": -1.0475388765335083, "logits/rejected": -1.3520863056182861, "logps/chosen": -1.7849271297454834, "logps/rejected": -1.921607255935669, "loss": 2.2732, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.84926986694336, "rewards/margins": 1.3668019771575928, "rewards/rejected": -19.216073989868164, "step": 9345 }, { "epoch": 0.31514375273854867, "grad_norm": 42.31816864013672, "learning_rate": 8.655087487682753e-07, "logits/chosen": -0.971020519733429, "logits/rejected": -0.9450328946113586, "logps/chosen": -1.6906359195709229, "logps/rejected": -1.7991920709609985, "loss": 2.3246, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.906360626220703, "rewards/margins": 1.0855603218078613, "rewards/rejected": -17.991918563842773, "step": 9350 }, { "epoch": 0.31531227880953183, "grad_norm": 25.02703857421875, "learning_rate": 8.653079802391943e-07, "logits/chosen": -1.0409469604492188, "logits/rejected": -1.2706401348114014, "logps/chosen": -1.9263372421264648, "logps/rejected": -1.8309767246246338, "loss": 4.0889, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.26337242126465, "rewards/margins": -0.9536054730415344, "rewards/rejected": -18.30976676940918, "step": 9355 }, { "epoch": 0.315480804880515, "grad_norm": 31.79754066467285, "learning_rate": 8.651070852914137e-07, "logits/chosen": -0.7451499104499817, "logits/rejected": -0.7770187258720398, "logps/chosen": -1.7138770818710327, "logps/rejected": -1.7701056003570557, "loss": 2.5513, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.13877296447754, "rewards/margins": 0.5622828602790833, "rewards/rejected": -17.7010555267334, "step": 9360 }, { "epoch": 0.3156493309514982, "grad_norm": 12.896461486816406, "learning_rate": 8.649060639944557e-07, "logits/chosen": -0.5657856464385986, "logits/rejected": -0.6135199069976807, "logps/chosen": -2.1283740997314453, "logps/rejected": -2.3975729942321777, "loss": 1.6793, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.283740997314453, "rewards/margins": 2.691988945007324, "rewards/rejected": -23.97572898864746, "step": 9365 }, { "epoch": 0.3158178570224814, "grad_norm": 27.075576782226562, "learning_rate": 8.647049164178857e-07, "logits/chosen": -1.006801724433899, "logits/rejected": -1.1020275354385376, "logps/chosen": -1.8209720849990845, "logps/rejected": -1.8261454105377197, "loss": 3.1815, "rewards/accuracies": 0.5, "rewards/chosen": -18.209720611572266, "rewards/margins": 0.05173378065228462, "rewards/rejected": -18.26145362854004, "step": 9370 }, { "epoch": 0.31598638309346455, "grad_norm": 40.327659606933594, "learning_rate": 8.645036426313128e-07, "logits/chosen": -1.0180786848068237, "logits/rejected": -1.0732612609863281, "logps/chosen": -2.100003719329834, "logps/rejected": -2.201460599899292, "loss": 2.8745, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.000036239624023, "rewards/margins": 1.0145716667175293, "rewards/rejected": -22.014606475830078, "step": 9375 }, { "epoch": 0.3161549091644477, "grad_norm": 33.58378219604492, "learning_rate": 8.643022427043901e-07, "logits/chosen": -0.5249465703964233, "logits/rejected": -0.6569808125495911, "logps/chosen": -2.062403440475464, "logps/rejected": -2.0120742321014404, "loss": 3.6936, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -20.624034881591797, "rewards/margins": -0.5032904744148254, "rewards/rejected": -20.120744705200195, "step": 9380 }, { "epoch": 0.31632343523543094, "grad_norm": 22.796710968017578, "learning_rate": 8.641007167068141e-07, "logits/chosen": -0.9827069044113159, "logits/rejected": -1.0087189674377441, "logps/chosen": -2.1850533485412598, "logps/rejected": -2.171093225479126, "loss": 4.1346, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.85053062438965, "rewards/margins": -0.1395985633134842, "rewards/rejected": -21.7109317779541, "step": 9385 }, { "epoch": 0.3164919613064141, "grad_norm": 16.15544319152832, "learning_rate": 8.638990647083252e-07, "logits/chosen": -0.7128480672836304, "logits/rejected": -0.888879120349884, "logps/chosen": -2.0813021659851074, "logps/rejected": -2.189892530441284, "loss": 2.7087, "rewards/accuracies": 0.5, "rewards/chosen": -20.81302261352539, "rewards/margins": 1.085901141166687, "rewards/rejected": -21.89892578125, "step": 9390 }, { "epoch": 0.31666048737739727, "grad_norm": 20.231306076049805, "learning_rate": 8.636972867787069e-07, "logits/chosen": -0.823818027973175, "logits/rejected": -0.8593767881393433, "logps/chosen": -2.1330342292785645, "logps/rejected": -2.3957901000976562, "loss": 1.9103, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.330341339111328, "rewards/margins": 2.627556324005127, "rewards/rejected": -23.957897186279297, "step": 9395 }, { "epoch": 0.3168290134483805, "grad_norm": 18.477651596069336, "learning_rate": 8.634953829877869e-07, "logits/chosen": -0.855760395526886, "logits/rejected": -0.9541324377059937, "logps/chosen": -2.114394187927246, "logps/rejected": -2.291926860809326, "loss": 2.7066, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.143939971923828, "rewards/margins": 1.7753299474716187, "rewards/rejected": -22.919269561767578, "step": 9400 }, { "epoch": 0.31699753951936366, "grad_norm": 38.60859298706055, "learning_rate": 8.632933534054359e-07, "logits/chosen": -0.571567177772522, "logits/rejected": -0.5827969312667847, "logps/chosen": -1.69004225730896, "logps/rejected": -1.6473404169082642, "loss": 3.5833, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -16.90042495727539, "rewards/margins": -0.4270210862159729, "rewards/rejected": -16.473403930664062, "step": 9405 }, { "epoch": 0.3171660655903468, "grad_norm": 39.37350082397461, "learning_rate": 8.630911981015683e-07, "logits/chosen": -0.3582506775856018, "logits/rejected": -0.49621373414993286, "logps/chosen": -2.3286869525909424, "logps/rejected": -2.4957258701324463, "loss": 2.3083, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.286869049072266, "rewards/margins": 1.670389175415039, "rewards/rejected": -24.957258224487305, "step": 9410 }, { "epoch": 0.31733459166133, "grad_norm": 34.105995178222656, "learning_rate": 8.628889171461426e-07, "logits/chosen": -1.1902964115142822, "logits/rejected": -1.1787660121917725, "logps/chosen": -1.9384952783584595, "logps/rejected": -1.8969110250473022, "loss": 4.4437, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -19.384952545166016, "rewards/margins": -0.4158410131931305, "rewards/rejected": -18.9691104888916, "step": 9415 }, { "epoch": 0.3175031177323132, "grad_norm": 29.657711029052734, "learning_rate": 8.626865106091596e-07, "logits/chosen": -1.00395929813385, "logits/rejected": -1.1669594049453735, "logps/chosen": -1.771984338760376, "logps/rejected": -1.697410225868225, "loss": 3.8024, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -17.7198429107666, "rewards/margins": -0.7457417249679565, "rewards/rejected": -16.974102020263672, "step": 9420 }, { "epoch": 0.3176716438032964, "grad_norm": 22.412580490112305, "learning_rate": 8.624839785606648e-07, "logits/chosen": -1.3314557075500488, "logits/rejected": -1.3555439710617065, "logps/chosen": -1.5902462005615234, "logps/rejected": -1.7207386493682861, "loss": 2.1892, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -15.902461051940918, "rewards/margins": 1.3049246072769165, "rewards/rejected": -17.207386016845703, "step": 9425 }, { "epoch": 0.31784016987427954, "grad_norm": 121.15837097167969, "learning_rate": 8.622813210707463e-07, "logits/chosen": -0.8635732531547546, "logits/rejected": -0.7936872243881226, "logps/chosen": -2.18469500541687, "logps/rejected": -2.245011568069458, "loss": 3.3173, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.84695053100586, "rewards/margins": 0.6031640768051147, "rewards/rejected": -22.450115203857422, "step": 9430 }, { "epoch": 0.3180086959452627, "grad_norm": 12.949443817138672, "learning_rate": 8.620785382095357e-07, "logits/chosen": -0.571534276008606, "logits/rejected": -0.7806876301765442, "logps/chosen": -2.23927640914917, "logps/rejected": -2.2879207134246826, "loss": 4.3462, "rewards/accuracies": 0.5, "rewards/chosen": -22.392765045166016, "rewards/margins": 0.4864432215690613, "rewards/rejected": -22.879207611083984, "step": 9435 }, { "epoch": 0.31817722201624593, "grad_norm": 53.13159942626953, "learning_rate": 8.618756300472085e-07, "logits/chosen": -0.9639241099357605, "logits/rejected": -1.171112060546875, "logps/chosen": -1.884033441543579, "logps/rejected": -1.9627354145050049, "loss": 2.5174, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.840335845947266, "rewards/margins": 0.7870213389396667, "rewards/rejected": -19.627357482910156, "step": 9440 }, { "epoch": 0.3183457480872291, "grad_norm": 29.785985946655273, "learning_rate": 8.616725966539831e-07, "logits/chosen": -0.48238930106163025, "logits/rejected": -0.6622364521026611, "logps/chosen": -1.9177591800689697, "logps/rejected": -2.15956449508667, "loss": 2.3485, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.17759132385254, "rewards/margins": 2.4180550575256348, "rewards/rejected": -21.595645904541016, "step": 9445 }, { "epoch": 0.31851427415821226, "grad_norm": 40.153404235839844, "learning_rate": 8.614694381001213e-07, "logits/chosen": -0.8885830044746399, "logits/rejected": -0.7523598670959473, "logps/chosen": -1.8252578973770142, "logps/rejected": -1.7716503143310547, "loss": 3.7513, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -18.252578735351562, "rewards/margins": -0.5360754132270813, "rewards/rejected": -17.716503143310547, "step": 9450 }, { "epoch": 0.3186828002291955, "grad_norm": 28.715364456176758, "learning_rate": 8.612661544559284e-07, "logits/chosen": -1.2380956411361694, "logits/rejected": -1.2025935649871826, "logps/chosen": -1.6656516790390015, "logps/rejected": -1.7289981842041016, "loss": 2.9496, "rewards/accuracies": 0.5, "rewards/chosen": -16.656518936157227, "rewards/margins": 0.633465588092804, "rewards/rejected": -17.289981842041016, "step": 9455 }, { "epoch": 0.31885132630017865, "grad_norm": 35.392513275146484, "learning_rate": 8.610627457917526e-07, "logits/chosen": -0.8869184255599976, "logits/rejected": -0.8394335508346558, "logps/chosen": -2.1466517448425293, "logps/rejected": -2.15181827545166, "loss": 3.3505, "rewards/accuracies": 0.5, "rewards/chosen": -21.46651840209961, "rewards/margins": 0.05166482925415039, "rewards/rejected": -21.5181827545166, "step": 9460 }, { "epoch": 0.3190198523711618, "grad_norm": 24.403085708618164, "learning_rate": 8.608592121779856e-07, "logits/chosen": -0.5069986581802368, "logits/rejected": -0.3841266632080078, "logps/chosen": -2.139840602874756, "logps/rejected": -2.1221654415130615, "loss": 3.2549, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -21.398408889770508, "rewards/margins": -0.17675228416919708, "rewards/rejected": -21.221654891967773, "step": 9465 }, { "epoch": 0.319188378442145, "grad_norm": 34.315101623535156, "learning_rate": 8.606555536850628e-07, "logits/chosen": -0.9683068990707397, "logits/rejected": -0.9776903986930847, "logps/chosen": -2.0443432331085205, "logps/rejected": -2.006239414215088, "loss": 3.6543, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.443431854248047, "rewards/margins": -0.3810390532016754, "rewards/rejected": -20.062393188476562, "step": 9470 }, { "epoch": 0.3193569045131282, "grad_norm": 18.212900161743164, "learning_rate": 8.604517703834622e-07, "logits/chosen": -0.9584037661552429, "logits/rejected": -0.8274585008621216, "logps/chosen": -1.603839635848999, "logps/rejected": -1.6559810638427734, "loss": 2.941, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.03839683532715, "rewards/margins": 0.5214144587516785, "rewards/rejected": -16.559810638427734, "step": 9475 }, { "epoch": 0.31952543058411137, "grad_norm": 31.893938064575195, "learning_rate": 8.60247862343705e-07, "logits/chosen": -0.7184762954711914, "logits/rejected": -0.7728734016418457, "logps/chosen": -1.8432371616363525, "logps/rejected": -1.8536771535873413, "loss": 3.3165, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.432369232177734, "rewards/margins": 0.10440144687891006, "rewards/rejected": -18.536771774291992, "step": 9480 }, { "epoch": 0.31969395665509454, "grad_norm": 25.503551483154297, "learning_rate": 8.600438296363559e-07, "logits/chosen": -1.151992917060852, "logits/rejected": -1.1667964458465576, "logps/chosen": -1.5828298330307007, "logps/rejected": -1.6267452239990234, "loss": 2.7009, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -15.828298568725586, "rewards/margins": 0.43915247917175293, "rewards/rejected": -16.267452239990234, "step": 9485 }, { "epoch": 0.3198624827260777, "grad_norm": 31.356788635253906, "learning_rate": 8.598396723320224e-07, "logits/chosen": -0.9881758689880371, "logits/rejected": -1.132152795791626, "logps/chosen": -1.8627265691757202, "logps/rejected": -1.8474178314208984, "loss": 3.6102, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -18.62726402282715, "rewards/margins": -0.15308618545532227, "rewards/rejected": -18.474178314208984, "step": 9490 }, { "epoch": 0.3200310087970609, "grad_norm": 119.1594467163086, "learning_rate": 8.596353905013556e-07, "logits/chosen": -0.33203741908073425, "logits/rejected": -0.49453800916671753, "logps/chosen": -2.566382884979248, "logps/rejected": -2.5018601417541504, "loss": 3.7362, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -25.663827896118164, "rewards/margins": -0.6452277302742004, "rewards/rejected": -25.018598556518555, "step": 9495 }, { "epoch": 0.3201995348680441, "grad_norm": 100.49848937988281, "learning_rate": 8.594309842150491e-07, "logits/chosen": -1.0113314390182495, "logits/rejected": -0.9509127736091614, "logps/chosen": -1.7100799083709717, "logps/rejected": -1.7621523141860962, "loss": 2.6916, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.100797653198242, "rewards/margins": 0.5207257866859436, "rewards/rejected": -17.621524810791016, "step": 9500 }, { "epoch": 0.32036806093902725, "grad_norm": 28.509531021118164, "learning_rate": 8.592264535438399e-07, "logits/chosen": -0.9975595474243164, "logits/rejected": -1.1303162574768066, "logps/chosen": -1.623504400253296, "logps/rejected": -1.6738468408584595, "loss": 2.7432, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.235044479370117, "rewards/margins": 0.5034238696098328, "rewards/rejected": -16.738468170166016, "step": 9505 }, { "epoch": 0.3205365870100104, "grad_norm": 17.786806106567383, "learning_rate": 8.590217985585083e-07, "logits/chosen": -0.43555235862731934, "logits/rejected": -0.549434244632721, "logps/chosen": -2.0390753746032715, "logps/rejected": -2.358064889907837, "loss": 2.2292, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.39075469970703, "rewards/margins": 3.1898932456970215, "rewards/rejected": -23.58064842224121, "step": 9510 }, { "epoch": 0.32070511308099364, "grad_norm": 69.22200012207031, "learning_rate": 8.588170193298769e-07, "logits/chosen": -0.5065039396286011, "logits/rejected": -0.5373364686965942, "logps/chosen": -2.0895285606384277, "logps/rejected": -2.2371814250946045, "loss": 2.788, "rewards/accuracies": 0.5, "rewards/chosen": -20.895288467407227, "rewards/margins": 1.4765275716781616, "rewards/rejected": -22.371814727783203, "step": 9515 }, { "epoch": 0.3208736391519768, "grad_norm": 14.257323265075684, "learning_rate": 8.58612115928812e-07, "logits/chosen": -1.1542446613311768, "logits/rejected": -1.0949870347976685, "logps/chosen": -1.9531142711639404, "logps/rejected": -2.2141025066375732, "loss": 2.0862, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.531143188476562, "rewards/margins": 2.609881639480591, "rewards/rejected": -22.14102554321289, "step": 9520 }, { "epoch": 0.32104216522296, "grad_norm": 37.42106628417969, "learning_rate": 8.584070884262225e-07, "logits/chosen": -1.0090105533599854, "logits/rejected": -1.0935182571411133, "logps/chosen": -1.9097763299942017, "logps/rejected": -2.0177340507507324, "loss": 2.1412, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.09776496887207, "rewards/margins": 1.0795772075653076, "rewards/rejected": -20.17734146118164, "step": 9525 }, { "epoch": 0.3212106912939432, "grad_norm": 24.182695388793945, "learning_rate": 8.582019368930605e-07, "logits/chosen": -1.008709192276001, "logits/rejected": -1.2963372468948364, "logps/chosen": -1.8214937448501587, "logps/rejected": -2.3255906105041504, "loss": 2.9947, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.21493911743164, "rewards/margins": 5.0409698486328125, "rewards/rejected": -23.25590705871582, "step": 9530 }, { "epoch": 0.32137921736492636, "grad_norm": 48.76929473876953, "learning_rate": 8.579966614003206e-07, "logits/chosen": -0.8978897929191589, "logits/rejected": -0.8626217842102051, "logps/chosen": -1.9985179901123047, "logps/rejected": -2.1461033821105957, "loss": 2.7831, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.985179901123047, "rewards/margins": 1.4758514165878296, "rewards/rejected": -21.461029052734375, "step": 9535 }, { "epoch": 0.3215477434359095, "grad_norm": 17.92789649963379, "learning_rate": 8.577912620190408e-07, "logits/chosen": -1.0495909452438354, "logits/rejected": -1.0890016555786133, "logps/chosen": -1.8421592712402344, "logps/rejected": -2.0374155044555664, "loss": 2.5219, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.42159080505371, "rewards/margins": 1.9525625705718994, "rewards/rejected": -20.374156951904297, "step": 9540 }, { "epoch": 0.3217162695068927, "grad_norm": 19.91221046447754, "learning_rate": 8.575857388203016e-07, "logits/chosen": -0.7675440907478333, "logits/rejected": -0.9447044134140015, "logps/chosen": -1.7784792184829712, "logps/rejected": -2.116765260696411, "loss": 2.1139, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.784793853759766, "rewards/margins": 3.382856845855713, "rewards/rejected": -21.16765022277832, "step": 9545 }, { "epoch": 0.3218847955778759, "grad_norm": 17.915637969970703, "learning_rate": 8.573800918752266e-07, "logits/chosen": -1.1638829708099365, "logits/rejected": -1.052834391593933, "logps/chosen": -1.5051783323287964, "logps/rejected": -1.4815104007720947, "loss": 3.3848, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -15.051783561706543, "rewards/margins": -0.23667888343334198, "rewards/rejected": -14.815104484558105, "step": 9550 }, { "epoch": 0.3220533216488591, "grad_norm": 22.767057418823242, "learning_rate": 8.571743212549817e-07, "logits/chosen": -0.7604719400405884, "logits/rejected": -1.0411133766174316, "logps/chosen": -1.8910396099090576, "logps/rejected": -1.7851155996322632, "loss": 4.1678, "rewards/accuracies": 0.5, "rewards/chosen": -18.910396575927734, "rewards/margins": -1.0592386722564697, "rewards/rejected": -17.85115623474121, "step": 9555 }, { "epoch": 0.32222184771984225, "grad_norm": 199.3408966064453, "learning_rate": 8.569684270307767e-07, "logits/chosen": -0.7701305150985718, "logits/rejected": -0.8080227971076965, "logps/chosen": -2.486893892288208, "logps/rejected": -2.411651134490967, "loss": 4.5925, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -24.868938446044922, "rewards/margins": -0.7524264454841614, "rewards/rejected": -24.11651039123535, "step": 9560 }, { "epoch": 0.3223903737908254, "grad_norm": 28.15599822998047, "learning_rate": 8.567624092738629e-07, "logits/chosen": -0.9503934979438782, "logits/rejected": -1.0396947860717773, "logps/chosen": -1.709754228591919, "logps/rejected": -1.7016417980194092, "loss": 3.2697, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.09754180908203, "rewards/margins": -0.08112458884716034, "rewards/rejected": -17.01641845703125, "step": 9565 }, { "epoch": 0.32255889986180863, "grad_norm": 25.845806121826172, "learning_rate": 8.565562680555351e-07, "logits/chosen": -0.9343695640563965, "logits/rejected": -0.6249425411224365, "logps/chosen": -1.81558096408844, "logps/rejected": -1.869520902633667, "loss": 3.1009, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.155807495117188, "rewards/margins": 0.5394006967544556, "rewards/rejected": -18.695209503173828, "step": 9570 }, { "epoch": 0.3227274259327918, "grad_norm": 30.309011459350586, "learning_rate": 8.563500034471308e-07, "logits/chosen": -0.9814979434013367, "logits/rejected": -1.0041558742523193, "logps/chosen": -1.9443080425262451, "logps/rejected": -1.9487826824188232, "loss": 3.2169, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -19.44308090209961, "rewards/margins": 0.04474801942706108, "rewards/rejected": -19.48782730102539, "step": 9575 }, { "epoch": 0.32289595200377497, "grad_norm": 27.450937271118164, "learning_rate": 8.561436155200299e-07, "logits/chosen": -0.8652593493461609, "logits/rejected": -0.8776264190673828, "logps/chosen": -2.157226800918579, "logps/rejected": -2.30631947517395, "loss": 2.236, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.572269439697266, "rewards/margins": 1.4909271001815796, "rewards/rejected": -23.063194274902344, "step": 9580 }, { "epoch": 0.3230644780747582, "grad_norm": 25.80462646484375, "learning_rate": 8.559371043456551e-07, "logits/chosen": -0.7589584589004517, "logits/rejected": -0.8819445371627808, "logps/chosen": -2.0037195682525635, "logps/rejected": -2.218597173690796, "loss": 1.9919, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.037195205688477, "rewards/margins": 2.1487772464752197, "rewards/rejected": -22.185970306396484, "step": 9585 }, { "epoch": 0.32323300414574135, "grad_norm": 23.218080520629883, "learning_rate": 8.55730469995472e-07, "logits/chosen": -0.8827985525131226, "logits/rejected": -0.8832064867019653, "logps/chosen": -2.133117198944092, "logps/rejected": -2.1974258422851562, "loss": 2.6685, "rewards/accuracies": 0.5, "rewards/chosen": -21.331172943115234, "rewards/margins": 0.6430840492248535, "rewards/rejected": -21.974258422851562, "step": 9590 }, { "epoch": 0.3234015302167245, "grad_norm": 28.35801887512207, "learning_rate": 8.555237125409882e-07, "logits/chosen": -0.9255178570747375, "logits/rejected": -0.8860748410224915, "logps/chosen": -2.0652050971984863, "logps/rejected": -2.2760536670684814, "loss": 2.1149, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.652048110961914, "rewards/margins": 2.1084868907928467, "rewards/rejected": -22.76053810119629, "step": 9595 }, { "epoch": 0.3235700562877077, "grad_norm": 14.770458221435547, "learning_rate": 8.553168320537547e-07, "logits/chosen": -1.5179839134216309, "logits/rejected": -1.4965096712112427, "logps/chosen": -2.1413462162017822, "logps/rejected": -2.4134342670440674, "loss": 2.701, "rewards/accuracies": 0.5, "rewards/chosen": -21.413461685180664, "rewards/margins": 2.7208831310272217, "rewards/rejected": -24.13434410095215, "step": 9600 }, { "epoch": 0.3235700562877077, "eval_logits/chosen": -1.2073123455047607, "eval_logits/rejected": -1.278466820716858, "eval_logps/chosen": -1.8508267402648926, "eval_logps/rejected": -1.9134628772735596, "eval_loss": 3.102193593978882, "eval_rewards/accuracies": 0.6100000143051147, "eval_rewards/chosen": -18.50826644897461, "eval_rewards/margins": 0.6263617873191833, "eval_rewards/rejected": -19.134628295898438, "eval_runtime": 12.9203, "eval_samples_per_second": 7.74, "eval_steps_per_second": 1.935, "step": 9600 }, { "epoch": 0.3237385823586909, "grad_norm": 40.8096923828125, "learning_rate": 8.551098286053647e-07, "logits/chosen": -1.0502384901046753, "logits/rejected": -0.9606598019599915, "logps/chosen": -1.8660523891448975, "logps/rejected": -1.9286441802978516, "loss": 2.6769, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.660526275634766, "rewards/margins": 0.6259174346923828, "rewards/rejected": -19.286441802978516, "step": 9605 }, { "epoch": 0.3239071084296741, "grad_norm": 31.944400787353516, "learning_rate": 8.549027022674536e-07, "logits/chosen": -1.1471023559570312, "logits/rejected": -1.282149314880371, "logps/chosen": -1.9713191986083984, "logps/rejected": -2.035529851913452, "loss": 2.892, "rewards/accuracies": 0.5, "rewards/chosen": -19.713191986083984, "rewards/margins": 0.642108142375946, "rewards/rejected": -20.35529899597168, "step": 9610 }, { "epoch": 0.32407563450065724, "grad_norm": 12.34630012512207, "learning_rate": 8.546954531116999e-07, "logits/chosen": -0.7726519703865051, "logits/rejected": -1.0031630992889404, "logps/chosen": -1.6736605167388916, "logps/rejected": -1.8040440082550049, "loss": 2.2425, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.73660659790039, "rewards/margins": 1.3038337230682373, "rewards/rejected": -18.04043960571289, "step": 9615 }, { "epoch": 0.3242441605716404, "grad_norm": 47.13421630859375, "learning_rate": 8.544880812098242e-07, "logits/chosen": -1.16587495803833, "logits/rejected": -0.9911531209945679, "logps/chosen": -1.9475829601287842, "logps/rejected": -1.936532974243164, "loss": 3.2225, "rewards/accuracies": 0.5, "rewards/chosen": -19.475828170776367, "rewards/margins": -0.11049928516149521, "rewards/rejected": -19.36532974243164, "step": 9620 }, { "epoch": 0.3244126866426236, "grad_norm": 21.326196670532227, "learning_rate": 8.542805866335902e-07, "logits/chosen": -0.9639849662780762, "logits/rejected": -1.0168709754943848, "logps/chosen": -1.6442312002182007, "logps/rejected": -1.623727560043335, "loss": 3.3254, "rewards/accuracies": 0.5, "rewards/chosen": -16.442312240600586, "rewards/margins": -0.20503464341163635, "rewards/rejected": -16.23727798461914, "step": 9625 }, { "epoch": 0.3245812127136068, "grad_norm": 22.18870735168457, "learning_rate": 8.54072969454803e-07, "logits/chosen": -0.5071839094161987, "logits/rejected": -0.3684050142765045, "logps/chosen": -2.0935750007629395, "logps/rejected": -2.424656391143799, "loss": 2.7616, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.935749053955078, "rewards/margins": 3.3108131885528564, "rewards/rejected": -24.246562957763672, "step": 9630 }, { "epoch": 0.32474973878458996, "grad_norm": 21.057998657226562, "learning_rate": 8.53865229745311e-07, "logits/chosen": -0.9777098894119263, "logits/rejected": -1.1507208347320557, "logps/chosen": -1.7663494348526, "logps/rejected": -1.8777844905853271, "loss": 3.0791, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.663494110107422, "rewards/margins": 1.114350438117981, "rewards/rejected": -18.777843475341797, "step": 9635 }, { "epoch": 0.3249182648555732, "grad_norm": 26.480030059814453, "learning_rate": 8.536573675770048e-07, "logits/chosen": -0.8730767965316772, "logits/rejected": -0.8443295359611511, "logps/chosen": -1.9790958166122437, "logps/rejected": -2.0051825046539307, "loss": 3.1889, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.790958404541016, "rewards/margins": 0.2608667314052582, "rewards/rejected": -20.05182456970215, "step": 9640 }, { "epoch": 0.32508679092655635, "grad_norm": 27.223636627197266, "learning_rate": 8.534493830218173e-07, "logits/chosen": -0.8448840379714966, "logits/rejected": -0.9053970575332642, "logps/chosen": -1.7872326374053955, "logps/rejected": -1.7945051193237305, "loss": 3.2698, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.872325897216797, "rewards/margins": 0.07272644340991974, "rewards/rejected": -17.945053100585938, "step": 9645 }, { "epoch": 0.3252553169975395, "grad_norm": 22.850114822387695, "learning_rate": 8.532412761517236e-07, "logits/chosen": -0.45379573106765747, "logits/rejected": -0.5692591071128845, "logps/chosen": -1.8840221166610718, "logps/rejected": -1.8490978479385376, "loss": 3.6867, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.840221405029297, "rewards/margins": -0.3492446839809418, "rewards/rejected": -18.490978240966797, "step": 9650 }, { "epoch": 0.3254238430685227, "grad_norm": 30.58220672607422, "learning_rate": 8.530330470387412e-07, "logits/chosen": -1.219278335571289, "logits/rejected": -1.2407810688018799, "logps/chosen": -1.8869062662124634, "logps/rejected": -1.8954986333847046, "loss": 3.1748, "rewards/accuracies": 0.5, "rewards/chosen": -18.869060516357422, "rewards/margins": 0.08592300117015839, "rewards/rejected": -18.954986572265625, "step": 9655 }, { "epoch": 0.3255923691395059, "grad_norm": 23.84304428100586, "learning_rate": 8.528246957549303e-07, "logits/chosen": -1.1590244770050049, "logits/rejected": -1.387596607208252, "logps/chosen": -1.9695241451263428, "logps/rejected": -2.313225269317627, "loss": 2.8357, "rewards/accuracies": 0.5, "rewards/chosen": -19.69524383544922, "rewards/margins": 3.4370105266571045, "rewards/rejected": -23.132251739501953, "step": 9660 }, { "epoch": 0.32576089521048907, "grad_norm": 25.04132080078125, "learning_rate": 8.52616222372393e-07, "logits/chosen": -0.4334254264831543, "logits/rejected": -0.39630335569381714, "logps/chosen": -1.6619259119033813, "logps/rejected": -1.7265924215316772, "loss": 2.4997, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -16.619258880615234, "rewards/margins": 0.6466663479804993, "rewards/rejected": -17.265926361083984, "step": 9665 }, { "epoch": 0.32592942128147223, "grad_norm": 34.72787857055664, "learning_rate": 8.524076269632736e-07, "logits/chosen": -0.741960346698761, "logits/rejected": -0.7212079763412476, "logps/chosen": -1.7507514953613281, "logps/rejected": -1.8867127895355225, "loss": 2.0226, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -17.50751304626465, "rewards/margins": 1.3596150875091553, "rewards/rejected": -18.867128372192383, "step": 9670 }, { "epoch": 0.3260979473524554, "grad_norm": 41.89925003051758, "learning_rate": 8.521989095997589e-07, "logits/chosen": -0.8195021748542786, "logits/rejected": -0.7949432730674744, "logps/chosen": -1.8670480251312256, "logps/rejected": -2.0466132164001465, "loss": 2.6577, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.670482635498047, "rewards/margins": 1.7956523895263672, "rewards/rejected": -20.46613311767578, "step": 9675 }, { "epoch": 0.3262664734234386, "grad_norm": 21.182729721069336, "learning_rate": 8.519900703540776e-07, "logits/chosen": -1.038873553276062, "logits/rejected": -0.9915347099304199, "logps/chosen": -1.9196319580078125, "logps/rejected": -2.232133150100708, "loss": 2.1505, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.196319580078125, "rewards/margins": 3.1250100135803223, "rewards/rejected": -22.32132911682129, "step": 9680 }, { "epoch": 0.3264349994944218, "grad_norm": 23.9052677154541, "learning_rate": 8.517811092985008e-07, "logits/chosen": -0.7712268829345703, "logits/rejected": -0.8451194763183594, "logps/chosen": -2.0270209312438965, "logps/rejected": -2.146897792816162, "loss": 2.9616, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.27021026611328, "rewards/margins": 1.1987684965133667, "rewards/rejected": -21.468978881835938, "step": 9685 }, { "epoch": 0.32660352556540495, "grad_norm": 30.760866165161133, "learning_rate": 8.515720265053416e-07, "logits/chosen": -0.8517478704452515, "logits/rejected": -0.6468242406845093, "logps/chosen": -1.8079086542129517, "logps/rejected": -1.719451665878296, "loss": 3.981, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.079084396362305, "rewards/margins": -0.8845663070678711, "rewards/rejected": -17.19451904296875, "step": 9690 }, { "epoch": 0.32677205163638817, "grad_norm": 2.7800254821777344, "learning_rate": 8.513628220469556e-07, "logits/chosen": -0.8500580787658691, "logits/rejected": -0.9404166340827942, "logps/chosen": -1.8277915716171265, "logps/rejected": -2.126081705093384, "loss": 1.8373, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.277915954589844, "rewards/margins": 2.982900619506836, "rewards/rejected": -21.26081657409668, "step": 9695 }, { "epoch": 0.32694057770737134, "grad_norm": 104.6100845336914, "learning_rate": 8.5115349599574e-07, "logits/chosen": -0.7318485975265503, "logits/rejected": -0.7622694373130798, "logps/chosen": -2.2147345542907715, "logps/rejected": -2.025134801864624, "loss": 5.1087, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.1473445892334, "rewards/margins": -1.895997405052185, "rewards/rejected": -20.2513484954834, "step": 9700 }, { "epoch": 0.3271091037783545, "grad_norm": 17.52537727355957, "learning_rate": 8.509440484241342e-07, "logits/chosen": -1.0226285457611084, "logits/rejected": -1.0693570375442505, "logps/chosen": -2.1967062950134277, "logps/rejected": -2.1093969345092773, "loss": 4.3936, "rewards/accuracies": 0.5, "rewards/chosen": -21.967063903808594, "rewards/margins": -0.8730939030647278, "rewards/rejected": -21.09396743774414, "step": 9705 }, { "epoch": 0.32727762984933767, "grad_norm": 19.541439056396484, "learning_rate": 8.507344794046201e-07, "logits/chosen": -1.3186284303665161, "logits/rejected": -1.2661542892456055, "logps/chosen": -1.856690764427185, "logps/rejected": -1.9217170476913452, "loss": 2.7054, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.566905975341797, "rewards/margins": 0.6502623558044434, "rewards/rejected": -19.21717071533203, "step": 9710 }, { "epoch": 0.3274461559203209, "grad_norm": 22.625598907470703, "learning_rate": 8.505247890097208e-07, "logits/chosen": -0.7529505491256714, "logits/rejected": -0.941428005695343, "logps/chosen": -1.9435609579086304, "logps/rejected": -2.5068881511688232, "loss": 1.0568, "rewards/accuracies": 1.0, "rewards/chosen": -19.435611724853516, "rewards/margins": 5.633272647857666, "rewards/rejected": -25.06888198852539, "step": 9715 }, { "epoch": 0.32761468199130406, "grad_norm": 16.12258529663086, "learning_rate": 8.503149773120023e-07, "logits/chosen": -0.8368858098983765, "logits/rejected": -0.9379183053970337, "logps/chosen": -2.060410976409912, "logps/rejected": -2.178067445755005, "loss": 3.0949, "rewards/accuracies": 0.5, "rewards/chosen": -20.604108810424805, "rewards/margins": 1.1765660047531128, "rewards/rejected": -21.78067398071289, "step": 9720 }, { "epoch": 0.3277832080622872, "grad_norm": 23.239463806152344, "learning_rate": 8.501050443840721e-07, "logits/chosen": -1.0896296501159668, "logits/rejected": -1.2426843643188477, "logps/chosen": -1.9288963079452515, "logps/rejected": -2.2791318893432617, "loss": 1.6746, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.288963317871094, "rewards/margins": 3.502357006072998, "rewards/rejected": -22.791318893432617, "step": 9725 }, { "epoch": 0.3279517341332704, "grad_norm": 29.32204246520996, "learning_rate": 8.498949902985795e-07, "logits/chosen": -0.5991629362106323, "logits/rejected": -0.5655861496925354, "logps/chosen": -2.132098436355591, "logps/rejected": -2.0694327354431152, "loss": 4.0641, "rewards/accuracies": 0.5, "rewards/chosen": -21.32098388671875, "rewards/margins": -0.6266528367996216, "rewards/rejected": -20.6943302154541, "step": 9730 }, { "epoch": 0.3281202602042536, "grad_norm": 43.981815338134766, "learning_rate": 8.49684815128216e-07, "logits/chosen": -0.7278081178665161, "logits/rejected": -0.7373084425926208, "logps/chosen": -2.299593448638916, "logps/rejected": -2.4935460090637207, "loss": 1.894, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.99593734741211, "rewards/margins": 1.9395256042480469, "rewards/rejected": -24.935461044311523, "step": 9735 }, { "epoch": 0.3282887862752368, "grad_norm": 18.560821533203125, "learning_rate": 8.494745189457151e-07, "logits/chosen": -1.152789831161499, "logits/rejected": -1.0454437732696533, "logps/chosen": -1.897719144821167, "logps/rejected": -2.0982089042663574, "loss": 3.2824, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.977191925048828, "rewards/margins": 2.0048956871032715, "rewards/rejected": -20.98208999633789, "step": 9740 }, { "epoch": 0.32845731234621994, "grad_norm": 114.25135040283203, "learning_rate": 8.49264101823852e-07, "logits/chosen": -0.6349064111709595, "logits/rejected": -0.6722576022148132, "logps/chosen": -2.2356173992156982, "logps/rejected": -2.3247010707855225, "loss": 2.9115, "rewards/accuracies": 0.5, "rewards/chosen": -22.35617446899414, "rewards/margins": 0.8908360600471497, "rewards/rejected": -23.24700927734375, "step": 9745 }, { "epoch": 0.32862583841720316, "grad_norm": 45.583343505859375, "learning_rate": 8.490535638354436e-07, "logits/chosen": -0.9422609210014343, "logits/rejected": -0.7428984045982361, "logps/chosen": -1.7766081094741821, "logps/rejected": -1.7448434829711914, "loss": 3.4315, "rewards/accuracies": 0.5, "rewards/chosen": -17.766080856323242, "rewards/margins": -0.31764650344848633, "rewards/rejected": -17.448434829711914, "step": 9750 }, { "epoch": 0.32879436448818633, "grad_norm": 26.040782928466797, "learning_rate": 8.48842905053349e-07, "logits/chosen": -0.9188686609268188, "logits/rejected": -1.0052945613861084, "logps/chosen": -2.036522626876831, "logps/rejected": -2.0704965591430664, "loss": 2.982, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.36522674560547, "rewards/margins": 0.33973854780197144, "rewards/rejected": -20.70496368408203, "step": 9755 }, { "epoch": 0.3289628905591695, "grad_norm": 27.022668838500977, "learning_rate": 8.486321255504687e-07, "logits/chosen": -1.0087183713912964, "logits/rejected": -1.115375280380249, "logps/chosen": -1.6923017501831055, "logps/rejected": -1.6801421642303467, "loss": 3.1842, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.923015594482422, "rewards/margins": -0.12159526348114014, "rewards/rejected": -16.801422119140625, "step": 9760 }, { "epoch": 0.32913141663015266, "grad_norm": 40.464080810546875, "learning_rate": 8.484212253997455e-07, "logits/chosen": -0.6966105103492737, "logits/rejected": -0.9393747448921204, "logps/chosen": -2.2360169887542725, "logps/rejected": -1.9587417840957642, "loss": 5.9142, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.360172271728516, "rewards/margins": -2.772754430770874, "rewards/rejected": -19.587417602539062, "step": 9765 }, { "epoch": 0.3292999427011359, "grad_norm": 45.77811050415039, "learning_rate": 8.482102046741633e-07, "logits/chosen": -0.8145803213119507, "logits/rejected": -0.793364405632019, "logps/chosen": -2.1728711128234863, "logps/rejected": -2.2482926845550537, "loss": 3.1594, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.728710174560547, "rewards/margins": 0.7542173266410828, "rewards/rejected": -22.482927322387695, "step": 9770 }, { "epoch": 0.32946846877211905, "grad_norm": 53.0908203125, "learning_rate": 8.479990634467482e-07, "logits/chosen": -0.9707155227661133, "logits/rejected": -0.947056770324707, "logps/chosen": -2.244295120239258, "logps/rejected": -2.3716988563537598, "loss": 2.0424, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.442951202392578, "rewards/margins": 1.2740370035171509, "rewards/rejected": -23.71698760986328, "step": 9775 }, { "epoch": 0.3296369948431022, "grad_norm": 23.31206512451172, "learning_rate": 8.47787801790568e-07, "logits/chosen": -0.6974349021911621, "logits/rejected": -0.9545847177505493, "logps/chosen": -1.9938684701919556, "logps/rejected": -2.0778064727783203, "loss": 2.9787, "rewards/accuracies": 0.5, "rewards/chosen": -19.93868637084961, "rewards/margins": 0.8393799662590027, "rewards/rejected": -20.778064727783203, "step": 9780 }, { "epoch": 0.3298055209140854, "grad_norm": 23.06899070739746, "learning_rate": 8.475764197787317e-07, "logits/chosen": -1.0806128978729248, "logits/rejected": -0.8885553479194641, "logps/chosen": -1.8322607278823853, "logps/rejected": -1.838618516921997, "loss": 3.6789, "rewards/accuracies": 0.5, "rewards/chosen": -18.322607040405273, "rewards/margins": 0.0635797530412674, "rewards/rejected": -18.386188507080078, "step": 9785 }, { "epoch": 0.3299740469850686, "grad_norm": 22.257450103759766, "learning_rate": 8.473649174843906e-07, "logits/chosen": -1.033005952835083, "logits/rejected": -1.2247979640960693, "logps/chosen": -1.7999452352523804, "logps/rejected": -1.8376827239990234, "loss": 3.2559, "rewards/accuracies": 0.5, "rewards/chosen": -17.999454498291016, "rewards/margins": 0.37737494707107544, "rewards/rejected": -18.376827239990234, "step": 9790 }, { "epoch": 0.33014257305605177, "grad_norm": 38.79618835449219, "learning_rate": 8.471532949807372e-07, "logits/chosen": -0.8862309455871582, "logits/rejected": -0.8179659843444824, "logps/chosen": -1.777960181236267, "logps/rejected": -1.7753187417984009, "loss": 3.6499, "rewards/accuracies": 0.5, "rewards/chosen": -17.779598236083984, "rewards/margins": -0.026413727551698685, "rewards/rejected": -17.75318717956543, "step": 9795 }, { "epoch": 0.33031109912703494, "grad_norm": 15.938300132751465, "learning_rate": 8.469415523410056e-07, "logits/chosen": -0.6858940124511719, "logits/rejected": -0.6337902545928955, "logps/chosen": -1.9199140071868896, "logps/rejected": -2.024050235748291, "loss": 2.3887, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.199140548706055, "rewards/margins": 1.0413639545440674, "rewards/rejected": -20.240503311157227, "step": 9800 }, { "epoch": 0.33047962519801816, "grad_norm": 27.558385848999023, "learning_rate": 8.467296896384717e-07, "logits/chosen": -0.9122729301452637, "logits/rejected": -0.9424211382865906, "logps/chosen": -1.8392369747161865, "logps/rejected": -2.293322801589966, "loss": 2.3289, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.39236831665039, "rewards/margins": 4.540858268737793, "rewards/rejected": -22.9332275390625, "step": 9805 }, { "epoch": 0.3306481512690013, "grad_norm": 17.154170989990234, "learning_rate": 8.465177069464528e-07, "logits/chosen": -0.7819756269454956, "logits/rejected": -0.9240609407424927, "logps/chosen": -1.9444135427474976, "logps/rejected": -2.0653367042541504, "loss": 2.3107, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.444137573242188, "rewards/margins": 1.209228754043579, "rewards/rejected": -20.653366088867188, "step": 9810 }, { "epoch": 0.3308166773399845, "grad_norm": 19.14472198486328, "learning_rate": 8.463056043383079e-07, "logits/chosen": -0.9145916700363159, "logits/rejected": -0.8885605931282043, "logps/chosen": -2.2163727283477783, "logps/rejected": -2.3537745475769043, "loss": 3.1827, "rewards/accuracies": 0.5, "rewards/chosen": -22.163726806640625, "rewards/margins": 1.3740198612213135, "rewards/rejected": -23.53774642944336, "step": 9815 }, { "epoch": 0.33098520341096765, "grad_norm": 22.862253189086914, "learning_rate": 8.460933818874372e-07, "logits/chosen": -1.0414626598358154, "logits/rejected": -1.1654443740844727, "logps/chosen": -1.8751583099365234, "logps/rejected": -1.9087566137313843, "loss": 2.9526, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.751583099365234, "rewards/margins": 0.3359828591346741, "rewards/rejected": -19.08756446838379, "step": 9820 }, { "epoch": 0.3311537294819509, "grad_norm": 18.878591537475586, "learning_rate": 8.458810396672827e-07, "logits/chosen": -0.9540130496025085, "logits/rejected": -1.0682528018951416, "logps/chosen": -2.067431688308716, "logps/rejected": -2.05672025680542, "loss": 3.8316, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.674314498901367, "rewards/margins": -0.10711526870727539, "rewards/rejected": -20.56719970703125, "step": 9825 }, { "epoch": 0.33132225555293404, "grad_norm": 20.707855224609375, "learning_rate": 8.456685777513273e-07, "logits/chosen": -1.0735938549041748, "logits/rejected": -1.0637882947921753, "logps/chosen": -1.8889278173446655, "logps/rejected": -2.131657600402832, "loss": 2.0185, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.8892765045166, "rewards/margins": 2.4273009300231934, "rewards/rejected": -21.316577911376953, "step": 9830 }, { "epoch": 0.3314907816239172, "grad_norm": 50.769004821777344, "learning_rate": 8.45455996213096e-07, "logits/chosen": -1.1742569208145142, "logits/rejected": -1.2606443166732788, "logps/chosen": -1.8508479595184326, "logps/rejected": -1.8263981342315674, "loss": 3.3138, "rewards/accuracies": 0.5, "rewards/chosen": -18.508480072021484, "rewards/margins": -0.24450120329856873, "rewards/rejected": -18.263980865478516, "step": 9835 }, { "epoch": 0.3316593076949004, "grad_norm": 19.067001342773438, "learning_rate": 8.452432951261548e-07, "logits/chosen": -0.9642957448959351, "logits/rejected": -1.1399331092834473, "logps/chosen": -2.040045738220215, "logps/rejected": -2.4999804496765137, "loss": 3.3077, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.400455474853516, "rewards/margins": 4.599349021911621, "rewards/rejected": -24.99980354309082, "step": 9840 }, { "epoch": 0.3318278337658836, "grad_norm": 26.52146339416504, "learning_rate": 8.450304745641112e-07, "logits/chosen": -1.0799713134765625, "logits/rejected": -0.9734416007995605, "logps/chosen": -1.9313831329345703, "logps/rejected": -1.8945175409317017, "loss": 3.4691, "rewards/accuracies": 0.5, "rewards/chosen": -19.313831329345703, "rewards/margins": -0.3686564564704895, "rewards/rejected": -18.945175170898438, "step": 9845 }, { "epoch": 0.33199635983686676, "grad_norm": 19.426136016845703, "learning_rate": 8.448175346006141e-07, "logits/chosen": -0.675517737865448, "logits/rejected": -0.7951253056526184, "logps/chosen": -1.6109631061553955, "logps/rejected": -1.7086893320083618, "loss": 2.5614, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -16.109630584716797, "rewards/margins": 0.9772618412971497, "rewards/rejected": -17.08689308166504, "step": 9850 }, { "epoch": 0.3321648859078499, "grad_norm": 29.161775588989258, "learning_rate": 8.446044753093535e-07, "logits/chosen": -0.7901461720466614, "logits/rejected": -0.745838463306427, "logps/chosen": -2.0752806663513184, "logps/rejected": -2.0568737983703613, "loss": 3.6689, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.752811431884766, "rewards/margins": -0.1840725839138031, "rewards/rejected": -20.568737030029297, "step": 9855 }, { "epoch": 0.33233341197883315, "grad_norm": 2.2476565837860107, "learning_rate": 8.44391296764061e-07, "logits/chosen": -0.9421189427375793, "logits/rejected": -1.1054116487503052, "logps/chosen": -2.5878076553344727, "logps/rejected": -2.8351638317108154, "loss": 2.177, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.878076553344727, "rewards/margins": 2.4735639095306396, "rewards/rejected": -28.351642608642578, "step": 9860 }, { "epoch": 0.3325019380498163, "grad_norm": 29.23499870300293, "learning_rate": 8.441779990385089e-07, "logits/chosen": -0.7637979388237, "logits/rejected": -0.8095385432243347, "logps/chosen": -2.1008362770080566, "logps/rejected": -2.135150909423828, "loss": 3.0109, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -21.008363723754883, "rewards/margins": 0.34314537048339844, "rewards/rejected": -21.35150909423828, "step": 9865 }, { "epoch": 0.3326704641207995, "grad_norm": 39.963294982910156, "learning_rate": 8.439645822065115e-07, "logits/chosen": -0.7813414335250854, "logits/rejected": -0.8605804443359375, "logps/chosen": -2.0555100440979004, "logps/rejected": -2.060969829559326, "loss": 3.9695, "rewards/accuracies": 0.5, "rewards/chosen": -20.555099487304688, "rewards/margins": 0.05459785461425781, "rewards/rejected": -20.609697341918945, "step": 9870 }, { "epoch": 0.33283899019178265, "grad_norm": 59.06498336791992, "learning_rate": 8.43751046341924e-07, "logits/chosen": -0.9045795202255249, "logits/rejected": -1.0206382274627686, "logps/chosen": -2.1519813537597656, "logps/rejected": -2.279313564300537, "loss": 2.3572, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.519811630249023, "rewards/margins": 1.2733227014541626, "rewards/rejected": -22.793134689331055, "step": 9875 }, { "epoch": 0.33300751626276587, "grad_norm": 65.2740249633789, "learning_rate": 8.435373915186426e-07, "logits/chosen": -0.8387653231620789, "logits/rejected": -0.8890382647514343, "logps/chosen": -2.2679905891418457, "logps/rejected": -2.2393336296081543, "loss": 3.4295, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.67991065979004, "rewards/margins": -0.2865728437900543, "rewards/rejected": -22.393335342407227, "step": 9880 }, { "epoch": 0.33317604233374903, "grad_norm": 24.272924423217773, "learning_rate": 8.433236178106047e-07, "logits/chosen": -0.8641761541366577, "logits/rejected": -0.8543815612792969, "logps/chosen": -2.308032512664795, "logps/rejected": -2.3229072093963623, "loss": 3.3651, "rewards/accuracies": 0.5, "rewards/chosen": -23.080322265625, "rewards/margins": 0.14874887466430664, "rewards/rejected": -23.22907257080078, "step": 9885 }, { "epoch": 0.3333445684047322, "grad_norm": 25.492050170898438, "learning_rate": 8.43109725291789e-07, "logits/chosen": -0.8823873400688171, "logits/rejected": -0.9587462544441223, "logps/chosen": -2.126710891723633, "logps/rejected": -2.0421059131622314, "loss": 4.2964, "rewards/accuracies": 0.5, "rewards/chosen": -21.267108917236328, "rewards/margins": -0.8460475206375122, "rewards/rejected": -20.42106056213379, "step": 9890 }, { "epoch": 0.33351309447571537, "grad_norm": 33.2651252746582, "learning_rate": 8.428957140362157e-07, "logits/chosen": -1.146289587020874, "logits/rejected": -1.347427487373352, "logps/chosen": -1.861358404159546, "logps/rejected": -1.9180434942245483, "loss": 2.7072, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.613584518432617, "rewards/margins": 0.5668505430221558, "rewards/rejected": -19.180435180664062, "step": 9895 }, { "epoch": 0.3336816205466986, "grad_norm": 20.580039978027344, "learning_rate": 8.426815841179451e-07, "logits/chosen": -0.49518918991088867, "logits/rejected": -0.4970271587371826, "logps/chosen": -2.2172446250915527, "logps/rejected": -2.412372589111328, "loss": 3.1679, "rewards/accuracies": 0.5, "rewards/chosen": -22.172447204589844, "rewards/margins": 1.9512779712677002, "rewards/rejected": -24.12372589111328, "step": 9900 }, { "epoch": 0.33385014661768175, "grad_norm": 55.29900360107422, "learning_rate": 8.424673356110792e-07, "logits/chosen": -1.1283732652664185, "logits/rejected": -1.0785002708435059, "logps/chosen": -1.8419973850250244, "logps/rejected": -1.9012470245361328, "loss": 2.6125, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.419971466064453, "rewards/margins": 0.5924980044364929, "rewards/rejected": -19.012470245361328, "step": 9905 }, { "epoch": 0.3340186726886649, "grad_norm": 22.117595672607422, "learning_rate": 8.422529685897614e-07, "logits/chosen": -0.9310518503189087, "logits/rejected": -0.8916823267936707, "logps/chosen": -2.5464320182800293, "logps/rejected": -2.720090389251709, "loss": 1.6402, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -25.46432113647461, "rewards/margins": 1.7365844249725342, "rewards/rejected": -27.20090675354004, "step": 9910 }, { "epoch": 0.33418719875964814, "grad_norm": 21.287263870239258, "learning_rate": 8.420384831281752e-07, "logits/chosen": -1.1691551208496094, "logits/rejected": -1.24275541305542, "logps/chosen": -2.466435670852661, "logps/rejected": -2.7765564918518066, "loss": 3.6018, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -24.664356231689453, "rewards/margins": 3.1012115478515625, "rewards/rejected": -27.765567779541016, "step": 9915 }, { "epoch": 0.3343557248306313, "grad_norm": 48.62921905517578, "learning_rate": 8.418238793005459e-07, "logits/chosen": -1.2796119451522827, "logits/rejected": -1.1867414712905884, "logps/chosen": -1.770689606666565, "logps/rejected": -1.7434495687484741, "loss": 3.3813, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.706897735595703, "rewards/margins": -0.27240076661109924, "rewards/rejected": -17.43449592590332, "step": 9920 }, { "epoch": 0.3345242509016145, "grad_norm": 28.96337127685547, "learning_rate": 8.416091571811393e-07, "logits/chosen": -1.2204170227050781, "logits/rejected": -1.3906219005584717, "logps/chosen": -1.966515302658081, "logps/rejected": -1.9123204946517944, "loss": 4.1011, "rewards/accuracies": 0.5, "rewards/chosen": -19.665151596069336, "rewards/margins": -0.5419479608535767, "rewards/rejected": -19.123207092285156, "step": 9925 }, { "epoch": 0.33469277697259764, "grad_norm": 40.10865783691406, "learning_rate": 8.413943168442621e-07, "logits/chosen": -0.6399518251419067, "logits/rejected": -0.7417412996292114, "logps/chosen": -1.9978796243667603, "logps/rejected": -2.1743671894073486, "loss": 1.7883, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -19.97879409790039, "rewards/margins": 1.7648769617080688, "rewards/rejected": -21.74367332458496, "step": 9930 }, { "epoch": 0.33486130304358086, "grad_norm": 12.934712409973145, "learning_rate": 8.411793583642625e-07, "logits/chosen": -0.8698463439941406, "logits/rejected": -1.0204404592514038, "logps/chosen": -1.9658260345458984, "logps/rejected": -2.275739908218384, "loss": 3.0921, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.658262252807617, "rewards/margins": 3.0991365909576416, "rewards/rejected": -22.757396697998047, "step": 9935 }, { "epoch": 0.335029829114564, "grad_norm": 28.698816299438477, "learning_rate": 8.409642818155287e-07, "logits/chosen": -0.9460894465446472, "logits/rejected": -0.9732720255851746, "logps/chosen": -1.5314778089523315, "logps/rejected": -1.5256506204605103, "loss": 3.1376, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -15.314778327941895, "rewards/margins": -0.05827188491821289, "rewards/rejected": -15.256505012512207, "step": 9940 }, { "epoch": 0.3351983551855472, "grad_norm": 24.51972198486328, "learning_rate": 8.407490872724905e-07, "logits/chosen": -1.241775393486023, "logits/rejected": -1.3540207147598267, "logps/chosen": -1.8327884674072266, "logps/rejected": -1.8153759241104126, "loss": 3.6465, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.327884674072266, "rewards/margins": -0.17412586510181427, "rewards/rejected": -18.153759002685547, "step": 9945 }, { "epoch": 0.33536688125653036, "grad_norm": 38.081138610839844, "learning_rate": 8.405337748096182e-07, "logits/chosen": -0.9224987030029297, "logits/rejected": -1.4343881607055664, "logps/chosen": -1.85904860496521, "logps/rejected": -2.256868839263916, "loss": 1.9235, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.590484619140625, "rewards/margins": 3.978203535079956, "rewards/rejected": -22.56869125366211, "step": 9950 }, { "epoch": 0.3355354073275136, "grad_norm": 12.562773704528809, "learning_rate": 8.403183445014228e-07, "logits/chosen": -0.7454361319541931, "logits/rejected": -0.9337083101272583, "logps/chosen": -2.4363226890563965, "logps/rejected": -2.676544666290283, "loss": 2.0228, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.36322784423828, "rewards/margins": 2.402218818664551, "rewards/rejected": -26.765445709228516, "step": 9955 }, { "epoch": 0.33570393339849675, "grad_norm": 24.086362838745117, "learning_rate": 8.401027964224565e-07, "logits/chosen": -0.8464582562446594, "logits/rejected": -0.9272781610488892, "logps/chosen": -1.9356921911239624, "logps/rejected": -1.8562870025634766, "loss": 4.2157, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.356922149658203, "rewards/margins": -0.7940529584884644, "rewards/rejected": -18.562870025634766, "step": 9960 }, { "epoch": 0.3358724594694799, "grad_norm": 26.327152252197266, "learning_rate": 8.398871306473118e-07, "logits/chosen": -0.979174017906189, "logits/rejected": -0.9731283187866211, "logps/chosen": -2.057375192642212, "logps/rejected": -2.0879650115966797, "loss": 3.4316, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.573749542236328, "rewards/margins": 0.30589810013771057, "rewards/rejected": -20.879650115966797, "step": 9965 }, { "epoch": 0.33604098554046313, "grad_norm": 20.45331382751465, "learning_rate": 8.396713472506222e-07, "logits/chosen": -0.8873946070671082, "logits/rejected": -0.948246955871582, "logps/chosen": -1.953375220298767, "logps/rejected": -2.0850324630737305, "loss": 2.0067, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -19.53375244140625, "rewards/margins": 1.3165717124938965, "rewards/rejected": -20.850322723388672, "step": 9970 }, { "epoch": 0.3362095116114463, "grad_norm": 27.921266555786133, "learning_rate": 8.394554463070619e-07, "logits/chosen": -0.7784280776977539, "logits/rejected": -0.8730652928352356, "logps/chosen": -2.3976194858551025, "logps/rejected": -2.5944883823394775, "loss": 2.3079, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.9761962890625, "rewards/margins": 1.9686893224716187, "rewards/rejected": -25.94488525390625, "step": 9975 }, { "epoch": 0.33637803768242946, "grad_norm": 12.803922653198242, "learning_rate": 8.392394278913456e-07, "logits/chosen": -0.8921037912368774, "logits/rejected": -1.1823089122772217, "logps/chosen": -2.6166586875915527, "logps/rejected": -2.4037137031555176, "loss": 6.1695, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -26.166589736938477, "rewards/margins": -2.1294519901275635, "rewards/rejected": -24.03713607788086, "step": 9980 }, { "epoch": 0.33654656375341263, "grad_norm": 44.06391143798828, "learning_rate": 8.390232920782287e-07, "logits/chosen": -1.0536354780197144, "logits/rejected": -1.010083556175232, "logps/chosen": -1.964163064956665, "logps/rejected": -2.0906193256378174, "loss": 2.6179, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.641630172729492, "rewards/margins": 1.264561414718628, "rewards/rejected": -20.906192779541016, "step": 9985 }, { "epoch": 0.33671508982439585, "grad_norm": 39.75322723388672, "learning_rate": 8.388070389425077e-07, "logits/chosen": -1.345232605934143, "logits/rejected": -1.3481756448745728, "logps/chosen": -1.900757074356079, "logps/rejected": -1.9350669384002686, "loss": 3.1176, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.007572174072266, "rewards/margins": 0.3430987298488617, "rewards/rejected": -19.350669860839844, "step": 9990 }, { "epoch": 0.336883615895379, "grad_norm": 16.268253326416016, "learning_rate": 8.385906685590187e-07, "logits/chosen": -0.9428138732910156, "logits/rejected": -0.8706213235855103, "logps/chosen": -1.9050085544586182, "logps/rejected": -2.324378728866577, "loss": 2.3956, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.050085067749023, "rewards/margins": 4.193702220916748, "rewards/rejected": -23.243785858154297, "step": 9995 }, { "epoch": 0.3370521419663622, "grad_norm": 28.86384391784668, "learning_rate": 8.383741810026395e-07, "logits/chosen": -1.1897058486938477, "logits/rejected": -1.244680643081665, "logps/chosen": -1.8175055980682373, "logps/rejected": -1.7724707126617432, "loss": 3.772, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.175058364868164, "rewards/margins": -0.45034971833229065, "rewards/rejected": -17.724706649780273, "step": 10000 }, { "epoch": 0.3370521419663622, "eval_logits/chosen": -1.2586745023727417, "eval_logits/rejected": -1.3345402479171753, "eval_logps/chosen": -1.8584271669387817, "eval_logps/rejected": -1.9249132871627808, "eval_loss": 3.0772247314453125, "eval_rewards/accuracies": 0.6100000143051147, "eval_rewards/chosen": -18.584270477294922, "eval_rewards/margins": 0.6648635268211365, "eval_rewards/rejected": -19.249135971069336, "eval_runtime": 12.9088, "eval_samples_per_second": 7.747, "eval_steps_per_second": 1.937, "step": 10000 }, { "epoch": 0.33722066803734535, "grad_norm": 37.60383605957031, "learning_rate": 8.381575763482875e-07, "logits/chosen": -0.7130342721939087, "logits/rejected": -0.5373210906982422, "logps/chosen": -2.3927738666534424, "logps/rejected": -2.6733829975128174, "loss": 2.5409, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.927738189697266, "rewards/margins": 2.8060927391052246, "rewards/rejected": -26.733829498291016, "step": 10005 }, { "epoch": 0.33738919410832857, "grad_norm": 18.94394874572754, "learning_rate": 8.379408546709212e-07, "logits/chosen": -1.180837631225586, "logits/rejected": -1.2834656238555908, "logps/chosen": -1.5497967004776, "logps/rejected": -1.8647041320800781, "loss": 2.0521, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -15.497967720031738, "rewards/margins": 3.1490752696990967, "rewards/rejected": -18.647043228149414, "step": 10010 }, { "epoch": 0.33755772017931174, "grad_norm": 27.073068618774414, "learning_rate": 8.377240160455395e-07, "logits/chosen": -0.5144739151000977, "logits/rejected": -0.6853980422019958, "logps/chosen": -2.464611053466797, "logps/rejected": -2.574533224105835, "loss": 2.9053, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.64611053466797, "rewards/margins": 1.099221110343933, "rewards/rejected": -25.745330810546875, "step": 10015 }, { "epoch": 0.3377262462502949, "grad_norm": 22.88643455505371, "learning_rate": 8.375070605471815e-07, "logits/chosen": -0.9767011404037476, "logits/rejected": -0.978759765625, "logps/chosen": -1.664846658706665, "logps/rejected": -1.8161695003509521, "loss": 2.3565, "rewards/accuracies": 0.5, "rewards/chosen": -16.648466110229492, "rewards/margins": 1.5132266283035278, "rewards/rejected": -18.161693572998047, "step": 10020 }, { "epoch": 0.3378947723212781, "grad_norm": 37.77303695678711, "learning_rate": 8.372899882509273e-07, "logits/chosen": -0.8196894526481628, "logits/rejected": -0.847124457359314, "logps/chosen": -2.192286968231201, "logps/rejected": -2.450854778289795, "loss": 2.3799, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.922870635986328, "rewards/margins": 2.5856778621673584, "rewards/rejected": -24.508548736572266, "step": 10025 }, { "epoch": 0.3380632983922613, "grad_norm": 75.76078033447266, "learning_rate": 8.370727992318967e-07, "logits/chosen": -1.073168396949768, "logits/rejected": -1.253846526145935, "logps/chosen": -2.0942280292510986, "logps/rejected": -2.1867103576660156, "loss": 2.9323, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.942279815673828, "rewards/margins": 0.924824059009552, "rewards/rejected": -21.867103576660156, "step": 10030 }, { "epoch": 0.33823182446324446, "grad_norm": 35.56973648071289, "learning_rate": 8.368554935652503e-07, "logits/chosen": -0.6315957903862, "logits/rejected": -0.7099935412406921, "logps/chosen": -2.129772663116455, "logps/rejected": -2.2371087074279785, "loss": 2.8981, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.297725677490234, "rewards/margins": 1.0733586549758911, "rewards/rejected": -22.371084213256836, "step": 10035 }, { "epoch": 0.3384003505342276, "grad_norm": 41.625614166259766, "learning_rate": 8.366380713261894e-07, "logits/chosen": -1.1311523914337158, "logits/rejected": -1.1663687229156494, "logps/chosen": -2.106232166290283, "logps/rejected": -2.3760056495666504, "loss": 2.496, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.062320709228516, "rewards/margins": 2.69773006439209, "rewards/rejected": -23.760051727294922, "step": 10040 }, { "epoch": 0.33856887660521084, "grad_norm": 56.50038146972656, "learning_rate": 8.364205325899549e-07, "logits/chosen": -1.2110131978988647, "logits/rejected": -1.2723443508148193, "logps/chosen": -2.363647937774658, "logps/rejected": -2.497826337814331, "loss": 2.4119, "rewards/accuracies": 0.5, "rewards/chosen": -23.63648223876953, "rewards/margins": 1.341783881187439, "rewards/rejected": -24.978261947631836, "step": 10045 }, { "epoch": 0.338737402676194, "grad_norm": 22.00423240661621, "learning_rate": 8.362028774318286e-07, "logits/chosen": -0.8191970586776733, "logits/rejected": -0.9766160845756531, "logps/chosen": -1.750830888748169, "logps/rejected": -1.8567674160003662, "loss": 2.2368, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.50830841064453, "rewards/margins": 1.0593667030334473, "rewards/rejected": -18.567676544189453, "step": 10050 }, { "epoch": 0.3389059287471772, "grad_norm": 29.267391204833984, "learning_rate": 8.359851059271323e-07, "logits/chosen": -0.760384738445282, "logits/rejected": -0.8759638667106628, "logps/chosen": -2.0324978828430176, "logps/rejected": -2.2271060943603516, "loss": 2.3612, "rewards/accuracies": 0.5, "rewards/chosen": -20.324981689453125, "rewards/margins": 1.9460808038711548, "rewards/rejected": -22.271060943603516, "step": 10055 }, { "epoch": 0.33907445481816034, "grad_norm": 24.558929443359375, "learning_rate": 8.357672181512281e-07, "logits/chosen": -1.1694022417068481, "logits/rejected": -1.303847312927246, "logps/chosen": -1.7414251565933228, "logps/rejected": -1.8605835437774658, "loss": 2.5338, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.41425132751465, "rewards/margins": 1.1915841102600098, "rewards/rejected": -18.6058349609375, "step": 10060 }, { "epoch": 0.33924298088914356, "grad_norm": 21.290231704711914, "learning_rate": 8.355492141795184e-07, "logits/chosen": -0.7563185691833496, "logits/rejected": -0.895641028881073, "logps/chosen": -1.799481749534607, "logps/rejected": -2.1289761066436768, "loss": 1.8406, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.994815826416016, "rewards/margins": 3.2949440479278564, "rewards/rejected": -21.289762496948242, "step": 10065 }, { "epoch": 0.33941150696012673, "grad_norm": 19.94627571105957, "learning_rate": 8.353310940874457e-07, "logits/chosen": -0.45717424154281616, "logits/rejected": -0.6762049794197083, "logps/chosen": -1.7784864902496338, "logps/rejected": -2.0194575786590576, "loss": 2.3624, "rewards/accuracies": 0.5, "rewards/chosen": -17.784862518310547, "rewards/margins": 2.4097115993499756, "rewards/rejected": -20.194576263427734, "step": 10070 }, { "epoch": 0.3395800330311099, "grad_norm": 12.628251075744629, "learning_rate": 8.351128579504929e-07, "logits/chosen": -1.2598450183868408, "logits/rejected": -1.4129188060760498, "logps/chosen": -2.335139036178589, "logps/rejected": -2.369058609008789, "loss": 3.2621, "rewards/accuracies": 0.5, "rewards/chosen": -23.351390838623047, "rewards/margins": 0.3391936421394348, "rewards/rejected": -23.690584182739258, "step": 10075 }, { "epoch": 0.3397485591020931, "grad_norm": 25.803693771362305, "learning_rate": 8.34894505844183e-07, "logits/chosen": -0.8495704531669617, "logits/rejected": -0.9760378003120422, "logps/chosen": -1.7889350652694702, "logps/rejected": -1.8187023401260376, "loss": 3.0082, "rewards/accuracies": 0.5, "rewards/chosen": -17.88934898376465, "rewards/margins": 0.2976733148097992, "rewards/rejected": -18.187023162841797, "step": 10080 }, { "epoch": 0.3399170851730763, "grad_norm": 17.753204345703125, "learning_rate": 8.346760378440787e-07, "logits/chosen": -1.2412251234054565, "logits/rejected": -1.4337639808654785, "logps/chosen": -1.5392677783966064, "logps/rejected": -1.7446925640106201, "loss": 1.9918, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -15.392675399780273, "rewards/margins": 2.054250955581665, "rewards/rejected": -17.44692611694336, "step": 10085 }, { "epoch": 0.34008561124405945, "grad_norm": 24.789297103881836, "learning_rate": 8.344574540257836e-07, "logits/chosen": -1.016185998916626, "logits/rejected": -0.9818147420883179, "logps/chosen": -1.6750415563583374, "logps/rejected": -1.9193319082260132, "loss": 3.0797, "rewards/accuracies": 0.5, "rewards/chosen": -16.750415802001953, "rewards/margins": 2.442903995513916, "rewards/rejected": -19.19331932067871, "step": 10090 }, { "epoch": 0.3402541373150426, "grad_norm": 5.1002421379089355, "learning_rate": 8.342387544649407e-07, "logits/chosen": -0.885150134563446, "logits/rejected": -1.1269336938858032, "logps/chosen": -1.7850160598754883, "logps/rejected": -2.032360792160034, "loss": 4.2058, "rewards/accuracies": 0.5, "rewards/chosen": -17.850162506103516, "rewards/margins": 2.4734485149383545, "rewards/rejected": -20.3236083984375, "step": 10095 }, { "epoch": 0.34042266338602584, "grad_norm": 33.8403205871582, "learning_rate": 8.340199392372334e-07, "logits/chosen": -0.7171911001205444, "logits/rejected": -0.8975852727890015, "logps/chosen": -2.01006817817688, "logps/rejected": -2.1210741996765137, "loss": 2.4229, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.100679397583008, "rewards/margins": 1.110063910484314, "rewards/rejected": -21.210742950439453, "step": 10100 }, { "epoch": 0.340591189457009, "grad_norm": 26.41214370727539, "learning_rate": 8.338010084183848e-07, "logits/chosen": -0.6185327768325806, "logits/rejected": -0.8199454545974731, "logps/chosen": -2.1546084880828857, "logps/rejected": -2.167876720428467, "loss": 3.3486, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.546085357666016, "rewards/margins": 0.13268089294433594, "rewards/rejected": -21.67876625061035, "step": 10105 }, { "epoch": 0.34075971552799217, "grad_norm": 92.04045104980469, "learning_rate": 8.335819620841588e-07, "logits/chosen": -1.1275532245635986, "logits/rejected": -0.9211037755012512, "logps/chosen": -2.427985668182373, "logps/rejected": -2.0251243114471436, "loss": 7.0349, "rewards/accuracies": 0.0, "rewards/chosen": -24.279855728149414, "rewards/margins": -4.0286126136779785, "rewards/rejected": -20.251245498657227, "step": 10110 }, { "epoch": 0.34092824159897533, "grad_norm": 55.04663848876953, "learning_rate": 8.33362800310358e-07, "logits/chosen": -0.7637117505073547, "logits/rejected": -0.8645876049995422, "logps/chosen": -1.7918641567230225, "logps/rejected": -1.942285180091858, "loss": 2.2968, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.918643951416016, "rewards/margins": 1.5042095184326172, "rewards/rejected": -19.4228515625, "step": 10115 }, { "epoch": 0.34109676766995856, "grad_norm": 57.4473991394043, "learning_rate": 8.331435231728261e-07, "logits/chosen": -0.8943387269973755, "logits/rejected": -1.140529751777649, "logps/chosen": -1.9503597021102905, "logps/rejected": -1.9956696033477783, "loss": 3.2865, "rewards/accuracies": 0.5, "rewards/chosen": -19.503597259521484, "rewards/margins": 0.4531001150608063, "rewards/rejected": -19.956695556640625, "step": 10120 }, { "epoch": 0.3412652937409417, "grad_norm": 30.372905731201172, "learning_rate": 8.329241307474462e-07, "logits/chosen": -0.9090366363525391, "logits/rejected": -0.9714096784591675, "logps/chosen": -1.5740294456481934, "logps/rejected": -1.6546818017959595, "loss": 2.545, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -15.740293502807617, "rewards/margins": 0.8065251111984253, "rewards/rejected": -16.546817779541016, "step": 10125 }, { "epoch": 0.3414338198119249, "grad_norm": 35.968505859375, "learning_rate": 8.327046231101413e-07, "logits/chosen": -0.9563786387443542, "logits/rejected": -1.0514451265335083, "logps/chosen": -2.135432720184326, "logps/rejected": -2.0912060737609863, "loss": 3.5679, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.354328155517578, "rewards/margins": -0.4422665536403656, "rewards/rejected": -20.91206169128418, "step": 10130 }, { "epoch": 0.3416023458829081, "grad_norm": 27.77577018737793, "learning_rate": 8.324850003368744e-07, "logits/chosen": -1.0580577850341797, "logits/rejected": -1.0771617889404297, "logps/chosen": -2.0225207805633545, "logps/rejected": -1.999882698059082, "loss": 3.5039, "rewards/accuracies": 0.5, "rewards/chosen": -20.225208282470703, "rewards/margins": -0.22638091444969177, "rewards/rejected": -19.998825073242188, "step": 10135 }, { "epoch": 0.3417708719538913, "grad_norm": 76.16617584228516, "learning_rate": 8.322652625036482e-07, "logits/chosen": -0.9272229075431824, "logits/rejected": -0.9870445132255554, "logps/chosen": -1.9856401681900024, "logps/rejected": -2.3070406913757324, "loss": 1.934, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.856403350830078, "rewards/margins": 3.2140049934387207, "rewards/rejected": -23.07040786743164, "step": 10140 }, { "epoch": 0.34193939802487444, "grad_norm": 23.884069442749023, "learning_rate": 8.320454096865054e-07, "logits/chosen": -0.785285472869873, "logits/rejected": -0.854631245136261, "logps/chosen": -1.846387505531311, "logps/rejected": -1.9219926595687866, "loss": 2.7894, "rewards/accuracies": 0.5, "rewards/chosen": -18.46387481689453, "rewards/margins": 0.7560516595840454, "rewards/rejected": -19.219926834106445, "step": 10145 }, { "epoch": 0.3421079240958576, "grad_norm": 32.95330047607422, "learning_rate": 8.318254419615283e-07, "logits/chosen": -0.9430086016654968, "logits/rejected": -1.0663788318634033, "logps/chosen": -1.7976980209350586, "logps/rejected": -2.0233747959136963, "loss": 1.4286, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.976980209350586, "rewards/margins": 2.2567665576934814, "rewards/rejected": -20.233745574951172, "step": 10150 }, { "epoch": 0.34227645016684083, "grad_norm": 25.350078582763672, "learning_rate": 8.316053594048394e-07, "logits/chosen": -0.994820237159729, "logits/rejected": -1.3681151866912842, "logps/chosen": -2.111388683319092, "logps/rejected": -2.4443917274475098, "loss": 1.6322, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.1138858795166, "rewards/margins": 3.3300297260284424, "rewards/rejected": -24.44391632080078, "step": 10155 }, { "epoch": 0.342444976237824, "grad_norm": 19.32894515991211, "learning_rate": 8.313851620926e-07, "logits/chosen": -0.9013049006462097, "logits/rejected": -0.9923794865608215, "logps/chosen": -1.902090072631836, "logps/rejected": -1.9696989059448242, "loss": 2.9091, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.020898818969727, "rewards/margins": 0.6760891079902649, "rewards/rejected": -19.696989059448242, "step": 10160 }, { "epoch": 0.34261350230880716, "grad_norm": 24.06064224243164, "learning_rate": 8.311648501010122e-07, "logits/chosen": -1.1217981576919556, "logits/rejected": -1.0748631954193115, "logps/chosen": -2.2157540321350098, "logps/rejected": -2.2380218505859375, "loss": 3.0986, "rewards/accuracies": 0.5, "rewards/chosen": -22.15753936767578, "rewards/margins": 0.22267866134643555, "rewards/rejected": -22.380218505859375, "step": 10165 }, { "epoch": 0.3427820283797903, "grad_norm": 26.289478302001953, "learning_rate": 8.309444235063172e-07, "logits/chosen": -0.9130669832229614, "logits/rejected": -0.7826474905014038, "logps/chosen": -1.8189846277236938, "logps/rejected": -1.9442589282989502, "loss": 2.3744, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.18984603881836, "rewards/margins": 1.2527434825897217, "rewards/rejected": -19.442588806152344, "step": 10170 }, { "epoch": 0.34295055445077355, "grad_norm": 28.468158721923828, "learning_rate": 8.307238823847959e-07, "logits/chosen": -0.6284016370773315, "logits/rejected": -0.5836378931999207, "logps/chosen": -1.7535285949707031, "logps/rejected": -1.9830653667449951, "loss": 1.7572, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -17.53528594970703, "rewards/margins": 2.295367956161499, "rewards/rejected": -19.83065414428711, "step": 10175 }, { "epoch": 0.3431190805217567, "grad_norm": 14.106034278869629, "learning_rate": 8.30503226812769e-07, "logits/chosen": -0.8923094868659973, "logits/rejected": -0.9873468279838562, "logps/chosen": -2.0170390605926514, "logps/rejected": -2.203902006149292, "loss": 2.0167, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -20.17038917541504, "rewards/margins": 1.8686323165893555, "rewards/rejected": -22.03902244567871, "step": 10180 }, { "epoch": 0.3432876065927399, "grad_norm": 32.9178352355957, "learning_rate": 8.302824568665965e-07, "logits/chosen": -1.0593280792236328, "logits/rejected": -1.00070321559906, "logps/chosen": -2.128171920776367, "logps/rejected": -1.895475149154663, "loss": 5.4176, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.281719207763672, "rewards/margins": -2.326968193054199, "rewards/rejected": -18.95475196838379, "step": 10185 }, { "epoch": 0.3434561326637231, "grad_norm": 26.191192626953125, "learning_rate": 8.300615726226783e-07, "logits/chosen": -1.1325823068618774, "logits/rejected": -0.9920172691345215, "logps/chosen": -1.9885280132293701, "logps/rejected": -2.0667898654937744, "loss": 3.2633, "rewards/accuracies": 0.5, "rewards/chosen": -19.88528060913086, "rewards/margins": 0.7826164960861206, "rewards/rejected": -20.667896270751953, "step": 10190 }, { "epoch": 0.34362465873470627, "grad_norm": 18.76409149169922, "learning_rate": 8.298405741574537e-07, "logits/chosen": -1.216739296913147, "logits/rejected": -1.285988211631775, "logps/chosen": -1.7787593603134155, "logps/rejected": -1.7824738025665283, "loss": 3.2125, "rewards/accuracies": 0.5, "rewards/chosen": -17.787593841552734, "rewards/margins": 0.037142276763916016, "rewards/rejected": -17.824735641479492, "step": 10195 }, { "epoch": 0.34379318480568943, "grad_norm": 37.59312057495117, "learning_rate": 8.296194615474014e-07, "logits/chosen": -1.0172761678695679, "logits/rejected": -1.1041208505630493, "logps/chosen": -2.025296926498413, "logps/rejected": -2.2218832969665527, "loss": 2.0428, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.252967834472656, "rewards/margins": 1.9658634662628174, "rewards/rejected": -22.218830108642578, "step": 10200 }, { "epoch": 0.3439617108766726, "grad_norm": 26.07461929321289, "learning_rate": 8.293982348690402e-07, "logits/chosen": -1.3663597106933594, "logits/rejected": -1.3732165098190308, "logps/chosen": -1.9943389892578125, "logps/rejected": -1.8807373046875, "loss": 4.3672, "rewards/accuracies": 0.5, "rewards/chosen": -19.943389892578125, "rewards/margins": -1.1360156536102295, "rewards/rejected": -18.807373046875, "step": 10205 }, { "epoch": 0.3441302369476558, "grad_norm": 16.750768661499023, "learning_rate": 8.291768941989277e-07, "logits/chosen": -1.353293776512146, "logits/rejected": -1.2948862314224243, "logps/chosen": -1.8148410320281982, "logps/rejected": -1.8855838775634766, "loss": 2.4624, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.14841079711914, "rewards/margins": 0.7074286341667175, "rewards/rejected": -18.855838775634766, "step": 10210 }, { "epoch": 0.344298763018639, "grad_norm": 50.30863571166992, "learning_rate": 8.289554396136611e-07, "logits/chosen": -0.7885207533836365, "logits/rejected": -0.789328932762146, "logps/chosen": -2.4931399822235107, "logps/rejected": -2.1422553062438965, "loss": 6.6146, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.9314022064209, "rewards/margins": -3.508845567703247, "rewards/rejected": -21.422555923461914, "step": 10215 }, { "epoch": 0.34446728908962215, "grad_norm": 38.200897216796875, "learning_rate": 8.287338711898771e-07, "logits/chosen": -0.6832276582717896, "logits/rejected": -0.6595412492752075, "logps/chosen": -2.279621124267578, "logps/rejected": -2.7458064556121826, "loss": 3.0489, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -22.79621124267578, "rewards/margins": 4.661856651306152, "rewards/rejected": -27.45806884765625, "step": 10220 }, { "epoch": 0.3446358151606053, "grad_norm": 24.22657585144043, "learning_rate": 8.28512189004252e-07, "logits/chosen": -0.6188753843307495, "logits/rejected": -0.6037112474441528, "logps/chosen": -1.9316953420639038, "logps/rejected": -2.0164954662323, "loss": 2.4917, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.316951751708984, "rewards/margins": 0.8480027318000793, "rewards/rejected": -20.16495704650879, "step": 10225 }, { "epoch": 0.34480434123158854, "grad_norm": 46.43810272216797, "learning_rate": 8.28290393133501e-07, "logits/chosen": -0.9024465680122375, "logits/rejected": -1.0171552896499634, "logps/chosen": -2.0227932929992676, "logps/rejected": -2.0649027824401855, "loss": 3.14, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.22793197631836, "rewards/margins": 0.4210955500602722, "rewards/rejected": -20.649028778076172, "step": 10230 }, { "epoch": 0.3449728673025717, "grad_norm": 22.429914474487305, "learning_rate": 8.280684836543793e-07, "logits/chosen": -1.074040412902832, "logits/rejected": -1.2552762031555176, "logps/chosen": -1.6524499654769897, "logps/rejected": -1.741180419921875, "loss": 2.6131, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.524497985839844, "rewards/margins": 0.8873060941696167, "rewards/rejected": -17.41180419921875, "step": 10235 }, { "epoch": 0.3451413933735549, "grad_norm": 75.95999908447266, "learning_rate": 8.278464606436807e-07, "logits/chosen": -0.4927440285682678, "logits/rejected": -0.4743828773498535, "logps/chosen": -2.1160855293273926, "logps/rejected": -2.2202095985412598, "loss": 2.5375, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.16085433959961, "rewards/margins": 1.0412417650222778, "rewards/rejected": -22.20209503173828, "step": 10240 }, { "epoch": 0.3453099194445381, "grad_norm": 30.984689712524414, "learning_rate": 8.276243241782386e-07, "logits/chosen": -0.9812358617782593, "logits/rejected": -1.0138499736785889, "logps/chosen": -1.5530610084533691, "logps/rejected": -1.5287425518035889, "loss": 3.5052, "rewards/accuracies": 0.5, "rewards/chosen": -15.530611038208008, "rewards/margins": -0.24318504333496094, "rewards/rejected": -15.28742504119873, "step": 10245 }, { "epoch": 0.34547844551552126, "grad_norm": 55.070709228515625, "learning_rate": 8.27402074334926e-07, "logits/chosen": -0.8449915051460266, "logits/rejected": -0.9934304356575012, "logps/chosen": -1.7992223501205444, "logps/rejected": -1.8660894632339478, "loss": 2.9444, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.992223739624023, "rewards/margins": 0.6686684489250183, "rewards/rejected": -18.660892486572266, "step": 10250 }, { "epoch": 0.3456469715865044, "grad_norm": 34.87515640258789, "learning_rate": 8.271797111906542e-07, "logits/chosen": -0.8565098643302917, "logits/rejected": -0.9601860046386719, "logps/chosen": -2.121169328689575, "logps/rejected": -2.3623504638671875, "loss": 1.9052, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.21169090270996, "rewards/margins": 2.411813974380493, "rewards/rejected": -23.62350845336914, "step": 10255 }, { "epoch": 0.3458154976574876, "grad_norm": 24.82640838623047, "learning_rate": 8.26957234822375e-07, "logits/chosen": -0.8380298614501953, "logits/rejected": -1.002062439918518, "logps/chosen": -1.8750407695770264, "logps/rejected": -2.041203022003174, "loss": 1.8186, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -18.750408172607422, "rewards/margins": 1.6616241931915283, "rewards/rejected": -20.412031173706055, "step": 10260 }, { "epoch": 0.3459840237284708, "grad_norm": 25.13413429260254, "learning_rate": 8.267346453070785e-07, "logits/chosen": -1.0303947925567627, "logits/rejected": -0.9507439732551575, "logps/chosen": -1.8449833393096924, "logps/rejected": -1.8964245319366455, "loss": 3.5502, "rewards/accuracies": 0.5, "rewards/chosen": -18.449832916259766, "rewards/margins": 0.5144118070602417, "rewards/rejected": -18.964242935180664, "step": 10265 }, { "epoch": 0.346152549799454, "grad_norm": 33.180355072021484, "learning_rate": 8.265119427217939e-07, "logits/chosen": -1.1469266414642334, "logits/rejected": -1.0809723138809204, "logps/chosen": -1.9312407970428467, "logps/rejected": -1.9858585596084595, "loss": 3.6962, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.312410354614258, "rewards/margins": 0.5461755990982056, "rewards/rejected": -19.858585357666016, "step": 10270 }, { "epoch": 0.34632107587043715, "grad_norm": 27.01608657836914, "learning_rate": 8.262891271435901e-07, "logits/chosen": -0.7008494138717651, "logits/rejected": -0.694682240486145, "logps/chosen": -1.4841798543930054, "logps/rejected": -1.623469591140747, "loss": 2.1888, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -14.841798782348633, "rewards/margins": 1.3928959369659424, "rewards/rejected": -16.234695434570312, "step": 10275 }, { "epoch": 0.3464896019414203, "grad_norm": 27.898277282714844, "learning_rate": 8.260661986495748e-07, "logits/chosen": -0.9827602505683899, "logits/rejected": -1.0116498470306396, "logps/chosen": -1.5244865417480469, "logps/rejected": -1.416467308998108, "loss": 4.1803, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -15.244865417480469, "rewards/margins": -1.0801918506622314, "rewards/rejected": -14.1646728515625, "step": 10280 }, { "epoch": 0.34665812801240353, "grad_norm": 16.065555572509766, "learning_rate": 8.258431573168944e-07, "logits/chosen": -0.8114410638809204, "logits/rejected": -0.6855028867721558, "logps/chosen": -2.205500841140747, "logps/rejected": -2.2870655059814453, "loss": 3.2703, "rewards/accuracies": 0.5, "rewards/chosen": -22.055007934570312, "rewards/margins": 0.8156498670578003, "rewards/rejected": -22.870656967163086, "step": 10285 }, { "epoch": 0.3468266540833867, "grad_norm": 29.678882598876953, "learning_rate": 8.25620003222735e-07, "logits/chosen": -1.0882136821746826, "logits/rejected": -1.0546444654464722, "logps/chosen": -1.7505064010620117, "logps/rejected": -1.8048797845840454, "loss": 2.6905, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.505064010620117, "rewards/margins": 0.5437334179878235, "rewards/rejected": -18.048799514770508, "step": 10290 }, { "epoch": 0.34699518015436986, "grad_norm": 17.689411163330078, "learning_rate": 8.253967364443214e-07, "logits/chosen": -0.49934762716293335, "logits/rejected": -0.5486842393875122, "logps/chosen": -2.119231939315796, "logps/rejected": -2.325207233428955, "loss": 2.5515, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.192317962646484, "rewards/margins": 2.0597527027130127, "rewards/rejected": -23.252071380615234, "step": 10295 }, { "epoch": 0.3471637062253531, "grad_norm": 30.368303298950195, "learning_rate": 8.251733570589176e-07, "logits/chosen": -0.9879854321479797, "logits/rejected": -1.0501902103424072, "logps/chosen": -2.1748950481414795, "logps/rejected": -2.3603451251983643, "loss": 2.5822, "rewards/accuracies": 0.5, "rewards/chosen": -21.748950958251953, "rewards/margins": 1.8544994592666626, "rewards/rejected": -23.603452682495117, "step": 10300 }, { "epoch": 0.34733223229633625, "grad_norm": 62.947410583496094, "learning_rate": 8.249498651438261e-07, "logits/chosen": -1.065384864807129, "logits/rejected": -1.0049973726272583, "logps/chosen": -2.1836864948272705, "logps/rejected": -2.3289408683776855, "loss": 2.5892, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.836864471435547, "rewards/margins": 1.452542781829834, "rewards/rejected": -23.289409637451172, "step": 10305 }, { "epoch": 0.3475007583673194, "grad_norm": 17.733478546142578, "learning_rate": 8.247262607763887e-07, "logits/chosen": -1.1448280811309814, "logits/rejected": -1.433410882949829, "logps/chosen": -1.7068363428115845, "logps/rejected": -1.804264783859253, "loss": 2.2979, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.068363189697266, "rewards/margins": 0.9742859601974487, "rewards/rejected": -18.042648315429688, "step": 10310 }, { "epoch": 0.3476692844383026, "grad_norm": 24.782636642456055, "learning_rate": 8.245025440339864e-07, "logits/chosen": -0.9603859186172485, "logits/rejected": -0.9498102068901062, "logps/chosen": -2.2697396278381348, "logps/rejected": -2.155381441116333, "loss": 4.5072, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.69739532470703, "rewards/margins": -1.1435810327529907, "rewards/rejected": -21.553813934326172, "step": 10315 }, { "epoch": 0.3478378105092858, "grad_norm": 27.691980361938477, "learning_rate": 8.242787149940382e-07, "logits/chosen": -1.1349749565124512, "logits/rejected": -1.1333445310592651, "logps/chosen": -2.019230365753174, "logps/rejected": -1.935373067855835, "loss": 3.9671, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -20.192302703857422, "rewards/margins": -0.8385698199272156, "rewards/rejected": -19.35373306274414, "step": 10320 }, { "epoch": 0.34800633658026897, "grad_norm": 15.423532485961914, "learning_rate": 8.24054773734003e-07, "logits/chosen": -0.9372714757919312, "logits/rejected": -0.927697479724884, "logps/chosen": -2.0240514278411865, "logps/rejected": -2.0457513332366943, "loss": 3.1761, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.240514755249023, "rewards/margins": 0.21699972450733185, "rewards/rejected": -20.4575138092041, "step": 10325 }, { "epoch": 0.34817486265125214, "grad_norm": 23.208126068115234, "learning_rate": 8.238307203313779e-07, "logits/chosen": -1.2076337337493896, "logits/rejected": -1.4667056798934937, "logps/chosen": -1.6836729049682617, "logps/rejected": -1.6917756795883179, "loss": 3.1361, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.836727142333984, "rewards/margins": 0.08102655410766602, "rewards/rejected": -16.91775894165039, "step": 10330 }, { "epoch": 0.3483433887222353, "grad_norm": 40.44734191894531, "learning_rate": 8.236065548636987e-07, "logits/chosen": -0.9224785566329956, "logits/rejected": -1.0374819040298462, "logps/chosen": -2.0995991230010986, "logps/rejected": -2.18255877494812, "loss": 2.4162, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.995990753173828, "rewards/margins": 0.8295975923538208, "rewards/rejected": -21.82558822631836, "step": 10335 }, { "epoch": 0.3485119147932185, "grad_norm": 45.3128662109375, "learning_rate": 8.233822774085406e-07, "logits/chosen": -1.0633924007415771, "logits/rejected": -1.2170779705047607, "logps/chosen": -1.716073751449585, "logps/rejected": -1.9018266201019287, "loss": 2.2667, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.16073989868164, "rewards/margins": 1.8575260639190674, "rewards/rejected": -19.018264770507812, "step": 10340 }, { "epoch": 0.3486804408642017, "grad_norm": 25.652551651000977, "learning_rate": 8.231578880435172e-07, "logits/chosen": -0.7540196180343628, "logits/rejected": -1.001956582069397, "logps/chosen": -1.909753441810608, "logps/rejected": -1.8614799976348877, "loss": 3.7268, "rewards/accuracies": 0.5, "rewards/chosen": -19.0975341796875, "rewards/margins": -0.4827335476875305, "rewards/rejected": -18.61480140686035, "step": 10345 }, { "epoch": 0.34884896693518486, "grad_norm": 19.225238800048828, "learning_rate": 8.229333868462804e-07, "logits/chosen": -0.4429520070552826, "logits/rejected": -0.4661738872528076, "logps/chosen": -1.8548179864883423, "logps/rejected": -2.0106232166290283, "loss": 2.276, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.548179626464844, "rewards/margins": 1.5580523014068604, "rewards/rejected": -20.106231689453125, "step": 10350 }, { "epoch": 0.3490174930061681, "grad_norm": 40.73187255859375, "learning_rate": 8.227087738945216e-07, "logits/chosen": -0.9383414387702942, "logits/rejected": -1.0112953186035156, "logps/chosen": -2.1667134761810303, "logps/rejected": -2.1280548572540283, "loss": 3.9253, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.66713523864746, "rewards/margins": -0.3865896165370941, "rewards/rejected": -21.280548095703125, "step": 10355 }, { "epoch": 0.34918601907715124, "grad_norm": 25.78534698486328, "learning_rate": 8.224840492659704e-07, "logits/chosen": -1.1125036478042603, "logits/rejected": -0.9585012197494507, "logps/chosen": -2.114788770675659, "logps/rejected": -2.162379503250122, "loss": 3.0587, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.14788818359375, "rewards/margins": 0.4759071469306946, "rewards/rejected": -21.623794555664062, "step": 10360 }, { "epoch": 0.3493545451481344, "grad_norm": 33.07319259643555, "learning_rate": 8.22259213038395e-07, "logits/chosen": -1.2760889530181885, "logits/rejected": -1.3612782955169678, "logps/chosen": -1.783482551574707, "logps/rejected": -1.891667366027832, "loss": 2.4565, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.834823608398438, "rewards/margins": 1.0818490982055664, "rewards/rejected": -18.91667366027832, "step": 10365 }, { "epoch": 0.3495230712191176, "grad_norm": 60.872222900390625, "learning_rate": 8.220342652896026e-07, "logits/chosen": -1.4350082874298096, "logits/rejected": -1.29402756690979, "logps/chosen": -2.09328031539917, "logps/rejected": -2.333141803741455, "loss": 2.3236, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.932804107666016, "rewards/margins": 2.398613691329956, "rewards/rejected": -23.331417083740234, "step": 10370 }, { "epoch": 0.3496915972901008, "grad_norm": 27.59153175354004, "learning_rate": 8.218092060974385e-07, "logits/chosen": -1.290093183517456, "logits/rejected": -0.9906972050666809, "logps/chosen": -2.023043394088745, "logps/rejected": -2.148740768432617, "loss": 2.9719, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.23043441772461, "rewards/margins": 1.256973385810852, "rewards/rejected": -21.487407684326172, "step": 10375 }, { "epoch": 0.34986012336108396, "grad_norm": 34.23496627807617, "learning_rate": 8.215840355397871e-07, "logits/chosen": -0.91291743516922, "logits/rejected": -1.5867294073104858, "logps/chosen": -1.6097930669784546, "logps/rejected": -2.0119924545288086, "loss": 2.237, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.097929000854492, "rewards/margins": 4.021994113922119, "rewards/rejected": -20.119922637939453, "step": 10380 }, { "epoch": 0.35002864943206713, "grad_norm": 23.3467960357666, "learning_rate": 8.213587536945708e-07, "logits/chosen": -0.9239813685417175, "logits/rejected": -0.8876129984855652, "logps/chosen": -1.8807910680770874, "logps/rejected": -2.263803243637085, "loss": 2.5773, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.80790901184082, "rewards/margins": 3.8301239013671875, "rewards/rejected": -22.638032913208008, "step": 10385 }, { "epoch": 0.3501971755030503, "grad_norm": 21.498138427734375, "learning_rate": 8.211333606397508e-07, "logits/chosen": -1.2303471565246582, "logits/rejected": -1.2403428554534912, "logps/chosen": -1.7818397283554077, "logps/rejected": -1.784547209739685, "loss": 3.0743, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.81839942932129, "rewards/margins": 0.027074432000517845, "rewards/rejected": -17.845470428466797, "step": 10390 }, { "epoch": 0.3503657015740335, "grad_norm": 33.57012939453125, "learning_rate": 8.209078564533269e-07, "logits/chosen": -0.9230928421020508, "logits/rejected": -1.0170118808746338, "logps/chosen": -1.7714271545410156, "logps/rejected": -1.8767017126083374, "loss": 2.7308, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.71427345275879, "rewards/margins": 1.0527466535568237, "rewards/rejected": -18.767019271850586, "step": 10395 }, { "epoch": 0.3505342276450167, "grad_norm": 25.153078079223633, "learning_rate": 8.206822412133372e-07, "logits/chosen": -0.9156519174575806, "logits/rejected": -1.0805760622024536, "logps/chosen": -1.826174020767212, "logps/rejected": -1.9233871698379517, "loss": 2.7414, "rewards/accuracies": 0.5, "rewards/chosen": -18.261737823486328, "rewards/margins": 0.9721338152885437, "rewards/rejected": -19.233871459960938, "step": 10400 }, { "epoch": 0.3505342276450167, "eval_logits/chosen": -1.300429105758667, "eval_logits/rejected": -1.3824095726013184, "eval_logps/chosen": -1.8830461502075195, "eval_logps/rejected": -1.9594608545303345, "eval_loss": 3.055062770843506, "eval_rewards/accuracies": 0.6100000143051147, "eval_rewards/chosen": -18.830459594726562, "eval_rewards/margins": 0.7641494870185852, "eval_rewards/rejected": -19.594608306884766, "eval_runtime": 12.9087, "eval_samples_per_second": 7.747, "eval_steps_per_second": 1.937, "step": 10400 }, { "epoch": 0.35070275371599985, "grad_norm": 25.646238327026367, "learning_rate": 8.204565149978582e-07, "logits/chosen": -0.9649211168289185, "logits/rejected": -1.0713859796524048, "logps/chosen": -2.2169041633605957, "logps/rejected": -2.3779962062835693, "loss": 3.2058, "rewards/accuracies": 0.5, "rewards/chosen": -22.16904067993164, "rewards/margins": 1.61092209815979, "rewards/rejected": -23.779964447021484, "step": 10405 }, { "epoch": 0.35087127978698307, "grad_norm": 26.233304977416992, "learning_rate": 8.202306778850048e-07, "logits/chosen": -0.3807659447193146, "logits/rejected": -0.40060725808143616, "logps/chosen": -2.0837881565093994, "logps/rejected": -2.195038080215454, "loss": 3.0738, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.837881088256836, "rewards/margins": 1.1124993562698364, "rewards/rejected": -21.950382232666016, "step": 10410 }, { "epoch": 0.35103980585796624, "grad_norm": 54.93354034423828, "learning_rate": 8.200047299529305e-07, "logits/chosen": -0.7190951108932495, "logits/rejected": -0.6860246658325195, "logps/chosen": -2.145395278930664, "logps/rejected": -2.186631679534912, "loss": 3.2817, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.45395278930664, "rewards/margins": 0.41236335039138794, "rewards/rejected": -21.866313934326172, "step": 10415 }, { "epoch": 0.3512083319289494, "grad_norm": 23.91092872619629, "learning_rate": 8.197786712798265e-07, "logits/chosen": -0.7775014638900757, "logits/rejected": -0.8118532299995422, "logps/chosen": -1.7293260097503662, "logps/rejected": -1.9149547815322876, "loss": 2.2801, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.293262481689453, "rewards/margins": 1.8562860488891602, "rewards/rejected": -19.149547576904297, "step": 10420 }, { "epoch": 0.35137685799993257, "grad_norm": 16.439292907714844, "learning_rate": 8.195525019439236e-07, "logits/chosen": -1.0097054243087769, "logits/rejected": -1.0496337413787842, "logps/chosen": -1.8205091953277588, "logps/rejected": -1.7729756832122803, "loss": 3.6208, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.205089569091797, "rewards/margins": -0.4753352999687195, "rewards/rejected": -17.729755401611328, "step": 10425 }, { "epoch": 0.3515453840709158, "grad_norm": 98.25023651123047, "learning_rate": 8.193262220234894e-07, "logits/chosen": -1.0923794507980347, "logits/rejected": -0.9958206415176392, "logps/chosen": -2.3235385417938232, "logps/rejected": -2.1468660831451416, "loss": 4.7969, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -23.23538589477539, "rewards/margins": -1.7667248249053955, "rewards/rejected": -21.46866226196289, "step": 10430 }, { "epoch": 0.35171391014189896, "grad_norm": 16.97013282775879, "learning_rate": 8.190998315968306e-07, "logits/chosen": -1.2971888780593872, "logits/rejected": -1.257934808731079, "logps/chosen": -1.5942026376724243, "logps/rejected": -1.7968189716339111, "loss": 1.7453, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -15.942026138305664, "rewards/margins": 2.026162624359131, "rewards/rejected": -17.968189239501953, "step": 10435 }, { "epoch": 0.3518824362128821, "grad_norm": 31.861385345458984, "learning_rate": 8.188733307422923e-07, "logits/chosen": -1.2741501331329346, "logits/rejected": -1.0171701908111572, "logps/chosen": -2.095390796661377, "logps/rejected": -2.1119773387908936, "loss": 3.6509, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -20.953907012939453, "rewards/margins": 0.16586685180664062, "rewards/rejected": -21.11977195739746, "step": 10440 }, { "epoch": 0.3520509622838653, "grad_norm": 46.652584075927734, "learning_rate": 8.186467195382572e-07, "logits/chosen": -1.0171384811401367, "logits/rejected": -1.2291090488433838, "logps/chosen": -1.9784443378448486, "logps/rejected": -2.19016695022583, "loss": 2.1259, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.784442901611328, "rewards/margins": 2.1172266006469727, "rewards/rejected": -21.901668548583984, "step": 10445 }, { "epoch": 0.3522194883548485, "grad_norm": 24.156171798706055, "learning_rate": 8.184199980631467e-07, "logits/chosen": -0.7162508964538574, "logits/rejected": -0.8358560800552368, "logps/chosen": -2.1988096237182617, "logps/rejected": -2.6243538856506348, "loss": 1.7779, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.988094329833984, "rewards/margins": 4.255441188812256, "rewards/rejected": -26.2435359954834, "step": 10450 }, { "epoch": 0.3523880144258317, "grad_norm": 32.38210678100586, "learning_rate": 8.181931663954201e-07, "logits/chosen": -0.7578636407852173, "logits/rejected": -0.8377211689949036, "logps/chosen": -1.8838396072387695, "logps/rejected": -2.02470064163208, "loss": 1.9536, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.838396072387695, "rewards/margins": 1.4086089134216309, "rewards/rejected": -20.24700355529785, "step": 10455 }, { "epoch": 0.35255654049681484, "grad_norm": 31.619720458984375, "learning_rate": 8.17966224613575e-07, "logits/chosen": -0.8775178790092468, "logits/rejected": -1.0011556148529053, "logps/chosen": -1.9142124652862549, "logps/rejected": -2.068910598754883, "loss": 2.953, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.142126083374023, "rewards/margins": 1.5469805002212524, "rewards/rejected": -20.689105987548828, "step": 10460 }, { "epoch": 0.35272506656779806, "grad_norm": 13.446866989135742, "learning_rate": 8.177391727961469e-07, "logits/chosen": -0.8765958547592163, "logits/rejected": -1.1049778461456299, "logps/chosen": -2.0977909564971924, "logps/rejected": -2.3150532245635986, "loss": 2.0321, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.977909088134766, "rewards/margins": 2.172624349594116, "rewards/rejected": -23.150531768798828, "step": 10465 }, { "epoch": 0.35289359263878123, "grad_norm": 21.661466598510742, "learning_rate": 8.175120110217095e-07, "logits/chosen": -0.9677863121032715, "logits/rejected": -1.0532230138778687, "logps/chosen": -2.0553691387176514, "logps/rejected": -2.2560195922851562, "loss": 2.1423, "rewards/accuracies": 0.5, "rewards/chosen": -20.553691864013672, "rewards/margins": 2.0065035820007324, "rewards/rejected": -22.560195922851562, "step": 10470 }, { "epoch": 0.3530621187097644, "grad_norm": 10.768431663513184, "learning_rate": 8.172847393688747e-07, "logits/chosen": -0.6174628734588623, "logits/rejected": -0.8380719423294067, "logps/chosen": -1.5090444087982178, "logps/rejected": -1.721604347229004, "loss": 3.0297, "rewards/accuracies": 0.5, "rewards/chosen": -15.090444564819336, "rewards/margins": 2.1255993843078613, "rewards/rejected": -17.216045379638672, "step": 10475 }, { "epoch": 0.35323064478074756, "grad_norm": 27.460527420043945, "learning_rate": 8.170573579162918e-07, "logits/chosen": -0.5143523812294006, "logits/rejected": -0.6044338941574097, "logps/chosen": -1.9238542318344116, "logps/rejected": -2.391577959060669, "loss": 1.76, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.238540649414062, "rewards/margins": 4.6772356033325195, "rewards/rejected": -23.9157772064209, "step": 10480 }, { "epoch": 0.3533991708517308, "grad_norm": 17.713415145874023, "learning_rate": 8.168298667426492e-07, "logits/chosen": -0.8852537870407104, "logits/rejected": -0.9979242086410522, "logps/chosen": -2.6422157287597656, "logps/rejected": -2.943582534790039, "loss": 2.0315, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -26.42215919494629, "rewards/margins": 3.0136685371398926, "rewards/rejected": -29.435827255249023, "step": 10485 }, { "epoch": 0.35356769692271395, "grad_norm": 39.909637451171875, "learning_rate": 8.166022659266722e-07, "logits/chosen": -1.2096792459487915, "logits/rejected": -1.0006765127182007, "logps/chosen": -1.7474887371063232, "logps/rejected": -1.59381103515625, "loss": 5.1208, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.47488784790039, "rewards/margins": -1.5367769002914429, "rewards/rejected": -15.9381103515625, "step": 10490 }, { "epoch": 0.3537362229936971, "grad_norm": 13.468647956848145, "learning_rate": 8.163745555471246e-07, "logits/chosen": -1.0280694961547852, "logits/rejected": -1.1094509363174438, "logps/chosen": -1.7760365009307861, "logps/rejected": -2.0079100131988525, "loss": 1.5435, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.760364532470703, "rewards/margins": 2.3187355995178223, "rewards/rejected": -20.079099655151367, "step": 10495 }, { "epoch": 0.3539047490646803, "grad_norm": 33.24937438964844, "learning_rate": 8.161467356828079e-07, "logits/chosen": -0.4880562424659729, "logits/rejected": -0.7934033870697021, "logps/chosen": -2.284480571746826, "logps/rejected": -2.696446180343628, "loss": 1.5306, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.844804763793945, "rewards/margins": 4.119656562805176, "rewards/rejected": -26.964462280273438, "step": 10500 }, { "epoch": 0.3540732751356635, "grad_norm": 27.55710792541504, "learning_rate": 8.159188064125617e-07, "logits/chosen": -0.588653028011322, "logits/rejected": -0.5736607313156128, "logps/chosen": -1.9094724655151367, "logps/rejected": -1.8087133169174194, "loss": 4.103, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.094724655151367, "rewards/margins": -1.007592797279358, "rewards/rejected": -18.087133407592773, "step": 10505 }, { "epoch": 0.35424180120664667, "grad_norm": 31.66336441040039, "learning_rate": 8.156907678152633e-07, "logits/chosen": -0.8797151446342468, "logits/rejected": -1.1634795665740967, "logps/chosen": -1.987494707107544, "logps/rejected": -2.1331000328063965, "loss": 2.3538, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.87494468688965, "rewards/margins": 1.4560561180114746, "rewards/rejected": -21.33099937438965, "step": 10510 }, { "epoch": 0.35441032727762983, "grad_norm": 24.7913875579834, "learning_rate": 8.15462619969828e-07, "logits/chosen": -1.3399163484573364, "logits/rejected": -1.513481616973877, "logps/chosen": -1.6611442565917969, "logps/rejected": -1.8118454217910767, "loss": 2.4241, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.61144256591797, "rewards/margins": 1.507012128829956, "rewards/rejected": -18.11845588684082, "step": 10515 }, { "epoch": 0.35457885334861305, "grad_norm": 28.988832473754883, "learning_rate": 8.152343629552086e-07, "logits/chosen": -1.3251217603683472, "logits/rejected": -1.4001457691192627, "logps/chosen": -2.0002996921539307, "logps/rejected": -2.1876602172851562, "loss": 3.1388, "rewards/accuracies": 0.5, "rewards/chosen": -20.002994537353516, "rewards/margins": 1.873605728149414, "rewards/rejected": -21.876602172851562, "step": 10520 }, { "epoch": 0.3547473794195962, "grad_norm": 23.213520050048828, "learning_rate": 8.15005996850396e-07, "logits/chosen": -0.7930396795272827, "logits/rejected": -0.8237592577934265, "logps/chosen": -2.3455262184143066, "logps/rejected": -2.8883988857269287, "loss": 2.1257, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.455265045166016, "rewards/margins": 5.428727149963379, "rewards/rejected": -28.883987426757812, "step": 10525 }, { "epoch": 0.3549159054905794, "grad_norm": 18.53113555908203, "learning_rate": 8.147775217344183e-07, "logits/chosen": -0.570213258266449, "logits/rejected": -0.6492749452590942, "logps/chosen": -1.9311374425888062, "logps/rejected": -1.984156847000122, "loss": 2.9684, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.31137466430664, "rewards/margins": 0.5301931500434875, "rewards/rejected": -19.841569900512695, "step": 10530 }, { "epoch": 0.35508443156156255, "grad_norm": 18.7517147064209, "learning_rate": 8.145489376863424e-07, "logits/chosen": -1.0552548170089722, "logits/rejected": -1.1530416011810303, "logps/chosen": -2.1529173851013184, "logps/rejected": -2.5861310958862305, "loss": 1.6314, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.529176712036133, "rewards/margins": 4.332135200500488, "rewards/rejected": -25.861309051513672, "step": 10535 }, { "epoch": 0.3552529576325458, "grad_norm": 19.308006286621094, "learning_rate": 8.143202447852718e-07, "logits/chosen": -0.7028933763504028, "logits/rejected": -0.7312060594558716, "logps/chosen": -1.9514005184173584, "logps/rejected": -1.9244524240493774, "loss": 3.4887, "rewards/accuracies": 0.5, "rewards/chosen": -19.514005661010742, "rewards/margins": -0.26948195695877075, "rewards/rejected": -19.244525909423828, "step": 10540 }, { "epoch": 0.35542148370352894, "grad_norm": 18.863069534301758, "learning_rate": 8.140914431103482e-07, "logits/chosen": -1.2888405323028564, "logits/rejected": -1.218519926071167, "logps/chosen": -1.8028770685195923, "logps/rejected": -1.8393287658691406, "loss": 2.804, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.02876853942871, "rewards/margins": 0.36451882123947144, "rewards/rejected": -18.39328956604004, "step": 10545 }, { "epoch": 0.3555900097745121, "grad_norm": 95.41343688964844, "learning_rate": 8.138625327407509e-07, "logits/chosen": -1.0824968814849854, "logits/rejected": -1.0274932384490967, "logps/chosen": -2.0919995307922363, "logps/rejected": -2.3005242347717285, "loss": 2.4623, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.919994354248047, "rewards/margins": 2.085249185562134, "rewards/rejected": -23.005245208740234, "step": 10550 }, { "epoch": 0.35575853584549527, "grad_norm": 39.427024841308594, "learning_rate": 8.136335137556967e-07, "logits/chosen": -1.059185266494751, "logits/rejected": -1.1593902111053467, "logps/chosen": -2.3896114826202393, "logps/rejected": -2.362490653991699, "loss": 3.4528, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.896114349365234, "rewards/margins": -0.2712062895298004, "rewards/rejected": -23.624908447265625, "step": 10555 }, { "epoch": 0.3559270619164785, "grad_norm": 28.08155059814453, "learning_rate": 8.134043862344399e-07, "logits/chosen": -0.863645076751709, "logits/rejected": -0.8519765138626099, "logps/chosen": -1.8194077014923096, "logps/rejected": -1.9381601810455322, "loss": 2.887, "rewards/accuracies": 0.5, "rewards/chosen": -18.194076538085938, "rewards/margins": 1.1875252723693848, "rewards/rejected": -19.381603240966797, "step": 10560 }, { "epoch": 0.35609558798746166, "grad_norm": 35.566463470458984, "learning_rate": 8.13175150256273e-07, "logits/chosen": -0.351532518863678, "logits/rejected": -0.5769934058189392, "logps/chosen": -1.795641303062439, "logps/rejected": -2.0936331748962402, "loss": 2.3132, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.9564151763916, "rewards/margins": 2.97991681098938, "rewards/rejected": -20.936330795288086, "step": 10565 }, { "epoch": 0.3562641140584448, "grad_norm": 18.616701126098633, "learning_rate": 8.129458059005249e-07, "logits/chosen": -0.9681928753852844, "logits/rejected": -1.2117507457733154, "logps/chosen": -2.4990522861480713, "logps/rejected": -2.418123722076416, "loss": 4.0166, "rewards/accuracies": 0.5, "rewards/chosen": -24.990520477294922, "rewards/margins": -0.8092843294143677, "rewards/rejected": -24.181236267089844, "step": 10570 }, { "epoch": 0.35643264012942805, "grad_norm": 13.77556037902832, "learning_rate": 8.127163532465629e-07, "logits/chosen": -0.9583713412284851, "logits/rejected": -0.7233438491821289, "logps/chosen": -2.600524425506592, "logps/rejected": -2.3895583152770996, "loss": 5.9725, "rewards/accuracies": 0.5, "rewards/chosen": -26.0052433013916, "rewards/margins": -2.1096599102020264, "rewards/rejected": -23.895580291748047, "step": 10575 }, { "epoch": 0.3566011662004112, "grad_norm": 22.821674346923828, "learning_rate": 8.124867923737918e-07, "logits/chosen": -0.8695154190063477, "logits/rejected": -0.8962182998657227, "logps/chosen": -2.6580350399017334, "logps/rejected": -2.571289300918579, "loss": 4.253, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -26.580352783203125, "rewards/margins": -0.8674640655517578, "rewards/rejected": -25.712890625, "step": 10580 }, { "epoch": 0.3567696922713944, "grad_norm": 37.85147476196289, "learning_rate": 8.122571233616531e-07, "logits/chosen": -0.7850193977355957, "logits/rejected": -0.8626729846000671, "logps/chosen": -1.9868669509887695, "logps/rejected": -2.2017486095428467, "loss": 3.1236, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.868671417236328, "rewards/margins": 2.1488170623779297, "rewards/rejected": -22.017486572265625, "step": 10585 }, { "epoch": 0.35693821834237754, "grad_norm": 25.867671966552734, "learning_rate": 8.120273462896267e-07, "logits/chosen": -0.9501420259475708, "logits/rejected": -1.3043444156646729, "logps/chosen": -1.6017944812774658, "logps/rejected": -1.914813756942749, "loss": 2.0398, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.0179443359375, "rewards/margins": 3.1301941871643066, "rewards/rejected": -19.14813804626465, "step": 10590 }, { "epoch": 0.35710674441336077, "grad_norm": 26.94976806640625, "learning_rate": 8.11797461237229e-07, "logits/chosen": -0.9019731283187866, "logits/rejected": -1.0319669246673584, "logps/chosen": -2.006082057952881, "logps/rejected": -2.0699872970581055, "loss": 2.8598, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.060821533203125, "rewards/margins": 0.6390496492385864, "rewards/rejected": -20.699871063232422, "step": 10595 }, { "epoch": 0.35727527048434393, "grad_norm": 61.693450927734375, "learning_rate": 8.115674682840143e-07, "logits/chosen": -0.7386834025382996, "logits/rejected": -0.798917293548584, "logps/chosen": -2.1820013523101807, "logps/rejected": -2.5649349689483643, "loss": 1.9702, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.820011138916016, "rewards/margins": 3.8293373584747314, "rewards/rejected": -25.64935302734375, "step": 10600 }, { "epoch": 0.3574437965553271, "grad_norm": 35.622066497802734, "learning_rate": 8.113373675095743e-07, "logits/chosen": -0.8943923711776733, "logits/rejected": -0.7858074903488159, "logps/chosen": -2.3475801944732666, "logps/rejected": -2.061145305633545, "loss": 6.0813, "rewards/accuracies": 0.5, "rewards/chosen": -23.475801467895508, "rewards/margins": -2.864348888397217, "rewards/rejected": -20.611452102661133, "step": 10605 }, { "epoch": 0.35761232262631026, "grad_norm": 21.826406478881836, "learning_rate": 8.111071589935374e-07, "logits/chosen": -0.8130962252616882, "logits/rejected": -1.0828945636749268, "logps/chosen": -1.6612212657928467, "logps/rejected": -2.0095667839050293, "loss": 1.5615, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -16.612215042114258, "rewards/margins": 3.4834537506103516, "rewards/rejected": -20.09566879272461, "step": 10610 }, { "epoch": 0.3577808486972935, "grad_norm": 23.42716407775879, "learning_rate": 8.108768428155699e-07, "logits/chosen": -1.146475911140442, "logits/rejected": -1.3742198944091797, "logps/chosen": -1.9793484210968018, "logps/rejected": -2.097473382949829, "loss": 2.7614, "rewards/accuracies": 0.5, "rewards/chosen": -19.79348373413086, "rewards/margins": 1.181250810623169, "rewards/rejected": -20.974733352661133, "step": 10615 }, { "epoch": 0.35794937476827665, "grad_norm": 19.213890075683594, "learning_rate": 8.106464190553753e-07, "logits/chosen": -0.6341241002082825, "logits/rejected": -0.7922481298446655, "logps/chosen": -2.2231078147888184, "logps/rejected": -1.9456430673599243, "loss": 6.9201, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.231077194213867, "rewards/margins": -2.7746474742889404, "rewards/rejected": -19.45642852783203, "step": 10620 }, { "epoch": 0.3581179008392598, "grad_norm": 20.00115203857422, "learning_rate": 8.104158877926939e-07, "logits/chosen": -0.8993018865585327, "logits/rejected": -1.1128456592559814, "logps/chosen": -2.005645751953125, "logps/rejected": -2.5759527683258057, "loss": 2.2756, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.056455612182617, "rewards/margins": 5.70306921005249, "rewards/rejected": -25.7595272064209, "step": 10625 }, { "epoch": 0.35828642691024304, "grad_norm": 84.15482330322266, "learning_rate": 8.101852491073036e-07, "logits/chosen": -1.1985838413238525, "logits/rejected": -1.105023980140686, "logps/chosen": -2.0047457218170166, "logps/rejected": -2.3635776042938232, "loss": 1.906, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.047454833984375, "rewards/margins": 3.588322401046753, "rewards/rejected": -23.635778427124023, "step": 10630 }, { "epoch": 0.3584549529812262, "grad_norm": 23.04952621459961, "learning_rate": 8.099545030790196e-07, "logits/chosen": -0.5161810517311096, "logits/rejected": -0.5834816694259644, "logps/chosen": -1.7309818267822266, "logps/rejected": -1.7283589839935303, "loss": 3.5396, "rewards/accuracies": 0.5, "rewards/chosen": -17.3098201751709, "rewards/margins": -0.02622966840863228, "rewards/rejected": -17.28359031677246, "step": 10635 }, { "epoch": 0.35862347905220937, "grad_norm": 17.653745651245117, "learning_rate": 8.097236497876936e-07, "logits/chosen": -0.992956817150116, "logits/rejected": -1.181114912033081, "logps/chosen": -2.4386329650878906, "logps/rejected": -2.3491318225860596, "loss": 4.2635, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.386327743530273, "rewards/margins": -0.8950119018554688, "rewards/rejected": -23.491315841674805, "step": 10640 }, { "epoch": 0.35879200512319254, "grad_norm": 19.865116119384766, "learning_rate": 8.094926893132151e-07, "logits/chosen": -1.1021289825439453, "logits/rejected": -1.2707788944244385, "logps/chosen": -1.718862771987915, "logps/rejected": -1.920997977256775, "loss": 2.7385, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.188629150390625, "rewards/margins": 2.0213510990142822, "rewards/rejected": -19.209980010986328, "step": 10645 }, { "epoch": 0.35896053119417576, "grad_norm": 26.011329650878906, "learning_rate": 8.092616217355104e-07, "logits/chosen": -0.8622426986694336, "logits/rejected": -0.7457947731018066, "logps/chosen": -2.4235494136810303, "logps/rejected": -2.853537082672119, "loss": 2.9544, "rewards/accuracies": 0.5, "rewards/chosen": -24.23549461364746, "rewards/margins": 4.2998762130737305, "rewards/rejected": -28.535369873046875, "step": 10650 }, { "epoch": 0.3591290572651589, "grad_norm": 39.413116455078125, "learning_rate": 8.090304471345428e-07, "logits/chosen": -0.4998112618923187, "logits/rejected": -0.43951162695884705, "logps/chosen": -2.3564367294311523, "logps/rejected": -2.374368190765381, "loss": 2.935, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.56436538696289, "rewards/margins": 0.1793135702610016, "rewards/rejected": -23.743680953979492, "step": 10655 }, { "epoch": 0.3592975833361421, "grad_norm": 23.182138442993164, "learning_rate": 8.087991655903129e-07, "logits/chosen": -0.576167106628418, "logits/rejected": -0.7032621502876282, "logps/chosen": -2.637622833251953, "logps/rejected": -2.7747979164123535, "loss": 3.8412, "rewards/accuracies": 0.5, "rewards/chosen": -26.376230239868164, "rewards/margins": 1.3717488050460815, "rewards/rejected": -27.74798011779785, "step": 10660 }, { "epoch": 0.35946610940712526, "grad_norm": 15.814130783081055, "learning_rate": 8.085677771828577e-07, "logits/chosen": -0.6864426136016846, "logits/rejected": -0.8167473077774048, "logps/chosen": -1.9061853885650635, "logps/rejected": -2.3845667839050293, "loss": 1.5243, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.061853408813477, "rewards/margins": 4.783814430236816, "rewards/rejected": -23.84566879272461, "step": 10665 }, { "epoch": 0.3596346354781085, "grad_norm": 14.93944263458252, "learning_rate": 8.083362819922521e-07, "logits/chosen": -0.6831316351890564, "logits/rejected": -0.7957452535629272, "logps/chosen": -2.8043792247772217, "logps/rejected": -3.365492582321167, "loss": 1.2971, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -28.043792724609375, "rewards/margins": 5.611135005950928, "rewards/rejected": -33.65492630004883, "step": 10670 }, { "epoch": 0.35980316154909164, "grad_norm": 21.99008560180664, "learning_rate": 8.081046800986072e-07, "logits/chosen": -0.8613970875740051, "logits/rejected": -0.8172779083251953, "logps/chosen": -2.1606273651123047, "logps/rejected": -1.9760665893554688, "loss": 5.3347, "rewards/accuracies": 0.5, "rewards/chosen": -21.606273651123047, "rewards/margins": -1.8456090688705444, "rewards/rejected": -19.760665893554688, "step": 10675 }, { "epoch": 0.3599716876200748, "grad_norm": 2.4829165935516357, "learning_rate": 8.078729715820713e-07, "logits/chosen": -1.1459004878997803, "logits/rejected": -1.173221230506897, "logps/chosen": -1.861358880996704, "logps/rejected": -2.1315152645111084, "loss": 2.0619, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.613590240478516, "rewards/margins": 2.7015633583068848, "rewards/rejected": -21.315153121948242, "step": 10680 }, { "epoch": 0.36014021369105803, "grad_norm": 22.07672119140625, "learning_rate": 8.076411565228298e-07, "logits/chosen": -1.134795904159546, "logits/rejected": -1.1395275592803955, "logps/chosen": -1.8481004238128662, "logps/rejected": -2.072533130645752, "loss": 2.1095, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.481006622314453, "rewards/margins": 2.2443251609802246, "rewards/rejected": -20.725330352783203, "step": 10685 }, { "epoch": 0.3603087397620412, "grad_norm": 34.374366760253906, "learning_rate": 8.074092350011046e-07, "logits/chosen": -1.1068211793899536, "logits/rejected": -1.222532033920288, "logps/chosen": -1.869600534439087, "logps/rejected": -1.9473581314086914, "loss": 2.7607, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.69600486755371, "rewards/margins": 0.7775775790214539, "rewards/rejected": -19.473583221435547, "step": 10690 }, { "epoch": 0.36047726583302436, "grad_norm": 149.23831176757812, "learning_rate": 8.071772070971546e-07, "logits/chosen": -0.8026615977287292, "logits/rejected": -0.8282175064086914, "logps/chosen": -2.3600168228149414, "logps/rejected": -2.2999606132507324, "loss": 4.2298, "rewards/accuracies": 0.5, "rewards/chosen": -23.600168228149414, "rewards/margins": -0.6005603671073914, "rewards/rejected": -22.99960708618164, "step": 10695 }, { "epoch": 0.36064579190400753, "grad_norm": 19.745399475097656, "learning_rate": 8.069450728912753e-07, "logits/chosen": -0.7332836389541626, "logits/rejected": -0.6940861940383911, "logps/chosen": -2.3434970378875732, "logps/rejected": -2.887530565261841, "loss": 3.0107, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.43497085571289, "rewards/margins": 5.440334320068359, "rewards/rejected": -28.87530517578125, "step": 10700 }, { "epoch": 0.36081431797499075, "grad_norm": 28.678794860839844, "learning_rate": 8.067128324637997e-07, "logits/chosen": -1.017822504043579, "logits/rejected": -1.1551496982574463, "logps/chosen": -2.1272358894348145, "logps/rejected": -2.23545503616333, "loss": 2.3345, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.272357940673828, "rewards/margins": 1.0821908712387085, "rewards/rejected": -22.354549407958984, "step": 10705 }, { "epoch": 0.3609828440459739, "grad_norm": 17.994518280029297, "learning_rate": 8.064804858950966e-07, "logits/chosen": -1.330047845840454, "logits/rejected": -1.540175199508667, "logps/chosen": -1.7342472076416016, "logps/rejected": -1.9005409479141235, "loss": 2.1613, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.342472076416016, "rewards/margins": 1.6629356145858765, "rewards/rejected": -19.005409240722656, "step": 10710 }, { "epoch": 0.3611513701169571, "grad_norm": 21.73023223876953, "learning_rate": 8.062480332655722e-07, "logits/chosen": -0.590639591217041, "logits/rejected": -0.5785341858863831, "logps/chosen": -2.293099880218506, "logps/rejected": -2.547441005706787, "loss": 1.9951, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.93099594116211, "rewards/margins": 2.5434110164642334, "rewards/rejected": -25.474407196044922, "step": 10715 }, { "epoch": 0.36131989618794025, "grad_norm": 44.752376556396484, "learning_rate": 8.060154746556694e-07, "logits/chosen": -1.0894025564193726, "logits/rejected": -0.9643278121948242, "logps/chosen": -2.7998452186584473, "logps/rejected": -2.6930184364318848, "loss": 4.1691, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -27.99845314025879, "rewards/margins": -1.0682717561721802, "rewards/rejected": -26.9301815032959, "step": 10720 }, { "epoch": 0.36148842225892347, "grad_norm": 22.641786575317383, "learning_rate": 8.05782810145867e-07, "logits/chosen": -0.69547039270401, "logits/rejected": -0.9306710958480835, "logps/chosen": -1.8364391326904297, "logps/rejected": -2.280531167984009, "loss": 2.5626, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.364391326904297, "rewards/margins": 4.440920829772949, "rewards/rejected": -22.805309295654297, "step": 10725 }, { "epoch": 0.36165694832990664, "grad_norm": 19.32879066467285, "learning_rate": 8.055500398166816e-07, "logits/chosen": -0.8459224700927734, "logits/rejected": -0.7579227685928345, "logps/chosen": -2.598602056503296, "logps/rejected": -2.1134042739868164, "loss": 7.9173, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -25.986019134521484, "rewards/margins": -4.851977348327637, "rewards/rejected": -21.134042739868164, "step": 10730 }, { "epoch": 0.3618254744008898, "grad_norm": 27.421846389770508, "learning_rate": 8.053171637486656e-07, "logits/chosen": -0.9135136604309082, "logits/rejected": -0.8715440630912781, "logps/chosen": -2.0788276195526123, "logps/rejected": -2.096433162689209, "loss": 3.4743, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -20.78827667236328, "rewards/margins": 0.1760571449995041, "rewards/rejected": -20.96433448791504, "step": 10735 }, { "epoch": 0.361994000471873, "grad_norm": 16.79775047302246, "learning_rate": 8.050841820224081e-07, "logits/chosen": -0.8370095491409302, "logits/rejected": -0.9754121899604797, "logps/chosen": -1.9215948581695557, "logps/rejected": -2.382934331893921, "loss": 2.0463, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.2159481048584, "rewards/margins": 4.613394737243652, "rewards/rejected": -23.829343795776367, "step": 10740 }, { "epoch": 0.3621625265428562, "grad_norm": 9.726906776428223, "learning_rate": 8.048510947185353e-07, "logits/chosen": -1.0751714706420898, "logits/rejected": -1.2355901002883911, "logps/chosen": -1.956291913986206, "logps/rejected": -2.3752856254577637, "loss": 1.4613, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.56292152404785, "rewards/margins": 4.189937591552734, "rewards/rejected": -23.752857208251953, "step": 10745 }, { "epoch": 0.36233105261383936, "grad_norm": 16.294376373291016, "learning_rate": 8.046179019177091e-07, "logits/chosen": -0.8132011294364929, "logits/rejected": -0.9101356267929077, "logps/chosen": -2.291696786880493, "logps/rejected": -2.310281753540039, "loss": 3.8084, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.91696548461914, "rewards/margins": 0.1858508139848709, "rewards/rejected": -23.102819442749023, "step": 10750 }, { "epoch": 0.3624995786848225, "grad_norm": 43.041324615478516, "learning_rate": 8.043846037006285e-07, "logits/chosen": -0.9655378460884094, "logits/rejected": -0.837437629699707, "logps/chosen": -2.0782408714294434, "logps/rejected": -2.021411180496216, "loss": 3.6769, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.78240966796875, "rewards/margins": -0.5682979822158813, "rewards/rejected": -20.214109420776367, "step": 10755 }, { "epoch": 0.36266810475580574, "grad_norm": 23.52166175842285, "learning_rate": 8.041512001480288e-07, "logits/chosen": -1.079542875289917, "logits/rejected": -1.1581979990005493, "logps/chosen": -1.7527449131011963, "logps/rejected": -2.023618221282959, "loss": 1.5697, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -17.527446746826172, "rewards/margins": 2.7087342739105225, "rewards/rejected": -20.236183166503906, "step": 10760 }, { "epoch": 0.3628366308267889, "grad_norm": 7.493854522705078, "learning_rate": 8.03917691340682e-07, "logits/chosen": -0.7019214034080505, "logits/rejected": -0.9974255561828613, "logps/chosen": -2.165797472000122, "logps/rejected": -2.412288188934326, "loss": 2.6236, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.657974243164062, "rewards/margins": 2.4649100303649902, "rewards/rejected": -24.12288475036621, "step": 10765 }, { "epoch": 0.3630051568977721, "grad_norm": 0.5345020890235901, "learning_rate": 8.036840773593958e-07, "logits/chosen": -1.065690279006958, "logits/rejected": -1.0886703729629517, "logps/chosen": -1.682474136352539, "logps/rejected": -2.0966532230377197, "loss": 2.0629, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -16.824743270874023, "rewards/margins": 4.141788482666016, "rewards/rejected": -20.966529846191406, "step": 10770 }, { "epoch": 0.36317368296875524, "grad_norm": 49.67851638793945, "learning_rate": 8.034503582850154e-07, "logits/chosen": -1.3109495639801025, "logits/rejected": -1.2142508029937744, "logps/chosen": -1.5672677755355835, "logps/rejected": -1.5252450704574585, "loss": 3.6729, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -15.672677993774414, "rewards/margins": -0.42022705078125, "rewards/rejected": -15.252450942993164, "step": 10775 }, { "epoch": 0.36334220903973846, "grad_norm": 29.32314109802246, "learning_rate": 8.032165341984214e-07, "logits/chosen": -1.1105209589004517, "logits/rejected": -1.3988972902297974, "logps/chosen": -1.9345782995224, "logps/rejected": -1.8571125268936157, "loss": 3.8116, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -19.345783233642578, "rewards/margins": -0.7746579051017761, "rewards/rejected": -18.571125030517578, "step": 10780 }, { "epoch": 0.36351073511072163, "grad_norm": 24.98900604248047, "learning_rate": 8.029826051805311e-07, "logits/chosen": -0.8778678178787231, "logits/rejected": -1.202693223953247, "logps/chosen": -2.0306835174560547, "logps/rejected": -2.4914937019348145, "loss": 2.0412, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.30683708190918, "rewards/margins": 4.608098983764648, "rewards/rejected": -24.914936065673828, "step": 10785 }, { "epoch": 0.3636792611817048, "grad_norm": 51.06911087036133, "learning_rate": 8.027485713122982e-07, "logits/chosen": -0.7300389409065247, "logits/rejected": -0.7390624284744263, "logps/chosen": -2.4046947956085205, "logps/rejected": -2.5226998329162598, "loss": 3.5424, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -24.046947479248047, "rewards/margins": 1.1800497770309448, "rewards/rejected": -25.226999282836914, "step": 10790 }, { "epoch": 0.363847787252688, "grad_norm": 45.48428726196289, "learning_rate": 8.025144326747126e-07, "logits/chosen": -1.0668237209320068, "logits/rejected": -1.1761319637298584, "logps/chosen": -2.078740358352661, "logps/rejected": -2.179811477661133, "loss": 3.0571, "rewards/accuracies": 0.5, "rewards/chosen": -20.787403106689453, "rewards/margins": 1.010711908340454, "rewards/rejected": -21.798114776611328, "step": 10795 }, { "epoch": 0.3640163133236712, "grad_norm": 26.712162017822266, "learning_rate": 8.022801893488003e-07, "logits/chosen": -1.1064934730529785, "logits/rejected": -1.1405363082885742, "logps/chosen": -1.9824740886688232, "logps/rejected": -2.1991167068481445, "loss": 2.0287, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.82474136352539, "rewards/margins": 2.166426420211792, "rewards/rejected": -21.991167068481445, "step": 10800 }, { "epoch": 0.3640163133236712, "eval_logits/chosen": -1.346745491027832, "eval_logits/rejected": -1.4354705810546875, "eval_logps/chosen": -1.899340033531189, "eval_logps/rejected": -1.9798485040664673, "eval_loss": 3.053359270095825, "eval_rewards/accuracies": 0.6200000047683716, "eval_rewards/chosen": -18.99340057373047, "eval_rewards/margins": 0.8050832152366638, "eval_rewards/rejected": -19.798484802246094, "eval_runtime": 12.9258, "eval_samples_per_second": 7.736, "eval_steps_per_second": 1.934, "step": 10800 }, { "epoch": 0.36418483939465435, "grad_norm": 30.801185607910156, "learning_rate": 8.020458414156239e-07, "logits/chosen": -1.2585346698760986, "logits/rejected": -1.40419602394104, "logps/chosen": -2.2337536811828613, "logps/rejected": -2.4760305881500244, "loss": 2.0292, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.337535858154297, "rewards/margins": 2.422769546508789, "rewards/rejected": -24.760305404663086, "step": 10805 }, { "epoch": 0.3643533654656375, "grad_norm": 48.36637496948242, "learning_rate": 8.018113889562821e-07, "logits/chosen": -1.1833237409591675, "logits/rejected": -1.37998366355896, "logps/chosen": -1.620487928390503, "logps/rejected": -1.7583506107330322, "loss": 2.9234, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.20488166809082, "rewards/margins": 1.378624677658081, "rewards/rejected": -17.583505630493164, "step": 10810 }, { "epoch": 0.36452189153662073, "grad_norm": 20.77741813659668, "learning_rate": 8.015768320519094e-07, "logits/chosen": -0.5633755326271057, "logits/rejected": -0.5698398351669312, "logps/chosen": -2.248947858810425, "logps/rejected": -2.2062184810638428, "loss": 3.8129, "rewards/accuracies": 0.5, "rewards/chosen": -22.489479064941406, "rewards/margins": -0.4272943437099457, "rewards/rejected": -22.062185287475586, "step": 10815 }, { "epoch": 0.3646904176076039, "grad_norm": 24.723190307617188, "learning_rate": 8.013421707836767e-07, "logits/chosen": -0.9400695562362671, "logits/rejected": -1.1459122896194458, "logps/chosen": -2.1460351943969727, "logps/rejected": -2.315514087677002, "loss": 2.2302, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.46035385131836, "rewards/margins": 1.6947863101959229, "rewards/rejected": -23.155139923095703, "step": 10820 }, { "epoch": 0.36485894367858707, "grad_norm": 32.624755859375, "learning_rate": 8.01107405232791e-07, "logits/chosen": -1.087473750114441, "logits/rejected": -1.2249068021774292, "logps/chosen": -1.879091501235962, "logps/rejected": -2.0642178058624268, "loss": 1.6607, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.79091453552246, "rewards/margins": 1.8512637615203857, "rewards/rejected": -20.64217758178711, "step": 10825 }, { "epoch": 0.36502746974957023, "grad_norm": 34.93103790283203, "learning_rate": 8.008725354804957e-07, "logits/chosen": -0.7595319747924805, "logits/rejected": -0.9362818598747253, "logps/chosen": -1.7449228763580322, "logps/rejected": -1.6130717992782593, "loss": 5.0697, "rewards/accuracies": 0.5, "rewards/chosen": -17.449230194091797, "rewards/margins": -1.318509817123413, "rewards/rejected": -16.130718231201172, "step": 10830 }, { "epoch": 0.36519599582055345, "grad_norm": 16.051925659179688, "learning_rate": 8.006375616080697e-07, "logits/chosen": -1.0025599002838135, "logits/rejected": -1.1287715435028076, "logps/chosen": -2.0526137351989746, "logps/rejected": -2.1133031845092773, "loss": 2.8868, "rewards/accuracies": 0.5, "rewards/chosen": -20.526134490966797, "rewards/margins": 0.6068953275680542, "rewards/rejected": -21.13302993774414, "step": 10835 }, { "epoch": 0.3653645218915366, "grad_norm": 31.391122817993164, "learning_rate": 8.004024836968284e-07, "logits/chosen": -1.0124971866607666, "logits/rejected": -1.1490424871444702, "logps/chosen": -2.1096863746643066, "logps/rejected": -2.255751132965088, "loss": 2.6028, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.09686279296875, "rewards/margins": 1.4606473445892334, "rewards/rejected": -22.557512283325195, "step": 10840 }, { "epoch": 0.3655330479625198, "grad_norm": 30.329187393188477, "learning_rate": 8.001673018281228e-07, "logits/chosen": -0.9520799517631531, "logits/rejected": -1.063399076461792, "logps/chosen": -2.088949203491211, "logps/rejected": -2.2793190479278564, "loss": 2.4518, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.88949203491211, "rewards/margins": 1.9036967754364014, "rewards/rejected": -22.793188095092773, "step": 10845 }, { "epoch": 0.365701574033503, "grad_norm": 37.581329345703125, "learning_rate": 7.9993201608334e-07, "logits/chosen": -0.3347877562046051, "logits/rejected": -0.5965047478675842, "logps/chosen": -1.8404756784439087, "logps/rejected": -2.2952017784118652, "loss": 1.913, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.404754638671875, "rewards/margins": 4.547261714935303, "rewards/rejected": -22.952016830444336, "step": 10850 }, { "epoch": 0.3658701001044862, "grad_norm": 21.93905258178711, "learning_rate": 7.996966265439033e-07, "logits/chosen": -0.9020354151725769, "logits/rejected": -0.9424558877944946, "logps/chosen": -1.7665306329727173, "logps/rejected": -1.5817607641220093, "loss": 5.0133, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -17.665306091308594, "rewards/margins": -1.8476982116699219, "rewards/rejected": -15.817608833312988, "step": 10855 }, { "epoch": 0.36603862617546934, "grad_norm": 125.510986328125, "learning_rate": 7.994611332912719e-07, "logits/chosen": -1.2232359647750854, "logits/rejected": -1.100111722946167, "logps/chosen": -2.0492358207702637, "logps/rejected": -2.1210994720458984, "loss": 2.5122, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.492359161376953, "rewards/margins": 0.7186365127563477, "rewards/rejected": -21.210994720458984, "step": 10860 }, { "epoch": 0.3662071522464525, "grad_norm": 26.81747817993164, "learning_rate": 7.992255364069406e-07, "logits/chosen": -0.8770850896835327, "logits/rejected": -0.8681109547615051, "logps/chosen": -1.8699661493301392, "logps/rejected": -1.9965813159942627, "loss": 2.6638, "rewards/accuracies": 0.5, "rewards/chosen": -18.699661254882812, "rewards/margins": 1.2661510705947876, "rewards/rejected": -19.96581268310547, "step": 10865 }, { "epoch": 0.3663756783174357, "grad_norm": 12.97459888458252, "learning_rate": 7.989898359724401e-07, "logits/chosen": -1.3153835535049438, "logits/rejected": -1.5715057849884033, "logps/chosen": -1.7097463607788086, "logps/rejected": -1.838587999343872, "loss": 2.3408, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.097463607788086, "rewards/margins": 1.288414716720581, "rewards/rejected": -18.38587760925293, "step": 10870 }, { "epoch": 0.3665442043884189, "grad_norm": 24.906919479370117, "learning_rate": 7.98754032069337e-07, "logits/chosen": -1.2778717279434204, "logits/rejected": -1.3702975511550903, "logps/chosen": -1.830116629600525, "logps/rejected": -1.8268821239471436, "loss": 3.5659, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.301166534423828, "rewards/margins": -0.03234300762414932, "rewards/rejected": -18.268823623657227, "step": 10875 }, { "epoch": 0.36671273045940206, "grad_norm": 35.55815887451172, "learning_rate": 7.985181247792338e-07, "logits/chosen": -0.7316654324531555, "logits/rejected": -0.8771049380302429, "logps/chosen": -1.9307162761688232, "logps/rejected": -1.8940412998199463, "loss": 3.4595, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -19.30716323852539, "rewards/margins": -0.36675119400024414, "rewards/rejected": -18.940412521362305, "step": 10880 }, { "epoch": 0.3668812565303852, "grad_norm": 28.160144805908203, "learning_rate": 7.982821141837691e-07, "logits/chosen": -1.1204853057861328, "logits/rejected": -1.2265549898147583, "logps/chosen": -2.3023176193237305, "logps/rejected": -2.135511636734009, "loss": 5.0299, "rewards/accuracies": 0.5, "rewards/chosen": -23.023174285888672, "rewards/margins": -1.6680587530136108, "rewards/rejected": -21.355113983154297, "step": 10885 }, { "epoch": 0.36704978260136845, "grad_norm": 24.9565486907959, "learning_rate": 7.980460003646162e-07, "logits/chosen": -0.9870179295539856, "logits/rejected": -1.0818120241165161, "logps/chosen": -1.8075840473175049, "logps/rejected": -1.8251380920410156, "loss": 3.1207, "rewards/accuracies": 0.5, "rewards/chosen": -18.07583999633789, "rewards/margins": 0.17554087936878204, "rewards/rejected": -18.251379013061523, "step": 10890 }, { "epoch": 0.3672183086723516, "grad_norm": 45.430294036865234, "learning_rate": 7.978097834034851e-07, "logits/chosen": -1.2092764377593994, "logits/rejected": -1.349082350730896, "logps/chosen": -2.282064199447632, "logps/rejected": -2.6417107582092285, "loss": 3.4773, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.820642471313477, "rewards/margins": 3.596468448638916, "rewards/rejected": -26.417110443115234, "step": 10895 }, { "epoch": 0.3673868347433348, "grad_norm": 33.20343017578125, "learning_rate": 7.975734633821214e-07, "logits/chosen": -1.1706898212432861, "logits/rejected": -1.1777961254119873, "logps/chosen": -2.0728137493133545, "logps/rejected": -1.885846734046936, "loss": 4.9354, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.728137969970703, "rewards/margins": -1.8696720600128174, "rewards/rejected": -18.85846519470215, "step": 10900 }, { "epoch": 0.367555360814318, "grad_norm": 14.910902976989746, "learning_rate": 7.973370403823059e-07, "logits/chosen": -1.4453526735305786, "logits/rejected": -1.3824089765548706, "logps/chosen": -1.74431574344635, "logps/rejected": -1.8275190591812134, "loss": 2.6178, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.443157196044922, "rewards/margins": 0.832032322883606, "rewards/rejected": -18.275188446044922, "step": 10905 }, { "epoch": 0.36772388688530117, "grad_norm": 23.91060447692871, "learning_rate": 7.971005144858553e-07, "logits/chosen": -1.0507118701934814, "logits/rejected": -1.1027783155441284, "logps/chosen": -2.1079440116882324, "logps/rejected": -2.2358126640319824, "loss": 3.259, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.07944107055664, "rewards/margins": 1.2786868810653687, "rewards/rejected": -22.35812759399414, "step": 10910 }, { "epoch": 0.36789241295628433, "grad_norm": 18.480350494384766, "learning_rate": 7.968638857746218e-07, "logits/chosen": -0.8534606099128723, "logits/rejected": -0.8906230926513672, "logps/chosen": -2.049269437789917, "logps/rejected": -2.2661845684051514, "loss": 3.0378, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.49269676208496, "rewards/margins": 2.169149875640869, "rewards/rejected": -22.661846160888672, "step": 10915 }, { "epoch": 0.3680609390272675, "grad_norm": 20.608287811279297, "learning_rate": 7.966271543304937e-07, "logits/chosen": -0.8241097331047058, "logits/rejected": -0.7778711915016174, "logps/chosen": -2.1182360649108887, "logps/rejected": -2.205749034881592, "loss": 2.646, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.182361602783203, "rewards/margins": 0.8751300573348999, "rewards/rejected": -22.057491302490234, "step": 10920 }, { "epoch": 0.3682294650982507, "grad_norm": 22.974990844726562, "learning_rate": 7.963903202353939e-07, "logits/chosen": -1.1302852630615234, "logits/rejected": -1.2610663175582886, "logps/chosen": -1.9656798839569092, "logps/rejected": -2.0262506008148193, "loss": 3.0751, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.65679931640625, "rewards/margins": 0.605708122253418, "rewards/rejected": -20.26250648498535, "step": 10925 }, { "epoch": 0.3683979911692339, "grad_norm": 20.506927490234375, "learning_rate": 7.961533835712816e-07, "logits/chosen": -0.9316355586051941, "logits/rejected": -1.1611944437026978, "logps/chosen": -1.856085181236267, "logps/rejected": -1.9529523849487305, "loss": 2.8926, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.56085205078125, "rewards/margins": 0.9686723947525024, "rewards/rejected": -19.529521942138672, "step": 10930 }, { "epoch": 0.36856651724021705, "grad_norm": 28.772199630737305, "learning_rate": 7.959163444201512e-07, "logits/chosen": -1.3307853937149048, "logits/rejected": -1.1167147159576416, "logps/chosen": -2.1241683959960938, "logps/rejected": -2.3638079166412354, "loss": 2.7786, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.241682052612305, "rewards/margins": 2.396395444869995, "rewards/rejected": -23.638076782226562, "step": 10935 }, { "epoch": 0.3687350433112002, "grad_norm": 9.803357124328613, "learning_rate": 7.956792028640327e-07, "logits/chosen": -1.039208173751831, "logits/rejected": -0.9944947361946106, "logps/chosen": -2.582061290740967, "logps/rejected": -2.6542296409606934, "loss": 2.7741, "rewards/accuracies": 0.5, "rewards/chosen": -25.82061195373535, "rewards/margins": 0.7216871976852417, "rewards/rejected": -26.54229736328125, "step": 10940 }, { "epoch": 0.36890356938218344, "grad_norm": 25.190404891967773, "learning_rate": 7.954419589849914e-07, "logits/chosen": -1.2668789625167847, "logits/rejected": -1.4306660890579224, "logps/chosen": -1.7445892095565796, "logps/rejected": -1.6929042339324951, "loss": 3.692, "rewards/accuracies": 0.5, "rewards/chosen": -17.445892333984375, "rewards/margins": -0.5168499946594238, "rewards/rejected": -16.92904281616211, "step": 10945 }, { "epoch": 0.3690720954531666, "grad_norm": 30.531227111816406, "learning_rate": 7.952046128651279e-07, "logits/chosen": -0.958761990070343, "logits/rejected": -0.9972974061965942, "logps/chosen": -2.2063372135162354, "logps/rejected": -2.076390266418457, "loss": 4.3272, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -22.063373565673828, "rewards/margins": -1.2994682788848877, "rewards/rejected": -20.763904571533203, "step": 10950 }, { "epoch": 0.36924062152414977, "grad_norm": 43.369598388671875, "learning_rate": 7.949671645865788e-07, "logits/chosen": -1.0231273174285889, "logits/rejected": -1.2699315547943115, "logps/chosen": -1.7658660411834717, "logps/rejected": -1.9867603778839111, "loss": 2.4551, "rewards/accuracies": 0.5, "rewards/chosen": -17.658658981323242, "rewards/margins": 2.208942413330078, "rewards/rejected": -19.867603302001953, "step": 10955 }, { "epoch": 0.369409147595133, "grad_norm": 25.24970817565918, "learning_rate": 7.94729614231515e-07, "logits/chosen": -1.1766541004180908, "logits/rejected": -1.4894263744354248, "logps/chosen": -1.7530008554458618, "logps/rejected": -1.8725624084472656, "loss": 2.419, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.53000831604004, "rewards/margins": 1.1956161260604858, "rewards/rejected": -18.725622177124023, "step": 10960 }, { "epoch": 0.36957767366611616, "grad_norm": 4.202431678771973, "learning_rate": 7.944919618821438e-07, "logits/chosen": -1.3009445667266846, "logits/rejected": -1.4404528141021729, "logps/chosen": -2.084996461868286, "logps/rejected": -2.300058126449585, "loss": 2.1089, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.849964141845703, "rewards/margins": 2.150618553161621, "rewards/rejected": -23.00058364868164, "step": 10965 }, { "epoch": 0.3697461997370993, "grad_norm": 23.584318161010742, "learning_rate": 7.942542076207069e-07, "logits/chosen": -0.710057258605957, "logits/rejected": -0.7822506427764893, "logps/chosen": -2.7446389198303223, "logps/rejected": -3.029550075531006, "loss": 2.1846, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -27.446392059326172, "rewards/margins": 2.849109172821045, "rewards/rejected": -30.295501708984375, "step": 10970 }, { "epoch": 0.3699147258080825, "grad_norm": 22.66236114501953, "learning_rate": 7.940163515294819e-07, "logits/chosen": -0.9495447278022766, "logits/rejected": -1.0309523344039917, "logps/chosen": -2.10927677154541, "logps/rejected": -2.3697586059570312, "loss": 2.7297, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.0927677154541, "rewards/margins": 2.6048176288604736, "rewards/rejected": -23.697586059570312, "step": 10975 }, { "epoch": 0.3700832518790657, "grad_norm": 32.25993347167969, "learning_rate": 7.937783936907816e-07, "logits/chosen": -0.9410039782524109, "logits/rejected": -0.9836063385009766, "logps/chosen": -2.0231223106384277, "logps/rejected": -2.137643337249756, "loss": 2.2719, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.23122215270996, "rewards/margins": 1.1452109813690186, "rewards/rejected": -21.376434326171875, "step": 10980 }, { "epoch": 0.3702517779500489, "grad_norm": 45.95429229736328, "learning_rate": 7.935403341869535e-07, "logits/chosen": -0.5016958713531494, "logits/rejected": -0.5743024945259094, "logps/chosen": -2.0376670360565186, "logps/rejected": -2.1427981853485107, "loss": 2.3295, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.37666893005371, "rewards/margins": 1.0513111352920532, "rewards/rejected": -21.427982330322266, "step": 10985 }, { "epoch": 0.37042030402103204, "grad_norm": 9.918193817138672, "learning_rate": 7.933021731003809e-07, "logits/chosen": -1.3090450763702393, "logits/rejected": -1.3329724073410034, "logps/chosen": -2.461430788040161, "logps/rejected": -2.8623244762420654, "loss": 1.8669, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -24.614307403564453, "rewards/margins": 4.008938789367676, "rewards/rejected": -28.623245239257812, "step": 10990 }, { "epoch": 0.3705888300920152, "grad_norm": 22.17966079711914, "learning_rate": 7.930639105134818e-07, "logits/chosen": -1.0726702213287354, "logits/rejected": -1.2153971195220947, "logps/chosen": -2.1161656379699707, "logps/rejected": -2.127230405807495, "loss": 3.2105, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.161657333374023, "rewards/margins": 0.11064748466014862, "rewards/rejected": -21.27230453491211, "step": 10995 }, { "epoch": 0.37075735616299843, "grad_norm": 39.328758239746094, "learning_rate": 7.928255465087094e-07, "logits/chosen": -1.2356667518615723, "logits/rejected": -1.152830719947815, "logps/chosen": -1.9329249858856201, "logps/rejected": -1.9360370635986328, "loss": 3.0736, "rewards/accuracies": 0.5, "rewards/chosen": -19.32925033569336, "rewards/margins": 0.03111877478659153, "rewards/rejected": -19.360370635986328, "step": 11000 }, { "epoch": 0.3709258822339816, "grad_norm": 9.758732795715332, "learning_rate": 7.925870811685523e-07, "logits/chosen": -0.9007709622383118, "logits/rejected": -1.2970943450927734, "logps/chosen": -1.790388822555542, "logps/rejected": -2.073038101196289, "loss": 1.8041, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.903888702392578, "rewards/margins": 2.8264918327331543, "rewards/rejected": -20.73038101196289, "step": 11005 }, { "epoch": 0.37109440830496476, "grad_norm": 24.037540435791016, "learning_rate": 7.923485145755339e-07, "logits/chosen": -0.6779791712760925, "logits/rejected": -0.953299880027771, "logps/chosen": -2.328507661819458, "logps/rejected": -2.587146043777466, "loss": 1.678, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.285076141357422, "rewards/margins": 2.586381435394287, "rewards/rejected": -25.871456146240234, "step": 11010 }, { "epoch": 0.371262934375948, "grad_norm": 28.73473358154297, "learning_rate": 7.921098468122127e-07, "logits/chosen": -1.0912433862686157, "logits/rejected": -1.1714380979537964, "logps/chosen": -1.9897487163543701, "logps/rejected": -2.1489391326904297, "loss": 3.4827, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.89748764038086, "rewards/margins": 1.5919034481048584, "rewards/rejected": -21.489391326904297, "step": 11015 }, { "epoch": 0.37143146044693115, "grad_norm": 41.63922119140625, "learning_rate": 7.918710779611822e-07, "logits/chosen": -1.0436906814575195, "logits/rejected": -1.1562167406082153, "logps/chosen": -1.9038293361663818, "logps/rejected": -1.8470821380615234, "loss": 3.6442, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -19.038293838500977, "rewards/margins": -0.5674721002578735, "rewards/rejected": -18.470823287963867, "step": 11020 }, { "epoch": 0.3715999865179143, "grad_norm": 14.585823059082031, "learning_rate": 7.916322081050709e-07, "logits/chosen": -0.8105791211128235, "logits/rejected": -0.9758694767951965, "logps/chosen": -2.1387295722961426, "logps/rejected": -2.2473695278167725, "loss": 2.674, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.38729476928711, "rewards/margins": 1.0864002704620361, "rewards/rejected": -22.47369384765625, "step": 11025 }, { "epoch": 0.3717685125888975, "grad_norm": 56.67366409301758, "learning_rate": 7.91393237326542e-07, "logits/chosen": -1.2288243770599365, "logits/rejected": -1.2605135440826416, "logps/chosen": -2.3369829654693604, "logps/rejected": -2.5689830780029297, "loss": 3.9939, "rewards/accuracies": 0.5, "rewards/chosen": -23.369831085205078, "rewards/margins": 2.320000171661377, "rewards/rejected": -25.689828872680664, "step": 11030 }, { "epoch": 0.3719370386598807, "grad_norm": 39.05540466308594, "learning_rate": 7.911541657082943e-07, "logits/chosen": -0.34584683179855347, "logits/rejected": -0.3778868615627289, "logps/chosen": -1.9564568996429443, "logps/rejected": -2.009974241256714, "loss": 2.9104, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.56456756591797, "rewards/margins": 0.5351727604866028, "rewards/rejected": -20.099742889404297, "step": 11035 }, { "epoch": 0.37210556473086387, "grad_norm": 32.99935531616211, "learning_rate": 7.909149933330608e-07, "logits/chosen": -0.9975587129592896, "logits/rejected": -1.0851614475250244, "logps/chosen": -1.80816650390625, "logps/rejected": -1.9597301483154297, "loss": 2.5019, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.081666946411133, "rewards/margins": 1.5156358480453491, "rewards/rejected": -19.597301483154297, "step": 11040 }, { "epoch": 0.37227409080184704, "grad_norm": 30.022205352783203, "learning_rate": 7.906757202836097e-07, "logits/chosen": -1.1794826984405518, "logits/rejected": -1.2481104135513306, "logps/chosen": -2.236422300338745, "logps/rejected": -2.309372663497925, "loss": 3.417, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.36422348022461, "rewards/margins": 0.7295053601264954, "rewards/rejected": -23.09372901916504, "step": 11045 }, { "epoch": 0.3724426168728302, "grad_norm": 24.919965744018555, "learning_rate": 7.90436346642744e-07, "logits/chosen": -1.1151336431503296, "logits/rejected": -1.1000896692276, "logps/chosen": -2.002037286758423, "logps/rejected": -1.9458297491073608, "loss": 3.7713, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -20.020374298095703, "rewards/margins": -0.5620753169059753, "rewards/rejected": -19.458297729492188, "step": 11050 }, { "epoch": 0.3726111429438134, "grad_norm": 17.231094360351562, "learning_rate": 7.901968724933015e-07, "logits/chosen": -0.9927698969841003, "logits/rejected": -1.0535566806793213, "logps/chosen": -1.9820178747177124, "logps/rejected": -2.3538105487823486, "loss": 2.1248, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.820178985595703, "rewards/margins": 3.7179248332977295, "rewards/rejected": -23.538105010986328, "step": 11055 }, { "epoch": 0.3727796690147966, "grad_norm": 118.672607421875, "learning_rate": 7.899572979181545e-07, "logits/chosen": -1.281635046005249, "logits/rejected": -1.35465407371521, "logps/chosen": -1.8064937591552734, "logps/rejected": -1.8414586782455444, "loss": 3.1491, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -18.064937591552734, "rewards/margins": 0.34965038299560547, "rewards/rejected": -18.414587020874023, "step": 11060 }, { "epoch": 0.37294819508577975, "grad_norm": 27.215576171875, "learning_rate": 7.897176230002108e-07, "logits/chosen": -1.197999358177185, "logits/rejected": -1.0447529554367065, "logps/chosen": -2.089306592941284, "logps/rejected": -2.06302547454834, "loss": 3.4649, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.89306640625, "rewards/margins": -0.26281100511550903, "rewards/rejected": -20.630252838134766, "step": 11065 }, { "epoch": 0.373116721156763, "grad_norm": 65.9457778930664, "learning_rate": 7.894778478224123e-07, "logits/chosen": -1.1507130861282349, "logits/rejected": -1.225836992263794, "logps/chosen": -2.147380828857422, "logps/rejected": -2.0534119606018066, "loss": 4.3662, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -21.47380828857422, "rewards/margins": -0.9396876096725464, "rewards/rejected": -20.53411865234375, "step": 11070 }, { "epoch": 0.37328524722774614, "grad_norm": 22.707504272460938, "learning_rate": 7.892379724677354e-07, "logits/chosen": -0.7473500370979309, "logits/rejected": -0.9139529466629028, "logps/chosen": -2.0047521591186523, "logps/rejected": -2.5226516723632812, "loss": 1.0853, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -20.04751968383789, "rewards/margins": 5.178994655609131, "rewards/rejected": -25.226516723632812, "step": 11075 }, { "epoch": 0.3734537732987293, "grad_norm": 24.477022171020508, "learning_rate": 7.889979970191918e-07, "logits/chosen": -1.004990816116333, "logits/rejected": -0.9029370546340942, "logps/chosen": -1.7345755100250244, "logps/rejected": -1.6503496170043945, "loss": 3.9531, "rewards/accuracies": 0.5, "rewards/chosen": -17.345754623413086, "rewards/margins": -0.8422587513923645, "rewards/rejected": -16.503498077392578, "step": 11080 }, { "epoch": 0.3736222993697125, "grad_norm": 50.63272476196289, "learning_rate": 7.887579215598277e-07, "logits/chosen": -0.6778031587600708, "logits/rejected": -0.8914194107055664, "logps/chosen": -2.030156373977661, "logps/rejected": -2.2473902702331543, "loss": 2.6024, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.301563262939453, "rewards/margins": 2.172337293624878, "rewards/rejected": -22.473901748657227, "step": 11085 }, { "epoch": 0.3737908254406957, "grad_norm": 6.910362720489502, "learning_rate": 7.885177461727233e-07, "logits/chosen": -0.8329635858535767, "logits/rejected": -0.8338441848754883, "logps/chosen": -1.9738490581512451, "logps/rejected": -2.2185306549072266, "loss": 3.3751, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.73849105834961, "rewards/margins": 2.4468133449554443, "rewards/rejected": -22.185306549072266, "step": 11090 }, { "epoch": 0.37395935151167886, "grad_norm": 15.434812545776367, "learning_rate": 7.88277470940994e-07, "logits/chosen": -1.3695753812789917, "logits/rejected": -1.4795372486114502, "logps/chosen": -2.154623031616211, "logps/rejected": -2.047244071960449, "loss": 4.2477, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -21.54623031616211, "rewards/margins": -1.0737907886505127, "rewards/rejected": -20.47243881225586, "step": 11095 }, { "epoch": 0.374127877582662, "grad_norm": 20.866865158081055, "learning_rate": 7.8803709594779e-07, "logits/chosen": -1.5691171884536743, "logits/rejected": -1.4664461612701416, "logps/chosen": -1.8162921667099, "logps/rejected": -1.9663314819335938, "loss": 2.1468, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.162921905517578, "rewards/margins": 1.5003941059112549, "rewards/rejected": -19.663318634033203, "step": 11100 }, { "epoch": 0.3742964036536452, "grad_norm": 18.494287490844727, "learning_rate": 7.877966212762952e-07, "logits/chosen": -1.304776906967163, "logits/rejected": -1.2949206829071045, "logps/chosen": -1.9437240362167358, "logps/rejected": -2.265573740005493, "loss": 1.9874, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.43724250793457, "rewards/margins": 3.218493938446045, "rewards/rejected": -22.655736923217773, "step": 11105 }, { "epoch": 0.3744649297246284, "grad_norm": 122.1553955078125, "learning_rate": 7.875560470097285e-07, "logits/chosen": -1.307988166809082, "logits/rejected": -1.3546937704086304, "logps/chosen": -2.1325485706329346, "logps/rejected": -2.3272907733917236, "loss": 2.4457, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.325485229492188, "rewards/margins": 1.9474217891693115, "rewards/rejected": -23.27290916442871, "step": 11110 }, { "epoch": 0.3746334557956116, "grad_norm": 25.68359375, "learning_rate": 7.873153732313432e-07, "logits/chosen": -1.16599440574646, "logits/rejected": -1.28875732421875, "logps/chosen": -1.7075920104980469, "logps/rejected": -1.694820761680603, "loss": 3.3776, "rewards/accuracies": 0.5, "rewards/chosen": -17.07592010498047, "rewards/margins": -0.1277121603488922, "rewards/rejected": -16.948205947875977, "step": 11115 }, { "epoch": 0.37480198186659475, "grad_norm": 100.46998596191406, "learning_rate": 7.870746000244269e-07, "logits/chosen": -0.9557684063911438, "logits/rejected": -0.9845932126045227, "logps/chosen": -2.7109880447387695, "logps/rejected": -3.0132102966308594, "loss": 3.0756, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -27.109878540039062, "rewards/margins": 3.0222253799438477, "rewards/rejected": -30.132104873657227, "step": 11120 }, { "epoch": 0.37497050793757797, "grad_norm": 20.687875747680664, "learning_rate": 7.868337274723018e-07, "logits/chosen": -1.0712049007415771, "logits/rejected": -1.2100828886032104, "logps/chosen": -1.7976467609405518, "logps/rejected": -2.0642523765563965, "loss": 2.1613, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.976465225219727, "rewards/margins": 2.666057586669922, "rewards/rejected": -20.64252471923828, "step": 11125 }, { "epoch": 0.37513903400856113, "grad_norm": 19.09157371520996, "learning_rate": 7.865927556583245e-07, "logits/chosen": -0.9747500419616699, "logits/rejected": -1.2264236211776733, "logps/chosen": -2.065001964569092, "logps/rejected": -2.3609440326690674, "loss": 1.7971, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.6500186920166, "rewards/margins": 2.959421396255493, "rewards/rejected": -23.609439849853516, "step": 11130 }, { "epoch": 0.3753075600795443, "grad_norm": 21.92704963684082, "learning_rate": 7.863516846658857e-07, "logits/chosen": -0.8536840677261353, "logits/rejected": -1.1422303915023804, "logps/chosen": -1.911171317100525, "logps/rejected": -1.7837629318237305, "loss": 4.7212, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.111713409423828, "rewards/margins": -1.2740838527679443, "rewards/rejected": -17.837631225585938, "step": 11135 }, { "epoch": 0.37547608615052747, "grad_norm": 20.51827621459961, "learning_rate": 7.861105145784108e-07, "logits/chosen": -0.713087260723114, "logits/rejected": -0.7482129335403442, "logps/chosen": -2.575368881225586, "logps/rejected": -2.7310588359832764, "loss": 2.7725, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.75368881225586, "rewards/margins": 1.556899070739746, "rewards/rejected": -27.31058692932129, "step": 11140 }, { "epoch": 0.3756446122215107, "grad_norm": 18.282073974609375, "learning_rate": 7.858692454793589e-07, "logits/chosen": -0.7038525938987732, "logits/rejected": -0.787943959236145, "logps/chosen": -2.6329092979431152, "logps/rejected": -2.9737820625305176, "loss": 2.7294, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -26.3290958404541, "rewards/margins": 3.408724308013916, "rewards/rejected": -29.737817764282227, "step": 11145 }, { "epoch": 0.37581313829249385, "grad_norm": 13.00042724609375, "learning_rate": 7.856278774522242e-07, "logits/chosen": -0.9289695024490356, "logits/rejected": -1.0384521484375, "logps/chosen": -1.6467241048812866, "logps/rejected": -1.785021185874939, "loss": 1.9632, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -16.467241287231445, "rewards/margins": 1.3829692602157593, "rewards/rejected": -17.850210189819336, "step": 11150 }, { "epoch": 0.375981664363477, "grad_norm": 18.52608299255371, "learning_rate": 7.853864105805342e-07, "logits/chosen": -1.4320456981658936, "logits/rejected": -1.6049312353134155, "logps/chosen": -1.8154075145721436, "logps/rejected": -1.932138204574585, "loss": 2.4999, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.154075622558594, "rewards/margins": 1.1673085689544678, "rewards/rejected": -19.32138442993164, "step": 11155 }, { "epoch": 0.3761501904344602, "grad_norm": 16.68703842163086, "learning_rate": 7.851448449478513e-07, "logits/chosen": -0.8950377702713013, "logits/rejected": -0.6943656206130981, "logps/chosen": -1.9260505437850952, "logps/rejected": -2.0210115909576416, "loss": 3.0619, "rewards/accuracies": 0.5, "rewards/chosen": -19.26050567626953, "rewards/margins": 0.949609100818634, "rewards/rejected": -20.21011734008789, "step": 11160 }, { "epoch": 0.3763187165054434, "grad_norm": 23.949373245239258, "learning_rate": 7.84903180637772e-07, "logits/chosen": -1.0606566667556763, "logits/rejected": -0.9929370880126953, "logps/chosen": -1.6432468891143799, "logps/rejected": -1.7798480987548828, "loss": 2.4106, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.43246841430664, "rewards/margins": 1.366010308265686, "rewards/rejected": -17.798480987548828, "step": 11165 }, { "epoch": 0.3764872425764266, "grad_norm": 39.09119415283203, "learning_rate": 7.846614177339264e-07, "logits/chosen": -0.5430514812469482, "logits/rejected": -0.8053268194198608, "logps/chosen": -2.0910189151763916, "logps/rejected": -2.1563382148742676, "loss": 2.8637, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.910188674926758, "rewards/margins": 0.6531929969787598, "rewards/rejected": -21.563379287719727, "step": 11170 }, { "epoch": 0.37665576864740974, "grad_norm": 14.804924964904785, "learning_rate": 7.844195563199794e-07, "logits/chosen": -0.9070854187011719, "logits/rejected": -1.100773572921753, "logps/chosen": -2.2509732246398926, "logps/rejected": -2.2927937507629395, "loss": 3.488, "rewards/accuracies": 0.5, "rewards/chosen": -22.50973129272461, "rewards/margins": 0.4182073473930359, "rewards/rejected": -22.92793846130371, "step": 11175 }, { "epoch": 0.37682429471839296, "grad_norm": 65.736083984375, "learning_rate": 7.841775964796296e-07, "logits/chosen": -0.9328392148017883, "logits/rejected": -1.3494576215744019, "logps/chosen": -2.0066604614257812, "logps/rejected": -2.2377495765686035, "loss": 2.3296, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.066606521606445, "rewards/margins": 2.3108911514282227, "rewards/rejected": -22.377498626708984, "step": 11180 }, { "epoch": 0.3769928207893761, "grad_norm": 51.13811492919922, "learning_rate": 7.8393553829661e-07, "logits/chosen": -0.9025689959526062, "logits/rejected": -1.1935182809829712, "logps/chosen": -1.9853508472442627, "logps/rejected": -2.0312693119049072, "loss": 3.7426, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.85350799560547, "rewards/margins": 0.4591858983039856, "rewards/rejected": -20.312694549560547, "step": 11185 }, { "epoch": 0.3771613468603593, "grad_norm": 23.337675094604492, "learning_rate": 7.83693381854687e-07, "logits/chosen": -0.9358808398246765, "logits/rejected": -1.1639865636825562, "logps/chosen": -1.9129003286361694, "logps/rejected": -2.0121288299560547, "loss": 3.1006, "rewards/accuracies": 0.5, "rewards/chosen": -19.129003524780273, "rewards/margins": 0.992284893989563, "rewards/rejected": -20.121288299560547, "step": 11190 }, { "epoch": 0.37732987293134246, "grad_norm": 22.129470825195312, "learning_rate": 7.834511272376616e-07, "logits/chosen": -1.1202242374420166, "logits/rejected": -1.1665992736816406, "logps/chosen": -1.6695282459259033, "logps/rejected": -1.8295204639434814, "loss": 1.887, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.695281982421875, "rewards/margins": 1.5999218225479126, "rewards/rejected": -18.295204162597656, "step": 11195 }, { "epoch": 0.3774983990023257, "grad_norm": 19.949989318847656, "learning_rate": 7.832087745293687e-07, "logits/chosen": -1.039284348487854, "logits/rejected": -1.0724128484725952, "logps/chosen": -1.8849290609359741, "logps/rejected": -2.1825642585754395, "loss": 1.0473, "rewards/accuracies": 1.0, "rewards/chosen": -18.849288940429688, "rewards/margins": 2.9763526916503906, "rewards/rejected": -21.82564353942871, "step": 11200 }, { "epoch": 0.3774983990023257, "eval_logits/chosen": -1.4173237085342407, "eval_logits/rejected": -1.5109046697616577, "eval_logps/chosen": -1.9158105850219727, "eval_logps/rejected": -1.9985766410827637, "eval_loss": 3.0528249740600586, "eval_rewards/accuracies": 0.6100000143051147, "eval_rewards/chosen": -19.158105850219727, "eval_rewards/margins": 0.8276617527008057, "eval_rewards/rejected": -19.985767364501953, "eval_runtime": 12.8962, "eval_samples_per_second": 7.754, "eval_steps_per_second": 1.939, "step": 11200 }, { "epoch": 0.37766692507330885, "grad_norm": 90.48750305175781, "learning_rate": 7.829663238136769e-07, "logits/chosen": -0.6559727787971497, "logits/rejected": -0.6415562629699707, "logps/chosen": -2.196298122406006, "logps/rejected": -2.155961275100708, "loss": 3.5922, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -21.96297836303711, "rewards/margins": -0.40336617827415466, "rewards/rejected": -21.559612274169922, "step": 11205 }, { "epoch": 0.377835451144292, "grad_norm": 19.182260513305664, "learning_rate": 7.827237751744889e-07, "logits/chosen": -0.9353266954421997, "logits/rejected": -1.0023210048675537, "logps/chosen": -1.7862240076065063, "logps/rejected": -1.8538305759429932, "loss": 2.629, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.862239837646484, "rewards/margins": 0.6760631799697876, "rewards/rejected": -18.53830337524414, "step": 11210 }, { "epoch": 0.3780039772152752, "grad_norm": 37.58772659301758, "learning_rate": 7.824811286957411e-07, "logits/chosen": -0.5556604862213135, "logits/rejected": -0.8810178637504578, "logps/chosen": -2.6270499229431152, "logps/rejected": -2.7347140312194824, "loss": 4.833, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -26.2705020904541, "rewards/margins": 1.0766421556472778, "rewards/rejected": -27.347143173217773, "step": 11215 }, { "epoch": 0.3781725032862584, "grad_norm": 25.184528350830078, "learning_rate": 7.82238384461404e-07, "logits/chosen": -1.3108174800872803, "logits/rejected": -1.623822808265686, "logps/chosen": -2.8811333179473877, "logps/rejected": -3.0212135314941406, "loss": 4.4953, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -28.811330795288086, "rewards/margins": 1.400800347328186, "rewards/rejected": -30.212133407592773, "step": 11220 }, { "epoch": 0.37834102935724157, "grad_norm": 97.33984375, "learning_rate": 7.819955425554818e-07, "logits/chosen": -1.1681185960769653, "logits/rejected": -1.3820760250091553, "logps/chosen": -2.2876718044281006, "logps/rejected": -2.4516873359680176, "loss": 3.3757, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.876718521118164, "rewards/margins": 1.6401538848876953, "rewards/rejected": -24.51687240600586, "step": 11225 }, { "epoch": 0.37850955542822473, "grad_norm": 24.39796257019043, "learning_rate": 7.817526030620125e-07, "logits/chosen": -0.8352615237236023, "logits/rejected": -0.9630219340324402, "logps/chosen": -1.8876619338989258, "logps/rejected": -1.981488823890686, "loss": 2.2906, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -18.876617431640625, "rewards/margins": 0.9382714033126831, "rewards/rejected": -19.814889907836914, "step": 11230 }, { "epoch": 0.37867808149920795, "grad_norm": 38.77871322631836, "learning_rate": 7.815095660650679e-07, "logits/chosen": -1.0813668966293335, "logits/rejected": -1.2330832481384277, "logps/chosen": -1.8022918701171875, "logps/rejected": -1.9867006540298462, "loss": 1.755, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.022918701171875, "rewards/margins": 1.84408700466156, "rewards/rejected": -19.867008209228516, "step": 11235 }, { "epoch": 0.3788466075701911, "grad_norm": 39.55854034423828, "learning_rate": 7.812664316487534e-07, "logits/chosen": -1.3786113262176514, "logits/rejected": -1.407435655593872, "logps/chosen": -1.9177662134170532, "logps/rejected": -2.2436928749084473, "loss": 1.6302, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.177661895751953, "rewards/margins": 3.2592673301696777, "rewards/rejected": -22.43692970275879, "step": 11240 }, { "epoch": 0.3790151336411743, "grad_norm": 14.872421264648438, "learning_rate": 7.810231998972085e-07, "logits/chosen": -0.9885151982307434, "logits/rejected": -1.0631240606307983, "logps/chosen": -1.8154971599578857, "logps/rejected": -2.111203670501709, "loss": 1.7458, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.15497398376465, "rewards/margins": 2.957064151763916, "rewards/rejected": -21.112035751342773, "step": 11245 }, { "epoch": 0.37918365971215745, "grad_norm": 20.53430938720703, "learning_rate": 7.80779870894606e-07, "logits/chosen": -0.9703305959701538, "logits/rejected": -1.236297369003296, "logps/chosen": -1.994520902633667, "logps/rejected": -2.0560126304626465, "loss": 2.6613, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.945209503173828, "rewards/margins": 0.6149150729179382, "rewards/rejected": -20.560123443603516, "step": 11250 }, { "epoch": 0.37935218578314067, "grad_norm": 30.372365951538086, "learning_rate": 7.805364447251524e-07, "logits/chosen": -0.8832541704177856, "logits/rejected": -0.8103952407836914, "logps/chosen": -1.8758599758148193, "logps/rejected": -1.8299274444580078, "loss": 3.6637, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.758602142333984, "rewards/margins": -0.45932644605636597, "rewards/rejected": -18.29927635192871, "step": 11255 }, { "epoch": 0.37952071185412384, "grad_norm": 24.143922805786133, "learning_rate": 7.80292921473088e-07, "logits/chosen": -0.9485553503036499, "logits/rejected": -1.2246477603912354, "logps/chosen": -1.9528968334197998, "logps/rejected": -1.9845561981201172, "loss": 2.8398, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.528968811035156, "rewards/margins": 0.31659451127052307, "rewards/rejected": -19.845561981201172, "step": 11260 }, { "epoch": 0.379689237925107, "grad_norm": 19.331287384033203, "learning_rate": 7.800493012226865e-07, "logits/chosen": -1.026592493057251, "logits/rejected": -1.1320085525512695, "logps/chosen": -1.7972173690795898, "logps/rejected": -1.9342625141143799, "loss": 2.1796, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.9721736907959, "rewards/margins": 1.370451807975769, "rewards/rejected": -19.34262466430664, "step": 11265 }, { "epoch": 0.37985776399609017, "grad_norm": 45.51408767700195, "learning_rate": 7.798055840582555e-07, "logits/chosen": -0.6429299116134644, "logits/rejected": -0.831339955329895, "logps/chosen": -2.9012444019317627, "logps/rejected": -2.4655685424804688, "loss": 7.7611, "rewards/accuracies": 0.5, "rewards/chosen": -29.0124454498291, "rewards/margins": -4.356759548187256, "rewards/rejected": -24.655685424804688, "step": 11270 }, { "epoch": 0.3800262900670734, "grad_norm": 75.51453399658203, "learning_rate": 7.795617700641356e-07, "logits/chosen": -0.9120687246322632, "logits/rejected": -0.9755932688713074, "logps/chosen": -2.571845769882202, "logps/rejected": -2.770507574081421, "loss": 2.619, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -25.718456268310547, "rewards/margins": 1.986619234085083, "rewards/rejected": -27.705074310302734, "step": 11275 }, { "epoch": 0.38019481613805656, "grad_norm": 8.264686584472656, "learning_rate": 7.793178593247014e-07, "logits/chosen": -1.3956291675567627, "logits/rejected": -1.326690435409546, "logps/chosen": -2.0324623584747314, "logps/rejected": -2.5627822875976562, "loss": 1.8257, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.324626922607422, "rewards/margins": 5.303196430206299, "rewards/rejected": -25.627822875976562, "step": 11280 }, { "epoch": 0.3803633422090397, "grad_norm": 24.20513916015625, "learning_rate": 7.790738519243609e-07, "logits/chosen": -1.0150988101959229, "logits/rejected": -1.3087393045425415, "logps/chosen": -1.8914064168930054, "logps/rejected": -2.2175402641296387, "loss": 2.9225, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.914064407348633, "rewards/margins": 3.2613399028778076, "rewards/rejected": -22.175403594970703, "step": 11285 }, { "epoch": 0.38053186828002294, "grad_norm": 31.742076873779297, "learning_rate": 7.788297479475552e-07, "logits/chosen": -1.392856240272522, "logits/rejected": -1.4201936721801758, "logps/chosen": -1.9414689540863037, "logps/rejected": -2.150960922241211, "loss": 2.3635, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.414690017700195, "rewards/margins": 2.094921112060547, "rewards/rejected": -21.509611129760742, "step": 11290 }, { "epoch": 0.3807003943510061, "grad_norm": 24.79743194580078, "learning_rate": 7.785855474787593e-07, "logits/chosen": -0.5785871744155884, "logits/rejected": -0.6766026020050049, "logps/chosen": -2.581320285797119, "logps/rejected": -2.841749668121338, "loss": 3.0005, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.813201904296875, "rewards/margins": 2.604294538497925, "rewards/rejected": -28.417495727539062, "step": 11295 }, { "epoch": 0.3808689204219893, "grad_norm": 33.3655891418457, "learning_rate": 7.783412506024811e-07, "logits/chosen": -1.11223566532135, "logits/rejected": -1.057945966720581, "logps/chosen": -2.075849771499634, "logps/rejected": -2.0295917987823486, "loss": 3.6025, "rewards/accuracies": 0.5, "rewards/chosen": -20.75849723815918, "rewards/margins": -0.46258020401000977, "rewards/rejected": -20.295917510986328, "step": 11300 }, { "epoch": 0.38103744649297244, "grad_norm": 21.387447357177734, "learning_rate": 7.780968574032625e-07, "logits/chosen": -1.4144738912582397, "logits/rejected": -1.3412866592407227, "logps/chosen": -1.6566816568374634, "logps/rejected": -1.8312675952911377, "loss": 2.6812, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.566818237304688, "rewards/margins": 1.7458572387695312, "rewards/rejected": -18.312673568725586, "step": 11305 }, { "epoch": 0.38120597256395566, "grad_norm": 24.164539337158203, "learning_rate": 7.778523679656779e-07, "logits/chosen": -1.141021490097046, "logits/rejected": -1.0232211351394653, "logps/chosen": -2.0743701457977295, "logps/rejected": -2.0848662853240967, "loss": 3.68, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.743701934814453, "rewards/margins": 0.10496292263269424, "rewards/rejected": -20.848665237426758, "step": 11310 }, { "epoch": 0.38137449863493883, "grad_norm": 21.93690299987793, "learning_rate": 7.776077823743357e-07, "logits/chosen": -1.4113761186599731, "logits/rejected": -1.4901399612426758, "logps/chosen": -1.8558003902435303, "logps/rejected": -1.9158865213394165, "loss": 3.2107, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -18.55800437927246, "rewards/margins": 0.6008610725402832, "rewards/rejected": -19.158864974975586, "step": 11315 }, { "epoch": 0.381543024705922, "grad_norm": 23.51809310913086, "learning_rate": 7.773631007138774e-07, "logits/chosen": -1.0101337432861328, "logits/rejected": -1.177026391029358, "logps/chosen": -1.8464853763580322, "logps/rejected": -1.8554184436798096, "loss": 3.04, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.464853286743164, "rewards/margins": 0.08933134377002716, "rewards/rejected": -18.554183959960938, "step": 11320 }, { "epoch": 0.38171155077690516, "grad_norm": 17.446304321289062, "learning_rate": 7.771183230689777e-07, "logits/chosen": -0.7555183172225952, "logits/rejected": -0.8380460739135742, "logps/chosen": -1.8387556076049805, "logps/rejected": -2.0808956623077393, "loss": 1.905, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.387554168701172, "rewards/margins": 2.421400547027588, "rewards/rejected": -20.808956146240234, "step": 11325 }, { "epoch": 0.3818800768478884, "grad_norm": 85.44268035888672, "learning_rate": 7.768734495243443e-07, "logits/chosen": -1.2770483493804932, "logits/rejected": -1.438614845275879, "logps/chosen": -2.207958698272705, "logps/rejected": -2.1826369762420654, "loss": 3.6512, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.079586029052734, "rewards/margins": -0.25321730971336365, "rewards/rejected": -21.826370239257812, "step": 11330 }, { "epoch": 0.38204860291887155, "grad_norm": 24.92569351196289, "learning_rate": 7.766284801647185e-07, "logits/chosen": -0.9356335401535034, "logits/rejected": -0.9447436332702637, "logps/chosen": -1.8274080753326416, "logps/rejected": -2.022212028503418, "loss": 2.2281, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.274080276489258, "rewards/margins": 1.948040246963501, "rewards/rejected": -20.22212028503418, "step": 11335 }, { "epoch": 0.3822171289898547, "grad_norm": 37.96074295043945, "learning_rate": 7.763834150748744e-07, "logits/chosen": -0.9241237640380859, "logits/rejected": -1.1175363063812256, "logps/chosen": -1.5214978456497192, "logps/rejected": -1.6261125802993774, "loss": 3.0458, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -15.21497917175293, "rewards/margins": 1.046147108078003, "rewards/rejected": -16.261127471923828, "step": 11340 }, { "epoch": 0.38238565506083794, "grad_norm": 30.894428253173828, "learning_rate": 7.761382543396194e-07, "logits/chosen": -1.182045340538025, "logits/rejected": -1.2811795473098755, "logps/chosen": -1.7931095361709595, "logps/rejected": -1.8102737665176392, "loss": 2.9517, "rewards/accuracies": 0.5, "rewards/chosen": -17.931095123291016, "rewards/margins": 0.17164134979248047, "rewards/rejected": -18.102737426757812, "step": 11345 }, { "epoch": 0.3825541811318211, "grad_norm": 55.25584030151367, "learning_rate": 7.758929980437938e-07, "logits/chosen": -0.961665153503418, "logits/rejected": -1.0043509006500244, "logps/chosen": -1.9752800464630127, "logps/rejected": -2.2029144763946533, "loss": 2.7367, "rewards/accuracies": 0.5, "rewards/chosen": -19.752798080444336, "rewards/margins": 2.276346206665039, "rewards/rejected": -22.029144287109375, "step": 11350 }, { "epoch": 0.38272270720280427, "grad_norm": 17.4051513671875, "learning_rate": 7.756476462722716e-07, "logits/chosen": -0.5869064331054688, "logits/rejected": -0.6633475422859192, "logps/chosen": -2.171950578689575, "logps/rejected": -2.3819758892059326, "loss": 2.1582, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.719507217407227, "rewards/margins": 2.1002535820007324, "rewards/rejected": -23.819759368896484, "step": 11355 }, { "epoch": 0.38289123327378743, "grad_norm": 29.156442642211914, "learning_rate": 7.75402199109959e-07, "logits/chosen": -1.1302787065505981, "logits/rejected": -1.266941785812378, "logps/chosen": -2.077214002609253, "logps/rejected": -2.3803865909576416, "loss": 2.6726, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.772140502929688, "rewards/margins": 3.0317249298095703, "rewards/rejected": -23.80386734008789, "step": 11360 }, { "epoch": 0.38305975934477066, "grad_norm": 17.059425354003906, "learning_rate": 7.751566566417957e-07, "logits/chosen": -0.5787457823753357, "logits/rejected": -0.9798294305801392, "logps/chosen": -1.8301093578338623, "logps/rejected": -2.0004382133483887, "loss": 2.7989, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.301095962524414, "rewards/margins": 1.7032867670059204, "rewards/rejected": -20.00438117980957, "step": 11365 }, { "epoch": 0.3832282854157538, "grad_norm": 47.650630950927734, "learning_rate": 7.749110189527543e-07, "logits/chosen": -1.153541088104248, "logits/rejected": -1.209975242614746, "logps/chosen": -1.8833907842636108, "logps/rejected": -2.014538288116455, "loss": 3.4553, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -18.833908081054688, "rewards/margins": 1.3114765882492065, "rewards/rejected": -20.145383834838867, "step": 11370 }, { "epoch": 0.383396811486737, "grad_norm": 19.723957061767578, "learning_rate": 7.746652861278403e-07, "logits/chosen": -0.9713567495346069, "logits/rejected": -1.041154384613037, "logps/chosen": -2.3281097412109375, "logps/rejected": -2.4205708503723145, "loss": 3.5797, "rewards/accuracies": 0.5, "rewards/chosen": -23.281097412109375, "rewards/margins": 0.9246100187301636, "rewards/rejected": -24.205707550048828, "step": 11375 }, { "epoch": 0.38356533755772015, "grad_norm": 32.127601623535156, "learning_rate": 7.744194582520922e-07, "logits/chosen": -1.2071858644485474, "logits/rejected": -1.1609976291656494, "logps/chosen": -2.1086649894714355, "logps/rejected": -2.12287974357605, "loss": 3.3836, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.086650848388672, "rewards/margins": 0.14214667677879333, "rewards/rejected": -21.228797912597656, "step": 11380 }, { "epoch": 0.3837338636287034, "grad_norm": 28.18889045715332, "learning_rate": 7.741735354105812e-07, "logits/chosen": -0.9639381170272827, "logits/rejected": -0.873441219329834, "logps/chosen": -1.969211220741272, "logps/rejected": -2.263988733291626, "loss": 2.5078, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.69211196899414, "rewards/margins": 2.9477756023406982, "rewards/rejected": -22.639888763427734, "step": 11385 }, { "epoch": 0.38390238969968654, "grad_norm": 19.48329734802246, "learning_rate": 7.739275176884117e-07, "logits/chosen": -1.3636186122894287, "logits/rejected": -1.2289994955062866, "logps/chosen": -2.4750638008117676, "logps/rejected": -2.5021533966064453, "loss": 3.0527, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -24.75063705444336, "rewards/margins": 0.2708970010280609, "rewards/rejected": -25.021533966064453, "step": 11390 }, { "epoch": 0.3840709157706697, "grad_norm": 13.585640907287598, "learning_rate": 7.736814051707204e-07, "logits/chosen": -1.298323392868042, "logits/rejected": -1.3097971677780151, "logps/chosen": -1.9604408740997314, "logps/rejected": -2.163069486618042, "loss": 2.75, "rewards/accuracies": 0.5, "rewards/chosen": -19.604406356811523, "rewards/margins": 2.026287794113159, "rewards/rejected": -21.630695343017578, "step": 11395 }, { "epoch": 0.38423944184165293, "grad_norm": 19.65452003479004, "learning_rate": 7.734351979426776e-07, "logits/chosen": -0.9546276330947876, "logits/rejected": -1.0646032094955444, "logps/chosen": -1.8960742950439453, "logps/rejected": -2.45497465133667, "loss": 2.2316, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.960744857788086, "rewards/margins": 5.5890021324157715, "rewards/rejected": -24.549747467041016, "step": 11400 }, { "epoch": 0.3844079679126361, "grad_norm": 19.066781997680664, "learning_rate": 7.731888960894857e-07, "logits/chosen": -1.4005488157272339, "logits/rejected": -1.523158311843872, "logps/chosen": -2.310615301132202, "logps/rejected": -2.218581438064575, "loss": 4.5198, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.106151580810547, "rewards/margins": -0.92033851146698, "rewards/rejected": -22.185815811157227, "step": 11405 }, { "epoch": 0.38457649398361926, "grad_norm": 22.03188705444336, "learning_rate": 7.7294249969638e-07, "logits/chosen": -1.5152934789657593, "logits/rejected": -1.56234610080719, "logps/chosen": -2.118349313735962, "logps/rejected": -2.1849474906921387, "loss": 2.6433, "rewards/accuracies": 0.5, "rewards/chosen": -21.18349266052246, "rewards/margins": 0.665981650352478, "rewards/rejected": -21.84947395324707, "step": 11410 }, { "epoch": 0.3847450200546024, "grad_norm": 33.175025939941406, "learning_rate": 7.726960088486288e-07, "logits/chosen": -0.6917494535446167, "logits/rejected": -0.6354162693023682, "logps/chosen": -2.822929859161377, "logps/rejected": -2.600625514984131, "loss": 5.3058, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -28.229299545288086, "rewards/margins": -2.2230448722839355, "rewards/rejected": -26.006256103515625, "step": 11415 }, { "epoch": 0.38491354612558565, "grad_norm": 41.19036102294922, "learning_rate": 7.724494236315327e-07, "logits/chosen": -1.2923592329025269, "logits/rejected": -1.0900766849517822, "logps/chosen": -2.182579755783081, "logps/rejected": -2.3524978160858154, "loss": 3.9432, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.8257999420166, "rewards/margins": 1.6991780996322632, "rewards/rejected": -23.524974822998047, "step": 11420 }, { "epoch": 0.3850820721965688, "grad_norm": 23.587520599365234, "learning_rate": 7.722027441304251e-07, "logits/chosen": -1.6018108129501343, "logits/rejected": -1.6550190448760986, "logps/chosen": -1.9862916469573975, "logps/rejected": -2.0539584159851074, "loss": 2.6674, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.862918853759766, "rewards/margins": 0.6766681671142578, "rewards/rejected": -20.53958511352539, "step": 11425 }, { "epoch": 0.385250598267552, "grad_norm": 23.030256271362305, "learning_rate": 7.719559704306719e-07, "logits/chosen": -0.8091068267822266, "logits/rejected": -0.777170717716217, "logps/chosen": -2.224095582962036, "logps/rejected": -2.261972665786743, "loss": 3.4069, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.240955352783203, "rewards/margins": 0.3787725865840912, "rewards/rejected": -22.61972427368164, "step": 11430 }, { "epoch": 0.38541912433853515, "grad_norm": 23.55622100830078, "learning_rate": 7.717091026176724e-07, "logits/chosen": -1.5814132690429688, "logits/rejected": -1.5999425649642944, "logps/chosen": -2.2364578247070312, "logps/rejected": -2.3260645866394043, "loss": 3.1145, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.364580154418945, "rewards/margins": 0.8960673213005066, "rewards/rejected": -23.26064682006836, "step": 11435 }, { "epoch": 0.38558765040951837, "grad_norm": 24.407276153564453, "learning_rate": 7.714621407768571e-07, "logits/chosen": -1.0618751049041748, "logits/rejected": -1.2050310373306274, "logps/chosen": -1.7198562622070312, "logps/rejected": -1.9511902332305908, "loss": 2.3609, "rewards/accuracies": 0.5, "rewards/chosen": -17.198562622070312, "rewards/margins": 2.3133392333984375, "rewards/rejected": -19.51190185546875, "step": 11440 }, { "epoch": 0.38575617648050153, "grad_norm": 33.41534423828125, "learning_rate": 7.712150849936902e-07, "logits/chosen": -1.1403298377990723, "logits/rejected": -0.9351509213447571, "logps/chosen": -2.2158753871917725, "logps/rejected": -2.1572763919830322, "loss": 3.8557, "rewards/accuracies": 0.5, "rewards/chosen": -22.158754348754883, "rewards/margins": -0.5859910249710083, "rewards/rejected": -21.572763442993164, "step": 11445 }, { "epoch": 0.3859247025514847, "grad_norm": 25.274030685424805, "learning_rate": 7.709679353536678e-07, "logits/chosen": -1.1853322982788086, "logits/rejected": -0.947810173034668, "logps/chosen": -2.2994515895843506, "logps/rejected": -2.129556894302368, "loss": 5.0149, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.994516372680664, "rewards/margins": -1.6989485025405884, "rewards/rejected": -21.295568466186523, "step": 11450 }, { "epoch": 0.3860932286224679, "grad_norm": 26.36060905456543, "learning_rate": 7.707206919423186e-07, "logits/chosen": -1.122575044631958, "logits/rejected": -1.1204006671905518, "logps/chosen": -2.4006028175354004, "logps/rejected": -2.2933762073516846, "loss": 4.2671, "rewards/accuracies": 0.5, "rewards/chosen": -24.006031036376953, "rewards/margins": -1.0722652673721313, "rewards/rejected": -22.93376350402832, "step": 11455 }, { "epoch": 0.3862617546934511, "grad_norm": 19.837413787841797, "learning_rate": 7.704733548452041e-07, "logits/chosen": -1.243242859840393, "logits/rejected": -1.3733158111572266, "logps/chosen": -1.903545618057251, "logps/rejected": -1.9424865245819092, "loss": 3.0499, "rewards/accuracies": 0.5, "rewards/chosen": -19.03545570373535, "rewards/margins": 0.38941067457199097, "rewards/rejected": -19.42486572265625, "step": 11460 }, { "epoch": 0.38643028076443425, "grad_norm": 20.840614318847656, "learning_rate": 7.702259241479174e-07, "logits/chosen": -1.3639594316482544, "logits/rejected": -1.418534517288208, "logps/chosen": -1.9076082706451416, "logps/rejected": -2.043626308441162, "loss": 2.1244, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.076082229614258, "rewards/margins": 1.3601806163787842, "rewards/rejected": -20.436264038085938, "step": 11465 }, { "epoch": 0.3865988068354174, "grad_norm": 22.049299240112305, "learning_rate": 7.69978399936085e-07, "logits/chosen": -1.047729730606079, "logits/rejected": -1.1679027080535889, "logps/chosen": -1.8407936096191406, "logps/rejected": -2.011669397354126, "loss": 2.3953, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.407936096191406, "rewards/margins": 1.7087585926055908, "rewards/rejected": -20.116695404052734, "step": 11470 }, { "epoch": 0.38676733290640064, "grad_norm": 19.621408462524414, "learning_rate": 7.697307822953651e-07, "logits/chosen": -0.9885059595108032, "logits/rejected": -0.8718851208686829, "logps/chosen": -2.2795028686523438, "logps/rejected": -2.277373790740967, "loss": 3.3437, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -22.795028686523438, "rewards/margins": -0.021291160956025124, "rewards/rejected": -22.773738861083984, "step": 11475 }, { "epoch": 0.3869358589773838, "grad_norm": 77.31005859375, "learning_rate": 7.694830713114484e-07, "logits/chosen": -0.5937200784683228, "logits/rejected": -0.6495085954666138, "logps/chosen": -3.7535088062286377, "logps/rejected": -3.6097118854522705, "loss": 5.3228, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -37.53508758544922, "rewards/margins": -1.4379686117172241, "rewards/rejected": -36.09711837768555, "step": 11480 }, { "epoch": 0.387104385048367, "grad_norm": 95.22547912597656, "learning_rate": 7.69235267070058e-07, "logits/chosen": -0.9418071508407593, "logits/rejected": -0.9999884366989136, "logps/chosen": -2.5587856769561768, "logps/rejected": -2.812378406524658, "loss": 2.48, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.587852478027344, "rewards/margins": 2.535930871963501, "rewards/rejected": -28.1237850189209, "step": 11485 }, { "epoch": 0.38727291111935014, "grad_norm": 20.330053329467773, "learning_rate": 7.689873696569491e-07, "logits/chosen": -1.4741865396499634, "logits/rejected": -1.4686228036880493, "logps/chosen": -1.855538010597229, "logps/rejected": -2.09028959274292, "loss": 1.9487, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.555377960205078, "rewards/margins": 2.347515821456909, "rewards/rejected": -20.902894973754883, "step": 11490 }, { "epoch": 0.38744143719033336, "grad_norm": 26.7628231048584, "learning_rate": 7.687393791579092e-07, "logits/chosen": -0.7385457754135132, "logits/rejected": -0.8334264755249023, "logps/chosen": -1.7570949792861938, "logps/rejected": -2.526315212249756, "loss": 2.4006, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.570947647094727, "rewards/margins": 7.692204475402832, "rewards/rejected": -25.263153076171875, "step": 11495 }, { "epoch": 0.3876099632613165, "grad_norm": 24.974245071411133, "learning_rate": 7.684912956587581e-07, "logits/chosen": -0.8599656224250793, "logits/rejected": -1.045178771018982, "logps/chosen": -1.721909761428833, "logps/rejected": -1.6989558935165405, "loss": 3.4335, "rewards/accuracies": 0.5, "rewards/chosen": -17.219097137451172, "rewards/margins": -0.22953709959983826, "rewards/rejected": -16.989561080932617, "step": 11500 }, { "epoch": 0.3877784893322997, "grad_norm": 36.48255157470703, "learning_rate": 7.682431192453476e-07, "logits/chosen": -1.283911943435669, "logits/rejected": -1.2216994762420654, "logps/chosen": -1.7604789733886719, "logps/rejected": -1.7060072422027588, "loss": 3.5949, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -17.60478973388672, "rewards/margins": -0.5447174310684204, "rewards/rejected": -17.06007194519043, "step": 11505 }, { "epoch": 0.3879470154032829, "grad_norm": 24.400299072265625, "learning_rate": 7.67994850003562e-07, "logits/chosen": -1.1598316431045532, "logits/rejected": -1.2359213829040527, "logps/chosen": -1.6177875995635986, "logps/rejected": -1.597001075744629, "loss": 4.4583, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.177875518798828, "rewards/margins": -0.20786476135253906, "rewards/rejected": -15.970010757446289, "step": 11510 }, { "epoch": 0.3881155414742661, "grad_norm": 19.435991287231445, "learning_rate": 7.677464880193173e-07, "logits/chosen": -1.1459027528762817, "logits/rejected": -1.52254319190979, "logps/chosen": -1.9466493129730225, "logps/rejected": -2.2855138778686523, "loss": 2.229, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.466493606567383, "rewards/margins": 3.388643980026245, "rewards/rejected": -22.85513687133789, "step": 11515 }, { "epoch": 0.38828406754524925, "grad_norm": 14.826272010803223, "learning_rate": 7.67498033378562e-07, "logits/chosen": -0.6543978452682495, "logits/rejected": -0.8185670971870422, "logps/chosen": -1.837378740310669, "logps/rejected": -1.9322576522827148, "loss": 2.5036, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.37378692626953, "rewards/margins": 0.9487897753715515, "rewards/rejected": -19.322574615478516, "step": 11520 }, { "epoch": 0.3884525936162324, "grad_norm": 28.684703826904297, "learning_rate": 7.672494861672763e-07, "logits/chosen": -0.9026684761047363, "logits/rejected": -0.8441116213798523, "logps/chosen": -2.0589985847473145, "logps/rejected": -1.9286243915557861, "loss": 4.411, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -20.589984893798828, "rewards/margins": -1.303741693496704, "rewards/rejected": -19.286243438720703, "step": 11525 }, { "epoch": 0.38862111968721563, "grad_norm": 74.1058120727539, "learning_rate": 7.670008464714725e-07, "logits/chosen": -1.1678255796432495, "logits/rejected": -1.188652753829956, "logps/chosen": -1.9451920986175537, "logps/rejected": -1.9647331237792969, "loss": 3.2948, "rewards/accuracies": 0.5, "rewards/chosen": -19.451923370361328, "rewards/margins": 0.19540786743164062, "rewards/rejected": -19.647327423095703, "step": 11530 }, { "epoch": 0.3887896457581988, "grad_norm": 36.787261962890625, "learning_rate": 7.667521143771954e-07, "logits/chosen": -1.0473954677581787, "logits/rejected": -1.2901098728179932, "logps/chosen": -1.8299624919891357, "logps/rejected": -2.346789598464966, "loss": 1.9515, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.299625396728516, "rewards/margins": 5.168271064758301, "rewards/rejected": -23.4678955078125, "step": 11535 }, { "epoch": 0.38895817182918196, "grad_norm": 38.38254928588867, "learning_rate": 7.665032899705211e-07, "logits/chosen": -1.1778606176376343, "logits/rejected": -1.1312202215194702, "logps/chosen": -2.3744866847991943, "logps/rejected": -2.650757312774658, "loss": 2.4871, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.7448673248291, "rewards/margins": 2.762705087661743, "rewards/rejected": -26.507572174072266, "step": 11540 }, { "epoch": 0.38912669790016513, "grad_norm": 20.765256881713867, "learning_rate": 7.662543733375577e-07, "logits/chosen": -1.1510334014892578, "logits/rejected": -1.017518401145935, "logps/chosen": -1.8481746912002563, "logps/rejected": -1.763033151626587, "loss": 4.0422, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.481746673583984, "rewards/margins": -0.8514149785041809, "rewards/rejected": -17.63033103942871, "step": 11545 }, { "epoch": 0.38929522397114835, "grad_norm": 25.51265525817871, "learning_rate": 7.66005364564446e-07, "logits/chosen": -1.084987759590149, "logits/rejected": -1.028355360031128, "logps/chosen": -2.0981199741363525, "logps/rejected": -2.294259548187256, "loss": 2.1252, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.981199264526367, "rewards/margins": 1.961395263671875, "rewards/rejected": -22.942594528198242, "step": 11550 }, { "epoch": 0.3894637500421315, "grad_norm": 15.384166717529297, "learning_rate": 7.657562637373577e-07, "logits/chosen": -1.0935142040252686, "logits/rejected": -0.9374657869338989, "logps/chosen": -1.8439613580703735, "logps/rejected": -1.926548719406128, "loss": 2.879, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -18.439613342285156, "rewards/margins": 0.8258728981018066, "rewards/rejected": -19.265485763549805, "step": 11555 }, { "epoch": 0.3896322761131147, "grad_norm": 27.608064651489258, "learning_rate": 7.655070709424969e-07, "logits/chosen": -1.1121046543121338, "logits/rejected": -1.1762913465499878, "logps/chosen": -1.8197288513183594, "logps/rejected": -1.8982279300689697, "loss": 3.2518, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -18.197288513183594, "rewards/margins": 0.7849894762039185, "rewards/rejected": -18.98227882385254, "step": 11560 }, { "epoch": 0.3898008021840979, "grad_norm": 24.960180282592773, "learning_rate": 7.652577862660994e-07, "logits/chosen": -0.9853776097297668, "logits/rejected": -1.296339750289917, "logps/chosen": -1.872950553894043, "logps/rejected": -2.0671050548553467, "loss": 2.1694, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.729503631591797, "rewards/margins": 1.9415457248687744, "rewards/rejected": -20.671052932739258, "step": 11565 }, { "epoch": 0.38996932825508107, "grad_norm": 5.3271379470825195, "learning_rate": 7.650084097944327e-07, "logits/chosen": -1.2342129945755005, "logits/rejected": -1.6344501972198486, "logps/chosen": -1.9482825994491577, "logps/rejected": -2.2002644538879395, "loss": 3.2104, "rewards/accuracies": 0.5, "rewards/chosen": -19.482826232910156, "rewards/margins": 2.5198206901550293, "rewards/rejected": -22.00264549255371, "step": 11570 }, { "epoch": 0.39013785432606424, "grad_norm": 18.309335708618164, "learning_rate": 7.647589416137965e-07, "logits/chosen": -1.1587319374084473, "logits/rejected": -1.1315759420394897, "logps/chosen": -2.09301495552063, "logps/rejected": -2.0794355869293213, "loss": 3.4546, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -20.93014907836914, "rewards/margins": -0.13579444587230682, "rewards/rejected": -20.794353485107422, "step": 11575 }, { "epoch": 0.3903063803970474, "grad_norm": 41.83700942993164, "learning_rate": 7.645093818105215e-07, "logits/chosen": -0.998862624168396, "logits/rejected": -0.9568171501159668, "logps/chosen": -1.9338080883026123, "logps/rejected": -2.008169651031494, "loss": 2.6539, "rewards/accuracies": 0.5, "rewards/chosen": -19.338083267211914, "rewards/margins": 0.7436147928237915, "rewards/rejected": -20.081695556640625, "step": 11580 }, { "epoch": 0.3904749064680306, "grad_norm": 32.795169830322266, "learning_rate": 7.642597304709708e-07, "logits/chosen": -1.2452964782714844, "logits/rejected": -1.3635139465332031, "logps/chosen": -2.2852675914764404, "logps/rejected": -2.5104193687438965, "loss": 2.4394, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.852676391601562, "rewards/margins": 2.2515177726745605, "rewards/rejected": -25.104196548461914, "step": 11585 }, { "epoch": 0.3906434325390138, "grad_norm": 199.022705078125, "learning_rate": 7.640099876815388e-07, "logits/chosen": -0.74217689037323, "logits/rejected": -1.0585310459136963, "logps/chosen": -2.1841442584991455, "logps/rejected": -2.2506299018859863, "loss": 2.7874, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.841442108154297, "rewards/margins": 0.6648585200309753, "rewards/rejected": -22.50629997253418, "step": 11590 }, { "epoch": 0.39081195860999696, "grad_norm": 32.575645446777344, "learning_rate": 7.637601535286516e-07, "logits/chosen": -0.9163684844970703, "logits/rejected": -1.3957068920135498, "logps/chosen": -2.022493839263916, "logps/rejected": -2.0056674480438232, "loss": 3.5664, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.224937438964844, "rewards/margins": -0.1682640016078949, "rewards/rejected": -20.05667495727539, "step": 11595 }, { "epoch": 0.3909804846809801, "grad_norm": 20.356990814208984, "learning_rate": 7.635102280987671e-07, "logits/chosen": -0.9212282299995422, "logits/rejected": -0.7368286848068237, "logps/chosen": -2.0380892753601074, "logps/rejected": -2.2226696014404297, "loss": 2.8106, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.38089370727539, "rewards/margins": 1.8458038568496704, "rewards/rejected": -22.226696014404297, "step": 11600 }, { "epoch": 0.3909804846809801, "eval_logits/chosen": -1.420649528503418, "eval_logits/rejected": -1.5138413906097412, "eval_logps/chosen": -1.9176331758499146, "eval_logps/rejected": -1.999894380569458, "eval_loss": 3.043593406677246, "eval_rewards/accuracies": 0.6100000143051147, "eval_rewards/chosen": -19.176332473754883, "eval_rewards/margins": 0.8226120471954346, "eval_rewards/rejected": -19.998943328857422, "eval_runtime": 12.8886, "eval_samples_per_second": 7.759, "eval_steps_per_second": 1.94, "step": 11600 }, { "epoch": 0.39114901075196334, "grad_norm": 20.27068328857422, "learning_rate": 7.632602114783744e-07, "logits/chosen": -1.3960545063018799, "logits/rejected": -1.362420678138733, "logps/chosen": -1.963235855102539, "logps/rejected": -1.9808452129364014, "loss": 3.4921, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.63235855102539, "rewards/margins": 0.17609205842018127, "rewards/rejected": -19.80845069885254, "step": 11605 }, { "epoch": 0.3913175368229465, "grad_norm": 29.96604347229004, "learning_rate": 7.630101037539947e-07, "logits/chosen": -1.1657246351242065, "logits/rejected": -1.2024915218353271, "logps/chosen": -1.7683417797088623, "logps/rejected": -1.9547497034072876, "loss": 2.4799, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.683420181274414, "rewards/margins": 1.864076018333435, "rewards/rejected": -19.547494888305664, "step": 11610 }, { "epoch": 0.3914860628939297, "grad_norm": 19.694589614868164, "learning_rate": 7.627599050121803e-07, "logits/chosen": -1.076236367225647, "logits/rejected": -1.0020965337753296, "logps/chosen": -1.7998813390731812, "logps/rejected": -1.9322502613067627, "loss": 2.5581, "rewards/accuracies": 0.5, "rewards/chosen": -17.99881362915039, "rewards/margins": 1.32368803024292, "rewards/rejected": -19.322500228881836, "step": 11615 }, { "epoch": 0.3916545889649129, "grad_norm": 29.93548011779785, "learning_rate": 7.625096153395149e-07, "logits/chosen": -1.3330743312835693, "logits/rejected": -1.4957467317581177, "logps/chosen": -1.6043914556503296, "logps/rejected": -1.6570093631744385, "loss": 2.6907, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.043914794921875, "rewards/margins": 0.5261794328689575, "rewards/rejected": -16.57009506225586, "step": 11620 }, { "epoch": 0.39182311503589606, "grad_norm": 16.487661361694336, "learning_rate": 7.622592348226142e-07, "logits/chosen": -1.1505143642425537, "logits/rejected": -1.137687087059021, "logps/chosen": -1.5276297330856323, "logps/rejected": -1.5988701581954956, "loss": 2.6423, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -15.276298522949219, "rewards/margins": 0.7124035954475403, "rewards/rejected": -15.988700866699219, "step": 11625 }, { "epoch": 0.39199164110687923, "grad_norm": 27.717079162597656, "learning_rate": 7.62008763548125e-07, "logits/chosen": -0.8346315622329712, "logits/rejected": -1.1313467025756836, "logps/chosen": -1.8140102624893188, "logps/rejected": -2.153585910797119, "loss": 2.2458, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.140100479125977, "rewards/margins": 3.395756483078003, "rewards/rejected": -21.535858154296875, "step": 11630 }, { "epoch": 0.3921601671778624, "grad_norm": 18.077486038208008, "learning_rate": 7.617582016027253e-07, "logits/chosen": -0.8229688405990601, "logits/rejected": -0.9524946212768555, "logps/chosen": -1.5641162395477295, "logps/rejected": -1.6118942499160767, "loss": 2.71, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -15.641161918640137, "rewards/margins": 0.4777793884277344, "rewards/rejected": -16.118942260742188, "step": 11635 }, { "epoch": 0.3923286932488456, "grad_norm": 21.34409523010254, "learning_rate": 7.615075490731249e-07, "logits/chosen": -0.8470916748046875, "logits/rejected": -1.0112509727478027, "logps/chosen": -2.080671787261963, "logps/rejected": -2.3466696739196777, "loss": 3.2121, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.806716918945312, "rewards/margins": 2.6599812507629395, "rewards/rejected": -23.466699600219727, "step": 11640 }, { "epoch": 0.3924972193198288, "grad_norm": 12.055615425109863, "learning_rate": 7.612568060460649e-07, "logits/chosen": -1.058789610862732, "logits/rejected": -1.077118158340454, "logps/chosen": -1.869264006614685, "logps/rejected": -2.0772647857666016, "loss": 2.7164, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.69264030456543, "rewards/margins": 2.080007553100586, "rewards/rejected": -20.772647857666016, "step": 11645 }, { "epoch": 0.39266574539081195, "grad_norm": 22.117664337158203, "learning_rate": 7.610059726083174e-07, "logits/chosen": -0.7841507196426392, "logits/rejected": -0.8706964254379272, "logps/chosen": -2.1044468879699707, "logps/rejected": -2.599332094192505, "loss": 2.7502, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.04446792602539, "rewards/margins": 4.948855400085449, "rewards/rejected": -25.993322372436523, "step": 11650 }, { "epoch": 0.3928342714617951, "grad_norm": 30.5588321685791, "learning_rate": 7.60755048846686e-07, "logits/chosen": -0.9382207989692688, "logits/rejected": -0.816710352897644, "logps/chosen": -1.7673962116241455, "logps/rejected": -1.7424449920654297, "loss": 3.5536, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.673961639404297, "rewards/margins": -0.24951085448265076, "rewards/rejected": -17.424449920654297, "step": 11655 }, { "epoch": 0.39300279753277834, "grad_norm": 22.62909507751465, "learning_rate": 7.605040348480054e-07, "logits/chosen": -1.1503394842147827, "logits/rejected": -1.2069244384765625, "logps/chosen": -2.2256691455841064, "logps/rejected": -2.470478057861328, "loss": 3.4489, "rewards/accuracies": 0.5, "rewards/chosen": -22.25669288635254, "rewards/margins": 2.448086977005005, "rewards/rejected": -24.70477867126465, "step": 11660 }, { "epoch": 0.3931713236037615, "grad_norm": 24.27674674987793, "learning_rate": 7.602529306991418e-07, "logits/chosen": -1.1156190633773804, "logits/rejected": -1.256882905960083, "logps/chosen": -2.3534419536590576, "logps/rejected": -2.3057656288146973, "loss": 3.8305, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -23.534420013427734, "rewards/margins": -0.47676581144332886, "rewards/rejected": -23.057653427124023, "step": 11665 }, { "epoch": 0.39333984967474467, "grad_norm": 34.294864654541016, "learning_rate": 7.600017364869926e-07, "logits/chosen": -1.274646520614624, "logits/rejected": -1.1494067907333374, "logps/chosen": -2.619175672531128, "logps/rejected": -2.60260272026062, "loss": 3.4273, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -26.191753387451172, "rewards/margins": -0.1657283753156662, "rewards/rejected": -26.026025772094727, "step": 11670 }, { "epoch": 0.3935083757457279, "grad_norm": 45.172237396240234, "learning_rate": 7.59750452298486e-07, "logits/chosen": -0.9087691307067871, "logits/rejected": -1.0363165140151978, "logps/chosen": -2.048802375793457, "logps/rejected": -2.136763095855713, "loss": 2.8038, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.48802375793457, "rewards/margins": 0.8796059489250183, "rewards/rejected": -21.367630004882812, "step": 11675 }, { "epoch": 0.39367690181671106, "grad_norm": 142.33331298828125, "learning_rate": 7.594990782205817e-07, "logits/chosen": -0.9688955545425415, "logits/rejected": -1.3358403444290161, "logps/chosen": -2.2000503540039062, "logps/rejected": -2.168553352355957, "loss": 4.0627, "rewards/accuracies": 0.5, "rewards/chosen": -22.00050163269043, "rewards/margins": -0.31496915221214294, "rewards/rejected": -21.68553352355957, "step": 11680 }, { "epoch": 0.3938454278876942, "grad_norm": 48.449283599853516, "learning_rate": 7.592476143402702e-07, "logits/chosen": -1.389552116394043, "logits/rejected": -1.2141063213348389, "logps/chosen": -2.1961960792541504, "logps/rejected": -2.036904811859131, "loss": 4.7629, "rewards/accuracies": 0.5, "rewards/chosen": -21.961963653564453, "rewards/margins": -1.5929114818572998, "rewards/rejected": -20.369050979614258, "step": 11685 }, { "epoch": 0.3940139539586774, "grad_norm": 15.973644256591797, "learning_rate": 7.589960607445734e-07, "logits/chosen": -0.7461063265800476, "logits/rejected": -1.0605086088180542, "logps/chosen": -2.184495449066162, "logps/rejected": -2.7327818870544434, "loss": 1.8069, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.844953536987305, "rewards/margins": 5.482865333557129, "rewards/rejected": -27.32781982421875, "step": 11690 }, { "epoch": 0.3941824800296606, "grad_norm": 15.492846488952637, "learning_rate": 7.587444175205439e-07, "logits/chosen": -1.1022851467132568, "logits/rejected": -1.0290096998214722, "logps/chosen": -2.023465633392334, "logps/rejected": -2.073293924331665, "loss": 2.9022, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.234655380249023, "rewards/margins": 0.49828261137008667, "rewards/rejected": -20.732938766479492, "step": 11695 }, { "epoch": 0.3943510061006438, "grad_norm": 22.443706512451172, "learning_rate": 7.584926847552656e-07, "logits/chosen": -1.0928064584732056, "logits/rejected": -0.9536017179489136, "logps/chosen": -1.8068549633026123, "logps/rejected": -1.7727839946746826, "loss": 3.444, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.06855010986328, "rewards/margins": -0.3407100737094879, "rewards/rejected": -17.727840423583984, "step": 11700 }, { "epoch": 0.39451953217162694, "grad_norm": 105.63274383544922, "learning_rate": 7.582408625358534e-07, "logits/chosen": -0.6659219264984131, "logits/rejected": -0.7687760591506958, "logps/chosen": -2.302253007888794, "logps/rejected": -2.3893723487854004, "loss": 3.26, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -23.02252960205078, "rewards/margins": 0.8711929321289062, "rewards/rejected": -23.893722534179688, "step": 11705 }, { "epoch": 0.3946880582426101, "grad_norm": 9.452350616455078, "learning_rate": 7.579889509494528e-07, "logits/chosen": -0.9181884527206421, "logits/rejected": -1.3316329717636108, "logps/chosen": -1.9574193954467773, "logps/rejected": -2.146073579788208, "loss": 2.8546, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.574193954467773, "rewards/margins": 1.8865394592285156, "rewards/rejected": -21.460735321044922, "step": 11710 }, { "epoch": 0.39485658431359333, "grad_norm": 41.55025100708008, "learning_rate": 7.577369500832408e-07, "logits/chosen": -0.987427830696106, "logits/rejected": -1.0094937086105347, "logps/chosen": -2.231602907180786, "logps/rejected": -2.1909897327423096, "loss": 3.7208, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -22.316028594970703, "rewards/margins": -0.4061313569545746, "rewards/rejected": -21.909896850585938, "step": 11715 }, { "epoch": 0.3950251103845765, "grad_norm": 22.197267532348633, "learning_rate": 7.574848600244249e-07, "logits/chosen": -1.7265431880950928, "logits/rejected": -1.9097391366958618, "logps/chosen": -1.936183214187622, "logps/rejected": -2.317960262298584, "loss": 2.1239, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.361831665039062, "rewards/margins": 3.8177738189697266, "rewards/rejected": -23.17960548400879, "step": 11720 }, { "epoch": 0.39519363645555966, "grad_norm": 102.47838592529297, "learning_rate": 7.572326808602433e-07, "logits/chosen": -1.2377512454986572, "logits/rejected": -1.3128893375396729, "logps/chosen": -2.288329601287842, "logps/rejected": -2.1615147590637207, "loss": 4.3777, "rewards/accuracies": 0.5, "rewards/chosen": -22.8832950592041, "rewards/margins": -1.2681492567062378, "rewards/rejected": -21.61514663696289, "step": 11725 }, { "epoch": 0.3953621625265429, "grad_norm": 20.766952514648438, "learning_rate": 7.569804126779653e-07, "logits/chosen": -1.397312879562378, "logits/rejected": -1.6373573541641235, "logps/chosen": -1.943434476852417, "logps/rejected": -2.0884037017822266, "loss": 2.0901, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.43434715270996, "rewards/margins": 1.4496896266937256, "rewards/rejected": -20.884037017822266, "step": 11730 }, { "epoch": 0.39553068859752605, "grad_norm": 37.036781311035156, "learning_rate": 7.567280555648914e-07, "logits/chosen": -1.0928919315338135, "logits/rejected": -1.3050086498260498, "logps/chosen": -1.9236781597137451, "logps/rejected": -2.1296448707580566, "loss": 1.8989, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -19.23678207397461, "rewards/margins": 2.059666395187378, "rewards/rejected": -21.296449661254883, "step": 11735 }, { "epoch": 0.3956992146685092, "grad_norm": 29.56365966796875, "learning_rate": 7.564756096083519e-07, "logits/chosen": -0.8099555969238281, "logits/rejected": -0.937663197517395, "logps/chosen": -1.5575193166732788, "logps/rejected": -1.8885425329208374, "loss": 2.7032, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -15.575193405151367, "rewards/margins": 3.3102316856384277, "rewards/rejected": -18.885425567626953, "step": 11740 }, { "epoch": 0.3958677407394924, "grad_norm": 30.615787506103516, "learning_rate": 7.562230748957086e-07, "logits/chosen": -1.2532612085342407, "logits/rejected": -1.6562509536743164, "logps/chosen": -1.9327198266983032, "logps/rejected": -1.9758819341659546, "loss": 3.1936, "rewards/accuracies": 0.5, "rewards/chosen": -19.327198028564453, "rewards/margins": 0.4316198229789734, "rewards/rejected": -19.758817672729492, "step": 11745 }, { "epoch": 0.3960362668104756, "grad_norm": 17.918991088867188, "learning_rate": 7.559704515143541e-07, "logits/chosen": -0.8716901540756226, "logits/rejected": -1.0966124534606934, "logps/chosen": -2.116118907928467, "logps/rejected": -2.3539252281188965, "loss": 1.909, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.161190032958984, "rewards/margins": 2.3780627250671387, "rewards/rejected": -23.53925132751465, "step": 11750 }, { "epoch": 0.39620479288145877, "grad_norm": 26.732141494750977, "learning_rate": 7.557177395517111e-07, "logits/chosen": -0.8900313377380371, "logits/rejected": -1.020237684249878, "logps/chosen": -2.0022330284118652, "logps/rejected": -1.9952160120010376, "loss": 3.2698, "rewards/accuracies": 0.5, "rewards/chosen": -20.02233123779297, "rewards/margins": -0.07017116248607635, "rewards/rejected": -19.952159881591797, "step": 11755 }, { "epoch": 0.39637331895244193, "grad_norm": 17.14510726928711, "learning_rate": 7.554649390952333e-07, "logits/chosen": -1.1153548955917358, "logits/rejected": -1.2645814418792725, "logps/chosen": -2.1429028511047363, "logps/rejected": -2.2042899131774902, "loss": 2.8233, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.429027557373047, "rewards/margins": 0.6138709187507629, "rewards/rejected": -22.042898178100586, "step": 11760 }, { "epoch": 0.3965418450234251, "grad_norm": 42.62935256958008, "learning_rate": 7.552120502324048e-07, "logits/chosen": -1.1245473623275757, "logits/rejected": -1.1443026065826416, "logps/chosen": -1.7413899898529053, "logps/rejected": -1.7725694179534912, "loss": 3.104, "rewards/accuracies": 0.5, "rewards/chosen": -17.413898468017578, "rewards/margins": 0.31179675459861755, "rewards/rejected": -17.72569465637207, "step": 11765 }, { "epoch": 0.3967103710944083, "grad_norm": 0.033805444836616516, "learning_rate": 7.549590730507409e-07, "logits/chosen": -1.0588490962982178, "logits/rejected": -1.0548990964889526, "logps/chosen": -1.8918966054916382, "logps/rejected": -2.3487842082977295, "loss": 1.0045, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -18.91896629333496, "rewards/margins": 4.568876266479492, "rewards/rejected": -23.487842559814453, "step": 11770 }, { "epoch": 0.3968788971653915, "grad_norm": 16.685335159301758, "learning_rate": 7.547060076377868e-07, "logits/chosen": -0.6314164400100708, "logits/rejected": -0.7681409120559692, "logps/chosen": -1.8253930807113647, "logps/rejected": -1.9405262470245361, "loss": 2.4047, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.25393295288086, "rewards/margins": 1.1513316631317139, "rewards/rejected": -19.405263900756836, "step": 11775 }, { "epoch": 0.39704742323637465, "grad_norm": 37.280887603759766, "learning_rate": 7.544528540811183e-07, "logits/chosen": -1.0419279336929321, "logits/rejected": -0.9182440638542175, "logps/chosen": -2.2263355255126953, "logps/rejected": -2.186289072036743, "loss": 3.752, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.263357162475586, "rewards/margins": -0.40046462416648865, "rewards/rejected": -21.862892150878906, "step": 11780 }, { "epoch": 0.3972159493073579, "grad_norm": 13.038125991821289, "learning_rate": 7.541996124683423e-07, "logits/chosen": -1.2539036273956299, "logits/rejected": -1.3483214378356934, "logps/chosen": -2.142634153366089, "logps/rejected": -2.682434558868408, "loss": 1.4145, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.426342010498047, "rewards/margins": 5.398002624511719, "rewards/rejected": -26.824344635009766, "step": 11785 }, { "epoch": 0.39738447537834104, "grad_norm": 36.69297790527344, "learning_rate": 7.539462828870953e-07, "logits/chosen": -0.9368730783462524, "logits/rejected": -1.0406259298324585, "logps/chosen": -1.7773082256317139, "logps/rejected": -2.0813655853271484, "loss": 1.8501, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -17.773082733154297, "rewards/margins": 3.04057240486145, "rewards/rejected": -20.813655853271484, "step": 11790 }, { "epoch": 0.3975530014493242, "grad_norm": 17.663898468017578, "learning_rate": 7.53692865425045e-07, "logits/chosen": -0.9241794347763062, "logits/rejected": -0.6411947011947632, "logps/chosen": -1.8682501316070557, "logps/rejected": -2.117549419403076, "loss": 3.3426, "rewards/accuracies": 0.5, "rewards/chosen": -18.6825008392334, "rewards/margins": 2.4929943084716797, "rewards/rejected": -21.175495147705078, "step": 11795 }, { "epoch": 0.39772152752030737, "grad_norm": 17.26961898803711, "learning_rate": 7.53439360169889e-07, "logits/chosen": -0.8073530197143555, "logits/rejected": -0.9707645177841187, "logps/chosen": -2.1161324977874756, "logps/rejected": -2.29624605178833, "loss": 2.9547, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -21.161327362060547, "rewards/margins": 1.8011353015899658, "rewards/rejected": -22.96246337890625, "step": 11800 }, { "epoch": 0.3978900535912906, "grad_norm": 17.637361526489258, "learning_rate": 7.531857672093556e-07, "logits/chosen": -1.259456992149353, "logits/rejected": -1.3594366312026978, "logps/chosen": -1.818107008934021, "logps/rejected": -2.2726337909698486, "loss": 2.2051, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.18107032775879, "rewards/margins": 4.5452680587768555, "rewards/rejected": -22.726337432861328, "step": 11805 }, { "epoch": 0.39805857966227376, "grad_norm": 34.132537841796875, "learning_rate": 7.529320866312032e-07, "logits/chosen": -1.1170294284820557, "logits/rejected": -1.1737979650497437, "logps/chosen": -1.8334858417510986, "logps/rejected": -1.8769190311431885, "loss": 4.1332, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.334856033325195, "rewards/margins": 0.4343327581882477, "rewards/rejected": -18.769189834594727, "step": 11810 }, { "epoch": 0.3982271057332569, "grad_norm": 11.358888626098633, "learning_rate": 7.526783185232207e-07, "logits/chosen": -0.5799717307090759, "logits/rejected": -0.7505512237548828, "logps/chosen": -2.1831164360046387, "logps/rejected": -2.3155195713043213, "loss": 2.3181, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.831165313720703, "rewards/margins": 1.3240302801132202, "rewards/rejected": -23.155197143554688, "step": 11815 }, { "epoch": 0.3983956318042401, "grad_norm": 31.22028923034668, "learning_rate": 7.524244629732275e-07, "logits/chosen": -1.0025193691253662, "logits/rejected": -1.0711981058120728, "logps/chosen": -1.750725507736206, "logps/rejected": -1.9163103103637695, "loss": 2.8004, "rewards/accuracies": 0.5, "rewards/chosen": -17.50725746154785, "rewards/margins": 1.6558473110198975, "rewards/rejected": -19.163105010986328, "step": 11820 }, { "epoch": 0.3985641578752233, "grad_norm": 27.10166358947754, "learning_rate": 7.521705200690727e-07, "logits/chosen": -1.5414470434188843, "logits/rejected": -1.6252628564834595, "logps/chosen": -2.156869411468506, "logps/rejected": -2.4458861351013184, "loss": 3.313, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.568696975708008, "rewards/margins": 2.8901631832122803, "rewards/rejected": -24.458858489990234, "step": 11825 }, { "epoch": 0.3987326839462065, "grad_norm": 31.27543067932129, "learning_rate": 7.519164898986358e-07, "logits/chosen": -0.794817328453064, "logits/rejected": -0.8538041114807129, "logps/chosen": -1.9069769382476807, "logps/rejected": -2.101755142211914, "loss": 2.3943, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.06976890563965, "rewards/margins": 1.9477818012237549, "rewards/rejected": -21.01755142211914, "step": 11830 }, { "epoch": 0.39890121001718964, "grad_norm": 61.70077896118164, "learning_rate": 7.516623725498272e-07, "logits/chosen": -0.8156031370162964, "logits/rejected": -1.2082624435424805, "logps/chosen": -2.378671884536743, "logps/rejected": -3.244913101196289, "loss": 1.4, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.78671646118164, "rewards/margins": 8.662416458129883, "rewards/rejected": -32.449134826660156, "step": 11835 }, { "epoch": 0.39906973608817287, "grad_norm": 18.984455108642578, "learning_rate": 7.514081681105864e-07, "logits/chosen": -1.3033349514007568, "logits/rejected": -1.254258394241333, "logps/chosen": -1.8369280099868774, "logps/rejected": -1.799774169921875, "loss": 4.5203, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -18.369279861450195, "rewards/margins": -0.3715387284755707, "rewards/rejected": -17.997739791870117, "step": 11840 }, { "epoch": 0.39923826215915603, "grad_norm": 24.976802825927734, "learning_rate": 7.511538766688838e-07, "logits/chosen": -1.1798292398452759, "logits/rejected": -1.2531414031982422, "logps/chosen": -1.946840524673462, "logps/rejected": -2.2215023040771484, "loss": 2.4009, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.46840476989746, "rewards/margins": 2.7466187477111816, "rewards/rejected": -22.215023040771484, "step": 11845 }, { "epoch": 0.3994067882301392, "grad_norm": 26.44159507751465, "learning_rate": 7.508994983127194e-07, "logits/chosen": -0.9524277448654175, "logits/rejected": -1.117555022239685, "logps/chosen": -2.0292279720306396, "logps/rejected": -2.4175662994384766, "loss": 2.1058, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.292278289794922, "rewards/margins": 3.8833839893341064, "rewards/rejected": -24.175662994384766, "step": 11850 }, { "epoch": 0.39957531430112236, "grad_norm": 21.922222137451172, "learning_rate": 7.506450331301237e-07, "logits/chosen": -1.2023600339889526, "logits/rejected": -1.229527235031128, "logps/chosen": -1.674254059791565, "logps/rejected": -1.7711979150772095, "loss": 2.8766, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -16.742542266845703, "rewards/margins": 0.9694374799728394, "rewards/rejected": -17.711978912353516, "step": 11855 }, { "epoch": 0.3997438403721056, "grad_norm": 34.8781623840332, "learning_rate": 7.503904812091572e-07, "logits/chosen": -1.010766863822937, "logits/rejected": -1.1134612560272217, "logps/chosen": -2.0631964206695557, "logps/rejected": -1.935153603553772, "loss": 4.7799, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -20.6319637298584, "rewards/margins": -1.2804267406463623, "rewards/rejected": -19.35153579711914, "step": 11860 }, { "epoch": 0.39991236644308875, "grad_norm": 23.074020385742188, "learning_rate": 7.501358426379101e-07, "logits/chosen": -1.3153374195098877, "logits/rejected": -1.2573680877685547, "logps/chosen": -2.031836986541748, "logps/rejected": -1.9721667766571045, "loss": 3.8169, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -20.318370819091797, "rewards/margins": -0.5966991186141968, "rewards/rejected": -19.721668243408203, "step": 11865 }, { "epoch": 0.4000808925140719, "grad_norm": 18.008769989013672, "learning_rate": 7.498811175045028e-07, "logits/chosen": -0.9966435432434082, "logits/rejected": -0.9778487086296082, "logps/chosen": -2.4542758464813232, "logps/rejected": -2.3798739910125732, "loss": 4.6834, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -24.54275894165039, "rewards/margins": -0.7440201640129089, "rewards/rejected": -23.79874038696289, "step": 11870 }, { "epoch": 0.4002494185850551, "grad_norm": 19.766666412353516, "learning_rate": 7.496263058970855e-07, "logits/chosen": -1.0324809551239014, "logits/rejected": -1.2260249853134155, "logps/chosen": -1.6027014255523682, "logps/rejected": -1.7053003311157227, "loss": 2.4804, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.027013778686523, "rewards/margins": 1.025989055633545, "rewards/rejected": -17.053003311157227, "step": 11875 }, { "epoch": 0.4004179446560383, "grad_norm": 23.002748489379883, "learning_rate": 7.493714079038388e-07, "logits/chosen": -0.8100983500480652, "logits/rejected": -1.1202142238616943, "logps/chosen": -2.186525821685791, "logps/rejected": -2.3713173866271973, "loss": 2.4701, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.86526107788086, "rewards/margins": 1.8479118347167969, "rewards/rejected": -23.713171005249023, "step": 11880 }, { "epoch": 0.40058647072702147, "grad_norm": 22.952287673950195, "learning_rate": 7.491164236129726e-07, "logits/chosen": -1.3353387117385864, "logits/rejected": -1.3661134243011475, "logps/chosen": -1.7970765829086304, "logps/rejected": -2.0575413703918457, "loss": 3.5522, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.970767974853516, "rewards/margins": 2.6046481132507324, "rewards/rejected": -20.57541275024414, "step": 11885 }, { "epoch": 0.40075499679800464, "grad_norm": 14.258259773254395, "learning_rate": 7.48861353112727e-07, "logits/chosen": -0.8800445795059204, "logits/rejected": -1.0589618682861328, "logps/chosen": -1.7092921733856201, "logps/rejected": -1.914419412612915, "loss": 2.247, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.09292221069336, "rewards/margins": 2.051269054412842, "rewards/rejected": -19.14419174194336, "step": 11890 }, { "epoch": 0.40092352286898786, "grad_norm": 26.45533561706543, "learning_rate": 7.486061964913719e-07, "logits/chosen": -1.2647030353546143, "logits/rejected": -1.3658673763275146, "logps/chosen": -1.7642624378204346, "logps/rejected": -1.9021352529525757, "loss": 2.7179, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.642620086669922, "rewards/margins": 1.3787298202514648, "rewards/rejected": -19.021352767944336, "step": 11895 }, { "epoch": 0.401092048939971, "grad_norm": 17.83465003967285, "learning_rate": 7.483509538372067e-07, "logits/chosen": -1.247041940689087, "logits/rejected": -1.282518744468689, "logps/chosen": -2.0411016941070557, "logps/rejected": -2.189762830734253, "loss": 2.4295, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.4110164642334, "rewards/margins": 1.486612319946289, "rewards/rejected": -21.897628784179688, "step": 11900 }, { "epoch": 0.4012605750109542, "grad_norm": 19.71072769165039, "learning_rate": 7.480956252385612e-07, "logits/chosen": -0.6247833371162415, "logits/rejected": -0.8317705392837524, "logps/chosen": -2.314605951309204, "logps/rejected": -2.5528736114501953, "loss": 1.5118, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -23.146060943603516, "rewards/margins": 2.3826773166656494, "rewards/rejected": -25.528738021850586, "step": 11905 }, { "epoch": 0.40142910108193736, "grad_norm": 28.8355770111084, "learning_rate": 7.478402107837942e-07, "logits/chosen": -1.1492893695831299, "logits/rejected": -1.1314630508422852, "logps/chosen": -1.678655982017517, "logps/rejected": -1.749389886856079, "loss": 2.5565, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.786558151245117, "rewards/margins": 0.7073384523391724, "rewards/rejected": -17.493896484375, "step": 11910 }, { "epoch": 0.4015976271529206, "grad_norm": 20.623319625854492, "learning_rate": 7.47584710561295e-07, "logits/chosen": -1.0180232524871826, "logits/rejected": -1.1039505004882812, "logps/chosen": -1.974373459815979, "logps/rejected": -2.00376558303833, "loss": 3.6947, "rewards/accuracies": 0.5, "rewards/chosen": -19.743732452392578, "rewards/margins": 0.2939223349094391, "rewards/rejected": -20.037654876708984, "step": 11915 }, { "epoch": 0.40176615322390374, "grad_norm": 22.25886344909668, "learning_rate": 7.473291246594819e-07, "logits/chosen": -1.4031606912612915, "logits/rejected": -1.4442576169967651, "logps/chosen": -1.717380166053772, "logps/rejected": -1.8811886310577393, "loss": 1.8976, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.17380142211914, "rewards/margins": 1.6380836963653564, "rewards/rejected": -18.8118839263916, "step": 11920 }, { "epoch": 0.4019346792948869, "grad_norm": 40.460548400878906, "learning_rate": 7.470734531668029e-07, "logits/chosen": -0.8854770660400391, "logits/rejected": -0.9225967526435852, "logps/chosen": -2.049858570098877, "logps/rejected": -2.3521180152893066, "loss": 1.7058, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.498584747314453, "rewards/margins": 3.0225963592529297, "rewards/rejected": -23.521181106567383, "step": 11925 }, { "epoch": 0.4021032053658701, "grad_norm": 26.445152282714844, "learning_rate": 7.468176961717363e-07, "logits/chosen": -0.9759553670883179, "logits/rejected": -0.9978090524673462, "logps/chosen": -1.817072868347168, "logps/rejected": -1.8944177627563477, "loss": 2.4529, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.170730590820312, "rewards/margins": 0.7734484672546387, "rewards/rejected": -18.944177627563477, "step": 11930 }, { "epoch": 0.4022717314368533, "grad_norm": 25.400487899780273, "learning_rate": 7.465618537627891e-07, "logits/chosen": -1.1339561939239502, "logits/rejected": -1.1581547260284424, "logps/chosen": -2.5781798362731934, "logps/rejected": -2.6100564002990723, "loss": 3.1132, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.78179931640625, "rewards/margins": 0.31876450777053833, "rewards/rejected": -26.100561141967773, "step": 11935 }, { "epoch": 0.40244025750783646, "grad_norm": 90.3258285522461, "learning_rate": 7.463059260284985e-07, "logits/chosen": -0.9859679937362671, "logits/rejected": -1.0732853412628174, "logps/chosen": -2.2488198280334473, "logps/rejected": -2.398329496383667, "loss": 2.7178, "rewards/accuracies": 0.5, "rewards/chosen": -22.48819923400879, "rewards/margins": 1.4950984716415405, "rewards/rejected": -23.983295440673828, "step": 11940 }, { "epoch": 0.40260878357881963, "grad_norm": 34.835697174072266, "learning_rate": 7.46049913057431e-07, "logits/chosen": -0.9262846112251282, "logits/rejected": -1.155128836631775, "logps/chosen": -2.1297969818115234, "logps/rejected": -2.6779632568359375, "loss": 3.041, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.2979679107666, "rewards/margins": 5.481663227081299, "rewards/rejected": -26.779632568359375, "step": 11945 }, { "epoch": 0.40277730964980285, "grad_norm": 12.751852035522461, "learning_rate": 7.457938149381826e-07, "logits/chosen": -1.1583601236343384, "logits/rejected": -1.1672272682189941, "logps/chosen": -1.930572509765625, "logps/rejected": -1.9824788570404053, "loss": 3.419, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.30572509765625, "rewards/margins": 0.5190634727478027, "rewards/rejected": -19.82478904724121, "step": 11950 }, { "epoch": 0.402945835720786, "grad_norm": 12.714435577392578, "learning_rate": 7.455376317593787e-07, "logits/chosen": -1.2571144104003906, "logits/rejected": -1.3536908626556396, "logps/chosen": -2.014531373977661, "logps/rejected": -2.2088634967803955, "loss": 2.8636, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.145313262939453, "rewards/margins": 1.94331955909729, "rewards/rejected": -22.088634490966797, "step": 11955 }, { "epoch": 0.4031143617917692, "grad_norm": 22.396446228027344, "learning_rate": 7.452813636096742e-07, "logits/chosen": -1.0473554134368896, "logits/rejected": -1.0027496814727783, "logps/chosen": -2.0327858924865723, "logps/rejected": -2.114520311355591, "loss": 3.3383, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.32785987854004, "rewards/margins": 0.8173434138298035, "rewards/rejected": -21.14520263671875, "step": 11960 }, { "epoch": 0.40328288786275235, "grad_norm": 16.245067596435547, "learning_rate": 7.450250105777536e-07, "logits/chosen": -0.6577657461166382, "logits/rejected": -0.7657932043075562, "logps/chosen": -2.4727909564971924, "logps/rejected": -2.837554931640625, "loss": 3.0212, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.727909088134766, "rewards/margins": 3.6476409435272217, "rewards/rejected": -28.37554931640625, "step": 11965 }, { "epoch": 0.40345141393373557, "grad_norm": 23.989412307739258, "learning_rate": 7.447685727523303e-07, "logits/chosen": -0.9820396304130554, "logits/rejected": -0.8650194406509399, "logps/chosen": -1.8049871921539307, "logps/rejected": -1.7340768575668335, "loss": 3.8176, "rewards/accuracies": 0.5, "rewards/chosen": -18.049869537353516, "rewards/margins": -0.7091019749641418, "rewards/rejected": -17.340768814086914, "step": 11970 }, { "epoch": 0.40361994000471874, "grad_norm": 27.99403953552246, "learning_rate": 7.445120502221475e-07, "logits/chosen": -0.820398211479187, "logits/rejected": -0.7926065921783447, "logps/chosen": -1.9640032052993774, "logps/rejected": -1.9566271305084229, "loss": 3.3637, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.640033721923828, "rewards/margins": -0.07375955581665039, "rewards/rejected": -19.566272735595703, "step": 11975 }, { "epoch": 0.4037884660757019, "grad_norm": 19.738489151000977, "learning_rate": 7.442554430759775e-07, "logits/chosen": -1.0407991409301758, "logits/rejected": -1.423341989517212, "logps/chosen": -1.6015815734863281, "logps/rejected": -1.8600505590438843, "loss": 2.8412, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.01581382751465, "rewards/margins": 2.5846915245056152, "rewards/rejected": -18.600505828857422, "step": 11980 }, { "epoch": 0.40395699214668507, "grad_norm": 31.152938842773438, "learning_rate": 7.43998751402622e-07, "logits/chosen": -1.1023541688919067, "logits/rejected": -1.0939910411834717, "logps/chosen": -2.0495901107788086, "logps/rejected": -2.3085341453552246, "loss": 2.8457, "rewards/accuracies": 0.5, "rewards/chosen": -20.495901107788086, "rewards/margins": 2.5894412994384766, "rewards/rejected": -23.085342407226562, "step": 11985 }, { "epoch": 0.4041255182176683, "grad_norm": 25.51861000061035, "learning_rate": 7.437419752909119e-07, "logits/chosen": -1.249093770980835, "logits/rejected": -1.0771114826202393, "logps/chosen": -2.0163919925689697, "logps/rejected": -2.0079128742218018, "loss": 4.7058, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.163917541503906, "rewards/margins": -0.08479070663452148, "rewards/rejected": -20.07912826538086, "step": 11990 }, { "epoch": 0.40429404428865146, "grad_norm": 23.77742576599121, "learning_rate": 7.43485114829707e-07, "logits/chosen": -1.5242725610733032, "logits/rejected": -1.4089118242263794, "logps/chosen": -2.0811915397644043, "logps/rejected": -2.0558857917785645, "loss": 4.1171, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.811914443969727, "rewards/margins": -0.25305813550949097, "rewards/rejected": -20.558856964111328, "step": 11995 }, { "epoch": 0.4044625703596346, "grad_norm": 26.791170120239258, "learning_rate": 7.432281701078969e-07, "logits/chosen": -1.3711198568344116, "logits/rejected": -1.464496374130249, "logps/chosen": -1.9427284002304077, "logps/rejected": -1.997841477394104, "loss": 3.0344, "rewards/accuracies": 0.5, "rewards/chosen": -19.427284240722656, "rewards/margins": 0.5511296987533569, "rewards/rejected": -19.97841453552246, "step": 12000 }, { "epoch": 0.4044625703596346, "eval_logits/chosen": -1.465684175491333, "eval_logits/rejected": -1.5627810955047607, "eval_logps/chosen": -1.9252618551254272, "eval_logps/rejected": -2.010789394378662, "eval_loss": 3.0332751274108887, "eval_rewards/accuracies": 0.6100000143051147, "eval_rewards/chosen": -19.252620697021484, "eval_rewards/margins": 0.855275571346283, "eval_rewards/rejected": -20.107894897460938, "eval_runtime": 12.907, "eval_samples_per_second": 7.748, "eval_steps_per_second": 1.937, "step": 12000 }, { "epoch": 0.40463109643061784, "grad_norm": 24.16160774230957, "learning_rate": 7.429711412143999e-07, "logits/chosen": -1.0915724039077759, "logits/rejected": -1.1342499256134033, "logps/chosen": -1.7461715936660767, "logps/rejected": -1.81634521484375, "loss": 2.5254, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.461713790893555, "rewards/margins": 0.7017372846603394, "rewards/rejected": -18.1634521484375, "step": 12005 }, { "epoch": 0.404799622501601, "grad_norm": 11.7499361038208, "learning_rate": 7.427140282381636e-07, "logits/chosen": -0.9528292417526245, "logits/rejected": -1.1537973880767822, "logps/chosen": -1.9370794296264648, "logps/rejected": -2.340834140777588, "loss": 1.3718, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -19.37079429626465, "rewards/margins": 4.0375471115112305, "rewards/rejected": -23.408340454101562, "step": 12010 }, { "epoch": 0.4049681485725842, "grad_norm": 45.47167205810547, "learning_rate": 7.424568312681647e-07, "logits/chosen": -0.9921124577522278, "logits/rejected": -1.0518230199813843, "logps/chosen": -1.770341157913208, "logps/rejected": -2.001453399658203, "loss": 2.4102, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.703411102294922, "rewards/margins": 2.3111231327056885, "rewards/rejected": -20.0145320892334, "step": 12015 }, { "epoch": 0.40513667464356734, "grad_norm": 21.313961029052734, "learning_rate": 7.421995503934088e-07, "logits/chosen": -1.3454720973968506, "logits/rejected": -1.3453900814056396, "logps/chosen": -1.8339992761611938, "logps/rejected": -1.7264811992645264, "loss": 4.1263, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -18.33999252319336, "rewards/margins": -1.075181245803833, "rewards/rejected": -17.264812469482422, "step": 12020 }, { "epoch": 0.40530520071455056, "grad_norm": 12.074272155761719, "learning_rate": 7.419421857029309e-07, "logits/chosen": -1.1453709602355957, "logits/rejected": -1.3053287267684937, "logps/chosen": -2.2835354804992676, "logps/rejected": -2.569051742553711, "loss": 1.9534, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.835357666015625, "rewards/margins": 2.8551602363586426, "rewards/rejected": -25.69051742553711, "step": 12025 }, { "epoch": 0.40547372678553373, "grad_norm": 19.331594467163086, "learning_rate": 7.416847372857946e-07, "logits/chosen": -1.0492010116577148, "logits/rejected": -1.131423830986023, "logps/chosen": -2.292367458343506, "logps/rejected": -2.3346362113952637, "loss": 3.5757, "rewards/accuracies": 0.5, "rewards/chosen": -22.923675537109375, "rewards/margins": 0.4226861894130707, "rewards/rejected": -23.346363067626953, "step": 12030 }, { "epoch": 0.4056422528565169, "grad_norm": 31.929597854614258, "learning_rate": 7.414272052310928e-07, "logits/chosen": -1.2143497467041016, "logits/rejected": -1.1757112741470337, "logps/chosen": -2.2437641620635986, "logps/rejected": -2.503958225250244, "loss": 3.0289, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.437641143798828, "rewards/margins": 2.6019396781921387, "rewards/rejected": -25.039579391479492, "step": 12035 }, { "epoch": 0.40581077892750006, "grad_norm": 22.63580894470215, "learning_rate": 7.41169589627947e-07, "logits/chosen": -1.4139041900634766, "logits/rejected": -1.4436061382293701, "logps/chosen": -1.89009690284729, "logps/rejected": -2.185490131378174, "loss": 2.9563, "rewards/accuracies": 0.5, "rewards/chosen": -18.90096664428711, "rewards/margins": 2.953932523727417, "rewards/rejected": -21.854900360107422, "step": 12040 }, { "epoch": 0.4059793049984833, "grad_norm": 21.127553939819336, "learning_rate": 7.409118905655082e-07, "logits/chosen": -0.6792327761650085, "logits/rejected": -0.8694466352462769, "logps/chosen": -1.8247032165527344, "logps/rejected": -1.8372104167938232, "loss": 3.0029, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.247034072875977, "rewards/margins": 0.12507256865501404, "rewards/rejected": -18.37210464477539, "step": 12045 }, { "epoch": 0.40614783106946645, "grad_norm": 57.487403869628906, "learning_rate": 7.406541081329554e-07, "logits/chosen": -1.2932124137878418, "logits/rejected": -1.4253662824630737, "logps/chosen": -2.3572797775268555, "logps/rejected": -2.522393226623535, "loss": 2.8901, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.572799682617188, "rewards/margins": 1.6511356830596924, "rewards/rejected": -25.223934173583984, "step": 12050 }, { "epoch": 0.4063163571404496, "grad_norm": 6.55502462387085, "learning_rate": 7.403962424194973e-07, "logits/chosen": -1.4770417213439941, "logits/rejected": -1.7608951330184937, "logps/chosen": -2.6661248207092285, "logps/rejected": -3.192875385284424, "loss": 1.4518, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -26.6612491607666, "rewards/margins": 5.2675065994262695, "rewards/rejected": -31.928752899169922, "step": 12055 }, { "epoch": 0.40648488321143283, "grad_norm": 26.68470001220703, "learning_rate": 7.401382935143709e-07, "logits/chosen": -0.8198683857917786, "logits/rejected": -0.9025104641914368, "logps/chosen": -1.8631515502929688, "logps/rejected": -1.8569438457489014, "loss": 3.1761, "rewards/accuracies": 0.5, "rewards/chosen": -18.63151741027832, "rewards/margins": -0.06207876279950142, "rewards/rejected": -18.56943702697754, "step": 12060 }, { "epoch": 0.406653409282416, "grad_norm": 22.029172897338867, "learning_rate": 7.398802615068421e-07, "logits/chosen": -1.0925521850585938, "logits/rejected": -1.1846771240234375, "logps/chosen": -1.9000976085662842, "logps/rejected": -1.9701240062713623, "loss": 2.6078, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.000974655151367, "rewards/margins": 0.7002660632133484, "rewards/rejected": -19.70124053955078, "step": 12065 }, { "epoch": 0.40682193535339917, "grad_norm": 52.73628234863281, "learning_rate": 7.396221464862058e-07, "logits/chosen": -1.0964667797088623, "logits/rejected": -0.6679283976554871, "logps/chosen": -2.2612481117248535, "logps/rejected": -1.9707530736923218, "loss": 6.3113, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -22.61248207092285, "rewards/margins": -2.904949188232422, "rewards/rejected": -19.707530975341797, "step": 12070 }, { "epoch": 0.40699046142438233, "grad_norm": 15.641594886779785, "learning_rate": 7.393639485417852e-07, "logits/chosen": -0.9506348371505737, "logits/rejected": -0.9982539415359497, "logps/chosen": -1.7841243743896484, "logps/rejected": -1.9900413751602173, "loss": 1.9571, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.841243743896484, "rewards/margins": 2.0591704845428467, "rewards/rejected": -19.900415420532227, "step": 12075 }, { "epoch": 0.40715898749536555, "grad_norm": 23.61943817138672, "learning_rate": 7.391056677629327e-07, "logits/chosen": -1.5612033605575562, "logits/rejected": -1.3875303268432617, "logps/chosen": -2.126805543899536, "logps/rejected": -2.2525746822357178, "loss": 2.73, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.268054962158203, "rewards/margins": 1.2576922178268433, "rewards/rejected": -22.525747299194336, "step": 12080 }, { "epoch": 0.4073275135663487, "grad_norm": 44.1659049987793, "learning_rate": 7.388473042390289e-07, "logits/chosen": -1.2619104385375977, "logits/rejected": -1.3089015483856201, "logps/chosen": -1.9846280813217163, "logps/rejected": -2.134305953979492, "loss": 3.0727, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.84627914428711, "rewards/margins": 1.4967803955078125, "rewards/rejected": -21.343059539794922, "step": 12085 }, { "epoch": 0.4074960396373319, "grad_norm": 30.376798629760742, "learning_rate": 7.385888580594834e-07, "logits/chosen": -1.2105739116668701, "logits/rejected": -1.2595316171646118, "logps/chosen": -2.5818519592285156, "logps/rejected": -2.8327412605285645, "loss": 2.8547, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.818517684936523, "rewards/margins": 2.508892774581909, "rewards/rejected": -28.327411651611328, "step": 12090 }, { "epoch": 0.40766456570831505, "grad_norm": 36.32497787475586, "learning_rate": 7.383303293137339e-07, "logits/chosen": -1.0478847026824951, "logits/rejected": -1.2231203317642212, "logps/chosen": -2.192035675048828, "logps/rejected": -2.384169816970825, "loss": 2.2683, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.92035484313965, "rewards/margins": 1.9213413000106812, "rewards/rejected": -23.84169578552246, "step": 12095 }, { "epoch": 0.4078330917792983, "grad_norm": 35.671592712402344, "learning_rate": 7.380717180912477e-07, "logits/chosen": -1.0254673957824707, "logits/rejected": -0.9650663137435913, "logps/chosen": -2.426251173019409, "logps/rejected": -2.793447971343994, "loss": 4.8195, "rewards/accuracies": 0.5, "rewards/chosen": -24.26251220703125, "rewards/margins": 3.671968460083008, "rewards/rejected": -27.93448257446289, "step": 12100 }, { "epoch": 0.40800161785028144, "grad_norm": 46.74282455444336, "learning_rate": 7.378130244815191e-07, "logits/chosen": -1.1126468181610107, "logits/rejected": -1.2673817873001099, "logps/chosen": -1.7720956802368164, "logps/rejected": -2.2362868785858154, "loss": 2.1947, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.720956802368164, "rewards/margins": 4.641912937164307, "rewards/rejected": -22.362869262695312, "step": 12105 }, { "epoch": 0.4081701439212646, "grad_norm": 29.57602310180664, "learning_rate": 7.375542485740723e-07, "logits/chosen": -1.2394063472747803, "logits/rejected": -1.3660204410552979, "logps/chosen": -1.8587286472320557, "logps/rejected": -2.068803310394287, "loss": 1.7865, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.58728790283203, "rewards/margins": 2.1007449626922607, "rewards/rejected": -20.688030242919922, "step": 12110 }, { "epoch": 0.4083386699922478, "grad_norm": 27.55736541748047, "learning_rate": 7.372953904584596e-07, "logits/chosen": -1.0895322561264038, "logits/rejected": -0.9657597541809082, "logps/chosen": -1.6055514812469482, "logps/rejected": -1.532454490661621, "loss": 3.8164, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.05551528930664, "rewards/margins": -0.7309691309928894, "rewards/rejected": -15.324544906616211, "step": 12115 }, { "epoch": 0.408507196063231, "grad_norm": 30.960561752319336, "learning_rate": 7.37036450224261e-07, "logits/chosen": -0.8229540586471558, "logits/rejected": -1.2388590574264526, "logps/chosen": -1.9742721319198608, "logps/rejected": -2.200082302093506, "loss": 2.202, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.742721557617188, "rewards/margins": 2.2581019401550293, "rewards/rejected": -22.000823974609375, "step": 12120 }, { "epoch": 0.40867572213421416, "grad_norm": 33.830482482910156, "learning_rate": 7.36777427961086e-07, "logits/chosen": -0.8376771807670593, "logits/rejected": -0.981291651725769, "logps/chosen": -2.4010491371154785, "logps/rejected": -2.5145821571350098, "loss": 2.5353, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.010494232177734, "rewards/margins": 1.135331630706787, "rewards/rejected": -25.145824432373047, "step": 12125 }, { "epoch": 0.4088442482051973, "grad_norm": 23.59634780883789, "learning_rate": 7.365183237585718e-07, "logits/chosen": -0.861966609954834, "logits/rejected": -1.0989129543304443, "logps/chosen": -1.946171522140503, "logps/rejected": -1.9714372158050537, "loss": 3.086, "rewards/accuracies": 0.5, "rewards/chosen": -19.461715698242188, "rewards/margins": 0.25265711545944214, "rewards/rejected": -19.714372634887695, "step": 12130 }, { "epoch": 0.40901277427618055, "grad_norm": 53.22785186767578, "learning_rate": 7.362591377063841e-07, "logits/chosen": -1.0354158878326416, "logits/rejected": -1.0143946409225464, "logps/chosen": -1.9543819427490234, "logps/rejected": -1.9405418634414673, "loss": 3.345, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.543819427490234, "rewards/margins": -0.13840040564537048, "rewards/rejected": -19.405418395996094, "step": 12135 }, { "epoch": 0.4091813003471637, "grad_norm": 25.117643356323242, "learning_rate": 7.359998698942173e-07, "logits/chosen": -0.827044665813446, "logits/rejected": -0.9510555267333984, "logps/chosen": -2.408053159713745, "logps/rejected": -2.6223647594451904, "loss": 2.8323, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.08053207397461, "rewards/margins": 2.1431150436401367, "rewards/rejected": -26.223644256591797, "step": 12140 }, { "epoch": 0.4093498264181469, "grad_norm": 32.903385162353516, "learning_rate": 7.357405204117934e-07, "logits/chosen": -0.9480105638504028, "logits/rejected": -1.0420299768447876, "logps/chosen": -1.8940540552139282, "logps/rejected": -1.9430221319198608, "loss": 2.7973, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.940540313720703, "rewards/margins": 0.489682674407959, "rewards/rejected": -19.43022346496582, "step": 12145 }, { "epoch": 0.40951835248913004, "grad_norm": 13.586479187011719, "learning_rate": 7.354810893488632e-07, "logits/chosen": -1.3252044916152954, "logits/rejected": -1.5341026782989502, "logps/chosen": -2.290491819381714, "logps/rejected": -2.34997296333313, "loss": 2.9688, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.904918670654297, "rewards/margins": 0.594811737537384, "rewards/rejected": -23.49972915649414, "step": 12150 }, { "epoch": 0.40968687856011327, "grad_norm": 18.08561897277832, "learning_rate": 7.352215767952056e-07, "logits/chosen": -1.4511380195617676, "logits/rejected": -1.7551714181900024, "logps/chosen": -2.110844373703003, "logps/rejected": -2.080214738845825, "loss": 3.5353, "rewards/accuracies": 0.5, "rewards/chosen": -21.108444213867188, "rewards/margins": -0.3062978684902191, "rewards/rejected": -20.802148818969727, "step": 12155 }, { "epoch": 0.40985540463109643, "grad_norm": 20.255584716796875, "learning_rate": 7.349619828406277e-07, "logits/chosen": -0.935697078704834, "logits/rejected": -0.9104664921760559, "logps/chosen": -2.4696171283721924, "logps/rejected": -2.5782546997070312, "loss": 3.1522, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.696170806884766, "rewards/margins": 1.0863767862319946, "rewards/rejected": -25.782546997070312, "step": 12160 }, { "epoch": 0.4100239307020796, "grad_norm": 22.614980697631836, "learning_rate": 7.347023075749645e-07, "logits/chosen": -0.9744084477424622, "logits/rejected": -0.9399921298027039, "logps/chosen": -1.6697473526000977, "logps/rejected": -2.09625506401062, "loss": 1.7474, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -16.69747543334961, "rewards/margins": 4.26507568359375, "rewards/rejected": -20.962549209594727, "step": 12165 }, { "epoch": 0.4101924567730628, "grad_norm": 55.02566909790039, "learning_rate": 7.344425510880797e-07, "logits/chosen": -1.2344236373901367, "logits/rejected": -1.445399522781372, "logps/chosen": -2.253945827484131, "logps/rejected": -2.4210329055786133, "loss": 2.2166, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.539459228515625, "rewards/margins": 1.6708701848983765, "rewards/rejected": -24.210330963134766, "step": 12170 }, { "epoch": 0.410360982844046, "grad_norm": 12.205888748168945, "learning_rate": 7.341827134698645e-07, "logits/chosen": -0.9991312026977539, "logits/rejected": -1.023807168006897, "logps/chosen": -1.743577003479004, "logps/rejected": -1.9013125896453857, "loss": 2.273, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.435771942138672, "rewards/margins": 1.577357292175293, "rewards/rejected": -19.013126373291016, "step": 12175 }, { "epoch": 0.41052950891502915, "grad_norm": 22.69434928894043, "learning_rate": 7.339227948102387e-07, "logits/chosen": -1.2785236835479736, "logits/rejected": -1.3499950170516968, "logps/chosen": -1.9484812021255493, "logps/rejected": -2.2043070793151855, "loss": 2.7824, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.48480796813965, "rewards/margins": 2.5582618713378906, "rewards/rejected": -22.043071746826172, "step": 12180 }, { "epoch": 0.4106980349860123, "grad_norm": 70.99961853027344, "learning_rate": 7.336627951991497e-07, "logits/chosen": -0.8217660188674927, "logits/rejected": -0.8359603881835938, "logps/chosen": -1.856405258178711, "logps/rejected": -1.7680613994598389, "loss": 4.2279, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.564050674438477, "rewards/margins": -0.8834368586540222, "rewards/rejected": -17.680614471435547, "step": 12185 }, { "epoch": 0.41086656105699554, "grad_norm": 26.163740158081055, "learning_rate": 7.334027147265734e-07, "logits/chosen": -0.8019935488700867, "logits/rejected": -1.046931505203247, "logps/chosen": -2.2931296825408936, "logps/rejected": -2.110200881958008, "loss": 5.3934, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.931297302246094, "rewards/margins": -1.8292884826660156, "rewards/rejected": -21.102008819580078, "step": 12190 }, { "epoch": 0.4110350871279787, "grad_norm": 22.50705909729004, "learning_rate": 7.331425534825131e-07, "logits/chosen": -1.3941196203231812, "logits/rejected": -1.3452855348587036, "logps/chosen": -2.0837740898132324, "logps/rejected": -2.4105069637298584, "loss": 1.954, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.83774185180664, "rewards/margins": 3.267329692840576, "rewards/rejected": -24.105072021484375, "step": 12195 }, { "epoch": 0.41120361319896187, "grad_norm": 19.575517654418945, "learning_rate": 7.328823115570005e-07, "logits/chosen": -1.1170722246170044, "logits/rejected": -1.5405693054199219, "logps/chosen": -1.7801272869110107, "logps/rejected": -2.321302890777588, "loss": 1.2349, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.801273345947266, "rewards/margins": 5.411752700805664, "rewards/rejected": -23.213027954101562, "step": 12200 }, { "epoch": 0.41137213926994504, "grad_norm": 18.17782211303711, "learning_rate": 7.326219890400951e-07, "logits/chosen": -1.4105345010757446, "logits/rejected": -1.3899109363555908, "logps/chosen": -2.142193078994751, "logps/rejected": -2.263820171356201, "loss": 2.1952, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.42193031311035, "rewards/margins": 1.216269612312317, "rewards/rejected": -22.638198852539062, "step": 12205 }, { "epoch": 0.41154066534092826, "grad_norm": 27.79196548461914, "learning_rate": 7.323615860218842e-07, "logits/chosen": -1.1210639476776123, "logits/rejected": -1.2003812789916992, "logps/chosen": -1.9306762218475342, "logps/rejected": -1.9171720743179321, "loss": 3.6701, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.3067626953125, "rewards/margins": -0.13504056632518768, "rewards/rejected": -19.171720504760742, "step": 12210 }, { "epoch": 0.4117091914119114, "grad_norm": 11.662469863891602, "learning_rate": 7.321011025924832e-07, "logits/chosen": -1.377205729484558, "logits/rejected": -1.4924352169036865, "logps/chosen": -2.1127142906188965, "logps/rejected": -2.2229113578796387, "loss": 2.2694, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.12714195251465, "rewards/margins": 1.1019700765609741, "rewards/rejected": -22.229114532470703, "step": 12215 }, { "epoch": 0.4118777174828946, "grad_norm": 30.770193099975586, "learning_rate": 7.318405388420349e-07, "logits/chosen": -1.1618680953979492, "logits/rejected": -1.2497812509536743, "logps/chosen": -2.0236659049987793, "logps/rejected": -2.0624430179595947, "loss": 2.8735, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.23666000366211, "rewards/margins": 0.3877657949924469, "rewards/rejected": -20.62442970275879, "step": 12220 }, { "epoch": 0.4120462435538778, "grad_norm": 31.006916046142578, "learning_rate": 7.315798948607102e-07, "logits/chosen": -1.724689245223999, "logits/rejected": -1.7922130823135376, "logps/chosen": -1.9403785467147827, "logps/rejected": -1.982122778892517, "loss": 2.7834, "rewards/accuracies": 0.5, "rewards/chosen": -19.40378761291504, "rewards/margins": 0.41744089126586914, "rewards/rejected": -19.82122802734375, "step": 12225 }, { "epoch": 0.412214769624861, "grad_norm": 27.098119735717773, "learning_rate": 7.313191707387079e-07, "logits/chosen": -1.5360838174819946, "logits/rejected": -1.4201209545135498, "logps/chosen": -1.9972326755523682, "logps/rejected": -2.020376682281494, "loss": 3.0258, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.972326278686523, "rewards/margins": 0.23143863677978516, "rewards/rejected": -20.203763961791992, "step": 12230 }, { "epoch": 0.41238329569584414, "grad_norm": 37.77764892578125, "learning_rate": 7.310583665662542e-07, "logits/chosen": -1.0606439113616943, "logits/rejected": -1.2241575717926025, "logps/chosen": -2.2028839588165283, "logps/rejected": -2.37382173538208, "loss": 1.8302, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -22.028841018676758, "rewards/margins": 1.7093784809112549, "rewards/rejected": -23.738218307495117, "step": 12235 }, { "epoch": 0.4125518217668273, "grad_norm": 40.7574348449707, "learning_rate": 7.30797482433603e-07, "logits/chosen": -1.1301488876342773, "logits/rejected": -1.4540493488311768, "logps/chosen": -1.9775673151016235, "logps/rejected": -2.2723512649536133, "loss": 2.5747, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.775671005249023, "rewards/margins": 2.9478418827056885, "rewards/rejected": -22.723514556884766, "step": 12240 }, { "epoch": 0.41272034783781053, "grad_norm": 14.587956428527832, "learning_rate": 7.305365184310363e-07, "logits/chosen": -0.8185412287712097, "logits/rejected": -0.7269413471221924, "logps/chosen": -1.9818493127822876, "logps/rejected": -2.2220189571380615, "loss": 2.2466, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.818492889404297, "rewards/margins": 2.401698112487793, "rewards/rejected": -22.220190048217773, "step": 12245 }, { "epoch": 0.4128888739087937, "grad_norm": 25.93768882751465, "learning_rate": 7.302754746488633e-07, "logits/chosen": -1.1302485466003418, "logits/rejected": -1.3396055698394775, "logps/chosen": -2.0434622764587402, "logps/rejected": -2.2037312984466553, "loss": 2.0975, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.434621810913086, "rewards/margins": 1.6026890277862549, "rewards/rejected": -22.037311553955078, "step": 12250 }, { "epoch": 0.41305739997977686, "grad_norm": 77.58814239501953, "learning_rate": 7.300143511774211e-07, "logits/chosen": -1.3720858097076416, "logits/rejected": -1.4968478679656982, "logps/chosen": -2.2451188564300537, "logps/rejected": -2.2659029960632324, "loss": 4.1834, "rewards/accuracies": 0.5, "rewards/chosen": -22.451190948486328, "rewards/margins": 0.20783929526805878, "rewards/rejected": -22.659029006958008, "step": 12255 }, { "epoch": 0.41322592605076003, "grad_norm": 26.527759552001953, "learning_rate": 7.297531481070742e-07, "logits/chosen": -1.1272186040878296, "logits/rejected": -1.2402737140655518, "logps/chosen": -1.7846702337265015, "logps/rejected": -1.966025948524475, "loss": 2.1777, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -17.846702575683594, "rewards/margins": 1.8135563135147095, "rewards/rejected": -19.660259246826172, "step": 12260 }, { "epoch": 0.41339445212174325, "grad_norm": 34.187957763671875, "learning_rate": 7.294918655282145e-07, "logits/chosen": -1.4714405536651611, "logits/rejected": -1.390163779258728, "logps/chosen": -1.9039027690887451, "logps/rejected": -2.0494437217712402, "loss": 1.8899, "rewards/accuracies": 1.0, "rewards/chosen": -19.03902816772461, "rewards/margins": 1.4554088115692139, "rewards/rejected": -20.494434356689453, "step": 12265 }, { "epoch": 0.4135629781927264, "grad_norm": 22.404212951660156, "learning_rate": 7.292305035312618e-07, "logits/chosen": -1.2922292947769165, "logits/rejected": -1.1275092363357544, "logps/chosen": -2.3387680053710938, "logps/rejected": -2.4274537563323975, "loss": 4.3855, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.387680053710938, "rewards/margins": 0.8868575096130371, "rewards/rejected": -24.274539947509766, "step": 12270 }, { "epoch": 0.4137315042637096, "grad_norm": 108.76043701171875, "learning_rate": 7.289690622066633e-07, "logits/chosen": -0.7196947336196899, "logits/rejected": -0.6757172346115112, "logps/chosen": -2.1135897636413574, "logps/rejected": -2.088670253753662, "loss": 3.8477, "rewards/accuracies": 0.5, "rewards/chosen": -21.135900497436523, "rewards/margins": -0.24919691681861877, "rewards/rejected": -20.886703491210938, "step": 12275 }, { "epoch": 0.4139000303346928, "grad_norm": 30.70173454284668, "learning_rate": 7.287075416448932e-07, "logits/chosen": -0.8257554173469543, "logits/rejected": -0.939769446849823, "logps/chosen": -2.1116161346435547, "logps/rejected": -2.2537155151367188, "loss": 2.8061, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.116161346435547, "rewards/margins": 1.4209929704666138, "rewards/rejected": -22.537155151367188, "step": 12280 }, { "epoch": 0.41406855640567597, "grad_norm": 28.507184982299805, "learning_rate": 7.284459419364537e-07, "logits/chosen": -1.1531884670257568, "logits/rejected": -1.1810081005096436, "logps/chosen": -1.9541816711425781, "logps/rejected": -2.1569201946258545, "loss": 2.3336, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.541818618774414, "rewards/margins": 2.0273826122283936, "rewards/rejected": -21.56920051574707, "step": 12285 }, { "epoch": 0.41423708247665914, "grad_norm": 29.22176742553711, "learning_rate": 7.281842631718742e-07, "logits/chosen": -1.2742034196853638, "logits/rejected": -1.4763638973236084, "logps/chosen": -1.9598357677459717, "logps/rejected": -2.181473970413208, "loss": 2.33, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.598360061645508, "rewards/margins": 2.2163803577423096, "rewards/rejected": -21.814739227294922, "step": 12290 }, { "epoch": 0.4144056085476423, "grad_norm": 27.640893936157227, "learning_rate": 7.279225054417113e-07, "logits/chosen": -1.4601396322250366, "logits/rejected": -1.4143357276916504, "logps/chosen": -1.7320646047592163, "logps/rejected": -2.1392834186553955, "loss": 2.036, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.320646286010742, "rewards/margins": 4.072186470031738, "rewards/rejected": -21.392831802368164, "step": 12295 }, { "epoch": 0.4145741346186255, "grad_norm": 69.44369506835938, "learning_rate": 7.27660668836549e-07, "logits/chosen": -0.875129222869873, "logits/rejected": -0.9124727249145508, "logps/chosen": -1.7804782390594482, "logps/rejected": -1.9126323461532593, "loss": 2.0381, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.80478286743164, "rewards/margins": 1.3215408325195312, "rewards/rejected": -19.126323699951172, "step": 12300 }, { "epoch": 0.4147426606896087, "grad_norm": 18.76422882080078, "learning_rate": 7.273987534469987e-07, "logits/chosen": -1.410447120666504, "logits/rejected": -1.6111469268798828, "logps/chosen": -1.6875078678131104, "logps/rejected": -1.7965885400772095, "loss": 2.3301, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.875080108642578, "rewards/margins": 1.0908066034317017, "rewards/rejected": -17.965885162353516, "step": 12305 }, { "epoch": 0.41491118676059185, "grad_norm": 41.15472412109375, "learning_rate": 7.27136759363699e-07, "logits/chosen": -0.9998579025268555, "logits/rejected": -1.189564824104309, "logps/chosen": -1.9412838220596313, "logps/rejected": -2.197477102279663, "loss": 2.2405, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.412837982177734, "rewards/margins": 2.561933994293213, "rewards/rejected": -21.97477149963379, "step": 12310 }, { "epoch": 0.415079712831575, "grad_norm": 25.710790634155273, "learning_rate": 7.268746866773157e-07, "logits/chosen": -1.1858714818954468, "logits/rejected": -1.3653953075408936, "logps/chosen": -1.9866607189178467, "logps/rejected": -2.078334331512451, "loss": 2.9149, "rewards/accuracies": 0.5, "rewards/chosen": -19.866609573364258, "rewards/margins": 0.9167356491088867, "rewards/rejected": -20.783344268798828, "step": 12315 }, { "epoch": 0.41524823890255824, "grad_norm": 45.082820892333984, "learning_rate": 7.266125354785419e-07, "logits/chosen": -0.9618284106254578, "logits/rejected": -0.9656648635864258, "logps/chosen": -2.5577244758605957, "logps/rejected": -2.5045924186706543, "loss": 4.5745, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -25.57724380493164, "rewards/margins": -0.5313173532485962, "rewards/rejected": -25.04592514038086, "step": 12320 }, { "epoch": 0.4154167649735414, "grad_norm": 27.387304306030273, "learning_rate": 7.263503058580975e-07, "logits/chosen": -0.8120461702346802, "logits/rejected": -1.0206549167633057, "logps/chosen": -1.7548229694366455, "logps/rejected": -1.9979721307754517, "loss": 3.0462, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -17.548229217529297, "rewards/margins": 2.4314920902252197, "rewards/rejected": -19.979719161987305, "step": 12325 }, { "epoch": 0.4155852910445246, "grad_norm": 58.04964065551758, "learning_rate": 7.260879979067305e-07, "logits/chosen": -1.2231042385101318, "logits/rejected": -1.3412379026412964, "logps/chosen": -2.661652088165283, "logps/rejected": -2.8398842811584473, "loss": 2.8371, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.61652183532715, "rewards/margins": 1.782320261001587, "rewards/rejected": -28.39884376525879, "step": 12330 }, { "epoch": 0.4157538171155078, "grad_norm": 31.14594268798828, "learning_rate": 7.258256117152147e-07, "logits/chosen": -1.147136926651001, "logits/rejected": -1.2516025304794312, "logps/chosen": -2.197524309158325, "logps/rejected": -2.545668601989746, "loss": 1.9896, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.975244522094727, "rewards/margins": 3.4814445972442627, "rewards/rejected": -25.45668601989746, "step": 12335 }, { "epoch": 0.41592234318649096, "grad_norm": 27.909366607666016, "learning_rate": 7.255631473743517e-07, "logits/chosen": -1.7197551727294922, "logits/rejected": -1.5631155967712402, "logps/chosen": -1.870761513710022, "logps/rejected": -1.8892608880996704, "loss": 3.0922, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.70761489868164, "rewards/margins": 0.18499116599559784, "rewards/rejected": -18.892608642578125, "step": 12340 }, { "epoch": 0.4160908692574741, "grad_norm": 30.327396392822266, "learning_rate": 7.253006049749704e-07, "logits/chosen": -0.9778448939323425, "logits/rejected": -1.1327383518218994, "logps/chosen": -1.968382477760315, "logps/rejected": -2.545841693878174, "loss": 2.3467, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.683826446533203, "rewards/margins": 5.774592399597168, "rewards/rejected": -25.458415985107422, "step": 12345 }, { "epoch": 0.4162593953284573, "grad_norm": 3.7972683906555176, "learning_rate": 7.250379846079263e-07, "logits/chosen": -0.7681287527084351, "logits/rejected": -0.9513195157051086, "logps/chosen": -2.9177188873291016, "logps/rejected": -3.2327704429626465, "loss": 2.2224, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -29.177188873291016, "rewards/margins": 3.1505179405212402, "rewards/rejected": -32.32770538330078, "step": 12350 }, { "epoch": 0.4164279213994405, "grad_norm": 24.724031448364258, "learning_rate": 7.247752863641018e-07, "logits/chosen": -1.0654830932617188, "logits/rejected": -1.0408488512039185, "logps/chosen": -1.9206384420394897, "logps/rejected": -2.0448288917541504, "loss": 2.7421, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.206384658813477, "rewards/margins": 1.24190354347229, "rewards/rejected": -20.448287963867188, "step": 12355 }, { "epoch": 0.4165964474704237, "grad_norm": 34.73861312866211, "learning_rate": 7.245125103344066e-07, "logits/chosen": -1.2476475238800049, "logits/rejected": -1.307342290878296, "logps/chosen": -1.658424735069275, "logps/rejected": -1.6547248363494873, "loss": 3.6264, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.58424949645996, "rewards/margins": -0.036998748779296875, "rewards/rejected": -16.547250747680664, "step": 12360 }, { "epoch": 0.41676497354140685, "grad_norm": 33.121646881103516, "learning_rate": 7.242496566097769e-07, "logits/chosen": -1.4867914915084839, "logits/rejected": -1.541632056236267, "logps/chosen": -1.9344894886016846, "logps/rejected": -2.1640186309814453, "loss": 2.4745, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.344892501831055, "rewards/margins": 2.2952919006347656, "rewards/rejected": -21.64018440246582, "step": 12365 }, { "epoch": 0.41693349961239, "grad_norm": 24.134441375732422, "learning_rate": 7.23986725281176e-07, "logits/chosen": -0.9161200523376465, "logits/rejected": -0.9984935522079468, "logps/chosen": -2.1371476650238037, "logps/rejected": -2.1769909858703613, "loss": 3.7732, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.371477127075195, "rewards/margins": 0.3984335958957672, "rewards/rejected": -21.769908905029297, "step": 12370 }, { "epoch": 0.41710202568337323, "grad_norm": 16.265979766845703, "learning_rate": 7.237237164395944e-07, "logits/chosen": -1.368922233581543, "logits/rejected": -1.6064525842666626, "logps/chosen": -1.886877417564392, "logps/rejected": -2.280475378036499, "loss": 1.4464, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.868776321411133, "rewards/margins": 3.9359793663024902, "rewards/rejected": -22.80475425720215, "step": 12375 }, { "epoch": 0.4172705517543564, "grad_norm": 14.606618881225586, "learning_rate": 7.234606301760488e-07, "logits/chosen": -0.9598191976547241, "logits/rejected": -1.0696005821228027, "logps/chosen": -1.8257827758789062, "logps/rejected": -1.9989960193634033, "loss": 2.0592, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.25782585144043, "rewards/margins": 1.7321323156356812, "rewards/rejected": -19.989957809448242, "step": 12380 }, { "epoch": 0.41743907782533957, "grad_norm": 32.96151351928711, "learning_rate": 7.231974665815831e-07, "logits/chosen": -1.2338770627975464, "logits/rejected": -1.3861429691314697, "logps/chosen": -2.4079480171203613, "logps/rejected": -2.3635220527648926, "loss": 3.9454, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -24.079477310180664, "rewards/margins": -0.44425565004348755, "rewards/rejected": -23.635223388671875, "step": 12385 }, { "epoch": 0.4176076038963228, "grad_norm": 12.575274467468262, "learning_rate": 7.229342257472678e-07, "logits/chosen": -1.2482370138168335, "logits/rejected": -1.3556302785873413, "logps/chosen": -2.2981467247009277, "logps/rejected": -2.404623508453369, "loss": 3.5159, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -22.981468200683594, "rewards/margins": 1.064767599105835, "rewards/rejected": -24.046234130859375, "step": 12390 }, { "epoch": 0.41777612996730595, "grad_norm": 38.49678421020508, "learning_rate": 7.226709077642002e-07, "logits/chosen": -0.7314721941947937, "logits/rejected": -0.8654648065567017, "logps/chosen": -2.2405874729156494, "logps/rejected": -2.4284911155700684, "loss": 3.6159, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.405872344970703, "rewards/margins": 1.8790397644042969, "rewards/rejected": -24.284912109375, "step": 12395 }, { "epoch": 0.4179446560382891, "grad_norm": 20.525619506835938, "learning_rate": 7.224075127235044e-07, "logits/chosen": -0.9811423420906067, "logits/rejected": -1.0781348943710327, "logps/chosen": -2.029435157775879, "logps/rejected": -2.214388608932495, "loss": 2.1886, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.294349670410156, "rewards/margins": 1.849535346031189, "rewards/rejected": -22.14388656616211, "step": 12400 }, { "epoch": 0.4179446560382891, "eval_logits/chosen": -1.5217288732528687, "eval_logits/rejected": -1.6246063709259033, "eval_logps/chosen": -1.9450013637542725, "eval_logps/rejected": -2.0381784439086914, "eval_loss": 3.018660068511963, "eval_rewards/accuracies": 0.6299999952316284, "eval_rewards/chosen": -19.450016021728516, "eval_rewards/margins": 0.9317706823348999, "eval_rewards/rejected": -20.381784439086914, "eval_runtime": 12.8916, "eval_samples_per_second": 7.757, "eval_steps_per_second": 1.939, "step": 12400 }, { "epoch": 0.4181131821092723, "grad_norm": 23.479873657226562, "learning_rate": 7.221440407163309e-07, "logits/chosen": -1.2906603813171387, "logits/rejected": -1.2902982234954834, "logps/chosen": -1.9721931219100952, "logps/rejected": -1.8723487854003906, "loss": 4.2176, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -19.7219295501709, "rewards/margins": -0.9984428286552429, "rewards/rejected": -18.723487854003906, "step": 12405 }, { "epoch": 0.4182817081802555, "grad_norm": 65.4778823852539, "learning_rate": 7.218804918338572e-07, "logits/chosen": -1.067082166671753, "logits/rejected": -1.1751021146774292, "logps/chosen": -2.318246603012085, "logps/rejected": -2.514338970184326, "loss": 2.0274, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.18246841430664, "rewards/margins": 1.9609248638153076, "rewards/rejected": -25.143390655517578, "step": 12410 }, { "epoch": 0.4184502342512387, "grad_norm": 20.164350509643555, "learning_rate": 7.216168661672868e-07, "logits/chosen": -1.2625491619110107, "logits/rejected": -1.2166945934295654, "logps/chosen": -1.9427082538604736, "logps/rejected": -1.942220687866211, "loss": 3.556, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -19.427082061767578, "rewards/margins": -0.004874229431152344, "rewards/rejected": -19.42220687866211, "step": 12415 }, { "epoch": 0.41861876032222184, "grad_norm": 27.37095069885254, "learning_rate": 7.213531638078505e-07, "logits/chosen": -1.3327056169509888, "logits/rejected": -1.3353521823883057, "logps/chosen": -2.1947896480560303, "logps/rejected": -2.3421804904937744, "loss": 2.2908, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.94789695739746, "rewards/margins": 1.4739079475402832, "rewards/rejected": -23.421804428100586, "step": 12420 }, { "epoch": 0.418787286393205, "grad_norm": 27.996984481811523, "learning_rate": 7.210893848468053e-07, "logits/chosen": -1.1961950063705444, "logits/rejected": -1.2545980215072632, "logps/chosen": -1.6736834049224854, "logps/rejected": -1.796057105064392, "loss": 2.3931, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.736835479736328, "rewards/margins": 1.2237374782562256, "rewards/rejected": -17.960573196411133, "step": 12425 }, { "epoch": 0.4189558124641882, "grad_norm": 26.38707160949707, "learning_rate": 7.208255293754342e-07, "logits/chosen": -1.335105299949646, "logits/rejected": -1.4457073211669922, "logps/chosen": -1.7503960132598877, "logps/rejected": -1.9006984233856201, "loss": 2.7425, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.503957748413086, "rewards/margins": 1.503026008605957, "rewards/rejected": -19.00698471069336, "step": 12430 }, { "epoch": 0.4191243385351714, "grad_norm": 20.31515884399414, "learning_rate": 7.20561597485048e-07, "logits/chosen": -1.020716667175293, "logits/rejected": -1.0862579345703125, "logps/chosen": -1.5637071132659912, "logps/rejected": -1.6635987758636475, "loss": 2.2843, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -15.63707160949707, "rewards/margins": 0.9989177584648132, "rewards/rejected": -16.635990142822266, "step": 12435 }, { "epoch": 0.41929286460615456, "grad_norm": 97.51506042480469, "learning_rate": 7.202975892669824e-07, "logits/chosen": -0.7687516212463379, "logits/rejected": -0.8372823596000671, "logps/chosen": -2.676095485687256, "logps/rejected": -3.029672861099243, "loss": 1.9994, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -26.76095962524414, "rewards/margins": 3.5357728004455566, "rewards/rejected": -30.296728134155273, "step": 12440 }, { "epoch": 0.4194613906771378, "grad_norm": 22.99105453491211, "learning_rate": 7.200335048126006e-07, "logits/chosen": -0.9765909910202026, "logits/rejected": -1.0797923803329468, "logps/chosen": -2.3902230262756348, "logps/rejected": -2.2430508136749268, "loss": 4.7527, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -23.902233123779297, "rewards/margins": -1.471724271774292, "rewards/rejected": -22.43050765991211, "step": 12445 }, { "epoch": 0.41962991674812095, "grad_norm": 14.573020935058594, "learning_rate": 7.197693442132917e-07, "logits/chosen": -1.458287000656128, "logits/rejected": -1.5055878162384033, "logps/chosen": -2.1899430751800537, "logps/rejected": -2.093184471130371, "loss": 4.5942, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.899433135986328, "rewards/margins": -0.9675881266593933, "rewards/rejected": -20.93184471130371, "step": 12450 }, { "epoch": 0.4197984428191041, "grad_norm": 25.01654052734375, "learning_rate": 7.195051075604715e-07, "logits/chosen": -1.3347218036651611, "logits/rejected": -1.4352153539657593, "logps/chosen": -2.1493160724639893, "logps/rejected": -2.3409645557403564, "loss": 2.7931, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.493160247802734, "rewards/margins": 1.9164841175079346, "rewards/rejected": -23.409643173217773, "step": 12455 }, { "epoch": 0.4199669688900873, "grad_norm": 34.97426223754883, "learning_rate": 7.192407949455816e-07, "logits/chosen": -1.1958450078964233, "logits/rejected": -1.3197181224822998, "logps/chosen": -2.0296499729156494, "logps/rejected": -2.766195774078369, "loss": 1.828, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.296499252319336, "rewards/margins": 7.365456581115723, "rewards/rejected": -27.661956787109375, "step": 12460 }, { "epoch": 0.4201354949610705, "grad_norm": 31.839874267578125, "learning_rate": 7.189764064600904e-07, "logits/chosen": -1.214929223060608, "logits/rejected": -1.1214382648468018, "logps/chosen": -1.8447256088256836, "logps/rejected": -2.0540318489074707, "loss": 1.8063, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -18.447256088256836, "rewards/margins": 2.0930612087249756, "rewards/rejected": -20.54031753540039, "step": 12465 }, { "epoch": 0.42030402103205367, "grad_norm": 23.187700271606445, "learning_rate": 7.187119421954921e-07, "logits/chosen": -1.2923511266708374, "logits/rejected": -1.489856481552124, "logps/chosen": -1.7116539478302002, "logps/rejected": -1.5989964008331299, "loss": 4.4184, "rewards/accuracies": 0.5, "rewards/chosen": -17.116540908813477, "rewards/margins": -1.1265767812728882, "rewards/rejected": -15.989962577819824, "step": 12470 }, { "epoch": 0.42047254710303683, "grad_norm": 24.130979537963867, "learning_rate": 7.184474022433075e-07, "logits/chosen": -1.2651400566101074, "logits/rejected": -1.326103925704956, "logps/chosen": -1.6987262964248657, "logps/rejected": -1.8224769830703735, "loss": 2.1328, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.987262725830078, "rewards/margins": 1.237506628036499, "rewards/rejected": -18.224769592285156, "step": 12475 }, { "epoch": 0.42064107317402, "grad_norm": 15.125301361083984, "learning_rate": 7.181827866950837e-07, "logits/chosen": -1.270438313484192, "logits/rejected": -1.2889509201049805, "logps/chosen": -1.5895674228668213, "logps/rejected": -1.6432710886001587, "loss": 2.6452, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -15.895675659179688, "rewards/margins": 0.5370348691940308, "rewards/rejected": -16.432708740234375, "step": 12480 }, { "epoch": 0.4208095992450032, "grad_norm": 5.707220554351807, "learning_rate": 7.179180956423933e-07, "logits/chosen": -1.1088390350341797, "logits/rejected": -1.3981083631515503, "logps/chosen": -1.8700625896453857, "logps/rejected": -2.3774728775024414, "loss": 2.1257, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.700626373291016, "rewards/margins": 5.07410192489624, "rewards/rejected": -23.774728775024414, "step": 12485 }, { "epoch": 0.4209781253159864, "grad_norm": 17.590496063232422, "learning_rate": 7.176533291768357e-07, "logits/chosen": -1.1576659679412842, "logits/rejected": -1.4298789501190186, "logps/chosen": -2.152177333831787, "logps/rejected": -2.259253740310669, "loss": 2.457, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.521774291992188, "rewards/margins": 1.0707619190216064, "rewards/rejected": -22.592538833618164, "step": 12490 }, { "epoch": 0.42114665138696955, "grad_norm": 22.579177856445312, "learning_rate": 7.173884873900362e-07, "logits/chosen": -1.2096421718597412, "logits/rejected": -1.372266173362732, "logps/chosen": -2.0245296955108643, "logps/rejected": -2.4928340911865234, "loss": 3.0018, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.245298385620117, "rewards/margins": 4.683045387268066, "rewards/rejected": -24.928342819213867, "step": 12495 }, { "epoch": 0.42131517745795277, "grad_norm": 22.653228759765625, "learning_rate": 7.171235703736458e-07, "logits/chosen": -0.8634752035140991, "logits/rejected": -1.0036604404449463, "logps/chosen": -2.2977168560028076, "logps/rejected": -2.3925795555114746, "loss": 2.9089, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.977169036865234, "rewards/margins": 0.9486261606216431, "rewards/rejected": -23.92579460144043, "step": 12500 }, { "epoch": 0.42148370352893594, "grad_norm": 28.098493576049805, "learning_rate": 7.16858578219342e-07, "logits/chosen": -1.356195092201233, "logits/rejected": -1.5597960948944092, "logps/chosen": -2.096545457839966, "logps/rejected": -2.229048252105713, "loss": 2.6018, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.965456008911133, "rewards/margins": 1.3250248432159424, "rewards/rejected": -22.29047966003418, "step": 12505 }, { "epoch": 0.4216522295999191, "grad_norm": 19.7722225189209, "learning_rate": 7.165935110188282e-07, "logits/chosen": -1.291948676109314, "logits/rejected": -1.2664096355438232, "logps/chosen": -2.1727402210235596, "logps/rejected": -2.754978656768799, "loss": 2.2238, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.727405548095703, "rewards/margins": 5.822384834289551, "rewards/rejected": -27.549789428710938, "step": 12510 }, { "epoch": 0.42182075567090227, "grad_norm": 18.146760940551758, "learning_rate": 7.163283688638338e-07, "logits/chosen": -0.8999283909797668, "logits/rejected": -0.8663008809089661, "logps/chosen": -2.0590133666992188, "logps/rejected": -2.1434433460235596, "loss": 2.928, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.590129852294922, "rewards/margins": 0.84429931640625, "rewards/rejected": -21.434432983398438, "step": 12515 }, { "epoch": 0.4219892817418855, "grad_norm": 2.633085012435913, "learning_rate": 7.160631518461138e-07, "logits/chosen": -0.8085635900497437, "logits/rejected": -1.2670907974243164, "logps/chosen": -2.0810775756835938, "logps/rejected": -2.7740936279296875, "loss": 1.1083, "rewards/accuracies": 1.0, "rewards/chosen": -20.810775756835938, "rewards/margins": 6.930161952972412, "rewards/rejected": -27.740936279296875, "step": 12520 }, { "epoch": 0.42215780781286866, "grad_norm": 33.739681243896484, "learning_rate": 7.157978600574494e-07, "logits/chosen": -1.1513853073120117, "logits/rejected": -1.0155327320098877, "logps/chosen": -2.2184622287750244, "logps/rejected": -2.5932672023773193, "loss": 2.3999, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.184621810913086, "rewards/margins": 3.7480499744415283, "rewards/rejected": -25.93267250061035, "step": 12525 }, { "epoch": 0.4223263338838518, "grad_norm": 14.900190353393555, "learning_rate": 7.155324935896481e-07, "logits/chosen": -1.1839789152145386, "logits/rejected": -1.4558827877044678, "logps/chosen": -1.641060471534729, "logps/rejected": -1.6600325107574463, "loss": 3.9618, "rewards/accuracies": 0.5, "rewards/chosen": -16.410602569580078, "rewards/margins": 0.18971911072731018, "rewards/rejected": -16.600324630737305, "step": 12530 }, { "epoch": 0.422494859954835, "grad_norm": 63.875736236572266, "learning_rate": 7.152670525345421e-07, "logits/chosen": -1.1068434715270996, "logits/rejected": -1.1477311849594116, "logps/chosen": -2.444314956665039, "logps/rejected": -2.6108994483947754, "loss": 3.4869, "rewards/accuracies": 0.5, "rewards/chosen": -24.44314956665039, "rewards/margins": 1.6658456325531006, "rewards/rejected": -26.108993530273438, "step": 12535 }, { "epoch": 0.4226633860258182, "grad_norm": 21.8823184967041, "learning_rate": 7.150015369839903e-07, "logits/chosen": -0.818661093711853, "logits/rejected": -1.0668702125549316, "logps/chosen": -2.3808517456054688, "logps/rejected": -2.7843680381774902, "loss": 2.1808, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.808517456054688, "rewards/margins": 4.035162925720215, "rewards/rejected": -27.843679428100586, "step": 12540 }, { "epoch": 0.4228319120968014, "grad_norm": 27.321157455444336, "learning_rate": 7.147359470298776e-07, "logits/chosen": -1.0066759586334229, "logits/rejected": -0.9979284405708313, "logps/chosen": -2.0069892406463623, "logps/rejected": -1.8785688877105713, "loss": 4.3458, "rewards/accuracies": 0.5, "rewards/chosen": -20.069894790649414, "rewards/margins": -1.2842042446136475, "rewards/rejected": -18.785690307617188, "step": 12545 }, { "epoch": 0.42300043816778454, "grad_norm": 21.502607345581055, "learning_rate": 7.144702827641136e-07, "logits/chosen": -1.2351725101470947, "logits/rejected": -1.4520246982574463, "logps/chosen": -1.9867042303085327, "logps/rejected": -2.3051655292510986, "loss": 2.2522, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.867042541503906, "rewards/margins": 3.1846134662628174, "rewards/rejected": -23.051654815673828, "step": 12550 }, { "epoch": 0.42316896423876776, "grad_norm": 23.776119232177734, "learning_rate": 7.142045442786346e-07, "logits/chosen": -0.8708661794662476, "logits/rejected": -1.0048704147338867, "logps/chosen": -1.9848359823226929, "logps/rejected": -2.0720088481903076, "loss": 2.6712, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.84836196899414, "rewards/margins": 0.8717293739318848, "rewards/rejected": -20.720088958740234, "step": 12555 }, { "epoch": 0.42333749030975093, "grad_norm": 20.22873878479004, "learning_rate": 7.139387316654024e-07, "logits/chosen": -1.050518274307251, "logits/rejected": -1.279454231262207, "logps/chosen": -2.533379077911377, "logps/rejected": -2.4677693843841553, "loss": 4.2271, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -25.333789825439453, "rewards/margins": -0.6560948491096497, "rewards/rejected": -24.67769432067871, "step": 12560 }, { "epoch": 0.4235060163807341, "grad_norm": 19.31144905090332, "learning_rate": 7.136728450164038e-07, "logits/chosen": -0.9448343515396118, "logits/rejected": -1.2927018404006958, "logps/chosen": -2.0609188079833984, "logps/rejected": -2.36385440826416, "loss": 1.8349, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.609188079833984, "rewards/margins": 3.029357671737671, "rewards/rejected": -23.638545989990234, "step": 12565 }, { "epoch": 0.42367454245171726, "grad_norm": 22.986955642700195, "learning_rate": 7.134068844236518e-07, "logits/chosen": -1.3164246082305908, "logits/rejected": -1.5228006839752197, "logps/chosen": -2.031179428100586, "logps/rejected": -2.2485601902008057, "loss": 1.9017, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.31179428100586, "rewards/margins": 2.1738078594207764, "rewards/rejected": -22.4856014251709, "step": 12570 }, { "epoch": 0.4238430685227005, "grad_norm": 21.826759338378906, "learning_rate": 7.131408499791853e-07, "logits/chosen": -1.7282383441925049, "logits/rejected": -2.004629135131836, "logps/chosen": -2.011059045791626, "logps/rejected": -2.124830722808838, "loss": 3.4721, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.1105899810791, "rewards/margins": 1.1377174854278564, "rewards/rejected": -21.248306274414062, "step": 12575 }, { "epoch": 0.42401159459368365, "grad_norm": 36.31961441040039, "learning_rate": 7.128747417750678e-07, "logits/chosen": -1.0672346353530884, "logits/rejected": -1.3110134601593018, "logps/chosen": -2.252652406692505, "logps/rejected": -2.3968546390533447, "loss": 1.9498, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.526521682739258, "rewards/margins": 1.4420230388641357, "rewards/rejected": -23.96854591369629, "step": 12580 }, { "epoch": 0.4241801206646668, "grad_norm": 45.022438049316406, "learning_rate": 7.126085599033892e-07, "logits/chosen": -0.8541949987411499, "logits/rejected": -1.1512668132781982, "logps/chosen": -2.3442893028259277, "logps/rejected": -2.7394356727600098, "loss": 2.5909, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.442893981933594, "rewards/margins": 3.951462507247925, "rewards/rejected": -27.39435386657715, "step": 12585 }, { "epoch": 0.42434864673565, "grad_norm": 17.76199722290039, "learning_rate": 7.123423044562644e-07, "logits/chosen": -1.2358906269073486, "logits/rejected": -1.426826000213623, "logps/chosen": -1.9580609798431396, "logps/rejected": -2.097620725631714, "loss": 2.9607, "rewards/accuracies": 0.5, "rewards/chosen": -19.580608367919922, "rewards/margins": 1.395599365234375, "rewards/rejected": -20.976207733154297, "step": 12590 }, { "epoch": 0.4245171728066332, "grad_norm": 15.683243751525879, "learning_rate": 7.12075975525834e-07, "logits/chosen": -1.072770118713379, "logits/rejected": -1.5230886936187744, "logps/chosen": -2.1154682636260986, "logps/rejected": -2.5087642669677734, "loss": 1.9563, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.154682159423828, "rewards/margins": 3.9329605102539062, "rewards/rejected": -25.0876407623291, "step": 12595 }, { "epoch": 0.42468569887761637, "grad_norm": 25.16454315185547, "learning_rate": 7.118095732042641e-07, "logits/chosen": -0.9468148350715637, "logits/rejected": -0.978870689868927, "logps/chosen": -2.006333589553833, "logps/rejected": -1.969488501548767, "loss": 3.8357, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.063335418701172, "rewards/margins": -0.36844903230667114, "rewards/rejected": -19.69488525390625, "step": 12600 }, { "epoch": 0.42485422494859953, "grad_norm": 15.496404647827148, "learning_rate": 7.115430975837456e-07, "logits/chosen": -1.466344952583313, "logits/rejected": -1.5798033475875854, "logps/chosen": -2.424255847930908, "logps/rejected": -2.755340337753296, "loss": 2.6929, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.242555618286133, "rewards/margins": 3.310847520828247, "rewards/rejected": -27.55340576171875, "step": 12605 }, { "epoch": 0.42502275101958276, "grad_norm": 43.10670471191406, "learning_rate": 7.112765487564957e-07, "logits/chosen": -1.076716661453247, "logits/rejected": -1.1707000732421875, "logps/chosen": -2.2483749389648438, "logps/rejected": -2.3862216472625732, "loss": 3.0182, "rewards/accuracies": 0.5, "rewards/chosen": -22.483749389648438, "rewards/margins": 1.3784677982330322, "rewards/rejected": -23.862218856811523, "step": 12610 }, { "epoch": 0.4251912770905659, "grad_norm": 29.91777229309082, "learning_rate": 7.110099268147562e-07, "logits/chosen": -0.9753861427307129, "logits/rejected": -1.0013278722763062, "logps/chosen": -1.9687080383300781, "logps/rejected": -1.981389045715332, "loss": 3.097, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.68707847595215, "rewards/margins": 0.12681102752685547, "rewards/rejected": -19.813888549804688, "step": 12615 }, { "epoch": 0.4253598031615491, "grad_norm": 35.663963317871094, "learning_rate": 7.107432318507943e-07, "logits/chosen": -1.1238733530044556, "logits/rejected": -1.0368965864181519, "logps/chosen": -2.0092053413391113, "logps/rejected": -1.9546020030975342, "loss": 4.2135, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -20.092052459716797, "rewards/margins": -0.5460325479507446, "rewards/rejected": -19.5460205078125, "step": 12620 }, { "epoch": 0.42552832923253225, "grad_norm": 29.770458221435547, "learning_rate": 7.10476463956903e-07, "logits/chosen": -1.1682617664337158, "logits/rejected": -1.2263028621673584, "logps/chosen": -2.1611130237579346, "logps/rejected": -2.257652997970581, "loss": 2.6198, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.61113166809082, "rewards/margins": 0.9653974771499634, "rewards/rejected": -22.576528549194336, "step": 12625 }, { "epoch": 0.4256968553035155, "grad_norm": 29.46103858947754, "learning_rate": 7.102096232253999e-07, "logits/chosen": -1.0423948764801025, "logits/rejected": -1.1089481115341187, "logps/chosen": -2.0582234859466553, "logps/rejected": -2.3573241233825684, "loss": 1.906, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.582233428955078, "rewards/margins": 2.991006374359131, "rewards/rejected": -23.5732421875, "step": 12630 }, { "epoch": 0.42586538137449864, "grad_norm": 17.78944206237793, "learning_rate": 7.099427097486283e-07, "logits/chosen": -0.9830500483512878, "logits/rejected": -1.443943738937378, "logps/chosen": -2.2791683673858643, "logps/rejected": -2.5998737812042236, "loss": 2.1672, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.791683197021484, "rewards/margins": 3.2070529460906982, "rewards/rejected": -25.998737335205078, "step": 12635 }, { "epoch": 0.4260339074454818, "grad_norm": 34.38286590576172, "learning_rate": 7.09675723618956e-07, "logits/chosen": -1.4862444400787354, "logits/rejected": -1.2831571102142334, "logps/chosen": -2.3077995777130127, "logps/rejected": -2.3144662380218506, "loss": 4.1222, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -23.07799530029297, "rewards/margins": 0.06666803359985352, "rewards/rejected": -23.144662857055664, "step": 12640 }, { "epoch": 0.426202433516465, "grad_norm": 0.0135088711977005, "learning_rate": 7.094086649287768e-07, "logits/chosen": -1.286271333694458, "logits/rejected": -1.4231679439544678, "logps/chosen": -1.9492824077606201, "logps/rejected": -2.2704503536224365, "loss": 2.3507, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.49282455444336, "rewards/margins": 3.211678981781006, "rewards/rejected": -22.70450210571289, "step": 12645 }, { "epoch": 0.4263709595874482, "grad_norm": 20.299243927001953, "learning_rate": 7.09141533770509e-07, "logits/chosen": -1.2199132442474365, "logits/rejected": -1.5089060068130493, "logps/chosen": -1.748953104019165, "logps/rejected": -1.8105674982070923, "loss": 2.633, "rewards/accuracies": 0.5, "rewards/chosen": -17.489532470703125, "rewards/margins": 0.6161444783210754, "rewards/rejected": -18.105676651000977, "step": 12650 }, { "epoch": 0.42653948565843136, "grad_norm": 37.17742919921875, "learning_rate": 7.088743302365963e-07, "logits/chosen": -1.1301769018173218, "logits/rejected": -1.4917234182357788, "logps/chosen": -2.305114269256592, "logps/rejected": -2.4861512184143066, "loss": 3.5311, "rewards/accuracies": 0.5, "rewards/chosen": -23.05113983154297, "rewards/margins": 1.8103729486465454, "rewards/rejected": -24.861515045166016, "step": 12655 }, { "epoch": 0.4267080117294145, "grad_norm": 36.21525192260742, "learning_rate": 7.086070544195071e-07, "logits/chosen": -1.2566407918930054, "logits/rejected": -1.1735836267471313, "logps/chosen": -2.330984592437744, "logps/rejected": -2.2364494800567627, "loss": 4.3382, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -23.309844970703125, "rewards/margins": -0.9453502893447876, "rewards/rejected": -22.3644962310791, "step": 12660 }, { "epoch": 0.42687653780039775, "grad_norm": 40.654571533203125, "learning_rate": 7.083397064117351e-07, "logits/chosen": -1.0398657321929932, "logits/rejected": -1.0622133016586304, "logps/chosen": -2.4192018508911133, "logps/rejected": -2.477231502532959, "loss": 2.9533, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.1920166015625, "rewards/margins": 0.5802987813949585, "rewards/rejected": -24.77231788635254, "step": 12665 }, { "epoch": 0.4270450638713809, "grad_norm": 28.800556182861328, "learning_rate": 7.080722863057992e-07, "logits/chosen": -1.5768417119979858, "logits/rejected": -1.5328872203826904, "logps/chosen": -1.8163115978240967, "logps/rejected": -1.9694408178329468, "loss": 2.5645, "rewards/accuracies": 0.5, "rewards/chosen": -18.163116455078125, "rewards/margins": 1.53129243850708, "rewards/rejected": -19.694408416748047, "step": 12670 }, { "epoch": 0.4272135899423641, "grad_norm": 18.046127319335938, "learning_rate": 7.078047941942426e-07, "logits/chosen": -0.7132788896560669, "logits/rejected": -0.9055215716362, "logps/chosen": -2.2301299571990967, "logps/rejected": -2.510833740234375, "loss": 2.5123, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.30130386352539, "rewards/margins": 2.8070342540740967, "rewards/rejected": -25.108333587646484, "step": 12675 }, { "epoch": 0.42738211601334725, "grad_norm": 33.88047409057617, "learning_rate": 7.075372301696339e-07, "logits/chosen": -0.7558620572090149, "logits/rejected": -0.7717004418373108, "logps/chosen": -2.054013967514038, "logps/rejected": -2.0608019828796387, "loss": 3.3235, "rewards/accuracies": 0.5, "rewards/chosen": -20.54014015197754, "rewards/margins": 0.06788072735071182, "rewards/rejected": -20.608020782470703, "step": 12680 }, { "epoch": 0.42755064208433047, "grad_norm": 20.51235580444336, "learning_rate": 7.072695943245664e-07, "logits/chosen": -0.8837175369262695, "logits/rejected": -1.0166467428207397, "logps/chosen": -2.6695451736450195, "logps/rejected": -2.8811819553375244, "loss": 3.5117, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.695453643798828, "rewards/margins": 2.116365432739258, "rewards/rejected": -28.811819076538086, "step": 12685 }, { "epoch": 0.42771916815531363, "grad_norm": 43.31715393066406, "learning_rate": 7.070018867516585e-07, "logits/chosen": -1.3836842775344849, "logits/rejected": -1.291332721710205, "logps/chosen": -2.0165090560913086, "logps/rejected": -2.2391884326934814, "loss": 2.756, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.165088653564453, "rewards/margins": 2.226795196533203, "rewards/rejected": -22.391883850097656, "step": 12690 }, { "epoch": 0.4278876942262968, "grad_norm": 22.55230140686035, "learning_rate": 7.067341075435531e-07, "logits/chosen": -0.9342721700668335, "logits/rejected": -1.154515027999878, "logps/chosen": -1.9569528102874756, "logps/rejected": -2.1937878131866455, "loss": 2.1746, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.569528579711914, "rewards/margins": 2.3683505058288574, "rewards/rejected": -21.937877655029297, "step": 12695 }, { "epoch": 0.42805622029727997, "grad_norm": 26.59225845336914, "learning_rate": 7.06466256792918e-07, "logits/chosen": -1.6623830795288086, "logits/rejected": -1.609900712966919, "logps/chosen": -2.140404462814331, "logps/rejected": -2.286125659942627, "loss": 2.6497, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.40404510498047, "rewards/margins": 1.4572112560272217, "rewards/rejected": -22.861255645751953, "step": 12700 }, { "epoch": 0.4282247463682632, "grad_norm": 18.7648868560791, "learning_rate": 7.061983345924462e-07, "logits/chosen": -1.6123731136322021, "logits/rejected": -1.5295023918151855, "logps/chosen": -2.2385873794555664, "logps/rejected": -2.4097483158111572, "loss": 2.9512, "rewards/accuracies": 0.5, "rewards/chosen": -22.385873794555664, "rewards/margins": 1.7116081714630127, "rewards/rejected": -24.097482681274414, "step": 12705 }, { "epoch": 0.42839327243924635, "grad_norm": 27.55744171142578, "learning_rate": 7.059303410348544e-07, "logits/chosen": -1.27418053150177, "logits/rejected": -1.2935948371887207, "logps/chosen": -2.2106258869171143, "logps/rejected": -2.7528538703918457, "loss": 3.1742, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.106258392333984, "rewards/margins": 5.4222822189331055, "rewards/rejected": -27.528539657592773, "step": 12710 }, { "epoch": 0.4285617985102295, "grad_norm": 149.60943603515625, "learning_rate": 7.05662276212885e-07, "logits/chosen": -1.3029506206512451, "logits/rejected": -1.5379345417022705, "logps/chosen": -2.348891019821167, "logps/rejected": -2.646965503692627, "loss": 3.6559, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.488908767700195, "rewards/margins": 2.980748414993286, "rewards/rejected": -26.469655990600586, "step": 12715 }, { "epoch": 0.42873032458121274, "grad_norm": 84.20378112792969, "learning_rate": 7.053941402193044e-07, "logits/chosen": -0.8661912679672241, "logits/rejected": -0.9065690040588379, "logps/chosen": -1.9864723682403564, "logps/rejected": -1.9592368602752686, "loss": 3.4597, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.86472511291504, "rewards/margins": -0.2723536491394043, "rewards/rejected": -19.592369079589844, "step": 12720 }, { "epoch": 0.4288988506521959, "grad_norm": 26.458824157714844, "learning_rate": 7.051259331469044e-07, "logits/chosen": -1.0588595867156982, "logits/rejected": -1.0328760147094727, "logps/chosen": -2.196181535720825, "logps/rejected": -2.3460662364959717, "loss": 3.5815, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.961816787719727, "rewards/margins": 1.4988477230072021, "rewards/rejected": -23.460662841796875, "step": 12725 }, { "epoch": 0.4290673767231791, "grad_norm": 17.77597999572754, "learning_rate": 7.048576550885004e-07, "logits/chosen": -1.0335174798965454, "logits/rejected": -1.2250608205795288, "logps/chosen": -2.217999219894409, "logps/rejected": -2.653132200241089, "loss": 1.2628, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -22.179994583129883, "rewards/margins": 4.351326942443848, "rewards/rejected": -26.531322479248047, "step": 12730 }, { "epoch": 0.42923590279416224, "grad_norm": 23.028162002563477, "learning_rate": 7.04589306136933e-07, "logits/chosen": -1.1175248622894287, "logits/rejected": -1.1833066940307617, "logps/chosen": -1.847046136856079, "logps/rejected": -2.0023093223571777, "loss": 2.6907, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.470462799072266, "rewards/margins": 1.5526316165924072, "rewards/rejected": -20.02309226989746, "step": 12735 }, { "epoch": 0.42940442886514546, "grad_norm": 27.370683670043945, "learning_rate": 7.043208863850672e-07, "logits/chosen": -0.9457284808158875, "logits/rejected": -1.0987629890441895, "logps/chosen": -1.9186127185821533, "logps/rejected": -2.0391685962677, "loss": 2.2472, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -19.186124801635742, "rewards/margins": 1.2055596113204956, "rewards/rejected": -20.39168357849121, "step": 12740 }, { "epoch": 0.4295729549361286, "grad_norm": 20.669946670532227, "learning_rate": 7.040523959257927e-07, "logits/chosen": -1.6514732837677002, "logits/rejected": -1.6349937915802002, "logps/chosen": -1.930381417274475, "logps/rejected": -1.892491340637207, "loss": 3.7019, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.303813934326172, "rewards/margins": -0.3788990080356598, "rewards/rejected": -18.924915313720703, "step": 12745 }, { "epoch": 0.4297414810071118, "grad_norm": 21.375263214111328, "learning_rate": 7.037838348520233e-07, "logits/chosen": -1.4665489196777344, "logits/rejected": -1.5651066303253174, "logps/chosen": -1.99209725856781, "logps/rejected": -2.207446813583374, "loss": 1.8538, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.920970916748047, "rewards/margins": 2.1534957885742188, "rewards/rejected": -22.0744686126709, "step": 12750 }, { "epoch": 0.42991000707809496, "grad_norm": 15.512998580932617, "learning_rate": 7.035152032566973e-07, "logits/chosen": -1.1759886741638184, "logits/rejected": -1.4427909851074219, "logps/chosen": -2.1886658668518066, "logps/rejected": -2.348357677459717, "loss": 1.8191, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.88665771484375, "rewards/margins": 1.5969184637069702, "rewards/rejected": -23.48357582092285, "step": 12755 }, { "epoch": 0.4300785331490782, "grad_norm": 35.07164001464844, "learning_rate": 7.032465012327777e-07, "logits/chosen": -1.2698971033096313, "logits/rejected": -1.5194822549819946, "logps/chosen": -2.5975723266601562, "logps/rejected": -2.8108088970184326, "loss": 2.5388, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.975723266601562, "rewards/margins": 2.132366895675659, "rewards/rejected": -28.10808753967285, "step": 12760 }, { "epoch": 0.43024705922006135, "grad_norm": 36.83730697631836, "learning_rate": 7.029777288732516e-07, "logits/chosen": -0.8789815902709961, "logits/rejected": -1.0874645709991455, "logps/chosen": -1.9733607769012451, "logps/rejected": -2.3463854789733887, "loss": 2.2044, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.73360824584961, "rewards/margins": 3.730245590209961, "rewards/rejected": -23.46385383605957, "step": 12765 }, { "epoch": 0.4304155852910445, "grad_norm": 13.495026588439941, "learning_rate": 7.027088862711305e-07, "logits/chosen": -0.9406919479370117, "logits/rejected": -1.2175328731536865, "logps/chosen": -2.1586251258850098, "logps/rejected": -2.4604978561401367, "loss": 2.6017, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.586252212524414, "rewards/margins": 3.0187273025512695, "rewards/rejected": -24.60498046875, "step": 12770 }, { "epoch": 0.43058411136202773, "grad_norm": 24.654760360717773, "learning_rate": 7.024399735194503e-07, "logits/chosen": -1.667497992515564, "logits/rejected": -1.6784369945526123, "logps/chosen": -2.176253080368042, "logps/rejected": -2.3713762760162354, "loss": 2.694, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.762531280517578, "rewards/margins": 1.9512317180633545, "rewards/rejected": -23.713764190673828, "step": 12775 }, { "epoch": 0.4307526374330109, "grad_norm": 26.69776153564453, "learning_rate": 7.021709907112711e-07, "logits/chosen": -1.6615867614746094, "logits/rejected": -1.6004226207733154, "logps/chosen": -1.9908841848373413, "logps/rejected": -2.079436779022217, "loss": 3.5278, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -19.90884017944336, "rewards/margins": 0.8855279684066772, "rewards/rejected": -20.794368743896484, "step": 12780 }, { "epoch": 0.43092116350399406, "grad_norm": 23.461130142211914, "learning_rate": 7.019019379396772e-07, "logits/chosen": -1.2154319286346436, "logits/rejected": -1.3082120418548584, "logps/chosen": -2.370018482208252, "logps/rejected": -2.3778629302978516, "loss": 3.5654, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.7001895904541, "rewards/margins": 0.07844285666942596, "rewards/rejected": -23.77863121032715, "step": 12785 }, { "epoch": 0.43108968957497723, "grad_norm": 6.32010505796643e-06, "learning_rate": 7.016328152977773e-07, "logits/chosen": -1.21359384059906, "logits/rejected": -1.5789474248886108, "logps/chosen": -2.312983989715576, "logps/rejected": -2.9869911670684814, "loss": 1.6659, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -23.129838943481445, "rewards/margins": 6.74007511138916, "rewards/rejected": -29.869909286499023, "step": 12790 }, { "epoch": 0.43125821564596045, "grad_norm": 23.659255981445312, "learning_rate": 7.01363622878704e-07, "logits/chosen": -0.9882528185844421, "logits/rejected": -1.1198049783706665, "logps/chosen": -1.950643539428711, "logps/rejected": -1.860714316368103, "loss": 4.0003, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -19.50643539428711, "rewards/margins": -0.8992928266525269, "rewards/rejected": -18.60714340209961, "step": 12795 }, { "epoch": 0.4314267417169436, "grad_norm": 29.752731323242188, "learning_rate": 7.010943607756142e-07, "logits/chosen": -0.6915519833564758, "logits/rejected": -0.7919496297836304, "logps/chosen": -2.217557430267334, "logps/rejected": -2.159320831298828, "loss": 4.1181, "rewards/accuracies": 0.5, "rewards/chosen": -22.175573348999023, "rewards/margins": -0.5823682546615601, "rewards/rejected": -21.593204498291016, "step": 12800 }, { "epoch": 0.4314267417169436, "eval_logits/chosen": -1.581845760345459, "eval_logits/rejected": -1.6885778903961182, "eval_logps/chosen": -1.9620436429977417, "eval_logps/rejected": -2.061039447784424, "eval_loss": 3.008579969406128, "eval_rewards/accuracies": 0.6299999952316284, "eval_rewards/chosen": -19.62043571472168, "eval_rewards/margins": 0.9899570345878601, "eval_rewards/rejected": -20.610393524169922, "eval_runtime": 12.918, "eval_samples_per_second": 7.741, "eval_steps_per_second": 1.935, "step": 12800 }, { "epoch": 0.4315952677879268, "grad_norm": 38.067996978759766, "learning_rate": 7.008250290816888e-07, "logits/chosen": -1.0375231504440308, "logits/rejected": -1.0609022378921509, "logps/chosen": -2.1347129344940186, "logps/rejected": -2.05728816986084, "loss": 4.267, "rewards/accuracies": 0.5, "rewards/chosen": -21.34712791442871, "rewards/margins": -0.774248480796814, "rewards/rejected": -20.5728816986084, "step": 12805 }, { "epoch": 0.43176379385890995, "grad_norm": 59.478187561035156, "learning_rate": 7.005556278901334e-07, "logits/chosen": -1.0637800693511963, "logits/rejected": -1.1035311222076416, "logps/chosen": -2.100837469100952, "logps/rejected": -2.370177745819092, "loss": 2.2027, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.00837516784668, "rewards/margins": 2.6934049129486084, "rewards/rejected": -23.701780319213867, "step": 12810 }, { "epoch": 0.43193231992989317, "grad_norm": 30.48736572265625, "learning_rate": 7.002861572941764e-07, "logits/chosen": -1.3173472881317139, "logits/rejected": -1.440666913986206, "logps/chosen": -1.78774094581604, "logps/rejected": -1.9231376647949219, "loss": 3.0321, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -17.87740707397461, "rewards/margins": 1.3539693355560303, "rewards/rejected": -19.23137664794922, "step": 12815 }, { "epoch": 0.43210084600087634, "grad_norm": 21.779550552368164, "learning_rate": 7.000166173870715e-07, "logits/chosen": -1.1645904779434204, "logits/rejected": -0.9815500974655151, "logps/chosen": -2.5551183223724365, "logps/rejected": -2.453338861465454, "loss": 4.3826, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -25.551183700561523, "rewards/margins": -1.017795443534851, "rewards/rejected": -24.53338623046875, "step": 12820 }, { "epoch": 0.4322693720718595, "grad_norm": 140.74278259277344, "learning_rate": 6.997470082620955e-07, "logits/chosen": -1.717813491821289, "logits/rejected": -1.7180192470550537, "logps/chosen": -2.6433820724487305, "logps/rejected": -2.638429641723633, "loss": 4.0287, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -26.433818817138672, "rewards/margins": -0.04952258989214897, "rewards/rejected": -26.384296417236328, "step": 12825 }, { "epoch": 0.4324378981428427, "grad_norm": 59.987083435058594, "learning_rate": 6.994773300125498e-07, "logits/chosen": -0.9183546304702759, "logits/rejected": -0.7480214834213257, "logps/chosen": -3.556931257247925, "logps/rejected": -3.59765625, "loss": 6.2959, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -35.569313049316406, "rewards/margins": 0.40724772214889526, "rewards/rejected": -35.976558685302734, "step": 12830 }, { "epoch": 0.4326064242138259, "grad_norm": 16.872873306274414, "learning_rate": 6.992075827317593e-07, "logits/chosen": -1.4935461282730103, "logits/rejected": -1.6690353155136108, "logps/chosen": -2.410545825958252, "logps/rejected": -2.426908493041992, "loss": 4.5857, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.105457305908203, "rewards/margins": 0.16362905502319336, "rewards/rejected": -24.269084930419922, "step": 12835 }, { "epoch": 0.43277495028480906, "grad_norm": 16.417036056518555, "learning_rate": 6.989377665130727e-07, "logits/chosen": -1.4650869369506836, "logits/rejected": -1.4241197109222412, "logps/chosen": -2.010990619659424, "logps/rejected": -2.867649555206299, "loss": 2.4715, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.109905242919922, "rewards/margins": 8.566590309143066, "rewards/rejected": -28.676494598388672, "step": 12840 }, { "epoch": 0.4329434763557922, "grad_norm": 151.86782836914062, "learning_rate": 6.986678814498633e-07, "logits/chosen": -0.7954455614089966, "logits/rejected": -0.8803497552871704, "logps/chosen": -2.5972073078155518, "logps/rejected": -2.9544386863708496, "loss": 1.3978, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -25.972070693969727, "rewards/margins": 3.5723164081573486, "rewards/rejected": -29.544384002685547, "step": 12845 }, { "epoch": 0.43311200242677544, "grad_norm": 24.465486526489258, "learning_rate": 6.98397927635527e-07, "logits/chosen": -1.0608322620391846, "logits/rejected": -1.2614036798477173, "logps/chosen": -2.178588390350342, "logps/rejected": -2.1134209632873535, "loss": 3.7257, "rewards/accuracies": 0.5, "rewards/chosen": -21.7858829498291, "rewards/margins": -0.6516709327697754, "rewards/rejected": -21.13421058654785, "step": 12850 }, { "epoch": 0.4332805284977586, "grad_norm": 71.80946350097656, "learning_rate": 6.981279051634845e-07, "logits/chosen": -1.386091947555542, "logits/rejected": -1.5041242837905884, "logps/chosen": -2.2846813201904297, "logps/rejected": -2.337955951690674, "loss": 3.1754, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.846813201904297, "rewards/margins": 0.5327471494674683, "rewards/rejected": -23.379558563232422, "step": 12855 }, { "epoch": 0.4334490545687418, "grad_norm": 19.060367584228516, "learning_rate": 6.978578141271802e-07, "logits/chosen": -1.181014895439148, "logits/rejected": -1.188372254371643, "logps/chosen": -2.3186752796173096, "logps/rejected": -2.3536510467529297, "loss": 3.1496, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.186752319335938, "rewards/margins": 0.34975796937942505, "rewards/rejected": -23.536510467529297, "step": 12860 }, { "epoch": 0.43361758063972494, "grad_norm": 22.087045669555664, "learning_rate": 6.975876546200815e-07, "logits/chosen": -1.2001193761825562, "logits/rejected": -1.1703085899353027, "logps/chosen": -1.8144724369049072, "logps/rejected": -1.942793607711792, "loss": 3.0515, "rewards/accuracies": 0.5, "rewards/chosen": -18.144723892211914, "rewards/margins": 1.2832123041152954, "rewards/rejected": -19.427936553955078, "step": 12865 }, { "epoch": 0.43378610671070816, "grad_norm": 30.62040901184082, "learning_rate": 6.973174267356804e-07, "logits/chosen": -1.490786075592041, "logits/rejected": -1.7894165515899658, "logps/chosen": -2.078627109527588, "logps/rejected": -2.2299420833587646, "loss": 2.8385, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.786270141601562, "rewards/margins": 1.51315176486969, "rewards/rejected": -22.299423217773438, "step": 12870 }, { "epoch": 0.43395463278169133, "grad_norm": 30.947246551513672, "learning_rate": 6.970471305674917e-07, "logits/chosen": -0.5229056477546692, "logits/rejected": -0.6939235925674438, "logps/chosen": -1.9918756484985352, "logps/rejected": -2.3143186569213867, "loss": 1.3739, "rewards/accuracies": 1.0, "rewards/chosen": -19.91875457763672, "rewards/margins": 3.224431276321411, "rewards/rejected": -23.143184661865234, "step": 12875 }, { "epoch": 0.4341231588526745, "grad_norm": 47.894962310791016, "learning_rate": 6.967767662090546e-07, "logits/chosen": -0.884885311126709, "logits/rejected": -0.9361233711242676, "logps/chosen": -2.259148359298706, "logps/rejected": -2.435331344604492, "loss": 2.0028, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.59148597717285, "rewards/margins": 1.7618297338485718, "rewards/rejected": -24.353313446044922, "step": 12880 }, { "epoch": 0.4342916849236577, "grad_norm": 30.30147361755371, "learning_rate": 6.965063337539312e-07, "logits/chosen": -1.1772197484970093, "logits/rejected": -1.2234853506088257, "logps/chosen": -1.6218032836914062, "logps/rejected": -2.1243720054626465, "loss": 1.6635, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -16.218032836914062, "rewards/margins": 5.025688171386719, "rewards/rejected": -21.24372100830078, "step": 12885 }, { "epoch": 0.4344602109946409, "grad_norm": 32.74153137207031, "learning_rate": 6.962358332957078e-07, "logits/chosen": -0.8374770879745483, "logits/rejected": -0.9318382143974304, "logps/chosen": -2.2800254821777344, "logps/rejected": -2.363006591796875, "loss": 2.8854, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.800256729125977, "rewards/margins": 0.829812228679657, "rewards/rejected": -23.630067825317383, "step": 12890 }, { "epoch": 0.43462873706562405, "grad_norm": 17.24728775024414, "learning_rate": 6.959652649279941e-07, "logits/chosen": -1.3443167209625244, "logits/rejected": -1.3858791589736938, "logps/chosen": -2.0147061347961426, "logps/rejected": -2.057424306869507, "loss": 3.3514, "rewards/accuracies": 0.5, "rewards/chosen": -20.147062301635742, "rewards/margins": 0.4271821081638336, "rewards/rejected": -20.574243545532227, "step": 12895 }, { "epoch": 0.4347972631366072, "grad_norm": 126.15062713623047, "learning_rate": 6.956946287444227e-07, "logits/chosen": -1.0633189678192139, "logits/rejected": -1.052336573600769, "logps/chosen": -2.6452765464782715, "logps/rejected": -2.6808838844299316, "loss": 3.7942, "rewards/accuracies": 0.5, "rewards/chosen": -26.4527645111084, "rewards/margins": 0.3560709059238434, "rewards/rejected": -26.808834075927734, "step": 12900 }, { "epoch": 0.43496578920759044, "grad_norm": 19.88990020751953, "learning_rate": 6.954239248386504e-07, "logits/chosen": -1.5321104526519775, "logits/rejected": -1.4127600193023682, "logps/chosen": -1.8655850887298584, "logps/rejected": -2.0147814750671387, "loss": 2.5387, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.65584945678711, "rewards/margins": 1.4919636249542236, "rewards/rejected": -20.147815704345703, "step": 12905 }, { "epoch": 0.4351343152785736, "grad_norm": 84.38320922851562, "learning_rate": 6.951531533043572e-07, "logits/chosen": -0.6111572980880737, "logits/rejected": -0.744185745716095, "logps/chosen": -2.453381061553955, "logps/rejected": -3.282031536102295, "loss": 1.5609, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.533809661865234, "rewards/margins": 8.286506652832031, "rewards/rejected": -32.820316314697266, "step": 12910 }, { "epoch": 0.43530284134955677, "grad_norm": 53.10908889770508, "learning_rate": 6.948823142352463e-07, "logits/chosen": -1.4643628597259521, "logits/rejected": -1.636060118675232, "logps/chosen": -1.7974869012832642, "logps/rejected": -1.7600347995758057, "loss": 4.2903, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -17.974870681762695, "rewards/margins": -0.3745214343070984, "rewards/rejected": -17.6003475189209, "step": 12915 }, { "epoch": 0.43547136742053993, "grad_norm": 28.004077911376953, "learning_rate": 6.946114077250445e-07, "logits/chosen": -1.2066113948822021, "logits/rejected": -1.1940003633499146, "logps/chosen": -2.1136012077331543, "logps/rejected": -2.303692579269409, "loss": 2.6832, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.13601303100586, "rewards/margins": 1.9009119272232056, "rewards/rejected": -23.03692626953125, "step": 12920 }, { "epoch": 0.43563989349152316, "grad_norm": 26.950162887573242, "learning_rate": 6.943404338675018e-07, "logits/chosen": -0.9007886648178101, "logits/rejected": -0.9533143043518066, "logps/chosen": -2.1714673042297363, "logps/rejected": -2.1655383110046387, "loss": 3.6921, "rewards/accuracies": 0.5, "rewards/chosen": -21.714672088623047, "rewards/margins": -0.05929117277264595, "rewards/rejected": -21.655384063720703, "step": 12925 }, { "epoch": 0.4358084195625063, "grad_norm": 25.70941925048828, "learning_rate": 6.940693927563918e-07, "logits/chosen": -1.064649224281311, "logits/rejected": -1.015718698501587, "logps/chosen": -2.6266140937805176, "logps/rejected": -2.615485668182373, "loss": 4.4821, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -26.26613998413086, "rewards/margins": -0.11128310859203339, "rewards/rejected": -26.154857635498047, "step": 12930 }, { "epoch": 0.4359769456334895, "grad_norm": 26.513484954833984, "learning_rate": 6.937982844855109e-07, "logits/chosen": -0.5383487343788147, "logits/rejected": -0.8462270498275757, "logps/chosen": -2.0189685821533203, "logps/rejected": -2.3757314682006836, "loss": 2.4246, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.189685821533203, "rewards/margins": 3.5676300525665283, "rewards/rejected": -23.757314682006836, "step": 12935 }, { "epoch": 0.4361454717044727, "grad_norm": 20.81569480895996, "learning_rate": 6.935271091486791e-07, "logits/chosen": -1.6532917022705078, "logits/rejected": -1.7369930744171143, "logps/chosen": -2.276045560836792, "logps/rejected": -2.694082736968994, "loss": 2.3589, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.760456085205078, "rewards/margins": 4.180371284484863, "rewards/rejected": -26.940826416015625, "step": 12940 }, { "epoch": 0.4363139977754559, "grad_norm": 26.31812286376953, "learning_rate": 6.932558668397395e-07, "logits/chosen": -1.0625091791152954, "logits/rejected": -1.0464208126068115, "logps/chosen": -1.7008140087127686, "logps/rejected": -1.7959041595458984, "loss": 2.5549, "rewards/accuracies": 0.5, "rewards/chosen": -17.008142471313477, "rewards/margins": 0.9508990049362183, "rewards/rejected": -17.959041595458984, "step": 12945 }, { "epoch": 0.43648252384643904, "grad_norm": 38.73316192626953, "learning_rate": 6.929845576525584e-07, "logits/chosen": -1.0894930362701416, "logits/rejected": -1.0746827125549316, "logps/chosen": -2.072542190551758, "logps/rejected": -2.346071720123291, "loss": 1.8593, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.725421905517578, "rewards/margins": 2.7352960109710693, "rewards/rejected": -23.460718154907227, "step": 12950 }, { "epoch": 0.4366510499174222, "grad_norm": 64.09503173828125, "learning_rate": 6.927131816810251e-07, "logits/chosen": -1.3247054815292358, "logits/rejected": -1.6320841312408447, "logps/chosen": -2.17695951461792, "logps/rejected": -2.305966377258301, "loss": 3.0175, "rewards/accuracies": 0.5, "rewards/chosen": -21.769596099853516, "rewards/margins": 1.2900705337524414, "rewards/rejected": -23.05966567993164, "step": 12955 }, { "epoch": 0.43681957598840543, "grad_norm": 30.350339889526367, "learning_rate": 6.924417390190522e-07, "logits/chosen": -0.9131641387939453, "logits/rejected": -0.9161790013313293, "logps/chosen": -2.3749337196350098, "logps/rejected": -2.497476100921631, "loss": 3.0965, "rewards/accuracies": 0.5, "rewards/chosen": -23.749340057373047, "rewards/margins": 1.22542142868042, "rewards/rejected": -24.974760055541992, "step": 12960 }, { "epoch": 0.4369881020593886, "grad_norm": 25.28206443786621, "learning_rate": 6.921702297605755e-07, "logits/chosen": -0.969762921333313, "logits/rejected": -1.2061833143234253, "logps/chosen": -1.714321494102478, "logps/rejected": -2.0063693523406982, "loss": 1.6666, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.14321517944336, "rewards/margins": 2.9204788208007812, "rewards/rejected": -20.06369400024414, "step": 12965 }, { "epoch": 0.43715662813037176, "grad_norm": 88.055419921875, "learning_rate": 6.918986539995533e-07, "logits/chosen": -0.8369825482368469, "logits/rejected": -0.7208142876625061, "logps/chosen": -2.2105679512023926, "logps/rejected": -2.1658036708831787, "loss": 4.0844, "rewards/accuracies": 0.5, "rewards/chosen": -22.10567855834961, "rewards/margins": -0.4476422667503357, "rewards/rejected": -21.658037185668945, "step": 12970 }, { "epoch": 0.4373251542013549, "grad_norm": 27.194414138793945, "learning_rate": 6.916270118299677e-07, "logits/chosen": -1.2619565725326538, "logits/rejected": -1.45753014087677, "logps/chosen": -1.8999649286270142, "logps/rejected": -1.9333438873291016, "loss": 3.1134, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.999652862548828, "rewards/margins": 0.3337882161140442, "rewards/rejected": -19.333438873291016, "step": 12975 }, { "epoch": 0.43749368027233815, "grad_norm": 17.010786056518555, "learning_rate": 6.913553033458228e-07, "logits/chosen": -1.2112401723861694, "logits/rejected": -1.004233717918396, "logps/chosen": -1.9769935607910156, "logps/rejected": -1.8977857828140259, "loss": 4.0243, "rewards/accuracies": 0.5, "rewards/chosen": -19.769933700561523, "rewards/margins": -0.7920783758163452, "rewards/rejected": -18.977855682373047, "step": 12980 }, { "epoch": 0.4376622063433213, "grad_norm": 41.05767822265625, "learning_rate": 6.910835286411466e-07, "logits/chosen": -1.159142255783081, "logits/rejected": -1.3187485933303833, "logps/chosen": -2.0043880939483643, "logps/rejected": -2.213470220565796, "loss": 2.8533, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.043880462646484, "rewards/margins": 2.090820789337158, "rewards/rejected": -22.134702682495117, "step": 12985 }, { "epoch": 0.4378307324143045, "grad_norm": 42.45212936401367, "learning_rate": 6.908116878099894e-07, "logits/chosen": -0.8741549253463745, "logits/rejected": -0.9256917238235474, "logps/chosen": -1.7342628240585327, "logps/rejected": -1.8204578161239624, "loss": 2.5843, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.34263038635254, "rewards/margins": 0.8619493246078491, "rewards/rejected": -18.204578399658203, "step": 12990 }, { "epoch": 0.4379992584852877, "grad_norm": 25.65916633605957, "learning_rate": 6.90539780946425e-07, "logits/chosen": -1.1083229780197144, "logits/rejected": -1.4796850681304932, "logps/chosen": -2.369807004928589, "logps/rejected": -2.748922824859619, "loss": 2.4815, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.698070526123047, "rewards/margins": 3.7911579608917236, "rewards/rejected": -27.489227294921875, "step": 12995 }, { "epoch": 0.43816778455627087, "grad_norm": 46.00324249267578, "learning_rate": 6.902678081445494e-07, "logits/chosen": -1.6248180866241455, "logits/rejected": -1.7508703470230103, "logps/chosen": -2.150437831878662, "logps/rejected": -2.212613344192505, "loss": 3.3764, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.504375457763672, "rewards/margins": 0.6217565536499023, "rewards/rejected": -22.12613296508789, "step": 13000 }, { "epoch": 0.43833631062725403, "grad_norm": 29.252960205078125, "learning_rate": 6.899957694984815e-07, "logits/chosen": -1.0204622745513916, "logits/rejected": -1.358551025390625, "logps/chosen": -1.7499244213104248, "logps/rejected": -2.167311668395996, "loss": 1.7655, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -17.49924659729004, "rewards/margins": 4.173872947692871, "rewards/rejected": -21.673118591308594, "step": 13005 }, { "epoch": 0.4385048366982372, "grad_norm": 21.64922332763672, "learning_rate": 6.897236651023633e-07, "logits/chosen": -1.1953816413879395, "logits/rejected": -1.064687967300415, "logps/chosen": -2.1950154304504395, "logps/rejected": -2.2868361473083496, "loss": 3.2271, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.95015525817871, "rewards/margins": 0.9182068705558777, "rewards/rejected": -22.868358612060547, "step": 13010 }, { "epoch": 0.4386733627692204, "grad_norm": 8.935832977294922, "learning_rate": 6.894514950503594e-07, "logits/chosen": -1.1418415307998657, "logits/rejected": -1.4403326511383057, "logps/chosen": -1.8407323360443115, "logps/rejected": -2.124626874923706, "loss": 2.3638, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.407323837280273, "rewards/margins": 2.8389463424682617, "rewards/rejected": -21.24627113342285, "step": 13015 }, { "epoch": 0.4388418888402036, "grad_norm": 13.57795524597168, "learning_rate": 6.891792594366573e-07, "logits/chosen": -1.514499545097351, "logits/rejected": -1.5799944400787354, "logps/chosen": -1.9179500341415405, "logps/rejected": -2.3520543575286865, "loss": 2.3665, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.179500579833984, "rewards/margins": 4.341042995452881, "rewards/rejected": -23.520544052124023, "step": 13020 }, { "epoch": 0.43901041491118675, "grad_norm": 27.935443878173828, "learning_rate": 6.889069583554667e-07, "logits/chosen": -1.4357717037200928, "logits/rejected": -1.4973537921905518, "logps/chosen": -2.2571046352386475, "logps/rejected": -2.332789659500122, "loss": 2.6409, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.571048736572266, "rewards/margins": 0.7568472623825073, "rewards/rejected": -23.32789421081543, "step": 13025 }, { "epoch": 0.4391789409821699, "grad_norm": 21.917020797729492, "learning_rate": 6.886345919010207e-07, "logits/chosen": -1.1875712871551514, "logits/rejected": -1.2291758060455322, "logps/chosen": -2.2137908935546875, "logps/rejected": -2.2709903717041016, "loss": 3.0548, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.137910842895508, "rewards/margins": 0.5719939470291138, "rewards/rejected": -22.709903717041016, "step": 13030 }, { "epoch": 0.43934746705315314, "grad_norm": 15.354704856872559, "learning_rate": 6.883621601675743e-07, "logits/chosen": -1.095472812652588, "logits/rejected": -1.132777452468872, "logps/chosen": -2.2099857330322266, "logps/rejected": -2.343184471130371, "loss": 2.4734, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.099857330322266, "rewards/margins": 1.3319861888885498, "rewards/rejected": -23.431842803955078, "step": 13035 }, { "epoch": 0.4395159931241363, "grad_norm": 27.11628532409668, "learning_rate": 6.880896632494052e-07, "logits/chosen": -0.7230058908462524, "logits/rejected": -0.9955340623855591, "logps/chosen": -3.2007107734680176, "logps/rejected": -3.135807514190674, "loss": 4.1483, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -32.00710678100586, "rewards/margins": -0.6490316390991211, "rewards/rejected": -31.358074188232422, "step": 13040 }, { "epoch": 0.43968451919511947, "grad_norm": 17.648286819458008, "learning_rate": 6.878171012408143e-07, "logits/chosen": -1.3936727046966553, "logits/rejected": -1.5251567363739014, "logps/chosen": -1.7271394729614258, "logps/rejected": -1.8262546062469482, "loss": 2.3641, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.271394729614258, "rewards/margins": 0.9911512136459351, "rewards/rejected": -18.26254653930664, "step": 13045 }, { "epoch": 0.4398530452661027, "grad_norm": 21.880645751953125, "learning_rate": 6.875444742361243e-07, "logits/chosen": -0.9139540791511536, "logits/rejected": -1.0724513530731201, "logps/chosen": -1.8850370645523071, "logps/rejected": -2.1353516578674316, "loss": 3.3787, "rewards/accuracies": 0.5, "rewards/chosen": -18.85036849975586, "rewards/margins": 2.503145933151245, "rewards/rejected": -21.353517532348633, "step": 13050 }, { "epoch": 0.44002157133708586, "grad_norm": 15.43720531463623, "learning_rate": 6.872717823296806e-07, "logits/chosen": -1.1720114946365356, "logits/rejected": -1.1585838794708252, "logps/chosen": -2.18475341796875, "logps/rejected": -2.2911295890808105, "loss": 3.6322, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -21.847530364990234, "rewards/margins": 1.0637617111206055, "rewards/rejected": -22.91129493713379, "step": 13055 }, { "epoch": 0.440190097408069, "grad_norm": 25.999244689941406, "learning_rate": 6.869990256158513e-07, "logits/chosen": -1.3059625625610352, "logits/rejected": -1.1716262102127075, "logps/chosen": -2.208430051803589, "logps/rejected": -1.9477787017822266, "loss": 5.7393, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -22.084300994873047, "rewards/margins": -2.6065142154693604, "rewards/rejected": -19.477787017822266, "step": 13060 }, { "epoch": 0.4403586234790522, "grad_norm": 93.29350280761719, "learning_rate": 6.867262041890267e-07, "logits/chosen": -1.0793451070785522, "logits/rejected": -1.2775110006332397, "logps/chosen": -2.104738712310791, "logps/rejected": -2.279259443283081, "loss": 2.4399, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.04738426208496, "rewards/margins": 1.7452077865600586, "rewards/rejected": -22.792593002319336, "step": 13065 }, { "epoch": 0.4405271495500354, "grad_norm": 32.7735710144043, "learning_rate": 6.864533181436197e-07, "logits/chosen": -1.1295160055160522, "logits/rejected": -1.2696306705474854, "logps/chosen": -1.9000717401504517, "logps/rejected": -2.0310187339782715, "loss": 2.4621, "rewards/accuracies": 0.5, "rewards/chosen": -19.000717163085938, "rewards/margins": 1.3094708919525146, "rewards/rejected": -20.3101863861084, "step": 13070 }, { "epoch": 0.4406956756210186, "grad_norm": 115.73983001708984, "learning_rate": 6.861803675740652e-07, "logits/chosen": -1.2367197275161743, "logits/rejected": -1.3083772659301758, "logps/chosen": -2.917975664138794, "logps/rejected": -2.668013572692871, "loss": 5.6267, "rewards/accuracies": 0.5, "rewards/chosen": -29.17975425720215, "rewards/margins": -2.4996211528778076, "rewards/rejected": -26.680133819580078, "step": 13075 }, { "epoch": 0.44086420169200174, "grad_norm": 25.333463668823242, "learning_rate": 6.859073525748207e-07, "logits/chosen": -1.2527525424957275, "logits/rejected": -1.2299823760986328, "logps/chosen": -2.4775257110595703, "logps/rejected": -2.5582261085510254, "loss": 2.7355, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.775257110595703, "rewards/margins": 0.807004451751709, "rewards/rejected": -25.582263946533203, "step": 13080 }, { "epoch": 0.4410327277629849, "grad_norm": 20.128324508666992, "learning_rate": 6.856342732403658e-07, "logits/chosen": -1.1662752628326416, "logits/rejected": -1.07258141040802, "logps/chosen": -1.9656169414520264, "logps/rejected": -2.0915415287017822, "loss": 3.361, "rewards/accuracies": 0.5, "rewards/chosen": -19.656169891357422, "rewards/margins": 1.2592443227767944, "rewards/rejected": -20.915414810180664, "step": 13085 }, { "epoch": 0.44120125383396813, "grad_norm": 29.223894119262695, "learning_rate": 6.853611296652028e-07, "logits/chosen": -1.3792330026626587, "logits/rejected": -1.6818948984146118, "logps/chosen": -1.9177711009979248, "logps/rejected": -2.044464588165283, "loss": 2.8086, "rewards/accuracies": 0.5, "rewards/chosen": -19.177711486816406, "rewards/margins": 1.2669343948364258, "rewards/rejected": -20.44464683532715, "step": 13090 }, { "epoch": 0.4413697799049513, "grad_norm": 32.580604553222656, "learning_rate": 6.850879219438555e-07, "logits/chosen": -1.114403247833252, "logits/rejected": -1.0609710216522217, "logps/chosen": -1.9394359588623047, "logps/rejected": -1.9422838687896729, "loss": 3.1338, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -19.394359588623047, "rewards/margins": 0.028481101617217064, "rewards/rejected": -19.422840118408203, "step": 13095 }, { "epoch": 0.44153830597593446, "grad_norm": 85.80226135253906, "learning_rate": 6.848146501708709e-07, "logits/chosen": -0.9836652874946594, "logits/rejected": -1.0147119760513306, "logps/chosen": -2.1077113151550293, "logps/rejected": -2.0701069831848145, "loss": 3.5635, "rewards/accuracies": 0.5, "rewards/chosen": -21.077116012573242, "rewards/margins": -0.3760454058647156, "rewards/rejected": -20.70107078552246, "step": 13100 }, { "epoch": 0.4417068320469177, "grad_norm": 25.502391815185547, "learning_rate": 6.845413144408172e-07, "logits/chosen": -1.5241189002990723, "logits/rejected": -1.780220627784729, "logps/chosen": -2.1454973220825195, "logps/rejected": -2.860766887664795, "loss": 1.6927, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.454975128173828, "rewards/margins": 7.152696132659912, "rewards/rejected": -28.607669830322266, "step": 13105 }, { "epoch": 0.44187535811790085, "grad_norm": 25.432723999023438, "learning_rate": 6.842679148482851e-07, "logits/chosen": -1.1696789264678955, "logits/rejected": -1.5731089115142822, "logps/chosen": -1.8657867908477783, "logps/rejected": -2.2207283973693848, "loss": 1.8702, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.657867431640625, "rewards/margins": 3.5494143962860107, "rewards/rejected": -22.20728302001953, "step": 13110 }, { "epoch": 0.442043884188884, "grad_norm": 0.36097314953804016, "learning_rate": 6.839944514878877e-07, "logits/chosen": -1.2934590578079224, "logits/rejected": -1.476231575012207, "logps/chosen": -2.865110397338867, "logps/rejected": -3.2158114910125732, "loss": 2.2426, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -28.65110206604004, "rewards/margins": 3.507014513015747, "rewards/rejected": -32.158119201660156, "step": 13115 }, { "epoch": 0.4422124102598672, "grad_norm": 27.747323989868164, "learning_rate": 6.837209244542595e-07, "logits/chosen": -1.120197057723999, "logits/rejected": -1.235581398010254, "logps/chosen": -1.9799785614013672, "logps/rejected": -2.0634286403656006, "loss": 2.5903, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.799787521362305, "rewards/margins": 0.8344995379447937, "rewards/rejected": -20.634286880493164, "step": 13120 }, { "epoch": 0.4423809363308504, "grad_norm": 90.0943374633789, "learning_rate": 6.834473338420579e-07, "logits/chosen": -1.396087884902954, "logits/rejected": -1.4234291315078735, "logps/chosen": -2.2245163917541504, "logps/rejected": -2.019835948944092, "loss": 5.2797, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -22.245161056518555, "rewards/margins": -2.0468029975891113, "rewards/rejected": -20.198360443115234, "step": 13125 }, { "epoch": 0.44254946240183357, "grad_norm": 117.42842102050781, "learning_rate": 6.831736797459614e-07, "logits/chosen": -1.3987348079681396, "logits/rejected": -1.3095295429229736, "logps/chosen": -2.476482391357422, "logps/rejected": -2.4489948749542236, "loss": 3.766, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.76482582092285, "rewards/margins": -0.27487725019454956, "rewards/rejected": -24.48995018005371, "step": 13130 }, { "epoch": 0.44271798847281674, "grad_norm": 28.931501388549805, "learning_rate": 6.828999622606711e-07, "logits/chosen": -0.5741135478019714, "logits/rejected": -0.7597935199737549, "logps/chosen": -1.8390392065048218, "logps/rejected": -1.8604549169540405, "loss": 2.9438, "rewards/accuracies": 0.5, "rewards/chosen": -18.390392303466797, "rewards/margins": 0.2141561508178711, "rewards/rejected": -18.604549407958984, "step": 13135 }, { "epoch": 0.4428865145437999, "grad_norm": 35.9239501953125, "learning_rate": 6.8262618148091e-07, "logits/chosen": -1.060102939605713, "logits/rejected": -1.0475283861160278, "logps/chosen": -1.917798638343811, "logps/rejected": -2.2860186100006104, "loss": 2.7481, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.17798614501953, "rewards/margins": 3.6821999549865723, "rewards/rejected": -22.860183715820312, "step": 13140 }, { "epoch": 0.4430550406147831, "grad_norm": 19.328611373901367, "learning_rate": 6.823523375014226e-07, "logits/chosen": -0.9829468727111816, "logits/rejected": -1.0679337978363037, "logps/chosen": -2.110246419906616, "logps/rejected": -2.191786527633667, "loss": 3.3953, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.102466583251953, "rewards/margins": 0.815396785736084, "rewards/rejected": -21.917861938476562, "step": 13145 }, { "epoch": 0.4432235666857663, "grad_norm": 71.68951416015625, "learning_rate": 6.820784304169756e-07, "logits/chosen": -1.6284596920013428, "logits/rejected": -1.9653995037078857, "logps/chosen": -2.408257007598877, "logps/rejected": -3.189302921295166, "loss": 1.2931, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -24.082571029663086, "rewards/margins": 7.8104567527771, "rewards/rejected": -31.89302635192871, "step": 13150 }, { "epoch": 0.44339209275674946, "grad_norm": 20.27888298034668, "learning_rate": 6.818044603223574e-07, "logits/chosen": -1.0515351295471191, "logits/rejected": -1.1399530172348022, "logps/chosen": -2.0861306190490723, "logps/rejected": -2.0385451316833496, "loss": 3.8656, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -20.86130714416504, "rewards/margins": -0.47585612535476685, "rewards/rejected": -20.38545036315918, "step": 13155 }, { "epoch": 0.4435606188277327, "grad_norm": 46.08041763305664, "learning_rate": 6.815304273123783e-07, "logits/chosen": -1.0970168113708496, "logits/rejected": -1.1916046142578125, "logps/chosen": -2.6436848640441895, "logps/rejected": -2.4543864727020264, "loss": 4.9371, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -26.436847686767578, "rewards/margins": -1.8929836750030518, "rewards/rejected": -24.543865203857422, "step": 13160 }, { "epoch": 0.44372914489871584, "grad_norm": 37.04818344116211, "learning_rate": 6.812563314818703e-07, "logits/chosen": -1.206762671470642, "logits/rejected": -1.246781349182129, "logps/chosen": -1.8855199813842773, "logps/rejected": -2.097358465194702, "loss": 1.7062, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -18.855199813842773, "rewards/margins": 2.1183857917785645, "rewards/rejected": -20.973587036132812, "step": 13165 }, { "epoch": 0.443897670969699, "grad_norm": 12.917704582214355, "learning_rate": 6.809821729256873e-07, "logits/chosen": -1.279144287109375, "logits/rejected": -1.3216769695281982, "logps/chosen": -1.8531615734100342, "logps/rejected": -2.089557409286499, "loss": 1.7713, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.5316162109375, "rewards/margins": 2.3639566898345947, "rewards/rejected": -20.895572662353516, "step": 13170 }, { "epoch": 0.4440661970406822, "grad_norm": 29.798051834106445, "learning_rate": 6.807079517387046e-07, "logits/chosen": -1.0649659633636475, "logits/rejected": -1.3647143840789795, "logps/chosen": -1.8949416875839233, "logps/rejected": -2.111315965652466, "loss": 2.3844, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.949417114257812, "rewards/margins": 2.163743257522583, "rewards/rejected": -21.1131591796875, "step": 13175 }, { "epoch": 0.4442347231116654, "grad_norm": 20.287752151489258, "learning_rate": 6.804336680158192e-07, "logits/chosen": -1.6447932720184326, "logits/rejected": -2.0392098426818848, "logps/chosen": -1.7290363311767578, "logps/rejected": -2.252883195877075, "loss": 1.5029, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.290363311767578, "rewards/margins": 5.238468170166016, "rewards/rejected": -22.528831481933594, "step": 13180 }, { "epoch": 0.44440324918264856, "grad_norm": 36.967166900634766, "learning_rate": 6.801593218519504e-07, "logits/chosen": -0.7964226007461548, "logits/rejected": -0.8987895250320435, "logps/chosen": -2.4772469997406006, "logps/rejected": -2.6278481483459473, "loss": 3.3181, "rewards/accuracies": 0.5, "rewards/chosen": -24.772472381591797, "rewards/margins": 1.506009817123413, "rewards/rejected": -26.27848243713379, "step": 13185 }, { "epoch": 0.44457177525363173, "grad_norm": 8.808879852294922, "learning_rate": 6.798849133420381e-07, "logits/chosen": -1.4141619205474854, "logits/rejected": -1.4586751461029053, "logps/chosen": -1.8616573810577393, "logps/rejected": -2.29601788520813, "loss": 1.1475, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -18.616573333740234, "rewards/margins": 4.34360408782959, "rewards/rejected": -22.96017837524414, "step": 13190 }, { "epoch": 0.4447403013246149, "grad_norm": 42.99454879760742, "learning_rate": 6.796104425810447e-07, "logits/chosen": -1.2810221910476685, "logits/rejected": -1.4269440174102783, "logps/chosen": -2.0660388469696045, "logps/rejected": -2.1136868000030518, "loss": 2.8976, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.660388946533203, "rewards/margins": 0.4764803946018219, "rewards/rejected": -21.13686752319336, "step": 13195 }, { "epoch": 0.4449088273955981, "grad_norm": 28.964792251586914, "learning_rate": 6.793359096639533e-07, "logits/chosen": -0.6905937194824219, "logits/rejected": -0.8187843561172485, "logps/chosen": -1.9930311441421509, "logps/rejected": -2.2221527099609375, "loss": 1.6647, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.930309295654297, "rewards/margins": 2.291217565536499, "rewards/rejected": -22.221527099609375, "step": 13200 }, { "epoch": 0.4449088273955981, "eval_logits/chosen": -1.6181302070617676, "eval_logits/rejected": -1.7306538820266724, "eval_logps/chosen": -1.9777320623397827, "eval_logps/rejected": -2.079491376876831, "eval_loss": 3.0125772953033447, "eval_rewards/accuracies": 0.6299999952316284, "eval_rewards/chosen": -19.77732276916504, "eval_rewards/margins": 1.0175917148590088, "eval_rewards/rejected": -20.79491424560547, "eval_runtime": 12.9112, "eval_samples_per_second": 7.745, "eval_steps_per_second": 1.936, "step": 13200 }, { "epoch": 0.4450773534665813, "grad_norm": 30.03069305419922, "learning_rate": 6.790613146857691e-07, "logits/chosen": -1.3091100454330444, "logits/rejected": -1.6376367807388306, "logps/chosen": -2.148491621017456, "logps/rejected": -2.295868396759033, "loss": 3.0759, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.484914779663086, "rewards/margins": 1.473767876625061, "rewards/rejected": -22.958683013916016, "step": 13205 }, { "epoch": 0.44524587953756445, "grad_norm": 137.2454071044922, "learning_rate": 6.787866577415186e-07, "logits/chosen": -1.1861579418182373, "logits/rejected": -1.4750853776931763, "logps/chosen": -2.2643580436706543, "logps/rejected": -2.4095728397369385, "loss": 2.4487, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.64358139038086, "rewards/margins": 1.4521455764770508, "rewards/rejected": -24.09572982788086, "step": 13210 }, { "epoch": 0.44541440560854767, "grad_norm": 195.74488830566406, "learning_rate": 6.7851193892625e-07, "logits/chosen": -1.5892274379730225, "logits/rejected": -1.4221677780151367, "logps/chosen": -2.283761501312256, "logps/rejected": -2.062800168991089, "loss": 5.2939, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.837615966796875, "rewards/margins": -2.2096126079559326, "rewards/rejected": -20.628002166748047, "step": 13215 }, { "epoch": 0.44558293167953084, "grad_norm": 31.29680633544922, "learning_rate": 6.782371583350323e-07, "logits/chosen": -0.9922895431518555, "logits/rejected": -1.0052015781402588, "logps/chosen": -1.8493363857269287, "logps/rejected": -1.829101324081421, "loss": 4.1612, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.493366241455078, "rewards/margins": -0.2023508995771408, "rewards/rejected": -18.291013717651367, "step": 13220 }, { "epoch": 0.445751457750514, "grad_norm": 41.823753356933594, "learning_rate": 6.779623160629563e-07, "logits/chosen": -1.229884386062622, "logits/rejected": -1.6847175359725952, "logps/chosen": -1.9959089756011963, "logps/rejected": -2.059077501296997, "loss": 3.4638, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -19.959091186523438, "rewards/margins": 0.6316847801208496, "rewards/rejected": -20.590774536132812, "step": 13225 }, { "epoch": 0.44591998382149717, "grad_norm": 27.8101863861084, "learning_rate": 6.776874122051343e-07, "logits/chosen": -1.2641267776489258, "logits/rejected": -1.4562067985534668, "logps/chosen": -2.0732169151306152, "logps/rejected": -2.3724536895751953, "loss": 2.2068, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.732168197631836, "rewards/margins": 2.9923672676086426, "rewards/rejected": -23.724536895751953, "step": 13230 }, { "epoch": 0.4460885098924804, "grad_norm": 14.324053764343262, "learning_rate": 6.774124468566994e-07, "logits/chosen": -1.0541682243347168, "logits/rejected": -1.083929181098938, "logps/chosen": -1.5968945026397705, "logps/rejected": -1.8938548564910889, "loss": 1.4582, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -15.968945503234863, "rewards/margins": 2.9696033000946045, "rewards/rejected": -18.938547134399414, "step": 13235 }, { "epoch": 0.44625703596346356, "grad_norm": 23.845109939575195, "learning_rate": 6.771374201128064e-07, "logits/chosen": -1.324191927909851, "logits/rejected": -1.682186484336853, "logps/chosen": -1.9650967121124268, "logps/rejected": -2.437678575515747, "loss": 1.7045, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -19.65096664428711, "rewards/margins": 4.725818634033203, "rewards/rejected": -24.376785278320312, "step": 13240 }, { "epoch": 0.4464255620344467, "grad_norm": 38.21913528442383, "learning_rate": 6.768623320686315e-07, "logits/chosen": -0.9799416661262512, "logits/rejected": -0.7952501177787781, "logps/chosen": -2.021001100540161, "logps/rejected": -1.8483047485351562, "loss": 4.7785, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -20.210010528564453, "rewards/margins": -1.7269624471664429, "rewards/rejected": -18.483047485351562, "step": 13245 }, { "epoch": 0.4465940881054299, "grad_norm": 24.604358673095703, "learning_rate": 6.765871828193712e-07, "logits/chosen": -1.4743516445159912, "logits/rejected": -1.4921091794967651, "logps/chosen": -1.9836381673812866, "logps/rejected": -2.2177886962890625, "loss": 2.4211, "rewards/accuracies": 0.5, "rewards/chosen": -19.836380004882812, "rewards/margins": 2.341505289077759, "rewards/rejected": -22.177886962890625, "step": 13250 }, { "epoch": 0.4467626141764131, "grad_norm": 25.316375732421875, "learning_rate": 6.763119724602444e-07, "logits/chosen": -1.157165288925171, "logits/rejected": -1.2132501602172852, "logps/chosen": -2.028961181640625, "logps/rejected": -2.47636079788208, "loss": 1.1852, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -20.289613723754883, "rewards/margins": 4.473998546600342, "rewards/rejected": -24.763607025146484, "step": 13255 }, { "epoch": 0.4469311402473963, "grad_norm": 65.39524841308594, "learning_rate": 6.760367010864902e-07, "logits/chosen": -1.1245633363723755, "logits/rejected": -1.1890214681625366, "logps/chosen": -2.193920135498047, "logps/rejected": -2.1093883514404297, "loss": 4.1426, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -21.93920135498047, "rewards/margins": -0.8453181385993958, "rewards/rejected": -21.093883514404297, "step": 13260 }, { "epoch": 0.44709966631837944, "grad_norm": 81.17794036865234, "learning_rate": 6.757613687933694e-07, "logits/chosen": -1.566934585571289, "logits/rejected": -1.8760780096054077, "logps/chosen": -2.550309419631958, "logps/rejected": -2.6105148792266846, "loss": 4.4707, "rewards/accuracies": 0.5, "rewards/chosen": -25.503093719482422, "rewards/margins": 0.6020570993423462, "rewards/rejected": -26.105152130126953, "step": 13265 }, { "epoch": 0.44726819238936266, "grad_norm": 16.7962703704834, "learning_rate": 6.754859756761636e-07, "logits/chosen": -0.7093914747238159, "logits/rejected": -1.3773797750473022, "logps/chosen": -2.6696250438690186, "logps/rejected": -2.9403035640716553, "loss": 2.9442, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -26.69624900817871, "rewards/margins": 2.7067861557006836, "rewards/rejected": -29.403034210205078, "step": 13270 }, { "epoch": 0.44743671846034583, "grad_norm": 21.853174209594727, "learning_rate": 6.752105218301756e-07, "logits/chosen": -1.2181885242462158, "logits/rejected": -1.489119291305542, "logps/chosen": -2.325205087661743, "logps/rejected": -2.50209379196167, "loss": 2.7952, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -23.25205421447754, "rewards/margins": 1.7688862085342407, "rewards/rejected": -25.020938873291016, "step": 13275 }, { "epoch": 0.447605244531329, "grad_norm": 23.012371063232422, "learning_rate": 6.749350073507288e-07, "logits/chosen": -1.453213095664978, "logits/rejected": -1.8205163478851318, "logps/chosen": -1.930599570274353, "logps/rejected": -2.1534509658813477, "loss": 2.5051, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.30599594116211, "rewards/margins": 2.2285144329071045, "rewards/rejected": -21.53451156616211, "step": 13280 }, { "epoch": 0.44777377060231216, "grad_norm": 29.789447784423828, "learning_rate": 6.746594323331681e-07, "logits/chosen": -1.418830394744873, "logits/rejected": -1.1861220598220825, "logps/chosen": -2.4347853660583496, "logps/rejected": -2.4367117881774902, "loss": 3.1936, "rewards/accuracies": 0.5, "rewards/chosen": -24.34785270690918, "rewards/margins": 0.019265126436948776, "rewards/rejected": -24.367116928100586, "step": 13285 }, { "epoch": 0.4479422966732954, "grad_norm": 12.318198204040527, "learning_rate": 6.743837968728594e-07, "logits/chosen": -1.5645182132720947, "logits/rejected": -1.6600430011749268, "logps/chosen": -1.8471921682357788, "logps/rejected": -1.8008241653442383, "loss": 4.0371, "rewards/accuracies": 0.5, "rewards/chosen": -18.471920013427734, "rewards/margins": -0.46367913484573364, "rewards/rejected": -18.008243560791016, "step": 13290 }, { "epoch": 0.44811082274427855, "grad_norm": 16.357254028320312, "learning_rate": 6.741081010651889e-07, "logits/chosen": -1.1210089921951294, "logits/rejected": -1.0778058767318726, "logps/chosen": -1.6563653945922852, "logps/rejected": -1.7912307977676392, "loss": 2.2908, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.56365394592285, "rewards/margins": 1.34865403175354, "rewards/rejected": -17.912307739257812, "step": 13295 }, { "epoch": 0.4482793488152617, "grad_norm": 15.02835750579834, "learning_rate": 6.738323450055643e-07, "logits/chosen": -0.8961246609687805, "logits/rejected": -1.0249212980270386, "logps/chosen": -1.69499933719635, "logps/rejected": -2.0932421684265137, "loss": 1.6754, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -16.94999122619629, "rewards/margins": 3.982431411743164, "rewards/rejected": -20.932422637939453, "step": 13300 }, { "epoch": 0.4484478748862449, "grad_norm": 6.8707990646362305, "learning_rate": 6.735565287894138e-07, "logits/chosen": -1.0673387050628662, "logits/rejected": -1.2874033451080322, "logps/chosen": -2.0255348682403564, "logps/rejected": -2.2231597900390625, "loss": 1.8753, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.255346298217773, "rewards/margins": 1.9762521982192993, "rewards/rejected": -22.23160171508789, "step": 13305 }, { "epoch": 0.4486164009572281, "grad_norm": 60.034698486328125, "learning_rate": 6.732806525121865e-07, "logits/chosen": -1.4277997016906738, "logits/rejected": -1.6005510091781616, "logps/chosen": -2.0236220359802246, "logps/rejected": -2.0606486797332764, "loss": 3.7093, "rewards/accuracies": 0.5, "rewards/chosen": -20.23621940612793, "rewards/margins": 0.3702692985534668, "rewards/rejected": -20.606489181518555, "step": 13310 }, { "epoch": 0.44878492702821127, "grad_norm": 22.498273849487305, "learning_rate": 6.730047162693524e-07, "logits/chosen": -0.6527112126350403, "logits/rejected": -0.9839221835136414, "logps/chosen": -1.7513904571533203, "logps/rejected": -2.521151304244995, "loss": 1.6066, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.513904571533203, "rewards/margins": 7.697610378265381, "rewards/rejected": -25.211515426635742, "step": 13315 }, { "epoch": 0.44895345309919443, "grad_norm": 17.344219207763672, "learning_rate": 6.72728720156402e-07, "logits/chosen": -1.0116724967956543, "logits/rejected": -1.0222210884094238, "logps/chosen": -1.6952499151229858, "logps/rejected": -1.765568494796753, "loss": 2.7574, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.952499389648438, "rewards/margins": 0.7031861543655396, "rewards/rejected": -17.655685424804688, "step": 13320 }, { "epoch": 0.4491219791701776, "grad_norm": 209.14768981933594, "learning_rate": 6.724526642688469e-07, "logits/chosen": -0.7147254943847656, "logits/rejected": -0.6004077792167664, "logps/chosen": -2.7620253562927246, "logps/rejected": -2.6541154384613037, "loss": 4.4528, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -27.620254516601562, "rewards/margins": -1.079100251197815, "rewards/rejected": -26.541156768798828, "step": 13325 }, { "epoch": 0.4492905052411608, "grad_norm": 21.652204513549805, "learning_rate": 6.72176548702219e-07, "logits/chosen": -1.4568471908569336, "logits/rejected": -1.3350114822387695, "logps/chosen": -1.6353132724761963, "logps/rejected": -1.6017754077911377, "loss": 3.6454, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.353130340576172, "rewards/margins": -0.33537864685058594, "rewards/rejected": -16.017751693725586, "step": 13330 }, { "epoch": 0.449459031312144, "grad_norm": 20.0999698638916, "learning_rate": 6.719003735520711e-07, "logits/chosen": -1.2092087268829346, "logits/rejected": -1.0637407302856445, "logps/chosen": -2.552417516708374, "logps/rejected": -2.986699342727661, "loss": 1.2213, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -25.5241756439209, "rewards/margins": 4.3428168296813965, "rewards/rejected": -29.866994857788086, "step": 13335 }, { "epoch": 0.44962755738312715, "grad_norm": 12.592076301574707, "learning_rate": 6.716241389139767e-07, "logits/chosen": -1.3967036008834839, "logits/rejected": -1.6072719097137451, "logps/chosen": -2.0083084106445312, "logps/rejected": -2.2066750526428223, "loss": 2.3997, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.083087921142578, "rewards/margins": 1.9836658239364624, "rewards/rejected": -22.066753387451172, "step": 13340 }, { "epoch": 0.4497960834541104, "grad_norm": 25.59599494934082, "learning_rate": 6.713478448835292e-07, "logits/chosen": -1.3335907459259033, "logits/rejected": -1.4632683992385864, "logps/chosen": -1.9616371393203735, "logps/rejected": -2.127497434616089, "loss": 2.5662, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.616371154785156, "rewards/margins": 1.6586040258407593, "rewards/rejected": -21.274974822998047, "step": 13345 }, { "epoch": 0.44996460952509354, "grad_norm": 28.009143829345703, "learning_rate": 6.710714915563436e-07, "logits/chosen": -0.913608729839325, "logits/rejected": -0.8590755462646484, "logps/chosen": -2.1531331539154053, "logps/rejected": -2.5827205181121826, "loss": 2.3824, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.53133201599121, "rewards/margins": 4.295873165130615, "rewards/rejected": -25.827205657958984, "step": 13350 }, { "epoch": 0.4501331355960767, "grad_norm": 29.852846145629883, "learning_rate": 6.707950790280545e-07, "logits/chosen": -1.193040370941162, "logits/rejected": -1.324516773223877, "logps/chosen": -2.145280599594116, "logps/rejected": -2.0207278728485107, "loss": 4.3902, "rewards/accuracies": 0.5, "rewards/chosen": -21.45280647277832, "rewards/margins": -1.2455263137817383, "rewards/rejected": -20.2072811126709, "step": 13355 }, { "epoch": 0.45030166166705987, "grad_norm": 9.560738563537598, "learning_rate": 6.70518607394318e-07, "logits/chosen": -0.8668038249015808, "logits/rejected": -1.1544456481933594, "logps/chosen": -2.8680739402770996, "logps/rejected": -3.039849042892456, "loss": 3.3841, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -28.680736541748047, "rewards/margins": 1.7177526950836182, "rewards/rejected": -30.39849281311035, "step": 13360 }, { "epoch": 0.4504701877380431, "grad_norm": 28.32439613342285, "learning_rate": 6.702420767508094e-07, "logits/chosen": -1.2721959352493286, "logits/rejected": -1.5142534971237183, "logps/chosen": -2.3183741569519043, "logps/rejected": -2.365943431854248, "loss": 4.5788, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.183740615844727, "rewards/margins": 0.47569331526756287, "rewards/rejected": -23.659435272216797, "step": 13365 }, { "epoch": 0.45063871380902626, "grad_norm": 24.397031784057617, "learning_rate": 6.699654871932255e-07, "logits/chosen": -1.5062090158462524, "logits/rejected": -1.5440236330032349, "logps/chosen": -1.8172622919082642, "logps/rejected": -2.0760245323181152, "loss": 1.6939, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.172622680664062, "rewards/margins": 2.587623357772827, "rewards/rejected": -20.760244369506836, "step": 13370 }, { "epoch": 0.4508072398800094, "grad_norm": 28.845012664794922, "learning_rate": 6.696888388172827e-07, "logits/chosen": -1.766819715499878, "logits/rejected": -1.8832629919052124, "logps/chosen": -1.7977116107940674, "logps/rejected": -2.1124632358551025, "loss": 2.0223, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.977115631103516, "rewards/margins": 3.147516965866089, "rewards/rejected": -21.124631881713867, "step": 13375 }, { "epoch": 0.4509757659509926, "grad_norm": 35.38254165649414, "learning_rate": 6.694121317187182e-07, "logits/chosen": -1.0718519687652588, "logits/rejected": -1.0068604946136475, "logps/chosen": -2.358485698699951, "logps/rejected": -2.662508010864258, "loss": 4.2874, "rewards/accuracies": 0.5, "rewards/chosen": -23.584857940673828, "rewards/margins": 3.0402235984802246, "rewards/rejected": -26.625080108642578, "step": 13380 }, { "epoch": 0.4511442920219758, "grad_norm": 58.901695251464844, "learning_rate": 6.691353659932895e-07, "logits/chosen": -1.763758659362793, "logits/rejected": -1.3714280128479004, "logps/chosen": -2.5071606636047363, "logps/rejected": -2.5245368480682373, "loss": 2.98, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.071605682373047, "rewards/margins": 0.17376241087913513, "rewards/rejected": -25.245370864868164, "step": 13385 }, { "epoch": 0.451312818092959, "grad_norm": 41.93807601928711, "learning_rate": 6.688585417367744e-07, "logits/chosen": -1.47969388961792, "logits/rejected": -1.490703821182251, "logps/chosen": -2.10553240776062, "logps/rejected": -1.976574182510376, "loss": 4.7305, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -21.055326461791992, "rewards/margins": -1.2895841598510742, "rewards/rejected": -19.7657413482666, "step": 13390 }, { "epoch": 0.45148134416394214, "grad_norm": 23.69333839416504, "learning_rate": 6.685816590449708e-07, "logits/chosen": -1.6030505895614624, "logits/rejected": -1.6188846826553345, "logps/chosen": -2.3643722534179688, "logps/rejected": -2.304133415222168, "loss": 3.9704, "rewards/accuracies": 0.5, "rewards/chosen": -23.643722534179688, "rewards/margins": -0.6023872494697571, "rewards/rejected": -23.04133415222168, "step": 13395 }, { "epoch": 0.45164987023492537, "grad_norm": 23.56180191040039, "learning_rate": 6.683047180136968e-07, "logits/chosen": -0.9816482663154602, "logits/rejected": -1.2482496500015259, "logps/chosen": -1.8885109424591064, "logps/rejected": -2.057342052459717, "loss": 2.3919, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.885107040405273, "rewards/margins": 1.6883150339126587, "rewards/rejected": -20.573421478271484, "step": 13400 }, { "epoch": 0.45181839630590853, "grad_norm": 42.30891418457031, "learning_rate": 6.680277187387908e-07, "logits/chosen": -1.793176293373108, "logits/rejected": -1.7645775079727173, "logps/chosen": -1.954549789428711, "logps/rejected": -2.162538528442383, "loss": 2.726, "rewards/accuracies": 0.5, "rewards/chosen": -19.54549789428711, "rewards/margins": 2.0798871517181396, "rewards/rejected": -21.625385284423828, "step": 13405 }, { "epoch": 0.4519869223768917, "grad_norm": 45.36151885986328, "learning_rate": 6.677506613161115e-07, "logits/chosen": -1.1814281940460205, "logits/rejected": -1.3925037384033203, "logps/chosen": -2.0957963466644287, "logps/rejected": -2.2337698936462402, "loss": 2.4478, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.957962036132812, "rewards/margins": 1.3797346353530884, "rewards/rejected": -22.33769989013672, "step": 13410 }, { "epoch": 0.45215544844787486, "grad_norm": 30.80636978149414, "learning_rate": 6.674735458415371e-07, "logits/chosen": -1.3277122974395752, "logits/rejected": -1.2999038696289062, "logps/chosen": -1.950740098953247, "logps/rejected": -1.9885116815567017, "loss": 3.0388, "rewards/accuracies": 0.5, "rewards/chosen": -19.507400512695312, "rewards/margins": 0.37771645188331604, "rewards/rejected": -19.885116577148438, "step": 13415 }, { "epoch": 0.4523239745188581, "grad_norm": 15.257139205932617, "learning_rate": 6.67196372410967e-07, "logits/chosen": -1.331177830696106, "logits/rejected": -1.614814043045044, "logps/chosen": -1.9961341619491577, "logps/rejected": -2.163208484649658, "loss": 2.151, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.96134376525879, "rewards/margins": 1.6707426309585571, "rewards/rejected": -21.6320858001709, "step": 13420 }, { "epoch": 0.45249250058984125, "grad_norm": 18.619733810424805, "learning_rate": 6.669191411203195e-07, "logits/chosen": -0.8345550298690796, "logits/rejected": -0.8545502424240112, "logps/chosen": -2.327765941619873, "logps/rejected": -2.5256428718566895, "loss": 2.3759, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -23.277658462524414, "rewards/margins": 1.978769302368164, "rewards/rejected": -25.256427764892578, "step": 13425 }, { "epoch": 0.4526610266608244, "grad_norm": 23.630504608154297, "learning_rate": 6.666418520655337e-07, "logits/chosen": -1.0486633777618408, "logits/rejected": -0.9874933362007141, "logps/chosen": -1.9027341604232788, "logps/rejected": -1.8841438293457031, "loss": 3.8279, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.027341842651367, "rewards/margins": -0.1859048306941986, "rewards/rejected": -18.8414363861084, "step": 13430 }, { "epoch": 0.4528295527318076, "grad_norm": 21.301984786987305, "learning_rate": 6.663645053425684e-07, "logits/chosen": -1.3079849481582642, "logits/rejected": -1.2970564365386963, "logps/chosen": -1.8315374851226807, "logps/rejected": -2.204904794692993, "loss": 3.1383, "rewards/accuracies": 0.5, "rewards/chosen": -18.315372467041016, "rewards/margins": 3.733673095703125, "rewards/rejected": -22.04904556274414, "step": 13435 }, { "epoch": 0.4529980788027908, "grad_norm": 100.8381118774414, "learning_rate": 6.660871010474022e-07, "logits/chosen": -1.1379636526107788, "logits/rejected": -0.9681974649429321, "logps/chosen": -2.4161911010742188, "logps/rejected": -2.1934218406677246, "loss": 5.781, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -24.16191291809082, "rewards/margins": -2.2276949882507324, "rewards/rejected": -21.934215545654297, "step": 13440 }, { "epoch": 0.45316660487377397, "grad_norm": 57.3202018737793, "learning_rate": 6.658096392760339e-07, "logits/chosen": -1.0857040882110596, "logits/rejected": -1.133437991142273, "logps/chosen": -1.7230842113494873, "logps/rejected": -1.8146679401397705, "loss": 2.8472, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.230844497680664, "rewards/margins": 0.9158358573913574, "rewards/rejected": -18.146678924560547, "step": 13445 }, { "epoch": 0.45333513094475714, "grad_norm": 11.185892105102539, "learning_rate": 6.655321201244822e-07, "logits/chosen": -1.4162670373916626, "logits/rejected": -1.4772002696990967, "logps/chosen": -1.9110603332519531, "logps/rejected": -2.6553866863250732, "loss": 1.822, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.1106014251709, "rewards/margins": 7.443265438079834, "rewards/rejected": -26.553869247436523, "step": 13450 }, { "epoch": 0.45350365701574036, "grad_norm": 19.761932373046875, "learning_rate": 6.652545436887853e-07, "logits/chosen": -1.1321470737457275, "logits/rejected": -1.3900673389434814, "logps/chosen": -2.0201334953308105, "logps/rejected": -2.591609477996826, "loss": 2.73, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.201332092285156, "rewards/margins": 5.714763641357422, "rewards/rejected": -25.916095733642578, "step": 13455 }, { "epoch": 0.4536721830867235, "grad_norm": 26.253297805786133, "learning_rate": 6.649769100650016e-07, "logits/chosen": -1.4115238189697266, "logits/rejected": -1.5038487911224365, "logps/chosen": -2.2541861534118652, "logps/rejected": -2.6011769771575928, "loss": 1.8083, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.54186248779297, "rewards/margins": 3.469909191131592, "rewards/rejected": -26.01177406311035, "step": 13460 }, { "epoch": 0.4538407091577067, "grad_norm": 66.70148468017578, "learning_rate": 6.646992193492091e-07, "logits/chosen": -1.5807290077209473, "logits/rejected": -1.4729264974594116, "logps/chosen": -1.8306732177734375, "logps/rejected": -1.915564775466919, "loss": 2.5989, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.306732177734375, "rewards/margins": 0.8489131927490234, "rewards/rejected": -19.1556453704834, "step": 13465 }, { "epoch": 0.45400923522868986, "grad_norm": 21.561328887939453, "learning_rate": 6.644214716375058e-07, "logits/chosen": -1.6580655574798584, "logits/rejected": -1.7166268825531006, "logps/chosen": -1.9259936809539795, "logps/rejected": -2.1397814750671387, "loss": 3.2724, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.259937286376953, "rewards/margins": 2.137877941131592, "rewards/rejected": -21.397815704345703, "step": 13470 }, { "epoch": 0.4541777612996731, "grad_norm": 74.84004974365234, "learning_rate": 6.641436670260091e-07, "logits/chosen": -0.9843997955322266, "logits/rejected": -1.2904479503631592, "logps/chosen": -2.055634021759033, "logps/rejected": -2.116734504699707, "loss": 3.042, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.55634117126465, "rewards/margins": 0.6110035181045532, "rewards/rejected": -21.167346954345703, "step": 13475 }, { "epoch": 0.45434628737065624, "grad_norm": 27.720678329467773, "learning_rate": 6.638658056108563e-07, "logits/chosen": -1.02791166305542, "logits/rejected": -1.3568519353866577, "logps/chosen": -1.9348390102386475, "logps/rejected": -1.9345744848251343, "loss": 3.5751, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.348390579223633, "rewards/margins": -0.0026446818374097347, "rewards/rejected": -19.345746994018555, "step": 13480 }, { "epoch": 0.4545148134416394, "grad_norm": 23.407005310058594, "learning_rate": 6.63587887488204e-07, "logits/chosen": -1.26626718044281, "logits/rejected": -1.209788203239441, "logps/chosen": -1.6449733972549438, "logps/rejected": -1.5845093727111816, "loss": 3.9153, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.449735641479492, "rewards/margins": -0.6046417355537415, "rewards/rejected": -15.845094680786133, "step": 13485 }, { "epoch": 0.4546833395126226, "grad_norm": 23.646387100219727, "learning_rate": 6.633099127542292e-07, "logits/chosen": -1.390199899673462, "logits/rejected": -1.3530223369598389, "logps/chosen": -2.141261577606201, "logps/rejected": -2.1175923347473145, "loss": 3.3947, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.412616729736328, "rewards/margins": -0.23669414222240448, "rewards/rejected": -21.175920486450195, "step": 13490 }, { "epoch": 0.4548518655836058, "grad_norm": 20.303586959838867, "learning_rate": 6.630318815051276e-07, "logits/chosen": -1.4647496938705444, "logits/rejected": -1.537502408027649, "logps/chosen": -2.4159634113311768, "logps/rejected": -2.673832416534424, "loss": 1.8577, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.159637451171875, "rewards/margins": 2.5786876678466797, "rewards/rejected": -26.738323211669922, "step": 13495 }, { "epoch": 0.45502039165458896, "grad_norm": 29.78251075744629, "learning_rate": 6.627537938371151e-07, "logits/chosen": -1.3127989768981934, "logits/rejected": -1.3036218881607056, "logps/chosen": -1.9458658695220947, "logps/rejected": -1.7856101989746094, "loss": 4.6394, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -19.458660125732422, "rewards/margins": -1.6025558710098267, "rewards/rejected": -17.856103897094727, "step": 13500 }, { "epoch": 0.45518891772557213, "grad_norm": 56.97768783569336, "learning_rate": 6.624756498464268e-07, "logits/chosen": -1.3639286756515503, "logits/rejected": -1.4781858921051025, "logps/chosen": -3.0983211994171143, "logps/rejected": -2.691847562789917, "loss": 7.4064, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -30.98320960998535, "rewards/margins": -4.064734935760498, "rewards/rejected": -26.918476104736328, "step": 13505 }, { "epoch": 0.45535744379655535, "grad_norm": 23.73890495300293, "learning_rate": 6.621974496293173e-07, "logits/chosen": -1.0127825736999512, "logits/rejected": -1.3120650053024292, "logps/chosen": -2.0623297691345215, "logps/rejected": -2.40421986579895, "loss": 4.3155, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -20.6232967376709, "rewards/margins": 3.4188995361328125, "rewards/rejected": -24.04219627380371, "step": 13510 }, { "epoch": 0.4555259698675385, "grad_norm": 30.769384384155273, "learning_rate": 6.619191932820608e-07, "logits/chosen": -1.323246717453003, "logits/rejected": -1.534246802330017, "logps/chosen": -2.2586467266082764, "logps/rejected": -2.7533957958221436, "loss": 1.5861, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.586467742919922, "rewards/margins": 4.9474921226501465, "rewards/rejected": -27.533960342407227, "step": 13515 }, { "epoch": 0.4556944959385217, "grad_norm": 14.726211547851562, "learning_rate": 6.616408809009508e-07, "logits/chosen": -1.2696958780288696, "logits/rejected": -1.288459062576294, "logps/chosen": -2.2325329780578613, "logps/rejected": -2.7220871448516846, "loss": 1.5197, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -22.325326919555664, "rewards/margins": 4.895545482635498, "rewards/rejected": -27.220874786376953, "step": 13520 }, { "epoch": 0.45586302200950485, "grad_norm": 18.62717056274414, "learning_rate": 6.613625125823006e-07, "logits/chosen": -0.8771476745605469, "logits/rejected": -1.0887469053268433, "logps/chosen": -2.093808650970459, "logps/rejected": -2.270981550216675, "loss": 1.9294, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.938087463378906, "rewards/margins": 1.7717288732528687, "rewards/rejected": -22.709814071655273, "step": 13525 }, { "epoch": 0.45603154808048807, "grad_norm": 32.54737091064453, "learning_rate": 6.610840884224419e-07, "logits/chosen": -1.5765092372894287, "logits/rejected": -1.587710976600647, "logps/chosen": -2.069793462753296, "logps/rejected": -2.1981959342956543, "loss": 2.4058, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.697935104370117, "rewards/margins": 1.2840235233306885, "rewards/rejected": -21.981958389282227, "step": 13530 }, { "epoch": 0.45620007415147124, "grad_norm": 43.75876998901367, "learning_rate": 6.60805608517727e-07, "logits/chosen": -1.2509219646453857, "logits/rejected": -1.4516044855117798, "logps/chosen": -1.8669036626815796, "logps/rejected": -1.8626625537872314, "loss": 3.7962, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -18.669036865234375, "rewards/margins": -0.04241304472088814, "rewards/rejected": -18.626625061035156, "step": 13535 }, { "epoch": 0.4563686002224544, "grad_norm": 31.220243453979492, "learning_rate": 6.605270729645263e-07, "logits/chosen": -1.1909513473510742, "logits/rejected": -1.2448523044586182, "logps/chosen": -1.9961858987808228, "logps/rejected": -2.35213041305542, "loss": 2.2121, "rewards/accuracies": 0.5, "rewards/chosen": -19.96185874938965, "rewards/margins": 3.5594451427459717, "rewards/rejected": -23.521305084228516, "step": 13540 }, { "epoch": 0.45653712629343757, "grad_norm": 159.19329833984375, "learning_rate": 6.6024848185923e-07, "logits/chosen": -1.0990245342254639, "logits/rejected": -1.2833720445632935, "logps/chosen": -2.3984270095825195, "logps/rejected": -2.343677520751953, "loss": 4.4424, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.984272003173828, "rewards/margins": -0.5474956631660461, "rewards/rejected": -23.4367733001709, "step": 13545 }, { "epoch": 0.4567056523644208, "grad_norm": 24.201881408691406, "learning_rate": 6.599698352982479e-07, "logits/chosen": -1.5832691192626953, "logits/rejected": -1.9115978479385376, "logps/chosen": -1.7946374416351318, "logps/rejected": -1.907883882522583, "loss": 2.2535, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.946374893188477, "rewards/margins": 1.132464051246643, "rewards/rejected": -19.078838348388672, "step": 13550 }, { "epoch": 0.45687417843540395, "grad_norm": 28.029586791992188, "learning_rate": 6.596911333780082e-07, "logits/chosen": -1.019972562789917, "logits/rejected": -0.9652736783027649, "logps/chosen": -2.3742733001708984, "logps/rejected": -2.439466953277588, "loss": 2.5481, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.742733001708984, "rewards/margins": 0.6519354581832886, "rewards/rejected": -24.394670486450195, "step": 13555 }, { "epoch": 0.4570427045063871, "grad_norm": 20.071243286132812, "learning_rate": 6.594123761949589e-07, "logits/chosen": -1.3693435192108154, "logits/rejected": -1.8643760681152344, "logps/chosen": -1.987213373184204, "logps/rejected": -2.7671642303466797, "loss": 2.0481, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.872133255004883, "rewards/margins": 7.799508571624756, "rewards/rejected": -27.671642303466797, "step": 13560 }, { "epoch": 0.45721123057737034, "grad_norm": 30.94683837890625, "learning_rate": 6.591335638455667e-07, "logits/chosen": -1.4474550485610962, "logits/rejected": -1.564068078994751, "logps/chosen": -1.9418039321899414, "logps/rejected": -1.8427324295043945, "loss": 4.245, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -19.418039321899414, "rewards/margins": -0.9907158613204956, "rewards/rejected": -18.427324295043945, "step": 13565 }, { "epoch": 0.4573797566483535, "grad_norm": 29.03971290588379, "learning_rate": 6.588546964263178e-07, "logits/chosen": -1.3514407873153687, "logits/rejected": -1.6398487091064453, "logps/chosen": -2.3361032009124756, "logps/rejected": -2.876328468322754, "loss": 1.5941, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -23.361032485961914, "rewards/margins": 5.402250289916992, "rewards/rejected": -28.76328468322754, "step": 13570 }, { "epoch": 0.4575482827193367, "grad_norm": 24.678768157958984, "learning_rate": 6.58575774033717e-07, "logits/chosen": -1.069570779800415, "logits/rejected": -1.5706819295883179, "logps/chosen": -1.6676559448242188, "logps/rejected": -2.0189266204833984, "loss": 2.0779, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.676559448242188, "rewards/margins": 3.5127081871032715, "rewards/rejected": -20.189266204833984, "step": 13575 }, { "epoch": 0.45771680879031984, "grad_norm": 23.059268951416016, "learning_rate": 6.582967967642886e-07, "logits/chosen": -1.284484624862671, "logits/rejected": -1.2964885234832764, "logps/chosen": -1.8898823261260986, "logps/rejected": -2.063899278640747, "loss": 2.1698, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.898822784423828, "rewards/margins": 1.7401721477508545, "rewards/rejected": -20.638994216918945, "step": 13580 }, { "epoch": 0.45788533486130306, "grad_norm": 4.454768180847168, "learning_rate": 6.580177647145753e-07, "logits/chosen": -0.8917932510375977, "logits/rejected": -0.643142819404602, "logps/chosen": -2.9636824131011963, "logps/rejected": -3.6665992736816406, "loss": 1.6265, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -29.636821746826172, "rewards/margins": 7.029170036315918, "rewards/rejected": -36.665992736816406, "step": 13585 }, { "epoch": 0.4580538609322862, "grad_norm": 27.495288848876953, "learning_rate": 6.577386779811393e-07, "logits/chosen": -1.07036554813385, "logits/rejected": -0.9880205988883972, "logps/chosen": -1.5988483428955078, "logps/rejected": -1.5910999774932861, "loss": 3.2392, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -15.988485336303711, "rewards/margins": -0.07748489081859589, "rewards/rejected": -15.910998344421387, "step": 13590 }, { "epoch": 0.4582223870032694, "grad_norm": 30.054046630859375, "learning_rate": 6.574595366605616e-07, "logits/chosen": -0.9607011675834656, "logits/rejected": -0.9186193346977234, "logps/chosen": -2.318202257156372, "logps/rejected": -2.5237011909484863, "loss": 3.5981, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -23.182022094726562, "rewards/margins": 2.054990768432617, "rewards/rejected": -25.237010955810547, "step": 13595 }, { "epoch": 0.45839091307425256, "grad_norm": 30.519376754760742, "learning_rate": 6.571803408494419e-07, "logits/chosen": -1.5208982229232788, "logits/rejected": -1.7436546087265015, "logps/chosen": -2.0867679119110107, "logps/rejected": -2.053692579269409, "loss": 4.8533, "rewards/accuracies": 0.5, "rewards/chosen": -20.867679595947266, "rewards/margins": -0.33075445890426636, "rewards/rejected": -20.53692626953125, "step": 13600 }, { "epoch": 0.45839091307425256, "eval_logits/chosen": -1.6288355588912964, "eval_logits/rejected": -1.7436861991882324, "eval_logps/chosen": -1.9900078773498535, "eval_logps/rejected": -2.096331834793091, "eval_loss": 3.001215934753418, "eval_rewards/accuracies": 0.6299999952316284, "eval_rewards/chosen": -19.90007972717285, "eval_rewards/margins": 1.0632379055023193, "eval_rewards/rejected": -20.963319778442383, "eval_runtime": 12.9241, "eval_samples_per_second": 7.737, "eval_steps_per_second": 1.934, "step": 13600 }, { "epoch": 0.4585594391452358, "grad_norm": 22.9370174407959, "learning_rate": 6.569010906443989e-07, "logits/chosen": -1.0309830904006958, "logits/rejected": -1.232157826423645, "logps/chosen": -1.890729546546936, "logps/rejected": -1.792786955833435, "loss": 4.2338, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.90729522705078, "rewards/margins": -0.9794248342514038, "rewards/rejected": -17.927867889404297, "step": 13605 }, { "epoch": 0.45872796521621895, "grad_norm": 25.70821762084961, "learning_rate": 6.566217861420701e-07, "logits/chosen": -1.4500412940979004, "logits/rejected": -1.3498857021331787, "logps/chosen": -2.291757106781006, "logps/rejected": -2.2664847373962402, "loss": 3.3888, "rewards/accuracies": 0.5, "rewards/chosen": -22.917570114135742, "rewards/margins": -0.2527216076850891, "rewards/rejected": -22.66485023498535, "step": 13610 }, { "epoch": 0.4588964912872021, "grad_norm": 50.01831817626953, "learning_rate": 6.563424274391116e-07, "logits/chosen": -1.510155200958252, "logits/rejected": -1.5413382053375244, "logps/chosen": -2.076988697052002, "logps/rejected": -2.128782272338867, "loss": 4.0567, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.769887924194336, "rewards/margins": 0.5179346799850464, "rewards/rejected": -21.287822723388672, "step": 13615 }, { "epoch": 0.45906501735818533, "grad_norm": 24.66239356994629, "learning_rate": 6.560630146321989e-07, "logits/chosen": -1.018226146697998, "logits/rejected": -0.988991916179657, "logps/chosen": -2.174020528793335, "logps/rejected": -2.0794548988342285, "loss": 4.0338, "rewards/accuracies": 0.5, "rewards/chosen": -21.740205764770508, "rewards/margins": -0.9456550478935242, "rewards/rejected": -20.7945499420166, "step": 13620 }, { "epoch": 0.4592335434291685, "grad_norm": 19.06427764892578, "learning_rate": 6.557835478180251e-07, "logits/chosen": -0.6391313672065735, "logits/rejected": -0.686683714389801, "logps/chosen": -2.8946290016174316, "logps/rejected": -2.7074649333953857, "loss": 5.8085, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -28.9462890625, "rewards/margins": -1.8716394901275635, "rewards/rejected": -27.07465171813965, "step": 13625 }, { "epoch": 0.45940206950015167, "grad_norm": 8.433749198913574, "learning_rate": 6.555040270933034e-07, "logits/chosen": -1.2226756811141968, "logits/rejected": -1.4951807260513306, "logps/chosen": -2.156207323074341, "logps/rejected": -2.4813437461853027, "loss": 2.6955, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.56207275390625, "rewards/margins": 3.251366376876831, "rewards/rejected": -24.813438415527344, "step": 13630 }, { "epoch": 0.45957059557113483, "grad_norm": 24.13922691345215, "learning_rate": 6.552244525547643e-07, "logits/chosen": -1.204192876815796, "logits/rejected": -1.3513870239257812, "logps/chosen": -1.8572742938995361, "logps/rejected": -2.0948574542999268, "loss": 1.8049, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.572742462158203, "rewards/margins": 2.3758318424224854, "rewards/rejected": -20.94857406616211, "step": 13635 }, { "epoch": 0.45973912164211805, "grad_norm": 30.350019454956055, "learning_rate": 6.549448242991579e-07, "logits/chosen": -1.0757592916488647, "logits/rejected": -1.455788016319275, "logps/chosen": -1.5065065622329712, "logps/rejected": -1.5797616243362427, "loss": 2.6693, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -15.06506633758545, "rewards/margins": 0.7325505018234253, "rewards/rejected": -15.797616958618164, "step": 13640 }, { "epoch": 0.4599076477131012, "grad_norm": 88.70205688476562, "learning_rate": 6.546651424232523e-07, "logits/chosen": -1.1661994457244873, "logits/rejected": -1.6986852884292603, "logps/chosen": -2.092421293258667, "logps/rejected": -2.2523179054260254, "loss": 2.1306, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.924211502075195, "rewards/margins": 1.5989638566970825, "rewards/rejected": -22.523174285888672, "step": 13645 }, { "epoch": 0.4600761737840844, "grad_norm": 17.726274490356445, "learning_rate": 6.543854070238344e-07, "logits/chosen": -0.8371933698654175, "logits/rejected": -1.2864792346954346, "logps/chosen": -2.6805765628814697, "logps/rejected": -2.8042044639587402, "loss": 4.4959, "rewards/accuracies": 0.5, "rewards/chosen": -26.80576515197754, "rewards/margins": 1.2362817525863647, "rewards/rejected": -28.04204750061035, "step": 13650 }, { "epoch": 0.46024469985506755, "grad_norm": 27.410917282104492, "learning_rate": 6.541056181977098e-07, "logits/chosen": -1.1484122276306152, "logits/rejected": -1.2108361721038818, "logps/chosen": -1.7693979740142822, "logps/rejected": -1.885839819908142, "loss": 2.7561, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.693981170654297, "rewards/margins": 1.164419174194336, "rewards/rejected": -18.858400344848633, "step": 13655 }, { "epoch": 0.4604132259260508, "grad_norm": 23.559326171875, "learning_rate": 6.538257760417022e-07, "logits/chosen": -1.1835523843765259, "logits/rejected": -1.2095699310302734, "logps/chosen": -2.0060606002807617, "logps/rejected": -2.050452709197998, "loss": 2.9333, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.060604095458984, "rewards/margins": 0.4439212679862976, "rewards/rejected": -20.504526138305664, "step": 13660 }, { "epoch": 0.46058175199703394, "grad_norm": 35.010986328125, "learning_rate": 6.535458806526542e-07, "logits/chosen": -1.0575556755065918, "logits/rejected": -1.188498854637146, "logps/chosen": -2.679745674133301, "logps/rejected": -2.794583559036255, "loss": 2.235, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.797454833984375, "rewards/margins": 1.1483802795410156, "rewards/rejected": -27.945837020874023, "step": 13665 }, { "epoch": 0.4607502780680171, "grad_norm": 9.957199096679688, "learning_rate": 6.532659321274262e-07, "logits/chosen": -1.0374212265014648, "logits/rejected": -1.3127609491348267, "logps/chosen": -2.2107858657836914, "logps/rejected": -2.331003189086914, "loss": 2.5993, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.107858657836914, "rewards/margins": 1.2021735906600952, "rewards/rejected": -23.31003189086914, "step": 13670 }, { "epoch": 0.4609188041390003, "grad_norm": 26.413589477539062, "learning_rate": 6.529859305628976e-07, "logits/chosen": -0.9869252443313599, "logits/rejected": -1.2702324390411377, "logps/chosen": -1.952105164527893, "logps/rejected": -2.1264002323150635, "loss": 2.5447, "rewards/accuracies": 0.5, "rewards/chosen": -19.52104949951172, "rewards/margins": 1.7429507970809937, "rewards/rejected": -21.264001846313477, "step": 13675 }, { "epoch": 0.4610873302099835, "grad_norm": 16.234878540039062, "learning_rate": 6.527058760559657e-07, "logits/chosen": -0.9233641624450684, "logits/rejected": -0.9307794570922852, "logps/chosen": -2.134063959121704, "logps/rejected": -2.294506072998047, "loss": 2.1801, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.340639114379883, "rewards/margins": 1.604425072669983, "rewards/rejected": -22.945064544677734, "step": 13680 }, { "epoch": 0.46125585628096666, "grad_norm": 39.736698150634766, "learning_rate": 6.524257687035465e-07, "logits/chosen": -1.769451379776001, "logits/rejected": -1.9414253234863281, "logps/chosen": -2.2306134700775146, "logps/rejected": -2.355893611907959, "loss": 2.6742, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.306133270263672, "rewards/margins": 1.2528001070022583, "rewards/rejected": -23.558935165405273, "step": 13685 }, { "epoch": 0.4614243823519498, "grad_norm": 28.430448532104492, "learning_rate": 6.521456086025742e-07, "logits/chosen": -1.4746885299682617, "logits/rejected": -1.8396384716033936, "logps/chosen": -1.8950374126434326, "logps/rejected": -2.66398286819458, "loss": 1.7045, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.950374603271484, "rewards/margins": 7.689455986022949, "rewards/rejected": -26.63983154296875, "step": 13690 }, { "epoch": 0.46159290842293305, "grad_norm": 26.767501831054688, "learning_rate": 6.518653958500008e-07, "logits/chosen": -1.4325040578842163, "logits/rejected": -1.2674458026885986, "logps/chosen": -2.7604289054870605, "logps/rejected": -3.158684253692627, "loss": 4.1102, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -27.60428810119629, "rewards/margins": 3.9825546741485596, "rewards/rejected": -31.586841583251953, "step": 13695 }, { "epoch": 0.4617614344939162, "grad_norm": 40.75784683227539, "learning_rate": 6.515851305427975e-07, "logits/chosen": -1.3811547756195068, "logits/rejected": -1.3409370183944702, "logps/chosen": -2.617219924926758, "logps/rejected": -2.4330475330352783, "loss": 5.5949, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -26.172199249267578, "rewards/margins": -1.8417232036590576, "rewards/rejected": -24.330474853515625, "step": 13700 }, { "epoch": 0.4619299605648994, "grad_norm": 32.274192810058594, "learning_rate": 6.513048127779526e-07, "logits/chosen": -1.4233167171478271, "logits/rejected": -1.477988839149475, "logps/chosen": -2.823249578475952, "logps/rejected": -2.5230438709259033, "loss": 6.2899, "rewards/accuracies": 0.5, "rewards/chosen": -28.232494354248047, "rewards/margins": -3.0020570755004883, "rewards/rejected": -25.230438232421875, "step": 13705 }, { "epoch": 0.46209848663588254, "grad_norm": 15.3927583694458, "learning_rate": 6.510244426524731e-07, "logits/chosen": -1.1456549167633057, "logits/rejected": -1.2661025524139404, "logps/chosen": -2.2368240356445312, "logps/rejected": -2.704552412033081, "loss": 1.7765, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.368236541748047, "rewards/margins": 4.677285194396973, "rewards/rejected": -27.0455265045166, "step": 13710 }, { "epoch": 0.46226701270686577, "grad_norm": 10.991805076599121, "learning_rate": 6.507440202633841e-07, "logits/chosen": -1.2727103233337402, "logits/rejected": -1.5026662349700928, "logps/chosen": -2.2351553440093994, "logps/rejected": -2.759641170501709, "loss": 1.5506, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -22.351551055908203, "rewards/margins": 5.244860649108887, "rewards/rejected": -27.59641456604004, "step": 13715 }, { "epoch": 0.46243553877784893, "grad_norm": 226.2294921875, "learning_rate": 6.504635457077289e-07, "logits/chosen": -1.145394206047058, "logits/rejected": -1.1384984254837036, "logps/chosen": -2.7437894344329834, "logps/rejected": -2.415273427963257, "loss": 7.1034, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -27.43789291381836, "rewards/margins": -3.2851600646972656, "rewards/rejected": -24.152732849121094, "step": 13720 }, { "epoch": 0.4626040648488321, "grad_norm": 31.03740119934082, "learning_rate": 6.501830190825685e-07, "logits/chosen": -0.7348178625106812, "logits/rejected": -0.8843439817428589, "logps/chosen": -2.0869638919830322, "logps/rejected": -2.4660804271698, "loss": 2.054, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.869638442993164, "rewards/margins": 3.7911689281463623, "rewards/rejected": -24.66080665588379, "step": 13725 }, { "epoch": 0.4627725909198153, "grad_norm": 17.494319915771484, "learning_rate": 6.499024404849821e-07, "logits/chosen": -1.1662698984146118, "logits/rejected": -1.1467528343200684, "logps/chosen": -1.6923586130142212, "logps/rejected": -1.6379038095474243, "loss": 3.7571, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.923587799072266, "rewards/margins": -0.5445479154586792, "rewards/rejected": -16.379037857055664, "step": 13730 }, { "epoch": 0.4629411169907985, "grad_norm": 23.104352951049805, "learning_rate": 6.496218100120672e-07, "logits/chosen": -1.1973875761032104, "logits/rejected": -1.36476731300354, "logps/chosen": -1.6315116882324219, "logps/rejected": -1.6547889709472656, "loss": 3.2729, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.31511688232422, "rewards/margins": 0.23277311027050018, "rewards/rejected": -16.54789161682129, "step": 13735 }, { "epoch": 0.46310964306178165, "grad_norm": 27.43927764892578, "learning_rate": 6.493411277609385e-07, "logits/chosen": -1.2858362197875977, "logits/rejected": -1.3546708822250366, "logps/chosen": -1.8596910238265991, "logps/rejected": -2.0080785751342773, "loss": 2.8077, "rewards/accuracies": 0.5, "rewards/chosen": -18.596908569335938, "rewards/margins": 1.4838764667510986, "rewards/rejected": -20.08078384399414, "step": 13740 }, { "epoch": 0.4632781691327648, "grad_norm": 30.223655700683594, "learning_rate": 6.490603938287294e-07, "logits/chosen": -1.3623411655426025, "logits/rejected": -1.4095981121063232, "logps/chosen": -2.1852593421936035, "logps/rejected": -2.1864328384399414, "loss": 3.8224, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.85259437561035, "rewards/margins": 0.011735248379409313, "rewards/rejected": -21.864328384399414, "step": 13745 }, { "epoch": 0.46344669520374804, "grad_norm": 1.1034603118896484, "learning_rate": 6.487796083125907e-07, "logits/chosen": -1.5010812282562256, "logits/rejected": -1.6568992137908936, "logps/chosen": -1.8861267566680908, "logps/rejected": -2.314347267150879, "loss": 2.3827, "rewards/accuracies": 0.5, "rewards/chosen": -18.861268997192383, "rewards/margins": 4.282201290130615, "rewards/rejected": -23.14347267150879, "step": 13750 }, { "epoch": 0.4636152212747312, "grad_norm": 25.887685775756836, "learning_rate": 6.484987713096911e-07, "logits/chosen": -1.2538336515426636, "logits/rejected": -1.1933120489120483, "logps/chosen": -1.827857255935669, "logps/rejected": -2.159615993499756, "loss": 2.7857, "rewards/accuracies": 0.5, "rewards/chosen": -18.278573989868164, "rewards/margins": 3.317587375640869, "rewards/rejected": -21.596160888671875, "step": 13755 }, { "epoch": 0.46378374734571437, "grad_norm": 23.123455047607422, "learning_rate": 6.482178829172175e-07, "logits/chosen": -1.5872116088867188, "logits/rejected": -1.8486239910125732, "logps/chosen": -2.3375518321990967, "logps/rejected": -2.6852965354919434, "loss": 2.7182, "rewards/accuracies": 0.5, "rewards/chosen": -23.375520706176758, "rewards/margins": 3.4774460792541504, "rewards/rejected": -26.85296630859375, "step": 13760 }, { "epoch": 0.46395227341669754, "grad_norm": 30.978561401367188, "learning_rate": 6.479369432323742e-07, "logits/chosen": -0.8670709729194641, "logits/rejected": -1.0293810367584229, "logps/chosen": -1.9717384576797485, "logps/rejected": -2.28629994392395, "loss": 3.0605, "rewards/accuracies": 0.5, "rewards/chosen": -19.71738624572754, "rewards/margins": 3.145613193511963, "rewards/rejected": -22.86299705505371, "step": 13765 }, { "epoch": 0.46412079948768076, "grad_norm": 12.688581466674805, "learning_rate": 6.476559523523833e-07, "logits/chosen": -1.5742883682250977, "logits/rejected": -1.7252527475357056, "logps/chosen": -2.204754590988159, "logps/rejected": -2.2683627605438232, "loss": 2.7556, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.04754638671875, "rewards/margins": 0.6360834836959839, "rewards/rejected": -22.683629989624023, "step": 13770 }, { "epoch": 0.4642893255586639, "grad_norm": 10.192408561706543, "learning_rate": 6.473749103744848e-07, "logits/chosen": -1.0795964002609253, "logits/rejected": -1.212799310684204, "logps/chosen": -1.7061240673065186, "logps/rejected": -1.9400373697280884, "loss": 1.8272, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -17.061241149902344, "rewards/margins": 2.3391315937042236, "rewards/rejected": -19.400375366210938, "step": 13775 }, { "epoch": 0.4644578516296471, "grad_norm": 24.896669387817383, "learning_rate": 6.470938173959361e-07, "logits/chosen": -1.1159603595733643, "logits/rejected": -1.3001426458358765, "logps/chosen": -2.2676663398742676, "logps/rejected": -2.2217071056365967, "loss": 3.6825, "rewards/accuracies": 0.5, "rewards/chosen": -22.67666244506836, "rewards/margins": -0.45959025621414185, "rewards/rejected": -22.217071533203125, "step": 13780 }, { "epoch": 0.4646263777006303, "grad_norm": 28.015443801879883, "learning_rate": 6.468126735140122e-07, "logits/chosen": -1.4823081493377686, "logits/rejected": -1.6397498846054077, "logps/chosen": -2.4521548748016357, "logps/rejected": -2.624523162841797, "loss": 2.4646, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.521547317504883, "rewards/margins": 1.7236862182617188, "rewards/rejected": -26.2452335357666, "step": 13785 }, { "epoch": 0.4647949037716135, "grad_norm": 102.11254119873047, "learning_rate": 6.465314788260065e-07, "logits/chosen": -0.9647348523139954, "logits/rejected": -1.1873705387115479, "logps/chosen": -2.272239923477173, "logps/rejected": -2.569958448410034, "loss": 2.1275, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.722400665283203, "rewards/margins": 2.9771876335144043, "rewards/rejected": -25.6995849609375, "step": 13790 }, { "epoch": 0.46496342984259664, "grad_norm": 17.63508415222168, "learning_rate": 6.46250233429229e-07, "logits/chosen": -1.3895456790924072, "logits/rejected": -1.3987621068954468, "logps/chosen": -2.464350938796997, "logps/rejected": -2.446322202682495, "loss": 4.5296, "rewards/accuracies": 0.5, "rewards/chosen": -24.643508911132812, "rewards/margins": -0.18028488755226135, "rewards/rejected": -24.46322250366211, "step": 13795 }, { "epoch": 0.4651319559135798, "grad_norm": 14.113638877868652, "learning_rate": 6.459689374210078e-07, "logits/chosen": -1.4735864400863647, "logits/rejected": -1.517884373664856, "logps/chosen": -1.9817434549331665, "logps/rejected": -2.366046905517578, "loss": 2.2756, "rewards/accuracies": 0.5, "rewards/chosen": -19.817434310913086, "rewards/margins": 3.8430354595184326, "rewards/rejected": -23.66046714782715, "step": 13800 }, { "epoch": 0.46530048198456303, "grad_norm": 0.7048804759979248, "learning_rate": 6.456875908986882e-07, "logits/chosen": -1.4789502620697021, "logits/rejected": -1.5977550745010376, "logps/chosen": -1.8480441570281982, "logps/rejected": -2.0476698875427246, "loss": 2.3848, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.48044204711914, "rewards/margins": 1.9962581396102905, "rewards/rejected": -20.476699829101562, "step": 13805 }, { "epoch": 0.4654690080555462, "grad_norm": 26.780628204345703, "learning_rate": 6.454061939596334e-07, "logits/chosen": -0.9053479433059692, "logits/rejected": -0.8510274887084961, "logps/chosen": -2.4856226444244385, "logps/rejected": -2.364593982696533, "loss": 4.3577, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.85622215270996, "rewards/margins": -1.2102824449539185, "rewards/rejected": -23.64594268798828, "step": 13810 }, { "epoch": 0.46563753412652936, "grad_norm": 24.32168960571289, "learning_rate": 6.451247467012234e-07, "logits/chosen": -1.5808491706848145, "logits/rejected": -1.6128294467926025, "logps/chosen": -2.07789945602417, "logps/rejected": -2.0538277626037598, "loss": 3.4658, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.778995513916016, "rewards/margins": -0.24072055518627167, "rewards/rejected": -20.53827476501465, "step": 13815 }, { "epoch": 0.46580606019751253, "grad_norm": 61.70922088623047, "learning_rate": 6.448432492208566e-07, "logits/chosen": -1.8394935131072998, "logits/rejected": -1.8253707885742188, "logps/chosen": -2.7731966972351074, "logps/rejected": -2.8784608840942383, "loss": 3.0673, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -27.73196792602539, "rewards/margins": 1.0526418685913086, "rewards/rejected": -28.784610748291016, "step": 13820 }, { "epoch": 0.46597458626849575, "grad_norm": 17.952144622802734, "learning_rate": 6.445617016159475e-07, "logits/chosen": -1.2500659227371216, "logits/rejected": -1.161217212677002, "logps/chosen": -1.7770369052886963, "logps/rejected": -1.9865745306015015, "loss": 3.5517, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.770368576049805, "rewards/margins": 2.095376491546631, "rewards/rejected": -19.865747451782227, "step": 13825 }, { "epoch": 0.4661431123394789, "grad_norm": 15.38546085357666, "learning_rate": 6.442801039839292e-07, "logits/chosen": -1.3103373050689697, "logits/rejected": -1.481055736541748, "logps/chosen": -1.623786211013794, "logps/rejected": -1.827819585800171, "loss": 2.5167, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.23786163330078, "rewards/margins": 2.0403342247009277, "rewards/rejected": -18.278194427490234, "step": 13830 }, { "epoch": 0.4663116384104621, "grad_norm": 14.652449607849121, "learning_rate": 6.439984564222511e-07, "logits/chosen": -1.3200714588165283, "logits/rejected": -1.2847893238067627, "logps/chosen": -2.327244997024536, "logps/rejected": -2.3881309032440186, "loss": 3.8291, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.27244758605957, "rewards/margins": 0.6088644862174988, "rewards/rejected": -23.881309509277344, "step": 13835 }, { "epoch": 0.4664801644814453, "grad_norm": 22.924610137939453, "learning_rate": 6.437167590283808e-07, "logits/chosen": -1.0864531993865967, "logits/rejected": -1.2700326442718506, "logps/chosen": -2.320392370223999, "logps/rejected": -2.349609375, "loss": 3.0595, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.20392417907715, "rewards/margins": 0.29216688871383667, "rewards/rejected": -23.49609375, "step": 13840 }, { "epoch": 0.46664869055242847, "grad_norm": 33.14518737792969, "learning_rate": 6.434350118998024e-07, "logits/chosen": -1.2175791263580322, "logits/rejected": -1.4766108989715576, "logps/chosen": -1.7124464511871338, "logps/rejected": -2.053079605102539, "loss": 1.7015, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.124462127685547, "rewards/margins": 3.4063332080841064, "rewards/rejected": -20.530797958374023, "step": 13845 }, { "epoch": 0.46681721662341163, "grad_norm": 23.546630859375, "learning_rate": 6.431532151340172e-07, "logits/chosen": -1.2797232866287231, "logits/rejected": -1.1614303588867188, "logps/chosen": -2.258538007736206, "logps/rejected": -2.3078856468200684, "loss": 4.2846, "rewards/accuracies": 0.5, "rewards/chosen": -22.58538055419922, "rewards/margins": 0.4934772551059723, "rewards/rejected": -23.078855514526367, "step": 13850 }, { "epoch": 0.4669857426943948, "grad_norm": 25.121030807495117, "learning_rate": 6.428713688285446e-07, "logits/chosen": -1.067025899887085, "logits/rejected": -1.2792638540267944, "logps/chosen": -2.191856861114502, "logps/rejected": -2.5970585346221924, "loss": 0.9155, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -21.918567657470703, "rewards/margins": 4.052019119262695, "rewards/rejected": -25.970584869384766, "step": 13855 }, { "epoch": 0.467154268765378, "grad_norm": 6.469688892364502, "learning_rate": 6.425894730809198e-07, "logits/chosen": -1.7441765069961548, "logits/rejected": -1.9495182037353516, "logps/chosen": -2.265683889389038, "logps/rejected": -2.710209608078003, "loss": 1.1884, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -22.65683937072754, "rewards/margins": 4.445258617401123, "rewards/rejected": -27.102096557617188, "step": 13860 }, { "epoch": 0.4673227948363612, "grad_norm": 21.148752212524414, "learning_rate": 6.423075279886963e-07, "logits/chosen": -1.4913420677185059, "logits/rejected": -1.7002441883087158, "logps/chosen": -1.9288151264190674, "logps/rejected": -2.1135830879211426, "loss": 2.7459, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.288150787353516, "rewards/margins": 1.847680687904358, "rewards/rejected": -21.13582992553711, "step": 13865 }, { "epoch": 0.46749132090734435, "grad_norm": 18.34513282775879, "learning_rate": 6.420255336494441e-07, "logits/chosen": -1.4038597345352173, "logits/rejected": -1.4210649728775024, "logps/chosen": -2.7172553539276123, "logps/rejected": -3.0526812076568604, "loss": 2.7329, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -27.172550201416016, "rewards/margins": 3.354259490966797, "rewards/rejected": -30.526809692382812, "step": 13870 }, { "epoch": 0.4676598469783275, "grad_norm": 36.14348602294922, "learning_rate": 6.417434901607504e-07, "logits/chosen": -1.338536262512207, "logits/rejected": -1.4746836423873901, "logps/chosen": -2.4771456718444824, "logps/rejected": -2.722282886505127, "loss": 3.0973, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -24.77145767211914, "rewards/margins": 2.451371669769287, "rewards/rejected": -27.222829818725586, "step": 13875 }, { "epoch": 0.46782837304931074, "grad_norm": 72.72344207763672, "learning_rate": 6.414613976202192e-07, "logits/chosen": -0.7898836135864258, "logits/rejected": -0.6323626637458801, "logps/chosen": -1.9599090814590454, "logps/rejected": -1.9197161197662354, "loss": 3.5675, "rewards/accuracies": 0.5, "rewards/chosen": -19.599090576171875, "rewards/margins": -0.4019303321838379, "rewards/rejected": -19.197160720825195, "step": 13880 }, { "epoch": 0.4679968991202939, "grad_norm": 20.84559440612793, "learning_rate": 6.411792561254715e-07, "logits/chosen": -1.2231805324554443, "logits/rejected": -1.2022231817245483, "logps/chosen": -1.9406921863555908, "logps/rejected": -2.292463541030884, "loss": 2.1474, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.40692138671875, "rewards/margins": 3.5177130699157715, "rewards/rejected": -22.92463493347168, "step": 13885 }, { "epoch": 0.4681654251912771, "grad_norm": 36.6888427734375, "learning_rate": 6.408970657741457e-07, "logits/chosen": -1.4471817016601562, "logits/rejected": -1.2688713073730469, "logps/chosen": -2.1352126598358154, "logps/rejected": -2.2517178058624268, "loss": 3.4562, "rewards/accuracies": 0.5, "rewards/chosen": -21.352127075195312, "rewards/margins": 1.1650497913360596, "rewards/rejected": -22.51717758178711, "step": 13890 }, { "epoch": 0.4683339512622603, "grad_norm": 33.751834869384766, "learning_rate": 6.406148266638963e-07, "logits/chosen": -1.157727599143982, "logits/rejected": -1.255642294883728, "logps/chosen": -2.18957781791687, "logps/rejected": -2.665783405303955, "loss": 1.6078, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -21.89577865600586, "rewards/margins": 4.7620530128479, "rewards/rejected": -26.657833099365234, "step": 13895 }, { "epoch": 0.46850247733324346, "grad_norm": 43.1579475402832, "learning_rate": 6.403325388923956e-07, "logits/chosen": -1.3292075395584106, "logits/rejected": -1.3559629917144775, "logps/chosen": -1.9967514276504517, "logps/rejected": -2.118342399597168, "loss": 2.8668, "rewards/accuracies": 0.5, "rewards/chosen": -19.967514038085938, "rewards/margins": 1.2159093618392944, "rewards/rejected": -21.183422088623047, "step": 13900 }, { "epoch": 0.4686710034042266, "grad_norm": 16.178476333618164, "learning_rate": 6.400502025573319e-07, "logits/chosen": -1.3434818983078003, "logits/rejected": -1.5653314590454102, "logps/chosen": -1.8827110528945923, "logps/rejected": -2.1971614360809326, "loss": 3.3364, "rewards/accuracies": 0.5, "rewards/chosen": -18.82710838317871, "rewards/margins": 3.144505023956299, "rewards/rejected": -21.971614837646484, "step": 13905 }, { "epoch": 0.4688395294752098, "grad_norm": 27.721033096313477, "learning_rate": 6.397678177564109e-07, "logits/chosen": -1.3376166820526123, "logits/rejected": -1.5822420120239258, "logps/chosen": -1.7184581756591797, "logps/rejected": -2.0001845359802246, "loss": 2.1544, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.184581756591797, "rewards/margins": 2.817262649536133, "rewards/rejected": -20.001846313476562, "step": 13910 }, { "epoch": 0.469008055546193, "grad_norm": 50.78194808959961, "learning_rate": 6.39485384587355e-07, "logits/chosen": -1.2960230112075806, "logits/rejected": -1.3487869501113892, "logps/chosen": -1.9403388500213623, "logps/rejected": -1.9828157424926758, "loss": 3.7826, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.40338897705078, "rewards/margins": 0.424767404794693, "rewards/rejected": -19.828155517578125, "step": 13915 }, { "epoch": 0.4691765816171762, "grad_norm": 201.55856323242188, "learning_rate": 6.392029031479029e-07, "logits/chosen": -0.938056468963623, "logits/rejected": -1.2472608089447021, "logps/chosen": -2.160804510116577, "logps/rejected": -2.3859035968780518, "loss": 2.4459, "rewards/accuracies": 0.5, "rewards/chosen": -21.608047485351562, "rewards/margins": 2.2509894371032715, "rewards/rejected": -23.85903549194336, "step": 13920 }, { "epoch": 0.46934510768815935, "grad_norm": 39.64912796020508, "learning_rate": 6.389203735358103e-07, "logits/chosen": -1.1982097625732422, "logits/rejected": -1.1630780696868896, "logps/chosen": -2.0709540843963623, "logps/rejected": -1.9648492336273193, "loss": 4.1928, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.709543228149414, "rewards/margins": -1.0610501766204834, "rewards/rejected": -19.64849281311035, "step": 13925 }, { "epoch": 0.4695136337591425, "grad_norm": 4.722727298736572, "learning_rate": 6.386377958488497e-07, "logits/chosen": -1.9029403924942017, "logits/rejected": -1.9499372243881226, "logps/chosen": -2.29158091545105, "logps/rejected": -2.7147128582000732, "loss": 1.8003, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.915807723999023, "rewards/margins": 4.231320381164551, "rewards/rejected": -27.14712905883789, "step": 13930 }, { "epoch": 0.46968215983012573, "grad_norm": 142.9805145263672, "learning_rate": 6.3835517018481e-07, "logits/chosen": -1.6389110088348389, "logits/rejected": -1.3937580585479736, "logps/chosen": -2.1385300159454346, "logps/rejected": -2.0082430839538574, "loss": 4.8887, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -21.385299682617188, "rewards/margins": -1.302868127822876, "rewards/rejected": -20.08243179321289, "step": 13935 }, { "epoch": 0.4698506859011089, "grad_norm": 29.812519073486328, "learning_rate": 6.38072496641497e-07, "logits/chosen": -1.1409599781036377, "logits/rejected": -1.3570410013198853, "logps/chosen": -2.3800430297851562, "logps/rejected": -2.7600884437561035, "loss": 2.6882, "rewards/accuracies": 0.5, "rewards/chosen": -23.800432205200195, "rewards/margins": 3.8004555702209473, "rewards/rejected": -27.60088539123535, "step": 13940 }, { "epoch": 0.47001921197209207, "grad_norm": 35.49007034301758, "learning_rate": 6.377897753167328e-07, "logits/chosen": -1.118445873260498, "logits/rejected": -1.2374904155731201, "logps/chosen": -1.9628002643585205, "logps/rejected": -2.1348910331726074, "loss": 2.7892, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.62800407409668, "rewards/margins": 1.7209068536758423, "rewards/rejected": -21.34891128540039, "step": 13945 }, { "epoch": 0.4701877380430753, "grad_norm": 27.707048416137695, "learning_rate": 6.375070063083558e-07, "logits/chosen": -1.0523077249526978, "logits/rejected": -1.3618173599243164, "logps/chosen": -2.0080909729003906, "logps/rejected": -2.720878839492798, "loss": 1.3578, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -20.080907821655273, "rewards/margins": 7.127877712249756, "rewards/rejected": -27.208789825439453, "step": 13950 }, { "epoch": 0.47035626411405845, "grad_norm": 26.838281631469727, "learning_rate": 6.372241897142217e-07, "logits/chosen": -0.7201957702636719, "logits/rejected": -0.920723557472229, "logps/chosen": -1.9195568561553955, "logps/rejected": -2.176978826522827, "loss": 2.7029, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.195566177368164, "rewards/margins": 2.5742220878601074, "rewards/rejected": -21.76978874206543, "step": 13955 }, { "epoch": 0.4705247901850416, "grad_norm": 69.2751693725586, "learning_rate": 6.36941325632202e-07, "logits/chosen": -1.1831648349761963, "logits/rejected": -1.8111673593521118, "logps/chosen": -2.3680479526519775, "logps/rejected": -2.7056612968444824, "loss": 3.0904, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -23.68048095703125, "rewards/margins": 3.3761305809020996, "rewards/rejected": -27.056610107421875, "step": 13960 }, { "epoch": 0.4706933162560248, "grad_norm": 9.624326705932617, "learning_rate": 6.366584141601845e-07, "logits/chosen": -1.4053133726119995, "logits/rejected": -1.5715116262435913, "logps/chosen": -3.049325466156006, "logps/rejected": -3.178790807723999, "loss": 4.7181, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -30.493255615234375, "rewards/margins": 1.2946542501449585, "rewards/rejected": -31.787912368774414, "step": 13965 }, { "epoch": 0.470861842327008, "grad_norm": 24.864315032958984, "learning_rate": 6.363754553960743e-07, "logits/chosen": -1.1395964622497559, "logits/rejected": -1.1456211805343628, "logps/chosen": -1.9944721460342407, "logps/rejected": -2.139186382293701, "loss": 2.812, "rewards/accuracies": 0.5, "rewards/chosen": -19.944721221923828, "rewards/margins": 1.4471421241760254, "rewards/rejected": -21.391860961914062, "step": 13970 }, { "epoch": 0.4710303683979912, "grad_norm": 25.200536727905273, "learning_rate": 6.360924494377918e-07, "logits/chosen": -1.249730110168457, "logits/rejected": -1.3911710977554321, "logps/chosen": -2.2585055828094482, "logps/rejected": -2.606250286102295, "loss": 2.649, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.58505630493164, "rewards/margins": 3.4774513244628906, "rewards/rejected": -26.062503814697266, "step": 13975 }, { "epoch": 0.47119889446897434, "grad_norm": 22.978961944580078, "learning_rate": 6.358093963832745e-07, "logits/chosen": -1.1088197231292725, "logits/rejected": -1.2002352476119995, "logps/chosen": -1.988402009010315, "logps/rejected": -2.3186020851135254, "loss": 1.9841, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.88401985168457, "rewards/margins": 3.301999568939209, "rewards/rejected": -23.186019897460938, "step": 13980 }, { "epoch": 0.4713674205399575, "grad_norm": 29.248794555664062, "learning_rate": 6.355262963304756e-07, "logits/chosen": -2.046112298965454, "logits/rejected": -2.1253397464752197, "logps/chosen": -1.8194358348846436, "logps/rejected": -1.847980260848999, "loss": 2.9246, "rewards/accuracies": 0.5, "rewards/chosen": -18.194358825683594, "rewards/margins": 0.28544360399246216, "rewards/rejected": -18.47980308532715, "step": 13985 }, { "epoch": 0.4715359466109407, "grad_norm": 4.779592037200928, "learning_rate": 6.352431493773651e-07, "logits/chosen": -1.2561156749725342, "logits/rejected": -1.378087043762207, "logps/chosen": -2.207831859588623, "logps/rejected": -2.593492031097412, "loss": 1.459, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.078319549560547, "rewards/margins": 3.8566012382507324, "rewards/rejected": -25.934921264648438, "step": 13990 }, { "epoch": 0.4717044726819239, "grad_norm": 70.30220031738281, "learning_rate": 6.349599556219291e-07, "logits/chosen": -1.090355634689331, "logits/rejected": -1.3683557510375977, "logps/chosen": -2.2222657203674316, "logps/rejected": -2.777714252471924, "loss": 2.9842, "rewards/accuracies": 0.5, "rewards/chosen": -22.222658157348633, "rewards/margins": 5.554482460021973, "rewards/rejected": -27.77713966369629, "step": 13995 }, { "epoch": 0.47187299875290706, "grad_norm": 18.473787307739258, "learning_rate": 6.346767151621696e-07, "logits/chosen": -1.13655686378479, "logits/rejected": -0.9696399569511414, "logps/chosen": -1.8745672702789307, "logps/rejected": -1.9711214303970337, "loss": 2.9945, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.74567222595215, "rewards/margins": 0.9655420184135437, "rewards/rejected": -19.71121597290039, "step": 14000 }, { "epoch": 0.47187299875290706, "eval_logits/chosen": -1.6666821241378784, "eval_logits/rejected": -1.7839144468307495, "eval_logps/chosen": -1.9983142614364624, "eval_logps/rejected": -2.1036059856414795, "eval_loss": 3.0070505142211914, "eval_rewards/accuracies": 0.6299999952316284, "eval_rewards/chosen": -19.983142852783203, "eval_rewards/margins": 1.052917242050171, "eval_rewards/rejected": -21.036060333251953, "eval_runtime": 12.8911, "eval_samples_per_second": 7.757, "eval_steps_per_second": 1.939, "step": 14000 }, { "epoch": 0.4720415248238903, "grad_norm": 80.3492660522461, "learning_rate": 6.343934280961051e-07, "logits/chosen": -1.4856373071670532, "logits/rejected": -1.2735168933868408, "logps/chosen": -2.5143871307373047, "logps/rejected": -2.415238857269287, "loss": 4.5798, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -25.143877029418945, "rewards/margins": -0.9914867281913757, "rewards/rejected": -24.152387619018555, "step": 14005 }, { "epoch": 0.47221005089487345, "grad_norm": 19.168996810913086, "learning_rate": 6.341100945217699e-07, "logits/chosen": -1.0347862243652344, "logits/rejected": -1.228040337562561, "logps/chosen": -1.7857773303985596, "logps/rejected": -2.0644729137420654, "loss": 2.1059, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.85777473449707, "rewards/margins": 2.786953926086426, "rewards/rejected": -20.64472770690918, "step": 14010 }, { "epoch": 0.4723785769658566, "grad_norm": 38.48196029663086, "learning_rate": 6.338267145372147e-07, "logits/chosen": -1.4567896127700806, "logits/rejected": -1.55691659450531, "logps/chosen": -2.0581021308898926, "logps/rejected": -2.0358104705810547, "loss": 3.5322, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.58102035522461, "rewards/margins": -0.22291651368141174, "rewards/rejected": -20.358102798461914, "step": 14015 }, { "epoch": 0.4725471030368398, "grad_norm": 19.195449829101562, "learning_rate": 6.335432882405062e-07, "logits/chosen": -1.2669869661331177, "logits/rejected": -1.4322224855422974, "logps/chosen": -1.9909021854400635, "logps/rejected": -2.0929489135742188, "loss": 2.9498, "rewards/accuracies": 0.5, "rewards/chosen": -19.909021377563477, "rewards/margins": 1.0204694271087646, "rewards/rejected": -20.92949104309082, "step": 14020 }, { "epoch": 0.472715629107823, "grad_norm": 27.397371292114258, "learning_rate": 6.332598157297271e-07, "logits/chosen": -1.4022011756896973, "logits/rejected": -1.1833802461624146, "logps/chosen": -1.933884620666504, "logps/rejected": -1.8614232540130615, "loss": 3.8703, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -19.33884620666504, "rewards/margins": -0.724612832069397, "rewards/rejected": -18.61423110961914, "step": 14025 }, { "epoch": 0.47288415517880616, "grad_norm": 30.178300857543945, "learning_rate": 6.329762971029759e-07, "logits/chosen": -1.2592933177947998, "logits/rejected": -1.3360066413879395, "logps/chosen": -2.270819902420044, "logps/rejected": -2.3751158714294434, "loss": 2.8132, "rewards/accuracies": 0.5, "rewards/chosen": -22.70819664001465, "rewards/margins": 1.0429630279541016, "rewards/rejected": -23.751161575317383, "step": 14030 }, { "epoch": 0.47305268124978933, "grad_norm": 23.464101791381836, "learning_rate": 6.326927324583674e-07, "logits/chosen": -1.0942169427871704, "logits/rejected": -1.1212034225463867, "logps/chosen": -1.8277006149291992, "logps/rejected": -1.815222978591919, "loss": 3.3575, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.277008056640625, "rewards/margins": -0.12477798759937286, "rewards/rejected": -18.15222930908203, "step": 14035 }, { "epoch": 0.4732212073207725, "grad_norm": 34.82871627807617, "learning_rate": 6.324091218940322e-07, "logits/chosen": -1.5310542583465576, "logits/rejected": -1.5248639583587646, "logps/chosen": -2.289816379547119, "logps/rejected": -2.739499807357788, "loss": 4.0646, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.898162841796875, "rewards/margins": 4.4968366622924805, "rewards/rejected": -27.39499855041504, "step": 14040 }, { "epoch": 0.4733897333917557, "grad_norm": 25.91857147216797, "learning_rate": 6.321254655081165e-07, "logits/chosen": -1.0183018445968628, "logits/rejected": -1.0823280811309814, "logps/chosen": -1.8455663919448853, "logps/rejected": -1.8559458255767822, "loss": 3.2614, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.455663681030273, "rewards/margins": 0.10379400104284286, "rewards/rejected": -18.559459686279297, "step": 14045 }, { "epoch": 0.4735582594627389, "grad_norm": 33.44871139526367, "learning_rate": 6.318417633987826e-07, "logits/chosen": -1.2877857685089111, "logits/rejected": -1.4010345935821533, "logps/chosen": -2.4368534088134766, "logps/rejected": -2.478266477584839, "loss": 2.9642, "rewards/accuracies": 0.5, "rewards/chosen": -24.368532180786133, "rewards/margins": 0.4141322076320648, "rewards/rejected": -24.782665252685547, "step": 14050 }, { "epoch": 0.47372678553372205, "grad_norm": 3.0363457202911377, "learning_rate": 6.31558015664209e-07, "logits/chosen": -1.3036028146743774, "logits/rejected": -1.3670432567596436, "logps/chosen": -2.2221055030822754, "logps/rejected": -2.45412015914917, "loss": 2.2587, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.221054077148438, "rewards/margins": 2.320145845413208, "rewards/rejected": -24.541200637817383, "step": 14055 }, { "epoch": 0.47389531160470527, "grad_norm": 19.182397842407227, "learning_rate": 6.312742224025891e-07, "logits/chosen": -1.494376301765442, "logits/rejected": -1.308542013168335, "logps/chosen": -1.7802484035491943, "logps/rejected": -1.8587958812713623, "loss": 2.7654, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.802486419677734, "rewards/margins": 0.7854740023612976, "rewards/rejected": -18.58795738220215, "step": 14060 }, { "epoch": 0.47406383767568844, "grad_norm": 24.555452346801758, "learning_rate": 6.30990383712133e-07, "logits/chosen": -1.552181363105774, "logits/rejected": -1.574568510055542, "logps/chosen": -1.7801599502563477, "logps/rejected": -1.9626197814941406, "loss": 2.7652, "rewards/accuracies": 0.5, "rewards/chosen": -17.801597595214844, "rewards/margins": 1.8245998620986938, "rewards/rejected": -19.626197814941406, "step": 14065 }, { "epoch": 0.4742323637466716, "grad_norm": 42.724327087402344, "learning_rate": 6.307064996910658e-07, "logits/chosen": -1.3822429180145264, "logits/rejected": -1.4136865139007568, "logps/chosen": -1.8683557510375977, "logps/rejected": -1.9732444286346436, "loss": 3.3348, "rewards/accuracies": 0.5, "rewards/chosen": -18.683557510375977, "rewards/margins": 1.0488868951797485, "rewards/rejected": -19.732444763183594, "step": 14070 }, { "epoch": 0.47440088981765477, "grad_norm": 26.224445343017578, "learning_rate": 6.304225704376288e-07, "logits/chosen": -1.298749566078186, "logits/rejected": -1.3914250135421753, "logps/chosen": -1.7438873052597046, "logps/rejected": -1.8251692056655884, "loss": 2.9275, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.438873291015625, "rewards/margins": 0.8128176927566528, "rewards/rejected": -18.251689910888672, "step": 14075 }, { "epoch": 0.474569415888638, "grad_norm": 24.768003463745117, "learning_rate": 6.301385960500784e-07, "logits/chosen": -1.6690937280654907, "logits/rejected": -1.5592644214630127, "logps/chosen": -2.0394389629364014, "logps/rejected": -2.316621780395508, "loss": 1.6017, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.394390106201172, "rewards/margins": 2.771827220916748, "rewards/rejected": -23.166217803955078, "step": 14080 }, { "epoch": 0.47473794195962116, "grad_norm": 31.17376708984375, "learning_rate": 6.298545766266874e-07, "logits/chosen": -1.3782024383544922, "logits/rejected": -1.5816433429718018, "logps/chosen": -1.900072455406189, "logps/rejected": -2.0293757915496826, "loss": 2.3152, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.00072479248047, "rewards/margins": 1.2930349111557007, "rewards/rejected": -20.293758392333984, "step": 14085 }, { "epoch": 0.4749064680306043, "grad_norm": 18.251209259033203, "learning_rate": 6.295705122657435e-07, "logits/chosen": -1.1897153854370117, "logits/rejected": -1.1693168878555298, "logps/chosen": -2.068413257598877, "logps/rejected": -2.2642273902893066, "loss": 1.9694, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -20.684133529663086, "rewards/margins": 1.9581406116485596, "rewards/rejected": -22.64227294921875, "step": 14090 }, { "epoch": 0.4750749941015875, "grad_norm": 69.73340606689453, "learning_rate": 6.2928640306555e-07, "logits/chosen": -1.4802566766738892, "logits/rejected": -1.614418387413025, "logps/chosen": -2.283409357070923, "logps/rejected": -2.8050522804260254, "loss": 3.6148, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.834091186523438, "rewards/margins": 5.216431617736816, "rewards/rejected": -28.050525665283203, "step": 14095 }, { "epoch": 0.4752435201725707, "grad_norm": 27.110261917114258, "learning_rate": 6.290022491244262e-07, "logits/chosen": -0.9759872555732727, "logits/rejected": -1.0511130094528198, "logps/chosen": -1.9783554077148438, "logps/rejected": -2.054938316345215, "loss": 2.7153, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.783552169799805, "rewards/margins": 0.7658289670944214, "rewards/rejected": -20.549381256103516, "step": 14100 }, { "epoch": 0.4754120462435539, "grad_norm": 23.795833587646484, "learning_rate": 6.287180505407065e-07, "logits/chosen": -1.2092092037200928, "logits/rejected": -1.0659422874450684, "logps/chosen": -1.832767128944397, "logps/rejected": -1.7705835103988647, "loss": 3.7276, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.32767105102539, "rewards/margins": -0.6218371391296387, "rewards/rejected": -17.705833435058594, "step": 14105 }, { "epoch": 0.47558057231453704, "grad_norm": 14.359508514404297, "learning_rate": 6.284338074127407e-07, "logits/chosen": -1.1778762340545654, "logits/rejected": -1.3249976634979248, "logps/chosen": -1.9273595809936523, "logps/rejected": -2.1541523933410645, "loss": 2.5223, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.27359390258789, "rewards/margins": 2.2679269313812256, "rewards/rejected": -21.541522979736328, "step": 14110 }, { "epoch": 0.47574909838552026, "grad_norm": 13.926602363586426, "learning_rate": 6.281495198388944e-07, "logits/chosen": -1.1781768798828125, "logits/rejected": -1.0687205791473389, "logps/chosen": -1.4076114892959595, "logps/rejected": -1.5143462419509888, "loss": 2.439, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -14.0761137008667, "rewards/margins": 1.0673487186431885, "rewards/rejected": -15.143463134765625, "step": 14115 }, { "epoch": 0.47591762445650343, "grad_norm": 30.59756851196289, "learning_rate": 6.278651879175481e-07, "logits/chosen": -1.2486302852630615, "logits/rejected": -1.612501859664917, "logps/chosen": -2.2371091842651367, "logps/rejected": -2.649040937423706, "loss": 1.8951, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -22.371091842651367, "rewards/margins": 4.119317054748535, "rewards/rejected": -26.49041175842285, "step": 14120 }, { "epoch": 0.4760861505274866, "grad_norm": 27.569778442382812, "learning_rate": 6.275808117470979e-07, "logits/chosen": -1.5844205617904663, "logits/rejected": -1.7609293460845947, "logps/chosen": -2.280905246734619, "logps/rejected": -2.6194040775299072, "loss": 2.4388, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.809053421020508, "rewards/margins": 3.384986400604248, "rewards/rejected": -26.194040298461914, "step": 14125 }, { "epoch": 0.47625467659846976, "grad_norm": 31.418270111083984, "learning_rate": 6.272963914259551e-07, "logits/chosen": -1.1242899894714355, "logits/rejected": -1.2510685920715332, "logps/chosen": -2.14827036857605, "logps/rejected": -2.277985095977783, "loss": 3.4789, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.482702255249023, "rewards/margins": 1.2971477508544922, "rewards/rejected": -22.77985191345215, "step": 14130 }, { "epoch": 0.476423202669453, "grad_norm": 51.32257080078125, "learning_rate": 6.270119270525468e-07, "logits/chosen": -0.9274293184280396, "logits/rejected": -1.0708013772964478, "logps/chosen": -2.1544365882873535, "logps/rejected": -2.3122477531433105, "loss": 2.7029, "rewards/accuracies": 0.5, "rewards/chosen": -21.544368743896484, "rewards/margins": 1.5781086683273315, "rewards/rejected": -23.12247657775879, "step": 14135 }, { "epoch": 0.47659172874043615, "grad_norm": 24.99764633178711, "learning_rate": 6.267274187253144e-07, "logits/chosen": -1.298872947692871, "logits/rejected": -1.4449822902679443, "logps/chosen": -2.611276388168335, "logps/rejected": -2.9252381324768066, "loss": 3.4442, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.11276626586914, "rewards/margins": 3.1396145820617676, "rewards/rejected": -29.25238037109375, "step": 14140 }, { "epoch": 0.4767602548114193, "grad_norm": 47.8037109375, "learning_rate": 6.264428665427153e-07, "logits/chosen": -1.2006722688674927, "logits/rejected": -1.2854722738265991, "logps/chosen": -1.9553956985473633, "logps/rejected": -2.218827724456787, "loss": 2.0508, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.553955078125, "rewards/margins": 2.63432240486145, "rewards/rejected": -22.188278198242188, "step": 14145 }, { "epoch": 0.4769287808824025, "grad_norm": 27.07879066467285, "learning_rate": 6.261582706032218e-07, "logits/chosen": -1.2765188217163086, "logits/rejected": -1.3474172353744507, "logps/chosen": -1.7571680545806885, "logps/rejected": -1.7844899892807007, "loss": 3.4131, "rewards/accuracies": 0.5, "rewards/chosen": -17.571680068969727, "rewards/margins": 0.27321872115135193, "rewards/rejected": -17.844898223876953, "step": 14150 }, { "epoch": 0.4770973069533857, "grad_norm": 10.174779891967773, "learning_rate": 6.258736310053212e-07, "logits/chosen": -1.4351608753204346, "logits/rejected": -1.4350874423980713, "logps/chosen": -2.893874406814575, "logps/rejected": -3.1628687381744385, "loss": 2.306, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -28.93874168395996, "rewards/margins": 2.6899428367614746, "rewards/rejected": -31.628686904907227, "step": 14155 }, { "epoch": 0.47726583302436887, "grad_norm": 14.811348915100098, "learning_rate": 6.255889478475161e-07, "logits/chosen": -1.382568597793579, "logits/rejected": -1.4855680465698242, "logps/chosen": -1.9862836599349976, "logps/rejected": -2.301663637161255, "loss": 1.8944, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -19.862834930419922, "rewards/margins": 3.153799533843994, "rewards/rejected": -23.01663589477539, "step": 14160 }, { "epoch": 0.47743435909535203, "grad_norm": 23.847272872924805, "learning_rate": 6.253042212283241e-07, "logits/chosen": -1.6445223093032837, "logits/rejected": -1.3189189434051514, "logps/chosen": -1.8566372394561768, "logps/rejected": -1.814192771911621, "loss": 3.7629, "rewards/accuracies": 0.5, "rewards/chosen": -18.566370010375977, "rewards/margins": -0.42444291710853577, "rewards/rejected": -18.14192771911621, "step": 14165 }, { "epoch": 0.47760288516633526, "grad_norm": 39.18086624145508, "learning_rate": 6.250194512462782e-07, "logits/chosen": -1.2912828922271729, "logits/rejected": -1.3584439754486084, "logps/chosen": -1.8556301593780518, "logps/rejected": -1.8538516759872437, "loss": 3.1592, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.55630111694336, "rewards/margins": -0.01778392866253853, "rewards/rejected": -18.538516998291016, "step": 14170 }, { "epoch": 0.4777714112373184, "grad_norm": 25.342227935791016, "learning_rate": 6.247346379999257e-07, "logits/chosen": -1.5513664484024048, "logits/rejected": -1.6413652896881104, "logps/chosen": -2.6058261394500732, "logps/rejected": -2.8640480041503906, "loss": 2.4241, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -26.058263778686523, "rewards/margins": 2.582216262817383, "rewards/rejected": -28.640478134155273, "step": 14175 }, { "epoch": 0.4779399373083016, "grad_norm": 10.93128490447998, "learning_rate": 6.244497815878292e-07, "logits/chosen": -0.7427780032157898, "logits/rejected": -1.1368509531021118, "logps/chosen": -2.306380033493042, "logps/rejected": -3.289423704147339, "loss": 1.7394, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.063800811767578, "rewards/margins": 9.830431938171387, "rewards/rejected": -32.89423751831055, "step": 14180 }, { "epoch": 0.47810846337928475, "grad_norm": 27.874006271362305, "learning_rate": 6.241648821085665e-07, "logits/chosen": -0.9375996589660645, "logits/rejected": -1.060762643814087, "logps/chosen": -2.2712631225585938, "logps/rejected": -2.4879183769226074, "loss": 1.9797, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.712631225585938, "rewards/margins": 2.1665539741516113, "rewards/rejected": -24.879186630249023, "step": 14185 }, { "epoch": 0.478276989450268, "grad_norm": 35.346622467041016, "learning_rate": 6.238799396607299e-07, "logits/chosen": -1.0190104246139526, "logits/rejected": -1.0845071077346802, "logps/chosen": -2.458282709121704, "logps/rejected": -2.4083683490753174, "loss": 3.8692, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -24.58282470703125, "rewards/margins": -0.4991399645805359, "rewards/rejected": -24.08368492126465, "step": 14190 }, { "epoch": 0.47844551552125114, "grad_norm": 30.713102340698242, "learning_rate": 6.235949543429271e-07, "logits/chosen": -1.664607286453247, "logits/rejected": -1.994667649269104, "logps/chosen": -2.1017775535583496, "logps/rejected": -2.7052159309387207, "loss": 3.0572, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.01777458190918, "rewards/margins": 6.0343828201293945, "rewards/rejected": -27.052160263061523, "step": 14195 }, { "epoch": 0.4786140415922343, "grad_norm": 16.882923126220703, "learning_rate": 6.233099262537798e-07, "logits/chosen": -1.3837839365005493, "logits/rejected": -1.538823127746582, "logps/chosen": -2.1491425037384033, "logps/rejected": -2.6276285648345947, "loss": 2.3587, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.491424560546875, "rewards/margins": 4.784861087799072, "rewards/rejected": -26.27628517150879, "step": 14200 }, { "epoch": 0.4787825676632175, "grad_norm": 15.137643814086914, "learning_rate": 6.230248554919254e-07, "logits/chosen": -1.432544469833374, "logits/rejected": -1.466481328010559, "logps/chosen": -2.5068650245666504, "logps/rejected": -2.5227551460266113, "loss": 3.0185, "rewards/accuracies": 0.5, "rewards/chosen": -25.068649291992188, "rewards/margins": 0.1589018851518631, "rewards/rejected": -25.227550506591797, "step": 14205 }, { "epoch": 0.4789510937342007, "grad_norm": 21.842287063598633, "learning_rate": 6.227397421560156e-07, "logits/chosen": -1.2101644277572632, "logits/rejected": -1.2983782291412354, "logps/chosen": -2.416313648223877, "logps/rejected": -2.7157235145568848, "loss": 2.7991, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.163137435913086, "rewards/margins": 2.9940972328186035, "rewards/rejected": -27.1572322845459, "step": 14210 }, { "epoch": 0.47911961980518386, "grad_norm": 25.819618225097656, "learning_rate": 6.224545863447164e-07, "logits/chosen": -1.0523632764816284, "logits/rejected": -1.081443190574646, "logps/chosen": -1.8073928356170654, "logps/rejected": -1.915116548538208, "loss": 2.2054, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.073930740356445, "rewards/margins": 1.0772359371185303, "rewards/rejected": -19.151165008544922, "step": 14215 }, { "epoch": 0.479288145876167, "grad_norm": 33.45602035522461, "learning_rate": 6.221693881567097e-07, "logits/chosen": -1.0425139665603638, "logits/rejected": -1.362992286682129, "logps/chosen": -1.824812650680542, "logps/rejected": -2.031705141067505, "loss": 3.2246, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.248126983642578, "rewards/margins": 2.0689239501953125, "rewards/rejected": -20.31705093383789, "step": 14220 }, { "epoch": 0.47945667194715025, "grad_norm": 25.50204086303711, "learning_rate": 6.21884147690691e-07, "logits/chosen": -1.3271222114562988, "logits/rejected": -1.2059987783432007, "logps/chosen": -1.9234594106674194, "logps/rejected": -2.1173043251037598, "loss": 1.9135, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.234594345092773, "rewards/margins": 1.9384514093399048, "rewards/rejected": -21.173046112060547, "step": 14225 }, { "epoch": 0.4796251980181334, "grad_norm": 0.04183439910411835, "learning_rate": 6.215988650453707e-07, "logits/chosen": -1.230825424194336, "logits/rejected": -1.699907660484314, "logps/chosen": -2.3121089935302734, "logps/rejected": -2.669900894165039, "loss": 2.2012, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -23.121089935302734, "rewards/margins": 3.577920436859131, "rewards/rejected": -26.699010848999023, "step": 14230 }, { "epoch": 0.4797937240891166, "grad_norm": 103.14031982421875, "learning_rate": 6.21313540319474e-07, "logits/chosen": -1.5439348220825195, "logits/rejected": -2.149160385131836, "logps/chosen": -2.681786298751831, "logps/rejected": -3.1404526233673096, "loss": 3.5701, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.817861557006836, "rewards/margins": 4.586663246154785, "rewards/rejected": -31.404525756835938, "step": 14235 }, { "epoch": 0.47996225016009975, "grad_norm": 24.254440307617188, "learning_rate": 6.210281736117407e-07, "logits/chosen": -0.8741234540939331, "logits/rejected": -0.8805392384529114, "logps/chosen": -2.045274019241333, "logps/rejected": -1.999871850013733, "loss": 3.8074, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.452739715576172, "rewards/margins": -0.4540228843688965, "rewards/rejected": -19.99871826171875, "step": 14240 }, { "epoch": 0.48013077623108297, "grad_norm": 37.69783401489258, "learning_rate": 6.20742765020925e-07, "logits/chosen": -1.3421859741210938, "logits/rejected": -1.4220188856124878, "logps/chosen": -1.9036476612091064, "logps/rejected": -1.9306730031967163, "loss": 3.0629, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.036474227905273, "rewards/margins": 0.2702566981315613, "rewards/rejected": -19.306730270385742, "step": 14245 }, { "epoch": 0.48029930230206613, "grad_norm": 20.065317153930664, "learning_rate": 6.20457314645795e-07, "logits/chosen": -1.2782478332519531, "logits/rejected": -1.2565407752990723, "logps/chosen": -2.7419238090515137, "logps/rejected": -2.7357048988342285, "loss": 5.0031, "rewards/accuracies": 0.5, "rewards/chosen": -27.419235229492188, "rewards/margins": -0.062186289578676224, "rewards/rejected": -27.3570499420166, "step": 14250 }, { "epoch": 0.4804678283730493, "grad_norm": 48.01775360107422, "learning_rate": 6.201718225851345e-07, "logits/chosen": -1.6276464462280273, "logits/rejected": -1.634319543838501, "logps/chosen": -3.138836145401001, "logps/rejected": -3.149662494659424, "loss": 5.2109, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -31.38836097717285, "rewards/margins": 0.10826186835765839, "rewards/rejected": -31.496623992919922, "step": 14255 }, { "epoch": 0.48063635444403247, "grad_norm": 61.33388137817383, "learning_rate": 6.198862889377407e-07, "logits/chosen": -0.9420528411865234, "logits/rejected": -1.111307144165039, "logps/chosen": -2.386169910430908, "logps/rejected": -2.489887237548828, "loss": 2.4275, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.861698150634766, "rewards/margins": 1.0371739864349365, "rewards/rejected": -24.89887237548828, "step": 14260 }, { "epoch": 0.4808048805150157, "grad_norm": 16.47574806213379, "learning_rate": 6.196007138024257e-07, "logits/chosen": -1.4349069595336914, "logits/rejected": -1.9009329080581665, "logps/chosen": -2.3457863330841064, "logps/rejected": -2.69038462638855, "loss": 2.016, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.457860946655273, "rewards/margins": 3.4459869861602783, "rewards/rejected": -26.90384864807129, "step": 14265 }, { "epoch": 0.48097340658599885, "grad_norm": 29.380556106567383, "learning_rate": 6.193150972780156e-07, "logits/chosen": -0.8126411437988281, "logits/rejected": -1.0067518949508667, "logps/chosen": -2.386798620223999, "logps/rejected": -2.546583890914917, "loss": 2.976, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.867984771728516, "rewards/margins": 1.5978561639785767, "rewards/rejected": -25.465839385986328, "step": 14270 }, { "epoch": 0.481141932656982, "grad_norm": 22.0316162109375, "learning_rate": 6.190294394633513e-07, "logits/chosen": -1.327775001525879, "logits/rejected": -1.6017240285873413, "logps/chosen": -2.261784315109253, "logps/rejected": -2.3681483268737793, "loss": 3.1997, "rewards/accuracies": 0.5, "rewards/chosen": -22.617843627929688, "rewards/margins": 1.0636417865753174, "rewards/rejected": -23.68148422241211, "step": 14275 }, { "epoch": 0.48131045872796524, "grad_norm": 16.934232711791992, "learning_rate": 6.187437404572875e-07, "logits/chosen": -1.2591432332992554, "logits/rejected": -1.6536871194839478, "logps/chosen": -1.532456636428833, "logps/rejected": -1.7124006748199463, "loss": 2.4245, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -15.324564933776855, "rewards/margins": 1.7994403839111328, "rewards/rejected": -17.124004364013672, "step": 14280 }, { "epoch": 0.4814789847989484, "grad_norm": 25.954748153686523, "learning_rate": 6.184580003586934e-07, "logits/chosen": -0.8407789468765259, "logits/rejected": -0.9403706789016724, "logps/chosen": -2.3137755393981934, "logps/rejected": -2.3520450592041016, "loss": 2.9743, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.137752532958984, "rewards/margins": 0.3826959729194641, "rewards/rejected": -23.520448684692383, "step": 14285 }, { "epoch": 0.48164751086993157, "grad_norm": 33.69615936279297, "learning_rate": 6.181722192664525e-07, "logits/chosen": -1.4487159252166748, "logits/rejected": -1.4866324663162231, "logps/chosen": -2.4114739894866943, "logps/rejected": -2.2642366886138916, "loss": 4.5777, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -24.11473846435547, "rewards/margins": -1.4723711013793945, "rewards/rejected": -22.64236831665039, "step": 14290 }, { "epoch": 0.48181603694091474, "grad_norm": 14.018428802490234, "learning_rate": 6.178863972794623e-07, "logits/chosen": -1.5550386905670166, "logits/rejected": -1.626044511795044, "logps/chosen": -2.4448163509368896, "logps/rejected": -2.5209197998046875, "loss": 3.7098, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.448165893554688, "rewards/margins": 0.7610336542129517, "rewards/rejected": -25.209197998046875, "step": 14295 }, { "epoch": 0.48198456301189796, "grad_norm": 23.618711471557617, "learning_rate": 6.176005344966344e-07, "logits/chosen": -1.7064844369888306, "logits/rejected": -1.9256782531738281, "logps/chosen": -2.080040454864502, "logps/rejected": -2.499323606491089, "loss": 2.6969, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.800403594970703, "rewards/margins": 4.1928300857543945, "rewards/rejected": -24.993236541748047, "step": 14300 }, { "epoch": 0.4821530890828811, "grad_norm": 67.77555847167969, "learning_rate": 6.17314631016895e-07, "logits/chosen": -1.3775417804718018, "logits/rejected": -1.1704440116882324, "logps/chosen": -2.4363372325897217, "logps/rejected": -2.379361629486084, "loss": 3.9586, "rewards/accuracies": 0.5, "rewards/chosen": -24.363372802734375, "rewards/margins": -0.5697550773620605, "rewards/rejected": -23.79361915588379, "step": 14305 }, { "epoch": 0.4823216151538643, "grad_norm": 25.39704704284668, "learning_rate": 6.170286869391836e-07, "logits/chosen": -1.0143911838531494, "logits/rejected": -1.1997044086456299, "logps/chosen": -1.80712890625, "logps/rejected": -1.854230523109436, "loss": 2.8305, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.0712890625, "rewards/margins": 0.47101593017578125, "rewards/rejected": -18.542306900024414, "step": 14310 }, { "epoch": 0.48249014122484746, "grad_norm": 21.75940704345703, "learning_rate": 6.167427023624547e-07, "logits/chosen": -1.05575692653656, "logits/rejected": -1.2112061977386475, "logps/chosen": -1.9335291385650635, "logps/rejected": -1.9409252405166626, "loss": 3.2197, "rewards/accuracies": 0.5, "rewards/chosen": -19.335290908813477, "rewards/margins": 0.07396335899829865, "rewards/rejected": -19.409252166748047, "step": 14315 }, { "epoch": 0.4826586672958307, "grad_norm": 22.265239715576172, "learning_rate": 6.164566773856757e-07, "logits/chosen": -1.438683271408081, "logits/rejected": -1.3932020664215088, "logps/chosen": -1.8244634866714478, "logps/rejected": -2.039867877960205, "loss": 1.8846, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.244632720947266, "rewards/margins": 2.1540427207946777, "rewards/rejected": -20.398677825927734, "step": 14320 }, { "epoch": 0.48282719336681384, "grad_norm": 29.78438949584961, "learning_rate": 6.16170612107829e-07, "logits/chosen": -1.921547532081604, "logits/rejected": -1.909881591796875, "logps/chosen": -1.7852840423583984, "logps/rejected": -2.009737014770508, "loss": 2.2081, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.852840423583984, "rewards/margins": 2.2445316314697266, "rewards/rejected": -20.097370147705078, "step": 14325 }, { "epoch": 0.482995719437797, "grad_norm": 17.730634689331055, "learning_rate": 6.158845066279103e-07, "logits/chosen": -1.072278618812561, "logits/rejected": -1.5491522550582886, "logps/chosen": -2.598126173019409, "logps/rejected": -2.719301700592041, "loss": 3.4963, "rewards/accuracies": 0.5, "rewards/chosen": -25.98126220703125, "rewards/margins": 1.2117526531219482, "rewards/rejected": -27.19301414489746, "step": 14330 }, { "epoch": 0.48316424550878023, "grad_norm": 29.073835372924805, "learning_rate": 6.155983610449298e-07, "logits/chosen": -1.2550567388534546, "logits/rejected": -1.3643629550933838, "logps/chosen": -1.7392864227294922, "logps/rejected": -1.9145896434783936, "loss": 2.6637, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.392864227294922, "rewards/margins": 1.7530326843261719, "rewards/rejected": -19.145896911621094, "step": 14335 }, { "epoch": 0.4833327715797634, "grad_norm": 33.49995803833008, "learning_rate": 6.153121754579107e-07, "logits/chosen": -1.3938661813735962, "logits/rejected": -1.3946878910064697, "logps/chosen": -2.068878650665283, "logps/rejected": -2.1564369201660156, "loss": 2.9194, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.68878746032715, "rewards/margins": 0.8755823373794556, "rewards/rejected": -21.564369201660156, "step": 14340 }, { "epoch": 0.48350129765074656, "grad_norm": 25.166501998901367, "learning_rate": 6.150259499658909e-07, "logits/chosen": -1.1227186918258667, "logits/rejected": -1.2852933406829834, "logps/chosen": -2.0173442363739014, "logps/rejected": -2.4354665279388428, "loss": 2.1515, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.173442840576172, "rewards/margins": 4.181223392486572, "rewards/rejected": -24.354665756225586, "step": 14345 }, { "epoch": 0.48366982372172973, "grad_norm": 26.04254722595215, "learning_rate": 6.147396846679216e-07, "logits/chosen": -1.2595160007476807, "logits/rejected": -1.3415896892547607, "logps/chosen": -2.605316638946533, "logps/rejected": -2.745382785797119, "loss": 3.355, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -26.05316734313965, "rewards/margins": 1.400661587715149, "rewards/rejected": -27.453826904296875, "step": 14350 }, { "epoch": 0.48383834979271295, "grad_norm": 30.954938888549805, "learning_rate": 6.144533796630678e-07, "logits/chosen": -1.0422312021255493, "logits/rejected": -1.0594663619995117, "logps/chosen": -1.8631160259246826, "logps/rejected": -1.7447509765625, "loss": 4.2904, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -18.631160736083984, "rewards/margins": -1.1836521625518799, "rewards/rejected": -17.447509765625, "step": 14355 }, { "epoch": 0.4840068758636961, "grad_norm": 29.922924041748047, "learning_rate": 6.141670350504089e-07, "logits/chosen": -1.3065602779388428, "logits/rejected": -1.418766736984253, "logps/chosen": -1.9556224346160889, "logps/rejected": -2.0475106239318848, "loss": 3.0816, "rewards/accuracies": 0.5, "rewards/chosen": -19.556224822998047, "rewards/margins": 0.9188838005065918, "rewards/rejected": -20.475109100341797, "step": 14360 }, { "epoch": 0.4841754019346793, "grad_norm": 34.842384338378906, "learning_rate": 6.13880650929037e-07, "logits/chosen": -1.3274986743927002, "logits/rejected": -1.19040048122406, "logps/chosen": -1.783129334449768, "logps/rejected": -1.8191875219345093, "loss": 2.8422, "rewards/accuracies": 0.5, "rewards/chosen": -17.8312931060791, "rewards/margins": 0.3605828285217285, "rewards/rejected": -18.191875457763672, "step": 14365 }, { "epoch": 0.48434392800566245, "grad_norm": 19.06740379333496, "learning_rate": 6.135942273980586e-07, "logits/chosen": -1.246483325958252, "logits/rejected": -1.3938615322113037, "logps/chosen": -2.066582202911377, "logps/rejected": -2.1912307739257812, "loss": 2.4974, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.665822982788086, "rewards/margins": 1.2464841604232788, "rewards/rejected": -21.912309646606445, "step": 14370 }, { "epoch": 0.48451245407664567, "grad_norm": 15.381619453430176, "learning_rate": 6.133077645565935e-07, "logits/chosen": -1.049902319908142, "logits/rejected": -1.299940824508667, "logps/chosen": -1.895345687866211, "logps/rejected": -2.2535789012908936, "loss": 1.1043, "rewards/accuracies": 1.0, "rewards/chosen": -18.95345687866211, "rewards/margins": 3.5823326110839844, "rewards/rejected": -22.53578758239746, "step": 14375 }, { "epoch": 0.48468098014762884, "grad_norm": 15.546119689941406, "learning_rate": 6.130212625037752e-07, "logits/chosen": -1.4986063241958618, "logits/rejected": -1.5750732421875, "logps/chosen": -2.206378936767578, "logps/rejected": -2.759787082672119, "loss": 1.7383, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.063793182373047, "rewards/margins": 5.534076690673828, "rewards/rejected": -27.59786605834961, "step": 14380 }, { "epoch": 0.484849506218612, "grad_norm": 27.19158172607422, "learning_rate": 6.12734721338751e-07, "logits/chosen": -1.4182652235031128, "logits/rejected": -1.5037392377853394, "logps/chosen": -1.734135389328003, "logps/rejected": -1.8829662799835205, "loss": 1.9696, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -17.341354370117188, "rewards/margins": 1.4883079528808594, "rewards/rejected": -18.829662322998047, "step": 14385 }, { "epoch": 0.4850180322895952, "grad_norm": 30.321714401245117, "learning_rate": 6.12448141160681e-07, "logits/chosen": -1.1180613040924072, "logits/rejected": -1.4290263652801514, "logps/chosen": -2.056044101715088, "logps/rejected": -2.676231622695923, "loss": 1.2935, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -20.560440063476562, "rewards/margins": 6.20187520980835, "rewards/rejected": -26.762313842773438, "step": 14390 }, { "epoch": 0.4851865583605784, "grad_norm": 159.37966918945312, "learning_rate": 6.121615220687398e-07, "logits/chosen": -1.4194531440734863, "logits/rejected": -1.2666727304458618, "logps/chosen": -2.57710337638855, "logps/rejected": -2.4502930641174316, "loss": 4.6584, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -25.77103614807129, "rewards/margins": -1.2681033611297607, "rewards/rejected": -24.5029296875, "step": 14395 }, { "epoch": 0.48535508443156156, "grad_norm": 42.41606140136719, "learning_rate": 6.118748641621148e-07, "logits/chosen": -1.2904958724975586, "logits/rejected": -1.2220988273620605, "logps/chosen": -2.0724587440490723, "logps/rejected": -2.123455047607422, "loss": 2.9377, "rewards/accuracies": 0.5, "rewards/chosen": -20.72458839416504, "rewards/margins": 0.5099626779556274, "rewards/rejected": -21.234548568725586, "step": 14400 }, { "epoch": 0.48535508443156156, "eval_logits/chosen": -1.7178480625152588, "eval_logits/rejected": -1.8386316299438477, "eval_logps/chosen": -2.0116519927978516, "eval_logps/rejected": -2.1217188835144043, "eval_loss": 2.994631290435791, "eval_rewards/accuracies": 0.6399999856948853, "eval_rewards/chosen": -20.11652183532715, "eval_rewards/margins": 1.1006675958633423, "eval_rewards/rejected": -21.217187881469727, "eval_runtime": 12.8896, "eval_samples_per_second": 7.758, "eval_steps_per_second": 1.94, "step": 14400 }, { "epoch": 0.4855236105025447, "grad_norm": 15.567980766296387, "learning_rate": 6.11588167540007e-07, "logits/chosen": -1.2109471559524536, "logits/rejected": -1.536413550376892, "logps/chosen": -1.9555637836456299, "logps/rejected": -2.1613593101501465, "loss": 2.1833, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.55563735961914, "rewards/margins": 2.0579581260681152, "rewards/rejected": -21.613595962524414, "step": 14405 }, { "epoch": 0.48569213657352794, "grad_norm": 11.709920883178711, "learning_rate": 6.113014323016307e-07, "logits/chosen": -1.006519079208374, "logits/rejected": -1.320786476135254, "logps/chosen": -1.6655772924423218, "logps/rejected": -1.8181434869766235, "loss": 2.2558, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.655773162841797, "rewards/margins": 1.5256626605987549, "rewards/rejected": -18.181434631347656, "step": 14410 }, { "epoch": 0.4858606626445111, "grad_norm": 28.14756202697754, "learning_rate": 6.11014658546214e-07, "logits/chosen": -1.0801985263824463, "logits/rejected": -1.355345368385315, "logps/chosen": -1.7694141864776611, "logps/rejected": -2.0815796852111816, "loss": 1.9312, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.694141387939453, "rewards/margins": 3.1216535568237305, "rewards/rejected": -20.815793991088867, "step": 14415 }, { "epoch": 0.4860291887154943, "grad_norm": 57.387203216552734, "learning_rate": 6.107278463729977e-07, "logits/chosen": -1.1415436267852783, "logits/rejected": -1.7277675867080688, "logps/chosen": -2.552786350250244, "logps/rejected": -2.63765287399292, "loss": 3.79, "rewards/accuracies": 0.5, "rewards/chosen": -25.52786636352539, "rewards/margins": 0.8486614227294922, "rewards/rejected": -26.376529693603516, "step": 14420 }, { "epoch": 0.48619771478647744, "grad_norm": 38.32411193847656, "learning_rate": 6.104409958812362e-07, "logits/chosen": -1.50121009349823, "logits/rejected": -1.1826785802841187, "logps/chosen": -1.7358640432357788, "logps/rejected": -1.7259807586669922, "loss": 3.3679, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.358638763427734, "rewards/margins": -0.09883232414722443, "rewards/rejected": -17.259807586669922, "step": 14425 }, { "epoch": 0.48636624085746066, "grad_norm": 16.106178283691406, "learning_rate": 6.101541071701974e-07, "logits/chosen": -1.0993359088897705, "logits/rejected": -1.3681131601333618, "logps/chosen": -1.9167248010635376, "logps/rejected": -2.063552141189575, "loss": 2.2598, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.16724967956543, "rewards/margins": 1.468271255493164, "rewards/rejected": -20.63551902770996, "step": 14430 }, { "epoch": 0.48653476692844383, "grad_norm": 33.390281677246094, "learning_rate": 6.098671803391618e-07, "logits/chosen": -1.3295332193374634, "logits/rejected": -1.732404351234436, "logps/chosen": -2.3224639892578125, "logps/rejected": -2.6696527004241943, "loss": 2.3211, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.22464370727539, "rewards/margins": 3.4718856811523438, "rewards/rejected": -26.6965274810791, "step": 14435 }, { "epoch": 0.486703292999427, "grad_norm": 38.27173614501953, "learning_rate": 6.095802154874238e-07, "logits/chosen": -0.702488124370575, "logits/rejected": -0.9602154493331909, "logps/chosen": -3.133786678314209, "logps/rejected": -2.500781297683716, "loss": 9.6033, "rewards/accuracies": 0.5, "rewards/chosen": -31.337865829467773, "rewards/margins": -6.33005428314209, "rewards/rejected": -25.007810592651367, "step": 14440 }, { "epoch": 0.4868718190704102, "grad_norm": 30.617448806762695, "learning_rate": 6.092932127142904e-07, "logits/chosen": -1.3489675521850586, "logits/rejected": -1.422295331954956, "logps/chosen": -1.9906165599822998, "logps/rejected": -2.0119526386260986, "loss": 3.6732, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.90616798400879, "rewards/margins": 0.21335992217063904, "rewards/rejected": -20.119525909423828, "step": 14445 }, { "epoch": 0.4870403451413934, "grad_norm": 44.26568603515625, "learning_rate": 6.09006172119082e-07, "logits/chosen": -1.2701125144958496, "logits/rejected": -1.8389867544174194, "logps/chosen": -2.1882669925689697, "logps/rejected": -2.1582720279693604, "loss": 3.402, "rewards/accuracies": 0.5, "rewards/chosen": -21.882671356201172, "rewards/margins": -0.2999493479728699, "rewards/rejected": -21.582721710205078, "step": 14450 }, { "epoch": 0.48720887121237655, "grad_norm": 34.02650451660156, "learning_rate": 6.087190938011322e-07, "logits/chosen": -1.2723720073699951, "logits/rejected": -1.2750638723373413, "logps/chosen": -2.176562786102295, "logps/rejected": -2.5528616905212402, "loss": 2.337, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.765628814697266, "rewards/margins": 3.762988328933716, "rewards/rejected": -25.52861785888672, "step": 14455 }, { "epoch": 0.4873773972833597, "grad_norm": 40.86668014526367, "learning_rate": 6.084319778597875e-07, "logits/chosen": -0.954400897026062, "logits/rejected": -1.1919571161270142, "logps/chosen": -2.130120277404785, "logps/rejected": -2.39589262008667, "loss": 1.5986, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.30120277404785, "rewards/margins": 2.6577227115631104, "rewards/rejected": -23.958925247192383, "step": 14460 }, { "epoch": 0.48754592335434294, "grad_norm": 76.33245086669922, "learning_rate": 6.081448243944073e-07, "logits/chosen": -1.9409711360931396, "logits/rejected": -1.8011581897735596, "logps/chosen": -2.597461700439453, "logps/rejected": -2.948514223098755, "loss": 1.7805, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.9746150970459, "rewards/margins": 3.510526180267334, "rewards/rejected": -29.485143661499023, "step": 14465 }, { "epoch": 0.4877144494253261, "grad_norm": 27.40862464904785, "learning_rate": 6.07857633504364e-07, "logits/chosen": -0.9310008883476257, "logits/rejected": -0.8623281717300415, "logps/chosen": -2.396544933319092, "logps/rejected": -2.29371976852417, "loss": 4.1105, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -23.9654483795166, "rewards/margins": -1.0282517671585083, "rewards/rejected": -22.937198638916016, "step": 14470 }, { "epoch": 0.48788297549630927, "grad_norm": 17.24898910522461, "learning_rate": 6.075704052890432e-07, "logits/chosen": -1.9165115356445312, "logits/rejected": -2.039600372314453, "logps/chosen": -2.1480746269226074, "logps/rejected": -2.7809290885925293, "loss": 2.4188, "rewards/accuracies": 0.5, "rewards/chosen": -21.48074722290039, "rewards/margins": 6.328543663024902, "rewards/rejected": -27.809289932250977, "step": 14475 }, { "epoch": 0.48805150156729243, "grad_norm": 48.96101760864258, "learning_rate": 6.072831398478433e-07, "logits/chosen": -1.634894609451294, "logits/rejected": -1.6797151565551758, "logps/chosen": -2.297348976135254, "logps/rejected": -2.287741184234619, "loss": 3.8043, "rewards/accuracies": 0.5, "rewards/chosen": -22.97348976135254, "rewards/margins": -0.0960756316781044, "rewards/rejected": -22.87741470336914, "step": 14480 }, { "epoch": 0.48822002763827566, "grad_norm": 9.349481582641602, "learning_rate": 6.069958372801753e-07, "logits/chosen": -1.4112704992294312, "logits/rejected": -1.444657564163208, "logps/chosen": -2.316025495529175, "logps/rejected": -2.368542432785034, "loss": 2.9264, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -23.160253524780273, "rewards/margins": 0.5251716375350952, "rewards/rejected": -23.6854248046875, "step": 14485 }, { "epoch": 0.4883885537092588, "grad_norm": 17.727920532226562, "learning_rate": 6.067084976854637e-07, "logits/chosen": -1.3877828121185303, "logits/rejected": -1.4814679622650146, "logps/chosen": -2.2088212966918945, "logps/rejected": -2.6402947902679443, "loss": 3.2358, "rewards/accuracies": 0.5, "rewards/chosen": -22.088214874267578, "rewards/margins": 4.314736366271973, "rewards/rejected": -26.4029483795166, "step": 14490 }, { "epoch": 0.488557079780242, "grad_norm": 42.89591598510742, "learning_rate": 6.064211211631451e-07, "logits/chosen": -1.392857313156128, "logits/rejected": -1.4012411832809448, "logps/chosen": -2.6497719287872314, "logps/rejected": -2.85886287689209, "loss": 2.3042, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.497716903686523, "rewards/margins": 2.090907335281372, "rewards/rejected": -28.588626861572266, "step": 14495 }, { "epoch": 0.4887256058512252, "grad_norm": 45.4229850769043, "learning_rate": 6.061337078126693e-07, "logits/chosen": -1.9174840450286865, "logits/rejected": -1.8754726648330688, "logps/chosen": -2.0455451011657715, "logps/rejected": -2.170490264892578, "loss": 3.5905, "rewards/accuracies": 0.5, "rewards/chosen": -20.45545196533203, "rewards/margins": 1.2494512796401978, "rewards/rejected": -21.70490264892578, "step": 14500 }, { "epoch": 0.4888941319222084, "grad_norm": 44.48363494873047, "learning_rate": 6.058462577334987e-07, "logits/chosen": -1.233435034751892, "logits/rejected": -1.3299095630645752, "logps/chosen": -1.6596050262451172, "logps/rejected": -1.8056104183197021, "loss": 2.2678, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -16.596050262451172, "rewards/margins": 1.4600555896759033, "rewards/rejected": -18.05610466003418, "step": 14505 }, { "epoch": 0.48906265799319154, "grad_norm": 36.73933029174805, "learning_rate": 6.055587710251086e-07, "logits/chosen": -1.4670766592025757, "logits/rejected": -1.645054817199707, "logps/chosen": -2.200580358505249, "logps/rejected": -2.275089979171753, "loss": 2.583, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.00580406188965, "rewards/margins": 0.745094895362854, "rewards/rejected": -22.750900268554688, "step": 14510 }, { "epoch": 0.4892311840641747, "grad_norm": 14.582422256469727, "learning_rate": 6.052712477869866e-07, "logits/chosen": -1.8081855773925781, "logits/rejected": -1.8254966735839844, "logps/chosen": -1.8928664922714233, "logps/rejected": -2.5116868019104004, "loss": 1.3671, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.928665161132812, "rewards/margins": 6.188204288482666, "rewards/rejected": -25.116870880126953, "step": 14515 }, { "epoch": 0.48939971013515793, "grad_norm": 24.57520866394043, "learning_rate": 6.049836881186334e-07, "logits/chosen": -1.295364499092102, "logits/rejected": -1.5239925384521484, "logps/chosen": -2.708310842514038, "logps/rejected": -3.2306361198425293, "loss": 2.9936, "rewards/accuracies": 0.5, "rewards/chosen": -27.083110809326172, "rewards/margins": 5.223249912261963, "rewards/rejected": -32.306358337402344, "step": 14520 }, { "epoch": 0.4895682362061411, "grad_norm": 4.418197154998779, "learning_rate": 6.046960921195616e-07, "logits/chosen": -1.1067047119140625, "logits/rejected": -1.0724799633026123, "logps/chosen": -2.7758538722991943, "logps/rejected": -2.9014029502868652, "loss": 2.8487, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -27.7585391998291, "rewards/margins": 1.2554924488067627, "rewards/rejected": -29.0140323638916, "step": 14525 }, { "epoch": 0.48973676227712426, "grad_norm": 82.43099975585938, "learning_rate": 6.044084598892973e-07, "logits/chosen": -1.6011970043182373, "logits/rejected": -1.5154972076416016, "logps/chosen": -2.1913952827453613, "logps/rejected": -2.110191822052002, "loss": 3.9255, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -21.913951873779297, "rewards/margins": -0.812035083770752, "rewards/rejected": -21.101917266845703, "step": 14530 }, { "epoch": 0.4899052883481074, "grad_norm": 32.03125, "learning_rate": 6.041207915273787e-07, "logits/chosen": -1.1000322103500366, "logits/rejected": -0.9549843072891235, "logps/chosen": -2.2531070709228516, "logps/rejected": -2.343667507171631, "loss": 2.4727, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.531070709228516, "rewards/margins": 0.9056074023246765, "rewards/rejected": -23.436676025390625, "step": 14535 }, { "epoch": 0.49007381441909065, "grad_norm": 48.95552062988281, "learning_rate": 6.038330871333563e-07, "logits/chosen": -1.446215033531189, "logits/rejected": -1.434485912322998, "logps/chosen": -1.998910665512085, "logps/rejected": -1.985263466835022, "loss": 3.2234, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.98910903930664, "rewards/margins": -0.13647422194480896, "rewards/rejected": -19.852632522583008, "step": 14540 }, { "epoch": 0.4902423404900738, "grad_norm": 29.8745174407959, "learning_rate": 6.035453468067934e-07, "logits/chosen": -1.5774548053741455, "logits/rejected": -1.8568214178085327, "logps/chosen": -1.9072927236557007, "logps/rejected": -2.082167625427246, "loss": 2.8992, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.072927474975586, "rewards/margins": 1.7487468719482422, "rewards/rejected": -20.821674346923828, "step": 14545 }, { "epoch": 0.490410866561057, "grad_norm": 20.004297256469727, "learning_rate": 6.032575706472654e-07, "logits/chosen": -1.165950059890747, "logits/rejected": -1.2661291360855103, "logps/chosen": -2.0044305324554443, "logps/rejected": -2.1626477241516113, "loss": 3.1361, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.0443058013916, "rewards/margins": 1.5821691751480103, "rewards/rejected": -21.626474380493164, "step": 14550 }, { "epoch": 0.4905793926320402, "grad_norm": 29.16197395324707, "learning_rate": 6.029697587543603e-07, "logits/chosen": -1.1389122009277344, "logits/rejected": -1.1878808736801147, "logps/chosen": -2.138899803161621, "logps/rejected": -2.3903207778930664, "loss": 1.5864, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.388996124267578, "rewards/margins": 2.5142085552215576, "rewards/rejected": -23.903209686279297, "step": 14555 }, { "epoch": 0.49074791870302337, "grad_norm": 48.880123138427734, "learning_rate": 6.026819112276786e-07, "logits/chosen": -1.1174449920654297, "logits/rejected": -1.1852917671203613, "logps/chosen": -2.229419231414795, "logps/rejected": -2.2317538261413574, "loss": 3.3643, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.294193267822266, "rewards/margins": 0.023347090929746628, "rewards/rejected": -22.317541122436523, "step": 14560 }, { "epoch": 0.49091644477400653, "grad_norm": 35.779170989990234, "learning_rate": 6.02394028166833e-07, "logits/chosen": -1.393424153327942, "logits/rejected": -1.6953125, "logps/chosen": -2.20637845993042, "logps/rejected": -2.665292739868164, "loss": 1.8522, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.063785552978516, "rewards/margins": 4.589139461517334, "rewards/rejected": -26.652923583984375, "step": 14565 }, { "epoch": 0.4910849708449897, "grad_norm": 22.484113693237305, "learning_rate": 6.021061096714484e-07, "logits/chosen": -1.3832494020462036, "logits/rejected": -1.5571014881134033, "logps/chosen": -2.7218856811523438, "logps/rejected": -3.0552916526794434, "loss": 1.4561, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -27.218856811523438, "rewards/margins": 3.334063768386841, "rewards/rejected": -30.552921295166016, "step": 14570 }, { "epoch": 0.4912534969159729, "grad_norm": 37.85688400268555, "learning_rate": 6.01818155841162e-07, "logits/chosen": -1.389123558998108, "logits/rejected": -1.6732994318008423, "logps/chosen": -1.9779088497161865, "logps/rejected": -2.153419017791748, "loss": 2.1336, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.779090881347656, "rewards/margins": 1.755099892616272, "rewards/rejected": -21.534189224243164, "step": 14575 }, { "epoch": 0.4914220229869561, "grad_norm": 56.15266418457031, "learning_rate": 6.015301667756233e-07, "logits/chosen": -1.1650410890579224, "logits/rejected": -1.157894492149353, "logps/chosen": -1.8984973430633545, "logps/rejected": -2.231627941131592, "loss": 1.5791, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -18.984973907470703, "rewards/margins": 3.331307888031006, "rewards/rejected": -22.316282272338867, "step": 14580 }, { "epoch": 0.49159054905793925, "grad_norm": 37.5297737121582, "learning_rate": 6.012421425744941e-07, "logits/chosen": -1.0445148944854736, "logits/rejected": -1.4065015316009521, "logps/chosen": -1.8271509408950806, "logps/rejected": -1.9741817712783813, "loss": 2.9193, "rewards/accuracies": 0.5, "rewards/chosen": -18.271509170532227, "rewards/margins": 1.4703084230422974, "rewards/rejected": -19.741817474365234, "step": 14585 }, { "epoch": 0.4917590751289224, "grad_norm": 39.2474365234375, "learning_rate": 6.009540833374481e-07, "logits/chosen": -1.191789984703064, "logits/rejected": -1.3047010898590088, "logps/chosen": -2.168593645095825, "logps/rejected": -2.7140133380889893, "loss": 1.7459, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.68593406677246, "rewards/margins": 5.454197883605957, "rewards/rejected": -27.140132904052734, "step": 14590 }, { "epoch": 0.49192760119990564, "grad_norm": 24.918357849121094, "learning_rate": 6.006659891641712e-07, "logits/chosen": -0.8684003949165344, "logits/rejected": -1.174020767211914, "logps/chosen": -2.2814254760742188, "logps/rejected": -2.5541939735412598, "loss": 3.1475, "rewards/accuracies": 0.5, "rewards/chosen": -22.814252853393555, "rewards/margins": 2.7276878356933594, "rewards/rejected": -25.541942596435547, "step": 14595 }, { "epoch": 0.4920961272708888, "grad_norm": 59.820919036865234, "learning_rate": 6.003778601543616e-07, "logits/chosen": -1.7482401132583618, "logits/rejected": -1.6752662658691406, "logps/chosen": -2.2109625339508057, "logps/rejected": -2.252445936203003, "loss": 3.9494, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.1096248626709, "rewards/margins": 0.4148353934288025, "rewards/rejected": -22.524459838867188, "step": 14600 }, { "epoch": 0.49226465334187197, "grad_norm": 34.3822135925293, "learning_rate": 6.000896964077295e-07, "logits/chosen": -0.9001103639602661, "logits/rejected": -0.9175226092338562, "logps/chosen": -2.6637980937957764, "logps/rejected": -2.9894144535064697, "loss": 3.8383, "rewards/accuracies": 0.5, "rewards/chosen": -26.63797950744629, "rewards/margins": 3.2561659812927246, "rewards/rejected": -29.89414405822754, "step": 14605 }, { "epoch": 0.4924331794128552, "grad_norm": 20.390865325927734, "learning_rate": 5.998014980239966e-07, "logits/chosen": -1.5274779796600342, "logits/rejected": -1.6380846500396729, "logps/chosen": -2.4721086025238037, "logps/rejected": -2.516230821609497, "loss": 3.6446, "rewards/accuracies": 0.5, "rewards/chosen": -24.721084594726562, "rewards/margins": 0.4412227272987366, "rewards/rejected": -25.162307739257812, "step": 14610 }, { "epoch": 0.49260170548383836, "grad_norm": 28.802907943725586, "learning_rate": 5.995132651028973e-07, "logits/chosen": -1.3802731037139893, "logits/rejected": -1.5385868549346924, "logps/chosen": -2.067274570465088, "logps/rejected": -2.7522835731506348, "loss": 2.2341, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.672746658325195, "rewards/margins": 6.850091457366943, "rewards/rejected": -27.522836685180664, "step": 14615 }, { "epoch": 0.4927702315548215, "grad_norm": 33.500038146972656, "learning_rate": 5.992249977441778e-07, "logits/chosen": -1.3794944286346436, "logits/rejected": -1.4457772970199585, "logps/chosen": -2.42356538772583, "logps/rejected": -2.1479406356811523, "loss": 5.8888, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -24.23565673828125, "rewards/margins": -2.7562496662139893, "rewards/rejected": -21.47940444946289, "step": 14620 }, { "epoch": 0.4929387576258047, "grad_norm": 25.33816909790039, "learning_rate": 5.989366960475956e-07, "logits/chosen": -1.3720991611480713, "logits/rejected": -1.6218681335449219, "logps/chosen": -1.9994127750396729, "logps/rejected": -2.1939616203308105, "loss": 2.91, "rewards/accuracies": 0.5, "rewards/chosen": -19.99412727355957, "rewards/margins": 1.9454885721206665, "rewards/rejected": -21.93961524963379, "step": 14625 }, { "epoch": 0.4931072836967879, "grad_norm": 1.7631736993789673, "learning_rate": 5.986483601129212e-07, "logits/chosen": -0.9612113833427429, "logits/rejected": -1.0274362564086914, "logps/chosen": -2.0222010612487793, "logps/rejected": -2.2431979179382324, "loss": 2.969, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.22201156616211, "rewards/margins": 2.209967851638794, "rewards/rejected": -22.43198013305664, "step": 14630 }, { "epoch": 0.4932758097677711, "grad_norm": 23.257797241210938, "learning_rate": 5.983599900399357e-07, "logits/chosen": -1.47468101978302, "logits/rejected": -1.5135730504989624, "logps/chosen": -2.2988877296447754, "logps/rejected": -1.9558820724487305, "loss": 6.8303, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.988876342773438, "rewards/margins": -3.430056095123291, "rewards/rejected": -19.558818817138672, "step": 14635 }, { "epoch": 0.49344433583875424, "grad_norm": 121.70762634277344, "learning_rate": 5.98071585928433e-07, "logits/chosen": -1.7339370250701904, "logits/rejected": -1.6378087997436523, "logps/chosen": -2.540347099304199, "logps/rejected": -2.673609733581543, "loss": 3.1187, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -25.40346908569336, "rewards/margins": 1.3326267004013062, "rewards/rejected": -26.736095428466797, "step": 14640 }, { "epoch": 0.4936128619097374, "grad_norm": 22.278928756713867, "learning_rate": 5.977831478782181e-07, "logits/chosen": -1.2727556228637695, "logits/rejected": -1.4096349477767944, "logps/chosen": -1.9860179424285889, "logps/rejected": -2.102168083190918, "loss": 2.8199, "rewards/accuracies": 0.5, "rewards/chosen": -19.860179901123047, "rewards/margins": 1.1615017652511597, "rewards/rejected": -21.02168083190918, "step": 14645 }, { "epoch": 0.49378138798072063, "grad_norm": 12.310074806213379, "learning_rate": 5.974946759891084e-07, "logits/chosen": -1.1900991201400757, "logits/rejected": -1.4266362190246582, "logps/chosen": -2.0711121559143066, "logps/rejected": -2.1807780265808105, "loss": 2.5928, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.711122512817383, "rewards/margins": 1.0966581106185913, "rewards/rejected": -21.80777931213379, "step": 14650 }, { "epoch": 0.4939499140517038, "grad_norm": 139.086669921875, "learning_rate": 5.972061703609326e-07, "logits/chosen": -1.0209219455718994, "logits/rejected": -0.8229808807373047, "logps/chosen": -2.4855475425720215, "logps/rejected": -2.6358425617218018, "loss": 3.4862, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -24.8554744720459, "rewards/margins": 1.502950668334961, "rewards/rejected": -26.35842514038086, "step": 14655 }, { "epoch": 0.49411844012268696, "grad_norm": 32.465938568115234, "learning_rate": 5.969176310935307e-07, "logits/chosen": -1.2797296047210693, "logits/rejected": -1.3183832168579102, "logps/chosen": -1.6758267879486084, "logps/rejected": -1.6694438457489014, "loss": 3.3985, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -16.758268356323242, "rewards/margins": -0.0638284683227539, "rewards/rejected": -16.694438934326172, "step": 14660 }, { "epoch": 0.4942869661936702, "grad_norm": 57.931907653808594, "learning_rate": 5.966290582867552e-07, "logits/chosen": -1.1228911876678467, "logits/rejected": -1.4107589721679688, "logps/chosen": -2.399655818939209, "logps/rejected": -2.4190433025360107, "loss": 3.325, "rewards/accuracies": 0.5, "rewards/chosen": -23.996559143066406, "rewards/margins": 0.19387368857860565, "rewards/rejected": -24.190433502197266, "step": 14665 }, { "epoch": 0.49445549226465335, "grad_norm": 24.14126205444336, "learning_rate": 5.963404520404696e-07, "logits/chosen": -1.2213075160980225, "logits/rejected": -1.4488023519515991, "logps/chosen": -1.8714441061019897, "logps/rejected": -2.1866447925567627, "loss": 2.0998, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.714441299438477, "rewards/margins": 3.152009963989258, "rewards/rejected": -21.8664493560791, "step": 14670 }, { "epoch": 0.4946240183356365, "grad_norm": 32.929386138916016, "learning_rate": 5.960518124545492e-07, "logits/chosen": -1.1682794094085693, "logits/rejected": -1.2816708087921143, "logps/chosen": -2.183274030685425, "logps/rejected": -2.345292329788208, "loss": 3.108, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.832740783691406, "rewards/margins": 1.6201798915863037, "rewards/rejected": -23.45292091369629, "step": 14675 }, { "epoch": 0.4947925444066197, "grad_norm": 19.513643264770508, "learning_rate": 5.957631396288809e-07, "logits/chosen": -1.9665130376815796, "logits/rejected": -2.2521214485168457, "logps/chosen": -2.527388334274292, "logps/rejected": -2.6848843097686768, "loss": 3.246, "rewards/accuracies": 0.5, "rewards/chosen": -25.273883819580078, "rewards/margins": 1.5749595165252686, "rewards/rejected": -26.848840713500977, "step": 14680 }, { "epoch": 0.4949610704776029, "grad_norm": 31.406166076660156, "learning_rate": 5.954744336633629e-07, "logits/chosen": -1.33124840259552, "logits/rejected": -1.4813728332519531, "logps/chosen": -2.0210530757904053, "logps/rejected": -2.093916893005371, "loss": 2.7094, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.210533142089844, "rewards/margins": 0.7286360859870911, "rewards/rejected": -20.939167022705078, "step": 14685 }, { "epoch": 0.49512959654858607, "grad_norm": 23.128408432006836, "learning_rate": 5.95185694657905e-07, "logits/chosen": -1.0466244220733643, "logits/rejected": -1.5269792079925537, "logps/chosen": -2.0125339031219482, "logps/rejected": -2.319976329803467, "loss": 1.7864, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.125341415405273, "rewards/margins": 3.0744259357452393, "rewards/rejected": -23.199764251708984, "step": 14690 }, { "epoch": 0.49529812261956924, "grad_norm": 50.07123947143555, "learning_rate": 5.948969227124282e-07, "logits/chosen": -0.7739205956459045, "logits/rejected": -1.1197071075439453, "logps/chosen": -2.1050896644592285, "logps/rejected": -2.4487571716308594, "loss": 2.6332, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.050893783569336, "rewards/margins": 3.4366748332977295, "rewards/rejected": -24.487571716308594, "step": 14695 }, { "epoch": 0.4954666486905524, "grad_norm": 11.936738967895508, "learning_rate": 5.946081179268654e-07, "logits/chosen": -1.552634596824646, "logits/rejected": -1.8280937671661377, "logps/chosen": -2.6402595043182373, "logps/rejected": -2.8458423614501953, "loss": 3.8194, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -26.4025936126709, "rewards/margins": 2.0558295249938965, "rewards/rejected": -28.458423614501953, "step": 14700 }, { "epoch": 0.4956351747615356, "grad_norm": 71.45115661621094, "learning_rate": 5.943192804011602e-07, "logits/chosen": -1.7254797220230103, "logits/rejected": -1.431593894958496, "logps/chosen": -3.140167474746704, "logps/rejected": -3.0458261966705322, "loss": 4.3807, "rewards/accuracies": 0.5, "rewards/chosen": -31.401676177978516, "rewards/margins": -0.9434127807617188, "rewards/rejected": -30.4582576751709, "step": 14705 }, { "epoch": 0.4958037008325188, "grad_norm": 17.0635929107666, "learning_rate": 5.940304102352682e-07, "logits/chosen": -1.3017793893814087, "logits/rejected": -1.3800328969955444, "logps/chosen": -1.8271408081054688, "logps/rejected": -1.947548508644104, "loss": 3.4235, "rewards/accuracies": 0.5, "rewards/chosen": -18.271406173706055, "rewards/margins": 1.2040780782699585, "rewards/rejected": -19.475482940673828, "step": 14710 }, { "epoch": 0.49597222690350196, "grad_norm": 18.700368881225586, "learning_rate": 5.93741507529156e-07, "logits/chosen": -1.353197455406189, "logits/rejected": -1.3648579120635986, "logps/chosen": -1.6878719329833984, "logps/rejected": -1.8210432529449463, "loss": 2.3033, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.878719329833984, "rewards/margins": 1.3317129611968994, "rewards/rejected": -18.210430145263672, "step": 14715 }, { "epoch": 0.4961407529744852, "grad_norm": 20.979001998901367, "learning_rate": 5.934525723828011e-07, "logits/chosen": -1.2096859216690063, "logits/rejected": -1.4101965427398682, "logps/chosen": -2.714660167694092, "logps/rejected": -2.635310649871826, "loss": 3.9666, "rewards/accuracies": 0.5, "rewards/chosen": -27.146602630615234, "rewards/margins": -0.7934969663619995, "rewards/rejected": -26.353107452392578, "step": 14720 }, { "epoch": 0.49630927904546834, "grad_norm": 28.868972778320312, "learning_rate": 5.931636048961928e-07, "logits/chosen": -2.1657252311706543, "logits/rejected": -2.1079888343811035, "logps/chosen": -1.6346423625946045, "logps/rejected": -1.6175251007080078, "loss": 3.3324, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -16.346424102783203, "rewards/margins": -0.1711721420288086, "rewards/rejected": -16.17525291442871, "step": 14725 }, { "epoch": 0.4964778051164515, "grad_norm": 65.39803314208984, "learning_rate": 5.928746051693314e-07, "logits/chosen": -1.4344148635864258, "logits/rejected": -1.676160454750061, "logps/chosen": -2.6490559577941895, "logps/rejected": -2.7858047485351562, "loss": 3.5182, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -26.490558624267578, "rewards/margins": 1.367490530014038, "rewards/rejected": -27.858051300048828, "step": 14730 }, { "epoch": 0.4966463311874347, "grad_norm": 28.76473617553711, "learning_rate": 5.925855733022284e-07, "logits/chosen": -1.472895860671997, "logits/rejected": -1.8007482290267944, "logps/chosen": -2.340292453765869, "logps/rejected": -2.5708696842193604, "loss": 1.9541, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.40292739868164, "rewards/margins": 2.305771827697754, "rewards/rejected": -25.708698272705078, "step": 14735 }, { "epoch": 0.4968148572584179, "grad_norm": 32.8542594909668, "learning_rate": 5.922965093949059e-07, "logits/chosen": -1.4455643892288208, "logits/rejected": -1.7135651111602783, "logps/chosen": -2.043985366821289, "logps/rejected": -2.2156598567962646, "loss": 2.1238, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.43985366821289, "rewards/margins": 1.7167431116104126, "rewards/rejected": -22.156597137451172, "step": 14740 }, { "epoch": 0.49698338332940106, "grad_norm": 21.394441604614258, "learning_rate": 5.92007413547398e-07, "logits/chosen": -1.5994513034820557, "logits/rejected": -1.3545200824737549, "logps/chosen": -1.9160792827606201, "logps/rejected": -1.9209601879119873, "loss": 3.2191, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.16079330444336, "rewards/margins": 0.048807524144649506, "rewards/rejected": -19.20960235595703, "step": 14745 }, { "epoch": 0.49715190940038423, "grad_norm": 1.6420506238937378, "learning_rate": 5.917182858597493e-07, "logits/chosen": -0.9913978576660156, "logits/rejected": -1.404176950454712, "logps/chosen": -2.147432804107666, "logps/rejected": -2.678109645843506, "loss": 2.0465, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.47432518005371, "rewards/margins": 5.3067708015441895, "rewards/rejected": -26.781097412109375, "step": 14750 }, { "epoch": 0.4973204354713674, "grad_norm": 22.916858673095703, "learning_rate": 5.914291264320152e-07, "logits/chosen": -1.6764614582061768, "logits/rejected": -1.8273632526397705, "logps/chosen": -2.4563040733337402, "logps/rejected": -2.4414238929748535, "loss": 4.3066, "rewards/accuracies": 0.5, "rewards/chosen": -24.56304168701172, "rewards/margins": -0.14880123734474182, "rewards/rejected": -24.41423988342285, "step": 14755 }, { "epoch": 0.4974889615423506, "grad_norm": 30.65839195251465, "learning_rate": 5.911399353642629e-07, "logits/chosen": -1.2902991771697998, "logits/rejected": -1.3787257671356201, "logps/chosen": -2.3649189472198486, "logps/rejected": -2.294633388519287, "loss": 4.6882, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.649185180664062, "rewards/margins": -0.7028514742851257, "rewards/rejected": -22.946334838867188, "step": 14760 }, { "epoch": 0.4976574876133338, "grad_norm": 44.71762466430664, "learning_rate": 5.908507127565695e-07, "logits/chosen": -1.3229894638061523, "logits/rejected": -1.5332590341567993, "logps/chosen": -2.423475742340088, "logps/rejected": -2.6610312461853027, "loss": 2.5497, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.23475456237793, "rewards/margins": 2.375556230545044, "rewards/rejected": -26.61031150817871, "step": 14765 }, { "epoch": 0.49782601368431695, "grad_norm": 11.168540954589844, "learning_rate": 5.905614587090239e-07, "logits/chosen": -1.39534592628479, "logits/rejected": -1.234565258026123, "logps/chosen": -2.0755391120910645, "logps/rejected": -2.285327434539795, "loss": 2.1858, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.755390167236328, "rewards/margins": 2.0978846549987793, "rewards/rejected": -22.853275299072266, "step": 14770 }, { "epoch": 0.49799453975530017, "grad_norm": 27.3898983001709, "learning_rate": 5.902721733217254e-07, "logits/chosen": -1.3990356922149658, "logits/rejected": -1.514904499053955, "logps/chosen": -1.944977045059204, "logps/rejected": -1.9022576808929443, "loss": 3.5773, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.449771881103516, "rewards/margins": -0.4271933436393738, "rewards/rejected": -19.0225772857666, "step": 14775 }, { "epoch": 0.49816306582628334, "grad_norm": 7.661595344543457, "learning_rate": 5.899828566947843e-07, "logits/chosen": -1.1851098537445068, "logits/rejected": -1.463303565979004, "logps/chosen": -2.121722459793091, "logps/rejected": -2.8411059379577637, "loss": 1.9996, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.21722412109375, "rewards/margins": 7.193833827972412, "rewards/rejected": -28.411062240600586, "step": 14780 }, { "epoch": 0.4983315918972665, "grad_norm": 30.427043914794922, "learning_rate": 5.896935089283217e-07, "logits/chosen": -1.6385900974273682, "logits/rejected": -1.9292678833007812, "logps/chosen": -2.1936774253845215, "logps/rejected": -2.4388184547424316, "loss": 2.0809, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.9367733001709, "rewards/margins": 2.4514102935791016, "rewards/rejected": -24.38818359375, "step": 14785 }, { "epoch": 0.49850011796824967, "grad_norm": 22.440723419189453, "learning_rate": 5.894041301224694e-07, "logits/chosen": -1.774770736694336, "logits/rejected": -2.0120785236358643, "logps/chosen": -2.0042014122009277, "logps/rejected": -2.512622356414795, "loss": 1.9292, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.04201316833496, "rewards/margins": 5.084211826324463, "rewards/rejected": -25.126224517822266, "step": 14790 }, { "epoch": 0.4986686440392329, "grad_norm": 32.15318298339844, "learning_rate": 5.8911472037737e-07, "logits/chosen": -1.5917437076568604, "logits/rejected": -1.4512475728988647, "logps/chosen": -2.2175662517547607, "logps/rejected": -2.5062191486358643, "loss": 3.2555, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -22.175662994384766, "rewards/margins": 2.886528253555298, "rewards/rejected": -25.06218910217285, "step": 14795 }, { "epoch": 0.49883717011021605, "grad_norm": 24.423494338989258, "learning_rate": 5.88825279793177e-07, "logits/chosen": -1.39306640625, "logits/rejected": -1.2912009954452515, "logps/chosen": -2.3427510261535645, "logps/rejected": -2.494655132293701, "loss": 2.7856, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.427509307861328, "rewards/margins": 1.5190414190292358, "rewards/rejected": -24.946552276611328, "step": 14800 }, { "epoch": 0.49883717011021605, "eval_logits/chosen": -1.746840238571167, "eval_logits/rejected": -1.8720086812973022, "eval_logps/chosen": -2.02829647064209, "eval_logps/rejected": -2.141511917114258, "eval_loss": 2.990762948989868, "eval_rewards/accuracies": 0.6299999952316284, "eval_rewards/chosen": -20.2829647064209, "eval_rewards/margins": 1.1321519613265991, "eval_rewards/rejected": -21.415117263793945, "eval_runtime": 12.8988, "eval_samples_per_second": 7.753, "eval_steps_per_second": 1.938, "step": 14800 }, { "epoch": 0.4990056961811992, "grad_norm": 23.94206428527832, "learning_rate": 5.885358084700542e-07, "logits/chosen": -1.1922471523284912, "logits/rejected": -1.102346658706665, "logps/chosen": -2.3455393314361572, "logps/rejected": -2.550114870071411, "loss": 2.7445, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.455392837524414, "rewards/margins": 2.045755624771118, "rewards/rejected": -25.501148223876953, "step": 14805 }, { "epoch": 0.4991742222521824, "grad_norm": 27.79281997680664, "learning_rate": 5.882463065081762e-07, "logits/chosen": -1.541636347770691, "logits/rejected": -1.4492474794387817, "logps/chosen": -2.2527847290039062, "logps/rejected": -2.196924924850464, "loss": 3.8975, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.52784538269043, "rewards/margins": -0.5585947036743164, "rewards/rejected": -21.96925163269043, "step": 14810 }, { "epoch": 0.4993427483231656, "grad_norm": 16.550588607788086, "learning_rate": 5.879567740077283e-07, "logits/chosen": -1.3408617973327637, "logits/rejected": -1.3791230916976929, "logps/chosen": -1.9284662008285522, "logps/rejected": -2.4746458530426025, "loss": 1.635, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.28466033935547, "rewards/margins": 5.461796760559082, "rewards/rejected": -24.746456146240234, "step": 14815 }, { "epoch": 0.4995112743941488, "grad_norm": 26.888790130615234, "learning_rate": 5.876672110689063e-07, "logits/chosen": -1.716398000717163, "logits/rejected": -1.6853067874908447, "logps/chosen": -1.8864831924438477, "logps/rejected": -1.9089370965957642, "loss": 2.9516, "rewards/accuracies": 0.5, "rewards/chosen": -18.864831924438477, "rewards/margins": 0.22453880310058594, "rewards/rejected": -19.089370727539062, "step": 14820 }, { "epoch": 0.49967980046513194, "grad_norm": 142.75662231445312, "learning_rate": 5.873776177919163e-07, "logits/chosen": -2.0140433311462402, "logits/rejected": -2.0492501258850098, "logps/chosen": -1.961703896522522, "logps/rejected": -2.112826108932495, "loss": 2.7826, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.61703872680664, "rewards/margins": 1.511224389076233, "rewards/rejected": -21.12826156616211, "step": 14825 }, { "epoch": 0.49984832653611516, "grad_norm": 12.503056526184082, "learning_rate": 5.870879942769757e-07, "logits/chosen": -1.238468050956726, "logits/rejected": -1.2341539859771729, "logps/chosen": -2.7365505695343018, "logps/rejected": -2.764101028442383, "loss": 3.6693, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -27.365509033203125, "rewards/margins": 0.2755018174648285, "rewards/rejected": -27.641006469726562, "step": 14830 }, { "epoch": 0.5000168526070983, "grad_norm": 52.38449478149414, "learning_rate": 5.867983406243111e-07, "logits/chosen": -1.2844483852386475, "logits/rejected": -1.7105772495269775, "logps/chosen": -2.4828011989593506, "logps/rejected": -2.6509883403778076, "loss": 2.5619, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.828014373779297, "rewards/margins": 1.6818711757659912, "rewards/rejected": -26.509883880615234, "step": 14835 }, { "epoch": 0.5001853786780815, "grad_norm": 26.385717391967773, "learning_rate": 5.865086569341606e-07, "logits/chosen": -0.7888490557670593, "logits/rejected": -0.9685935974121094, "logps/chosen": -2.1663498878479004, "logps/rejected": -2.4157395362854004, "loss": 3.3503, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.663497924804688, "rewards/margins": 2.493898391723633, "rewards/rejected": -24.157394409179688, "step": 14840 }, { "epoch": 0.5003539047490647, "grad_norm": 26.826446533203125, "learning_rate": 5.862189433067722e-07, "logits/chosen": -1.3891171216964722, "logits/rejected": -1.4050616025924683, "logps/chosen": -1.6742547750473022, "logps/rejected": -1.6001243591308594, "loss": 3.8043, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -16.7425479888916, "rewards/margins": -0.7413040995597839, "rewards/rejected": -16.001245498657227, "step": 14845 }, { "epoch": 0.5005224308200479, "grad_norm": 291.24468994140625, "learning_rate": 5.859291998424047e-07, "logits/chosen": -1.1373765468597412, "logits/rejected": -1.0353875160217285, "logps/chosen": -2.4684436321258545, "logps/rejected": -2.557105541229248, "loss": 4.6681, "rewards/accuracies": 0.5, "rewards/chosen": -24.684436798095703, "rewards/margins": 0.8866220712661743, "rewards/rejected": -25.57105827331543, "step": 14850 }, { "epoch": 0.500690956891031, "grad_norm": 42.07600784301758, "learning_rate": 5.856394266413264e-07, "logits/chosen": -1.3034619092941284, "logits/rejected": -1.3409957885742188, "logps/chosen": -2.2791152000427246, "logps/rejected": -2.2514889240264893, "loss": 3.4252, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.791152954101562, "rewards/margins": -0.27626290917396545, "rewards/rejected": -22.514888763427734, "step": 14855 }, { "epoch": 0.5008594829620142, "grad_norm": 2.786142110824585, "learning_rate": 5.853496238038165e-07, "logits/chosen": -1.2134068012237549, "logits/rejected": -1.3753328323364258, "logps/chosen": -2.052281141281128, "logps/rejected": -2.3305046558380127, "loss": 1.8951, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.522811889648438, "rewards/margins": 2.7822327613830566, "rewards/rejected": -23.305044174194336, "step": 14860 }, { "epoch": 0.5010280090329974, "grad_norm": 25.388391494750977, "learning_rate": 5.850597914301646e-07, "logits/chosen": -1.2176518440246582, "logits/rejected": -1.2763893604278564, "logps/chosen": -2.0402097702026367, "logps/rejected": -2.1866259574890137, "loss": 2.6111, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.402097702026367, "rewards/margins": 1.4641621112823486, "rewards/rejected": -21.866260528564453, "step": 14865 }, { "epoch": 0.5011965351039805, "grad_norm": 62.56755065917969, "learning_rate": 5.847699296206699e-07, "logits/chosen": -1.7310209274291992, "logits/rejected": -1.6556918621063232, "logps/chosen": -2.3317017555236816, "logps/rejected": -2.410207748413086, "loss": 4.2986, "rewards/accuracies": 0.5, "rewards/chosen": -23.317020416259766, "rewards/margins": 0.7850597500801086, "rewards/rejected": -24.102079391479492, "step": 14870 }, { "epoch": 0.5013650611749638, "grad_norm": 42.40745162963867, "learning_rate": 5.844800384756427e-07, "logits/chosen": -0.902258038520813, "logits/rejected": -0.7612020373344421, "logps/chosen": -2.677332639694214, "logps/rejected": -3.0184645652770996, "loss": 3.5555, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -26.773326873779297, "rewards/margins": 3.4113197326660156, "rewards/rejected": -30.184650421142578, "step": 14875 }, { "epoch": 0.501533587245947, "grad_norm": 34.04011535644531, "learning_rate": 5.841901180954023e-07, "logits/chosen": -1.5480183362960815, "logits/rejected": -1.486154317855835, "logps/chosen": -2.7318997383117676, "logps/rejected": -2.1882028579711914, "loss": 9.325, "rewards/accuracies": 0.5, "rewards/chosen": -27.318994522094727, "rewards/margins": -5.436966896057129, "rewards/rejected": -21.882028579711914, "step": 14880 }, { "epoch": 0.5017021133169302, "grad_norm": 31.812999725341797, "learning_rate": 5.839001685802791e-07, "logits/chosen": -1.213324785232544, "logits/rejected": -1.4107666015625, "logps/chosen": -2.0232436656951904, "logps/rejected": -2.030778646469116, "loss": 3.2473, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.232437133789062, "rewards/margins": 0.0753483772277832, "rewards/rejected": -20.307785034179688, "step": 14885 }, { "epoch": 0.5018706393879133, "grad_norm": 51.604061126708984, "learning_rate": 5.83610190030613e-07, "logits/chosen": -1.1090004444122314, "logits/rejected": -1.0023685693740845, "logps/chosen": -1.8414013385772705, "logps/rejected": -2.0963680744171143, "loss": 3.827, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.414012908935547, "rewards/margins": 2.5496678352355957, "rewards/rejected": -20.963680267333984, "step": 14890 }, { "epoch": 0.5020391654588965, "grad_norm": 6.897836685180664, "learning_rate": 5.833201825467542e-07, "logits/chosen": -1.3187196254730225, "logits/rejected": -1.425402283668518, "logps/chosen": -1.9114145040512085, "logps/rejected": -2.122316598892212, "loss": 2.0474, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.114147186279297, "rewards/margins": 2.109020948410034, "rewards/rejected": -21.22316551208496, "step": 14895 }, { "epoch": 0.5022076915298797, "grad_norm": 51.942203521728516, "learning_rate": 5.830301462290631e-07, "logits/chosen": -1.130313515663147, "logits/rejected": -1.3741600513458252, "logps/chosen": -2.0696234703063965, "logps/rejected": -2.4011638164520264, "loss": 2.609, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.696231842041016, "rewards/margins": 3.3154044151306152, "rewards/rejected": -24.011638641357422, "step": 14900 }, { "epoch": 0.5023762176008628, "grad_norm": 16.872711181640625, "learning_rate": 5.827400811779094e-07, "logits/chosen": -1.4417014122009277, "logits/rejected": -1.4584705829620361, "logps/chosen": -2.0206668376922607, "logps/rejected": -2.132718324661255, "loss": 2.462, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.206668853759766, "rewards/margins": 1.120514154434204, "rewards/rejected": -21.32718276977539, "step": 14905 }, { "epoch": 0.502544743671846, "grad_norm": 36.745567321777344, "learning_rate": 5.824499874936737e-07, "logits/chosen": -1.2186622619628906, "logits/rejected": -1.7788499593734741, "logps/chosen": -2.42600154876709, "logps/rejected": -2.794022560119629, "loss": 2.5989, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.2600154876709, "rewards/margins": 3.680208206176758, "rewards/rejected": -27.94022560119629, "step": 14910 }, { "epoch": 0.5027132697428293, "grad_norm": 20.291852951049805, "learning_rate": 5.821598652767456e-07, "logits/chosen": -1.5000197887420654, "logits/rejected": -1.3767282962799072, "logps/chosen": -2.5916619300842285, "logps/rejected": -2.7663090229034424, "loss": 2.7872, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.916616439819336, "rewards/margins": 1.7464720010757446, "rewards/rejected": -27.663089752197266, "step": 14915 }, { "epoch": 0.5028817958138124, "grad_norm": 39.71499252319336, "learning_rate": 5.818697146275251e-07, "logits/chosen": -1.3548834323883057, "logits/rejected": -1.595643162727356, "logps/chosen": -2.2784557342529297, "logps/rejected": -2.455989122390747, "loss": 1.7163, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.784555435180664, "rewards/margins": 1.7753328084945679, "rewards/rejected": -24.55988883972168, "step": 14920 }, { "epoch": 0.5030503218847956, "grad_norm": 21.26605796813965, "learning_rate": 5.815795356464219e-07, "logits/chosen": -1.3504364490509033, "logits/rejected": -1.2849876880645752, "logps/chosen": -1.9238868951797485, "logps/rejected": -1.8664134740829468, "loss": 4.2078, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.23887062072754, "rewards/margins": -0.5747331380844116, "rewards/rejected": -18.664134979248047, "step": 14925 }, { "epoch": 0.5032188479557788, "grad_norm": 54.63640213012695, "learning_rate": 5.812893284338554e-07, "logits/chosen": -1.5353538990020752, "logits/rejected": -1.5873312950134277, "logps/chosen": -2.139723300933838, "logps/rejected": -2.3303074836730957, "loss": 3.1369, "rewards/accuracies": 0.5, "rewards/chosen": -21.397235870361328, "rewards/margins": 1.9058374166488647, "rewards/rejected": -23.303071975708008, "step": 14930 }, { "epoch": 0.5033873740267619, "grad_norm": 21.923625946044922, "learning_rate": 5.809990930902553e-07, "logits/chosen": -1.544602870941162, "logits/rejected": -1.413293719291687, "logps/chosen": -1.8571140766143799, "logps/rejected": -2.018515110015869, "loss": 2.363, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.57114028930664, "rewards/margins": 1.614009141921997, "rewards/rejected": -20.185152053833008, "step": 14935 }, { "epoch": 0.5035559000977451, "grad_norm": 67.67894744873047, "learning_rate": 5.8070882971606e-07, "logits/chosen": -1.0476644039154053, "logits/rejected": -0.980495274066925, "logps/chosen": -2.2644171714782715, "logps/rejected": -2.1518642902374268, "loss": 4.2407, "rewards/accuracies": 0.5, "rewards/chosen": -22.644174575805664, "rewards/margins": -1.1255325078964233, "rewards/rejected": -21.51864242553711, "step": 14940 }, { "epoch": 0.5037244261687283, "grad_norm": 16.13987922668457, "learning_rate": 5.804185384117189e-07, "logits/chosen": -1.2747676372528076, "logits/rejected": -1.362653136253357, "logps/chosen": -1.9708786010742188, "logps/rejected": -2.423872470855713, "loss": 1.4325, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -19.708786010742188, "rewards/margins": 4.529940128326416, "rewards/rejected": -24.238727569580078, "step": 14945 }, { "epoch": 0.5038929522397115, "grad_norm": 15.113187789916992, "learning_rate": 5.801282192776897e-07, "logits/chosen": -0.9203144907951355, "logits/rejected": -1.1155325174331665, "logps/chosen": -1.7292630672454834, "logps/rejected": -2.20222806930542, "loss": 1.5978, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -17.29262924194336, "rewards/margins": 4.729650974273682, "rewards/rejected": -22.022281646728516, "step": 14950 }, { "epoch": 0.5040614783106947, "grad_norm": 20.39295196533203, "learning_rate": 5.798378724144408e-07, "logits/chosen": -1.5319854021072388, "logits/rejected": -1.7888109683990479, "logps/chosen": -2.368319272994995, "logps/rejected": -2.5020503997802734, "loss": 3.2456, "rewards/accuracies": 0.5, "rewards/chosen": -23.68319320678711, "rewards/margins": 1.3373106718063354, "rewards/rejected": -25.020505905151367, "step": 14955 }, { "epoch": 0.5042300043816779, "grad_norm": 41.03092575073242, "learning_rate": 5.795474979224497e-07, "logits/chosen": -1.3623888492584229, "logits/rejected": -1.5820858478546143, "logps/chosen": -1.880578637123108, "logps/rejected": -1.8766975402832031, "loss": 3.2324, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.805784225463867, "rewards/margins": -0.03881063312292099, "rewards/rejected": -18.76697540283203, "step": 14960 }, { "epoch": 0.504398530452661, "grad_norm": 28.810705184936523, "learning_rate": 5.792570959022036e-07, "logits/chosen": -1.0483916997909546, "logits/rejected": -1.350998878479004, "logps/chosen": -1.980957269668579, "logps/rejected": -2.296393394470215, "loss": 2.5662, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.809574127197266, "rewards/margins": 3.154359817504883, "rewards/rejected": -22.96393394470215, "step": 14965 }, { "epoch": 0.5045670565236442, "grad_norm": 18.993446350097656, "learning_rate": 5.789666664541995e-07, "logits/chosen": -0.7886873483657837, "logits/rejected": -0.8404957056045532, "logps/chosen": -2.4663655757904053, "logps/rejected": -2.771146774291992, "loss": 2.7351, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.663654327392578, "rewards/margins": 3.0478122234344482, "rewards/rejected": -27.711467742919922, "step": 14970 }, { "epoch": 0.5047355825946274, "grad_norm": 61.77982711791992, "learning_rate": 5.78676209678943e-07, "logits/chosen": -1.4243978261947632, "logits/rejected": -1.5058616399765015, "logps/chosen": -3.024482250213623, "logps/rejected": -3.1523780822753906, "loss": 3.9404, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -30.244823455810547, "rewards/margins": 1.2789571285247803, "rewards/rejected": -31.523778915405273, "step": 14975 }, { "epoch": 0.5049041086656105, "grad_norm": 19.27959442138672, "learning_rate": 5.783857256769503e-07, "logits/chosen": -1.178634762763977, "logits/rejected": -1.251468300819397, "logps/chosen": -1.8392133712768555, "logps/rejected": -1.937748670578003, "loss": 2.3974, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.392135620117188, "rewards/margins": 0.9853529930114746, "rewards/rejected": -19.377490997314453, "step": 14980 }, { "epoch": 0.5050726347365938, "grad_norm": 8.128096580505371, "learning_rate": 5.78095214548746e-07, "logits/chosen": -1.1162395477294922, "logits/rejected": -1.6783126592636108, "logps/chosen": -1.9752323627471924, "logps/rejected": -2.4849510192871094, "loss": 1.2585, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.752323150634766, "rewards/margins": 5.097184658050537, "rewards/rejected": -24.84950828552246, "step": 14985 }, { "epoch": 0.505241160807577, "grad_norm": 31.431013107299805, "learning_rate": 5.778046763948649e-07, "logits/chosen": -1.3854622840881348, "logits/rejected": -1.2875674962997437, "logps/chosen": -2.1900856494903564, "logps/rejected": -1.9820547103881836, "loss": 5.1442, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -21.900854110717773, "rewards/margins": -2.080308437347412, "rewards/rejected": -19.820547103881836, "step": 14990 }, { "epoch": 0.5054096868785601, "grad_norm": 1.898257851600647, "learning_rate": 5.775141113158506e-07, "logits/chosen": -1.081305742263794, "logits/rejected": -1.121368646621704, "logps/chosen": -2.686314582824707, "logps/rejected": -2.8313148021698, "loss": 3.5432, "rewards/accuracies": 0.5, "rewards/chosen": -26.863147735595703, "rewards/margins": 1.450002670288086, "rewards/rejected": -28.313146591186523, "step": 14995 }, { "epoch": 0.5055782129495433, "grad_norm": 30.13627815246582, "learning_rate": 5.772235194122564e-07, "logits/chosen": -1.1736291646957397, "logits/rejected": -1.7007135152816772, "logps/chosen": -2.089254856109619, "logps/rejected": -2.541189670562744, "loss": 2.0636, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.892545700073242, "rewards/margins": 4.519349098205566, "rewards/rejected": -25.411895751953125, "step": 15000 }, { "epoch": 0.5057467390205265, "grad_norm": 79.65396881103516, "learning_rate": 5.769329007846445e-07, "logits/chosen": -1.4201363325119019, "logits/rejected": -1.2374026775360107, "logps/chosen": -2.4222466945648193, "logps/rejected": -2.21855092048645, "loss": 5.2571, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -24.22246742248535, "rewards/margins": -2.0369582176208496, "rewards/rejected": -22.18550682067871, "step": 15005 }, { "epoch": 0.5059152650915096, "grad_norm": 36.43207550048828, "learning_rate": 5.766422555335866e-07, "logits/chosen": -1.0224153995513916, "logits/rejected": -1.168966293334961, "logps/chosen": -2.2806081771850586, "logps/rejected": -2.5962941646575928, "loss": 2.3357, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.806079864501953, "rewards/margins": 3.1568641662597656, "rewards/rejected": -25.96294593811035, "step": 15010 }, { "epoch": 0.5060837911624928, "grad_norm": 137.0742645263672, "learning_rate": 5.763515837596638e-07, "logits/chosen": -1.1404750347137451, "logits/rejected": -1.0680490732192993, "logps/chosen": -2.4310100078582764, "logps/rejected": -2.489689350128174, "loss": 2.8058, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.310100555419922, "rewards/margins": 0.5867937207221985, "rewards/rejected": -24.896894454956055, "step": 15015 }, { "epoch": 0.506252317233476, "grad_norm": 20.5747127532959, "learning_rate": 5.760608855634661e-07, "logits/chosen": -1.1737111806869507, "logits/rejected": -1.6075645685195923, "logps/chosen": -1.9366137981414795, "logps/rejected": -2.1530511379241943, "loss": 1.5798, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.366138458251953, "rewards/margins": 2.1643741130828857, "rewards/rejected": -21.5305118560791, "step": 15020 }, { "epoch": 0.5064208433044592, "grad_norm": 34.711265563964844, "learning_rate": 5.757701610455924e-07, "logits/chosen": -1.3211795091629028, "logits/rejected": -1.266367793083191, "logps/chosen": -1.9475847482681274, "logps/rejected": -2.24564528465271, "loss": 1.8382, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.475847244262695, "rewards/margins": 2.9806065559387207, "rewards/rejected": -22.45645523071289, "step": 15025 }, { "epoch": 0.5065893693754424, "grad_norm": 29.361732482910156, "learning_rate": 5.754794103066511e-07, "logits/chosen": -1.6665149927139282, "logits/rejected": -1.9457658529281616, "logps/chosen": -2.569488048553467, "logps/rejected": -2.8468360900878906, "loss": 1.7092, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -25.69488525390625, "rewards/margins": 2.7734780311584473, "rewards/rejected": -28.46836280822754, "step": 15030 }, { "epoch": 0.5067578954464256, "grad_norm": 27.143373489379883, "learning_rate": 5.751886334472598e-07, "logits/chosen": -0.9758981466293335, "logits/rejected": -1.1668593883514404, "logps/chosen": -2.2181034088134766, "logps/rejected": -2.323904275894165, "loss": 3.4183, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.1810359954834, "rewards/margins": 1.0580068826675415, "rewards/rejected": -23.23904037475586, "step": 15035 }, { "epoch": 0.5069264215174087, "grad_norm": 35.79897689819336, "learning_rate": 5.748978305680448e-07, "logits/chosen": -1.2053627967834473, "logits/rejected": -1.2655714750289917, "logps/chosen": -1.6004425287246704, "logps/rejected": -1.8336282968521118, "loss": 1.7421, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -16.004425048828125, "rewards/margins": 2.3318583965301514, "rewards/rejected": -18.336284637451172, "step": 15040 }, { "epoch": 0.5070949475883919, "grad_norm": 28.32414436340332, "learning_rate": 5.746070017696415e-07, "logits/chosen": -1.4866182804107666, "logits/rejected": -1.7096736431121826, "logps/chosen": -1.7442781925201416, "logps/rejected": -1.6984007358551025, "loss": 3.5889, "rewards/accuracies": 0.5, "rewards/chosen": -17.442779541015625, "rewards/margins": -0.4587737023830414, "rewards/rejected": -16.984004974365234, "step": 15045 }, { "epoch": 0.5072634736593751, "grad_norm": 32.17911911010742, "learning_rate": 5.743161471526943e-07, "logits/chosen": -1.323290467262268, "logits/rejected": -1.168046236038208, "logps/chosen": -1.8326724767684937, "logps/rejected": -1.8872158527374268, "loss": 2.6512, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.326725006103516, "rewards/margins": 0.5454355478286743, "rewards/rejected": -18.872159957885742, "step": 15050 }, { "epoch": 0.5074319997303582, "grad_norm": 36.145240783691406, "learning_rate": 5.740252668178565e-07, "logits/chosen": -1.0528347492218018, "logits/rejected": -1.01715886592865, "logps/chosen": -2.278759241104126, "logps/rejected": -2.063829183578491, "loss": 5.1952, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -22.78759002685547, "rewards/margins": -2.149296522140503, "rewards/rejected": -20.638294219970703, "step": 15055 }, { "epoch": 0.5076005258013415, "grad_norm": 21.49053192138672, "learning_rate": 5.737343608657903e-07, "logits/chosen": -1.3348206281661987, "logits/rejected": -1.3797569274902344, "logps/chosen": -2.138988971710205, "logps/rejected": -2.2335526943206787, "loss": 2.9161, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.389888763427734, "rewards/margins": 0.9456375241279602, "rewards/rejected": -22.335527420043945, "step": 15060 }, { "epoch": 0.5077690518723247, "grad_norm": 24.10394859313965, "learning_rate": 5.734434293971668e-07, "logits/chosen": -1.3850148916244507, "logits/rejected": -1.6381429433822632, "logps/chosen": -1.8205817937850952, "logps/rejected": -2.170335292816162, "loss": 2.4044, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.2058162689209, "rewards/margins": 3.497534990310669, "rewards/rejected": -21.703351974487305, "step": 15065 }, { "epoch": 0.5079375779433078, "grad_norm": 103.56597900390625, "learning_rate": 5.73152472512666e-07, "logits/chosen": -1.0463709831237793, "logits/rejected": -1.275679588317871, "logps/chosen": -2.388948440551758, "logps/rejected": -2.543187379837036, "loss": 2.6494, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -23.889484405517578, "rewards/margins": 1.5423904657363892, "rewards/rejected": -25.431873321533203, "step": 15070 }, { "epoch": 0.508106104014291, "grad_norm": 124.68612670898438, "learning_rate": 5.728614903129765e-07, "logits/chosen": -1.4812027215957642, "logits/rejected": -1.6524875164031982, "logps/chosen": -2.292492628097534, "logps/rejected": -2.298862934112549, "loss": 3.0968, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.9249267578125, "rewards/margins": 0.06370306015014648, "rewards/rejected": -22.988630294799805, "step": 15075 }, { "epoch": 0.5082746300852742, "grad_norm": 26.963624954223633, "learning_rate": 5.725704828987959e-07, "logits/chosen": -1.2273039817810059, "logits/rejected": -1.193224549293518, "logps/chosen": -2.1760504245758057, "logps/rejected": -2.2250120639801025, "loss": 3.1869, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -21.7605037689209, "rewards/margins": 0.48961561918258667, "rewards/rejected": -22.250120162963867, "step": 15080 }, { "epoch": 0.5084431561562573, "grad_norm": 27.633817672729492, "learning_rate": 5.722794503708303e-07, "logits/chosen": -1.3985062837600708, "logits/rejected": -1.6811736822128296, "logps/chosen": -2.1464569568634033, "logps/rejected": -2.389500379562378, "loss": 2.1451, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.464569091796875, "rewards/margins": 2.4304375648498535, "rewards/rejected": -23.89500617980957, "step": 15085 }, { "epoch": 0.5086116822272405, "grad_norm": 30.096017837524414, "learning_rate": 5.719883928297946e-07, "logits/chosen": -1.3296657800674438, "logits/rejected": -1.5122735500335693, "logps/chosen": -2.052288293838501, "logps/rejected": -2.173229932785034, "loss": 2.4221, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.522884368896484, "rewards/margins": 1.2094166278839111, "rewards/rejected": -21.7322998046875, "step": 15090 }, { "epoch": 0.5087802082982238, "grad_norm": 38.03456115722656, "learning_rate": 5.716973103764123e-07, "logits/chosen": -1.9371150732040405, "logits/rejected": -2.0468668937683105, "logps/chosen": -2.0087790489196777, "logps/rejected": -2.5527255535125732, "loss": 2.3194, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.08778953552246, "rewards/margins": 5.439466953277588, "rewards/rejected": -25.52725601196289, "step": 15095 }, { "epoch": 0.508948734369207, "grad_norm": 20.301576614379883, "learning_rate": 5.714062031114159e-07, "logits/chosen": -1.9820003509521484, "logits/rejected": -2.280233383178711, "logps/chosen": -1.8788375854492188, "logps/rejected": -2.440950870513916, "loss": 1.7261, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.788375854492188, "rewards/margins": 5.621129035949707, "rewards/rejected": -24.409503936767578, "step": 15100 }, { "epoch": 0.5091172604401901, "grad_norm": 124.19692993164062, "learning_rate": 5.711150711355456e-07, "logits/chosen": -1.373255968093872, "logits/rejected": -1.834007978439331, "logps/chosen": -2.469758987426758, "logps/rejected": -2.4483742713928223, "loss": 4.5018, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -24.697589874267578, "rewards/margins": -0.2138504981994629, "rewards/rejected": -24.483741760253906, "step": 15105 }, { "epoch": 0.5092857865111733, "grad_norm": 27.15669059753418, "learning_rate": 5.70823914549551e-07, "logits/chosen": -1.1241658926010132, "logits/rejected": -1.5976471900939941, "logps/chosen": -2.844392776489258, "logps/rejected": -3.51300311088562, "loss": 1.8059, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -28.443927764892578, "rewards/margins": 6.686103820800781, "rewards/rejected": -35.130027770996094, "step": 15110 }, { "epoch": 0.5094543125821565, "grad_norm": 34.46886444091797, "learning_rate": 5.705327334541901e-07, "logits/chosen": -1.1911742687225342, "logits/rejected": -1.2832351922988892, "logps/chosen": -1.9296703338623047, "logps/rejected": -2.128652811050415, "loss": 2.3969, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.296703338623047, "rewards/margins": 1.9898231029510498, "rewards/rejected": -21.28652572631836, "step": 15115 }, { "epoch": 0.5096228386531396, "grad_norm": 19.99987030029297, "learning_rate": 5.702415279502289e-07, "logits/chosen": -1.6466939449310303, "logits/rejected": -1.8893673419952393, "logps/chosen": -2.2943835258483887, "logps/rejected": -2.482844829559326, "loss": 2.9223, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.94383430480957, "rewards/margins": 1.8846137523651123, "rewards/rejected": -24.828449249267578, "step": 15120 }, { "epoch": 0.5097913647241228, "grad_norm": 12.42703914642334, "learning_rate": 5.699502981384424e-07, "logits/chosen": -1.3233040571212769, "logits/rejected": -1.3392280340194702, "logps/chosen": -2.143939971923828, "logps/rejected": -2.0123233795166016, "loss": 6.2954, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -21.43939781188965, "rewards/margins": -1.3161662817001343, "rewards/rejected": -20.123231887817383, "step": 15125 }, { "epoch": 0.509959890795106, "grad_norm": 37.99794006347656, "learning_rate": 5.696590441196137e-07, "logits/chosen": -1.403928518295288, "logits/rejected": -1.2134307622909546, "logps/chosen": -2.2327723503112793, "logps/rejected": -2.414923906326294, "loss": 3.2015, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.32772445678711, "rewards/margins": 1.8215110301971436, "rewards/rejected": -24.14923667907715, "step": 15130 }, { "epoch": 0.5101284168660892, "grad_norm": 32.46255111694336, "learning_rate": 5.693677659945342e-07, "logits/chosen": -1.5124094486236572, "logits/rejected": -1.5291669368743896, "logps/chosen": -1.6440776586532593, "logps/rejected": -1.6526644229888916, "loss": 3.0941, "rewards/accuracies": 0.5, "rewards/chosen": -16.440776824951172, "rewards/margins": 0.08586740493774414, "rewards/rejected": -16.526643753051758, "step": 15135 }, { "epoch": 0.5102969429370724, "grad_norm": 26.882415771484375, "learning_rate": 5.690764638640037e-07, "logits/chosen": -1.208224892616272, "logits/rejected": -1.1025320291519165, "logps/chosen": -2.1468870639801025, "logps/rejected": -2.654031276702881, "loss": 1.5857, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.468868255615234, "rewards/margins": 5.071444034576416, "rewards/rejected": -26.540313720703125, "step": 15140 }, { "epoch": 0.5104654690080556, "grad_norm": 33.146541595458984, "learning_rate": 5.687851378288309e-07, "logits/chosen": -1.5718231201171875, "logits/rejected": -1.6999132633209229, "logps/chosen": -2.2277894020080566, "logps/rejected": -2.578054666519165, "loss": 3.0773, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.277894973754883, "rewards/margins": 3.502652406692505, "rewards/rejected": -25.78054428100586, "step": 15145 }, { "epoch": 0.5106339950790387, "grad_norm": 25.321151733398438, "learning_rate": 5.684937879898316e-07, "logits/chosen": -0.8661189079284668, "logits/rejected": -0.9531176686286926, "logps/chosen": -2.0083203315734863, "logps/rejected": -2.0512888431549072, "loss": 3.563, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.083200454711914, "rewards/margins": 0.42968788743019104, "rewards/rejected": -20.512887954711914, "step": 15150 }, { "epoch": 0.5108025211500219, "grad_norm": 71.89868927001953, "learning_rate": 5.68202414447831e-07, "logits/chosen": -1.2475855350494385, "logits/rejected": -1.1813002824783325, "logps/chosen": -2.005934238433838, "logps/rejected": -2.3727059364318848, "loss": 2.8826, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.059343338012695, "rewards/margins": 3.6677181720733643, "rewards/rejected": -23.727060317993164, "step": 15155 }, { "epoch": 0.5109710472210051, "grad_norm": 41.4750862121582, "learning_rate": 5.679110173036619e-07, "logits/chosen": -1.2968660593032837, "logits/rejected": -1.4929004907608032, "logps/chosen": -2.0121240615844727, "logps/rejected": -2.3801047801971436, "loss": 2.112, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.121240615844727, "rewards/margins": 3.6798081398010254, "rewards/rejected": -23.801050186157227, "step": 15160 }, { "epoch": 0.5111395732919882, "grad_norm": 24.9608211517334, "learning_rate": 5.67619596658165e-07, "logits/chosen": -1.766427993774414, "logits/rejected": -2.0109450817108154, "logps/chosen": -2.5077290534973145, "logps/rejected": -2.861133337020874, "loss": 3.4502, "rewards/accuracies": 0.5, "rewards/chosen": -25.07729148864746, "rewards/margins": 3.5340423583984375, "rewards/rejected": -28.6113338470459, "step": 15165 }, { "epoch": 0.5113080993629715, "grad_norm": 99.52765655517578, "learning_rate": 5.673281526121901e-07, "logits/chosen": -1.3471081256866455, "logits/rejected": -1.2552907466888428, "logps/chosen": -2.918884754180908, "logps/rejected": -2.788374662399292, "loss": 4.8588, "rewards/accuracies": 0.5, "rewards/chosen": -29.1888484954834, "rewards/margins": -1.3051000833511353, "rewards/rejected": -27.883747100830078, "step": 15170 }, { "epoch": 0.5114766254339547, "grad_norm": 19.62303352355957, "learning_rate": 5.670366852665941e-07, "logits/chosen": -1.0323598384857178, "logits/rejected": -1.5572612285614014, "logps/chosen": -2.121976852416992, "logps/rejected": -2.498755693435669, "loss": 2.5533, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.219768524169922, "rewards/margins": 3.7677853107452393, "rewards/rejected": -24.9875545501709, "step": 15175 }, { "epoch": 0.5116451515049378, "grad_norm": 14.84547233581543, "learning_rate": 5.667451947222424e-07, "logits/chosen": -1.4115025997161865, "logits/rejected": -1.6721994876861572, "logps/chosen": -2.5753791332244873, "logps/rejected": -2.889843463897705, "loss": 3.2453, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.7537899017334, "rewards/margins": 3.144641876220703, "rewards/rejected": -28.8984317779541, "step": 15180 }, { "epoch": 0.511813677575921, "grad_norm": 98.96126556396484, "learning_rate": 5.664536810800086e-07, "logits/chosen": -0.8689087629318237, "logits/rejected": -1.1629440784454346, "logps/chosen": -3.3273167610168457, "logps/rejected": -3.6868317127227783, "loss": 3.3101, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -33.273170471191406, "rewards/margins": 3.5951449871063232, "rewards/rejected": -36.86831283569336, "step": 15185 }, { "epoch": 0.5119822036469042, "grad_norm": 13.844437599182129, "learning_rate": 5.661621444407738e-07, "logits/chosen": -1.2056928873062134, "logits/rejected": -1.2074638605117798, "logps/chosen": -2.4977755546569824, "logps/rejected": -2.657663106918335, "loss": 2.1132, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.97775650024414, "rewards/margins": 1.598876953125, "rewards/rejected": -26.576629638671875, "step": 15190 }, { "epoch": 0.5121507297178873, "grad_norm": 21.894990921020508, "learning_rate": 5.658705849054276e-07, "logits/chosen": -0.9881241917610168, "logits/rejected": -1.1200683116912842, "logps/chosen": -1.9984909296035767, "logps/rejected": -2.072193145751953, "loss": 3.4589, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.984909057617188, "rewards/margins": 0.7370238304138184, "rewards/rejected": -20.72193145751953, "step": 15195 }, { "epoch": 0.5123192557888705, "grad_norm": 128.0411834716797, "learning_rate": 5.655790025748672e-07, "logits/chosen": -1.3919637203216553, "logits/rejected": -1.5302560329437256, "logps/chosen": -2.3977060317993164, "logps/rejected": -2.2407610416412354, "loss": 4.9446, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -23.977062225341797, "rewards/margins": -1.5694499015808105, "rewards/rejected": -22.407611846923828, "step": 15200 }, { "epoch": 0.5123192557888705, "eval_logits/chosen": -1.7760441303253174, "eval_logits/rejected": -1.9057296514511108, "eval_logps/chosen": -2.0414392948150635, "eval_logps/rejected": -2.1566874980926514, "eval_loss": 2.990462064743042, "eval_rewards/accuracies": 0.6299999952316284, "eval_rewards/chosen": -20.414392471313477, "eval_rewards/margins": 1.1524845361709595, "eval_rewards/rejected": -21.566875457763672, "eval_runtime": 12.9128, "eval_samples_per_second": 7.744, "eval_steps_per_second": 1.936, "step": 15200 }, { "epoch": 0.5124877818598538, "grad_norm": 104.97480010986328, "learning_rate": 5.652873975499977e-07, "logits/chosen": -1.511674165725708, "logits/rejected": -1.7704378366470337, "logps/chosen": -2.450023889541626, "logps/rejected": -2.663987159729004, "loss": 2.8018, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.5002384185791, "rewards/margins": 2.1396327018737793, "rewards/rejected": -26.63987159729004, "step": 15205 }, { "epoch": 0.5126563079308369, "grad_norm": 3.2481727600097656, "learning_rate": 5.649957699317319e-07, "logits/chosen": -1.3668832778930664, "logits/rejected": -1.6022402048110962, "logps/chosen": -2.3622775077819824, "logps/rejected": -3.1148085594177246, "loss": 1.7853, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -23.62277603149414, "rewards/margins": 7.525309085845947, "rewards/rejected": -31.148086547851562, "step": 15210 }, { "epoch": 0.5128248340018201, "grad_norm": 41.21067428588867, "learning_rate": 5.647041198209912e-07, "logits/chosen": -1.402848243713379, "logits/rejected": -1.4903753995895386, "logps/chosen": -2.3762662410736084, "logps/rejected": -2.614976167678833, "loss": 2.2179, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.762662887573242, "rewards/margins": 2.387101173400879, "rewards/rejected": -26.149761199951172, "step": 15215 }, { "epoch": 0.5129933600728033, "grad_norm": 18.946138381958008, "learning_rate": 5.644124473187038e-07, "logits/chosen": -1.4866359233856201, "logits/rejected": -1.8214614391326904, "logps/chosen": -2.311133861541748, "logps/rejected": -2.6970841884613037, "loss": 2.1406, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.111337661743164, "rewards/margins": 3.8595046997070312, "rewards/rejected": -26.970844268798828, "step": 15220 }, { "epoch": 0.5131618861437864, "grad_norm": 20.410324096679688, "learning_rate": 5.641207525258059e-07, "logits/chosen": -1.355177879333496, "logits/rejected": -1.4015527963638306, "logps/chosen": -1.7830779552459717, "logps/rejected": -1.906141996383667, "loss": 2.4046, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.830781936645508, "rewards/margins": 1.2306405305862427, "rewards/rejected": -19.061420440673828, "step": 15225 }, { "epoch": 0.5133304122147696, "grad_norm": 21.630565643310547, "learning_rate": 5.63829035543242e-07, "logits/chosen": -1.3447643518447876, "logits/rejected": -1.690081000328064, "logps/chosen": -2.441915988922119, "logps/rejected": -2.865563154220581, "loss": 2.5523, "rewards/accuracies": 0.5, "rewards/chosen": -24.41915512084961, "rewards/margins": 4.236475944519043, "rewards/rejected": -28.6556339263916, "step": 15230 }, { "epoch": 0.5134989382857528, "grad_norm": 33.107730865478516, "learning_rate": 5.635372964719635e-07, "logits/chosen": -1.1950870752334595, "logits/rejected": -1.6577335596084595, "logps/chosen": -1.9552379846572876, "logps/rejected": -2.3201534748077393, "loss": 2.7634, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.552379608154297, "rewards/margins": 3.6491541862487793, "rewards/rejected": -23.201534271240234, "step": 15235 }, { "epoch": 0.5136674643567359, "grad_norm": 17.149858474731445, "learning_rate": 5.632455354129302e-07, "logits/chosen": -1.1954476833343506, "logits/rejected": -1.427310585975647, "logps/chosen": -2.1515591144561768, "logps/rejected": -2.411625385284424, "loss": 2.4254, "rewards/accuracies": 0.5, "rewards/chosen": -21.51559066772461, "rewards/margins": 2.6006622314453125, "rewards/rejected": -24.116252899169922, "step": 15240 }, { "epoch": 0.5138359904277192, "grad_norm": 62.84499740600586, "learning_rate": 5.629537524671086e-07, "logits/chosen": -1.7799403667449951, "logits/rejected": -1.6227846145629883, "logps/chosen": -1.9937528371810913, "logps/rejected": -1.9934908151626587, "loss": 3.6365, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.937528610229492, "rewards/margins": -0.0026217461563646793, "rewards/rejected": -19.934907913208008, "step": 15245 }, { "epoch": 0.5140045164987024, "grad_norm": 27.73377799987793, "learning_rate": 5.626619477354738e-07, "logits/chosen": -1.7326381206512451, "logits/rejected": -1.859344482421875, "logps/chosen": -2.5812244415283203, "logps/rejected": -2.9161319732666016, "loss": 2.1002, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.812240600585938, "rewards/margins": 3.349076509475708, "rewards/rejected": -29.16132164001465, "step": 15250 }, { "epoch": 0.5141730425696855, "grad_norm": 23.284900665283203, "learning_rate": 5.623701213190075e-07, "logits/chosen": -1.6898269653320312, "logits/rejected": -1.8181917667388916, "logps/chosen": -2.469947576522827, "logps/rejected": -2.9490766525268555, "loss": 1.9946, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.699474334716797, "rewards/margins": 4.7912917137146, "rewards/rejected": -29.490764617919922, "step": 15255 }, { "epoch": 0.5143415686406687, "grad_norm": 20.285890579223633, "learning_rate": 5.620782733186995e-07, "logits/chosen": -1.0866138935089111, "logits/rejected": -1.1041343212127686, "logps/chosen": -2.3508377075195312, "logps/rejected": -2.452939510345459, "loss": 2.8995, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.50837516784668, "rewards/margins": 1.021020531654358, "rewards/rejected": -24.529394149780273, "step": 15260 }, { "epoch": 0.5145100947116519, "grad_norm": 31.823381423950195, "learning_rate": 5.617864038355469e-07, "logits/chosen": -1.625353455543518, "logits/rejected": -1.5252519845962524, "logps/chosen": -2.1418838500976562, "logps/rejected": -2.5077455043792725, "loss": 2.7457, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.418838500976562, "rewards/margins": 3.658615827560425, "rewards/rejected": -25.07745361328125, "step": 15265 }, { "epoch": 0.514678620782635, "grad_norm": 34.04703903198242, "learning_rate": 5.614945129705543e-07, "logits/chosen": -1.2527238130569458, "logits/rejected": -1.4769929647445679, "logps/chosen": -2.013667106628418, "logps/rejected": -2.987143039703369, "loss": 2.7115, "rewards/accuracies": 0.5, "rewards/chosen": -20.136669158935547, "rewards/margins": 9.734761238098145, "rewards/rejected": -29.871429443359375, "step": 15270 }, { "epoch": 0.5148471468536182, "grad_norm": 28.078725814819336, "learning_rate": 5.612026008247336e-07, "logits/chosen": -2.0508275032043457, "logits/rejected": -1.9603971242904663, "logps/chosen": -2.383984327316284, "logps/rejected": -2.6936373710632324, "loss": 2.3759, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.839847564697266, "rewards/margins": 3.096529006958008, "rewards/rejected": -26.93637466430664, "step": 15275 }, { "epoch": 0.5150156729246015, "grad_norm": 28.85657501220703, "learning_rate": 5.609106674991038e-07, "logits/chosen": -1.2822644710540771, "logits/rejected": -1.3018858432769775, "logps/chosen": -2.2119603157043457, "logps/rejected": -2.025120973587036, "loss": 5.0357, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.11960220336914, "rewards/margins": -1.8683927059173584, "rewards/rejected": -20.251211166381836, "step": 15280 }, { "epoch": 0.5151841989955847, "grad_norm": 75.3193588256836, "learning_rate": 5.606187130946921e-07, "logits/chosen": -1.9307209253311157, "logits/rejected": -1.889897108078003, "logps/chosen": -2.0299391746520996, "logps/rejected": -2.8823294639587402, "loss": 2.1457, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.29939079284668, "rewards/margins": 8.523903846740723, "rewards/rejected": -28.823293685913086, "step": 15285 }, { "epoch": 0.5153527250665678, "grad_norm": 21.13232421875, "learning_rate": 5.603267377125319e-07, "logits/chosen": -1.5366874933242798, "logits/rejected": -1.7862281799316406, "logps/chosen": -2.5976200103759766, "logps/rejected": -3.0689826011657715, "loss": 1.3576, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -25.976200103759766, "rewards/margins": 4.713629245758057, "rewards/rejected": -30.689828872680664, "step": 15290 }, { "epoch": 0.515521251137551, "grad_norm": 23.98374366760254, "learning_rate": 5.600347414536645e-07, "logits/chosen": -1.440609335899353, "logits/rejected": -1.6278479099273682, "logps/chosen": -2.1875216960906982, "logps/rejected": -2.355437755584717, "loss": 2.2062, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.87521743774414, "rewards/margins": 1.6791574954986572, "rewards/rejected": -23.55437469482422, "step": 15295 }, { "epoch": 0.5156897772085342, "grad_norm": 27.256919860839844, "learning_rate": 5.597427244191385e-07, "logits/chosen": -1.8316819667816162, "logits/rejected": -1.9437427520751953, "logps/chosen": -1.7571513652801514, "logps/rejected": -2.0879530906677246, "loss": 2.5299, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.571514129638672, "rewards/margins": 3.308016300201416, "rewards/rejected": -20.879528045654297, "step": 15300 }, { "epoch": 0.5158583032795173, "grad_norm": 22.483091354370117, "learning_rate": 5.594506867100092e-07, "logits/chosen": -1.382283329963684, "logits/rejected": -1.304034948348999, "logps/chosen": -2.455326557159424, "logps/rejected": -2.544750213623047, "loss": 3.8275, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.553264617919922, "rewards/margins": 0.894239068031311, "rewards/rejected": -25.4475040435791, "step": 15305 }, { "epoch": 0.5160268293505005, "grad_norm": 29.94123077392578, "learning_rate": 5.591586284273396e-07, "logits/chosen": -1.4842712879180908, "logits/rejected": -1.387634038925171, "logps/chosen": -1.6527531147003174, "logps/rejected": -1.783780813217163, "loss": 1.9562, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -16.527530670166016, "rewards/margins": 1.3102777004241943, "rewards/rejected": -17.837810516357422, "step": 15310 }, { "epoch": 0.5161953554214836, "grad_norm": 32.287227630615234, "learning_rate": 5.588665496721994e-07, "logits/chosen": -1.6405065059661865, "logits/rejected": -2.1743381023406982, "logps/chosen": -2.823437213897705, "logps/rejected": -3.476367950439453, "loss": 1.34, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -28.234371185302734, "rewards/margins": 6.529305934906006, "rewards/rejected": -34.76367950439453, "step": 15315 }, { "epoch": 0.5163638814924669, "grad_norm": 29.671825408935547, "learning_rate": 5.585744505456656e-07, "logits/chosen": -1.4803146123886108, "logits/rejected": -1.6037018299102783, "logps/chosen": -1.7614666223526, "logps/rejected": -2.1801557540893555, "loss": 1.7482, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -17.614665985107422, "rewards/margins": 4.186891555786133, "rewards/rejected": -21.801555633544922, "step": 15320 }, { "epoch": 0.5165324075634501, "grad_norm": 32.12974548339844, "learning_rate": 5.582823311488222e-07, "logits/chosen": -1.0687305927276611, "logits/rejected": -1.4367082118988037, "logps/chosen": -2.857874631881714, "logps/rejected": -2.8543312549591064, "loss": 3.6711, "rewards/accuracies": 0.5, "rewards/chosen": -28.578746795654297, "rewards/margins": -0.03543538972735405, "rewards/rejected": -28.543310165405273, "step": 15325 }, { "epoch": 0.5167009336344333, "grad_norm": 56.81086730957031, "learning_rate": 5.579901915827601e-07, "logits/chosen": -1.816873550415039, "logits/rejected": -1.7994168996810913, "logps/chosen": -2.356872320175171, "logps/rejected": -2.617316722869873, "loss": 2.9243, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.5687255859375, "rewards/margins": 2.604442596435547, "rewards/rejected": -26.173168182373047, "step": 15330 }, { "epoch": 0.5168694597054164, "grad_norm": 27.684396743774414, "learning_rate": 5.576980319485777e-07, "logits/chosen": -1.2684608697891235, "logits/rejected": -1.4406415224075317, "logps/chosen": -2.101574420928955, "logps/rejected": -2.1958396434783936, "loss": 2.8061, "rewards/accuracies": 0.5, "rewards/chosen": -21.015743255615234, "rewards/margins": 0.9426544308662415, "rewards/rejected": -21.958398818969727, "step": 15335 }, { "epoch": 0.5170379857763996, "grad_norm": 53.1370735168457, "learning_rate": 5.574058523473794e-07, "logits/chosen": -1.0984376668930054, "logits/rejected": -1.0726025104522705, "logps/chosen": -2.186927318572998, "logps/rejected": -2.5625603199005127, "loss": 2.5126, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.869272232055664, "rewards/margins": 3.7563300132751465, "rewards/rejected": -25.6256046295166, "step": 15340 }, { "epoch": 0.5172065118473828, "grad_norm": 22.909120559692383, "learning_rate": 5.571136528802775e-07, "logits/chosen": -1.4302793741226196, "logits/rejected": -1.3703594207763672, "logps/chosen": -1.9710960388183594, "logps/rejected": -1.9895975589752197, "loss": 3.0761, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.710962295532227, "rewards/margins": 0.18501415848731995, "rewards/rejected": -19.89597511291504, "step": 15345 }, { "epoch": 0.5173750379183659, "grad_norm": 28.171419143676758, "learning_rate": 5.568214336483904e-07, "logits/chosen": -1.289721131324768, "logits/rejected": -1.4089850187301636, "logps/chosen": -1.9875361919403076, "logps/rejected": -2.204806089401245, "loss": 2.5122, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.875362396240234, "rewards/margins": 2.172700881958008, "rewards/rejected": -22.048063278198242, "step": 15350 }, { "epoch": 0.5175435639893492, "grad_norm": 21.062044143676758, "learning_rate": 5.56529194752844e-07, "logits/chosen": -0.7967745065689087, "logits/rejected": -1.043370246887207, "logps/chosen": -2.4613327980041504, "logps/rejected": -2.6952717304229736, "loss": 2.6294, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.613330841064453, "rewards/margins": 2.3393893241882324, "rewards/rejected": -26.95271873474121, "step": 15355 }, { "epoch": 0.5177120900603324, "grad_norm": 88.55207824707031, "learning_rate": 5.562369362947703e-07, "logits/chosen": -1.3059293031692505, "logits/rejected": -1.3453352451324463, "logps/chosen": -2.1375696659088135, "logps/rejected": -2.015935182571411, "loss": 4.2888, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -21.37569808959961, "rewards/margins": -1.216347336769104, "rewards/rejected": -20.15934944152832, "step": 15360 }, { "epoch": 0.5178806161313155, "grad_norm": 86.71758270263672, "learning_rate": 5.559446583753086e-07, "logits/chosen": -1.2455203533172607, "logits/rejected": -1.2034311294555664, "logps/chosen": -2.1374192237854004, "logps/rejected": -2.324183464050293, "loss": 3.5776, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -21.374195098876953, "rewards/margins": 1.8676401376724243, "rewards/rejected": -23.241836547851562, "step": 15365 }, { "epoch": 0.5180491422022987, "grad_norm": 34.24867248535156, "learning_rate": 5.556523610956047e-07, "logits/chosen": -1.469974160194397, "logits/rejected": -1.6809148788452148, "logps/chosen": -2.598238468170166, "logps/rejected": -2.6210434436798096, "loss": 3.7393, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -25.982385635375977, "rewards/margins": 0.22805070877075195, "rewards/rejected": -26.210433959960938, "step": 15370 }, { "epoch": 0.5182176682732819, "grad_norm": 36.28508758544922, "learning_rate": 5.553600445568113e-07, "logits/chosen": -1.4834458827972412, "logits/rejected": -1.4371143579483032, "logps/chosen": -2.5968239307403564, "logps/rejected": -2.711268901824951, "loss": 4.7324, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -25.968236923217773, "rewards/margins": 1.1444545984268188, "rewards/rejected": -27.11269187927246, "step": 15375 }, { "epoch": 0.518386194344265, "grad_norm": 17.586444854736328, "learning_rate": 5.550677088600876e-07, "logits/chosen": -1.2126684188842773, "logits/rejected": -1.5736459493637085, "logps/chosen": -2.134211301803589, "logps/rejected": -2.6339869499206543, "loss": 2.5797, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.342113494873047, "rewards/margins": 4.997755527496338, "rewards/rejected": -26.339868545532227, "step": 15380 }, { "epoch": 0.5185547204152482, "grad_norm": 22.281524658203125, "learning_rate": 5.547753541065993e-07, "logits/chosen": -1.3549929857254028, "logits/rejected": -1.396969199180603, "logps/chosen": -1.6693958044052124, "logps/rejected": -1.815840482711792, "loss": 2.511, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.693960189819336, "rewards/margins": 1.4644463062286377, "rewards/rejected": -18.158405303955078, "step": 15385 }, { "epoch": 0.5187232464862315, "grad_norm": 34.48127746582031, "learning_rate": 5.544829803975193e-07, "logits/chosen": -1.3881969451904297, "logits/rejected": -1.4901460409164429, "logps/chosen": -2.1605522632598877, "logps/rejected": -2.43407940864563, "loss": 2.5932, "rewards/accuracies": 0.5, "rewards/chosen": -21.60552406311035, "rewards/margins": 2.735269784927368, "rewards/rejected": -24.34079360961914, "step": 15390 }, { "epoch": 0.5188917725572146, "grad_norm": 63.25047302246094, "learning_rate": 5.541905878340261e-07, "logits/chosen": -0.8809518814086914, "logits/rejected": -1.2939692735671997, "logps/chosen": -2.41929292678833, "logps/rejected": -2.7904906272888184, "loss": 1.4925, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -24.192928314208984, "rewards/margins": 3.7119758129119873, "rewards/rejected": -27.904903411865234, "step": 15395 }, { "epoch": 0.5190602986281978, "grad_norm": 86.68781280517578, "learning_rate": 5.538981765173055e-07, "logits/chosen": -1.6341667175292969, "logits/rejected": -1.7704432010650635, "logps/chosen": -2.3183207511901855, "logps/rejected": -2.383631944656372, "loss": 3.7675, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.183204650878906, "rewards/margins": 0.6531141400337219, "rewards/rejected": -23.836318969726562, "step": 15400 }, { "epoch": 0.519228824699181, "grad_norm": 29.288299560546875, "learning_rate": 5.536057465485495e-07, "logits/chosen": -1.5594924688339233, "logits/rejected": -1.3881484270095825, "logps/chosen": -1.7050707340240479, "logps/rejected": -1.7124258279800415, "loss": 3.1131, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.050708770751953, "rewards/margins": 0.0735509842634201, "rewards/rejected": -17.124258041381836, "step": 15405 }, { "epoch": 0.5193973507701641, "grad_norm": 59.807960510253906, "learning_rate": 5.533132980289567e-07, "logits/chosen": -1.6147937774658203, "logits/rejected": -1.9224376678466797, "logps/chosen": -2.599799633026123, "logps/rejected": -2.6466755867004395, "loss": 4.5049, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.997997283935547, "rewards/margins": 0.46876010298728943, "rewards/rejected": -26.46675682067871, "step": 15410 }, { "epoch": 0.5195658768411473, "grad_norm": 25.46161460876465, "learning_rate": 5.530208310597318e-07, "logits/chosen": -1.5502841472625732, "logits/rejected": -1.5365631580352783, "logps/chosen": -3.0772483348846436, "logps/rejected": -3.437613010406494, "loss": 3.7961, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -30.77248191833496, "rewards/margins": 3.6036434173583984, "rewards/rejected": -34.37612533569336, "step": 15415 }, { "epoch": 0.5197344029121305, "grad_norm": 105.07298278808594, "learning_rate": 5.527283457420862e-07, "logits/chosen": -1.4746211767196655, "logits/rejected": -1.7897018194198608, "logps/chosen": -2.3653385639190674, "logps/rejected": -2.210273027420044, "loss": 5.108, "rewards/accuracies": 0.5, "rewards/chosen": -23.653385162353516, "rewards/margins": -1.5506563186645508, "rewards/rejected": -22.10272789001465, "step": 15420 }, { "epoch": 0.5199029289831136, "grad_norm": 31.118650436401367, "learning_rate": 5.524358421772377e-07, "logits/chosen": -0.7948622703552246, "logits/rejected": -0.8744922876358032, "logps/chosen": -2.0413529872894287, "logps/rejected": -2.088682174682617, "loss": 2.8332, "rewards/accuracies": 0.5, "rewards/chosen": -20.413528442382812, "rewards/margins": 0.47329291701316833, "rewards/rejected": -20.886821746826172, "step": 15425 }, { "epoch": 0.5200714550540969, "grad_norm": 17.658946990966797, "learning_rate": 5.521433204664101e-07, "logits/chosen": -1.8911575078964233, "logits/rejected": -2.300468683242798, "logps/chosen": -1.8099273443222046, "logps/rejected": -2.1652398109436035, "loss": 2.6423, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.099271774291992, "rewards/margins": 3.553126573562622, "rewards/rejected": -21.65239906311035, "step": 15430 }, { "epoch": 0.5202399811250801, "grad_norm": 32.41124725341797, "learning_rate": 5.518507807108335e-07, "logits/chosen": -1.2449058294296265, "logits/rejected": -1.6564595699310303, "logps/chosen": -2.1771650314331055, "logps/rejected": -3.255115509033203, "loss": 1.5736, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.771650314331055, "rewards/margins": 10.779507637023926, "rewards/rejected": -32.55115509033203, "step": 15435 }, { "epoch": 0.5204085071960632, "grad_norm": 9.971014022827148, "learning_rate": 5.515582230117448e-07, "logits/chosen": -1.3065838813781738, "logits/rejected": -1.6391983032226562, "logps/chosen": -1.7685725688934326, "logps/rejected": -2.0779318809509277, "loss": 1.3878, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -17.685726165771484, "rewards/margins": 3.09359073638916, "rewards/rejected": -20.779315948486328, "step": 15440 }, { "epoch": 0.5205770332670464, "grad_norm": 16.967708587646484, "learning_rate": 5.512656474703861e-07, "logits/chosen": -1.1333194971084595, "logits/rejected": -1.6499792337417603, "logps/chosen": -2.0808238983154297, "logps/rejected": -2.2162888050079346, "loss": 2.6948, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.808238983154297, "rewards/margins": 1.3546478748321533, "rewards/rejected": -22.16288948059082, "step": 15445 }, { "epoch": 0.5207455593380296, "grad_norm": 22.480432510375977, "learning_rate": 5.509730541880068e-07, "logits/chosen": -1.1527360677719116, "logits/rejected": -1.5316417217254639, "logps/chosen": -2.3182454109191895, "logps/rejected": -2.688753366470337, "loss": 2.913, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -23.18245506286621, "rewards/margins": 3.7050795555114746, "rewards/rejected": -26.88753318786621, "step": 15450 }, { "epoch": 0.5209140854090127, "grad_norm": 41.222991943359375, "learning_rate": 5.506804432658615e-07, "logits/chosen": -1.7212040424346924, "logits/rejected": -1.7893693447113037, "logps/chosen": -1.9451452493667603, "logps/rejected": -2.0703797340393066, "loss": 3.169, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.451452255249023, "rewards/margins": 1.252342939376831, "rewards/rejected": -20.70379638671875, "step": 15455 }, { "epoch": 0.5210826114799959, "grad_norm": 20.651769638061523, "learning_rate": 5.503878148052118e-07, "logits/chosen": -1.4760338068008423, "logits/rejected": -1.548568844795227, "logps/chosen": -2.059722423553467, "logps/rejected": -2.5742545127868652, "loss": 3.1436, "rewards/accuracies": 0.5, "rewards/chosen": -20.597225189208984, "rewards/margins": 5.145321846008301, "rewards/rejected": -25.7425479888916, "step": 15460 }, { "epoch": 0.5212511375509792, "grad_norm": 40.60196304321289, "learning_rate": 5.500951689073244e-07, "logits/chosen": -0.966766357421875, "logits/rejected": -1.1678035259246826, "logps/chosen": -2.260707139968872, "logps/rejected": -2.3044838905334473, "loss": 3.4371, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.607070922851562, "rewards/margins": 0.4377668499946594, "rewards/rejected": -23.044836044311523, "step": 15465 }, { "epoch": 0.5214196636219623, "grad_norm": 36.945743560791016, "learning_rate": 5.498025056734727e-07, "logits/chosen": -0.9160255193710327, "logits/rejected": -1.3329023122787476, "logps/chosen": -2.5447611808776855, "logps/rejected": -2.7440483570098877, "loss": 5.0179, "rewards/accuracies": 0.5, "rewards/chosen": -25.447612762451172, "rewards/margins": 1.9928712844848633, "rewards/rejected": -27.44048500061035, "step": 15470 }, { "epoch": 0.5215881896929455, "grad_norm": 20.789392471313477, "learning_rate": 5.49509825204936e-07, "logits/chosen": -1.9368362426757812, "logits/rejected": -1.951311469078064, "logps/chosen": -2.766106128692627, "logps/rejected": -2.6983156204223633, "loss": 4.2261, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -27.661060333251953, "rewards/margins": -0.6779062151908875, "rewards/rejected": -26.983154296875, "step": 15475 }, { "epoch": 0.5217567157639287, "grad_norm": 33.815025329589844, "learning_rate": 5.492171276029994e-07, "logits/chosen": -1.4312325716018677, "logits/rejected": -1.7131084203720093, "logps/chosen": -2.202226400375366, "logps/rejected": -2.3178181648254395, "loss": 3.5403, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.022266387939453, "rewards/margins": 1.1559193134307861, "rewards/rejected": -23.178184509277344, "step": 15480 }, { "epoch": 0.5219252418349118, "grad_norm": 1.9928309917449951, "learning_rate": 5.48924412968954e-07, "logits/chosen": -0.6165008544921875, "logits/rejected": -0.9194254875183105, "logps/chosen": -2.514402389526367, "logps/rejected": -2.7848479747772217, "loss": 1.7571, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -25.144020080566406, "rewards/margins": 2.7044589519500732, "rewards/rejected": -27.848480224609375, "step": 15485 }, { "epoch": 0.522093767905895, "grad_norm": 28.686857223510742, "learning_rate": 5.486316814040968e-07, "logits/chosen": -1.7597252130508423, "logits/rejected": -1.6932029724121094, "logps/chosen": -2.4415581226348877, "logps/rejected": -2.6324195861816406, "loss": 3.7649, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -24.41558074951172, "rewards/margins": 1.9086145162582397, "rewards/rejected": -26.324193954467773, "step": 15490 }, { "epoch": 0.5222622939768782, "grad_norm": 27.051673889160156, "learning_rate": 5.483389330097308e-07, "logits/chosen": -1.7283437252044678, "logits/rejected": -1.5780450105667114, "logps/chosen": -2.3943264484405518, "logps/rejected": -2.3182897567749023, "loss": 4.1081, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -23.94326400756836, "rewards/margins": -0.7603681683540344, "rewards/rejected": -23.18289566040039, "step": 15495 }, { "epoch": 0.5224308200478615, "grad_norm": 37.90834426879883, "learning_rate": 5.480461678871645e-07, "logits/chosen": -1.6101102828979492, "logits/rejected": -1.6730715036392212, "logps/chosen": -1.935234785079956, "logps/rejected": -1.872582197189331, "loss": 3.8334, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.35234832763672, "rewards/margins": -0.6265257596969604, "rewards/rejected": -18.7258243560791, "step": 15500 }, { "epoch": 0.5225993461188446, "grad_norm": 21.659454345703125, "learning_rate": 5.477533861377123e-07, "logits/chosen": -1.2597486972808838, "logits/rejected": -1.7310842275619507, "logps/chosen": -1.9114364385604858, "logps/rejected": -2.3817667961120605, "loss": 2.5768, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.11436653137207, "rewards/margins": 4.70330286026001, "rewards/rejected": -23.81766700744629, "step": 15505 }, { "epoch": 0.5227678721898278, "grad_norm": 16.150802612304688, "learning_rate": 5.474605878626948e-07, "logits/chosen": -1.0654808282852173, "logits/rejected": -1.4487148523330688, "logps/chosen": -2.6950135231018066, "logps/rejected": -2.666337490081787, "loss": 4.7582, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.95013427734375, "rewards/margins": -0.286760151386261, "rewards/rejected": -26.663372039794922, "step": 15510 }, { "epoch": 0.522936398260811, "grad_norm": 78.9401626586914, "learning_rate": 5.471677731634375e-07, "logits/chosen": -1.832585096359253, "logits/rejected": -1.858319640159607, "logps/chosen": -2.2961020469665527, "logps/rejected": -2.3054680824279785, "loss": 4.1239, "rewards/accuracies": 0.5, "rewards/chosen": -22.961023330688477, "rewards/margins": 0.09365816414356232, "rewards/rejected": -23.0546817779541, "step": 15515 }, { "epoch": 0.5231049243317941, "grad_norm": 31.371814727783203, "learning_rate": 5.468749421412723e-07, "logits/chosen": -1.6428378820419312, "logits/rejected": -1.6746675968170166, "logps/chosen": -2.245944023132324, "logps/rejected": -2.602429151535034, "loss": 1.88, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.45943832397461, "rewards/margins": 3.5648505687713623, "rewards/rejected": -26.024288177490234, "step": 15520 }, { "epoch": 0.5232734504027773, "grad_norm": 16.432018280029297, "learning_rate": 5.465820948975366e-07, "logits/chosen": -1.270250916481018, "logits/rejected": -1.2934939861297607, "logps/chosen": -1.722895622253418, "logps/rejected": -1.7790238857269287, "loss": 3.5555, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.22895622253418, "rewards/margins": 0.5612838864326477, "rewards/rejected": -17.790239334106445, "step": 15525 }, { "epoch": 0.5234419764737605, "grad_norm": 20.018625259399414, "learning_rate": 5.462892315335729e-07, "logits/chosen": -1.6446031332015991, "logits/rejected": -1.5727581977844238, "logps/chosen": -1.9519649744033813, "logps/rejected": -2.2087297439575195, "loss": 2.8335, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.519649505615234, "rewards/margins": 2.5676486492156982, "rewards/rejected": -22.087299346923828, "step": 15530 }, { "epoch": 0.5236105025447436, "grad_norm": 44.28361892700195, "learning_rate": 5.4599635215073e-07, "logits/chosen": -1.4462125301361084, "logits/rejected": -1.6265175342559814, "logps/chosen": -2.038994789123535, "logps/rejected": -2.193636178970337, "loss": 3.1313, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.38994789123535, "rewards/margins": 1.5464133024215698, "rewards/rejected": -21.93636131286621, "step": 15535 }, { "epoch": 0.5237790286157269, "grad_norm": 110.08293151855469, "learning_rate": 5.457034568503616e-07, "logits/chosen": -1.8243696689605713, "logits/rejected": -1.8024543523788452, "logps/chosen": -2.3185818195343018, "logps/rejected": -2.424232006072998, "loss": 2.6836, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.18581771850586, "rewards/margins": 1.0565001964569092, "rewards/rejected": -24.242319107055664, "step": 15540 }, { "epoch": 0.5239475546867101, "grad_norm": 80.31021881103516, "learning_rate": 5.454105457338278e-07, "logits/chosen": -1.5435346364974976, "logits/rejected": -1.5518152713775635, "logps/chosen": -2.4903368949890137, "logps/rejected": -2.444121837615967, "loss": 3.6497, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -24.903369903564453, "rewards/margins": -0.46214962005615234, "rewards/rejected": -24.441219329833984, "step": 15545 }, { "epoch": 0.5241160807576932, "grad_norm": 40.0004768371582, "learning_rate": 5.45117618902493e-07, "logits/chosen": -1.5282537937164307, "logits/rejected": -1.8590351343154907, "logps/chosen": -2.001547336578369, "logps/rejected": -2.2384941577911377, "loss": 2.0766, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.015472412109375, "rewards/margins": 2.369469404220581, "rewards/rejected": -22.384939193725586, "step": 15550 }, { "epoch": 0.5242846068286764, "grad_norm": 20.842981338500977, "learning_rate": 5.448246764577278e-07, "logits/chosen": -0.5346226096153259, "logits/rejected": -0.5988118648529053, "logps/chosen": -2.4696402549743652, "logps/rejected": -2.6132705211639404, "loss": 2.9447, "rewards/accuracies": 0.5, "rewards/chosen": -24.69640350341797, "rewards/margins": 1.4363019466400146, "rewards/rejected": -26.132705688476562, "step": 15555 }, { "epoch": 0.5244531328996596, "grad_norm": 27.3537654876709, "learning_rate": 5.445317185009082e-07, "logits/chosen": -1.1227928400039673, "logits/rejected": -1.205906867980957, "logps/chosen": -2.1927571296691895, "logps/rejected": -2.4103589057922363, "loss": 1.8171, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.92757225036621, "rewards/margins": 2.1760153770446777, "rewards/rejected": -24.103588104248047, "step": 15560 }, { "epoch": 0.5246216589706427, "grad_norm": 35.99321746826172, "learning_rate": 5.442387451334152e-07, "logits/chosen": -1.2954622507095337, "logits/rejected": -1.3853565454483032, "logps/chosen": -1.78121018409729, "logps/rejected": -1.9041932821273804, "loss": 2.5959, "rewards/accuracies": 0.5, "rewards/chosen": -17.812103271484375, "rewards/margins": 1.229830026626587, "rewards/rejected": -19.04193115234375, "step": 15565 }, { "epoch": 0.5247901850416259, "grad_norm": 44.51189041137695, "learning_rate": 5.439457564566356e-07, "logits/chosen": -1.7479088306427002, "logits/rejected": -1.7973203659057617, "logps/chosen": -1.5744832754135132, "logps/rejected": -1.9441041946411133, "loss": 2.0475, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -15.744832038879395, "rewards/margins": 3.6962077617645264, "rewards/rejected": -19.441041946411133, "step": 15570 }, { "epoch": 0.5249587111126092, "grad_norm": 22.456087112426758, "learning_rate": 5.43652752571961e-07, "logits/chosen": -1.5364540815353394, "logits/rejected": -1.588979721069336, "logps/chosen": -2.6818394660949707, "logps/rejected": -2.505458354949951, "loss": 5.4875, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -26.818395614624023, "rewards/margins": -1.7638130187988281, "rewards/rejected": -25.054582595825195, "step": 15575 }, { "epoch": 0.5251272371835923, "grad_norm": 28.281593322753906, "learning_rate": 5.433597335807887e-07, "logits/chosen": -1.2712260484695435, "logits/rejected": -1.3536813259124756, "logps/chosen": -2.145684242248535, "logps/rejected": -2.089423418045044, "loss": 4.0857, "rewards/accuracies": 0.5, "rewards/chosen": -21.456844329833984, "rewards/margins": -0.5626105070114136, "rewards/rejected": -20.89423370361328, "step": 15580 }, { "epoch": 0.5252957632545755, "grad_norm": 19.869348526000977, "learning_rate": 5.430666995845207e-07, "logits/chosen": -1.6280781030654907, "logits/rejected": -1.7816858291625977, "logps/chosen": -2.778658866882324, "logps/rejected": -3.0360519886016846, "loss": 2.51, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -27.78658676147461, "rewards/margins": 2.5739312171936035, "rewards/rejected": -30.360515594482422, "step": 15585 }, { "epoch": 0.5254642893255587, "grad_norm": 61.13578796386719, "learning_rate": 5.42773650684565e-07, "logits/chosen": -1.3919565677642822, "logits/rejected": -1.010761022567749, "logps/chosen": -2.5698320865631104, "logps/rejected": -2.5252742767333984, "loss": 3.7041, "rewards/accuracies": 0.5, "rewards/chosen": -25.698322296142578, "rewards/margins": -0.4455797076225281, "rewards/rejected": -25.25274085998535, "step": 15590 }, { "epoch": 0.5256328153965418, "grad_norm": 256.7560729980469, "learning_rate": 5.424805869823338e-07, "logits/chosen": -1.6359916925430298, "logits/rejected": -1.6572707891464233, "logps/chosen": -3.07808256149292, "logps/rejected": -2.8269076347351074, "loss": 5.5903, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -30.78082275390625, "rewards/margins": -2.5117459297180176, "rewards/rejected": -28.26907730102539, "step": 15595 }, { "epoch": 0.525801341467525, "grad_norm": 39.58763885498047, "learning_rate": 5.421875085792451e-07, "logits/chosen": -1.4183658361434937, "logits/rejected": -1.4681622982025146, "logps/chosen": -2.238878011703491, "logps/rejected": -2.2670669555664062, "loss": 3.2834, "rewards/accuracies": 0.5, "rewards/chosen": -22.388778686523438, "rewards/margins": 0.28189095854759216, "rewards/rejected": -22.670669555664062, "step": 15600 }, { "epoch": 0.525801341467525, "eval_logits/chosen": -1.7632553577423096, "eval_logits/rejected": -1.8928484916687012, "eval_logps/chosen": -2.0442748069763184, "eval_logps/rejected": -2.1599292755126953, "eval_loss": 2.9858274459838867, "eval_rewards/accuracies": 0.6299999952316284, "eval_rewards/chosen": -20.442750930786133, "eval_rewards/margins": 1.156540870666504, "eval_rewards/rejected": -21.599288940429688, "eval_runtime": 12.8962, "eval_samples_per_second": 7.754, "eval_steps_per_second": 1.939, "step": 15600 }, { "epoch": 0.5259698675385082, "grad_norm": 22.91073226928711, "learning_rate": 5.41894415576722e-07, "logits/chosen": -1.1644032001495361, "logits/rejected": -1.226810336112976, "logps/chosen": -2.3184869289398193, "logps/rejected": -2.561067819595337, "loss": 2.5009, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.18486976623535, "rewards/margins": 2.4258084297180176, "rewards/rejected": -25.610681533813477, "step": 15605 }, { "epoch": 0.5261383936094914, "grad_norm": 30.144290924072266, "learning_rate": 5.416013080761921e-07, "logits/chosen": -1.4322903156280518, "logits/rejected": -1.5939862728118896, "logps/chosen": -2.02023983001709, "logps/rejected": -2.257467269897461, "loss": 1.6984, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.202396392822266, "rewards/margins": 2.372274398803711, "rewards/rejected": -22.57467269897461, "step": 15610 }, { "epoch": 0.5263069196804746, "grad_norm": 12.66912841796875, "learning_rate": 5.413081861790884e-07, "logits/chosen": -1.5032761096954346, "logits/rejected": -1.7086365222930908, "logps/chosen": -1.7659047842025757, "logps/rejected": -2.06687593460083, "loss": 2.1078, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.659048080444336, "rewards/margins": 3.009711742401123, "rewards/rejected": -20.668760299682617, "step": 15615 }, { "epoch": 0.5264754457514578, "grad_norm": 34.912384033203125, "learning_rate": 5.410150499868491e-07, "logits/chosen": -1.7015644311904907, "logits/rejected": -1.7704875469207764, "logps/chosen": -1.6861746311187744, "logps/rejected": -2.1708366870880127, "loss": 1.7276, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -16.861745834350586, "rewards/margins": 4.846621036529541, "rewards/rejected": -21.70836639404297, "step": 15620 }, { "epoch": 0.5266439718224409, "grad_norm": 23.303464889526367, "learning_rate": 5.407218996009168e-07, "logits/chosen": -1.414489507675171, "logits/rejected": -1.2904560565948486, "logps/chosen": -1.8772773742675781, "logps/rejected": -1.9689223766326904, "loss": 2.6413, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.77277183532715, "rewards/margins": 0.9164519309997559, "rewards/rejected": -19.689224243164062, "step": 15625 }, { "epoch": 0.5268124978934241, "grad_norm": 7.382728099822998, "learning_rate": 5.404287351227397e-07, "logits/chosen": -1.51156747341156, "logits/rejected": -1.5182366371154785, "logps/chosen": -2.156038999557495, "logps/rejected": -2.454796552658081, "loss": 2.1912, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.56039047241211, "rewards/margins": 2.987576723098755, "rewards/rejected": -24.5479679107666, "step": 15630 }, { "epoch": 0.5269810239644073, "grad_norm": 24.27437973022461, "learning_rate": 5.401355566537698e-07, "logits/chosen": -1.5087625980377197, "logits/rejected": -1.5422756671905518, "logps/chosen": -2.191742420196533, "logps/rejected": -2.490962028503418, "loss": 2.7534, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.91742515563965, "rewards/margins": 2.9921936988830566, "rewards/rejected": -24.909618377685547, "step": 15635 }, { "epoch": 0.5271495500353904, "grad_norm": 34.50971984863281, "learning_rate": 5.398423642954654e-07, "logits/chosen": -1.3371788263320923, "logits/rejected": -1.3717623949050903, "logps/chosen": -2.7144930362701416, "logps/rejected": -2.642326831817627, "loss": 3.8951, "rewards/accuracies": 0.5, "rewards/chosen": -27.144927978515625, "rewards/margins": -0.7216583490371704, "rewards/rejected": -26.423269271850586, "step": 15640 }, { "epoch": 0.5273180761063736, "grad_norm": 57.722877502441406, "learning_rate": 5.395491581492883e-07, "logits/chosen": -1.2004446983337402, "logits/rejected": -1.3373278379440308, "logps/chosen": -3.4159648418426514, "logps/rejected": -3.5445168018341064, "loss": 6.5148, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -34.15964889526367, "rewards/margins": 1.2855199575424194, "rewards/rejected": -35.445167541503906, "step": 15645 }, { "epoch": 0.5274866021773569, "grad_norm": 33.64910125732422, "learning_rate": 5.392559383167057e-07, "logits/chosen": -1.420398473739624, "logits/rejected": -1.4803659915924072, "logps/chosen": -2.0454297065734863, "logps/rejected": -1.9843571186065674, "loss": 3.7885, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.454294204711914, "rewards/margins": -0.6107238531112671, "rewards/rejected": -19.843570709228516, "step": 15650 }, { "epoch": 0.52765512824834, "grad_norm": 143.58140563964844, "learning_rate": 5.389627048991894e-07, "logits/chosen": -1.1687041521072388, "logits/rejected": -1.2767616510391235, "logps/chosen": -2.470137357711792, "logps/rejected": -2.4478909969329834, "loss": 3.6825, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.701374053955078, "rewards/margins": -0.2224619835615158, "rewards/rejected": -24.478910446166992, "step": 15655 }, { "epoch": 0.5278236543193232, "grad_norm": 36.96298599243164, "learning_rate": 5.386694579982161e-07, "logits/chosen": -1.4571011066436768, "logits/rejected": -1.7262732982635498, "logps/chosen": -2.092268466949463, "logps/rejected": -2.3306639194488525, "loss": 1.82, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.92268180847168, "rewards/margins": 2.38395619392395, "rewards/rejected": -23.306636810302734, "step": 15660 }, { "epoch": 0.5279921803903064, "grad_norm": 21.692073822021484, "learning_rate": 5.38376197715267e-07, "logits/chosen": -0.784595251083374, "logits/rejected": -0.8547506332397461, "logps/chosen": -2.1192760467529297, "logps/rejected": -2.105349540710449, "loss": 3.6934, "rewards/accuracies": 0.5, "rewards/chosen": -21.192760467529297, "rewards/margins": -0.13926735520362854, "rewards/rejected": -21.05349349975586, "step": 15665 }, { "epoch": 0.5281607064612895, "grad_norm": 13.716679573059082, "learning_rate": 5.380829241518277e-07, "logits/chosen": -1.217454195022583, "logits/rejected": -1.7114721536636353, "logps/chosen": -2.3606152534484863, "logps/rejected": -2.695775270462036, "loss": 3.1203, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.606151580810547, "rewards/margins": 3.351599931716919, "rewards/rejected": -26.957752227783203, "step": 15670 }, { "epoch": 0.5283292325322727, "grad_norm": 28.68997573852539, "learning_rate": 5.377896374093889e-07, "logits/chosen": -1.3477064371109009, "logits/rejected": -1.5586186647415161, "logps/chosen": -1.914340615272522, "logps/rejected": -1.8554699420928955, "loss": 3.8192, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.14340591430664, "rewards/margins": -0.588705837726593, "rewards/rejected": -18.554698944091797, "step": 15675 }, { "epoch": 0.5284977586032559, "grad_norm": 33.395748138427734, "learning_rate": 5.374963375894452e-07, "logits/chosen": -1.627058982849121, "logits/rejected": -1.6720809936523438, "logps/chosen": -1.5370395183563232, "logps/rejected": -1.6897306442260742, "loss": 2.9374, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -15.370396614074707, "rewards/margins": 1.5269111394882202, "rewards/rejected": -16.897306442260742, "step": 15680 }, { "epoch": 0.5286662846742392, "grad_norm": 39.92876434326172, "learning_rate": 5.372030247934965e-07, "logits/chosen": -1.1321537494659424, "logits/rejected": -1.3527767658233643, "logps/chosen": -1.9309250116348267, "logps/rejected": -2.1466927528381348, "loss": 3.1527, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.309249877929688, "rewards/margins": 2.157675266265869, "rewards/rejected": -21.4669246673584, "step": 15685 }, { "epoch": 0.5288348107452223, "grad_norm": 39.63317108154297, "learning_rate": 5.369096991230467e-07, "logits/chosen": -1.7300984859466553, "logits/rejected": -1.8779960870742798, "logps/chosen": -1.7245010137557983, "logps/rejected": -2.174797773361206, "loss": 2.0401, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.245010375976562, "rewards/margins": 4.5029683113098145, "rewards/rejected": -21.74797821044922, "step": 15690 }, { "epoch": 0.5290033368162055, "grad_norm": 30.05023193359375, "learning_rate": 5.366163606796042e-07, "logits/chosen": -1.5409890413284302, "logits/rejected": -1.5917844772338867, "logps/chosen": -1.865624189376831, "logps/rejected": -2.017108201980591, "loss": 2.5183, "rewards/accuracies": 0.5, "rewards/chosen": -18.65624237060547, "rewards/margins": 1.5148383378982544, "rewards/rejected": -20.171083450317383, "step": 15695 }, { "epoch": 0.5291718628871886, "grad_norm": 19.90330696105957, "learning_rate": 5.363230095646818e-07, "logits/chosen": -1.2542389631271362, "logits/rejected": -1.4719120264053345, "logps/chosen": -1.8331972360610962, "logps/rejected": -1.7900031805038452, "loss": 3.6632, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.331972122192383, "rewards/margins": -0.4319402575492859, "rewards/rejected": -17.90003204345703, "step": 15700 }, { "epoch": 0.5293403889581718, "grad_norm": 18.70873260498047, "learning_rate": 5.360296458797969e-07, "logits/chosen": -1.2316617965698242, "logits/rejected": -1.6386626958847046, "logps/chosen": -2.234480381011963, "logps/rejected": -2.251962184906006, "loss": 3.1477, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.344802856445312, "rewards/margins": 0.17481890320777893, "rewards/rejected": -22.519622802734375, "step": 15705 }, { "epoch": 0.529508915029155, "grad_norm": 3.4370267391204834, "learning_rate": 5.357362697264711e-07, "logits/chosen": -1.5080573558807373, "logits/rejected": -1.6581541299819946, "logps/chosen": -2.3186049461364746, "logps/rejected": -2.8751769065856934, "loss": 1.6822, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -23.186050415039062, "rewards/margins": 5.565718650817871, "rewards/rejected": -28.75177001953125, "step": 15710 }, { "epoch": 0.5296774411001381, "grad_norm": 30.35115623474121, "learning_rate": 5.354428812062303e-07, "logits/chosen": -1.646667718887329, "logits/rejected": -1.6787481307983398, "logps/chosen": -2.1127872467041016, "logps/rejected": -2.3904125690460205, "loss": 3.2484, "rewards/accuracies": 0.5, "rewards/chosen": -21.12787437438965, "rewards/margins": 2.7762503623962402, "rewards/rejected": -23.904123306274414, "step": 15715 }, { "epoch": 0.5298459671711214, "grad_norm": 35.458404541015625, "learning_rate": 5.351494804206047e-07, "logits/chosen": -1.3847260475158691, "logits/rejected": -1.9206058979034424, "logps/chosen": -2.1701271533966064, "logps/rejected": -2.5087239742279053, "loss": 2.0344, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.701271057128906, "rewards/margins": 3.3859705924987793, "rewards/rejected": -25.08724021911621, "step": 15720 }, { "epoch": 0.5300144932421046, "grad_norm": 28.80376434326172, "learning_rate": 5.348560674711289e-07, "logits/chosen": -1.468523621559143, "logits/rejected": -1.8824526071548462, "logps/chosen": -2.12115740776062, "logps/rejected": -2.4652764797210693, "loss": 4.2581, "rewards/accuracies": 0.5, "rewards/chosen": -21.21157455444336, "rewards/margins": 3.4411914348602295, "rewards/rejected": -24.65276527404785, "step": 15725 }, { "epoch": 0.5301830193130878, "grad_norm": 16.96817970275879, "learning_rate": 5.345626424593412e-07, "logits/chosen": -1.7553907632827759, "logits/rejected": -1.8569806814193726, "logps/chosen": -2.455338478088379, "logps/rejected": -2.970616102218628, "loss": 2.0558, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -24.55338478088379, "rewards/margins": 5.152777671813965, "rewards/rejected": -29.706165313720703, "step": 15730 }, { "epoch": 0.5303515453840709, "grad_norm": 33.88883590698242, "learning_rate": 5.342692054867848e-07, "logits/chosen": -0.7830812931060791, "logits/rejected": -1.0861982107162476, "logps/chosen": -2.214578628540039, "logps/rejected": -2.686673641204834, "loss": 1.3673, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -22.145784378051758, "rewards/margins": 4.720952033996582, "rewards/rejected": -26.86673927307129, "step": 15735 }, { "epoch": 0.5305200714550541, "grad_norm": 11.75301456451416, "learning_rate": 5.339757566550065e-07, "logits/chosen": -1.480642557144165, "logits/rejected": -1.5226514339447021, "logps/chosen": -2.491964817047119, "logps/rejected": -2.958143949508667, "loss": 1.4401, "rewards/accuracies": 1.0, "rewards/chosen": -24.91964340209961, "rewards/margins": 4.661794185638428, "rewards/rejected": -29.581439971923828, "step": 15740 }, { "epoch": 0.5306885975260373, "grad_norm": 18.430078506469727, "learning_rate": 5.336822960655574e-07, "logits/chosen": -1.3387706279754639, "logits/rejected": -1.4303174018859863, "logps/chosen": -1.6745388507843018, "logps/rejected": -1.630464792251587, "loss": 3.5253, "rewards/accuracies": 0.5, "rewards/chosen": -16.74538803100586, "rewards/margins": -0.44073954224586487, "rewards/rejected": -16.30464744567871, "step": 15745 }, { "epoch": 0.5308571235970204, "grad_norm": 59.17100143432617, "learning_rate": 5.333888238199926e-07, "logits/chosen": -1.1929603815078735, "logits/rejected": -1.2906194925308228, "logps/chosen": -2.2294490337371826, "logps/rejected": -2.385697841644287, "loss": 2.3508, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.294490814208984, "rewards/margins": 1.5624854564666748, "rewards/rejected": -23.856977462768555, "step": 15750 }, { "epoch": 0.5310256496680036, "grad_norm": 44.3913688659668, "learning_rate": 5.330953400198715e-07, "logits/chosen": -1.5607414245605469, "logits/rejected": -1.7760826349258423, "logps/chosen": -2.3757803440093994, "logps/rejected": -2.758803606033325, "loss": 2.1524, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.757801055908203, "rewards/margins": 3.8302321434020996, "rewards/rejected": -27.588037490844727, "step": 15755 }, { "epoch": 0.5311941757389869, "grad_norm": 19.41259765625, "learning_rate": 5.32801844766757e-07, "logits/chosen": -1.5063214302062988, "logits/rejected": -1.431069016456604, "logps/chosen": -3.2000415325164795, "logps/rejected": -3.125478506088257, "loss": 5.374, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -32.00041580200195, "rewards/margins": -0.7456296682357788, "rewards/rejected": -31.254785537719727, "step": 15760 }, { "epoch": 0.53136270180997, "grad_norm": 17.94995880126953, "learning_rate": 5.325083381622164e-07, "logits/chosen": -1.1592814922332764, "logits/rejected": -1.3412177562713623, "logps/chosen": -2.687490463256836, "logps/rejected": -2.77734375, "loss": 2.949, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -26.874902725219727, "rewards/margins": 0.8985313177108765, "rewards/rejected": -27.7734375, "step": 15765 }, { "epoch": 0.5315312278809532, "grad_norm": 12.215071678161621, "learning_rate": 5.322148203078206e-07, "logits/chosen": -1.4378396272659302, "logits/rejected": -1.5946756601333618, "logps/chosen": -2.456948757171631, "logps/rejected": -2.9714131355285645, "loss": 0.9776, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -24.56949234008789, "rewards/margins": 5.1446404457092285, "rewards/rejected": -29.714130401611328, "step": 15770 }, { "epoch": 0.5316997539519364, "grad_norm": 29.457229614257812, "learning_rate": 5.319212913051449e-07, "logits/chosen": -1.3002904653549194, "logits/rejected": -1.7504593133926392, "logps/chosen": -1.988246202468872, "logps/rejected": -2.1686580181121826, "loss": 4.2004, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.882461547851562, "rewards/margins": 1.8041210174560547, "rewards/rejected": -21.686582565307617, "step": 15775 }, { "epoch": 0.5318682800229195, "grad_norm": 94.38124084472656, "learning_rate": 5.316277512557678e-07, "logits/chosen": -1.3595154285430908, "logits/rejected": -1.8648008108139038, "logps/chosen": -2.644291400909424, "logps/rejected": -2.9432849884033203, "loss": 4.225, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -26.442913055419922, "rewards/margins": 2.9899346828460693, "rewards/rejected": -29.432849884033203, "step": 15780 }, { "epoch": 0.5320368060939027, "grad_norm": 31.942821502685547, "learning_rate": 5.31334200261272e-07, "logits/chosen": -1.6898002624511719, "logits/rejected": -1.6755949258804321, "logps/chosen": -2.614825963973999, "logps/rejected": -2.7868762016296387, "loss": 3.4297, "rewards/accuracies": 0.5, "rewards/chosen": -26.148258209228516, "rewards/margins": 1.720507025718689, "rewards/rejected": -27.868764877319336, "step": 15785 }, { "epoch": 0.5322053321648859, "grad_norm": 23.538860321044922, "learning_rate": 5.310406384232443e-07, "logits/chosen": -0.8642290234565735, "logits/rejected": -1.1689766645431519, "logps/chosen": -2.2893424034118652, "logps/rejected": -2.6651718616485596, "loss": 2.8563, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.8934268951416, "rewards/margins": 3.758291244506836, "rewards/rejected": -26.651714324951172, "step": 15790 }, { "epoch": 0.5323738582358691, "grad_norm": 5.7882914543151855, "learning_rate": 5.307470658432745e-07, "logits/chosen": -1.9477641582489014, "logits/rejected": -2.2056195735931396, "logps/chosen": -2.2544777393341064, "logps/rejected": -2.708986282348633, "loss": 1.7698, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.544776916503906, "rewards/margins": 4.545085430145264, "rewards/rejected": -27.089862823486328, "step": 15795 }, { "epoch": 0.5325423843068523, "grad_norm": 14.628530502319336, "learning_rate": 5.304534826229565e-07, "logits/chosen": -1.5568211078643799, "logits/rejected": -2.0258517265319824, "logps/chosen": -2.239051342010498, "logps/rejected": -2.902362108230591, "loss": 1.7635, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.39051628112793, "rewards/margins": 6.633103847503662, "rewards/rejected": -29.02362060546875, "step": 15800 }, { "epoch": 0.5327109103778355, "grad_norm": 17.869384765625, "learning_rate": 5.30159888863888e-07, "logits/chosen": -1.4855738878250122, "logits/rejected": -1.5085865259170532, "logps/chosen": -2.267606735229492, "logps/rejected": -2.5644633769989014, "loss": 2.3582, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.676069259643555, "rewards/margins": 2.9685654640197754, "rewards/rejected": -25.644634246826172, "step": 15805 }, { "epoch": 0.5328794364488186, "grad_norm": 64.83020782470703, "learning_rate": 5.298662846676702e-07, "logits/chosen": -1.9247585535049438, "logits/rejected": -1.9699634313583374, "logps/chosen": -2.598067283630371, "logps/rejected": -2.7303786277770996, "loss": 3.14, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -25.98067283630371, "rewards/margins": 1.3231112957000732, "rewards/rejected": -27.303783416748047, "step": 15810 }, { "epoch": 0.5330479625198018, "grad_norm": 152.54348754882812, "learning_rate": 5.295726701359081e-07, "logits/chosen": -1.5798695087432861, "logits/rejected": -1.5234687328338623, "logps/chosen": -2.802265167236328, "logps/rejected": -2.816190719604492, "loss": 4.435, "rewards/accuracies": 0.5, "rewards/chosen": -28.02264976501465, "rewards/margins": 0.13925638794898987, "rewards/rejected": -28.16190528869629, "step": 15815 }, { "epoch": 0.533216488590785, "grad_norm": 40.41664505004883, "learning_rate": 5.292790453702098e-07, "logits/chosen": -0.893968403339386, "logits/rejected": -0.8400181531906128, "logps/chosen": -1.9188792705535889, "logps/rejected": -1.7751582860946655, "loss": 4.5872, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.188793182373047, "rewards/margins": -1.4372103214263916, "rewards/rejected": -17.751583099365234, "step": 15820 }, { "epoch": 0.5333850146617681, "grad_norm": 5.9038262367248535, "learning_rate": 5.289854104721876e-07, "logits/chosen": -1.1986534595489502, "logits/rejected": -1.4004700183868408, "logps/chosen": -2.0523171424865723, "logps/rejected": -2.20119309425354, "loss": 2.5557, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.523168563842773, "rewards/margins": 1.4887616634368896, "rewards/rejected": -22.01192855834961, "step": 15825 }, { "epoch": 0.5335535407327514, "grad_norm": 28.159732818603516, "learning_rate": 5.286917655434568e-07, "logits/chosen": -1.5185017585754395, "logits/rejected": -1.7343857288360596, "logps/chosen": -1.7940750122070312, "logps/rejected": -1.7864980697631836, "loss": 3.3753, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.940750122070312, "rewards/margins": -0.07577009499073029, "rewards/rejected": -17.864978790283203, "step": 15830 }, { "epoch": 0.5337220668037346, "grad_norm": 27.44189453125, "learning_rate": 5.283981106856362e-07, "logits/chosen": -1.4348413944244385, "logits/rejected": -1.4643663167953491, "logps/chosen": -2.1176836490631104, "logps/rejected": -2.0485522747039795, "loss": 3.8682, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.176837921142578, "rewards/margins": -0.6913127899169922, "rewards/rejected": -20.485523223876953, "step": 15835 }, { "epoch": 0.5338905928747177, "grad_norm": 16.955625534057617, "learning_rate": 5.281044460003485e-07, "logits/chosen": -1.3504717350006104, "logits/rejected": -1.4131934642791748, "logps/chosen": -2.345491886138916, "logps/rejected": -2.415341854095459, "loss": 2.9348, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.45492172241211, "rewards/margins": 0.6984950304031372, "rewards/rejected": -24.15341567993164, "step": 15840 }, { "epoch": 0.5340591189457009, "grad_norm": 104.76861572265625, "learning_rate": 5.278107715892192e-07, "logits/chosen": -1.08576500415802, "logits/rejected": -1.1210057735443115, "logps/chosen": -2.4601125717163086, "logps/rejected": -2.3961403369903564, "loss": 3.7628, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -24.601125717163086, "rewards/margins": -0.6397234797477722, "rewards/rejected": -23.961400985717773, "step": 15845 }, { "epoch": 0.5342276450166841, "grad_norm": 27.159868240356445, "learning_rate": 5.275170875538776e-07, "logits/chosen": -1.3380801677703857, "logits/rejected": -1.3865848779678345, "logps/chosen": -1.9246442317962646, "logps/rejected": -1.8412472009658813, "loss": 3.8892, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.246442794799805, "rewards/margins": -0.8339722752571106, "rewards/rejected": -18.412471771240234, "step": 15850 }, { "epoch": 0.5343961710876672, "grad_norm": 102.9177474975586, "learning_rate": 5.272233939959559e-07, "logits/chosen": -1.1191086769104004, "logits/rejected": -0.9336107969284058, "logps/chosen": -2.1518714427948, "logps/rejected": -2.009411334991455, "loss": 4.5251, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.518714904785156, "rewards/margins": -1.42460298538208, "rewards/rejected": -20.0941104888916, "step": 15855 }, { "epoch": 0.5345646971586504, "grad_norm": 26.768428802490234, "learning_rate": 5.269296910170905e-07, "logits/chosen": -1.1772959232330322, "logits/rejected": -1.4456255435943604, "logps/chosen": -1.839946985244751, "logps/rejected": -2.2853474617004395, "loss": 2.4164, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.39946937561035, "rewards/margins": 4.454005241394043, "rewards/rejected": -22.85347557067871, "step": 15860 }, { "epoch": 0.5347332232296336, "grad_norm": 31.624685287475586, "learning_rate": 5.266359787189199e-07, "logits/chosen": -1.5025027990341187, "logits/rejected": -1.947291612625122, "logps/chosen": -1.8523876667022705, "logps/rejected": -2.393691301345825, "loss": 1.875, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.523876190185547, "rewards/margins": 5.413036823272705, "rewards/rejected": -23.936914443969727, "step": 15865 }, { "epoch": 0.5349017493006168, "grad_norm": 20.274646759033203, "learning_rate": 5.263422572030863e-07, "logits/chosen": -1.776607871055603, "logits/rejected": -1.7238489389419556, "logps/chosen": -1.981871247291565, "logps/rejected": -2.054335832595825, "loss": 2.6384, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.818714141845703, "rewards/margins": 0.7246443033218384, "rewards/rejected": -20.54335594177246, "step": 15870 }, { "epoch": 0.5350702753716, "grad_norm": 29.006074905395508, "learning_rate": 5.260485265712355e-07, "logits/chosen": -1.447188138961792, "logits/rejected": -1.4549908638000488, "logps/chosen": -1.9105160236358643, "logps/rejected": -1.9352041482925415, "loss": 3.2192, "rewards/accuracies": 0.5, "rewards/chosen": -19.105159759521484, "rewards/margins": 0.24688215553760529, "rewards/rejected": -19.352041244506836, "step": 15875 }, { "epoch": 0.5352388014425832, "grad_norm": 31.608394622802734, "learning_rate": 5.257547869250159e-07, "logits/chosen": -1.1805084943771362, "logits/rejected": -1.463069200515747, "logps/chosen": -1.7122770547866821, "logps/rejected": -2.39255690574646, "loss": 2.7306, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.122770309448242, "rewards/margins": 6.802798271179199, "rewards/rejected": -23.925569534301758, "step": 15880 }, { "epoch": 0.5354073275135663, "grad_norm": 23.542816162109375, "learning_rate": 5.254610383660793e-07, "logits/chosen": -1.764362096786499, "logits/rejected": -1.6827386617660522, "logps/chosen": -2.032930850982666, "logps/rejected": -2.121166944503784, "loss": 2.8885, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.32931137084961, "rewards/margins": 0.882359504699707, "rewards/rejected": -21.211669921875, "step": 15885 }, { "epoch": 0.5355758535845495, "grad_norm": 13.081600189208984, "learning_rate": 5.251672809960802e-07, "logits/chosen": -1.3148635625839233, "logits/rejected": -1.5017060041427612, "logps/chosen": -1.7135963439941406, "logps/rejected": -1.729269027709961, "loss": 3.5379, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.135963439941406, "rewards/margins": 0.15672659873962402, "rewards/rejected": -17.29269027709961, "step": 15890 }, { "epoch": 0.5357443796555327, "grad_norm": 19.698957443237305, "learning_rate": 5.24873514916677e-07, "logits/chosen": -1.3450462818145752, "logits/rejected": -1.1864540576934814, "logps/chosen": -2.393937826156616, "logps/rejected": -2.188688278198242, "loss": 5.1997, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -23.93937873840332, "rewards/margins": -2.0524964332580566, "rewards/rejected": -21.886882781982422, "step": 15895 }, { "epoch": 0.5359129057265158, "grad_norm": 29.669261932373047, "learning_rate": 5.245797402295301e-07, "logits/chosen": -1.623167634010315, "logits/rejected": -1.6045089960098267, "logps/chosen": -2.1398420333862305, "logps/rejected": -2.199852466583252, "loss": 2.7214, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.398418426513672, "rewards/margins": 0.6001054644584656, "rewards/rejected": -21.998523712158203, "step": 15900 }, { "epoch": 0.5360814317974991, "grad_norm": 32.335594177246094, "learning_rate": 5.242859570363035e-07, "logits/chosen": -1.3742711544036865, "logits/rejected": -1.4153274297714233, "logps/chosen": -1.8817806243896484, "logps/rejected": -1.8834056854248047, "loss": 3.8013, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -18.817806243896484, "rewards/margins": 0.01625032350420952, "rewards/rejected": -18.834056854248047, "step": 15905 }, { "epoch": 0.5362499578684823, "grad_norm": 24.32137107849121, "learning_rate": 5.239921654386641e-07, "logits/chosen": -1.215399980545044, "logits/rejected": -1.2015448808670044, "logps/chosen": -1.8794883489608765, "logps/rejected": -2.0633597373962402, "loss": 1.7739, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -18.794885635375977, "rewards/margins": 1.8387138843536377, "rewards/rejected": -20.633596420288086, "step": 15910 }, { "epoch": 0.5364184839394655, "grad_norm": 28.42302131652832, "learning_rate": 5.236983655382813e-07, "logits/chosen": -1.375603199005127, "logits/rejected": -1.2848405838012695, "logps/chosen": -1.872772455215454, "logps/rejected": -1.9896090030670166, "loss": 2.7718, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.727725982666016, "rewards/margins": 1.1683663129806519, "rewards/rejected": -19.89609146118164, "step": 15915 }, { "epoch": 0.5365870100104486, "grad_norm": 33.60725784301758, "learning_rate": 5.23404557436828e-07, "logits/chosen": -0.9588086009025574, "logits/rejected": -0.9080830812454224, "logps/chosen": -2.8476717472076416, "logps/rejected": -2.907454013824463, "loss": 3.9853, "rewards/accuracies": 0.5, "rewards/chosen": -28.476715087890625, "rewards/margins": 0.5978223085403442, "rewards/rejected": -29.074539184570312, "step": 15920 }, { "epoch": 0.5367555360814318, "grad_norm": 26.43718910217285, "learning_rate": 5.231107412359794e-07, "logits/chosen": -1.1706427335739136, "logits/rejected": -1.4406774044036865, "logps/chosen": -2.108630895614624, "logps/rejected": -2.3447341918945312, "loss": 2.9799, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.0863094329834, "rewards/margins": 2.361030340194702, "rewards/rejected": -23.44734001159668, "step": 15925 }, { "epoch": 0.536924062152415, "grad_norm": 23.173118591308594, "learning_rate": 5.228169170374139e-07, "logits/chosen": -1.5244656801223755, "logits/rejected": -1.5963716506958008, "logps/chosen": -1.9139703512191772, "logps/rejected": -2.1006815433502197, "loss": 2.2664, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.13970375061035, "rewards/margins": 1.867110252380371, "rewards/rejected": -21.006813049316406, "step": 15930 }, { "epoch": 0.5370925882233981, "grad_norm": 26.308143615722656, "learning_rate": 5.225230849428124e-07, "logits/chosen": -1.360687494277954, "logits/rejected": -1.4157650470733643, "logps/chosen": -2.1507084369659424, "logps/rejected": -2.197394371032715, "loss": 2.7048, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.5070858001709, "rewards/margins": 0.46685847640037537, "rewards/rejected": -21.97394371032715, "step": 15935 }, { "epoch": 0.5372611142943814, "grad_norm": 26.92331314086914, "learning_rate": 5.222292450538584e-07, "logits/chosen": -0.9067287445068359, "logits/rejected": -0.9419302940368652, "logps/chosen": -3.1705856323242188, "logps/rejected": -3.2947776317596436, "loss": 2.4813, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -31.705860137939453, "rewards/margins": 1.2419227361679077, "rewards/rejected": -32.947776794433594, "step": 15940 }, { "epoch": 0.5374296403653646, "grad_norm": 21.930761337280273, "learning_rate": 5.219353974722387e-07, "logits/chosen": -1.3587646484375, "logits/rejected": -1.5184298753738403, "logps/chosen": -2.0198464393615723, "logps/rejected": -2.459423780441284, "loss": 2.6066, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.198467254638672, "rewards/margins": 4.395775318145752, "rewards/rejected": -24.59423828125, "step": 15945 }, { "epoch": 0.5375981664363477, "grad_norm": 44.06086349487305, "learning_rate": 5.21641542299642e-07, "logits/chosen": -0.7717021107673645, "logits/rejected": -1.0169395208358765, "logps/chosen": -2.177698850631714, "logps/rejected": -2.4020535945892334, "loss": 2.0949, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.77699089050293, "rewards/margins": 2.2435462474823, "rewards/rejected": -24.02053451538086, "step": 15950 }, { "epoch": 0.5377666925073309, "grad_norm": 40.27283477783203, "learning_rate": 5.213476796377603e-07, "logits/chosen": -1.5478652715682983, "logits/rejected": -1.6528785228729248, "logps/chosen": -1.8562246561050415, "logps/rejected": -1.9601234197616577, "loss": 2.7799, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.562244415283203, "rewards/margins": 1.0389883518218994, "rewards/rejected": -19.601234436035156, "step": 15955 }, { "epoch": 0.5379352185783141, "grad_norm": 27.30537986755371, "learning_rate": 5.210538095882875e-07, "logits/chosen": -1.1312801837921143, "logits/rejected": -1.1560137271881104, "logps/chosen": -2.146221160888672, "logps/rejected": -2.3140676021575928, "loss": 2.0642, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.46221160888672, "rewards/margins": 1.6784664392471313, "rewards/rejected": -23.14067840576172, "step": 15960 }, { "epoch": 0.5381037446492972, "grad_norm": 18.12936782836914, "learning_rate": 5.207599322529209e-07, "logits/chosen": -1.1602661609649658, "logits/rejected": -1.3525993824005127, "logps/chosen": -1.647727370262146, "logps/rejected": -2.0149059295654297, "loss": 2.121, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.47727394104004, "rewards/margins": 3.6717867851257324, "rewards/rejected": -20.14906120300293, "step": 15965 }, { "epoch": 0.5382722707202804, "grad_norm": 31.879697799682617, "learning_rate": 5.204660477333595e-07, "logits/chosen": -1.3323277235031128, "logits/rejected": -1.6333240270614624, "logps/chosen": -2.490701913833618, "logps/rejected": -3.067025661468506, "loss": 2.6407, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.907018661499023, "rewards/margins": 5.763237953186035, "rewards/rejected": -30.67025375366211, "step": 15970 }, { "epoch": 0.5384407967912636, "grad_norm": 22.474822998046875, "learning_rate": 5.201721561313054e-07, "logits/chosen": -1.035434365272522, "logits/rejected": -1.3812066316604614, "logps/chosen": -1.76253342628479, "logps/rejected": -2.0237176418304443, "loss": 1.2305, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.62533187866211, "rewards/margins": 2.6118431091308594, "rewards/rejected": -20.2371768951416, "step": 15975 }, { "epoch": 0.5386093228622468, "grad_norm": 22.040910720825195, "learning_rate": 5.198782575484629e-07, "logits/chosen": -1.5590795278549194, "logits/rejected": -1.5408340692520142, "logps/chosen": -2.3352012634277344, "logps/rejected": -2.256068229675293, "loss": 3.988, "rewards/accuracies": 0.5, "rewards/chosen": -23.352014541625977, "rewards/margins": -0.7913322448730469, "rewards/rejected": -22.56068229675293, "step": 15980 }, { "epoch": 0.53877784893323, "grad_norm": 56.712703704833984, "learning_rate": 5.195843520865385e-07, "logits/chosen": -1.1708321571350098, "logits/rejected": -1.1317713260650635, "logps/chosen": -2.1852142810821533, "logps/rejected": -2.4626965522766113, "loss": 2.6013, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.852140426635742, "rewards/margins": 2.774822950363159, "rewards/rejected": -24.626964569091797, "step": 15985 }, { "epoch": 0.5389463750042132, "grad_norm": 40.07321548461914, "learning_rate": 5.192904398472414e-07, "logits/chosen": -1.5711636543273926, "logits/rejected": -1.618537187576294, "logps/chosen": -2.194077253341675, "logps/rejected": -2.444293975830078, "loss": 2.8609, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.940771102905273, "rewards/margins": 2.502168655395508, "rewards/rejected": -24.44293785095215, "step": 15990 }, { "epoch": 0.5391149010751963, "grad_norm": 24.483991622924805, "learning_rate": 5.189965209322832e-07, "logits/chosen": -1.692317008972168, "logits/rejected": -1.6503814458847046, "logps/chosen": -2.3598055839538574, "logps/rejected": -2.362114667892456, "loss": 4.052, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -23.59805679321289, "rewards/margins": 0.023090552538633347, "rewards/rejected": -23.621145248413086, "step": 15995 }, { "epoch": 0.5392834271461795, "grad_norm": 26.952857971191406, "learning_rate": 5.187025954433775e-07, "logits/chosen": -1.8345916271209717, "logits/rejected": -2.290605068206787, "logps/chosen": -2.6928634643554688, "logps/rejected": -3.4724318981170654, "loss": 1.8705, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -26.928638458251953, "rewards/margins": 7.7956862449646, "rewards/rejected": -34.72432327270508, "step": 16000 }, { "epoch": 0.5392834271461795, "eval_logits/chosen": -1.8009322881698608, "eval_logits/rejected": -1.9339642524719238, "eval_logps/chosen": -2.0592150688171387, "eval_logps/rejected": -2.1777427196502686, "eval_loss": 2.988820791244507, "eval_rewards/accuracies": 0.6299999952316284, "eval_rewards/chosen": -20.592151641845703, "eval_rewards/margins": 1.1852753162384033, "eval_rewards/rejected": -21.777429580688477, "eval_runtime": 12.9249, "eval_samples_per_second": 7.737, "eval_steps_per_second": 1.934, "step": 16000 }, { "epoch": 0.5394519532171627, "grad_norm": 35.81848907470703, "learning_rate": 5.184086634822403e-07, "logits/chosen": -1.6082760095596313, "logits/rejected": -1.6484458446502686, "logps/chosen": -2.346975803375244, "logps/rejected": -2.5124051570892334, "loss": 2.7089, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.46976089477539, "rewards/margins": 1.654293417930603, "rewards/rejected": -25.124052047729492, "step": 16005 }, { "epoch": 0.5396204792881458, "grad_norm": 16.334346771240234, "learning_rate": 5.1811472515059e-07, "logits/chosen": -1.5007803440093994, "logits/rejected": -1.9453624486923218, "logps/chosen": -1.8338782787322998, "logps/rejected": -2.1191000938415527, "loss": 2.9343, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.338781356811523, "rewards/margins": 2.8522186279296875, "rewards/rejected": -21.191001892089844, "step": 16010 }, { "epoch": 0.5397890053591291, "grad_norm": 17.845060348510742, "learning_rate": 5.17820780550147e-07, "logits/chosen": -1.0943386554718018, "logits/rejected": -1.19479238986969, "logps/chosen": -1.8260581493377686, "logps/rejected": -2.105739116668701, "loss": 1.4134, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.260583877563477, "rewards/margins": 2.796811580657959, "rewards/rejected": -21.05739402770996, "step": 16015 }, { "epoch": 0.5399575314301123, "grad_norm": 29.95235824584961, "learning_rate": 5.175268297826339e-07, "logits/chosen": -1.506792664527893, "logits/rejected": -1.7213274240493774, "logps/chosen": -1.9028027057647705, "logps/rejected": -1.939144492149353, "loss": 3.0789, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.028026580810547, "rewards/margins": 0.36341866850852966, "rewards/rejected": -19.39144515991211, "step": 16020 }, { "epoch": 0.5401260575010954, "grad_norm": 35.79323959350586, "learning_rate": 5.172328729497757e-07, "logits/chosen": -1.24240243434906, "logits/rejected": -1.4514870643615723, "logps/chosen": -2.0713295936584473, "logps/rejected": -2.2839739322662354, "loss": 1.4821, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -20.71329689025879, "rewards/margins": 2.1264426708221436, "rewards/rejected": -22.839738845825195, "step": 16025 }, { "epoch": 0.5402945835720786, "grad_norm": 21.9273738861084, "learning_rate": 5.169389101532992e-07, "logits/chosen": -1.3595364093780518, "logits/rejected": -1.4825410842895508, "logps/chosen": -2.9908063411712646, "logps/rejected": -3.0221633911132812, "loss": 4.5762, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -29.908061981201172, "rewards/margins": 0.31356924772262573, "rewards/rejected": -30.221630096435547, "step": 16030 }, { "epoch": 0.5404631096430618, "grad_norm": 29.29009246826172, "learning_rate": 5.16644941494933e-07, "logits/chosen": -1.5605688095092773, "logits/rejected": -1.5835740566253662, "logps/chosen": -2.1153295040130615, "logps/rejected": -2.410200595855713, "loss": 2.5973, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.15329360961914, "rewards/margins": 2.948711395263672, "rewards/rejected": -24.102006912231445, "step": 16035 }, { "epoch": 0.5406316357140449, "grad_norm": 34.95164489746094, "learning_rate": 5.163509670764085e-07, "logits/chosen": -1.321013331413269, "logits/rejected": -1.2671291828155518, "logps/chosen": -1.869821310043335, "logps/rejected": -1.9189687967300415, "loss": 2.6713, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.69821548461914, "rewards/margins": 0.49147263169288635, "rewards/rejected": -19.189685821533203, "step": 16040 }, { "epoch": 0.5408001617850281, "grad_norm": 25.569393157958984, "learning_rate": 5.160569869994583e-07, "logits/chosen": -1.5885627269744873, "logits/rejected": -1.7263247966766357, "logps/chosen": -1.9965381622314453, "logps/rejected": -2.2323851585388184, "loss": 2.582, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.96537971496582, "rewards/margins": 2.3584752082824707, "rewards/rejected": -22.323854446411133, "step": 16045 }, { "epoch": 0.5409686878560114, "grad_norm": 27.766555786132812, "learning_rate": 5.157630013658177e-07, "logits/chosen": -1.088120937347412, "logits/rejected": -1.072862148284912, "logps/chosen": -1.9776265621185303, "logps/rejected": -2.1201555728912354, "loss": 2.5222, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.776268005371094, "rewards/margins": 1.4252907037734985, "rewards/rejected": -21.201557159423828, "step": 16050 }, { "epoch": 0.5411372139269945, "grad_norm": 26.957998275756836, "learning_rate": 5.154690102772233e-07, "logits/chosen": -1.8490188121795654, "logits/rejected": -2.186875820159912, "logps/chosen": -1.4853366613388062, "logps/rejected": -1.7421538829803467, "loss": 2.0116, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -14.853365898132324, "rewards/margins": 2.568171262741089, "rewards/rejected": -17.421539306640625, "step": 16055 }, { "epoch": 0.5413057399979777, "grad_norm": 15.618478775024414, "learning_rate": 5.151750138354139e-07, "logits/chosen": -1.0816246271133423, "logits/rejected": -1.3231147527694702, "logps/chosen": -2.101982831954956, "logps/rejected": -2.9028682708740234, "loss": 1.5279, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -21.01982879638672, "rewards/margins": 8.0088529586792, "rewards/rejected": -29.0286808013916, "step": 16060 }, { "epoch": 0.5414742660689609, "grad_norm": 11.756542205810547, "learning_rate": 5.148810121421301e-07, "logits/chosen": -1.383972406387329, "logits/rejected": -1.4302829504013062, "logps/chosen": -2.4469587802886963, "logps/rejected": -3.174461603164673, "loss": 2.2436, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.469587326049805, "rewards/margins": 7.275025844573975, "rewards/rejected": -31.744617462158203, "step": 16065 }, { "epoch": 0.541642792139944, "grad_norm": 46.22239303588867, "learning_rate": 5.145870052991142e-07, "logits/chosen": -1.664219617843628, "logits/rejected": -2.0187935829162598, "logps/chosen": -1.9643936157226562, "logps/rejected": -2.1999523639678955, "loss": 2.2296, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -19.64393424987793, "rewards/margins": 2.3555893898010254, "rewards/rejected": -21.999523162841797, "step": 16070 }, { "epoch": 0.5418113182109272, "grad_norm": 20.696165084838867, "learning_rate": 5.142929934081107e-07, "logits/chosen": -1.3868186473846436, "logits/rejected": -1.6188242435455322, "logps/chosen": -1.7030565738677979, "logps/rejected": -1.6263227462768555, "loss": 3.9108, "rewards/accuracies": 0.5, "rewards/chosen": -17.03056526184082, "rewards/margins": -0.7673369646072388, "rewards/rejected": -16.263227462768555, "step": 16075 }, { "epoch": 0.5419798442819104, "grad_norm": 41.721031188964844, "learning_rate": 5.139989765708651e-07, "logits/chosen": -1.031810998916626, "logits/rejected": -1.10875403881073, "logps/chosen": -2.078503131866455, "logps/rejected": -2.17492938041687, "loss": 2.7677, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.785030364990234, "rewards/margins": 0.9642614126205444, "rewards/rejected": -21.749292373657227, "step": 16080 }, { "epoch": 0.5421483703528935, "grad_norm": 66.38001251220703, "learning_rate": 5.137049548891253e-07, "logits/chosen": -0.5578786134719849, "logits/rejected": -0.5402613878250122, "logps/chosen": -1.9524940252304077, "logps/rejected": -2.130063533782959, "loss": 2.1677, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.524944305419922, "rewards/margins": 1.7756946086883545, "rewards/rejected": -21.300636291503906, "step": 16085 }, { "epoch": 0.5423168964238768, "grad_norm": 129.77603149414062, "learning_rate": 5.134109284646405e-07, "logits/chosen": -1.6398814916610718, "logits/rejected": -1.465562343597412, "logps/chosen": -2.069758415222168, "logps/rejected": -2.019371271133423, "loss": 4.3334, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -20.697582244873047, "rewards/margins": -0.5038704872131348, "rewards/rejected": -20.193714141845703, "step": 16090 }, { "epoch": 0.54248542249486, "grad_norm": 29.77696418762207, "learning_rate": 5.131168973991618e-07, "logits/chosen": -1.091217279434204, "logits/rejected": -1.1410750150680542, "logps/chosen": -2.4660487174987793, "logps/rejected": -2.612558364868164, "loss": 2.7581, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.66048812866211, "rewards/margins": 1.4650931358337402, "rewards/rejected": -26.125579833984375, "step": 16095 }, { "epoch": 0.5426539485658431, "grad_norm": 10.7893705368042, "learning_rate": 5.128228617944418e-07, "logits/chosen": -1.3055378198623657, "logits/rejected": -1.4175994396209717, "logps/chosen": -2.1625494956970215, "logps/rejected": -2.2795069217681885, "loss": 3.7809, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.62549591064453, "rewards/margins": 1.1695719957351685, "rewards/rejected": -22.795068740844727, "step": 16100 }, { "epoch": 0.5428224746368263, "grad_norm": 27.38039207458496, "learning_rate": 5.125288217522344e-07, "logits/chosen": -1.5777299404144287, "logits/rejected": -1.7784534692764282, "logps/chosen": -2.2568578720092773, "logps/rejected": -2.5661861896514893, "loss": 3.0235, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.568580627441406, "rewards/margins": 3.09328031539917, "rewards/rejected": -25.6618595123291, "step": 16105 }, { "epoch": 0.5429910007078095, "grad_norm": 32.53067398071289, "learning_rate": 5.122347773742956e-07, "logits/chosen": -1.8470481634140015, "logits/rejected": -1.6745433807373047, "logps/chosen": -2.235819101333618, "logps/rejected": -2.2479605674743652, "loss": 3.0407, "rewards/accuracies": 0.5, "rewards/chosen": -22.358190536499023, "rewards/margins": 0.1214146614074707, "rewards/rejected": -22.479604721069336, "step": 16110 }, { "epoch": 0.5431595267787926, "grad_norm": 21.476905822753906, "learning_rate": 5.11940728762382e-07, "logits/chosen": -1.3058350086212158, "logits/rejected": -1.5694831609725952, "logps/chosen": -2.243704080581665, "logps/rejected": -2.6150970458984375, "loss": 1.2742, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.437042236328125, "rewards/margins": 3.7139289379119873, "rewards/rejected": -26.150970458984375, "step": 16115 }, { "epoch": 0.5433280528497758, "grad_norm": 0.27187997102737427, "learning_rate": 5.116466760182529e-07, "logits/chosen": -1.6408789157867432, "logits/rejected": -1.9273831844329834, "logps/chosen": -1.9865741729736328, "logps/rejected": -2.6092066764831543, "loss": 1.2092, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.865741729736328, "rewards/margins": 6.22632360458374, "rewards/rejected": -26.092065811157227, "step": 16120 }, { "epoch": 0.5434965789207591, "grad_norm": 36.5335807800293, "learning_rate": 5.11352619243668e-07, "logits/chosen": -1.7007386684417725, "logits/rejected": -2.2129995822906494, "logps/chosen": -2.2495627403259277, "logps/rejected": -2.761093854904175, "loss": 2.9749, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.495628356933594, "rewards/margins": 5.115310192108154, "rewards/rejected": -27.610937118530273, "step": 16125 }, { "epoch": 0.5436651049917423, "grad_norm": 35.36635208129883, "learning_rate": 5.11058558540389e-07, "logits/chosen": -0.7347787618637085, "logits/rejected": -1.0197536945343018, "logps/chosen": -1.9708576202392578, "logps/rejected": -2.1755924224853516, "loss": 2.4735, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.708574295043945, "rewards/margins": 2.047349452972412, "rewards/rejected": -21.755924224853516, "step": 16130 }, { "epoch": 0.5438336310627254, "grad_norm": 24.370132446289062, "learning_rate": 5.107644940101784e-07, "logits/chosen": -0.6877826452255249, "logits/rejected": -0.806088924407959, "logps/chosen": -2.206653118133545, "logps/rejected": -2.56797456741333, "loss": 1.1178, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -22.066530227661133, "rewards/margins": 3.6132149696350098, "rewards/rejected": -25.67974281311035, "step": 16135 }, { "epoch": 0.5440021571337086, "grad_norm": 16.870189666748047, "learning_rate": 5.104704257548005e-07, "logits/chosen": -1.4101765155792236, "logits/rejected": -1.325073480606079, "logps/chosen": -2.379502534866333, "logps/rejected": -2.5285820960998535, "loss": 2.683, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.795024871826172, "rewards/margins": 1.4907970428466797, "rewards/rejected": -25.28582191467285, "step": 16140 }, { "epoch": 0.5441706832046918, "grad_norm": 46.246456146240234, "learning_rate": 5.101763538760209e-07, "logits/chosen": -1.5839884281158447, "logits/rejected": -1.957201600074768, "logps/chosen": -1.852923035621643, "logps/rejected": -2.072567939758301, "loss": 2.7152, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.52923011779785, "rewards/margins": 2.196450710296631, "rewards/rejected": -20.72568130493164, "step": 16145 }, { "epoch": 0.5443392092756749, "grad_norm": 97.65252685546875, "learning_rate": 5.098822784756061e-07, "logits/chosen": -1.8456027507781982, "logits/rejected": -1.7750129699707031, "logps/chosen": -2.749521255493164, "logps/rejected": -2.648500919342041, "loss": 5.1515, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -27.495208740234375, "rewards/margins": -1.0102026462554932, "rewards/rejected": -26.48500633239746, "step": 16150 }, { "epoch": 0.5445077353466581, "grad_norm": 80.65924072265625, "learning_rate": 5.095881996553242e-07, "logits/chosen": -1.3277708292007446, "logits/rejected": -1.4319045543670654, "logps/chosen": -1.9848464727401733, "logps/rejected": -1.9942963123321533, "loss": 3.5708, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.848468780517578, "rewards/margins": 0.09449663013219833, "rewards/rejected": -19.942962646484375, "step": 16155 }, { "epoch": 0.5446762614176414, "grad_norm": 54.68977355957031, "learning_rate": 5.09294117516944e-07, "logits/chosen": -1.5335499048233032, "logits/rejected": -1.610508680343628, "logps/chosen": -2.8308680057525635, "logps/rejected": -3.0489935874938965, "loss": 2.3802, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -28.30868148803711, "rewards/margins": 2.181258201599121, "rewards/rejected": -30.489938735961914, "step": 16160 }, { "epoch": 0.5448447874886245, "grad_norm": 27.68987464904785, "learning_rate": 5.090000321622358e-07, "logits/chosen": -1.3854801654815674, "logits/rejected": -1.203540563583374, "logps/chosen": -1.650176763534546, "logps/rejected": -2.056550979614258, "loss": 2.6758, "rewards/accuracies": 0.5, "rewards/chosen": -16.501766204833984, "rewards/margins": 4.063741207122803, "rewards/rejected": -20.565509796142578, "step": 16165 }, { "epoch": 0.5450133135596077, "grad_norm": 54.86668014526367, "learning_rate": 5.087059436929714e-07, "logits/chosen": -0.4492467939853668, "logits/rejected": -0.6218141317367554, "logps/chosen": -2.3843488693237305, "logps/rejected": -2.4283106327056885, "loss": 3.0914, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.843486785888672, "rewards/margins": 0.4396181106567383, "rewards/rejected": -24.28310775756836, "step": 16170 }, { "epoch": 0.5451818396305909, "grad_norm": 21.276992797851562, "learning_rate": 5.084118522109225e-07, "logits/chosen": -1.6168181896209717, "logits/rejected": -1.83038330078125, "logps/chosen": -1.897952675819397, "logps/rejected": -2.1104540824890137, "loss": 2.5915, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.97952651977539, "rewards/margins": 2.1250123977661133, "rewards/rejected": -21.104537963867188, "step": 16175 }, { "epoch": 0.545350365701574, "grad_norm": 31.154457092285156, "learning_rate": 5.081177578178632e-07, "logits/chosen": -2.0428037643432617, "logits/rejected": -2.1841020584106445, "logps/chosen": -2.541215181350708, "logps/rejected": -3.1152877807617188, "loss": 2.5159, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.412151336669922, "rewards/margins": 5.740725994110107, "rewards/rejected": -31.152877807617188, "step": 16180 }, { "epoch": 0.5455188917725572, "grad_norm": 25.28038787841797, "learning_rate": 5.078236606155677e-07, "logits/chosen": -1.2056846618652344, "logits/rejected": -1.2339636087417603, "logps/chosen": -2.2586770057678223, "logps/rejected": -2.537292957305908, "loss": 2.1905, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.586769104003906, "rewards/margins": 2.786158800125122, "rewards/rejected": -25.372928619384766, "step": 16185 }, { "epoch": 0.5456874178435404, "grad_norm": 74.27156066894531, "learning_rate": 5.075295607058116e-07, "logits/chosen": -1.1293174028396606, "logits/rejected": -1.1770614385604858, "logps/chosen": -2.194941759109497, "logps/rejected": -2.2006897926330566, "loss": 3.2653, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.949419021606445, "rewards/margins": 0.05747966840863228, "rewards/rejected": -22.00689697265625, "step": 16190 }, { "epoch": 0.5458559439145235, "grad_norm": 63.31100082397461, "learning_rate": 5.072354581903709e-07, "logits/chosen": -1.2766873836517334, "logits/rejected": -1.6367841958999634, "logps/chosen": -1.8417125940322876, "logps/rejected": -1.7709461450576782, "loss": 3.8441, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.417125701904297, "rewards/margins": -0.7076643705368042, "rewards/rejected": -17.709461212158203, "step": 16195 }, { "epoch": 0.5460244699855068, "grad_norm": 0.0952446460723877, "learning_rate": 5.069413531710235e-07, "logits/chosen": -1.78704833984375, "logits/rejected": -1.8815076351165771, "logps/chosen": -2.5174808502197266, "logps/rejected": -3.0017216205596924, "loss": 1.5106, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -25.174808502197266, "rewards/margins": 4.842409610748291, "rewards/rejected": -30.0172176361084, "step": 16200 }, { "epoch": 0.54619299605649, "grad_norm": 4.080849647521973, "learning_rate": 5.066472457495471e-07, "logits/chosen": -1.1902334690093994, "logits/rejected": -1.4749139547348022, "logps/chosen": -1.9703378677368164, "logps/rejected": -2.3434581756591797, "loss": 1.7, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.70337677001953, "rewards/margins": 3.731203556060791, "rewards/rejected": -23.434581756591797, "step": 16205 }, { "epoch": 0.5463615221274731, "grad_norm": 19.52630615234375, "learning_rate": 5.063531360277209e-07, "logits/chosen": -1.5813535451889038, "logits/rejected": -1.4736263751983643, "logps/chosen": -1.56548011302948, "logps/rejected": -1.4851534366607666, "loss": 4.1395, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -15.654800415039062, "rewards/margins": -0.8032673597335815, "rewards/rejected": -14.851534843444824, "step": 16210 }, { "epoch": 0.5465300481984563, "grad_norm": 30.350008010864258, "learning_rate": 5.060590241073245e-07, "logits/chosen": -1.7225860357284546, "logits/rejected": -1.749176263809204, "logps/chosen": -2.0334715843200684, "logps/rejected": -2.5191073417663574, "loss": 1.8731, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.334712982177734, "rewards/margins": 4.856356620788574, "rewards/rejected": -25.191070556640625, "step": 16215 }, { "epoch": 0.5466985742694395, "grad_norm": 53.04943084716797, "learning_rate": 5.057649100901386e-07, "logits/chosen": -1.5750441551208496, "logits/rejected": -1.8706716299057007, "logps/chosen": -1.9647775888442993, "logps/rejected": -2.0691940784454346, "loss": 2.207, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.647777557373047, "rewards/margins": 1.0441657304763794, "rewards/rejected": -20.69194221496582, "step": 16220 }, { "epoch": 0.5468671003404226, "grad_norm": 16.87798500061035, "learning_rate": 5.054707940779446e-07, "logits/chosen": -1.5316976308822632, "logits/rejected": -1.7342464923858643, "logps/chosen": -2.130229949951172, "logps/rejected": -2.5026209354400635, "loss": 2.1972, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.302297592163086, "rewards/margins": 3.7239105701446533, "rewards/rejected": -25.02621078491211, "step": 16225 }, { "epoch": 0.5470356264114058, "grad_norm": 63.59160232543945, "learning_rate": 5.051766761725241e-07, "logits/chosen": -1.3253222703933716, "logits/rejected": -1.2189723253250122, "logps/chosen": -2.0285019874572754, "logps/rejected": -1.908988356590271, "loss": 4.2631, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.28502082824707, "rewards/margins": -1.1951375007629395, "rewards/rejected": -19.08988380432129, "step": 16230 }, { "epoch": 0.5472041524823891, "grad_norm": 92.26155090332031, "learning_rate": 5.048825564756601e-07, "logits/chosen": -1.8079001903533936, "logits/rejected": -1.8996975421905518, "logps/chosen": -2.1402010917663574, "logps/rejected": -2.247673749923706, "loss": 2.2508, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.40201187133789, "rewards/margins": 1.0747264623641968, "rewards/rejected": -22.47673797607422, "step": 16235 }, { "epoch": 0.5473726785533722, "grad_norm": 69.60755157470703, "learning_rate": 5.045884350891356e-07, "logits/chosen": -1.0289746522903442, "logits/rejected": -1.0647555589675903, "logps/chosen": -2.0716910362243652, "logps/rejected": -2.075446605682373, "loss": 3.2364, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.716907501220703, "rewards/margins": 0.03755836561322212, "rewards/rejected": -20.754467010498047, "step": 16240 }, { "epoch": 0.5475412046243554, "grad_norm": 31.6566219329834, "learning_rate": 5.042943121147345e-07, "logits/chosen": -1.6412330865859985, "logits/rejected": -2.096470355987549, "logps/chosen": -2.3620076179504395, "logps/rejected": -3.1981656551361084, "loss": 1.9026, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.620075225830078, "rewards/margins": 8.361583709716797, "rewards/rejected": -31.981658935546875, "step": 16245 }, { "epoch": 0.5477097306953386, "grad_norm": 19.45050811767578, "learning_rate": 5.040001876542413e-07, "logits/chosen": -1.898911714553833, "logits/rejected": -1.8914964199066162, "logps/chosen": -1.802631139755249, "logps/rejected": -1.9166710376739502, "loss": 2.6816, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.02631187438965, "rewards/margins": 1.1403964757919312, "rewards/rejected": -19.166709899902344, "step": 16250 }, { "epoch": 0.5478782567663217, "grad_norm": 26.124082565307617, "learning_rate": 5.037060618094406e-07, "logits/chosen": -1.0378376245498657, "logits/rejected": -1.0364210605621338, "logps/chosen": -2.5162196159362793, "logps/rejected": -2.6829471588134766, "loss": 3.888, "rewards/accuracies": 0.5, "rewards/chosen": -25.16219711303711, "rewards/margins": 1.6672769784927368, "rewards/rejected": -26.8294734954834, "step": 16255 }, { "epoch": 0.5480467828373049, "grad_norm": 24.959684371948242, "learning_rate": 5.034119346821179e-07, "logits/chosen": -1.591496229171753, "logits/rejected": -1.5482546091079712, "logps/chosen": -2.5285449028015137, "logps/rejected": -2.6545941829681396, "loss": 3.1635, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.285449981689453, "rewards/margins": 1.2604939937591553, "rewards/rejected": -26.545940399169922, "step": 16260 }, { "epoch": 0.5482153089082881, "grad_norm": 25.58453369140625, "learning_rate": 5.031178063740591e-07, "logits/chosen": -1.7127612829208374, "logits/rejected": -1.9984369277954102, "logps/chosen": -2.3069615364074707, "logps/rejected": -2.550511121749878, "loss": 2.1247, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -23.06961441040039, "rewards/margins": 2.4354963302612305, "rewards/rejected": -25.505109786987305, "step": 16265 }, { "epoch": 0.5483838349792713, "grad_norm": 13.465657234191895, "learning_rate": 5.028236769870503e-07, "logits/chosen": -1.6997343301773071, "logits/rejected": -1.6074409484863281, "logps/chosen": -2.0852694511413574, "logps/rejected": -2.3273866176605225, "loss": 1.6164, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.85269546508789, "rewards/margins": 2.4211738109588623, "rewards/rejected": -23.273868560791016, "step": 16270 }, { "epoch": 0.5485523610502545, "grad_norm": 6.806887149810791, "learning_rate": 5.025295466228782e-07, "logits/chosen": -2.103903293609619, "logits/rejected": -2.28928804397583, "logps/chosen": -2.131436824798584, "logps/rejected": -2.859297275543213, "loss": 1.2612, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.314367294311523, "rewards/margins": 7.278607368469238, "rewards/rejected": -28.592975616455078, "step": 16275 }, { "epoch": 0.5487208871212377, "grad_norm": 28.80763053894043, "learning_rate": 5.022354153833296e-07, "logits/chosen": -1.6582714319229126, "logits/rejected": -1.998355507850647, "logps/chosen": -2.045281410217285, "logps/rejected": -2.201249599456787, "loss": 2.4043, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.452816009521484, "rewards/margins": 1.5596843957901, "rewards/rejected": -22.012500762939453, "step": 16280 }, { "epoch": 0.5488894131922208, "grad_norm": 23.410083770751953, "learning_rate": 5.019412833701917e-07, "logits/chosen": -1.7809860706329346, "logits/rejected": -1.9740571975708008, "logps/chosen": -1.9353249073028564, "logps/rejected": -1.9824997186660767, "loss": 2.9308, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.353246688842773, "rewards/margins": 0.4717481732368469, "rewards/rejected": -19.824996948242188, "step": 16285 }, { "epoch": 0.549057939263204, "grad_norm": 34.78386688232422, "learning_rate": 5.016471506852522e-07, "logits/chosen": -1.3661186695098877, "logits/rejected": -1.5314580202102661, "logps/chosen": -1.7690000534057617, "logps/rejected": -1.9127906560897827, "loss": 2.3856, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.690000534057617, "rewards/margins": 1.4379034042358398, "rewards/rejected": -19.127904891967773, "step": 16290 }, { "epoch": 0.5492264653341872, "grad_norm": 24.4421329498291, "learning_rate": 5.013530174302989e-07, "logits/chosen": -1.5516514778137207, "logits/rejected": -1.7300994396209717, "logps/chosen": -2.3746907711029053, "logps/rejected": -2.7230477333068848, "loss": 2.4286, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.746906280517578, "rewards/margins": 3.4835681915283203, "rewards/rejected": -27.2304744720459, "step": 16295 }, { "epoch": 0.5493949914051703, "grad_norm": 13.170940399169922, "learning_rate": 5.010588837071196e-07, "logits/chosen": -1.0385621786117554, "logits/rejected": -1.3224549293518066, "logps/chosen": -2.315553665161133, "logps/rejected": -2.626375913619995, "loss": 2.0412, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.155536651611328, "rewards/margins": 3.108222723007202, "rewards/rejected": -26.26375961303711, "step": 16300 }, { "epoch": 0.5495635174761535, "grad_norm": 75.63530731201172, "learning_rate": 5.007647496175021e-07, "logits/chosen": -1.0984992980957031, "logits/rejected": -1.217091679573059, "logps/chosen": -3.2464778423309326, "logps/rejected": -3.294538974761963, "loss": 4.7708, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -32.464778900146484, "rewards/margins": 0.4806079864501953, "rewards/rejected": -32.94538879394531, "step": 16305 }, { "epoch": 0.5497320435471368, "grad_norm": 39.796756744384766, "learning_rate": 5.004706152632351e-07, "logits/chosen": -1.3322203159332275, "logits/rejected": -1.4762696027755737, "logps/chosen": -2.046513319015503, "logps/rejected": -2.0476295948028564, "loss": 3.142, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.465133666992188, "rewards/margins": 0.011162233538925648, "rewards/rejected": -20.476295471191406, "step": 16310 }, { "epoch": 0.54990056961812, "grad_norm": 38.28547668457031, "learning_rate": 5.001764807461065e-07, "logits/chosen": -1.0382802486419678, "logits/rejected": -1.4499540328979492, "logps/chosen": -2.783287763595581, "logps/rejected": -2.572631359100342, "loss": 5.7225, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -27.832876205444336, "rewards/margins": -2.1065640449523926, "rewards/rejected": -25.7263126373291, "step": 16315 }, { "epoch": 0.5500690956891031, "grad_norm": 24.27626609802246, "learning_rate": 4.998823461679051e-07, "logits/chosen": -1.0992909669876099, "logits/rejected": -1.2056782245635986, "logps/chosen": -2.2541472911834717, "logps/rejected": -2.2591731548309326, "loss": 3.6655, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.54146957397461, "rewards/margins": 0.05026273801922798, "rewards/rejected": -22.591732025146484, "step": 16320 }, { "epoch": 0.5502376217600863, "grad_norm": 1.48041832447052, "learning_rate": 4.995882116304189e-07, "logits/chosen": -1.4299277067184448, "logits/rejected": -1.3262441158294678, "logps/chosen": -2.3866114616394043, "logps/rejected": -2.650369644165039, "loss": 2.7072, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.86611557006836, "rewards/margins": 2.6375796794891357, "rewards/rejected": -26.50369644165039, "step": 16325 }, { "epoch": 0.5504061478310694, "grad_norm": 31.15922737121582, "learning_rate": 4.992940772354364e-07, "logits/chosen": -1.0533661842346191, "logits/rejected": -1.3468220233917236, "logps/chosen": -2.4013590812683105, "logps/rejected": -2.3216609954833984, "loss": 5.1866, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -24.013591766357422, "rewards/margins": -0.7969821691513062, "rewards/rejected": -23.216609954833984, "step": 16330 }, { "epoch": 0.5505746739020526, "grad_norm": 31.58074188232422, "learning_rate": 4.989999430847463e-07, "logits/chosen": -2.013664722442627, "logits/rejected": -1.8949086666107178, "logps/chosen": -1.9614967107772827, "logps/rejected": -2.0524280071258545, "loss": 2.6342, "rewards/accuracies": 0.5, "rewards/chosen": -19.614965438842773, "rewards/margins": 0.9093145132064819, "rewards/rejected": -20.524280548095703, "step": 16335 }, { "epoch": 0.5507431999730358, "grad_norm": 31.68235206604004, "learning_rate": 4.987058092801361e-07, "logits/chosen": -1.184633493423462, "logits/rejected": -1.1958637237548828, "logps/chosen": -1.8362147808074951, "logps/rejected": -1.8909202814102173, "loss": 2.7622, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.36214828491211, "rewards/margins": 0.547053337097168, "rewards/rejected": -18.909204483032227, "step": 16340 }, { "epoch": 0.5509117260440191, "grad_norm": 26.33146858215332, "learning_rate": 4.984116759233944e-07, "logits/chosen": -1.4128179550170898, "logits/rejected": -1.6827160120010376, "logps/chosen": -2.2236125469207764, "logps/rejected": -3.1885323524475098, "loss": 2.1096, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.236125946044922, "rewards/margins": 9.649200439453125, "rewards/rejected": -31.885326385498047, "step": 16345 }, { "epoch": 0.5510802521150022, "grad_norm": 37.639930725097656, "learning_rate": 4.981175431163092e-07, "logits/chosen": -1.5246819257736206, "logits/rejected": -1.458280324935913, "logps/chosen": -2.300814151763916, "logps/rejected": -2.4303762912750244, "loss": 2.9189, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.00814437866211, "rewards/margins": 1.2956197261810303, "rewards/rejected": -24.30376434326172, "step": 16350 }, { "epoch": 0.5512487781859854, "grad_norm": 33.55547332763672, "learning_rate": 4.978234109606681e-07, "logits/chosen": -1.6795823574066162, "logits/rejected": -1.840654730796814, "logps/chosen": -2.044466733932495, "logps/rejected": -2.403834819793701, "loss": 1.9782, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.44466781616211, "rewards/margins": 3.5936789512634277, "rewards/rejected": -24.038349151611328, "step": 16355 }, { "epoch": 0.5514173042569686, "grad_norm": 24.78546714782715, "learning_rate": 4.975292795582588e-07, "logits/chosen": -1.0123234987258911, "logits/rejected": -1.229949712753296, "logps/chosen": -2.1764540672302246, "logps/rejected": -2.229409694671631, "loss": 3.0214, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.764545440673828, "rewards/margins": 0.529554009437561, "rewards/rejected": -22.294097900390625, "step": 16360 }, { "epoch": 0.5515858303279517, "grad_norm": 18.02266502380371, "learning_rate": 4.972351490108683e-07, "logits/chosen": -1.6163885593414307, "logits/rejected": -1.8083269596099854, "logps/chosen": -2.210312604904175, "logps/rejected": -2.2474353313446045, "loss": 4.8088, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.103126525878906, "rewards/margins": 0.37122592329978943, "rewards/rejected": -22.474353790283203, "step": 16365 }, { "epoch": 0.5517543563989349, "grad_norm": 15.657951354980469, "learning_rate": 4.96941019420284e-07, "logits/chosen": -1.1342637538909912, "logits/rejected": -1.3603713512420654, "logps/chosen": -2.810260772705078, "logps/rejected": -2.916231155395508, "loss": 3.2622, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -28.102609634399414, "rewards/margins": 1.0597028732299805, "rewards/rejected": -29.162311553955078, "step": 16370 }, { "epoch": 0.551922882469918, "grad_norm": 37.477928161621094, "learning_rate": 4.966468908882921e-07, "logits/chosen": -1.0693776607513428, "logits/rejected": -1.183394193649292, "logps/chosen": -2.5060973167419434, "logps/rejected": -2.6473300457000732, "loss": 2.2649, "rewards/accuracies": 0.5, "rewards/chosen": -25.06097412109375, "rewards/margins": 1.4123274087905884, "rewards/rejected": -26.47330093383789, "step": 16375 }, { "epoch": 0.5520914085409013, "grad_norm": 34.93418884277344, "learning_rate": 4.963527635166793e-07, "logits/chosen": -1.20332670211792, "logits/rejected": -1.1986877918243408, "logps/chosen": -2.035940408706665, "logps/rejected": -2.0706629753112793, "loss": 2.7629, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.359403610229492, "rewards/margins": 0.34722432494163513, "rewards/rejected": -20.706628799438477, "step": 16380 }, { "epoch": 0.5522599346118845, "grad_norm": 23.54374885559082, "learning_rate": 4.960586374072316e-07, "logits/chosen": -1.2519137859344482, "logits/rejected": -1.3859598636627197, "logps/chosen": -2.3384459018707275, "logps/rejected": -3.0998260974884033, "loss": 1.8127, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.38446044921875, "rewards/margins": 7.6138014793396, "rewards/rejected": -30.998260498046875, "step": 16385 }, { "epoch": 0.5524284606828677, "grad_norm": 24.325088500976562, "learning_rate": 4.957645126617339e-07, "logits/chosen": -1.4304975271224976, "logits/rejected": -1.6704849004745483, "logps/chosen": -2.214526653289795, "logps/rejected": -2.578474521636963, "loss": 1.8956, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.145265579223633, "rewards/margins": 3.6394755840301514, "rewards/rejected": -25.784744262695312, "step": 16390 }, { "epoch": 0.5525969867538508, "grad_norm": 4.897208213806152, "learning_rate": 4.954703893819715e-07, "logits/chosen": -1.407099962234497, "logits/rejected": -1.4926787614822388, "logps/chosen": -2.6405577659606934, "logps/rejected": -3.410717487335205, "loss": 2.6697, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.405574798583984, "rewards/margins": 7.701594352722168, "rewards/rejected": -34.10717010498047, "step": 16395 }, { "epoch": 0.552765512824834, "grad_norm": 57.99406051635742, "learning_rate": 4.951762676697292e-07, "logits/chosen": -1.7110874652862549, "logits/rejected": -1.8076190948486328, "logps/chosen": -2.230250835418701, "logps/rejected": -2.1426777839660645, "loss": 4.0587, "rewards/accuracies": 0.5, "rewards/chosen": -22.302507400512695, "rewards/margins": -0.8757309913635254, "rewards/rejected": -21.426776885986328, "step": 16400 }, { "epoch": 0.552765512824834, "eval_logits/chosen": -1.862671971321106, "eval_logits/rejected": -2.0018630027770996, "eval_logps/chosen": -2.0881171226501465, "eval_logps/rejected": -2.213587760925293, "eval_loss": 2.9925146102905273, "eval_rewards/accuracies": 0.6299999952316284, "eval_rewards/chosen": -20.88117218017578, "eval_rewards/margins": 1.2547067403793335, "eval_rewards/rejected": -22.135875701904297, "eval_runtime": 12.916, "eval_samples_per_second": 7.742, "eval_steps_per_second": 1.936, "step": 16400 }, { "epoch": 0.5529340388958172, "grad_norm": 30.2686824798584, "learning_rate": 4.948821476267902e-07, "logits/chosen": -1.4786027669906616, "logits/rejected": -1.5657284259796143, "logps/chosen": -2.48724627494812, "logps/rejected": -3.2754569053649902, "loss": 1.9817, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.87246322631836, "rewards/margins": 7.882106781005859, "rewards/rejected": -32.75457000732422, "step": 16405 }, { "epoch": 0.5531025649668003, "grad_norm": 26.35026741027832, "learning_rate": 4.945880293549384e-07, "logits/chosen": -1.3459535837173462, "logits/rejected": -1.4547795057296753, "logps/chosen": -1.9763944149017334, "logps/rejected": -1.8858134746551514, "loss": 4.0438, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.763944625854492, "rewards/margins": -0.9058086276054382, "rewards/rejected": -18.858135223388672, "step": 16410 }, { "epoch": 0.5532710910377835, "grad_norm": 10.583443641662598, "learning_rate": 4.942939129559564e-07, "logits/chosen": -1.0191407203674316, "logits/rejected": -1.2290555238723755, "logps/chosen": -2.1333813667297363, "logps/rejected": -2.4551522731781006, "loss": 1.9698, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.333812713623047, "rewards/margins": 3.217708110809326, "rewards/rejected": -24.5515193939209, "step": 16415 }, { "epoch": 0.5534396171087668, "grad_norm": 69.04009246826172, "learning_rate": 4.939997985316265e-07, "logits/chosen": -1.6069825887680054, "logits/rejected": -1.5675886869430542, "logps/chosen": -1.7600486278533936, "logps/rejected": -2.053833484649658, "loss": 1.7592, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.600486755371094, "rewards/margins": 2.9378466606140137, "rewards/rejected": -20.538333892822266, "step": 16420 }, { "epoch": 0.5536081431797499, "grad_norm": 30.52703285217285, "learning_rate": 4.937056861837298e-07, "logits/chosen": -1.2858994007110596, "logits/rejected": -1.4342375993728638, "logps/chosen": -2.0015480518341064, "logps/rejected": -2.1410858631134033, "loss": 2.2274, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.015480041503906, "rewards/margins": 1.3953787088394165, "rewards/rejected": -21.410858154296875, "step": 16425 }, { "epoch": 0.5537766692507331, "grad_norm": 23.32746696472168, "learning_rate": 4.934115760140472e-07, "logits/chosen": -1.1391804218292236, "logits/rejected": -1.6687994003295898, "logps/chosen": -2.4570107460021973, "logps/rejected": -3.014371395111084, "loss": 1.5547, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -24.57010841369629, "rewards/margins": 5.573606014251709, "rewards/rejected": -30.143712997436523, "step": 16430 }, { "epoch": 0.5539451953217163, "grad_norm": 29.025510787963867, "learning_rate": 4.931174681243586e-07, "logits/chosen": -1.0708847045898438, "logits/rejected": -1.4539520740509033, "logps/chosen": -1.974205732345581, "logps/rejected": -2.124587059020996, "loss": 2.2244, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.74205780029297, "rewards/margins": 1.5038119554519653, "rewards/rejected": -21.24587059020996, "step": 16435 }, { "epoch": 0.5541137213926994, "grad_norm": 33.02252197265625, "learning_rate": 4.928233626164428e-07, "logits/chosen": -1.5927103757858276, "logits/rejected": -2.0426902770996094, "logps/chosen": -2.281912326812744, "logps/rejected": -2.7135491371154785, "loss": 1.8576, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.819122314453125, "rewards/margins": 4.316366672515869, "rewards/rejected": -27.135488510131836, "step": 16440 }, { "epoch": 0.5542822474636826, "grad_norm": 26.73567008972168, "learning_rate": 4.925292595920787e-07, "logits/chosen": -1.7700579166412354, "logits/rejected": -1.8333972692489624, "logps/chosen": -1.8393447399139404, "logps/rejected": -2.0610852241516113, "loss": 2.1441, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.393447875976562, "rewards/margins": 2.2174034118652344, "rewards/rejected": -20.610851287841797, "step": 16445 }, { "epoch": 0.5544507735346658, "grad_norm": 26.375341415405273, "learning_rate": 4.922351591530434e-07, "logits/chosen": -1.185505747795105, "logits/rejected": -1.398880958557129, "logps/chosen": -2.5037283897399902, "logps/rejected": -2.6086947917938232, "loss": 3.5968, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -25.03728485107422, "rewards/margins": 1.0496633052825928, "rewards/rejected": -26.08694839477539, "step": 16450 }, { "epoch": 0.554619299605649, "grad_norm": 130.64976501464844, "learning_rate": 4.919410614011138e-07, "logits/chosen": -1.4663383960723877, "logits/rejected": -2.1934359073638916, "logps/chosen": -2.6172523498535156, "logps/rejected": -2.5515918731689453, "loss": 5.0914, "rewards/accuracies": 0.5, "rewards/chosen": -26.17252540588379, "rewards/margins": -0.6566047668457031, "rewards/rejected": -25.515918731689453, "step": 16455 }, { "epoch": 0.5547878256766322, "grad_norm": 20.96159553527832, "learning_rate": 4.916469664380652e-07, "logits/chosen": -1.6890236139297485, "logits/rejected": -1.9183467626571655, "logps/chosen": -2.627854824066162, "logps/rejected": -2.8655261993408203, "loss": 4.5326, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.278553009033203, "rewards/margins": 2.3767104148864746, "rewards/rejected": -28.655261993408203, "step": 16460 }, { "epoch": 0.5549563517476154, "grad_norm": 0.08136291056871414, "learning_rate": 4.913528743656724e-07, "logits/chosen": -1.2650330066680908, "logits/rejected": -1.2096878290176392, "logps/chosen": -2.059722661972046, "logps/rejected": -2.423049211502075, "loss": 2.4642, "rewards/accuracies": 0.5, "rewards/chosen": -20.597225189208984, "rewards/margins": 3.6332650184631348, "rewards/rejected": -24.230493545532227, "step": 16465 }, { "epoch": 0.5551248778185985, "grad_norm": 10.423648834228516, "learning_rate": 4.910587852857093e-07, "logits/chosen": -1.5063730478286743, "logits/rejected": -1.798951506614685, "logps/chosen": -2.671128034591675, "logps/rejected": -3.014394521713257, "loss": 2.0443, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.711278915405273, "rewards/margins": 3.432664155960083, "rewards/rejected": -30.143945693969727, "step": 16470 }, { "epoch": 0.5552934038895817, "grad_norm": 23.890661239624023, "learning_rate": 4.907646992999481e-07, "logits/chosen": -1.442610740661621, "logits/rejected": -1.9095418453216553, "logps/chosen": -2.0867397785186768, "logps/rejected": -2.4975295066833496, "loss": 1.7156, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.86739730834961, "rewards/margins": 4.1078996658325195, "rewards/rejected": -24.975296020507812, "step": 16475 }, { "epoch": 0.5554619299605649, "grad_norm": 45.274497985839844, "learning_rate": 4.904706165101607e-07, "logits/chosen": -1.3336973190307617, "logits/rejected": -1.534952163696289, "logps/chosen": -3.592500686645508, "logps/rejected": -3.7593085765838623, "loss": 2.1646, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -35.92500686645508, "rewards/margins": 1.6680819988250732, "rewards/rejected": -37.59308624267578, "step": 16480 }, { "epoch": 0.555630456031548, "grad_norm": 29.72709083557129, "learning_rate": 4.901765370181174e-07, "logits/chosen": -1.1310837268829346, "logits/rejected": -1.2525126934051514, "logps/chosen": -1.883829116821289, "logps/rejected": -1.9360030889511108, "loss": 2.8968, "rewards/accuracies": 0.5, "rewards/chosen": -18.83829116821289, "rewards/margins": 0.5217410326004028, "rewards/rejected": -19.360031127929688, "step": 16485 }, { "epoch": 0.5557989821025313, "grad_norm": 35.20262908935547, "learning_rate": 4.898824609255879e-07, "logits/chosen": -1.30862295627594, "logits/rejected": -1.5366251468658447, "logps/chosen": -2.2552974224090576, "logps/rejected": -2.888899087905884, "loss": 2.4783, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.552974700927734, "rewards/margins": 6.336014747619629, "rewards/rejected": -28.888988494873047, "step": 16490 }, { "epoch": 0.5559675081735145, "grad_norm": 63.60124588012695, "learning_rate": 4.895883883343398e-07, "logits/chosen": -1.5948011875152588, "logits/rejected": -1.6446367502212524, "logps/chosen": -2.000032663345337, "logps/rejected": -2.2590491771698, "loss": 1.9738, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.00032615661621, "rewards/margins": 2.5901684761047363, "rewards/rejected": -22.59049415588379, "step": 16495 }, { "epoch": 0.5561360342444976, "grad_norm": 68.49860382080078, "learning_rate": 4.892943193461403e-07, "logits/chosen": -1.2705062627792358, "logits/rejected": -1.4505457878112793, "logps/chosen": -1.9772846698760986, "logps/rejected": -1.9667972326278687, "loss": 3.8444, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -19.77284812927246, "rewards/margins": -0.10487423092126846, "rewards/rejected": -19.667972564697266, "step": 16500 }, { "epoch": 0.5563045603154808, "grad_norm": 29.31913948059082, "learning_rate": 4.890002540627552e-07, "logits/chosen": -1.715620756149292, "logits/rejected": -1.643994688987732, "logps/chosen": -2.7153449058532715, "logps/rejected": -2.9860455989837646, "loss": 2.0, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -27.1534481048584, "rewards/margins": 2.7070071697235107, "rewards/rejected": -29.860454559326172, "step": 16505 }, { "epoch": 0.556473086386464, "grad_norm": 14.820429801940918, "learning_rate": 4.887061925859487e-07, "logits/chosen": -1.5017743110656738, "logits/rejected": -1.7025244235992432, "logps/chosen": -2.3334708213806152, "logps/rejected": -2.7292990684509277, "loss": 2.9727, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.334707260131836, "rewards/margins": 3.9582855701446533, "rewards/rejected": -27.29298973083496, "step": 16510 }, { "epoch": 0.5566416124574471, "grad_norm": 22.817903518676758, "learning_rate": 4.88412135017484e-07, "logits/chosen": -0.9818054437637329, "logits/rejected": -1.0674546957015991, "logps/chosen": -2.76274037361145, "logps/rejected": -2.9394752979278564, "loss": 2.6797, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -27.627399444580078, "rewards/margins": 1.767350435256958, "rewards/rejected": -29.394750595092773, "step": 16515 }, { "epoch": 0.5568101385284303, "grad_norm": 23.144840240478516, "learning_rate": 4.881180814591226e-07, "logits/chosen": -1.8649994134902954, "logits/rejected": -1.4928921461105347, "logps/chosen": -2.244431972503662, "logps/rejected": -2.0774097442626953, "loss": 4.7936, "rewards/accuracies": 0.5, "rewards/chosen": -22.444316864013672, "rewards/margins": -1.6702207326889038, "rewards/rejected": -20.77409553527832, "step": 16520 }, { "epoch": 0.5569786645994135, "grad_norm": 25.49403190612793, "learning_rate": 4.878240320126256e-07, "logits/chosen": -2.3358254432678223, "logits/rejected": -2.589106798171997, "logps/chosen": -2.41384220123291, "logps/rejected": -2.698692798614502, "loss": 2.5465, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.1384220123291, "rewards/margins": 2.8485052585601807, "rewards/rejected": -26.986928939819336, "step": 16525 }, { "epoch": 0.5571471906703968, "grad_norm": 30.329010009765625, "learning_rate": 4.87529986779751e-07, "logits/chosen": -1.149228572845459, "logits/rejected": -1.606133222579956, "logps/chosen": -1.8847917318344116, "logps/rejected": -2.1649017333984375, "loss": 2.6444, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.847917556762695, "rewards/margins": 2.8010976314544678, "rewards/rejected": -21.64901351928711, "step": 16530 }, { "epoch": 0.5573157167413799, "grad_norm": 97.66452026367188, "learning_rate": 4.872359458622568e-07, "logits/chosen": -1.7025953531265259, "logits/rejected": -1.810329794883728, "logps/chosen": -1.7309554815292358, "logps/rejected": -1.6912472248077393, "loss": 3.6162, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.309553146362305, "rewards/margins": -0.39708080887794495, "rewards/rejected": -16.912473678588867, "step": 16535 }, { "epoch": 0.5574842428123631, "grad_norm": 17.021034240722656, "learning_rate": 4.869419093618991e-07, "logits/chosen": -1.5559533834457397, "logits/rejected": -1.6489553451538086, "logps/chosen": -2.4044666290283203, "logps/rejected": -2.8175644874572754, "loss": 2.9486, "rewards/accuracies": 0.5, "rewards/chosen": -24.04466438293457, "rewards/margins": 4.130979061126709, "rewards/rejected": -28.175643920898438, "step": 16540 }, { "epoch": 0.5576527688833463, "grad_norm": 73.31028747558594, "learning_rate": 4.866478773804317e-07, "logits/chosen": -1.2781057357788086, "logits/rejected": -1.4325788021087646, "logps/chosen": -1.9948387145996094, "logps/rejected": -2.0437731742858887, "loss": 3.4351, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.948389053344727, "rewards/margins": 0.4893454611301422, "rewards/rejected": -20.437732696533203, "step": 16545 }, { "epoch": 0.5578212949543294, "grad_norm": 15.803415298461914, "learning_rate": 4.863538500196081e-07, "logits/chosen": -1.2874476909637451, "logits/rejected": -1.8122501373291016, "logps/chosen": -2.1901726722717285, "logps/rejected": -2.637709140777588, "loss": 1.5549, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.901723861694336, "rewards/margins": 4.475368022918701, "rewards/rejected": -26.377094268798828, "step": 16550 }, { "epoch": 0.5579898210253126, "grad_norm": 39.55556106567383, "learning_rate": 4.860598273811792e-07, "logits/chosen": -1.4781594276428223, "logits/rejected": -1.6253509521484375, "logps/chosen": -2.057183027267456, "logps/rejected": -2.060908555984497, "loss": 3.7672, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.571828842163086, "rewards/margins": 0.03725280612707138, "rewards/rejected": -20.609081268310547, "step": 16555 }, { "epoch": 0.5581583470962957, "grad_norm": 68.54031372070312, "learning_rate": 4.857658095668951e-07, "logits/chosen": -1.6205686330795288, "logits/rejected": -1.6138147115707397, "logps/chosen": -1.9181791543960571, "logps/rejected": -2.0273666381835938, "loss": 2.6595, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.181793212890625, "rewards/margins": 1.091873288154602, "rewards/rejected": -20.273664474487305, "step": 16560 }, { "epoch": 0.558326873167279, "grad_norm": 29.88010597229004, "learning_rate": 4.854717966785033e-07, "logits/chosen": -1.522566556930542, "logits/rejected": -1.5306899547576904, "logps/chosen": -1.7374460697174072, "logps/rejected": -1.7058292627334595, "loss": 3.7956, "rewards/accuracies": 0.5, "rewards/chosen": -17.374460220336914, "rewards/margins": -0.3161682188510895, "rewards/rejected": -17.058292388916016, "step": 16565 }, { "epoch": 0.5584953992382622, "grad_norm": 14.674449920654297, "learning_rate": 4.851777888177503e-07, "logits/chosen": -1.492299199104309, "logits/rejected": -1.5254344940185547, "logps/chosen": -2.6740927696228027, "logps/rejected": -2.8673977851867676, "loss": 3.7997, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.740930557250977, "rewards/margins": 1.933049201965332, "rewards/rejected": -28.67397689819336, "step": 16570 }, { "epoch": 0.5586639253092454, "grad_norm": 20.321304321289062, "learning_rate": 4.848837860863807e-07, "logits/chosen": -1.5205323696136475, "logits/rejected": -1.4475538730621338, "logps/chosen": -1.978070855140686, "logps/rejected": -2.5035884380340576, "loss": 0.9607, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -19.780710220336914, "rewards/margins": 5.255176544189453, "rewards/rejected": -25.035886764526367, "step": 16575 }, { "epoch": 0.5588324513802285, "grad_norm": 35.91277313232422, "learning_rate": 4.845897885861371e-07, "logits/chosen": -1.4589111804962158, "logits/rejected": -1.4083130359649658, "logps/chosen": -2.222066640853882, "logps/rejected": -2.1525063514709473, "loss": 3.8385, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.220666885375977, "rewards/margins": -0.6956036686897278, "rewards/rejected": -21.52506446838379, "step": 16580 }, { "epoch": 0.5590009774512117, "grad_norm": 24.30882453918457, "learning_rate": 4.842957964187604e-07, "logits/chosen": -1.6066770553588867, "logits/rejected": -1.6216022968292236, "logps/chosen": -1.8694368600845337, "logps/rejected": -1.9515972137451172, "loss": 2.5597, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.69437026977539, "rewards/margins": 0.8216029405593872, "rewards/rejected": -19.51597023010254, "step": 16585 }, { "epoch": 0.5591695035221949, "grad_norm": 128.77745056152344, "learning_rate": 4.8400180968599e-07, "logits/chosen": -1.5237901210784912, "logits/rejected": -1.6502147912979126, "logps/chosen": -2.442279815673828, "logps/rejected": -2.3668129444122314, "loss": 4.9498, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -24.42279624938965, "rewards/margins": -0.754668116569519, "rewards/rejected": -23.66813087463379, "step": 16590 }, { "epoch": 0.559338029593178, "grad_norm": 31.42833137512207, "learning_rate": 4.837078284895631e-07, "logits/chosen": -2.0330276489257812, "logits/rejected": -1.8534698486328125, "logps/chosen": -2.165086269378662, "logps/rejected": -2.5630440711975098, "loss": 3.0482, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.650861740112305, "rewards/margins": 3.9795784950256348, "rewards/rejected": -25.630441665649414, "step": 16595 }, { "epoch": 0.5595065556641613, "grad_norm": 24.275657653808594, "learning_rate": 4.834138529312146e-07, "logits/chosen": -1.7258002758026123, "logits/rejected": -1.767745018005371, "logps/chosen": -1.896641492843628, "logps/rejected": -2.3668007850646973, "loss": 1.7978, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.966419219970703, "rewards/margins": 4.701590538024902, "rewards/rejected": -23.66800880432129, "step": 16600 }, { "epoch": 0.5596750817351445, "grad_norm": 21.67288589477539, "learning_rate": 4.831198831126784e-07, "logits/chosen": -1.5068045854568481, "logits/rejected": -1.4906129837036133, "logps/chosen": -1.9527183771133423, "logps/rejected": -2.0906100273132324, "loss": 2.4302, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.527183532714844, "rewards/margins": 1.3789176940917969, "rewards/rejected": -20.90610122680664, "step": 16605 }, { "epoch": 0.5598436078061276, "grad_norm": 29.13378143310547, "learning_rate": 4.828259191356855e-07, "logits/chosen": -1.6987136602401733, "logits/rejected": -1.7911326885223389, "logps/chosen": -1.9743280410766602, "logps/rejected": -2.221207857131958, "loss": 1.8545, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.7432804107666, "rewards/margins": 2.4687979221343994, "rewards/rejected": -22.212078094482422, "step": 16610 }, { "epoch": 0.5600121338771108, "grad_norm": 30.432849884033203, "learning_rate": 4.825319611019653e-07, "logits/chosen": -1.3041839599609375, "logits/rejected": -1.5292479991912842, "logps/chosen": -1.8356603384017944, "logps/rejected": -2.110485315322876, "loss": 2.4426, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.356603622436523, "rewards/margins": 2.748248815536499, "rewards/rejected": -21.1048526763916, "step": 16615 }, { "epoch": 0.560180659948094, "grad_norm": 28.228656768798828, "learning_rate": 4.822380091132452e-07, "logits/chosen": -1.496335744857788, "logits/rejected": -1.5300164222717285, "logps/chosen": -2.2147862911224365, "logps/rejected": -2.358285665512085, "loss": 3.8083, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.147863388061523, "rewards/margins": 1.4349899291992188, "rewards/rejected": -23.582855224609375, "step": 16620 }, { "epoch": 0.5603491860190771, "grad_norm": 7.746950626373291, "learning_rate": 4.819440632712502e-07, "logits/chosen": -1.1421386003494263, "logits/rejected": -1.6735588312149048, "logps/chosen": -1.7586784362792969, "logps/rejected": -2.379070997238159, "loss": 1.7404, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.5867862701416, "rewards/margins": 6.203924179077148, "rewards/rejected": -23.790708541870117, "step": 16625 }, { "epoch": 0.5605177120900603, "grad_norm": 55.8576774597168, "learning_rate": 4.816501236777038e-07, "logits/chosen": -1.6667125225067139, "logits/rejected": -1.4844706058502197, "logps/chosen": -1.8552688360214233, "logps/rejected": -1.9931942224502563, "loss": 2.5257, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.552688598632812, "rewards/margins": 1.3792531490325928, "rewards/rejected": -19.931941986083984, "step": 16630 }, { "epoch": 0.5606862381610435, "grad_norm": 20.889873504638672, "learning_rate": 4.813561904343265e-07, "logits/chosen": -1.006071925163269, "logits/rejected": -1.044333815574646, "logps/chosen": -2.241271734237671, "logps/rejected": -2.404127836227417, "loss": 1.9726, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.412717819213867, "rewards/margins": 1.6285613775253296, "rewards/rejected": -24.041278839111328, "step": 16635 }, { "epoch": 0.5608547642320267, "grad_norm": 55.024349212646484, "learning_rate": 4.810622636428371e-07, "logits/chosen": -1.270347237586975, "logits/rejected": -1.3348054885864258, "logps/chosen": -1.9361298084259033, "logps/rejected": -1.9386670589447021, "loss": 3.1037, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.361297607421875, "rewards/margins": 0.025374317541718483, "rewards/rejected": -19.38667106628418, "step": 16640 }, { "epoch": 0.5610232903030099, "grad_norm": 29.22247886657715, "learning_rate": 4.807683434049522e-07, "logits/chosen": -1.417856216430664, "logits/rejected": -1.9524939060211182, "logps/chosen": -2.4791512489318848, "logps/rejected": -2.824134349822998, "loss": 1.92, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -24.79151153564453, "rewards/margins": 3.449831485748291, "rewards/rejected": -28.241342544555664, "step": 16645 }, { "epoch": 0.5611918163739931, "grad_norm": 30.65985679626465, "learning_rate": 4.804744298223859e-07, "logits/chosen": -1.4590588808059692, "logits/rejected": -1.7810996770858765, "logps/chosen": -2.255669116973877, "logps/rejected": -2.6106760501861572, "loss": 1.4134, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.55669403076172, "rewards/margins": 3.550067186355591, "rewards/rejected": -26.106760025024414, "step": 16650 }, { "epoch": 0.5613603424449762, "grad_norm": 28.592660903930664, "learning_rate": 4.8018052299685e-07, "logits/chosen": -1.8148752450942993, "logits/rejected": -2.0093588829040527, "logps/chosen": -2.1500916481018066, "logps/rejected": -2.5256147384643555, "loss": 1.4492, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.50091552734375, "rewards/margins": 3.755230665206909, "rewards/rejected": -25.256145477294922, "step": 16655 }, { "epoch": 0.5615288685159594, "grad_norm": 83.37126159667969, "learning_rate": 4.798866230300547e-07, "logits/chosen": -0.9690818786621094, "logits/rejected": -1.2768938541412354, "logps/chosen": -2.553854465484619, "logps/rejected": -3.4719252586364746, "loss": 2.0541, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.538543701171875, "rewards/margins": 9.180707931518555, "rewards/rejected": -34.71925354003906, "step": 16660 }, { "epoch": 0.5616973945869426, "grad_norm": 31.286157608032227, "learning_rate": 4.795927300237065e-07, "logits/chosen": -1.402068853378296, "logits/rejected": -1.401330590248108, "logps/chosen": -1.912672758102417, "logps/rejected": -1.8983705043792725, "loss": 3.6742, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.126728057861328, "rewards/margins": -0.1430220603942871, "rewards/rejected": -18.98370361328125, "step": 16665 }, { "epoch": 0.5618659206579257, "grad_norm": 32.06412887573242, "learning_rate": 4.792988440795103e-07, "logits/chosen": -1.5014759302139282, "logits/rejected": -1.4859968423843384, "logps/chosen": -2.089202880859375, "logps/rejected": -2.309732675552368, "loss": 2.6381, "rewards/accuracies": 0.5, "rewards/chosen": -20.89202880859375, "rewards/margins": 2.2052981853485107, "rewards/rejected": -23.097328186035156, "step": 16670 }, { "epoch": 0.562034446728909, "grad_norm": 0.22517681121826172, "learning_rate": 4.790049652991685e-07, "logits/chosen": -1.5998106002807617, "logits/rejected": -1.8751304149627686, "logps/chosen": -1.7838659286499023, "logps/rejected": -2.068983793258667, "loss": 2.4504, "rewards/accuracies": 0.5, "rewards/chosen": -17.83865737915039, "rewards/margins": 2.8511791229248047, "rewards/rejected": -20.689838409423828, "step": 16675 }, { "epoch": 0.5622029727998922, "grad_norm": 39.847381591796875, "learning_rate": 4.787110937843814e-07, "logits/chosen": -1.1157238483428955, "logits/rejected": -1.418677568435669, "logps/chosen": -2.137622356414795, "logps/rejected": -2.336545944213867, "loss": 2.6717, "rewards/accuracies": 0.5, "rewards/chosen": -21.376224517822266, "rewards/margins": 1.989235281944275, "rewards/rejected": -23.365459442138672, "step": 16680 }, { "epoch": 0.5623714988708753, "grad_norm": 8.367581367492676, "learning_rate": 4.784172296368457e-07, "logits/chosen": -1.6157697439193726, "logits/rejected": -2.3373653888702393, "logps/chosen": -2.44496488571167, "logps/rejected": -3.342592239379883, "loss": 0.4014, "rewards/accuracies": 1.0, "rewards/chosen": -24.449649810791016, "rewards/margins": 8.976272583007812, "rewards/rejected": -33.425926208496094, "step": 16685 }, { "epoch": 0.5625400249418585, "grad_norm": 64.52335357666016, "learning_rate": 4.781233729582565e-07, "logits/chosen": -0.9468668699264526, "logits/rejected": -0.905168890953064, "logps/chosen": -2.1241188049316406, "logps/rejected": -2.2420175075531006, "loss": 2.7584, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.241186141967773, "rewards/margins": 1.1789891719818115, "rewards/rejected": -22.420177459716797, "step": 16690 }, { "epoch": 0.5627085510128417, "grad_norm": 25.68901824951172, "learning_rate": 4.778295238503061e-07, "logits/chosen": -1.390925645828247, "logits/rejected": -1.3971506357192993, "logps/chosen": -2.2836949825286865, "logps/rejected": -2.681293249130249, "loss": 2.3995, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.836950302124023, "rewards/margins": 3.9759840965270996, "rewards/rejected": -26.81293296813965, "step": 16695 }, { "epoch": 0.5628770770838248, "grad_norm": 31.65011978149414, "learning_rate": 4.775356824146842e-07, "logits/chosen": -1.9038221836090088, "logits/rejected": -1.802114725112915, "logps/chosen": -2.901379346847534, "logps/rejected": -3.1506600379943848, "loss": 2.4802, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -29.0137939453125, "rewards/margins": 2.4928066730499268, "rewards/rejected": -31.5065975189209, "step": 16700 }, { "epoch": 0.563045603154808, "grad_norm": 23.219512939453125, "learning_rate": 4.772418487530773e-07, "logits/chosen": -1.4731714725494385, "logits/rejected": -1.4118788242340088, "logps/chosen": -2.0316500663757324, "logps/rejected": -2.0555057525634766, "loss": 3.2372, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.31650161743164, "rewards/margins": 0.23855523765087128, "rewards/rejected": -20.555057525634766, "step": 16705 }, { "epoch": 0.5632141292257913, "grad_norm": 27.701637268066406, "learning_rate": 4.769480229671699e-07, "logits/chosen": -1.1318514347076416, "logits/rejected": -1.1759233474731445, "logps/chosen": -2.0909242630004883, "logps/rejected": -2.103691816329956, "loss": 3.2503, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.909244537353516, "rewards/margins": 0.12767677009105682, "rewards/rejected": -21.03692054748535, "step": 16710 }, { "epoch": 0.5633826552967744, "grad_norm": 32.668514251708984, "learning_rate": 4.7665420515864374e-07, "logits/chosen": -1.5871978998184204, "logits/rejected": -1.787335991859436, "logps/chosen": -1.980926275253296, "logps/rejected": -2.1331100463867188, "loss": 2.471, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.809263229370117, "rewards/margins": 1.5218383073806763, "rewards/rejected": -21.33110237121582, "step": 16715 }, { "epoch": 0.5635511813677576, "grad_norm": 30.054622650146484, "learning_rate": 4.7636039542917716e-07, "logits/chosen": -1.2195504903793335, "logits/rejected": -1.4400713443756104, "logps/chosen": -2.8065850734710693, "logps/rejected": -2.85850191116333, "loss": 4.9515, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -28.06585121154785, "rewards/margins": 0.5191686749458313, "rewards/rejected": -28.585018157958984, "step": 16720 }, { "epoch": 0.5637197074387408, "grad_norm": 34.892494201660156, "learning_rate": 4.760665938804466e-07, "logits/chosen": -1.404201626777649, "logits/rejected": -1.6849861145019531, "logps/chosen": -2.6109631061553955, "logps/rejected": -2.7899582386016846, "loss": 2.5493, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -26.109630584716797, "rewards/margins": 1.7899490594863892, "rewards/rejected": -27.899578094482422, "step": 16725 }, { "epoch": 0.563888233509724, "grad_norm": 32.79019546508789, "learning_rate": 4.7577280061412474e-07, "logits/chosen": -1.413694977760315, "logits/rejected": -1.3809101581573486, "logps/chosen": -2.7751529216766357, "logps/rejected": -2.6814167499542236, "loss": 4.4692, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -27.751529693603516, "rewards/margins": -0.9373645782470703, "rewards/rejected": -26.814163208007812, "step": 16730 }, { "epoch": 0.5640567595807071, "grad_norm": 6.341630935668945, "learning_rate": 4.754790157318822e-07, "logits/chosen": -1.2289912700653076, "logits/rejected": -1.5710041522979736, "logps/chosen": -2.0926711559295654, "logps/rejected": -2.535799503326416, "loss": 1.8432, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.926708221435547, "rewards/margins": 4.431286334991455, "rewards/rejected": -25.35799789428711, "step": 16735 }, { "epoch": 0.5642252856516903, "grad_norm": 49.58246612548828, "learning_rate": 4.7518523933538613e-07, "logits/chosen": -1.8374273777008057, "logits/rejected": -1.8193261623382568, "logps/chosen": -3.3235645294189453, "logps/rejected": -3.626161575317383, "loss": 3.2184, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -33.23564529418945, "rewards/margins": 3.025968313217163, "rewards/rejected": -36.26161193847656, "step": 16740 }, { "epoch": 0.5643938117226734, "grad_norm": 25.741788864135742, "learning_rate": 4.7489147152630104e-07, "logits/chosen": -1.4145604372024536, "logits/rejected": -1.5160937309265137, "logps/chosen": -2.352865219116211, "logps/rejected": -2.9049177169799805, "loss": 2.2004, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.528654098510742, "rewards/margins": 5.520522117614746, "rewards/rejected": -29.049175262451172, "step": 16745 }, { "epoch": 0.5645623377936567, "grad_norm": 26.192583084106445, "learning_rate": 4.745977124062887e-07, "logits/chosen": -1.2434992790222168, "logits/rejected": -1.0916662216186523, "logps/chosen": -2.044055938720703, "logps/rejected": -2.2048280239105225, "loss": 3.14, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.44055938720703, "rewards/margins": 1.6077194213867188, "rewards/rejected": -22.048280715942383, "step": 16750 }, { "epoch": 0.5647308638646399, "grad_norm": 140.10960388183594, "learning_rate": 4.74303962077007e-07, "logits/chosen": -1.3205041885375977, "logits/rejected": -1.650608777999878, "logps/chosen": -2.4417591094970703, "logps/rejected": -2.5953195095062256, "loss": 2.543, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.417591094970703, "rewards/margins": 1.5356042385101318, "rewards/rejected": -25.953195571899414, "step": 16755 }, { "epoch": 0.564899389935623, "grad_norm": 42.08061218261719, "learning_rate": 4.740102206401117e-07, "logits/chosen": -2.0617008209228516, "logits/rejected": -2.03901743888855, "logps/chosen": -2.1589865684509277, "logps/rejected": -2.2602858543395996, "loss": 3.0386, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.58986473083496, "rewards/margins": 1.0129940509796143, "rewards/rejected": -22.602859497070312, "step": 16760 }, { "epoch": 0.5650679160066062, "grad_norm": 37.95011520385742, "learning_rate": 4.737164881972551e-07, "logits/chosen": -0.8276047706604004, "logits/rejected": -0.9013395309448242, "logps/chosen": -1.9446722269058228, "logps/rejected": -2.0602924823760986, "loss": 2.992, "rewards/accuracies": 0.5, "rewards/chosen": -19.446720123291016, "rewards/margins": 1.156203269958496, "rewards/rejected": -20.602924346923828, "step": 16765 }, { "epoch": 0.5652364420775894, "grad_norm": 43.475223541259766, "learning_rate": 4.7342276485008654e-07, "logits/chosen": -1.6008399724960327, "logits/rejected": -1.54447340965271, "logps/chosen": -2.948169708251953, "logps/rejected": -3.309300661087036, "loss": 2.6869, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -29.481698989868164, "rewards/margins": 3.6113078594207764, "rewards/rejected": -33.0930061340332, "step": 16770 }, { "epoch": 0.5654049681485726, "grad_norm": 78.703125, "learning_rate": 4.7312905070025177e-07, "logits/chosen": -0.9836323857307434, "logits/rejected": -0.9874518513679504, "logps/chosen": -2.5067899227142334, "logps/rejected": -3.001052141189575, "loss": 3.4794, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.067901611328125, "rewards/margins": 4.942620277404785, "rewards/rejected": -30.010522842407227, "step": 16775 }, { "epoch": 0.5655734942195557, "grad_norm": 36.86214065551758, "learning_rate": 4.728353458493939e-07, "logits/chosen": -1.0935938358306885, "logits/rejected": -1.7105945348739624, "logps/chosen": -2.6541755199432373, "logps/rejected": -3.2404963970184326, "loss": 2.6482, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -26.5417537689209, "rewards/margins": 5.863210201263428, "rewards/rejected": -32.404964447021484, "step": 16780 }, { "epoch": 0.565742020290539, "grad_norm": 0.07047080993652344, "learning_rate": 4.7254165039915265e-07, "logits/chosen": -1.5007909536361694, "logits/rejected": -1.7393224239349365, "logps/chosen": -2.6301045417785645, "logps/rejected": -3.297396183013916, "loss": 1.2243, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -26.301044464111328, "rewards/margins": 6.672916412353516, "rewards/rejected": -32.973960876464844, "step": 16785 }, { "epoch": 0.5659105463615222, "grad_norm": 43.58036422729492, "learning_rate": 4.7224796445116446e-07, "logits/chosen": -1.2965277433395386, "logits/rejected": -1.6817373037338257, "logps/chosen": -2.2248032093048096, "logps/rejected": -2.647348642349243, "loss": 2.0617, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.248035430908203, "rewards/margins": 4.225451946258545, "rewards/rejected": -26.473485946655273, "step": 16790 }, { "epoch": 0.5660790724325053, "grad_norm": 33.99382019042969, "learning_rate": 4.7195428810706224e-07, "logits/chosen": -1.4129993915557861, "logits/rejected": -1.655768632888794, "logps/chosen": -2.2138402462005615, "logps/rejected": -2.304614782333374, "loss": 3.2053, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.138404846191406, "rewards/margins": 0.9077442288398743, "rewards/rejected": -23.046146392822266, "step": 16795 }, { "epoch": 0.5662475985034885, "grad_norm": 25.1485595703125, "learning_rate": 4.7166062146847593e-07, "logits/chosen": -1.9000380039215088, "logits/rejected": -1.7676162719726562, "logps/chosen": -2.2173843383789062, "logps/rejected": -2.3960306644439697, "loss": 3.0706, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.173843383789062, "rewards/margins": 1.7864621877670288, "rewards/rejected": -23.96030616760254, "step": 16800 }, { "epoch": 0.5662475985034885, "eval_logits/chosen": -1.9103525876998901, "eval_logits/rejected": -2.0533409118652344, "eval_logps/chosen": -2.1100523471832275, "eval_logps/rejected": -2.24176025390625, "eval_loss": 2.994614362716675, "eval_rewards/accuracies": 0.6299999952316284, "eval_rewards/chosen": -21.100522994995117, "eval_rewards/margins": 1.3170799016952515, "eval_rewards/rejected": -22.417604446411133, "eval_runtime": 12.9176, "eval_samples_per_second": 7.741, "eval_steps_per_second": 1.935, "step": 16800 }, { "epoch": 0.5664161245744717, "grad_norm": 11.869410514831543, "learning_rate": 4.713669646370321e-07, "logits/chosen": -0.9209138751029968, "logits/rejected": -1.0028040409088135, "logps/chosen": -1.5366700887680054, "logps/rejected": -1.610845923423767, "loss": 3.0163, "rewards/accuracies": 0.5, "rewards/chosen": -15.366701126098633, "rewards/margins": 0.7417588233947754, "rewards/rejected": -16.10845947265625, "step": 16805 }, { "epoch": 0.5665846506454548, "grad_norm": 41.69317626953125, "learning_rate": 4.7107331771435366e-07, "logits/chosen": -1.6482641696929932, "logits/rejected": -1.9954522848129272, "logps/chosen": -2.4083595275878906, "logps/rejected": -2.967284679412842, "loss": 3.5173, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.08359718322754, "rewards/margins": 5.5892534255981445, "rewards/rejected": -29.6728515625, "step": 16810 }, { "epoch": 0.566753176716438, "grad_norm": 23.685644149780273, "learning_rate": 4.7077968080206025e-07, "logits/chosen": -1.785162329673767, "logits/rejected": -1.7933508157730103, "logps/chosen": -2.0810201168060303, "logps/rejected": -2.196655511856079, "loss": 2.9084, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.81020164489746, "rewards/margins": 1.1563531160354614, "rewards/rejected": -21.966556549072266, "step": 16815 }, { "epoch": 0.5669217027874213, "grad_norm": 170.0150909423828, "learning_rate": 4.7048605400176835e-07, "logits/chosen": -2.2140371799468994, "logits/rejected": -1.9532238245010376, "logps/chosen": -2.8031115531921387, "logps/rejected": -3.2334632873535156, "loss": 3.6139, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -28.031116485595703, "rewards/margins": 4.303518295288086, "rewards/rejected": -32.334632873535156, "step": 16820 }, { "epoch": 0.5670902288584044, "grad_norm": 23.4686279296875, "learning_rate": 4.701924374150901e-07, "logits/chosen": -1.7128368616104126, "logits/rejected": -1.8601795434951782, "logps/chosen": -2.399580478668213, "logps/rejected": -2.2932586669921875, "loss": 5.407, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.995803833007812, "rewards/margins": -1.063218116760254, "rewards/rejected": -22.932588577270508, "step": 16825 }, { "epoch": 0.5672587549293876, "grad_norm": 215.03001403808594, "learning_rate": 4.6989883114363486e-07, "logits/chosen": -1.6193939447402954, "logits/rejected": -2.1604743003845215, "logps/chosen": -2.8643462657928467, "logps/rejected": -3.4826483726501465, "loss": 3.6061, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -28.643463134765625, "rewards/margins": 6.183023452758789, "rewards/rejected": -34.82648468017578, "step": 16830 }, { "epoch": 0.5674272810003708, "grad_norm": 42.07413864135742, "learning_rate": 4.6960523528900823e-07, "logits/chosen": -1.6678909063339233, "logits/rejected": -1.6212352514266968, "logps/chosen": -1.9012730121612549, "logps/rejected": -2.26922869682312, "loss": 2.7834, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.01272964477539, "rewards/margins": 3.679558277130127, "rewards/rejected": -22.692289352416992, "step": 16835 }, { "epoch": 0.5675958070713539, "grad_norm": 13.32239818572998, "learning_rate": 4.693116499528124e-07, "logits/chosen": -1.7592246532440186, "logits/rejected": -2.169157028198242, "logps/chosen": -2.433260917663574, "logps/rejected": -3.041790246963501, "loss": 1.4877, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.33260726928711, "rewards/margins": 6.085291862487793, "rewards/rejected": -30.41790199279785, "step": 16840 }, { "epoch": 0.5677643331423371, "grad_norm": 31.183002471923828, "learning_rate": 4.690180752366453e-07, "logits/chosen": -1.2726290225982666, "logits/rejected": -1.692091703414917, "logps/chosen": -2.0409367084503174, "logps/rejected": -2.078146457672119, "loss": 2.8352, "rewards/accuracies": 0.5, "rewards/chosen": -20.40936851501465, "rewards/margins": 0.3720945417881012, "rewards/rejected": -20.781463623046875, "step": 16845 }, { "epoch": 0.5679328592133203, "grad_norm": 25.570552825927734, "learning_rate": 4.687245112421016e-07, "logits/chosen": -1.5729811191558838, "logits/rejected": -1.711248755455017, "logps/chosen": -2.6292476654052734, "logps/rejected": -2.7394912242889404, "loss": 2.8007, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -26.292476654052734, "rewards/margins": 1.1024351119995117, "rewards/rejected": -27.394912719726562, "step": 16850 }, { "epoch": 0.5681013852843034, "grad_norm": 32.55588150024414, "learning_rate": 4.684309580707727e-07, "logits/chosen": -1.56328547000885, "logits/rejected": -1.5660221576690674, "logps/chosen": -3.4068398475646973, "logps/rejected": -4.1345391273498535, "loss": 2.0818, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -34.068397521972656, "rewards/margins": 7.276989936828613, "rewards/rejected": -41.34539031982422, "step": 16855 }, { "epoch": 0.5682699113552867, "grad_norm": 2.4072084426879883, "learning_rate": 4.681374158242451e-07, "logits/chosen": -1.4371076822280884, "logits/rejected": -1.7409675121307373, "logps/chosen": -2.604935884475708, "logps/rejected": -3.037245273590088, "loss": 2.1005, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.049358367919922, "rewards/margins": 4.32309103012085, "rewards/rejected": -30.372451782226562, "step": 16860 }, { "epoch": 0.5684384374262699, "grad_norm": 30.55173110961914, "learning_rate": 4.6784388460410257e-07, "logits/chosen": -1.604943037033081, "logits/rejected": -1.6222620010375977, "logps/chosen": -2.298088788986206, "logps/rejected": -1.992645263671875, "loss": 6.1099, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.98088836669922, "rewards/margins": -3.0544333457946777, "rewards/rejected": -19.926454544067383, "step": 16865 }, { "epoch": 0.568606963497253, "grad_norm": 21.035503387451172, "learning_rate": 4.675503645119247e-07, "logits/chosen": -1.6293731927871704, "logits/rejected": -1.6573808193206787, "logps/chosen": -2.3817954063415527, "logps/rejected": -2.569385528564453, "loss": 2.4276, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.817955017089844, "rewards/margins": 1.8758999109268188, "rewards/rejected": -25.6938533782959, "step": 16870 }, { "epoch": 0.5687754895682362, "grad_norm": 42.433319091796875, "learning_rate": 4.672568556492873e-07, "logits/chosen": -0.7304778695106506, "logits/rejected": -0.9662677645683289, "logps/chosen": -2.424504041671753, "logps/rejected": -3.062227725982666, "loss": 1.7874, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.24504280090332, "rewards/margins": 6.377236843109131, "rewards/rejected": -30.622278213500977, "step": 16875 }, { "epoch": 0.5689440156392194, "grad_norm": 30.08279037475586, "learning_rate": 4.669633581177621e-07, "logits/chosen": -1.5956250429153442, "logits/rejected": -1.4057915210723877, "logps/chosen": -1.5880109071731567, "logps/rejected": -1.5237188339233398, "loss": 3.6982, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -15.880106925964355, "rewards/margins": -0.6429195404052734, "rewards/rejected": -15.237188339233398, "step": 16880 }, { "epoch": 0.5691125417102025, "grad_norm": 32.80216979980469, "learning_rate": 4.66669872018917e-07, "logits/chosen": -1.4292566776275635, "logits/rejected": -1.4082731008529663, "logps/chosen": -2.8172738552093506, "logps/rejected": -3.159550189971924, "loss": 1.5456, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -28.172739028930664, "rewards/margins": 3.4227638244628906, "rewards/rejected": -31.595500946044922, "step": 16885 }, { "epoch": 0.5692810677811857, "grad_norm": 31.284208297729492, "learning_rate": 4.6637639745431626e-07, "logits/chosen": -0.8667623400688171, "logits/rejected": -0.9213277697563171, "logps/chosen": -1.9619200229644775, "logps/rejected": -2.0677216053009033, "loss": 2.7827, "rewards/accuracies": 0.5, "rewards/chosen": -19.619197845458984, "rewards/margins": 1.0580158233642578, "rewards/rejected": -20.677213668823242, "step": 16890 }, { "epoch": 0.569449593852169, "grad_norm": 23.00290870666504, "learning_rate": 4.6608293452551947e-07, "logits/chosen": -1.3267605304718018, "logits/rejected": -1.2977806329727173, "logps/chosen": -2.193504810333252, "logps/rejected": -2.4723658561706543, "loss": 2.3626, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.935049057006836, "rewards/margins": 2.78861403465271, "rewards/rejected": -24.723661422729492, "step": 16895 }, { "epoch": 0.5696181199231521, "grad_norm": 66.30839538574219, "learning_rate": 4.657894833340827e-07, "logits/chosen": -1.3002276420593262, "logits/rejected": -1.927890419960022, "logps/chosen": -2.195603609085083, "logps/rejected": -2.5019192695617676, "loss": 2.504, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.95603370666504, "rewards/margins": 3.0631580352783203, "rewards/rejected": -25.01919174194336, "step": 16900 }, { "epoch": 0.5697866459941353, "grad_norm": 30.63758087158203, "learning_rate": 4.654960439815581e-07, "logits/chosen": -1.4829597473144531, "logits/rejected": -1.4372055530548096, "logps/chosen": -1.5658553838729858, "logps/rejected": -1.7043907642364502, "loss": 2.202, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -15.658554077148438, "rewards/margins": 1.3853533267974854, "rewards/rejected": -17.043907165527344, "step": 16905 }, { "epoch": 0.5699551720651185, "grad_norm": 25.67203712463379, "learning_rate": 4.6520261656949315e-07, "logits/chosen": -1.7035623788833618, "logits/rejected": -1.4969885349273682, "logps/chosen": -2.9192066192626953, "logps/rejected": -3.232933759689331, "loss": 5.5975, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -29.192066192626953, "rewards/margins": 3.1372737884521484, "rewards/rejected": -32.32933807373047, "step": 16910 }, { "epoch": 0.5701236981361016, "grad_norm": 33.323753356933594, "learning_rate": 4.649092011994316e-07, "logits/chosen": -1.6091026067733765, "logits/rejected": -1.7056491374969482, "logps/chosen": -2.0236144065856934, "logps/rejected": -2.0775866508483887, "loss": 3.0293, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.23614501953125, "rewards/margins": 0.5397213697433472, "rewards/rejected": -20.77586555480957, "step": 16915 }, { "epoch": 0.5702922242070848, "grad_norm": 145.63626098632812, "learning_rate": 4.64615797972913e-07, "logits/chosen": -1.4785640239715576, "logits/rejected": -1.8875732421875, "logps/chosen": -2.968642234802246, "logps/rejected": -3.1860897541046143, "loss": 4.7265, "rewards/accuracies": 0.5, "rewards/chosen": -29.686426162719727, "rewards/margins": 2.174473285675049, "rewards/rejected": -31.860897064208984, "step": 16920 }, { "epoch": 0.570460750278068, "grad_norm": 48.70518112182617, "learning_rate": 4.6432240699147283e-07, "logits/chosen": -1.6808608770370483, "logits/rejected": -1.7317768335342407, "logps/chosen": -3.0904078483581543, "logps/rejected": -3.425537586212158, "loss": 2.7118, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -30.904077529907227, "rewards/margins": 3.3512985706329346, "rewards/rejected": -34.25537872314453, "step": 16925 }, { "epoch": 0.5706292763490513, "grad_norm": 23.636653900146484, "learning_rate": 4.6402902835664177e-07, "logits/chosen": -1.3742854595184326, "logits/rejected": -1.3601338863372803, "logps/chosen": -2.2360167503356934, "logps/rejected": -2.3575634956359863, "loss": 3.1559, "rewards/accuracies": 0.5, "rewards/chosen": -22.36016845703125, "rewards/margins": 1.215468168258667, "rewards/rejected": -23.57563591003418, "step": 16930 }, { "epoch": 0.5707978024200344, "grad_norm": 4.566060543060303, "learning_rate": 4.637356621699468e-07, "logits/chosen": -1.5993291139602661, "logits/rejected": -1.9258677959442139, "logps/chosen": -2.4710073471069336, "logps/rejected": -3.1647419929504395, "loss": 2.4363, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.710071563720703, "rewards/margins": 6.937350273132324, "rewards/rejected": -31.647424697875977, "step": 16935 }, { "epoch": 0.5709663284910176, "grad_norm": 29.32889175415039, "learning_rate": 4.634423085329105e-07, "logits/chosen": -1.786794900894165, "logits/rejected": -1.8577169179916382, "logps/chosen": -2.4155313968658447, "logps/rejected": -2.431154489517212, "loss": 3.2679, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.155315399169922, "rewards/margins": 0.15623120963573456, "rewards/rejected": -24.31154441833496, "step": 16940 }, { "epoch": 0.5711348545620007, "grad_norm": 25.315120697021484, "learning_rate": 4.6314896754705075e-07, "logits/chosen": -1.314682960510254, "logits/rejected": -1.5629719495773315, "logps/chosen": -1.8124099969863892, "logps/rejected": -1.908725380897522, "loss": 2.6222, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.124099731445312, "rewards/margins": 0.9631543159484863, "rewards/rejected": -19.08725357055664, "step": 16945 }, { "epoch": 0.5713033806329839, "grad_norm": 13.031963348388672, "learning_rate": 4.628556393138815e-07, "logits/chosen": -1.0483187437057495, "logits/rejected": -1.5672765970230103, "logps/chosen": -2.2765183448791504, "logps/rejected": -2.5370960235595703, "loss": 2.0186, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.765186309814453, "rewards/margins": 2.6057753562927246, "rewards/rejected": -25.370960235595703, "step": 16950 }, { "epoch": 0.5714719067039671, "grad_norm": 20.017946243286133, "learning_rate": 4.625623239349121e-07, "logits/chosen": -1.6982698440551758, "logits/rejected": -1.9008452892303467, "logps/chosen": -2.2406089305877686, "logps/rejected": -2.3991425037384033, "loss": 2.4108, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.406091690063477, "rewards/margins": 1.585334062576294, "rewards/rejected": -23.991424560546875, "step": 16955 }, { "epoch": 0.5716404327749502, "grad_norm": 62.65876388549805, "learning_rate": 4.622690215116475e-07, "logits/chosen": -1.2825062274932861, "logits/rejected": -1.6194877624511719, "logps/chosen": -1.9833399057388306, "logps/rejected": -2.9209718704223633, "loss": 1.831, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.833398818969727, "rewards/margins": 9.37631893157959, "rewards/rejected": -29.209716796875, "step": 16960 }, { "epoch": 0.5718089588459334, "grad_norm": 2.7115726470947266, "learning_rate": 4.619757321455879e-07, "logits/chosen": -1.9648278951644897, "logits/rejected": -1.9568313360214233, "logps/chosen": -2.6452038288116455, "logps/rejected": -3.012327194213867, "loss": 2.7066, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -26.452037811279297, "rewards/margins": 3.671234130859375, "rewards/rejected": -30.123271942138672, "step": 16965 }, { "epoch": 0.5719774849169167, "grad_norm": 0.24442708492279053, "learning_rate": 4.6168245593822923e-07, "logits/chosen": -1.2114921808242798, "logits/rejected": -1.5997109413146973, "logps/chosen": -1.999610185623169, "logps/rejected": -2.6512880325317383, "loss": 1.4446, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.996103286743164, "rewards/margins": 6.516776084899902, "rewards/rejected": -26.51287841796875, "step": 16970 }, { "epoch": 0.5721460109878999, "grad_norm": 9.06457233428955, "learning_rate": 4.613891929910632e-07, "logits/chosen": -1.558680772781372, "logits/rejected": -1.8719091415405273, "logps/chosen": -2.5277926921844482, "logps/rejected": -3.1365044116973877, "loss": 2.7746, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.277929306030273, "rewards/margins": 6.0871148109436035, "rewards/rejected": -31.365041732788086, "step": 16975 }, { "epoch": 0.572314537058883, "grad_norm": 7.560523509979248, "learning_rate": 4.61095943405576e-07, "logits/chosen": -1.4988155364990234, "logits/rejected": -1.532820463180542, "logps/chosen": -1.8676990270614624, "logps/rejected": -2.1474297046661377, "loss": 2.0479, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.676990509033203, "rewards/margins": 2.7973062992095947, "rewards/rejected": -21.47429656982422, "step": 16980 }, { "epoch": 0.5724830631298662, "grad_norm": 58.599769592285156, "learning_rate": 4.6080270728325006e-07, "logits/chosen": -1.50767183303833, "logits/rejected": -1.8170439004898071, "logps/chosen": -2.332676887512207, "logps/rejected": -2.4412198066711426, "loss": 2.7629, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.326770782470703, "rewards/margins": 1.0854309797286987, "rewards/rejected": -24.412199020385742, "step": 16985 }, { "epoch": 0.5726515892008494, "grad_norm": 13.395484924316406, "learning_rate": 4.605094847255628e-07, "logits/chosen": -1.3667609691619873, "logits/rejected": -1.3412193059921265, "logps/chosen": -1.8395763635635376, "logps/rejected": -2.1235289573669434, "loss": 2.2519, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.395763397216797, "rewards/margins": 2.839524745941162, "rewards/rejected": -21.235288619995117, "step": 16990 }, { "epoch": 0.5728201152718325, "grad_norm": 10.290194511413574, "learning_rate": 4.602162758339873e-07, "logits/chosen": -1.4792242050170898, "logits/rejected": -2.0029921531677246, "logps/chosen": -2.0362067222595215, "logps/rejected": -2.3818199634552, "loss": 1.0937, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -20.36206817626953, "rewards/margins": 3.4561309814453125, "rewards/rejected": -23.818199157714844, "step": 16995 }, { "epoch": 0.5729886413428157, "grad_norm": 33.30671310424805, "learning_rate": 4.5992308070999096e-07, "logits/chosen": -1.5765219926834106, "logits/rejected": -2.062494993209839, "logps/chosen": -1.8226219415664673, "logps/rejected": -2.2019715309143066, "loss": 2.1542, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.226221084594727, "rewards/margins": 3.7934963703155518, "rewards/rejected": -22.019716262817383, "step": 17000 }, { "epoch": 0.573157167413799, "grad_norm": 23.92595672607422, "learning_rate": 4.596298994550375e-07, "logits/chosen": -1.568203330039978, "logits/rejected": -1.6613171100616455, "logps/chosen": -2.349910259246826, "logps/rejected": -2.626659870147705, "loss": 2.1305, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.499103546142578, "rewards/margins": 2.7674946784973145, "rewards/rejected": -26.2665958404541, "step": 17005 }, { "epoch": 0.5733256934847821, "grad_norm": 82.73783111572266, "learning_rate": 4.5933673217058543e-07, "logits/chosen": -1.2131386995315552, "logits/rejected": -1.4366130828857422, "logps/chosen": -2.282435655593872, "logps/rejected": -2.7135918140411377, "loss": 3.3078, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.824357986450195, "rewards/margins": 4.311557769775391, "rewards/rejected": -27.135913848876953, "step": 17010 }, { "epoch": 0.5734942195557653, "grad_norm": 35.42367172241211, "learning_rate": 4.5904357895808815e-07, "logits/chosen": -1.7238378524780273, "logits/rejected": -1.8969109058380127, "logps/chosen": -1.975142478942871, "logps/rejected": -2.1562931537628174, "loss": 2.2837, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.751422882080078, "rewards/margins": 1.8115079402923584, "rewards/rejected": -21.56293296813965, "step": 17015 }, { "epoch": 0.5736627456267485, "grad_norm": 26.47319221496582, "learning_rate": 4.587504399189946e-07, "logits/chosen": -1.3628904819488525, "logits/rejected": -1.342140793800354, "logps/chosen": -1.874319314956665, "logps/rejected": -2.0152149200439453, "loss": 1.8363, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -18.74319076538086, "rewards/margins": 1.4089562892913818, "rewards/rejected": -20.152149200439453, "step": 17020 }, { "epoch": 0.5738312716977316, "grad_norm": 15.671334266662598, "learning_rate": 4.5845731515474873e-07, "logits/chosen": -1.347197413444519, "logits/rejected": -1.512352466583252, "logps/chosen": -1.7438786029815674, "logps/rejected": -2.090334415435791, "loss": 1.4458, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -17.438785552978516, "rewards/margins": 3.464556932449341, "rewards/rejected": -20.903343200683594, "step": 17025 }, { "epoch": 0.5739997977687148, "grad_norm": 33.49119186401367, "learning_rate": 4.5816420476678936e-07, "logits/chosen": -1.4836363792419434, "logits/rejected": -1.5364410877227783, "logps/chosen": -2.810619831085205, "logps/rejected": -2.9169609546661377, "loss": 3.2122, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -28.106197357177734, "rewards/margins": 1.0634124279022217, "rewards/rejected": -29.16961097717285, "step": 17030 }, { "epoch": 0.574168323839698, "grad_norm": 27.717208862304688, "learning_rate": 4.578711088565504e-07, "logits/chosen": -1.3634545803070068, "logits/rejected": -1.412527322769165, "logps/chosen": -2.6148791313171387, "logps/rejected": -3.1877834796905518, "loss": 1.6711, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.148792266845703, "rewards/margins": 5.7290449142456055, "rewards/rejected": -31.87783432006836, "step": 17035 }, { "epoch": 0.5743368499106812, "grad_norm": 0.033766914159059525, "learning_rate": 4.5757802752546074e-07, "logits/chosen": -1.450412631034851, "logits/rejected": -1.698773980140686, "logps/chosen": -2.0109925270080566, "logps/rejected": -2.916412115097046, "loss": 1.2433, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -20.109926223754883, "rewards/margins": 9.054194450378418, "rewards/rejected": -29.164119720458984, "step": 17040 }, { "epoch": 0.5745053759816644, "grad_norm": 8.274916648864746, "learning_rate": 4.572849608749447e-07, "logits/chosen": -1.4016960859298706, "logits/rejected": -1.663762092590332, "logps/chosen": -2.1348116397857666, "logps/rejected": -2.4447596073150635, "loss": 1.6618, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.34811782836914, "rewards/margins": 3.099478006362915, "rewards/rejected": -24.447593688964844, "step": 17045 }, { "epoch": 0.5746739020526476, "grad_norm": 23.06145668029785, "learning_rate": 4.5699190900642057e-07, "logits/chosen": -1.5405018329620361, "logits/rejected": -1.792536973953247, "logps/chosen": -2.386561632156372, "logps/rejected": -2.670846462249756, "loss": 2.5033, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.865615844726562, "rewards/margins": 2.8428475856781006, "rewards/rejected": -26.70846176147461, "step": 17050 }, { "epoch": 0.5748424281236307, "grad_norm": 12.915722846984863, "learning_rate": 4.566988720213023e-07, "logits/chosen": -1.1587172746658325, "logits/rejected": -1.3730021715164185, "logps/chosen": -1.8425954580307007, "logps/rejected": -2.190491199493408, "loss": 3.0152, "rewards/accuracies": 0.5, "rewards/chosen": -18.425954818725586, "rewards/margins": 3.478956699371338, "rewards/rejected": -21.904911041259766, "step": 17055 }, { "epoch": 0.5750109541946139, "grad_norm": 20.196578979492188, "learning_rate": 4.5640585002099835e-07, "logits/chosen": -1.5783436298370361, "logits/rejected": -1.5012297630310059, "logps/chosen": -2.2923500537872314, "logps/rejected": -2.4385783672332764, "loss": 3.1821, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.923500061035156, "rewards/margins": 1.4622838497161865, "rewards/rejected": -24.385784149169922, "step": 17060 }, { "epoch": 0.5751794802655971, "grad_norm": 16.635955810546875, "learning_rate": 4.5611284310691246e-07, "logits/chosen": -1.9602220058441162, "logits/rejected": -2.0683677196502686, "logps/chosen": -2.7638659477233887, "logps/rejected": -3.1422085762023926, "loss": 1.454, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -27.638662338256836, "rewards/margins": 3.7834243774414062, "rewards/rejected": -31.422088623046875, "step": 17065 }, { "epoch": 0.5753480063365802, "grad_norm": 20.59967613220215, "learning_rate": 4.558198513804422e-07, "logits/chosen": -1.8842146396636963, "logits/rejected": -1.9829429388046265, "logps/chosen": -2.4420864582061768, "logps/rejected": -2.790311336517334, "loss": 1.5725, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -24.420862197875977, "rewards/margins": 3.482252836227417, "rewards/rejected": -27.90311622619629, "step": 17070 }, { "epoch": 0.5755165324075634, "grad_norm": 20.192537307739258, "learning_rate": 4.555268749429808e-07, "logits/chosen": -1.5222409963607788, "logits/rejected": -1.757651925086975, "logps/chosen": -2.3623366355895996, "logps/rejected": -2.4851298332214355, "loss": 4.5759, "rewards/accuracies": 0.5, "rewards/chosen": -23.623369216918945, "rewards/margins": 1.2279298305511475, "rewards/rejected": -24.851299285888672, "step": 17075 }, { "epoch": 0.5756850584785467, "grad_norm": 32.318233489990234, "learning_rate": 4.5523391389591595e-07, "logits/chosen": -1.657088041305542, "logits/rejected": -1.7773571014404297, "logps/chosen": -2.460700273513794, "logps/rejected": -2.5693631172180176, "loss": 2.61, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.60700035095215, "rewards/margins": 1.0866304636001587, "rewards/rejected": -25.69363021850586, "step": 17080 }, { "epoch": 0.5758535845495298, "grad_norm": 26.118494033813477, "learning_rate": 4.5494096834062963e-07, "logits/chosen": -0.7557857632637024, "logits/rejected": -0.8230286836624146, "logps/chosen": -2.1196682453155518, "logps/rejected": -2.166259288787842, "loss": 3.5991, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.19668197631836, "rewards/margins": 0.46591252088546753, "rewards/rejected": -21.662593841552734, "step": 17085 }, { "epoch": 0.576022110620513, "grad_norm": 0.6773899793624878, "learning_rate": 4.5464803837849904e-07, "logits/chosen": -1.4761030673980713, "logits/rejected": -1.5611333847045898, "logps/chosen": -2.6379261016845703, "logps/rejected": -2.967628240585327, "loss": 3.1823, "rewards/accuracies": 0.5, "rewards/chosen": -26.379261016845703, "rewards/margins": 3.2970237731933594, "rewards/rejected": -29.676280975341797, "step": 17090 }, { "epoch": 0.5761906366914962, "grad_norm": 82.37804412841797, "learning_rate": 4.5435512411089545e-07, "logits/chosen": -1.3212661743164062, "logits/rejected": -2.0065817832946777, "logps/chosen": -1.9370911121368408, "logps/rejected": -2.3542609214782715, "loss": 1.4699, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -19.37091064453125, "rewards/margins": 4.17169713973999, "rewards/rejected": -23.542606353759766, "step": 17095 }, { "epoch": 0.5763591627624793, "grad_norm": 34.555946350097656, "learning_rate": 4.5406222563918515e-07, "logits/chosen": -1.3918941020965576, "logits/rejected": -1.291394591331482, "logps/chosen": -2.2668352127075195, "logps/rejected": -2.3042871952056885, "loss": 3.7059, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.668350219726562, "rewards/margins": 0.37451982498168945, "rewards/rejected": -23.042869567871094, "step": 17100 }, { "epoch": 0.5765276888334625, "grad_norm": 35.844886779785156, "learning_rate": 4.537693430647286e-07, "logits/chosen": -1.9387611150741577, "logits/rejected": -2.1475517749786377, "logps/chosen": -2.0147666931152344, "logps/rejected": -2.07336163520813, "loss": 2.7322, "rewards/accuracies": 0.5, "rewards/chosen": -20.147666931152344, "rewards/margins": 0.585949718952179, "rewards/rejected": -20.73361587524414, "step": 17105 }, { "epoch": 0.5766962149044457, "grad_norm": 132.61204528808594, "learning_rate": 4.53476476488881e-07, "logits/chosen": -1.348154067993164, "logits/rejected": -1.716099739074707, "logps/chosen": -2.2341794967651367, "logps/rejected": -2.8116281032562256, "loss": 1.878, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.341794967651367, "rewards/margins": 5.774487495422363, "rewards/rejected": -28.116281509399414, "step": 17110 }, { "epoch": 0.576864740975429, "grad_norm": 37.44850158691406, "learning_rate": 4.5318362601299217e-07, "logits/chosen": -1.4237921237945557, "logits/rejected": -1.8257821798324585, "logps/chosen": -1.934027910232544, "logps/rejected": -2.2331252098083496, "loss": 1.3484, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.340280532836914, "rewards/margins": 2.990973711013794, "rewards/rejected": -22.331253051757812, "step": 17115 }, { "epoch": 0.5770332670464121, "grad_norm": 42.07789611816406, "learning_rate": 4.528907917384056e-07, "logits/chosen": -1.6937841176986694, "logits/rejected": -1.7856550216674805, "logps/chosen": -2.3766939640045166, "logps/rejected": -2.406951427459717, "loss": 3.1782, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.766937255859375, "rewards/margins": 0.3025781512260437, "rewards/rejected": -24.069515228271484, "step": 17120 }, { "epoch": 0.5772017931173953, "grad_norm": 20.366586685180664, "learning_rate": 4.5259797376646007e-07, "logits/chosen": -1.8341726064682007, "logits/rejected": -2.335084915161133, "logps/chosen": -2.359943151473999, "logps/rejected": -3.0176966190338135, "loss": 1.9511, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.599430084228516, "rewards/margins": 6.577535152435303, "rewards/rejected": -30.176965713500977, "step": 17125 }, { "epoch": 0.5773703191883784, "grad_norm": 71.93550109863281, "learning_rate": 4.5230517219848816e-07, "logits/chosen": -1.4756571054458618, "logits/rejected": -1.6808812618255615, "logps/chosen": -2.351409912109375, "logps/rejected": -2.6855571269989014, "loss": 1.1408, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -23.514097213745117, "rewards/margins": 3.341470241546631, "rewards/rejected": -26.85556983947754, "step": 17130 }, { "epoch": 0.5775388452593616, "grad_norm": 21.230552673339844, "learning_rate": 4.5201238713581735e-07, "logits/chosen": -1.8363711833953857, "logits/rejected": -1.81638503074646, "logps/chosen": -2.2280547618865967, "logps/rejected": -2.2871644496917725, "loss": 2.8284, "rewards/accuracies": 0.5, "rewards/chosen": -22.280548095703125, "rewards/margins": 0.5910956263542175, "rewards/rejected": -22.87164306640625, "step": 17135 }, { "epoch": 0.5777073713303448, "grad_norm": 31.255447387695312, "learning_rate": 4.5171961867976847e-07, "logits/chosen": -1.5670968294143677, "logits/rejected": -2.0457816123962402, "logps/chosen": -2.262490749359131, "logps/rejected": -2.6034095287323, "loss": 2.4048, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.624908447265625, "rewards/margins": 3.409191131591797, "rewards/rejected": -26.034099578857422, "step": 17140 }, { "epoch": 0.5778758974013279, "grad_norm": 43.904598236083984, "learning_rate": 4.5142686693165744e-07, "logits/chosen": -1.659224510192871, "logits/rejected": -1.6053335666656494, "logps/chosen": -2.29905366897583, "logps/rejected": -2.3424086570739746, "loss": 3.2108, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.990535736083984, "rewards/margins": 0.43354931473731995, "rewards/rejected": -23.424083709716797, "step": 17145 }, { "epoch": 0.5780444234723112, "grad_norm": 21.321529388427734, "learning_rate": 4.511341319927941e-07, "logits/chosen": -1.8803768157958984, "logits/rejected": -1.943765640258789, "logps/chosen": -2.151759386062622, "logps/rejected": -2.35597562789917, "loss": 3.3356, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.517593383789062, "rewards/margins": 2.0421626567840576, "rewards/rejected": -23.559757232666016, "step": 17150 }, { "epoch": 0.5782129495432944, "grad_norm": 55.82296371459961, "learning_rate": 4.5084141396448245e-07, "logits/chosen": -1.3013948202133179, "logits/rejected": -1.8239634037017822, "logps/chosen": -1.8853652477264404, "logps/rejected": -2.176085948944092, "loss": 2.8686, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.853652954101562, "rewards/margins": 2.907205581665039, "rewards/rejected": -21.7608585357666, "step": 17155 }, { "epoch": 0.5783814756142776, "grad_norm": 21.96685218811035, "learning_rate": 4.5054871294802056e-07, "logits/chosen": -1.6689783334732056, "logits/rejected": -1.7678687572479248, "logps/chosen": -2.287881374359131, "logps/rejected": -2.6410670280456543, "loss": 2.5601, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.87881088256836, "rewards/margins": 3.531856060028076, "rewards/rejected": -26.410669326782227, "step": 17160 }, { "epoch": 0.5785500016852607, "grad_norm": 87.77336120605469, "learning_rate": 4.5025602904470084e-07, "logits/chosen": -1.5644152164459229, "logits/rejected": -1.750683069229126, "logps/chosen": -2.5638442039489746, "logps/rejected": -2.700592041015625, "loss": 3.1935, "rewards/accuracies": 0.5, "rewards/chosen": -25.638439178466797, "rewards/margins": 1.367479681968689, "rewards/rejected": -27.00592041015625, "step": 17165 }, { "epoch": 0.5787185277562439, "grad_norm": 22.31219482421875, "learning_rate": 4.499633623558097e-07, "logits/chosen": -2.0407156944274902, "logits/rejected": -1.9517043828964233, "logps/chosen": -2.0538718700408936, "logps/rejected": -2.209723949432373, "loss": 3.3678, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.538721084594727, "rewards/margins": 1.5585215091705322, "rewards/rejected": -22.097240447998047, "step": 17170 }, { "epoch": 0.578887053827227, "grad_norm": 24.899978637695312, "learning_rate": 4.496707129826274e-07, "logits/chosen": -1.3175891637802124, "logits/rejected": -1.6883060932159424, "logps/chosen": -2.261355400085449, "logps/rejected": -2.381133556365967, "loss": 2.5608, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.61355209350586, "rewards/margins": 1.1977792978286743, "rewards/rejected": -23.81133460998535, "step": 17175 }, { "epoch": 0.5790555798982102, "grad_norm": 18.139432907104492, "learning_rate": 4.493780810264284e-07, "logits/chosen": -1.407244324684143, "logits/rejected": -1.4826358556747437, "logps/chosen": -2.506412982940674, "logps/rejected": -2.7521300315856934, "loss": 1.9552, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -25.064128875732422, "rewards/margins": 2.4571704864501953, "rewards/rejected": -27.521297454833984, "step": 17180 }, { "epoch": 0.5792241059691934, "grad_norm": 27.345720291137695, "learning_rate": 4.490854665884814e-07, "logits/chosen": -1.4379069805145264, "logits/rejected": -1.525801420211792, "logps/chosen": -2.407961368560791, "logps/rejected": -2.475123167037964, "loss": 3.0396, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -24.07961654663086, "rewards/margins": 0.6716176271438599, "rewards/rejected": -24.751232147216797, "step": 17185 }, { "epoch": 0.5793926320401767, "grad_norm": 22.395584106445312, "learning_rate": 4.487928697700482e-07, "logits/chosen": -1.871355652809143, "logits/rejected": -1.9694700241088867, "logps/chosen": -2.5732643604278564, "logps/rejected": -3.7709145545959473, "loss": 1.3826, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -25.732641220092773, "rewards/margins": 11.976503372192383, "rewards/rejected": -37.709144592285156, "step": 17190 }, { "epoch": 0.5795611581111598, "grad_norm": 99.64684295654297, "learning_rate": 4.4850029067238536e-07, "logits/chosen": -1.2232414484024048, "logits/rejected": -1.756906270980835, "logps/chosen": -2.6735825538635254, "logps/rejected": -3.592395782470703, "loss": 1.9555, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -26.735824584960938, "rewards/margins": 9.18813419342041, "rewards/rejected": -35.92395782470703, "step": 17195 }, { "epoch": 0.579729684182143, "grad_norm": 20.795007705688477, "learning_rate": 4.4820772939674286e-07, "logits/chosen": -1.6001487970352173, "logits/rejected": -1.7138382196426392, "logps/chosen": -2.831948757171631, "logps/rejected": -3.1137797832489014, "loss": 3.152, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -28.31948471069336, "rewards/margins": 2.8183116912841797, "rewards/rejected": -31.137798309326172, "step": 17200 }, { "epoch": 0.579729684182143, "eval_logits/chosen": -1.9627069234848022, "eval_logits/rejected": -2.10943603515625, "eval_logps/chosen": -2.1293704509735107, "eval_logps/rejected": -2.267233371734619, "eval_loss": 2.9915850162506104, "eval_rewards/accuracies": 0.6200000047683716, "eval_rewards/chosen": -21.2937068939209, "eval_rewards/margins": 1.3786267042160034, "eval_rewards/rejected": -22.672330856323242, "eval_runtime": 12.9271, "eval_samples_per_second": 7.736, "eval_steps_per_second": 1.934, "step": 17200 }, { "epoch": 0.5798982102531262, "grad_norm": 20.8897647857666, "learning_rate": 4.479151860443649e-07, "logits/chosen": -2.012810468673706, "logits/rejected": -2.060838222503662, "logps/chosen": -2.4459307193756104, "logps/rejected": -2.3485682010650635, "loss": 4.6796, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.459306716918945, "rewards/margins": -0.9736261367797852, "rewards/rejected": -23.485681533813477, "step": 17205 }, { "epoch": 0.5800667363241093, "grad_norm": 43.36587905883789, "learning_rate": 4.476226607164888e-07, "logits/chosen": -2.007704973220825, "logits/rejected": -1.7056808471679688, "logps/chosen": -2.324647903442383, "logps/rejected": -2.256413459777832, "loss": 4.0325, "rewards/accuracies": 0.5, "rewards/chosen": -23.246479034423828, "rewards/margins": -0.6823431849479675, "rewards/rejected": -22.564136505126953, "step": 17210 }, { "epoch": 0.5802352623950925, "grad_norm": 18.71867561340332, "learning_rate": 4.473301535143462e-07, "logits/chosen": -1.7911930084228516, "logits/rejected": -1.8705055713653564, "logps/chosen": -1.8644154071807861, "logps/rejected": -1.8933528661727905, "loss": 3.1285, "rewards/accuracies": 0.5, "rewards/chosen": -18.644153594970703, "rewards/margins": 0.2893770635128021, "rewards/rejected": -18.933528900146484, "step": 17215 }, { "epoch": 0.5804037884660757, "grad_norm": 30.68587875366211, "learning_rate": 4.4703766453916263e-07, "logits/chosen": -1.858590841293335, "logits/rejected": -2.0414671897888184, "logps/chosen": -2.1605629920959473, "logps/rejected": -2.3674232959747314, "loss": 3.1348, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.605627059936523, "rewards/margins": 2.068603515625, "rewards/rejected": -23.674230575561523, "step": 17220 }, { "epoch": 0.5805723145370589, "grad_norm": 31.98077964782715, "learning_rate": 4.467451938921565e-07, "logits/chosen": -1.209812879562378, "logits/rejected": -1.4913972616195679, "logps/chosen": -1.8544597625732422, "logps/rejected": -2.0227410793304443, "loss": 3.0677, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.544597625732422, "rewards/margins": 1.682814359664917, "rewards/rejected": -20.227413177490234, "step": 17225 }, { "epoch": 0.5807408406080421, "grad_norm": 56.92514419555664, "learning_rate": 4.4645274167454053e-07, "logits/chosen": -1.7451435327529907, "logits/rejected": -1.7276217937469482, "logps/chosen": -2.22159743309021, "logps/rejected": -2.313671112060547, "loss": 3.6201, "rewards/accuracies": 0.5, "rewards/chosen": -22.215974807739258, "rewards/margins": 0.9207379221916199, "rewards/rejected": -23.136709213256836, "step": 17230 }, { "epoch": 0.5809093666790253, "grad_norm": 30.251081466674805, "learning_rate": 4.4616030798752106e-07, "logits/chosen": -1.287561297416687, "logits/rejected": -1.501903772354126, "logps/chosen": -2.222918748855591, "logps/rejected": -2.593648910522461, "loss": 1.8023, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.22918701171875, "rewards/margins": 3.7073047161102295, "rewards/rejected": -25.93648910522461, "step": 17235 }, { "epoch": 0.5810778927500084, "grad_norm": 28.843341827392578, "learning_rate": 4.458678929322979e-07, "logits/chosen": -1.5157232284545898, "logits/rejected": -1.330273985862732, "logps/chosen": -3.0622832775115967, "logps/rejected": -3.581799268722534, "loss": 2.1884, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -30.622833251953125, "rewards/margins": 5.195162296295166, "rewards/rejected": -35.8179931640625, "step": 17240 }, { "epoch": 0.5812464188209916, "grad_norm": 14.910189628601074, "learning_rate": 4.4557549661006417e-07, "logits/chosen": -2.0796093940734863, "logits/rejected": -2.4145195484161377, "logps/chosen": -2.002706527709961, "logps/rejected": -2.060037612915039, "loss": 3.3767, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.027063369750977, "rewards/margins": 0.5733121037483215, "rewards/rejected": -20.60037612915039, "step": 17245 }, { "epoch": 0.5814149448919748, "grad_norm": 5.407770156860352, "learning_rate": 4.4528311912200685e-07, "logits/chosen": -1.4320753812789917, "logits/rejected": -1.643031120300293, "logps/chosen": -2.594674587249756, "logps/rejected": -3.0666213035583496, "loss": 1.6656, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -25.946746826171875, "rewards/margins": 4.719465255737305, "rewards/rejected": -30.666210174560547, "step": 17250 }, { "epoch": 0.5815834709629579, "grad_norm": 294.2926940917969, "learning_rate": 4.449907605693064e-07, "logits/chosen": -1.483459711074829, "logits/rejected": -1.3290231227874756, "logps/chosen": -2.4088211059570312, "logps/rejected": -2.4589452743530273, "loss": 3.8198, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.088211059570312, "rewards/margins": 0.501244843006134, "rewards/rejected": -24.589452743530273, "step": 17255 }, { "epoch": 0.5817519970339412, "grad_norm": 20.795665740966797, "learning_rate": 4.446984210531363e-07, "logits/chosen": -1.3401178121566772, "logits/rejected": -1.5118831396102905, "logps/chosen": -1.9169412851333618, "logps/rejected": -1.932276964187622, "loss": 3.2381, "rewards/accuracies": 0.5, "rewards/chosen": -19.169414520263672, "rewards/margins": 0.1533549278974533, "rewards/rejected": -19.322769165039062, "step": 17260 }, { "epoch": 0.5819205231049244, "grad_norm": 7.758377552032471, "learning_rate": 4.444061006746638e-07, "logits/chosen": -1.9944626092910767, "logits/rejected": -2.3274025917053223, "logps/chosen": -2.7576446533203125, "logps/rejected": -3.061986207962036, "loss": 2.1118, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -27.57644271850586, "rewards/margins": 3.043414354324341, "rewards/rejected": -30.619861602783203, "step": 17265 }, { "epoch": 0.5820890491759075, "grad_norm": 18.494991302490234, "learning_rate": 4.441137995350496e-07, "logits/chosen": -1.7012603282928467, "logits/rejected": -1.7783534526824951, "logps/chosen": -2.5444867610931396, "logps/rejected": -2.4805233478546143, "loss": 5.1853, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.444866180419922, "rewards/margins": -0.6396337747573853, "rewards/rejected": -24.805233001708984, "step": 17270 }, { "epoch": 0.5822575752468907, "grad_norm": 51.310508728027344, "learning_rate": 4.438215177354477e-07, "logits/chosen": -1.655430555343628, "logits/rejected": -1.7902733087539673, "logps/chosen": -1.914145827293396, "logps/rejected": -1.9054285287857056, "loss": 3.3195, "rewards/accuracies": 0.5, "rewards/chosen": -19.141460418701172, "rewards/margins": -0.08717469871044159, "rewards/rejected": -19.054285049438477, "step": 17275 }, { "epoch": 0.5824261013178739, "grad_norm": 22.34128189086914, "learning_rate": 4.43529255377005e-07, "logits/chosen": -1.6213443279266357, "logits/rejected": -1.6654713153839111, "logps/chosen": -2.4206910133361816, "logps/rejected": -2.515307903289795, "loss": 3.7372, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -24.206905364990234, "rewards/margins": 0.9461703300476074, "rewards/rejected": -25.153076171875, "step": 17280 }, { "epoch": 0.582594627388857, "grad_norm": 27.15727424621582, "learning_rate": 4.432370125608622e-07, "logits/chosen": -1.858704924583435, "logits/rejected": -2.0804834365844727, "logps/chosen": -1.887465238571167, "logps/rejected": -2.244636058807373, "loss": 1.7039, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.874652862548828, "rewards/margins": 3.5717086791992188, "rewards/rejected": -22.446361541748047, "step": 17285 }, { "epoch": 0.5827631534598402, "grad_norm": 50.979434967041016, "learning_rate": 4.429447893881531e-07, "logits/chosen": -1.3986759185791016, "logits/rejected": -1.5457799434661865, "logps/chosen": -1.9006513357162476, "logps/rejected": -1.94772207736969, "loss": 3.0563, "rewards/accuracies": 0.5, "rewards/chosen": -19.006513595581055, "rewards/margins": 0.4707070291042328, "rewards/rejected": -19.47722053527832, "step": 17290 }, { "epoch": 0.5829316795308234, "grad_norm": 43.54460525512695, "learning_rate": 4.4265258596000434e-07, "logits/chosen": -1.9640693664550781, "logits/rejected": -2.306492567062378, "logps/chosen": -2.4300239086151123, "logps/rejected": -3.905445098876953, "loss": 1.4328, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -24.30023956298828, "rewards/margins": 14.75421142578125, "rewards/rejected": -39.05445098876953, "step": 17295 }, { "epoch": 0.5831002056018066, "grad_norm": 17.482149124145508, "learning_rate": 4.423604023775361e-07, "logits/chosen": -1.5966932773590088, "logits/rejected": -1.7289857864379883, "logps/chosen": -3.195779323577881, "logps/rejected": -3.491218090057373, "loss": 2.7372, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -31.957794189453125, "rewards/margins": 2.9543869495391846, "rewards/rejected": -34.91218185424805, "step": 17300 }, { "epoch": 0.5832687316727898, "grad_norm": 47.70451354980469, "learning_rate": 4.4206823874186184e-07, "logits/chosen": -1.1667314767837524, "logits/rejected": -1.2670345306396484, "logps/chosen": -3.233905076980591, "logps/rejected": -3.1098103523254395, "loss": 5.1382, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -32.339054107666016, "rewards/margins": -1.2409486770629883, "rewards/rejected": -31.098102569580078, "step": 17305 }, { "epoch": 0.583437257743773, "grad_norm": 42.176513671875, "learning_rate": 4.4177609515408773e-07, "logits/chosen": -2.0141780376434326, "logits/rejected": -2.237900495529175, "logps/chosen": -2.178071975708008, "logps/rejected": -2.326620101928711, "loss": 4.262, "rewards/accuracies": 0.5, "rewards/chosen": -21.780719757080078, "rewards/margins": 1.4854825735092163, "rewards/rejected": -23.26620101928711, "step": 17310 }, { "epoch": 0.5836057838147561, "grad_norm": 37.14058303833008, "learning_rate": 4.414839717153131e-07, "logits/chosen": -1.9146333932876587, "logits/rejected": -2.0396671295166016, "logps/chosen": -2.8835787773132324, "logps/rejected": -2.882293701171875, "loss": 4.3341, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -28.83578872680664, "rewards/margins": -0.012850761413574219, "rewards/rejected": -28.82293701171875, "step": 17315 }, { "epoch": 0.5837743098857393, "grad_norm": 35.45816421508789, "learning_rate": 4.411918685266304e-07, "logits/chosen": -1.8805458545684814, "logits/rejected": -1.6515352725982666, "logps/chosen": -2.736062526702881, "logps/rejected": -2.856140613555908, "loss": 3.9732, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -27.360626220703125, "rewards/margins": 1.200781226158142, "rewards/rejected": -28.5614070892334, "step": 17320 }, { "epoch": 0.5839428359567225, "grad_norm": 38.65712356567383, "learning_rate": 4.408997856891253e-07, "logits/chosen": -1.8081859350204468, "logits/rejected": -2.07198166847229, "logps/chosen": -2.44268536567688, "logps/rejected": -3.3284401893615723, "loss": 2.2041, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.42685317993164, "rewards/margins": 8.85754680633545, "rewards/rejected": -33.284400939941406, "step": 17325 }, { "epoch": 0.5841113620277056, "grad_norm": 15.86058235168457, "learning_rate": 4.4060772330387564e-07, "logits/chosen": -1.563691258430481, "logits/rejected": -1.9738092422485352, "logps/chosen": -2.079322099685669, "logps/rejected": -2.628232717514038, "loss": 1.3509, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.793222427368164, "rewards/margins": 5.489107131958008, "rewards/rejected": -26.28232765197754, "step": 17330 }, { "epoch": 0.5842798880986889, "grad_norm": 16.25583839416504, "learning_rate": 4.40315681471953e-07, "logits/chosen": -1.6212646961212158, "logits/rejected": -1.8121631145477295, "logps/chosen": -2.2856125831604004, "logps/rejected": -2.2472023963928223, "loss": 3.7451, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.85612678527832, "rewards/margins": -0.38410359621047974, "rewards/rejected": -22.47202491760254, "step": 17335 }, { "epoch": 0.5844484141696721, "grad_norm": 29.76826286315918, "learning_rate": 4.4002366029442154e-07, "logits/chosen": -1.1239495277404785, "logits/rejected": -1.1791913509368896, "logps/chosen": -3.060885190963745, "logps/rejected": -3.046093702316284, "loss": 4.3675, "rewards/accuracies": 0.5, "rewards/chosen": -30.608850479125977, "rewards/margins": -0.14791660010814667, "rewards/rejected": -30.4609375, "step": 17340 }, { "epoch": 0.5846169402406552, "grad_norm": 20.629085540771484, "learning_rate": 4.397316598723385e-07, "logits/chosen": -1.5575644969940186, "logits/rejected": -2.092625141143799, "logps/chosen": -2.773283004760742, "logps/rejected": -3.0996828079223633, "loss": 2.2424, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -27.732830047607422, "rewards/margins": 3.2639968395233154, "rewards/rejected": -30.996826171875, "step": 17345 }, { "epoch": 0.5847854663116384, "grad_norm": 18.53481101989746, "learning_rate": 4.394396803067533e-07, "logits/chosen": -1.792412519454956, "logits/rejected": -1.9561207294464111, "logps/chosen": -2.1911630630493164, "logps/rejected": -2.5339317321777344, "loss": 3.261, "rewards/accuracies": 0.5, "rewards/chosen": -21.911630630493164, "rewards/margins": 3.4276881217956543, "rewards/rejected": -25.339317321777344, "step": 17350 }, { "epoch": 0.5849539923826216, "grad_norm": 9.772783279418945, "learning_rate": 4.391477216987088e-07, "logits/chosen": -0.737585723400116, "logits/rejected": -1.3849313259124756, "logps/chosen": -2.2479326725006104, "logps/rejected": -3.033356189727783, "loss": 2.179, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.479326248168945, "rewards/margins": 7.8542375564575195, "rewards/rejected": -30.33356285095215, "step": 17355 }, { "epoch": 0.5851225184536047, "grad_norm": 35.8283805847168, "learning_rate": 4.3885578414924054e-07, "logits/chosen": -1.527430534362793, "logits/rejected": -1.5946701765060425, "logps/chosen": -2.0074801445007324, "logps/rejected": -1.9396559000015259, "loss": 3.876, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -20.07480239868164, "rewards/margins": -0.6782447695732117, "rewards/rejected": -19.396560668945312, "step": 17360 }, { "epoch": 0.5852910445245879, "grad_norm": 44.97746658325195, "learning_rate": 4.385638677593761e-07, "logits/chosen": -1.3549509048461914, "logits/rejected": -1.7077839374542236, "logps/chosen": -2.1543612480163574, "logps/rejected": -2.287923574447632, "loss": 2.8444, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.543611526489258, "rewards/margins": 1.3356250524520874, "rewards/rejected": -22.87923812866211, "step": 17365 }, { "epoch": 0.5854595705955712, "grad_norm": 40.57670974731445, "learning_rate": 4.382719726301366e-07, "logits/chosen": -0.7811486124992371, "logits/rejected": -1.128101110458374, "logps/chosen": -2.0559234619140625, "logps/rejected": -2.356846332550049, "loss": 1.8674, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.559234619140625, "rewards/margins": 3.0092263221740723, "rewards/rejected": -23.568462371826172, "step": 17370 }, { "epoch": 0.5856280966665544, "grad_norm": 23.4588565826416, "learning_rate": 4.379800988625354e-07, "logits/chosen": -1.4437754154205322, "logits/rejected": -1.4596506357192993, "logps/chosen": -1.832118272781372, "logps/rejected": -1.9595096111297607, "loss": 2.1277, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.321184158325195, "rewards/margins": 1.273911952972412, "rewards/rejected": -19.595096588134766, "step": 17375 }, { "epoch": 0.5857966227375375, "grad_norm": 68.99215698242188, "learning_rate": 4.3768824655757854e-07, "logits/chosen": -1.3112378120422363, "logits/rejected": -1.148902416229248, "logps/chosen": -2.7348835468292236, "logps/rejected": -2.5241928100585938, "loss": 5.5747, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -27.348834991455078, "rewards/margins": -2.1069068908691406, "rewards/rejected": -25.241928100585938, "step": 17380 }, { "epoch": 0.5859651488085207, "grad_norm": 2.611574649810791, "learning_rate": 4.3739641581626453e-07, "logits/chosen": -1.5041404962539673, "logits/rejected": -1.6578476428985596, "logps/chosen": -2.2694530487060547, "logps/rejected": -2.3508238792419434, "loss": 3.3058, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.694530487060547, "rewards/margins": 0.8137068748474121, "rewards/rejected": -23.508237838745117, "step": 17385 }, { "epoch": 0.5861336748795039, "grad_norm": 28.102005004882812, "learning_rate": 4.371046067395846e-07, "logits/chosen": -1.4805330038070679, "logits/rejected": -1.919136643409729, "logps/chosen": -1.7919280529022217, "logps/rejected": -1.934385061264038, "loss": 2.1956, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.919281005859375, "rewards/margins": 1.4245693683624268, "rewards/rejected": -19.34385108947754, "step": 17390 }, { "epoch": 0.586302200950487, "grad_norm": 78.70321655273438, "learning_rate": 4.368128194285223e-07, "logits/chosen": -1.7666633129119873, "logits/rejected": -1.8986902236938477, "logps/chosen": -2.514099597930908, "logps/rejected": -2.6970717906951904, "loss": 2.7538, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.140995025634766, "rewards/margins": 1.829724669456482, "rewards/rejected": -26.970718383789062, "step": 17395 }, { "epoch": 0.5864707270214702, "grad_norm": 33.610618591308594, "learning_rate": 4.365210539840536e-07, "logits/chosen": -1.8147218227386475, "logits/rejected": -1.7239938974380493, "logps/chosen": -1.9119583368301392, "logps/rejected": -1.9474995136260986, "loss": 3.2428, "rewards/accuracies": 0.5, "rewards/chosen": -19.119583129882812, "rewards/margins": 0.3554133474826813, "rewards/rejected": -19.47499656677246, "step": 17400 }, { "epoch": 0.5866392530924533, "grad_norm": 78.48345947265625, "learning_rate": 4.3622931050714713e-07, "logits/chosen": -1.8521192073822021, "logits/rejected": -1.9285533428192139, "logps/chosen": -2.5423624515533447, "logps/rejected": -2.8399410247802734, "loss": 2.6434, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.42362403869629, "rewards/margins": 2.9757864475250244, "rewards/rejected": -28.399410247802734, "step": 17405 }, { "epoch": 0.5868077791634366, "grad_norm": 25.898334503173828, "learning_rate": 4.359375890987641e-07, "logits/chosen": -1.4780672788619995, "logits/rejected": -1.896118402481079, "logps/chosen": -2.5103867053985596, "logps/rejected": -3.384552001953125, "loss": 1.8391, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.103864669799805, "rewards/margins": 8.741653442382812, "rewards/rejected": -33.84552001953125, "step": 17410 }, { "epoch": 0.5869763052344198, "grad_norm": 102.30123901367188, "learning_rate": 4.356458898598572e-07, "logits/chosen": -1.2664697170257568, "logits/rejected": -1.3566185235977173, "logps/chosen": -2.1262001991271973, "logps/rejected": -2.1818442344665527, "loss": 3.5012, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.26200294494629, "rewards/margins": 0.5564396977424622, "rewards/rejected": -21.81844139099121, "step": 17415 }, { "epoch": 0.587144831305403, "grad_norm": 192.08316040039062, "learning_rate": 4.353542128913725e-07, "logits/chosen": -1.8345438241958618, "logits/rejected": -1.7843061685562134, "logps/chosen": -3.290046215057373, "logps/rejected": -2.989392042160034, "loss": 7.0941, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -32.90045928955078, "rewards/margins": -3.0065383911132812, "rewards/rejected": -29.8939208984375, "step": 17420 }, { "epoch": 0.5873133573763861, "grad_norm": 25.507287979125977, "learning_rate": 4.350625582942477e-07, "logits/chosen": -1.751320481300354, "logits/rejected": -1.5903288125991821, "logps/chosen": -2.034891366958618, "logps/rejected": -2.0643208026885986, "loss": 3.1321, "rewards/accuracies": 0.5, "rewards/chosen": -20.34891128540039, "rewards/margins": 0.2942947745323181, "rewards/rejected": -20.643207550048828, "step": 17425 }, { "epoch": 0.5874818834473693, "grad_norm": 51.03654479980469, "learning_rate": 4.347709261694133e-07, "logits/chosen": -0.9774719476699829, "logits/rejected": -1.4587715864181519, "logps/chosen": -2.1797828674316406, "logps/rejected": -2.532813549041748, "loss": 2.059, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.797828674316406, "rewards/margins": 3.530308485031128, "rewards/rejected": -25.328136444091797, "step": 17430 }, { "epoch": 0.5876504095183525, "grad_norm": 13.80114459991455, "learning_rate": 4.3447931661779117e-07, "logits/chosen": -2.0016977787017822, "logits/rejected": -2.1547093391418457, "logps/chosen": -2.693709135055542, "logps/rejected": -2.9496166706085205, "loss": 3.2403, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -26.937091827392578, "rewards/margins": 2.559072732925415, "rewards/rejected": -29.496164321899414, "step": 17435 }, { "epoch": 0.5878189355893356, "grad_norm": 26.799381256103516, "learning_rate": 4.341877297402962e-07, "logits/chosen": -1.6604926586151123, "logits/rejected": -2.074387550354004, "logps/chosen": -1.9511470794677734, "logps/rejected": -2.075636386871338, "loss": 2.371, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.5114688873291, "rewards/margins": 1.2448922395706177, "rewards/rejected": -20.756362915039062, "step": 17440 }, { "epoch": 0.5879874616603189, "grad_norm": 20.89389419555664, "learning_rate": 4.3389616563783513e-07, "logits/chosen": -1.4348245859146118, "logits/rejected": -1.3038320541381836, "logps/chosen": -2.3064918518066406, "logps/rejected": -2.6267943382263184, "loss": 3.0722, "rewards/accuracies": 0.5, "rewards/chosen": -23.064918518066406, "rewards/margins": 3.203024387359619, "rewards/rejected": -26.2679443359375, "step": 17445 }, { "epoch": 0.5881559877313021, "grad_norm": 63.85792541503906, "learning_rate": 4.336046244113066e-07, "logits/chosen": -1.6739234924316406, "logits/rejected": -1.5325616598129272, "logps/chosen": -1.8348830938339233, "logps/rejected": -1.812511682510376, "loss": 3.3974, "rewards/accuracies": 0.5, "rewards/chosen": -18.348831176757812, "rewards/margins": -0.2237166464328766, "rewards/rejected": -18.1251163482666, "step": 17450 }, { "epoch": 0.5883245138022852, "grad_norm": 24.068666458129883, "learning_rate": 4.3331310616160187e-07, "logits/chosen": -1.8047908544540405, "logits/rejected": -1.758195161819458, "logps/chosen": -1.8868213891983032, "logps/rejected": -1.9505535364151, "loss": 2.784, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.868213653564453, "rewards/margins": 0.6373217701911926, "rewards/rejected": -19.505535125732422, "step": 17455 }, { "epoch": 0.5884930398732684, "grad_norm": 0.08875492960214615, "learning_rate": 4.3302161098960364e-07, "logits/chosen": -1.6345468759536743, "logits/rejected": -1.958866834640503, "logps/chosen": -2.1410791873931885, "logps/rejected": -3.067662000656128, "loss": 2.0759, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.41079330444336, "rewards/margins": 9.265830039978027, "rewards/rejected": -30.676620483398438, "step": 17460 }, { "epoch": 0.5886615659442516, "grad_norm": 3.284158083260991e-05, "learning_rate": 4.3273013899618704e-07, "logits/chosen": -1.3332509994506836, "logits/rejected": -1.8917903900146484, "logps/chosen": -2.107168436050415, "logps/rejected": -2.826239824295044, "loss": 1.5995, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.07168197631836, "rewards/margins": 7.1907148361206055, "rewards/rejected": -28.262399673461914, "step": 17465 }, { "epoch": 0.5888300920152347, "grad_norm": 44.276790618896484, "learning_rate": 4.32438690282219e-07, "logits/chosen": -1.041285753250122, "logits/rejected": -1.167011022567749, "logps/chosen": -2.1292452812194824, "logps/rejected": -2.432764768600464, "loss": 2.8508, "rewards/accuracies": 0.5, "rewards/chosen": -21.292451858520508, "rewards/margins": 3.0351970195770264, "rewards/rejected": -24.327648162841797, "step": 17470 }, { "epoch": 0.5889986180862179, "grad_norm": 0.5383897423744202, "learning_rate": 4.3214726494855836e-07, "logits/chosen": -1.7013810873031616, "logits/rejected": -1.6997826099395752, "logps/chosen": -3.0948708057403564, "logps/rejected": -3.3002877235412598, "loss": 4.2099, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -30.948705673217773, "rewards/margins": 2.0541675090789795, "rewards/rejected": -33.00287628173828, "step": 17475 }, { "epoch": 0.5891671441572012, "grad_norm": 19.071176528930664, "learning_rate": 4.3185586309605627e-07, "logits/chosen": -1.5275566577911377, "logits/rejected": -1.7071269750595093, "logps/chosen": -2.2619528770446777, "logps/rejected": -2.693704128265381, "loss": 2.9347, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.619525909423828, "rewards/margins": 4.317513465881348, "rewards/rejected": -26.937042236328125, "step": 17480 }, { "epoch": 0.5893356702281843, "grad_norm": 36.96125411987305, "learning_rate": 4.3156448482555494e-07, "logits/chosen": -1.653538465499878, "logits/rejected": -1.7368927001953125, "logps/chosen": -2.780684232711792, "logps/rejected": -2.837693214416504, "loss": 2.7991, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -27.80684471130371, "rewards/margins": 0.5700892210006714, "rewards/rejected": -28.376934051513672, "step": 17485 }, { "epoch": 0.5895041962991675, "grad_norm": 66.32109069824219, "learning_rate": 4.312731302378892e-07, "logits/chosen": -1.5482505559921265, "logits/rejected": -1.755061149597168, "logps/chosen": -2.361372470855713, "logps/rejected": -2.75365948677063, "loss": 1.4334, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -23.613723754882812, "rewards/margins": 3.922870635986328, "rewards/rejected": -27.536596298217773, "step": 17490 }, { "epoch": 0.5896727223701507, "grad_norm": 29.832361221313477, "learning_rate": 4.3098179943388535e-07, "logits/chosen": -1.2624939680099487, "logits/rejected": -1.3678514957427979, "logps/chosen": -2.529294729232788, "logps/rejected": -2.463853120803833, "loss": 5.1863, "rewards/accuracies": 0.5, "rewards/chosen": -25.29294776916504, "rewards/margins": -0.6544168591499329, "rewards/rejected": -24.638530731201172, "step": 17495 }, { "epoch": 0.5898412484411338, "grad_norm": 32.630340576171875, "learning_rate": 4.3069049251436175e-07, "logits/chosen": -1.7602989673614502, "logits/rejected": -1.858923316001892, "logps/chosen": -2.544949769973755, "logps/rejected": -2.550809860229492, "loss": 5.0199, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.44949722290039, "rewards/margins": 0.05859985202550888, "rewards/rejected": -25.508098602294922, "step": 17500 }, { "epoch": 0.590009774512117, "grad_norm": 22.349571228027344, "learning_rate": 4.3039920958012776e-07, "logits/chosen": -1.661318063735962, "logits/rejected": -1.627806305885315, "logps/chosen": -2.583085298538208, "logps/rejected": -2.57246470451355, "loss": 3.8321, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.830852508544922, "rewards/margins": -0.10620470345020294, "rewards/rejected": -25.724645614624023, "step": 17505 }, { "epoch": 0.5901783005831002, "grad_norm": 68.3171615600586, "learning_rate": 4.3010795073198513e-07, "logits/chosen": -1.728459119796753, "logits/rejected": -1.9351695775985718, "logps/chosen": -2.076179265975952, "logps/rejected": -2.5930678844451904, "loss": 2.3447, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.761795043945312, "rewards/margins": 5.168886661529541, "rewards/rejected": -25.930679321289062, "step": 17510 }, { "epoch": 0.5903468266540833, "grad_norm": 27.479597091674805, "learning_rate": 4.2981671607072727e-07, "logits/chosen": -1.640201210975647, "logits/rejected": -1.7427374124526978, "logps/chosen": -2.0141711235046387, "logps/rejected": -2.28794527053833, "loss": 1.9525, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -20.141712188720703, "rewards/margins": 2.7377383708953857, "rewards/rejected": -22.87944984436035, "step": 17515 }, { "epoch": 0.5905153527250666, "grad_norm": 37.086692810058594, "learning_rate": 4.295255056971389e-07, "logits/chosen": -1.4790493249893188, "logits/rejected": -1.347656488418579, "logps/chosen": -2.7744202613830566, "logps/rejected": -2.695765972137451, "loss": 5.654, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -27.74420166015625, "rewards/margins": -0.7865422368049622, "rewards/rejected": -26.957660675048828, "step": 17520 }, { "epoch": 0.5906838787960498, "grad_norm": 16.76278305053711, "learning_rate": 4.2923431971199624e-07, "logits/chosen": -1.0991032123565674, "logits/rejected": -1.166424036026001, "logps/chosen": -2.3206911087036133, "logps/rejected": -2.3980796337127686, "loss": 2.8055, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.2069091796875, "rewards/margins": 0.7738858461380005, "rewards/rejected": -23.980796813964844, "step": 17525 }, { "epoch": 0.5908524048670329, "grad_norm": 17.811054229736328, "learning_rate": 4.289431582160675e-07, "logits/chosen": -1.578426718711853, "logits/rejected": -2.2804975509643555, "logps/chosen": -2.642085313796997, "logps/rejected": -3.4870471954345703, "loss": 2.2679, "rewards/accuracies": 0.5, "rewards/chosen": -26.420848846435547, "rewards/margins": 8.449621200561523, "rewards/rejected": -34.87046813964844, "step": 17530 }, { "epoch": 0.5910209309380161, "grad_norm": 24.69849967956543, "learning_rate": 4.286520213101123e-07, "logits/chosen": -1.7663204669952393, "logits/rejected": -1.9045088291168213, "logps/chosen": -2.673767566680908, "logps/rejected": -2.9358999729156494, "loss": 2.1756, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.7376766204834, "rewards/margins": 2.621324300765991, "rewards/rejected": -29.358999252319336, "step": 17535 }, { "epoch": 0.5911894570089993, "grad_norm": 53.869384765625, "learning_rate": 4.283609090948814e-07, "logits/chosen": -1.6090246438980103, "logits/rejected": -1.5867881774902344, "logps/chosen": -2.0779833793640137, "logps/rejected": -1.9043381214141846, "loss": 4.9391, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.779834747314453, "rewards/margins": -1.7364532947540283, "rewards/rejected": -19.043380737304688, "step": 17540 }, { "epoch": 0.5913579830799824, "grad_norm": 14.778437614440918, "learning_rate": 4.280698216711174e-07, "logits/chosen": -1.898503303527832, "logits/rejected": -2.0935044288635254, "logps/chosen": -2.9049651622772217, "logps/rejected": -3.0754752159118652, "loss": 2.9957, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -29.049652099609375, "rewards/margins": 1.7050997018814087, "rewards/rejected": -30.754751205444336, "step": 17545 }, { "epoch": 0.5915265091509656, "grad_norm": 60.79536056518555, "learning_rate": 4.2777875913955443e-07, "logits/chosen": -1.1955629587173462, "logits/rejected": -1.308807134628296, "logps/chosen": -2.1304657459259033, "logps/rejected": -2.615813732147217, "loss": 2.2048, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.304656982421875, "rewards/margins": 4.853476047515869, "rewards/rejected": -26.158132553100586, "step": 17550 }, { "epoch": 0.5916950352219489, "grad_norm": 35.70866775512695, "learning_rate": 4.2748772160091727e-07, "logits/chosen": -2.0288426876068115, "logits/rejected": -2.134122610092163, "logps/chosen": -2.0519161224365234, "logps/rejected": -2.507661819458008, "loss": 3.2737, "rewards/accuracies": 0.5, "rewards/chosen": -20.519161224365234, "rewards/margins": 4.557457447052002, "rewards/rejected": -25.076618194580078, "step": 17555 }, { "epoch": 0.591863561292932, "grad_norm": 26.37740707397461, "learning_rate": 4.271967091559228e-07, "logits/chosen": -1.6266578435897827, "logits/rejected": -1.8625901937484741, "logps/chosen": -1.8834251165390015, "logps/rejected": -1.9434534311294556, "loss": 3.822, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.834253311157227, "rewards/margins": 0.6002805829048157, "rewards/rejected": -19.434532165527344, "step": 17560 }, { "epoch": 0.5920320873639152, "grad_norm": 20.472694396972656, "learning_rate": 4.2690572190527895e-07, "logits/chosen": -1.5904171466827393, "logits/rejected": -2.1096081733703613, "logps/chosen": -2.228877305984497, "logps/rejected": -2.7297446727752686, "loss": 1.0736, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -22.28877067565918, "rewards/margins": 5.008677005767822, "rewards/rejected": -27.297449111938477, "step": 17565 }, { "epoch": 0.5922006134348984, "grad_norm": 34.900054931640625, "learning_rate": 4.266147599496852e-07, "logits/chosen": -1.158190131187439, "logits/rejected": -1.2331039905548096, "logps/chosen": -2.0225605964660645, "logps/rejected": -2.1574673652648926, "loss": 2.6277, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.22560691833496, "rewards/margins": 1.349064588546753, "rewards/rejected": -21.57467269897461, "step": 17570 }, { "epoch": 0.5923691395058815, "grad_norm": 54.93463897705078, "learning_rate": 4.2632382338983153e-07, "logits/chosen": -1.7310314178466797, "logits/rejected": -1.6517353057861328, "logps/chosen": -2.441493511199951, "logps/rejected": -2.7141823768615723, "loss": 3.6634, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.414932250976562, "rewards/margins": 2.7268924713134766, "rewards/rejected": -27.14182472229004, "step": 17575 }, { "epoch": 0.5925376655768647, "grad_norm": 43.11684799194336, "learning_rate": 4.2603291232639984e-07, "logits/chosen": -1.8522008657455444, "logits/rejected": -1.6992623805999756, "logps/chosen": -2.5552072525024414, "logps/rejected": -2.281377077102661, "loss": 5.8346, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -25.552072525024414, "rewards/margins": -2.7383031845092773, "rewards/rejected": -22.813770294189453, "step": 17580 }, { "epoch": 0.5927061916478479, "grad_norm": 16.700607299804688, "learning_rate": 4.257420268600632e-07, "logits/chosen": -1.6553192138671875, "logits/rejected": -1.7701327800750732, "logps/chosen": -3.3446273803710938, "logps/rejected": -4.0541839599609375, "loss": 1.9073, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -33.44627380371094, "rewards/margins": 7.095564365386963, "rewards/rejected": -40.541839599609375, "step": 17585 }, { "epoch": 0.5928747177188312, "grad_norm": 25.058446884155273, "learning_rate": 4.2545116709148526e-07, "logits/chosen": -1.351576328277588, "logits/rejected": -1.4393280744552612, "logps/chosen": -1.845435380935669, "logps/rejected": -1.8508222103118896, "loss": 3.1528, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.4543514251709, "rewards/margins": 0.05386962741613388, "rewards/rejected": -18.508222579956055, "step": 17590 }, { "epoch": 0.5930432437898143, "grad_norm": 40.690330505371094, "learning_rate": 4.251603331213213e-07, "logits/chosen": -0.8327827453613281, "logits/rejected": -1.0143380165100098, "logps/chosen": -2.506579875946045, "logps/rejected": -2.4614579677581787, "loss": 3.6507, "rewards/accuracies": 0.5, "rewards/chosen": -25.065799713134766, "rewards/margins": -0.451219379901886, "rewards/rejected": -24.614582061767578, "step": 17595 }, { "epoch": 0.5932117698607975, "grad_norm": 37.29527282714844, "learning_rate": 4.248695250502174e-07, "logits/chosen": -1.3765560388565063, "logits/rejected": -1.6383224725723267, "logps/chosen": -2.117464780807495, "logps/rejected": -2.629913330078125, "loss": 1.8856, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.17464828491211, "rewards/margins": 5.124485969543457, "rewards/rejected": -26.29913330078125, "step": 17600 }, { "epoch": 0.5932117698607975, "eval_logits/chosen": -1.9636667966842651, "eval_logits/rejected": -2.110840082168579, "eval_logps/chosen": -2.127274513244629, "eval_logps/rejected": -2.264631509780884, "eval_loss": 2.9846508502960205, "eval_rewards/accuracies": 0.6200000047683716, "eval_rewards/chosen": -21.272747039794922, "eval_rewards/margins": 1.3735666275024414, "eval_rewards/rejected": -22.646312713623047, "eval_runtime": 12.9237, "eval_samples_per_second": 7.738, "eval_steps_per_second": 1.934, "step": 17600 }, { "epoch": 0.5933802959317807, "grad_norm": 19.281713485717773, "learning_rate": 4.2457874297881105e-07, "logits/chosen": -1.4907379150390625, "logits/rejected": -1.6906015872955322, "logps/chosen": -2.0307023525238037, "logps/rejected": -2.305128574371338, "loss": 2.3951, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.307024002075195, "rewards/margins": 2.744263172149658, "rewards/rejected": -23.051288604736328, "step": 17605 }, { "epoch": 0.5935488220027638, "grad_norm": 32.606544494628906, "learning_rate": 4.242879870077301e-07, "logits/chosen": -1.3404357433319092, "logits/rejected": -1.6816742420196533, "logps/chosen": -2.534003734588623, "logps/rejected": -2.823920726776123, "loss": 2.3159, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.340038299560547, "rewards/margins": 2.899167060852051, "rewards/rejected": -28.23920249938965, "step": 17610 }, { "epoch": 0.593717348073747, "grad_norm": 21.681140899658203, "learning_rate": 4.2399725723759395e-07, "logits/chosen": -1.5766531229019165, "logits/rejected": -2.337092399597168, "logps/chosen": -2.5060973167419434, "logps/rejected": -2.8640053272247314, "loss": 1.2001, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -25.060977935791016, "rewards/margins": 3.5790741443634033, "rewards/rejected": -28.64004898071289, "step": 17615 }, { "epoch": 0.5938858741447302, "grad_norm": 30.393402099609375, "learning_rate": 4.2370655376901286e-07, "logits/chosen": -1.6135174036026, "logits/rejected": -1.6738744974136353, "logps/chosen": -2.3300719261169434, "logps/rejected": -2.5740814208984375, "loss": 1.3831, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -23.30072021484375, "rewards/margins": 2.4400906562805176, "rewards/rejected": -25.740814208984375, "step": 17620 }, { "epoch": 0.5940544002157133, "grad_norm": 48.35916519165039, "learning_rate": 4.234158767025875e-07, "logits/chosen": -1.544013261795044, "logits/rejected": -1.8595256805419922, "logps/chosen": -2.1459286212921143, "logps/rejected": -2.489976406097412, "loss": 1.4423, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -21.459285736083984, "rewards/margins": 3.4404807090759277, "rewards/rejected": -24.89976692199707, "step": 17625 }, { "epoch": 0.5942229262866966, "grad_norm": 18.79473876953125, "learning_rate": 4.231252261389099e-07, "logits/chosen": -1.719613790512085, "logits/rejected": -1.609312653541565, "logps/chosen": -2.18891978263855, "logps/rejected": -2.1651229858398438, "loss": 3.4594, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -21.88920021057129, "rewards/margins": -0.23796892166137695, "rewards/rejected": -21.651229858398438, "step": 17630 }, { "epoch": 0.5943914523576798, "grad_norm": 35.13705062866211, "learning_rate": 4.2283460217856275e-07, "logits/chosen": -1.4112998247146606, "logits/rejected": -1.5887477397918701, "logps/chosen": -2.176684856414795, "logps/rejected": -2.39996337890625, "loss": 2.3311, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.766849517822266, "rewards/margins": 2.232788562774658, "rewards/rejected": -23.999637603759766, "step": 17635 }, { "epoch": 0.5945599784286629, "grad_norm": 41.7555046081543, "learning_rate": 4.225440049221198e-07, "logits/chosen": -1.7434478998184204, "logits/rejected": -1.6766493320465088, "logps/chosen": -1.836517333984375, "logps/rejected": -1.7924884557724, "loss": 3.6239, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.365171432495117, "rewards/margins": -0.4402883052825928, "rewards/rejected": -17.924884796142578, "step": 17640 }, { "epoch": 0.5947285044996461, "grad_norm": 21.95073890686035, "learning_rate": 4.222534344701449e-07, "logits/chosen": -1.4925669431686401, "logits/rejected": -1.6031299829483032, "logps/chosen": -2.6258418560028076, "logps/rejected": -2.924691677093506, "loss": 2.6878, "rewards/accuracies": 0.5, "rewards/chosen": -26.258419036865234, "rewards/margins": 2.9885010719299316, "rewards/rejected": -29.24692153930664, "step": 17645 }, { "epoch": 0.5948970305706293, "grad_norm": 30.1873779296875, "learning_rate": 4.2196289092319333e-07, "logits/chosen": -1.2937986850738525, "logits/rejected": -1.3382008075714111, "logps/chosen": -2.9433345794677734, "logps/rejected": -2.8709471225738525, "loss": 5.1872, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -29.4333438873291, "rewards/margins": -0.7238739728927612, "rewards/rejected": -28.709468841552734, "step": 17650 }, { "epoch": 0.5950655566416124, "grad_norm": 0.43724510073661804, "learning_rate": 4.216723743818108e-07, "logits/chosen": -1.4383492469787598, "logits/rejected": -1.4420338869094849, "logps/chosen": -1.880933403968811, "logps/rejected": -2.153778553009033, "loss": 2.3401, "rewards/accuracies": 0.5, "rewards/chosen": -18.8093318939209, "rewards/margins": 2.728450059890747, "rewards/rejected": -21.537784576416016, "step": 17655 }, { "epoch": 0.5952340827125956, "grad_norm": 60.6141357421875, "learning_rate": 4.2138188494653336e-07, "logits/chosen": -1.254163146018982, "logits/rejected": -1.2855738401412964, "logps/chosen": -1.8590952157974243, "logps/rejected": -2.130317211151123, "loss": 2.0457, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.590951919555664, "rewards/margins": 2.712218999862671, "rewards/rejected": -21.303173065185547, "step": 17660 }, { "epoch": 0.5954026087835789, "grad_norm": 222.0001678466797, "learning_rate": 4.2109142271788805e-07, "logits/chosen": -1.451233983039856, "logits/rejected": -1.878170371055603, "logps/chosen": -2.879201889038086, "logps/rejected": -3.0601067543029785, "loss": 5.7789, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -28.79201889038086, "rewards/margins": 1.8090499639511108, "rewards/rejected": -30.6010684967041, "step": 17665 }, { "epoch": 0.595571134854562, "grad_norm": 18.758010864257812, "learning_rate": 4.208009877963925e-07, "logits/chosen": -1.7818104028701782, "logits/rejected": -2.403594493865967, "logps/chosen": -2.9406161308288574, "logps/rejected": -3.6528496742248535, "loss": 1.6939, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -29.406158447265625, "rewards/margins": 7.1223344802856445, "rewards/rejected": -36.52849578857422, "step": 17670 }, { "epoch": 0.5957396609255452, "grad_norm": 15.573593139648438, "learning_rate": 4.205105802825548e-07, "logits/chosen": -1.4992997646331787, "logits/rejected": -1.5927377939224243, "logps/chosen": -2.3704192638397217, "logps/rejected": -2.571882724761963, "loss": 2.4671, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.704195022583008, "rewards/margins": 2.0146334171295166, "rewards/rejected": -25.718826293945312, "step": 17675 }, { "epoch": 0.5959081869965284, "grad_norm": 59.856201171875, "learning_rate": 4.2022020027687344e-07, "logits/chosen": -1.6479030847549438, "logits/rejected": -1.8206367492675781, "logps/chosen": -2.8668527603149414, "logps/rejected": -3.6330673694610596, "loss": 1.8714, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -28.66852378845215, "rewards/margins": 7.662148952484131, "rewards/rejected": -36.33067321777344, "step": 17680 }, { "epoch": 0.5960767130675115, "grad_norm": 26.6012020111084, "learning_rate": 4.199298478798376e-07, "logits/chosen": -1.8493913412094116, "logits/rejected": -1.903660774230957, "logps/chosen": -1.858231782913208, "logps/rejected": -2.213120937347412, "loss": 1.4821, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -18.582317352294922, "rewards/margins": 3.5488898754119873, "rewards/rejected": -22.131208419799805, "step": 17685 }, { "epoch": 0.5962452391384947, "grad_norm": 101.23904418945312, "learning_rate": 4.1963952319192695e-07, "logits/chosen": -1.0026901960372925, "logits/rejected": -1.2731549739837646, "logps/chosen": -2.68685245513916, "logps/rejected": -2.9386978149414062, "loss": 2.6075, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -26.868526458740234, "rewards/margins": 2.518451452255249, "rewards/rejected": -29.386978149414062, "step": 17690 }, { "epoch": 0.5964137652094779, "grad_norm": 27.395618438720703, "learning_rate": 4.1934922631361104e-07, "logits/chosen": -1.4647185802459717, "logits/rejected": -1.7177009582519531, "logps/chosen": -2.9183998107910156, "logps/rejected": -3.2998645305633545, "loss": 1.8271, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -29.183996200561523, "rewards/margins": 3.8146488666534424, "rewards/rejected": -32.99864959716797, "step": 17695 }, { "epoch": 0.5965822912804611, "grad_norm": 13.962396621704102, "learning_rate": 4.190589573453504e-07, "logits/chosen": -1.97426438331604, "logits/rejected": -1.9399926662445068, "logps/chosen": -2.150571823120117, "logps/rejected": -2.5680508613586426, "loss": 2.7481, "rewards/accuracies": 0.5, "rewards/chosen": -21.505718231201172, "rewards/margins": 4.174788951873779, "rewards/rejected": -25.680505752563477, "step": 17700 }, { "epoch": 0.5967508173514443, "grad_norm": 188.11080932617188, "learning_rate": 4.1876871638759564e-07, "logits/chosen": -2.3743977546691895, "logits/rejected": -2.6676254272460938, "logps/chosen": -3.02490234375, "logps/rejected": -3.4125359058380127, "loss": 4.419, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -30.2490234375, "rewards/margins": 3.8763325214385986, "rewards/rejected": -34.12535858154297, "step": 17705 }, { "epoch": 0.5969193434224275, "grad_norm": 22.14044761657715, "learning_rate": 4.18478503540788e-07, "logits/chosen": -1.5527372360229492, "logits/rejected": -1.6013343334197998, "logps/chosen": -2.241884708404541, "logps/rejected": -2.361104965209961, "loss": 2.9046, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.418848037719727, "rewards/margins": 1.1922025680541992, "rewards/rejected": -23.61104965209961, "step": 17710 }, { "epoch": 0.5970878694934106, "grad_norm": 19.193300247192383, "learning_rate": 4.181883189053582e-07, "logits/chosen": -1.4520524740219116, "logits/rejected": -1.5924266576766968, "logps/chosen": -1.887930154800415, "logps/rejected": -2.1257078647613525, "loss": 2.7585, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.879301071166992, "rewards/margins": 2.377777338027954, "rewards/rejected": -21.257076263427734, "step": 17715 }, { "epoch": 0.5972563955643938, "grad_norm": 23.460098266601562, "learning_rate": 4.1789816258172805e-07, "logits/chosen": -1.358764886856079, "logits/rejected": -2.004973888397217, "logps/chosen": -2.920631170272827, "logps/rejected": -3.5469462871551514, "loss": 3.206, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -29.206314086914062, "rewards/margins": 6.263152122497559, "rewards/rejected": -35.46946334838867, "step": 17720 }, { "epoch": 0.597424921635377, "grad_norm": 37.642578125, "learning_rate": 4.176080346703094e-07, "logits/chosen": -1.4246867895126343, "logits/rejected": -1.5094234943389893, "logps/chosen": -1.9959986209869385, "logps/rejected": -2.001661539077759, "loss": 3.374, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.95998764038086, "rewards/margins": 0.056629084050655365, "rewards/rejected": -20.016613006591797, "step": 17725 }, { "epoch": 0.5975934477063601, "grad_norm": 21.875577926635742, "learning_rate": 4.173179352715035e-07, "logits/chosen": -1.5243735313415527, "logits/rejected": -1.5015779733657837, "logps/chosen": -2.428837299346924, "logps/rejected": -2.3328146934509277, "loss": 4.3343, "rewards/accuracies": 0.5, "rewards/chosen": -24.288372039794922, "rewards/margins": -0.9602264165878296, "rewards/rejected": -23.32814598083496, "step": 17730 }, { "epoch": 0.5977619737773433, "grad_norm": 6.548096179962158, "learning_rate": 4.170278644857027e-07, "logits/chosen": -1.662825584411621, "logits/rejected": -1.599381446838379, "logps/chosen": -2.056088924407959, "logps/rejected": -2.751908540725708, "loss": 1.445, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.560888290405273, "rewards/margins": 6.958195686340332, "rewards/rejected": -27.519084930419922, "step": 17735 }, { "epoch": 0.5979304998483266, "grad_norm": 63.508277893066406, "learning_rate": 4.16737822413289e-07, "logits/chosen": -2.2823574542999268, "logits/rejected": -2.3403286933898926, "logps/chosen": -2.6337478160858154, "logps/rejected": -2.5546653270721436, "loss": 5.8875, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.337478637695312, "rewards/margins": -0.7908235788345337, "rewards/rejected": -25.546653747558594, "step": 17740 }, { "epoch": 0.5980990259193097, "grad_norm": 18.882797241210938, "learning_rate": 4.1644780915463475e-07, "logits/chosen": -1.9262142181396484, "logits/rejected": -2.1272881031036377, "logps/chosen": -1.8121017217636108, "logps/rejected": -2.1528286933898926, "loss": 1.4948, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -18.121017456054688, "rewards/margins": 3.4072678089141846, "rewards/rejected": -21.52828598022461, "step": 17745 }, { "epoch": 0.5982675519902929, "grad_norm": 18.638851165771484, "learning_rate": 4.1615782481010176e-07, "logits/chosen": -1.487438678741455, "logits/rejected": -1.7235790491104126, "logps/chosen": -1.9567781686782837, "logps/rejected": -2.219252347946167, "loss": 1.372, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.56778335571289, "rewards/margins": 2.6247432231903076, "rewards/rejected": -22.19252586364746, "step": 17750 }, { "epoch": 0.5984360780612761, "grad_norm": 37.2097053527832, "learning_rate": 4.158678694800425e-07, "logits/chosen": -1.640181303024292, "logits/rejected": -1.6364223957061768, "logps/chosen": -2.0055267810821533, "logps/rejected": -2.185953378677368, "loss": 3.3978, "rewards/accuracies": 0.5, "rewards/chosen": -20.055267333984375, "rewards/margins": 1.8042659759521484, "rewards/rejected": -21.859533309936523, "step": 17755 }, { "epoch": 0.5986046041322592, "grad_norm": 22.888492584228516, "learning_rate": 4.155779432647989e-07, "logits/chosen": -1.4140545129776, "logits/rejected": -1.5783103704452515, "logps/chosen": -2.144395589828491, "logps/rejected": -2.281733989715576, "loss": 2.4318, "rewards/accuracies": 0.5, "rewards/chosen": -21.443958282470703, "rewards/margins": 1.3733841180801392, "rewards/rejected": -22.817340850830078, "step": 17760 }, { "epoch": 0.5987731302032424, "grad_norm": 24.782909393310547, "learning_rate": 4.1528804626470295e-07, "logits/chosen": -1.4038830995559692, "logits/rejected": -1.4523396492004395, "logps/chosen": -2.3506646156311035, "logps/rejected": -2.3315212726593018, "loss": 3.7075, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -23.506643295288086, "rewards/margins": -0.1914331465959549, "rewards/rejected": -23.31521224975586, "step": 17765 }, { "epoch": 0.5989416562742256, "grad_norm": 31.55765151977539, "learning_rate": 4.149981785800767e-07, "logits/chosen": -1.700679063796997, "logits/rejected": -1.971419095993042, "logps/chosen": -2.1803455352783203, "logps/rejected": -2.843294858932495, "loss": 1.6472, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.803455352783203, "rewards/margins": 6.629496097564697, "rewards/rejected": -28.43294906616211, "step": 17770 }, { "epoch": 0.5991101823452089, "grad_norm": 29.38243293762207, "learning_rate": 4.147083403112319e-07, "logits/chosen": -1.5240113735198975, "logits/rejected": -1.3688710927963257, "logps/chosen": -1.7080087661743164, "logps/rejected": -1.7847347259521484, "loss": 2.4782, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.080087661743164, "rewards/margins": 0.7672606706619263, "rewards/rejected": -17.847349166870117, "step": 17775 }, { "epoch": 0.599278708416192, "grad_norm": 67.73983001708984, "learning_rate": 4.144185315584703e-07, "logits/chosen": -1.9726877212524414, "logits/rejected": -1.9675414562225342, "logps/chosen": -2.4346508979797363, "logps/rejected": -2.096621036529541, "loss": 6.6933, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -24.34650993347168, "rewards/margins": -3.380300521850586, "rewards/rejected": -20.966209411621094, "step": 17780 }, { "epoch": 0.5994472344871752, "grad_norm": 29.38736915588379, "learning_rate": 4.14128752422083e-07, "logits/chosen": -1.3103973865509033, "logits/rejected": -1.4525163173675537, "logps/chosen": -1.706693410873413, "logps/rejected": -1.8178211450576782, "loss": 2.1655, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.06693458557129, "rewards/margins": 1.1112741231918335, "rewards/rejected": -18.17820930480957, "step": 17785 }, { "epoch": 0.5996157605581584, "grad_norm": 31.563465118408203, "learning_rate": 4.1383900300235125e-07, "logits/chosen": -2.0533287525177, "logits/rejected": -2.32878041267395, "logps/chosen": -2.641245126724243, "logps/rejected": -2.7328808307647705, "loss": 3.7078, "rewards/accuracies": 0.5, "rewards/chosen": -26.412450790405273, "rewards/margins": 0.9163557887077332, "rewards/rejected": -27.328805923461914, "step": 17790 }, { "epoch": 0.5997842866291415, "grad_norm": 49.81260681152344, "learning_rate": 4.135492833995462e-07, "logits/chosen": -1.5277329683303833, "logits/rejected": -1.401781678199768, "logps/chosen": -2.6558830738067627, "logps/rejected": -3.2873096466064453, "loss": 3.8545, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -26.5588321685791, "rewards/margins": 6.31426477432251, "rewards/rejected": -32.87309646606445, "step": 17795 }, { "epoch": 0.5999528127001247, "grad_norm": 36.82050323486328, "learning_rate": 4.1325959371392796e-07, "logits/chosen": -0.9483410120010376, "logits/rejected": -1.0351123809814453, "logps/chosen": -2.4580655097961426, "logps/rejected": -2.5040602684020996, "loss": 3.2038, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -24.580652236938477, "rewards/margins": 0.4599494934082031, "rewards/rejected": -25.040603637695312, "step": 17800 }, { "epoch": 0.6001213387711078, "grad_norm": 17.785057067871094, "learning_rate": 4.1296993404574687e-07, "logits/chosen": -1.2796690464019775, "logits/rejected": -2.216463327407837, "logps/chosen": -2.1483981609344482, "logps/rejected": -2.617771863937378, "loss": 1.394, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -21.48398208618164, "rewards/margins": 4.693737506866455, "rewards/rejected": -26.177722930908203, "step": 17805 }, { "epoch": 0.6002898648420911, "grad_norm": 22.225467681884766, "learning_rate": 4.1268030449524275e-07, "logits/chosen": -2.2345798015594482, "logits/rejected": -2.384531021118164, "logps/chosen": -2.7949657440185547, "logps/rejected": -2.8740978240966797, "loss": 3.4624, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -27.949655532836914, "rewards/margins": 0.7913219332695007, "rewards/rejected": -28.740978240966797, "step": 17810 }, { "epoch": 0.6004583909130743, "grad_norm": 18.74883270263672, "learning_rate": 4.1239070516264506e-07, "logits/chosen": -1.6532881259918213, "logits/rejected": -1.6195957660675049, "logps/chosen": -2.23626446723938, "logps/rejected": -2.477332353591919, "loss": 3.5523, "rewards/accuracies": 0.5, "rewards/chosen": -22.36264419555664, "rewards/margins": 2.4106783866882324, "rewards/rejected": -24.7733211517334, "step": 17815 }, { "epoch": 0.6006269169840575, "grad_norm": 24.84805679321289, "learning_rate": 4.1210113614817273e-07, "logits/chosen": -1.1539795398712158, "logits/rejected": -1.2512967586517334, "logps/chosen": -2.1468093395233154, "logps/rejected": -2.287897825241089, "loss": 2.038, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.468093872070312, "rewards/margins": 1.4108854532241821, "rewards/rejected": -22.878978729248047, "step": 17820 }, { "epoch": 0.6007954430550406, "grad_norm": 45.94430923461914, "learning_rate": 4.11811597552034e-07, "logits/chosen": -1.8820756673812866, "logits/rejected": -1.816383719444275, "logps/chosen": -2.616534948348999, "logps/rejected": -2.635910987854004, "loss": 3.6579, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -26.16534996032715, "rewards/margins": 0.1937606781721115, "rewards/rejected": -26.35910987854004, "step": 17825 }, { "epoch": 0.6009639691260238, "grad_norm": 35.467071533203125, "learning_rate": 4.115220894744269e-07, "logits/chosen": -1.4625093936920166, "logits/rejected": -1.6223608255386353, "logps/chosen": -1.9035320281982422, "logps/rejected": -2.360581159591675, "loss": 2.4854, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.035320281982422, "rewards/margins": 4.570491790771484, "rewards/rejected": -23.605812072753906, "step": 17830 }, { "epoch": 0.601132495197007, "grad_norm": 17.500070571899414, "learning_rate": 4.1123261201553867e-07, "logits/chosen": -1.5405447483062744, "logits/rejected": -1.847246527671814, "logps/chosen": -2.1715755462646484, "logps/rejected": -2.882485866546631, "loss": 3.3008, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.71575355529785, "rewards/margins": 7.109103202819824, "rewards/rejected": -28.824859619140625, "step": 17835 }, { "epoch": 0.6013010212679901, "grad_norm": 141.09652709960938, "learning_rate": 4.109431652755461e-07, "logits/chosen": -1.4695584774017334, "logits/rejected": -1.6095157861709595, "logps/chosen": -2.1822612285614014, "logps/rejected": -2.1554157733917236, "loss": 3.5666, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.822612762451172, "rewards/margins": -0.2684548497200012, "rewards/rejected": -21.554157257080078, "step": 17840 }, { "epoch": 0.6014695473389733, "grad_norm": 21.075143814086914, "learning_rate": 4.106537493546154e-07, "logits/chosen": -1.46273672580719, "logits/rejected": -1.464318871498108, "logps/chosen": -2.298161268234253, "logps/rejected": -2.3293509483337402, "loss": 3.4996, "rewards/accuracies": 0.5, "rewards/chosen": -22.981613159179688, "rewards/margins": 0.3118970990180969, "rewards/rejected": -23.293508529663086, "step": 17845 }, { "epoch": 0.6016380734099566, "grad_norm": 25.357160568237305, "learning_rate": 4.1036436435290207e-07, "logits/chosen": -1.5115927457809448, "logits/rejected": -1.9485218524932861, "logps/chosen": -1.806051254272461, "logps/rejected": -2.0344676971435547, "loss": 2.5676, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.06051254272461, "rewards/margins": 2.2841649055480957, "rewards/rejected": -20.344676971435547, "step": 17850 }, { "epoch": 0.6018065994809397, "grad_norm": 25.672758102416992, "learning_rate": 4.100750103705506e-07, "logits/chosen": -1.677353858947754, "logits/rejected": -2.0314478874206543, "logps/chosen": -2.9779555797576904, "logps/rejected": -3.4202136993408203, "loss": 1.8307, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -29.779556274414062, "rewards/margins": 4.422582626342773, "rewards/rejected": -34.2021369934082, "step": 17855 }, { "epoch": 0.6019751255519229, "grad_norm": 16.537107467651367, "learning_rate": 4.09785687507695e-07, "logits/chosen": -1.6343332529067993, "logits/rejected": -1.8614110946655273, "logps/chosen": -2.0546581745147705, "logps/rejected": -2.017251491546631, "loss": 3.8123, "rewards/accuracies": 0.5, "rewards/chosen": -20.546581268310547, "rewards/margins": -0.37406882643699646, "rewards/rejected": -20.17251205444336, "step": 17860 }, { "epoch": 0.6021436516229061, "grad_norm": 44.8232421875, "learning_rate": 4.0949639586445907e-07, "logits/chosen": -1.3961999416351318, "logits/rejected": -1.5271848440170288, "logps/chosen": -2.1533312797546387, "logps/rejected": -2.27251935005188, "loss": 2.4688, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.533313751220703, "rewards/margins": 1.1918811798095703, "rewards/rejected": -22.725194931030273, "step": 17865 }, { "epoch": 0.6023121776938892, "grad_norm": 21.053346633911133, "learning_rate": 4.092071355409545e-07, "logits/chosen": -1.7526776790618896, "logits/rejected": -1.7933683395385742, "logps/chosen": -1.8899450302124023, "logps/rejected": -1.890005111694336, "loss": 3.3379, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.899450302124023, "rewards/margins": 0.0006008148193359375, "rewards/rejected": -18.90005111694336, "step": 17870 }, { "epoch": 0.6024807037648724, "grad_norm": 44.42392349243164, "learning_rate": 4.089179066372832e-07, "logits/chosen": -1.336414098739624, "logits/rejected": -1.5446573495864868, "logps/chosen": -2.743844985961914, "logps/rejected": -3.301013231277466, "loss": 2.7564, "rewards/accuracies": 0.5, "rewards/chosen": -27.438451766967773, "rewards/margins": 5.571681022644043, "rewards/rejected": -33.0101318359375, "step": 17875 }, { "epoch": 0.6026492298358556, "grad_norm": 35.24571990966797, "learning_rate": 4.0862870925353597e-07, "logits/chosen": -1.813269019126892, "logits/rejected": -1.7184432744979858, "logps/chosen": -2.3771960735321045, "logps/rejected": -2.766353130340576, "loss": 2.1354, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.771961212158203, "rewards/margins": 3.8915724754333496, "rewards/rejected": -27.66353416442871, "step": 17880 }, { "epoch": 0.6028177559068388, "grad_norm": 29.940200805664062, "learning_rate": 4.083395434897928e-07, "logits/chosen": -2.2942934036254883, "logits/rejected": -2.215756893157959, "logps/chosen": -2.2887065410614014, "logps/rejected": -2.3103342056274414, "loss": 3.4214, "rewards/accuracies": 0.5, "rewards/chosen": -22.887065887451172, "rewards/margins": 0.21627846360206604, "rewards/rejected": -23.103343963623047, "step": 17885 }, { "epoch": 0.602986281977822, "grad_norm": 15.23022174835205, "learning_rate": 4.0805040944612215e-07, "logits/chosen": -1.6742618083953857, "logits/rejected": -2.1392934322357178, "logps/chosen": -2.9912867546081543, "logps/rejected": -3.2775497436523438, "loss": 2.0122, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -29.91286849975586, "rewards/margins": 2.8626296520233154, "rewards/rejected": -32.7755012512207, "step": 17890 }, { "epoch": 0.6031548080488052, "grad_norm": 29.538070678710938, "learning_rate": 4.0776130722258207e-07, "logits/chosen": -1.221923589706421, "logits/rejected": -1.2887871265411377, "logps/chosen": -2.2940049171447754, "logps/rejected": -2.515733003616333, "loss": 2.6864, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.940052032470703, "rewards/margins": 2.2172799110412598, "rewards/rejected": -25.157331466674805, "step": 17895 }, { "epoch": 0.6033233341197883, "grad_norm": 0.11861720681190491, "learning_rate": 4.0747223691921963e-07, "logits/chosen": -1.7873830795288086, "logits/rejected": -1.839321494102478, "logps/chosen": -2.739753007888794, "logps/rejected": -3.109933376312256, "loss": 2.1349, "rewards/accuracies": 0.5, "rewards/chosen": -27.39752769470215, "rewards/margins": 3.7018027305603027, "rewards/rejected": -31.099334716796875, "step": 17900 }, { "epoch": 0.6034918601907715, "grad_norm": 23.25163459777832, "learning_rate": 4.071831986360704e-07, "logits/chosen": -1.5324798822402954, "logits/rejected": -1.7978417873382568, "logps/chosen": -3.7456448078155518, "logps/rejected": -3.5534369945526123, "loss": 5.4472, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -37.456443786621094, "rewards/margins": -1.9220736026763916, "rewards/rejected": -35.53437042236328, "step": 17905 }, { "epoch": 0.6036603862617547, "grad_norm": 23.313236236572266, "learning_rate": 4.0689419247315935e-07, "logits/chosen": -1.3383638858795166, "logits/rejected": -1.4649317264556885, "logps/chosen": -2.1240153312683105, "logps/rejected": -2.2492001056671143, "loss": 2.3375, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.240154266357422, "rewards/margins": 1.251846194267273, "rewards/rejected": -22.492000579833984, "step": 17910 }, { "epoch": 0.6038289123327378, "grad_norm": 16.772438049316406, "learning_rate": 4.066052185305e-07, "logits/chosen": -1.185973882675171, "logits/rejected": -1.493067979812622, "logps/chosen": -1.9985774755477905, "logps/rejected": -2.1876299381256104, "loss": 1.9702, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.985774993896484, "rewards/margins": 1.8905220031738281, "rewards/rejected": -21.876296997070312, "step": 17915 }, { "epoch": 0.6039974384037211, "grad_norm": 30.339557647705078, "learning_rate": 4.063162769080952e-07, "logits/chosen": -1.7155368328094482, "logits/rejected": -1.5918371677398682, "logps/chosen": -2.586151599884033, "logps/rejected": -2.4468390941619873, "loss": 5.2645, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.86151695251465, "rewards/margins": -1.3931264877319336, "rewards/rejected": -24.4683895111084, "step": 17920 }, { "epoch": 0.6041659644747043, "grad_norm": 71.9335708618164, "learning_rate": 4.060273677059357e-07, "logits/chosen": -2.01371431350708, "logits/rejected": -2.0844101905822754, "logps/chosen": -2.2792601585388184, "logps/rejected": -2.360816478729248, "loss": 2.8228, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.792600631713867, "rewards/margins": 0.8155611157417297, "rewards/rejected": -23.60816192626953, "step": 17925 }, { "epoch": 0.6043344905456874, "grad_norm": 9.240427017211914, "learning_rate": 4.0573849102400185e-07, "logits/chosen": -1.902093529701233, "logits/rejected": -2.0091750621795654, "logps/chosen": -2.6528334617614746, "logps/rejected": -3.194786787033081, "loss": 1.0115, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -26.528331756591797, "rewards/margins": 5.419533729553223, "rewards/rejected": -31.947866439819336, "step": 17930 }, { "epoch": 0.6045030166166706, "grad_norm": 26.025230407714844, "learning_rate": 4.054496469622628e-07, "logits/chosen": -1.269789695739746, "logits/rejected": -1.9125232696533203, "logps/chosen": -2.085925579071045, "logps/rejected": -2.6767005920410156, "loss": 1.9414, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.859254837036133, "rewards/margins": 5.907750129699707, "rewards/rejected": -26.767004013061523, "step": 17935 }, { "epoch": 0.6046715426876538, "grad_norm": 52.2057991027832, "learning_rate": 4.051608356206755e-07, "logits/chosen": -1.7119247913360596, "logits/rejected": -1.4851834774017334, "logps/chosen": -1.791717767715454, "logps/rejected": -1.9203579425811768, "loss": 2.4681, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.917177200317383, "rewards/margins": 1.2864009141921997, "rewards/rejected": -19.20357894897461, "step": 17940 }, { "epoch": 0.6048400687586369, "grad_norm": 17.035730361938477, "learning_rate": 4.048720570991865e-07, "logits/chosen": -1.4668445587158203, "logits/rejected": -1.8648614883422852, "logps/chosen": -2.6843013763427734, "logps/rejected": -3.5758934020996094, "loss": 1.7242, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -26.843013763427734, "rewards/margins": 8.915924072265625, "rewards/rejected": -35.75893783569336, "step": 17945 }, { "epoch": 0.6050085948296201, "grad_norm": 17.1804141998291, "learning_rate": 4.045833114977309e-07, "logits/chosen": -1.7299177646636963, "logits/rejected": -2.482475996017456, "logps/chosen": -2.2938883304595947, "logps/rejected": -3.3217194080352783, "loss": 1.6015, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -22.93888282775879, "rewards/margins": 10.27830982208252, "rewards/rejected": -33.21718978881836, "step": 17950 }, { "epoch": 0.6051771209006033, "grad_norm": 3.4808218479156494, "learning_rate": 4.0429459891623165e-07, "logits/chosen": -1.123740315437317, "logits/rejected": -1.7537816762924194, "logps/chosen": -1.9857628345489502, "logps/rejected": -2.486551523208618, "loss": 1.0033, "rewards/accuracies": 1.0, "rewards/chosen": -19.85762596130371, "rewards/margins": 5.0078911781311035, "rewards/rejected": -24.86551856994629, "step": 17955 }, { "epoch": 0.6053456469715865, "grad_norm": 51.88289260864258, "learning_rate": 4.040059194546011e-07, "logits/chosen": -1.659250259399414, "logits/rejected": -1.814026117324829, "logps/chosen": -2.484945774078369, "logps/rejected": -2.844405174255371, "loss": 2.0936, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -24.849456787109375, "rewards/margins": 3.594592571258545, "rewards/rejected": -28.44405174255371, "step": 17960 }, { "epoch": 0.6055141730425697, "grad_norm": 9.73103141784668, "learning_rate": 4.0371727321273987e-07, "logits/chosen": -1.4572718143463135, "logits/rejected": -1.7130321264266968, "logps/chosen": -2.1037278175354004, "logps/rejected": -2.5024209022521973, "loss": 1.9432, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.037277221679688, "rewards/margins": 3.986931324005127, "rewards/rejected": -25.02420997619629, "step": 17965 }, { "epoch": 0.6056826991135529, "grad_norm": 20.67357063293457, "learning_rate": 4.0342866029053703e-07, "logits/chosen": -1.54425847530365, "logits/rejected": -1.5728416442871094, "logps/chosen": -1.8009445667266846, "logps/rejected": -1.9770351648330688, "loss": 2.644, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.009445190429688, "rewards/margins": 1.7609062194824219, "rewards/rejected": -19.77035140991211, "step": 17970 }, { "epoch": 0.605851225184536, "grad_norm": 32.49703598022461, "learning_rate": 4.0314008078787e-07, "logits/chosen": -1.3170311450958252, "logits/rejected": -1.2635383605957031, "logps/chosen": -2.206963062286377, "logps/rejected": -2.1144444942474365, "loss": 4.0325, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.069629669189453, "rewards/margins": -0.9251849055290222, "rewards/rejected": -21.144445419311523, "step": 17975 }, { "epoch": 0.6060197512555192, "grad_norm": 39.733985900878906, "learning_rate": 4.028515348046049e-07, "logits/chosen": -2.097215175628662, "logits/rejected": -2.3491263389587402, "logps/chosen": -2.108121395111084, "logps/rejected": -2.3394417762756348, "loss": 2.8359, "rewards/accuracies": 0.5, "rewards/chosen": -21.081212997436523, "rewards/margins": 2.3132030963897705, "rewards/rejected": -23.3944149017334, "step": 17980 }, { "epoch": 0.6061882773265024, "grad_norm": 12.164617538452148, "learning_rate": 4.0256302244059623e-07, "logits/chosen": -1.217043161392212, "logits/rejected": -1.5332605838775635, "logps/chosen": -2.25597882270813, "logps/rejected": -2.373389720916748, "loss": 2.6482, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.55978775024414, "rewards/margins": 1.1741094589233398, "rewards/rejected": -23.733896255493164, "step": 17985 }, { "epoch": 0.6063568033974855, "grad_norm": 31.064668655395508, "learning_rate": 4.0227454379568653e-07, "logits/chosen": -1.4396828413009644, "logits/rejected": -1.7883985042572021, "logps/chosen": -2.1677372455596924, "logps/rejected": -2.3215463161468506, "loss": 2.3949, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.677370071411133, "rewards/margins": 1.538094162940979, "rewards/rejected": -23.215463638305664, "step": 17990 }, { "epoch": 0.6065253294684688, "grad_norm": 33.58404541015625, "learning_rate": 4.01986098969707e-07, "logits/chosen": -1.8123286962509155, "logits/rejected": -1.6961183547973633, "logps/chosen": -2.0836260318756104, "logps/rejected": -2.062624454498291, "loss": 3.3407, "rewards/accuracies": 0.5, "rewards/chosen": -20.836257934570312, "rewards/margins": -0.21001510322093964, "rewards/rejected": -20.626243591308594, "step": 17995 }, { "epoch": 0.606693855539452, "grad_norm": 0.05182600021362305, "learning_rate": 4.0169768806247697e-07, "logits/chosen": -1.3305574655532837, "logits/rejected": -1.6693122386932373, "logps/chosen": -2.4600489139556885, "logps/rejected": -3.0529866218566895, "loss": 1.1291, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -24.600488662719727, "rewards/margins": 5.929378509521484, "rewards/rejected": -30.52986717224121, "step": 18000 }, { "epoch": 0.606693855539452, "eval_logits/chosen": -2.0212080478668213, "eval_logits/rejected": -2.1736414432525635, "eval_logps/chosen": -2.1531260013580322, "eval_logps/rejected": -2.2950658798217773, "eval_loss": 2.9981372356414795, "eval_rewards/accuracies": 0.6200000047683716, "eval_rewards/chosen": -21.531261444091797, "eval_rewards/margins": 1.4193978309631348, "eval_rewards/rejected": -22.950658798217773, "eval_runtime": 12.8857, "eval_samples_per_second": 7.761, "eval_steps_per_second": 1.94, "step": 18000 }, { "epoch": 0.6068623816104352, "grad_norm": 89.49707794189453, "learning_rate": 4.0140931117380437e-07, "logits/chosen": -1.5910829305648804, "logits/rejected": -1.7629798650741577, "logps/chosen": -2.1482040882110596, "logps/rejected": -2.142286777496338, "loss": 3.6854, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.482038497924805, "rewards/margins": -0.059171102941036224, "rewards/rejected": -21.422866821289062, "step": 18005 }, { "epoch": 0.6070309076814183, "grad_norm": 2.001401298912242e-05, "learning_rate": 4.011209684034846e-07, "logits/chosen": -1.7829933166503906, "logits/rejected": -2.354343891143799, "logps/chosen": -2.8509857654571533, "logps/rejected": -3.5344605445861816, "loss": 2.6997, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -28.50986099243164, "rewards/margins": 6.834742546081543, "rewards/rejected": -35.34459686279297, "step": 18010 }, { "epoch": 0.6071994337524015, "grad_norm": 29.717529296875, "learning_rate": 4.008326598513021e-07, "logits/chosen": -1.4794981479644775, "logits/rejected": -1.6470489501953125, "logps/chosen": -2.9904773235321045, "logps/rejected": -3.15329909324646, "loss": 2.9079, "rewards/accuracies": 0.5, "rewards/chosen": -29.904775619506836, "rewards/margins": 1.6282180547714233, "rewards/rejected": -31.53299331665039, "step": 18015 }, { "epoch": 0.6073679598233847, "grad_norm": 51.709434509277344, "learning_rate": 4.005443856170291e-07, "logits/chosen": -1.5839917659759521, "logits/rejected": -1.4346948862075806, "logps/chosen": -2.345694065093994, "logps/rejected": -2.851513385772705, "loss": 2.9015, "rewards/accuracies": 0.5, "rewards/chosen": -23.456941604614258, "rewards/margins": 5.058190822601318, "rewards/rejected": -28.515132904052734, "step": 18020 }, { "epoch": 0.6075364858943678, "grad_norm": 19.4397029876709, "learning_rate": 4.0025614580042565e-07, "logits/chosen": -1.346427321434021, "logits/rejected": -1.7577238082885742, "logps/chosen": -2.1209535598754883, "logps/rejected": -2.2037816047668457, "loss": 2.8587, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.20953369140625, "rewards/margins": 0.8282777667045593, "rewards/rejected": -22.03781509399414, "step": 18025 }, { "epoch": 0.6077050119653511, "grad_norm": 23.9757137298584, "learning_rate": 3.999679405012404e-07, "logits/chosen": -2.53595232963562, "logits/rejected": -2.367096185684204, "logps/chosen": -3.45710825920105, "logps/rejected": -3.697390079498291, "loss": 3.8363, "rewards/accuracies": 0.5, "rewards/chosen": -34.571083068847656, "rewards/margins": 2.402817964553833, "rewards/rejected": -36.973899841308594, "step": 18030 }, { "epoch": 0.6078735380363343, "grad_norm": 33.260257720947266, "learning_rate": 3.9967976981920987e-07, "logits/chosen": -1.5834105014801025, "logits/rejected": -1.6174606084823608, "logps/chosen": -2.2329487800598145, "logps/rejected": -2.2986979484558105, "loss": 2.947, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.32948875427246, "rewards/margins": 0.6574923396110535, "rewards/rejected": -22.986980438232422, "step": 18035 }, { "epoch": 0.6080420641073174, "grad_norm": 28.550710678100586, "learning_rate": 3.993916338540586e-07, "logits/chosen": -1.7848262786865234, "logits/rejected": -1.7436761856079102, "logps/chosen": -2.402068853378296, "logps/rejected": -2.468722343444824, "loss": 3.0888, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.020687103271484, "rewards/margins": 0.6665343046188354, "rewards/rejected": -24.68722152709961, "step": 18040 }, { "epoch": 0.6082105901783006, "grad_norm": 18.684982299804688, "learning_rate": 3.9910353270549895e-07, "logits/chosen": -1.147879958152771, "logits/rejected": -1.5647351741790771, "logps/chosen": -2.320190906524658, "logps/rejected": -2.9074273109436035, "loss": 2.8898, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.2019100189209, "rewards/margins": 5.872366428375244, "rewards/rejected": -29.07427406311035, "step": 18045 }, { "epoch": 0.6083791162492838, "grad_norm": 29.315391540527344, "learning_rate": 3.988154664732315e-07, "logits/chosen": -1.4805309772491455, "logits/rejected": -1.7728363275527954, "logps/chosen": -2.3441691398620605, "logps/rejected": -3.008974552154541, "loss": 2.8736, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -23.441692352294922, "rewards/margins": 6.648050785064697, "rewards/rejected": -30.089740753173828, "step": 18050 }, { "epoch": 0.6085476423202669, "grad_norm": 30.374649047851562, "learning_rate": 3.9852743525694477e-07, "logits/chosen": -1.4703962802886963, "logits/rejected": -1.6875982284545898, "logps/chosen": -2.152924060821533, "logps/rejected": -2.2890422344207764, "loss": 2.2531, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.529239654541016, "rewards/margins": 1.3611841201782227, "rewards/rejected": -22.890422821044922, "step": 18055 }, { "epoch": 0.6087161683912501, "grad_norm": 48.82684326171875, "learning_rate": 3.9823943915631466e-07, "logits/chosen": -1.6669085025787354, "logits/rejected": -2.1665635108947754, "logps/chosen": -2.514723777770996, "logps/rejected": -2.925302028656006, "loss": 3.5687, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.14723777770996, "rewards/margins": 4.1057844161987305, "rewards/rejected": -29.253021240234375, "step": 18060 }, { "epoch": 0.6088846944622333, "grad_norm": 194.8758087158203, "learning_rate": 3.979514782710054e-07, "logits/chosen": -1.4292688369750977, "logits/rejected": -1.3867441415786743, "logps/chosen": -2.669638156890869, "logps/rejected": -2.704360008239746, "loss": 3.4805, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.69638442993164, "rewards/margins": 0.347219854593277, "rewards/rejected": -27.043603897094727, "step": 18065 }, { "epoch": 0.6090532205332165, "grad_norm": 23.801340103149414, "learning_rate": 3.97663552700669e-07, "logits/chosen": -1.9906822443008423, "logits/rejected": -1.7741702795028687, "logps/chosen": -1.8173282146453857, "logps/rejected": -1.7739219665527344, "loss": 3.493, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.173282623291016, "rewards/margins": -0.43406257033348083, "rewards/rejected": -17.739219665527344, "step": 18070 }, { "epoch": 0.6092217466041997, "grad_norm": 17.24144172668457, "learning_rate": 3.9737566254494533e-07, "logits/chosen": -1.5699522495269775, "logits/rejected": -1.4470058679580688, "logps/chosen": -2.3631958961486816, "logps/rejected": -2.5016205310821533, "loss": 3.8795, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -23.631961822509766, "rewards/margins": 1.3842445611953735, "rewards/rejected": -25.016204833984375, "step": 18075 }, { "epoch": 0.6093902726751829, "grad_norm": 26.506839752197266, "learning_rate": 3.9708780790346133e-07, "logits/chosen": -1.7675163745880127, "logits/rejected": -1.9242709875106812, "logps/chosen": -2.3664536476135254, "logps/rejected": -2.7819604873657227, "loss": 2.7749, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -23.66453742980957, "rewards/margins": 4.155068397521973, "rewards/rejected": -27.819604873657227, "step": 18080 }, { "epoch": 0.609558798746166, "grad_norm": 229.21978759765625, "learning_rate": 3.967999888758325e-07, "logits/chosen": -1.8047516345977783, "logits/rejected": -2.22436261177063, "logps/chosen": -3.2236645221710205, "logps/rejected": -3.230384349822998, "loss": 7.2379, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -32.23664474487305, "rewards/margins": 0.06719855964183807, "rewards/rejected": -32.30384063720703, "step": 18085 }, { "epoch": 0.6097273248171492, "grad_norm": 26.054292678833008, "learning_rate": 3.9651220556166183e-07, "logits/chosen": -1.5580518245697021, "logits/rejected": -1.891649603843689, "logps/chosen": -2.72765851020813, "logps/rejected": -2.717656135559082, "loss": 3.3043, "rewards/accuracies": 0.5, "rewards/chosen": -27.276586532592773, "rewards/margins": -0.10002384334802628, "rewards/rejected": -27.176563262939453, "step": 18090 }, { "epoch": 0.6098958508881324, "grad_norm": 21.03340721130371, "learning_rate": 3.9622445806053925e-07, "logits/chosen": -1.4648394584655762, "logits/rejected": -1.4504142999649048, "logps/chosen": -2.3387691974639893, "logps/rejected": -2.473045825958252, "loss": 2.2089, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.387691497802734, "rewards/margins": 1.3427667617797852, "rewards/rejected": -24.730457305908203, "step": 18095 }, { "epoch": 0.6100643769591155, "grad_norm": 6.288976669311523, "learning_rate": 3.959367464720433e-07, "logits/chosen": -1.4934344291687012, "logits/rejected": -1.6204888820648193, "logps/chosen": -2.0959277153015137, "logps/rejected": -2.4498965740203857, "loss": 2.2194, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.959278106689453, "rewards/margins": 3.5396888256073, "rewards/rejected": -24.498966217041016, "step": 18100 }, { "epoch": 0.6102329030300988, "grad_norm": 36.37209701538086, "learning_rate": 3.9564907089573934e-07, "logits/chosen": -1.1374117136001587, "logits/rejected": -1.1836864948272705, "logps/chosen": -2.0730018615722656, "logps/rejected": -2.164442777633667, "loss": 2.4365, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.730016708374023, "rewards/margins": 0.9144119024276733, "rewards/rejected": -21.64443016052246, "step": 18105 }, { "epoch": 0.610401429101082, "grad_norm": 45.56204605102539, "learning_rate": 3.953614314311808e-07, "logits/chosen": -1.6337623596191406, "logits/rejected": -1.5129172801971436, "logps/chosen": -2.0202136039733887, "logps/rejected": -2.1366028785705566, "loss": 3.4906, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.202136993408203, "rewards/margins": 1.1638901233673096, "rewards/rejected": -21.36602783203125, "step": 18110 }, { "epoch": 0.6105699551720651, "grad_norm": 14.165609359741211, "learning_rate": 3.950738281779082e-07, "logits/chosen": -1.9523528814315796, "logits/rejected": -2.077697992324829, "logps/chosen": -2.342078447341919, "logps/rejected": -2.615379810333252, "loss": 2.851, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -23.4207820892334, "rewards/margins": 2.7330145835876465, "rewards/rejected": -26.153797149658203, "step": 18115 }, { "epoch": 0.6107384812430483, "grad_norm": 40.21343994140625, "learning_rate": 3.9478626123544985e-07, "logits/chosen": -1.9517990350723267, "logits/rejected": -2.384939193725586, "logps/chosen": -2.8035922050476074, "logps/rejected": -2.9411544799804688, "loss": 3.0387, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -28.035924911499023, "rewards/margins": 1.375620722770691, "rewards/rejected": -29.411544799804688, "step": 18120 }, { "epoch": 0.6109070073140315, "grad_norm": 30.41344451904297, "learning_rate": 3.944987307033212e-07, "logits/chosen": -1.464687705039978, "logits/rejected": -1.6759631633758545, "logps/chosen": -1.9217870235443115, "logps/rejected": -2.374483585357666, "loss": 1.3678, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.217870712280273, "rewards/margins": 4.5269646644592285, "rewards/rejected": -23.744834899902344, "step": 18125 }, { "epoch": 0.6110755333850146, "grad_norm": 43.67235565185547, "learning_rate": 3.9421123668102515e-07, "logits/chosen": -1.7854055166244507, "logits/rejected": -2.2990849018096924, "logps/chosen": -2.1993279457092285, "logps/rejected": -2.435128688812256, "loss": 1.8303, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.99327850341797, "rewards/margins": 2.3580093383789062, "rewards/rejected": -24.351287841796875, "step": 18130 }, { "epoch": 0.6112440594559978, "grad_norm": 46.27460479736328, "learning_rate": 3.939237792680522e-07, "logits/chosen": -1.3446651697158813, "logits/rejected": -1.3674166202545166, "logps/chosen": -2.7552475929260254, "logps/rejected": -3.0105865001678467, "loss": 1.5578, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -27.552471160888672, "rewards/margins": 2.5533928871154785, "rewards/rejected": -30.105865478515625, "step": 18135 }, { "epoch": 0.6114125855269811, "grad_norm": 20.77875518798828, "learning_rate": 3.9363635856388e-07, "logits/chosen": -1.725376844406128, "logits/rejected": -1.628379464149475, "logps/chosen": -2.1728854179382324, "logps/rejected": -2.4002127647399902, "loss": 2.1471, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.72885513305664, "rewards/margins": 2.2732746601104736, "rewards/rejected": -24.002126693725586, "step": 18140 }, { "epoch": 0.6115811115979642, "grad_norm": 31.333084106445312, "learning_rate": 3.933489746679737e-07, "logits/chosen": -1.8086239099502563, "logits/rejected": -1.7974382638931274, "logps/chosen": -1.94889235496521, "logps/rejected": -2.0811383724212646, "loss": 2.4491, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.48892593383789, "rewards/margins": 1.3224602937698364, "rewards/rejected": -20.811384201049805, "step": 18145 }, { "epoch": 0.6117496376689474, "grad_norm": 132.87933349609375, "learning_rate": 3.9306162767978526e-07, "logits/chosen": -1.6830793619155884, "logits/rejected": -2.086782932281494, "logps/chosen": -2.5300583839416504, "logps/rejected": -2.8638129234313965, "loss": 1.7163, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -25.300582885742188, "rewards/margins": 3.337545871734619, "rewards/rejected": -28.63812828063965, "step": 18150 }, { "epoch": 0.6119181637399306, "grad_norm": 17.956872940063477, "learning_rate": 3.9277431769875425e-07, "logits/chosen": -1.276592493057251, "logits/rejected": -1.4645140171051025, "logps/chosen": -3.1796116828918457, "logps/rejected": -3.4596123695373535, "loss": 2.8719, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -31.796117782592773, "rewards/margins": 2.8000075817108154, "rewards/rejected": -34.596126556396484, "step": 18155 }, { "epoch": 0.6120866898109137, "grad_norm": 2.6117608547210693, "learning_rate": 3.924870448243075e-07, "logits/chosen": -1.7809474468231201, "logits/rejected": -2.395023822784424, "logps/chosen": -2.2598633766174316, "logps/rejected": -2.346357822418213, "loss": 2.9899, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.5986328125, "rewards/margins": 0.8649458885192871, "rewards/rejected": -23.463581085205078, "step": 18160 }, { "epoch": 0.6122552158818969, "grad_norm": 36.84818649291992, "learning_rate": 3.921998091558586e-07, "logits/chosen": -1.473480224609375, "logits/rejected": -1.471651315689087, "logps/chosen": -2.3036561012268066, "logps/rejected": -2.150559663772583, "loss": 4.7204, "rewards/accuracies": 0.5, "rewards/chosen": -23.03656005859375, "rewards/margins": -1.53096604347229, "rewards/rejected": -21.505596160888672, "step": 18165 }, { "epoch": 0.6124237419528801, "grad_norm": 39.324222564697266, "learning_rate": 3.919126107928085e-07, "logits/chosen": -1.783125877380371, "logits/rejected": -1.8939613103866577, "logps/chosen": -2.0952346324920654, "logps/rejected": -2.0697779655456543, "loss": 3.5832, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.952346801757812, "rewards/margins": -0.2545679211616516, "rewards/rejected": -20.697778701782227, "step": 18170 }, { "epoch": 0.6125922680238632, "grad_norm": 34.55978775024414, "learning_rate": 3.916254498345454e-07, "logits/chosen": -2.1542255878448486, "logits/rejected": -2.322671890258789, "logps/chosen": -2.3772964477539062, "logps/rejected": -3.2974255084991455, "loss": 2.7457, "rewards/accuracies": 0.5, "rewards/chosen": -23.772964477539062, "rewards/margins": 9.201289176940918, "rewards/rejected": -32.97425079345703, "step": 18175 }, { "epoch": 0.6127607940948465, "grad_norm": 35.717376708984375, "learning_rate": 3.913383263804444e-07, "logits/chosen": -1.193623423576355, "logits/rejected": -1.332782506942749, "logps/chosen": -2.154371738433838, "logps/rejected": -2.138970375061035, "loss": 3.3613, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.543718338012695, "rewards/margins": -0.15401363372802734, "rewards/rejected": -21.389705657958984, "step": 18180 }, { "epoch": 0.6129293201658297, "grad_norm": 31.9857120513916, "learning_rate": 3.910512405298675e-07, "logits/chosen": -1.9725778102874756, "logits/rejected": -1.8313255310058594, "logps/chosen": -1.9979254007339478, "logps/rejected": -2.13980770111084, "loss": 2.3521, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.9792537689209, "rewards/margins": 1.418821930885315, "rewards/rejected": -21.3980770111084, "step": 18185 }, { "epoch": 0.6130978462368128, "grad_norm": 110.69065856933594, "learning_rate": 3.907641923821638e-07, "logits/chosen": -1.2381213903427124, "logits/rejected": -1.6605393886566162, "logps/chosen": -2.7820382118225098, "logps/rejected": -2.967272996902466, "loss": 2.7824, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -27.820383071899414, "rewards/margins": 1.8523496389389038, "rewards/rejected": -29.672733306884766, "step": 18190 }, { "epoch": 0.613266372307796, "grad_norm": 41.08340072631836, "learning_rate": 3.9047718203666947e-07, "logits/chosen": -1.500327706336975, "logits/rejected": -1.98642098903656, "logps/chosen": -2.553170919418335, "logps/rejected": -3.072920560836792, "loss": 2.2151, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -25.53171157836914, "rewards/margins": 5.1974968910217285, "rewards/rejected": -30.729206085205078, "step": 18195 }, { "epoch": 0.6134348983787792, "grad_norm": 37.24290084838867, "learning_rate": 3.9019020959270733e-07, "logits/chosen": -1.6461604833602905, "logits/rejected": -1.7050399780273438, "logps/chosen": -2.5508205890655518, "logps/rejected": -2.390434741973877, "loss": 5.6425, "rewards/accuracies": 0.5, "rewards/chosen": -25.50820541381836, "rewards/margins": -1.6038585901260376, "rewards/rejected": -23.904346466064453, "step": 18200 }, { "epoch": 0.6136034244497623, "grad_norm": 40.20940017700195, "learning_rate": 3.899032751495873e-07, "logits/chosen": -1.0868529081344604, "logits/rejected": -1.1445951461791992, "logps/chosen": -2.8452606201171875, "logps/rejected": -2.932924270629883, "loss": 3.5403, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -28.452606201171875, "rewards/margins": 0.8766378164291382, "rewards/rejected": -29.32924461364746, "step": 18205 }, { "epoch": 0.6137719505207455, "grad_norm": 61.71173858642578, "learning_rate": 3.896163788066061e-07, "logits/chosen": -1.2802660465240479, "logits/rejected": -1.3684922456741333, "logps/chosen": -3.304487943649292, "logps/rejected": -3.5455081462860107, "loss": 4.7107, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -33.04487991333008, "rewards/margins": 2.410205364227295, "rewards/rejected": -35.45508575439453, "step": 18210 }, { "epoch": 0.6139404765917288, "grad_norm": 73.96290588378906, "learning_rate": 3.8932952066304745e-07, "logits/chosen": -1.6128242015838623, "logits/rejected": -1.8162364959716797, "logps/chosen": -3.207860231399536, "logps/rejected": -3.3573410511016846, "loss": 5.1996, "rewards/accuracies": 0.5, "rewards/chosen": -32.0786018371582, "rewards/margins": 1.4948112964630127, "rewards/rejected": -33.57341384887695, "step": 18215 }, { "epoch": 0.614109002662712, "grad_norm": 26.064023971557617, "learning_rate": 3.8904270081818125e-07, "logits/chosen": -2.0467605590820312, "logits/rejected": -2.187319278717041, "logps/chosen": -2.2105517387390137, "logps/rejected": -2.0833380222320557, "loss": 4.5352, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.105518341064453, "rewards/margins": -1.2721388339996338, "rewards/rejected": -20.833377838134766, "step": 18220 }, { "epoch": 0.6142775287336951, "grad_norm": 12.313178062438965, "learning_rate": 3.8875591937126477e-07, "logits/chosen": -1.4248065948486328, "logits/rejected": -2.334559202194214, "logps/chosen": -1.9030323028564453, "logps/rejected": -2.7566123008728027, "loss": 2.0344, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.030323028564453, "rewards/margins": 8.535801887512207, "rewards/rejected": -27.566125869750977, "step": 18225 }, { "epoch": 0.6144460548046783, "grad_norm": 48.34172439575195, "learning_rate": 3.88469176421542e-07, "logits/chosen": -1.7388198375701904, "logits/rejected": -1.784240961074829, "logps/chosen": -2.641664743423462, "logps/rejected": -2.8418049812316895, "loss": 3.8033, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -26.416645050048828, "rewards/margins": 2.0014047622680664, "rewards/rejected": -28.41805076599121, "step": 18230 }, { "epoch": 0.6146145808756615, "grad_norm": 34.234649658203125, "learning_rate": 3.8818247206824284e-07, "logits/chosen": -1.5583927631378174, "logits/rejected": -1.8008098602294922, "logps/chosen": -1.9230226278305054, "logps/rejected": -2.3101422786712646, "loss": 2.172, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.230226516723633, "rewards/margins": 3.8711953163146973, "rewards/rejected": -23.101421356201172, "step": 18235 }, { "epoch": 0.6147831069466446, "grad_norm": 16.408966064453125, "learning_rate": 3.878958064105847e-07, "logits/chosen": -1.8375132083892822, "logits/rejected": -2.240048885345459, "logps/chosen": -1.6549434661865234, "logps/rejected": -2.166231393814087, "loss": 1.856, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -16.549434661865234, "rewards/margins": 5.112878799438477, "rewards/rejected": -21.66231346130371, "step": 18240 }, { "epoch": 0.6149516330176278, "grad_norm": 17.53336524963379, "learning_rate": 3.8760917954777123e-07, "logits/chosen": -1.7445443868637085, "logits/rejected": -1.7051893472671509, "logps/chosen": -2.308375835418701, "logps/rejected": -2.4736084938049316, "loss": 3.2146, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.083759307861328, "rewards/margins": 1.6523252725601196, "rewards/rejected": -24.736083984375, "step": 18245 }, { "epoch": 0.6151201590886111, "grad_norm": 87.04015350341797, "learning_rate": 3.8732259157899295e-07, "logits/chosen": -1.6780481338500977, "logits/rejected": -2.038491725921631, "logps/chosen": -3.206683397293091, "logps/rejected": -3.6300296783447266, "loss": 1.6632, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -32.06683349609375, "rewards/margins": 4.233463287353516, "rewards/rejected": -36.30029296875, "step": 18250 }, { "epoch": 0.6152886851595942, "grad_norm": 25.186378479003906, "learning_rate": 3.8703604260342616e-07, "logits/chosen": -1.5214375257492065, "logits/rejected": -1.491634488105774, "logps/chosen": -2.2915260791778564, "logps/rejected": -2.593151092529297, "loss": 1.7989, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.915258407592773, "rewards/margins": 3.016251564025879, "rewards/rejected": -25.9315128326416, "step": 18255 }, { "epoch": 0.6154572112305774, "grad_norm": 27.948070526123047, "learning_rate": 3.8674953272023443e-07, "logits/chosen": -1.2256909608840942, "logits/rejected": -1.4115798473358154, "logps/chosen": -2.4634010791778564, "logps/rejected": -2.0862600803375244, "loss": 6.8228, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -24.634008407592773, "rewards/margins": -3.771408796310425, "rewards/rejected": -20.862600326538086, "step": 18260 }, { "epoch": 0.6156257373015606, "grad_norm": 0.05216868966817856, "learning_rate": 3.864630620285676e-07, "logits/chosen": -1.603329062461853, "logits/rejected": -2.0473170280456543, "logps/chosen": -2.211454391479492, "logps/rejected": -2.6565604209899902, "loss": 1.4562, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -22.114543914794922, "rewards/margins": 4.4510602951049805, "rewards/rejected": -26.565603256225586, "step": 18265 }, { "epoch": 0.6157942633725437, "grad_norm": 150.82110595703125, "learning_rate": 3.8617663062756177e-07, "logits/chosen": -1.921468734741211, "logits/rejected": -2.095893144607544, "logps/chosen": -2.2001423835754395, "logps/rejected": -2.6294565200805664, "loss": 2.827, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.00142478942871, "rewards/margins": 4.293142318725586, "rewards/rejected": -26.294567108154297, "step": 18270 }, { "epoch": 0.6159627894435269, "grad_norm": 0.3024599552154541, "learning_rate": 3.8589023861633965e-07, "logits/chosen": -2.103994369506836, "logits/rejected": -2.313030242919922, "logps/chosen": -2.3458099365234375, "logps/rejected": -2.509472608566284, "loss": 3.5971, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.458097457885742, "rewards/margins": 1.6366304159164429, "rewards/rejected": -25.094730377197266, "step": 18275 }, { "epoch": 0.6161313155145101, "grad_norm": 81.68231201171875, "learning_rate": 3.8560388609401015e-07, "logits/chosen": -1.8704826831817627, "logits/rejected": -1.8536460399627686, "logps/chosen": -2.5050511360168457, "logps/rejected": -2.513740062713623, "loss": 3.299, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.05051040649414, "rewards/margins": 0.08688764274120331, "rewards/rejected": -25.13739585876465, "step": 18280 }, { "epoch": 0.6162998415854932, "grad_norm": 28.812223434448242, "learning_rate": 3.8531757315966883e-07, "logits/chosen": -1.7152595520019531, "logits/rejected": -1.6138585805892944, "logps/chosen": -2.5530338287353516, "logps/rejected": -2.981071949005127, "loss": 2.2196, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.530338287353516, "rewards/margins": 4.280381679534912, "rewards/rejected": -29.810720443725586, "step": 18285 }, { "epoch": 0.6164683676564765, "grad_norm": 58.12255859375, "learning_rate": 3.8503129991239695e-07, "logits/chosen": -1.7070395946502686, "logits/rejected": -1.852764368057251, "logps/chosen": -2.6107370853424072, "logps/rejected": -2.647188186645508, "loss": 3.906, "rewards/accuracies": 0.5, "rewards/chosen": -26.107370376586914, "rewards/margins": 0.36450958251953125, "rewards/rejected": -26.471881866455078, "step": 18290 }, { "epoch": 0.6166368937274597, "grad_norm": 68.67720031738281, "learning_rate": 3.8474506645126257e-07, "logits/chosen": -1.6032400131225586, "logits/rejected": -1.540244221687317, "logps/chosen": -2.1491074562072754, "logps/rejected": -2.198559284210205, "loss": 2.7402, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.491073608398438, "rewards/margins": 0.4945201873779297, "rewards/rejected": -21.985593795776367, "step": 18295 }, { "epoch": 0.6168054197984428, "grad_norm": 102.33248138427734, "learning_rate": 3.8445887287532006e-07, "logits/chosen": -1.7181625366210938, "logits/rejected": -1.9780042171478271, "logps/chosen": -2.8356316089630127, "logps/rejected": -3.348367691040039, "loss": 1.4323, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -28.3563175201416, "rewards/margins": 5.127358913421631, "rewards/rejected": -33.483680725097656, "step": 18300 }, { "epoch": 0.616973945869426, "grad_norm": 36.54032516479492, "learning_rate": 3.8417271928360934e-07, "logits/chosen": -1.6353356838226318, "logits/rejected": -1.8744697570800781, "logps/chosen": -1.9833621978759766, "logps/rejected": -2.1706643104553223, "loss": 3.0216, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.833620071411133, "rewards/margins": 1.873023271560669, "rewards/rejected": -21.70664405822754, "step": 18305 }, { "epoch": 0.6171424719404092, "grad_norm": 25.216777801513672, "learning_rate": 3.83886605775157e-07, "logits/chosen": -1.900887131690979, "logits/rejected": -1.8457438945770264, "logps/chosen": -3.0988681316375732, "logps/rejected": -3.0910229682922363, "loss": 4.2173, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -30.988683700561523, "rewards/margins": -0.07845230400562286, "rewards/rejected": -30.910228729248047, "step": 18310 }, { "epoch": 0.6173109980113923, "grad_norm": 48.769683837890625, "learning_rate": 3.8360053244897573e-07, "logits/chosen": -1.7366855144500732, "logits/rejected": -1.7216198444366455, "logps/chosen": -2.0870800018310547, "logps/rejected": -2.0824666023254395, "loss": 3.4526, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.870800018310547, "rewards/margins": -0.04613447189331055, "rewards/rejected": -20.824665069580078, "step": 18315 }, { "epoch": 0.6174795240823755, "grad_norm": 31.436220169067383, "learning_rate": 3.8331449940406444e-07, "logits/chosen": -2.0017778873443604, "logits/rejected": -2.344151496887207, "logps/chosen": -3.020869493484497, "logps/rejected": -3.665548801422119, "loss": 1.5649, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -30.208694458007812, "rewards/margins": 6.4467949867248535, "rewards/rejected": -36.655487060546875, "step": 18320 }, { "epoch": 0.6176480501533588, "grad_norm": 11.105485916137695, "learning_rate": 3.8302850673940745e-07, "logits/chosen": -1.6430978775024414, "logits/rejected": -1.8722255229949951, "logps/chosen": -2.1614890098571777, "logps/rejected": -2.7056336402893066, "loss": 2.1378, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.614891052246094, "rewards/margins": 5.441445827484131, "rewards/rejected": -27.05633544921875, "step": 18325 }, { "epoch": 0.6178165762243419, "grad_norm": 35.78452682495117, "learning_rate": 3.8274255455397585e-07, "logits/chosen": -1.2529391050338745, "logits/rejected": -1.4416625499725342, "logps/chosen": -2.2270255088806152, "logps/rejected": -2.205737352371216, "loss": 3.5502, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.2702579498291, "rewards/margins": -0.21288509666919708, "rewards/rejected": -22.057371139526367, "step": 18330 }, { "epoch": 0.6179851022953251, "grad_norm": 64.93318939208984, "learning_rate": 3.8245664294672644e-07, "logits/chosen": -1.5049943923950195, "logits/rejected": -1.4447296857833862, "logps/chosen": -1.9939762353897095, "logps/rejected": -2.002965211868286, "loss": 3.1059, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.93976593017578, "rewards/margins": 0.08988761901855469, "rewards/rejected": -20.029653549194336, "step": 18335 }, { "epoch": 0.6181536283663083, "grad_norm": 29.312524795532227, "learning_rate": 3.821707720166018e-07, "logits/chosen": -1.8286025524139404, "logits/rejected": -1.8722509145736694, "logps/chosen": -1.8341219425201416, "logps/rejected": -1.8297138214111328, "loss": 3.2439, "rewards/accuracies": 0.5, "rewards/chosen": -18.341217041015625, "rewards/margins": -0.044080257415771484, "rewards/rejected": -18.297138214111328, "step": 18340 }, { "epoch": 0.6183221544372914, "grad_norm": 127.18154907226562, "learning_rate": 3.818849418625306e-07, "logits/chosen": -1.9834444522857666, "logits/rejected": -1.7992169857025146, "logps/chosen": -2.6786978244781494, "logps/rejected": -2.7950711250305176, "loss": 4.7649, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.786977767944336, "rewards/margins": 1.1637320518493652, "rewards/rejected": -27.950708389282227, "step": 18345 }, { "epoch": 0.6184906805082746, "grad_norm": 34.55775833129883, "learning_rate": 3.815991525834276e-07, "logits/chosen": -2.2684998512268066, "logits/rejected": -2.30085825920105, "logps/chosen": -3.213099241256714, "logps/rejected": -3.595430850982666, "loss": 4.0705, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -32.13098907470703, "rewards/margins": 3.8233160972595215, "rewards/rejected": -35.954307556152344, "step": 18350 }, { "epoch": 0.6186592065792578, "grad_norm": 21.957063674926758, "learning_rate": 3.8131340427819307e-07, "logits/chosen": -1.6553875207901, "logits/rejected": -1.6048953533172607, "logps/chosen": -1.9786068201065063, "logps/rejected": -2.27418851852417, "loss": 2.6846, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.786067962646484, "rewards/margins": 2.955817222595215, "rewards/rejected": -22.741886138916016, "step": 18355 }, { "epoch": 0.618827732650241, "grad_norm": 43.23855972290039, "learning_rate": 3.810276970457132e-07, "logits/chosen": -2.354893445968628, "logits/rejected": -2.361724853515625, "logps/chosen": -2.808964729309082, "logps/rejected": -3.1177735328674316, "loss": 3.0821, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -28.089649200439453, "rewards/margins": 3.088087558746338, "rewards/rejected": -31.177734375, "step": 18360 }, { "epoch": 0.6189962587212242, "grad_norm": 30.515972137451172, "learning_rate": 3.8074203098486004e-07, "logits/chosen": -1.6836265325546265, "logits/rejected": -2.1789212226867676, "logps/chosen": -2.1196069717407227, "logps/rejected": -2.5858001708984375, "loss": 2.8524, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.19607162475586, "rewards/margins": 4.66193151473999, "rewards/rejected": -25.858001708984375, "step": 18365 }, { "epoch": 0.6191647847922074, "grad_norm": 30.81611442565918, "learning_rate": 3.804564061944916e-07, "logits/chosen": -1.2702562808990479, "logits/rejected": -1.6156784296035767, "logps/chosen": -2.31827974319458, "logps/rejected": -2.7410683631896973, "loss": 1.8617, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -23.182796478271484, "rewards/margins": 4.227887153625488, "rewards/rejected": -27.41068458557129, "step": 18370 }, { "epoch": 0.6193333108631905, "grad_norm": 25.21677589416504, "learning_rate": 3.801708227734509e-07, "logits/chosen": -1.7795536518096924, "logits/rejected": -2.1119651794433594, "logps/chosen": -1.9674450159072876, "logps/rejected": -2.2505908012390137, "loss": 3.0825, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.674449920654297, "rewards/margins": 2.8314576148986816, "rewards/rejected": -22.505908966064453, "step": 18375 }, { "epoch": 0.6195018369341737, "grad_norm": 162.6773681640625, "learning_rate": 3.798852808205674e-07, "logits/chosen": -2.0111989974975586, "logits/rejected": -2.380748748779297, "logps/chosen": -3.5007591247558594, "logps/rejected": -3.966818332672119, "loss": 2.9209, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -35.007591247558594, "rewards/margins": 4.660592079162598, "rewards/rejected": -39.668182373046875, "step": 18380 }, { "epoch": 0.6196703630051569, "grad_norm": 29.002161026000977, "learning_rate": 3.7959978043465584e-07, "logits/chosen": -1.1822447776794434, "logits/rejected": -1.4777024984359741, "logps/chosen": -2.1516406536102295, "logps/rejected": -2.1278204917907715, "loss": 4.0319, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.516408920288086, "rewards/margins": -0.2382035255432129, "rewards/rejected": -21.2782039642334, "step": 18385 }, { "epoch": 0.61983888907614, "grad_norm": 33.008941650390625, "learning_rate": 3.7931432171451695e-07, "logits/chosen": -1.0619652271270752, "logits/rejected": -1.018744707107544, "logps/chosen": -3.1147468090057373, "logps/rejected": -3.2658798694610596, "loss": 2.2896, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -31.1474666595459, "rewards/margins": 1.5113319158554077, "rewards/rejected": -32.6588020324707, "step": 18390 }, { "epoch": 0.6200074151471232, "grad_norm": 29.102638244628906, "learning_rate": 3.7902890475893625e-07, "logits/chosen": -1.483496904373169, "logits/rejected": -1.4409980773925781, "logps/chosen": -2.2974536418914795, "logps/rejected": -2.2440028190612793, "loss": 4.9437, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.974538803100586, "rewards/margins": -0.5345101356506348, "rewards/rejected": -22.44002914428711, "step": 18395 }, { "epoch": 0.6201759412181065, "grad_norm": 26.34992218017578, "learning_rate": 3.787435296666855e-07, "logits/chosen": -1.703963279724121, "logits/rejected": -1.8446147441864014, "logps/chosen": -2.14034366607666, "logps/rejected": -2.2772395610809326, "loss": 2.9894, "rewards/accuracies": 0.5, "rewards/chosen": -21.403438568115234, "rewards/margins": 1.3689591884613037, "rewards/rejected": -22.772396087646484, "step": 18400 }, { "epoch": 0.6201759412181065, "eval_logits/chosen": -2.0543246269226074, "eval_logits/rejected": -2.208879232406616, "eval_logps/chosen": -2.161914110183716, "eval_logps/rejected": -2.3027634620666504, "eval_loss": 3.003286838531494, "eval_rewards/accuracies": 0.6200000047683716, "eval_rewards/chosen": -21.619140625, "eval_rewards/margins": 1.4084923267364502, "eval_rewards/rejected": -23.027631759643555, "eval_runtime": 12.9004, "eval_samples_per_second": 7.752, "eval_steps_per_second": 1.938, "step": 18400 }, { "epoch": 0.6203444672890897, "grad_norm": 158.8337860107422, "learning_rate": 3.78458196536522e-07, "logits/chosen": -1.6829944849014282, "logits/rejected": -1.850947618484497, "logps/chosen": -2.789701461791992, "logps/rejected": -2.4470295906066895, "loss": 6.636, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -27.89701271057129, "rewards/margins": -3.426718235015869, "rewards/rejected": -24.470294952392578, "step": 18405 }, { "epoch": 0.6205129933600728, "grad_norm": 36.712547302246094, "learning_rate": 3.7817290546718796e-07, "logits/chosen": -1.125700831413269, "logits/rejected": -1.4983158111572266, "logps/chosen": -2.071444034576416, "logps/rejected": -2.1972525119781494, "loss": 3.7233, "rewards/accuracies": 0.5, "rewards/chosen": -20.714439392089844, "rewards/margins": 1.2580852508544922, "rewards/rejected": -21.972524642944336, "step": 18410 }, { "epoch": 0.620681519431056, "grad_norm": 16.614980697631836, "learning_rate": 3.7788765655741165e-07, "logits/chosen": -1.7382490634918213, "logits/rejected": -2.124817132949829, "logps/chosen": -2.513784408569336, "logps/rejected": -2.7320122718811035, "loss": 2.1141, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.13784408569336, "rewards/margins": 2.182276964187622, "rewards/rejected": -27.32012367248535, "step": 18415 }, { "epoch": 0.6208500455020391, "grad_norm": 21.424095153808594, "learning_rate": 3.7760244990590627e-07, "logits/chosen": -1.7637748718261719, "logits/rejected": -1.7967971563339233, "logps/chosen": -2.9038190841674805, "logps/rejected": -3.031123399734497, "loss": 3.3259, "rewards/accuracies": 0.5, "rewards/chosen": -29.038188934326172, "rewards/margins": 1.2730454206466675, "rewards/rejected": -30.311237335205078, "step": 18420 }, { "epoch": 0.6210185715730223, "grad_norm": 24.72469711303711, "learning_rate": 3.773172856113709e-07, "logits/chosen": -1.4290255308151245, "logits/rejected": -1.7143604755401611, "logps/chosen": -1.9013131856918335, "logps/rejected": -2.246443510055542, "loss": 2.1578, "rewards/accuracies": 0.5, "rewards/chosen": -19.013132095336914, "rewards/margins": 3.4513046741485596, "rewards/rejected": -22.464435577392578, "step": 18425 }, { "epoch": 0.6211870976440055, "grad_norm": 33.128170013427734, "learning_rate": 3.770321637724893e-07, "logits/chosen": -1.646192193031311, "logits/rejected": -1.898533582687378, "logps/chosen": -3.1607718467712402, "logps/rejected": -3.6098225116729736, "loss": 3.0863, "rewards/accuracies": 0.5, "rewards/chosen": -31.607717514038086, "rewards/margins": 4.490505218505859, "rewards/rejected": -36.09822463989258, "step": 18430 }, { "epoch": 0.6213556237149888, "grad_norm": 36.44154357910156, "learning_rate": 3.7674708448793105e-07, "logits/chosen": -1.2727489471435547, "logits/rejected": -1.7410869598388672, "logps/chosen": -2.239375114440918, "logps/rejected": -3.270235776901245, "loss": 1.8125, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.393749237060547, "rewards/margins": 10.308609008789062, "rewards/rejected": -32.70235824584961, "step": 18435 }, { "epoch": 0.6215241497859719, "grad_norm": 28.19940948486328, "learning_rate": 3.764620478563511e-07, "logits/chosen": -1.2106577157974243, "logits/rejected": -1.5160796642303467, "logps/chosen": -2.4029972553253174, "logps/rejected": -2.482666492462158, "loss": 2.6059, "rewards/accuracies": 0.5, "rewards/chosen": -24.02997398376465, "rewards/margins": 0.7966904640197754, "rewards/rejected": -24.826663970947266, "step": 18440 }, { "epoch": 0.6216926758569551, "grad_norm": 24.05192756652832, "learning_rate": 3.76177053976389e-07, "logits/chosen": -1.7198930978775024, "logits/rejected": -2.0885720252990723, "logps/chosen": -1.9368069171905518, "logps/rejected": -2.056891918182373, "loss": 3.4735, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.36806869506836, "rewards/margins": 1.200850009918213, "rewards/rejected": -20.568918228149414, "step": 18445 }, { "epoch": 0.6218612019279383, "grad_norm": 17.315340042114258, "learning_rate": 3.758921029466701e-07, "logits/chosen": -1.7316211462020874, "logits/rejected": -1.7989072799682617, "logps/chosen": -2.3561413288116455, "logps/rejected": -2.818152904510498, "loss": 2.7612, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -23.561410903930664, "rewards/margins": 4.620118141174316, "rewards/rejected": -28.181528091430664, "step": 18450 }, { "epoch": 0.6220297279989214, "grad_norm": 35.148826599121094, "learning_rate": 3.7560719486580494e-07, "logits/chosen": -2.0707952976226807, "logits/rejected": -2.0945842266082764, "logps/chosen": -2.033674955368042, "logps/rejected": -1.9517732858657837, "loss": 4.0518, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.33675193786621, "rewards/margins": -0.8190194368362427, "rewards/rejected": -19.517730712890625, "step": 18455 }, { "epoch": 0.6221982540699046, "grad_norm": 41.66608810424805, "learning_rate": 3.7532232983238847e-07, "logits/chosen": -1.5666892528533936, "logits/rejected": -1.5280810594558716, "logps/chosen": -1.975049614906311, "logps/rejected": -1.9352716207504272, "loss": 3.5712, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.7504940032959, "rewards/margins": -0.39777764678001404, "rewards/rejected": -19.352718353271484, "step": 18460 }, { "epoch": 0.6223667801408878, "grad_norm": 65.85220336914062, "learning_rate": 3.750375079450016e-07, "logits/chosen": -1.7846683263778687, "logits/rejected": -1.8842815160751343, "logps/chosen": -2.4808125495910645, "logps/rejected": -2.9712767601013184, "loss": 2.332, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.80812644958496, "rewards/margins": 4.90463924407959, "rewards/rejected": -29.712764739990234, "step": 18465 }, { "epoch": 0.622535306211871, "grad_norm": 44.47334289550781, "learning_rate": 3.747527293022099e-07, "logits/chosen": -1.5464017391204834, "logits/rejected": -1.6421788930892944, "logps/chosen": -2.0164692401885986, "logps/rejected": -1.9157434701919556, "loss": 4.4096, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -20.164691925048828, "rewards/margins": -1.0072581768035889, "rewards/rejected": -19.157434463500977, "step": 18470 }, { "epoch": 0.6227038322828542, "grad_norm": 54.05680847167969, "learning_rate": 3.7446799400256415e-07, "logits/chosen": -1.3993407487869263, "logits/rejected": -1.7749563455581665, "logps/chosen": -1.9316022396087646, "logps/rejected": -2.392576217651367, "loss": 2.0747, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.316022872924805, "rewards/margins": 4.609738349914551, "rewards/rejected": -23.92576026916504, "step": 18475 }, { "epoch": 0.6228723583538374, "grad_norm": 101.51625061035156, "learning_rate": 3.741833021445999e-07, "logits/chosen": -1.7461254596710205, "logits/rejected": -1.7653440237045288, "logps/chosen": -2.2352042198181152, "logps/rejected": -2.2143850326538086, "loss": 3.3942, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.35204315185547, "rewards/margins": -0.20819291472434998, "rewards/rejected": -22.143848419189453, "step": 18480 }, { "epoch": 0.6230408844248205, "grad_norm": 170.65745544433594, "learning_rate": 3.7389865382683774e-07, "logits/chosen": -1.717813491821289, "logits/rejected": -1.915365219116211, "logps/chosen": -2.594407081604004, "logps/rejected": -2.9728779792785645, "loss": 4.5426, "rewards/accuracies": 0.5, "rewards/chosen": -25.94407081604004, "rewards/margins": 3.784705400466919, "rewards/rejected": -29.728778839111328, "step": 18485 }, { "epoch": 0.6232094104958037, "grad_norm": 30.940324783325195, "learning_rate": 3.7361404914778326e-07, "logits/chosen": -1.867049217224121, "logits/rejected": -2.008021354675293, "logps/chosen": -2.037647008895874, "logps/rejected": -2.712981700897217, "loss": 2.4101, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.3764705657959, "rewards/margins": 6.753344535827637, "rewards/rejected": -27.12981605529785, "step": 18490 }, { "epoch": 0.6233779365667869, "grad_norm": 42.33964538574219, "learning_rate": 3.73329488205927e-07, "logits/chosen": -1.4565184116363525, "logits/rejected": -1.9266363382339478, "logps/chosen": -2.5136559009552, "logps/rejected": -3.28765606880188, "loss": 1.3486, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -25.136554718017578, "rewards/margins": 7.7400031089782715, "rewards/rejected": -32.87656021118164, "step": 18495 }, { "epoch": 0.62354646263777, "grad_norm": 22.314546585083008, "learning_rate": 3.730449710997442e-07, "logits/chosen": -1.321411371231079, "logits/rejected": -1.730507254600525, "logps/chosen": -1.797588586807251, "logps/rejected": -1.8665136098861694, "loss": 2.5434, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.97588539123535, "rewards/margins": 0.6892514228820801, "rewards/rejected": -18.665136337280273, "step": 18500 }, { "epoch": 0.6237149887087532, "grad_norm": 21.432235717773438, "learning_rate": 3.727604979276951e-07, "logits/chosen": -1.1045910120010376, "logits/rejected": -1.2482346296310425, "logps/chosen": -2.0690102577209473, "logps/rejected": -2.294459581375122, "loss": 2.9685, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.69010353088379, "rewards/margins": 2.2544922828674316, "rewards/rejected": -22.944595336914062, "step": 18505 }, { "epoch": 0.6238835147797365, "grad_norm": 30.58997344970703, "learning_rate": 3.724760687882248e-07, "logits/chosen": -1.981286644935608, "logits/rejected": -1.9685707092285156, "logps/chosen": -1.7274444103240967, "logps/rejected": -1.8757808208465576, "loss": 2.8326, "rewards/accuracies": 0.5, "rewards/chosen": -17.274444580078125, "rewards/margins": 1.483363151550293, "rewards/rejected": -18.757808685302734, "step": 18510 }, { "epoch": 0.6240520408507196, "grad_norm": 28.33938217163086, "learning_rate": 3.7219168377976267e-07, "logits/chosen": -1.7231643199920654, "logits/rejected": -1.5711719989776611, "logps/chosen": -2.3871026039123535, "logps/rejected": -2.6765151023864746, "loss": 3.2495, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -23.87102508544922, "rewards/margins": 2.89412522315979, "rewards/rejected": -26.765151977539062, "step": 18515 }, { "epoch": 0.6242205669217028, "grad_norm": 27.47281837463379, "learning_rate": 3.7190734300072336e-07, "logits/chosen": -2.195241689682007, "logits/rejected": -2.453075885772705, "logps/chosen": -1.7438371181488037, "logps/rejected": -1.9724502563476562, "loss": 2.3724, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.438373565673828, "rewards/margins": 2.2861297130584717, "rewards/rejected": -19.724502563476562, "step": 18520 }, { "epoch": 0.624389092992686, "grad_norm": 28.859298706054688, "learning_rate": 3.7162304654950614e-07, "logits/chosen": -1.7981083393096924, "logits/rejected": -1.8701518774032593, "logps/chosen": -2.183924436569214, "logps/rejected": -2.228469133377075, "loss": 3.7653, "rewards/accuracies": 0.5, "rewards/chosen": -21.839244842529297, "rewards/margins": 0.4454454481601715, "rewards/rejected": -22.28468894958496, "step": 18525 }, { "epoch": 0.6245576190636691, "grad_norm": 28.86895751953125, "learning_rate": 3.7133879452449446e-07, "logits/chosen": -1.7908798456192017, "logits/rejected": -1.7884708642959595, "logps/chosen": -2.4270756244659424, "logps/rejected": -2.6708824634552, "loss": 2.6429, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.2707576751709, "rewards/margins": 2.438068151473999, "rewards/rejected": -26.708826065063477, "step": 18530 }, { "epoch": 0.6247261451346523, "grad_norm": 50.5154914855957, "learning_rate": 3.71054587024057e-07, "logits/chosen": -1.2883013486862183, "logits/rejected": -1.6345545053482056, "logps/chosen": -1.8493115901947021, "logps/rejected": -2.293156385421753, "loss": 2.6221, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.493114471435547, "rewards/margins": 4.438447952270508, "rewards/rejected": -22.931560516357422, "step": 18535 }, { "epoch": 0.6248946712056355, "grad_norm": 30.61741065979004, "learning_rate": 3.707704241465467e-07, "logits/chosen": -0.9950865507125854, "logits/rejected": -1.1939284801483154, "logps/chosen": -2.134911298751831, "logps/rejected": -3.1475205421447754, "loss": 1.5967, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.349111557006836, "rewards/margins": 10.126092910766602, "rewards/rejected": -31.475208282470703, "step": 18540 }, { "epoch": 0.6250631972766187, "grad_norm": 84.05421447753906, "learning_rate": 3.7048630599030134e-07, "logits/chosen": -1.2695919275283813, "logits/rejected": -1.594789743423462, "logps/chosen": -1.8781297206878662, "logps/rejected": -2.194685697555542, "loss": 1.3803, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -18.78129768371582, "rewards/margins": 3.1655619144439697, "rewards/rejected": -21.946857452392578, "step": 18545 }, { "epoch": 0.6252317233476019, "grad_norm": 52.571868896484375, "learning_rate": 3.7020223265364264e-07, "logits/chosen": -1.192299485206604, "logits/rejected": -1.9590389728546143, "logps/chosen": -2.2939975261688232, "logps/rejected": -2.977759838104248, "loss": 2.1195, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -22.939977645874023, "rewards/margins": 6.837620735168457, "rewards/rejected": -29.777597427368164, "step": 18550 }, { "epoch": 0.6254002494185851, "grad_norm": 33.046817779541016, "learning_rate": 3.699182042348774e-07, "logits/chosen": -1.9198243618011475, "logits/rejected": -1.962834358215332, "logps/chosen": -3.0863873958587646, "logps/rejected": -3.3279757499694824, "loss": 5.3368, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -30.863876342773438, "rewards/margins": 2.415881633758545, "rewards/rejected": -33.27975845336914, "step": 18555 }, { "epoch": 0.6255687754895682, "grad_norm": 45.52570724487305, "learning_rate": 3.6963422083229676e-07, "logits/chosen": -1.1146682500839233, "logits/rejected": -1.5024160146713257, "logps/chosen": -2.387057304382324, "logps/rejected": -2.8693313598632812, "loss": 3.674, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.870573043823242, "rewards/margins": 4.822741508483887, "rewards/rejected": -28.693313598632812, "step": 18560 }, { "epoch": 0.6257373015605514, "grad_norm": 34.899070739746094, "learning_rate": 3.6935028254417597e-07, "logits/chosen": -1.2353322505950928, "logits/rejected": -1.1657068729400635, "logps/chosen": -1.8352171182632446, "logps/rejected": -1.9835834503173828, "loss": 2.1929, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.352170944213867, "rewards/margins": 1.4836634397506714, "rewards/rejected": -19.835834503173828, "step": 18565 }, { "epoch": 0.6259058276315346, "grad_norm": 16.575700759887695, "learning_rate": 3.69066389468775e-07, "logits/chosen": -1.628157377243042, "logits/rejected": -1.5780006647109985, "logps/chosen": -1.9046649932861328, "logps/rejected": -2.117918014526367, "loss": 2.2805, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.046649932861328, "rewards/margins": 2.1325302124023438, "rewards/rejected": -21.179180145263672, "step": 18570 }, { "epoch": 0.6260743537025177, "grad_norm": 32.12807846069336, "learning_rate": 3.687825417043381e-07, "logits/chosen": -1.7738593816757202, "logits/rejected": -1.9184929132461548, "logps/chosen": -2.171799659729004, "logps/rejected": -2.3933589458465576, "loss": 3.5577, "rewards/accuracies": 0.5, "rewards/chosen": -21.71799659729004, "rewards/margins": 2.215592622756958, "rewards/rejected": -23.933589935302734, "step": 18575 }, { "epoch": 0.626242879773501, "grad_norm": 34.624046325683594, "learning_rate": 3.684987393490939e-07, "logits/chosen": -0.7456585764884949, "logits/rejected": -0.8155146837234497, "logps/chosen": -2.190500259399414, "logps/rejected": -2.361079692840576, "loss": 1.9946, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.90500259399414, "rewards/margins": 1.705796480178833, "rewards/rejected": -23.61079978942871, "step": 18580 }, { "epoch": 0.6264114058444842, "grad_norm": 23.193527221679688, "learning_rate": 3.6821498250125494e-07, "logits/chosen": -1.531273603439331, "logits/rejected": -1.4955469369888306, "logps/chosen": -2.295144557952881, "logps/rejected": -2.4351449012756348, "loss": 3.1515, "rewards/accuracies": 0.5, "rewards/chosen": -22.951446533203125, "rewards/margins": 1.4000024795532227, "rewards/rejected": -24.3514461517334, "step": 18585 }, { "epoch": 0.6265799319154673, "grad_norm": 35.371726989746094, "learning_rate": 3.679312712590183e-07, "logits/chosen": -1.5566551685333252, "logits/rejected": -1.6705825328826904, "logps/chosen": -2.2006218433380127, "logps/rejected": -2.512608766555786, "loss": 2.6644, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.00621795654297, "rewards/margins": 3.1198689937591553, "rewards/rejected": -25.126087188720703, "step": 18590 }, { "epoch": 0.6267484579864505, "grad_norm": 46.78335189819336, "learning_rate": 3.6764760572056567e-07, "logits/chosen": -1.7316887378692627, "logits/rejected": -1.6605297327041626, "logps/chosen": -1.9013340473175049, "logps/rejected": -2.0264945030212402, "loss": 2.8801, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.01333999633789, "rewards/margins": 1.2516052722930908, "rewards/rejected": -20.26494598388672, "step": 18595 }, { "epoch": 0.6269169840574337, "grad_norm": 80.91983032226562, "learning_rate": 3.6736398598406205e-07, "logits/chosen": -1.4064184427261353, "logits/rejected": -1.4780693054199219, "logps/chosen": -2.071498155593872, "logps/rejected": -2.684077739715576, "loss": 2.1366, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.71497917175293, "rewards/margins": 6.125799179077148, "rewards/rejected": -26.840778350830078, "step": 18600 }, { "epoch": 0.6270855101284168, "grad_norm": 22.150487899780273, "learning_rate": 3.670804121476571e-07, "logits/chosen": -1.170275330543518, "logits/rejected": -1.510514259338379, "logps/chosen": -2.4041285514831543, "logps/rejected": -3.1954643726348877, "loss": 2.4695, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.04128646850586, "rewards/margins": 7.913358211517334, "rewards/rejected": -31.95464515686035, "step": 18605 }, { "epoch": 0.6272540361994, "grad_norm": 16.839929580688477, "learning_rate": 3.6679688430948477e-07, "logits/chosen": -1.8504005670547485, "logits/rejected": -2.0550291538238525, "logps/chosen": -2.6161468029022217, "logps/rejected": -4.031783103942871, "loss": 2.3714, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.161468505859375, "rewards/margins": 14.156366348266602, "rewards/rejected": -40.317832946777344, "step": 18610 }, { "epoch": 0.6274225622703832, "grad_norm": 25.382099151611328, "learning_rate": 3.66513402567663e-07, "logits/chosen": -1.4021594524383545, "logits/rejected": -1.5731501579284668, "logps/chosen": -2.061854600906372, "logps/rejected": -2.361215114593506, "loss": 1.9625, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -20.61854362487793, "rewards/margins": 2.9936070442199707, "rewards/rejected": -23.612152099609375, "step": 18615 }, { "epoch": 0.6275910883413665, "grad_norm": 26.835275650024414, "learning_rate": 3.6622996702029317e-07, "logits/chosen": -1.4052951335906982, "logits/rejected": -1.4855515956878662, "logps/chosen": -1.8088502883911133, "logps/rejected": -2.0100693702697754, "loss": 1.6008, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.088504791259766, "rewards/margins": 2.0121896266937256, "rewards/rejected": -20.100692749023438, "step": 18620 }, { "epoch": 0.6277596144123496, "grad_norm": 23.951950073242188, "learning_rate": 3.659465777654615e-07, "logits/chosen": -1.716301679611206, "logits/rejected": -1.5523309707641602, "logps/chosen": -1.6933162212371826, "logps/rejected": -1.782979965209961, "loss": 2.6362, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.933162689208984, "rewards/margins": 0.8966361880302429, "rewards/rejected": -17.82979965209961, "step": 18625 }, { "epoch": 0.6279281404833328, "grad_norm": 28.52524757385254, "learning_rate": 3.6566323490123785e-07, "logits/chosen": -1.995849370956421, "logits/rejected": -2.2262625694274902, "logps/chosen": -2.3244833946228027, "logps/rejected": -2.6942267417907715, "loss": 3.0486, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.24483299255371, "rewards/margins": 3.697434663772583, "rewards/rejected": -26.942270278930664, "step": 18630 }, { "epoch": 0.628096666554316, "grad_norm": 25.172317504882812, "learning_rate": 3.6537993852567584e-07, "logits/chosen": -1.965527892112732, "logits/rejected": -2.0461528301239014, "logps/chosen": -2.907482147216797, "logps/rejected": -3.6528172492980957, "loss": 2.0917, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -29.074819564819336, "rewards/margins": 7.453348636627197, "rewards/rejected": -36.528175354003906, "step": 18635 }, { "epoch": 0.6282651926252991, "grad_norm": 25.730562210083008, "learning_rate": 3.6509668873681327e-07, "logits/chosen": -1.723894715309143, "logits/rejected": -1.7027992010116577, "logps/chosen": -2.7941901683807373, "logps/rejected": -2.780871868133545, "loss": 3.8698, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -27.941904067993164, "rewards/margins": -0.1331832855939865, "rewards/rejected": -27.8087215423584, "step": 18640 }, { "epoch": 0.6284337186962823, "grad_norm": 26.492856979370117, "learning_rate": 3.6481348563267176e-07, "logits/chosen": -1.3790489435195923, "logits/rejected": -1.5762332677841187, "logps/chosen": -1.9941978454589844, "logps/rejected": -2.02683687210083, "loss": 2.8477, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.941980361938477, "rewards/margins": 0.3263886570930481, "rewards/rejected": -20.268369674682617, "step": 18645 }, { "epoch": 0.6286022447672655, "grad_norm": 21.514509201049805, "learning_rate": 3.6453032931125695e-07, "logits/chosen": -2.6226284503936768, "logits/rejected": -2.9686291217803955, "logps/chosen": -3.428541898727417, "logps/rejected": -4.568081855773926, "loss": 2.2685, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -34.28541946411133, "rewards/margins": 11.395398139953613, "rewards/rejected": -45.680816650390625, "step": 18650 }, { "epoch": 0.6287707708382487, "grad_norm": 40.166297912597656, "learning_rate": 3.642472198705576e-07, "logits/chosen": -1.676222562789917, "logits/rejected": -1.9952523708343506, "logps/chosen": -2.4934425354003906, "logps/rejected": -2.7475247383117676, "loss": 2.4238, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.93442726135254, "rewards/margins": 2.540821075439453, "rewards/rejected": -27.47524642944336, "step": 18655 }, { "epoch": 0.6289392969092319, "grad_norm": 43.843589782714844, "learning_rate": 3.6396415740854715e-07, "logits/chosen": -1.8575794696807861, "logits/rejected": -2.393746852874756, "logps/chosen": -2.755718946456909, "logps/rejected": -3.4682717323303223, "loss": 2.7489, "rewards/accuracies": 0.5, "rewards/chosen": -27.55718994140625, "rewards/margins": 7.1255292892456055, "rewards/rejected": -34.682716369628906, "step": 18660 }, { "epoch": 0.6291078229802151, "grad_norm": 34.07551193237305, "learning_rate": 3.6368114202318234e-07, "logits/chosen": -2.294307231903076, "logits/rejected": -2.680905818939209, "logps/chosen": -2.486971139907837, "logps/rejected": -2.966942310333252, "loss": 1.8959, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.869714736938477, "rewards/margins": 4.799710750579834, "rewards/rejected": -29.669422149658203, "step": 18665 }, { "epoch": 0.6292763490511982, "grad_norm": 22.968944549560547, "learning_rate": 3.6339817381240336e-07, "logits/chosen": -1.6254408359527588, "logits/rejected": -1.8740613460540771, "logps/chosen": -2.6052937507629395, "logps/rejected": -3.0102014541625977, "loss": 1.7571, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -26.05293846130371, "rewards/margins": 4.049075126647949, "rewards/rejected": -30.102014541625977, "step": 18670 }, { "epoch": 0.6294448751221814, "grad_norm": 39.4991455078125, "learning_rate": 3.631152528741345e-07, "logits/chosen": -1.8259556293487549, "logits/rejected": -1.862667441368103, "logps/chosen": -1.8470585346221924, "logps/rejected": -1.8689968585968018, "loss": 3.0458, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.470584869384766, "rewards/margins": 0.2193818986415863, "rewards/rejected": -18.68996810913086, "step": 18675 }, { "epoch": 0.6296134011931646, "grad_norm": 22.74688148498535, "learning_rate": 3.6283237930628354e-07, "logits/chosen": -1.3108330965042114, "logits/rejected": -1.2930123805999756, "logps/chosen": -2.7357821464538574, "logps/rejected": -2.361436367034912, "loss": 8.0913, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -27.357824325561523, "rewards/margins": -3.7434592247009277, "rewards/rejected": -23.614362716674805, "step": 18680 }, { "epoch": 0.6297819272641477, "grad_norm": 33.82335662841797, "learning_rate": 3.6254955320674215e-07, "logits/chosen": -1.7322124242782593, "logits/rejected": -2.1520094871520996, "logps/chosen": -2.2102978229522705, "logps/rejected": -3.8706984519958496, "loss": 1.6132, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.102977752685547, "rewards/margins": 16.604013442993164, "rewards/rejected": -38.706993103027344, "step": 18685 }, { "epoch": 0.629950453335131, "grad_norm": 39.25462341308594, "learning_rate": 3.6226677467338486e-07, "logits/chosen": -1.723928451538086, "logits/rejected": -1.8014103174209595, "logps/chosen": -2.4551825523376465, "logps/rejected": -2.642056941986084, "loss": 3.9859, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -24.551822662353516, "rewards/margins": 1.868748664855957, "rewards/rejected": -26.420568466186523, "step": 18690 }, { "epoch": 0.6301189794061142, "grad_norm": 40.322025299072266, "learning_rate": 3.6198404380407034e-07, "logits/chosen": -1.3129554986953735, "logits/rejected": -1.4795254468917847, "logps/chosen": -2.9755096435546875, "logps/rejected": -2.795775890350342, "loss": 6.003, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -29.755096435546875, "rewards/margins": -1.7973363399505615, "rewards/rejected": -27.9577579498291, "step": 18695 }, { "epoch": 0.6302875054770973, "grad_norm": 14.616473197937012, "learning_rate": 3.617013606966408e-07, "logits/chosen": -1.6453460454940796, "logits/rejected": -2.410950183868408, "logps/chosen": -1.7156639099121094, "logps/rejected": -2.1873421669006348, "loss": 2.3051, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.156639099121094, "rewards/margins": 4.716780185699463, "rewards/rejected": -21.8734188079834, "step": 18700 }, { "epoch": 0.6304560315480805, "grad_norm": 52.16621780395508, "learning_rate": 3.614187254489215e-07, "logits/chosen": -1.852903962135315, "logits/rejected": -1.9818837642669678, "logps/chosen": -2.1786704063415527, "logps/rejected": -2.2185847759246826, "loss": 3.7794, "rewards/accuracies": 0.5, "rewards/chosen": -21.786705017089844, "rewards/margins": 0.39914292097091675, "rewards/rejected": -22.185848236083984, "step": 18705 }, { "epoch": 0.6306245576190637, "grad_norm": 40.281700134277344, "learning_rate": 3.6113613815872136e-07, "logits/chosen": -1.6567414999008179, "logits/rejected": -1.8920471668243408, "logps/chosen": -2.4621388912200928, "logps/rejected": -2.8170342445373535, "loss": 3.8563, "rewards/accuracies": 0.5, "rewards/chosen": -24.621389389038086, "rewards/margins": 3.5489540100097656, "rewards/rejected": -28.17034339904785, "step": 18710 }, { "epoch": 0.6307930836900468, "grad_norm": 18.528535842895508, "learning_rate": 3.6085359892383293e-07, "logits/chosen": -1.451129674911499, "logits/rejected": -1.6147100925445557, "logps/chosen": -2.878284454345703, "logps/rejected": -3.1676900386810303, "loss": 3.541, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -28.782846450805664, "rewards/margins": 2.8940553665161133, "rewards/rejected": -31.676898956298828, "step": 18715 }, { "epoch": 0.63096160976103, "grad_norm": 15.683385848999023, "learning_rate": 3.6057110784203174e-07, "logits/chosen": -1.6452653408050537, "logits/rejected": -1.8250606060028076, "logps/chosen": -1.8918319940567017, "logps/rejected": -2.104295492172241, "loss": 1.6575, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -18.918319702148438, "rewards/margins": 2.124636173248291, "rewards/rejected": -21.04295539855957, "step": 18720 }, { "epoch": 0.6311301358320132, "grad_norm": 559.3909301757812, "learning_rate": 3.602886650110768e-07, "logits/chosen": -1.2559688091278076, "logits/rejected": -1.6382989883422852, "logps/chosen": -2.667348861694336, "logps/rejected": -2.693481922149658, "loss": 4.4701, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -26.67348861694336, "rewards/margins": 0.26132678985595703, "rewards/rejected": -26.934814453125, "step": 18725 }, { "epoch": 0.6312986619029964, "grad_norm": 26.204259872436523, "learning_rate": 3.600062705287105e-07, "logits/chosen": -1.9202476739883423, "logits/rejected": -2.0799126625061035, "logps/chosen": -2.1364963054656982, "logps/rejected": -2.337533473968506, "loss": 2.3524, "rewards/accuracies": 0.5, "rewards/chosen": -21.364961624145508, "rewards/margins": 2.01037335395813, "rewards/rejected": -23.375335693359375, "step": 18730 }, { "epoch": 0.6314671879739796, "grad_norm": 71.08409118652344, "learning_rate": 3.5972392449265854e-07, "logits/chosen": -1.4504081010818481, "logits/rejected": -1.4608997106552124, "logps/chosen": -2.9600231647491455, "logps/rejected": -2.9094111919403076, "loss": 3.7138, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -29.600229263305664, "rewards/margins": -0.5061157941818237, "rewards/rejected": -29.0941104888916, "step": 18735 }, { "epoch": 0.6316357140449628, "grad_norm": 38.98657989501953, "learning_rate": 3.594416270006295e-07, "logits/chosen": -1.581471562385559, "logits/rejected": -1.865757942199707, "logps/chosen": -2.142603635787964, "logps/rejected": -2.3634285926818848, "loss": 2.3702, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.426036834716797, "rewards/margins": 2.2082512378692627, "rewards/rejected": -23.634288787841797, "step": 18740 }, { "epoch": 0.6318042401159459, "grad_norm": 39.91908264160156, "learning_rate": 3.591593781503156e-07, "logits/chosen": -1.7952802181243896, "logits/rejected": -1.8274444341659546, "logps/chosen": -2.224351167678833, "logps/rejected": -2.292336940765381, "loss": 2.7999, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.243511199951172, "rewards/margins": 0.6798585057258606, "rewards/rejected": -22.923370361328125, "step": 18745 }, { "epoch": 0.6319727661869291, "grad_norm": 66.18889617919922, "learning_rate": 3.58877178039392e-07, "logits/chosen": -1.6533877849578857, "logits/rejected": -1.7162796258926392, "logps/chosen": -2.54699969291687, "logps/rejected": -2.590531587600708, "loss": 3.4206, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -25.46999740600586, "rewards/margins": 0.43531855940818787, "rewards/rejected": -25.905315399169922, "step": 18750 }, { "epoch": 0.6321412922579123, "grad_norm": 30.72490119934082, "learning_rate": 3.5859502676551736e-07, "logits/chosen": -1.9574337005615234, "logits/rejected": -1.5460635423660278, "logps/chosen": -2.3032872676849365, "logps/rejected": -2.066685438156128, "loss": 6.0732, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.032873153686523, "rewards/margins": -2.366018772125244, "rewards/rejected": -20.666854858398438, "step": 18755 }, { "epoch": 0.6323098183288954, "grad_norm": 27.569772720336914, "learning_rate": 3.583129244263325e-07, "logits/chosen": -1.2762352228164673, "logits/rejected": -1.673752784729004, "logps/chosen": -2.3343570232391357, "logps/rejected": -3.042248010635376, "loss": 1.5226, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -23.343570709228516, "rewards/margins": 7.0789079666137695, "rewards/rejected": -30.422481536865234, "step": 18760 }, { "epoch": 0.6324783443998787, "grad_norm": 31.950868606567383, "learning_rate": 3.5803087111946226e-07, "logits/chosen": -0.9125463366508484, "logits/rejected": -1.0970439910888672, "logps/chosen": -2.523500919342041, "logps/rejected": -2.504364490509033, "loss": 3.4955, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -25.23501205444336, "rewards/margins": -0.19136667251586914, "rewards/rejected": -25.043643951416016, "step": 18765 }, { "epoch": 0.6326468704708619, "grad_norm": 27.117694854736328, "learning_rate": 3.5774886694251426e-07, "logits/chosen": -1.55544912815094, "logits/rejected": -2.047374963760376, "logps/chosen": -2.2005863189697266, "logps/rejected": -2.595324754714966, "loss": 2.0615, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.005863189697266, "rewards/margins": 3.947382688522339, "rewards/rejected": -25.9532470703125, "step": 18770 }, { "epoch": 0.632815396541845, "grad_norm": 32.97996520996094, "learning_rate": 3.574669119930789e-07, "logits/chosen": -1.7787446975708008, "logits/rejected": -2.044855833053589, "logps/chosen": -2.233386278152466, "logps/rejected": -2.2153115272521973, "loss": 3.3101, "rewards/accuracies": 0.5, "rewards/chosen": -22.3338623046875, "rewards/margins": -0.18074798583984375, "rewards/rejected": -22.153112411499023, "step": 18775 }, { "epoch": 0.6329839226128282, "grad_norm": 32.860652923583984, "learning_rate": 3.5718500636872983e-07, "logits/chosen": -1.2347030639648438, "logits/rejected": -1.4687144756317139, "logps/chosen": -2.5832676887512207, "logps/rejected": -2.9072065353393555, "loss": 2.0371, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.83267593383789, "rewards/margins": 3.2393898963928223, "rewards/rejected": -29.072063446044922, "step": 18780 }, { "epoch": 0.6331524486838114, "grad_norm": 92.78843688964844, "learning_rate": 3.569031501670232e-07, "logits/chosen": -1.433812141418457, "logits/rejected": -1.804659128189087, "logps/chosen": -2.26090669631958, "logps/rejected": -2.6193065643310547, "loss": 2.3625, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.609066009521484, "rewards/margins": 3.5839996337890625, "rewards/rejected": -26.193065643310547, "step": 18785 }, { "epoch": 0.6333209747547945, "grad_norm": 72.81118774414062, "learning_rate": 3.5662134348549867e-07, "logits/chosen": -1.4099839925765991, "logits/rejected": -1.9020986557006836, "logps/chosen": -2.07269549369812, "logps/rejected": -2.3570284843444824, "loss": 1.9019, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.72695541381836, "rewards/margins": 2.8433303833007812, "rewards/rejected": -23.57028579711914, "step": 18790 }, { "epoch": 0.6334895008257777, "grad_norm": 74.02709197998047, "learning_rate": 3.563395864216781e-07, "logits/chosen": -1.6922391653060913, "logits/rejected": -1.9072993993759155, "logps/chosen": -2.716442823410034, "logps/rejected": -3.1500327587127686, "loss": 2.693, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -27.1644287109375, "rewards/margins": 4.335899829864502, "rewards/rejected": -31.50032615661621, "step": 18795 }, { "epoch": 0.633658026896761, "grad_norm": 108.89360809326172, "learning_rate": 3.560578790730667e-07, "logits/chosen": -1.768334150314331, "logits/rejected": -1.6725940704345703, "logps/chosen": -2.8877758979797363, "logps/rejected": -2.924286365509033, "loss": 3.497, "rewards/accuracies": 0.5, "rewards/chosen": -28.877758026123047, "rewards/margins": 0.3651053309440613, "rewards/rejected": -29.24286460876465, "step": 18800 }, { "epoch": 0.633658026896761, "eval_logits/chosen": -2.071402072906494, "eval_logits/rejected": -2.2284653186798096, "eval_logps/chosen": -2.1819846630096436, "eval_logps/rejected": -2.3242621421813965, "eval_loss": 3.0252206325531006, "eval_rewards/accuracies": 0.6200000047683716, "eval_rewards/chosen": -21.819847106933594, "eval_rewards/margins": 1.42277193069458, "eval_rewards/rejected": -23.24262046813965, "eval_runtime": 12.8999, "eval_samples_per_second": 7.752, "eval_steps_per_second": 1.938, "step": 18800 }, { "epoch": 0.6338265529677442, "grad_norm": 41.892486572265625, "learning_rate": 3.557762215371525e-07, "logits/chosen": -1.066054105758667, "logits/rejected": -1.4403715133666992, "logps/chosen": -1.9089603424072266, "logps/rejected": -2.2967567443847656, "loss": 1.7666, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.089603424072266, "rewards/margins": 3.877964496612549, "rewards/rejected": -22.96756935119629, "step": 18805 }, { "epoch": 0.6339950790387273, "grad_norm": 29.515609741210938, "learning_rate": 3.5549461391140557e-07, "logits/chosen": -1.927973747253418, "logits/rejected": -2.1799073219299316, "logps/chosen": -2.299041748046875, "logps/rejected": -2.4362006187438965, "loss": 3.3194, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.99041748046875, "rewards/margins": 1.3715906143188477, "rewards/rejected": -24.36200714111328, "step": 18810 }, { "epoch": 0.6341636051097105, "grad_norm": 29.148569107055664, "learning_rate": 3.5521305629327953e-07, "logits/chosen": -1.8402526378631592, "logits/rejected": -1.7619283199310303, "logps/chosen": -1.8162330389022827, "logps/rejected": -2.185051679611206, "loss": 1.8324, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.16233253479004, "rewards/margins": 3.688185214996338, "rewards/rejected": -21.850515365600586, "step": 18815 }, { "epoch": 0.6343321311806936, "grad_norm": 54.641075134277344, "learning_rate": 3.549315487802103e-07, "logits/chosen": -0.9678794741630554, "logits/rejected": -0.9939224123954773, "logps/chosen": -2.24064302444458, "logps/rejected": -2.242785930633545, "loss": 3.2498, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.406429290771484, "rewards/margins": 0.021430587396025658, "rewards/rejected": -22.427860260009766, "step": 18820 }, { "epoch": 0.6345006572516768, "grad_norm": 15.241156578063965, "learning_rate": 3.546500914696168e-07, "logits/chosen": -1.4445557594299316, "logits/rejected": -1.633917212486267, "logps/chosen": -2.0695130825042725, "logps/rejected": -2.4762346744537354, "loss": 1.6071, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.695133209228516, "rewards/margins": 4.067215442657471, "rewards/rejected": -24.762348175048828, "step": 18825 }, { "epoch": 0.63466918332266, "grad_norm": 31.20334815979004, "learning_rate": 3.543686844588999e-07, "logits/chosen": -1.7547988891601562, "logits/rejected": -2.0018696784973145, "logps/chosen": -2.5072829723358154, "logps/rejected": -3.3072426319122314, "loss": 2.0993, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.072830200195312, "rewards/margins": 7.999594211578369, "rewards/rejected": -33.072425842285156, "step": 18830 }, { "epoch": 0.6348377093936431, "grad_norm": 38.99212646484375, "learning_rate": 3.540873278454436e-07, "logits/chosen": -1.6647943258285522, "logits/rejected": -2.0520567893981934, "logps/chosen": -2.5589611530303955, "logps/rejected": -2.987727403640747, "loss": 3.0522, "rewards/accuracies": 0.5, "rewards/chosen": -25.589611053466797, "rewards/margins": 4.287664413452148, "rewards/rejected": -29.877277374267578, "step": 18835 }, { "epoch": 0.6350062354646264, "grad_norm": 17.255659103393555, "learning_rate": 3.5380602172661454e-07, "logits/chosen": -1.7852544784545898, "logits/rejected": -1.8239473104476929, "logps/chosen": -2.2291624546051025, "logps/rejected": -2.2721869945526123, "loss": 3.1657, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.291624069213867, "rewards/margins": 0.43024587631225586, "rewards/rejected": -22.721872329711914, "step": 18840 }, { "epoch": 0.6351747615356096, "grad_norm": 43.60550308227539, "learning_rate": 3.535247661997616e-07, "logits/chosen": -1.6584608554840088, "logits/rejected": -1.7699321508407593, "logps/chosen": -2.3036136627197266, "logps/rejected": -2.459486484527588, "loss": 4.5349, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -23.036136627197266, "rewards/margins": 1.5587265491485596, "rewards/rejected": -24.594863891601562, "step": 18845 }, { "epoch": 0.6353432876065928, "grad_norm": 22.359338760375977, "learning_rate": 3.53243561362216e-07, "logits/chosen": -1.2959634065628052, "logits/rejected": -1.7488740682601929, "logps/chosen": -1.9807817935943604, "logps/rejected": -2.1481728553771973, "loss": 2.394, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.807817459106445, "rewards/margins": 1.673910140991211, "rewards/rejected": -21.481727600097656, "step": 18850 }, { "epoch": 0.6355118136775759, "grad_norm": 18.181415557861328, "learning_rate": 3.529624073112918e-07, "logits/chosen": -1.4957122802734375, "logits/rejected": -1.6004817485809326, "logps/chosen": -2.4971108436584473, "logps/rejected": -2.80725359916687, "loss": 1.7375, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.971107482910156, "rewards/margins": 3.101428270339966, "rewards/rejected": -28.07253646850586, "step": 18855 }, { "epoch": 0.6356803397485591, "grad_norm": 22.70155143737793, "learning_rate": 3.526813041442855e-07, "logits/chosen": -1.983559012413025, "logits/rejected": -2.0745460987091064, "logps/chosen": -1.7819048166275024, "logps/rejected": -2.053633213043213, "loss": 2.3117, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.819047927856445, "rewards/margins": 2.717284679412842, "rewards/rejected": -20.536331176757812, "step": 18860 }, { "epoch": 0.6358488658195423, "grad_norm": 47.017906188964844, "learning_rate": 3.524002519584757e-07, "logits/chosen": -1.9625564813613892, "logits/rejected": -2.141477108001709, "logps/chosen": -1.758323073387146, "logps/rejected": -1.9290144443511963, "loss": 2.6992, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.58323097229004, "rewards/margins": 1.706911325454712, "rewards/rejected": -19.290143966674805, "step": 18865 }, { "epoch": 0.6360173918905254, "grad_norm": 29.950515747070312, "learning_rate": 3.5211925085112347e-07, "logits/chosen": -1.0737838745117188, "logits/rejected": -1.1902484893798828, "logps/chosen": -2.2974486351013184, "logps/rejected": -2.5132625102996826, "loss": 3.3409, "rewards/accuracies": 0.5, "rewards/chosen": -22.974483489990234, "rewards/margins": 2.15814208984375, "rewards/rejected": -25.132625579833984, "step": 18870 }, { "epoch": 0.6361859179615087, "grad_norm": 20.321805953979492, "learning_rate": 3.518383009194724e-07, "logits/chosen": -1.7747859954833984, "logits/rejected": -1.9187867641448975, "logps/chosen": -2.972046375274658, "logps/rejected": -2.7977707386016846, "loss": 6.132, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -29.7204647064209, "rewards/margins": -1.7427597045898438, "rewards/rejected": -27.977706909179688, "step": 18875 }, { "epoch": 0.6363544440324919, "grad_norm": 22.178987503051758, "learning_rate": 3.5155740226074793e-07, "logits/chosen": -2.397124767303467, "logits/rejected": -2.566413402557373, "logps/chosen": -2.435213565826416, "logps/rejected": -2.9507038593292236, "loss": 2.6944, "rewards/accuracies": 0.5, "rewards/chosen": -24.35213851928711, "rewards/margins": 5.154902458190918, "rewards/rejected": -29.50704002380371, "step": 18880 }, { "epoch": 0.636522970103475, "grad_norm": 14.269989013671875, "learning_rate": 3.512765549721581e-07, "logits/chosen": -1.5212559700012207, "logits/rejected": -1.7017109394073486, "logps/chosen": -1.7232303619384766, "logps/rejected": -1.922149658203125, "loss": 2.4591, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.232303619384766, "rewards/margins": 1.989192247390747, "rewards/rejected": -19.221494674682617, "step": 18885 }, { "epoch": 0.6366914961744582, "grad_norm": 23.027902603149414, "learning_rate": 3.5099575915089307e-07, "logits/chosen": -1.8852980136871338, "logits/rejected": -2.2309741973876953, "logps/chosen": -2.5858306884765625, "logps/rejected": -2.9226760864257812, "loss": 2.2128, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.858306884765625, "rewards/margins": 3.3684539794921875, "rewards/rejected": -29.226760864257812, "step": 18890 }, { "epoch": 0.6368600222454414, "grad_norm": 57.29917907714844, "learning_rate": 3.507150148941255e-07, "logits/chosen": -1.6230862140655518, "logits/rejected": -2.040215015411377, "logps/chosen": -1.9775832891464233, "logps/rejected": -2.1819052696228027, "loss": 2.2303, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.775833129882812, "rewards/margins": 2.0432209968566895, "rewards/rejected": -21.819053649902344, "step": 18895 }, { "epoch": 0.6370285483164245, "grad_norm": 62.11034393310547, "learning_rate": 3.5043432229900946e-07, "logits/chosen": -1.6436758041381836, "logits/rejected": -2.031874179840088, "logps/chosen": -1.860642671585083, "logps/rejected": -1.9398397207260132, "loss": 2.9126, "rewards/accuracies": 0.5, "rewards/chosen": -18.606426239013672, "rewards/margins": 0.7919692993164062, "rewards/rejected": -19.398395538330078, "step": 18900 }, { "epoch": 0.6371970743874077, "grad_norm": 253.6606903076172, "learning_rate": 3.5015368146268186e-07, "logits/chosen": -1.4768943786621094, "logits/rejected": -1.5061800479888916, "logps/chosen": -2.92598295211792, "logps/rejected": -3.0826478004455566, "loss": 3.2916, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -29.25983238220215, "rewards/margins": 1.566645860671997, "rewards/rejected": -30.82647705078125, "step": 18905 }, { "epoch": 0.637365600458391, "grad_norm": 38.190616607666016, "learning_rate": 3.498730924822616e-07, "logits/chosen": -1.4217101335525513, "logits/rejected": -1.5524682998657227, "logps/chosen": -2.2179980278015137, "logps/rejected": -2.49385404586792, "loss": 2.591, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.179981231689453, "rewards/margins": 2.7585608959198, "rewards/rejected": -24.938539505004883, "step": 18910 }, { "epoch": 0.6375341265293741, "grad_norm": 14.970929145812988, "learning_rate": 3.4959255545484907e-07, "logits/chosen": -1.656989336013794, "logits/rejected": -1.7032983303070068, "logps/chosen": -1.9883617162704468, "logps/rejected": -2.0293631553649902, "loss": 3.119, "rewards/accuracies": 0.5, "rewards/chosen": -19.883617401123047, "rewards/margins": 0.41001588106155396, "rewards/rejected": -20.29363441467285, "step": 18915 }, { "epoch": 0.6377026526003573, "grad_norm": 106.11602020263672, "learning_rate": 3.4931207047752725e-07, "logits/chosen": -1.71733820438385, "logits/rejected": -1.701205849647522, "logps/chosen": -2.496513843536377, "logps/rejected": -2.5461325645446777, "loss": 2.7739, "rewards/accuracies": 0.5, "rewards/chosen": -24.965137481689453, "rewards/margins": 0.4961865544319153, "rewards/rejected": -25.461322784423828, "step": 18920 }, { "epoch": 0.6378711786713405, "grad_norm": 25.053218841552734, "learning_rate": 3.49031637647361e-07, "logits/chosen": -1.6383533477783203, "logits/rejected": -2.0752038955688477, "logps/chosen": -2.378575563430786, "logps/rejected": -2.8636727333068848, "loss": 2.3228, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.785757064819336, "rewards/margins": 4.850972652435303, "rewards/rejected": -28.636730194091797, "step": 18925 }, { "epoch": 0.6380397047423236, "grad_norm": 26.07686996459961, "learning_rate": 3.487512570613971e-07, "logits/chosen": -1.9094693660736084, "logits/rejected": -2.106689453125, "logps/chosen": -2.4322314262390137, "logps/rejected": -3.324227809906006, "loss": 1.8016, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -24.322317123413086, "rewards/margins": 8.919961929321289, "rewards/rejected": -33.242279052734375, "step": 18930 }, { "epoch": 0.6382082308133068, "grad_norm": 19.347030639648438, "learning_rate": 3.484709288166641e-07, "logits/chosen": -1.3109431266784668, "logits/rejected": -1.3125630617141724, "logps/chosen": -2.019881010055542, "logps/rejected": -2.2754335403442383, "loss": 2.3485, "rewards/accuracies": 0.5, "rewards/chosen": -20.198810577392578, "rewards/margins": 2.555525302886963, "rewards/rejected": -22.754335403442383, "step": 18935 }, { "epoch": 0.63837675688429, "grad_norm": 24.105995178222656, "learning_rate": 3.481906530101726e-07, "logits/chosen": -1.6839663982391357, "logits/rejected": -1.6865516901016235, "logps/chosen": -3.2025482654571533, "logps/rejected": -3.766237735748291, "loss": 2.4375, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -32.02547836303711, "rewards/margins": 5.636894226074219, "rewards/rejected": -37.66237258911133, "step": 18940 }, { "epoch": 0.6385452829552731, "grad_norm": 30.606367111206055, "learning_rate": 3.4791042973891524e-07, "logits/chosen": -2.0236551761627197, "logits/rejected": -2.570330858230591, "logps/chosen": -3.1450514793395996, "logps/rejected": -4.160052299499512, "loss": 1.957, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -31.450511932373047, "rewards/margins": 10.15001106262207, "rewards/rejected": -41.60052490234375, "step": 18945 }, { "epoch": 0.6387138090262564, "grad_norm": 49.75942611694336, "learning_rate": 3.476302590998659e-07, "logits/chosen": -2.274714469909668, "logits/rejected": -2.318877935409546, "logps/chosen": -2.3748977184295654, "logps/rejected": -2.564833164215088, "loss": 3.0908, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.74897575378418, "rewards/margins": 1.899355173110962, "rewards/rejected": -25.648334503173828, "step": 18950 }, { "epoch": 0.6388823350972396, "grad_norm": 31.032297134399414, "learning_rate": 3.4735014118998073e-07, "logits/chosen": -1.6689374446868896, "logits/rejected": -1.5681211948394775, "logps/chosen": -2.400958299636841, "logps/rejected": -2.3238704204559326, "loss": 4.0307, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.009584426879883, "rewards/margins": -0.7708789110183716, "rewards/rejected": -23.238704681396484, "step": 18955 }, { "epoch": 0.6390508611682227, "grad_norm": 91.0964584350586, "learning_rate": 3.4707007610619777e-07, "logits/chosen": -1.6875574588775635, "logits/rejected": -1.9102585315704346, "logps/chosen": -2.6997759342193604, "logps/rejected": -3.2260589599609375, "loss": 2.3887, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -26.997756958007812, "rewards/margins": 5.2628350257873535, "rewards/rejected": -32.26059341430664, "step": 18960 }, { "epoch": 0.6392193872392059, "grad_norm": 118.5235824584961, "learning_rate": 3.4679006394543606e-07, "logits/chosen": -1.7751781940460205, "logits/rejected": -1.6583400964736938, "logps/chosen": -2.2704877853393555, "logps/rejected": -2.3859403133392334, "loss": 4.3287, "rewards/accuracies": 0.5, "rewards/chosen": -22.704877853393555, "rewards/margins": 1.154525637626648, "rewards/rejected": -23.859405517578125, "step": 18965 }, { "epoch": 0.6393879133101891, "grad_norm": 38.85346221923828, "learning_rate": 3.4651010480459697e-07, "logits/chosen": -1.6581932306289673, "logits/rejected": -1.7150039672851562, "logps/chosen": -2.4210574626922607, "logps/rejected": -2.385411500930786, "loss": 3.6377, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -24.210575103759766, "rewards/margins": -0.3564607501029968, "rewards/rejected": -23.854114532470703, "step": 18970 }, { "epoch": 0.6395564393811722, "grad_norm": 38.01725769042969, "learning_rate": 3.462301987805634e-07, "logits/chosen": -1.2921791076660156, "logits/rejected": -1.5788782835006714, "logps/chosen": -2.8870227336883545, "logps/rejected": -3.223814010620117, "loss": 3.081, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -28.870227813720703, "rewards/margins": 3.3679096698760986, "rewards/rejected": -32.238136291503906, "step": 18975 }, { "epoch": 0.6397249654521554, "grad_norm": 83.44391632080078, "learning_rate": 3.459503459701998e-07, "logits/chosen": -1.817146897315979, "logits/rejected": -2.30041766166687, "logps/chosen": -2.1918838024139404, "logps/rejected": -2.373908281326294, "loss": 1.9129, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.918842315673828, "rewards/margins": 1.8202422857284546, "rewards/rejected": -23.73908233642578, "step": 18980 }, { "epoch": 0.6398934915231387, "grad_norm": 35.81073760986328, "learning_rate": 3.456705464703521e-07, "logits/chosen": -1.5047098398208618, "logits/rejected": -1.621691107749939, "logps/chosen": -1.9369847774505615, "logps/rejected": -2.070429563522339, "loss": 2.3392, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.369848251342773, "rewards/margins": 1.3344460725784302, "rewards/rejected": -20.704296112060547, "step": 18985 }, { "epoch": 0.6400620175941218, "grad_norm": 69.3550796508789, "learning_rate": 3.4539080037784783e-07, "logits/chosen": -1.557337999343872, "logits/rejected": -1.6153488159179688, "logps/chosen": -2.2617194652557373, "logps/rejected": -2.4085395336151123, "loss": 3.6134, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.617197036743164, "rewards/margins": 1.468200922012329, "rewards/rejected": -24.085397720336914, "step": 18990 }, { "epoch": 0.640230543665105, "grad_norm": 36.558536529541016, "learning_rate": 3.451111077894963e-07, "logits/chosen": -1.361604928970337, "logits/rejected": -1.8662503957748413, "logps/chosen": -2.5303022861480713, "logps/rejected": -2.9746086597442627, "loss": 2.3299, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.303020477294922, "rewards/margins": 4.443061828613281, "rewards/rejected": -29.746084213256836, "step": 18995 }, { "epoch": 0.6403990697360882, "grad_norm": 42.665504455566406, "learning_rate": 3.448314688020879e-07, "logits/chosen": -1.1075172424316406, "logits/rejected": -0.9852533340454102, "logps/chosen": -2.483982801437378, "logps/rejected": -2.329929828643799, "loss": 4.841, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.839826583862305, "rewards/margins": -1.540529489517212, "rewards/rejected": -23.299297332763672, "step": 19000 }, { "epoch": 0.6405675958070713, "grad_norm": 19.328079223632812, "learning_rate": 3.445518835123948e-07, "logits/chosen": -1.4079978466033936, "logits/rejected": -1.3210872411727905, "logps/chosen": -2.058558225631714, "logps/rejected": -1.71990168094635, "loss": 6.4632, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -20.585582733154297, "rewards/margins": -3.386566638946533, "rewards/rejected": -17.199016571044922, "step": 19005 }, { "epoch": 0.6407361218780545, "grad_norm": 41.33991622924805, "learning_rate": 3.442723520171703e-07, "logits/chosen": -2.0147464275360107, "logits/rejected": -1.8899329900741577, "logps/chosen": -2.5618722438812256, "logps/rejected": -2.6130149364471436, "loss": 3.2193, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.618722915649414, "rewards/margins": 0.5114272236824036, "rewards/rejected": -26.130151748657227, "step": 19010 }, { "epoch": 0.6409046479490377, "grad_norm": 17.656309127807617, "learning_rate": 3.439928744131497e-07, "logits/chosen": -1.745766043663025, "logits/rejected": -1.926134467124939, "logps/chosen": -2.9451346397399902, "logps/rejected": -3.5164706707000732, "loss": 2.238, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -29.45134925842285, "rewards/margins": 5.713357925415039, "rewards/rejected": -35.164703369140625, "step": 19015 }, { "epoch": 0.6410731740200208, "grad_norm": 90.32007598876953, "learning_rate": 3.437134507970485e-07, "logits/chosen": -1.8085434436798096, "logits/rejected": -1.9374297857284546, "logps/chosen": -2.5853800773620605, "logps/rejected": -2.4945881366729736, "loss": 4.7094, "rewards/accuracies": 0.5, "rewards/chosen": -25.853801727294922, "rewards/margins": -0.9079216122627258, "rewards/rejected": -24.94588279724121, "step": 19020 }, { "epoch": 0.6412417000910041, "grad_norm": 20.479576110839844, "learning_rate": 3.4343408126556455e-07, "logits/chosen": -1.6717798709869385, "logits/rejected": -1.769441843032837, "logps/chosen": -1.9906914234161377, "logps/rejected": -2.4700753688812256, "loss": 1.6959, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.90691375732422, "rewards/margins": 4.793841361999512, "rewards/rejected": -24.700756072998047, "step": 19025 }, { "epoch": 0.6414102261619873, "grad_norm": 35.265438079833984, "learning_rate": 3.4315476591537683e-07, "logits/chosen": -1.5250223875045776, "logits/rejected": -1.7313295602798462, "logps/chosen": -2.3894386291503906, "logps/rejected": -2.6715071201324463, "loss": 3.4896, "rewards/accuracies": 0.5, "rewards/chosen": -23.894384384155273, "rewards/margins": 2.8206865787506104, "rewards/rejected": -26.715068817138672, "step": 19030 }, { "epoch": 0.6415787522329705, "grad_norm": 69.1385269165039, "learning_rate": 3.4287550484314497e-07, "logits/chosen": -1.355473518371582, "logits/rejected": -1.6162292957305908, "logps/chosen": -2.0230188369750977, "logps/rejected": -2.365042209625244, "loss": 2.5705, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.23019027709961, "rewards/margins": 3.420231580734253, "rewards/rejected": -23.650421142578125, "step": 19035 }, { "epoch": 0.6417472783039536, "grad_norm": 47.425376892089844, "learning_rate": 3.425962981455105e-07, "logits/chosen": -1.7746198177337646, "logits/rejected": -1.81686532497406, "logps/chosen": -2.368058681488037, "logps/rejected": -2.510368585586548, "loss": 2.9706, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -23.680587768554688, "rewards/margins": 1.423097848892212, "rewards/rejected": -25.10368537902832, "step": 19040 }, { "epoch": 0.6419158043749368, "grad_norm": 52.12995529174805, "learning_rate": 3.4231714591909573e-07, "logits/chosen": -1.6075427532196045, "logits/rejected": -1.6615245342254639, "logps/chosen": -1.8489784002304077, "logps/rejected": -1.945669412612915, "loss": 2.9096, "rewards/accuracies": 0.5, "rewards/chosen": -18.489784240722656, "rewards/margins": 0.9669092893600464, "rewards/rejected": -19.456695556640625, "step": 19045 }, { "epoch": 0.64208433044592, "grad_norm": 27.588619232177734, "learning_rate": 3.420380482605045e-07, "logits/chosen": -1.9126943349838257, "logits/rejected": -1.8093080520629883, "logps/chosen": -2.4399096965789795, "logps/rejected": -3.0001025199890137, "loss": 2.9082, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.399097442626953, "rewards/margins": 5.6019287109375, "rewards/rejected": -30.001026153564453, "step": 19050 }, { "epoch": 0.6422528565169031, "grad_norm": 90.55980682373047, "learning_rate": 3.417590052663211e-07, "logits/chosen": -2.1148598194122314, "logits/rejected": -2.1183550357818604, "logps/chosen": -2.115551710128784, "logps/rejected": -2.486673593521118, "loss": 2.0638, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.155513763427734, "rewards/margins": 3.7112205028533936, "rewards/rejected": -24.866735458374023, "step": 19055 }, { "epoch": 0.6424213825878864, "grad_norm": 18.421188354492188, "learning_rate": 3.414800170331116e-07, "logits/chosen": -1.5110455751419067, "logits/rejected": -1.6215041875839233, "logps/chosen": -1.7555958032608032, "logps/rejected": -1.8505535125732422, "loss": 2.9856, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.555957794189453, "rewards/margins": 0.9495766758918762, "rewards/rejected": -18.505535125732422, "step": 19060 }, { "epoch": 0.6425899086588696, "grad_norm": 24.81529998779297, "learning_rate": 3.4120108365742274e-07, "logits/chosen": -1.4183194637298584, "logits/rejected": -1.4200835227966309, "logps/chosen": -2.245863437652588, "logps/rejected": -2.2999191284179688, "loss": 2.7156, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.458633422851562, "rewards/margins": 0.5405560731887817, "rewards/rejected": -22.999189376831055, "step": 19065 }, { "epoch": 0.6427584347298527, "grad_norm": 18.90172576904297, "learning_rate": 3.4092220523578244e-07, "logits/chosen": -1.430742859840393, "logits/rejected": -1.766953468322754, "logps/chosen": -2.45387601852417, "logps/rejected": -2.677732467651367, "loss": 2.8874, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.538759231567383, "rewards/margins": 2.23856520652771, "rewards/rejected": -26.777324676513672, "step": 19070 }, { "epoch": 0.6429269608008359, "grad_norm": 93.47918701171875, "learning_rate": 3.406433818646993e-07, "logits/chosen": -1.84613835811615, "logits/rejected": -2.163917064666748, "logps/chosen": -2.7454466819763184, "logps/rejected": -3.1951904296875, "loss": 2.0116, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -27.4544677734375, "rewards/margins": 4.497437477111816, "rewards/rejected": -31.951904296875, "step": 19075 }, { "epoch": 0.643095486871819, "grad_norm": 256.0754699707031, "learning_rate": 3.403646136406636e-07, "logits/chosen": -1.865033745765686, "logits/rejected": -1.7697813510894775, "logps/chosen": -2.9825804233551025, "logps/rejected": -3.022819995880127, "loss": 4.0846, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -29.8258056640625, "rewards/margins": 0.4023931622505188, "rewards/rejected": -30.228199005126953, "step": 19080 }, { "epoch": 0.6432640129428022, "grad_norm": 52.34347915649414, "learning_rate": 3.4008590066014564e-07, "logits/chosen": -1.9278236627578735, "logits/rejected": -2.2672438621520996, "logps/chosen": -3.0003390312194824, "logps/rejected": -3.2712783813476562, "loss": 3.44, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -30.003393173217773, "rewards/margins": 2.7093968391418457, "rewards/rejected": -32.712791442871094, "step": 19085 }, { "epoch": 0.6434325390137854, "grad_norm": 18.355587005615234, "learning_rate": 3.3980724301959704e-07, "logits/chosen": -1.8786852359771729, "logits/rejected": -2.1181275844573975, "logps/chosen": -1.9924042224884033, "logps/rejected": -2.353553295135498, "loss": 2.2139, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.924043655395508, "rewards/margins": 3.611489772796631, "rewards/rejected": -23.535533905029297, "step": 19090 }, { "epoch": 0.6436010650847687, "grad_norm": 28.176570892333984, "learning_rate": 3.3952864081545017e-07, "logits/chosen": -1.975188970565796, "logits/rejected": -2.2873916625976562, "logps/chosen": -1.818352460861206, "logps/rejected": -2.311967134475708, "loss": 2.5429, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.18352699279785, "rewards/margins": 4.936144828796387, "rewards/rejected": -23.119670867919922, "step": 19095 }, { "epoch": 0.6437695911557518, "grad_norm": 18.773603439331055, "learning_rate": 3.392500941441188e-07, "logits/chosen": -1.4203368425369263, "logits/rejected": -2.007047653198242, "logps/chosen": -2.115145206451416, "logps/rejected": -2.8709805011749268, "loss": 1.8211, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.15144920349121, "rewards/margins": 7.558354377746582, "rewards/rejected": -28.709802627563477, "step": 19100 }, { "epoch": 0.643938117226735, "grad_norm": 31.940587997436523, "learning_rate": 3.389716031019962e-07, "logits/chosen": -0.8345028758049011, "logits/rejected": -0.8378894925117493, "logps/chosen": -2.056838274002075, "logps/rejected": -2.223741054534912, "loss": 3.4076, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -20.568382263183594, "rewards/margins": 1.6690285205841064, "rewards/rejected": -22.237411499023438, "step": 19105 }, { "epoch": 0.6441066432977182, "grad_norm": 10.822291374206543, "learning_rate": 3.3869316778545754e-07, "logits/chosen": -2.2176527976989746, "logits/rejected": -2.4825215339660645, "logps/chosen": -2.1370999813079834, "logps/rejected": -2.312455654144287, "loss": 2.7543, "rewards/accuracies": 0.5, "rewards/chosen": -21.37099838256836, "rewards/margins": 1.7535556554794312, "rewards/rejected": -23.124553680419922, "step": 19110 }, { "epoch": 0.6442751693687013, "grad_norm": 29.39801788330078, "learning_rate": 3.384147882908582e-07, "logits/chosen": -1.725995659828186, "logits/rejected": -2.026106834411621, "logps/chosen": -1.9508934020996094, "logps/rejected": -2.574272871017456, "loss": 1.511, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.508935928344727, "rewards/margins": 6.233796119689941, "rewards/rejected": -25.74273109436035, "step": 19115 }, { "epoch": 0.6444436954396845, "grad_norm": 27.90772247314453, "learning_rate": 3.3813646471453473e-07, "logits/chosen": -1.8614917993545532, "logits/rejected": -1.9006048440933228, "logps/chosen": -2.628399610519409, "logps/rejected": -2.999600410461426, "loss": 2.4409, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.28399658203125, "rewards/margins": 3.712005615234375, "rewards/rejected": -29.996002197265625, "step": 19120 }, { "epoch": 0.6446122215106677, "grad_norm": 21.94499397277832, "learning_rate": 3.3785819715280343e-07, "logits/chosen": -1.3559751510620117, "logits/rejected": -1.900770902633667, "logps/chosen": -2.3102097511291504, "logps/rejected": -2.738304853439331, "loss": 1.551, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -23.10209846496582, "rewards/margins": 4.2809529304504395, "rewards/rejected": -27.3830509185791, "step": 19125 }, { "epoch": 0.6447807475816508, "grad_norm": 60.39374923706055, "learning_rate": 3.37579985701962e-07, "logits/chosen": -2.079019069671631, "logits/rejected": -2.3765835762023926, "logps/chosen": -1.9837615489959717, "logps/rejected": -2.609318494796753, "loss": 1.6134, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.837615966796875, "rewards/margins": 6.255568504333496, "rewards/rejected": -26.093181610107422, "step": 19130 }, { "epoch": 0.6449492736526341, "grad_norm": 36.971927642822266, "learning_rate": 3.373018304582884e-07, "logits/chosen": -1.6064996719360352, "logits/rejected": -1.6553875207901, "logps/chosen": -2.9100847244262695, "logps/rejected": -2.7735984325408936, "loss": 4.6983, "rewards/accuracies": 0.5, "rewards/chosen": -29.100849151611328, "rewards/margins": -1.3648655414581299, "rewards/rejected": -27.73598289489746, "step": 19135 }, { "epoch": 0.6451177997236173, "grad_norm": 76.00817108154297, "learning_rate": 3.3702373151804124e-07, "logits/chosen": -1.5281689167022705, "logits/rejected": -1.7328161001205444, "logps/chosen": -2.793159246444702, "logps/rejected": -3.744621753692627, "loss": 2.5595, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -27.931591033935547, "rewards/margins": 9.514626502990723, "rewards/rejected": -37.44622039794922, "step": 19140 }, { "epoch": 0.6452863257946004, "grad_norm": 36.74580764770508, "learning_rate": 3.367456889774597e-07, "logits/chosen": -2.091965675354004, "logits/rejected": -2.1845428943634033, "logps/chosen": -2.186652660369873, "logps/rejected": -2.5208215713500977, "loss": 2.1896, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.86652946472168, "rewards/margins": 3.3416855335235596, "rewards/rejected": -25.208215713500977, "step": 19145 }, { "epoch": 0.6454548518655836, "grad_norm": 78.41200256347656, "learning_rate": 3.3646770293276303e-07, "logits/chosen": -1.791298270225525, "logits/rejected": -1.397806167602539, "logps/chosen": -2.0304665565490723, "logps/rejected": -2.0542068481445312, "loss": 3.3975, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.30466651916504, "rewards/margins": 0.2374025285243988, "rewards/rejected": -20.542068481445312, "step": 19150 }, { "epoch": 0.6456233779365668, "grad_norm": 38.56045150756836, "learning_rate": 3.3618977348015166e-07, "logits/chosen": -1.378115177154541, "logits/rejected": -1.7747703790664673, "logps/chosen": -1.6445846557617188, "logps/rejected": -1.9529813528060913, "loss": 1.9329, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.44584846496582, "rewards/margins": 3.0839669704437256, "rewards/rejected": -19.529815673828125, "step": 19155 }, { "epoch": 0.6457919040075499, "grad_norm": 39.86232376098633, "learning_rate": 3.3591190071580574e-07, "logits/chosen": -1.5744518041610718, "logits/rejected": -1.954493761062622, "logps/chosen": -1.6901108026504517, "logps/rejected": -2.289529323577881, "loss": 1.4474, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -16.90110969543457, "rewards/margins": 5.9941816329956055, "rewards/rejected": -22.89529037475586, "step": 19160 }, { "epoch": 0.6459604300785331, "grad_norm": 20.42445945739746, "learning_rate": 3.356340847358861e-07, "logits/chosen": -1.4461759328842163, "logits/rejected": -1.6852458715438843, "logps/chosen": -3.33441424369812, "logps/rejected": -3.3620505332946777, "loss": 3.4576, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -33.34414291381836, "rewards/margins": 0.27636146545410156, "rewards/rejected": -33.620506286621094, "step": 19165 }, { "epoch": 0.6461289561495164, "grad_norm": 25.505706787109375, "learning_rate": 3.353563256365342e-07, "logits/chosen": -1.3414726257324219, "logits/rejected": -1.9695316553115845, "logps/chosen": -1.883315086364746, "logps/rejected": -2.6686549186706543, "loss": 1.8392, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.833148956298828, "rewards/margins": 7.853400230407715, "rewards/rejected": -26.686548233032227, "step": 19170 }, { "epoch": 0.6462974822204995, "grad_norm": 55.38985061645508, "learning_rate": 3.350786235138711e-07, "logits/chosen": -1.4357484579086304, "logits/rejected": -2.1208090782165527, "logps/chosen": -1.828082799911499, "logps/rejected": -2.372054100036621, "loss": 2.4524, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.280826568603516, "rewards/margins": 5.4397125244140625, "rewards/rejected": -23.720539093017578, "step": 19175 }, { "epoch": 0.6464660082914827, "grad_norm": 37.25720977783203, "learning_rate": 3.348009784639988e-07, "logits/chosen": -1.5288727283477783, "logits/rejected": -2.281703233718872, "logps/chosen": -2.1740665435791016, "logps/rejected": -3.175799608230591, "loss": 2.1291, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.740665435791016, "rewards/margins": 10.017329216003418, "rewards/rejected": -31.757991790771484, "step": 19180 }, { "epoch": 0.6466345343624659, "grad_norm": 32.539100646972656, "learning_rate": 3.3452339058299914e-07, "logits/chosen": -1.4856078624725342, "logits/rejected": -1.3350521326065063, "logps/chosen": -2.816305160522461, "logps/rejected": -2.017688751220703, "loss": 11.0178, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -28.16305160522461, "rewards/margins": -7.986166477203369, "rewards/rejected": -20.176883697509766, "step": 19185 }, { "epoch": 0.646803060433449, "grad_norm": 61.53752517700195, "learning_rate": 3.3424585996693483e-07, "logits/chosen": -1.2890323400497437, "logits/rejected": -1.2937710285186768, "logps/chosen": -2.464566707611084, "logps/rejected": -2.404175281524658, "loss": 6.1059, "rewards/accuracies": 0.5, "rewards/chosen": -24.64566993713379, "rewards/margins": -0.6039150357246399, "rewards/rejected": -24.0417537689209, "step": 19190 }, { "epoch": 0.6469715865044322, "grad_norm": 17.450136184692383, "learning_rate": 3.339683867118477e-07, "logits/chosen": -1.4439754486083984, "logits/rejected": -1.957381010055542, "logps/chosen": -3.2206637859344482, "logps/rejected": -3.8896000385284424, "loss": 1.3379, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -32.206642150878906, "rewards/margins": 6.689364433288574, "rewards/rejected": -38.89600372314453, "step": 19195 }, { "epoch": 0.6471401125754154, "grad_norm": 25.236295700073242, "learning_rate": 3.3369097091376045e-07, "logits/chosen": -1.224825143814087, "logits/rejected": -1.328102946281433, "logps/chosen": -2.1852471828460693, "logps/rejected": -2.2270686626434326, "loss": 3.18, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.852474212646484, "rewards/margins": 0.41821593046188354, "rewards/rejected": -22.270687103271484, "step": 19200 }, { "epoch": 0.6471401125754154, "eval_logits/chosen": -2.086235761642456, "eval_logits/rejected": -2.2461965084075928, "eval_logps/chosen": -2.1888742446899414, "eval_logps/rejected": -2.3300468921661377, "eval_loss": 3.0306525230407715, "eval_rewards/accuracies": 0.6200000047683716, "eval_rewards/chosen": -21.88874053955078, "eval_rewards/margins": 1.4117244482040405, "eval_rewards/rejected": -23.30046844482422, "eval_runtime": 12.8872, "eval_samples_per_second": 7.76, "eval_steps_per_second": 1.94, "step": 19200 }, { "epoch": 0.6473086386463986, "grad_norm": 30.1646785736084, "learning_rate": 3.3341361266867607e-07, "logits/chosen": -1.3984628915786743, "logits/rejected": -1.7773818969726562, "logps/chosen": -1.8794208765029907, "logps/rejected": -2.1332011222839355, "loss": 2.7122, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.79421043395996, "rewards/margins": 2.5378024578094482, "rewards/rejected": -21.33201026916504, "step": 19205 }, { "epoch": 0.6474771647173818, "grad_norm": 9.875208854675293, "learning_rate": 3.33136312072577e-07, "logits/chosen": -1.5803048610687256, "logits/rejected": -1.8212120532989502, "logps/chosen": -1.9327504634857178, "logps/rejected": -1.9376983642578125, "loss": 3.4788, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.327505111694336, "rewards/margins": 0.04947824403643608, "rewards/rejected": -19.376985549926758, "step": 19210 }, { "epoch": 0.647645690788365, "grad_norm": 22.80118751525879, "learning_rate": 3.3285906922142615e-07, "logits/chosen": -1.0638067722320557, "logits/rejected": -1.1562116146087646, "logps/chosen": -2.210082530975342, "logps/rejected": -2.3568356037139893, "loss": 1.9867, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.100826263427734, "rewards/margins": 1.4675300121307373, "rewards/rejected": -23.568355560302734, "step": 19215 }, { "epoch": 0.6478142168593481, "grad_norm": 48.995601654052734, "learning_rate": 3.325818842111663e-07, "logits/chosen": -1.4287619590759277, "logits/rejected": -1.3261592388153076, "logps/chosen": -2.314736843109131, "logps/rejected": -2.5384268760681152, "loss": 3.3182, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.147369384765625, "rewards/margins": 2.2368996143341064, "rewards/rejected": -25.384267807006836, "step": 19220 }, { "epoch": 0.6479827429303313, "grad_norm": 5.7302073400933295e-05, "learning_rate": 3.3230475713772044e-07, "logits/chosen": -1.295851230621338, "logits/rejected": -1.7130746841430664, "logps/chosen": -2.718839406967163, "logps/rejected": -3.571959972381592, "loss": 2.3557, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -27.188396453857422, "rewards/margins": 8.531206130981445, "rewards/rejected": -35.719600677490234, "step": 19225 }, { "epoch": 0.6481512690013145, "grad_norm": 45.10464096069336, "learning_rate": 3.3202768809699106e-07, "logits/chosen": -1.7688840627670288, "logits/rejected": -1.9455190896987915, "logps/chosen": -2.6712350845336914, "logps/rejected": -2.82537841796875, "loss": 2.169, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.712352752685547, "rewards/margins": 1.541433334350586, "rewards/rejected": -28.2537841796875, "step": 19230 }, { "epoch": 0.6483197950722976, "grad_norm": 10.823365211486816, "learning_rate": 3.3175067718486103e-07, "logits/chosen": -1.8669370412826538, "logits/rejected": -2.175741672515869, "logps/chosen": -2.35686993598938, "logps/rejected": -3.105584144592285, "loss": 1.8598, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -23.56869888305664, "rewards/margins": 7.487143516540527, "rewards/rejected": -31.05584144592285, "step": 19235 }, { "epoch": 0.6484883211432808, "grad_norm": 68.01353454589844, "learning_rate": 3.3147372449719304e-07, "logits/chosen": -1.0006816387176514, "logits/rejected": -1.3833884000778198, "logps/chosen": -2.709277629852295, "logps/rejected": -3.23040509223938, "loss": 1.3677, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -27.0927734375, "rewards/margins": 5.211273670196533, "rewards/rejected": -32.304046630859375, "step": 19240 }, { "epoch": 0.6486568472142641, "grad_norm": 25.578697204589844, "learning_rate": 3.311968301298291e-07, "logits/chosen": -1.8713645935058594, "logits/rejected": -1.9751352071762085, "logps/chosen": -2.5791544914245605, "logps/rejected": -2.552963972091675, "loss": 4.1632, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.79154396057129, "rewards/margins": -0.2619051933288574, "rewards/rejected": -25.529638290405273, "step": 19245 }, { "epoch": 0.6488253732852473, "grad_norm": 18.423656463623047, "learning_rate": 3.3091999417859174e-07, "logits/chosen": -1.3926771879196167, "logits/rejected": -1.9350593090057373, "logps/chosen": -2.724782943725586, "logps/rejected": -3.3932957649230957, "loss": 1.9378, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -27.24782943725586, "rewards/margins": 6.685128211975098, "rewards/rejected": -33.932960510253906, "step": 19250 }, { "epoch": 0.6489938993562304, "grad_norm": 27.966829299926758, "learning_rate": 3.306432167392829e-07, "logits/chosen": -1.7937549352645874, "logits/rejected": -1.6492153406143188, "logps/chosen": -2.1946823596954346, "logps/rejected": -2.183840751647949, "loss": 3.5634, "rewards/accuracies": 0.5, "rewards/chosen": -21.946823120117188, "rewards/margins": -0.10841550678014755, "rewards/rejected": -21.83840560913086, "step": 19255 }, { "epoch": 0.6491624254272136, "grad_norm": 32.890052795410156, "learning_rate": 3.3036649790768454e-07, "logits/chosen": -1.5309436321258545, "logits/rejected": -1.5625251531600952, "logps/chosen": -1.864524483680725, "logps/rejected": -1.8880809545516968, "loss": 2.9337, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.645244598388672, "rewards/margins": 0.2355668991804123, "rewards/rejected": -18.880809783935547, "step": 19260 }, { "epoch": 0.6493309514981968, "grad_norm": 15.722186088562012, "learning_rate": 3.300898377795578e-07, "logits/chosen": -1.6200459003448486, "logits/rejected": -1.7546627521514893, "logps/chosen": -2.278107166290283, "logps/rejected": -2.427480459213257, "loss": 3.1703, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.78107261657715, "rewards/margins": 1.4937334060668945, "rewards/rejected": -24.27480697631836, "step": 19265 }, { "epoch": 0.6494994775691799, "grad_norm": 37.55821228027344, "learning_rate": 3.2981323645064397e-07, "logits/chosen": -1.5431878566741943, "logits/rejected": -1.5832126140594482, "logps/chosen": -1.9823036193847656, "logps/rejected": -2.360731840133667, "loss": 1.3685, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -19.82303810119629, "rewards/margins": 3.784282684326172, "rewards/rejected": -23.607318878173828, "step": 19270 }, { "epoch": 0.6496680036401631, "grad_norm": 33.9102783203125, "learning_rate": 3.2953669401666405e-07, "logits/chosen": -1.7654139995574951, "logits/rejected": -2.2431235313415527, "logps/chosen": -3.234036922454834, "logps/rejected": -3.5689334869384766, "loss": 4.0651, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -32.340370178222656, "rewards/margins": 3.348963499069214, "rewards/rejected": -35.689334869384766, "step": 19275 }, { "epoch": 0.6498365297111464, "grad_norm": 16.531702041625977, "learning_rate": 3.292602105733182e-07, "logits/chosen": -1.793821096420288, "logits/rejected": -2.153486728668213, "logps/chosen": -2.093123435974121, "logps/rejected": -2.2675602436065674, "loss": 3.106, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.931236267089844, "rewards/margins": 1.7443650960922241, "rewards/rejected": -22.675600051879883, "step": 19280 }, { "epoch": 0.6500050557821295, "grad_norm": 29.99871063232422, "learning_rate": 3.2898378621628663e-07, "logits/chosen": -1.416312575340271, "logits/rejected": -1.304660677909851, "logps/chosen": -2.879304885864258, "logps/rejected": -2.3571860790252686, "loss": 8.2708, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -28.79305076599121, "rewards/margins": -5.221189975738525, "rewards/rejected": -23.57185935974121, "step": 19285 }, { "epoch": 0.6501735818531127, "grad_norm": 76.10118865966797, "learning_rate": 3.2870742104122885e-07, "logits/chosen": -1.12397038936615, "logits/rejected": -1.5121071338653564, "logps/chosen": -2.4723703861236572, "logps/rejected": -3.379451274871826, "loss": 1.7656, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -24.723703384399414, "rewards/margins": 9.070809364318848, "rewards/rejected": -33.79451370239258, "step": 19290 }, { "epoch": 0.6503421079240959, "grad_norm": 0.30136585235595703, "learning_rate": 3.2843111514378406e-07, "logits/chosen": -1.2250374555587769, "logits/rejected": -1.424181342124939, "logps/chosen": -3.2688241004943848, "logps/rejected": -3.828373670578003, "loss": 3.0649, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -32.68824005126953, "rewards/margins": 5.595494270324707, "rewards/rejected": -38.28373336791992, "step": 19295 }, { "epoch": 0.650510633995079, "grad_norm": 21.783082962036133, "learning_rate": 3.2815486861957073e-07, "logits/chosen": -1.941187858581543, "logits/rejected": -1.7948150634765625, "logps/chosen": -2.5010275840759277, "logps/rejected": -2.561431407928467, "loss": 3.7398, "rewards/accuracies": 0.5, "rewards/chosen": -25.010276794433594, "rewards/margins": 0.6040407419204712, "rewards/rejected": -25.614315032958984, "step": 19300 }, { "epoch": 0.6506791600660622, "grad_norm": 204.24371337890625, "learning_rate": 3.2787868156418697e-07, "logits/chosen": -1.5785753726959229, "logits/rejected": -1.9905147552490234, "logps/chosen": -2.33290696144104, "logps/rejected": -3.1187796592712402, "loss": 2.1347, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.329069137573242, "rewards/margins": 7.858725070953369, "rewards/rejected": -31.18779945373535, "step": 19305 }, { "epoch": 0.6508476861370454, "grad_norm": 61.92455291748047, "learning_rate": 3.276025540732104e-07, "logits/chosen": -1.7628555297851562, "logits/rejected": -1.9517265558242798, "logps/chosen": -2.293363332748413, "logps/rejected": -2.615460157394409, "loss": 2.8875, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.933635711669922, "rewards/margins": 3.2209677696228027, "rewards/rejected": -26.15460205078125, "step": 19310 }, { "epoch": 0.6510162122080286, "grad_norm": 28.794397354125977, "learning_rate": 3.273264862421974e-07, "logits/chosen": -1.9501034021377563, "logits/rejected": -2.310251235961914, "logps/chosen": -2.6617469787597656, "logps/rejected": -2.8715603351593018, "loss": 3.1637, "rewards/accuracies": 0.5, "rewards/chosen": -26.617467880249023, "rewards/margins": 2.0981335639953613, "rewards/rejected": -28.71560287475586, "step": 19315 }, { "epoch": 0.6511847382790118, "grad_norm": 55.15589904785156, "learning_rate": 3.270504781666845e-07, "logits/chosen": -1.3807077407836914, "logits/rejected": -1.6009747982025146, "logps/chosen": -2.3612663745880127, "logps/rejected": -2.615450382232666, "loss": 3.8383, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.6126651763916, "rewards/margins": 2.5418403148651123, "rewards/rejected": -26.154504776000977, "step": 19320 }, { "epoch": 0.651353264349995, "grad_norm": 28.586013793945312, "learning_rate": 3.267745299421871e-07, "logits/chosen": -1.59463369846344, "logits/rejected": -1.751935601234436, "logps/chosen": -2.453174114227295, "logps/rejected": -2.8395793437957764, "loss": 3.0157, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.531740188598633, "rewards/margins": 3.864053726196289, "rewards/rejected": -28.395793914794922, "step": 19325 }, { "epoch": 0.6515217904209781, "grad_norm": 17.22910499572754, "learning_rate": 3.2649864166420037e-07, "logits/chosen": -1.5936510562896729, "logits/rejected": -1.992539405822754, "logps/chosen": -2.317352771759033, "logps/rejected": -3.149284839630127, "loss": 1.7068, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.17352867126465, "rewards/margins": 8.319319725036621, "rewards/rejected": -31.492849349975586, "step": 19330 }, { "epoch": 0.6516903164919613, "grad_norm": 46.21883010864258, "learning_rate": 3.262228134281978e-07, "logits/chosen": -1.5569813251495361, "logits/rejected": -1.4711658954620361, "logps/chosen": -3.3634109497070312, "logps/rejected": -3.3014659881591797, "loss": 3.8962, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -33.63411331176758, "rewards/margins": -0.6194513440132141, "rewards/rejected": -33.0146598815918, "step": 19335 }, { "epoch": 0.6518588425629445, "grad_norm": 29.301965713500977, "learning_rate": 3.2594704532963304e-07, "logits/chosen": -0.8339643478393555, "logits/rejected": -1.0341277122497559, "logps/chosen": -2.964996576309204, "logps/rejected": -3.104041576385498, "loss": 3.136, "rewards/accuracies": 0.5, "rewards/chosen": -29.649967193603516, "rewards/margins": 1.3904485702514648, "rewards/rejected": -31.040414810180664, "step": 19340 }, { "epoch": 0.6520273686339276, "grad_norm": 64.72969818115234, "learning_rate": 3.256713374639386e-07, "logits/chosen": -2.102191209793091, "logits/rejected": -2.0357930660247803, "logps/chosen": -2.5758633613586426, "logps/rejected": -2.6760551929473877, "loss": 2.8529, "rewards/accuracies": 0.5, "rewards/chosen": -25.758636474609375, "rewards/margins": 1.001916766166687, "rewards/rejected": -26.76055335998535, "step": 19345 }, { "epoch": 0.6521958947049108, "grad_norm": 35.41569900512695, "learning_rate": 3.253956899265258e-07, "logits/chosen": -1.9102615118026733, "logits/rejected": -2.1288914680480957, "logps/chosen": -2.2416248321533203, "logps/rejected": -2.2110986709594727, "loss": 3.7389, "rewards/accuracies": 0.5, "rewards/chosen": -22.416248321533203, "rewards/margins": -0.30526217818260193, "rewards/rejected": -22.110986709594727, "step": 19350 }, { "epoch": 0.6523644207758941, "grad_norm": 26.262041091918945, "learning_rate": 3.251201028127856e-07, "logits/chosen": -1.384140968322754, "logits/rejected": -1.5131553411483765, "logps/chosen": -1.9074770212173462, "logps/rejected": -1.9531618356704712, "loss": 3.0518, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.074771881103516, "rewards/margins": 0.4568476676940918, "rewards/rejected": -19.531620025634766, "step": 19355 }, { "epoch": 0.6525329468468772, "grad_norm": 18.94156837463379, "learning_rate": 3.248445762180878e-07, "logits/chosen": -2.1064388751983643, "logits/rejected": -2.721240520477295, "logps/chosen": -2.8058600425720215, "logps/rejected": -3.3915016651153564, "loss": 4.1979, "rewards/accuracies": 0.5, "rewards/chosen": -28.0585994720459, "rewards/margins": 5.856412887573242, "rewards/rejected": -33.915016174316406, "step": 19360 }, { "epoch": 0.6527014729178604, "grad_norm": 25.645156860351562, "learning_rate": 3.245691102377814e-07, "logits/chosen": -1.8133264780044556, "logits/rejected": -2.2402541637420654, "logps/chosen": -2.4837303161621094, "logps/rejected": -3.0722298622131348, "loss": 1.9654, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -24.83730125427246, "rewards/margins": 5.884993076324463, "rewards/rejected": -30.7222957611084, "step": 19365 }, { "epoch": 0.6528699989888436, "grad_norm": 41.649349212646484, "learning_rate": 3.2429370496719425e-07, "logits/chosen": -1.8655116558074951, "logits/rejected": -1.7398531436920166, "logps/chosen": -2.2461180686950684, "logps/rejected": -2.325387477874756, "loss": 3.5284, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -22.461181640625, "rewards/margins": 0.7926937341690063, "rewards/rejected": -23.253875732421875, "step": 19370 }, { "epoch": 0.6530385250598267, "grad_norm": 38.51300048828125, "learning_rate": 3.2401836050163323e-07, "logits/chosen": -1.1705830097198486, "logits/rejected": -2.015150308609009, "logps/chosen": -2.3863024711608887, "logps/rejected": -3.1402745246887207, "loss": 2.464, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -23.863025665283203, "rewards/margins": 7.5397186279296875, "rewards/rejected": -31.402746200561523, "step": 19375 }, { "epoch": 0.6532070511308099, "grad_norm": 61.92262649536133, "learning_rate": 3.2374307693638444e-07, "logits/chosen": -1.3954817056655884, "logits/rejected": -1.8168102502822876, "logps/chosen": -2.647247791290283, "logps/rejected": -3.5583183765411377, "loss": 3.0354, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -26.472476959228516, "rewards/margins": 9.11070442199707, "rewards/rejected": -35.58317947387695, "step": 19380 }, { "epoch": 0.6533755772017931, "grad_norm": 0.01797177828848362, "learning_rate": 3.234678543667122e-07, "logits/chosen": -1.431006669998169, "logits/rejected": -1.6097593307495117, "logps/chosen": -3.0501410961151123, "logps/rejected": -3.4992783069610596, "loss": 2.6282, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -30.501415252685547, "rewards/margins": 4.491368293762207, "rewards/rejected": -34.99277877807617, "step": 19385 }, { "epoch": 0.6535441032727763, "grad_norm": 55.26557159423828, "learning_rate": 3.2319269288786057e-07, "logits/chosen": -1.0813277959823608, "logits/rejected": -1.3444488048553467, "logps/chosen": -2.77297306060791, "logps/rejected": -3.128937244415283, "loss": 3.0345, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -27.7297306060791, "rewards/margins": 3.559643268585205, "rewards/rejected": -31.28937339782715, "step": 19390 }, { "epoch": 0.6537126293437595, "grad_norm": 54.47288513183594, "learning_rate": 3.229175925950519e-07, "logits/chosen": -1.9032952785491943, "logits/rejected": -2.183346748352051, "logps/chosen": -2.3724331855773926, "logps/rejected": -2.8111534118652344, "loss": 3.0137, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.72433090209961, "rewards/margins": 4.387204647064209, "rewards/rejected": -28.111536026000977, "step": 19395 }, { "epoch": 0.6538811554147427, "grad_norm": 190.4893798828125, "learning_rate": 3.226425535834879e-07, "logits/chosen": -1.549120306968689, "logits/rejected": -1.7237541675567627, "logps/chosen": -3.474799633026123, "logps/rejected": -3.365884780883789, "loss": 4.8594, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -34.74799728393555, "rewards/margins": -1.0891517400741577, "rewards/rejected": -33.65884780883789, "step": 19400 }, { "epoch": 0.6540496814857258, "grad_norm": 35.21147918701172, "learning_rate": 3.2236757594834834e-07, "logits/chosen": -1.985065221786499, "logits/rejected": -1.8145793676376343, "logps/chosen": -1.9986995458602905, "logps/rejected": -1.9813121557235718, "loss": 3.5778, "rewards/accuracies": 0.5, "rewards/chosen": -19.98699378967285, "rewards/margins": -0.17387238144874573, "rewards/rejected": -19.813121795654297, "step": 19405 }, { "epoch": 0.654218207556709, "grad_norm": 38.15839767456055, "learning_rate": 3.220926597847923e-07, "logits/chosen": -1.4903342723846436, "logits/rejected": -1.9833576679229736, "logps/chosen": -2.1800193786621094, "logps/rejected": -2.4973537921905518, "loss": 3.3582, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -21.800195693969727, "rewards/margins": 3.1733405590057373, "rewards/rejected": -24.97353744506836, "step": 19410 }, { "epoch": 0.6543867336276922, "grad_norm": 48.588050842285156, "learning_rate": 3.2181780518795765e-07, "logits/chosen": -1.302261471748352, "logits/rejected": -1.409800410270691, "logps/chosen": -2.139906406402588, "logps/rejected": -2.2833118438720703, "loss": 1.8871, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.399066925048828, "rewards/margins": 1.4340507984161377, "rewards/rejected": -22.833118438720703, "step": 19415 }, { "epoch": 0.6545552596986753, "grad_norm": 0.09502626210451126, "learning_rate": 3.2154301225296033e-07, "logits/chosen": -1.509668231010437, "logits/rejected": -1.8733934164047241, "logps/chosen": -2.271191120147705, "logps/rejected": -3.536611557006836, "loss": 0.8511, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -22.7119083404541, "rewards/margins": 12.654206275939941, "rewards/rejected": -35.36611557006836, "step": 19420 }, { "epoch": 0.6547237857696586, "grad_norm": 26.83744239807129, "learning_rate": 3.212682810748955e-07, "logits/chosen": -1.461922287940979, "logits/rejected": -1.5134670734405518, "logps/chosen": -3.206855297088623, "logps/rejected": -3.866116762161255, "loss": 2.3329, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -32.06855392456055, "rewards/margins": 6.592613220214844, "rewards/rejected": -38.661170959472656, "step": 19425 }, { "epoch": 0.6548923118406418, "grad_norm": 39.84358596801758, "learning_rate": 3.2099361174883683e-07, "logits/chosen": -1.2554028034210205, "logits/rejected": -1.2571032047271729, "logps/chosen": -2.093191623687744, "logps/rejected": -2.3733489513397217, "loss": 2.7174, "rewards/accuracies": 0.5, "rewards/chosen": -20.931913375854492, "rewards/margins": 2.8015756607055664, "rewards/rejected": -23.733489990234375, "step": 19430 }, { "epoch": 0.655060837911625, "grad_norm": 69.73009490966797, "learning_rate": 3.207190043698367e-07, "logits/chosen": -2.204716920852661, "logits/rejected": -2.272737741470337, "logps/chosen": -2.727163791656494, "logps/rejected": -2.85206937789917, "loss": 4.5027, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -27.271636962890625, "rewards/margins": 1.2490571737289429, "rewards/rejected": -28.52069664001465, "step": 19435 }, { "epoch": 0.6552293639826081, "grad_norm": 40.216697692871094, "learning_rate": 3.204444590329256e-07, "logits/chosen": -1.6425358057022095, "logits/rejected": -1.55009126663208, "logps/chosen": -2.5580005645751953, "logps/rejected": -2.6416759490966797, "loss": 4.1616, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -25.580005645751953, "rewards/margins": 0.8367554545402527, "rewards/rejected": -26.416757583618164, "step": 19440 }, { "epoch": 0.6553978900535913, "grad_norm": 17.301225662231445, "learning_rate": 3.2016997583311323e-07, "logits/chosen": -1.6697677373886108, "logits/rejected": -1.9882276058197021, "logps/chosen": -2.2435336112976074, "logps/rejected": -2.221325159072876, "loss": 3.8423, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.43533706665039, "rewards/margins": -0.2220836579799652, "rewards/rejected": -22.213253021240234, "step": 19445 }, { "epoch": 0.6555664161245744, "grad_norm": 42.60205078125, "learning_rate": 3.1989555486538716e-07, "logits/chosen": -1.0797595977783203, "logits/rejected": -1.429962158203125, "logps/chosen": -2.0917086601257324, "logps/rejected": -2.2917962074279785, "loss": 2.1877, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.91708755493164, "rewards/margins": 2.0008738040924072, "rewards/rejected": -22.9179630279541, "step": 19450 }, { "epoch": 0.6557349421955576, "grad_norm": 50.20454406738281, "learning_rate": 3.196211962247136e-07, "logits/chosen": -1.5836347341537476, "logits/rejected": -1.5361778736114502, "logps/chosen": -2.3230605125427246, "logps/rejected": -2.3607373237609863, "loss": 2.9018, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.230606079101562, "rewards/margins": 0.37676936388015747, "rewards/rejected": -23.607372283935547, "step": 19455 }, { "epoch": 0.6559034682665408, "grad_norm": 31.224794387817383, "learning_rate": 3.193469000060374e-07, "logits/chosen": -1.5918365716934204, "logits/rejected": -1.2786345481872559, "logps/chosen": -2.7298786640167236, "logps/rejected": -2.472374439239502, "loss": 6.6671, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -27.298786163330078, "rewards/margins": -2.5750393867492676, "rewards/rejected": -24.723745346069336, "step": 19460 }, { "epoch": 0.6560719943375241, "grad_norm": 24.223526000976562, "learning_rate": 3.1907266630428165e-07, "logits/chosen": -1.9028345346450806, "logits/rejected": -2.3104963302612305, "logps/chosen": -3.585310697555542, "logps/rejected": -3.973658323287964, "loss": 3.0605, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -35.85310745239258, "rewards/margins": 3.8834731578826904, "rewards/rejected": -39.73657989501953, "step": 19465 }, { "epoch": 0.6562405204085072, "grad_norm": 32.01412582397461, "learning_rate": 3.187984952143481e-07, "logits/chosen": -1.718444585800171, "logits/rejected": -1.6569147109985352, "logps/chosen": -3.2525010108947754, "logps/rejected": -3.6777184009552, "loss": 4.2341, "rewards/accuracies": 0.5, "rewards/chosen": -32.52500915527344, "rewards/margins": 4.252173900604248, "rewards/rejected": -36.777183532714844, "step": 19470 }, { "epoch": 0.6564090464794904, "grad_norm": 212.0823974609375, "learning_rate": 3.1852438683111603e-07, "logits/chosen": -1.5602266788482666, "logits/rejected": -1.596228837966919, "logps/chosen": -2.254568576812744, "logps/rejected": -2.299839496612549, "loss": 2.8802, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.54568099975586, "rewards/margins": 0.4527137875556946, "rewards/rejected": -22.998395919799805, "step": 19475 }, { "epoch": 0.6565775725504736, "grad_norm": 0.14562419056892395, "learning_rate": 3.1825034124944384e-07, "logits/chosen": -2.2185866832733154, "logits/rejected": -2.6798908710479736, "logps/chosen": -3.334772825241089, "logps/rejected": -3.952873706817627, "loss": 4.9938, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -33.34772872924805, "rewards/margins": 6.18100643157959, "rewards/rejected": -39.52873611450195, "step": 19480 }, { "epoch": 0.6567460986214567, "grad_norm": 17.60808563232422, "learning_rate": 3.179763585641681e-07, "logits/chosen": -1.7934448719024658, "logits/rejected": -1.743231177330017, "logps/chosen": -2.447374105453491, "logps/rejected": -2.9404006004333496, "loss": 1.6274, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -24.473739624023438, "rewards/margins": 4.930263519287109, "rewards/rejected": -29.404003143310547, "step": 19485 }, { "epoch": 0.6569146246924399, "grad_norm": 134.12086486816406, "learning_rate": 3.17702438870103e-07, "logits/chosen": -1.1001700162887573, "logits/rejected": -1.2772667407989502, "logps/chosen": -2.1697604656219482, "logps/rejected": -2.358163595199585, "loss": 2.444, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.697607040405273, "rewards/margins": 1.884027123451233, "rewards/rejected": -23.581636428833008, "step": 19490 }, { "epoch": 0.657083150763423, "grad_norm": 26.62551498413086, "learning_rate": 3.174285822620416e-07, "logits/chosen": -1.5494797229766846, "logits/rejected": -1.5621850490570068, "logps/chosen": -2.2668309211730957, "logps/rejected": -2.612889528274536, "loss": 2.6577, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.66830825805664, "rewards/margins": 3.4605870246887207, "rewards/rejected": -26.128894805908203, "step": 19495 }, { "epoch": 0.6572516768344063, "grad_norm": 39.074554443359375, "learning_rate": 3.1715478883475495e-07, "logits/chosen": -2.109074592590332, "logits/rejected": -2.007145404815674, "logps/chosen": -2.1038336753845215, "logps/rejected": -2.416748523712158, "loss": 2.8232, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.0383358001709, "rewards/margins": 3.129146099090576, "rewards/rejected": -24.167484283447266, "step": 19500 }, { "epoch": 0.6574202029053895, "grad_norm": 28.4183349609375, "learning_rate": 3.1688105868299193e-07, "logits/chosen": -1.3516571521759033, "logits/rejected": -1.6358953714370728, "logps/chosen": -2.264101505279541, "logps/rejected": -2.4743664264678955, "loss": 2.117, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.64101219177246, "rewards/margins": 2.102649688720703, "rewards/rejected": -24.743663787841797, "step": 19505 }, { "epoch": 0.6575887289763727, "grad_norm": 20.12200164794922, "learning_rate": 3.1660739190148e-07, "logits/chosen": -1.7156364917755127, "logits/rejected": -1.9024250507354736, "logps/chosen": -2.3485307693481445, "logps/rejected": -3.1334264278411865, "loss": 2.453, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -23.485305786132812, "rewards/margins": 7.848960876464844, "rewards/rejected": -31.334264755249023, "step": 19510 }, { "epoch": 0.6577572550473558, "grad_norm": 27.374279022216797, "learning_rate": 3.163337885849243e-07, "logits/chosen": -1.6456096172332764, "logits/rejected": -1.7837450504302979, "logps/chosen": -1.983033537864685, "logps/rejected": -1.9265260696411133, "loss": 3.7782, "rewards/accuracies": 0.5, "rewards/chosen": -19.830333709716797, "rewards/margins": -0.5650733709335327, "rewards/rejected": -19.265262603759766, "step": 19515 }, { "epoch": 0.657925781118339, "grad_norm": 109.21072387695312, "learning_rate": 3.160602488280083e-07, "logits/chosen": -1.409911036491394, "logits/rejected": -1.4950873851776123, "logps/chosen": -3.305628538131714, "logps/rejected": -3.413794755935669, "loss": 3.2585, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -33.05628204345703, "rewards/margins": 1.0816668272018433, "rewards/rejected": -34.1379508972168, "step": 19520 }, { "epoch": 0.6580943071893222, "grad_norm": 25.108606338500977, "learning_rate": 3.1578677272539313e-07, "logits/chosen": -1.7747972011566162, "logits/rejected": -2.1871495246887207, "logps/chosen": -1.8980529308319092, "logps/rejected": -2.155996799468994, "loss": 1.6926, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.98052978515625, "rewards/margins": 2.579437255859375, "rewards/rejected": -21.559967041015625, "step": 19525 }, { "epoch": 0.6582628332603053, "grad_norm": 55.106502532958984, "learning_rate": 3.155133603717182e-07, "logits/chosen": -1.4665242433547974, "logits/rejected": -1.449638843536377, "logps/chosen": -2.216754913330078, "logps/rejected": -2.4463374614715576, "loss": 2.5417, "rewards/accuracies": 0.5, "rewards/chosen": -22.16754913330078, "rewards/margins": 2.2958261966705322, "rewards/rejected": -24.463375091552734, "step": 19530 }, { "epoch": 0.6584313593312886, "grad_norm": 31.020858764648438, "learning_rate": 3.15240011861601e-07, "logits/chosen": -1.8325130939483643, "logits/rejected": -2.208038330078125, "logps/chosen": -1.9647200107574463, "logps/rejected": -2.779484987258911, "loss": 1.5215, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.647197723388672, "rewards/margins": 8.147647857666016, "rewards/rejected": -27.794849395751953, "step": 19535 }, { "epoch": 0.6585998854022718, "grad_norm": 52.70795822143555, "learning_rate": 3.1496672728963625e-07, "logits/chosen": -1.2615281343460083, "logits/rejected": -1.745319128036499, "logps/chosen": -2.6752312183380127, "logps/rejected": -2.9537129402160645, "loss": 2.1299, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.7523136138916, "rewards/margins": 2.7848167419433594, "rewards/rejected": -29.537128448486328, "step": 19540 }, { "epoch": 0.6587684114732549, "grad_norm": 32.79944610595703, "learning_rate": 3.1469350675039706e-07, "logits/chosen": -1.7413183450698853, "logits/rejected": -1.7953037023544312, "logps/chosen": -2.31355881690979, "logps/rejected": -2.3361732959747314, "loss": 3.5028, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.13558578491211, "rewards/margins": 0.22614488005638123, "rewards/rejected": -23.361730575561523, "step": 19545 }, { "epoch": 0.6589369375442381, "grad_norm": 6.505997657775879, "learning_rate": 3.144203503384345e-07, "logits/chosen": -1.4171102046966553, "logits/rejected": -1.7000093460083008, "logps/chosen": -2.309157133102417, "logps/rejected": -2.585911512374878, "loss": 3.0135, "rewards/accuracies": 0.5, "rewards/chosen": -23.091571807861328, "rewards/margins": 2.7675464153289795, "rewards/rejected": -25.859119415283203, "step": 19550 }, { "epoch": 0.6591054636152213, "grad_norm": 18.754776000976562, "learning_rate": 3.1414725814827735e-07, "logits/chosen": -1.275989294052124, "logits/rejected": -1.4211461544036865, "logps/chosen": -2.091052293777466, "logps/rejected": -2.1982831954956055, "loss": 2.5494, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.9105224609375, "rewards/margins": 1.072310209274292, "rewards/rejected": -21.982831954956055, "step": 19555 }, { "epoch": 0.6592739896862044, "grad_norm": 46.30010223388672, "learning_rate": 3.138742302744316e-07, "logits/chosen": -1.2075470685958862, "logits/rejected": -1.5399049520492554, "logps/chosen": -2.470909595489502, "logps/rejected": -3.1514031887054443, "loss": 1.9756, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -24.70909881591797, "rewards/margins": 6.804935455322266, "rewards/rejected": -31.514034271240234, "step": 19560 }, { "epoch": 0.6594425157571876, "grad_norm": 18.922914505004883, "learning_rate": 3.1360126681138164e-07, "logits/chosen": -2.1684887409210205, "logits/rejected": -2.0825414657592773, "logps/chosen": -1.9475510120391846, "logps/rejected": -2.1734211444854736, "loss": 1.7755, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.475509643554688, "rewards/margins": 2.258702039718628, "rewards/rejected": -21.734210968017578, "step": 19565 }, { "epoch": 0.6596110418281708, "grad_norm": 44.96382141113281, "learning_rate": 3.1332836785358964e-07, "logits/chosen": -1.8090126514434814, "logits/rejected": -1.8098065853118896, "logps/chosen": -2.098357677459717, "logps/rejected": -2.2060370445251465, "loss": 2.5793, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.98357582092285, "rewards/margins": 1.0767956972122192, "rewards/rejected": -22.06036949157715, "step": 19570 }, { "epoch": 0.659779567899154, "grad_norm": 24.288698196411133, "learning_rate": 3.130555334954949e-07, "logits/chosen": -1.448203206062317, "logits/rejected": -1.4640809297561646, "logps/chosen": -2.248979091644287, "logps/rejected": -2.4473307132720947, "loss": 2.1316, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.489791870117188, "rewards/margins": 1.9835140705108643, "rewards/rejected": -24.47330665588379, "step": 19575 }, { "epoch": 0.6599480939701372, "grad_norm": 29.102195739746094, "learning_rate": 3.127827638315146e-07, "logits/chosen": -1.6922508478164673, "logits/rejected": -1.743398666381836, "logps/chosen": -2.742082118988037, "logps/rejected": -3.214224338531494, "loss": 2.7073, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -27.420822143554688, "rewards/margins": 4.721421241760254, "rewards/rejected": -32.142242431640625, "step": 19580 }, { "epoch": 0.6601166200411204, "grad_norm": 69.89996337890625, "learning_rate": 3.1251005895604363e-07, "logits/chosen": -1.2156705856323242, "logits/rejected": -1.1699566841125488, "logps/chosen": -2.3415446281433105, "logps/rejected": -2.812303066253662, "loss": 3.7849, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -23.41544532775879, "rewards/margins": 4.707589149475098, "rewards/rejected": -28.123035430908203, "step": 19585 }, { "epoch": 0.6602851461121035, "grad_norm": 28.79608726501465, "learning_rate": 3.122374189634546e-07, "logits/chosen": -1.7804441452026367, "logits/rejected": -2.0019617080688477, "logps/chosen": -1.7656739950180054, "logps/rejected": -1.958898901939392, "loss": 2.2752, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.656740188598633, "rewards/margins": 1.9322493076324463, "rewards/rejected": -19.5889892578125, "step": 19590 }, { "epoch": 0.6604536721830867, "grad_norm": 208.58645629882812, "learning_rate": 3.119648439480972e-07, "logits/chosen": -1.7226556539535522, "logits/rejected": -1.7559149265289307, "logps/chosen": -2.9009034633636475, "logps/rejected": -3.0689263343811035, "loss": 2.6959, "rewards/accuracies": 0.5, "rewards/chosen": -29.009033203125, "rewards/margins": 1.6802289485931396, "rewards/rejected": -30.689266204833984, "step": 19595 }, { "epoch": 0.6606221982540699, "grad_norm": 20.864017486572266, "learning_rate": 3.1169233400429907e-07, "logits/chosen": -1.8954029083251953, "logits/rejected": -2.0006442070007324, "logps/chosen": -2.0070273876190186, "logps/rejected": -2.6326088905334473, "loss": 1.9522, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.070276260375977, "rewards/margins": 6.255814552307129, "rewards/rejected": -26.32608985900879, "step": 19600 }, { "epoch": 0.6606221982540699, "eval_logits/chosen": -2.0875275135040283, "eval_logits/rejected": -2.2476277351379395, "eval_logps/chosen": -2.1917946338653564, "eval_logps/rejected": -2.3321449756622314, "eval_loss": 3.0390613079071045, "eval_rewards/accuracies": 0.6299999952316284, "eval_rewards/chosen": -21.917945861816406, "eval_rewards/margins": 1.4035052061080933, "eval_rewards/rejected": -23.321449279785156, "eval_runtime": 12.9389, "eval_samples_per_second": 7.729, "eval_steps_per_second": 1.932, "step": 19600 }, { "epoch": 0.660790724325053, "grad_norm": 31.91147804260254, "learning_rate": 3.1141988922636525e-07, "logits/chosen": -1.6353679895401, "logits/rejected": -1.8300693035125732, "logps/chosen": -2.4113149642944336, "logps/rejected": -3.3126251697540283, "loss": 2.4313, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.113149642944336, "rewards/margins": 9.013103485107422, "rewards/rejected": -33.126251220703125, "step": 19605 }, { "epoch": 0.6609592503960363, "grad_norm": 30.281909942626953, "learning_rate": 3.1114750970857784e-07, "logits/chosen": -1.512751817703247, "logits/rejected": -2.048257827758789, "logps/chosen": -2.5183193683624268, "logps/rejected": -4.193761825561523, "loss": 0.8449, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -25.18319320678711, "rewards/margins": 16.754425048828125, "rewards/rejected": -41.93761444091797, "step": 19610 }, { "epoch": 0.6611277764670195, "grad_norm": 23.415830612182617, "learning_rate": 3.108751955451968e-07, "logits/chosen": -1.5418158769607544, "logits/rejected": -1.8583043813705444, "logps/chosen": -2.7758288383483887, "logps/rejected": -3.548363208770752, "loss": 1.8529, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -27.758289337158203, "rewards/margins": 7.725339412689209, "rewards/rejected": -35.4836311340332, "step": 19615 }, { "epoch": 0.6612963025380026, "grad_norm": 21.576784133911133, "learning_rate": 3.106029468304594e-07, "logits/chosen": -1.6991370916366577, "logits/rejected": -1.7369697093963623, "logps/chosen": -2.20546293258667, "logps/rejected": -2.2247977256774902, "loss": 3.1269, "rewards/accuracies": 0.5, "rewards/chosen": -22.054630279541016, "rewards/margins": 0.1933467835187912, "rewards/rejected": -22.247976303100586, "step": 19620 }, { "epoch": 0.6614648286089858, "grad_norm": 208.43551635742188, "learning_rate": 3.1033076365858036e-07, "logits/chosen": -1.1789934635162354, "logits/rejected": -2.0805132389068604, "logps/chosen": -2.531059741973877, "logps/rejected": -3.4619078636169434, "loss": 2.0194, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -25.310596466064453, "rewards/margins": 9.308481216430664, "rewards/rejected": -34.61907958984375, "step": 19625 }, { "epoch": 0.661633354679969, "grad_norm": 1.221866488456726, "learning_rate": 3.100586461237511e-07, "logits/chosen": -1.6778274774551392, "logits/rejected": -1.9052091836929321, "logps/chosen": -2.725083827972412, "logps/rejected": -3.2458865642547607, "loss": 1.9867, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -27.250839233398438, "rewards/margins": 5.208024024963379, "rewards/rejected": -32.4588623046875, "step": 19630 }, { "epoch": 0.6618018807509521, "grad_norm": 22.51280975341797, "learning_rate": 3.0978659432014103e-07, "logits/chosen": -1.9016478061676025, "logits/rejected": -2.3998100757598877, "logps/chosen": -3.3028228282928467, "logps/rejected": -3.3499350547790527, "loss": 6.1235, "rewards/accuracies": 0.5, "rewards/chosen": -33.02823257446289, "rewards/margins": 0.47111815214157104, "rewards/rejected": -33.49934768676758, "step": 19635 }, { "epoch": 0.6619704068219353, "grad_norm": 62.46411895751953, "learning_rate": 3.095146083418968e-07, "logits/chosen": -1.3130820989608765, "logits/rejected": -1.4224445819854736, "logps/chosen": -2.2050185203552246, "logps/rejected": -2.242039918899536, "loss": 4.3249, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.050186157226562, "rewards/margins": 0.3702128529548645, "rewards/rejected": -22.420398712158203, "step": 19640 }, { "epoch": 0.6621389328929186, "grad_norm": 55.873966217041016, "learning_rate": 3.092426882831416e-07, "logits/chosen": -1.7375984191894531, "logits/rejected": -1.5786654949188232, "logps/chosen": -2.8116352558135986, "logps/rejected": -2.9780755043029785, "loss": 3.8662, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -28.116352081298828, "rewards/margins": 1.664402961730957, "rewards/rejected": -29.780752182006836, "step": 19645 }, { "epoch": 0.6623074589639018, "grad_norm": 60.345523834228516, "learning_rate": 3.089708342379764e-07, "logits/chosen": -1.3604744672775269, "logits/rejected": -1.313848614692688, "logps/chosen": -2.6019206047058105, "logps/rejected": -2.614013910293579, "loss": 3.0137, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -26.01920509338379, "rewards/margins": 0.120935820043087, "rewards/rejected": -26.140140533447266, "step": 19650 }, { "epoch": 0.6624759850348849, "grad_norm": 47.997467041015625, "learning_rate": 3.086990463004792e-07, "logits/chosen": -1.4324225187301636, "logits/rejected": -1.6653724908828735, "logps/chosen": -1.972364068031311, "logps/rejected": -2.192746639251709, "loss": 2.0297, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.723642349243164, "rewards/margins": 2.2038259506225586, "rewards/rejected": -21.927465438842773, "step": 19655 }, { "epoch": 0.6626445111058681, "grad_norm": 49.364105224609375, "learning_rate": 3.0842732456470527e-07, "logits/chosen": -1.995145559310913, "logits/rejected": -2.4126739501953125, "logps/chosen": -2.6052517890930176, "logps/rejected": -3.049837827682495, "loss": 1.9231, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -26.052515029907227, "rewards/margins": 4.445866107940674, "rewards/rejected": -30.49837875366211, "step": 19660 }, { "epoch": 0.6628130371768512, "grad_norm": 68.35008239746094, "learning_rate": 3.0815566912468657e-07, "logits/chosen": -2.1833343505859375, "logits/rejected": -2.2756967544555664, "logps/chosen": -2.228520631790161, "logps/rejected": -2.616534471511841, "loss": 2.9025, "rewards/accuracies": 0.5, "rewards/chosen": -22.285205841064453, "rewards/margins": 3.8801398277282715, "rewards/rejected": -26.16534423828125, "step": 19665 }, { "epoch": 0.6629815632478344, "grad_norm": 23.333072662353516, "learning_rate": 3.0788408007443234e-07, "logits/chosen": -1.176478624343872, "logits/rejected": -1.5110712051391602, "logps/chosen": -1.7166290283203125, "logps/rejected": -1.8860689401626587, "loss": 1.9023, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.166288375854492, "rewards/margins": 1.6943992376327515, "rewards/rejected": -18.86069107055664, "step": 19670 }, { "epoch": 0.6631500893188176, "grad_norm": 18.771587371826172, "learning_rate": 3.0761255750792923e-07, "logits/chosen": -1.8307679891586304, "logits/rejected": -2.152930736541748, "logps/chosen": -2.309884786605835, "logps/rejected": -2.8270676136016846, "loss": 2.0373, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.098846435546875, "rewards/margins": 5.1718268394470215, "rewards/rejected": -28.270675659179688, "step": 19675 }, { "epoch": 0.6633186153898007, "grad_norm": 33.46798324584961, "learning_rate": 3.0734110151913995e-07, "logits/chosen": -1.5328623056411743, "logits/rejected": -1.9079945087432861, "logps/chosen": -2.2217116355895996, "logps/rejected": -2.8022282123565674, "loss": 1.8753, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.21711540222168, "rewards/margins": 5.8051652908325195, "rewards/rejected": -28.02227783203125, "step": 19680 }, { "epoch": 0.663487141460784, "grad_norm": 31.7010498046875, "learning_rate": 3.0706971220200494e-07, "logits/chosen": -1.3515177965164185, "logits/rejected": -1.5968728065490723, "logps/chosen": -2.9291114807128906, "logps/rejected": -3.167548656463623, "loss": 2.6971, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -29.291112899780273, "rewards/margins": 2.384373664855957, "rewards/rejected": -31.675487518310547, "step": 19685 }, { "epoch": 0.6636556675317672, "grad_norm": 19.04203987121582, "learning_rate": 3.0679838965044147e-07, "logits/chosen": -1.6338005065917969, "logits/rejected": -1.7416985034942627, "logps/chosen": -2.837009906768799, "logps/rejected": -3.1580300331115723, "loss": 4.8603, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -28.370098114013672, "rewards/margins": 3.2102017402648926, "rewards/rejected": -31.58030128479004, "step": 19690 }, { "epoch": 0.6638241936027504, "grad_norm": 17.882585525512695, "learning_rate": 3.065271339583436e-07, "logits/chosen": -1.746091604232788, "logits/rejected": -2.024017810821533, "logps/chosen": -1.873356580734253, "logps/rejected": -2.518157720565796, "loss": 1.3999, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -18.733566284179688, "rewards/margins": 6.4480085372924805, "rewards/rejected": -25.181575775146484, "step": 19695 }, { "epoch": 0.6639927196737335, "grad_norm": 48.590450286865234, "learning_rate": 3.06255945219582e-07, "logits/chosen": -1.691332221031189, "logits/rejected": -2.2088327407836914, "logps/chosen": -2.7080318927764893, "logps/rejected": -2.8886380195617676, "loss": 4.762, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -27.080318450927734, "rewards/margins": 1.8060623407363892, "rewards/rejected": -28.88637924194336, "step": 19700 }, { "epoch": 0.6641612457447167, "grad_norm": 215.69692993164062, "learning_rate": 3.0598482352800457e-07, "logits/chosen": -1.6511586904525757, "logits/rejected": -1.527772307395935, "logps/chosen": -4.019328594207764, "logps/rejected": -4.086058616638184, "loss": 3.9622, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -40.19328689575195, "rewards/margins": 0.6673009991645813, "rewards/rejected": -40.86058807373047, "step": 19705 }, { "epoch": 0.6643297718156999, "grad_norm": 116.02570343017578, "learning_rate": 3.0571376897743606e-07, "logits/chosen": -1.7304351329803467, "logits/rejected": -1.770742416381836, "logps/chosen": -2.940293312072754, "logps/rejected": -2.576066493988037, "loss": 7.1427, "rewards/accuracies": 0.5, "rewards/chosen": -29.40293312072754, "rewards/margins": -3.6422653198242188, "rewards/rejected": -25.760665893554688, "step": 19710 }, { "epoch": 0.664498297886683, "grad_norm": 19.49298858642578, "learning_rate": 3.0544278166167725e-07, "logits/chosen": -1.6837724447250366, "logits/rejected": -1.6870133876800537, "logps/chosen": -2.900510787963867, "logps/rejected": -3.3730030059814453, "loss": 1.0817, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -29.005102157592773, "rewards/margins": 4.724923133850098, "rewards/rejected": -33.73003005981445, "step": 19715 }, { "epoch": 0.6646668239576663, "grad_norm": 21.581035614013672, "learning_rate": 3.0517186167450647e-07, "logits/chosen": -1.40189528465271, "logits/rejected": -1.6113313436508179, "logps/chosen": -2.0576136112213135, "logps/rejected": -2.196462869644165, "loss": 2.172, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.576135635375977, "rewards/margins": 1.388492465019226, "rewards/rejected": -21.964628219604492, "step": 19720 }, { "epoch": 0.6648353500286495, "grad_norm": 11.661018371582031, "learning_rate": 3.049010091096784e-07, "logits/chosen": -2.140958309173584, "logits/rejected": -2.1676430702209473, "logps/chosen": -1.8523032665252686, "logps/rejected": -2.0676932334899902, "loss": 3.0915, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.523033142089844, "rewards/margins": 2.153900384902954, "rewards/rejected": -20.67693519592285, "step": 19725 }, { "epoch": 0.6650038760996326, "grad_norm": 50.89949035644531, "learning_rate": 3.046302240609247e-07, "logits/chosen": -1.6583229303359985, "logits/rejected": -1.9790871143341064, "logps/chosen": -2.465359687805176, "logps/rejected": -2.7347371578216553, "loss": 2.9139, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.653594970703125, "rewards/margins": 2.6937763690948486, "rewards/rejected": -27.347375869750977, "step": 19730 }, { "epoch": 0.6651724021706158, "grad_norm": 51.48941421508789, "learning_rate": 3.04359506621953e-07, "logits/chosen": -1.8139331340789795, "logits/rejected": -1.7735220193862915, "logps/chosen": -2.390505313873291, "logps/rejected": -2.332714319229126, "loss": 3.8516, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -23.905052185058594, "rewards/margins": -0.5779077410697937, "rewards/rejected": -23.3271427154541, "step": 19735 }, { "epoch": 0.665340928241599, "grad_norm": 37.086307525634766, "learning_rate": 3.040888568864482e-07, "logits/chosen": -1.5597200393676758, "logits/rejected": -1.7371556758880615, "logps/chosen": -1.9877293109893799, "logps/rejected": -2.0962672233581543, "loss": 2.5319, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.877294540405273, "rewards/margins": 1.085381269454956, "rewards/rejected": -20.962675094604492, "step": 19740 }, { "epoch": 0.6655094543125821, "grad_norm": 13.57636833190918, "learning_rate": 3.038182749480716e-07, "logits/chosen": -1.6128686666488647, "logits/rejected": -1.851527214050293, "logps/chosen": -2.4245975017547607, "logps/rejected": -3.140558958053589, "loss": 1.9717, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.2459774017334, "rewards/margins": 7.15961217880249, "rewards/rejected": -31.405590057373047, "step": 19745 }, { "epoch": 0.6656779803835653, "grad_norm": 50.33108139038086, "learning_rate": 3.035477609004606e-07, "logits/chosen": -2.2638416290283203, "logits/rejected": -2.2421774864196777, "logps/chosen": -2.322725772857666, "logps/rejected": -2.16920804977417, "loss": 4.5887, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -23.22725486755371, "rewards/margins": -1.5351752042770386, "rewards/rejected": -21.692081451416016, "step": 19750 }, { "epoch": 0.6658465064545486, "grad_norm": 47.44742202758789, "learning_rate": 3.0327731483722965e-07, "logits/chosen": -1.5761334896087646, "logits/rejected": -1.4156379699707031, "logps/chosen": -2.0456478595733643, "logps/rejected": -2.1322078704833984, "loss": 2.6839, "rewards/accuracies": 0.5, "rewards/chosen": -20.456480026245117, "rewards/margins": 0.8655961751937866, "rewards/rejected": -21.32207679748535, "step": 19755 }, { "epoch": 0.6660150325255317, "grad_norm": 135.6229705810547, "learning_rate": 3.030069368519694e-07, "logits/chosen": -1.7876793146133423, "logits/rejected": -2.074373245239258, "logps/chosen": -2.859711170196533, "logps/rejected": -3.498471736907959, "loss": 1.6566, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -28.59711265563965, "rewards/margins": 6.3876051902771, "rewards/rejected": -34.984718322753906, "step": 19760 }, { "epoch": 0.6661835585965149, "grad_norm": 0.10053889453411102, "learning_rate": 3.0273662703824737e-07, "logits/chosen": -1.8345619440078735, "logits/rejected": -1.8314402103424072, "logps/chosen": -2.1931605339050293, "logps/rejected": -2.520170211791992, "loss": 2.2617, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.93160629272461, "rewards/margins": 3.27009654045105, "rewards/rejected": -25.201702117919922, "step": 19765 }, { "epoch": 0.6663520846674981, "grad_norm": 43.061500549316406, "learning_rate": 3.024663854896067e-07, "logits/chosen": -1.4732874631881714, "logits/rejected": -1.4747284650802612, "logps/chosen": -2.1525635719299316, "logps/rejected": -2.309382915496826, "loss": 3.044, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.525634765625, "rewards/margins": 1.5681923627853394, "rewards/rejected": -23.093828201293945, "step": 19770 }, { "epoch": 0.6665206107384812, "grad_norm": 5.061534404754639, "learning_rate": 3.0219621229956735e-07, "logits/chosen": -1.9069154262542725, "logits/rejected": -2.5560789108276367, "logps/chosen": -1.9357631206512451, "logps/rejected": -2.6078197956085205, "loss": 1.6453, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.357629776000977, "rewards/margins": 6.7205657958984375, "rewards/rejected": -26.078197479248047, "step": 19775 }, { "epoch": 0.6666891368094644, "grad_norm": 12.537591934204102, "learning_rate": 3.0192610756162606e-07, "logits/chosen": -1.9024620056152344, "logits/rejected": -2.025599241256714, "logps/chosen": -1.7322824001312256, "logps/rejected": -1.9820201396942139, "loss": 1.6466, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.322824478149414, "rewards/margins": 2.4973764419555664, "rewards/rejected": -19.820201873779297, "step": 19780 }, { "epoch": 0.6668576628804476, "grad_norm": 23.594646453857422, "learning_rate": 3.0165607136925496e-07, "logits/chosen": -1.9728679656982422, "logits/rejected": -1.887183427810669, "logps/chosen": -2.2772622108459473, "logps/rejected": -2.125269889831543, "loss": 4.8542, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -22.772621154785156, "rewards/margins": -1.519923448562622, "rewards/rejected": -21.252696990966797, "step": 19785 }, { "epoch": 0.6670261889514307, "grad_norm": 31.816057205200195, "learning_rate": 3.013861038159031e-07, "logits/chosen": -1.525428056716919, "logits/rejected": -1.2926287651062012, "logps/chosen": -2.7483131885528564, "logps/rejected": -3.068668842315674, "loss": 2.3267, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -27.483129501342773, "rewards/margins": 3.203556537628174, "rewards/rejected": -30.686687469482422, "step": 19790 }, { "epoch": 0.667194715022414, "grad_norm": 19.48959732055664, "learning_rate": 3.0111620499499555e-07, "logits/chosen": -1.4664475917816162, "logits/rejected": -1.9527008533477783, "logps/chosen": -2.288588523864746, "logps/rejected": -2.7013888359069824, "loss": 3.1565, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.88588523864746, "rewards/margins": 4.128004550933838, "rewards/rejected": -27.01388931274414, "step": 19795 }, { "epoch": 0.6673632410933972, "grad_norm": 43.20466995239258, "learning_rate": 3.008463749999339e-07, "logits/chosen": -1.6210861206054688, "logits/rejected": -1.7428817749023438, "logps/chosen": -2.5191102027893066, "logps/rejected": -2.910252571105957, "loss": 2.0976, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.19110107421875, "rewards/margins": 3.911424160003662, "rewards/rejected": -29.102527618408203, "step": 19800 }, { "epoch": 0.6675317671643803, "grad_norm": 72.56021118164062, "learning_rate": 3.005766139240955e-07, "logits/chosen": -2.2898874282836914, "logits/rejected": -2.5569610595703125, "logps/chosen": -2.3946192264556885, "logps/rejected": -2.511489152908325, "loss": 3.3885, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -23.946191787719727, "rewards/margins": 1.1687005758285522, "rewards/rejected": -25.11488914489746, "step": 19805 }, { "epoch": 0.6677002932353635, "grad_norm": 32.11191940307617, "learning_rate": 3.0030692186083405e-07, "logits/chosen": -2.098188877105713, "logits/rejected": -2.284949541091919, "logps/chosen": -2.6981968879699707, "logps/rejected": -2.643249988555908, "loss": 3.9236, "rewards/accuracies": 0.5, "rewards/chosen": -26.981969833374023, "rewards/margins": -0.5494720339775085, "rewards/rejected": -26.432498931884766, "step": 19810 }, { "epoch": 0.6678688193063467, "grad_norm": 13.116031646728516, "learning_rate": 3.000372989034794e-07, "logits/chosen": -1.3008522987365723, "logits/rejected": -1.7032623291015625, "logps/chosen": -2.2843902111053467, "logps/rejected": -2.548161745071411, "loss": 2.0715, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.843904495239258, "rewards/margins": 2.637714147567749, "rewards/rejected": -25.481616973876953, "step": 19815 }, { "epoch": 0.6680373453773298, "grad_norm": 35.49582290649414, "learning_rate": 2.997677451453373e-07, "logits/chosen": -1.7446882724761963, "logits/rejected": -1.876037836074829, "logps/chosen": -2.699276924133301, "logps/rejected": -2.7415661811828613, "loss": 2.7519, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -26.99277114868164, "rewards/margins": 0.4228929579257965, "rewards/rejected": -27.415660858154297, "step": 19820 }, { "epoch": 0.668205871448313, "grad_norm": 12.453043937683105, "learning_rate": 2.9949826067968977e-07, "logits/chosen": -1.5163322687149048, "logits/rejected": -1.9970420598983765, "logps/chosen": -2.086151599884033, "logps/rejected": -2.855980157852173, "loss": 1.0368, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -20.861515045166016, "rewards/margins": 7.698286533355713, "rewards/rejected": -28.559803009033203, "step": 19825 }, { "epoch": 0.6683743975192963, "grad_norm": 51.53904342651367, "learning_rate": 2.992288455997947e-07, "logits/chosen": -1.2252388000488281, "logits/rejected": -1.4193706512451172, "logps/chosen": -2.433497667312622, "logps/rejected": -2.7764475345611572, "loss": 2.9424, "rewards/accuracies": 0.5, "rewards/chosen": -24.334978103637695, "rewards/margins": 3.4294967651367188, "rewards/rejected": -27.764474868774414, "step": 19830 }, { "epoch": 0.6685429235902794, "grad_norm": 62.04422378540039, "learning_rate": 2.989594999988864e-07, "logits/chosen": -2.034576177597046, "logits/rejected": -2.177302837371826, "logps/chosen": -2.8381359577178955, "logps/rejected": -3.122462034225464, "loss": 2.5747, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -28.381359100341797, "rewards/margins": 2.8432610034942627, "rewards/rejected": -31.224618911743164, "step": 19835 }, { "epoch": 0.6687114496612626, "grad_norm": 32.734657287597656, "learning_rate": 2.9869022397017417e-07, "logits/chosen": -2.064215898513794, "logits/rejected": -2.021044969558716, "logps/chosen": -2.555880546569824, "logps/rejected": -2.721607208251953, "loss": 2.5477, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.55880355834961, "rewards/margins": 1.657266616821289, "rewards/rejected": -27.2160701751709, "step": 19840 }, { "epoch": 0.6688799757322458, "grad_norm": 33.11255645751953, "learning_rate": 2.9842101760684413e-07, "logits/chosen": -1.5624778270721436, "logits/rejected": -1.7367737293243408, "logps/chosen": -2.5347161293029785, "logps/rejected": -2.9798059463500977, "loss": 1.5732, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.3471622467041, "rewards/margins": 4.45089864730835, "rewards/rejected": -29.79806137084961, "step": 19845 }, { "epoch": 0.669048501803229, "grad_norm": 23.20467758178711, "learning_rate": 2.9815188100205824e-07, "logits/chosen": -1.9864904880523682, "logits/rejected": -2.2545325756073, "logps/chosen": -2.7851338386535645, "logps/rejected": -3.3120033740997314, "loss": 1.7605, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -27.851337432861328, "rewards/margins": 5.2686967849731445, "rewards/rejected": -33.120033264160156, "step": 19850 }, { "epoch": 0.6692170278742121, "grad_norm": 27.634586334228516, "learning_rate": 2.978828142489537e-07, "logits/chosen": -1.9006726741790771, "logits/rejected": -1.9433460235595703, "logps/chosen": -2.817883014678955, "logps/rejected": -3.0188820362091064, "loss": 2.6239, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -28.178829193115234, "rewards/margins": 2.0099892616271973, "rewards/rejected": -30.18882179260254, "step": 19855 }, { "epoch": 0.6693855539451953, "grad_norm": 17.127777099609375, "learning_rate": 2.9761381744064396e-07, "logits/chosen": -2.049617290496826, "logits/rejected": -2.2871267795562744, "logps/chosen": -1.9521675109863281, "logps/rejected": -2.246138095855713, "loss": 1.4065, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.52167320251465, "rewards/margins": 2.939706325531006, "rewards/rejected": -22.461380004882812, "step": 19860 }, { "epoch": 0.6695540800161786, "grad_norm": 56.237342834472656, "learning_rate": 2.9734489067021836e-07, "logits/chosen": -1.6401517391204834, "logits/rejected": -1.9211409091949463, "logps/chosen": -2.022190570831299, "logps/rejected": -2.0948269367218018, "loss": 2.5922, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.221904754638672, "rewards/margins": 0.7263639569282532, "rewards/rejected": -20.94826889038086, "step": 19865 }, { "epoch": 0.6697226060871617, "grad_norm": 43.48698425292969, "learning_rate": 2.9707603403074187e-07, "logits/chosen": -2.499911308288574, "logits/rejected": -2.7677135467529297, "logps/chosen": -3.4701619148254395, "logps/rejected": -3.94292950630188, "loss": 2.2561, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -34.70161819458008, "rewards/margins": 4.72767448425293, "rewards/rejected": -39.429290771484375, "step": 19870 }, { "epoch": 0.6698911321581449, "grad_norm": 0.6691756844520569, "learning_rate": 2.9680724761525513e-07, "logits/chosen": -1.852129578590393, "logits/rejected": -1.8271507024765015, "logps/chosen": -3.7556660175323486, "logps/rejected": -4.138113498687744, "loss": 3.4457, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -37.55665969848633, "rewards/margins": 3.8244755268096924, "rewards/rejected": -41.381134033203125, "step": 19875 }, { "epoch": 0.670059658229128, "grad_norm": 58.062313079833984, "learning_rate": 2.9653853151677443e-07, "logits/chosen": -1.591839075088501, "logits/rejected": -2.509289264678955, "logps/chosen": -2.5534749031066895, "logps/rejected": -4.027466773986816, "loss": 2.536, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -25.534751892089844, "rewards/margins": 14.73991870880127, "rewards/rejected": -40.2746696472168, "step": 19880 }, { "epoch": 0.6702281843001112, "grad_norm": 23.800106048583984, "learning_rate": 2.9626988582829197e-07, "logits/chosen": -1.6431554555892944, "logits/rejected": -2.029618740081787, "logps/chosen": -2.032714605331421, "logps/rejected": -2.3359105587005615, "loss": 2.3751, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.327144622802734, "rewards/margins": 3.0319602489471436, "rewards/rejected": -23.359106063842773, "step": 19885 }, { "epoch": 0.6703967103710944, "grad_norm": 18.587617874145508, "learning_rate": 2.9600131064277534e-07, "logits/chosen": -1.9746696949005127, "logits/rejected": -2.062826633453369, "logps/chosen": -2.09183406829834, "logps/rejected": -2.246274471282959, "loss": 2.9238, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.9183406829834, "rewards/margins": 1.5444018840789795, "rewards/rejected": -22.46274185180664, "step": 19890 }, { "epoch": 0.6705652364420776, "grad_norm": 18.86264419555664, "learning_rate": 2.957328060531678e-07, "logits/chosen": -2.121891498565674, "logits/rejected": -1.968225121498108, "logps/chosen": -2.1642074584960938, "logps/rejected": -2.0462965965270996, "loss": 4.5091, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -21.642074584960938, "rewards/margins": -1.179107666015625, "rewards/rejected": -20.462966918945312, "step": 19895 }, { "epoch": 0.6707337625130607, "grad_norm": 46.66054153442383, "learning_rate": 2.9546437215238827e-07, "logits/chosen": -1.9920495748519897, "logits/rejected": -2.144864320755005, "logps/chosen": -1.730360984802246, "logps/rejected": -1.6745617389678955, "loss": 3.6781, "rewards/accuracies": 0.5, "rewards/chosen": -17.30360984802246, "rewards/margins": -0.5579929351806641, "rewards/rejected": -16.745616912841797, "step": 19900 }, { "epoch": 0.670902288584044, "grad_norm": 33.12007141113281, "learning_rate": 2.951960090333314e-07, "logits/chosen": -1.970428228378296, "logits/rejected": -1.892177939414978, "logps/chosen": -2.675799608230591, "logps/rejected": -2.7696533203125, "loss": 3.9091, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -26.75799560546875, "rewards/margins": 0.9385347366333008, "rewards/rejected": -27.696533203125, "step": 19905 }, { "epoch": 0.6710708146550272, "grad_norm": 25.728837966918945, "learning_rate": 2.9492771678886675e-07, "logits/chosen": -1.4254019260406494, "logits/rejected": -1.8275810480117798, "logps/chosen": -2.298034429550171, "logps/rejected": -3.1815507411956787, "loss": 2.7914, "rewards/accuracies": 0.5, "rewards/chosen": -22.980342864990234, "rewards/margins": 8.835161209106445, "rewards/rejected": -31.815505981445312, "step": 19910 }, { "epoch": 0.6712393407260103, "grad_norm": 27.136716842651367, "learning_rate": 2.9465949551183966e-07, "logits/chosen": -2.2184853553771973, "logits/rejected": -2.2532405853271484, "logps/chosen": -3.0618937015533447, "logps/rejected": -3.311782121658325, "loss": 3.4152, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -30.61893653869629, "rewards/margins": 2.4988853931427, "rewards/rejected": -33.117820739746094, "step": 19915 }, { "epoch": 0.6714078667969935, "grad_norm": 25.29293441772461, "learning_rate": 2.9439134529507127e-07, "logits/chosen": -1.8177769184112549, "logits/rejected": -2.0616250038146973, "logps/chosen": -2.5643460750579834, "logps/rejected": -2.580888509750366, "loss": 3.0727, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.643463134765625, "rewards/margins": 0.16542330384254456, "rewards/rejected": -25.808883666992188, "step": 19920 }, { "epoch": 0.6715763928679767, "grad_norm": 24.884119033813477, "learning_rate": 2.9412326623135755e-07, "logits/chosen": -1.701251745223999, "logits/rejected": -2.263763904571533, "logps/chosen": -2.0247585773468018, "logps/rejected": -3.072613000869751, "loss": 1.6354, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -20.24758529663086, "rewards/margins": 10.478544235229492, "rewards/rejected": -30.726131439208984, "step": 19925 }, { "epoch": 0.6717449189389598, "grad_norm": 27.353504180908203, "learning_rate": 2.9385525841347004e-07, "logits/chosen": -1.5897279977798462, "logits/rejected": -1.5412429571151733, "logps/chosen": -1.9933135509490967, "logps/rejected": -2.19027042388916, "loss": 1.8097, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.933135986328125, "rewards/margins": 1.9695701599121094, "rewards/rejected": -21.9027042388916, "step": 19930 }, { "epoch": 0.671913445009943, "grad_norm": 36.17189025878906, "learning_rate": 2.935873219341558e-07, "logits/chosen": -1.9658607244491577, "logits/rejected": -2.1116909980773926, "logps/chosen": -2.512491464614868, "logps/rejected": -2.5998706817626953, "loss": 2.6017, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.124914169311523, "rewards/margins": 0.873791515827179, "rewards/rejected": -25.998706817626953, "step": 19935 }, { "epoch": 0.6720819710809263, "grad_norm": 99.07889556884766, "learning_rate": 2.9331945688613736e-07, "logits/chosen": -1.8254836797714233, "logits/rejected": -2.083385944366455, "logps/chosen": -2.940991163253784, "logps/rejected": -3.8571383953094482, "loss": 1.2032, "rewards/accuracies": 1.0, "rewards/chosen": -29.409912109375, "rewards/margins": 9.161473274230957, "rewards/rejected": -38.571388244628906, "step": 19940 }, { "epoch": 0.6722504971519094, "grad_norm": 22.57912254333496, "learning_rate": 2.9305166336211187e-07, "logits/chosen": -1.6052013635635376, "logits/rejected": -1.779809594154358, "logps/chosen": -1.822079062461853, "logps/rejected": -2.1713473796844482, "loss": 1.7866, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.22079086303711, "rewards/margins": 3.4926846027374268, "rewards/rejected": -21.71347427368164, "step": 19945 }, { "epoch": 0.6724190232228926, "grad_norm": 41.973777770996094, "learning_rate": 2.9278394145475214e-07, "logits/chosen": -1.430271863937378, "logits/rejected": -1.4732592105865479, "logps/chosen": -2.8167452812194824, "logps/rejected": -3.216135025024414, "loss": 1.7358, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -28.167449951171875, "rewards/margins": 3.993901014328003, "rewards/rejected": -32.161354064941406, "step": 19950 }, { "epoch": 0.6725875492938758, "grad_norm": 87.2608413696289, "learning_rate": 2.925162912567064e-07, "logits/chosen": -2.058218240737915, "logits/rejected": -2.2669763565063477, "logps/chosen": -2.5060524940490723, "logps/rejected": -2.9572110176086426, "loss": 3.5245, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -25.06052589416504, "rewards/margins": 4.511587142944336, "rewards/rejected": -29.572113037109375, "step": 19955 }, { "epoch": 0.6727560753648589, "grad_norm": 22.192533493041992, "learning_rate": 2.922487128605977e-07, "logits/chosen": -1.6927998065948486, "logits/rejected": -1.853003740310669, "logps/chosen": -2.4295566082000732, "logps/rejected": -2.8034374713897705, "loss": 3.1124, "rewards/accuracies": 0.5, "rewards/chosen": -24.29556655883789, "rewards/margins": 3.7388081550598145, "rewards/rejected": -28.034374237060547, "step": 19960 }, { "epoch": 0.6729246014358421, "grad_norm": 25.506179809570312, "learning_rate": 2.9198120635902437e-07, "logits/chosen": -2.323963165283203, "logits/rejected": -2.281276226043701, "logps/chosen": -1.8685518503189087, "logps/rejected": -2.068497896194458, "loss": 2.338, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.685518264770508, "rewards/margins": 1.9994609355926514, "rewards/rejected": -20.684978485107422, "step": 19965 }, { "epoch": 0.6730931275068253, "grad_norm": 40.8975830078125, "learning_rate": 2.917137718445598e-07, "logits/chosen": -1.6007277965545654, "logits/rejected": -1.6875572204589844, "logps/chosen": -2.6180386543273926, "logps/rejected": -2.8947513103485107, "loss": 3.3363, "rewards/accuracies": 0.5, "rewards/chosen": -26.180383682250977, "rewards/margins": 2.7671265602111816, "rewards/rejected": -28.947513580322266, "step": 19970 }, { "epoch": 0.6732616535778085, "grad_norm": 36.157615661621094, "learning_rate": 2.9144640940975296e-07, "logits/chosen": -1.505743384361267, "logits/rejected": -1.3464056253433228, "logps/chosen": -2.9090559482574463, "logps/rejected": -2.8138246536254883, "loss": 4.7922, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -29.090557098388672, "rewards/margins": -0.9523128271102905, "rewards/rejected": -28.138248443603516, "step": 19975 }, { "epoch": 0.6734301796487917, "grad_norm": 44.5861930847168, "learning_rate": 2.911791191471269e-07, "logits/chosen": -1.250292181968689, "logits/rejected": -1.2156884670257568, "logps/chosen": -2.3468449115753174, "logps/rejected": -2.5400891304016113, "loss": 2.9251, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -23.46845054626465, "rewards/margins": 1.932440161705017, "rewards/rejected": -25.400888442993164, "step": 19980 }, { "epoch": 0.6735987057197749, "grad_norm": 41.93446350097656, "learning_rate": 2.909119011491805e-07, "logits/chosen": -1.940731406211853, "logits/rejected": -2.3662357330322266, "logps/chosen": -1.8077341318130493, "logps/rejected": -2.249481678009033, "loss": 2.6443, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.07733917236328, "rewards/margins": 4.417477607727051, "rewards/rejected": -22.494815826416016, "step": 19985 }, { "epoch": 0.673767231790758, "grad_norm": 24.274431228637695, "learning_rate": 2.9064475550838764e-07, "logits/chosen": -1.6842902898788452, "logits/rejected": -2.0487284660339355, "logps/chosen": -2.380117416381836, "logps/rejected": -2.9363510608673096, "loss": 2.1863, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.80117416381836, "rewards/margins": 5.562333583831787, "rewards/rejected": -29.363506317138672, "step": 19990 }, { "epoch": 0.6739357578617412, "grad_norm": 19.593360900878906, "learning_rate": 2.9037768231719636e-07, "logits/chosen": -0.8565117716789246, "logits/rejected": -1.156738042831421, "logps/chosen": -2.1061644554138184, "logps/rejected": -2.46700382232666, "loss": 1.9911, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.0616455078125, "rewards/margins": 3.608396053314209, "rewards/rejected": -24.670042037963867, "step": 19995 }, { "epoch": 0.6741042839327244, "grad_norm": 12.377226829528809, "learning_rate": 2.9011068166803046e-07, "logits/chosen": -1.6227328777313232, "logits/rejected": -1.8104091882705688, "logps/chosen": -2.387129306793213, "logps/rejected": -2.4981789588928223, "loss": 2.4878, "rewards/accuracies": 0.5, "rewards/chosen": -23.871295928955078, "rewards/margins": 1.1104968786239624, "rewards/rejected": -24.981792449951172, "step": 20000 }, { "epoch": 0.6741042839327244, "eval_logits/chosen": -2.1333370208740234, "eval_logits/rejected": -2.2969448566436768, "eval_logps/chosen": -2.2102105617523193, "eval_logps/rejected": -2.3554303646087646, "eval_loss": 3.0430943965911865, "eval_rewards/accuracies": 0.6299999952316284, "eval_rewards/chosen": -22.10210418701172, "eval_rewards/margins": 1.4521974325180054, "eval_rewards/rejected": -23.554304122924805, "eval_runtime": 12.9424, "eval_samples_per_second": 7.727, "eval_steps_per_second": 1.932, "step": 20000 }, { "epoch": 0.6742728100037075, "grad_norm": 4.8359270095825195, "learning_rate": 2.898437536532885e-07, "logits/chosen": -1.0973485708236694, "logits/rejected": -1.7246170043945312, "logps/chosen": -3.404980182647705, "logps/rejected": -4.6295485496521, "loss": 1.127, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -34.04979705810547, "rewards/margins": 12.245687484741211, "rewards/rejected": -46.29548645019531, "step": 20005 }, { "epoch": 0.6744413360746907, "grad_norm": 67.98348236083984, "learning_rate": 2.8957689836534336e-07, "logits/chosen": -1.6053552627563477, "logits/rejected": -1.9920330047607422, "logps/chosen": -2.480193614959717, "logps/rejected": -2.6489195823669434, "loss": 3.5401, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -24.801939010620117, "rewards/margins": 1.687256097793579, "rewards/rejected": -26.489192962646484, "step": 20010 }, { "epoch": 0.674609862145674, "grad_norm": 42.41163635253906, "learning_rate": 2.893101158965434e-07, "logits/chosen": -1.9345791339874268, "logits/rejected": -2.219799757003784, "logps/chosen": -2.2220282554626465, "logps/rejected": -2.655217170715332, "loss": 2.3733, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.22028160095215, "rewards/margins": 4.331890106201172, "rewards/rejected": -26.552169799804688, "step": 20015 }, { "epoch": 0.6747783882166571, "grad_norm": 35.31697463989258, "learning_rate": 2.890434063392114e-07, "logits/chosen": -1.5198293924331665, "logits/rejected": -1.8901045322418213, "logps/chosen": -2.567343235015869, "logps/rejected": -3.333820343017578, "loss": 2.5047, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.67343521118164, "rewards/margins": 7.66477108001709, "rewards/rejected": -33.33820724487305, "step": 20020 }, { "epoch": 0.6749469142876403, "grad_norm": 29.586748123168945, "learning_rate": 2.887767697856454e-07, "logits/chosen": -1.6142957210540771, "logits/rejected": -2.133453369140625, "logps/chosen": -2.0409905910491943, "logps/rejected": -2.410146713256836, "loss": 2.2076, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.4099063873291, "rewards/margins": 3.6915602684020996, "rewards/rejected": -24.10146713256836, "step": 20025 }, { "epoch": 0.6751154403586235, "grad_norm": 104.16493225097656, "learning_rate": 2.885102063281173e-07, "logits/chosen": -1.5088083744049072, "logits/rejected": -1.4387980699539185, "logps/chosen": -2.9790401458740234, "logps/rejected": -3.0391666889190674, "loss": 4.008, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -29.790401458740234, "rewards/margins": 0.6012633442878723, "rewards/rejected": -30.39166831970215, "step": 20030 }, { "epoch": 0.6752839664296066, "grad_norm": 37.88565444946289, "learning_rate": 2.882437160588744e-07, "logits/chosen": -1.602617859840393, "logits/rejected": -1.81451416015625, "logps/chosen": -2.366647243499756, "logps/rejected": -2.5041327476501465, "loss": 2.6427, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.666473388671875, "rewards/margins": 1.374853253364563, "rewards/rejected": -25.04132652282715, "step": 20035 }, { "epoch": 0.6754524925005898, "grad_norm": 43.18614196777344, "learning_rate": 2.879772990701387e-07, "logits/chosen": -1.196619987487793, "logits/rejected": -1.1416454315185547, "logps/chosen": -2.1735050678253174, "logps/rejected": -2.4336700439453125, "loss": 2.4065, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.73505401611328, "rewards/margins": 2.6016478538513184, "rewards/rejected": -24.336700439453125, "step": 20040 }, { "epoch": 0.675621018571573, "grad_norm": 32.108089447021484, "learning_rate": 2.8771095545410627e-07, "logits/chosen": -1.4238228797912598, "logits/rejected": -1.4724472761154175, "logps/chosen": -2.4174728393554688, "logps/rejected": -2.5063812732696533, "loss": 3.1065, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.174728393554688, "rewards/margins": 0.8890829086303711, "rewards/rejected": -25.063812255859375, "step": 20045 }, { "epoch": 0.6757895446425563, "grad_norm": 30.244998931884766, "learning_rate": 2.874446853029483e-07, "logits/chosen": -1.2692375183105469, "logits/rejected": -1.465065598487854, "logps/chosen": -2.7822213172912598, "logps/rejected": -2.912348508834839, "loss": 3.6016, "rewards/accuracies": 0.5, "rewards/chosen": -27.822214126586914, "rewards/margins": 1.3012707233428955, "rewards/rejected": -29.123483657836914, "step": 20050 }, { "epoch": 0.6759580707135394, "grad_norm": 24.151615142822266, "learning_rate": 2.8717848870881033e-07, "logits/chosen": -1.6752557754516602, "logits/rejected": -2.0266435146331787, "logps/chosen": -2.9445536136627197, "logps/rejected": -3.129530429840088, "loss": 5.0165, "rewards/accuracies": 0.5, "rewards/chosen": -29.445537567138672, "rewards/margins": 1.8497695922851562, "rewards/rejected": -31.295307159423828, "step": 20055 }, { "epoch": 0.6761265967845226, "grad_norm": 28.681072235107422, "learning_rate": 2.869123657638126e-07, "logits/chosen": -2.0757529735565186, "logits/rejected": -2.134483575820923, "logps/chosen": -1.8745758533477783, "logps/rejected": -2.227299928665161, "loss": 1.2008, "rewards/accuracies": 1.0, "rewards/chosen": -18.745756149291992, "rewards/margins": 3.5272421836853027, "rewards/rejected": -22.272998809814453, "step": 20060 }, { "epoch": 0.6762951228555057, "grad_norm": 31.584806442260742, "learning_rate": 2.8664631656004984e-07, "logits/chosen": -1.453299880027771, "logits/rejected": -1.7309995889663696, "logps/chosen": -2.1703834533691406, "logps/rejected": -2.3654937744140625, "loss": 2.009, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.703832626342773, "rewards/margins": 1.9511051177978516, "rewards/rejected": -23.65494155883789, "step": 20065 }, { "epoch": 0.6764636489264889, "grad_norm": 25.078779220581055, "learning_rate": 2.863803411895911e-07, "logits/chosen": -1.1493384838104248, "logits/rejected": -1.3260166645050049, "logps/chosen": -2.2854998111724854, "logps/rejected": -2.580888509750366, "loss": 2.1081, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.854999542236328, "rewards/margins": 2.9538865089416504, "rewards/rejected": -25.808887481689453, "step": 20070 }, { "epoch": 0.6766321749974721, "grad_norm": 38.19968795776367, "learning_rate": 2.8611443974448015e-07, "logits/chosen": -1.5478785037994385, "logits/rejected": -1.4771803617477417, "logps/chosen": -2.46025013923645, "logps/rejected": -2.3180558681488037, "loss": 4.5903, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -24.602500915527344, "rewards/margins": -1.421942114830017, "rewards/rejected": -23.180557250976562, "step": 20075 }, { "epoch": 0.6768007010684552, "grad_norm": 35.97868347167969, "learning_rate": 2.858486123167346e-07, "logits/chosen": -2.461002826690674, "logits/rejected": -2.8161003589630127, "logps/chosen": -2.063539743423462, "logps/rejected": -2.8001794815063477, "loss": 1.391, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.635398864746094, "rewards/margins": 7.366394996643066, "rewards/rejected": -28.001794815063477, "step": 20080 }, { "epoch": 0.6769692271394385, "grad_norm": 78.83557891845703, "learning_rate": 2.855828589983472e-07, "logits/chosen": -1.955715537071228, "logits/rejected": -1.9469232559204102, "logps/chosen": -1.9905261993408203, "logps/rejected": -1.7431453466415405, "loss": 5.602, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -19.905261993408203, "rewards/margins": -2.4738082885742188, "rewards/rejected": -17.431453704833984, "step": 20085 }, { "epoch": 0.6771377532104217, "grad_norm": 51.79839324951172, "learning_rate": 2.8531717988128463e-07, "logits/chosen": -1.5449891090393066, "logits/rejected": -1.8164851665496826, "logps/chosen": -2.6464550495147705, "logps/rejected": -3.3289992809295654, "loss": 3.4826, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -26.464550018310547, "rewards/margins": 6.825445652008057, "rewards/rejected": -33.28999710083008, "step": 20090 }, { "epoch": 0.6773062792814049, "grad_norm": 30.013301849365234, "learning_rate": 2.8505157505748804e-07, "logits/chosen": -1.1776177883148193, "logits/rejected": -1.3167506456375122, "logps/chosen": -2.100921154022217, "logps/rejected": -2.1228299140930176, "loss": 3.0622, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.00921058654785, "rewards/margins": 0.21908855438232422, "rewards/rejected": -21.22829818725586, "step": 20095 }, { "epoch": 0.677474805352388, "grad_norm": 19.57428741455078, "learning_rate": 2.8478604461887255e-07, "logits/chosen": -1.5491914749145508, "logits/rejected": -1.5144532918930054, "logps/chosen": -2.411500930786133, "logps/rejected": -2.600080966949463, "loss": 3.1888, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -24.115009307861328, "rewards/margins": 1.8858016729354858, "rewards/rejected": -26.000812530517578, "step": 20100 }, { "epoch": 0.6776433314233712, "grad_norm": 40.35907745361328, "learning_rate": 2.845205886573279e-07, "logits/chosen": -1.5372756719589233, "logits/rejected": -1.847243309020996, "logps/chosen": -2.269357919692993, "logps/rejected": -2.4268269538879395, "loss": 2.3753, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.693578720092773, "rewards/margins": 1.5746897459030151, "rewards/rejected": -24.268268585205078, "step": 20105 }, { "epoch": 0.6778118574943544, "grad_norm": 47.303367614746094, "learning_rate": 2.842552072647182e-07, "logits/chosen": -1.74314284324646, "logits/rejected": -1.6884737014770508, "logps/chosen": -2.1921029090881348, "logps/rejected": -3.185455560684204, "loss": 2.4485, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.921030044555664, "rewards/margins": 9.933526039123535, "rewards/rejected": -31.854557037353516, "step": 20110 }, { "epoch": 0.6779803835653375, "grad_norm": 123.9562759399414, "learning_rate": 2.83989900532881e-07, "logits/chosen": -1.790907859802246, "logits/rejected": -2.0922181606292725, "logps/chosen": -2.6318843364715576, "logps/rejected": -2.8555569648742676, "loss": 2.0623, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.3188419342041, "rewards/margins": 2.2367305755615234, "rewards/rejected": -28.555572509765625, "step": 20115 }, { "epoch": 0.6781489096363207, "grad_norm": 21.239953994750977, "learning_rate": 2.8372466855362883e-07, "logits/chosen": -2.271080493927002, "logits/rejected": -3.0720982551574707, "logps/chosen": -3.933926820755005, "logps/rejected": -5.070860385894775, "loss": 6.406, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -39.33926773071289, "rewards/margins": 11.36933422088623, "rewards/rejected": -50.70859909057617, "step": 20120 }, { "epoch": 0.678317435707304, "grad_norm": 23.059619903564453, "learning_rate": 2.834595114187479e-07, "logits/chosen": -1.4889625310897827, "logits/rejected": -1.7724645137786865, "logps/chosen": -1.981604814529419, "logps/rejected": -2.2343153953552246, "loss": 2.1921, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.81604766845703, "rewards/margins": 2.5271058082580566, "rewards/rejected": -22.343151092529297, "step": 20125 }, { "epoch": 0.6784859617782871, "grad_norm": 24.74485969543457, "learning_rate": 2.83194429219999e-07, "logits/chosen": -1.1593676805496216, "logits/rejected": -1.7265560626983643, "logps/chosen": -2.5378623008728027, "logps/rejected": -3.2800393104553223, "loss": 1.6983, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.378623962402344, "rewards/margins": 7.421769618988037, "rewards/rejected": -32.800392150878906, "step": 20130 }, { "epoch": 0.6786544878492703, "grad_norm": 98.45122528076172, "learning_rate": 2.829294220491161e-07, "logits/chosen": -1.537777304649353, "logits/rejected": -1.8030803203582764, "logps/chosen": -2.5062530040740967, "logps/rejected": -3.077735185623169, "loss": 3.7697, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.062528610229492, "rewards/margins": 5.714821815490723, "rewards/rejected": -30.7773494720459, "step": 20135 }, { "epoch": 0.6788230139202535, "grad_norm": 19.68750762939453, "learning_rate": 2.82664489997808e-07, "logits/chosen": -1.295419692993164, "logits/rejected": -1.708397626876831, "logps/chosen": -2.520289659500122, "logps/rejected": -2.7188165187835693, "loss": 3.2872, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.20289421081543, "rewards/margins": 1.985269546508789, "rewards/rejected": -27.18816566467285, "step": 20140 }, { "epoch": 0.6789915399912366, "grad_norm": 16.885944366455078, "learning_rate": 2.823996331577574e-07, "logits/chosen": -1.9829976558685303, "logits/rejected": -2.066995143890381, "logps/chosen": -2.4660446643829346, "logps/rejected": -3.0983173847198486, "loss": 1.8963, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -24.660446166992188, "rewards/margins": 6.322728633880615, "rewards/rejected": -30.983173370361328, "step": 20145 }, { "epoch": 0.6791600660622198, "grad_norm": 30.617427825927734, "learning_rate": 2.821348516206204e-07, "logits/chosen": -1.7437076568603516, "logits/rejected": -2.1927478313446045, "logps/chosen": -1.8544002771377563, "logps/rejected": -2.3719706535339355, "loss": 1.7637, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.544002532958984, "rewards/margins": 5.1757025718688965, "rewards/rejected": -23.719707489013672, "step": 20150 }, { "epoch": 0.679328592133203, "grad_norm": 23.765220642089844, "learning_rate": 2.8187014547802783e-07, "logits/chosen": -1.847679853439331, "logits/rejected": -1.8367445468902588, "logps/chosen": -2.689016819000244, "logps/rejected": -3.5438976287841797, "loss": 1.2487, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -26.890167236328125, "rewards/margins": 8.548810005187988, "rewards/rejected": -35.4389762878418, "step": 20155 }, { "epoch": 0.6794971182041862, "grad_norm": 33.91009521484375, "learning_rate": 2.816055148215839e-07, "logits/chosen": -1.806243658065796, "logits/rejected": -1.8416646718978882, "logps/chosen": -1.8404827117919922, "logps/rejected": -1.952391266822815, "loss": 2.6134, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.404827117919922, "rewards/margins": 1.1190874576568604, "rewards/rejected": -19.523914337158203, "step": 20160 }, { "epoch": 0.6796656442751694, "grad_norm": 32.12192153930664, "learning_rate": 2.813409597428671e-07, "logits/chosen": -1.9678351879119873, "logits/rejected": -2.0218756198883057, "logps/chosen": -3.2655386924743652, "logps/rejected": -3.7912089824676514, "loss": 3.1875, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -32.65538787841797, "rewards/margins": 5.256703853607178, "rewards/rejected": -37.91209030151367, "step": 20165 }, { "epoch": 0.6798341703461526, "grad_norm": 43.30851745605469, "learning_rate": 2.8107648033342914e-07, "logits/chosen": -1.8117700815200806, "logits/rejected": -1.8282169103622437, "logps/chosen": -2.4342598915100098, "logps/rejected": -2.890298366546631, "loss": 2.7606, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.342599868774414, "rewards/margins": 4.560386657714844, "rewards/rejected": -28.90298843383789, "step": 20170 }, { "epoch": 0.6800026964171357, "grad_norm": 34.93639373779297, "learning_rate": 2.80812076684796e-07, "logits/chosen": -1.8555113077163696, "logits/rejected": -1.8551677465438843, "logps/chosen": -2.5153555870056152, "logps/rejected": -2.742401599884033, "loss": 3.1483, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.153554916381836, "rewards/margins": 2.2704625129699707, "rewards/rejected": -27.42401695251465, "step": 20175 }, { "epoch": 0.6801712224881189, "grad_norm": 44.3549919128418, "learning_rate": 2.805477488884677e-07, "logits/chosen": -1.478435754776001, "logits/rejected": -1.7246806621551514, "logps/chosen": -1.7173773050308228, "logps/rejected": -1.8425076007843018, "loss": 3.089, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.173770904541016, "rewards/margins": 1.2513021230697632, "rewards/rejected": -18.42507553100586, "step": 20180 }, { "epoch": 0.6803397485591021, "grad_norm": 122.29318237304688, "learning_rate": 2.8028349703591727e-07, "logits/chosen": -1.7741448879241943, "logits/rejected": -1.771937608718872, "logps/chosen": -2.4081344604492188, "logps/rejected": -2.4229705333709717, "loss": 4.366, "rewards/accuracies": 0.5, "rewards/chosen": -24.081344604492188, "rewards/margins": 0.14835968613624573, "rewards/rejected": -24.22970199584961, "step": 20185 }, { "epoch": 0.6805082746300852, "grad_norm": 31.629911422729492, "learning_rate": 2.8001932121859195e-07, "logits/chosen": -1.6347239017486572, "logits/rejected": -1.6003844738006592, "logps/chosen": -1.9684116840362549, "logps/rejected": -1.9572652578353882, "loss": 3.2296, "rewards/accuracies": 0.5, "rewards/chosen": -19.68411636352539, "rewards/margins": -0.11146555095911026, "rewards/rejected": -19.572650909423828, "step": 20190 }, { "epoch": 0.6806768007010685, "grad_norm": 36.26969528198242, "learning_rate": 2.7975522152791274e-07, "logits/chosen": -1.643143892288208, "logits/rejected": -1.7509486675262451, "logps/chosen": -2.7773661613464355, "logps/rejected": -3.1645007133483887, "loss": 2.8131, "rewards/accuracies": 0.5, "rewards/chosen": -27.773662567138672, "rewards/margins": 3.871345043182373, "rewards/rejected": -31.645008087158203, "step": 20195 }, { "epoch": 0.6808453267720517, "grad_norm": 26.0379638671875, "learning_rate": 2.7949119805527406e-07, "logits/chosen": -1.2348109483718872, "logits/rejected": -1.275914192199707, "logps/chosen": -2.392324686050415, "logps/rejected": -2.7133028507232666, "loss": 0.9852, "rewards/accuracies": 1.0, "rewards/chosen": -23.92324447631836, "rewards/margins": 3.2097830772399902, "rewards/rejected": -27.13302993774414, "step": 20200 }, { "epoch": 0.6810138528430348, "grad_norm": 36.44670104980469, "learning_rate": 2.7922725089204425e-07, "logits/chosen": -1.4761666059494019, "logits/rejected": -1.729008674621582, "logps/chosen": -1.9980462789535522, "logps/rejected": -2.1607513427734375, "loss": 2.8235, "rewards/accuracies": 0.5, "rewards/chosen": -19.9804630279541, "rewards/margins": 1.6270501613616943, "rewards/rejected": -21.607511520385742, "step": 20205 }, { "epoch": 0.681182378914018, "grad_norm": 14.305713653564453, "learning_rate": 2.789633801295645e-07, "logits/chosen": -1.8914934396743774, "logits/rejected": -2.159989833831787, "logps/chosen": -1.9110405445098877, "logps/rejected": -2.0172300338745117, "loss": 2.6372, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.110403060913086, "rewards/margins": 1.0618977546691895, "rewards/rejected": -20.17230224609375, "step": 20210 }, { "epoch": 0.6813509049850012, "grad_norm": 39.7442741394043, "learning_rate": 2.786995858591505e-07, "logits/chosen": -1.5674628019332886, "logits/rejected": -1.659325361251831, "logps/chosen": -2.3448400497436523, "logps/rejected": -2.6547160148620605, "loss": 3.0196, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.448400497436523, "rewards/margins": 3.0987625122070312, "rewards/rejected": -26.547161102294922, "step": 20215 }, { "epoch": 0.6815194310559843, "grad_norm": 35.93793487548828, "learning_rate": 2.784358681720909e-07, "logits/chosen": -1.6853892803192139, "logits/rejected": -1.776785135269165, "logps/chosen": -1.8474948406219482, "logps/rejected": -1.982184648513794, "loss": 2.7998, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.47494888305664, "rewards/margins": 1.3468974828720093, "rewards/rejected": -19.821847915649414, "step": 20220 }, { "epoch": 0.6816879571269675, "grad_norm": 24.072486877441406, "learning_rate": 2.7817222715964807e-07, "logits/chosen": -1.866061806678772, "logits/rejected": -2.0546302795410156, "logps/chosen": -2.9646809101104736, "logps/rejected": -3.516381025314331, "loss": 2.0249, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -29.646808624267578, "rewards/margins": 5.517003536224365, "rewards/rejected": -35.16381072998047, "step": 20225 }, { "epoch": 0.6818564831979507, "grad_norm": 41.478538513183594, "learning_rate": 2.779086629130577e-07, "logits/chosen": -1.5552170276641846, "logits/rejected": -2.01354718208313, "logps/chosen": -2.577141284942627, "logps/rejected": -3.2818055152893066, "loss": 1.9669, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -25.771411895751953, "rewards/margins": 7.046643257141113, "rewards/rejected": -32.81805419921875, "step": 20230 }, { "epoch": 0.682025009268934, "grad_norm": 34.2642707824707, "learning_rate": 2.776451755235293e-07, "logits/chosen": -1.435533881187439, "logits/rejected": -1.877467155456543, "logps/chosen": -2.060547351837158, "logps/rejected": -2.422755002975464, "loss": 2.2323, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.605472564697266, "rewards/margins": 3.622079372406006, "rewards/rejected": -24.227550506591797, "step": 20235 }, { "epoch": 0.6821935353399171, "grad_norm": 45.35870361328125, "learning_rate": 2.77381765082245e-07, "logits/chosen": -1.6485698223114014, "logits/rejected": -1.6269375085830688, "logps/chosen": -3.022157669067383, "logps/rejected": -3.5015807151794434, "loss": 3.5989, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -30.221576690673828, "rewards/margins": 4.7942352294921875, "rewards/rejected": -35.01581573486328, "step": 20240 }, { "epoch": 0.6823620614109003, "grad_norm": 42.239959716796875, "learning_rate": 2.77118431680361e-07, "logits/chosen": -1.5377813577651978, "logits/rejected": -1.3784878253936768, "logps/chosen": -2.4813880920410156, "logps/rejected": -3.0015201568603516, "loss": 4.149, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -24.813879013061523, "rewards/margins": 5.201320648193359, "rewards/rejected": -30.015201568603516, "step": 20245 }, { "epoch": 0.6825305874818834, "grad_norm": 27.377073287963867, "learning_rate": 2.768551754090068e-07, "logits/chosen": -1.577036738395691, "logits/rejected": -1.7084392309188843, "logps/chosen": -2.2118425369262695, "logps/rejected": -2.247002363204956, "loss": 3.2371, "rewards/accuracies": 0.5, "rewards/chosen": -22.118425369262695, "rewards/margins": 0.35159969329833984, "rewards/rejected": -22.47002601623535, "step": 20250 }, { "epoch": 0.6826991135528666, "grad_norm": 19.689481735229492, "learning_rate": 2.7659199635928465e-07, "logits/chosen": -1.7298429012298584, "logits/rejected": -2.325814962387085, "logps/chosen": -1.8912522792816162, "logps/rejected": -2.167006015777588, "loss": 2.8236, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.91252326965332, "rewards/margins": 2.7575371265411377, "rewards/rejected": -21.670061111450195, "step": 20255 }, { "epoch": 0.6828676396238498, "grad_norm": 33.25885009765625, "learning_rate": 2.763288946222707e-07, "logits/chosen": -1.6983985900878906, "logits/rejected": -1.697054147720337, "logps/chosen": -2.0792253017425537, "logps/rejected": -2.391404867172241, "loss": 2.6382, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.792253494262695, "rewards/margins": 3.1217968463897705, "rewards/rejected": -23.914051055908203, "step": 20260 }, { "epoch": 0.6830361656948329, "grad_norm": 27.963102340698242, "learning_rate": 2.7606587028901395e-07, "logits/chosen": -1.5231996774673462, "logits/rejected": -1.3576040267944336, "logps/chosen": -2.1193814277648926, "logps/rejected": -2.295095443725586, "loss": 2.6454, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.19381332397461, "rewards/margins": 1.7571430206298828, "rewards/rejected": -22.950956344604492, "step": 20265 }, { "epoch": 0.6832046917658162, "grad_norm": 28.47199821472168, "learning_rate": 2.75802923450537e-07, "logits/chosen": -1.8896493911743164, "logits/rejected": -1.9708207845687866, "logps/chosen": -2.278593063354492, "logps/rejected": -2.3892312049865723, "loss": 2.5397, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.785930633544922, "rewards/margins": 1.1063793897628784, "rewards/rejected": -23.89231300354004, "step": 20270 }, { "epoch": 0.6833732178367994, "grad_norm": 28.51179313659668, "learning_rate": 2.7554005419783516e-07, "logits/chosen": -1.6566784381866455, "logits/rejected": -1.8939708471298218, "logps/chosen": -2.288196325302124, "logps/rejected": -2.2760863304138184, "loss": 3.7188, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -22.881961822509766, "rewards/margins": -0.12109851837158203, "rewards/rejected": -22.7608642578125, "step": 20275 }, { "epoch": 0.6835417439077826, "grad_norm": 29.09737205505371, "learning_rate": 2.752772626218771e-07, "logits/chosen": -1.6456453800201416, "logits/rejected": -1.872393012046814, "logps/chosen": -2.492414951324463, "logps/rejected": -3.059534788131714, "loss": 2.0609, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.924148559570312, "rewards/margins": 5.671200752258301, "rewards/rejected": -30.595348358154297, "step": 20280 }, { "epoch": 0.6837102699787657, "grad_norm": 23.130584716796875, "learning_rate": 2.7501454881360496e-07, "logits/chosen": -2.1061911582946777, "logits/rejected": -2.1469058990478516, "logps/chosen": -2.26118540763855, "logps/rejected": -2.4657797813415527, "loss": 2.4224, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.611852645874023, "rewards/margins": 2.0459442138671875, "rewards/rejected": -24.65779685974121, "step": 20285 }, { "epoch": 0.6838787960497489, "grad_norm": 4.321815013885498, "learning_rate": 2.7475191286393316e-07, "logits/chosen": -1.7793070077896118, "logits/rejected": -1.7671607732772827, "logps/chosen": -1.7020763158798218, "logps/rejected": -1.816414475440979, "loss": 3.0518, "rewards/accuracies": 0.5, "rewards/chosen": -17.020761489868164, "rewards/margins": 1.1433823108673096, "rewards/rejected": -18.16414451599121, "step": 20290 }, { "epoch": 0.684047322120732, "grad_norm": 19.862804412841797, "learning_rate": 2.7448935486374994e-07, "logits/chosen": -2.074662685394287, "logits/rejected": -2.0828239917755127, "logps/chosen": -2.2630491256713867, "logps/rejected": -2.365959644317627, "loss": 3.3963, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.630489349365234, "rewards/margins": 1.0291064977645874, "rewards/rejected": -23.659595489501953, "step": 20295 }, { "epoch": 0.6842158481917152, "grad_norm": 67.76535034179688, "learning_rate": 2.7422687490391627e-07, "logits/chosen": -1.5843805074691772, "logits/rejected": -1.6821810007095337, "logps/chosen": -2.2501184940338135, "logps/rejected": -2.322180986404419, "loss": 2.7301, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.50118637084961, "rewards/margins": 0.7206247448921204, "rewards/rejected": -23.221811294555664, "step": 20300 }, { "epoch": 0.6843843742626985, "grad_norm": 20.505523681640625, "learning_rate": 2.739644730752662e-07, "logits/chosen": -1.7529728412628174, "logits/rejected": -1.9020655155181885, "logps/chosen": -2.148254871368408, "logps/rejected": -2.5298733711242676, "loss": 1.4661, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -21.4825496673584, "rewards/margins": 3.816183567047119, "rewards/rejected": -25.29873275756836, "step": 20305 }, { "epoch": 0.6845529003336817, "grad_norm": 101.92227172851562, "learning_rate": 2.737021494686064e-07, "logits/chosen": -2.2013256549835205, "logits/rejected": -2.3193464279174805, "logps/chosen": -2.3510169982910156, "logps/rejected": -2.4058213233947754, "loss": 2.9042, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.51017189025879, "rewards/margins": 0.5480403900146484, "rewards/rejected": -24.058212280273438, "step": 20310 }, { "epoch": 0.6847214264046648, "grad_norm": 22.332473754882812, "learning_rate": 2.734399041747169e-07, "logits/chosen": -1.2684409618377686, "logits/rejected": -1.8357795476913452, "logps/chosen": -2.1953673362731934, "logps/rejected": -2.5396382808685303, "loss": 1.8932, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.953670501708984, "rewards/margins": 3.4427096843719482, "rewards/rejected": -25.396381378173828, "step": 20315 }, { "epoch": 0.684889952475648, "grad_norm": 9.619364738464355, "learning_rate": 2.7317773728435067e-07, "logits/chosen": -1.5909945964813232, "logits/rejected": -1.9193031787872314, "logps/chosen": -2.6117565631866455, "logps/rejected": -3.1586432456970215, "loss": 2.584, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -26.117565155029297, "rewards/margins": 5.468869686126709, "rewards/rejected": -31.5864315032959, "step": 20320 }, { "epoch": 0.6850584785466312, "grad_norm": 39.636714935302734, "learning_rate": 2.7291564888823287e-07, "logits/chosen": -1.0083258152008057, "logits/rejected": -1.1197991371154785, "logps/chosen": -2.370436906814575, "logps/rejected": -2.7647347450256348, "loss": 2.3407, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.704368591308594, "rewards/margins": 3.942978620529175, "rewards/rejected": -27.6473445892334, "step": 20325 }, { "epoch": 0.6852270046176143, "grad_norm": 47.81154251098633, "learning_rate": 2.726536390770623e-07, "logits/chosen": -1.4197077751159668, "logits/rejected": -2.2119290828704834, "logps/chosen": -2.0732133388519287, "logps/rejected": -2.5548577308654785, "loss": 3.0506, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.732135772705078, "rewards/margins": 4.816441059112549, "rewards/rejected": -25.54857635498047, "step": 20330 }, { "epoch": 0.6853955306885975, "grad_norm": 29.99540901184082, "learning_rate": 2.7239170794151007e-07, "logits/chosen": -2.2027525901794434, "logits/rejected": -2.388345718383789, "logps/chosen": -3.244304656982422, "logps/rejected": -3.8606619834899902, "loss": 0.961, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -32.44304656982422, "rewards/margins": 6.163573265075684, "rewards/rejected": -38.60662078857422, "step": 20335 }, { "epoch": 0.6855640567595807, "grad_norm": 17.19338607788086, "learning_rate": 2.7212985557222056e-07, "logits/chosen": -2.053317070007324, "logits/rejected": -2.0944182872772217, "logps/chosen": -2.624318838119507, "logps/rejected": -2.6115059852600098, "loss": 4.0212, "rewards/accuracies": 0.5, "rewards/chosen": -26.243188858032227, "rewards/margins": -0.12812776863574982, "rewards/rejected": -26.115060806274414, "step": 20340 }, { "epoch": 0.6857325828305639, "grad_norm": 24.04823112487793, "learning_rate": 2.718680820598101e-07, "logits/chosen": -1.5281686782836914, "logits/rejected": -2.1130008697509766, "logps/chosen": -1.9165928363800049, "logps/rejected": -2.1830132007598877, "loss": 2.018, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.16592788696289, "rewards/margins": 2.6642043590545654, "rewards/rejected": -21.83013343811035, "step": 20345 }, { "epoch": 0.6859011089015471, "grad_norm": 1.5715937479399145e-05, "learning_rate": 2.716063874948684e-07, "logits/chosen": -1.7485952377319336, "logits/rejected": -1.8180173635482788, "logps/chosen": -2.7975385189056396, "logps/rejected": -3.3084304332733154, "loss": 2.3908, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -27.975383758544922, "rewards/margins": 5.108920097351074, "rewards/rejected": -33.08430480957031, "step": 20350 }, { "epoch": 0.6860696349725303, "grad_norm": 25.99323272705078, "learning_rate": 2.7134477196795764e-07, "logits/chosen": -2.0262441635131836, "logits/rejected": -2.1174750328063965, "logps/chosen": -1.8926416635513306, "logps/rejected": -2.128100633621216, "loss": 3.0479, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.926416397094727, "rewards/margins": 2.35459041595459, "rewards/rejected": -21.281007766723633, "step": 20355 }, { "epoch": 0.6862381610435134, "grad_norm": 23.152528762817383, "learning_rate": 2.7108323556961266e-07, "logits/chosen": -1.719618558883667, "logits/rejected": -2.6587231159210205, "logps/chosen": -3.0434048175811768, "logps/rejected": -4.10079288482666, "loss": 1.2253, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -30.43404769897461, "rewards/margins": 10.573877334594727, "rewards/rejected": -41.00792694091797, "step": 20360 }, { "epoch": 0.6864066871144966, "grad_norm": 15.43291187286377, "learning_rate": 2.7082177839034087e-07, "logits/chosen": -2.2665624618530273, "logits/rejected": -2.249471426010132, "logps/chosen": -2.7999186515808105, "logps/rejected": -2.9842801094055176, "loss": 3.6626, "rewards/accuracies": 0.5, "rewards/chosen": -27.999187469482422, "rewards/margins": 1.8436133861541748, "rewards/rejected": -29.84280014038086, "step": 20365 }, { "epoch": 0.6865752131854798, "grad_norm": 51.50522232055664, "learning_rate": 2.705604005206223e-07, "logits/chosen": -1.8549646139144897, "logits/rejected": -1.838091254234314, "logps/chosen": -2.249647617340088, "logps/rejected": -2.5575718879699707, "loss": 3.3206, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.496477127075195, "rewards/margins": 3.079240560531616, "rewards/rejected": -25.57571792602539, "step": 20370 }, { "epoch": 0.6867437392564629, "grad_norm": 33.62204360961914, "learning_rate": 2.7029910205090975e-07, "logits/chosen": -1.884319543838501, "logits/rejected": -1.66828191280365, "logps/chosen": -2.4199607372283936, "logps/rejected": -2.259284019470215, "loss": 5.1467, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.19960594177246, "rewards/margins": -1.6067683696746826, "rewards/rejected": -22.59284019470215, "step": 20375 }, { "epoch": 0.6869122653274462, "grad_norm": 187.69085693359375, "learning_rate": 2.70037883071628e-07, "logits/chosen": -2.0082826614379883, "logits/rejected": -1.971462607383728, "logps/chosen": -3.232483386993408, "logps/rejected": -3.4104182720184326, "loss": 2.9896, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -32.324832916259766, "rewards/margins": 1.7793477773666382, "rewards/rejected": -34.10417938232422, "step": 20380 }, { "epoch": 0.6870807913984294, "grad_norm": 37.74311065673828, "learning_rate": 2.697767436731747e-07, "logits/chosen": -1.6654325723648071, "logits/rejected": -2.0131616592407227, "logps/chosen": -2.0143442153930664, "logps/rejected": -2.516138792037964, "loss": 1.6317, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -20.143442153930664, "rewards/margins": 5.017943859100342, "rewards/rejected": -25.161386489868164, "step": 20385 }, { "epoch": 0.6872493174694125, "grad_norm": 5.722283363342285, "learning_rate": 2.6951568394592024e-07, "logits/chosen": -0.9658223390579224, "logits/rejected": -1.3423798084259033, "logps/chosen": -2.0055198669433594, "logps/rejected": -3.0546345710754395, "loss": 1.2205, "rewards/accuracies": 1.0, "rewards/chosen": -20.055200576782227, "rewards/margins": 10.491147994995117, "rewards/rejected": -30.54634666442871, "step": 20390 }, { "epoch": 0.6874178435403957, "grad_norm": 28.412220001220703, "learning_rate": 2.6925470398020656e-07, "logits/chosen": -2.0573315620422363, "logits/rejected": -2.2049522399902344, "logps/chosen": -1.9439353942871094, "logps/rejected": -2.2200284004211426, "loss": 1.806, "rewards/accuracies": 1.0, "rewards/chosen": -19.439355850219727, "rewards/margins": 2.760927438735962, "rewards/rejected": -22.20028305053711, "step": 20395 }, { "epoch": 0.6875863696113789, "grad_norm": 29.253477096557617, "learning_rate": 2.689938038663489e-07, "logits/chosen": -1.6225097179412842, "logits/rejected": -2.028738498687744, "logps/chosen": -2.0318007469177246, "logps/rejected": -2.413212299346924, "loss": 2.3506, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.318008422851562, "rewards/margins": 3.814117908477783, "rewards/rejected": -24.132125854492188, "step": 20400 }, { "epoch": 0.6875863696113789, "eval_logits/chosen": -2.160346746444702, "eval_logits/rejected": -2.325753688812256, "eval_logps/chosen": -2.223787546157837, "eval_logps/rejected": -2.3722009658813477, "eval_loss": 3.045292854309082, "eval_rewards/accuracies": 0.6299999952316284, "eval_rewards/chosen": -22.237878799438477, "eval_rewards/margins": 1.484131932258606, "eval_rewards/rejected": -23.722009658813477, "eval_runtime": 12.8876, "eval_samples_per_second": 7.759, "eval_steps_per_second": 1.94, "step": 20400 }, { "epoch": 0.687754895682362, "grad_norm": 41.0540657043457, "learning_rate": 2.6873298369463443e-07, "logits/chosen": -1.7240146398544312, "logits/rejected": -1.768457055091858, "logps/chosen": -2.686223030090332, "logps/rejected": -2.7694153785705566, "loss": 4.6757, "rewards/accuracies": 0.5, "rewards/chosen": -26.862232208251953, "rewards/margins": 0.8319219350814819, "rewards/rejected": -27.69415283203125, "step": 20405 }, { "epoch": 0.6879234217533452, "grad_norm": 40.388038635253906, "learning_rate": 2.6847224355532296e-07, "logits/chosen": -1.528822660446167, "logits/rejected": -1.8464807271957397, "logps/chosen": -2.5221710205078125, "logps/rejected": -2.816987991333008, "loss": 2.6089, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -25.221710205078125, "rewards/margins": 2.948167324066162, "rewards/rejected": -28.169879913330078, "step": 20410 }, { "epoch": 0.6880919478243285, "grad_norm": 32.86744689941406, "learning_rate": 2.6821158353864595e-07, "logits/chosen": -2.0796568393707275, "logits/rejected": -2.2831714153289795, "logps/chosen": -2.6853480339050293, "logps/rejected": -2.882089138031006, "loss": 3.618, "rewards/accuracies": 0.5, "rewards/chosen": -26.853485107421875, "rewards/margins": 1.9674084186553955, "rewards/rejected": -28.820892333984375, "step": 20415 }, { "epoch": 0.6882604738953116, "grad_norm": 95.84664916992188, "learning_rate": 2.679510037348077e-07, "logits/chosen": -1.9441862106323242, "logits/rejected": -2.119619607925415, "logps/chosen": -3.1259143352508545, "logps/rejected": -3.42171049118042, "loss": 3.0782, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -31.259143829345703, "rewards/margins": 2.9579625129699707, "rewards/rejected": -34.217105865478516, "step": 20420 }, { "epoch": 0.6884289999662948, "grad_norm": 24.20979118347168, "learning_rate": 2.67690504233985e-07, "logits/chosen": -2.0083043575286865, "logits/rejected": -2.086310863494873, "logps/chosen": -2.2964296340942383, "logps/rejected": -2.89279842376709, "loss": 2.1356, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -22.964298248291016, "rewards/margins": 5.963687896728516, "rewards/rejected": -28.9279842376709, "step": 20425 }, { "epoch": 0.688597526037278, "grad_norm": 31.725812911987305, "learning_rate": 2.674300851263259e-07, "logits/chosen": -1.6422641277313232, "logits/rejected": -1.8533084392547607, "logps/chosen": -1.8975311517715454, "logps/rejected": -2.256784200668335, "loss": 1.8337, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -18.975311279296875, "rewards/margins": 3.592531204223633, "rewards/rejected": -22.567840576171875, "step": 20430 }, { "epoch": 0.6887660521082611, "grad_norm": 59.62810134887695, "learning_rate": 2.671697465019515e-07, "logits/chosen": -1.5278303623199463, "logits/rejected": -1.8468002080917358, "logps/chosen": -1.755629539489746, "logps/rejected": -1.7148040533065796, "loss": 3.7058, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.55629539489746, "rewards/margins": -0.40825533866882324, "rewards/rejected": -17.148038864135742, "step": 20435 }, { "epoch": 0.6889345781792443, "grad_norm": 377.57159423828125, "learning_rate": 2.669094884509546e-07, "logits/chosen": -1.3748226165771484, "logits/rejected": -1.3104236125946045, "logps/chosen": -3.0436301231384277, "logps/rejected": -3.194868803024292, "loss": 3.356, "rewards/accuracies": 0.5, "rewards/chosen": -30.436298370361328, "rewards/margins": 1.5123873949050903, "rewards/rejected": -31.948688507080078, "step": 20440 }, { "epoch": 0.6891031042502275, "grad_norm": 46.6607551574707, "learning_rate": 2.6664931106340064e-07, "logits/chosen": -2.249420166015625, "logits/rejected": -2.2896761894226074, "logps/chosen": -2.5717592239379883, "logps/rejected": -2.9195244312286377, "loss": 1.9671, "rewards/accuracies": 0.5, "rewards/chosen": -25.71759033203125, "rewards/margins": 3.477654218673706, "rewards/rejected": -29.19524574279785, "step": 20445 }, { "epoch": 0.6892716303212106, "grad_norm": 46.07664108276367, "learning_rate": 2.6638921442932627e-07, "logits/chosen": -1.4022904634475708, "logits/rejected": -1.3712940216064453, "logps/chosen": -2.298978328704834, "logps/rejected": -2.069422721862793, "loss": 5.3755, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -22.98978614807129, "rewards/margins": -2.2955572605133057, "rewards/rejected": -20.694225311279297, "step": 20450 }, { "epoch": 0.6894401563921939, "grad_norm": 39.8826904296875, "learning_rate": 2.6612919863874084e-07, "logits/chosen": -1.74521005153656, "logits/rejected": -1.838160514831543, "logps/chosen": -2.082080364227295, "logps/rejected": -2.506239652633667, "loss": 2.9543, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.820804595947266, "rewards/margins": 4.241593837738037, "rewards/rejected": -25.062397003173828, "step": 20455 }, { "epoch": 0.6896086824631771, "grad_norm": 12.789778709411621, "learning_rate": 2.658692637816258e-07, "logits/chosen": -1.5193207263946533, "logits/rejected": -1.6041972637176514, "logps/chosen": -2.3925070762634277, "logps/rejected": -2.643028736114502, "loss": 1.9822, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -23.92506980895996, "rewards/margins": 2.5052151679992676, "rewards/rejected": -26.430286407470703, "step": 20460 }, { "epoch": 0.6897772085341602, "grad_norm": 31.83477210998535, "learning_rate": 2.6560940994793403e-07, "logits/chosen": -1.491260290145874, "logits/rejected": -1.6489356756210327, "logps/chosen": -2.430779218673706, "logps/rejected": -2.6424241065979004, "loss": 3.0858, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.30779457092285, "rewards/margins": 2.1164519786834717, "rewards/rejected": -26.424243927001953, "step": 20465 }, { "epoch": 0.6899457346051434, "grad_norm": 16.46843719482422, "learning_rate": 2.6534963722759085e-07, "logits/chosen": -2.0733752250671387, "logits/rejected": -1.9191596508026123, "logps/chosen": -1.8992841243743896, "logps/rejected": -1.9869730472564697, "loss": 2.9444, "rewards/accuracies": 0.5, "rewards/chosen": -18.992841720581055, "rewards/margins": 0.8768887519836426, "rewards/rejected": -19.869731903076172, "step": 20470 }, { "epoch": 0.6901142606761266, "grad_norm": 93.31346130371094, "learning_rate": 2.6508994571049337e-07, "logits/chosen": -1.970423936843872, "logits/rejected": -2.2703733444213867, "logps/chosen": -3.0131518840789795, "logps/rejected": -3.596086025238037, "loss": 1.6824, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -30.131519317626953, "rewards/margins": 5.829344272613525, "rewards/rejected": -35.96086120605469, "step": 20475 }, { "epoch": 0.6902827867471097, "grad_norm": 59.351375579833984, "learning_rate": 2.648303354865108e-07, "logits/chosen": -1.5795114040374756, "logits/rejected": -1.784462571144104, "logps/chosen": -2.711531162261963, "logps/rejected": -2.824735164642334, "loss": 3.4056, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -27.115314483642578, "rewards/margins": 1.1320381164550781, "rewards/rejected": -28.247350692749023, "step": 20480 }, { "epoch": 0.6904513128180929, "grad_norm": 36.14025115966797, "learning_rate": 2.645708066454836e-07, "logits/chosen": -2.1569674015045166, "logits/rejected": -2.4725911617279053, "logps/chosen": -2.0673184394836426, "logps/rejected": -2.4919426441192627, "loss": 2.1936, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.67318344116211, "rewards/margins": 4.246241569519043, "rewards/rejected": -24.91942596435547, "step": 20485 }, { "epoch": 0.6906198388890762, "grad_norm": 36.176048278808594, "learning_rate": 2.643113592772247e-07, "logits/chosen": -1.268363356590271, "logits/rejected": -1.4549598693847656, "logps/chosen": -2.6716246604919434, "logps/rejected": -3.0097477436065674, "loss": 1.3822, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -26.71624755859375, "rewards/margins": 3.38122820854187, "rewards/rejected": -30.09747314453125, "step": 20490 }, { "epoch": 0.6907883649600594, "grad_norm": 26.0761661529541, "learning_rate": 2.6405199347151853e-07, "logits/chosen": -1.4917685985565186, "logits/rejected": -1.7317289113998413, "logps/chosen": -2.0684714317321777, "logps/rejected": -2.4049010276794434, "loss": 2.7544, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.68471336364746, "rewards/margins": 3.3642945289611816, "rewards/rejected": -24.049007415771484, "step": 20495 }, { "epoch": 0.6909568910310425, "grad_norm": 40.55641555786133, "learning_rate": 2.637927093181215e-07, "logits/chosen": -1.4711533784866333, "logits/rejected": -1.232787847518921, "logps/chosen": -2.4316444396972656, "logps/rejected": -2.3839781284332275, "loss": 3.7814, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -24.316442489624023, "rewards/margins": -0.4766610264778137, "rewards/rejected": -23.83978271484375, "step": 20500 }, { "epoch": 0.6911254171020257, "grad_norm": 34.28879928588867, "learning_rate": 2.635335069067617e-07, "logits/chosen": -1.5314624309539795, "logits/rejected": -1.6001373529434204, "logps/chosen": -2.8761401176452637, "logps/rejected": -2.869184970855713, "loss": 3.1917, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -28.761402130126953, "rewards/margins": -0.06955299526453018, "rewards/rejected": -28.691852569580078, "step": 20505 }, { "epoch": 0.6912939431730089, "grad_norm": 21.706228256225586, "learning_rate": 2.632743863271386e-07, "logits/chosen": -1.8885313272476196, "logits/rejected": -2.1504628658294678, "logps/chosen": -2.0663483142852783, "logps/rejected": -2.186183452606201, "loss": 2.5927, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.663482666015625, "rewards/margins": 1.1983544826507568, "rewards/rejected": -21.861835479736328, "step": 20510 }, { "epoch": 0.691462469243992, "grad_norm": 41.952720642089844, "learning_rate": 2.6301534766892383e-07, "logits/chosen": -1.4128437042236328, "logits/rejected": -1.5041228532791138, "logps/chosen": -1.9707103967666626, "logps/rejected": -2.113816738128662, "loss": 2.0716, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.707103729248047, "rewards/margins": 1.4310643672943115, "rewards/rejected": -21.138168334960938, "step": 20515 }, { "epoch": 0.6916309953149752, "grad_norm": 18.8629207611084, "learning_rate": 2.627563910217603e-07, "logits/chosen": -1.3868954181671143, "logits/rejected": -1.3035885095596313, "logps/chosen": -3.475510358810425, "logps/rejected": -3.5753989219665527, "loss": 5.7918, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -34.755104064941406, "rewards/margins": 0.9988861083984375, "rewards/rejected": -35.753990173339844, "step": 20520 }, { "epoch": 0.6917995213859585, "grad_norm": 20.878881454467773, "learning_rate": 2.6249751647526284e-07, "logits/chosen": -1.4501540660858154, "logits/rejected": -1.6170969009399414, "logps/chosen": -1.8804584741592407, "logps/rejected": -2.7165768146514893, "loss": 2.1882, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.804584503173828, "rewards/margins": 8.361184120178223, "rewards/rejected": -27.165767669677734, "step": 20525 }, { "epoch": 0.6919680474569416, "grad_norm": 41.715492248535156, "learning_rate": 2.62238724119018e-07, "logits/chosen": -1.8132165670394897, "logits/rejected": -1.8326460123062134, "logps/chosen": -2.0126876831054688, "logps/rejected": -2.183061122894287, "loss": 2.6684, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.126874923706055, "rewards/margins": 1.7037353515625, "rewards/rejected": -21.830608367919922, "step": 20530 }, { "epoch": 0.6921365735279248, "grad_norm": 78.58806610107422, "learning_rate": 2.6198001404258306e-07, "logits/chosen": -1.914900779724121, "logits/rejected": -2.038290500640869, "logps/chosen": -2.3590197563171387, "logps/rejected": -2.56164288520813, "loss": 2.936, "rewards/accuracies": 0.5, "rewards/chosen": -23.590198516845703, "rewards/margins": 2.0262298583984375, "rewards/rejected": -25.616430282592773, "step": 20535 }, { "epoch": 0.692305099598908, "grad_norm": 21.73616600036621, "learning_rate": 2.617213863354876e-07, "logits/chosen": -1.6423991918563843, "logits/rejected": -1.8211101293563843, "logps/chosen": -2.6549861431121826, "logps/rejected": -2.900494337081909, "loss": 2.0109, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.549861907958984, "rewards/margins": 2.4550812244415283, "rewards/rejected": -29.00494384765625, "step": 20540 }, { "epoch": 0.6924736256698911, "grad_norm": 28.13576316833496, "learning_rate": 2.614628410872328e-07, "logits/chosen": -1.8414901494979858, "logits/rejected": -1.8130661249160767, "logps/chosen": -3.396226406097412, "logps/rejected": -3.634342908859253, "loss": 2.4793, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -33.96226501464844, "rewards/margins": 2.3811628818511963, "rewards/rejected": -36.34342956542969, "step": 20545 }, { "epoch": 0.6926421517408743, "grad_norm": 57.43549346923828, "learning_rate": 2.612043783872905e-07, "logits/chosen": -1.3797636032104492, "logits/rejected": -1.239461064338684, "logps/chosen": -2.233649730682373, "logps/rejected": -2.3911919593811035, "loss": 3.2192, "rewards/accuracies": 0.5, "rewards/chosen": -22.336496353149414, "rewards/margins": 1.5754238367080688, "rewards/rejected": -23.91192054748535, "step": 20550 }, { "epoch": 0.6928106778118575, "grad_norm": 47.604774475097656, "learning_rate": 2.609459983251046e-07, "logits/chosen": -1.5278592109680176, "logits/rejected": -1.573185682296753, "logps/chosen": -2.482006072998047, "logps/rejected": -2.4904625415802, "loss": 3.5412, "rewards/accuracies": 0.5, "rewards/chosen": -24.820064544677734, "rewards/margins": 0.08456306159496307, "rewards/rejected": -24.904626846313477, "step": 20555 }, { "epoch": 0.6929792038828406, "grad_norm": 16.714946746826172, "learning_rate": 2.606877009900904e-07, "logits/chosen": -1.9386390447616577, "logits/rejected": -1.9331505298614502, "logps/chosen": -1.9429633617401123, "logps/rejected": -2.433903694152832, "loss": 2.6649, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.42963218688965, "rewards/margins": 4.909402370452881, "rewards/rejected": -24.339035034179688, "step": 20560 }, { "epoch": 0.6931477299538239, "grad_norm": 5.052648544311523, "learning_rate": 2.6042948647163456e-07, "logits/chosen": -2.081897020339966, "logits/rejected": -2.2000374794006348, "logps/chosen": -2.3183398246765137, "logps/rejected": -2.636362075805664, "loss": 1.475, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -23.183399200439453, "rewards/margins": 3.180218458175659, "rewards/rejected": -26.36362075805664, "step": 20565 }, { "epoch": 0.6933162560248071, "grad_norm": 20.045211791992188, "learning_rate": 2.6017135485909445e-07, "logits/chosen": -1.1111745834350586, "logits/rejected": -1.1631591320037842, "logps/chosen": -1.8602969646453857, "logps/rejected": -1.9264259338378906, "loss": 2.8664, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.602970123291016, "rewards/margins": 0.6612892150878906, "rewards/rejected": -19.264257431030273, "step": 20570 }, { "epoch": 0.6934847820957902, "grad_norm": 31.42527198791504, "learning_rate": 2.5991330624179967e-07, "logits/chosen": -1.3147919178009033, "logits/rejected": -2.23165225982666, "logps/chosen": -1.960165023803711, "logps/rejected": -2.15248966217041, "loss": 2.2321, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.60165023803711, "rewards/margins": 1.9232457876205444, "rewards/rejected": -21.5248966217041, "step": 20575 }, { "epoch": 0.6936533081667734, "grad_norm": 21.13239288330078, "learning_rate": 2.596553407090507e-07, "logits/chosen": -1.5352892875671387, "logits/rejected": -1.9442498683929443, "logps/chosen": -2.198362350463867, "logps/rejected": -2.5642826557159424, "loss": 2.603, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.983623504638672, "rewards/margins": 3.659203290939331, "rewards/rejected": -25.6428279876709, "step": 20580 }, { "epoch": 0.6938218342377566, "grad_norm": 0.00042211037361994386, "learning_rate": 2.5939745835011895e-07, "logits/chosen": -1.496949553489685, "logits/rejected": -1.9834476709365845, "logps/chosen": -2.410764455795288, "logps/rejected": -2.8452978134155273, "loss": 2.2064, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.10764503479004, "rewards/margins": 4.345332622528076, "rewards/rejected": -28.452978134155273, "step": 20585 }, { "epoch": 0.6939903603087397, "grad_norm": 22.815431594848633, "learning_rate": 2.5913965925424754e-07, "logits/chosen": -1.6021522283554077, "logits/rejected": -1.7010421752929688, "logps/chosen": -1.9284210205078125, "logps/rejected": -2.514775514602661, "loss": 1.6196, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.28420639038086, "rewards/margins": 5.863546848297119, "rewards/rejected": -25.147754669189453, "step": 20590 }, { "epoch": 0.6941588863797229, "grad_norm": 128.66635131835938, "learning_rate": 2.588819435106504e-07, "logits/chosen": -2.0419507026672363, "logits/rejected": -1.9064838886260986, "logps/chosen": -2.3883216381073, "logps/rejected": -2.404067277908325, "loss": 2.9514, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.883216857910156, "rewards/margins": 0.15745744109153748, "rewards/rejected": -24.040674209594727, "step": 20595 }, { "epoch": 0.6943274124507062, "grad_norm": 30.456762313842773, "learning_rate": 2.5862431120851324e-07, "logits/chosen": -1.8142229318618774, "logits/rejected": -2.2456400394439697, "logps/chosen": -2.5279510021209717, "logps/rejected": -3.1119022369384766, "loss": 2.8751, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.279508590698242, "rewards/margins": 5.839513778686523, "rewards/rejected": -31.1190242767334, "step": 20600 }, { "epoch": 0.6944959385216893, "grad_norm": 26.131343841552734, "learning_rate": 2.58366762436992e-07, "logits/chosen": -1.7309370040893555, "logits/rejected": -2.1586380004882812, "logps/chosen": -2.619094133377075, "logps/rejected": -2.662580966949463, "loss": 4.1593, "rewards/accuracies": 0.5, "rewards/chosen": -26.190942764282227, "rewards/margins": 0.434866338968277, "rewards/rejected": -26.625812530517578, "step": 20605 }, { "epoch": 0.6946644645926725, "grad_norm": 43.826229095458984, "learning_rate": 2.5810929728521417e-07, "logits/chosen": -1.9372804164886475, "logits/rejected": -1.9223651885986328, "logps/chosen": -2.482267379760742, "logps/rejected": -2.6594791412353516, "loss": 2.3857, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.822673797607422, "rewards/margins": 1.7721197605133057, "rewards/rejected": -26.594791412353516, "step": 20610 }, { "epoch": 0.6948329906636557, "grad_norm": 15.244111061096191, "learning_rate": 2.578519158422787e-07, "logits/chosen": -1.0707063674926758, "logits/rejected": -1.3074023723602295, "logps/chosen": -2.4689464569091797, "logps/rejected": -2.585275173187256, "loss": 2.8056, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.68946647644043, "rewards/margins": 1.1632845401763916, "rewards/rejected": -25.852752685546875, "step": 20615 }, { "epoch": 0.6950015167346388, "grad_norm": 35.30781555175781, "learning_rate": 2.575946181972547e-07, "logits/chosen": -1.7129011154174805, "logits/rejected": -1.8900816440582275, "logps/chosen": -2.7587363719940186, "logps/rejected": -3.468085527420044, "loss": 3.7825, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -27.58736228942871, "rewards/margins": 7.093493461608887, "rewards/rejected": -34.68085479736328, "step": 20620 }, { "epoch": 0.695170042805622, "grad_norm": 27.71534538269043, "learning_rate": 2.57337404439183e-07, "logits/chosen": -1.5547401905059814, "logits/rejected": -1.6852400302886963, "logps/chosen": -2.4740939140319824, "logps/rejected": -2.8958001136779785, "loss": 2.7703, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.74094009399414, "rewards/margins": 4.217062473297119, "rewards/rejected": -28.9580020904541, "step": 20625 }, { "epoch": 0.6953385688766052, "grad_norm": 17.399873733520508, "learning_rate": 2.5708027465707507e-07, "logits/chosen": -1.6606941223144531, "logits/rejected": -1.7552680969238281, "logps/chosen": -2.045785427093506, "logps/rejected": -2.209364175796509, "loss": 3.1454, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.45785140991211, "rewards/margins": 1.6357879638671875, "rewards/rejected": -22.093639373779297, "step": 20630 }, { "epoch": 0.6955070949475884, "grad_norm": 5.373404026031494, "learning_rate": 2.568232289399136e-07, "logits/chosen": -2.0698084831237793, "logits/rejected": -2.2751216888427734, "logps/chosen": -2.794049024581909, "logps/rejected": -3.2170653343200684, "loss": 1.6037, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -27.940486907958984, "rewards/margins": 4.230165004730225, "rewards/rejected": -32.170654296875, "step": 20635 }, { "epoch": 0.6956756210185716, "grad_norm": 26.428239822387695, "learning_rate": 2.5656626737665166e-07, "logits/chosen": -1.5411195755004883, "logits/rejected": -1.6710189580917358, "logps/chosen": -2.2027552127838135, "logps/rejected": -2.4791810512542725, "loss": 2.2223, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.02755355834961, "rewards/margins": 2.7642579078674316, "rewards/rejected": -24.791810989379883, "step": 20640 }, { "epoch": 0.6958441470895548, "grad_norm": 25.9244384765625, "learning_rate": 2.5630939005621367e-07, "logits/chosen": -1.068873643875122, "logits/rejected": -1.6588131189346313, "logps/chosen": -2.1675708293914795, "logps/rejected": -2.701908588409424, "loss": 1.2802, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.675708770751953, "rewards/margins": 5.343373775482178, "rewards/rejected": -27.01908302307129, "step": 20645 }, { "epoch": 0.6960126731605379, "grad_norm": 16.891956329345703, "learning_rate": 2.560525970674947e-07, "logits/chosen": -1.6296188831329346, "logits/rejected": -1.6298997402191162, "logps/chosen": -2.7089664936065674, "logps/rejected": -3.0198957920074463, "loss": 2.3573, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -27.089664459228516, "rewards/margins": 3.1092934608459473, "rewards/rejected": -30.198955535888672, "step": 20650 }, { "epoch": 0.6961811992315211, "grad_norm": 21.705472946166992, "learning_rate": 2.557958884993607e-07, "logits/chosen": -2.1672890186309814, "logits/rejected": -2.3492963314056396, "logps/chosen": -1.8646361827850342, "logps/rejected": -2.2616653442382812, "loss": 2.0145, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.6463623046875, "rewards/margins": 3.970294237136841, "rewards/rejected": -22.616657257080078, "step": 20655 }, { "epoch": 0.6963497253025043, "grad_norm": 36.769920349121094, "learning_rate": 2.5553926444064856e-07, "logits/chosen": -1.727367639541626, "logits/rejected": -2.071427822113037, "logps/chosen": -2.185485363006592, "logps/rejected": -2.6844260692596436, "loss": 1.6868, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.8548526763916, "rewards/margins": 4.989406108856201, "rewards/rejected": -26.84425926208496, "step": 20660 }, { "epoch": 0.6965182513734874, "grad_norm": 18.47760009765625, "learning_rate": 2.552827249801656e-07, "logits/chosen": -1.3826062679290771, "logits/rejected": -1.8453384637832642, "logps/chosen": -2.333775281906128, "logps/rejected": -2.763153553009033, "loss": 1.6567, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -23.337753295898438, "rewards/margins": 4.293783664703369, "rewards/rejected": -27.63153648376465, "step": 20665 }, { "epoch": 0.6966867774444706, "grad_norm": 68.66205596923828, "learning_rate": 2.550262702066902e-07, "logits/chosen": -1.6487001180648804, "logits/rejected": -2.1160476207733154, "logps/chosen": -1.9262282848358154, "logps/rejected": -2.10188889503479, "loss": 2.5036, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.262283325195312, "rewards/margins": 1.756604552268982, "rewards/rejected": -21.01888656616211, "step": 20670 }, { "epoch": 0.6968553035154539, "grad_norm": 18.352575302124023, "learning_rate": 2.547699002089709e-07, "logits/chosen": -1.6079021692276, "logits/rejected": -1.6530320644378662, "logps/chosen": -3.115535259246826, "logps/rejected": -3.553027391433716, "loss": 2.8882, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -31.155353546142578, "rewards/margins": 4.374924659729004, "rewards/rejected": -35.5302734375, "step": 20675 }, { "epoch": 0.697023829586437, "grad_norm": 4.454761505126953, "learning_rate": 2.545136150757275e-07, "logits/chosen": -1.9702221155166626, "logits/rejected": -2.310147285461426, "logps/chosen": -3.109532117843628, "logps/rejected": -3.1416773796081543, "loss": 3.6061, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -31.095317840576172, "rewards/margins": 0.3214544355869293, "rewards/rejected": -31.41677474975586, "step": 20680 }, { "epoch": 0.6971923556574202, "grad_norm": 24.627824783325195, "learning_rate": 2.5425741489565035e-07, "logits/chosen": -1.629888892173767, "logits/rejected": -1.887681007385254, "logps/chosen": -2.459501028060913, "logps/rejected": -3.4645283222198486, "loss": 1.9909, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.595012664794922, "rewards/margins": 10.050271987915039, "rewards/rejected": -34.64528274536133, "step": 20685 }, { "epoch": 0.6973608817284034, "grad_norm": 21.283910751342773, "learning_rate": 2.5400129975739973e-07, "logits/chosen": -1.552143931388855, "logits/rejected": -1.7271095514297485, "logps/chosen": -1.927848219871521, "logps/rejected": -2.278165340423584, "loss": 1.8268, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.27848243713379, "rewards/margins": 3.503171443939209, "rewards/rejected": -22.78165626525879, "step": 20690 }, { "epoch": 0.6975294077993865, "grad_norm": 42.463069915771484, "learning_rate": 2.537452697496074e-07, "logits/chosen": -1.7690792083740234, "logits/rejected": -2.0024960041046143, "logps/chosen": -3.1089041233062744, "logps/rejected": -3.203892946243286, "loss": 6.6679, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -31.089040756225586, "rewards/margins": 0.9498880505561829, "rewards/rejected": -32.03893280029297, "step": 20695 }, { "epoch": 0.6976979338703697, "grad_norm": 219.65562438964844, "learning_rate": 2.5348932496087514e-07, "logits/chosen": -1.3836790323257446, "logits/rejected": -1.856666922569275, "logps/chosen": -2.4839062690734863, "logps/rejected": -3.0888781547546387, "loss": 2.7525, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -24.839061737060547, "rewards/margins": 6.04971981048584, "rewards/rejected": -30.888782501220703, "step": 20700 }, { "epoch": 0.6978664599413529, "grad_norm": 164.99838256835938, "learning_rate": 2.532334654797756e-07, "logits/chosen": -1.8446996212005615, "logits/rejected": -2.135688304901123, "logps/chosen": -2.1551125049591064, "logps/rejected": -2.0222866535186768, "loss": 4.7932, "rewards/accuracies": 0.5, "rewards/chosen": -21.55112075805664, "rewards/margins": -1.3282577991485596, "rewards/rejected": -20.222864151000977, "step": 20705 }, { "epoch": 0.6980349860123362, "grad_norm": 44.29646682739258, "learning_rate": 2.5297769139485126e-07, "logits/chosen": -1.5946813821792603, "logits/rejected": -1.9342540502548218, "logps/chosen": -2.563175916671753, "logps/rejected": -2.7002692222595215, "loss": 4.7077, "rewards/accuracies": 0.5, "rewards/chosen": -25.631759643554688, "rewards/margins": 1.3709338903427124, "rewards/rejected": -27.002695083618164, "step": 20710 }, { "epoch": 0.6982035120833193, "grad_norm": 27.76547622680664, "learning_rate": 2.5272200279461554e-07, "logits/chosen": -1.6258023977279663, "logits/rejected": -1.7282302379608154, "logps/chosen": -2.21496319770813, "logps/rejected": -2.229520797729492, "loss": 4.102, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.149633407592773, "rewards/margins": 0.14557456970214844, "rewards/rejected": -22.29520606994629, "step": 20715 }, { "epoch": 0.6983720381543025, "grad_norm": 20.176912307739258, "learning_rate": 2.5246639976755256e-07, "logits/chosen": -1.7401567697525024, "logits/rejected": -2.296189546585083, "logps/chosen": -2.269164562225342, "logps/rejected": -2.47044038772583, "loss": 3.4469, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.69164276123047, "rewards/margins": 2.012756824493408, "rewards/rejected": -24.704402923583984, "step": 20720 }, { "epoch": 0.6985405642252857, "grad_norm": 26.711063385009766, "learning_rate": 2.5221088240211595e-07, "logits/chosen": -1.606109619140625, "logits/rejected": -2.1354870796203613, "logps/chosen": -2.067237377166748, "logps/rejected": -2.362346649169922, "loss": 2.7425, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.672372817993164, "rewards/margins": 2.9510934352874756, "rewards/rejected": -23.62346839904785, "step": 20725 }, { "epoch": 0.6987090902962688, "grad_norm": 33.88330841064453, "learning_rate": 2.5195545078673055e-07, "logits/chosen": -1.2273902893066406, "logits/rejected": -1.6899696588516235, "logps/chosen": -2.1341545581817627, "logps/rejected": -2.3928472995758057, "loss": 2.0875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.341543197631836, "rewards/margins": 2.586925983428955, "rewards/rejected": -23.928470611572266, "step": 20730 }, { "epoch": 0.698877616367252, "grad_norm": 28.957759857177734, "learning_rate": 2.517001050097909e-07, "logits/chosen": -1.882333755493164, "logits/rejected": -1.8907684087753296, "logps/chosen": -1.8769035339355469, "logps/rejected": -2.2192816734313965, "loss": 3.232, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.769033432006836, "rewards/margins": 3.423780918121338, "rewards/rejected": -22.19281578063965, "step": 20735 }, { "epoch": 0.6990461424382352, "grad_norm": 38.6989631652832, "learning_rate": 2.5144484515966257e-07, "logits/chosen": -1.7968485355377197, "logits/rejected": -2.1946663856506348, "logps/chosen": -1.6905304193496704, "logps/rejected": -1.97846257686615, "loss": 2.7855, "rewards/accuracies": 0.5, "rewards/chosen": -16.905303955078125, "rewards/margins": 2.8793225288391113, "rewards/rejected": -19.784626007080078, "step": 20740 }, { "epoch": 0.6992146685092184, "grad_norm": 36.020259857177734, "learning_rate": 2.511896713246804e-07, "logits/chosen": -2.1380550861358643, "logits/rejected": -2.318734645843506, "logps/chosen": -2.142519950866699, "logps/rejected": -2.305680513381958, "loss": 2.8405, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.425201416015625, "rewards/margins": 1.6316025257110596, "rewards/rejected": -23.05680274963379, "step": 20745 }, { "epoch": 0.6993831945802016, "grad_norm": 31.211973190307617, "learning_rate": 2.509345835931503e-07, "logits/chosen": -1.5926382541656494, "logits/rejected": -1.6798031330108643, "logps/chosen": -2.4966330528259277, "logps/rejected": -2.52616810798645, "loss": 3.0846, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.966331481933594, "rewards/margins": 0.2953473925590515, "rewards/rejected": -25.26167869567871, "step": 20750 }, { "epoch": 0.6995517206511848, "grad_norm": 17.617544174194336, "learning_rate": 2.506795820533483e-07, "logits/chosen": -1.350720763206482, "logits/rejected": -1.8617594242095947, "logps/chosen": -2.2058634757995605, "logps/rejected": -2.441958427429199, "loss": 4.0364, "rewards/accuracies": 0.5, "rewards/chosen": -22.058635711669922, "rewards/margins": 2.360948085784912, "rewards/rejected": -24.41958236694336, "step": 20755 }, { "epoch": 0.6997202467221679, "grad_norm": 215.24154663085938, "learning_rate": 2.504246667935198e-07, "logits/chosen": -2.053504228591919, "logits/rejected": -2.227134943008423, "logps/chosen": -2.590571880340576, "logps/rejected": -2.6792871952056885, "loss": 6.1276, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.905715942382812, "rewards/margins": 0.8871553540229797, "rewards/rejected": -26.79287338256836, "step": 20760 }, { "epoch": 0.6998887727931511, "grad_norm": 46.3771858215332, "learning_rate": 2.501698379018813e-07, "logits/chosen": -1.6019909381866455, "logits/rejected": -1.7526795864105225, "logps/chosen": -1.9972448348999023, "logps/rejected": -2.048427104949951, "loss": 3.7901, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -19.972448348999023, "rewards/margins": 0.5118247270584106, "rewards/rejected": -20.48427391052246, "step": 20765 }, { "epoch": 0.7000572988641343, "grad_norm": 35.8875617980957, "learning_rate": 2.4991509546661896e-07, "logits/chosen": -1.7468448877334595, "logits/rejected": -2.6992127895355225, "logps/chosen": -2.6763198375701904, "logps/rejected": -3.5259883403778076, "loss": 1.6343, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -26.763198852539062, "rewards/margins": 8.496684074401855, "rewards/rejected": -35.25988006591797, "step": 20770 }, { "epoch": 0.7002258249351174, "grad_norm": 20.1556339263916, "learning_rate": 2.496604395758893e-07, "logits/chosen": -1.9715760946273804, "logits/rejected": -2.0841028690338135, "logps/chosen": -2.025458574295044, "logps/rejected": -2.2173969745635986, "loss": 2.2974, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.254587173461914, "rewards/margins": 1.9193828105926514, "rewards/rejected": -22.173969268798828, "step": 20775 }, { "epoch": 0.7003943510061006, "grad_norm": 35.320377349853516, "learning_rate": 2.494058703178184e-07, "logits/chosen": -1.4174038171768188, "logits/rejected": -1.5131930112838745, "logps/chosen": -2.280522584915161, "logps/rejected": -2.666675090789795, "loss": 1.6929, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.805225372314453, "rewards/margins": 3.8615264892578125, "rewards/rejected": -26.666751861572266, "step": 20780 }, { "epoch": 0.7005628770770839, "grad_norm": 25.47304344177246, "learning_rate": 2.491513877805027e-07, "logits/chosen": -1.593122959136963, "logits/rejected": -1.8874908685684204, "logps/chosen": -2.6540579795837402, "logps/rejected": -3.080254077911377, "loss": 3.1685, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -26.54058265686035, "rewards/margins": 4.261958122253418, "rewards/rejected": -30.802539825439453, "step": 20785 }, { "epoch": 0.700731403148067, "grad_norm": 30.636350631713867, "learning_rate": 2.4889699205200873e-07, "logits/chosen": -1.7904140949249268, "logits/rejected": -1.8597707748413086, "logps/chosen": -2.0977234840393066, "logps/rejected": -2.4295527935028076, "loss": 2.3094, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.977237701416016, "rewards/margins": 3.3182921409606934, "rewards/rejected": -24.295530319213867, "step": 20790 }, { "epoch": 0.7008999292190502, "grad_norm": 33.05244827270508, "learning_rate": 2.486426832203727e-07, "logits/chosen": -1.911708116531372, "logits/rejected": -1.7518056631088257, "logps/chosen": -2.130955696105957, "logps/rejected": -2.179682970046997, "loss": 4.1876, "rewards/accuracies": 0.5, "rewards/chosen": -21.309558868408203, "rewards/margins": 0.4872714877128601, "rewards/rejected": -21.796829223632812, "step": 20795 }, { "epoch": 0.7010684552900334, "grad_norm": 136.613037109375, "learning_rate": 2.483884613736009e-07, "logits/chosen": -1.943140983581543, "logits/rejected": -1.82207453250885, "logps/chosen": -2.9635822772979736, "logps/rejected": -2.933582305908203, "loss": 3.9719, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -29.63582420349121, "rewards/margins": -0.29999876022338867, "rewards/rejected": -29.3358211517334, "step": 20800 }, { "epoch": 0.7010684552900334, "eval_logits/chosen": -2.159958600997925, "eval_logits/rejected": -2.326280117034912, "eval_logps/chosen": -2.2271804809570312, "eval_logps/rejected": -2.3731677532196045, "eval_loss": 3.059114694595337, "eval_rewards/accuracies": 0.6299999952316284, "eval_rewards/chosen": -22.271800994873047, "eval_rewards/margins": 1.4598737955093384, "eval_rewards/rejected": -23.731678009033203, "eval_runtime": 12.8911, "eval_samples_per_second": 7.757, "eval_steps_per_second": 1.939, "step": 20800 }, { "epoch": 0.7012369813610165, "grad_norm": 9.893948554992676, "learning_rate": 2.481343265996697e-07, "logits/chosen": -2.0953831672668457, "logits/rejected": -2.207719087600708, "logps/chosen": -2.3842544555664062, "logps/rejected": -2.877701997756958, "loss": 1.0689, "rewards/accuracies": 1.0, "rewards/chosen": -23.84254264831543, "rewards/margins": 4.934475898742676, "rewards/rejected": -28.777019500732422, "step": 20805 }, { "epoch": 0.7014055074319997, "grad_norm": 32.668060302734375, "learning_rate": 2.478802789865248e-07, "logits/chosen": -1.3642785549163818, "logits/rejected": -1.369171380996704, "logps/chosen": -2.1678147315979004, "logps/rejected": -2.1231637001037598, "loss": 3.666, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.67814826965332, "rewards/margins": -0.44651031494140625, "rewards/rejected": -21.231637954711914, "step": 20810 }, { "epoch": 0.7015740335029829, "grad_norm": 20.985637664794922, "learning_rate": 2.476263186220822e-07, "logits/chosen": -1.3993022441864014, "logits/rejected": -1.4622784852981567, "logps/chosen": -2.4692177772521973, "logps/rejected": -2.36029314994812, "loss": 4.523, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -24.69217872619629, "rewards/margins": -1.0892469882965088, "rewards/rejected": -23.60293197631836, "step": 20815 }, { "epoch": 0.7017425595739661, "grad_norm": 19.307912826538086, "learning_rate": 2.4737244559422765e-07, "logits/chosen": -1.744577407836914, "logits/rejected": -1.6804134845733643, "logps/chosen": -2.9628689289093018, "logps/rejected": -3.169175386428833, "loss": 3.7887, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -29.628686904907227, "rewards/margins": 2.0630671977996826, "rewards/rejected": -31.691753387451172, "step": 20820 }, { "epoch": 0.7019110856449493, "grad_norm": 99.77359008789062, "learning_rate": 2.471186599908167e-07, "logits/chosen": -1.737992525100708, "logits/rejected": -1.4738280773162842, "logps/chosen": -3.280036211013794, "logps/rejected": -3.5343995094299316, "loss": 3.4004, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -32.80036544799805, "rewards/margins": 2.543631076812744, "rewards/rejected": -35.343994140625, "step": 20825 }, { "epoch": 0.7020796117159325, "grad_norm": 34.829166412353516, "learning_rate": 2.468649618996742e-07, "logits/chosen": -1.4518276453018188, "logits/rejected": -1.854933500289917, "logps/chosen": -2.5755703449249268, "logps/rejected": -2.7659616470336914, "loss": 2.5933, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.755701065063477, "rewards/margins": 1.903914451599121, "rewards/rejected": -27.659616470336914, "step": 20830 }, { "epoch": 0.7022481377869156, "grad_norm": 41.89667510986328, "learning_rate": 2.466113514085953e-07, "logits/chosen": -1.7497894763946533, "logits/rejected": -1.406106948852539, "logps/chosen": -2.2218799591064453, "logps/rejected": -2.411547899246216, "loss": 2.422, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.218799591064453, "rewards/margins": 1.8966801166534424, "rewards/rejected": -24.115480422973633, "step": 20835 }, { "epoch": 0.7024166638578988, "grad_norm": 29.444116592407227, "learning_rate": 2.4635782860534454e-07, "logits/chosen": -1.7633018493652344, "logits/rejected": -1.7266566753387451, "logps/chosen": -1.7943710088729858, "logps/rejected": -1.8583873510360718, "loss": 2.841, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.94371223449707, "rewards/margins": 0.6401627659797668, "rewards/rejected": -18.583873748779297, "step": 20840 }, { "epoch": 0.702585189928882, "grad_norm": 20.048221588134766, "learning_rate": 2.4610439357765637e-07, "logits/chosen": -2.148761034011841, "logits/rejected": -2.7610249519348145, "logps/chosen": -2.7624688148498535, "logps/rejected": -3.8066444396972656, "loss": 1.4623, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -27.62468910217285, "rewards/margins": 10.441758155822754, "rewards/rejected": -38.066444396972656, "step": 20845 }, { "epoch": 0.7027537159998651, "grad_norm": 19.170835494995117, "learning_rate": 2.458510464132343e-07, "logits/chosen": -1.7249538898468018, "logits/rejected": -1.8920398950576782, "logps/chosen": -1.8914012908935547, "logps/rejected": -2.133737087249756, "loss": 2.353, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.914012908935547, "rewards/margins": 2.4233574867248535, "rewards/rejected": -21.337369918823242, "step": 20850 }, { "epoch": 0.7029222420708484, "grad_norm": 12.12578296661377, "learning_rate": 2.4559778719975207e-07, "logits/chosen": -0.9357792139053345, "logits/rejected": -1.007215142250061, "logps/chosen": -2.2251079082489014, "logps/rejected": -2.2693939208984375, "loss": 3.4564, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -22.251079559326172, "rewards/margins": 0.4428592622280121, "rewards/rejected": -22.693939208984375, "step": 20855 }, { "epoch": 0.7030907681418316, "grad_norm": 16.79740333557129, "learning_rate": 2.453446160248528e-07, "logits/chosen": -1.680640459060669, "logits/rejected": -1.6724519729614258, "logps/chosen": -2.378614664077759, "logps/rejected": -2.7835144996643066, "loss": 2.2559, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.78614616394043, "rewards/margins": 4.048998832702637, "rewards/rejected": -27.835147857666016, "step": 20860 }, { "epoch": 0.7032592942128147, "grad_norm": 19.344402313232422, "learning_rate": 2.4509153297614865e-07, "logits/chosen": -0.960638701915741, "logits/rejected": -1.0211657285690308, "logps/chosen": -2.17290997505188, "logps/rejected": -2.6399240493774414, "loss": 2.242, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.729097366333008, "rewards/margins": 4.67014217376709, "rewards/rejected": -26.399240493774414, "step": 20865 }, { "epoch": 0.7034278202837979, "grad_norm": 33.54787826538086, "learning_rate": 2.44838538141222e-07, "logits/chosen": -1.5037105083465576, "logits/rejected": -1.8484117984771729, "logps/chosen": -2.09523344039917, "logps/rejected": -2.2684638500213623, "loss": 2.36, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.952335357666016, "rewards/margins": 1.7323029041290283, "rewards/rejected": -22.684640884399414, "step": 20870 }, { "epoch": 0.7035963463547811, "grad_norm": 24.71225929260254, "learning_rate": 2.4458563160762435e-07, "logits/chosen": -1.7637755870819092, "logits/rejected": -1.9183088541030884, "logps/chosen": -2.2071845531463623, "logps/rejected": -2.446314811706543, "loss": 1.6382, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.07184600830078, "rewards/margins": 2.391300678253174, "rewards/rejected": -24.463146209716797, "step": 20875 }, { "epoch": 0.7037648724257642, "grad_norm": 35.04969787597656, "learning_rate": 2.4433281346287683e-07, "logits/chosen": -1.8257334232330322, "logits/rejected": -1.829982042312622, "logps/chosen": -1.7968488931655884, "logps/rejected": -1.9402484893798828, "loss": 2.2478, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -17.968490600585938, "rewards/margins": 1.433994174003601, "rewards/rejected": -19.402484893798828, "step": 20880 }, { "epoch": 0.7039333984967474, "grad_norm": 15.392056465148926, "learning_rate": 2.4408008379446956e-07, "logits/chosen": -1.6506710052490234, "logits/rejected": -1.8374980688095093, "logps/chosen": -2.2430996894836426, "logps/rejected": -2.7005362510681152, "loss": 2.1661, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.43099594116211, "rewards/margins": 4.574366092681885, "rewards/rejected": -27.005361557006836, "step": 20885 }, { "epoch": 0.7041019245677306, "grad_norm": 218.97528076171875, "learning_rate": 2.4382744268986235e-07, "logits/chosen": -1.8860218524932861, "logits/rejected": -2.2579519748687744, "logps/chosen": -2.500788927078247, "logps/rejected": -2.6374239921569824, "loss": 3.5651, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -25.007888793945312, "rewards/margins": 1.3663525581359863, "rewards/rejected": -26.37424087524414, "step": 20890 }, { "epoch": 0.7042704506387139, "grad_norm": 124.85858154296875, "learning_rate": 2.435748902364847e-07, "logits/chosen": -1.345632791519165, "logits/rejected": -1.4876656532287598, "logps/chosen": -2.6240341663360596, "logps/rejected": -2.737029552459717, "loss": 4.7415, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -26.240345001220703, "rewards/margins": 1.1299524307250977, "rewards/rejected": -27.37029457092285, "step": 20895 }, { "epoch": 0.704438976709697, "grad_norm": 20.623889923095703, "learning_rate": 2.4332242652173455e-07, "logits/chosen": -2.132624387741089, "logits/rejected": -2.2081000804901123, "logps/chosen": -2.1483864784240723, "logps/rejected": -2.2890305519104004, "loss": 3.4368, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.483861923217773, "rewards/margins": 1.4064419269561768, "rewards/rejected": -22.89030647277832, "step": 20900 }, { "epoch": 0.7046075027806802, "grad_norm": 18.654865264892578, "learning_rate": 2.430700516329799e-07, "logits/chosen": -1.672620415687561, "logits/rejected": -1.7438329458236694, "logps/chosen": -2.1926429271698, "logps/rejected": -2.2783713340759277, "loss": 2.5433, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.926427841186523, "rewards/margins": 0.8572842478752136, "rewards/rejected": -22.783714294433594, "step": 20905 }, { "epoch": 0.7047760288516633, "grad_norm": 30.57003402709961, "learning_rate": 2.4281776565755776e-07, "logits/chosen": -1.8434076309204102, "logits/rejected": -1.8538297414779663, "logps/chosen": -3.4328250885009766, "logps/rejected": -3.8476052284240723, "loss": 4.098, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -34.328250885009766, "rewards/margins": 4.147800922393799, "rewards/rejected": -38.476051330566406, "step": 20910 }, { "epoch": 0.7049445549226465, "grad_norm": 32.916893005371094, "learning_rate": 2.425655686827745e-07, "logits/chosen": -2.005236864089966, "logits/rejected": -2.2738490104675293, "logps/chosen": -2.3984122276306152, "logps/rejected": -2.629697561264038, "loss": 2.0332, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.984121322631836, "rewards/margins": 2.3128538131713867, "rewards/rejected": -26.29697608947754, "step": 20915 }, { "epoch": 0.7051130809936297, "grad_norm": 22.032899856567383, "learning_rate": 2.4231346079590525e-07, "logits/chosen": -1.897334337234497, "logits/rejected": -2.2675790786743164, "logps/chosen": -2.951364517211914, "logps/rejected": -2.873973846435547, "loss": 4.657, "rewards/accuracies": 0.5, "rewards/chosen": -29.513647079467773, "rewards/margins": -0.7739073038101196, "rewards/rejected": -28.7397403717041, "step": 20920 }, { "epoch": 0.7052816070646128, "grad_norm": 175.79537963867188, "learning_rate": 2.4206144208419484e-07, "logits/chosen": -1.4970096349716187, "logits/rejected": -1.7550102472305298, "logps/chosen": -2.5946249961853027, "logps/rejected": -2.578613758087158, "loss": 3.6519, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.946252822875977, "rewards/margins": -0.16011133790016174, "rewards/rejected": -25.7861385345459, "step": 20925 }, { "epoch": 0.7054501331355961, "grad_norm": 37.29673767089844, "learning_rate": 2.418095126348568e-07, "logits/chosen": -1.363229751586914, "logits/rejected": -1.3798617124557495, "logps/chosen": -2.4268722534179688, "logps/rejected": -2.4810168743133545, "loss": 3.277, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -24.26872444152832, "rewards/margins": 0.5414448976516724, "rewards/rejected": -24.810169219970703, "step": 20930 }, { "epoch": 0.7056186592065793, "grad_norm": 32.56948471069336, "learning_rate": 2.415576725350745e-07, "logits/chosen": -1.3033921718597412, "logits/rejected": -1.7104156017303467, "logps/chosen": -1.9265705347061157, "logps/rejected": -1.993730902671814, "loss": 2.7227, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.265705108642578, "rewards/margins": 0.6716042757034302, "rewards/rejected": -19.93731117248535, "step": 20935 }, { "epoch": 0.7057871852775625, "grad_norm": 88.04150390625, "learning_rate": 2.413059218719992e-07, "logits/chosen": -1.3537064790725708, "logits/rejected": -1.228635549545288, "logps/chosen": -2.790931463241577, "logps/rejected": -2.846214771270752, "loss": 4.0388, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -27.909313201904297, "rewards/margins": 0.5528322458267212, "rewards/rejected": -28.462146759033203, "step": 20940 }, { "epoch": 0.7059557113485456, "grad_norm": 106.84761047363281, "learning_rate": 2.4105426073275227e-07, "logits/chosen": -1.6752468347549438, "logits/rejected": -1.9212989807128906, "logps/chosen": -2.588087558746338, "logps/rejected": -2.9983863830566406, "loss": 2.8237, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.880874633789062, "rewards/margins": 4.1029887199401855, "rewards/rejected": -29.983861923217773, "step": 20945 }, { "epoch": 0.7061242374195288, "grad_norm": 23.526409149169922, "learning_rate": 2.408026892044236e-07, "logits/chosen": -1.0860170125961304, "logits/rejected": -1.2228302955627441, "logps/chosen": -2.1994411945343018, "logps/rejected": -2.527712345123291, "loss": 1.9611, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.994409561157227, "rewards/margins": 3.2827117443084717, "rewards/rejected": -25.277124404907227, "step": 20950 }, { "epoch": 0.706292763490512, "grad_norm": 119.02529907226562, "learning_rate": 2.405512073740722e-07, "logits/chosen": -1.8486545085906982, "logits/rejected": -2.03426194190979, "logps/chosen": -3.235337018966675, "logps/rejected": -3.7842392921447754, "loss": 3.0592, "rewards/accuracies": 0.5, "rewards/chosen": -32.353370666503906, "rewards/margins": 5.489018440246582, "rewards/rejected": -37.84238815307617, "step": 20955 }, { "epoch": 0.7064612895614951, "grad_norm": 157.5975799560547, "learning_rate": 2.402998153287261e-07, "logits/chosen": -1.907758355140686, "logits/rejected": -1.7703840732574463, "logps/chosen": -3.10050630569458, "logps/rejected": -2.9990952014923096, "loss": 4.1782, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -31.00506019592285, "rewards/margins": -1.0141105651855469, "rewards/rejected": -29.990951538085938, "step": 20960 }, { "epoch": 0.7066298156324784, "grad_norm": 33.8292121887207, "learning_rate": 2.400485131553823e-07, "logits/chosen": -1.400076150894165, "logits/rejected": -1.7552769184112549, "logps/chosen": -2.0182392597198486, "logps/rejected": -2.322455644607544, "loss": 2.0623, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.182388305664062, "rewards/margins": 3.0421645641326904, "rewards/rejected": -23.22455596923828, "step": 20965 }, { "epoch": 0.7067983417034616, "grad_norm": 28.48842430114746, "learning_rate": 2.397973009410063e-07, "logits/chosen": -1.8866052627563477, "logits/rejected": -2.1081717014312744, "logps/chosen": -2.4975154399871826, "logps/rejected": -2.766672134399414, "loss": 2.5916, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.97515869140625, "rewards/margins": 2.6915647983551025, "rewards/rejected": -27.666723251342773, "step": 20970 }, { "epoch": 0.7069668677744447, "grad_norm": 31.44121551513672, "learning_rate": 2.395461787725328e-07, "logits/chosen": -1.7562042474746704, "logits/rejected": -1.8775659799575806, "logps/chosen": -2.86181378364563, "logps/rejected": -3.6873226165771484, "loss": 1.5245, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -28.618139266967773, "rewards/margins": 8.255086898803711, "rewards/rejected": -36.87322235107422, "step": 20975 }, { "epoch": 0.7071353938454279, "grad_norm": 24.774744033813477, "learning_rate": 2.392951467368654e-07, "logits/chosen": -1.6393998861312866, "logits/rejected": -1.6322141885757446, "logps/chosen": -2.874117612838745, "logps/rejected": -3.365924835205078, "loss": 1.5556, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -28.741174697875977, "rewards/margins": 4.918069362640381, "rewards/rejected": -33.659244537353516, "step": 20980 }, { "epoch": 0.7073039199164111, "grad_norm": 21.506885528564453, "learning_rate": 2.390442049208765e-07, "logits/chosen": -1.6719391345977783, "logits/rejected": -2.0707993507385254, "logps/chosen": -1.975489854812622, "logps/rejected": -2.183727741241455, "loss": 1.8297, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.754898071289062, "rewards/margins": 2.0823795795440674, "rewards/rejected": -21.837276458740234, "step": 20985 }, { "epoch": 0.7074724459873942, "grad_norm": 8.369352340698242, "learning_rate": 2.3879335341140684e-07, "logits/chosen": -1.5787404775619507, "logits/rejected": -1.8783071041107178, "logps/chosen": -1.9943172931671143, "logps/rejected": -2.20145583152771, "loss": 2.0157, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.943172454833984, "rewards/margins": 2.071387767791748, "rewards/rejected": -22.01456069946289, "step": 20990 }, { "epoch": 0.7076409720583774, "grad_norm": 34.375, "learning_rate": 2.3854259229526647e-07, "logits/chosen": -1.6097021102905273, "logits/rejected": -2.271876096725464, "logps/chosen": -2.341718912124634, "logps/rejected": -2.7711498737335205, "loss": 1.8549, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.417186737060547, "rewards/margins": 4.294312477111816, "rewards/rejected": -27.711498260498047, "step": 20995 }, { "epoch": 0.7078094981293606, "grad_norm": 54.840301513671875, "learning_rate": 2.3829192165923407e-07, "logits/chosen": -2.0436813831329346, "logits/rejected": -2.13887095451355, "logps/chosen": -2.341813802719116, "logps/rejected": -2.5769951343536377, "loss": 2.6942, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.418140411376953, "rewards/margins": 2.351813793182373, "rewards/rejected": -25.76995277404785, "step": 21000 }, { "epoch": 0.7079780242003438, "grad_norm": 41.41640853881836, "learning_rate": 2.3804134159005652e-07, "logits/chosen": -1.459006667137146, "logits/rejected": -1.6995465755462646, "logps/chosen": -2.172149181365967, "logps/rejected": -2.6103742122650146, "loss": 3.07, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.721492767333984, "rewards/margins": 4.382250785827637, "rewards/rejected": -26.103744506835938, "step": 21005 }, { "epoch": 0.708146550271327, "grad_norm": 48.500545501708984, "learning_rate": 2.3779085217444983e-07, "logits/chosen": -1.4249950647354126, "logits/rejected": -1.457912802696228, "logps/chosen": -2.4428913593292236, "logps/rejected": -2.856995105743408, "loss": 1.5174, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -24.428913116455078, "rewards/margins": 4.141038417816162, "rewards/rejected": -28.5699520111084, "step": 21010 }, { "epoch": 0.7083150763423102, "grad_norm": 55.933231353759766, "learning_rate": 2.3754045349909862e-07, "logits/chosen": -2.118337869644165, "logits/rejected": -2.549415111541748, "logps/chosen": -2.5966598987579346, "logps/rejected": -2.9401957988739014, "loss": 2.3674, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.966602325439453, "rewards/margins": 3.435359239578247, "rewards/rejected": -29.40195655822754, "step": 21015 }, { "epoch": 0.7084836024132933, "grad_norm": 33.618892669677734, "learning_rate": 2.3729014565065614e-07, "logits/chosen": -1.8896814584732056, "logits/rejected": -2.2039222717285156, "logps/chosen": -2.2860429286956787, "logps/rejected": -2.5437560081481934, "loss": 1.7982, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -22.860427856445312, "rewards/margins": 2.577130079269409, "rewards/rejected": -25.43756103515625, "step": 21020 }, { "epoch": 0.7086521284842765, "grad_norm": 43.32668685913086, "learning_rate": 2.3703992871574367e-07, "logits/chosen": -1.524289846420288, "logits/rejected": -1.53403902053833, "logps/chosen": -2.3910186290740967, "logps/rejected": -2.4618887901306152, "loss": 3.302, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.910188674926758, "rewards/margins": 0.7087002992630005, "rewards/rejected": -24.618886947631836, "step": 21025 }, { "epoch": 0.7088206545552597, "grad_norm": 21.206262588500977, "learning_rate": 2.3678980278095174e-07, "logits/chosen": -2.2566733360290527, "logits/rejected": -2.3251144886016846, "logps/chosen": -2.616288423538208, "logps/rejected": -2.5294570922851562, "loss": 4.4404, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -26.16288185119629, "rewards/margins": -0.8683112263679504, "rewards/rejected": -25.294570922851562, "step": 21030 }, { "epoch": 0.7089891806262428, "grad_norm": 30.227651596069336, "learning_rate": 2.3653976793283913e-07, "logits/chosen": -1.8887183666229248, "logits/rejected": -2.139434337615967, "logps/chosen": -2.6790931224823, "logps/rejected": -3.093452215194702, "loss": 2.1032, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -26.790935516357422, "rewards/margins": 4.143587112426758, "rewards/rejected": -30.934520721435547, "step": 21035 }, { "epoch": 0.7091577066972261, "grad_norm": 29.77997589111328, "learning_rate": 2.3628982425793276e-07, "logits/chosen": -2.377270460128784, "logits/rejected": -2.5480237007141113, "logps/chosen": -3.316850185394287, "logps/rejected": -3.6830153465270996, "loss": 2.4897, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -33.16849899291992, "rewards/margins": 3.6616523265838623, "rewards/rejected": -36.83015441894531, "step": 21040 }, { "epoch": 0.7093262327682093, "grad_norm": 26.390913009643555, "learning_rate": 2.3603997184272845e-07, "logits/chosen": -1.281688928604126, "logits/rejected": -1.5339549779891968, "logps/chosen": -2.191223621368408, "logps/rejected": -2.870591878890991, "loss": 2.2178, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.9122371673584, "rewards/margins": 6.793682098388672, "rewards/rejected": -28.705921173095703, "step": 21045 }, { "epoch": 0.7094947588391924, "grad_norm": 0.2677067816257477, "learning_rate": 2.3579021077369045e-07, "logits/chosen": -1.5607569217681885, "logits/rejected": -1.7881824970245361, "logps/chosen": -3.0007331371307373, "logps/rejected": -3.3100173473358154, "loss": 3.1781, "rewards/accuracies": 0.5, "rewards/chosen": -30.0073299407959, "rewards/margins": 3.092841625213623, "rewards/rejected": -33.10017776489258, "step": 21050 }, { "epoch": 0.7096632849101756, "grad_norm": 26.878314971923828, "learning_rate": 2.3554054113725087e-07, "logits/chosen": -1.8124243021011353, "logits/rejected": -1.9376004934310913, "logps/chosen": -2.7536587715148926, "logps/rejected": -3.0808892250061035, "loss": 2.0869, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -27.536584854125977, "rewards/margins": 3.272303819656372, "rewards/rejected": -30.808889389038086, "step": 21055 }, { "epoch": 0.7098318109811588, "grad_norm": 27.73183250427246, "learning_rate": 2.3529096301981066e-07, "logits/chosen": -1.2320560216903687, "logits/rejected": -1.4639769792556763, "logps/chosen": -2.032846689224243, "logps/rejected": -2.275391101837158, "loss": 2.2245, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.328466415405273, "rewards/margins": 2.425446033477783, "rewards/rejected": -22.7539119720459, "step": 21060 }, { "epoch": 0.7100003370521419, "grad_norm": 36.29167556762695, "learning_rate": 2.3504147650773908e-07, "logits/chosen": -1.829874038696289, "logits/rejected": -2.2702250480651855, "logps/chosen": -1.9189624786376953, "logps/rejected": -2.343379020690918, "loss": 2.3178, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.189624786376953, "rewards/margins": 4.24416446685791, "rewards/rejected": -23.433788299560547, "step": 21065 }, { "epoch": 0.7101688631231251, "grad_norm": 16.855567932128906, "learning_rate": 2.3479208168737375e-07, "logits/chosen": -1.4624030590057373, "logits/rejected": -1.6850998401641846, "logps/chosen": -2.534311294555664, "logps/rejected": -3.1778666973114014, "loss": 1.4996, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -25.34311294555664, "rewards/margins": 6.435555458068848, "rewards/rejected": -31.778667449951172, "step": 21070 }, { "epoch": 0.7103373891941084, "grad_norm": 33.9194221496582, "learning_rate": 2.3454277864501993e-07, "logits/chosen": -1.355196237564087, "logits/rejected": -1.965765357017517, "logps/chosen": -2.3128390312194824, "logps/rejected": -2.840618133544922, "loss": 2.5798, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.12839126586914, "rewards/margins": 5.2777886390686035, "rewards/rejected": -28.40618324279785, "step": 21075 }, { "epoch": 0.7105059152650915, "grad_norm": 116.49337005615234, "learning_rate": 2.3429356746695183e-07, "logits/chosen": -2.102574348449707, "logits/rejected": -2.062224864959717, "logps/chosen": -3.443988800048828, "logps/rejected": -3.4509689807891846, "loss": 7.7899, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -34.43988800048828, "rewards/margins": 0.06980228424072266, "rewards/rejected": -34.50969314575195, "step": 21080 }, { "epoch": 0.7106744413360747, "grad_norm": 51.51720428466797, "learning_rate": 2.340444482394116e-07, "logits/chosen": -2.1616411209106445, "logits/rejected": -1.7606723308563232, "logps/chosen": -2.331195831298828, "logps/rejected": -2.2554385662078857, "loss": 4.5934, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -23.311954498291016, "rewards/margins": -0.7575688362121582, "rewards/rejected": -22.55438804626465, "step": 21085 }, { "epoch": 0.7108429674070579, "grad_norm": 23.593618392944336, "learning_rate": 2.3379542104860966e-07, "logits/chosen": -1.722654104232788, "logits/rejected": -2.115140438079834, "logps/chosen": -2.236232042312622, "logps/rejected": -2.5139546394348145, "loss": 1.7579, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.36231803894043, "rewards/margins": 2.7772271633148193, "rewards/rejected": -25.139545440673828, "step": 21090 }, { "epoch": 0.711011493478041, "grad_norm": 31.937267303466797, "learning_rate": 2.335464859807244e-07, "logits/chosen": -1.7309108972549438, "logits/rejected": -1.8715778589248657, "logps/chosen": -1.8176692724227905, "logps/rejected": -2.0656280517578125, "loss": 1.9252, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.176692962646484, "rewards/margins": 2.479588747024536, "rewards/rejected": -20.656280517578125, "step": 21095 }, { "epoch": 0.7111800195490242, "grad_norm": 1.1448978185653687, "learning_rate": 2.3329764312190252e-07, "logits/chosen": -1.9613860845565796, "logits/rejected": -2.300971269607544, "logps/chosen": -2.1797537803649902, "logps/rejected": -3.2657864093780518, "loss": 0.8235, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -21.797536849975586, "rewards/margins": 10.860325813293457, "rewards/rejected": -32.65786361694336, "step": 21100 }, { "epoch": 0.7113485456200074, "grad_norm": 23.70174217224121, "learning_rate": 2.3304889255825894e-07, "logits/chosen": -1.4914168119430542, "logits/rejected": -1.7354528903961182, "logps/chosen": -2.097081184387207, "logps/rejected": -2.174804210662842, "loss": 3.2589, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.970809936523438, "rewards/margins": 0.7772310376167297, "rewards/rejected": -21.7480411529541, "step": 21105 }, { "epoch": 0.7115170716909905, "grad_norm": 29.179771423339844, "learning_rate": 2.3280023437587592e-07, "logits/chosen": -1.5822515487670898, "logits/rejected": -1.7653348445892334, "logps/chosen": -2.2154393196105957, "logps/rejected": -2.228299617767334, "loss": 2.9847, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.15439224243164, "rewards/margins": 0.12860460579395294, "rewards/rejected": -22.28299903869629, "step": 21110 }, { "epoch": 0.7116855977619738, "grad_norm": 26.02365493774414, "learning_rate": 2.3255166866080456e-07, "logits/chosen": -1.494226098060608, "logits/rejected": -1.6853415966033936, "logps/chosen": -2.2476089000701904, "logps/rejected": -2.3578267097473145, "loss": 2.6978, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.47608757019043, "rewards/margins": 1.1021760702133179, "rewards/rejected": -23.578266143798828, "step": 21115 }, { "epoch": 0.711854123832957, "grad_norm": 103.08861541748047, "learning_rate": 2.3230319549906385e-07, "logits/chosen": -1.0157320499420166, "logits/rejected": -1.02422034740448, "logps/chosen": -2.9622600078582764, "logps/rejected": -3.1863622665405273, "loss": 3.1049, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -29.622600555419922, "rewards/margins": 2.2410223484039307, "rewards/rejected": -31.86362075805664, "step": 21120 }, { "epoch": 0.7120226499039402, "grad_norm": 46.99019241333008, "learning_rate": 2.320548149766401e-07, "logits/chosen": -2.065828800201416, "logits/rejected": -1.975515604019165, "logps/chosen": -2.1992688179016113, "logps/rejected": -2.2332983016967773, "loss": 3.2294, "rewards/accuracies": 0.5, "rewards/chosen": -21.992687225341797, "rewards/margins": 0.3402930200099945, "rewards/rejected": -22.33298110961914, "step": 21125 }, { "epoch": 0.7121911759749233, "grad_norm": 46.34275436401367, "learning_rate": 2.3180652717948828e-07, "logits/chosen": -1.742457389831543, "logits/rejected": -1.711629867553711, "logps/chosen": -2.0267436504364014, "logps/rejected": -2.6312241554260254, "loss": 2.1364, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.267440795898438, "rewards/margins": 6.044801235198975, "rewards/rejected": -26.312240600585938, "step": 21130 }, { "epoch": 0.7123597020459065, "grad_norm": 23.298921585083008, "learning_rate": 2.3155833219353104e-07, "logits/chosen": -1.6443456411361694, "logits/rejected": -1.826258897781372, "logps/chosen": -2.348782777786255, "logps/rejected": -2.5271294116973877, "loss": 2.8658, "rewards/accuracies": 0.5, "rewards/chosen": -23.487829208374023, "rewards/margins": 1.7834653854370117, "rewards/rejected": -25.27129554748535, "step": 21135 }, { "epoch": 0.7125282281168897, "grad_norm": 23.32587242126465, "learning_rate": 2.31310230104659e-07, "logits/chosen": -1.3168076276779175, "logits/rejected": -1.4803967475891113, "logps/chosen": -2.104248523712158, "logps/rejected": -2.55351185798645, "loss": 2.4836, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.042484283447266, "rewards/margins": 4.4926347732543945, "rewards/rejected": -25.535120010375977, "step": 21140 }, { "epoch": 0.7126967541878728, "grad_norm": 20.411678314208984, "learning_rate": 2.3106222099873023e-07, "logits/chosen": -1.807246208190918, "logits/rejected": -1.8495361804962158, "logps/chosen": -2.8721916675567627, "logps/rejected": -2.8132948875427246, "loss": 4.907, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -28.721914291381836, "rewards/margins": -0.5889650583267212, "rewards/rejected": -28.132949829101562, "step": 21145 }, { "epoch": 0.7128652802588561, "grad_norm": 39.6473503112793, "learning_rate": 2.30814304961571e-07, "logits/chosen": -1.4973148107528687, "logits/rejected": -1.563876748085022, "logps/chosen": -2.365460157394409, "logps/rejected": -2.8280510902404785, "loss": 2.2636, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.65460205078125, "rewards/margins": 4.625908851623535, "rewards/rejected": -28.2805118560791, "step": 21150 }, { "epoch": 0.7130338063298393, "grad_norm": 50.854774475097656, "learning_rate": 2.3056648207897555e-07, "logits/chosen": -1.129978060722351, "logits/rejected": -1.2141058444976807, "logps/chosen": -2.1422510147094727, "logps/rejected": -2.164353370666504, "loss": 3.4514, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.422510147094727, "rewards/margins": 0.2210227996110916, "rewards/rejected": -21.64353370666504, "step": 21155 }, { "epoch": 0.7132023324008224, "grad_norm": 32.68754959106445, "learning_rate": 2.3031875243670519e-07, "logits/chosen": -2.121105432510376, "logits/rejected": -2.453059673309326, "logps/chosen": -2.97033953666687, "logps/rejected": -3.187030792236328, "loss": 2.5426, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -29.703393936157227, "rewards/margins": 2.1669116020202637, "rewards/rejected": -31.87030601501465, "step": 21160 }, { "epoch": 0.7133708584718056, "grad_norm": 21.080669403076172, "learning_rate": 2.3007111612048958e-07, "logits/chosen": -2.093998908996582, "logits/rejected": -2.063572883605957, "logps/chosen": -3.4782378673553467, "logps/rejected": -3.571840286254883, "loss": 2.6477, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -34.78237533569336, "rewards/margins": 0.9360250234603882, "rewards/rejected": -35.71840286254883, "step": 21165 }, { "epoch": 0.7135393845427888, "grad_norm": 17.68836784362793, "learning_rate": 2.298235732160259e-07, "logits/chosen": -1.7278655767440796, "logits/rejected": -1.7685997486114502, "logps/chosen": -2.0162787437438965, "logps/rejected": -2.3139195442199707, "loss": 2.8728, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.162790298461914, "rewards/margins": 2.9764046669006348, "rewards/rejected": -23.13919448852539, "step": 21170 }, { "epoch": 0.7137079106137719, "grad_norm": 24.257272720336914, "learning_rate": 2.2957612380897924e-07, "logits/chosen": -1.8252532482147217, "logits/rejected": -1.8329311609268188, "logps/chosen": -2.2917399406433105, "logps/rejected": -2.804558277130127, "loss": 2.0278, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.917400360107422, "rewards/margins": 5.128180503845215, "rewards/rejected": -28.045581817626953, "step": 21175 }, { "epoch": 0.7138764366847551, "grad_norm": 108.06800079345703, "learning_rate": 2.2932876798498164e-07, "logits/chosen": -1.6782376766204834, "logits/rejected": -2.0210893154144287, "logps/chosen": -2.508038282394409, "logps/rejected": -2.7307159900665283, "loss": 2.2803, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.080385208129883, "rewards/margins": 2.226774215698242, "rewards/rejected": -27.307159423828125, "step": 21180 }, { "epoch": 0.7140449627557384, "grad_norm": 37.349796295166016, "learning_rate": 2.2908150582963343e-07, "logits/chosen": -1.6880900859832764, "logits/rejected": -1.6230026483535767, "logps/chosen": -2.2963674068450928, "logps/rejected": -2.5860037803649902, "loss": 2.5056, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.963674545288086, "rewards/margins": 2.896362781524658, "rewards/rejected": -25.860036849975586, "step": 21185 }, { "epoch": 0.7142134888267215, "grad_norm": 54.41622543334961, "learning_rate": 2.2883433742850245e-07, "logits/chosen": -1.4802095890045166, "logits/rejected": -1.592818021774292, "logps/chosen": -2.937121629714966, "logps/rejected": -3.137934923171997, "loss": 2.6537, "rewards/accuracies": 0.5, "rewards/chosen": -29.371212005615234, "rewards/margins": 2.008134365081787, "rewards/rejected": -31.379344940185547, "step": 21190 }, { "epoch": 0.7143820148977047, "grad_norm": 160.5109405517578, "learning_rate": 2.285872628671236e-07, "logits/chosen": -1.8429698944091797, "logits/rejected": -2.2430481910705566, "logps/chosen": -3.6799235343933105, "logps/rejected": -4.076421737670898, "loss": 4.0979, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -36.79923629760742, "rewards/margins": 3.964979648590088, "rewards/rejected": -40.764217376708984, "step": 21195 }, { "epoch": 0.7145505409686879, "grad_norm": 48.12770462036133, "learning_rate": 2.2834028223099982e-07, "logits/chosen": -1.7956221103668213, "logits/rejected": -2.4896199703216553, "logps/chosen": -2.2983362674713135, "logps/rejected": -2.975998640060425, "loss": 1.4942, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -22.98336410522461, "rewards/margins": 6.776625633239746, "rewards/rejected": -29.759984970092773, "step": 21200 }, { "epoch": 0.7145505409686879, "eval_logits/chosen": -2.167952537536621, "eval_logits/rejected": -2.3352444171905518, "eval_logps/chosen": -2.232255697250366, "eval_logps/rejected": -2.380444288253784, "eval_loss": 3.0574114322662354, "eval_rewards/accuracies": 0.6299999952316284, "eval_rewards/chosen": -22.322555541992188, "eval_rewards/margins": 1.4818838834762573, "eval_rewards/rejected": -23.8044376373291, "eval_runtime": 12.8946, "eval_samples_per_second": 7.755, "eval_steps_per_second": 1.939, "step": 21200 }, { "epoch": 0.714719067039671, "grad_norm": 82.61286926269531, "learning_rate": 2.2809339560560143e-07, "logits/chosen": -1.8070186376571655, "logits/rejected": -1.924077033996582, "logps/chosen": -2.382108449935913, "logps/rejected": -2.96075439453125, "loss": 2.6643, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.82108497619629, "rewards/margins": 5.7864580154418945, "rewards/rejected": -29.6075439453125, "step": 21205 }, { "epoch": 0.7148875931106542, "grad_norm": 60.51424026489258, "learning_rate": 2.2784660307636632e-07, "logits/chosen": -2.4727723598480225, "logits/rejected": -2.318603038787842, "logps/chosen": -2.613403558731079, "logps/rejected": -2.723193645477295, "loss": 4.2397, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -26.134037017822266, "rewards/margins": 1.097900390625, "rewards/rejected": -27.23193359375, "step": 21210 }, { "epoch": 0.7150561191816374, "grad_norm": 28.987451553344727, "learning_rate": 2.2759990472869926e-07, "logits/chosen": -1.657486915588379, "logits/rejected": -1.9913294315338135, "logps/chosen": -2.1268715858459473, "logps/rejected": -2.752849817276001, "loss": 2.7899, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.26871681213379, "rewards/margins": 6.259782314300537, "rewards/rejected": -27.52849769592285, "step": 21215 }, { "epoch": 0.7152246452526205, "grad_norm": 32.00741195678711, "learning_rate": 2.273533006479731e-07, "logits/chosen": -1.8473243713378906, "logits/rejected": -1.6231390237808228, "logps/chosen": -2.018846035003662, "logps/rejected": -2.16618013381958, "loss": 2.0689, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.188459396362305, "rewards/margins": 1.4733377695083618, "rewards/rejected": -21.66179847717285, "step": 21220 }, { "epoch": 0.7153931713236038, "grad_norm": 45.651912689208984, "learning_rate": 2.2710679091952767e-07, "logits/chosen": -2.4465718269348145, "logits/rejected": -2.3830413818359375, "logps/chosen": -2.0005125999450684, "logps/rejected": -2.189295768737793, "loss": 2.6768, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.005125045776367, "rewards/margins": 1.8878326416015625, "rewards/rejected": -21.892959594726562, "step": 21225 }, { "epoch": 0.715561697394587, "grad_norm": 35.45180130004883, "learning_rate": 2.2686037562867033e-07, "logits/chosen": -1.246711254119873, "logits/rejected": -1.3242307901382446, "logps/chosen": -2.182126522064209, "logps/rejected": -2.203878164291382, "loss": 3.1722, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.821266174316406, "rewards/margins": 0.21751537919044495, "rewards/rejected": -22.038782119750977, "step": 21230 }, { "epoch": 0.7157302234655701, "grad_norm": 17.120080947875977, "learning_rate": 2.2661405486067593e-07, "logits/chosen": -2.0206122398376465, "logits/rejected": -2.1763134002685547, "logps/chosen": -2.324014902114868, "logps/rejected": -2.6340198516845703, "loss": 2.9857, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.240148544311523, "rewards/margins": 3.100048065185547, "rewards/rejected": -26.340198516845703, "step": 21235 }, { "epoch": 0.7158987495365533, "grad_norm": 23.65846061706543, "learning_rate": 2.2636782870078598e-07, "logits/chosen": -1.6090011596679688, "logits/rejected": -1.6499748229980469, "logps/chosen": -2.645697832107544, "logps/rejected": -2.6968419551849365, "loss": 3.213, "rewards/accuracies": 0.5, "rewards/chosen": -26.456979751586914, "rewards/margins": 0.5114401578903198, "rewards/rejected": -26.968420028686523, "step": 21240 }, { "epoch": 0.7160672756075365, "grad_norm": 5.814166069030762, "learning_rate": 2.2612169723420983e-07, "logits/chosen": -1.8302971124649048, "logits/rejected": -2.1456241607666016, "logps/chosen": -2.0068020820617676, "logps/rejected": -2.1064953804016113, "loss": 3.3416, "rewards/accuracies": 0.5, "rewards/chosen": -20.06801986694336, "rewards/margins": 0.9969308972358704, "rewards/rejected": -21.064952850341797, "step": 21245 }, { "epoch": 0.7162358016785196, "grad_norm": 189.8372344970703, "learning_rate": 2.258756605461239e-07, "logits/chosen": -1.873355507850647, "logits/rejected": -1.8405964374542236, "logps/chosen": -2.281764268875122, "logps/rejected": -2.165700912475586, "loss": 4.2068, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.817642211914062, "rewards/margins": -1.1606316566467285, "rewards/rejected": -21.657011032104492, "step": 21250 }, { "epoch": 0.7164043277495028, "grad_norm": 34.90528106689453, "learning_rate": 2.2562971872167175e-07, "logits/chosen": -1.7773349285125732, "logits/rejected": -1.6610603332519531, "logps/chosen": -2.300401449203491, "logps/rejected": -2.500701904296875, "loss": 3.2863, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.004016876220703, "rewards/margins": 2.0030055046081543, "rewards/rejected": -25.007020950317383, "step": 21255 }, { "epoch": 0.7165728538204861, "grad_norm": 30.30763816833496, "learning_rate": 2.2538387184596443e-07, "logits/chosen": -1.5406862497329712, "logits/rejected": -1.7500499486923218, "logps/chosen": -2.7660608291625977, "logps/rejected": -2.8649933338165283, "loss": 3.7678, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -27.660608291625977, "rewards/margins": 0.9893225431442261, "rewards/rejected": -28.64992904663086, "step": 21260 }, { "epoch": 0.7167413798914692, "grad_norm": 63.39667510986328, "learning_rate": 2.251381200040794e-07, "logits/chosen": -1.757519006729126, "logits/rejected": -1.9305756092071533, "logps/chosen": -3.3314311504364014, "logps/rejected": -3.6929244995117188, "loss": 4.2631, "rewards/accuracies": 0.5, "rewards/chosen": -33.31431198120117, "rewards/margins": 3.6149322986602783, "rewards/rejected": -36.92924499511719, "step": 21265 }, { "epoch": 0.7169099059624524, "grad_norm": 29.48974609375, "learning_rate": 2.2489246328106193e-07, "logits/chosen": -1.7308318614959717, "logits/rejected": -1.8175618648529053, "logps/chosen": -2.5321240425109863, "logps/rejected": -2.9798617362976074, "loss": 1.3057, "rewards/accuracies": 1.0, "rewards/chosen": -25.32124137878418, "rewards/margins": 4.477375507354736, "rewards/rejected": -29.79861831665039, "step": 21270 }, { "epoch": 0.7170784320334356, "grad_norm": 28.888696670532227, "learning_rate": 2.2464690176192413e-07, "logits/chosen": -2.0127971172332764, "logits/rejected": -2.030998468399048, "logps/chosen": -1.9513905048370361, "logps/rejected": -2.0580296516418457, "loss": 2.8038, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.513904571533203, "rewards/margins": 1.066389799118042, "rewards/rejected": -20.58029556274414, "step": 21275 }, { "epoch": 0.7172469581044187, "grad_norm": 62.83280563354492, "learning_rate": 2.244014355316453e-07, "logits/chosen": -1.9647674560546875, "logits/rejected": -2.2305192947387695, "logps/chosen": -2.1211609840393066, "logps/rejected": -2.109130859375, "loss": 3.6599, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.21160888671875, "rewards/margins": -0.12029991298913956, "rewards/rejected": -21.09130859375, "step": 21280 }, { "epoch": 0.7174154841754019, "grad_norm": 28.013324737548828, "learning_rate": 2.2415606467517134e-07, "logits/chosen": -1.4304853677749634, "logits/rejected": -1.919377326965332, "logps/chosen": -2.0542383193969727, "logps/rejected": -2.3345751762390137, "loss": 2.6828, "rewards/accuracies": 0.5, "rewards/chosen": -20.54238510131836, "rewards/margins": 2.803367853164673, "rewards/rejected": -23.34575080871582, "step": 21285 }, { "epoch": 0.7175840102463851, "grad_norm": 35.321327209472656, "learning_rate": 2.2391078927741552e-07, "logits/chosen": -1.984724760055542, "logits/rejected": -1.718353509902954, "logps/chosen": -2.211531162261963, "logps/rejected": -2.1570630073547363, "loss": 4.121, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -22.115312576293945, "rewards/margins": -0.5446780920028687, "rewards/rejected": -21.57063102722168, "step": 21290 }, { "epoch": 0.7177525363173684, "grad_norm": 13.352673530578613, "learning_rate": 2.2366560942325828e-07, "logits/chosen": -1.9127241373062134, "logits/rejected": -1.9945110082626343, "logps/chosen": -2.2655529975891113, "logps/rejected": -2.767404079437256, "loss": 2.0763, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.655529022216797, "rewards/margins": 5.01850700378418, "rewards/rejected": -27.674036026000977, "step": 21295 }, { "epoch": 0.7179210623883515, "grad_norm": 5.086029052734375, "learning_rate": 2.234205251975463e-07, "logits/chosen": -1.4625164270401, "logits/rejected": -1.3165785074234009, "logps/chosen": -1.9075345993041992, "logps/rejected": -1.817773461341858, "loss": 4.7245, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.07534408569336, "rewards/margins": -0.8976105451583862, "rewards/rejected": -18.177734375, "step": 21300 }, { "epoch": 0.7180895884593347, "grad_norm": 91.38097381591797, "learning_rate": 2.231755366850937e-07, "logits/chosen": -1.311603307723999, "logits/rejected": -1.4673454761505127, "logps/chosen": -3.1911635398864746, "logps/rejected": -3.2235331535339355, "loss": 4.8665, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -31.911640167236328, "rewards/margins": 0.3236920237541199, "rewards/rejected": -32.235328674316406, "step": 21305 }, { "epoch": 0.7182581145303178, "grad_norm": 8.877885818481445, "learning_rate": 2.2293064397068144e-07, "logits/chosen": -1.6854312419891357, "logits/rejected": -1.9205318689346313, "logps/chosen": -2.0600550174713135, "logps/rejected": -2.436044692993164, "loss": 1.8768, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.600549697875977, "rewards/margins": 3.7598977088928223, "rewards/rejected": -24.36044692993164, "step": 21310 }, { "epoch": 0.718426640601301, "grad_norm": 161.917236328125, "learning_rate": 2.226858471390574e-07, "logits/chosen": -1.7705590724945068, "logits/rejected": -1.7386350631713867, "logps/chosen": -2.8510050773620605, "logps/rejected": -2.8456594944000244, "loss": 3.4966, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -28.510046005249023, "rewards/margins": -0.05345315858721733, "rewards/rejected": -28.456594467163086, "step": 21315 }, { "epoch": 0.7185951666722842, "grad_norm": 34.89402770996094, "learning_rate": 2.2244114627493578e-07, "logits/chosen": -1.9679571390151978, "logits/rejected": -2.3387842178344727, "logps/chosen": -2.206179618835449, "logps/rejected": -2.8861498832702637, "loss": 3.2962, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -22.061796188354492, "rewards/margins": 6.799704551696777, "rewards/rejected": -28.861501693725586, "step": 21320 }, { "epoch": 0.7187636927432673, "grad_norm": 23.90579605102539, "learning_rate": 2.2219654146299794e-07, "logits/chosen": -1.3234543800354004, "logits/rejected": -1.437170386314392, "logps/chosen": -2.106889247894287, "logps/rejected": -2.464322090148926, "loss": 1.503, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -21.068889617919922, "rewards/margins": 3.574329376220703, "rewards/rejected": -24.643220901489258, "step": 21325 }, { "epoch": 0.7189322188142505, "grad_norm": 0.4208122193813324, "learning_rate": 2.2195203278789232e-07, "logits/chosen": -1.780714750289917, "logits/rejected": -1.8529062271118164, "logps/chosen": -1.8633159399032593, "logps/rejected": -2.1635186672210693, "loss": 1.4399, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -18.633159637451172, "rewards/margins": 3.0020289421081543, "rewards/rejected": -21.635189056396484, "step": 21330 }, { "epoch": 0.7191007448852338, "grad_norm": 24.755207061767578, "learning_rate": 2.2170762033423334e-07, "logits/chosen": -2.186357021331787, "logits/rejected": -2.4847683906555176, "logps/chosen": -2.5217902660369873, "logps/rejected": -3.0550482273101807, "loss": 1.1454, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -25.2179012298584, "rewards/margins": 5.332579135894775, "rewards/rejected": -30.55048179626465, "step": 21335 }, { "epoch": 0.719269270956217, "grad_norm": 37.603004455566406, "learning_rate": 2.2146330418660265e-07, "logits/chosen": -1.308406114578247, "logits/rejected": -1.3398100137710571, "logps/chosen": -2.137680768966675, "logps/rejected": -2.3950021266937256, "loss": 2.558, "rewards/accuracies": 0.5, "rewards/chosen": -21.376808166503906, "rewards/margins": 2.5732150077819824, "rewards/rejected": -23.950023651123047, "step": 21340 }, { "epoch": 0.7194377970272001, "grad_norm": 58.8840446472168, "learning_rate": 2.2121908442954852e-07, "logits/chosen": -1.9633537530899048, "logits/rejected": -2.081953525543213, "logps/chosen": -2.1084682941436768, "logps/rejected": -2.199111223220825, "loss": 2.9718, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.08468246459961, "rewards/margins": 0.9064277410507202, "rewards/rejected": -21.99110984802246, "step": 21345 }, { "epoch": 0.7196063230981833, "grad_norm": 142.93063354492188, "learning_rate": 2.2097496114758585e-07, "logits/chosen": -1.987000823020935, "logits/rejected": -2.1771368980407715, "logps/chosen": -2.0718207359313965, "logps/rejected": -1.9550584554672241, "loss": 4.651, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.71820640563965, "rewards/margins": -1.1676223278045654, "rewards/rejected": -19.55058479309082, "step": 21350 }, { "epoch": 0.7197748491691665, "grad_norm": 26.261070251464844, "learning_rate": 2.2073093442519587e-07, "logits/chosen": -1.6336174011230469, "logits/rejected": -2.0398337841033936, "logps/chosen": -2.14435076713562, "logps/rejected": -2.5782923698425293, "loss": 2.0267, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.44350814819336, "rewards/margins": 4.339417934417725, "rewards/rejected": -25.782922744750977, "step": 21355 }, { "epoch": 0.7199433752401496, "grad_norm": 300.9296875, "learning_rate": 2.2048700434682666e-07, "logits/chosen": -1.2496579885482788, "logits/rejected": -1.2768909931182861, "logps/chosen": -2.8326752185821533, "logps/rejected": -2.3903040885925293, "loss": 8.9275, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -28.326751708984375, "rewards/margins": -4.42371129989624, "rewards/rejected": -23.903039932250977, "step": 21360 }, { "epoch": 0.7201119013111328, "grad_norm": 39.62416076660156, "learning_rate": 2.202431709968931e-07, "logits/chosen": -1.3857898712158203, "logits/rejected": -1.5687038898468018, "logps/chosen": -2.131641387939453, "logps/rejected": -2.549037456512451, "loss": 1.6817, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.316415786743164, "rewards/margins": 4.173960208892822, "rewards/rejected": -25.490375518798828, "step": 21365 }, { "epoch": 0.7202804273821161, "grad_norm": 74.57008361816406, "learning_rate": 2.1999943445977586e-07, "logits/chosen": -1.8909308910369873, "logits/rejected": -1.9793068170547485, "logps/chosen": -2.3767809867858887, "logps/rejected": -2.5203075408935547, "loss": 2.5488, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.767810821533203, "rewards/margins": 1.4352657794952393, "rewards/rejected": -25.203075408935547, "step": 21370 }, { "epoch": 0.7204489534530992, "grad_norm": 23.843839645385742, "learning_rate": 2.1975579481982283e-07, "logits/chosen": -1.8189566135406494, "logits/rejected": -1.9706977605819702, "logps/chosen": -2.760148525238037, "logps/rejected": -3.534060001373291, "loss": 2.303, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -27.601486206054688, "rewards/margins": 7.739114284515381, "rewards/rejected": -35.340599060058594, "step": 21375 }, { "epoch": 0.7206174795240824, "grad_norm": 26.39014434814453, "learning_rate": 2.1951225216134795e-07, "logits/chosen": -1.7611877918243408, "logits/rejected": -2.1785178184509277, "logps/chosen": -2.0668892860412598, "logps/rejected": -2.2843990325927734, "loss": 2.5125, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.668895721435547, "rewards/margins": 2.1750943660736084, "rewards/rejected": -22.843990325927734, "step": 21380 }, { "epoch": 0.7207860055950656, "grad_norm": 46.218658447265625, "learning_rate": 2.192688065686319e-07, "logits/chosen": -1.968400239944458, "logits/rejected": -2.208972930908203, "logps/chosen": -2.1618003845214844, "logps/rejected": -2.215686321258545, "loss": 3.6735, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -21.618003845214844, "rewards/margins": 0.538860023021698, "rewards/rejected": -22.156864166259766, "step": 21385 }, { "epoch": 0.7209545316660487, "grad_norm": 31.82733154296875, "learning_rate": 2.1902545812592144e-07, "logits/chosen": -1.3051466941833496, "logits/rejected": -1.4728295803070068, "logps/chosen": -2.800342559814453, "logps/rejected": -2.817983388900757, "loss": 3.601, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -28.0034236907959, "rewards/margins": 0.1764104813337326, "rewards/rejected": -28.179834365844727, "step": 21390 }, { "epoch": 0.7211230577370319, "grad_norm": 9.380635261535645, "learning_rate": 2.1878220691743005e-07, "logits/chosen": -1.5431849956512451, "logits/rejected": -1.637035608291626, "logps/chosen": -2.2975473403930664, "logps/rejected": -2.5784294605255127, "loss": 2.5353, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.975473403930664, "rewards/margins": 2.8088207244873047, "rewards/rejected": -25.784292221069336, "step": 21395 }, { "epoch": 0.7212915838080151, "grad_norm": 25.51203155517578, "learning_rate": 2.1853905302733744e-07, "logits/chosen": -1.9188916683197021, "logits/rejected": -1.8357101678848267, "logps/chosen": -3.159135341644287, "logps/rejected": -4.11862850189209, "loss": 1.667, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -31.591350555419922, "rewards/margins": 9.59493637084961, "rewards/rejected": -41.18628692626953, "step": 21400 }, { "epoch": 0.7214601098789983, "grad_norm": 20.456008911132812, "learning_rate": 2.1829599653978932e-07, "logits/chosen": -1.7088804244995117, "logits/rejected": -2.2845683097839355, "logps/chosen": -1.9112327098846436, "logps/rejected": -2.8263041973114014, "loss": 2.1699, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.112329483032227, "rewards/margins": 9.150714874267578, "rewards/rejected": -28.263042449951172, "step": 21405 }, { "epoch": 0.7216286359499815, "grad_norm": 28.70714569091797, "learning_rate": 2.1805303753889803e-07, "logits/chosen": -1.6716718673706055, "logits/rejected": -1.8000621795654297, "logps/chosen": -2.972921371459961, "logps/rejected": -2.9745922088623047, "loss": 4.9837, "rewards/accuracies": 0.5, "rewards/chosen": -29.72921371459961, "rewards/margins": 0.016707420349121094, "rewards/rejected": -29.745920181274414, "step": 21410 }, { "epoch": 0.7217971620209647, "grad_norm": 21.922080993652344, "learning_rate": 2.1781017610874224e-07, "logits/chosen": -1.9619964361190796, "logits/rejected": -2.128202199935913, "logps/chosen": -2.0648131370544434, "logps/rejected": -2.676927089691162, "loss": 2.4329, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.64813232421875, "rewards/margins": 6.121140003204346, "rewards/rejected": -26.769271850585938, "step": 21415 }, { "epoch": 0.7219656880919478, "grad_norm": 49.752410888671875, "learning_rate": 2.1756741233336683e-07, "logits/chosen": -1.1550843715667725, "logits/rejected": -1.7084548473358154, "logps/chosen": -2.2608838081359863, "logps/rejected": -2.5417706966400146, "loss": 2.1431, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.60883903503418, "rewards/margins": 2.808867931365967, "rewards/rejected": -25.417705535888672, "step": 21420 }, { "epoch": 0.722134214162931, "grad_norm": 20.315689086914062, "learning_rate": 2.1732474629678243e-07, "logits/chosen": -2.0127696990966797, "logits/rejected": -2.288529872894287, "logps/chosen": -2.3630669116973877, "logps/rejected": -2.7239952087402344, "loss": 2.6911, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.63067054748535, "rewards/margins": 3.609281539916992, "rewards/rejected": -27.239953994750977, "step": 21425 }, { "epoch": 0.7223027402339142, "grad_norm": 24.40152359008789, "learning_rate": 2.1708217808296642e-07, "logits/chosen": -1.7259283065795898, "logits/rejected": -2.223496675491333, "logps/chosen": -2.1215977668762207, "logps/rejected": -2.2175374031066895, "loss": 3.1128, "rewards/accuracies": 0.5, "rewards/chosen": -21.21597671508789, "rewards/margins": 0.9593954086303711, "rewards/rejected": -22.175373077392578, "step": 21430 }, { "epoch": 0.7224712663048973, "grad_norm": 24.670318603515625, "learning_rate": 2.168397077758622e-07, "logits/chosen": -1.7781130075454712, "logits/rejected": -2.312152147293091, "logps/chosen": -2.622331142425537, "logps/rejected": -3.033611536026001, "loss": 2.44, "rewards/accuracies": 0.5, "rewards/chosen": -26.223312377929688, "rewards/margins": 4.112800598144531, "rewards/rejected": -30.33611488342285, "step": 21435 }, { "epoch": 0.7226397923758805, "grad_norm": 21.8465518951416, "learning_rate": 2.1659733545937886e-07, "logits/chosen": -2.0275444984436035, "logits/rejected": -2.103527545928955, "logps/chosen": -3.047454357147217, "logps/rejected": -3.24442982673645, "loss": 3.1846, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -30.474544525146484, "rewards/margins": 1.9697548151016235, "rewards/rejected": -32.444297790527344, "step": 21440 }, { "epoch": 0.7228083184468638, "grad_norm": 29.372770309448242, "learning_rate": 2.163550612173921e-07, "logits/chosen": -1.7228193283081055, "logits/rejected": -1.6601688861846924, "logps/chosen": -2.5070953369140625, "logps/rejected": -2.589710235595703, "loss": 3.0728, "rewards/accuracies": 0.5, "rewards/chosen": -25.070951461791992, "rewards/margins": 0.8261513710021973, "rewards/rejected": -25.897104263305664, "step": 21445 }, { "epoch": 0.7229768445178469, "grad_norm": 31.65980339050293, "learning_rate": 2.161128851337435e-07, "logits/chosen": -1.6353622674942017, "logits/rejected": -1.20289945602417, "logps/chosen": -2.7376632690429688, "logps/rejected": -2.623929500579834, "loss": 4.7818, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -27.376636505126953, "rewards/margins": -1.137336015701294, "rewards/rejected": -26.23929786682129, "step": 21450 }, { "epoch": 0.7231453705888301, "grad_norm": 16.73038673400879, "learning_rate": 2.1587080729224082e-07, "logits/chosen": -1.5904583930969238, "logits/rejected": -1.8517332077026367, "logps/chosen": -2.191336154937744, "logps/rejected": -2.4386138916015625, "loss": 1.7566, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.913360595703125, "rewards/margins": 2.472780704498291, "rewards/rejected": -24.386140823364258, "step": 21455 }, { "epoch": 0.7233138966598133, "grad_norm": 47.32572555541992, "learning_rate": 2.156288277766573e-07, "logits/chosen": -1.8892157077789307, "logits/rejected": -1.5970206260681152, "logps/chosen": -2.255237579345703, "logps/rejected": -2.3379788398742676, "loss": 3.1796, "rewards/accuracies": 0.5, "rewards/chosen": -22.5523738861084, "rewards/margins": 0.827414870262146, "rewards/rejected": -23.379789352416992, "step": 21460 }, { "epoch": 0.7234824227307964, "grad_norm": 26.47463035583496, "learning_rate": 2.153869466707327e-07, "logits/chosen": -1.2520034313201904, "logits/rejected": -1.5026814937591553, "logps/chosen": -2.371990203857422, "logps/rejected": -2.86955189704895, "loss": 1.6533, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -23.71990394592285, "rewards/margins": 4.975613594055176, "rewards/rejected": -28.69551658630371, "step": 21465 }, { "epoch": 0.7236509488017796, "grad_norm": 36.53496170043945, "learning_rate": 2.151451640581728e-07, "logits/chosen": -1.8263139724731445, "logits/rejected": -2.1204657554626465, "logps/chosen": -1.880746603012085, "logps/rejected": -2.126354932785034, "loss": 2.4378, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.80746841430664, "rewards/margins": 2.4560837745666504, "rewards/rejected": -21.2635498046875, "step": 21470 }, { "epoch": 0.7238194748727628, "grad_norm": 25.182451248168945, "learning_rate": 2.1490348002264852e-07, "logits/chosen": -1.9280401468276978, "logits/rejected": -1.8660892248153687, "logps/chosen": -2.6166787147521973, "logps/rejected": -2.53410005569458, "loss": 5.44, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -26.166784286499023, "rewards/margins": -0.8257870674133301, "rewards/rejected": -25.34099769592285, "step": 21475 }, { "epoch": 0.723988000943746, "grad_norm": 33.53519821166992, "learning_rate": 2.146618946477975e-07, "logits/chosen": -2.0185818672180176, "logits/rejected": -1.8434902429580688, "logps/chosen": -2.1258037090301514, "logps/rejected": -2.264908790588379, "loss": 2.8339, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.25803565979004, "rewards/margins": 1.3910521268844604, "rewards/rejected": -22.649089813232422, "step": 21480 }, { "epoch": 0.7241565270147292, "grad_norm": 22.76723289489746, "learning_rate": 2.144204080172229e-07, "logits/chosen": -1.6953926086425781, "logits/rejected": -1.9088554382324219, "logps/chosen": -2.2550201416015625, "logps/rejected": -2.693312883377075, "loss": 1.5315, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.550201416015625, "rewards/margins": 4.3829264640808105, "rewards/rejected": -26.933130264282227, "step": 21485 }, { "epoch": 0.7243250530857124, "grad_norm": 26.930259704589844, "learning_rate": 2.141790202144938e-07, "logits/chosen": -1.878997564315796, "logits/rejected": -1.7778708934783936, "logps/chosen": -2.3526453971862793, "logps/rejected": -2.3866665363311768, "loss": 3.0321, "rewards/accuracies": 0.5, "rewards/chosen": -23.526456832885742, "rewards/margins": 0.34020644426345825, "rewards/rejected": -23.86666488647461, "step": 21490 }, { "epoch": 0.7244935791566955, "grad_norm": 9.314671516418457, "learning_rate": 2.1393773132314479e-07, "logits/chosen": -1.6867443323135376, "logits/rejected": -2.1116082668304443, "logps/chosen": -2.4089341163635254, "logps/rejected": -3.092846393585205, "loss": 1.388, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -24.08934211730957, "rewards/margins": 6.839125156402588, "rewards/rejected": -30.928466796875, "step": 21495 }, { "epoch": 0.7246621052276787, "grad_norm": 23.345691680908203, "learning_rate": 2.1369654142667653e-07, "logits/chosen": -1.89193856716156, "logits/rejected": -2.0643019676208496, "logps/chosen": -2.404980182647705, "logps/rejected": -2.6729843616485596, "loss": 1.9652, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.049800872802734, "rewards/margins": 2.680041551589966, "rewards/rejected": -26.729843139648438, "step": 21500 }, { "epoch": 0.7248306312986619, "grad_norm": 7.167284965515137, "learning_rate": 2.1345545060855558e-07, "logits/chosen": -1.5573482513427734, "logits/rejected": -1.5786257982254028, "logps/chosen": -2.6754653453826904, "logps/rejected": -2.7059824466705322, "loss": 3.1412, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -26.754650115966797, "rewards/margins": 0.30517178773880005, "rewards/rejected": -27.059825897216797, "step": 21505 }, { "epoch": 0.724999157369645, "grad_norm": 20.261817932128906, "learning_rate": 2.1321445895221357e-07, "logits/chosen": -2.548706531524658, "logits/rejected": -2.389806032180786, "logps/chosen": -3.0154504776000977, "logps/rejected": -3.1051926612854004, "loss": 3.0242, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -30.154504776000977, "rewards/margins": 0.8974201083183289, "rewards/rejected": -31.051921844482422, "step": 21510 }, { "epoch": 0.7251676834406283, "grad_norm": 43.0755615234375, "learning_rate": 2.129735665410484e-07, "logits/chosen": -1.6869655847549438, "logits/rejected": -2.2092459201812744, "logps/chosen": -2.0269999504089355, "logps/rejected": -2.411229133605957, "loss": 2.5056, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.270000457763672, "rewards/margins": 3.8422927856445312, "rewards/rejected": -24.112289428710938, "step": 21515 }, { "epoch": 0.7253362095116115, "grad_norm": 36.908592224121094, "learning_rate": 2.127327734584235e-07, "logits/chosen": -1.4574025869369507, "logits/rejected": -1.7088209390640259, "logps/chosen": -2.0809457302093506, "logps/rejected": -2.0220389366149902, "loss": 3.6727, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -20.809457778930664, "rewards/margins": -0.5890698432922363, "rewards/rejected": -20.220388412475586, "step": 21520 }, { "epoch": 0.7255047355825947, "grad_norm": 31.039684295654297, "learning_rate": 2.124920797876678e-07, "logits/chosen": -1.992561936378479, "logits/rejected": -2.0856804847717285, "logps/chosen": -2.183537244796753, "logps/rejected": -2.502293348312378, "loss": 1.6649, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.835372924804688, "rewards/margins": 3.1875603199005127, "rewards/rejected": -25.022933959960938, "step": 21525 }, { "epoch": 0.7256732616535778, "grad_norm": 18.880573272705078, "learning_rate": 2.1225148561207596e-07, "logits/chosen": -1.826438307762146, "logits/rejected": -2.0268962383270264, "logps/chosen": -2.406874179840088, "logps/rejected": -2.645177125930786, "loss": 2.9077, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.068742752075195, "rewards/margins": 2.383025646209717, "rewards/rejected": -26.451770782470703, "step": 21530 }, { "epoch": 0.725841787724561, "grad_norm": 110.51516723632812, "learning_rate": 2.1201099101490828e-07, "logits/chosen": -1.7749906778335571, "logits/rejected": -1.9072506427764893, "logps/chosen": -2.6400880813598633, "logps/rejected": -2.54280424118042, "loss": 4.5673, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.400882720947266, "rewards/margins": -0.9728401303291321, "rewards/rejected": -25.428041458129883, "step": 21535 }, { "epoch": 0.7260103137955441, "grad_norm": 79.18938446044922, "learning_rate": 2.1177059607939014e-07, "logits/chosen": -2.305410861968994, "logits/rejected": -2.0641355514526367, "logps/chosen": -3.205970287322998, "logps/rejected": -3.1420233249664307, "loss": 6.4197, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -32.0597038269043, "rewards/margins": -0.6394695043563843, "rewards/rejected": -31.42023277282715, "step": 21540 }, { "epoch": 0.7261788398665273, "grad_norm": 4.201801300048828, "learning_rate": 2.1153030088871286e-07, "logits/chosen": -1.6951990127563477, "logits/rejected": -2.0906145572662354, "logps/chosen": -2.429492473602295, "logps/rejected": -2.592778444290161, "loss": 2.5728, "rewards/accuracies": 0.5, "rewards/chosen": -24.294925689697266, "rewards/margins": 1.6328589916229248, "rewards/rejected": -25.927783966064453, "step": 21545 }, { "epoch": 0.7263473659375105, "grad_norm": 38.4725341796875, "learning_rate": 2.112901055260332e-07, "logits/chosen": -1.0017874240875244, "logits/rejected": -1.5967439413070679, "logps/chosen": -1.7733027935028076, "logps/rejected": -2.150214195251465, "loss": 1.7988, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.733028411865234, "rewards/margins": 3.7691142559051514, "rewards/rejected": -21.50214195251465, "step": 21550 }, { "epoch": 0.7265158920084938, "grad_norm": 39.86610412597656, "learning_rate": 2.1105001007447348e-07, "logits/chosen": -1.7047908306121826, "logits/rejected": -1.934041976928711, "logps/chosen": -3.5561611652374268, "logps/rejected": -3.844970703125, "loss": 6.4161, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -35.56161117553711, "rewards/margins": 2.8880958557128906, "rewards/rejected": -38.44970703125, "step": 21555 }, { "epoch": 0.7266844180794769, "grad_norm": 34.7709846496582, "learning_rate": 2.1081001461712096e-07, "logits/chosen": -2.2614529132843018, "logits/rejected": -2.164703369140625, "logps/chosen": -2.1901984214782715, "logps/rejected": -2.157827854156494, "loss": 3.7716, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -21.901987075805664, "rewards/margins": -0.3237057626247406, "rewards/rejected": -21.57828140258789, "step": 21560 }, { "epoch": 0.7268529441504601, "grad_norm": 48.2080078125, "learning_rate": 2.1057011923702872e-07, "logits/chosen": -2.027026653289795, "logits/rejected": -2.1803174018859863, "logps/chosen": -2.5710997581481934, "logps/rejected": -2.8667140007019043, "loss": 2.6841, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.71099853515625, "rewards/margins": 2.9561429023742676, "rewards/rejected": -28.667144775390625, "step": 21565 }, { "epoch": 0.7270214702214433, "grad_norm": 26.905202865600586, "learning_rate": 2.103303240172151e-07, "logits/chosen": -1.483336329460144, "logits/rejected": -1.5595595836639404, "logps/chosen": -2.6003048419952393, "logps/rejected": -2.9392549991607666, "loss": 1.9289, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -26.003047943115234, "rewards/margins": 3.3895020484924316, "rewards/rejected": -29.39255142211914, "step": 21570 }, { "epoch": 0.7271899962924264, "grad_norm": 77.60855865478516, "learning_rate": 2.1009062904066404e-07, "logits/chosen": -1.9670803546905518, "logits/rejected": -2.1249303817749023, "logps/chosen": -2.6444172859191895, "logps/rejected": -2.739316463470459, "loss": 3.3144, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.444171905517578, "rewards/margins": 0.948990523815155, "rewards/rejected": -27.393163681030273, "step": 21575 }, { "epoch": 0.7273585223634096, "grad_norm": 32.01676559448242, "learning_rate": 2.098510343903241e-07, "logits/chosen": -1.2383935451507568, "logits/rejected": -1.593308687210083, "logps/chosen": -2.18575382232666, "logps/rejected": -2.4231770038604736, "loss": 3.2295, "rewards/accuracies": 0.5, "rewards/chosen": -21.8575382232666, "rewards/margins": 2.3742308616638184, "rewards/rejected": -24.231767654418945, "step": 21580 }, { "epoch": 0.7275270484343928, "grad_norm": 13.398002624511719, "learning_rate": 2.096115401491097e-07, "logits/chosen": -2.0275418758392334, "logits/rejected": -2.2678370475769043, "logps/chosen": -1.8518550395965576, "logps/rejected": -2.0165534019470215, "loss": 2.3763, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.518550872802734, "rewards/margins": 1.6469831466674805, "rewards/rejected": -20.1655330657959, "step": 21585 }, { "epoch": 0.727695574505376, "grad_norm": 19.24317169189453, "learning_rate": 2.0937214639990064e-07, "logits/chosen": -2.054893732070923, "logits/rejected": -2.430537700653076, "logps/chosen": -2.9003663063049316, "logps/rejected": -2.5996203422546387, "loss": 7.1915, "rewards/accuracies": 0.5, "rewards/chosen": -29.003662109375, "rewards/margins": -3.0074591636657715, "rewards/rejected": -25.996204376220703, "step": 21590 }, { "epoch": 0.7278641005763592, "grad_norm": 18.928489685058594, "learning_rate": 2.0913285322554126e-07, "logits/chosen": -1.5994584560394287, "logits/rejected": -1.9036979675292969, "logps/chosen": -2.419440746307373, "logps/rejected": -2.740779161453247, "loss": 3.6663, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.194406509399414, "rewards/margins": 3.213385820388794, "rewards/rejected": -27.407794952392578, "step": 21595 }, { "epoch": 0.7280326266473424, "grad_norm": 24.36540985107422, "learning_rate": 2.0889366070884161e-07, "logits/chosen": -1.7841564416885376, "logits/rejected": -2.3233582973480225, "logps/chosen": -3.2880959510803223, "logps/rejected": -4.030459403991699, "loss": 0.8797, "rewards/accuracies": 1.0, "rewards/chosen": -32.880958557128906, "rewards/margins": 7.4236345291137695, "rewards/rejected": -40.304588317871094, "step": 21600 }, { "epoch": 0.7280326266473424, "eval_logits/chosen": -2.172097682952881, "eval_logits/rejected": -2.3394265174865723, "eval_logps/chosen": -2.234185218811035, "eval_logps/rejected": -2.3823485374450684, "eval_loss": 3.061626672744751, "eval_rewards/accuracies": 0.6299999952316284, "eval_rewards/chosen": -22.34185218811035, "eval_rewards/margins": 1.4816354513168335, "eval_rewards/rejected": -23.823486328125, "eval_runtime": 12.8908, "eval_samples_per_second": 7.757, "eval_steps_per_second": 1.939, "step": 21600 }, { "epoch": 0.7282011527183255, "grad_norm": 31.25184440612793, "learning_rate": 2.0865456893257688e-07, "logits/chosen": -1.7714402675628662, "logits/rejected": -2.0110652446746826, "logps/chosen": -2.775097608566284, "logps/rejected": -3.070955753326416, "loss": 3.4729, "rewards/accuracies": 0.5, "rewards/chosen": -27.750980377197266, "rewards/margins": 2.9585795402526855, "rewards/rejected": -30.709558486938477, "step": 21605 }, { "epoch": 0.7283696787893087, "grad_norm": 27.05602264404297, "learning_rate": 2.084155779794875e-07, "logits/chosen": -2.4862136840820312, "logits/rejected": -2.4018337726593018, "logps/chosen": -3.081584930419922, "logps/rejected": -3.008526086807251, "loss": 6.1938, "rewards/accuracies": 0.5, "rewards/chosen": -30.81585121154785, "rewards/margins": -0.7305895090103149, "rewards/rejected": -30.08526039123535, "step": 21610 }, { "epoch": 0.7285382048602919, "grad_norm": 19.466569900512695, "learning_rate": 2.0817668793227845e-07, "logits/chosen": -1.7015174627304077, "logits/rejected": -2.1962692737579346, "logps/chosen": -2.1845622062683105, "logps/rejected": -3.028851270675659, "loss": 2.3892, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.845623016357422, "rewards/margins": 8.442889213562012, "rewards/rejected": -30.28851318359375, "step": 21615 }, { "epoch": 0.728706730931275, "grad_norm": 27.886024475097656, "learning_rate": 2.0793789887362022e-07, "logits/chosen": -1.9878759384155273, "logits/rejected": -2.0809290409088135, "logps/chosen": -2.6173412799835205, "logps/rejected": -2.7520089149475098, "loss": 2.9027, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -26.173410415649414, "rewards/margins": 1.3466761112213135, "rewards/rejected": -27.520090103149414, "step": 21620 }, { "epoch": 0.7288752570022583, "grad_norm": 59.48930740356445, "learning_rate": 2.0769921088614867e-07, "logits/chosen": -2.083529472351074, "logits/rejected": -1.9938551187515259, "logps/chosen": -2.6014790534973145, "logps/rejected": -2.4967360496520996, "loss": 4.5187, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -26.014789581298828, "rewards/margins": -1.0474289655685425, "rewards/rejected": -24.967361450195312, "step": 21625 }, { "epoch": 0.7290437830732415, "grad_norm": 35.612510681152344, "learning_rate": 2.0746062405246384e-07, "logits/chosen": -1.6287791728973389, "logits/rejected": -1.6877319812774658, "logps/chosen": -2.3290133476257324, "logps/rejected": -2.4741005897521973, "loss": 2.4639, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.29013442993164, "rewards/margins": 1.4508706331253052, "rewards/rejected": -24.741003036499023, "step": 21630 }, { "epoch": 0.7292123091442246, "grad_norm": 14.086270332336426, "learning_rate": 2.0722213845513147e-07, "logits/chosen": -2.198855400085449, "logits/rejected": -2.485460042953491, "logps/chosen": -2.5541272163391113, "logps/rejected": -2.784548282623291, "loss": 3.081, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.54127311706543, "rewards/margins": 2.3042120933532715, "rewards/rejected": -27.84548568725586, "step": 21635 }, { "epoch": 0.7293808352152078, "grad_norm": 24.064496994018555, "learning_rate": 2.0698375417668194e-07, "logits/chosen": -1.8448317050933838, "logits/rejected": -2.0081303119659424, "logps/chosen": -1.7897602319717407, "logps/rejected": -2.2101948261260986, "loss": 0.8858, "rewards/accuracies": 1.0, "rewards/chosen": -17.897600173950195, "rewards/margins": 4.204346179962158, "rewards/rejected": -22.101947784423828, "step": 21640 }, { "epoch": 0.729549361286191, "grad_norm": 52.95700454711914, "learning_rate": 2.0674547129961096e-07, "logits/chosen": -1.9925181865692139, "logits/rejected": -2.0233118534088135, "logps/chosen": -2.1348519325256348, "logps/rejected": -2.157832622528076, "loss": 3.098, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.34851837158203, "rewards/margins": 0.22980785369873047, "rewards/rejected": -21.57832908630371, "step": 21645 }, { "epoch": 0.7297178873571741, "grad_norm": 41.6870002746582, "learning_rate": 2.0650728990637833e-07, "logits/chosen": -1.3519022464752197, "logits/rejected": -1.4920880794525146, "logps/chosen": -2.5849311351776123, "logps/rejected": -2.673745632171631, "loss": 2.6725, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.84930992126465, "rewards/margins": 0.8881477117538452, "rewards/rejected": -26.737457275390625, "step": 21650 }, { "epoch": 0.7298864134281573, "grad_norm": 79.82129669189453, "learning_rate": 2.0626921007940955e-07, "logits/chosen": -1.5634056329727173, "logits/rejected": -2.13328218460083, "logps/chosen": -2.2908542156219482, "logps/rejected": -2.903014659881592, "loss": 1.9705, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.90854263305664, "rewards/margins": 6.121607303619385, "rewards/rejected": -29.030147552490234, "step": 21655 }, { "epoch": 0.7300549394991405, "grad_norm": 124.51602935791016, "learning_rate": 2.0603123190109468e-07, "logits/chosen": -1.8668529987335205, "logits/rejected": -2.3392491340637207, "logps/chosen": -2.717681407928467, "logps/rejected": -2.839423418045044, "loss": 3.7129, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -27.17681312561035, "rewards/margins": 1.2174217700958252, "rewards/rejected": -28.394235610961914, "step": 21660 }, { "epoch": 0.7302234655701237, "grad_norm": 25.56239891052246, "learning_rate": 2.0579335545378862e-07, "logits/chosen": -1.793898582458496, "logits/rejected": -1.988956093788147, "logps/chosen": -2.0297439098358154, "logps/rejected": -2.320495843887329, "loss": 2.1599, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.297439575195312, "rewards/margins": 2.907517910003662, "rewards/rejected": -23.204957962036133, "step": 21665 }, { "epoch": 0.7303919916411069, "grad_norm": 220.97686767578125, "learning_rate": 2.0555558081981085e-07, "logits/chosen": -1.5371949672698975, "logits/rejected": -1.4318342208862305, "logps/chosen": -3.315932035446167, "logps/rejected": -3.3558952808380127, "loss": 5.6326, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -33.15932083129883, "rewards/margins": 0.3996322751045227, "rewards/rejected": -33.55895233154297, "step": 21670 }, { "epoch": 0.7305605177120901, "grad_norm": 18.90399742126465, "learning_rate": 2.053179080814459e-07, "logits/chosen": -1.383420705795288, "logits/rejected": -1.8868669271469116, "logps/chosen": -2.0650277137756348, "logps/rejected": -2.419672727584839, "loss": 1.9787, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.6502742767334, "rewards/margins": 3.5464529991149902, "rewards/rejected": -24.196727752685547, "step": 21675 }, { "epoch": 0.7307290437830732, "grad_norm": 25.166046142578125, "learning_rate": 2.0508033732094294e-07, "logits/chosen": -1.5126234292984009, "logits/rejected": -2.2251594066619873, "logps/chosen": -2.121800661087036, "logps/rejected": -2.5393898487091064, "loss": 1.339, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.218008041381836, "rewards/margins": 4.17588996887207, "rewards/rejected": -25.393896102905273, "step": 21680 }, { "epoch": 0.7308975698540564, "grad_norm": 1.5466541051864624, "learning_rate": 2.0484286862051585e-07, "logits/chosen": -1.4658101797103882, "logits/rejected": -1.957542061805725, "logps/chosen": -2.613654851913452, "logps/rejected": -2.6913158893585205, "loss": 4.5056, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -26.136547088623047, "rewards/margins": 0.7766073942184448, "rewards/rejected": -26.913158416748047, "step": 21685 }, { "epoch": 0.7310660959250396, "grad_norm": 38.16489791870117, "learning_rate": 2.0460550206234323e-07, "logits/chosen": -2.2058348655700684, "logits/rejected": -2.6453592777252197, "logps/chosen": -2.519449234008789, "logps/rejected": -3.174541473388672, "loss": 1.4793, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -25.19449234008789, "rewards/margins": 6.550921440124512, "rewards/rejected": -31.745412826538086, "step": 21690 }, { "epoch": 0.7312346219960227, "grad_norm": 81.57676696777344, "learning_rate": 2.0436823772856843e-07, "logits/chosen": -2.1794464588165283, "logits/rejected": -2.2463772296905518, "logps/chosen": -3.040895938873291, "logps/rejected": -2.8675930500030518, "loss": 5.1556, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -30.408960342407227, "rewards/margins": -1.7330306768417358, "rewards/rejected": -28.675933837890625, "step": 21695 }, { "epoch": 0.731403148067006, "grad_norm": 39.454044342041016, "learning_rate": 2.0413107570129894e-07, "logits/chosen": -1.1450724601745605, "logits/rejected": -1.4699748754501343, "logps/chosen": -2.2500405311584473, "logps/rejected": -2.3740577697753906, "loss": 2.1046, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.500404357910156, "rewards/margins": 1.2401739358901978, "rewards/rejected": -23.740581512451172, "step": 21700 }, { "epoch": 0.7315716741379892, "grad_norm": 29.22942352294922, "learning_rate": 2.0389401606260743e-07, "logits/chosen": -2.020216941833496, "logits/rejected": -2.078310489654541, "logps/chosen": -2.3463661670684814, "logps/rejected": -2.4976742267608643, "loss": 2.2146, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.463661193847656, "rewards/margins": 1.513080358505249, "rewards/rejected": -24.976741790771484, "step": 21705 }, { "epoch": 0.7317402002089723, "grad_norm": 39.88352966308594, "learning_rate": 2.0365705889453083e-07, "logits/chosen": -1.6458019018173218, "logits/rejected": -1.7097450494766235, "logps/chosen": -2.1416940689086914, "logps/rejected": -2.4000911712646484, "loss": 3.2204, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.416940689086914, "rewards/margins": 2.5839715003967285, "rewards/rejected": -24.000911712646484, "step": 21710 }, { "epoch": 0.7319087262799555, "grad_norm": 36.849876403808594, "learning_rate": 2.0342020427907086e-07, "logits/chosen": -1.592294692993164, "logits/rejected": -1.305525541305542, "logps/chosen": -2.492601156234741, "logps/rejected": -2.5987658500671387, "loss": 2.9679, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.926013946533203, "rewards/margins": 1.061646580696106, "rewards/rejected": -25.987659454345703, "step": 21715 }, { "epoch": 0.7320772523509387, "grad_norm": 59.083961486816406, "learning_rate": 2.0318345229819324e-07, "logits/chosen": -1.4433258771896362, "logits/rejected": -2.345215320587158, "logps/chosen": -2.0695242881774902, "logps/rejected": -3.579845428466797, "loss": 2.4304, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.695240020751953, "rewards/margins": 15.103212356567383, "rewards/rejected": -35.79845428466797, "step": 21720 }, { "epoch": 0.7322457784219218, "grad_norm": 30.461536407470703, "learning_rate": 2.0294680303382867e-07, "logits/chosen": -1.554012656211853, "logits/rejected": -1.4713451862335205, "logps/chosen": -2.2490663528442383, "logps/rejected": -2.099107027053833, "loss": 4.9726, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.490663528442383, "rewards/margins": -1.499592661857605, "rewards/rejected": -20.991069793701172, "step": 21725 }, { "epoch": 0.732414304492905, "grad_norm": 0.02380460500717163, "learning_rate": 2.0271025656787232e-07, "logits/chosen": -2.025754690170288, "logits/rejected": -2.0077271461486816, "logps/chosen": -3.112980842590332, "logps/rejected": -2.8330283164978027, "loss": 7.9069, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -31.129806518554688, "rewards/margins": -2.7995221614837646, "rewards/rejected": -28.330286026000977, "step": 21730 }, { "epoch": 0.7325828305638883, "grad_norm": 56.081871032714844, "learning_rate": 2.0247381298218324e-07, "logits/chosen": -1.9024174213409424, "logits/rejected": -2.314527988433838, "logps/chosen": -2.104179859161377, "logps/rejected": -2.716348648071289, "loss": 1.859, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.041799545288086, "rewards/margins": 6.121685028076172, "rewards/rejected": -27.163482666015625, "step": 21735 }, { "epoch": 0.7327513566348715, "grad_norm": 26.001535415649414, "learning_rate": 2.022374723585854e-07, "logits/chosen": -1.456640601158142, "logits/rejected": -1.551238775253296, "logps/chosen": -2.132565975189209, "logps/rejected": -2.308089256286621, "loss": 2.2998, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.325660705566406, "rewards/margins": 1.755231261253357, "rewards/rejected": -23.080890655517578, "step": 21740 }, { "epoch": 0.7329198827058546, "grad_norm": 0.0018961316673085093, "learning_rate": 2.0200123477886706e-07, "logits/chosen": -1.740966558456421, "logits/rejected": -2.4154534339904785, "logps/chosen": -2.9531397819519043, "logps/rejected": -3.728076934814453, "loss": 1.3412, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -29.53139877319336, "rewards/margins": 7.7493696212768555, "rewards/rejected": -37.28076934814453, "step": 21745 }, { "epoch": 0.7330884087768378, "grad_norm": 71.07219696044922, "learning_rate": 2.0176510032478083e-07, "logits/chosen": -0.8832302093505859, "logits/rejected": -0.9097579717636108, "logps/chosen": -2.8612914085388184, "logps/rejected": -2.9164257049560547, "loss": 3.4554, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -28.6129150390625, "rewards/margins": 0.5513428449630737, "rewards/rejected": -29.164257049560547, "step": 21750 }, { "epoch": 0.733256934847821, "grad_norm": 0.0001923279487527907, "learning_rate": 2.0152906907804317e-07, "logits/chosen": -2.055886745452881, "logits/rejected": -2.815279006958008, "logps/chosen": -2.7214102745056152, "logps/rejected": -3.637563705444336, "loss": 2.0419, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -27.2141056060791, "rewards/margins": 9.161534309387207, "rewards/rejected": -36.375640869140625, "step": 21755 }, { "epoch": 0.7334254609188041, "grad_norm": 26.346817016601562, "learning_rate": 2.0129314112033552e-07, "logits/chosen": -1.566145658493042, "logits/rejected": -1.60275137424469, "logps/chosen": -2.2926278114318848, "logps/rejected": -2.2890093326568604, "loss": 3.6163, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.926279067993164, "rewards/margins": -0.03618669509887695, "rewards/rejected": -22.890094757080078, "step": 21760 }, { "epoch": 0.7335939869897873, "grad_norm": 35.32753372192383, "learning_rate": 2.010573165333032e-07, "logits/chosen": -1.9177045822143555, "logits/rejected": -2.006000280380249, "logps/chosen": -2.9610650539398193, "logps/rejected": -3.1530046463012695, "loss": 2.6623, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -29.61065101623535, "rewards/margins": 1.9193947315216064, "rewards/rejected": -31.530048370361328, "step": 21765 }, { "epoch": 0.7337625130607704, "grad_norm": 35.251258850097656, "learning_rate": 2.008215953985557e-07, "logits/chosen": -2.1620099544525146, "logits/rejected": -2.229039430618286, "logps/chosen": -1.9183037281036377, "logps/rejected": -1.9755512475967407, "loss": 2.9403, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.18303871154785, "rewards/margins": 0.5724747776985168, "rewards/rejected": -19.755512237548828, "step": 21770 }, { "epoch": 0.7339310391317537, "grad_norm": 38.914920806884766, "learning_rate": 2.0058597779766677e-07, "logits/chosen": -1.7548776865005493, "logits/rejected": -1.7073177099227905, "logps/chosen": -1.9944050312042236, "logps/rejected": -2.2155983448028564, "loss": 1.486, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -19.944049835205078, "rewards/margins": 2.211933135986328, "rewards/rejected": -22.155981063842773, "step": 21775 }, { "epoch": 0.7340995652027369, "grad_norm": 63.15382766723633, "learning_rate": 2.0035046381217458e-07, "logits/chosen": -1.6686474084854126, "logits/rejected": -2.051553249359131, "logps/chosen": -2.022413730621338, "logps/rejected": -2.373833417892456, "loss": 2.5306, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.224136352539062, "rewards/margins": 3.5141963958740234, "rewards/rejected": -23.73833465576172, "step": 21780 }, { "epoch": 0.7342680912737201, "grad_norm": 27.028345108032227, "learning_rate": 2.0011505352358126e-07, "logits/chosen": -2.099684715270996, "logits/rejected": -2.4309449195861816, "logps/chosen": -2.7647461891174316, "logps/rejected": -2.9898247718811035, "loss": 1.6346, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -27.6474609375, "rewards/margins": 2.2507858276367188, "rewards/rejected": -29.89824867248535, "step": 21785 }, { "epoch": 0.7344366173447032, "grad_norm": 16.62060546875, "learning_rate": 1.9987974701335276e-07, "logits/chosen": -1.8711074590682983, "logits/rejected": -1.8878005743026733, "logps/chosen": -1.918784499168396, "logps/rejected": -2.1599090099334717, "loss": 1.6143, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.18784523010254, "rewards/margins": 2.4112448692321777, "rewards/rejected": -21.599090576171875, "step": 21790 }, { "epoch": 0.7346051434156864, "grad_norm": 23.89042091369629, "learning_rate": 1.9964454436291955e-07, "logits/chosen": -1.636922836303711, "logits/rejected": -1.909189224243164, "logps/chosen": -2.8264358043670654, "logps/rejected": -3.0672054290771484, "loss": 2.023, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -28.264358520507812, "rewards/margins": 2.40769624710083, "rewards/rejected": -30.672054290771484, "step": 21795 }, { "epoch": 0.7347736694866696, "grad_norm": 40.953765869140625, "learning_rate": 1.9940944565367617e-07, "logits/chosen": -1.708216667175293, "logits/rejected": -1.8522497415542603, "logps/chosen": -2.2710158824920654, "logps/rejected": -2.5011610984802246, "loss": 2.6869, "rewards/accuracies": 0.5, "rewards/chosen": -22.710155487060547, "rewards/margins": 2.301453113555908, "rewards/rejected": -25.011611938476562, "step": 21800 }, { "epoch": 0.7349421955576527, "grad_norm": 37.34510803222656, "learning_rate": 1.9917445096698065e-07, "logits/chosen": -1.5141806602478027, "logits/rejected": -2.053837299346924, "logps/chosen": -2.2500662803649902, "logps/rejected": -2.249181032180786, "loss": 3.3769, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.500661849975586, "rewards/margins": -0.008852005004882812, "rewards/rejected": -22.491809844970703, "step": 21805 }, { "epoch": 0.735110721628636, "grad_norm": 21.374662399291992, "learning_rate": 1.9893956038415565e-07, "logits/chosen": -2.0776031017303467, "logits/rejected": -2.0971264839172363, "logps/chosen": -2.3597798347473145, "logps/rejected": -2.544868230819702, "loss": 2.7494, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.597797393798828, "rewards/margins": 1.850882887840271, "rewards/rejected": -25.448680877685547, "step": 21810 }, { "epoch": 0.7352792476996192, "grad_norm": 32.04645538330078, "learning_rate": 1.987047739864875e-07, "logits/chosen": -1.3755344152450562, "logits/rejected": -1.5694763660430908, "logps/chosen": -2.604581832885742, "logps/rejected": -2.738844633102417, "loss": 2.8888, "rewards/accuracies": 0.5, "rewards/chosen": -26.045818328857422, "rewards/margins": 1.3426287174224854, "rewards/rejected": -27.388446807861328, "step": 21815 }, { "epoch": 0.7354477737706023, "grad_norm": 39.647884368896484, "learning_rate": 1.9847009185522644e-07, "logits/chosen": -1.5242843627929688, "logits/rejected": -1.7097200155258179, "logps/chosen": -2.7526488304138184, "logps/rejected": -2.7933335304260254, "loss": 3.2408, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -27.5264892578125, "rewards/margins": 0.4068460464477539, "rewards/rejected": -27.933338165283203, "step": 21820 }, { "epoch": 0.7356162998415855, "grad_norm": 23.728511810302734, "learning_rate": 1.982355140715869e-07, "logits/chosen": -1.4998576641082764, "logits/rejected": -2.016965389251709, "logps/chosen": -2.0560142993927, "logps/rejected": -2.4636600017547607, "loss": 1.8951, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.560142517089844, "rewards/margins": 4.0764570236206055, "rewards/rejected": -24.636600494384766, "step": 21825 }, { "epoch": 0.7357848259125687, "grad_norm": 149.23117065429688, "learning_rate": 1.9800104071674677e-07, "logits/chosen": -1.884606957435608, "logits/rejected": -1.9795339107513428, "logps/chosen": -2.4407334327697754, "logps/rejected": -2.8185982704162598, "loss": 3.4615, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.407337188720703, "rewards/margins": 3.7786457538604736, "rewards/rejected": -28.18597984313965, "step": 21830 }, { "epoch": 0.7359533519835518, "grad_norm": 30.48876953125, "learning_rate": 1.9776667187184842e-07, "logits/chosen": -2.699343204498291, "logits/rejected": -2.3093960285186768, "logps/chosen": -2.7900753021240234, "logps/rejected": -3.133699893951416, "loss": 1.6805, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -27.9007511138916, "rewards/margins": 3.4362499713897705, "rewards/rejected": -31.336999893188477, "step": 21835 }, { "epoch": 0.736121878054535, "grad_norm": 26.56561279296875, "learning_rate": 1.9753240761799722e-07, "logits/chosen": -1.8246999979019165, "logits/rejected": -1.5348434448242188, "logps/chosen": -1.9646905660629272, "logps/rejected": -1.8137729167938232, "loss": 4.6944, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -19.64690589904785, "rewards/margins": -1.5091748237609863, "rewards/rejected": -18.13772964477539, "step": 21840 }, { "epoch": 0.7362904041255183, "grad_norm": 31.89987564086914, "learning_rate": 1.9729824803626299e-07, "logits/chosen": -1.7160171270370483, "logits/rejected": -2.2106146812438965, "logps/chosen": -2.4249229431152344, "logps/rejected": -4.063868999481201, "loss": 1.6264, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -24.24922752380371, "rewards/margins": 16.38946533203125, "rewards/rejected": -40.638694763183594, "step": 21845 }, { "epoch": 0.7364589301965014, "grad_norm": 38.55632019042969, "learning_rate": 1.9706419320767915e-07, "logits/chosen": -1.6241347789764404, "logits/rejected": -1.7468106746673584, "logps/chosen": -2.586089849472046, "logps/rejected": -2.7664897441864014, "loss": 2.2135, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -25.860897064208984, "rewards/margins": 1.8039993047714233, "rewards/rejected": -27.66489601135254, "step": 21850 }, { "epoch": 0.7366274562674846, "grad_norm": 16.978574752807617, "learning_rate": 1.9683024321324304e-07, "logits/chosen": -2.144387722015381, "logits/rejected": -2.554112195968628, "logps/chosen": -1.8638019561767578, "logps/rejected": -2.06876802444458, "loss": 2.8064, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.638019561767578, "rewards/margins": 2.0496623516082764, "rewards/rejected": -20.687681198120117, "step": 21855 }, { "epoch": 0.7367959823384678, "grad_norm": 27.162925720214844, "learning_rate": 1.9659639813391515e-07, "logits/chosen": -1.3586866855621338, "logits/rejected": -1.717599868774414, "logps/chosen": -2.207620620727539, "logps/rejected": -2.515143394470215, "loss": 2.1462, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.07620620727539, "rewards/margins": 3.075228214263916, "rewards/rejected": -25.151432037353516, "step": 21860 }, { "epoch": 0.7369645084094509, "grad_norm": 26.866268157958984, "learning_rate": 1.9636265805062025e-07, "logits/chosen": -1.8105628490447998, "logits/rejected": -2.0254015922546387, "logps/chosen": -1.65911865234375, "logps/rejected": -1.9263187646865845, "loss": 1.389, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -16.5911865234375, "rewards/margins": 2.6720001697540283, "rewards/rejected": -19.263187408447266, "step": 21865 }, { "epoch": 0.7371330344804341, "grad_norm": 12.93826675415039, "learning_rate": 1.9612902304424672e-07, "logits/chosen": -1.498923897743225, "logits/rejected": -1.587410569190979, "logps/chosen": -2.786804676055908, "logps/rejected": -3.3624961376190186, "loss": 1.4787, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -27.868045806884766, "rewards/margins": 5.75691556930542, "rewards/rejected": -33.624961853027344, "step": 21870 }, { "epoch": 0.7373015605514173, "grad_norm": 19.367074966430664, "learning_rate": 1.9589549319564607e-07, "logits/chosen": -2.255361557006836, "logits/rejected": -2.319303035736084, "logps/chosen": -1.961960792541504, "logps/rejected": -2.3379883766174316, "loss": 1.6691, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.619606018066406, "rewards/margins": 3.7602760791778564, "rewards/rejected": -23.3798828125, "step": 21875 }, { "epoch": 0.7374700866224004, "grad_norm": 24.227609634399414, "learning_rate": 1.9566206858563406e-07, "logits/chosen": -2.2398581504821777, "logits/rejected": -2.4608020782470703, "logps/chosen": -2.5069398880004883, "logps/rejected": -2.7963309288024902, "loss": 2.5762, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.06939697265625, "rewards/margins": 2.893913745880127, "rewards/rejected": -27.96331214904785, "step": 21880 }, { "epoch": 0.7376386126933837, "grad_norm": 34.777587890625, "learning_rate": 1.9542874929498964e-07, "logits/chosen": -2.167205572128296, "logits/rejected": -2.2443249225616455, "logps/chosen": -3.534986972808838, "logps/rejected": -4.648120880126953, "loss": 4.5919, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -35.3498649597168, "rewards/margins": 11.131341934204102, "rewards/rejected": -46.48120880126953, "step": 21885 }, { "epoch": 0.7378071387643669, "grad_norm": 24.37884521484375, "learning_rate": 1.9519553540445562e-07, "logits/chosen": -1.5812952518463135, "logits/rejected": -1.5319125652313232, "logps/chosen": -2.2089767456054688, "logps/rejected": -2.3872642517089844, "loss": 2.5415, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.089771270751953, "rewards/margins": 1.7828737497329712, "rewards/rejected": -23.872644424438477, "step": 21890 }, { "epoch": 0.73797566483535, "grad_norm": 62.177066802978516, "learning_rate": 1.9496242699473782e-07, "logits/chosen": -1.7420861721038818, "logits/rejected": -1.941145658493042, "logps/chosen": -2.2418718338012695, "logps/rejected": -2.2937049865722656, "loss": 3.5042, "rewards/accuracies": 0.5, "rewards/chosen": -22.418716430664062, "rewards/margins": 0.5183313488960266, "rewards/rejected": -22.937047958374023, "step": 21895 }, { "epoch": 0.7381441909063332, "grad_norm": 49.71686935424805, "learning_rate": 1.9472942414650607e-07, "logits/chosen": -1.7992541790008545, "logits/rejected": -1.9160549640655518, "logps/chosen": -1.7772258520126343, "logps/rejected": -1.9301140308380127, "loss": 2.9361, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.772258758544922, "rewards/margins": 1.5288810729980469, "rewards/rejected": -19.30113983154297, "step": 21900 }, { "epoch": 0.7383127169773164, "grad_norm": 23.071325302124023, "learning_rate": 1.9449652694039353e-07, "logits/chosen": -1.5916250944137573, "logits/rejected": -1.8258146047592163, "logps/chosen": -2.6305465698242188, "logps/rejected": -3.2942707538604736, "loss": 2.7448, "rewards/accuracies": 0.5, "rewards/chosen": -26.305465698242188, "rewards/margins": 6.637242317199707, "rewards/rejected": -32.942710876464844, "step": 21905 }, { "epoch": 0.7384812430482995, "grad_norm": 31.649948120117188, "learning_rate": 1.9426373545699658e-07, "logits/chosen": -1.8089519739151, "logits/rejected": -1.8158622980117798, "logps/chosen": -2.6677355766296387, "logps/rejected": -3.247864246368408, "loss": 2.4772, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -26.677358627319336, "rewards/margins": 5.801283836364746, "rewards/rejected": -32.478641510009766, "step": 21910 }, { "epoch": 0.7386497691192827, "grad_norm": 3.3252029418945312, "learning_rate": 1.9403104977687524e-07, "logits/chosen": -1.416884422302246, "logits/rejected": -1.889899492263794, "logps/chosen": -2.3453822135925293, "logps/rejected": -2.5371739864349365, "loss": 2.646, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.453821182250977, "rewards/margins": 1.9179189205169678, "rewards/rejected": -25.37173843383789, "step": 21915 }, { "epoch": 0.738818295190266, "grad_norm": 80.28643798828125, "learning_rate": 1.9379846998055282e-07, "logits/chosen": -1.7270739078521729, "logits/rejected": -1.7358312606811523, "logps/chosen": -3.119675874710083, "logps/rejected": -3.312995195388794, "loss": 4.8238, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -31.196758270263672, "rewards/margins": 1.933192491531372, "rewards/rejected": -33.12995147705078, "step": 21920 }, { "epoch": 0.7389868212612491, "grad_norm": 26.531578063964844, "learning_rate": 1.935659961485163e-07, "logits/chosen": -1.6422741413116455, "logits/rejected": -1.9076168537139893, "logps/chosen": -2.1916511058807373, "logps/rejected": -2.3707642555236816, "loss": 2.3595, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.91651153564453, "rewards/margins": 1.7911300659179688, "rewards/rejected": -23.7076416015625, "step": 21925 }, { "epoch": 0.7391553473322323, "grad_norm": 40.92249298095703, "learning_rate": 1.933336283612153e-07, "logits/chosen": -1.8644893169403076, "logits/rejected": -2.33244252204895, "logps/chosen": -1.8013432025909424, "logps/rejected": -2.192413806915283, "loss": 1.865, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.013431549072266, "rewards/margins": 3.9107062816619873, "rewards/rejected": -21.924137115478516, "step": 21930 }, { "epoch": 0.7393238734032155, "grad_norm": 25.538143157958984, "learning_rate": 1.9310136669906342e-07, "logits/chosen": -1.5520654916763306, "logits/rejected": -1.9955809116363525, "logps/chosen": -1.8076364994049072, "logps/rejected": -2.3288650512695312, "loss": 1.831, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.076366424560547, "rewards/margins": 5.21228551864624, "rewards/rejected": -23.288650512695312, "step": 21935 }, { "epoch": 0.7394923994741986, "grad_norm": 30.85343360900879, "learning_rate": 1.9286921124243727e-07, "logits/chosen": -1.5067452192306519, "logits/rejected": -1.6436704397201538, "logps/chosen": -2.2649459838867188, "logps/rejected": -2.452059507369995, "loss": 2.6169, "rewards/accuracies": 0.5, "rewards/chosen": -22.649459838867188, "rewards/margins": 1.8711345195770264, "rewards/rejected": -24.520593643188477, "step": 21940 }, { "epoch": 0.7396609255451818, "grad_norm": 42.99625778198242, "learning_rate": 1.9263716207167652e-07, "logits/chosen": -1.8859636783599854, "logits/rejected": -2.007584810256958, "logps/chosen": -2.6758522987365723, "logps/rejected": -2.7993979454040527, "loss": 2.8658, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -26.75852394104004, "rewards/margins": 1.2354570627212524, "rewards/rejected": -27.993982315063477, "step": 21945 }, { "epoch": 0.739829451616165, "grad_norm": 9.061617851257324, "learning_rate": 1.9240521926708437e-07, "logits/chosen": -1.9813066720962524, "logits/rejected": -2.2144722938537598, "logps/chosen": -3.057587146759033, "logps/rejected": -3.302992343902588, "loss": 4.0341, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -30.57587242126465, "rewards/margins": 2.454049587249756, "rewards/rejected": -33.02992248535156, "step": 21950 }, { "epoch": 0.7399979776871483, "grad_norm": 53.799285888671875, "learning_rate": 1.9217338290892704e-07, "logits/chosen": -1.5386488437652588, "logits/rejected": -2.009295701980591, "logps/chosen": -2.03143310546875, "logps/rejected": -2.4899253845214844, "loss": 2.1955, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.3143310546875, "rewards/margins": 4.584921836853027, "rewards/rejected": -24.89925193786621, "step": 21955 }, { "epoch": 0.7401665037581314, "grad_norm": 24.71092414855957, "learning_rate": 1.9194165307743403e-07, "logits/chosen": -1.5193939208984375, "logits/rejected": -1.3868157863616943, "logps/chosen": -2.746831178665161, "logps/rejected": -2.5405361652374268, "loss": 5.1587, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -27.468311309814453, "rewards/margins": -2.062947988510132, "rewards/rejected": -25.40536117553711, "step": 21960 }, { "epoch": 0.7403350298291146, "grad_norm": 22.135698318481445, "learning_rate": 1.917100298527981e-07, "logits/chosen": -1.7267589569091797, "logits/rejected": -1.7475883960723877, "logps/chosen": -2.539499282836914, "logps/rejected": -2.4604432582855225, "loss": 4.0905, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.39499282836914, "rewards/margins": -0.7905600666999817, "rewards/rejected": -24.60443115234375, "step": 21965 }, { "epoch": 0.7405035559000978, "grad_norm": 241.43174743652344, "learning_rate": 1.9147851331517445e-07, "logits/chosen": -1.717034935951233, "logits/rejected": -1.8810991048812866, "logps/chosen": -2.7885756492614746, "logps/rejected": -2.564950704574585, "loss": 7.6565, "rewards/accuracies": 0.5, "rewards/chosen": -27.885757446289062, "rewards/margins": -2.236248731613159, "rewards/rejected": -25.649505615234375, "step": 21970 }, { "epoch": 0.7406720819710809, "grad_norm": 0.6432590484619141, "learning_rate": 1.912471035446821e-07, "logits/chosen": -1.8602104187011719, "logits/rejected": -2.3250601291656494, "logps/chosen": -2.2839043140411377, "logps/rejected": -2.538177967071533, "loss": 1.9896, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.83904457092285, "rewards/margins": 2.5427379608154297, "rewards/rejected": -25.38178062438965, "step": 21975 }, { "epoch": 0.7408406080420641, "grad_norm": 59.0754280090332, "learning_rate": 1.910158006214029e-07, "logits/chosen": -2.180945873260498, "logits/rejected": -2.2051949501037598, "logps/chosen": -2.14461612701416, "logps/rejected": -2.3139724731445312, "loss": 3.2225, "rewards/accuracies": 0.5, "rewards/chosen": -21.4461612701416, "rewards/margins": 1.6935627460479736, "rewards/rejected": -23.139724731445312, "step": 21980 }, { "epoch": 0.7410091341130473, "grad_norm": 47.94268798828125, "learning_rate": 1.907846046253815e-07, "logits/chosen": -1.857304573059082, "logits/rejected": -1.8857357501983643, "logps/chosen": -2.83512544631958, "logps/rejected": -2.9262855052948, "loss": 3.4687, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -28.35125160217285, "rewards/margins": 0.9116016626358032, "rewards/rejected": -29.262853622436523, "step": 21985 }, { "epoch": 0.7411776601840304, "grad_norm": 22.46523666381836, "learning_rate": 1.9055351563662593e-07, "logits/chosen": -1.1714767217636108, "logits/rejected": -1.649778962135315, "logps/chosen": -2.0318779945373535, "logps/rejected": -2.8038489818573, "loss": 1.9946, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.31878089904785, "rewards/margins": 7.7197113037109375, "rewards/rejected": -28.038488388061523, "step": 21990 }, { "epoch": 0.7413461862550137, "grad_norm": 76.65923309326172, "learning_rate": 1.9032253373510697e-07, "logits/chosen": -1.9239925146102905, "logits/rejected": -1.8971790075302124, "logps/chosen": -2.953947067260742, "logps/rejected": -3.118098735809326, "loss": 3.1773, "rewards/accuracies": 0.5, "rewards/chosen": -29.539474487304688, "rewards/margins": 1.6415166854858398, "rewards/rejected": -31.18099021911621, "step": 21995 }, { "epoch": 0.7415147123259969, "grad_norm": 24.52005958557129, "learning_rate": 1.9009165900075819e-07, "logits/chosen": -1.0461251735687256, "logits/rejected": -1.2524454593658447, "logps/chosen": -2.185828447341919, "logps/rejected": -2.4685850143432617, "loss": 2.8176, "rewards/accuracies": 0.5, "rewards/chosen": -21.858285903930664, "rewards/margins": 2.8275644779205322, "rewards/rejected": -24.685850143432617, "step": 22000 }, { "epoch": 0.7415147123259969, "eval_logits/chosen": -2.2072925567626953, "eval_logits/rejected": -2.376696825027466, "eval_logps/chosen": -2.2478811740875244, "eval_logps/rejected": -2.396430015563965, "eval_loss": 3.075082302093506, "eval_rewards/accuracies": 0.6299999952316284, "eval_rewards/chosen": -22.47881317138672, "eval_rewards/margins": 1.4854872226715088, "eval_rewards/rejected": -23.96430015563965, "eval_runtime": 12.91, "eval_samples_per_second": 7.746, "eval_steps_per_second": 1.936, "step": 22000 }, { "epoch": 0.74168323839698, "grad_norm": 27.378284454345703, "learning_rate": 1.8986089151347628e-07, "logits/chosen": -1.6426585912704468, "logits/rejected": -1.6798603534698486, "logps/chosen": -2.4855704307556152, "logps/rejected": -2.8420779705047607, "loss": 2.8563, "rewards/accuracies": 0.5, "rewards/chosen": -24.855703353881836, "rewards/margins": 3.5650744438171387, "rewards/rejected": -28.4207820892334, "step": 22005 }, { "epoch": 0.7418517644679632, "grad_norm": 24.746652603149414, "learning_rate": 1.8963023135312105e-07, "logits/chosen": -1.9978439807891846, "logits/rejected": -2.34228515625, "logps/chosen": -2.251620292663574, "logps/rejected": -2.3410422801971436, "loss": 3.9954, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.51620101928711, "rewards/margins": 0.8942203521728516, "rewards/rejected": -23.41042137145996, "step": 22010 }, { "epoch": 0.7420202905389464, "grad_norm": 64.21375274658203, "learning_rate": 1.8939967859951445e-07, "logits/chosen": -1.7721850872039795, "logits/rejected": -2.458425998687744, "logps/chosen": -2.4934096336364746, "logps/rejected": -3.225297451019287, "loss": 2.2091, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.93409538269043, "rewards/margins": 7.318881034851074, "rewards/rejected": -32.25297927856445, "step": 22015 }, { "epoch": 0.7421888166099295, "grad_norm": 26.012672424316406, "learning_rate": 1.8916923333244195e-07, "logits/chosen": -1.8884680271148682, "logits/rejected": -1.7687313556671143, "logps/chosen": -2.0241780281066895, "logps/rejected": -2.5447230339050293, "loss": 2.282, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.24178123474121, "rewards/margins": 5.20544958114624, "rewards/rejected": -25.44723129272461, "step": 22020 }, { "epoch": 0.7423573426809127, "grad_norm": 29.735652923583984, "learning_rate": 1.8893889563165154e-07, "logits/chosen": -1.8298534154891968, "logits/rejected": -1.7624599933624268, "logps/chosen": -1.849662184715271, "logps/rejected": -1.876684546470642, "loss": 2.8919, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.49662208557129, "rewards/margins": 0.2702246606349945, "rewards/rejected": -18.766845703125, "step": 22025 }, { "epoch": 0.742525868751896, "grad_norm": 35.02006530761719, "learning_rate": 1.8870866557685421e-07, "logits/chosen": -2.155496597290039, "logits/rejected": -2.304814577102661, "logps/chosen": -2.7024080753326416, "logps/rejected": -2.8451664447784424, "loss": 3.3615, "rewards/accuracies": 0.5, "rewards/chosen": -27.024084091186523, "rewards/margins": 1.4275829792022705, "rewards/rejected": -28.4516658782959, "step": 22030 }, { "epoch": 0.7426943948228791, "grad_norm": 18.64474105834961, "learning_rate": 1.8847854324772316e-07, "logits/chosen": -1.7854385375976562, "logits/rejected": -2.331878662109375, "logps/chosen": -2.454838991165161, "logps/rejected": -3.2257533073425293, "loss": 1.3929, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -24.548389434814453, "rewards/margins": 7.709145545959473, "rewards/rejected": -32.25753402709961, "step": 22035 }, { "epoch": 0.7428629208938623, "grad_norm": 124.89165496826172, "learning_rate": 1.8824852872389486e-07, "logits/chosen": -1.9832309484481812, "logits/rejected": -2.0631701946258545, "logps/chosen": -2.7330057621002197, "logps/rejected": -2.7496206760406494, "loss": 3.7363, "rewards/accuracies": 0.5, "rewards/chosen": -27.330059051513672, "rewards/margins": 0.16614732146263123, "rewards/rejected": -27.4962100982666, "step": 22040 }, { "epoch": 0.7430314469648455, "grad_norm": 32.47340774536133, "learning_rate": 1.8801862208496838e-07, "logits/chosen": -2.0714099407196045, "logits/rejected": -2.1332778930664062, "logps/chosen": -1.9636199474334717, "logps/rejected": -2.1338706016540527, "loss": 2.1992, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.636199951171875, "rewards/margins": 1.702506422996521, "rewards/rejected": -21.33870506286621, "step": 22045 }, { "epoch": 0.7431999730358286, "grad_norm": 318.710693359375, "learning_rate": 1.8778882341050505e-07, "logits/chosen": -1.1691004037857056, "logits/rejected": -1.5123542547225952, "logps/chosen": -3.776531219482422, "logps/rejected": -4.898382663726807, "loss": 2.9377, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -37.76531219482422, "rewards/margins": 11.218512535095215, "rewards/rejected": -48.98382568359375, "step": 22050 }, { "epoch": 0.7433684991068118, "grad_norm": 30.12228012084961, "learning_rate": 1.8755913278002933e-07, "logits/chosen": -2.0730338096618652, "logits/rejected": -2.237550973892212, "logps/chosen": -2.0106124877929688, "logps/rejected": -2.141998767852783, "loss": 2.3136, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.10612678527832, "rewards/margins": 1.3138636350631714, "rewards/rejected": -21.41998863220215, "step": 22055 }, { "epoch": 0.743537025177795, "grad_norm": 53.70478057861328, "learning_rate": 1.8732955027302805e-07, "logits/chosen": -1.5072424411773682, "logits/rejected": -2.0281248092651367, "logps/chosen": -3.2066917419433594, "logps/rejected": -4.702208518981934, "loss": 1.5268, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -32.066917419433594, "rewards/margins": 14.955164909362793, "rewards/rejected": -47.02208709716797, "step": 22060 }, { "epoch": 0.7437055512487782, "grad_norm": 34.362606048583984, "learning_rate": 1.8710007596895088e-07, "logits/chosen": -2.142402410507202, "logits/rejected": -2.3605005741119385, "logps/chosen": -2.8594348430633545, "logps/rejected": -3.0598931312561035, "loss": 2.1342, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -28.594350814819336, "rewards/margins": 2.0045814514160156, "rewards/rejected": -30.59893226623535, "step": 22065 }, { "epoch": 0.7438740773197614, "grad_norm": 22.987091064453125, "learning_rate": 1.868707099472095e-07, "logits/chosen": -1.7191402912139893, "logits/rejected": -1.91571044921875, "logps/chosen": -2.633007287979126, "logps/rejected": -3.2162888050079346, "loss": 1.0829, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -26.3300724029541, "rewards/margins": 5.832815647125244, "rewards/rejected": -32.16288757324219, "step": 22070 }, { "epoch": 0.7440426033907446, "grad_norm": 43.26948165893555, "learning_rate": 1.866414522871786e-07, "logits/chosen": -1.5649702548980713, "logits/rejected": -2.07462739944458, "logps/chosen": -2.266348123550415, "logps/rejected": -2.657762050628662, "loss": 2.2072, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.663482666015625, "rewards/margins": 3.91413950920105, "rewards/rejected": -26.577621459960938, "step": 22075 }, { "epoch": 0.7442111294617277, "grad_norm": 70.24766540527344, "learning_rate": 1.864123030681954e-07, "logits/chosen": -2.052701234817505, "logits/rejected": -2.133179187774658, "logps/chosen": -3.3242931365966797, "logps/rejected": -3.694363832473755, "loss": 2.0295, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -33.24292755126953, "rewards/margins": 3.7007064819335938, "rewards/rejected": -36.943634033203125, "step": 22080 }, { "epoch": 0.7443796555327109, "grad_norm": 29.723024368286133, "learning_rate": 1.8618326236955906e-07, "logits/chosen": -1.730105996131897, "logits/rejected": -2.710209369659424, "logps/chosen": -2.478010416030884, "logps/rejected": -3.3492538928985596, "loss": 2.1972, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.780101776123047, "rewards/margins": 8.712434768676758, "rewards/rejected": -33.49253463745117, "step": 22085 }, { "epoch": 0.7445481816036941, "grad_norm": 17.219541549682617, "learning_rate": 1.8595433027053177e-07, "logits/chosen": -1.7738357782363892, "logits/rejected": -2.1546788215637207, "logps/chosen": -2.251901626586914, "logps/rejected": -2.3508944511413574, "loss": 2.7481, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.51901626586914, "rewards/margins": 0.989930272102356, "rewards/rejected": -23.50894546508789, "step": 22090 }, { "epoch": 0.7447167076746772, "grad_norm": 23.76531219482422, "learning_rate": 1.85725506850338e-07, "logits/chosen": -2.312504768371582, "logits/rejected": -1.9970581531524658, "logps/chosen": -2.0622830390930176, "logps/rejected": -2.181011199951172, "loss": 3.7324, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.62282943725586, "rewards/margins": 1.187281847000122, "rewards/rejected": -21.81011390686035, "step": 22095 }, { "epoch": 0.7448852337456604, "grad_norm": 11.535994529724121, "learning_rate": 1.854967921881642e-07, "logits/chosen": -1.7529932260513306, "logits/rejected": -2.226323366165161, "logps/chosen": -2.4964802265167236, "logps/rejected": -2.9389634132385254, "loss": 1.7485, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.964801788330078, "rewards/margins": 4.424830436706543, "rewards/rejected": -29.389633178710938, "step": 22100 }, { "epoch": 0.7450537598166437, "grad_norm": 184.34542846679688, "learning_rate": 1.852681863631597e-07, "logits/chosen": -1.7626209259033203, "logits/rejected": -1.7059158086776733, "logps/chosen": -2.2885444164276123, "logps/rejected": -2.4290459156036377, "loss": 2.3917, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.88544464111328, "rewards/margins": 1.4050140380859375, "rewards/rejected": -24.29045867919922, "step": 22105 }, { "epoch": 0.7452222858876268, "grad_norm": 154.77064514160156, "learning_rate": 1.8503968945443599e-07, "logits/chosen": -1.4389687776565552, "logits/rejected": -1.8824350833892822, "logps/chosen": -2.680974245071411, "logps/rejected": -2.93117094039917, "loss": 1.974, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -26.809743881225586, "rewards/margins": 2.5019686222076416, "rewards/rejected": -29.31171226501465, "step": 22110 }, { "epoch": 0.74539081195861, "grad_norm": 39.07148361206055, "learning_rate": 1.8481130154106684e-07, "logits/chosen": -1.7964918613433838, "logits/rejected": -1.9969040155410767, "logps/chosen": -2.613196611404419, "logps/rejected": -2.705244302749634, "loss": 2.9414, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -26.1319637298584, "rewards/margins": 0.9204772710800171, "rewards/rejected": -27.052440643310547, "step": 22115 }, { "epoch": 0.7455593380295932, "grad_norm": 76.31917572021484, "learning_rate": 1.8458302270208825e-07, "logits/chosen": -1.2408196926116943, "logits/rejected": -1.2059440612792969, "logps/chosen": -2.816004514694214, "logps/rejected": -2.9252734184265137, "loss": 2.7429, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -28.160045623779297, "rewards/margins": 1.092685341835022, "rewards/rejected": -29.252731323242188, "step": 22120 }, { "epoch": 0.7457278641005763, "grad_norm": 77.58131408691406, "learning_rate": 1.8435485301649857e-07, "logits/chosen": -2.2920916080474854, "logits/rejected": -2.2795474529266357, "logps/chosen": -2.514387845993042, "logps/rejected": -2.436331033706665, "loss": 4.8348, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -25.14388084411621, "rewards/margins": -0.7805711030960083, "rewards/rejected": -24.363309860229492, "step": 22125 }, { "epoch": 0.7458963901715595, "grad_norm": 2.1885435581207275, "learning_rate": 1.8412679256325852e-07, "logits/chosen": -1.1843568086624146, "logits/rejected": -1.6690727472305298, "logps/chosen": -2.1616756916046143, "logps/rejected": -2.833143711090088, "loss": 1.5591, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.616756439208984, "rewards/margins": 6.714682579040527, "rewards/rejected": -28.331436157226562, "step": 22130 }, { "epoch": 0.7460649162425427, "grad_norm": 33.07001876831055, "learning_rate": 1.8389884142129047e-07, "logits/chosen": -2.085460662841797, "logits/rejected": -2.267630100250244, "logps/chosen": -2.2407565116882324, "logps/rejected": -2.623178005218506, "loss": 1.4193, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -22.40756607055664, "rewards/margins": 3.824214458465576, "rewards/rejected": -26.231781005859375, "step": 22135 }, { "epoch": 0.746233442313526, "grad_norm": 90.47783660888672, "learning_rate": 1.8367099966947952e-07, "logits/chosen": -1.9926780462265015, "logits/rejected": -1.9720783233642578, "logps/chosen": -2.5652921199798584, "logps/rejected": -3.247729778289795, "loss": 1.9665, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -25.65291976928711, "rewards/margins": 6.8243818283081055, "rewards/rejected": -32.47730255126953, "step": 22140 }, { "epoch": 0.7464019683845091, "grad_norm": 9.481569290161133, "learning_rate": 1.834432673866727e-07, "logits/chosen": -1.9665133953094482, "logits/rejected": -2.1472795009613037, "logps/chosen": -2.3283274173736572, "logps/rejected": -2.7122232913970947, "loss": 2.1223, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.283275604248047, "rewards/margins": 3.838958740234375, "rewards/rejected": -27.12223243713379, "step": 22145 }, { "epoch": 0.7465704944554923, "grad_norm": 30.502643585205078, "learning_rate": 1.8321564465167943e-07, "logits/chosen": -1.9200775623321533, "logits/rejected": -1.9199256896972656, "logps/chosen": -2.313190460205078, "logps/rejected": -2.3956024646759033, "loss": 3.3351, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -23.13190460205078, "rewards/margins": 0.8241198658943176, "rewards/rejected": -23.956024169921875, "step": 22150 }, { "epoch": 0.7467390205264754, "grad_norm": 0.6498861908912659, "learning_rate": 1.8298813154327052e-07, "logits/chosen": -1.862210988998413, "logits/rejected": -2.1515557765960693, "logps/chosen": -2.6309103965759277, "logps/rejected": -2.9605705738067627, "loss": 2.0137, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -26.30910301208496, "rewards/margins": 3.296602725982666, "rewards/rejected": -29.6057071685791, "step": 22155 }, { "epoch": 0.7469075465974586, "grad_norm": 24.51485252380371, "learning_rate": 1.827607281401795e-07, "logits/chosen": -1.7067981958389282, "logits/rejected": -1.831194519996643, "logps/chosen": -1.6155074834823608, "logps/rejected": -1.7414348125457764, "loss": 2.4547, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.155075073242188, "rewards/margins": 1.2592729330062866, "rewards/rejected": -17.414348602294922, "step": 22160 }, { "epoch": 0.7470760726684418, "grad_norm": 21.368152618408203, "learning_rate": 1.8253343452110197e-07, "logits/chosen": -1.6644436120986938, "logits/rejected": -1.6071170568466187, "logps/chosen": -1.9112002849578857, "logps/rejected": -2.0044965744018555, "loss": 2.83, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.112003326416016, "rewards/margins": 0.9329609870910645, "rewards/rejected": -20.044963836669922, "step": 22165 }, { "epoch": 0.747244598739425, "grad_norm": 32.65996170043945, "learning_rate": 1.8230625076469486e-07, "logits/chosen": -2.3414549827575684, "logits/rejected": -2.433055877685547, "logps/chosen": -2.396184206008911, "logps/rejected": -2.6614813804626465, "loss": 2.0128, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.961841583251953, "rewards/margins": 2.652974843978882, "rewards/rejected": -26.61481285095215, "step": 22170 }, { "epoch": 0.7474131248104082, "grad_norm": 28.22422981262207, "learning_rate": 1.8207917694957775e-07, "logits/chosen": -1.752981424331665, "logits/rejected": -2.565385103225708, "logps/chosen": -2.0491116046905518, "logps/rejected": -2.334890842437744, "loss": 2.5923, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.49111557006836, "rewards/margins": 2.857790946960449, "rewards/rejected": -23.348905563354492, "step": 22175 }, { "epoch": 0.7475816508813914, "grad_norm": 32.882240295410156, "learning_rate": 1.818522131543319e-07, "logits/chosen": -1.775252103805542, "logits/rejected": -1.9257984161376953, "logps/chosen": -3.0990793704986572, "logps/rejected": -3.4123873710632324, "loss": 3.3253, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -30.990793228149414, "rewards/margins": 3.133082628250122, "rewards/rejected": -34.12387466430664, "step": 22180 }, { "epoch": 0.7477501769523746, "grad_norm": 30.46446418762207, "learning_rate": 1.8162535945750072e-07, "logits/chosen": -1.8090412616729736, "logits/rejected": -2.1050121784210205, "logps/chosen": -2.088987112045288, "logps/rejected": -2.3451297283172607, "loss": 2.756, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.889873504638672, "rewards/margins": 2.561424970626831, "rewards/rejected": -23.451297760009766, "step": 22185 }, { "epoch": 0.7479187030233577, "grad_norm": 21.73251724243164, "learning_rate": 1.8139861593758903e-07, "logits/chosen": -1.5950464010238647, "logits/rejected": -1.5290101766586304, "logps/chosen": -1.873583197593689, "logps/rejected": -1.7633142471313477, "loss": 4.2461, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -18.7358341217041, "rewards/margins": -1.1026910543441772, "rewards/rejected": -17.633142471313477, "step": 22190 }, { "epoch": 0.7480872290943409, "grad_norm": 44.56904220581055, "learning_rate": 1.8117198267306394e-07, "logits/chosen": -1.9961559772491455, "logits/rejected": -2.7102062702178955, "logps/chosen": -2.2197203636169434, "logps/rejected": -2.4373273849487305, "loss": 2.4196, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.197200775146484, "rewards/margins": 2.176071882247925, "rewards/rejected": -24.373271942138672, "step": 22195 }, { "epoch": 0.748255755165324, "grad_norm": 22.732515335083008, "learning_rate": 1.8094545974235453e-07, "logits/chosen": -1.7332115173339844, "logits/rejected": -1.7054576873779297, "logps/chosen": -2.0260987281799316, "logps/rejected": -2.120617389678955, "loss": 2.4321, "rewards/accuracies": 0.5, "rewards/chosen": -20.260986328125, "rewards/margins": 0.9451854825019836, "rewards/rejected": -21.206172943115234, "step": 22200 }, { "epoch": 0.7484242812363072, "grad_norm": 134.5745391845703, "learning_rate": 1.8071904722385107e-07, "logits/chosen": -1.819849967956543, "logits/rejected": -1.9714431762695312, "logps/chosen": -3.182410717010498, "logps/rejected": -3.592799663543701, "loss": 2.2189, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -31.824108123779297, "rewards/margins": 4.103890419006348, "rewards/rejected": -35.928001403808594, "step": 22205 }, { "epoch": 0.7485928073072904, "grad_norm": 19.461380004882812, "learning_rate": 1.8049274519590618e-07, "logits/chosen": -1.819338083267212, "logits/rejected": -2.1516706943511963, "logps/chosen": -2.2335047721862793, "logps/rejected": -2.5605053901672363, "loss": 3.4628, "rewards/accuracies": 0.5, "rewards/chosen": -22.335046768188477, "rewards/margins": 3.2700066566467285, "rewards/rejected": -25.605051040649414, "step": 22210 }, { "epoch": 0.7487613333782737, "grad_norm": 43.49894332885742, "learning_rate": 1.8026655373683407e-07, "logits/chosen": -1.8901605606079102, "logits/rejected": -2.1546928882598877, "logps/chosen": -2.9359912872314453, "logps/rejected": -3.4877407550811768, "loss": 2.6148, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -29.359912872314453, "rewards/margins": 5.517492771148682, "rewards/rejected": -34.877403259277344, "step": 22215 }, { "epoch": 0.7489298594492568, "grad_norm": 39.32984924316406, "learning_rate": 1.8004047292491094e-07, "logits/chosen": -1.7076501846313477, "logits/rejected": -1.9654871225357056, "logps/chosen": -2.4198029041290283, "logps/rejected": -2.534273862838745, "loss": 2.2676, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.198028564453125, "rewards/margins": 1.1447094678878784, "rewards/rejected": -25.34273910522461, "step": 22220 }, { "epoch": 0.74909838552024, "grad_norm": 20.094335556030273, "learning_rate": 1.79814502838374e-07, "logits/chosen": -1.2183277606964111, "logits/rejected": -1.6691389083862305, "logps/chosen": -2.220008373260498, "logps/rejected": -2.553109884262085, "loss": 2.2165, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.200082778930664, "rewards/margins": 3.3310158252716064, "rewards/rejected": -25.531099319458008, "step": 22225 }, { "epoch": 0.7492669115912232, "grad_norm": 13.931550025939941, "learning_rate": 1.795886435554229e-07, "logits/chosen": -1.722768783569336, "logits/rejected": -1.9010473489761353, "logps/chosen": -2.112097978591919, "logps/rejected": -2.6837105751037598, "loss": 1.1627, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.120981216430664, "rewards/margins": 5.71612548828125, "rewards/rejected": -26.837106704711914, "step": 22230 }, { "epoch": 0.7494354376622063, "grad_norm": 9.187200546264648, "learning_rate": 1.793628951542187e-07, "logits/chosen": -1.9483964443206787, "logits/rejected": -1.9009296894073486, "logps/chosen": -2.5094029903411865, "logps/rejected": -2.2897486686706543, "loss": 5.5848, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.09402847290039, "rewards/margins": -2.1965441703796387, "rewards/rejected": -22.89748764038086, "step": 22235 }, { "epoch": 0.7496039637331895, "grad_norm": 52.516719818115234, "learning_rate": 1.7913725771288368e-07, "logits/chosen": -1.4216723442077637, "logits/rejected": -1.5206291675567627, "logps/chosen": -1.9325135946273804, "logps/rejected": -2.0983996391296387, "loss": 2.3383, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.325138092041016, "rewards/margins": 1.6588608026504517, "rewards/rejected": -20.983993530273438, "step": 22240 }, { "epoch": 0.7497724898041727, "grad_norm": 28.921022415161133, "learning_rate": 1.7891173130950233e-07, "logits/chosen": -1.7567373514175415, "logits/rejected": -2.1790812015533447, "logps/chosen": -2.419783115386963, "logps/rejected": -2.6635146141052246, "loss": 1.936, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.197826385498047, "rewards/margins": 2.437316417694092, "rewards/rejected": -26.635147094726562, "step": 22245 }, { "epoch": 0.7499410158751559, "grad_norm": 28.339021682739258, "learning_rate": 1.7868631602212037e-07, "logits/chosen": -2.138418674468994, "logits/rejected": -2.2020459175109863, "logps/chosen": -3.1503100395202637, "logps/rejected": -3.557222366333008, "loss": 3.0416, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -31.503103256225586, "rewards/margins": 4.069121360778809, "rewards/rejected": -35.57222366333008, "step": 22250 }, { "epoch": 0.7501095419461391, "grad_norm": 31.886789321899414, "learning_rate": 1.784610119287452e-07, "logits/chosen": -1.6445305347442627, "logits/rejected": -1.7116506099700928, "logps/chosen": -2.769115447998047, "logps/rejected": -2.8107247352600098, "loss": 3.139, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -27.6911563873291, "rewards/margins": 0.41608962416648865, "rewards/rejected": -28.10724449157715, "step": 22255 }, { "epoch": 0.7502780680171223, "grad_norm": 46.03837966918945, "learning_rate": 1.7823581910734564e-07, "logits/chosen": -2.1681454181671143, "logits/rejected": -2.4093782901763916, "logps/chosen": -2.065337657928467, "logps/rejected": -2.368126392364502, "loss": 2.1653, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.653377532958984, "rewards/margins": 3.027885913848877, "rewards/rejected": -23.681264877319336, "step": 22260 }, { "epoch": 0.7504465940881054, "grad_norm": 31.069231033325195, "learning_rate": 1.7801073763585227e-07, "logits/chosen": -1.8537037372589111, "logits/rejected": -2.0984387397766113, "logps/chosen": -1.594420313835144, "logps/rejected": -2.136216640472412, "loss": 1.6542, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -15.94420337677002, "rewards/margins": 5.417963981628418, "rewards/rejected": -21.362167358398438, "step": 22265 }, { "epoch": 0.7506151201590886, "grad_norm": 32.4260368347168, "learning_rate": 1.7778576759215663e-07, "logits/chosen": -1.5697544813156128, "logits/rejected": -1.641465187072754, "logps/chosen": -2.854722738265991, "logps/rejected": -2.992177724838257, "loss": 3.0165, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -28.547225952148438, "rewards/margins": 1.3745505809783936, "rewards/rejected": -29.921777725219727, "step": 22270 }, { "epoch": 0.7507836462300718, "grad_norm": 24.50908851623535, "learning_rate": 1.7756090905411204e-07, "logits/chosen": -1.6597118377685547, "logits/rejected": -2.5056252479553223, "logps/chosen": -2.06673002243042, "logps/rejected": -2.915621757507324, "loss": 1.316, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.667299270629883, "rewards/margins": 8.488917350769043, "rewards/rejected": -29.15621566772461, "step": 22275 }, { "epoch": 0.7509521723010549, "grad_norm": 16.406259536743164, "learning_rate": 1.7733616209953317e-07, "logits/chosen": -2.0685009956359863, "logits/rejected": -2.1869888305664062, "logps/chosen": -2.3974456787109375, "logps/rejected": -2.5520505905151367, "loss": 2.4664, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.974454879760742, "rewards/margins": 1.5460479259490967, "rewards/rejected": -25.520503997802734, "step": 22280 }, { "epoch": 0.7511206983720382, "grad_norm": 22.887910842895508, "learning_rate": 1.7711152680619622e-07, "logits/chosen": -1.8368467092514038, "logits/rejected": -1.9409831762313843, "logps/chosen": -2.277331829071045, "logps/rejected": -2.750244617462158, "loss": 1.0961, "rewards/accuracies": 1.0, "rewards/chosen": -22.773319244384766, "rewards/margins": 4.729128837585449, "rewards/rejected": -27.5024471282959, "step": 22285 }, { "epoch": 0.7512892244430214, "grad_norm": 60.598419189453125, "learning_rate": 1.768870032518387e-07, "logits/chosen": -2.0507781505584717, "logits/rejected": -2.0986313819885254, "logps/chosen": -2.0975115299224854, "logps/rejected": -2.2042901515960693, "loss": 2.1705, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.975116729736328, "rewards/margins": 1.067787766456604, "rewards/rejected": -22.04290199279785, "step": 22290 }, { "epoch": 0.7514577505140045, "grad_norm": 28.564823150634766, "learning_rate": 1.7666259151415908e-07, "logits/chosen": -1.376070261001587, "logits/rejected": -1.6946004629135132, "logps/chosen": -2.3124992847442627, "logps/rejected": -2.3266541957855225, "loss": 4.3577, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.124990463256836, "rewards/margins": 0.14155101776123047, "rewards/rejected": -23.26654052734375, "step": 22295 }, { "epoch": 0.7516262765849877, "grad_norm": 144.43304443359375, "learning_rate": 1.7643829167081746e-07, "logits/chosen": -2.3734333515167236, "logits/rejected": -2.6592297554016113, "logps/chosen": -2.635132312774658, "logps/rejected": -2.699852228164673, "loss": 3.4685, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -26.351318359375, "rewards/margins": 0.647201657295227, "rewards/rejected": -26.998523712158203, "step": 22300 }, { "epoch": 0.7517948026559709, "grad_norm": 30.19963264465332, "learning_rate": 1.7621410379943551e-07, "logits/chosen": -1.405311942100525, "logits/rejected": -1.310832142829895, "logps/chosen": -2.3792216777801514, "logps/rejected": -2.3998732566833496, "loss": 3.3591, "rewards/accuracies": 0.5, "rewards/chosen": -23.792217254638672, "rewards/margins": 0.20651578903198242, "rewards/rejected": -23.998733520507812, "step": 22305 }, { "epoch": 0.751963328726954, "grad_norm": 73.10344696044922, "learning_rate": 1.7599002797759542e-07, "logits/chosen": -1.719347596168518, "logits/rejected": -2.036027431488037, "logps/chosen": -2.421614170074463, "logps/rejected": -3.365711212158203, "loss": 2.121, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -24.216140747070312, "rewards/margins": 9.440972328186035, "rewards/rejected": -33.65711212158203, "step": 22310 }, { "epoch": 0.7521318547979372, "grad_norm": 25.012081146240234, "learning_rate": 1.7576606428284114e-07, "logits/chosen": -1.8271840810775757, "logits/rejected": -2.110874652862549, "logps/chosen": -2.605572462081909, "logps/rejected": -3.6537890434265137, "loss": 1.924, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -26.05572509765625, "rewards/margins": 10.48216438293457, "rewards/rejected": -36.53789138793945, "step": 22315 }, { "epoch": 0.7523003808689204, "grad_norm": 3.9147238731384277, "learning_rate": 1.7554221279267768e-07, "logits/chosen": -1.5303199291229248, "logits/rejected": -1.9406137466430664, "logps/chosen": -2.8770370483398438, "logps/rejected": -3.2161426544189453, "loss": 3.0375, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -28.770370483398438, "rewards/margins": 3.391056776046753, "rewards/rejected": -32.16143035888672, "step": 22320 }, { "epoch": 0.7524689069399036, "grad_norm": 32.75675964355469, "learning_rate": 1.7531847358457148e-07, "logits/chosen": -1.7144191265106201, "logits/rejected": -2.0489983558654785, "logps/chosen": -2.0209603309631348, "logps/rejected": -2.873371124267578, "loss": 2.5432, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.20960235595703, "rewards/margins": 8.52410888671875, "rewards/rejected": -28.73370933532715, "step": 22325 }, { "epoch": 0.7526374330108868, "grad_norm": 141.016845703125, "learning_rate": 1.7509484673594938e-07, "logits/chosen": -1.8166608810424805, "logits/rejected": -1.8591245412826538, "logps/chosen": -2.8313632011413574, "logps/rejected": -2.9321489334106445, "loss": 3.0837, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -28.313629150390625, "rewards/margins": 1.0078595876693726, "rewards/rejected": -29.321491241455078, "step": 22330 }, { "epoch": 0.75280595908187, "grad_norm": 20.358863830566406, "learning_rate": 1.748713323242001e-07, "logits/chosen": -1.726928472518921, "logits/rejected": -1.9667476415634155, "logps/chosen": -3.285855531692505, "logps/rejected": -4.028448581695557, "loss": 3.8553, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -32.858558654785156, "rewards/margins": 7.425924777984619, "rewards/rejected": -40.28447723388672, "step": 22335 }, { "epoch": 0.7529744851528531, "grad_norm": 130.31146240234375, "learning_rate": 1.7464793042667337e-07, "logits/chosen": -1.8716411590576172, "logits/rejected": -2.0586330890655518, "logps/chosen": -2.2499797344207764, "logps/rejected": -2.5665526390075684, "loss": 4.8833, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.499799728393555, "rewards/margins": 3.165727376937866, "rewards/rejected": -25.66552734375, "step": 22340 }, { "epoch": 0.7531430112238363, "grad_norm": 44.93946838378906, "learning_rate": 1.7442464112067935e-07, "logits/chosen": -1.6285629272460938, "logits/rejected": -1.6942113637924194, "logps/chosen": -2.2660062313079834, "logps/rejected": -2.411550998687744, "loss": 3.1253, "rewards/accuracies": 0.5, "rewards/chosen": -22.66006088256836, "rewards/margins": 1.4554458856582642, "rewards/rejected": -24.115509033203125, "step": 22345 }, { "epoch": 0.7533115372948195, "grad_norm": 180.3627471923828, "learning_rate": 1.7420146448348982e-07, "logits/chosen": -1.4327377080917358, "logits/rejected": -1.430065393447876, "logps/chosen": -3.0352001190185547, "logps/rejected": -2.8664352893829346, "loss": 4.7893, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -30.352001190185547, "rewards/margins": -1.6876468658447266, "rewards/rejected": -28.664356231689453, "step": 22350 }, { "epoch": 0.7534800633658026, "grad_norm": 30.93425750732422, "learning_rate": 1.7397840059233754e-07, "logits/chosen": -1.6150524616241455, "logits/rejected": -2.075601100921631, "logps/chosen": -2.180380344390869, "logps/rejected": -2.4855568408966064, "loss": 1.4605, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.803804397583008, "rewards/margins": 3.051762580871582, "rewards/rejected": -24.855566024780273, "step": 22355 }, { "epoch": 0.7536485894367859, "grad_norm": 120.5937728881836, "learning_rate": 1.7375544952441628e-07, "logits/chosen": -1.124226450920105, "logits/rejected": -1.2850375175476074, "logps/chosen": -3.2305736541748047, "logps/rejected": -3.4257309436798096, "loss": 4.1787, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -32.30573654174805, "rewards/margins": 1.9515708684921265, "rewards/rejected": -34.25730895996094, "step": 22360 }, { "epoch": 0.7538171155077691, "grad_norm": 112.42485046386719, "learning_rate": 1.735326113568802e-07, "logits/chosen": -1.9617102146148682, "logits/rejected": -2.3836417198181152, "logps/chosen": -2.8195998668670654, "logps/rejected": -3.2562496662139893, "loss": 4.2705, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -28.196002960205078, "rewards/margins": 4.366497039794922, "rewards/rejected": -32.562496185302734, "step": 22365 }, { "epoch": 0.7539856415787523, "grad_norm": 34.154544830322266, "learning_rate": 1.7330988616684505e-07, "logits/chosen": -2.2110228538513184, "logits/rejected": -2.7860636711120605, "logps/chosen": -2.109682321548462, "logps/rejected": -2.5072388648986816, "loss": 1.4138, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.09682273864746, "rewards/margins": 3.9755663871765137, "rewards/rejected": -25.072391510009766, "step": 22370 }, { "epoch": 0.7541541676497354, "grad_norm": 55.39371871948242, "learning_rate": 1.7308727403138734e-07, "logits/chosen": -0.8848699331283569, "logits/rejected": -0.8680378198623657, "logps/chosen": -3.3436615467071533, "logps/rejected": -3.356152057647705, "loss": 3.4747, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -33.436614990234375, "rewards/margins": 0.12490396201610565, "rewards/rejected": -33.561519622802734, "step": 22375 }, { "epoch": 0.7543226937207186, "grad_norm": 33.278385162353516, "learning_rate": 1.7286477502754415e-07, "logits/chosen": -1.8989862203598022, "logits/rejected": -2.0420544147491455, "logps/chosen": -1.8241840600967407, "logps/rejected": -1.867881178855896, "loss": 2.7088, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.24184226989746, "rewards/margins": 0.43697088956832886, "rewards/rejected": -18.67881202697754, "step": 22380 }, { "epoch": 0.7544912197917018, "grad_norm": 50.16250228881836, "learning_rate": 1.7264238923231366e-07, "logits/chosen": -1.6946766376495361, "logits/rejected": -2.0134615898132324, "logps/chosen": -2.77992582321167, "logps/rejected": -2.884464979171753, "loss": 3.8556, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -27.79926109313965, "rewards/margins": 1.0453927516937256, "rewards/rejected": -28.844654083251953, "step": 22385 }, { "epoch": 0.7546597458626849, "grad_norm": 61.37696075439453, "learning_rate": 1.724201167226549e-07, "logits/chosen": -1.5504992008209229, "logits/rejected": -1.4514929056167603, "logps/chosen": -3.9573092460632324, "logps/rejected": -3.8085074424743652, "loss": 6.4388, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -39.573097229003906, "rewards/margins": -1.488017201423645, "rewards/rejected": -38.085079193115234, "step": 22390 }, { "epoch": 0.7548282719336682, "grad_norm": 22.884544372558594, "learning_rate": 1.7219795757548778e-07, "logits/chosen": -1.9424610137939453, "logits/rejected": -2.200577974319458, "logps/chosen": -2.2687830924987793, "logps/rejected": -3.1191654205322266, "loss": 2.0236, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -22.68783187866211, "rewards/margins": 8.50382137298584, "rewards/rejected": -31.191654205322266, "step": 22395 }, { "epoch": 0.7549967980046514, "grad_norm": 48.091827392578125, "learning_rate": 1.7197591186769245e-07, "logits/chosen": -1.8664665222167969, "logits/rejected": -1.8764442205429077, "logps/chosen": -2.2680325508117676, "logps/rejected": -2.3142781257629395, "loss": 3.3744, "rewards/accuracies": 0.5, "rewards/chosen": -22.68032455444336, "rewards/margins": 0.4624575674533844, "rewards/rejected": -23.142780303955078, "step": 22400 }, { "epoch": 0.7549967980046514, "eval_logits/chosen": -2.2423062324523926, "eval_logits/rejected": -2.414647102355957, "eval_logps/chosen": -2.2602834701538086, "eval_logps/rejected": -2.4113729000091553, "eval_loss": 3.0775012969970703, "eval_rewards/accuracies": 0.6299999952316284, "eval_rewards/chosen": -22.602834701538086, "eval_rewards/margins": 1.5108985900878906, "eval_rewards/rejected": -24.113731384277344, "eval_runtime": 12.8925, "eval_samples_per_second": 7.756, "eval_steps_per_second": 1.939, "step": 22400 }, { "epoch": 0.7551653240756345, "grad_norm": 15.63908576965332, "learning_rate": 1.7175397967611043e-07, "logits/chosen": -1.7012121677398682, "logits/rejected": -2.0834169387817383, "logps/chosen": -2.3098204135894775, "logps/rejected": -2.574169158935547, "loss": 3.1278, "rewards/accuracies": 0.5, "rewards/chosen": -23.09820556640625, "rewards/margins": 2.643484592437744, "rewards/rejected": -25.741689682006836, "step": 22405 }, { "epoch": 0.7553338501466177, "grad_norm": 186.98109436035156, "learning_rate": 1.7153216107754365e-07, "logits/chosen": -1.6505035161972046, "logits/rejected": -1.967118263244629, "logps/chosen": -2.6132192611694336, "logps/rejected": -3.1035962104797363, "loss": 3.2179, "rewards/accuracies": 0.5, "rewards/chosen": -26.132190704345703, "rewards/margins": 4.903773307800293, "rewards/rejected": -31.035964965820312, "step": 22410 }, { "epoch": 0.7555023762176009, "grad_norm": 60.38026809692383, "learning_rate": 1.7131045614875484e-07, "logits/chosen": -1.731563925743103, "logits/rejected": -1.6696627140045166, "logps/chosen": -2.5402698516845703, "logps/rejected": -2.457115888595581, "loss": 4.8036, "rewards/accuracies": 0.5, "rewards/chosen": -25.402698516845703, "rewards/margins": -0.8315426111221313, "rewards/rejected": -24.571157455444336, "step": 22415 }, { "epoch": 0.755670902288584, "grad_norm": 122.55125427246094, "learning_rate": 1.710888649664673e-07, "logits/chosen": -1.7970136404037476, "logits/rejected": -1.8179333209991455, "logps/chosen": -2.197624683380127, "logps/rejected": -2.4766898155212402, "loss": 2.565, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.976247787475586, "rewards/margins": 2.7906508445739746, "rewards/rejected": -24.766897201538086, "step": 22420 }, { "epoch": 0.7558394283595672, "grad_norm": 206.24832153320312, "learning_rate": 1.7086738760736497e-07, "logits/chosen": -2.3038604259490967, "logits/rejected": -2.688873052597046, "logps/chosen": -2.979055881500244, "logps/rejected": -3.692131757736206, "loss": 2.4169, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -29.79056167602539, "rewards/margins": 7.130758762359619, "rewards/rejected": -36.92131805419922, "step": 22425 }, { "epoch": 0.7560079544305504, "grad_norm": 21.21005630493164, "learning_rate": 1.7064602414809266e-07, "logits/chosen": -1.6251780986785889, "logits/rejected": -2.547797203063965, "logps/chosen": -2.238882064819336, "logps/rejected": -2.8598296642303467, "loss": 2.0533, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.388818740844727, "rewards/margins": 6.209475517272949, "rewards/rejected": -28.598297119140625, "step": 22430 }, { "epoch": 0.7561764805015336, "grad_norm": 23.16986656188965, "learning_rate": 1.7042477466525522e-07, "logits/chosen": -1.8212858438491821, "logits/rejected": -1.88141667842865, "logps/chosen": -2.4218106269836426, "logps/rejected": -2.696655750274658, "loss": 3.2363, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -24.21810531616211, "rewards/margins": 2.7484498023986816, "rewards/rejected": -26.9665584564209, "step": 22435 }, { "epoch": 0.7563450065725168, "grad_norm": 31.574495315551758, "learning_rate": 1.7020363923541853e-07, "logits/chosen": -1.8986423015594482, "logits/rejected": -1.8368682861328125, "logps/chosen": -3.057534694671631, "logps/rejected": -3.26399302482605, "loss": 3.2009, "rewards/accuracies": 0.5, "rewards/chosen": -30.57534408569336, "rewards/margins": 2.064589500427246, "rewards/rejected": -32.63993453979492, "step": 22440 }, { "epoch": 0.7565135326435, "grad_norm": 83.39399719238281, "learning_rate": 1.6998261793510898e-07, "logits/chosen": -2.201836109161377, "logits/rejected": -2.4175848960876465, "logps/chosen": -2.4501421451568604, "logps/rejected": -3.2026398181915283, "loss": 3.1793, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.501422882080078, "rewards/margins": 7.524975776672363, "rewards/rejected": -32.026397705078125, "step": 22445 }, { "epoch": 0.7566820587144831, "grad_norm": 39.38967514038086, "learning_rate": 1.6976171084081304e-07, "logits/chosen": -1.823545217514038, "logits/rejected": -2.4631857872009277, "logps/chosen": -2.3719120025634766, "logps/rejected": -3.7122325897216797, "loss": 1.0425, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -23.719120025634766, "rewards/margins": 13.403205871582031, "rewards/rejected": -37.1223258972168, "step": 22450 }, { "epoch": 0.7568505847854663, "grad_norm": 152.68515014648438, "learning_rate": 1.6954091802897807e-07, "logits/chosen": -1.5671319961547852, "logits/rejected": -1.5362260341644287, "logps/chosen": -2.3477840423583984, "logps/rejected": -2.3875017166137695, "loss": 3.1308, "rewards/accuracies": 0.5, "rewards/chosen": -23.47783851623535, "rewards/margins": 0.3971790373325348, "rewards/rejected": -23.875019073486328, "step": 22455 }, { "epoch": 0.7570191108564495, "grad_norm": 33.73069381713867, "learning_rate": 1.6932023957601187e-07, "logits/chosen": -1.9830052852630615, "logits/rejected": -1.470280647277832, "logps/chosen": -3.3719642162323, "logps/rejected": -3.146393299102783, "loss": 5.4149, "rewards/accuracies": 0.5, "rewards/chosen": -33.719642639160156, "rewards/margins": -2.2557122707366943, "rewards/rejected": -31.463932037353516, "step": 22460 }, { "epoch": 0.7571876369274326, "grad_norm": 26.94874382019043, "learning_rate": 1.6909967555828263e-07, "logits/chosen": -1.3639678955078125, "logits/rejected": -1.6912784576416016, "logps/chosen": -2.296612024307251, "logps/rejected": -2.5016751289367676, "loss": 2.0897, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.96611976623535, "rewards/margins": 2.050632953643799, "rewards/rejected": -25.01675033569336, "step": 22465 }, { "epoch": 0.7573561629984159, "grad_norm": 44.422882080078125, "learning_rate": 1.6887922605211858e-07, "logits/chosen": -2.0387985706329346, "logits/rejected": -2.1799306869506836, "logps/chosen": -2.5838475227355957, "logps/rejected": -2.6631133556365967, "loss": 3.9902, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -25.83847427368164, "rewards/margins": 0.792662501335144, "rewards/rejected": -26.631134033203125, "step": 22470 }, { "epoch": 0.7575246890693991, "grad_norm": 43.87267303466797, "learning_rate": 1.686588911338087e-07, "logits/chosen": -1.1579951047897339, "logits/rejected": -1.377206563949585, "logps/chosen": -2.371851682662964, "logps/rejected": -2.8188374042510986, "loss": 2.9854, "rewards/accuracies": 0.5, "rewards/chosen": -23.718515396118164, "rewards/margins": 4.469855785369873, "rewards/rejected": -28.188369750976562, "step": 22475 }, { "epoch": 0.7576932151403822, "grad_norm": 35.04606628417969, "learning_rate": 1.6843867087960251e-07, "logits/chosen": -1.3556041717529297, "logits/rejected": -1.4663169384002686, "logps/chosen": -2.151585102081299, "logps/rejected": -2.2910666465759277, "loss": 3.1251, "rewards/accuracies": 0.5, "rewards/chosen": -21.515850067138672, "rewards/margins": 1.3948184251785278, "rewards/rejected": -22.910667419433594, "step": 22480 }, { "epoch": 0.7578617412113654, "grad_norm": 39.85191345214844, "learning_rate": 1.682185653657091e-07, "logits/chosen": -2.1124298572540283, "logits/rejected": -2.0933234691619873, "logps/chosen": -2.2313594818115234, "logps/rejected": -2.3498880863189697, "loss": 2.6802, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.3135929107666, "rewards/margins": 1.1852903366088867, "rewards/rejected": -23.498882293701172, "step": 22485 }, { "epoch": 0.7580302672823486, "grad_norm": 54.735355377197266, "learning_rate": 1.6799857466829858e-07, "logits/chosen": -1.6174736022949219, "logits/rejected": -1.3525209426879883, "logps/chosen": -2.1450111865997314, "logps/rejected": -2.0265889167785645, "loss": 4.3679, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.450109481811523, "rewards/margins": -1.1842209100723267, "rewards/rejected": -20.26589012145996, "step": 22490 }, { "epoch": 0.7581987933533317, "grad_norm": 24.917158126831055, "learning_rate": 1.6777869886350104e-07, "logits/chosen": -2.9526991844177246, "logits/rejected": -3.172231674194336, "logps/chosen": -2.476539134979248, "logps/rejected": -2.646613836288452, "loss": 3.8738, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -24.765390396118164, "rewards/margins": 1.7007482051849365, "rewards/rejected": -26.466136932373047, "step": 22495 }, { "epoch": 0.7583673194243149, "grad_norm": 23.80600357055664, "learning_rate": 1.67558938027407e-07, "logits/chosen": -2.386019229888916, "logits/rejected": -2.5371181964874268, "logps/chosen": -2.0828425884246826, "logps/rejected": -2.561619758605957, "loss": 1.7093, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.828426361083984, "rewards/margins": 4.7877702713012695, "rewards/rejected": -25.616199493408203, "step": 22500 }, { "epoch": 0.7585358454952982, "grad_norm": 86.9273910522461, "learning_rate": 1.673392922360667e-07, "logits/chosen": -1.7680097818374634, "logits/rejected": -2.501622438430786, "logps/chosen": -2.6157374382019043, "logps/rejected": -3.2141737937927246, "loss": 2.935, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.157373428344727, "rewards/margins": 5.984361171722412, "rewards/rejected": -32.14173889160156, "step": 22505 }, { "epoch": 0.7587043715662813, "grad_norm": 37.30070114135742, "learning_rate": 1.67119761565491e-07, "logits/chosen": -1.8771638870239258, "logits/rejected": -1.8926589488983154, "logps/chosen": -3.0613274574279785, "logps/rejected": -3.1823253631591797, "loss": 3.0842, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -30.6132755279541, "rewards/margins": 1.2099756002426147, "rewards/rejected": -31.823253631591797, "step": 22510 }, { "epoch": 0.7588728976372645, "grad_norm": 1.0162453651428223, "learning_rate": 1.669003460916511e-07, "logits/chosen": -1.4025744199752808, "logits/rejected": -1.448706030845642, "logps/chosen": -2.603241443634033, "logps/rejected": -2.6898093223571777, "loss": 3.7204, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.03241539001465, "rewards/margins": 0.8656784296035767, "rewards/rejected": -26.89809226989746, "step": 22515 }, { "epoch": 0.7590414237082477, "grad_norm": 23.106170654296875, "learning_rate": 1.666810458904776e-07, "logits/chosen": -2.604548692703247, "logits/rejected": -2.551825523376465, "logps/chosen": -2.1498825550079346, "logps/rejected": -2.1720776557922363, "loss": 3.3861, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.49882698059082, "rewards/margins": 0.22195252776145935, "rewards/rejected": -21.720775604248047, "step": 22520 }, { "epoch": 0.7592099497792308, "grad_norm": 21.098031997680664, "learning_rate": 1.6646186103786187e-07, "logits/chosen": -1.3105382919311523, "logits/rejected": -2.053884983062744, "logps/chosen": -2.0351314544677734, "logps/rejected": -3.2010607719421387, "loss": 1.3937, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.3513126373291, "rewards/margins": 11.659296035766602, "rewards/rejected": -32.01061248779297, "step": 22525 }, { "epoch": 0.759378475850214, "grad_norm": 33.70224380493164, "learning_rate": 1.6624279160965522e-07, "logits/chosen": -1.2020736932754517, "logits/rejected": -1.5865360498428345, "logps/chosen": -2.410283327102661, "logps/rejected": -3.008296489715576, "loss": 2.2057, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -24.102832794189453, "rewards/margins": 5.980130195617676, "rewards/rejected": -30.082965850830078, "step": 22530 }, { "epoch": 0.7595470019211972, "grad_norm": 42.09781265258789, "learning_rate": 1.6602383768166895e-07, "logits/chosen": -0.8306490778923035, "logits/rejected": -1.1787729263305664, "logps/chosen": -2.244062662124634, "logps/rejected": -2.4677631855010986, "loss": 2.303, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.440624237060547, "rewards/margins": 2.2370076179504395, "rewards/rejected": -24.67763328552246, "step": 22535 }, { "epoch": 0.7597155279921803, "grad_norm": 21.09007453918457, "learning_rate": 1.6580499932967424e-07, "logits/chosen": -2.110147476196289, "logits/rejected": -2.182302474975586, "logps/chosen": -3.0315299034118652, "logps/rejected": -3.077430248260498, "loss": 2.7172, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -30.3153018951416, "rewards/margins": 0.45900383591651917, "rewards/rejected": -30.774303436279297, "step": 22540 }, { "epoch": 0.7598840540631636, "grad_norm": 53.60516357421875, "learning_rate": 1.6558627662940245e-07, "logits/chosen": -1.1278090476989746, "logits/rejected": -1.2264466285705566, "logps/chosen": -1.8437979221343994, "logps/rejected": -1.921600341796875, "loss": 2.8029, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.43798065185547, "rewards/margins": 0.778022289276123, "rewards/rejected": -19.216001510620117, "step": 22545 }, { "epoch": 0.7600525801341468, "grad_norm": 45.398101806640625, "learning_rate": 1.6536766965654497e-07, "logits/chosen": -1.9748245477676392, "logits/rejected": -1.8975473642349243, "logps/chosen": -2.218376636505127, "logps/rejected": -2.4128360748291016, "loss": 4.6966, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.183765411376953, "rewards/margins": 1.9445937871932983, "rewards/rejected": -24.128360748291016, "step": 22550 }, { "epoch": 0.76022110620513, "grad_norm": 66.98893737792969, "learning_rate": 1.6514917848675302e-07, "logits/chosen": -1.7163118124008179, "logits/rejected": -2.033578395843506, "logps/chosen": -2.355811595916748, "logps/rejected": -2.6961662769317627, "loss": 2.6563, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.558116912841797, "rewards/margins": 3.4035465717315674, "rewards/rejected": -26.9616641998291, "step": 22555 }, { "epoch": 0.7603896322761131, "grad_norm": 41.96858596801758, "learning_rate": 1.6493080319563786e-07, "logits/chosen": -1.6755859851837158, "logits/rejected": -2.1102638244628906, "logps/chosen": -2.6356406211853027, "logps/rejected": -3.725564479827881, "loss": 1.5622, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.356409072875977, "rewards/margins": 10.899238586425781, "rewards/rejected": -37.25564956665039, "step": 22560 }, { "epoch": 0.7605581583470963, "grad_norm": 24.43367576599121, "learning_rate": 1.6471254385877058e-07, "logits/chosen": -1.9659569263458252, "logits/rejected": -2.5237998962402344, "logps/chosen": -2.4973161220550537, "logps/rejected": -3.3989574909210205, "loss": 1.214, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -24.973163604736328, "rewards/margins": 9.016408920288086, "rewards/rejected": -33.98957061767578, "step": 22565 }, { "epoch": 0.7607266844180794, "grad_norm": 32.95793151855469, "learning_rate": 1.6449440055168197e-07, "logits/chosen": -1.3517484664916992, "logits/rejected": -1.5391706228256226, "logps/chosen": -2.0335590839385986, "logps/rejected": -2.13909649848938, "loss": 3.0144, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.335590362548828, "rewards/margins": 1.0553737878799438, "rewards/rejected": -21.39096450805664, "step": 22570 }, { "epoch": 0.7608952104890626, "grad_norm": 34.37199020385742, "learning_rate": 1.6427637334986295e-07, "logits/chosen": -2.1165642738342285, "logits/rejected": -2.1103622913360596, "logps/chosen": -2.2701714038848877, "logps/rejected": -2.4335379600524902, "loss": 4.2387, "rewards/accuracies": 0.5, "rewards/chosen": -22.701711654663086, "rewards/margins": 1.6336677074432373, "rewards/rejected": -24.335378646850586, "step": 22575 }, { "epoch": 0.7610637365600459, "grad_norm": 28.55931282043457, "learning_rate": 1.640584623287641e-07, "logits/chosen": -1.371361494064331, "logits/rejected": -1.538761019706726, "logps/chosen": -1.875836968421936, "logps/rejected": -1.9495811462402344, "loss": 2.4925, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.758371353149414, "rewards/margins": 0.7374424338340759, "rewards/rejected": -19.495811462402344, "step": 22580 }, { "epoch": 0.761232262631029, "grad_norm": 25.89690399169922, "learning_rate": 1.6384066756379606e-07, "logits/chosen": -1.989154577255249, "logits/rejected": -2.138568878173828, "logps/chosen": -2.7180099487304688, "logps/rejected": -2.8691840171813965, "loss": 2.7374, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -27.180099487304688, "rewards/margins": 1.511740803718567, "rewards/rejected": -28.69183921813965, "step": 22585 }, { "epoch": 0.7614007887020122, "grad_norm": 69.94366455078125, "learning_rate": 1.6362298913032861e-07, "logits/chosen": -1.814008355140686, "logits/rejected": -1.816664695739746, "logps/chosen": -2.587378978729248, "logps/rejected": -2.3037238121032715, "loss": 5.8606, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -25.873790740966797, "rewards/margins": -2.836550712585449, "rewards/rejected": -23.03723907470703, "step": 22590 }, { "epoch": 0.7615693147729954, "grad_norm": 28.40800666809082, "learning_rate": 1.6340542710369193e-07, "logits/chosen": -1.4182841777801514, "logits/rejected": -1.7591663599014282, "logps/chosen": -1.932254433631897, "logps/rejected": -2.272496461868286, "loss": 3.8638, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -19.32254409790039, "rewards/margins": 3.402418851852417, "rewards/rejected": -22.724964141845703, "step": 22595 }, { "epoch": 0.7617378408439786, "grad_norm": 14.029182434082031, "learning_rate": 1.631879815591758e-07, "logits/chosen": -1.8987289667129517, "logits/rejected": -2.287255048751831, "logps/chosen": -2.929877996444702, "logps/rejected": -4.1070146560668945, "loss": 2.4117, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -29.298778533935547, "rewards/margins": 11.7713623046875, "rewards/rejected": -41.07014083862305, "step": 22600 }, { "epoch": 0.7619063669149617, "grad_norm": 8.976080894470215, "learning_rate": 1.6297065257202924e-07, "logits/chosen": -1.4358826875686646, "logits/rejected": -1.8369331359863281, "logps/chosen": -2.0056796073913574, "logps/rejected": -2.3201420307159424, "loss": 1.7562, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.05679702758789, "rewards/margins": 3.1446213722229004, "rewards/rejected": -23.201419830322266, "step": 22605 }, { "epoch": 0.7620748929859449, "grad_norm": 97.5491714477539, "learning_rate": 1.6275344021746135e-07, "logits/chosen": -1.5037356615066528, "logits/rejected": -1.7426496744155884, "logps/chosen": -2.2114577293395996, "logps/rejected": -2.473027467727661, "loss": 2.3805, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.11457633972168, "rewards/margins": 2.6156959533691406, "rewards/rejected": -24.730274200439453, "step": 22610 }, { "epoch": 0.7622434190569282, "grad_norm": 42.989097595214844, "learning_rate": 1.6253634457064085e-07, "logits/chosen": -1.2196903228759766, "logits/rejected": -1.2983381748199463, "logps/chosen": -2.3565046787261963, "logps/rejected": -2.637000560760498, "loss": 2.1788, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.565048217773438, "rewards/margins": 2.804961681365967, "rewards/rejected": -26.370006561279297, "step": 22615 }, { "epoch": 0.7624119451279113, "grad_norm": 14.925318717956543, "learning_rate": 1.6231936570669614e-07, "logits/chosen": -1.4819129705429077, "logits/rejected": -1.4523422718048096, "logps/chosen": -1.8426742553710938, "logps/rejected": -2.0914957523345947, "loss": 2.7286, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.426740646362305, "rewards/margins": 2.4882164001464844, "rewards/rejected": -20.91495704650879, "step": 22620 }, { "epoch": 0.7625804711988945, "grad_norm": 14.706117630004883, "learning_rate": 1.6210250370071465e-07, "logits/chosen": -2.005725860595703, "logits/rejected": -1.8718922138214111, "logps/chosen": -2.3563499450683594, "logps/rejected": -2.945220708847046, "loss": 0.9005, "rewards/accuracies": 1.0, "rewards/chosen": -23.563499450683594, "rewards/margins": 5.888709545135498, "rewards/rejected": -29.45220947265625, "step": 22625 }, { "epoch": 0.7627489972698777, "grad_norm": 81.91179656982422, "learning_rate": 1.6188575862774405e-07, "logits/chosen": -1.7499510049819946, "logits/rejected": -1.71432626247406, "logps/chosen": -2.5580358505249023, "logps/rejected": -2.9207465648651123, "loss": 2.9396, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.580360412597656, "rewards/margins": 3.6271071434020996, "rewards/rejected": -29.207468032836914, "step": 22630 }, { "epoch": 0.7629175233408608, "grad_norm": 3.5403726617033726e-09, "learning_rate": 1.6166913056279136e-07, "logits/chosen": -1.9457800388336182, "logits/rejected": -2.086674928665161, "logps/chosen": -3.528853178024292, "logps/rejected": -4.209619045257568, "loss": 2.6988, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -35.288536071777344, "rewards/margins": 6.807660102844238, "rewards/rejected": -42.09619140625, "step": 22635 }, { "epoch": 0.763086049411844, "grad_norm": 86.31592559814453, "learning_rate": 1.6145261958082273e-07, "logits/chosen": -1.683431625366211, "logits/rejected": -2.1868739128112793, "logps/chosen": -2.386417865753174, "logps/rejected": -2.968527317047119, "loss": 2.3259, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.864177703857422, "rewards/margins": 5.8210954666137695, "rewards/rejected": -29.685272216796875, "step": 22640 }, { "epoch": 0.7632545754828272, "grad_norm": 28.384031295776367, "learning_rate": 1.6123622575676422e-07, "logits/chosen": -1.5124573707580566, "logits/rejected": -1.656079649925232, "logps/chosen": -2.3514010906219482, "logps/rejected": -2.981489896774292, "loss": 2.9862, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.514013290405273, "rewards/margins": 6.300887584686279, "rewards/rejected": -29.81490135192871, "step": 22645 }, { "epoch": 0.7634231015538103, "grad_norm": 44.00370407104492, "learning_rate": 1.610199491655012e-07, "logits/chosen": -1.9806187152862549, "logits/rejected": -2.0687432289123535, "logps/chosen": -2.1002681255340576, "logps/rejected": -2.3335559368133545, "loss": 2.0197, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.002683639526367, "rewards/margins": 2.3328773975372314, "rewards/rejected": -23.335561752319336, "step": 22650 }, { "epoch": 0.7635916276247936, "grad_norm": 1.1379077434539795, "learning_rate": 1.608037898818787e-07, "logits/chosen": -1.4696893692016602, "logits/rejected": -1.8817825317382812, "logps/chosen": -2.316413402557373, "logps/rejected": -2.648519992828369, "loss": 2.2046, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.16413688659668, "rewards/margins": 3.3210597038269043, "rewards/rejected": -26.48519515991211, "step": 22655 }, { "epoch": 0.7637601536957768, "grad_norm": 26.41794776916504, "learning_rate": 1.605877479807005e-07, "logits/chosen": -1.6426475048065186, "logits/rejected": -2.1441574096679688, "logps/chosen": -1.9908136129379272, "logps/rejected": -2.6799676418304443, "loss": 1.9165, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.90813636779785, "rewards/margins": 6.891541957855225, "rewards/rejected": -26.799678802490234, "step": 22660 }, { "epoch": 0.7639286797667599, "grad_norm": 28.51700782775879, "learning_rate": 1.6037182353673044e-07, "logits/chosen": -1.8227428197860718, "logits/rejected": -1.5971324443817139, "logps/chosen": -2.399543285369873, "logps/rejected": -2.171342372894287, "loss": 5.7566, "rewards/accuracies": 0.5, "rewards/chosen": -23.995433807373047, "rewards/margins": -2.2820091247558594, "rewards/rejected": -21.71342658996582, "step": 22665 }, { "epoch": 0.7640972058377431, "grad_norm": 36.324859619140625, "learning_rate": 1.6015601662469164e-07, "logits/chosen": -1.6870263814926147, "logits/rejected": -1.814182996749878, "logps/chosen": -2.6212759017944336, "logps/rejected": -3.208042860031128, "loss": 2.8824, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -26.212759017944336, "rewards/margins": 5.867671966552734, "rewards/rejected": -32.08042907714844, "step": 22670 }, { "epoch": 0.7642657319087263, "grad_norm": 27.71980094909668, "learning_rate": 1.59940327319266e-07, "logits/chosen": -1.6754287481307983, "logits/rejected": -1.8766758441925049, "logps/chosen": -2.4628703594207764, "logps/rejected": -2.6281509399414062, "loss": 4.069, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.62870216369629, "rewards/margins": 1.6528069972991943, "rewards/rejected": -26.281509399414062, "step": 22675 }, { "epoch": 0.7644342579797094, "grad_norm": 16.1252498626709, "learning_rate": 1.597247556950952e-07, "logits/chosen": -2.084392547607422, "logits/rejected": -2.031343460083008, "logps/chosen": -2.1783878803253174, "logps/rejected": -2.549133062362671, "loss": 2.1847, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.78388023376465, "rewards/margins": 3.7074522972106934, "rewards/rejected": -25.491331100463867, "step": 22680 }, { "epoch": 0.7646027840506926, "grad_norm": 29.193500518798828, "learning_rate": 1.595093018267802e-07, "logits/chosen": -1.4438226222991943, "logits/rejected": -1.5522502660751343, "logps/chosen": -2.001771926879883, "logps/rejected": -2.1155412197113037, "loss": 3.0072, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.017719268798828, "rewards/margins": 1.1376922130584717, "rewards/rejected": -21.155412673950195, "step": 22685 }, { "epoch": 0.7647713101216759, "grad_norm": 22.629920959472656, "learning_rate": 1.59293965788881e-07, "logits/chosen": -1.7114652395248413, "logits/rejected": -1.8377281427383423, "logps/chosen": -1.7160522937774658, "logps/rejected": -1.9835751056671143, "loss": 1.5033, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -17.160524368286133, "rewards/margins": 2.6752288341522217, "rewards/rejected": -19.835750579833984, "step": 22690 }, { "epoch": 0.764939836192659, "grad_norm": 31.570363998413086, "learning_rate": 1.5907874765591717e-07, "logits/chosen": -1.819737434387207, "logits/rejected": -2.656897783279419, "logps/chosen": -2.095639705657959, "logps/rejected": -2.8702781200408936, "loss": 1.6942, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.956396102905273, "rewards/margins": 7.7463860511779785, "rewards/rejected": -28.702783584594727, "step": 22695 }, { "epoch": 0.7651083622636422, "grad_norm": 29.860578536987305, "learning_rate": 1.588636475023668e-07, "logits/chosen": -1.793116569519043, "logits/rejected": -1.9782568216323853, "logps/chosen": -3.1871731281280518, "logps/rejected": -3.6866703033447266, "loss": 1.7431, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -31.87173080444336, "rewards/margins": 4.9949727058410645, "rewards/rejected": -36.86670684814453, "step": 22700 }, { "epoch": 0.7652768883346254, "grad_norm": 29.30422592163086, "learning_rate": 1.586486654026678e-07, "logits/chosen": -1.2873570919036865, "logits/rejected": -2.1461853981018066, "logps/chosen": -2.487175464630127, "logps/rejected": -3.387939929962158, "loss": 2.3507, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.871755599975586, "rewards/margins": 9.007641792297363, "rewards/rejected": -33.87939453125, "step": 22705 }, { "epoch": 0.7654454144056085, "grad_norm": 14.670337677001953, "learning_rate": 1.5843380143121703e-07, "logits/chosen": -1.8407291173934937, "logits/rejected": -1.8218971490859985, "logps/chosen": -2.770521640777588, "logps/rejected": -2.9160516262054443, "loss": 3.4715, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -27.705215454101562, "rewards/margins": 1.455300211906433, "rewards/rejected": -29.1605167388916, "step": 22710 }, { "epoch": 0.7656139404765917, "grad_norm": 36.592506408691406, "learning_rate": 1.5821905566237038e-07, "logits/chosen": -2.127350330352783, "logits/rejected": -2.3790395259857178, "logps/chosen": -2.19892954826355, "logps/rejected": -2.3953166007995605, "loss": 2.4215, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.989294052124023, "rewards/margins": 1.9638723134994507, "rewards/rejected": -23.953166961669922, "step": 22715 }, { "epoch": 0.7657824665475749, "grad_norm": 19.594825744628906, "learning_rate": 1.5800442817044297e-07, "logits/chosen": -2.041064739227295, "logits/rejected": -2.1058108806610107, "logps/chosen": -2.9583261013031006, "logps/rejected": -3.4392478466033936, "loss": 1.9416, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -29.583261489868164, "rewards/margins": 4.809213161468506, "rewards/rejected": -34.39247512817383, "step": 22720 }, { "epoch": 0.7659509926185581, "grad_norm": 0.45482581853866577, "learning_rate": 1.57789919029709e-07, "logits/chosen": -1.9175220727920532, "logits/rejected": -2.370159387588501, "logps/chosen": -2.3254458904266357, "logps/rejected": -3.4092516899108887, "loss": 1.4107, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -23.25446128845215, "rewards/margins": 10.838057518005371, "rewards/rejected": -34.09252166748047, "step": 22725 }, { "epoch": 0.7661195186895413, "grad_norm": 22.42098617553711, "learning_rate": 1.5757552831440141e-07, "logits/chosen": -2.0766656398773193, "logits/rejected": -2.1389594078063965, "logps/chosen": -2.143092393875122, "logps/rejected": -2.2686245441436768, "loss": 2.7787, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.430923461914062, "rewards/margins": 1.2553188800811768, "rewards/rejected": -22.68623924255371, "step": 22730 }, { "epoch": 0.7662880447605245, "grad_norm": 48.564796447753906, "learning_rate": 1.5736125609871243e-07, "logits/chosen": -2.0202689170837402, "logits/rejected": -2.1596198081970215, "logps/chosen": -2.347080945968628, "logps/rejected": -2.568044662475586, "loss": 2.9386, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.470806121826172, "rewards/margins": 2.2096376419067383, "rewards/rejected": -25.680444717407227, "step": 22735 }, { "epoch": 0.7664565708315076, "grad_norm": 25.688514709472656, "learning_rate": 1.5714710245679346e-07, "logits/chosen": -1.8988473415374756, "logits/rejected": -2.291731119155884, "logps/chosen": -1.5387709140777588, "logps/rejected": -1.7353506088256836, "loss": 1.9236, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -15.38770866394043, "rewards/margins": 1.9657974243164062, "rewards/rejected": -17.353506088256836, "step": 22740 }, { "epoch": 0.7666250969024908, "grad_norm": 42.179935455322266, "learning_rate": 1.5693306746275432e-07, "logits/chosen": -1.7268253564834595, "logits/rejected": -1.8562453985214233, "logps/chosen": -2.071321487426758, "logps/rejected": -2.1327714920043945, "loss": 3.8071, "rewards/accuracies": 0.5, "rewards/chosen": -20.713214874267578, "rewards/margins": 0.6145005226135254, "rewards/rejected": -21.327716827392578, "step": 22745 }, { "epoch": 0.766793622973474, "grad_norm": 25.364154815673828, "learning_rate": 1.5671915119066426e-07, "logits/chosen": -1.301816701889038, "logits/rejected": -1.5432207584381104, "logps/chosen": -2.3390283584594727, "logps/rejected": -2.549079418182373, "loss": 2.4451, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.390283584594727, "rewards/margins": 2.1005122661590576, "rewards/rejected": -25.490795135498047, "step": 22750 }, { "epoch": 0.7669621490444571, "grad_norm": 33.825782775878906, "learning_rate": 1.565053537145512e-07, "logits/chosen": -1.6760823726654053, "logits/rejected": -1.7630828619003296, "logps/chosen": -2.232607364654541, "logps/rejected": -2.3341293334960938, "loss": 3.0354, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.32607078552246, "rewards/margins": 1.015222191810608, "rewards/rejected": -23.341297149658203, "step": 22755 }, { "epoch": 0.7671306751154403, "grad_norm": 15.665143966674805, "learning_rate": 1.5629167510840224e-07, "logits/chosen": -1.711958646774292, "logits/rejected": -1.994261384010315, "logps/chosen": -2.732503890991211, "logps/rejected": -3.117694139480591, "loss": 1.5793, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -27.32503890991211, "rewards/margins": 3.8519043922424316, "rewards/rejected": -31.17694091796875, "step": 22760 }, { "epoch": 0.7672992011864236, "grad_norm": 31.382097244262695, "learning_rate": 1.560781154461628e-07, "logits/chosen": -1.7627441883087158, "logits/rejected": -1.7876489162445068, "logps/chosen": -2.144718885421753, "logps/rejected": -2.2064361572265625, "loss": 2.6592, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.447189331054688, "rewards/margins": 0.6171743273735046, "rewards/rejected": -22.064361572265625, "step": 22765 }, { "epoch": 0.7674677272574068, "grad_norm": 28.25269889831543, "learning_rate": 1.5586467480173766e-07, "logits/chosen": -2.0783777236938477, "logits/rejected": -2.0805037021636963, "logps/chosen": -2.2239603996276855, "logps/rejected": -2.4322445392608643, "loss": 2.3538, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.239604949951172, "rewards/margins": 2.082839012145996, "rewards/rejected": -24.322444915771484, "step": 22770 }, { "epoch": 0.7676362533283899, "grad_norm": 26.404125213623047, "learning_rate": 1.5565135324899026e-07, "logits/chosen": -2.1612420082092285, "logits/rejected": -2.238856077194214, "logps/chosen": -2.175217628479004, "logps/rejected": -2.4848060607910156, "loss": 2.6324, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.752174377441406, "rewards/margins": 3.0958826541900635, "rewards/rejected": -24.848058700561523, "step": 22775 }, { "epoch": 0.7678047793993731, "grad_norm": 40.29751968383789, "learning_rate": 1.554381508617426e-07, "logits/chosen": -2.263197660446167, "logits/rejected": -2.3222174644470215, "logps/chosen": -2.2314186096191406, "logps/rejected": -2.469470262527466, "loss": 1.9906, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.31418800354004, "rewards/margins": 2.3805153369903564, "rewards/rejected": -24.6947021484375, "step": 22780 }, { "epoch": 0.7679733054703562, "grad_norm": 22.63186264038086, "learning_rate": 1.5522506771377576e-07, "logits/chosen": -1.6402429342269897, "logits/rejected": -1.5860573053359985, "logps/chosen": -1.707369089126587, "logps/rejected": -1.9794480800628662, "loss": 1.7679, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.07369041442871, "rewards/margins": 2.720792055130005, "rewards/rejected": -19.794483184814453, "step": 22785 }, { "epoch": 0.7681418315413394, "grad_norm": 52.35350036621094, "learning_rate": 1.5501210387882933e-07, "logits/chosen": -1.4197752475738525, "logits/rejected": -1.8860286474227905, "logps/chosen": -2.8877930641174316, "logps/rejected": -3.428576707839966, "loss": 1.8367, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -28.8779296875, "rewards/margins": 5.4078369140625, "rewards/rejected": -34.2857666015625, "step": 22790 }, { "epoch": 0.7683103576123226, "grad_norm": 26.972814559936523, "learning_rate": 1.5479925943060195e-07, "logits/chosen": -1.6417248249053955, "logits/rejected": -1.6887985467910767, "logps/chosen": -2.1866393089294434, "logps/rejected": -2.9747371673583984, "loss": 2.9489, "rewards/accuracies": 0.5, "rewards/chosen": -21.86639404296875, "rewards/margins": 7.880978584289551, "rewards/rejected": -29.747371673583984, "step": 22795 }, { "epoch": 0.7684788836833059, "grad_norm": 21.10647964477539, "learning_rate": 1.5458653444275038e-07, "logits/chosen": -1.5778142213821411, "logits/rejected": -1.9015181064605713, "logps/chosen": -2.6732568740844727, "logps/rejected": -3.028304100036621, "loss": 1.9708, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.732568740844727, "rewards/margins": 3.550471544265747, "rewards/rejected": -30.283039093017578, "step": 22800 }, { "epoch": 0.7684788836833059, "eval_logits/chosen": -2.248206853866577, "eval_logits/rejected": -2.421609878540039, "eval_logps/chosen": -2.2624881267547607, "eval_logps/rejected": -2.4147942066192627, "eval_loss": 3.0768299102783203, "eval_rewards/accuracies": 0.6299999952316284, "eval_rewards/chosen": -22.624879837036133, "eval_rewards/margins": 1.5230610370635986, "eval_rewards/rejected": -24.1479434967041, "eval_runtime": 12.9007, "eval_samples_per_second": 7.751, "eval_steps_per_second": 1.938, "step": 22800 }, { "epoch": 0.768647409754289, "grad_norm": 24.942798614501953, "learning_rate": 1.5437392898889046e-07, "logits/chosen": -1.7756656408309937, "logits/rejected": -1.8584327697753906, "logps/chosen": -1.8894437551498413, "logps/rejected": -1.8411388397216797, "loss": 3.5599, "rewards/accuracies": 0.5, "rewards/chosen": -18.894437789916992, "rewards/margins": -0.4830484390258789, "rewards/rejected": -18.411388397216797, "step": 22805 }, { "epoch": 0.7688159358252722, "grad_norm": 39.55795669555664, "learning_rate": 1.5416144314259677e-07, "logits/chosen": -1.845928430557251, "logits/rejected": -2.0967960357666016, "logps/chosen": -2.7359180450439453, "logps/rejected": -2.971412420272827, "loss": 1.9504, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -27.359180450439453, "rewards/margins": 2.354943037033081, "rewards/rejected": -29.714122772216797, "step": 22810 }, { "epoch": 0.7689844618962554, "grad_norm": 131.97705078125, "learning_rate": 1.5394907697740194e-07, "logits/chosen": -1.8618195056915283, "logits/rejected": -2.1782288551330566, "logps/chosen": -2.682870388031006, "logps/rejected": -2.9631354808807373, "loss": 1.5972, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -26.82870864868164, "rewards/margins": 2.8026511669158936, "rewards/rejected": -29.631357192993164, "step": 22815 }, { "epoch": 0.7691529879672385, "grad_norm": 0.32372578978538513, "learning_rate": 1.537368305667977e-07, "logits/chosen": -2.0023844242095947, "logits/rejected": -2.6936569213867188, "logps/chosen": -2.029672145843506, "logps/rejected": -2.9066014289855957, "loss": 2.5648, "rewards/accuracies": 0.5, "rewards/chosen": -20.296720504760742, "rewards/margins": 8.769292831420898, "rewards/rejected": -29.066015243530273, "step": 22820 }, { "epoch": 0.7693215140382217, "grad_norm": 58.08989334106445, "learning_rate": 1.5352470398423423e-07, "logits/chosen": -2.2171883583068848, "logits/rejected": -2.2867395877838135, "logps/chosen": -3.2307028770446777, "logps/rejected": -3.6406402587890625, "loss": 2.1023, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -32.307029724121094, "rewards/margins": 4.09937047958374, "rewards/rejected": -36.406402587890625, "step": 22825 }, { "epoch": 0.7694900401092049, "grad_norm": 36.715049743652344, "learning_rate": 1.5331269730312025e-07, "logits/chosen": -1.6262295246124268, "logits/rejected": -2.023651361465454, "logps/chosen": -2.8353304862976074, "logps/rejected": -3.7760651111602783, "loss": 1.2619, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -28.35330581665039, "rewards/margins": 9.407342910766602, "rewards/rejected": -37.76064682006836, "step": 22830 }, { "epoch": 0.769658566180188, "grad_norm": 34.941463470458984, "learning_rate": 1.531008105968226e-07, "logits/chosen": -1.6637403964996338, "logits/rejected": -2.284741163253784, "logps/chosen": -1.8302547931671143, "logps/rejected": -2.269709825515747, "loss": 2.3833, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.302549362182617, "rewards/margins": 4.394549369812012, "rewards/rejected": -22.697097778320312, "step": 22835 }, { "epoch": 0.7698270922511713, "grad_norm": 26.151050567626953, "learning_rate": 1.528890439386672e-07, "logits/chosen": -1.7768512964248657, "logits/rejected": -1.8191230297088623, "logps/chosen": -2.1474437713623047, "logps/rejected": -2.3349575996398926, "loss": 2.3431, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.474435806274414, "rewards/margins": 1.8751392364501953, "rewards/rejected": -23.34957504272461, "step": 22840 }, { "epoch": 0.7699956183221545, "grad_norm": 35.694366455078125, "learning_rate": 1.5267739740193801e-07, "logits/chosen": -2.0323410034179688, "logits/rejected": -2.495180606842041, "logps/chosen": -2.026014804840088, "logps/rejected": -2.490914821624756, "loss": 2.4506, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.260150909423828, "rewards/margins": 4.648998260498047, "rewards/rejected": -24.909147262573242, "step": 22845 }, { "epoch": 0.7701641443931376, "grad_norm": 26.962989807128906, "learning_rate": 1.5246587105987762e-07, "logits/chosen": -1.271196961402893, "logits/rejected": -1.243116021156311, "logps/chosen": -2.028836965560913, "logps/rejected": -2.181765079498291, "loss": 2.8807, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.28837013244629, "rewards/margins": 1.5292824506759644, "rewards/rejected": -21.817651748657227, "step": 22850 }, { "epoch": 0.7703326704641208, "grad_norm": 35.68492889404297, "learning_rate": 1.5225446498568694e-07, "logits/chosen": -1.7652562856674194, "logits/rejected": -2.1971378326416016, "logps/chosen": -2.2682933807373047, "logps/rejected": -2.5781054496765137, "loss": 2.8534, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.682933807373047, "rewards/margins": 3.0981197357177734, "rewards/rejected": -25.781055450439453, "step": 22855 }, { "epoch": 0.770501196535104, "grad_norm": 50.4202880859375, "learning_rate": 1.5204317925252553e-07, "logits/chosen": -1.9249420166015625, "logits/rejected": -2.014530658721924, "logps/chosen": -2.3662045001983643, "logps/rejected": -2.5241458415985107, "loss": 3.5158, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -23.662044525146484, "rewards/margins": 1.5794137716293335, "rewards/rejected": -25.241458892822266, "step": 22860 }, { "epoch": 0.7706697226060871, "grad_norm": 4.7089128494262695, "learning_rate": 1.5183201393351064e-07, "logits/chosen": -1.807294487953186, "logits/rejected": -1.8672775030136108, "logps/chosen": -2.128382921218872, "logps/rejected": -2.1345269680023193, "loss": 3.4027, "rewards/accuracies": 0.5, "rewards/chosen": -21.283828735351562, "rewards/margins": 0.061441611498594284, "rewards/rejected": -21.34527015686035, "step": 22865 }, { "epoch": 0.7708382486770703, "grad_norm": 17.503753662109375, "learning_rate": 1.516209691017184e-07, "logits/chosen": -1.9709889888763428, "logits/rejected": -2.1105175018310547, "logps/chosen": -2.447917938232422, "logps/rejected": -2.793539047241211, "loss": 1.2948, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -24.47917938232422, "rewards/margins": 3.456209182739258, "rewards/rejected": -27.935388565063477, "step": 22870 }, { "epoch": 0.7710067747480536, "grad_norm": 42.22770690917969, "learning_rate": 1.5141004483018322e-07, "logits/chosen": -2.3059794902801514, "logits/rejected": -2.0448668003082275, "logps/chosen": -3.028981924057007, "logps/rejected": -3.5030417442321777, "loss": 4.0377, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -30.289819717407227, "rewards/margins": 4.740601539611816, "rewards/rejected": -35.030418395996094, "step": 22875 }, { "epoch": 0.7711753008190367, "grad_norm": 29.942577362060547, "learning_rate": 1.511992411918978e-07, "logits/chosen": -2.524641513824463, "logits/rejected": -2.246544361114502, "logps/chosen": -2.187622547149658, "logps/rejected": -2.3799984455108643, "loss": 2.3435, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.876224517822266, "rewards/margins": 1.9237569570541382, "rewards/rejected": -23.79998207092285, "step": 22880 }, { "epoch": 0.7713438268900199, "grad_norm": 0.0972161665558815, "learning_rate": 1.509885582598126e-07, "logits/chosen": -1.412776231765747, "logits/rejected": -2.6804697513580322, "logps/chosen": -2.9052116870880127, "logps/rejected": -3.8140385150909424, "loss": 1.1475, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -29.0521183013916, "rewards/margins": 9.088268280029297, "rewards/rejected": -38.140384674072266, "step": 22885 }, { "epoch": 0.7715123529610031, "grad_norm": 28.691585540771484, "learning_rate": 1.5077799610683694e-07, "logits/chosen": -2.3171160221099854, "logits/rejected": -2.5718681812286377, "logps/chosen": -2.6705522537231445, "logps/rejected": -2.6094350814819336, "loss": 3.9495, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.705524444580078, "rewards/margins": -0.6111720204353333, "rewards/rejected": -26.094350814819336, "step": 22890 }, { "epoch": 0.7716808790319862, "grad_norm": 18.407123565673828, "learning_rate": 1.50567554805838e-07, "logits/chosen": -1.6417083740234375, "logits/rejected": -1.7448304891586304, "logps/chosen": -1.9469597339630127, "logps/rejected": -1.9432523250579834, "loss": 3.5781, "rewards/accuracies": 0.5, "rewards/chosen": -19.46959686279297, "rewards/margins": -0.037073612213134766, "rewards/rejected": -19.432523727416992, "step": 22895 }, { "epoch": 0.7718494051029694, "grad_norm": 207.2261962890625, "learning_rate": 1.5035723442964137e-07, "logits/chosen": -1.3821115493774414, "logits/rejected": -1.5796051025390625, "logps/chosen": -2.5829050540924072, "logps/rejected": -2.776493787765503, "loss": 2.1161, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.829050064086914, "rewards/margins": 1.9358898401260376, "rewards/rejected": -27.764938354492188, "step": 22900 }, { "epoch": 0.7720179311739526, "grad_norm": 18.419729232788086, "learning_rate": 1.5014703505103042e-07, "logits/chosen": -1.5121935606002808, "logits/rejected": -2.028642177581787, "logps/chosen": -2.0915684700012207, "logps/rejected": -2.209536075592041, "loss": 2.8085, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.91568374633789, "rewards/margins": 1.1796770095825195, "rewards/rejected": -22.095359802246094, "step": 22905 }, { "epoch": 0.7721864572449358, "grad_norm": 17.541202545166016, "learning_rate": 1.4993695674274697e-07, "logits/chosen": -1.6946613788604736, "logits/rejected": -1.8450828790664673, "logps/chosen": -2.4303412437438965, "logps/rejected": -2.487248182296753, "loss": 2.9624, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.30341148376465, "rewards/margins": 0.5690677762031555, "rewards/rejected": -24.872478485107422, "step": 22910 }, { "epoch": 0.772354983315919, "grad_norm": 30.39205551147461, "learning_rate": 1.4972699957749102e-07, "logits/chosen": -1.9528687000274658, "logits/rejected": -1.8657668828964233, "logps/chosen": -2.3502016067504883, "logps/rejected": -2.5009114742279053, "loss": 2.6588, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.502017974853516, "rewards/margins": 1.5070993900299072, "rewards/rejected": -25.00911521911621, "step": 22915 }, { "epoch": 0.7725235093869022, "grad_norm": 35.62753677368164, "learning_rate": 1.4951716362792017e-07, "logits/chosen": -2.1138663291931152, "logits/rejected": -2.126997470855713, "logps/chosen": -1.9578602313995361, "logps/rejected": -1.9695158004760742, "loss": 3.4066, "rewards/accuracies": 0.5, "rewards/chosen": -19.578601837158203, "rewards/margins": 0.1165565475821495, "rewards/rejected": -19.695158004760742, "step": 22920 }, { "epoch": 0.7726920354578853, "grad_norm": 26.032629013061523, "learning_rate": 1.4930744896665048e-07, "logits/chosen": -1.8728666305541992, "logits/rejected": -1.8308374881744385, "logps/chosen": -2.678856372833252, "logps/rejected": -2.6533203125, "loss": 5.4955, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -26.788562774658203, "rewards/margins": -0.2553566098213196, "rewards/rejected": -26.533206939697266, "step": 22925 }, { "epoch": 0.7728605615288685, "grad_norm": 21.67523193359375, "learning_rate": 1.4909785566625598e-07, "logits/chosen": -1.8303050994873047, "logits/rejected": -2.086219549179077, "logps/chosen": -1.8535133600234985, "logps/rejected": -2.158536195755005, "loss": 1.6427, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.535133361816406, "rewards/margins": 3.050227403640747, "rewards/rejected": -21.58536148071289, "step": 22930 }, { "epoch": 0.7730290875998517, "grad_norm": 44.84750747680664, "learning_rate": 1.4888838379926883e-07, "logits/chosen": -1.8299903869628906, "logits/rejected": -2.076554775238037, "logps/chosen": -2.6862049102783203, "logps/rejected": -3.086336612701416, "loss": 2.0808, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.862049102783203, "rewards/margins": 4.001317501068115, "rewards/rejected": -30.863367080688477, "step": 22935 }, { "epoch": 0.7731976136708348, "grad_norm": 18.299293518066406, "learning_rate": 1.486790334381786e-07, "logits/chosen": -2.179386615753174, "logits/rejected": -2.3787121772766113, "logps/chosen": -2.733924150466919, "logps/rejected": -2.6912307739257812, "loss": 5.3055, "rewards/accuracies": 0.5, "rewards/chosen": -27.3392391204834, "rewards/margins": -0.426931768655777, "rewards/rejected": -26.912307739257812, "step": 22940 }, { "epoch": 0.773366139741818, "grad_norm": 30.414318084716797, "learning_rate": 1.4846980465543347e-07, "logits/chosen": -1.5662306547164917, "logits/rejected": -1.731276512145996, "logps/chosen": -2.159959077835083, "logps/rejected": -3.0159831047058105, "loss": 2.9906, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.599590301513672, "rewards/margins": 8.560237884521484, "rewards/rejected": -30.159826278686523, "step": 22945 }, { "epoch": 0.7735346658128013, "grad_norm": 22.8912353515625, "learning_rate": 1.4826069752343928e-07, "logits/chosen": -1.644881248474121, "logits/rejected": -1.354028344154358, "logps/chosen": -2.835266590118408, "logps/rejected": -2.5100321769714355, "loss": 7.9276, "rewards/accuracies": 0.5, "rewards/chosen": -28.352664947509766, "rewards/margins": -3.2523417472839355, "rewards/rejected": -25.10032081604004, "step": 22950 }, { "epoch": 0.7737031918837844, "grad_norm": 10.242867469787598, "learning_rate": 1.480517121145596e-07, "logits/chosen": -1.512997031211853, "logits/rejected": -1.731942892074585, "logps/chosen": -2.153104782104492, "logps/rejected": -2.394892454147339, "loss": 2.8295, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.531047821044922, "rewards/margins": 2.417874574661255, "rewards/rejected": -23.948925018310547, "step": 22955 }, { "epoch": 0.7738717179547676, "grad_norm": 26.37079429626465, "learning_rate": 1.4784284850111611e-07, "logits/chosen": -1.8254725933074951, "logits/rejected": -2.0033957958221436, "logps/chosen": -2.168997287750244, "logps/rejected": -2.5505526065826416, "loss": 1.7174, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.689970016479492, "rewards/margins": 3.81555438041687, "rewards/rejected": -25.50552749633789, "step": 22960 }, { "epoch": 0.7740402440257508, "grad_norm": 57.707820892333984, "learning_rate": 1.4763410675538835e-07, "logits/chosen": -2.0322554111480713, "logits/rejected": -2.0387206077575684, "logps/chosen": -2.6514925956726074, "logps/rejected": -2.516125440597534, "loss": 4.4089, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -26.514923095703125, "rewards/margins": -1.3536683320999146, "rewards/rejected": -25.1612548828125, "step": 22965 }, { "epoch": 0.774208770096734, "grad_norm": 18.135936737060547, "learning_rate": 1.4742548694961377e-07, "logits/chosen": -2.2560200691223145, "logits/rejected": -2.3864707946777344, "logps/chosen": -2.3666298389434814, "logps/rejected": -2.5894882678985596, "loss": 2.7261, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.666301727294922, "rewards/margins": 2.2285819053649902, "rewards/rejected": -25.894882202148438, "step": 22970 }, { "epoch": 0.7743772961677171, "grad_norm": 20.712677001953125, "learning_rate": 1.4721698915598702e-07, "logits/chosen": -1.5596562623977661, "logits/rejected": -1.7954515218734741, "logps/chosen": -2.4734067916870117, "logps/rejected": -3.3166375160217285, "loss": 2.9244, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.734067916870117, "rewards/margins": 8.432307243347168, "rewards/rejected": -33.16637420654297, "step": 22975 }, { "epoch": 0.7745458222387003, "grad_norm": 10.812604904174805, "learning_rate": 1.4700861344666132e-07, "logits/chosen": -1.5280256271362305, "logits/rejected": -1.799912691116333, "logps/chosen": -1.7556688785552979, "logps/rejected": -1.943488359451294, "loss": 2.2683, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.556686401367188, "rewards/margins": 1.8781960010528564, "rewards/rejected": -19.434885025024414, "step": 22980 }, { "epoch": 0.7747143483096836, "grad_norm": 30.655550003051758, "learning_rate": 1.4680035989374718e-07, "logits/chosen": -1.5087058544158936, "logits/rejected": -2.066072463989258, "logps/chosen": -1.9316574335098267, "logps/rejected": -2.208230972290039, "loss": 2.4866, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.316572189331055, "rewards/margins": 2.7657370567321777, "rewards/rejected": -22.08230972290039, "step": 22985 }, { "epoch": 0.7748828743806667, "grad_norm": 45.217041015625, "learning_rate": 1.4659222856931308e-07, "logits/chosen": -1.862138032913208, "logits/rejected": -2.3082938194274902, "logps/chosen": -2.321685314178467, "logps/rejected": -2.6933255195617676, "loss": 1.8816, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -23.216854095458984, "rewards/margins": 3.7164008617401123, "rewards/rejected": -26.933252334594727, "step": 22990 }, { "epoch": 0.7750514004516499, "grad_norm": 61.95263671875, "learning_rate": 1.4638421954538482e-07, "logits/chosen": -1.5636898279190063, "logits/rejected": -1.2484227418899536, "logps/chosen": -2.4466958045959473, "logps/rejected": -2.9692986011505127, "loss": 3.9933, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -24.46695899963379, "rewards/margins": 5.226029396057129, "rewards/rejected": -29.6929874420166, "step": 22995 }, { "epoch": 0.775219926522633, "grad_norm": 24.363027572631836, "learning_rate": 1.4617633289394633e-07, "logits/chosen": -2.067631483078003, "logits/rejected": -2.001176357269287, "logps/chosen": -2.2669143676757812, "logps/rejected": -2.5735621452331543, "loss": 2.9926, "rewards/accuracies": 0.5, "rewards/chosen": -22.669143676757812, "rewards/margins": 3.066478967666626, "rewards/rejected": -25.735620498657227, "step": 23000 }, { "epoch": 0.7753884525936162, "grad_norm": 16.292213439941406, "learning_rate": 1.4596856868693885e-07, "logits/chosen": -1.9688247442245483, "logits/rejected": -2.3371658325195312, "logps/chosen": -2.187854290008545, "logps/rejected": -2.4243526458740234, "loss": 1.6859, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -21.878543853759766, "rewards/margins": 2.364981174468994, "rewards/rejected": -24.2435245513916, "step": 23005 }, { "epoch": 0.7755569786645994, "grad_norm": 7.324104309082031, "learning_rate": 1.4576092699626152e-07, "logits/chosen": -1.372618317604065, "logits/rejected": -1.5508311986923218, "logps/chosen": -2.4179587364196777, "logps/rejected": -2.629817008972168, "loss": 1.7698, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -24.17958641052246, "rewards/margins": 2.118582248687744, "rewards/rejected": -26.298168182373047, "step": 23010 }, { "epoch": 0.7757255047355825, "grad_norm": 62.67533493041992, "learning_rate": 1.4555340789377085e-07, "logits/chosen": -1.9175342321395874, "logits/rejected": -2.3815016746520996, "logps/chosen": -2.818748950958252, "logps/rejected": -3.300917863845825, "loss": 3.7856, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -28.187490463256836, "rewards/margins": 4.821689128875732, "rewards/rejected": -33.009178161621094, "step": 23015 }, { "epoch": 0.7758940308065658, "grad_norm": 61.15652847290039, "learning_rate": 1.4534601145128128e-07, "logits/chosen": -1.769521713256836, "logits/rejected": -1.6290092468261719, "logps/chosen": -2.7396092414855957, "logps/rejected": -3.124126434326172, "loss": 2.4957, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -27.39609146118164, "rewards/margins": 3.8451716899871826, "rewards/rejected": -31.241262435913086, "step": 23020 }, { "epoch": 0.776062556877549, "grad_norm": 45.90425491333008, "learning_rate": 1.4513873774056412e-07, "logits/chosen": -1.338941216468811, "logits/rejected": -1.2785598039627075, "logps/chosen": -2.2803165912628174, "logps/rejected": -2.18070912361145, "loss": 4.1639, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.803163528442383, "rewards/margins": -0.9960732460021973, "rewards/rejected": -21.807090759277344, "step": 23025 }, { "epoch": 0.7762310829485322, "grad_norm": 23.82891082763672, "learning_rate": 1.449315868333489e-07, "logits/chosen": -1.6797630786895752, "logits/rejected": -1.6392351388931274, "logps/chosen": -2.1757025718688965, "logps/rejected": -2.223583221435547, "loss": 2.8337, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.75702476501465, "rewards/margins": 0.4788082242012024, "rewards/rejected": -22.2358341217041, "step": 23030 }, { "epoch": 0.7763996090195153, "grad_norm": 135.72125244140625, "learning_rate": 1.4472455880132234e-07, "logits/chosen": -1.7913395166397095, "logits/rejected": -1.7838003635406494, "logps/chosen": -2.557555675506592, "logps/rejected": -2.5798721313476562, "loss": 3.3435, "rewards/accuracies": 0.5, "rewards/chosen": -25.575559616088867, "rewards/margins": 0.22316360473632812, "rewards/rejected": -25.798721313476562, "step": 23035 }, { "epoch": 0.7765681350904985, "grad_norm": 31.134138107299805, "learning_rate": 1.4451765371612878e-07, "logits/chosen": -1.6820363998413086, "logits/rejected": -1.8307892084121704, "logps/chosen": -2.7770934104919434, "logps/rejected": -3.1867949962615967, "loss": 2.6863, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -27.77093505859375, "rewards/margins": 4.0970139503479, "rewards/rejected": -31.867950439453125, "step": 23040 }, { "epoch": 0.7767366611614817, "grad_norm": 45.60488510131836, "learning_rate": 1.4431087164936972e-07, "logits/chosen": -1.9355823993682861, "logits/rejected": -2.1896002292633057, "logps/chosen": -3.5485992431640625, "logps/rejected": -3.7806289196014404, "loss": 3.903, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -35.48598861694336, "rewards/margins": 2.320297956466675, "rewards/rejected": -37.80629348754883, "step": 23045 }, { "epoch": 0.7769051872324648, "grad_norm": 7.562737941741943, "learning_rate": 1.441042126726044e-07, "logits/chosen": -2.149552822113037, "logits/rejected": -2.076618194580078, "logps/chosen": -2.2214457988739014, "logps/rejected": -2.2276124954223633, "loss": 4.1425, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.214458465576172, "rewards/margins": 0.06166648864746094, "rewards/rejected": -22.276126861572266, "step": 23050 }, { "epoch": 0.777073713303448, "grad_norm": 22.309995651245117, "learning_rate": 1.438976768573495e-07, "logits/chosen": -1.158691644668579, "logits/rejected": -1.3676048517227173, "logps/chosen": -2.538264513015747, "logps/rejected": -3.150280475616455, "loss": 1.7112, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -25.382644653320312, "rewards/margins": 6.120162487030029, "rewards/rejected": -31.5028076171875, "step": 23055 }, { "epoch": 0.7772422393744313, "grad_norm": 24.967859268188477, "learning_rate": 1.4369126427507855e-07, "logits/chosen": -1.9301315546035767, "logits/rejected": -2.185620069503784, "logps/chosen": -2.295980930328369, "logps/rejected": -2.754180431365967, "loss": 2.413, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.95981216430664, "rewards/margins": 4.581993579864502, "rewards/rejected": -27.541805267333984, "step": 23060 }, { "epoch": 0.7774107654454144, "grad_norm": 30.21352195739746, "learning_rate": 1.4348497499722306e-07, "logits/chosen": -2.2400124073028564, "logits/rejected": -2.523691177368164, "logps/chosen": -2.7316737174987793, "logps/rejected": -3.1302685737609863, "loss": 1.3373, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -27.316736221313477, "rewards/margins": 3.985947370529175, "rewards/rejected": -31.302684783935547, "step": 23065 }, { "epoch": 0.7775792915163976, "grad_norm": 24.961088180541992, "learning_rate": 1.4327880909517166e-07, "logits/chosen": -1.9912430047988892, "logits/rejected": -2.3256280422210693, "logps/chosen": -2.2469804286956787, "logps/rejected": -2.587294578552246, "loss": 1.5903, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.469806671142578, "rewards/margins": 3.4031403064727783, "rewards/rejected": -25.872943878173828, "step": 23070 }, { "epoch": 0.7777478175873808, "grad_norm": 31.824460983276367, "learning_rate": 1.4307276664027024e-07, "logits/chosen": -1.4603766202926636, "logits/rejected": -1.5443477630615234, "logps/chosen": -1.936466932296753, "logps/rejected": -2.063175916671753, "loss": 3.2179, "rewards/accuracies": 0.5, "rewards/chosen": -19.364669799804688, "rewards/margins": 1.2670886516571045, "rewards/rejected": -20.631759643554688, "step": 23075 }, { "epoch": 0.7779163436583639, "grad_norm": 33.54826736450195, "learning_rate": 1.4286684770382178e-07, "logits/chosen": -1.6126028299331665, "logits/rejected": -1.8728916645050049, "logps/chosen": -2.1415352821350098, "logps/rejected": -2.4509973526000977, "loss": 2.4713, "rewards/accuracies": 0.5, "rewards/chosen": -21.415353775024414, "rewards/margins": 3.0946192741394043, "rewards/rejected": -24.509973526000977, "step": 23080 }, { "epoch": 0.7780848697293471, "grad_norm": 63.88078689575195, "learning_rate": 1.4266105235708687e-07, "logits/chosen": -1.996490716934204, "logits/rejected": -2.11602783203125, "logps/chosen": -2.670262575149536, "logps/rejected": -2.7985289096832275, "loss": 2.7266, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.702627182006836, "rewards/margins": 1.2826632261276245, "rewards/rejected": -27.985286712646484, "step": 23085 }, { "epoch": 0.7782533958003303, "grad_norm": 224.8692169189453, "learning_rate": 1.4245538067128331e-07, "logits/chosen": -1.309300184249878, "logits/rejected": -1.3923568725585938, "logps/chosen": -2.338399887084961, "logps/rejected": -2.3223764896392822, "loss": 3.6622, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.384000778198242, "rewards/margins": -0.16023759543895721, "rewards/rejected": -23.223764419555664, "step": 23090 }, { "epoch": 0.7784219218713135, "grad_norm": 117.828125, "learning_rate": 1.422498327175856e-07, "logits/chosen": -1.7504974603652954, "logits/rejected": -2.420707941055298, "logps/chosen": -3.289490222930908, "logps/rejected": -4.021206378936768, "loss": 2.4788, "rewards/accuracies": 0.5, "rewards/chosen": -32.894901275634766, "rewards/margins": 7.317163944244385, "rewards/rejected": -40.212066650390625, "step": 23095 }, { "epoch": 0.7785904479422967, "grad_norm": 154.18460083007812, "learning_rate": 1.42044408567126e-07, "logits/chosen": -1.9211801290512085, "logits/rejected": -2.178114414215088, "logps/chosen": -2.7812247276306152, "logps/rejected": -2.6831390857696533, "loss": 4.6546, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -27.812246322631836, "rewards/margins": -0.9808561205863953, "rewards/rejected": -26.831390380859375, "step": 23100 }, { "epoch": 0.7787589740132799, "grad_norm": 7.306661427719519e-05, "learning_rate": 1.4183910829099393e-07, "logits/chosen": -1.4283428192138672, "logits/rejected": -1.7082526683807373, "logps/chosen": -2.776681900024414, "logps/rejected": -3.5103487968444824, "loss": 2.2582, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -27.766815185546875, "rewards/margins": 7.336672782897949, "rewards/rejected": -35.10348892211914, "step": 23105 }, { "epoch": 0.778927500084263, "grad_norm": 30.130817413330078, "learning_rate": 1.4163393196023532e-07, "logits/chosen": -2.1264195442199707, "logits/rejected": -2.2917914390563965, "logps/chosen": -2.5027756690979004, "logps/rejected": -2.6923680305480957, "loss": 2.0476, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -25.02775764465332, "rewards/margins": 1.8959224224090576, "rewards/rejected": -26.92367935180664, "step": 23110 }, { "epoch": 0.7790960261552462, "grad_norm": 36.973567962646484, "learning_rate": 1.4142887964585375e-07, "logits/chosen": -1.9158130884170532, "logits/rejected": -2.056734800338745, "logps/chosen": -2.228407382965088, "logps/rejected": -2.534636974334717, "loss": 2.6497, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.284072875976562, "rewards/margins": 3.0622963905334473, "rewards/rejected": -25.34636878967285, "step": 23115 }, { "epoch": 0.7792645522262294, "grad_norm": 31.22389793395996, "learning_rate": 1.4122395141880983e-07, "logits/chosen": -1.7744057178497314, "logits/rejected": -2.0564589500427246, "logps/chosen": -2.008413076400757, "logps/rejected": -2.5604119300842285, "loss": 1.9569, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.084131240844727, "rewards/margins": 5.519989490509033, "rewards/rejected": -25.6041202545166, "step": 23120 }, { "epoch": 0.7794330782972125, "grad_norm": 36.156959533691406, "learning_rate": 1.4101914735002128e-07, "logits/chosen": -1.4270397424697876, "logits/rejected": -1.7191730737686157, "logps/chosen": -2.226762294769287, "logps/rejected": -2.4052212238311768, "loss": 2.4218, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.267623901367188, "rewards/margins": 1.784589409828186, "rewards/rejected": -24.052213668823242, "step": 23125 }, { "epoch": 0.7796016043681958, "grad_norm": 12.364837646484375, "learning_rate": 1.4081446751036242e-07, "logits/chosen": -1.8880256414413452, "logits/rejected": -2.7479958534240723, "logps/chosen": -2.484072208404541, "logps/rejected": -3.452897548675537, "loss": 0.7836, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -24.84072494506836, "rewards/margins": 9.688249588012695, "rewards/rejected": -34.52897644042969, "step": 23130 }, { "epoch": 0.779770130439179, "grad_norm": 11.456212043762207, "learning_rate": 1.4060991197066496e-07, "logits/chosen": -1.887709379196167, "logits/rejected": -1.9433212280273438, "logps/chosen": -2.3668179512023926, "logps/rejected": -2.5772693157196045, "loss": 2.0588, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.66817855834961, "rewards/margins": 2.1045122146606445, "rewards/rejected": -25.772689819335938, "step": 23135 }, { "epoch": 0.7799386565101621, "grad_norm": 0.0005731512210331857, "learning_rate": 1.4040548080171754e-07, "logits/chosen": -1.120009183883667, "logits/rejected": -1.2262059450149536, "logps/chosen": -2.447812557220459, "logps/rejected": -2.5543696880340576, "loss": 5.2049, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.47812271118164, "rewards/margins": 1.0655739307403564, "rewards/rejected": -25.543697357177734, "step": 23140 }, { "epoch": 0.7801071825811453, "grad_norm": 46.96331787109375, "learning_rate": 1.402011740742658e-07, "logits/chosen": -1.763390302658081, "logits/rejected": -2.1351780891418457, "logps/chosen": -1.7994730472564697, "logps/rejected": -1.8151382207870483, "loss": 3.6935, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.99472999572754, "rewards/margins": 0.15665188431739807, "rewards/rejected": -18.15138053894043, "step": 23145 }, { "epoch": 0.7802757086521285, "grad_norm": 37.356719970703125, "learning_rate": 1.3999699185901222e-07, "logits/chosen": -1.8704512119293213, "logits/rejected": -2.1423542499542236, "logps/chosen": -2.3729965686798096, "logps/rejected": -2.660909652709961, "loss": 2.2502, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.729965209960938, "rewards/margins": 2.8791308403015137, "rewards/rejected": -26.609094619750977, "step": 23150 }, { "epoch": 0.7804442347231116, "grad_norm": 70.5757064819336, "learning_rate": 1.397929342266162e-07, "logits/chosen": -1.9407947063446045, "logits/rejected": -1.8652915954589844, "logps/chosen": -2.179978132247925, "logps/rejected": -2.2899811267852783, "loss": 2.6966, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.799779891967773, "rewards/margins": 1.100031852722168, "rewards/rejected": -22.899810791015625, "step": 23155 }, { "epoch": 0.7806127607940948, "grad_norm": 19.091533660888672, "learning_rate": 1.395890012476942e-07, "logits/chosen": -1.7762082815170288, "logits/rejected": -1.6420847177505493, "logps/chosen": -1.8515875339508057, "logps/rejected": -2.181715488433838, "loss": 2.9497, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.515872955322266, "rewards/margins": 3.301278591156006, "rewards/rejected": -21.81715202331543, "step": 23160 }, { "epoch": 0.780781286865078, "grad_norm": 20.589876174926758, "learning_rate": 1.3938519299281903e-07, "logits/chosen": -2.0344557762145996, "logits/rejected": -2.105841636657715, "logps/chosen": -1.8849172592163086, "logps/rejected": -2.2034912109375, "loss": 2.7466, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.849172592163086, "rewards/margins": 3.1857378482818604, "rewards/rejected": -22.034912109375, "step": 23165 }, { "epoch": 0.7809498129360612, "grad_norm": 43.91060256958008, "learning_rate": 1.3918150953252096e-07, "logits/chosen": -1.9605381488800049, "logits/rejected": -2.1528801918029785, "logps/chosen": -2.89855694770813, "logps/rejected": -3.4155421257019043, "loss": 2.7643, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -28.98556900024414, "rewards/margins": 5.1698527336120605, "rewards/rejected": -34.155418395996094, "step": 23170 }, { "epoch": 0.7811183390070444, "grad_norm": 38.220924377441406, "learning_rate": 1.3897795093728692e-07, "logits/chosen": -2.337184429168701, "logits/rejected": -2.533107280731201, "logps/chosen": -2.7642264366149902, "logps/rejected": -3.6560683250427246, "loss": 1.7109, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -27.64226722717285, "rewards/margins": 8.918416023254395, "rewards/rejected": -36.5606803894043, "step": 23175 }, { "epoch": 0.7812868650780276, "grad_norm": 51.32099151611328, "learning_rate": 1.3877451727756017e-07, "logits/chosen": -1.914634108543396, "logits/rejected": -1.7938172817230225, "logps/chosen": -2.7175326347351074, "logps/rejected": -2.4325027465820312, "loss": 6.0719, "rewards/accuracies": 0.5, "rewards/chosen": -27.17532730102539, "rewards/margins": -2.850299596786499, "rewards/rejected": -24.325027465820312, "step": 23180 }, { "epoch": 0.7814553911490107, "grad_norm": 17.17637825012207, "learning_rate": 1.3857120862374134e-07, "logits/chosen": -1.8925994634628296, "logits/rejected": -2.0988128185272217, "logps/chosen": -2.4949610233306885, "logps/rejected": -2.6762216091156006, "loss": 3.1127, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -24.949609756469727, "rewards/margins": 1.8126052618026733, "rewards/rejected": -26.762216567993164, "step": 23185 }, { "epoch": 0.7816239172199939, "grad_norm": 14.754379272460938, "learning_rate": 1.3836802504618743e-07, "logits/chosen": -1.419999361038208, "logits/rejected": -1.9414488077163696, "logps/chosen": -2.772274971008301, "logps/rejected": -3.133202075958252, "loss": 3.968, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -27.722747802734375, "rewards/margins": 3.6092727184295654, "rewards/rejected": -31.3320255279541, "step": 23190 }, { "epoch": 0.7817924432909771, "grad_norm": 83.89252471923828, "learning_rate": 1.3816496661521247e-07, "logits/chosen": -1.7252734899520874, "logits/rejected": -1.954825758934021, "logps/chosen": -2.2423574924468994, "logps/rejected": -2.609626531600952, "loss": 2.8739, "rewards/accuracies": 0.5, "rewards/chosen": -22.423574447631836, "rewards/margins": 3.672689914703369, "rewards/rejected": -26.096263885498047, "step": 23195 }, { "epoch": 0.7819609693619602, "grad_norm": 50.774505615234375, "learning_rate": 1.3796203340108669e-07, "logits/chosen": -2.3243584632873535, "logits/rejected": -2.7399606704711914, "logps/chosen": -2.7891287803649902, "logps/rejected": -3.1757395267486572, "loss": 2.1589, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -27.891284942626953, "rewards/margins": 3.8661091327667236, "rewards/rejected": -31.75739097595215, "step": 23200 }, { "epoch": 0.7819609693619602, "eval_logits/chosen": -2.2590906620025635, "eval_logits/rejected": -2.4323461055755615, "eval_logps/chosen": -2.2656962871551514, "eval_logps/rejected": -2.419363498687744, "eval_loss": 3.069692373275757, "eval_rewards/accuracies": 0.6299999952316284, "eval_rewards/chosen": -22.656963348388672, "eval_rewards/margins": 1.5366699695587158, "eval_rewards/rejected": -24.193632125854492, "eval_runtime": 12.9266, "eval_samples_per_second": 7.736, "eval_steps_per_second": 1.934, "step": 23200 }, { "epoch": 0.7821294954329435, "grad_norm": 9.897990226745605, "learning_rate": 1.3775922547403747e-07, "logits/chosen": -1.3797047138214111, "logits/rejected": -1.4906001091003418, "logps/chosen": -2.4802145957946777, "logps/rejected": -2.776656150817871, "loss": 1.7795, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.802146911621094, "rewards/margins": 2.9644157886505127, "rewards/rejected": -27.76656150817871, "step": 23205 }, { "epoch": 0.7822980215039267, "grad_norm": 224.88113403320312, "learning_rate": 1.3755654290424867e-07, "logits/chosen": -2.0499682426452637, "logits/rejected": -2.2290892601013184, "logps/chosen": -2.3797082901000977, "logps/rejected": -2.57685923576355, "loss": 3.0556, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.79707908630371, "rewards/margins": 1.9715118408203125, "rewards/rejected": -25.768590927124023, "step": 23210 }, { "epoch": 0.7824665475749099, "grad_norm": 5.163259983062744, "learning_rate": 1.3735398576186058e-07, "logits/chosen": -1.5662257671356201, "logits/rejected": -1.7009817361831665, "logps/chosen": -2.46049427986145, "logps/rejected": -2.8982386589050293, "loss": 1.4512, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -24.604944229125977, "rewards/margins": 4.377443790435791, "rewards/rejected": -28.982385635375977, "step": 23215 }, { "epoch": 0.782635073645893, "grad_norm": 25.366844177246094, "learning_rate": 1.3715155411697028e-07, "logits/chosen": -1.9533096551895142, "logits/rejected": -2.640411615371704, "logps/chosen": -2.7735981941223145, "logps/rejected": -2.4231772422790527, "loss": 7.7772, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -27.735980987548828, "rewards/margins": -3.5042080879211426, "rewards/rejected": -24.231773376464844, "step": 23220 }, { "epoch": 0.7828035997168762, "grad_norm": 273.75946044921875, "learning_rate": 1.3694924803963147e-07, "logits/chosen": -1.8088428974151611, "logits/rejected": -1.6246687173843384, "logps/chosen": -2.374206304550171, "logps/rejected": -2.3597331047058105, "loss": 3.3866, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.7420654296875, "rewards/margins": -0.14473219215869904, "rewards/rejected": -23.597332000732422, "step": 23225 }, { "epoch": 0.7829721257878594, "grad_norm": 1.5523768663406372, "learning_rate": 1.3674706759985444e-07, "logits/chosen": -2.1185832023620605, "logits/rejected": -2.421220541000366, "logps/chosen": -2.440904140472412, "logps/rejected": -3.2022106647491455, "loss": 0.6537, "rewards/accuracies": 1.0, "rewards/chosen": -24.409038543701172, "rewards/margins": 7.613066673278809, "rewards/rejected": -32.0221061706543, "step": 23230 }, { "epoch": 0.7831406518588425, "grad_norm": 116.09601593017578, "learning_rate": 1.3654501286760555e-07, "logits/chosen": -1.8746535778045654, "logits/rejected": -2.2961676120758057, "logps/chosen": -2.961397647857666, "logps/rejected": -3.421003818511963, "loss": 2.4693, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -29.613977432250977, "rewards/margins": 4.596056938171387, "rewards/rejected": -34.21003341674805, "step": 23235 }, { "epoch": 0.7833091779298258, "grad_norm": 26.170934677124023, "learning_rate": 1.3634308391280818e-07, "logits/chosen": -2.3364009857177734, "logits/rejected": -2.408446788787842, "logps/chosen": -2.0243003368377686, "logps/rejected": -2.1905949115753174, "loss": 2.5335, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.243005752563477, "rewards/margins": 1.6629432439804077, "rewards/rejected": -21.905948638916016, "step": 23240 }, { "epoch": 0.783477704000809, "grad_norm": 35.10651397705078, "learning_rate": 1.361412808053421e-07, "logits/chosen": -1.689139723777771, "logits/rejected": -1.9725558757781982, "logps/chosen": -2.2845559120178223, "logps/rejected": -2.4833407402038574, "loss": 2.0314, "rewards/accuracies": 0.5, "rewards/chosen": -22.845556259155273, "rewards/margins": 1.9878495931625366, "rewards/rejected": -24.83340835571289, "step": 23245 }, { "epoch": 0.7836462300717921, "grad_norm": 22.14246940612793, "learning_rate": 1.359396036150431e-07, "logits/chosen": -1.912557601928711, "logits/rejected": -1.9069467782974243, "logps/chosen": -2.4497103691101074, "logps/rejected": -2.5474560260772705, "loss": 2.9122, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -24.49710464477539, "rewards/margins": 0.9774559140205383, "rewards/rejected": -25.474559783935547, "step": 23250 }, { "epoch": 0.7838147561427753, "grad_norm": 32.53044891357422, "learning_rate": 1.3573805241170388e-07, "logits/chosen": -1.7146705389022827, "logits/rejected": -1.7869758605957031, "logps/chosen": -2.8418736457824707, "logps/rejected": -2.825669765472412, "loss": 4.9545, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -28.41873550415039, "rewards/margins": -0.16203632950782776, "rewards/rejected": -28.256698608398438, "step": 23255 }, { "epoch": 0.7839832822137585, "grad_norm": 0.5539440512657166, "learning_rate": 1.3553662726507343e-07, "logits/chosen": -1.527477502822876, "logits/rejected": -1.8698577880859375, "logps/chosen": -2.849269390106201, "logps/rejected": -3.2912509441375732, "loss": 2.1832, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -28.492691040039062, "rewards/margins": 4.419821262359619, "rewards/rejected": -32.912513732910156, "step": 23260 }, { "epoch": 0.7841518082847416, "grad_norm": 18.821990966796875, "learning_rate": 1.353353282448571e-07, "logits/chosen": -1.9978233575820923, "logits/rejected": -2.158778429031372, "logps/chosen": -1.6932750940322876, "logps/rejected": -1.9092578887939453, "loss": 1.9132, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -16.932750701904297, "rewards/margins": 2.1598281860351562, "rewards/rejected": -19.092578887939453, "step": 23265 }, { "epoch": 0.7843203343557248, "grad_norm": 0.9460822939872742, "learning_rate": 1.3513415542071627e-07, "logits/chosen": -1.9015109539031982, "logits/rejected": -2.185220241546631, "logps/chosen": -1.9466800689697266, "logps/rejected": -2.194200038909912, "loss": 2.1957, "rewards/accuracies": 0.5, "rewards/chosen": -19.466800689697266, "rewards/margins": 2.4751992225646973, "rewards/rejected": -21.941997528076172, "step": 23270 }, { "epoch": 0.784488860426708, "grad_norm": 93.4100112915039, "learning_rate": 1.3493310886226917e-07, "logits/chosen": -1.8904443979263306, "logits/rejected": -1.738896131515503, "logps/chosen": -2.364542007446289, "logps/rejected": -2.4334211349487305, "loss": 4.0342, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.64542007446289, "rewards/margins": 0.6887893676757812, "rewards/rejected": -24.334209442138672, "step": 23275 }, { "epoch": 0.7846573864976912, "grad_norm": 28.355125427246094, "learning_rate": 1.3473218863909002e-07, "logits/chosen": -1.9899078607559204, "logits/rejected": -2.364184856414795, "logps/chosen": -2.944575786590576, "logps/rejected": -3.3720479011535645, "loss": 5.6914, "rewards/accuracies": 0.5, "rewards/chosen": -29.445758819580078, "rewards/margins": 4.274716377258301, "rewards/rejected": -33.7204704284668, "step": 23280 }, { "epoch": 0.7848259125686744, "grad_norm": 30.1297664642334, "learning_rate": 1.3453139482070936e-07, "logits/chosen": -1.5648924112319946, "logits/rejected": -1.971692681312561, "logps/chosen": -2.311891555786133, "logps/rejected": -3.1094870567321777, "loss": 2.2711, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.118911743164062, "rewards/margins": 7.975958347320557, "rewards/rejected": -31.094873428344727, "step": 23285 }, { "epoch": 0.7849944386396576, "grad_norm": 47.018577575683594, "learning_rate": 1.3433072747661427e-07, "logits/chosen": -1.5098412036895752, "logits/rejected": -1.6824413537979126, "logps/chosen": -2.133009195327759, "logps/rejected": -2.2505862712860107, "loss": 2.5595, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.330089569091797, "rewards/margins": 1.1757726669311523, "rewards/rejected": -22.505863189697266, "step": 23290 }, { "epoch": 0.7851629647106407, "grad_norm": 19.697479248046875, "learning_rate": 1.3413018667624742e-07, "logits/chosen": -1.9054797887802124, "logits/rejected": -2.207296371459961, "logps/chosen": -2.088989734649658, "logps/rejected": -2.263735055923462, "loss": 3.8373, "rewards/accuracies": 0.5, "rewards/chosen": -20.889896392822266, "rewards/margins": 1.747450590133667, "rewards/rejected": -22.637348175048828, "step": 23295 }, { "epoch": 0.7853314907816239, "grad_norm": 107.42939758300781, "learning_rate": 1.3392977248900827e-07, "logits/chosen": -1.213679552078247, "logits/rejected": -2.19240665435791, "logps/chosen": -2.506530284881592, "logps/rejected": -3.082782030105591, "loss": 2.0125, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.0653018951416, "rewards/margins": 5.762515068054199, "rewards/rejected": -30.82781982421875, "step": 23300 }, { "epoch": 0.7855000168526071, "grad_norm": 43.644493103027344, "learning_rate": 1.3372948498425229e-07, "logits/chosen": -1.627753496170044, "logits/rejected": -1.5755800008773804, "logps/chosen": -2.293330669403076, "logps/rejected": -2.4177563190460205, "loss": 2.4611, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.933305740356445, "rewards/margins": 1.2442573308944702, "rewards/rejected": -24.177562713623047, "step": 23305 }, { "epoch": 0.7856685429235902, "grad_norm": 26.87129783630371, "learning_rate": 1.335293242312911e-07, "logits/chosen": -2.177293300628662, "logits/rejected": -2.4072842597961426, "logps/chosen": -1.9114128351211548, "logps/rejected": -1.9970004558563232, "loss": 2.9647, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.114126205444336, "rewards/margins": 0.8558780550956726, "rewards/rejected": -19.97000503540039, "step": 23310 }, { "epoch": 0.7858370689945735, "grad_norm": 158.449462890625, "learning_rate": 1.3332929029939249e-07, "logits/chosen": -1.6649404764175415, "logits/rejected": -1.7888128757476807, "logps/chosen": -2.672362804412842, "logps/rejected": -2.7433390617370605, "loss": 4.5421, "rewards/accuracies": 0.5, "rewards/chosen": -26.7236270904541, "rewards/margins": 0.7097650766372681, "rewards/rejected": -27.433391571044922, "step": 23315 }, { "epoch": 0.7860055950655567, "grad_norm": 40.22739791870117, "learning_rate": 1.3312938325778017e-07, "logits/chosen": -1.8461437225341797, "logits/rejected": -1.9166162014007568, "logps/chosen": -2.19929575920105, "logps/rejected": -2.339583158493042, "loss": 2.528, "rewards/accuracies": 0.5, "rewards/chosen": -21.99295425415039, "rewards/margins": 1.4028747081756592, "rewards/rejected": -23.395832061767578, "step": 23320 }, { "epoch": 0.7861741211365398, "grad_norm": 16.53434944152832, "learning_rate": 1.3292960317563416e-07, "logits/chosen": -1.9835697412490845, "logits/rejected": -2.3433592319488525, "logps/chosen": -2.381385326385498, "logps/rejected": -2.6794729232788086, "loss": 1.9686, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.813854217529297, "rewards/margins": 2.9808764457702637, "rewards/rejected": -26.794729232788086, "step": 23325 }, { "epoch": 0.786342647207523, "grad_norm": 171.1400604248047, "learning_rate": 1.3272995012209054e-07, "logits/chosen": -2.139479398727417, "logits/rejected": -2.3308639526367188, "logps/chosen": -2.979889392852783, "logps/rejected": -3.3997435569763184, "loss": 4.3946, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -29.79889488220215, "rewards/margins": 4.19854211807251, "rewards/rejected": -33.9974365234375, "step": 23330 }, { "epoch": 0.7865111732785062, "grad_norm": 19.148937225341797, "learning_rate": 1.3253042416624145e-07, "logits/chosen": -2.0896546840667725, "logits/rejected": -2.177683115005493, "logps/chosen": -2.602734327316284, "logps/rejected": -2.8803181648254395, "loss": 2.4611, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.02734375, "rewards/margins": 2.775836229324341, "rewards/rejected": -28.803180694580078, "step": 23335 }, { "epoch": 0.7866796993494893, "grad_norm": 45.68458557128906, "learning_rate": 1.3233102537713465e-07, "logits/chosen": -1.3132305145263672, "logits/rejected": -1.513753890991211, "logps/chosen": -2.453195095062256, "logps/rejected": -3.172518014907837, "loss": 1.2306, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -24.531949996948242, "rewards/margins": 7.193228244781494, "rewards/rejected": -31.725177764892578, "step": 23340 }, { "epoch": 0.7868482254204725, "grad_norm": 54.43955993652344, "learning_rate": 1.321317538237744e-07, "logits/chosen": -1.7532365322113037, "logits/rejected": -2.0115723609924316, "logps/chosen": -2.954211950302124, "logps/rejected": -3.273526430130005, "loss": 3.8185, "rewards/accuracies": 0.5, "rewards/chosen": -29.5421199798584, "rewards/margins": 3.1931443214416504, "rewards/rejected": -32.735267639160156, "step": 23345 }, { "epoch": 0.7870167514914558, "grad_norm": 158.7488555908203, "learning_rate": 1.3193260957512087e-07, "logits/chosen": -2.010417938232422, "logits/rejected": -2.2069597244262695, "logps/chosen": -3.5379459857940674, "logps/rejected": -3.6150882244110107, "loss": 3.2882, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -35.37946319580078, "rewards/margins": 0.771422266960144, "rewards/rejected": -36.150882720947266, "step": 23350 }, { "epoch": 0.787185277562439, "grad_norm": 33.453224182128906, "learning_rate": 1.317335927000897e-07, "logits/chosen": -1.8199208974838257, "logits/rejected": -2.153441905975342, "logps/chosen": -2.2132675647735596, "logps/rejected": -2.4548451900482178, "loss": 3.2995, "rewards/accuracies": 0.5, "rewards/chosen": -22.132671356201172, "rewards/margins": 2.4157779216766357, "rewards/rejected": -24.548452377319336, "step": 23355 }, { "epoch": 0.7873538036334221, "grad_norm": 28.82305145263672, "learning_rate": 1.3153470326755307e-07, "logits/chosen": -1.5860540866851807, "logits/rejected": -1.6239850521087646, "logps/chosen": -2.708892822265625, "logps/rejected": -3.2508621215820312, "loss": 2.395, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -27.08892822265625, "rewards/margins": 5.419692516326904, "rewards/rejected": -32.50861740112305, "step": 23360 }, { "epoch": 0.7875223297044053, "grad_norm": 31.444128036499023, "learning_rate": 1.3133594134633862e-07, "logits/chosen": -1.9128261804580688, "logits/rejected": -2.18424391746521, "logps/chosen": -2.671001672744751, "logps/rejected": -3.0663018226623535, "loss": 2.4916, "rewards/accuracies": 0.5, "rewards/chosen": -26.71001625061035, "rewards/margins": 3.953002452850342, "rewards/rejected": -30.663015365600586, "step": 23365 }, { "epoch": 0.7876908557753884, "grad_norm": 18.52704620361328, "learning_rate": 1.3113730700523024e-07, "logits/chosen": -2.3228516578674316, "logits/rejected": -2.5755486488342285, "logps/chosen": -2.3647756576538086, "logps/rejected": -2.4866623878479004, "loss": 2.6455, "rewards/accuracies": 0.5, "rewards/chosen": -23.647756576538086, "rewards/margins": 1.2188684940338135, "rewards/rejected": -24.86662483215332, "step": 23370 }, { "epoch": 0.7878593818463716, "grad_norm": 233.28515625, "learning_rate": 1.3093880031296718e-07, "logits/chosen": -1.7100486755371094, "logits/rejected": -2.266857624053955, "logps/chosen": -3.6958794593811035, "logps/rejected": -4.913882255554199, "loss": 2.0399, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -36.95878982543945, "rewards/margins": 12.180027961730957, "rewards/rejected": -49.138816833496094, "step": 23375 }, { "epoch": 0.7880279079173548, "grad_norm": 29.36772918701172, "learning_rate": 1.3074042133824486e-07, "logits/chosen": -1.528952956199646, "logits/rejected": -1.6609337329864502, "logps/chosen": -2.4949135780334473, "logps/rejected": -2.8517425060272217, "loss": 2.036, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.94913673400879, "rewards/margins": 3.568286418914795, "rewards/rejected": -28.51742172241211, "step": 23380 }, { "epoch": 0.7881964339883379, "grad_norm": 20.27956771850586, "learning_rate": 1.3054217014971465e-07, "logits/chosen": -1.9842973947525024, "logits/rejected": -2.2391180992126465, "logps/chosen": -2.5889785289764404, "logps/rejected": -3.751030445098877, "loss": 1.8571, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -25.889789581298828, "rewards/margins": 11.620512008666992, "rewards/rejected": -37.51030349731445, "step": 23385 }, { "epoch": 0.7883649600593212, "grad_norm": 206.58560180664062, "learning_rate": 1.3034404681598316e-07, "logits/chosen": -1.965024709701538, "logits/rejected": -1.7408416271209717, "logps/chosen": -2.933290481567383, "logps/rejected": -2.8030002117156982, "loss": 5.3126, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -29.332904815673828, "rewards/margins": -1.3029001951217651, "rewards/rejected": -28.030004501342773, "step": 23390 }, { "epoch": 0.7885334861303044, "grad_norm": 47.58219528198242, "learning_rate": 1.3014605140561314e-07, "logits/chosen": -1.651240587234497, "logits/rejected": -1.6010128259658813, "logps/chosen": -2.0883679389953613, "logps/rejected": -2.7429397106170654, "loss": 2.0386, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.883678436279297, "rewards/margins": 6.545716762542725, "rewards/rejected": -27.429393768310547, "step": 23395 }, { "epoch": 0.7887020122012876, "grad_norm": 55.15037536621094, "learning_rate": 1.2994818398712309e-07, "logits/chosen": -1.6969373226165771, "logits/rejected": -1.8868554830551147, "logps/chosen": -1.8029073476791382, "logps/rejected": -1.7463340759277344, "loss": 3.8652, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.02907371520996, "rewards/margins": -0.5657330751419067, "rewards/rejected": -17.463340759277344, "step": 23400 }, { "epoch": 0.7888705382722707, "grad_norm": 45.554691314697266, "learning_rate": 1.2975044462898727e-07, "logits/chosen": -1.935333251953125, "logits/rejected": -2.3411459922790527, "logps/chosen": -2.15376615524292, "logps/rejected": -3.0946743488311768, "loss": 1.3598, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.537662506103516, "rewards/margins": 9.409080505371094, "rewards/rejected": -30.946741104125977, "step": 23405 }, { "epoch": 0.7890390643432539, "grad_norm": 40.636802673339844, "learning_rate": 1.295528333996352e-07, "logits/chosen": -1.7083832025527954, "logits/rejected": -2.2345995903015137, "logps/chosen": -2.28090763092041, "logps/rejected": -2.776479482650757, "loss": 1.9808, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.809078216552734, "rewards/margins": 4.955718040466309, "rewards/rejected": -27.764795303344727, "step": 23410 }, { "epoch": 0.789207590414237, "grad_norm": 264.3153991699219, "learning_rate": 1.2935535036745238e-07, "logits/chosen": -1.5900719165802002, "logits/rejected": -1.3120791912078857, "logps/chosen": -3.130431652069092, "logps/rejected": -3.3375420570373535, "loss": 3.7296, "rewards/accuracies": 0.5, "rewards/chosen": -31.3043212890625, "rewards/margins": 2.0710978507995605, "rewards/rejected": -33.37541961669922, "step": 23415 }, { "epoch": 0.7893761164852202, "grad_norm": 65.93145751953125, "learning_rate": 1.2915799560078017e-07, "logits/chosen": -1.4226365089416504, "logits/rejected": -1.7168121337890625, "logps/chosen": -2.6043052673339844, "logps/rejected": -2.3740992546081543, "loss": 6.4785, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -26.043054580688477, "rewards/margins": -2.302061080932617, "rewards/rejected": -23.74099349975586, "step": 23420 }, { "epoch": 0.7895446425562035, "grad_norm": 38.1806755065918, "learning_rate": 1.2896076916791493e-07, "logits/chosen": -1.9888120889663696, "logits/rejected": -2.302474021911621, "logps/chosen": -2.144757032394409, "logps/rejected": -2.4852633476257324, "loss": 2.3075, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.44757080078125, "rewards/margins": 3.4050631523132324, "rewards/rejected": -24.85263442993164, "step": 23425 }, { "epoch": 0.7897131686271867, "grad_norm": 16.340255737304688, "learning_rate": 1.2876367113710912e-07, "logits/chosen": -1.9023936986923218, "logits/rejected": -1.8551537990570068, "logps/chosen": -3.38207745552063, "logps/rejected": -3.6513431072235107, "loss": 2.8637, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -33.820777893066406, "rewards/margins": 2.692657709121704, "rewards/rejected": -36.513431549072266, "step": 23430 }, { "epoch": 0.7898816946981698, "grad_norm": 13.2061128616333, "learning_rate": 1.2856670157657063e-07, "logits/chosen": -1.9358323812484741, "logits/rejected": -1.9602285623550415, "logps/chosen": -3.3243343830108643, "logps/rejected": -4.002040863037109, "loss": 2.5865, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -33.24333953857422, "rewards/margins": 6.777066707611084, "rewards/rejected": -40.020408630371094, "step": 23435 }, { "epoch": 0.790050220769153, "grad_norm": 63.23234939575195, "learning_rate": 1.2836986055446282e-07, "logits/chosen": -1.6777303218841553, "logits/rejected": -1.462426781654358, "logps/chosen": -2.3692269325256348, "logps/rejected": -2.870598316192627, "loss": 2.7527, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.6922664642334, "rewards/margins": 5.013715744018555, "rewards/rejected": -28.705982208251953, "step": 23440 }, { "epoch": 0.7902187468401362, "grad_norm": 36.042152404785156, "learning_rate": 1.2817314813890462e-07, "logits/chosen": -1.772571325302124, "logits/rejected": -1.8282972574234009, "logps/chosen": -2.993535280227661, "logps/rejected": -2.908937931060791, "loss": 6.4062, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -29.935354232788086, "rewards/margins": -0.8459756970405579, "rewards/rejected": -29.08937644958496, "step": 23445 }, { "epoch": 0.7903872729111193, "grad_norm": 5.146857606774802e-09, "learning_rate": 1.2797656439797045e-07, "logits/chosen": -1.7889522314071655, "logits/rejected": -2.487628698348999, "logps/chosen": -3.045865058898926, "logps/rejected": -4.883096218109131, "loss": 1.714, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -30.458648681640625, "rewards/margins": 18.372310638427734, "rewards/rejected": -48.830955505371094, "step": 23450 }, { "epoch": 0.7905557989821025, "grad_norm": 18.409088134765625, "learning_rate": 1.2778010939969036e-07, "logits/chosen": -1.5572829246520996, "logits/rejected": -1.7321348190307617, "logps/chosen": -1.9669567346572876, "logps/rejected": -2.5498642921447754, "loss": 1.7725, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.669567108154297, "rewards/margins": 5.829077243804932, "rewards/rejected": -25.498645782470703, "step": 23455 }, { "epoch": 0.7907243250530858, "grad_norm": 25.91119956970215, "learning_rate": 1.2758378321204937e-07, "logits/chosen": -1.4228665828704834, "logits/rejected": -1.7013216018676758, "logps/chosen": -2.8095977306365967, "logps/rejected": -2.6884658336639404, "loss": 4.7333, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -28.095977783203125, "rewards/margins": -1.2113181352615356, "rewards/rejected": -26.884658813476562, "step": 23460 }, { "epoch": 0.7908928511240689, "grad_norm": 36.09959030151367, "learning_rate": 1.2738758590298837e-07, "logits/chosen": -1.487137794494629, "logits/rejected": -1.973081350326538, "logps/chosen": -2.0125210285186768, "logps/rejected": -2.5702998638153076, "loss": 2.3875, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.12520980834961, "rewards/margins": 5.577790260314941, "rewards/rejected": -25.702999114990234, "step": 23465 }, { "epoch": 0.7910613771950521, "grad_norm": 18.3976993560791, "learning_rate": 1.271915175404036e-07, "logits/chosen": -1.9622561931610107, "logits/rejected": -2.2915892601013184, "logps/chosen": -2.484745740890503, "logps/rejected": -3.077031373977661, "loss": 1.5175, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -24.847457885742188, "rewards/margins": 5.922853469848633, "rewards/rejected": -30.770313262939453, "step": 23470 }, { "epoch": 0.7912299032660353, "grad_norm": 56.609535217285156, "learning_rate": 1.2699557819214668e-07, "logits/chosen": -1.1058142185211182, "logits/rejected": -1.6427888870239258, "logps/chosen": -2.4761815071105957, "logps/rejected": -3.5051283836364746, "loss": 3.4386, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.76181411743164, "rewards/margins": 10.289472579956055, "rewards/rejected": -35.05128479003906, "step": 23475 }, { "epoch": 0.7913984293370184, "grad_norm": 288.9089050292969, "learning_rate": 1.267997679260242e-07, "logits/chosen": -1.9424368143081665, "logits/rejected": -1.7826957702636719, "logps/chosen": -3.042111396789551, "logps/rejected": -2.601473331451416, "loss": 10.5931, "rewards/accuracies": 0.5, "rewards/chosen": -30.42111587524414, "rewards/margins": -4.406381607055664, "rewards/rejected": -26.01473045349121, "step": 23480 }, { "epoch": 0.7915669554080016, "grad_norm": 29.078601837158203, "learning_rate": 1.2660408680979855e-07, "logits/chosen": -1.6248836517333984, "logits/rejected": -1.9362905025482178, "logps/chosen": -2.0807981491088867, "logps/rejected": -2.4495043754577637, "loss": 1.7965, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.807979583740234, "rewards/margins": 3.6870627403259277, "rewards/rejected": -24.495044708251953, "step": 23485 }, { "epoch": 0.7917354814789848, "grad_norm": 17.588529586791992, "learning_rate": 1.2640853491118736e-07, "logits/chosen": -2.079242706298828, "logits/rejected": -2.186800479888916, "logps/chosen": -3.2707207202911377, "logps/rejected": -3.467264175415039, "loss": 3.3609, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -32.70720672607422, "rewards/margins": 1.965433120727539, "rewards/rejected": -34.67264175415039, "step": 23490 }, { "epoch": 0.7919040075499679, "grad_norm": 55.01066207885742, "learning_rate": 1.262131122978632e-07, "logits/chosen": -1.6255840063095093, "logits/rejected": -1.6399444341659546, "logps/chosen": -2.202665328979492, "logps/rejected": -2.0976974964141846, "loss": 4.2069, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -22.026653289794922, "rewards/margins": -1.0496762990951538, "rewards/rejected": -20.976974487304688, "step": 23495 }, { "epoch": 0.7920725336209512, "grad_norm": 43.822635650634766, "learning_rate": 1.2601781903745428e-07, "logits/chosen": -2.414681911468506, "logits/rejected": -2.323493480682373, "logps/chosen": -2.716387987136841, "logps/rejected": -3.0716395378112793, "loss": 1.8628, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -27.16387939453125, "rewards/margins": 3.5525169372558594, "rewards/rejected": -30.71639633178711, "step": 23500 }, { "epoch": 0.7922410596919344, "grad_norm": 30.146278381347656, "learning_rate": 1.2582265519754383e-07, "logits/chosen": -1.63968026638031, "logits/rejected": -1.736728310585022, "logps/chosen": -2.398235559463501, "logps/rejected": -2.7017436027526855, "loss": 2.6609, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.98235511779785, "rewards/margins": 3.0350775718688965, "rewards/rejected": -27.017431259155273, "step": 23505 }, { "epoch": 0.7924095857629175, "grad_norm": 15.89415168762207, "learning_rate": 1.256276208456706e-07, "logits/chosen": -1.4977285861968994, "logits/rejected": -1.5650742053985596, "logps/chosen": -2.1227152347564697, "logps/rejected": -2.2276690006256104, "loss": 2.452, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.22715187072754, "rewards/margins": 1.0495368242263794, "rewards/rejected": -22.276689529418945, "step": 23510 }, { "epoch": 0.7925781118339007, "grad_norm": 18.189443588256836, "learning_rate": 1.2543271604932798e-07, "logits/chosen": -1.3050501346588135, "logits/rejected": -1.4186303615570068, "logps/chosen": -2.170626163482666, "logps/rejected": -2.408052921295166, "loss": 3.0794, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.70625877380371, "rewards/margins": 2.3742713928222656, "rewards/rejected": -24.080530166625977, "step": 23515 }, { "epoch": 0.7927466379048839, "grad_norm": 21.401525497436523, "learning_rate": 1.2523794087596497e-07, "logits/chosen": -1.9882118701934814, "logits/rejected": -2.178072214126587, "logps/chosen": -2.76887583732605, "logps/rejected": -2.8130533695220947, "loss": 4.3253, "rewards/accuracies": 0.5, "rewards/chosen": -27.688756942749023, "rewards/margins": 0.44177302718162537, "rewards/rejected": -28.13053321838379, "step": 23520 }, { "epoch": 0.792915163975867, "grad_norm": 20.015840530395508, "learning_rate": 1.250432953929857e-07, "logits/chosen": -0.9792146682739258, "logits/rejected": -1.1626332998275757, "logps/chosen": -3.1012685298919678, "logps/rejected": -3.83912992477417, "loss": 4.7123, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -31.012685775756836, "rewards/margins": 7.3786163330078125, "rewards/rejected": -38.39130401611328, "step": 23525 }, { "epoch": 0.7930836900468502, "grad_norm": 17.142728805541992, "learning_rate": 1.2484877966774903e-07, "logits/chosen": -1.9387973546981812, "logits/rejected": -2.458888292312622, "logps/chosen": -2.6190226078033447, "logps/rejected": -3.124077320098877, "loss": 1.9793, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.190227508544922, "rewards/margins": 5.050547122955322, "rewards/rejected": -31.240772247314453, "step": 23530 }, { "epoch": 0.7932522161178335, "grad_norm": 67.72013092041016, "learning_rate": 1.2465439376756937e-07, "logits/chosen": -2.1118016242980957, "logits/rejected": -2.0462646484375, "logps/chosen": -2.5230441093444824, "logps/rejected": -2.5223631858825684, "loss": 3.2112, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.230443954467773, "rewards/margins": -0.006808662321418524, "rewards/rejected": -25.223636627197266, "step": 23535 }, { "epoch": 0.7934207421888166, "grad_norm": 1.92659592628479, "learning_rate": 1.2446013775971604e-07, "logits/chosen": -1.891710638999939, "logits/rejected": -2.7494044303894043, "logps/chosen": -1.8182204961776733, "logps/rejected": -2.3620352745056152, "loss": 0.9304, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -18.182205200195312, "rewards/margins": 5.438145637512207, "rewards/rejected": -23.620351791381836, "step": 23540 }, { "epoch": 0.7935892682597998, "grad_norm": 16.09982681274414, "learning_rate": 1.2426601171141344e-07, "logits/chosen": -1.489386796951294, "logits/rejected": -1.820673942565918, "logps/chosen": -1.9332993030548096, "logps/rejected": -2.682340621948242, "loss": 1.2905, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.332992553710938, "rewards/margins": 7.490413665771484, "rewards/rejected": -26.823406219482422, "step": 23545 }, { "epoch": 0.793757794330783, "grad_norm": 16.377511978149414, "learning_rate": 1.240720156898407e-07, "logits/chosen": -1.8569927215576172, "logits/rejected": -1.9203824996948242, "logps/chosen": -2.1887829303741455, "logps/rejected": -2.242539882659912, "loss": 3.3256, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.887828826904297, "rewards/margins": 0.5375703573226929, "rewards/rejected": -22.425399780273438, "step": 23550 }, { "epoch": 0.7939263204017661, "grad_norm": 290.3534240722656, "learning_rate": 1.238781497621324e-07, "logits/chosen": -2.299469470977783, "logits/rejected": -2.2020373344421387, "logps/chosen": -2.653339385986328, "logps/rejected": -2.5723299980163574, "loss": 4.8782, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -26.533395767211914, "rewards/margins": -0.8100942373275757, "rewards/rejected": -25.72330093383789, "step": 23555 }, { "epoch": 0.7940948464727493, "grad_norm": 3.090402603149414, "learning_rate": 1.2368441399537804e-07, "logits/chosen": -1.4215683937072754, "logits/rejected": -1.6714990139007568, "logps/chosen": -2.486147403717041, "logps/rejected": -3.095050811767578, "loss": 1.8828, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.86147689819336, "rewards/margins": 6.089034080505371, "rewards/rejected": -30.950510025024414, "step": 23560 }, { "epoch": 0.7942633725437325, "grad_norm": 92.12581634521484, "learning_rate": 1.234908084566215e-07, "logits/chosen": -1.4645830392837524, "logits/rejected": -1.3868095874786377, "logps/chosen": -1.9499835968017578, "logps/rejected": -1.8435981273651123, "loss": 4.3021, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -19.499835968017578, "rewards/margins": -1.0638525485992432, "rewards/rejected": -18.435983657836914, "step": 23565 }, { "epoch": 0.7944318986147157, "grad_norm": 26.912521362304688, "learning_rate": 1.232973332128624e-07, "logits/chosen": -1.7435210943222046, "logits/rejected": -2.2686400413513184, "logps/chosen": -2.8063995838165283, "logps/rejected": -3.7166335582733154, "loss": 1.2754, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -28.063995361328125, "rewards/margins": 9.102341651916504, "rewards/rejected": -37.16633987426758, "step": 23570 }, { "epoch": 0.7946004246856989, "grad_norm": 23.525171279907227, "learning_rate": 1.2310398833105473e-07, "logits/chosen": -1.9364773035049438, "logits/rejected": -2.058380603790283, "logps/chosen": -2.36995792388916, "logps/rejected": -2.8152060508728027, "loss": 1.8172, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.6995792388916, "rewards/margins": 4.452480792999268, "rewards/rejected": -28.15205955505371, "step": 23575 }, { "epoch": 0.7947689507566821, "grad_norm": 23.474639892578125, "learning_rate": 1.229107738781076e-07, "logits/chosen": -1.6277767419815063, "logits/rejected": -2.2872207164764404, "logps/chosen": -3.2178142070770264, "logps/rejected": -4.609158515930176, "loss": 2.4449, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -32.178138732910156, "rewards/margins": 13.913442611694336, "rewards/rejected": -46.091583251953125, "step": 23580 }, { "epoch": 0.7949374768276652, "grad_norm": 70.98563385009766, "learning_rate": 1.227176899208849e-07, "logits/chosen": -1.8348493576049805, "logits/rejected": -2.1374592781066895, "logps/chosen": -2.2831220626831055, "logps/rejected": -2.608473777770996, "loss": 2.3469, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.831220626831055, "rewards/margins": 3.253516674041748, "rewards/rejected": -26.08473777770996, "step": 23585 }, { "epoch": 0.7951060028986484, "grad_norm": 30.179494857788086, "learning_rate": 1.2252473652620555e-07, "logits/chosen": -1.1523596048355103, "logits/rejected": -1.3122644424438477, "logps/chosen": -1.8992496728897095, "logps/rejected": -1.9855247735977173, "loss": 2.7997, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.992496490478516, "rewards/margins": 0.8627524375915527, "rewards/rejected": -19.855249404907227, "step": 23590 }, { "epoch": 0.7952745289696316, "grad_norm": 34.42359924316406, "learning_rate": 1.2233191376084278e-07, "logits/chosen": -1.2946655750274658, "logits/rejected": -1.474416732788086, "logps/chosen": -2.49336838722229, "logps/rejected": -2.689332962036133, "loss": 2.6393, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -24.933683395385742, "rewards/margins": 1.9596456289291382, "rewards/rejected": -26.893329620361328, "step": 23595 }, { "epoch": 0.7954430550406147, "grad_norm": 32.83942794799805, "learning_rate": 1.2213922169152512e-07, "logits/chosen": -2.071537494659424, "logits/rejected": -1.9408347606658936, "logps/chosen": -2.3998780250549316, "logps/rejected": -2.4216084480285645, "loss": 3.0872, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.998775482177734, "rewards/margins": 0.21730585396289825, "rewards/rejected": -24.216081619262695, "step": 23600 }, { "epoch": 0.7954430550406147, "eval_logits/chosen": -2.268324375152588, "eval_logits/rejected": -2.4430205821990967, "eval_logps/chosen": -2.2717368602752686, "eval_logps/rejected": -2.42488694190979, "eval_loss": 3.081331491470337, "eval_rewards/accuracies": 0.6299999952316284, "eval_rewards/chosen": -22.717369079589844, "eval_rewards/margins": 1.5315022468566895, "eval_rewards/rejected": -24.248868942260742, "eval_runtime": 12.9011, "eval_samples_per_second": 7.751, "eval_steps_per_second": 1.938, "step": 23600 }, { "epoch": 0.7956115811115979, "grad_norm": 23.862712860107422, "learning_rate": 1.2194666038493572e-07, "logits/chosen": -2.4514338970184326, "logits/rejected": -2.3811779022216797, "logps/chosen": -2.263093948364258, "logps/rejected": -2.6213536262512207, "loss": 2.0757, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.63094139099121, "rewards/margins": 3.582595109939575, "rewards/rejected": -26.213537216186523, "step": 23605 }, { "epoch": 0.7957801071825812, "grad_norm": 18.74156951904297, "learning_rate": 1.217542299077125e-07, "logits/chosen": -1.8089141845703125, "logits/rejected": -2.0515694618225098, "logps/chosen": -3.133936643600464, "logps/rejected": -2.848822593688965, "loss": 6.858, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -31.339366912841797, "rewards/margins": -2.8511433601379395, "rewards/rejected": -28.48822593688965, "step": 23610 }, { "epoch": 0.7959486332535644, "grad_norm": 16.450408935546875, "learning_rate": 1.2156193032644814e-07, "logits/chosen": -1.2309521436691284, "logits/rejected": -1.6049835681915283, "logps/chosen": -1.9556461572647095, "logps/rejected": -2.1087331771850586, "loss": 2.3946, "rewards/accuracies": 0.5, "rewards/chosen": -19.556461334228516, "rewards/margins": 1.5308706760406494, "rewards/rejected": -21.087329864501953, "step": 23615 }, { "epoch": 0.7961171593245475, "grad_norm": 23.52358627319336, "learning_rate": 1.2136976170768964e-07, "logits/chosen": -2.2708792686462402, "logits/rejected": -2.4395358562469482, "logps/chosen": -2.665672540664673, "logps/rejected": -2.482041358947754, "loss": 5.9053, "rewards/accuracies": 0.5, "rewards/chosen": -26.656723022460938, "rewards/margins": -1.8363081216812134, "rewards/rejected": -24.820415496826172, "step": 23620 }, { "epoch": 0.7962856853955307, "grad_norm": 43.37750244140625, "learning_rate": 1.2117772411793926e-07, "logits/chosen": -1.9793859720230103, "logits/rejected": -2.1826746463775635, "logps/chosen": -2.1819968223571777, "logps/rejected": -2.199878215789795, "loss": 2.9864, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.819969177246094, "rewards/margins": 0.17881163954734802, "rewards/rejected": -21.998783111572266, "step": 23625 }, { "epoch": 0.7964542114665139, "grad_norm": 59.14590835571289, "learning_rate": 1.2098581762365362e-07, "logits/chosen": -1.2757107019424438, "logits/rejected": -1.9035346508026123, "logps/chosen": -2.4866976737976074, "logps/rejected": -3.309864044189453, "loss": 2.3694, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.86697769165039, "rewards/margins": 8.23166561126709, "rewards/rejected": -33.09864044189453, "step": 23630 }, { "epoch": 0.796622737537497, "grad_norm": 185.86553955078125, "learning_rate": 1.2079404229124384e-07, "logits/chosen": -2.433590888977051, "logits/rejected": -2.8083548545837402, "logps/chosen": -3.4761478900909424, "logps/rejected": -3.9497249126434326, "loss": 3.5581, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -34.761478424072266, "rewards/margins": 4.73577356338501, "rewards/rejected": -39.497249603271484, "step": 23635 }, { "epoch": 0.7967912636084802, "grad_norm": 29.933055877685547, "learning_rate": 1.206023981870759e-07, "logits/chosen": -1.9267528057098389, "logits/rejected": -2.0702621936798096, "logps/chosen": -2.0923218727111816, "logps/rejected": -2.2572989463806152, "loss": 2.6568, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.923219680786133, "rewards/margins": 1.6497719287872314, "rewards/rejected": -22.57299041748047, "step": 23640 }, { "epoch": 0.7969597896794635, "grad_norm": 35.02862548828125, "learning_rate": 1.204108853774704e-07, "logits/chosen": -2.425234317779541, "logits/rejected": -2.033950090408325, "logps/chosen": -2.5497958660125732, "logps/rejected": -2.62798810005188, "loss": 2.8787, "rewards/accuracies": 0.5, "rewards/chosen": -25.49795913696289, "rewards/margins": 0.7819207906723022, "rewards/rejected": -26.27988052368164, "step": 23645 }, { "epoch": 0.7971283157504466, "grad_norm": 36.276649475097656, "learning_rate": 1.2021950392870217e-07, "logits/chosen": -1.880113959312439, "logits/rejected": -2.2360730171203613, "logps/chosen": -2.114194393157959, "logps/rejected": -2.259273052215576, "loss": 2.6131, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.14194107055664, "rewards/margins": 1.4507843255996704, "rewards/rejected": -22.592727661132812, "step": 23650 }, { "epoch": 0.7972968418214298, "grad_norm": 0.8350377678871155, "learning_rate": 1.2002825390700083e-07, "logits/chosen": -1.2839758396148682, "logits/rejected": -1.7719905376434326, "logps/chosen": -2.047053575515747, "logps/rejected": -2.2489142417907715, "loss": 4.0195, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.470537185668945, "rewards/margins": 2.0186100006103516, "rewards/rejected": -22.489147186279297, "step": 23655 }, { "epoch": 0.797465367892413, "grad_norm": 30.018207550048828, "learning_rate": 1.1983713537855057e-07, "logits/chosen": -1.1491228342056274, "logits/rejected": -1.546242356300354, "logps/chosen": -1.9170169830322266, "logps/rejected": -1.9767497777938843, "loss": 3.0863, "rewards/accuracies": 0.5, "rewards/chosen": -19.170169830322266, "rewards/margins": 0.5973286628723145, "rewards/rejected": -19.767498016357422, "step": 23660 }, { "epoch": 0.7976338939633961, "grad_norm": 19.978004455566406, "learning_rate": 1.1964614840949e-07, "logits/chosen": -1.8129030466079712, "logits/rejected": -2.4642412662506104, "logps/chosen": -2.960409164428711, "logps/rejected": -3.7300522327423096, "loss": 1.272, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -29.604089736938477, "rewards/margins": 7.696431636810303, "rewards/rejected": -37.30052185058594, "step": 23665 }, { "epoch": 0.7978024200343793, "grad_norm": 30.25568962097168, "learning_rate": 1.19455293065912e-07, "logits/chosen": -2.0144896507263184, "logits/rejected": -2.042292833328247, "logps/chosen": -2.054476499557495, "logps/rejected": -2.0532383918762207, "loss": 3.2688, "rewards/accuracies": 0.5, "rewards/chosen": -20.54476547241211, "rewards/margins": -0.012381648644804955, "rewards/rejected": -20.532384872436523, "step": 23670 }, { "epoch": 0.7979709461053625, "grad_norm": 14.91845417022705, "learning_rate": 1.1926456941386427e-07, "logits/chosen": -1.9736425876617432, "logits/rejected": -2.074195146560669, "logps/chosen": -2.37538743019104, "logps/rejected": -3.2154929637908936, "loss": 1.028, "rewards/accuracies": 1.0, "rewards/chosen": -23.753873825073242, "rewards/margins": 8.401058197021484, "rewards/rejected": -32.154930114746094, "step": 23675 }, { "epoch": 0.7981394721763457, "grad_norm": 25.458602905273438, "learning_rate": 1.1907397751934878e-07, "logits/chosen": -2.014819622039795, "logits/rejected": -2.256798028945923, "logps/chosen": -2.9026269912719727, "logps/rejected": -3.2940673828125, "loss": 1.7844, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -29.026269912719727, "rewards/margins": 3.9144043922424316, "rewards/rejected": -32.940677642822266, "step": 23680 }, { "epoch": 0.7983079982473289, "grad_norm": 41.000709533691406, "learning_rate": 1.1888351744832165e-07, "logits/chosen": -1.6192891597747803, "logits/rejected": -1.9515184164047241, "logps/chosen": -2.7270865440368652, "logps/rejected": -2.7937889099121094, "loss": 3.3432, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -27.270864486694336, "rewards/margins": 0.6670233011245728, "rewards/rejected": -27.937885284423828, "step": 23685 }, { "epoch": 0.7984765243183121, "grad_norm": 53.08238983154297, "learning_rate": 1.1869318926669375e-07, "logits/chosen": -1.8159668445587158, "logits/rejected": -2.2407050132751465, "logps/chosen": -2.7758102416992188, "logps/rejected": -2.7419583797454834, "loss": 4.1484, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -27.758102416992188, "rewards/margins": -0.3385186195373535, "rewards/rejected": -27.419586181640625, "step": 23690 }, { "epoch": 0.7986450503892952, "grad_norm": 31.205656051635742, "learning_rate": 1.1850299304033012e-07, "logits/chosen": -1.4135322570800781, "logits/rejected": -2.1289544105529785, "logps/chosen": -2.77282452583313, "logps/rejected": -3.640005588531494, "loss": 3.2513, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -27.728246688842773, "rewards/margins": 8.671809196472168, "rewards/rejected": -36.400054931640625, "step": 23695 }, { "epoch": 0.7988135764602784, "grad_norm": 11.864728927612305, "learning_rate": 1.183129288350504e-07, "logits/chosen": -1.782881736755371, "logits/rejected": -1.5927501916885376, "logps/chosen": -2.4381699562072754, "logps/rejected": -2.653639793395996, "loss": 3.9229, "rewards/accuracies": 0.5, "rewards/chosen": -24.381698608398438, "rewards/margins": 2.1546998023986816, "rewards/rejected": -26.536401748657227, "step": 23700 }, { "epoch": 0.7989821025312616, "grad_norm": 36.167850494384766, "learning_rate": 1.1812299671662801e-07, "logits/chosen": -1.3345615863800049, "logits/rejected": -1.5490849018096924, "logps/chosen": -2.146177053451538, "logps/rejected": -3.1967315673828125, "loss": 2.8808, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.461772918701172, "rewards/margins": 10.505544662475586, "rewards/rejected": -31.967315673828125, "step": 23705 }, { "epoch": 0.7991506286022447, "grad_norm": 25.301176071166992, "learning_rate": 1.1793319675079105e-07, "logits/chosen": -1.9392036199569702, "logits/rejected": -2.0048937797546387, "logps/chosen": -2.289959669113159, "logps/rejected": -2.714442491531372, "loss": 1.4998, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -22.899595260620117, "rewards/margins": 4.244827747344971, "rewards/rejected": -27.144420623779297, "step": 23710 }, { "epoch": 0.7993191546732279, "grad_norm": 68.43116760253906, "learning_rate": 1.1774352900322193e-07, "logits/chosen": -2.4753763675689697, "logits/rejected": -2.485348701477051, "logps/chosen": -3.014070510864258, "logps/rejected": -3.5698540210723877, "loss": 2.7535, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -30.140705108642578, "rewards/margins": 5.557833671569824, "rewards/rejected": -35.69853973388672, "step": 23715 }, { "epoch": 0.7994876807442112, "grad_norm": 18.36795425415039, "learning_rate": 1.1755399353955719e-07, "logits/chosen": -2.0299389362335205, "logits/rejected": -2.1144871711730957, "logps/chosen": -2.5242087841033936, "logps/rejected": -2.5602338314056396, "loss": 4.3361, "rewards/accuracies": 0.5, "rewards/chosen": -25.24208641052246, "rewards/margins": 0.36025285720825195, "rewards/rejected": -25.602340698242188, "step": 23720 }, { "epoch": 0.7996562068151943, "grad_norm": 22.82927131652832, "learning_rate": 1.1736459042538744e-07, "logits/chosen": -2.183875799179077, "logits/rejected": -2.2232208251953125, "logps/chosen": -2.3259165287017822, "logps/rejected": -2.5631721019744873, "loss": 2.9509, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.259164810180664, "rewards/margins": 2.3725597858428955, "rewards/rejected": -25.631725311279297, "step": 23725 }, { "epoch": 0.7998247328861775, "grad_norm": 42.674198150634766, "learning_rate": 1.1717531972625766e-07, "logits/chosen": -1.6192405223846436, "logits/rejected": -1.6916412115097046, "logps/chosen": -1.889173150062561, "logps/rejected": -1.9974628686904907, "loss": 3.0027, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -18.891727447509766, "rewards/margins": 1.082900047302246, "rewards/rejected": -19.974628448486328, "step": 23730 }, { "epoch": 0.7999932589571607, "grad_norm": 29.702539443969727, "learning_rate": 1.1698618150766703e-07, "logits/chosen": -1.8145778179168701, "logits/rejected": -2.477713108062744, "logps/chosen": -2.2364907264709473, "logps/rejected": -2.771986484527588, "loss": 1.8934, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.36490821838379, "rewards/margins": 5.354956150054932, "rewards/rejected": -27.719863891601562, "step": 23735 }, { "epoch": 0.8001617850281438, "grad_norm": 117.44536590576172, "learning_rate": 1.1679717583506887e-07, "logits/chosen": -2.1266894340515137, "logits/rejected": -2.262590169906616, "logps/chosen": -2.91984224319458, "logps/rejected": -2.8861470222473145, "loss": 4.2483, "rewards/accuracies": 0.5, "rewards/chosen": -29.198421478271484, "rewards/margins": -0.33695316314697266, "rewards/rejected": -28.861469268798828, "step": 23740 }, { "epoch": 0.800330311099127, "grad_norm": 27.362186431884766, "learning_rate": 1.1660830277387057e-07, "logits/chosen": -1.4639588594436646, "logits/rejected": -1.4785763025283813, "logps/chosen": -2.717275619506836, "logps/rejected": -2.8771772384643555, "loss": 3.1879, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -27.17275619506836, "rewards/margins": 1.5990158319473267, "rewards/rejected": -28.771770477294922, "step": 23745 }, { "epoch": 0.8004988371701102, "grad_norm": 36.77997589111328, "learning_rate": 1.1641956238943374e-07, "logits/chosen": -2.3897011280059814, "logits/rejected": -2.4779114723205566, "logps/chosen": -2.4858672618865967, "logps/rejected": -3.1868858337402344, "loss": 2.9593, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.858673095703125, "rewards/margins": 7.010186672210693, "rewards/rejected": -31.868860244750977, "step": 23750 }, { "epoch": 0.8006673632410934, "grad_norm": 48.805233001708984, "learning_rate": 1.1623095474707384e-07, "logits/chosen": -2.0982353687286377, "logits/rejected": -2.279383897781372, "logps/chosen": -2.692783832550049, "logps/rejected": -2.985405206680298, "loss": 2.3231, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.927837371826172, "rewards/margins": 2.926213502883911, "rewards/rejected": -29.854053497314453, "step": 23755 }, { "epoch": 0.8008358893120766, "grad_norm": 32.98495101928711, "learning_rate": 1.160424799120605e-07, "logits/chosen": -1.9733015298843384, "logits/rejected": -2.725742816925049, "logps/chosen": -2.0132980346679688, "logps/rejected": -2.7367465496063232, "loss": 0.4751, "rewards/accuracies": 1.0, "rewards/chosen": -20.132980346679688, "rewards/margins": 7.234484672546387, "rewards/rejected": -27.36746597290039, "step": 23760 }, { "epoch": 0.8010044153830598, "grad_norm": 9.345693588256836, "learning_rate": 1.1585413794961763e-07, "logits/chosen": -2.085667848587036, "logits/rejected": -2.239039659500122, "logps/chosen": -2.438840866088867, "logps/rejected": -3.195539712905884, "loss": 1.1036, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -24.388408660888672, "rewards/margins": 7.566989898681641, "rewards/rejected": -31.955394744873047, "step": 23765 }, { "epoch": 0.8011729414540429, "grad_norm": 45.57161331176758, "learning_rate": 1.1566592892492299e-07, "logits/chosen": -1.311514139175415, "logits/rejected": -1.605931043624878, "logps/chosen": -2.6677563190460205, "logps/rejected": -2.8092823028564453, "loss": 2.9337, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.677562713623047, "rewards/margins": 1.415259599685669, "rewards/rejected": -28.092823028564453, "step": 23770 }, { "epoch": 0.8013414675250261, "grad_norm": 34.057655334472656, "learning_rate": 1.1547785290310801e-07, "logits/chosen": -0.9410927891731262, "logits/rejected": -1.327196717262268, "logps/chosen": -3.036648988723755, "logps/rejected": -3.29571533203125, "loss": 3.1449, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -30.366491317749023, "rewards/margins": 2.5906639099121094, "rewards/rejected": -32.9571533203125, "step": 23775 }, { "epoch": 0.8015099935960093, "grad_norm": 61.107078552246094, "learning_rate": 1.1528990994925864e-07, "logits/chosen": -1.8493750095367432, "logits/rejected": -2.0228495597839355, "logps/chosen": -3.3258419036865234, "logps/rejected": -3.1576480865478516, "loss": 6.4411, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -33.2584228515625, "rewards/margins": -1.6819394826889038, "rewards/rejected": -31.576480865478516, "step": 23780 }, { "epoch": 0.8016785196669924, "grad_norm": 46.79604721069336, "learning_rate": 1.1510210012841454e-07, "logits/chosen": -0.7938812375068665, "logits/rejected": -0.8427656292915344, "logps/chosen": -2.245758533477783, "logps/rejected": -2.696263551712036, "loss": 2.7041, "rewards/accuracies": 0.5, "rewards/chosen": -22.457584381103516, "rewards/margins": 4.505049705505371, "rewards/rejected": -26.962635040283203, "step": 23785 }, { "epoch": 0.8018470457379757, "grad_norm": 10.535088539123535, "learning_rate": 1.1491442350556913e-07, "logits/chosen": -1.5898624658584595, "logits/rejected": -1.7450120449066162, "logps/chosen": -2.6972622871398926, "logps/rejected": -2.9257194995880127, "loss": 1.6859, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -26.972620010375977, "rewards/margins": 2.2845757007598877, "rewards/rejected": -29.257198333740234, "step": 23790 }, { "epoch": 0.8020155718089589, "grad_norm": 22.262107849121094, "learning_rate": 1.1472688014566994e-07, "logits/chosen": -2.0982842445373535, "logits/rejected": -2.317521095275879, "logps/chosen": -3.2699222564697266, "logps/rejected": -3.510211944580078, "loss": 4.1141, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -32.69922637939453, "rewards/margins": 2.4028992652893066, "rewards/rejected": -35.10212326049805, "step": 23795 }, { "epoch": 0.802184097879942, "grad_norm": 19.394956588745117, "learning_rate": 1.1453947011361837e-07, "logits/chosen": -1.8001686334609985, "logits/rejected": -2.0557053089141846, "logps/chosen": -2.1141293048858643, "logps/rejected": -2.363154888153076, "loss": 2.1705, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.141294479370117, "rewards/margins": 2.490255832672119, "rewards/rejected": -23.631549835205078, "step": 23800 }, { "epoch": 0.8023526239509252, "grad_norm": 33.673126220703125, "learning_rate": 1.1435219347426983e-07, "logits/chosen": -1.997754693031311, "logits/rejected": -2.667611598968506, "logps/chosen": -2.3104987144470215, "logps/rejected": -3.8297152519226074, "loss": 1.729, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -23.10498809814453, "rewards/margins": 15.192166328430176, "rewards/rejected": -38.297157287597656, "step": 23805 }, { "epoch": 0.8025211500219084, "grad_norm": 46.4930534362793, "learning_rate": 1.1416505029243307e-07, "logits/chosen": -1.2600901126861572, "logits/rejected": -1.6382179260253906, "logps/chosen": -2.0742053985595703, "logps/rejected": -2.07768177986145, "loss": 3.2559, "rewards/accuracies": 0.5, "rewards/chosen": -20.742053985595703, "rewards/margins": 0.03476228564977646, "rewards/rejected": -20.776817321777344, "step": 23810 }, { "epoch": 0.8026896760928915, "grad_norm": 20.000106811523438, "learning_rate": 1.1397804063287109e-07, "logits/chosen": -1.8872020244598389, "logits/rejected": -2.278292179107666, "logps/chosen": -2.424549102783203, "logps/rejected": -2.9823575019836426, "loss": 2.7308, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.2454891204834, "rewards/margins": 5.578084468841553, "rewards/rejected": -29.82357406616211, "step": 23815 }, { "epoch": 0.8028582021638747, "grad_norm": 77.44605255126953, "learning_rate": 1.1379116456030074e-07, "logits/chosen": -1.7925255298614502, "logits/rejected": -1.7629890441894531, "logps/chosen": -2.2893612384796143, "logps/rejected": -2.439579486846924, "loss": 2.0319, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.89361000061035, "rewards/margins": 1.5021837949752808, "rewards/rejected": -24.395793914794922, "step": 23820 }, { "epoch": 0.8030267282348579, "grad_norm": 43.680755615234375, "learning_rate": 1.1360442213939215e-07, "logits/chosen": -1.9656673669815063, "logits/rejected": -2.002274990081787, "logps/chosen": -2.452765941619873, "logps/rejected": -2.5063955783843994, "loss": 4.3844, "rewards/accuracies": 0.5, "rewards/chosen": -24.52766227722168, "rewards/margins": 0.5362951159477234, "rewards/rejected": -25.063955307006836, "step": 23825 }, { "epoch": 0.8031952543058412, "grad_norm": 58.82378005981445, "learning_rate": 1.1341781343476969e-07, "logits/chosen": -2.0588455200195312, "logits/rejected": -2.2516112327575684, "logps/chosen": -2.668566942214966, "logps/rejected": -2.5644493103027344, "loss": 4.2421, "rewards/accuracies": 0.5, "rewards/chosen": -26.685672760009766, "rewards/margins": -1.0411770343780518, "rewards/rejected": -25.64449691772461, "step": 23830 }, { "epoch": 0.8033637803768243, "grad_norm": 20.719966888427734, "learning_rate": 1.132313385110113e-07, "logits/chosen": -1.1614582538604736, "logits/rejected": -1.6384683847427368, "logps/chosen": -2.3579354286193848, "logps/rejected": -2.794785499572754, "loss": 0.9973, "rewards/accuracies": 1.0, "rewards/chosen": -23.579357147216797, "rewards/margins": 4.368496417999268, "rewards/rejected": -27.94785499572754, "step": 23835 }, { "epoch": 0.8035323064478075, "grad_norm": 256.3408203125, "learning_rate": 1.1304499743264867e-07, "logits/chosen": -2.2006125450134277, "logits/rejected": -2.2175331115722656, "logps/chosen": -3.5510592460632324, "logps/rejected": -3.5691275596618652, "loss": 4.0592, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -35.510597229003906, "rewards/margins": 0.18068209290504456, "rewards/rejected": -35.69127655029297, "step": 23840 }, { "epoch": 0.8037008325187907, "grad_norm": 39.30622863769531, "learning_rate": 1.1285879026416689e-07, "logits/chosen": -1.4549888372421265, "logits/rejected": -1.4136347770690918, "logps/chosen": -2.2924153804779053, "logps/rejected": -2.366925001144409, "loss": 2.9058, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.924156188964844, "rewards/margins": 0.7450965046882629, "rewards/rejected": -23.66925048828125, "step": 23845 }, { "epoch": 0.8038693585897738, "grad_norm": 30.57029151916504, "learning_rate": 1.1267271707000509e-07, "logits/chosen": -1.4190336465835571, "logits/rejected": -1.3612552881240845, "logps/chosen": -2.9762558937072754, "logps/rejected": -3.1376760005950928, "loss": 2.493, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -29.762561798095703, "rewards/margins": 1.6142009496688843, "rewards/rejected": -31.376760482788086, "step": 23850 }, { "epoch": 0.804037884660757, "grad_norm": 34.12413787841797, "learning_rate": 1.124867779145559e-07, "logits/chosen": -1.8991806507110596, "logits/rejected": -1.9499584436416626, "logps/chosen": -2.0986106395721436, "logps/rejected": -2.1971726417541504, "loss": 2.7685, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.986108779907227, "rewards/margins": 0.985619068145752, "rewards/rejected": -21.97172737121582, "step": 23855 }, { "epoch": 0.8042064107317402, "grad_norm": 44.969398498535156, "learning_rate": 1.1230097286216539e-07, "logits/chosen": -1.7429790496826172, "logits/rejected": -1.8025766611099243, "logps/chosen": -2.431539297103882, "logps/rejected": -2.4856417179107666, "loss": 3.4906, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -24.31539535522461, "rewards/margins": 0.5410224199295044, "rewards/rejected": -24.856416702270508, "step": 23860 }, { "epoch": 0.8043749368027234, "grad_norm": 257.18902587890625, "learning_rate": 1.1211530197713337e-07, "logits/chosen": -1.8138539791107178, "logits/rejected": -1.8681204319000244, "logps/chosen": -2.7075095176696777, "logps/rejected": -2.7071266174316406, "loss": 5.5266, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -27.075098037719727, "rewards/margins": -0.0038280487060546875, "rewards/rejected": -27.071269989013672, "step": 23865 }, { "epoch": 0.8045434628737066, "grad_norm": 23.051807403564453, "learning_rate": 1.1192976532371334e-07, "logits/chosen": -1.281162142753601, "logits/rejected": -1.1859794855117798, "logps/chosen": -2.280212879180908, "logps/rejected": -2.2272915840148926, "loss": 3.8509, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.802125930786133, "rewards/margins": -0.5292104482650757, "rewards/rejected": -22.27291488647461, "step": 23870 }, { "epoch": 0.8047119889446898, "grad_norm": 20.657026290893555, "learning_rate": 1.1174436296611212e-07, "logits/chosen": -1.8478273153305054, "logits/rejected": -1.8438236713409424, "logps/chosen": -3.1947686672210693, "logps/rejected": -3.2079625129699707, "loss": 4.3144, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -31.94768714904785, "rewards/margins": 0.13193626701831818, "rewards/rejected": -32.07962417602539, "step": 23875 }, { "epoch": 0.8048805150156729, "grad_norm": 45.678001403808594, "learning_rate": 1.1155909496849026e-07, "logits/chosen": -1.617417335510254, "logits/rejected": -1.949262022972107, "logps/chosen": -2.0899531841278076, "logps/rejected": -2.289635419845581, "loss": 2.7244, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.899532318115234, "rewards/margins": 1.9968221187591553, "rewards/rejected": -22.89635467529297, "step": 23880 }, { "epoch": 0.8050490410866561, "grad_norm": 34.41527557373047, "learning_rate": 1.1137396139496164e-07, "logits/chosen": -1.926828145980835, "logits/rejected": -1.8516514301300049, "logps/chosen": -2.007347583770752, "logps/rejected": -2.1980199813842773, "loss": 2.5529, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.073474884033203, "rewards/margins": 1.9067256450653076, "rewards/rejected": -21.980199813842773, "step": 23885 }, { "epoch": 0.8052175671576393, "grad_norm": 85.15401458740234, "learning_rate": 1.111889623095939e-07, "logits/chosen": -1.903693437576294, "logits/rejected": -2.0786874294281006, "logps/chosen": -2.0593502521514893, "logps/rejected": -2.3961517810821533, "loss": 2.7698, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.593502044677734, "rewards/margins": 3.3680152893066406, "rewards/rejected": -23.961519241333008, "step": 23890 }, { "epoch": 0.8053860932286224, "grad_norm": 24.764240264892578, "learning_rate": 1.1100409777640762e-07, "logits/chosen": -1.818833351135254, "logits/rejected": -1.9280961751937866, "logps/chosen": -2.0653393268585205, "logps/rejected": -2.341346263885498, "loss": 1.8859, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.653390884399414, "rewards/margins": 2.760075330734253, "rewards/rejected": -23.413467407226562, "step": 23895 }, { "epoch": 0.8055546192996057, "grad_norm": 29.278295516967773, "learning_rate": 1.1081936785937724e-07, "logits/chosen": -1.7818940877914429, "logits/rejected": -1.3903292417526245, "logps/chosen": -2.088256597518921, "logps/rejected": -2.1572296619415283, "loss": 3.4176, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.882568359375, "rewards/margins": 0.6897293329238892, "rewards/rejected": -21.572296142578125, "step": 23900 }, { "epoch": 0.8057231453705889, "grad_norm": 67.3364486694336, "learning_rate": 1.106347726224306e-07, "logits/chosen": -2.0864202976226807, "logits/rejected": -1.8650718927383423, "logps/chosen": -2.2361273765563965, "logps/rejected": -2.6703240871429443, "loss": 2.9271, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.36127281188965, "rewards/margins": 4.341969966888428, "rewards/rejected": -26.7032413482666, "step": 23905 }, { "epoch": 0.805891671441572, "grad_norm": 31.025312423706055, "learning_rate": 1.1045031212944884e-07, "logits/chosen": -1.1579580307006836, "logits/rejected": -1.2433195114135742, "logps/chosen": -2.173211097717285, "logps/rejected": -2.2033867835998535, "loss": 3.1625, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.73211097717285, "rewards/margins": 0.30175837874412537, "rewards/rejected": -22.033870697021484, "step": 23910 }, { "epoch": 0.8060601975125552, "grad_norm": 30.541427612304688, "learning_rate": 1.1026598644426632e-07, "logits/chosen": -1.7815994024276733, "logits/rejected": -1.8972110748291016, "logps/chosen": -2.0432934761047363, "logps/rejected": -2.1355528831481934, "loss": 2.3722, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.432933807373047, "rewards/margins": 0.9225980043411255, "rewards/rejected": -21.355531692504883, "step": 23915 }, { "epoch": 0.8062287235835384, "grad_norm": 19.195152282714844, "learning_rate": 1.1008179563067093e-07, "logits/chosen": -2.0765273571014404, "logits/rejected": -2.0552916526794434, "logps/chosen": -3.042235851287842, "logps/rejected": -3.305041790008545, "loss": 2.5792, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -30.422359466552734, "rewards/margins": 2.6280593872070312, "rewards/rejected": -33.050418853759766, "step": 23920 }, { "epoch": 0.8063972496545215, "grad_norm": 21.681163787841797, "learning_rate": 1.0989773975240412e-07, "logits/chosen": -1.1462697982788086, "logits/rejected": -1.4292978048324585, "logps/chosen": -2.134056568145752, "logps/rejected": -2.217719793319702, "loss": 2.7037, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.340566635131836, "rewards/margins": 0.8366325497627258, "rewards/rejected": -22.17719841003418, "step": 23925 }, { "epoch": 0.8065657757255047, "grad_norm": 10.539362907409668, "learning_rate": 1.0971381887316e-07, "logits/chosen": -1.6412235498428345, "logits/rejected": -2.147385835647583, "logps/chosen": -2.3741135597229004, "logps/rejected": -2.6469130516052246, "loss": 3.3071, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.741130828857422, "rewards/margins": 2.7279977798461914, "rewards/rejected": -26.469131469726562, "step": 23930 }, { "epoch": 0.8067343017964879, "grad_norm": 35.09379577636719, "learning_rate": 1.0953003305658648e-07, "logits/chosen": -1.7946780920028687, "logits/rejected": -1.9496517181396484, "logps/chosen": -2.2903881072998047, "logps/rejected": -2.7255663871765137, "loss": 2.1984, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.903881072998047, "rewards/margins": 4.351784706115723, "rewards/rejected": -27.255664825439453, "step": 23935 }, { "epoch": 0.8069028278674711, "grad_norm": 73.50012969970703, "learning_rate": 1.0934638236628463e-07, "logits/chosen": -1.9948482513427734, "logits/rejected": -2.410553455352783, "logps/chosen": -2.9056334495544434, "logps/rejected": -3.6007399559020996, "loss": 4.061, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -29.05633544921875, "rewards/margins": 6.951064109802246, "rewards/rejected": -36.00739669799805, "step": 23940 }, { "epoch": 0.8070713539384543, "grad_norm": 11.287801742553711, "learning_rate": 1.0916286686580884e-07, "logits/chosen": -1.756542444229126, "logits/rejected": -1.8862693309783936, "logps/chosen": -2.3044886589050293, "logps/rejected": -2.6915547847747803, "loss": 3.0784, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.04488754272461, "rewards/margins": 3.8706603050231934, "rewards/rejected": -26.91554832458496, "step": 23945 }, { "epoch": 0.8072398800094375, "grad_norm": 24.094289779663086, "learning_rate": 1.0897948661866636e-07, "logits/chosen": -1.5406244993209839, "logits/rejected": -2.3224754333496094, "logps/chosen": -1.9037678241729736, "logps/rejected": -2.257594585418701, "loss": 1.1585, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.037675857543945, "rewards/margins": 3.53826904296875, "rewards/rejected": -22.575944900512695, "step": 23950 }, { "epoch": 0.8074084060804206, "grad_norm": 22.659090042114258, "learning_rate": 1.0879624168831792e-07, "logits/chosen": -1.7213237285614014, "logits/rejected": -2.021697521209717, "logps/chosen": -2.254326105117798, "logps/rejected": -2.6272685527801514, "loss": 1.8801, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.543262481689453, "rewards/margins": 3.7294254302978516, "rewards/rejected": -26.272686004638672, "step": 23955 }, { "epoch": 0.8075769321514038, "grad_norm": 33.168418884277344, "learning_rate": 1.0861313213817758e-07, "logits/chosen": -2.592442035675049, "logits/rejected": -2.4539074897766113, "logps/chosen": -2.8346259593963623, "logps/rejected": -3.7187466621398926, "loss": 1.586, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -28.34625816345215, "rewards/margins": 8.841202735900879, "rewards/rejected": -37.187461853027344, "step": 23960 }, { "epoch": 0.807745458222387, "grad_norm": 21.68806266784668, "learning_rate": 1.0843015803161204e-07, "logits/chosen": -1.7804466485977173, "logits/rejected": -2.094736337661743, "logps/chosen": -2.116252899169922, "logps/rejected": -2.531083106994629, "loss": 1.1304, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -21.16252899169922, "rewards/margins": 4.148301124572754, "rewards/rejected": -25.31083106994629, "step": 23965 }, { "epoch": 0.8079139842933701, "grad_norm": 26.970895767211914, "learning_rate": 1.0824731943194154e-07, "logits/chosen": -2.08791184425354, "logits/rejected": -2.282702922821045, "logps/chosen": -1.9778106212615967, "logps/rejected": -2.2054924964904785, "loss": 2.0294, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.778106689453125, "rewards/margins": 2.276818037033081, "rewards/rejected": -22.0549259185791, "step": 23970 }, { "epoch": 0.8080825103643534, "grad_norm": 25.497480392456055, "learning_rate": 1.0806461640243941e-07, "logits/chosen": -1.6113965511322021, "logits/rejected": -1.5287015438079834, "logps/chosen": -2.3509488105773926, "logps/rejected": -2.640751361846924, "loss": 1.7576, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -23.509485244750977, "rewards/margins": 2.8980259895324707, "rewards/rejected": -26.40751075744629, "step": 23975 }, { "epoch": 0.8082510364353366, "grad_norm": 57.712249755859375, "learning_rate": 1.0788204900633196e-07, "logits/chosen": -1.449377417564392, "logits/rejected": -1.6721227169036865, "logps/chosen": -2.294976234436035, "logps/rejected": -2.4277760982513428, "loss": 3.0375, "rewards/accuracies": 0.5, "rewards/chosen": -22.94976234436035, "rewards/margins": 1.3279987573623657, "rewards/rejected": -24.277761459350586, "step": 23980 }, { "epoch": 0.8084195625063197, "grad_norm": 17.932680130004883, "learning_rate": 1.0769961730679844e-07, "logits/chosen": -1.9043604135513306, "logits/rejected": -2.0780739784240723, "logps/chosen": -2.340055227279663, "logps/rejected": -2.5520808696746826, "loss": 3.3177, "rewards/accuracies": 0.5, "rewards/chosen": -23.400554656982422, "rewards/margins": 2.120255708694458, "rewards/rejected": -25.520809173583984, "step": 23985 }, { "epoch": 0.8085880885773029, "grad_norm": 29.642797470092773, "learning_rate": 1.0751732136697134e-07, "logits/chosen": -1.8419221639633179, "logits/rejected": -2.1341745853424072, "logps/chosen": -2.4016568660736084, "logps/rejected": -2.761948347091675, "loss": 2.234, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.01656723022461, "rewards/margins": 3.6029136180877686, "rewards/rejected": -27.619482040405273, "step": 23990 }, { "epoch": 0.8087566146482861, "grad_norm": 34.91474151611328, "learning_rate": 1.0733516124993625e-07, "logits/chosen": -2.087256908416748, "logits/rejected": -2.281942129135132, "logps/chosen": -2.0903306007385254, "logps/rejected": -2.9100751876831055, "loss": 1.656, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -20.903305053710938, "rewards/margins": 8.1974458694458, "rewards/rejected": -29.100749969482422, "step": 23995 }, { "epoch": 0.8089251407192692, "grad_norm": 30.762310028076172, "learning_rate": 1.0715313701873135e-07, "logits/chosen": -1.854391098022461, "logits/rejected": -2.0410971641540527, "logps/chosen": -2.2059502601623535, "logps/rejected": -2.372504711151123, "loss": 3.9705, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.05950164794922, "rewards/margins": 1.6655467748641968, "rewards/rejected": -23.725046157836914, "step": 24000 }, { "epoch": 0.8089251407192692, "eval_logits/chosen": -2.2840046882629395, "eval_logits/rejected": -2.4597790241241455, "eval_logps/chosen": -2.2764413356781006, "eval_logps/rejected": -2.4307594299316406, "eval_loss": 3.080594539642334, "eval_rewards/accuracies": 0.6299999952316284, "eval_rewards/chosen": -22.764413833618164, "eval_rewards/margins": 1.5431796312332153, "eval_rewards/rejected": -24.307592391967773, "eval_runtime": 12.8927, "eval_samples_per_second": 7.756, "eval_steps_per_second": 1.939, "step": 24000 }, { "epoch": 0.8090936667902524, "grad_norm": 76.99275970458984, "learning_rate": 1.0697124873634816e-07, "logits/chosen": -1.6733137369155884, "logits/rejected": -1.85502028465271, "logps/chosen": -2.536296844482422, "logps/rejected": -2.6223788261413574, "loss": 2.8727, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.36296844482422, "rewards/margins": 0.8608211278915405, "rewards/rejected": -26.22378921508789, "step": 24005 }, { "epoch": 0.8092621928612357, "grad_norm": 25.34918212890625, "learning_rate": 1.0678949646573104e-07, "logits/chosen": -2.3985438346862793, "logits/rejected": -2.3265886306762695, "logps/chosen": -2.819058656692505, "logps/rejected": -2.8983523845672607, "loss": 4.5749, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -28.190587997436523, "rewards/margins": 0.7929363250732422, "rewards/rejected": -28.983524322509766, "step": 24010 }, { "epoch": 0.8094307189322189, "grad_norm": 51.82828903198242, "learning_rate": 1.0660788026977735e-07, "logits/chosen": -2.010450839996338, "logits/rejected": -1.9830372333526611, "logps/chosen": -2.299954891204834, "logps/rejected": -2.283881664276123, "loss": 3.8194, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.99955177307129, "rewards/margins": -0.16073504090309143, "rewards/rejected": -22.838817596435547, "step": 24015 }, { "epoch": 0.809599245003202, "grad_norm": 3.4491851329803467, "learning_rate": 1.0642640021133742e-07, "logits/chosen": -1.538646936416626, "logits/rejected": -1.7420097589492798, "logps/chosen": -3.4108078479766846, "logps/rejected": -4.116189479827881, "loss": 0.9612, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -34.10807418823242, "rewards/margins": 7.053819179534912, "rewards/rejected": -41.161895751953125, "step": 24020 }, { "epoch": 0.8097677710741852, "grad_norm": 9.866703033447266, "learning_rate": 1.0624505635321406e-07, "logits/chosen": -2.0352749824523926, "logits/rejected": -2.267897129058838, "logps/chosen": -2.5204594135284424, "logps/rejected": -3.058020830154419, "loss": 1.6614, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -25.20458984375, "rewards/margins": 5.375618934631348, "rewards/rejected": -30.580211639404297, "step": 24025 }, { "epoch": 0.8099362971451683, "grad_norm": 63.24565124511719, "learning_rate": 1.0606384875816332e-07, "logits/chosen": -2.1843369007110596, "logits/rejected": -2.30873966217041, "logps/chosen": -2.49751615524292, "logps/rejected": -2.7526285648345947, "loss": 3.4675, "rewards/accuracies": 0.5, "rewards/chosen": -24.975162506103516, "rewards/margins": 2.5511229038238525, "rewards/rejected": -27.526287078857422, "step": 24030 }, { "epoch": 0.8101048232161515, "grad_norm": 35.56220245361328, "learning_rate": 1.0588277748889412e-07, "logits/chosen": -1.576695442199707, "logits/rejected": -1.7100387811660767, "logps/chosen": -2.1149020195007324, "logps/rejected": -2.810107707977295, "loss": 2.6821, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.14902114868164, "rewards/margins": 6.952054500579834, "rewards/rejected": -28.10107421875, "step": 24035 }, { "epoch": 0.8102733492871347, "grad_norm": 37.3370246887207, "learning_rate": 1.0570184260806802e-07, "logits/chosen": -1.415395975112915, "logits/rejected": -1.6331355571746826, "logps/chosen": -2.5750272274017334, "logps/rejected": -2.3799424171447754, "loss": 5.4584, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -25.750274658203125, "rewards/margins": -1.9508476257324219, "rewards/rejected": -23.799427032470703, "step": 24040 }, { "epoch": 0.8104418753581178, "grad_norm": 47.88873291015625, "learning_rate": 1.0552104417829944e-07, "logits/chosen": -1.9850772619247437, "logits/rejected": -2.4515490531921387, "logps/chosen": -2.7267796993255615, "logps/rejected": -3.2105274200439453, "loss": 1.6153, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -27.26779556274414, "rewards/margins": 4.8374786376953125, "rewards/rejected": -32.10527801513672, "step": 24045 }, { "epoch": 0.8106104014291011, "grad_norm": 29.122587203979492, "learning_rate": 1.0534038226215574e-07, "logits/chosen": -2.318523406982422, "logits/rejected": -2.468736410140991, "logps/chosen": -3.361487865447998, "logps/rejected": -3.160301923751831, "loss": 6.1576, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -33.6148796081543, "rewards/margins": -2.0118582248687744, "rewards/rejected": -31.603023529052734, "step": 24050 }, { "epoch": 0.8107789275000843, "grad_norm": 80.43611907958984, "learning_rate": 1.0515985692215667e-07, "logits/chosen": -2.533133029937744, "logits/rejected": -2.5773842334747314, "logps/chosen": -3.062659740447998, "logps/rejected": -3.4902923107147217, "loss": 2.0103, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -30.626602172851562, "rewards/margins": 4.2763237953186035, "rewards/rejected": -34.902923583984375, "step": 24055 }, { "epoch": 0.8109474535710675, "grad_norm": 33.907955169677734, "learning_rate": 1.0497946822077503e-07, "logits/chosen": -2.0659396648406982, "logits/rejected": -2.092923641204834, "logps/chosen": -2.0031232833862305, "logps/rejected": -2.340515375137329, "loss": 3.295, "rewards/accuracies": 0.5, "rewards/chosen": -20.031230926513672, "rewards/margins": 3.37391996383667, "rewards/rejected": -23.405153274536133, "step": 24060 }, { "epoch": 0.8111159796420506, "grad_norm": 6.4482102394104, "learning_rate": 1.0479921622043642e-07, "logits/chosen": -2.037416934967041, "logits/rejected": -2.331026792526245, "logps/chosen": -2.0453298091888428, "logps/rejected": -3.061678409576416, "loss": 1.5471, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.453298568725586, "rewards/margins": 10.163487434387207, "rewards/rejected": -30.616785049438477, "step": 24065 }, { "epoch": 0.8112845057130338, "grad_norm": 0.9323909878730774, "learning_rate": 1.0461910098351862e-07, "logits/chosen": -1.7518703937530518, "logits/rejected": -2.306114435195923, "logps/chosen": -2.4827165603637695, "logps/rejected": -3.222071886062622, "loss": 1.9647, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.827163696289062, "rewards/margins": 7.3935546875, "rewards/rejected": -32.22071838378906, "step": 24070 }, { "epoch": 0.811453031784017, "grad_norm": 28.054025650024414, "learning_rate": 1.044391225723526e-07, "logits/chosen": -2.0947256088256836, "logits/rejected": -2.361001491546631, "logps/chosen": -3.1201844215393066, "logps/rejected": -3.464200258255005, "loss": 1.8679, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -31.201847076416016, "rewards/margins": 3.4401588439941406, "rewards/rejected": -34.642005920410156, "step": 24075 }, { "epoch": 0.8116215578550001, "grad_norm": 19.00840187072754, "learning_rate": 1.0425928104922171e-07, "logits/chosen": -1.4683290719985962, "logits/rejected": -1.6970717906951904, "logps/chosen": -2.109851360321045, "logps/rejected": -2.150763750076294, "loss": 3.2336, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.098514556884766, "rewards/margins": 0.40912121534347534, "rewards/rejected": -21.50763511657715, "step": 24080 }, { "epoch": 0.8117900839259834, "grad_norm": 36.81915283203125, "learning_rate": 1.0407957647636229e-07, "logits/chosen": -2.2459583282470703, "logits/rejected": -2.238729953765869, "logps/chosen": -2.171417713165283, "logps/rejected": -2.372509002685547, "loss": 2.351, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.71417808532715, "rewards/margins": 2.010913372039795, "rewards/rejected": -23.725093841552734, "step": 24085 }, { "epoch": 0.8119586099969666, "grad_norm": 48.59904861450195, "learning_rate": 1.0390000891596268e-07, "logits/chosen": -1.8941738605499268, "logits/rejected": -2.4483718872070312, "logps/chosen": -1.977226972579956, "logps/rejected": -2.3452813625335693, "loss": 2.1481, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.77227210998535, "rewards/margins": 3.6805419921875, "rewards/rejected": -23.45281410217285, "step": 24090 }, { "epoch": 0.8121271360679497, "grad_norm": 25.916887283325195, "learning_rate": 1.0372057843016424e-07, "logits/chosen": -1.7557846307754517, "logits/rejected": -2.196135997772217, "logps/chosen": -2.585275650024414, "logps/rejected": -3.183326005935669, "loss": 2.5426, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.852752685546875, "rewards/margins": 5.980503082275391, "rewards/rejected": -31.8332576751709, "step": 24095 }, { "epoch": 0.8122956621389329, "grad_norm": 30.466371536254883, "learning_rate": 1.0354128508106098e-07, "logits/chosen": -2.0006163120269775, "logits/rejected": -2.707191228866577, "logps/chosen": -2.314324140548706, "logps/rejected": -2.785937547683716, "loss": 2.3359, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.14324188232422, "rewards/margins": 4.716134071350098, "rewards/rejected": -27.859375, "step": 24100 }, { "epoch": 0.8124641882099161, "grad_norm": 94.53244018554688, "learning_rate": 1.0336212893069895e-07, "logits/chosen": -2.5970489978790283, "logits/rejected": -2.78641939163208, "logps/chosen": -3.6945462226867676, "logps/rejected": -3.814669132232666, "loss": 6.3957, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -36.94546127319336, "rewards/margins": 1.2012335062026978, "rewards/rejected": -38.14669418334961, "step": 24105 }, { "epoch": 0.8126327142808992, "grad_norm": 37.557125091552734, "learning_rate": 1.0318311004107716e-07, "logits/chosen": -1.805131196975708, "logits/rejected": -1.9349620342254639, "logps/chosen": -2.4366652965545654, "logps/rejected": -2.357182264328003, "loss": 4.5073, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -24.366655349731445, "rewards/margins": -0.7948305010795593, "rewards/rejected": -23.571823120117188, "step": 24110 }, { "epoch": 0.8128012403518824, "grad_norm": 1.4744274616241455, "learning_rate": 1.0300422847414708e-07, "logits/chosen": -2.667433738708496, "logits/rejected": -2.4561023712158203, "logps/chosen": -3.353752851486206, "logps/rejected": -3.409027576446533, "loss": 4.8988, "rewards/accuracies": 0.5, "rewards/chosen": -33.53752899169922, "rewards/margins": 0.5527437329292297, "rewards/rejected": -34.09027862548828, "step": 24115 }, { "epoch": 0.8129697664228657, "grad_norm": 41.911800384521484, "learning_rate": 1.0282548429181265e-07, "logits/chosen": -1.0657398700714111, "logits/rejected": -1.0967696905136108, "logps/chosen": -1.9195921421051025, "logps/rejected": -2.0219099521636963, "loss": 3.1227, "rewards/accuracies": 0.5, "rewards/chosen": -19.195920944213867, "rewards/margins": 1.023177981376648, "rewards/rejected": -20.219097137451172, "step": 24120 }, { "epoch": 0.8131382924938488, "grad_norm": 22.111970901489258, "learning_rate": 1.0264687755592987e-07, "logits/chosen": -1.5700323581695557, "logits/rejected": -1.79294753074646, "logps/chosen": -2.4795994758605957, "logps/rejected": -2.7823715209960938, "loss": 1.6587, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -24.79599380493164, "rewards/margins": 3.027724027633667, "rewards/rejected": -27.823715209960938, "step": 24125 }, { "epoch": 0.813306818564832, "grad_norm": 86.154052734375, "learning_rate": 1.0246840832830772e-07, "logits/chosen": -1.6293919086456299, "logits/rejected": -1.7340667247772217, "logps/chosen": -2.80242657661438, "logps/rejected": -2.5946459770202637, "loss": 5.1913, "rewards/accuracies": 0.5, "rewards/chosen": -28.02426528930664, "rewards/margins": -2.077803134918213, "rewards/rejected": -25.946462631225586, "step": 24130 }, { "epoch": 0.8134753446358152, "grad_norm": 34.58270263671875, "learning_rate": 1.0229007667070743e-07, "logits/chosen": -1.5737640857696533, "logits/rejected": -1.3462755680084229, "logps/chosen": -2.6154274940490723, "logps/rejected": -2.8824868202209473, "loss": 2.1968, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.15427589416504, "rewards/margins": 2.6705925464630127, "rewards/rejected": -28.82486915588379, "step": 24135 }, { "epoch": 0.8136438707067983, "grad_norm": 191.56494140625, "learning_rate": 1.0211188264484233e-07, "logits/chosen": -1.8556255102157593, "logits/rejected": -1.7768996953964233, "logps/chosen": -3.0176990032196045, "logps/rejected": -3.110203504562378, "loss": 4.2338, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -30.176992416381836, "rewards/margins": 0.9250432848930359, "rewards/rejected": -31.102035522460938, "step": 24140 }, { "epoch": 0.8138123967777815, "grad_norm": 52.166526794433594, "learning_rate": 1.0193382631237851e-07, "logits/chosen": -1.8037277460098267, "logits/rejected": -1.9889767169952393, "logps/chosen": -2.7616565227508545, "logps/rejected": -2.6427969932556152, "loss": 5.5955, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -27.616565704345703, "rewards/margins": -1.1885948181152344, "rewards/rejected": -26.427968978881836, "step": 24145 }, { "epoch": 0.8139809228487647, "grad_norm": 26.984853744506836, "learning_rate": 1.0175590773493431e-07, "logits/chosen": -2.0901906490325928, "logits/rejected": -1.9249852895736694, "logps/chosen": -2.3587594032287598, "logps/rejected": -2.214332342147827, "loss": 5.0239, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -23.58759117126465, "rewards/margins": -1.4442687034606934, "rewards/rejected": -22.14332389831543, "step": 24150 }, { "epoch": 0.8141494489197478, "grad_norm": 30.049564361572266, "learning_rate": 1.0157812697408019e-07, "logits/chosen": -1.4776041507720947, "logits/rejected": -1.5003935098648071, "logps/chosen": -2.169267416000366, "logps/rejected": -2.48649001121521, "loss": 2.6431, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.692672729492188, "rewards/margins": 3.172226667404175, "rewards/rejected": -24.864900588989258, "step": 24155 }, { "epoch": 0.8143179749907311, "grad_norm": 43.715599060058594, "learning_rate": 1.0140048409133906e-07, "logits/chosen": -1.9859644174575806, "logits/rejected": -2.3302559852600098, "logps/chosen": -2.533536195755005, "logps/rejected": -2.7264256477355957, "loss": 3.4219, "rewards/accuracies": 0.5, "rewards/chosen": -25.33536148071289, "rewards/margins": 1.9288944005966187, "rewards/rejected": -27.264257431030273, "step": 24160 }, { "epoch": 0.8144865010617143, "grad_norm": 36.16670608520508, "learning_rate": 1.0122297914818623e-07, "logits/chosen": -2.3879990577697754, "logits/rejected": -2.2910168170928955, "logps/chosen": -3.131276845932007, "logps/rejected": -3.8100860118865967, "loss": 3.3137, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -31.312768936157227, "rewards/margins": 6.788092136383057, "rewards/rejected": -38.100860595703125, "step": 24165 }, { "epoch": 0.8146550271326974, "grad_norm": 40.567203521728516, "learning_rate": 1.0104561220604913e-07, "logits/chosen": -2.137629270553589, "logits/rejected": -2.922985553741455, "logps/chosen": -2.1473476886749268, "logps/rejected": -2.918635606765747, "loss": 2.4425, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.47347640991211, "rewards/margins": 7.712876796722412, "rewards/rejected": -29.186355590820312, "step": 24170 }, { "epoch": 0.8148235532036806, "grad_norm": 183.20619201660156, "learning_rate": 1.0086838332630743e-07, "logits/chosen": -2.144543409347534, "logits/rejected": -2.3791539669036865, "logps/chosen": -2.6417734622955322, "logps/rejected": -2.595959186553955, "loss": 3.6555, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -26.417734146118164, "rewards/margins": -0.45814236998558044, "rewards/rejected": -25.959590911865234, "step": 24175 }, { "epoch": 0.8149920792746638, "grad_norm": 54.81478500366211, "learning_rate": 1.0069129257029313e-07, "logits/chosen": -1.75040602684021, "logits/rejected": -2.152054786682129, "logps/chosen": -2.6521573066711426, "logps/rejected": -2.500854969024658, "loss": 4.9655, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.52157211303711, "rewards/margins": -1.5130245685577393, "rewards/rejected": -25.008548736572266, "step": 24180 }, { "epoch": 0.8151606053456469, "grad_norm": 25.453004837036133, "learning_rate": 1.0051433999929049e-07, "logits/chosen": -1.3161516189575195, "logits/rejected": -1.4525598287582397, "logps/chosen": -2.7192330360412598, "logps/rejected": -3.028667449951172, "loss": 1.4393, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -27.19232749938965, "rewards/margins": 3.094343900680542, "rewards/rejected": -30.28667640686035, "step": 24185 }, { "epoch": 0.8153291314166301, "grad_norm": 17.820598602294922, "learning_rate": 1.0033752567453551e-07, "logits/chosen": -1.4301345348358154, "logits/rejected": -1.5557069778442383, "logps/chosen": -2.266484498977661, "logps/rejected": -2.372802972793579, "loss": 2.297, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.664844512939453, "rewards/margins": 1.0631874799728394, "rewards/rejected": -23.728031158447266, "step": 24190 }, { "epoch": 0.8154976574876134, "grad_norm": 114.78311157226562, "learning_rate": 1.0016084965721682e-07, "logits/chosen": -1.9529441595077515, "logits/rejected": -1.9129083156585693, "logps/chosen": -2.329055070877075, "logps/rejected": -2.5822417736053467, "loss": 3.8328, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.29054832458496, "rewards/margins": 2.5318689346313477, "rewards/rejected": -25.82242202758789, "step": 24195 }, { "epoch": 0.8156661835585965, "grad_norm": 50.35905838012695, "learning_rate": 9.998431200847506e-08, "logits/chosen": -1.9478317499160767, "logits/rejected": -1.6423200368881226, "logps/chosen": -2.32122540473938, "logps/rejected": -2.347656726837158, "loss": 2.9022, "rewards/accuracies": 0.5, "rewards/chosen": -23.212251663208008, "rewards/margins": 0.26431483030319214, "rewards/rejected": -23.476566314697266, "step": 24200 }, { "epoch": 0.8158347096295797, "grad_norm": 142.4918975830078, "learning_rate": 9.980791278940304e-08, "logits/chosen": -2.4216580390930176, "logits/rejected": -2.449982166290283, "logps/chosen": -2.682478666305542, "logps/rejected": -2.609151840209961, "loss": 4.4322, "rewards/accuracies": 0.5, "rewards/chosen": -26.824787139892578, "rewards/margins": -0.7332667112350464, "rewards/rejected": -26.09151840209961, "step": 24205 }, { "epoch": 0.8160032357005629, "grad_norm": 29.56013298034668, "learning_rate": 9.963165206104529e-08, "logits/chosen": -1.9350833892822266, "logits/rejected": -1.5915058851242065, "logps/chosen": -2.3900949954986572, "logps/rejected": -1.978994607925415, "loss": 7.1794, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -23.90094566345215, "rewards/margins": -4.111001491546631, "rewards/rejected": -19.789945602416992, "step": 24210 }, { "epoch": 0.816171761771546, "grad_norm": 27.79477882385254, "learning_rate": 9.945552988439893e-08, "logits/chosen": -1.9351444244384766, "logits/rejected": -1.809313416481018, "logps/chosen": -3.3555362224578857, "logps/rejected": -3.6098411083221436, "loss": 2.7335, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -33.555362701416016, "rewards/margins": 2.543045997619629, "rewards/rejected": -36.09840774536133, "step": 24215 }, { "epoch": 0.8163402878425292, "grad_norm": 33.100006103515625, "learning_rate": 9.927954632041297e-08, "logits/chosen": -1.5907175540924072, "logits/rejected": -1.7621221542358398, "logps/chosen": -1.743699312210083, "logps/rejected": -2.0336194038391113, "loss": 1.9667, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.43699073791504, "rewards/margins": 2.899199962615967, "rewards/rejected": -20.336193084716797, "step": 24220 }, { "epoch": 0.8165088139135124, "grad_norm": 31.314096450805664, "learning_rate": 9.910370142998814e-08, "logits/chosen": -2.312352418899536, "logits/rejected": -2.5222764015197754, "logps/chosen": -2.5663654804229736, "logps/rejected": -3.113084554672241, "loss": 2.6303, "rewards/accuracies": 0.5, "rewards/chosen": -25.663654327392578, "rewards/margins": 5.467194557189941, "rewards/rejected": -31.130847930908203, "step": 24225 }, { "epoch": 0.8166773399844957, "grad_norm": 52.19409942626953, "learning_rate": 9.892799527397755e-08, "logits/chosen": -1.5479198694229126, "logits/rejected": -1.8709392547607422, "logps/chosen": -2.036996603012085, "logps/rejected": -2.095541477203369, "loss": 2.6826, "rewards/accuracies": 0.5, "rewards/chosen": -20.36996841430664, "rewards/margins": 0.5854486227035522, "rewards/rejected": -20.95541763305664, "step": 24230 }, { "epoch": 0.8168458660554788, "grad_norm": 19.438608169555664, "learning_rate": 9.875242791318622e-08, "logits/chosen": -1.9275665283203125, "logits/rejected": -2.6109588146209717, "logps/chosen": -2.3209636211395264, "logps/rejected": -3.3178200721740723, "loss": 2.6408, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.209636688232422, "rewards/margins": 9.968561172485352, "rewards/rejected": -33.178199768066406, "step": 24235 }, { "epoch": 0.817014392126462, "grad_norm": 50.3494873046875, "learning_rate": 9.857699940837116e-08, "logits/chosen": -1.8657255172729492, "logits/rejected": -2.113823413848877, "logps/chosen": -1.711350679397583, "logps/rejected": -1.7632169723510742, "loss": 2.9808, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.113506317138672, "rewards/margins": 0.5186625719070435, "rewards/rejected": -17.63216781616211, "step": 24240 }, { "epoch": 0.8171829181974452, "grad_norm": 27.25106430053711, "learning_rate": 9.84017098202411e-08, "logits/chosen": -1.7077363729476929, "logits/rejected": -2.0198585987091064, "logps/chosen": -2.3715415000915527, "logps/rejected": -2.9775185585021973, "loss": 2.3465, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.715417861938477, "rewards/margins": 6.05977201461792, "rewards/rejected": -29.775188446044922, "step": 24245 }, { "epoch": 0.8173514442684283, "grad_norm": 57.114891052246094, "learning_rate": 9.822655920945689e-08, "logits/chosen": -1.7086464166641235, "logits/rejected": -1.7238966226577759, "logps/chosen": -1.6781396865844727, "logps/rejected": -1.5659065246582031, "loss": 4.1638, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -16.781396865844727, "rewards/margins": -1.122330904006958, "rewards/rejected": -15.659067153930664, "step": 24250 }, { "epoch": 0.8175199703394115, "grad_norm": 23.778400421142578, "learning_rate": 9.805154763663143e-08, "logits/chosen": -2.0748565196990967, "logits/rejected": -2.5501840114593506, "logps/chosen": -3.339693069458008, "logps/rejected": -4.343489646911621, "loss": 1.7495, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -33.39692687988281, "rewards/margins": 10.037964820861816, "rewards/rejected": -43.43489456176758, "step": 24255 }, { "epoch": 0.8176884964103946, "grad_norm": 21.111228942871094, "learning_rate": 9.787667516232906e-08, "logits/chosen": -1.0510246753692627, "logits/rejected": -1.7527587413787842, "logps/chosen": -2.5654733180999756, "logps/rejected": -3.7719593048095703, "loss": 1.7378, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.654733657836914, "rewards/margins": 12.064860343933105, "rewards/rejected": -37.71959686279297, "step": 24260 }, { "epoch": 0.8178570224813778, "grad_norm": 16.419200897216797, "learning_rate": 9.770194184706637e-08, "logits/chosen": -2.0978894233703613, "logits/rejected": -2.425337553024292, "logps/chosen": -2.3975110054016113, "logps/rejected": -3.037055253982544, "loss": 2.1612, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.975107192993164, "rewards/margins": 6.395442008972168, "rewards/rejected": -30.37055015563965, "step": 24265 }, { "epoch": 0.8180255485523611, "grad_norm": 38.14176940917969, "learning_rate": 9.752734775131171e-08, "logits/chosen": -1.8208221197128296, "logits/rejected": -1.9604995250701904, "logps/chosen": -1.9669945240020752, "logps/rejected": -1.8961633443832397, "loss": 3.8409, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.669946670532227, "rewards/margins": -0.7083131670951843, "rewards/rejected": -18.961633682250977, "step": 24270 }, { "epoch": 0.8181940746233443, "grad_norm": 47.77272033691406, "learning_rate": 9.735289293548537e-08, "logits/chosen": -2.3018887042999268, "logits/rejected": -2.4231314659118652, "logps/chosen": -2.951404094696045, "logps/rejected": -2.9356772899627686, "loss": 4.2997, "rewards/accuracies": 0.5, "rewards/chosen": -29.514041900634766, "rewards/margins": -0.15726737678050995, "rewards/rejected": -29.356775283813477, "step": 24275 }, { "epoch": 0.8183626006943274, "grad_norm": 1.0143815279006958, "learning_rate": 9.717857745995894e-08, "logits/chosen": -1.3401901721954346, "logits/rejected": -1.683653473854065, "logps/chosen": -2.7770957946777344, "logps/rejected": -3.4555981159210205, "loss": 1.8986, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -27.770959854125977, "rewards/margins": 6.785020351409912, "rewards/rejected": -34.55597686767578, "step": 24280 }, { "epoch": 0.8185311267653106, "grad_norm": 82.10999298095703, "learning_rate": 9.700440138505633e-08, "logits/chosen": -1.3875019550323486, "logits/rejected": -1.4035696983337402, "logps/chosen": -2.593018054962158, "logps/rejected": -2.754159927368164, "loss": 3.7198, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.930179595947266, "rewards/margins": 1.6114212274551392, "rewards/rejected": -27.541601181030273, "step": 24285 }, { "epoch": 0.8186996528362938, "grad_norm": 32.000885009765625, "learning_rate": 9.683036477105316e-08, "logits/chosen": -2.18599796295166, "logits/rejected": -2.2437338829040527, "logps/chosen": -2.5854225158691406, "logps/rejected": -2.7304766178131104, "loss": 3.1102, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -25.854223251342773, "rewards/margins": 1.4505420923233032, "rewards/rejected": -27.304767608642578, "step": 24290 }, { "epoch": 0.8188681789072769, "grad_norm": 28.451539993286133, "learning_rate": 9.665646767817636e-08, "logits/chosen": -2.0066299438476562, "logits/rejected": -2.0902113914489746, "logps/chosen": -2.9684078693389893, "logps/rejected": -3.7975857257843018, "loss": 2.2862, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -29.6840763092041, "rewards/margins": 8.291781425476074, "rewards/rejected": -37.97585678100586, "step": 24295 }, { "epoch": 0.8190367049782601, "grad_norm": 24.359365463256836, "learning_rate": 9.648271016660503e-08, "logits/chosen": -2.2018818855285645, "logits/rejected": -2.1886439323425293, "logps/chosen": -2.38197660446167, "logps/rejected": -2.835458278656006, "loss": 2.2134, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.819766998291016, "rewards/margins": 4.53481388092041, "rewards/rejected": -28.35457992553711, "step": 24300 }, { "epoch": 0.8192052310492434, "grad_norm": 19.370195388793945, "learning_rate": 9.630909229646972e-08, "logits/chosen": -1.8896777629852295, "logits/rejected": -2.023592472076416, "logps/chosen": -1.7500760555267334, "logps/rejected": -1.8328708410263062, "loss": 2.6741, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.500761032104492, "rewards/margins": 0.8279494047164917, "rewards/rejected": -18.328710556030273, "step": 24305 }, { "epoch": 0.8193737571202265, "grad_norm": 30.398090362548828, "learning_rate": 9.613561412785277e-08, "logits/chosen": -2.4260175228118896, "logits/rejected": -2.715618133544922, "logps/chosen": -2.8365581035614014, "logps/rejected": -2.6541521549224854, "loss": 5.9238, "rewards/accuracies": 0.5, "rewards/chosen": -28.365581512451172, "rewards/margins": -1.8240602016448975, "rewards/rejected": -26.541522979736328, "step": 24310 }, { "epoch": 0.8195422831912097, "grad_norm": 71.60824584960938, "learning_rate": 9.596227572078819e-08, "logits/chosen": -2.237004041671753, "logits/rejected": -2.439582109451294, "logps/chosen": -2.3587868213653564, "logps/rejected": -2.8456122875213623, "loss": 1.5354, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -23.587865829467773, "rewards/margins": 4.868255615234375, "rewards/rejected": -28.45612144470215, "step": 24315 }, { "epoch": 0.8197108092621929, "grad_norm": 24.230173110961914, "learning_rate": 9.578907713526163e-08, "logits/chosen": -2.4077305793762207, "logits/rejected": -2.8008816242218018, "logps/chosen": -1.9239914417266846, "logps/rejected": -2.4241859912872314, "loss": 2.336, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.239913940429688, "rewards/margins": 5.0019450187683105, "rewards/rejected": -24.241859436035156, "step": 24320 }, { "epoch": 0.819879335333176, "grad_norm": 17.881893157958984, "learning_rate": 9.561601843121003e-08, "logits/chosen": -2.061577320098877, "logits/rejected": -2.34552264213562, "logps/chosen": -3.029741048812866, "logps/rejected": -3.769167423248291, "loss": 1.8675, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -30.297412872314453, "rewards/margins": 7.394262790679932, "rewards/rejected": -37.691673278808594, "step": 24325 }, { "epoch": 0.8200478614041592, "grad_norm": 45.83840560913086, "learning_rate": 9.544309966852243e-08, "logits/chosen": -1.9961233139038086, "logits/rejected": -2.096349000930786, "logps/chosen": -2.4318995475769043, "logps/rejected": -2.354107618331909, "loss": 4.9792, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.318994522094727, "rewards/margins": -0.7779159545898438, "rewards/rejected": -23.54107666015625, "step": 24330 }, { "epoch": 0.8202163874751424, "grad_norm": 19.41057777404785, "learning_rate": 9.527032090703913e-08, "logits/chosen": -1.7249418497085571, "logits/rejected": -1.6345096826553345, "logps/chosen": -1.8474910259246826, "logps/rejected": -2.0146989822387695, "loss": 2.3606, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.474910736083984, "rewards/margins": 1.672079086303711, "rewards/rejected": -20.146991729736328, "step": 24335 }, { "epoch": 0.8203849135461256, "grad_norm": 202.65652465820312, "learning_rate": 9.509768220655201e-08, "logits/chosen": -2.2872161865234375, "logits/rejected": -2.4015910625457764, "logps/chosen": -3.318181276321411, "logps/rejected": -3.330636501312256, "loss": 6.8898, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -33.18181228637695, "rewards/margins": 0.12455234676599503, "rewards/rejected": -33.306365966796875, "step": 24340 }, { "epoch": 0.8205534396171088, "grad_norm": 44.25444412231445, "learning_rate": 9.492518362680469e-08, "logits/chosen": -2.0388379096984863, "logits/rejected": -2.184521436691284, "logps/chosen": -2.8452885150909424, "logps/rejected": -3.5608794689178467, "loss": 2.6052, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -28.452880859375, "rewards/margins": 7.155913352966309, "rewards/rejected": -35.608795166015625, "step": 24345 }, { "epoch": 0.820721965688092, "grad_norm": 21.56203269958496, "learning_rate": 9.475282522749189e-08, "logits/chosen": -1.3232519626617432, "logits/rejected": -1.813665747642517, "logps/chosen": -2.3560421466827393, "logps/rejected": -2.6592459678649902, "loss": 2.2359, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.560420989990234, "rewards/margins": 3.032042980194092, "rewards/rejected": -26.59246253967285, "step": 24350 }, { "epoch": 0.8208904917590751, "grad_norm": 31.2237606048584, "learning_rate": 9.458060706826021e-08, "logits/chosen": -1.8143165111541748, "logits/rejected": -1.8684632778167725, "logps/chosen": -2.4288649559020996, "logps/rejected": -2.423241376876831, "loss": 3.5141, "rewards/accuracies": 0.5, "rewards/chosen": -24.288654327392578, "rewards/margins": -0.05624275282025337, "rewards/rejected": -24.232410430908203, "step": 24355 }, { "epoch": 0.8210590178300583, "grad_norm": 14.442959785461426, "learning_rate": 9.440852920870762e-08, "logits/chosen": -2.293940305709839, "logits/rejected": -2.269188404083252, "logps/chosen": -2.117640256881714, "logps/rejected": -2.502702236175537, "loss": 2.5759, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.176403045654297, "rewards/margins": 3.8506178855895996, "rewards/rejected": -25.027021408081055, "step": 24360 }, { "epoch": 0.8212275439010415, "grad_norm": 21.63381576538086, "learning_rate": 9.423659170838327e-08, "logits/chosen": -1.926725149154663, "logits/rejected": -2.1903584003448486, "logps/chosen": -2.3233580589294434, "logps/rejected": -2.667724132537842, "loss": 2.1465, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.233577728271484, "rewards/margins": 3.443665027618408, "rewards/rejected": -26.677242279052734, "step": 24365 }, { "epoch": 0.8213960699720246, "grad_norm": 101.54517364501953, "learning_rate": 9.406479462678812e-08, "logits/chosen": -2.2762808799743652, "logits/rejected": -2.214491605758667, "logps/chosen": -2.777794599533081, "logps/rejected": -2.526498317718506, "loss": 6.0451, "rewards/accuracies": 0.5, "rewards/chosen": -27.7779483795166, "rewards/margins": -2.512964963912964, "rewards/rejected": -25.26498031616211, "step": 24370 }, { "epoch": 0.8215645960430078, "grad_norm": 11.986262321472168, "learning_rate": 9.389313802337434e-08, "logits/chosen": -2.0726375579833984, "logits/rejected": -2.5973610877990723, "logps/chosen": -2.276418685913086, "logps/rejected": -2.66465425491333, "loss": 1.4631, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.76418685913086, "rewards/margins": 3.882354259490967, "rewards/rejected": -26.64653968811035, "step": 24375 }, { "epoch": 0.8217331221139911, "grad_norm": 26.458959579467773, "learning_rate": 9.372162195754563e-08, "logits/chosen": -1.9591572284698486, "logits/rejected": -1.9244375228881836, "logps/chosen": -2.5298287868499756, "logps/rejected": -2.657670497894287, "loss": 2.2302, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.298288345336914, "rewards/margins": 1.2784183025360107, "rewards/rejected": -26.576705932617188, "step": 24380 }, { "epoch": 0.8219016481849742, "grad_norm": 45.323184967041016, "learning_rate": 9.355024648865673e-08, "logits/chosen": -1.9862937927246094, "logits/rejected": -2.0360422134399414, "logps/chosen": -2.7361838817596436, "logps/rejected": -3.230208158493042, "loss": 1.8254, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -27.361841201782227, "rewards/margins": 4.940243721008301, "rewards/rejected": -32.302085876464844, "step": 24385 }, { "epoch": 0.8220701742559574, "grad_norm": 0.9159602522850037, "learning_rate": 9.337901167601404e-08, "logits/chosen": -1.2810310125350952, "logits/rejected": -1.7395668029785156, "logps/chosen": -2.1692142486572266, "logps/rejected": -2.639930248260498, "loss": 1.484, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.692142486572266, "rewards/margins": 4.707159996032715, "rewards/rejected": -26.399301528930664, "step": 24390 }, { "epoch": 0.8222387003269406, "grad_norm": 29.296329498291016, "learning_rate": 9.320791757887525e-08, "logits/chosen": -1.996145248413086, "logits/rejected": -2.350928544998169, "logps/chosen": -2.3054845333099365, "logps/rejected": -2.567873001098633, "loss": 4.1882, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.054845809936523, "rewards/margins": 2.623884677886963, "rewards/rejected": -25.678730010986328, "step": 24395 }, { "epoch": 0.8224072263979237, "grad_norm": 28.01740074157715, "learning_rate": 9.303696425644914e-08, "logits/chosen": -1.6943343877792358, "logits/rejected": -2.1938259601593018, "logps/chosen": -2.7483012676239014, "logps/rejected": -3.2290358543395996, "loss": 3.5691, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -27.48301124572754, "rewards/margins": 4.807345867156982, "rewards/rejected": -32.29035949707031, "step": 24400 }, { "epoch": 0.8224072263979237, "eval_logits/chosen": -2.2856757640838623, "eval_logits/rejected": -2.4621024131774902, "eval_logps/chosen": -2.276271104812622, "eval_logps/rejected": -2.429314613342285, "eval_loss": 3.0806543827056885, "eval_rewards/accuracies": 0.6299999952316284, "eval_rewards/chosen": -22.762710571289062, "eval_rewards/margins": 1.5304350852966309, "eval_rewards/rejected": -24.29314422607422, "eval_runtime": 12.9037, "eval_samples_per_second": 7.75, "eval_steps_per_second": 1.937, "step": 24400 }, { "epoch": 0.8225757524689069, "grad_norm": 21.572216033935547, "learning_rate": 9.286615176789603e-08, "logits/chosen": -1.6033306121826172, "logits/rejected": -2.2495296001434326, "logps/chosen": -2.1205878257751465, "logps/rejected": -3.219911575317383, "loss": 1.5082, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.20587921142578, "rewards/margins": 10.99323844909668, "rewards/rejected": -32.199119567871094, "step": 24405 }, { "epoch": 0.8227442785398901, "grad_norm": 29.8074893951416, "learning_rate": 9.269548017232731e-08, "logits/chosen": -1.3725274801254272, "logits/rejected": -2.1665830612182617, "logps/chosen": -2.617130994796753, "logps/rejected": -3.4140186309814453, "loss": 2.4409, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -26.171310424804688, "rewards/margins": 7.968874931335449, "rewards/rejected": -34.14018249511719, "step": 24410 }, { "epoch": 0.8229128046108733, "grad_norm": 367.8448791503906, "learning_rate": 9.252494952880585e-08, "logits/chosen": -2.3928165435791016, "logits/rejected": -2.5799431800842285, "logps/chosen": -2.7322371006011963, "logps/rejected": -2.877946376800537, "loss": 3.0924, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -27.322372436523438, "rewards/margins": 1.4570938348770142, "rewards/rejected": -28.779464721679688, "step": 24415 }, { "epoch": 0.8230813306818565, "grad_norm": 30.02773666381836, "learning_rate": 9.235455989634539e-08, "logits/chosen": -1.4501005411148071, "logits/rejected": -1.9619977474212646, "logps/chosen": -2.1792657375335693, "logps/rejected": -2.3502280712127686, "loss": 2.8787, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.79265785217285, "rewards/margins": 1.7096226215362549, "rewards/rejected": -23.50227928161621, "step": 24420 }, { "epoch": 0.8232498567528397, "grad_norm": 23.372652053833008, "learning_rate": 9.218431133391119e-08, "logits/chosen": -2.177403211593628, "logits/rejected": -2.612704038619995, "logps/chosen": -2.234631061553955, "logps/rejected": -2.9745616912841797, "loss": 1.5008, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -22.3463077545166, "rewards/margins": 7.399305820465088, "rewards/rejected": -29.745615005493164, "step": 24425 }, { "epoch": 0.8234183828238228, "grad_norm": 33.17243194580078, "learning_rate": 9.201420390041964e-08, "logits/chosen": -2.2947487831115723, "logits/rejected": -2.219944477081299, "logps/chosen": -2.5415103435516357, "logps/rejected": -2.7752723693847656, "loss": 2.4336, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.41510581970215, "rewards/margins": 2.337618350982666, "rewards/rejected": -27.752721786499023, "step": 24430 }, { "epoch": 0.823586908894806, "grad_norm": 93.92717742919922, "learning_rate": 9.184423765473798e-08, "logits/chosen": -1.8181613683700562, "logits/rejected": -2.419252395629883, "logps/chosen": -3.1017751693725586, "logps/rejected": -3.9386048316955566, "loss": 1.7762, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -31.01775550842285, "rewards/margins": 8.368292808532715, "rewards/rejected": -39.386043548583984, "step": 24435 }, { "epoch": 0.8237554349657892, "grad_norm": 44.434810638427734, "learning_rate": 9.167441265568499e-08, "logits/chosen": -1.3838471174240112, "logits/rejected": -1.4956175088882446, "logps/chosen": -2.231163740158081, "logps/rejected": -2.0925040245056152, "loss": 4.5103, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.311635971069336, "rewards/margins": -1.3865975141525269, "rewards/rejected": -20.925039291381836, "step": 24440 }, { "epoch": 0.8239239610367723, "grad_norm": 48.002071380615234, "learning_rate": 9.150472896203038e-08, "logits/chosen": -2.1138761043548584, "logits/rejected": -2.857396364212036, "logps/chosen": -1.9746402502059937, "logps/rejected": -2.4951186180114746, "loss": 2.845, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.746402740478516, "rewards/margins": 5.204786777496338, "rewards/rejected": -24.951187133789062, "step": 24445 }, { "epoch": 0.8240924871077556, "grad_norm": 38.526649475097656, "learning_rate": 9.133518663249512e-08, "logits/chosen": -1.917676568031311, "logits/rejected": -1.9917066097259521, "logps/chosen": -2.883349895477295, "logps/rejected": -3.125060558319092, "loss": 2.1997, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -28.833499908447266, "rewards/margins": 2.417109251022339, "rewards/rejected": -31.250606536865234, "step": 24450 }, { "epoch": 0.8242610131787388, "grad_norm": 30.155296325683594, "learning_rate": 9.11657857257509e-08, "logits/chosen": -2.3301873207092285, "logits/rejected": -2.3411762714385986, "logps/chosen": -2.7758543491363525, "logps/rejected": -2.8965957164764404, "loss": 2.9838, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -27.758544921875, "rewards/margins": 1.207413911819458, "rewards/rejected": -28.965957641601562, "step": 24455 }, { "epoch": 0.824429539249722, "grad_norm": 56.81505584716797, "learning_rate": 9.099652630042082e-08, "logits/chosen": -1.6030937433242798, "logits/rejected": -1.4450093507766724, "logps/chosen": -2.240893840789795, "logps/rejected": -2.38731050491333, "loss": 2.5314, "rewards/accuracies": 0.5, "rewards/chosen": -22.408939361572266, "rewards/margins": 1.4641635417938232, "rewards/rejected": -23.873104095458984, "step": 24460 }, { "epoch": 0.8245980653207051, "grad_norm": 26.720672607421875, "learning_rate": 9.082740841507891e-08, "logits/chosen": -1.6110947132110596, "logits/rejected": -1.9232873916625977, "logps/chosen": -2.1672120094299316, "logps/rejected": -2.193887233734131, "loss": 3.3855, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.672119140625, "rewards/margins": 0.266756147146225, "rewards/rejected": -21.938875198364258, "step": 24465 }, { "epoch": 0.8247665913916883, "grad_norm": 8.478957176208496, "learning_rate": 9.065843212825014e-08, "logits/chosen": -2.654512405395508, "logits/rejected": -2.91544508934021, "logps/chosen": -2.9694647789001465, "logps/rejected": -3.8186492919921875, "loss": 1.6066, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -29.69464683532715, "rewards/margins": 8.491846084594727, "rewards/rejected": -38.186492919921875, "step": 24470 }, { "epoch": 0.8249351174626715, "grad_norm": 49.12944793701172, "learning_rate": 9.048959749841067e-08, "logits/chosen": -1.9129912853240967, "logits/rejected": -2.236804485321045, "logps/chosen": -2.4809210300445557, "logps/rejected": -2.9880154132843018, "loss": 3.2204, "rewards/accuracies": 0.5, "rewards/chosen": -24.8092098236084, "rewards/margins": 5.070943355560303, "rewards/rejected": -29.880151748657227, "step": 24475 }, { "epoch": 0.8251036435336546, "grad_norm": 43.3686637878418, "learning_rate": 9.03209045839874e-08, "logits/chosen": -1.8041359186172485, "logits/rejected": -2.211690664291382, "logps/chosen": -2.227994441986084, "logps/rejected": -2.4465460777282715, "loss": 3.7016, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.279943466186523, "rewards/margins": 2.1855177879333496, "rewards/rejected": -24.4654598236084, "step": 24480 }, { "epoch": 0.8252721696046378, "grad_norm": 57.436214447021484, "learning_rate": 9.015235344335848e-08, "logits/chosen": -1.828051209449768, "logits/rejected": -1.9708757400512695, "logps/chosen": -2.138115406036377, "logps/rejected": -2.4871132373809814, "loss": 1.6225, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.381155014038086, "rewards/margins": 3.489978313446045, "rewards/rejected": -24.87113380432129, "step": 24485 }, { "epoch": 0.8254406956756211, "grad_norm": 139.59628295898438, "learning_rate": 8.998394413485249e-08, "logits/chosen": -2.0176608562469482, "logits/rejected": -2.130763530731201, "logps/chosen": -2.8235087394714355, "logps/rejected": -3.0206499099731445, "loss": 3.2285, "rewards/accuracies": 0.5, "rewards/chosen": -28.235088348388672, "rewards/margins": 1.9714100360870361, "rewards/rejected": -30.206497192382812, "step": 24490 }, { "epoch": 0.8256092217466042, "grad_norm": 33.01143264770508, "learning_rate": 8.981567671674956e-08, "logits/chosen": -2.0660080909729004, "logits/rejected": -2.4264297485351562, "logps/chosen": -2.2755260467529297, "logps/rejected": -3.569042205810547, "loss": 1.0104, "rewards/accuracies": 1.0, "rewards/chosen": -22.755260467529297, "rewards/margins": 12.935162544250488, "rewards/rejected": -35.69042205810547, "step": 24495 }, { "epoch": 0.8257777478175874, "grad_norm": 62.452022552490234, "learning_rate": 8.964755124728035e-08, "logits/chosen": -1.6145250797271729, "logits/rejected": -1.9589201211929321, "logps/chosen": -2.197807550430298, "logps/rejected": -2.4428963661193848, "loss": 2.2725, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.978076934814453, "rewards/margins": 2.450887680053711, "rewards/rejected": -24.428964614868164, "step": 24500 }, { "epoch": 0.8259462738885706, "grad_norm": 34.67888259887695, "learning_rate": 8.947956778462628e-08, "logits/chosen": -1.963996171951294, "logits/rejected": -1.929396629333496, "logps/chosen": -2.4872591495513916, "logps/rejected": -2.4773287773132324, "loss": 3.2287, "rewards/accuracies": 0.5, "rewards/chosen": -24.87259292602539, "rewards/margins": -0.0993032455444336, "rewards/rejected": -24.77328872680664, "step": 24505 }, { "epoch": 0.8261147999595537, "grad_norm": 21.794479370117188, "learning_rate": 8.931172638691998e-08, "logits/chosen": -1.7423756122589111, "logits/rejected": -2.079932689666748, "logps/chosen": -2.141746997833252, "logps/rejected": -2.7574732303619385, "loss": 1.5836, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.417470932006836, "rewards/margins": 6.15726375579834, "rewards/rejected": -27.57473373413086, "step": 24510 }, { "epoch": 0.8262833260305369, "grad_norm": 33.65979766845703, "learning_rate": 8.914402711224466e-08, "logits/chosen": -1.8532603979110718, "logits/rejected": -1.9771888256072998, "logps/chosen": -2.039398670196533, "logps/rejected": -2.1789710521698, "loss": 2.1006, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.39398956298828, "rewards/margins": 1.3957213163375854, "rewards/rejected": -21.789709091186523, "step": 24515 }, { "epoch": 0.8264518521015201, "grad_norm": 19.37758445739746, "learning_rate": 8.897647001863467e-08, "logits/chosen": -2.056464433670044, "logits/rejected": -2.1448254585266113, "logps/chosen": -2.0192456245422363, "logps/rejected": -2.1061623096466064, "loss": 3.4028, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -20.192453384399414, "rewards/margins": 0.8691700100898743, "rewards/rejected": -21.06162452697754, "step": 24520 }, { "epoch": 0.8266203781725033, "grad_norm": 65.1499252319336, "learning_rate": 8.880905516407456e-08, "logits/chosen": -1.6522516012191772, "logits/rejected": -2.139988899230957, "logps/chosen": -3.164013385772705, "logps/rejected": -3.8580169677734375, "loss": 2.2586, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -31.6401309967041, "rewards/margins": 6.940039157867432, "rewards/rejected": -38.58017349243164, "step": 24525 }, { "epoch": 0.8267889042434865, "grad_norm": 61.83774948120117, "learning_rate": 8.864178260650018e-08, "logits/chosen": -1.669846534729004, "logits/rejected": -2.1356027126312256, "logps/chosen": -2.799243688583374, "logps/rejected": -4.0782670974731445, "loss": 1.836, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -27.9924373626709, "rewards/margins": 12.79023551940918, "rewards/rejected": -40.782676696777344, "step": 24530 }, { "epoch": 0.8269574303144697, "grad_norm": 23.5116024017334, "learning_rate": 8.847465240379809e-08, "logits/chosen": -1.7187703847885132, "logits/rejected": -2.0123953819274902, "logps/chosen": -2.03355073928833, "logps/rejected": -3.2723755836486816, "loss": 1.617, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.335506439208984, "rewards/margins": 12.388250350952148, "rewards/rejected": -32.7237548828125, "step": 24535 }, { "epoch": 0.8271259563854528, "grad_norm": 27.858238220214844, "learning_rate": 8.830766461380523e-08, "logits/chosen": -1.7541147470474243, "logits/rejected": -1.9249897003173828, "logps/chosen": -2.0622360706329346, "logps/rejected": -2.1741714477539062, "loss": 2.5385, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.622358322143555, "rewards/margins": 1.1193554401397705, "rewards/rejected": -21.741714477539062, "step": 24540 }, { "epoch": 0.827294482456436, "grad_norm": 23.42979621887207, "learning_rate": 8.814081929430967e-08, "logits/chosen": -1.9817981719970703, "logits/rejected": -1.6340019702911377, "logps/chosen": -2.1842682361602783, "logps/rejected": -2.083808422088623, "loss": 4.1161, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -21.842681884765625, "rewards/margins": -1.0045974254608154, "rewards/rejected": -20.838083267211914, "step": 24545 }, { "epoch": 0.8274630085274192, "grad_norm": 62.881187438964844, "learning_rate": 8.797411650304986e-08, "logits/chosen": -1.9652912616729736, "logits/rejected": -1.9075905084609985, "logps/chosen": -2.004683017730713, "logps/rejected": -2.0662283897399902, "loss": 2.7007, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.046829223632812, "rewards/margins": 0.6154546737670898, "rewards/rejected": -20.662282943725586, "step": 24550 }, { "epoch": 0.8276315345984023, "grad_norm": 22.706445693969727, "learning_rate": 8.780755629771536e-08, "logits/chosen": -2.144796848297119, "logits/rejected": -2.1325831413269043, "logps/chosen": -2.8483757972717285, "logps/rejected": -2.9826736450195312, "loss": 3.5277, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -28.4837589263916, "rewards/margins": 1.3429784774780273, "rewards/rejected": -29.826736450195312, "step": 24555 }, { "epoch": 0.8278000606693856, "grad_norm": 14.372481346130371, "learning_rate": 8.764113873594575e-08, "logits/chosen": -1.8322279453277588, "logits/rejected": -1.653124451637268, "logps/chosen": -2.967151403427124, "logps/rejected": -3.2807929515838623, "loss": 2.5602, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -29.6715145111084, "rewards/margins": 3.1364188194274902, "rewards/rejected": -32.80793380737305, "step": 24560 }, { "epoch": 0.8279685867403688, "grad_norm": 151.4964141845703, "learning_rate": 8.747486387533171e-08, "logits/chosen": -2.8778085708618164, "logits/rejected": -2.7823691368103027, "logps/chosen": -2.7946856021881104, "logps/rejected": -2.818932056427002, "loss": 3.4742, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -27.946857452392578, "rewards/margins": 0.2424612045288086, "rewards/rejected": -28.189319610595703, "step": 24565 }, { "epoch": 0.8281371128113519, "grad_norm": 25.029939651489258, "learning_rate": 8.730873177341458e-08, "logits/chosen": -1.8756672143936157, "logits/rejected": -2.349959135055542, "logps/chosen": -1.728493332862854, "logps/rejected": -1.984562635421753, "loss": 1.9933, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.28493309020996, "rewards/margins": 2.5606913566589355, "rewards/rejected": -19.845626831054688, "step": 24570 }, { "epoch": 0.8283056388823351, "grad_norm": 71.39663696289062, "learning_rate": 8.714274248768583e-08, "logits/chosen": -1.9558569192886353, "logits/rejected": -1.8518749475479126, "logps/chosen": -2.5088768005371094, "logps/rejected": -2.4664669036865234, "loss": 3.622, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -25.088764190673828, "rewards/margins": -0.42409858107566833, "rewards/rejected": -24.6646671295166, "step": 24575 }, { "epoch": 0.8284741649533183, "grad_norm": 64.44959259033203, "learning_rate": 8.697689607558801e-08, "logits/chosen": -2.6909384727478027, "logits/rejected": -2.5570266246795654, "logps/chosen": -2.24957537651062, "logps/rejected": -2.215425968170166, "loss": 3.8927, "rewards/accuracies": 0.5, "rewards/chosen": -22.49575424194336, "rewards/margins": -0.34149590134620667, "rewards/rejected": -22.154260635375977, "step": 24580 }, { "epoch": 0.8286426910243014, "grad_norm": 19.722009658813477, "learning_rate": 8.681119259451403e-08, "logits/chosen": -1.6167974472045898, "logits/rejected": -1.870568037033081, "logps/chosen": -2.371868848800659, "logps/rejected": -2.4427285194396973, "loss": 3.2064, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -23.718690872192383, "rewards/margins": 0.7085939645767212, "rewards/rejected": -24.42728614807129, "step": 24585 }, { "epoch": 0.8288112170952846, "grad_norm": 19.004295349121094, "learning_rate": 8.664563210180736e-08, "logits/chosen": -1.7222764492034912, "logits/rejected": -2.047950267791748, "logps/chosen": -1.800819754600525, "logps/rejected": -2.29779052734375, "loss": 1.6882, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.008197784423828, "rewards/margins": 4.969709873199463, "rewards/rejected": -22.9779052734375, "step": 24590 }, { "epoch": 0.8289797431662678, "grad_norm": 63.078643798828125, "learning_rate": 8.648021465476185e-08, "logits/chosen": -2.3208119869232178, "logits/rejected": -2.6440043449401855, "logps/chosen": -2.479806423187256, "logps/rejected": -2.5025460720062256, "loss": 5.985, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -24.798065185546875, "rewards/margins": 0.22739505767822266, "rewards/rejected": -25.025461196899414, "step": 24595 }, { "epoch": 0.829148269237251, "grad_norm": 22.138776779174805, "learning_rate": 8.631494031062197e-08, "logits/chosen": -1.9542160034179688, "logits/rejected": -1.8714958429336548, "logps/chosen": -2.1395716667175293, "logps/rejected": -2.96736478805542, "loss": 1.4994, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.39571762084961, "rewards/margins": 8.277932167053223, "rewards/rejected": -29.673648834228516, "step": 24600 }, { "epoch": 0.8293167953082342, "grad_norm": 23.740110397338867, "learning_rate": 8.61498091265827e-08, "logits/chosen": -1.223777174949646, "logits/rejected": -1.5701664686203003, "logps/chosen": -1.949045181274414, "logps/rejected": -2.445789337158203, "loss": 2.6138, "rewards/accuracies": 0.5, "rewards/chosen": -19.49045181274414, "rewards/margins": 4.967441558837891, "rewards/rejected": -24.457895278930664, "step": 24605 }, { "epoch": 0.8294853213792174, "grad_norm": 35.623653411865234, "learning_rate": 8.59848211597895e-08, "logits/chosen": -1.9874868392944336, "logits/rejected": -2.273397445678711, "logps/chosen": -1.8599836826324463, "logps/rejected": -2.0248303413391113, "loss": 2.7576, "rewards/accuracies": 0.5, "rewards/chosen": -18.599836349487305, "rewards/margins": 1.6484657526016235, "rewards/rejected": -20.248302459716797, "step": 24610 }, { "epoch": 0.8296538474502005, "grad_norm": 18.77968406677246, "learning_rate": 8.581997646733812e-08, "logits/chosen": -2.080842971801758, "logits/rejected": -2.308432102203369, "logps/chosen": -1.9153869152069092, "logps/rejected": -2.152315139770508, "loss": 3.1835, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.153867721557617, "rewards/margins": 2.369281768798828, "rewards/rejected": -21.523151397705078, "step": 24615 }, { "epoch": 0.8298223735211837, "grad_norm": 44.094932556152344, "learning_rate": 8.565527510627496e-08, "logits/chosen": -1.8816394805908203, "logits/rejected": -1.940839171409607, "logps/chosen": -2.2022149562835693, "logps/rejected": -2.478142499923706, "loss": 2.3778, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.022151947021484, "rewards/margins": 2.759272813796997, "rewards/rejected": -24.781423568725586, "step": 24620 }, { "epoch": 0.8299908995921669, "grad_norm": 29.145606994628906, "learning_rate": 8.549071713359646e-08, "logits/chosen": -1.9039310216903687, "logits/rejected": -1.9348599910736084, "logps/chosen": -2.906386375427246, "logps/rejected": -3.3684539794921875, "loss": 2.2814, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -29.063861846923828, "rewards/margins": 4.6206769943237305, "rewards/rejected": -33.684539794921875, "step": 24625 }, { "epoch": 0.83015942566315, "grad_norm": 27.65276336669922, "learning_rate": 8.532630260624974e-08, "logits/chosen": -1.5308537483215332, "logits/rejected": -1.6142187118530273, "logps/chosen": -2.1859543323516846, "logps/rejected": -2.2249884605407715, "loss": 2.8996, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.859542846679688, "rewards/margins": 0.3903442323207855, "rewards/rejected": -22.24988555908203, "step": 24630 }, { "epoch": 0.8303279517341333, "grad_norm": 19.151803970336914, "learning_rate": 8.516203158113216e-08, "logits/chosen": -1.777120590209961, "logits/rejected": -1.6125797033309937, "logps/chosen": -2.1398093700408936, "logps/rejected": -2.089583396911621, "loss": 3.9823, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.398096084594727, "rewards/margins": -0.5022621750831604, "rewards/rejected": -20.895832061767578, "step": 24635 }, { "epoch": 0.8304964778051165, "grad_norm": 20.655881881713867, "learning_rate": 8.499790411509161e-08, "logits/chosen": -1.474410057067871, "logits/rejected": -1.4502770900726318, "logps/chosen": -2.045407295227051, "logps/rejected": -2.0277533531188965, "loss": 3.5089, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.454071044921875, "rewards/margins": -0.17653894424438477, "rewards/rejected": -20.27753257751465, "step": 24640 }, { "epoch": 0.8306650038760997, "grad_norm": 32.811119079589844, "learning_rate": 8.483392026492592e-08, "logits/chosen": -1.5620620250701904, "logits/rejected": -1.6248960494995117, "logps/chosen": -2.8196754455566406, "logps/rejected": -2.5971808433532715, "loss": 5.5318, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -28.19675636291504, "rewards/margins": -2.224942684173584, "rewards/rejected": -25.971811294555664, "step": 24645 }, { "epoch": 0.8308335299470828, "grad_norm": 220.64158630371094, "learning_rate": 8.467008008738352e-08, "logits/chosen": -1.8569514751434326, "logits/rejected": -1.8670618534088135, "logps/chosen": -2.6269702911376953, "logps/rejected": -2.8930702209472656, "loss": 2.7798, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -26.269702911376953, "rewards/margins": 2.6609978675842285, "rewards/rejected": -28.93070411682129, "step": 24650 }, { "epoch": 0.831002056018066, "grad_norm": 37.396507263183594, "learning_rate": 8.450638363916324e-08, "logits/chosen": -1.9712955951690674, "logits/rejected": -2.224093437194824, "logps/chosen": -2.5156314373016357, "logps/rejected": -3.635897159576416, "loss": 0.8962, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -25.15631675720215, "rewards/margins": 11.202653884887695, "rewards/rejected": -36.358970642089844, "step": 24655 }, { "epoch": 0.8311705820890491, "grad_norm": 32.989925384521484, "learning_rate": 8.434283097691359e-08, "logits/chosen": -1.557471513748169, "logits/rejected": -1.5122106075286865, "logps/chosen": -3.2903800010681152, "logps/rejected": -3.4535770416259766, "loss": 2.5971, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -32.90380096435547, "rewards/margins": 1.631969690322876, "rewards/rejected": -34.53577423095703, "step": 24660 }, { "epoch": 0.8313391081600323, "grad_norm": 34.660987854003906, "learning_rate": 8.417942215723394e-08, "logits/chosen": -1.938816785812378, "logits/rejected": -2.1259303092956543, "logps/chosen": -2.5374019145965576, "logps/rejected": -2.644392490386963, "loss": 3.3285, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.374019622802734, "rewards/margins": 1.0699094533920288, "rewards/rejected": -26.443927764892578, "step": 24665 }, { "epoch": 0.8315076342310156, "grad_norm": 27.914525985717773, "learning_rate": 8.401615723667354e-08, "logits/chosen": -2.4170756340026855, "logits/rejected": -2.3611080646514893, "logps/chosen": -2.390336513519287, "logps/rejected": -2.480630397796631, "loss": 2.8004, "rewards/accuracies": 0.5, "rewards/chosen": -23.903366088867188, "rewards/margins": 0.9029388427734375, "rewards/rejected": -24.806303024291992, "step": 24670 }, { "epoch": 0.8316761603019988, "grad_norm": 16.455949783325195, "learning_rate": 8.385303627173223e-08, "logits/chosen": -1.707933783531189, "logits/rejected": -1.9285682439804077, "logps/chosen": -2.2291674613952637, "logps/rejected": -2.5350022315979004, "loss": 1.8161, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.291675567626953, "rewards/margins": 3.0583457946777344, "rewards/rejected": -25.350019454956055, "step": 24675 }, { "epoch": 0.8318446863729819, "grad_norm": 18.660741806030273, "learning_rate": 8.369005931885936e-08, "logits/chosen": -1.756413221359253, "logits/rejected": -2.0539305210113525, "logps/chosen": -2.5960230827331543, "logps/rejected": -3.1703128814697266, "loss": 1.8669, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -25.96023178100586, "rewards/margins": 5.742895603179932, "rewards/rejected": -31.703128814697266, "step": 24680 }, { "epoch": 0.8320132124439651, "grad_norm": 21.118820190429688, "learning_rate": 8.352722643445498e-08, "logits/chosen": -2.343622922897339, "logits/rejected": -2.329498291015625, "logps/chosen": -2.4431777000427246, "logps/rejected": -2.580986499786377, "loss": 2.7023, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.431777954101562, "rewards/margins": 1.3780863285064697, "rewards/rejected": -25.809864044189453, "step": 24685 }, { "epoch": 0.8321817385149483, "grad_norm": 33.85835647583008, "learning_rate": 8.336453767486929e-08, "logits/chosen": -1.9275716543197632, "logits/rejected": -2.071688413619995, "logps/chosen": -2.496863842010498, "logps/rejected": -3.0280239582061768, "loss": 2.5997, "rewards/accuracies": 0.5, "rewards/chosen": -24.968637466430664, "rewards/margins": 5.311602592468262, "rewards/rejected": -30.280237197875977, "step": 24690 }, { "epoch": 0.8323502645859314, "grad_norm": 24.390417098999023, "learning_rate": 8.320199309640224e-08, "logits/chosen": -1.8352603912353516, "logits/rejected": -2.3818843364715576, "logps/chosen": -2.051905870437622, "logps/rejected": -2.1428139209747314, "loss": 3.232, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.519058227539062, "rewards/margins": 0.9090802073478699, "rewards/rejected": -21.428136825561523, "step": 24695 }, { "epoch": 0.8325187906569146, "grad_norm": 33.29798126220703, "learning_rate": 8.303959275530415e-08, "logits/chosen": -2.1693029403686523, "logits/rejected": -2.5497283935546875, "logps/chosen": -2.0754661560058594, "logps/rejected": -2.577219247817993, "loss": 1.7509, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.75465965270996, "rewards/margins": 5.017529010772705, "rewards/rejected": -25.772192001342773, "step": 24700 }, { "epoch": 0.8326873167278978, "grad_norm": 91.18734741210938, "learning_rate": 8.287733670777547e-08, "logits/chosen": -1.9818317890167236, "logits/rejected": -1.9187809228897095, "logps/chosen": -3.0059261322021484, "logps/rejected": -3.015350580215454, "loss": 4.522, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -30.05925941467285, "rewards/margins": 0.09424237906932831, "rewards/rejected": -30.15350341796875, "step": 24705 }, { "epoch": 0.832855842798881, "grad_norm": 29.47276496887207, "learning_rate": 8.27152250099667e-08, "logits/chosen": -2.0462698936462402, "logits/rejected": -2.1758196353912354, "logps/chosen": -2.5688459873199463, "logps/rejected": -3.169067859649658, "loss": 1.4751, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -25.688457489013672, "rewards/margins": 6.002219200134277, "rewards/rejected": -31.690677642822266, "step": 24710 }, { "epoch": 0.8330243688698642, "grad_norm": 35.716827392578125, "learning_rate": 8.255325771797799e-08, "logits/chosen": -1.1083600521087646, "logits/rejected": -1.4641042947769165, "logps/chosen": -2.1212143898010254, "logps/rejected": -2.767564296722412, "loss": 1.8724, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.212142944335938, "rewards/margins": 6.463498115539551, "rewards/rejected": -27.675640106201172, "step": 24715 }, { "epoch": 0.8331928949408474, "grad_norm": 24.23432159423828, "learning_rate": 8.23914348878601e-08, "logits/chosen": -1.3574097156524658, "logits/rejected": -1.5571671724319458, "logps/chosen": -1.7626625299453735, "logps/rejected": -2.3101305961608887, "loss": 3.3752, "rewards/accuracies": 0.5, "rewards/chosen": -17.626623153686523, "rewards/margins": 5.474681854248047, "rewards/rejected": -23.101306915283203, "step": 24720 }, { "epoch": 0.8333614210118305, "grad_norm": 100.50273895263672, "learning_rate": 8.222975657561359e-08, "logits/chosen": -1.8818786144256592, "logits/rejected": -1.8507133722305298, "logps/chosen": -3.0965118408203125, "logps/rejected": -3.2869343757629395, "loss": 2.5407, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -30.96511459350586, "rewards/margins": 1.9042247533798218, "rewards/rejected": -32.86934280395508, "step": 24725 }, { "epoch": 0.8335299470828137, "grad_norm": 51.20134353637695, "learning_rate": 8.206822283718873e-08, "logits/chosen": -2.1385796070098877, "logits/rejected": -2.4248080253601074, "logps/chosen": -1.8668806552886963, "logps/rejected": -2.1117656230926514, "loss": 3.0479, "rewards/accuracies": 0.5, "rewards/chosen": -18.668806076049805, "rewards/margins": 2.44884991645813, "rewards/rejected": -21.117656707763672, "step": 24730 }, { "epoch": 0.8336984731537969, "grad_norm": 20.14624786376953, "learning_rate": 8.190683372848612e-08, "logits/chosen": -1.7058826684951782, "logits/rejected": -1.75360107421875, "logps/chosen": -2.259692668914795, "logps/rejected": -2.3245644569396973, "loss": 3.055, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.596923828125, "rewards/margins": 0.6487201452255249, "rewards/rejected": -23.245647430419922, "step": 24735 }, { "epoch": 0.83386699922478, "grad_norm": 24.15453338623047, "learning_rate": 8.174558930535608e-08, "logits/chosen": -2.644989490509033, "logits/rejected": -2.87270450592041, "logps/chosen": -2.3526439666748047, "logps/rejected": -2.963080883026123, "loss": 1.8598, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -23.526439666748047, "rewards/margins": 6.104373931884766, "rewards/rejected": -29.630813598632812, "step": 24740 }, { "epoch": 0.8340355252957633, "grad_norm": 24.785890579223633, "learning_rate": 8.158448962359903e-08, "logits/chosen": -1.4721378087997437, "logits/rejected": -1.8219953775405884, "logps/chosen": -1.9124889373779297, "logps/rejected": -2.7076058387756348, "loss": 1.8269, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.124889373779297, "rewards/margins": 7.951168060302734, "rewards/rejected": -27.0760555267334, "step": 24745 }, { "epoch": 0.8342040513667465, "grad_norm": 17.511924743652344, "learning_rate": 8.142353473896535e-08, "logits/chosen": -1.6391900777816772, "logits/rejected": -1.8954302072525024, "logps/chosen": -2.1554198265075684, "logps/rejected": -2.301879405975342, "loss": 2.0994, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.55419921875, "rewards/margins": 1.4645962715148926, "rewards/rejected": -23.018795013427734, "step": 24750 }, { "epoch": 0.8343725774377296, "grad_norm": 31.78805923461914, "learning_rate": 8.126272470715489e-08, "logits/chosen": -1.901834487915039, "logits/rejected": -2.024052619934082, "logps/chosen": -2.2320590019226074, "logps/rejected": -2.8132669925689697, "loss": 2.1427, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.320592880249023, "rewards/margins": 5.812076568603516, "rewards/rejected": -28.13266944885254, "step": 24755 }, { "epoch": 0.8345411035087128, "grad_norm": 14.08447551727295, "learning_rate": 8.110205958381783e-08, "logits/chosen": -1.8176250457763672, "logits/rejected": -1.7208465337753296, "logps/chosen": -1.8047428131103516, "logps/rejected": -2.0405871868133545, "loss": 1.8179, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.047428131103516, "rewards/margins": 2.3584465980529785, "rewards/rejected": -20.405874252319336, "step": 24760 }, { "epoch": 0.834709629579696, "grad_norm": 8.739221572875977, "learning_rate": 8.094153942455406e-08, "logits/chosen": -2.625211715698242, "logits/rejected": -3.1657795906066895, "logps/chosen": -4.064680576324463, "logps/rejected": -4.758119106292725, "loss": 3.9832, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -40.64680480957031, "rewards/margins": 6.934388637542725, "rewards/rejected": -47.58119583129883, "step": 24765 }, { "epoch": 0.8348781556506791, "grad_norm": 32.949581146240234, "learning_rate": 8.078116428491322e-08, "logits/chosen": -2.286863327026367, "logits/rejected": -2.008315324783325, "logps/chosen": -2.388113260269165, "logps/rejected": -2.4670004844665527, "loss": 2.6394, "rewards/accuracies": 0.5, "rewards/chosen": -23.881132125854492, "rewards/margins": 0.7888728976249695, "rewards/rejected": -24.670005798339844, "step": 24770 }, { "epoch": 0.8350466817216623, "grad_norm": 3.3901429176330566, "learning_rate": 8.062093422039484e-08, "logits/chosen": -2.0986361503601074, "logits/rejected": -2.428938388824463, "logps/chosen": -2.475968360900879, "logps/rejected": -2.965801954269409, "loss": 2.1051, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -24.759685516357422, "rewards/margins": 4.89833927154541, "rewards/rejected": -29.65802001953125, "step": 24775 }, { "epoch": 0.8352152077926456, "grad_norm": 88.18102264404297, "learning_rate": 8.046084928644841e-08, "logits/chosen": -1.3793189525604248, "logits/rejected": -2.444030284881592, "logps/chosen": -2.2284750938415527, "logps/rejected": -3.5414485931396484, "loss": 3.5037, "rewards/accuracies": 0.5, "rewards/chosen": -22.28474998474121, "rewards/margins": 13.129733085632324, "rewards/rejected": -35.414485931396484, "step": 24780 }, { "epoch": 0.8353837338636287, "grad_norm": 26.334148406982422, "learning_rate": 8.030090953847274e-08, "logits/chosen": -2.3015308380126953, "logits/rejected": -2.426604747772217, "logps/chosen": -1.8632885217666626, "logps/rejected": -2.1273903846740723, "loss": 2.4709, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.632884979248047, "rewards/margins": 2.641018867492676, "rewards/rejected": -21.27390480041504, "step": 24785 }, { "epoch": 0.8355522599346119, "grad_norm": 17.940793991088867, "learning_rate": 8.014111503181675e-08, "logits/chosen": -2.3400533199310303, "logits/rejected": -1.8803203105926514, "logps/chosen": -2.7042131423950195, "logps/rejected": -3.0254132747650146, "loss": 4.6191, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -27.042133331298828, "rewards/margins": 3.2120022773742676, "rewards/rejected": -30.254131317138672, "step": 24790 }, { "epoch": 0.8357207860055951, "grad_norm": 46.67781066894531, "learning_rate": 7.998146582177923e-08, "logits/chosen": -2.4982428550720215, "logits/rejected": -2.790593385696411, "logps/chosen": -3.1638150215148926, "logps/rejected": -3.3133628368377686, "loss": 3.7873, "rewards/accuracies": 0.5, "rewards/chosen": -31.638153076171875, "rewards/margins": 1.4954760074615479, "rewards/rejected": -33.133628845214844, "step": 24795 }, { "epoch": 0.8358893120765782, "grad_norm": 29.687307357788086, "learning_rate": 7.982196196360819e-08, "logits/chosen": -1.8978008031845093, "logits/rejected": -2.2759690284729004, "logps/chosen": -2.5504889488220215, "logps/rejected": -3.3600997924804688, "loss": 1.4467, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -25.504892349243164, "rewards/margins": 8.096107482910156, "rewards/rejected": -33.60099792480469, "step": 24800 }, { "epoch": 0.8358893120765782, "eval_logits/chosen": -2.2963404655456543, "eval_logits/rejected": -2.4741809368133545, "eval_logps/chosen": -2.281320810317993, "eval_logps/rejected": -2.4352529048919678, "eval_loss": 3.0853753089904785, "eval_rewards/accuracies": 0.6299999952316284, "eval_rewards/chosen": -22.81320571899414, "eval_rewards/margins": 1.5393211841583252, "eval_rewards/rejected": -24.352527618408203, "eval_runtime": 12.9051, "eval_samples_per_second": 7.749, "eval_steps_per_second": 1.937, "step": 24800 }, { "epoch": 0.8360578381475614, "grad_norm": 299.58172607421875, "learning_rate": 7.966260351250176e-08, "logits/chosen": -1.731415033340454, "logits/rejected": -1.6957142353057861, "logps/chosen": -2.898508310317993, "logps/rejected": -3.6182937622070312, "loss": 1.572, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -28.985082626342773, "rewards/margins": 7.1978559494018555, "rewards/rejected": -36.18293762207031, "step": 24805 }, { "epoch": 0.8362263642185446, "grad_norm": 11.822115898132324, "learning_rate": 7.950339052360761e-08, "logits/chosen": -1.9383857250213623, "logits/rejected": -2.0658161640167236, "logps/chosen": -2.1835899353027344, "logps/rejected": -2.295480966567993, "loss": 2.6226, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.835901260375977, "rewards/margins": 1.1189079284667969, "rewards/rejected": -22.954809188842773, "step": 24810 }, { "epoch": 0.8363948902895277, "grad_norm": 35.01670455932617, "learning_rate": 7.934432305202321e-08, "logits/chosen": -2.0714824199676514, "logits/rejected": -2.514486789703369, "logps/chosen": -2.0434935092926025, "logps/rejected": -2.7274627685546875, "loss": 1.6889, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.434932708740234, "rewards/margins": 6.839694023132324, "rewards/rejected": -27.274627685546875, "step": 24815 }, { "epoch": 0.836563416360511, "grad_norm": 28.495513916015625, "learning_rate": 7.918540115279538e-08, "logits/chosen": -2.2264180183410645, "logits/rejected": -2.5804407596588135, "logps/chosen": -2.661252737045288, "logps/rejected": -4.125033378601074, "loss": 1.6787, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -26.61252784729004, "rewards/margins": 14.637804985046387, "rewards/rejected": -41.250335693359375, "step": 24820 }, { "epoch": 0.8367319424314942, "grad_norm": 41.43541717529297, "learning_rate": 7.902662488092071e-08, "logits/chosen": -1.8777663707733154, "logits/rejected": -2.1416497230529785, "logps/chosen": -2.1105735301971436, "logps/rejected": -2.2156014442443848, "loss": 2.7785, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.10573387145996, "rewards/margins": 1.0502779483795166, "rewards/rejected": -22.15601348876953, "step": 24825 }, { "epoch": 0.8369004685024773, "grad_norm": 28.61469078063965, "learning_rate": 7.88679942913456e-08, "logits/chosen": -1.651914358139038, "logits/rejected": -1.790924310684204, "logps/chosen": -2.0216152667999268, "logps/rejected": -2.168140411376953, "loss": 2.639, "rewards/accuracies": 0.5, "rewards/chosen": -20.21615219116211, "rewards/margins": 1.4652526378631592, "rewards/rejected": -21.68140411376953, "step": 24830 }, { "epoch": 0.8370689945734605, "grad_norm": 20.57470703125, "learning_rate": 7.870950943896559e-08, "logits/chosen": -1.73696768283844, "logits/rejected": -1.9076378345489502, "logps/chosen": -2.084214687347412, "logps/rejected": -2.452416181564331, "loss": 2.1224, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.842144012451172, "rewards/margins": 3.6820149421691895, "rewards/rejected": -24.52416229248047, "step": 24835 }, { "epoch": 0.8372375206444437, "grad_norm": 12.077346801757812, "learning_rate": 7.855117037862624e-08, "logits/chosen": -1.2803000211715698, "logits/rejected": -1.6471437215805054, "logps/chosen": -2.5319161415100098, "logps/rejected": -2.940267562866211, "loss": 2.2514, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.31916046142578, "rewards/margins": 4.083517551422119, "rewards/rejected": -29.402679443359375, "step": 24840 }, { "epoch": 0.8374060467154268, "grad_norm": 50.33457946777344, "learning_rate": 7.839297716512233e-08, "logits/chosen": -1.7693202495574951, "logits/rejected": -1.9289439916610718, "logps/chosen": -2.4417946338653564, "logps/rejected": -3.2712059020996094, "loss": 3.0719, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.417943954467773, "rewards/margins": 8.294113159179688, "rewards/rejected": -32.712059020996094, "step": 24845 }, { "epoch": 0.83757457278641, "grad_norm": 28.605205535888672, "learning_rate": 7.823492985319857e-08, "logits/chosen": -1.704897165298462, "logits/rejected": -1.7164885997772217, "logps/chosen": -2.399608612060547, "logps/rejected": -2.45776104927063, "loss": 3.4444, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -23.9960880279541, "rewards/margins": 0.5815240740776062, "rewards/rejected": -24.577611923217773, "step": 24850 }, { "epoch": 0.8377430988573933, "grad_norm": 39.61781311035156, "learning_rate": 7.807702849754854e-08, "logits/chosen": -1.7676922082901, "logits/rejected": -1.6432530879974365, "logps/chosen": -2.191768169403076, "logps/rejected": -2.3741490840911865, "loss": 2.5188, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.917682647705078, "rewards/margins": 1.823809027671814, "rewards/rejected": -23.74148941040039, "step": 24855 }, { "epoch": 0.8379116249283765, "grad_norm": 28.61402130126953, "learning_rate": 7.791927315281582e-08, "logits/chosen": -1.956154227256775, "logits/rejected": -2.0019774436950684, "logps/chosen": -2.687230110168457, "logps/rejected": -3.2162253856658936, "loss": 1.9956, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.872303009033203, "rewards/margins": 5.289952278137207, "rewards/rejected": -32.162254333496094, "step": 24860 }, { "epoch": 0.8380801509993596, "grad_norm": 30.63452911376953, "learning_rate": 7.77616638735935e-08, "logits/chosen": -1.0301138162612915, "logits/rejected": -1.3081434965133667, "logps/chosen": -3.067924976348877, "logps/rejected": -3.3489010334014893, "loss": 3.011, "rewards/accuracies": 0.5, "rewards/chosen": -30.679248809814453, "rewards/margins": 2.8097636699676514, "rewards/rejected": -33.489013671875, "step": 24865 }, { "epoch": 0.8382486770703428, "grad_norm": 36.37446594238281, "learning_rate": 7.76042007144237e-08, "logits/chosen": -1.5581233501434326, "logits/rejected": -1.9835189580917358, "logps/chosen": -2.5085432529449463, "logps/rejected": -2.1060569286346436, "loss": 8.3278, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -25.085430145263672, "rewards/margins": -4.024866104125977, "rewards/rejected": -21.060565948486328, "step": 24870 }, { "epoch": 0.838417203141326, "grad_norm": 34.3663215637207, "learning_rate": 7.744688372979824e-08, "logits/chosen": -1.7154502868652344, "logits/rejected": -1.7041962146759033, "logps/chosen": -2.716712474822998, "logps/rejected": -2.6436030864715576, "loss": 3.8873, "rewards/accuracies": 0.5, "rewards/chosen": -27.167123794555664, "rewards/margins": -0.7310911417007446, "rewards/rejected": -26.436031341552734, "step": 24875 }, { "epoch": 0.8385857292123091, "grad_norm": 235.8781280517578, "learning_rate": 7.728971297415843e-08, "logits/chosen": -1.8020703792572021, "logits/rejected": -2.0559287071228027, "logps/chosen": -3.1783413887023926, "logps/rejected": -3.4103896617889404, "loss": 6.2529, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -31.783416748046875, "rewards/margins": 2.3204798698425293, "rewards/rejected": -34.1038932800293, "step": 24880 }, { "epoch": 0.8387542552832923, "grad_norm": 16.511754989624023, "learning_rate": 7.713268850189492e-08, "logits/chosen": -1.7213910818099976, "logits/rejected": -1.850507140159607, "logps/chosen": -2.0041606426239014, "logps/rejected": -2.1789450645446777, "loss": 2.7103, "rewards/accuracies": 0.5, "rewards/chosen": -20.041606903076172, "rewards/margins": 1.7478454113006592, "rewards/rejected": -21.789453506469727, "step": 24885 }, { "epoch": 0.8389227813542756, "grad_norm": 29.656295776367188, "learning_rate": 7.697581036734752e-08, "logits/chosen": -1.7956311702728271, "logits/rejected": -1.9855846166610718, "logps/chosen": -3.3117146492004395, "logps/rejected": -3.126675605773926, "loss": 6.4772, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -33.11714553833008, "rewards/margins": -1.8503891229629517, "rewards/rejected": -31.26675796508789, "step": 24890 }, { "epoch": 0.8390913074252587, "grad_norm": 23.54472541809082, "learning_rate": 7.681907862480569e-08, "logits/chosen": -1.9615615606307983, "logits/rejected": -1.8660144805908203, "logps/chosen": -2.3936643600463867, "logps/rejected": -2.4466681480407715, "loss": 2.8234, "rewards/accuracies": 0.5, "rewards/chosen": -23.936643600463867, "rewards/margins": 0.5300378799438477, "rewards/rejected": -24.4666805267334, "step": 24895 }, { "epoch": 0.8392598334962419, "grad_norm": 36.822994232177734, "learning_rate": 7.666249332850805e-08, "logits/chosen": -1.5024158954620361, "logits/rejected": -1.6682920455932617, "logps/chosen": -1.9932746887207031, "logps/rejected": -2.1869759559631348, "loss": 2.5083, "rewards/accuracies": 0.5, "rewards/chosen": -19.93274688720703, "rewards/margins": 1.937015175819397, "rewards/rejected": -21.869760513305664, "step": 24900 }, { "epoch": 0.8394283595672251, "grad_norm": 47.96998596191406, "learning_rate": 7.650605453264263e-08, "logits/chosen": -1.840775489807129, "logits/rejected": -2.2100350856781006, "logps/chosen": -2.1522419452667236, "logps/rejected": -2.880509853363037, "loss": 1.1539, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -21.522417068481445, "rewards/margins": 7.282681941986084, "rewards/rejected": -28.805099487304688, "step": 24905 }, { "epoch": 0.8395968856382082, "grad_norm": 26.887475967407227, "learning_rate": 7.634976229134677e-08, "logits/chosen": -1.9716761112213135, "logits/rejected": -2.277108907699585, "logps/chosen": -2.2041330337524414, "logps/rejected": -2.7298479080200195, "loss": 2.846, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.041330337524414, "rewards/margins": 5.257149696350098, "rewards/rejected": -27.298480987548828, "step": 24910 }, { "epoch": 0.8397654117091914, "grad_norm": 55.090423583984375, "learning_rate": 7.619361665870699e-08, "logits/chosen": -1.6557731628417969, "logits/rejected": -1.8671079874038696, "logps/chosen": -2.6471877098083496, "logps/rejected": -2.532266855239868, "loss": 5.3774, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -26.471878051757812, "rewards/margins": -1.1492098569869995, "rewards/rejected": -25.32266616821289, "step": 24915 }, { "epoch": 0.8399339377801746, "grad_norm": 119.17420196533203, "learning_rate": 7.603761768875933e-08, "logits/chosen": -1.460766315460205, "logits/rejected": -1.6520782709121704, "logps/chosen": -3.4117214679718018, "logps/rejected": -3.665818691253662, "loss": 4.2282, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -34.11721420288086, "rewards/margins": 2.54097580909729, "rewards/rejected": -36.65818786621094, "step": 24920 }, { "epoch": 0.8401024638511577, "grad_norm": 44.00700378417969, "learning_rate": 7.588176543548863e-08, "logits/chosen": -2.1163854598999023, "logits/rejected": -2.0090603828430176, "logps/chosen": -2.562870502471924, "logps/rejected": -2.6641266345977783, "loss": 3.5659, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -25.628704071044922, "rewards/margins": 1.0125625133514404, "rewards/rejected": -26.641265869140625, "step": 24925 }, { "epoch": 0.840270989922141, "grad_norm": 61.31565475463867, "learning_rate": 7.572605995282932e-08, "logits/chosen": -1.1729356050491333, "logits/rejected": -2.2421317100524902, "logps/chosen": -2.4731881618499756, "logps/rejected": -3.079061985015869, "loss": 3.0966, "rewards/accuracies": 0.5, "rewards/chosen": -24.731884002685547, "rewards/margins": 6.058734893798828, "rewards/rejected": -30.79061508178711, "step": 24930 }, { "epoch": 0.8404395159931242, "grad_norm": 26.54155731201172, "learning_rate": 7.557050129466503e-08, "logits/chosen": -1.2745441198349, "logits/rejected": -1.866265058517456, "logps/chosen": -3.1167454719543457, "logps/rejected": -3.6627628803253174, "loss": 1.9667, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -31.16745376586914, "rewards/margins": 5.46017599105835, "rewards/rejected": -36.62763214111328, "step": 24935 }, { "epoch": 0.8406080420641073, "grad_norm": 37.995361328125, "learning_rate": 7.541508951482828e-08, "logits/chosen": -1.3408405780792236, "logits/rejected": -1.9099143743515015, "logps/chosen": -2.429614543914795, "logps/rejected": -2.8554434776306152, "loss": 1.9876, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.296146392822266, "rewards/margins": 4.2582902908325195, "rewards/rejected": -28.554433822631836, "step": 24940 }, { "epoch": 0.8407765681350905, "grad_norm": 11.022102355957031, "learning_rate": 7.525982466710107e-08, "logits/chosen": -1.692125678062439, "logits/rejected": -2.2618660926818848, "logps/chosen": -2.951873302459717, "logps/rejected": -3.3857064247131348, "loss": 3.0067, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -29.518728256225586, "rewards/margins": 4.33833122253418, "rewards/rejected": -33.85706329345703, "step": 24945 }, { "epoch": 0.8409450942060737, "grad_norm": 43.10789489746094, "learning_rate": 7.510470680521442e-08, "logits/chosen": -1.518248438835144, "logits/rejected": -1.6292740106582642, "logps/chosen": -2.6451168060302734, "logps/rejected": -2.6429200172424316, "loss": 4.3998, "rewards/accuracies": 0.5, "rewards/chosen": -26.4511661529541, "rewards/margins": -0.02196817472577095, "rewards/rejected": -26.42919921875, "step": 24950 }, { "epoch": 0.8411136202770568, "grad_norm": 23.555614471435547, "learning_rate": 7.494973598284871e-08, "logits/chosen": -1.820709228515625, "logits/rejected": -2.0816328525543213, "logps/chosen": -2.2437901496887207, "logps/rejected": -3.068833112716675, "loss": 1.043, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -22.43790054321289, "rewards/margins": 8.2504301071167, "rewards/rejected": -30.688329696655273, "step": 24955 }, { "epoch": 0.84128214634804, "grad_norm": 4.327147483825684, "learning_rate": 7.479491225363289e-08, "logits/chosen": -1.9262707233428955, "logits/rejected": -2.1278395652770996, "logps/chosen": -2.530325412750244, "logps/rejected": -3.2573580741882324, "loss": 1.5586, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.30324935913086, "rewards/margins": 7.27032995223999, "rewards/rejected": -32.57358169555664, "step": 24960 }, { "epoch": 0.8414506724190233, "grad_norm": 21.99888038635254, "learning_rate": 7.464023567114558e-08, "logits/chosen": -1.6207927465438843, "logits/rejected": -1.5682424306869507, "logps/chosen": -2.9613633155822754, "logps/rejected": -2.809797763824463, "loss": 5.7672, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -29.613636016845703, "rewards/margins": -1.515655755996704, "rewards/rejected": -28.097976684570312, "step": 24965 }, { "epoch": 0.8416191984900064, "grad_norm": 36.24042510986328, "learning_rate": 7.448570628891426e-08, "logits/chosen": -1.524966835975647, "logits/rejected": -1.888681173324585, "logps/chosen": -2.220306634902954, "logps/rejected": -2.5548994541168213, "loss": 2.9771, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.203067779541016, "rewards/margins": 3.345930814743042, "rewards/rejected": -25.548995971679688, "step": 24970 }, { "epoch": 0.8417877245609896, "grad_norm": 47.27100372314453, "learning_rate": 7.433132416041532e-08, "logits/chosen": -1.385962724685669, "logits/rejected": -1.4744789600372314, "logps/chosen": -1.9234859943389893, "logps/rejected": -2.1203255653381348, "loss": 2.8177, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.234861373901367, "rewards/margins": 1.9683917760849, "rewards/rejected": -21.2032527923584, "step": 24975 }, { "epoch": 0.8419562506319728, "grad_norm": 65.24883270263672, "learning_rate": 7.41770893390744e-08, "logits/chosen": -2.4800801277160645, "logits/rejected": -2.3463408946990967, "logps/chosen": -2.263475179672241, "logps/rejected": -2.277034282684326, "loss": 3.3677, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.634754180908203, "rewards/margins": 0.13558892905712128, "rewards/rejected": -22.770343780517578, "step": 24980 }, { "epoch": 0.8421247767029559, "grad_norm": 22.446992874145508, "learning_rate": 7.40230018782661e-08, "logits/chosen": -2.0303831100463867, "logits/rejected": -2.356431722640991, "logps/chosen": -2.162581443786621, "logps/rejected": -2.779609203338623, "loss": 1.9192, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.62581443786621, "rewards/margins": 6.1702752113342285, "rewards/rejected": -27.796092987060547, "step": 24985 }, { "epoch": 0.8422933027739391, "grad_norm": 403.3161926269531, "learning_rate": 7.386906183131414e-08, "logits/chosen": -1.5163092613220215, "logits/rejected": -1.9733335971832275, "logps/chosen": -2.665555238723755, "logps/rejected": -2.582866907119751, "loss": 4.4604, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.65555191040039, "rewards/margins": -0.8268814086914062, "rewards/rejected": -25.82866859436035, "step": 24990 }, { "epoch": 0.8424618288449223, "grad_norm": 64.99507141113281, "learning_rate": 7.37152692514909e-08, "logits/chosen": -1.9403884410858154, "logits/rejected": -2.4308600425720215, "logps/chosen": -2.6366209983825684, "logps/rejected": -2.773503541946411, "loss": 3.0987, "rewards/accuracies": 0.5, "rewards/chosen": -26.366207122802734, "rewards/margins": 1.3688265085220337, "rewards/rejected": -27.735034942626953, "step": 24995 }, { "epoch": 0.8426303549159055, "grad_norm": 43.74968338012695, "learning_rate": 7.3561624192018e-08, "logits/chosen": -2.2320027351379395, "logits/rejected": -2.564361333847046, "logps/chosen": -2.5689501762390137, "logps/rejected": -2.8458104133605957, "loss": 2.9299, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.689504623413086, "rewards/margins": 2.7685999870300293, "rewards/rejected": -28.458105087280273, "step": 25000 }, { "epoch": 0.8427988809868887, "grad_norm": 49.172855377197266, "learning_rate": 7.340812670606611e-08, "logits/chosen": -1.9074571132659912, "logits/rejected": -2.3194408416748047, "logps/chosen": -2.8225278854370117, "logps/rejected": -2.9414353370666504, "loss": 6.2644, "rewards/accuracies": 0.5, "rewards/chosen": -28.22528076171875, "rewards/margins": 1.1890745162963867, "rewards/rejected": -29.414352416992188, "step": 25005 }, { "epoch": 0.8429674070578719, "grad_norm": 20.995792388916016, "learning_rate": 7.32547768467544e-08, "logits/chosen": -1.8266541957855225, "logits/rejected": -2.029745578765869, "logps/chosen": -1.8279956579208374, "logps/rejected": -2.4323911666870117, "loss": 2.3263, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.279956817626953, "rewards/margins": 6.043955326080322, "rewards/rejected": -24.323911666870117, "step": 25010 }, { "epoch": 0.843135933128855, "grad_norm": 41.86899185180664, "learning_rate": 7.310157466715133e-08, "logits/chosen": -0.9306265115737915, "logits/rejected": -2.0167274475097656, "logps/chosen": -2.461030960083008, "logps/rejected": -3.634398937225342, "loss": 2.6665, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.610309600830078, "rewards/margins": 11.733680725097656, "rewards/rejected": -36.343990325927734, "step": 25015 }, { "epoch": 0.8433044591998382, "grad_norm": 44.50137710571289, "learning_rate": 7.294852022027409e-08, "logits/chosen": -1.8055055141448975, "logits/rejected": -2.123600721359253, "logps/chosen": -2.092360019683838, "logps/rejected": -2.493429660797119, "loss": 2.0098, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.923603057861328, "rewards/margins": 4.010695457458496, "rewards/rejected": -24.934295654296875, "step": 25020 }, { "epoch": 0.8434729852708214, "grad_norm": 30.74772071838379, "learning_rate": 7.279561355908903e-08, "logits/chosen": -1.1198111772537231, "logits/rejected": -1.6509565114974976, "logps/chosen": -2.195266008377075, "logps/rejected": -2.4629082679748535, "loss": 3.6763, "rewards/accuracies": 0.5, "rewards/chosen": -21.95265769958496, "rewards/margins": 2.6764230728149414, "rewards/rejected": -24.62908172607422, "step": 25025 }, { "epoch": 0.8436415113418045, "grad_norm": 39.40415954589844, "learning_rate": 7.264285473651078e-08, "logits/chosen": -1.5459685325622559, "logits/rejected": -1.7445091009140015, "logps/chosen": -2.3278846740722656, "logps/rejected": -2.5380375385284424, "loss": 1.627, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -23.27884864807129, "rewards/margins": 2.101527690887451, "rewards/rejected": -25.3803768157959, "step": 25030 }, { "epoch": 0.8438100374127877, "grad_norm": 13.756035804748535, "learning_rate": 7.249024380540331e-08, "logits/chosen": -2.231560230255127, "logits/rejected": -2.355930805206299, "logps/chosen": -2.355832576751709, "logps/rejected": -2.809471607208252, "loss": 1.2205, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -23.55832862854004, "rewards/margins": 4.5363874435424805, "rewards/rejected": -28.094715118408203, "step": 25035 }, { "epoch": 0.843978563483771, "grad_norm": 290.2860107421875, "learning_rate": 7.233778081857928e-08, "logits/chosen": -1.5744444131851196, "logits/rejected": -2.3242926597595215, "logps/chosen": -3.13602352142334, "logps/rejected": -3.7153167724609375, "loss": 4.2686, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -31.360239028930664, "rewards/margins": 5.792929649353027, "rewards/rejected": -37.153160095214844, "step": 25040 }, { "epoch": 0.8441470895547541, "grad_norm": 29.042400360107422, "learning_rate": 7.218546582880003e-08, "logits/chosen": -1.6595700979232788, "logits/rejected": -1.9214191436767578, "logps/chosen": -2.1897659301757812, "logps/rejected": -2.8830249309539795, "loss": 1.8043, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.897661209106445, "rewards/margins": 6.932587623596191, "rewards/rejected": -28.830249786376953, "step": 25045 }, { "epoch": 0.8443156156257373, "grad_norm": 38.23175811767578, "learning_rate": 7.203329888877602e-08, "logits/chosen": -1.7221580743789673, "logits/rejected": -1.9687376022338867, "logps/chosen": -3.106229782104492, "logps/rejected": -3.474976062774658, "loss": 3.0086, "rewards/accuracies": 0.5, "rewards/chosen": -31.062297821044922, "rewards/margins": 3.687462568283081, "rewards/rejected": -34.74976348876953, "step": 25050 }, { "epoch": 0.8444841416967205, "grad_norm": 19.823972702026367, "learning_rate": 7.188128005116589e-08, "logits/chosen": -1.8666727542877197, "logits/rejected": -2.0208840370178223, "logps/chosen": -1.9966617822647095, "logps/rejected": -2.352268934249878, "loss": 2.7333, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.966617584228516, "rewards/margins": 3.556072950363159, "rewards/rejected": -23.522689819335938, "step": 25055 }, { "epoch": 0.8446526677677036, "grad_norm": 34.18622589111328, "learning_rate": 7.172940936857751e-08, "logits/chosen": -1.8579282760620117, "logits/rejected": -1.8631632328033447, "logps/chosen": -2.143566846847534, "logps/rejected": -2.2731566429138184, "loss": 2.4161, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.4356689453125, "rewards/margins": 1.295898199081421, "rewards/rejected": -22.7315673828125, "step": 25060 }, { "epoch": 0.8448211938386868, "grad_norm": 37.837615966796875, "learning_rate": 7.157768689356741e-08, "logits/chosen": -2.0021746158599854, "logits/rejected": -2.181589365005493, "logps/chosen": -2.607229709625244, "logps/rejected": -3.0638108253479004, "loss": 2.0169, "rewards/accuracies": 0.5, "rewards/chosen": -26.072296142578125, "rewards/margins": 4.565813064575195, "rewards/rejected": -30.638107299804688, "step": 25065 }, { "epoch": 0.84498971990967, "grad_norm": 43.49299240112305, "learning_rate": 7.142611267864068e-08, "logits/chosen": -2.0716075897216797, "logits/rejected": -2.165670871734619, "logps/chosen": -2.8804638385772705, "logps/rejected": -3.0383782386779785, "loss": 3.4523, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -28.804637908935547, "rewards/margins": 1.5791432857513428, "rewards/rejected": -30.3837833404541, "step": 25070 }, { "epoch": 0.8451582459806533, "grad_norm": 33.29661560058594, "learning_rate": 7.127468677625137e-08, "logits/chosen": -1.3675150871276855, "logits/rejected": -1.3157413005828857, "logps/chosen": -2.02780818939209, "logps/rejected": -2.133814573287964, "loss": 3.5942, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.2780818939209, "rewards/margins": 1.060063362121582, "rewards/rejected": -21.338146209716797, "step": 25075 }, { "epoch": 0.8453267720516364, "grad_norm": 7.868017673492432, "learning_rate": 7.112340923880172e-08, "logits/chosen": -1.3394761085510254, "logits/rejected": -1.7868926525115967, "logps/chosen": -1.9955241680145264, "logps/rejected": -2.234886646270752, "loss": 2.258, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.955242156982422, "rewards/margins": 2.393623113632202, "rewards/rejected": -22.348867416381836, "step": 25080 }, { "epoch": 0.8454952981226196, "grad_norm": 40.71536636352539, "learning_rate": 7.097228011864304e-08, "logits/chosen": -1.532164216041565, "logits/rejected": -2.4663033485412598, "logps/chosen": -2.945025682449341, "logps/rejected": -3.815264940261841, "loss": 1.6375, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -29.45025634765625, "rewards/margins": 8.702392578125, "rewards/rejected": -38.15264892578125, "step": 25085 }, { "epoch": 0.8456638241936028, "grad_norm": 107.01415252685547, "learning_rate": 7.082129946807525e-08, "logits/chosen": -1.9736740589141846, "logits/rejected": -1.782965898513794, "logps/chosen": -2.528587818145752, "logps/rejected": -2.898869276046753, "loss": 2.9653, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.285877227783203, "rewards/margins": 3.702814817428589, "rewards/rejected": -28.988689422607422, "step": 25090 }, { "epoch": 0.8458323502645859, "grad_norm": 39.172874450683594, "learning_rate": 7.067046733934685e-08, "logits/chosen": -1.4259833097457886, "logits/rejected": -1.6555812358856201, "logps/chosen": -2.3975255489349365, "logps/rejected": -3.425919771194458, "loss": 2.7265, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.975255966186523, "rewards/margins": 10.283941268920898, "rewards/rejected": -34.25919723510742, "step": 25095 }, { "epoch": 0.8460008763355691, "grad_norm": 36.24737548828125, "learning_rate": 7.051978378465461e-08, "logits/chosen": -1.5988880395889282, "logits/rejected": -1.908831000328064, "logps/chosen": -2.4758269786834717, "logps/rejected": -2.7519752979278564, "loss": 3.4556, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -24.758270263671875, "rewards/margins": 2.7614827156066895, "rewards/rejected": -27.519750595092773, "step": 25100 }, { "epoch": 0.8461694024065523, "grad_norm": 113.6759262084961, "learning_rate": 7.036924885614443e-08, "logits/chosen": -1.8109318017959595, "logits/rejected": -1.8489185571670532, "logps/chosen": -2.460669994354248, "logps/rejected": -2.535710096359253, "loss": 2.8731, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.606698989868164, "rewards/margins": 0.7504032254219055, "rewards/rejected": -25.357101440429688, "step": 25105 }, { "epoch": 0.8463379284775355, "grad_norm": 48.524330139160156, "learning_rate": 7.021886260591053e-08, "logits/chosen": -1.6381628513336182, "logits/rejected": -1.8552592992782593, "logps/chosen": -1.9970314502716064, "logps/rejected": -2.153109073638916, "loss": 2.0627, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.97031593322754, "rewards/margins": 1.5607770681381226, "rewards/rejected": -21.531091690063477, "step": 25110 }, { "epoch": 0.8465064545485187, "grad_norm": 44.247013092041016, "learning_rate": 7.006862508599554e-08, "logits/chosen": -1.9095268249511719, "logits/rejected": -2.1885132789611816, "logps/chosen": -2.5970306396484375, "logps/rejected": -3.2741634845733643, "loss": 3.3046, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -25.970306396484375, "rewards/margins": 6.771327018737793, "rewards/rejected": -32.741634368896484, "step": 25115 }, { "epoch": 0.8466749806195019, "grad_norm": 0.39335116744041443, "learning_rate": 6.991853634839068e-08, "logits/chosen": -2.1617605686187744, "logits/rejected": -2.1641488075256348, "logps/chosen": -2.821693181991577, "logps/rejected": -3.0508570671081543, "loss": 2.5592, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -28.216930389404297, "rewards/margins": 2.291637897491455, "rewards/rejected": -30.508569717407227, "step": 25120 }, { "epoch": 0.846843506690485, "grad_norm": 67.44718170166016, "learning_rate": 6.976859644503591e-08, "logits/chosen": -1.5310341119766235, "logits/rejected": -1.79361891746521, "logps/chosen": -2.7277884483337402, "logps/rejected": -2.9582366943359375, "loss": 4.3421, "rewards/accuracies": 0.5, "rewards/chosen": -27.277883529663086, "rewards/margins": 2.304482936859131, "rewards/rejected": -29.582366943359375, "step": 25125 }, { "epoch": 0.8470120327614682, "grad_norm": 33.54582977294922, "learning_rate": 6.961880542781962e-08, "logits/chosen": -1.2129520177841187, "logits/rejected": -1.4452576637268066, "logps/chosen": -2.5996367931365967, "logps/rejected": -3.155336380004883, "loss": 0.7775, "rewards/accuracies": 1.0, "rewards/chosen": -25.996368408203125, "rewards/margins": 5.5569963455200195, "rewards/rejected": -31.553363800048828, "step": 25130 }, { "epoch": 0.8471805588324514, "grad_norm": 11.366177558898926, "learning_rate": 6.946916334857822e-08, "logits/chosen": -1.721011757850647, "logits/rejected": -2.0394160747528076, "logps/chosen": -2.0530264377593994, "logps/rejected": -2.2458395957946777, "loss": 2.0855, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.530263900756836, "rewards/margins": 1.9281337261199951, "rewards/rejected": -22.45839500427246, "step": 25135 }, { "epoch": 0.8473490849034345, "grad_norm": 44.85331344604492, "learning_rate": 6.931967025909724e-08, "logits/chosen": -1.4946274757385254, "logits/rejected": -1.5228080749511719, "logps/chosen": -2.5096378326416016, "logps/rejected": -2.6178455352783203, "loss": 3.6777, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -25.096378326416016, "rewards/margins": 1.082077980041504, "rewards/rejected": -26.178457260131836, "step": 25140 }, { "epoch": 0.8475176109744177, "grad_norm": 30.542884826660156, "learning_rate": 6.917032621111029e-08, "logits/chosen": -2.2814676761627197, "logits/rejected": -2.2030251026153564, "logps/chosen": -2.5095839500427246, "logps/rejected": -2.711360216140747, "loss": 2.8255, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.095840454101562, "rewards/margins": 2.017761468887329, "rewards/rejected": -27.113601684570312, "step": 25145 }, { "epoch": 0.847686137045401, "grad_norm": 104.68621063232422, "learning_rate": 6.902113125629938e-08, "logits/chosen": -1.8296295404434204, "logits/rejected": -2.0239923000335693, "logps/chosen": -2.2184555530548096, "logps/rejected": -2.414580821990967, "loss": 2.8458, "rewards/accuracies": 0.5, "rewards/chosen": -22.184555053710938, "rewards/margins": 1.9612529277801514, "rewards/rejected": -24.145809173583984, "step": 25150 }, { "epoch": 0.8478546631163841, "grad_norm": 63.11360168457031, "learning_rate": 6.887208544629503e-08, "logits/chosen": -1.5033972263336182, "logits/rejected": -1.4420006275177002, "logps/chosen": -2.1528759002685547, "logps/rejected": -2.4601898193359375, "loss": 2.2622, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.528759002685547, "rewards/margins": 3.0731396675109863, "rewards/rejected": -24.601898193359375, "step": 25155 }, { "epoch": 0.8480231891873673, "grad_norm": 151.5568084716797, "learning_rate": 6.872318883267614e-08, "logits/chosen": -2.212207317352295, "logits/rejected": -1.8575718402862549, "logps/chosen": -2.4121429920196533, "logps/rejected": -2.3576273918151855, "loss": 4.9464, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.121427536010742, "rewards/margins": -0.5451576113700867, "rewards/rejected": -23.576269149780273, "step": 25160 }, { "epoch": 0.8481917152583505, "grad_norm": 32.88406753540039, "learning_rate": 6.857444146697006e-08, "logits/chosen": -1.6049182415008545, "logits/rejected": -1.7114994525909424, "logps/chosen": -2.603238344192505, "logps/rejected": -2.758378028869629, "loss": 2.7873, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.03238296508789, "rewards/margins": 1.5513970851898193, "rewards/rejected": -27.58378028869629, "step": 25165 }, { "epoch": 0.8483602413293336, "grad_norm": 44.524539947509766, "learning_rate": 6.842584340065222e-08, "logits/chosen": -1.3670152425765991, "logits/rejected": -1.4763129949569702, "logps/chosen": -1.884411096572876, "logps/rejected": -2.149660110473633, "loss": 2.0465, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.8441104888916, "rewards/margins": 2.652489185333252, "rewards/rejected": -21.496601104736328, "step": 25170 }, { "epoch": 0.8485287674003168, "grad_norm": 60.07644271850586, "learning_rate": 6.827739468514659e-08, "logits/chosen": -2.2666361331939697, "logits/rejected": -2.264575719833374, "logps/chosen": -2.6530709266662598, "logps/rejected": -2.750534772872925, "loss": 3.4756, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -26.53070640563965, "rewards/margins": 0.9746391177177429, "rewards/rejected": -27.505346298217773, "step": 25175 }, { "epoch": 0.8486972934713, "grad_norm": 34.915287017822266, "learning_rate": 6.812909537182565e-08, "logits/chosen": -1.5165358781814575, "logits/rejected": -1.5831248760223389, "logps/chosen": -2.374108076095581, "logps/rejected": -2.758392810821533, "loss": 2.4535, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.741079330444336, "rewards/margins": 3.8428473472595215, "rewards/rejected": -27.58392906188965, "step": 25180 }, { "epoch": 0.8488658195422832, "grad_norm": 42.44291305541992, "learning_rate": 6.798094551200961e-08, "logits/chosen": -1.5595664978027344, "logits/rejected": -1.9098215103149414, "logps/chosen": -3.4170849323272705, "logps/rejected": -3.8340396881103516, "loss": 2.9079, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -34.17084503173828, "rewards/margins": 4.1695475578308105, "rewards/rejected": -38.340396881103516, "step": 25185 }, { "epoch": 0.8490343456132664, "grad_norm": 15.514145851135254, "learning_rate": 6.783294515696747e-08, "logits/chosen": -2.5971148014068604, "logits/rejected": -2.3608932495117188, "logps/chosen": -2.288546323776245, "logps/rejected": -2.5332279205322266, "loss": 3.8562, "rewards/accuracies": 0.5, "rewards/chosen": -22.88546371459961, "rewards/margins": 2.4468159675598145, "rewards/rejected": -25.332279205322266, "step": 25190 }, { "epoch": 0.8492028716842496, "grad_norm": 41.55581283569336, "learning_rate": 6.768509435791631e-08, "logits/chosen": -2.2747750282287598, "logits/rejected": -2.2851319313049316, "logps/chosen": -2.667318820953369, "logps/rejected": -2.459958076477051, "loss": 6.2535, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -26.67318344116211, "rewards/margins": -2.073606491088867, "rewards/rejected": -24.599578857421875, "step": 25195 }, { "epoch": 0.8493713977552327, "grad_norm": 75.3543701171875, "learning_rate": 6.753739316602148e-08, "logits/chosen": -2.2415099143981934, "logits/rejected": -2.157219171524048, "logps/chosen": -2.4603915214538574, "logps/rejected": -2.5724644660949707, "loss": 2.7241, "rewards/accuracies": 0.5, "rewards/chosen": -24.603918075561523, "rewards/margins": 1.1207275390625, "rewards/rejected": -25.724645614624023, "step": 25200 }, { "epoch": 0.8493713977552327, "eval_logits/chosen": -2.2988386154174805, "eval_logits/rejected": -2.477008104324341, "eval_logps/chosen": -2.283003330230713, "eval_logps/rejected": -2.4374542236328125, "eval_loss": 3.086169958114624, "eval_rewards/accuracies": 0.6299999952316284, "eval_rewards/chosen": -22.830032348632812, "eval_rewards/margins": 1.5445094108581543, "eval_rewards/rejected": -24.374540328979492, "eval_runtime": 12.887, "eval_samples_per_second": 7.76, "eval_steps_per_second": 1.94, "step": 25200 }, { "epoch": 0.8495399238262159, "grad_norm": 26.182111740112305, "learning_rate": 6.738984163239647e-08, "logits/chosen": -1.3157546520233154, "logits/rejected": -1.4888827800750732, "logps/chosen": -2.333944320678711, "logps/rejected": -3.176734447479248, "loss": 1.0272, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -23.33944320678711, "rewards/margins": 8.427900314331055, "rewards/rejected": -31.767343521118164, "step": 25205 }, { "epoch": 0.8497084498971991, "grad_norm": 21.64483070373535, "learning_rate": 6.724243980810319e-08, "logits/chosen": -2.0497689247131348, "logits/rejected": -2.2630741596221924, "logps/chosen": -2.9822685718536377, "logps/rejected": -3.110841989517212, "loss": 2.0844, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -29.82268714904785, "rewards/margins": 1.285732388496399, "rewards/rejected": -31.10841941833496, "step": 25210 }, { "epoch": 0.8498769759681822, "grad_norm": 9.70484447479248, "learning_rate": 6.709518774415157e-08, "logits/chosen": -1.9983733892440796, "logits/rejected": -2.8238205909729004, "logps/chosen": -2.225658416748047, "logps/rejected": -2.7286124229431152, "loss": 1.9975, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.25658416748047, "rewards/margins": 5.029541969299316, "rewards/rejected": -27.2861270904541, "step": 25215 }, { "epoch": 0.8500455020391655, "grad_norm": 39.07335662841797, "learning_rate": 6.69480854914996e-08, "logits/chosen": -1.7552353143692017, "logits/rejected": -2.2177629470825195, "logps/chosen": -2.202458620071411, "logps/rejected": -2.6856517791748047, "loss": 1.6069, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.024587631225586, "rewards/margins": 4.831931114196777, "rewards/rejected": -26.856517791748047, "step": 25220 }, { "epoch": 0.8502140281101487, "grad_norm": 50.24772262573242, "learning_rate": 6.680113310105373e-08, "logits/chosen": -1.8044487237930298, "logits/rejected": -1.6741470098495483, "logps/chosen": -3.385709762573242, "logps/rejected": -3.506852626800537, "loss": 2.8577, "rewards/accuracies": 0.5, "rewards/chosen": -33.85709762573242, "rewards/margins": 1.2114317417144775, "rewards/rejected": -35.06852722167969, "step": 25225 }, { "epoch": 0.8503825541811318, "grad_norm": 231.3321075439453, "learning_rate": 6.665433062366838e-08, "logits/chosen": -1.8543148040771484, "logits/rejected": -1.9665143489837646, "logps/chosen": -2.7595534324645996, "logps/rejected": -3.008307456970215, "loss": 2.334, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -27.595539093017578, "rewards/margins": 2.4875378608703613, "rewards/rejected": -30.08307456970215, "step": 25230 }, { "epoch": 0.850551080252115, "grad_norm": 34.387332916259766, "learning_rate": 6.650767811014602e-08, "logits/chosen": -1.3960977792739868, "logits/rejected": -1.4877779483795166, "logps/chosen": -2.183588743209839, "logps/rejected": -2.609048366546631, "loss": 2.8054, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.835886001586914, "rewards/margins": 4.254596710205078, "rewards/rejected": -26.09048080444336, "step": 25235 }, { "epoch": 0.8507196063230982, "grad_norm": 23.472320556640625, "learning_rate": 6.636117561123733e-08, "logits/chosen": -1.3947211503982544, "logits/rejected": -1.6472208499908447, "logps/chosen": -2.369253396987915, "logps/rejected": -2.7023870944976807, "loss": 2.2897, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.69253158569336, "rewards/margins": 3.3313395977020264, "rewards/rejected": -27.02387046813965, "step": 25240 }, { "epoch": 0.8508881323940813, "grad_norm": 26.026569366455078, "learning_rate": 6.621482317764104e-08, "logits/chosen": -1.4750478267669678, "logits/rejected": -1.8304048776626587, "logps/chosen": -2.4229671955108643, "logps/rejected": -2.976175308227539, "loss": 1.5136, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -24.229671478271484, "rewards/margins": 5.53208065032959, "rewards/rejected": -29.76175308227539, "step": 25245 }, { "epoch": 0.8510566584650645, "grad_norm": 33.33852005004883, "learning_rate": 6.606862086000414e-08, "logits/chosen": -1.5895836353302002, "logits/rejected": -2.616696357727051, "logps/chosen": -2.1377921104431152, "logps/rejected": -2.691991090774536, "loss": 2.8422, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.37792205810547, "rewards/margins": 5.541991233825684, "rewards/rejected": -26.919910430908203, "step": 25250 }, { "epoch": 0.8512251845360477, "grad_norm": 32.822242736816406, "learning_rate": 6.592256870892122e-08, "logits/chosen": -1.5423781871795654, "logits/rejected": -2.161618709564209, "logps/chosen": -2.338876247406006, "logps/rejected": -3.336784839630127, "loss": 1.0311, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -23.388761520385742, "rewards/margins": 9.979085922241211, "rewards/rejected": -33.36784744262695, "step": 25255 }, { "epoch": 0.851393710607031, "grad_norm": 45.84088134765625, "learning_rate": 6.577666677493532e-08, "logits/chosen": -2.2524895668029785, "logits/rejected": -1.8200067281723022, "logps/chosen": -3.145887851715088, "logps/rejected": -3.471153736114502, "loss": 2.6785, "rewards/accuracies": 0.5, "rewards/chosen": -31.458881378173828, "rewards/margins": 3.252655029296875, "rewards/rejected": -34.7115364074707, "step": 25260 }, { "epoch": 0.8515622366780141, "grad_norm": 20.831850051879883, "learning_rate": 6.563091510853741e-08, "logits/chosen": -1.4504529237747192, "logits/rejected": -1.597791075706482, "logps/chosen": -1.9485721588134766, "logps/rejected": -2.133368492126465, "loss": 2.2884, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.485719680786133, "rewards/margins": 1.8479652404785156, "rewards/rejected": -21.333683013916016, "step": 25265 }, { "epoch": 0.8517307627489973, "grad_norm": 26.64033317565918, "learning_rate": 6.548531376016619e-08, "logits/chosen": -2.3206093311309814, "logits/rejected": -2.648336887359619, "logps/chosen": -2.4551775455474854, "logps/rejected": -2.920405864715576, "loss": 2.0748, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.551776885986328, "rewards/margins": 4.652281284332275, "rewards/rejected": -29.204059600830078, "step": 25270 }, { "epoch": 0.8518992888199804, "grad_norm": 521.4677734375, "learning_rate": 6.533986278020875e-08, "logits/chosen": -1.5181517601013184, "logits/rejected": -2.314953327178955, "logps/chosen": -3.4335227012634277, "logps/rejected": -3.6828982830047607, "loss": 6.0014, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -34.335227966308594, "rewards/margins": 2.493752956390381, "rewards/rejected": -36.828983306884766, "step": 25275 }, { "epoch": 0.8520678148909636, "grad_norm": 32.94503402709961, "learning_rate": 6.519456221899982e-08, "logits/chosen": -2.042977809906006, "logits/rejected": -2.5099101066589355, "logps/chosen": -2.3646905422210693, "logps/rejected": -2.729539394378662, "loss": 1.9035, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.64690589904785, "rewards/margins": 3.6484885215759277, "rewards/rejected": -27.295394897460938, "step": 25280 }, { "epoch": 0.8522363409619468, "grad_norm": 18.743024826049805, "learning_rate": 6.50494121268224e-08, "logits/chosen": -1.6273609399795532, "logits/rejected": -1.8308128118515015, "logps/chosen": -1.9369827508926392, "logps/rejected": -2.1672778129577637, "loss": 2.0784, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.369827270507812, "rewards/margins": 2.3029518127441406, "rewards/rejected": -21.672779083251953, "step": 25285 }, { "epoch": 0.85240486703293, "grad_norm": 1.6357653141021729, "learning_rate": 6.4904412553907e-08, "logits/chosen": -1.824774146080017, "logits/rejected": -2.023179769515991, "logps/chosen": -2.229252338409424, "logps/rejected": -2.7864389419555664, "loss": 2.2768, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.292522430419922, "rewards/margins": 5.571867942810059, "rewards/rejected": -27.864391326904297, "step": 25290 }, { "epoch": 0.8525733931039132, "grad_norm": 31.789331436157227, "learning_rate": 6.475956355043227e-08, "logits/chosen": -2.1489577293395996, "logits/rejected": -2.3417067527770996, "logps/chosen": -2.325360059738159, "logps/rejected": -2.52240252494812, "loss": 2.8407, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.25360107421875, "rewards/margins": 1.9704246520996094, "rewards/rejected": -25.22402572631836, "step": 25295 }, { "epoch": 0.8527419191748964, "grad_norm": 33.04253005981445, "learning_rate": 6.461486516652492e-08, "logits/chosen": -1.5875952243804932, "logits/rejected": -1.7361282110214233, "logps/chosen": -2.2500598430633545, "logps/rejected": -2.3189072608947754, "loss": 3.0055, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.500600814819336, "rewards/margins": 0.6884740591049194, "rewards/rejected": -23.189075469970703, "step": 25300 }, { "epoch": 0.8529104452458796, "grad_norm": 28.588184356689453, "learning_rate": 6.447031745225917e-08, "logits/chosen": -1.8044865131378174, "logits/rejected": -2.0297751426696777, "logps/chosen": -3.2455849647521973, "logps/rejected": -3.45509672164917, "loss": 2.4975, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -32.45585250854492, "rewards/margins": 2.095115900039673, "rewards/rejected": -34.550968170166016, "step": 25305 }, { "epoch": 0.8530789713168627, "grad_norm": 48.96995544433594, "learning_rate": 6.432592045765733e-08, "logits/chosen": -1.811680555343628, "logits/rejected": -2.0719192028045654, "logps/chosen": -2.452927350997925, "logps/rejected": -2.5236716270446777, "loss": 2.9467, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -24.529273986816406, "rewards/margins": 0.707441508769989, "rewards/rejected": -25.236713409423828, "step": 25310 }, { "epoch": 0.8532474973878459, "grad_norm": 43.726924896240234, "learning_rate": 6.41816742326896e-08, "logits/chosen": -1.991275429725647, "logits/rejected": -1.9130961894989014, "logps/chosen": -2.3691442012786865, "logps/rejected": -2.2164864540100098, "loss": 5.1097, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -23.69144058227539, "rewards/margins": -1.526573657989502, "rewards/rejected": -22.164867401123047, "step": 25315 }, { "epoch": 0.853416023458829, "grad_norm": 23.73866081237793, "learning_rate": 6.403757882727389e-08, "logits/chosen": -1.9794378280639648, "logits/rejected": -2.502854108810425, "logps/chosen": -3.8947620391845703, "logps/rejected": -4.647494792938232, "loss": 1.7916, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -38.9476203918457, "rewards/margins": 7.527327060699463, "rewards/rejected": -46.474945068359375, "step": 25320 }, { "epoch": 0.8535845495298122, "grad_norm": 25.906620025634766, "learning_rate": 6.389363429127586e-08, "logits/chosen": -1.6756868362426758, "logits/rejected": -1.8262481689453125, "logps/chosen": -2.3997318744659424, "logps/rejected": -2.5467724800109863, "loss": 2.8854, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.997316360473633, "rewards/margins": 1.4704080820083618, "rewards/rejected": -25.467723846435547, "step": 25325 }, { "epoch": 0.8537530756007955, "grad_norm": 96.08812713623047, "learning_rate": 6.374984067450912e-08, "logits/chosen": -3.0671684741973877, "logits/rejected": -3.0086820125579834, "logps/chosen": -3.6582512855529785, "logps/rejected": -3.5938503742218018, "loss": 4.6296, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -36.58251190185547, "rewards/margins": -0.6440094113349915, "rewards/rejected": -35.938499450683594, "step": 25330 }, { "epoch": 0.8539216016717787, "grad_norm": 44.67426681518555, "learning_rate": 6.36061980267349e-08, "logits/chosen": -1.7330238819122314, "logits/rejected": -1.7280261516571045, "logps/chosen": -2.71097993850708, "logps/rejected": -2.6598942279815674, "loss": 4.144, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -27.109798431396484, "rewards/margins": -0.5108593106269836, "rewards/rejected": -26.598941802978516, "step": 25335 }, { "epoch": 0.8540901277427618, "grad_norm": 459.4154357910156, "learning_rate": 6.346270639766232e-08, "logits/chosen": -2.0010597705841064, "logits/rejected": -2.0018951892852783, "logps/chosen": -2.6174869537353516, "logps/rejected": -2.702815055847168, "loss": 2.7935, "rewards/accuracies": 0.5, "rewards/chosen": -26.174869537353516, "rewards/margins": 0.8532818555831909, "rewards/rejected": -27.028152465820312, "step": 25340 }, { "epoch": 0.854258653813745, "grad_norm": 51.30471420288086, "learning_rate": 6.331936583694819e-08, "logits/chosen": -2.0278751850128174, "logits/rejected": -1.9472980499267578, "logps/chosen": -2.5634241104125977, "logps/rejected": -2.626685857772827, "loss": 2.7637, "rewards/accuracies": 0.5, "rewards/chosen": -25.634241104125977, "rewards/margins": 0.6326183080673218, "rewards/rejected": -26.266857147216797, "step": 25345 }, { "epoch": 0.8544271798847282, "grad_norm": 19.242332458496094, "learning_rate": 6.317617639419714e-08, "logits/chosen": -1.845207929611206, "logits/rejected": -2.0394089221954346, "logps/chosen": -3.0137135982513428, "logps/rejected": -3.1849112510681152, "loss": 3.7604, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -30.137136459350586, "rewards/margins": 1.7119754552841187, "rewards/rejected": -31.849109649658203, "step": 25350 }, { "epoch": 0.8545957059557113, "grad_norm": 16.459596633911133, "learning_rate": 6.303313811896111e-08, "logits/chosen": -1.489900827407837, "logits/rejected": -1.5980645418167114, "logps/chosen": -2.0218257904052734, "logps/rejected": -2.2330009937286377, "loss": 2.9988, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.218257904052734, "rewards/margins": 2.1117513179779053, "rewards/rejected": -22.33000946044922, "step": 25355 }, { "epoch": 0.8547642320266945, "grad_norm": 24.089542388916016, "learning_rate": 6.289025106074019e-08, "logits/chosen": -1.951390027999878, "logits/rejected": -2.3046865463256836, "logps/chosen": -2.4620213508605957, "logps/rejected": -2.7450006008148193, "loss": 2.0841, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -24.62021255493164, "rewards/margins": 2.8297929763793945, "rewards/rejected": -27.45000648498535, "step": 25360 }, { "epoch": 0.8549327580976777, "grad_norm": 55.347660064697266, "learning_rate": 6.274751526898197e-08, "logits/chosen": -1.7793890237808228, "logits/rejected": -1.7488059997558594, "logps/chosen": -2.639188051223755, "logps/rejected": -3.5281052589416504, "loss": 1.8899, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.39188003540039, "rewards/margins": 8.889172554016113, "rewards/rejected": -35.28104782104492, "step": 25365 }, { "epoch": 0.8551012841686609, "grad_norm": 18.582015991210938, "learning_rate": 6.260493079308176e-08, "logits/chosen": -2.4654316902160645, "logits/rejected": -2.598381519317627, "logps/chosen": -2.64225697517395, "logps/rejected": -2.852651834487915, "loss": 3.1262, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -26.42256736755371, "rewards/margins": 2.1039481163024902, "rewards/rejected": -28.52651596069336, "step": 25370 }, { "epoch": 0.8552698102396441, "grad_norm": 36.65127944946289, "learning_rate": 6.24624976823822e-08, "logits/chosen": -2.479947328567505, "logits/rejected": -2.5645601749420166, "logps/chosen": -2.4463698863983154, "logps/rejected": -2.914379596710205, "loss": 2.9974, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.463699340820312, "rewards/margins": 4.680096626281738, "rewards/rejected": -29.143795013427734, "step": 25375 }, { "epoch": 0.8554383363106273, "grad_norm": 4.597733974456787, "learning_rate": 6.232021598617388e-08, "logits/chosen": -1.8337205648422241, "logits/rejected": -2.154306411743164, "logps/chosen": -2.193875789642334, "logps/rejected": -3.0177786350250244, "loss": 1.1959, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.93876075744629, "rewards/margins": 8.23902416229248, "rewards/rejected": -30.177783966064453, "step": 25380 }, { "epoch": 0.8556068623816104, "grad_norm": 2.200549602508545, "learning_rate": 6.217808575369493e-08, "logits/chosen": -1.9514930248260498, "logits/rejected": -1.7945950031280518, "logps/chosen": -1.9390647411346436, "logps/rejected": -2.213057041168213, "loss": 2.0697, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.390647888183594, "rewards/margins": 2.7399203777313232, "rewards/rejected": -22.13056755065918, "step": 25385 }, { "epoch": 0.8557753884525936, "grad_norm": 30.521926879882812, "learning_rate": 6.203610703413114e-08, "logits/chosen": -1.8817222118377686, "logits/rejected": -2.326267719268799, "logps/chosen": -2.600776195526123, "logps/rejected": -3.109726667404175, "loss": 1.8063, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -26.007761001586914, "rewards/margins": 5.089505672454834, "rewards/rejected": -31.097265243530273, "step": 25390 }, { "epoch": 0.8559439145235768, "grad_norm": 35.248165130615234, "learning_rate": 6.18942798766155e-08, "logits/chosen": -1.4958255290985107, "logits/rejected": -1.527790904045105, "logps/chosen": -2.2628679275512695, "logps/rejected": -2.3205666542053223, "loss": 2.8091, "rewards/accuracies": 0.5, "rewards/chosen": -22.628681182861328, "rewards/margins": 0.5769863128662109, "rewards/rejected": -23.205665588378906, "step": 25395 }, { "epoch": 0.8561124405945599, "grad_norm": 22.03529167175293, "learning_rate": 6.175260433022889e-08, "logits/chosen": -2.300389528274536, "logits/rejected": -2.2869935035705566, "logps/chosen": -2.078281879425049, "logps/rejected": -2.5390524864196777, "loss": 2.0494, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.782817840576172, "rewards/margins": 4.607706546783447, "rewards/rejected": -25.39052391052246, "step": 25400 }, { "epoch": 0.8562809666655432, "grad_norm": 34.80204391479492, "learning_rate": 6.161108044399976e-08, "logits/chosen": -2.219104528427124, "logits/rejected": -2.532747507095337, "logps/chosen": -2.918858766555786, "logps/rejected": -3.3544135093688965, "loss": 5.1036, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -29.188587188720703, "rewards/margins": 4.355550289154053, "rewards/rejected": -33.54413604736328, "step": 25405 }, { "epoch": 0.8564494927365264, "grad_norm": 34.77595520019531, "learning_rate": 6.146970826690378e-08, "logits/chosen": -2.205582857131958, "logits/rejected": -2.330531597137451, "logps/chosen": -2.4941062927246094, "logps/rejected": -2.755558967590332, "loss": 2.306, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.941062927246094, "rewards/margins": 2.61452579498291, "rewards/rejected": -27.555587768554688, "step": 25410 }, { "epoch": 0.8566180188075095, "grad_norm": 56.285179138183594, "learning_rate": 6.132848784786437e-08, "logits/chosen": -1.6364961862564087, "logits/rejected": -1.7433507442474365, "logps/chosen": -2.412095308303833, "logps/rejected": -2.6662497520446777, "loss": 2.2664, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.120952606201172, "rewards/margins": 2.5415430068969727, "rewards/rejected": -26.662494659423828, "step": 25415 }, { "epoch": 0.8567865448784927, "grad_norm": 47.27323532104492, "learning_rate": 6.118741923575233e-08, "logits/chosen": -1.5887492895126343, "logits/rejected": -1.5800529718399048, "logps/chosen": -2.2246580123901367, "logps/rejected": -2.341395616531372, "loss": 2.3448, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.246578216552734, "rewards/margins": 1.167377233505249, "rewards/rejected": -23.413955688476562, "step": 25420 }, { "epoch": 0.8569550709494759, "grad_norm": 21.56765365600586, "learning_rate": 6.104650247938609e-08, "logits/chosen": -1.5196692943572998, "logits/rejected": -1.9559457302093506, "logps/chosen": -2.3416597843170166, "logps/rejected": -2.5563902854919434, "loss": 2.3498, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.41659927368164, "rewards/margins": 2.147305727005005, "rewards/rejected": -25.56390380859375, "step": 25425 }, { "epoch": 0.857123597020459, "grad_norm": 150.3126678466797, "learning_rate": 6.090573762753115e-08, "logits/chosen": -1.950821876525879, "logits/rejected": -2.2176384925842285, "logps/chosen": -2.4310436248779297, "logps/rejected": -2.4594826698303223, "loss": 3.4541, "rewards/accuracies": 0.5, "rewards/chosen": -24.310436248779297, "rewards/margins": 0.28439101576805115, "rewards/rejected": -24.59482765197754, "step": 25430 }, { "epoch": 0.8572921230914422, "grad_norm": 148.96141052246094, "learning_rate": 6.076512472890077e-08, "logits/chosen": -1.6106939315795898, "logits/rejected": -1.8222036361694336, "logps/chosen": -1.9590381383895874, "logps/rejected": -2.043668270111084, "loss": 3.5791, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.590383529663086, "rewards/margins": 0.8463004231452942, "rewards/rejected": -20.436681747436523, "step": 25435 }, { "epoch": 0.8574606491624255, "grad_norm": 44.487586975097656, "learning_rate": 6.06246638321557e-08, "logits/chosen": -1.8768688440322876, "logits/rejected": -2.0317156314849854, "logps/chosen": -2.3081698417663574, "logps/rejected": -2.3036158084869385, "loss": 3.8499, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -23.081695556640625, "rewards/margins": -0.045539092272520065, "rewards/rejected": -23.036157608032227, "step": 25440 }, { "epoch": 0.8576291752334086, "grad_norm": 24.27766227722168, "learning_rate": 6.048435498590366e-08, "logits/chosen": -0.9871917963027954, "logits/rejected": -1.2592726945877075, "logps/chosen": -3.0321362018585205, "logps/rejected": -3.322997570037842, "loss": 2.0057, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -30.321359634399414, "rewards/margins": 2.9086170196533203, "rewards/rejected": -33.22998046875, "step": 25445 }, { "epoch": 0.8577977013043918, "grad_norm": 85.73827362060547, "learning_rate": 6.034419823870012e-08, "logits/chosen": -2.156827926635742, "logits/rejected": -2.120016574859619, "logps/chosen": -2.8094635009765625, "logps/rejected": -2.7601757049560547, "loss": 4.3052, "rewards/accuracies": 0.5, "rewards/chosen": -28.094635009765625, "rewards/margins": -0.4928779602050781, "rewards/rejected": -27.601755142211914, "step": 25450 }, { "epoch": 0.857966227375375, "grad_norm": 30.530527114868164, "learning_rate": 6.020419363904783e-08, "logits/chosen": -1.977306604385376, "logits/rejected": -2.1351287364959717, "logps/chosen": -3.0446646213531494, "logps/rejected": -3.5633749961853027, "loss": 1.9825, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -30.4466495513916, "rewards/margins": 5.187103271484375, "rewards/rejected": -35.633750915527344, "step": 25455 }, { "epoch": 0.8581347534463581, "grad_norm": 171.41098022460938, "learning_rate": 6.0064341235397e-08, "logits/chosen": -1.567341923713684, "logits/rejected": -1.6634018421173096, "logps/chosen": -3.08695650100708, "logps/rejected": -2.7849831581115723, "loss": 6.2163, "rewards/accuracies": 0.5, "rewards/chosen": -30.86956787109375, "rewards/margins": -3.019737720489502, "rewards/rejected": -27.849828720092773, "step": 25460 }, { "epoch": 0.8583032795173413, "grad_norm": 61.39971160888672, "learning_rate": 5.992464107614475e-08, "logits/chosen": -1.4290657043457031, "logits/rejected": -1.4986751079559326, "logps/chosen": -2.2665841579437256, "logps/rejected": -2.458251953125, "loss": 2.5043, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.66584014892578, "rewards/margins": 1.91667902469635, "rewards/rejected": -24.58251953125, "step": 25465 }, { "epoch": 0.8584718055883245, "grad_norm": 27.6011962890625, "learning_rate": 5.978509320963593e-08, "logits/chosen": -1.7557885646820068, "logits/rejected": -1.6499592065811157, "logps/chosen": -2.4323935508728027, "logps/rejected": -2.376591444015503, "loss": 4.2695, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -24.32393455505371, "rewards/margins": -0.558021068572998, "rewards/rejected": -23.765914916992188, "step": 25470 }, { "epoch": 0.8586403316593076, "grad_norm": 42.387786865234375, "learning_rate": 5.964569768416261e-08, "logits/chosen": -1.96505868434906, "logits/rejected": -2.3774993419647217, "logps/chosen": -2.227853298187256, "logps/rejected": -2.4281139373779297, "loss": 2.3639, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.278533935546875, "rewards/margins": 2.0026066303253174, "rewards/rejected": -24.28114128112793, "step": 25475 }, { "epoch": 0.8588088577302909, "grad_norm": 51.79988479614258, "learning_rate": 5.950645454796416e-08, "logits/chosen": -2.068077802658081, "logits/rejected": -2.061128854751587, "logps/chosen": -2.6374306678771973, "logps/rejected": -2.715590238571167, "loss": 4.849, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -26.37430763244629, "rewards/margins": 0.7815954089164734, "rewards/rejected": -27.155902862548828, "step": 25480 }, { "epoch": 0.8589773838012741, "grad_norm": 0.0009354325011372566, "learning_rate": 5.936736384922691e-08, "logits/chosen": -1.0480647087097168, "logits/rejected": -1.550246238708496, "logps/chosen": -1.7990306615829468, "logps/rejected": -2.71217679977417, "loss": 0.5304, "rewards/accuracies": 1.0, "rewards/chosen": -17.990306854248047, "rewards/margins": 9.13145923614502, "rewards/rejected": -27.12176513671875, "step": 25485 }, { "epoch": 0.8591459098722573, "grad_norm": 14.77606201171875, "learning_rate": 5.9228425636084824e-08, "logits/chosen": -1.341812252998352, "logits/rejected": -2.1341910362243652, "logps/chosen": -2.212036609649658, "logps/rejected": -2.9296481609344482, "loss": 3.4865, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.1203670501709, "rewards/margins": 7.1761155128479, "rewards/rejected": -29.29648208618164, "step": 25490 }, { "epoch": 0.8593144359432404, "grad_norm": 68.93023681640625, "learning_rate": 5.908963995661892e-08, "logits/chosen": -1.236950397491455, "logits/rejected": -1.241420030593872, "logps/chosen": -2.6127872467041016, "logps/rejected": -2.8742825984954834, "loss": 2.2873, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -26.12787437438965, "rewards/margins": 2.614952325820923, "rewards/rejected": -28.742828369140625, "step": 25495 }, { "epoch": 0.8594829620142236, "grad_norm": 10.785711288452148, "learning_rate": 5.895100685885745e-08, "logits/chosen": -1.7537428140640259, "logits/rejected": -1.8643490076065063, "logps/chosen": -2.1532981395721436, "logps/rejected": -2.436422824859619, "loss": 1.1377, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -21.53297996520996, "rewards/margins": 2.8312461376190186, "rewards/rejected": -24.364227294921875, "step": 25500 }, { "epoch": 0.8596514880852067, "grad_norm": 42.79972457885742, "learning_rate": 5.881252639077583e-08, "logits/chosen": -1.8643391132354736, "logits/rejected": -1.891916036605835, "logps/chosen": -2.975830554962158, "logps/rejected": -3.31604266166687, "loss": 3.4354, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -29.7583065032959, "rewards/margins": 3.402117967605591, "rewards/rejected": -33.160423278808594, "step": 25505 }, { "epoch": 0.8598200141561899, "grad_norm": 42.81157302856445, "learning_rate": 5.867419860029688e-08, "logits/chosen": -1.5780177116394043, "logits/rejected": -2.0716984272003174, "logps/chosen": -2.1099133491516113, "logps/rejected": -3.069995164871216, "loss": 2.5412, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.099132537841797, "rewards/margins": 9.600818634033203, "rewards/rejected": -30.699951171875, "step": 25510 }, { "epoch": 0.8599885402271732, "grad_norm": 28.209131240844727, "learning_rate": 5.8536023535290134e-08, "logits/chosen": -1.9131946563720703, "logits/rejected": -2.2496771812438965, "logps/chosen": -1.9939839839935303, "logps/rejected": -2.5202078819274902, "loss": 1.3354, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.93984031677246, "rewards/margins": 5.262241840362549, "rewards/rejected": -25.202083587646484, "step": 25515 }, { "epoch": 0.8601570662981564, "grad_norm": 57.89436340332031, "learning_rate": 5.839800124357264e-08, "logits/chosen": -1.967872977256775, "logits/rejected": -1.999243140220642, "logps/chosen": -2.5514752864837646, "logps/rejected": -2.5840890407562256, "loss": 3.0422, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.514755249023438, "rewards/margins": 0.3261362910270691, "rewards/rejected": -25.840890884399414, "step": 25520 }, { "epoch": 0.8603255923691395, "grad_norm": 25.394502639770508, "learning_rate": 5.8260131772908504e-08, "logits/chosen": -1.37850022315979, "logits/rejected": -1.5792301893234253, "logps/chosen": -2.2832894325256348, "logps/rejected": -2.744271755218506, "loss": 2.6536, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.8328914642334, "rewards/margins": 4.609823703765869, "rewards/rejected": -27.442718505859375, "step": 25525 }, { "epoch": 0.8604941184401227, "grad_norm": 22.741680145263672, "learning_rate": 5.812241517100902e-08, "logits/chosen": -2.1774706840515137, "logits/rejected": -2.6091322898864746, "logps/chosen": -2.7606756687164307, "logps/rejected": -3.212062358856201, "loss": 2.5673, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -27.60675621032715, "rewards/margins": 4.513869762420654, "rewards/rejected": -32.120628356933594, "step": 25530 }, { "epoch": 0.8606626445111059, "grad_norm": 26.45964241027832, "learning_rate": 5.7984851485532284e-08, "logits/chosen": -1.8604751825332642, "logits/rejected": -1.7118768692016602, "logps/chosen": -3.140791654586792, "logps/rejected": -2.977461338043213, "loss": 5.6263, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -31.407917022705078, "rewards/margins": -1.6333030462265015, "rewards/rejected": -29.774616241455078, "step": 25535 }, { "epoch": 0.860831170582089, "grad_norm": 32.870521545410156, "learning_rate": 5.784744076408371e-08, "logits/chosen": -1.555826187133789, "logits/rejected": -1.7253106832504272, "logps/chosen": -2.182738780975342, "logps/rejected": -2.4404165744781494, "loss": 2.0564, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.8273868560791, "rewards/margins": 2.576777935028076, "rewards/rejected": -24.404163360595703, "step": 25540 }, { "epoch": 0.8609996966530722, "grad_norm": 6.715198516845703, "learning_rate": 5.771018305421588e-08, "logits/chosen": -1.9538259506225586, "logits/rejected": -2.2901089191436768, "logps/chosen": -2.500892400741577, "logps/rejected": -2.8917453289031982, "loss": 3.0437, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -25.008922576904297, "rewards/margins": 3.9085326194763184, "rewards/rejected": -28.917455673217773, "step": 25545 }, { "epoch": 0.8611682227240555, "grad_norm": 36.01880645751953, "learning_rate": 5.757307840342807e-08, "logits/chosen": -1.9131215810775757, "logits/rejected": -1.9809229373931885, "logps/chosen": -2.1885132789611816, "logps/rejected": -2.4454269409179688, "loss": 2.4507, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.8851318359375, "rewards/margins": 2.569138288497925, "rewards/rejected": -24.45427131652832, "step": 25550 }, { "epoch": 0.8613367487950386, "grad_norm": 156.0620880126953, "learning_rate": 5.743612685916688e-08, "logits/chosen": -1.8777625560760498, "logits/rejected": -1.917538046836853, "logps/chosen": -3.0116991996765137, "logps/rejected": -2.9883906841278076, "loss": 4.478, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -30.116992950439453, "rewards/margins": -0.23308487236499786, "rewards/rejected": -29.8839054107666, "step": 25555 }, { "epoch": 0.8615052748660218, "grad_norm": 517.7734985351562, "learning_rate": 5.72993284688259e-08, "logits/chosen": -1.6766964197158813, "logits/rejected": -1.5441253185272217, "logps/chosen": -2.456958055496216, "logps/rejected": -2.3334765434265137, "loss": 4.4397, "rewards/accuracies": 0.5, "rewards/chosen": -24.569580078125, "rewards/margins": -1.2348133325576782, "rewards/rejected": -23.334766387939453, "step": 25560 }, { "epoch": 0.861673800937005, "grad_norm": 36.23284912109375, "learning_rate": 5.7162683279745715e-08, "logits/chosen": -2.2240848541259766, "logits/rejected": -2.341034412384033, "logps/chosen": -2.754894971847534, "logps/rejected": -2.6347315311431885, "loss": 4.8563, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -27.5489501953125, "rewards/margins": -1.201634407043457, "rewards/rejected": -26.347314834594727, "step": 25565 }, { "epoch": 0.8618423270079881, "grad_norm": 17.289134979248047, "learning_rate": 5.7026191339213655e-08, "logits/chosen": -2.197456121444702, "logits/rejected": -2.4068684577941895, "logps/chosen": -2.623168468475342, "logps/rejected": -3.1079468727111816, "loss": 1.5349, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.231685638427734, "rewards/margins": 4.847784519195557, "rewards/rejected": -31.079471588134766, "step": 25570 }, { "epoch": 0.8620108530789713, "grad_norm": 39.08678436279297, "learning_rate": 5.688985269446428e-08, "logits/chosen": -1.9539051055908203, "logits/rejected": -1.9796451330184937, "logps/chosen": -2.2235019207000732, "logps/rejected": -2.5791525840759277, "loss": 2.3173, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.23501968383789, "rewards/margins": 3.556506395339966, "rewards/rejected": -25.79152488708496, "step": 25575 }, { "epoch": 0.8621793791499545, "grad_norm": 27.619142532348633, "learning_rate": 5.675366739267917e-08, "logits/chosen": -1.809984803199768, "logits/rejected": -2.1333956718444824, "logps/chosen": -2.193274974822998, "logps/rejected": -2.231731414794922, "loss": 2.9135, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.932748794555664, "rewards/margins": 0.38456735014915466, "rewards/rejected": -22.31731605529785, "step": 25580 }, { "epoch": 0.8623479052209376, "grad_norm": 33.4080696105957, "learning_rate": 5.661763548098647e-08, "logits/chosen": -1.8288103342056274, "logits/rejected": -1.947704553604126, "logps/chosen": -2.4664788246154785, "logps/rejected": -2.844200611114502, "loss": 2.104, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.6647891998291, "rewards/margins": 3.7772185802459717, "rewards/rejected": -28.442005157470703, "step": 25585 }, { "epoch": 0.8625164312919209, "grad_norm": 16.371057510375977, "learning_rate": 5.648175700646152e-08, "logits/chosen": -1.4505841732025146, "logits/rejected": -1.5557242631912231, "logps/chosen": -2.552976131439209, "logps/rejected": -3.2438766956329346, "loss": 1.9105, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -25.52976417541504, "rewards/margins": 6.909002780914307, "rewards/rejected": -32.43876647949219, "step": 25590 }, { "epoch": 0.8626849573629041, "grad_norm": 26.774486541748047, "learning_rate": 5.6346032016126585e-08, "logits/chosen": -1.6347877979278564, "logits/rejected": -1.58437979221344, "logps/chosen": -2.2510108947753906, "logps/rejected": -2.346017360687256, "loss": 3.0834, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.510108947753906, "rewards/margins": 0.9500652551651001, "rewards/rejected": -23.460172653198242, "step": 25595 }, { "epoch": 0.8628534834338872, "grad_norm": 32.13697052001953, "learning_rate": 5.621046055695078e-08, "logits/chosen": -1.5043491125106812, "logits/rejected": -2.238797426223755, "logps/chosen": -2.6151254177093506, "logps/rejected": -3.404712677001953, "loss": 2.7441, "rewards/accuracies": 0.5, "rewards/chosen": -26.1512508392334, "rewards/margins": 7.895873069763184, "rewards/rejected": -34.047122955322266, "step": 25600 }, { "epoch": 0.8628534834338872, "eval_logits/chosen": -2.3047690391540527, "eval_logits/rejected": -2.4822678565979004, "eval_logps/chosen": -2.284496545791626, "eval_logps/rejected": -2.4387638568878174, "eval_loss": 3.0866053104400635, "eval_rewards/accuracies": 0.6299999952316284, "eval_rewards/chosen": -22.84496307373047, "eval_rewards/margins": 1.5426740646362305, "eval_rewards/rejected": -24.38763999938965, "eval_runtime": 12.8886, "eval_samples_per_second": 7.759, "eval_steps_per_second": 1.94, "step": 25600 }, { "epoch": 0.8630220095048704, "grad_norm": 36.613040924072266, "learning_rate": 5.6075042675849896e-08, "logits/chosen": -1.3739955425262451, "logits/rejected": -1.321131944656372, "logps/chosen": -2.1500613689422607, "logps/rejected": -2.17360520362854, "loss": 4.0157, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -21.500614166259766, "rewards/margins": 0.23543719947338104, "rewards/rejected": -21.736053466796875, "step": 25605 }, { "epoch": 0.8631905355758536, "grad_norm": 2.961085796356201, "learning_rate": 5.593977841968678e-08, "logits/chosen": -1.9631197452545166, "logits/rejected": -2.171518325805664, "logps/chosen": -2.591064691543579, "logps/rejected": -2.8690671920776367, "loss": 1.9633, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.91064453125, "rewards/margins": 2.7800254821777344, "rewards/rejected": -28.690670013427734, "step": 25610 }, { "epoch": 0.8633590616468367, "grad_norm": 35.69943618774414, "learning_rate": 5.580466783527116e-08, "logits/chosen": -1.8889795541763306, "logits/rejected": -2.0595734119415283, "logps/chosen": -1.7020518779754639, "logps/rejected": -1.756344199180603, "loss": 2.9206, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.020517349243164, "rewards/margins": 0.5429241061210632, "rewards/rejected": -17.563440322875977, "step": 25615 }, { "epoch": 0.8635275877178199, "grad_norm": 0.00012647907715290785, "learning_rate": 5.566971096935935e-08, "logits/chosen": -1.9607864618301392, "logits/rejected": -2.334564685821533, "logps/chosen": -2.574162244796753, "logps/rejected": -3.8408493995666504, "loss": 1.7019, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -25.741619110107422, "rewards/margins": 12.666872024536133, "rewards/rejected": -38.40849304199219, "step": 25620 }, { "epoch": 0.8636961137888032, "grad_norm": 7.7881598472595215, "learning_rate": 5.5534907868654615e-08, "logits/chosen": -2.3653645515441895, "logits/rejected": -2.4128830432891846, "logps/chosen": -2.2764759063720703, "logps/rejected": -2.3745293617248535, "loss": 4.8364, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.764759063720703, "rewards/margins": 0.9805337190628052, "rewards/rejected": -23.745290756225586, "step": 25625 }, { "epoch": 0.8638646398597863, "grad_norm": 29.671714782714844, "learning_rate": 5.540025857980707e-08, "logits/chosen": -1.6906957626342773, "logits/rejected": -2.0325279235839844, "logps/chosen": -1.7866567373275757, "logps/rejected": -1.9048383235931396, "loss": 2.205, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.866567611694336, "rewards/margins": 1.1818159818649292, "rewards/rejected": -19.048383712768555, "step": 25630 }, { "epoch": 0.8640331659307695, "grad_norm": 44.803955078125, "learning_rate": 5.52657631494135e-08, "logits/chosen": -1.8340305089950562, "logits/rejected": -1.7409149408340454, "logps/chosen": -2.6429009437561035, "logps/rejected": -3.0195226669311523, "loss": 3.688, "rewards/accuracies": 0.5, "rewards/chosen": -26.429006576538086, "rewards/margins": 3.7662174701690674, "rewards/rejected": -30.195226669311523, "step": 25635 }, { "epoch": 0.8642016920017527, "grad_norm": 59.5605583190918, "learning_rate": 5.513142162401746e-08, "logits/chosen": -1.4266693592071533, "logits/rejected": -1.5339360237121582, "logps/chosen": -1.7503912448883057, "logps/rejected": -1.7414312362670898, "loss": 3.3267, "rewards/accuracies": 0.5, "rewards/chosen": -17.50391387939453, "rewards/margins": -0.08959989249706268, "rewards/rejected": -17.4143123626709, "step": 25640 }, { "epoch": 0.8643702180727358, "grad_norm": 112.27110290527344, "learning_rate": 5.4997234050109365e-08, "logits/chosen": -2.070991277694702, "logits/rejected": -2.8527050018310547, "logps/chosen": -2.9822659492492676, "logps/rejected": -3.033463478088379, "loss": 3.4673, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -29.822656631469727, "rewards/margins": 0.5119756460189819, "rewards/rejected": -30.334630966186523, "step": 25645 }, { "epoch": 0.864538744143719, "grad_norm": 45.651275634765625, "learning_rate": 5.486320047412607e-08, "logits/chosen": -2.4427013397216797, "logits/rejected": -2.478302001953125, "logps/chosen": -3.080040216445923, "logps/rejected": -3.496181011199951, "loss": 2.1427, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -30.800403594970703, "rewards/margins": 4.161408424377441, "rewards/rejected": -34.96180725097656, "step": 25650 }, { "epoch": 0.8647072702147022, "grad_norm": 128.43927001953125, "learning_rate": 5.4729320942451417e-08, "logits/chosen": -2.290976047515869, "logits/rejected": -2.463477611541748, "logps/chosen": -2.309473752975464, "logps/rejected": -2.7989444732666016, "loss": 1.4247, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -23.094736099243164, "rewards/margins": 4.894708156585693, "rewards/rejected": -27.98944664001465, "step": 25655 }, { "epoch": 0.8648757962856854, "grad_norm": 33.28281021118164, "learning_rate": 5.459559550141579e-08, "logits/chosen": -1.9118398427963257, "logits/rejected": -2.3658390045166016, "logps/chosen": -2.0546092987060547, "logps/rejected": -2.8094446659088135, "loss": 1.5514, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.546092987060547, "rewards/margins": 7.548353672027588, "rewards/rejected": -28.094446182250977, "step": 25660 }, { "epoch": 0.8650443223566686, "grad_norm": 112.78716278076172, "learning_rate": 5.446202419729634e-08, "logits/chosen": -2.431443691253662, "logits/rejected": -2.530750274658203, "logps/chosen": -3.1777215003967285, "logps/rejected": -3.2929470539093018, "loss": 6.256, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -31.7772159576416, "rewards/margins": 1.1522538661956787, "rewards/rejected": -32.929466247558594, "step": 25665 }, { "epoch": 0.8652128484276518, "grad_norm": 18.13178062438965, "learning_rate": 5.432860707631692e-08, "logits/chosen": -1.9325920343399048, "logits/rejected": -2.1956613063812256, "logps/chosen": -2.1996188163757324, "logps/rejected": -2.414581775665283, "loss": 2.0302, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.996187210083008, "rewards/margins": 2.149627685546875, "rewards/rejected": -24.145816802978516, "step": 25670 }, { "epoch": 0.865381374498635, "grad_norm": 18.623947143554688, "learning_rate": 5.419534418464772e-08, "logits/chosen": -1.4391006231307983, "logits/rejected": -1.5431092977523804, "logps/chosen": -2.181546688079834, "logps/rejected": -2.4147443771362305, "loss": 1.8495, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.815465927124023, "rewards/margins": 2.331977367401123, "rewards/rejected": -24.147441864013672, "step": 25675 }, { "epoch": 0.8655499005696181, "grad_norm": 38.271759033203125, "learning_rate": 5.406223556840594e-08, "logits/chosen": -1.714686393737793, "logits/rejected": -2.1915745735168457, "logps/chosen": -1.994512915611267, "logps/rejected": -2.8179125785827637, "loss": 2.5671, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.94512939453125, "rewards/margins": 8.233994483947754, "rewards/rejected": -28.179126739501953, "step": 25680 }, { "epoch": 0.8657184266406013, "grad_norm": 22.912878036499023, "learning_rate": 5.3929281273655255e-08, "logits/chosen": -2.155128002166748, "logits/rejected": -2.2736685276031494, "logps/chosen": -2.6940789222717285, "logps/rejected": -2.630833864212036, "loss": 3.7778, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -26.940786361694336, "rewards/margins": -0.6324483156204224, "rewards/rejected": -26.308338165283203, "step": 25685 }, { "epoch": 0.8658869527115844, "grad_norm": 18.871042251586914, "learning_rate": 5.379648134640574e-08, "logits/chosen": -2.1314008235931396, "logits/rejected": -2.387678861618042, "logps/chosen": -2.2513153553009033, "logps/rejected": -2.300379753112793, "loss": 3.4506, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.513153076171875, "rewards/margins": 0.49064674973487854, "rewards/rejected": -23.003799438476562, "step": 25690 }, { "epoch": 0.8660554787825676, "grad_norm": 13.355888366699219, "learning_rate": 5.36638358326143e-08, "logits/chosen": -1.9050413370132446, "logits/rejected": -2.75042724609375, "logps/chosen": -2.610729932785034, "logps/rejected": -3.4782874584198, "loss": 1.1352, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -26.1072998046875, "rewards/margins": 8.67557430267334, "rewards/rejected": -34.782875061035156, "step": 25695 }, { "epoch": 0.8662240048535509, "grad_norm": 28.12961196899414, "learning_rate": 5.353134477818444e-08, "logits/chosen": -1.6968810558319092, "logits/rejected": -1.7211993932724, "logps/chosen": -1.8958885669708252, "logps/rejected": -2.062389612197876, "loss": 2.7024, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.958887100219727, "rewards/margins": 1.665008544921875, "rewards/rejected": -20.6238956451416, "step": 25700 }, { "epoch": 0.866392530924534, "grad_norm": 41.80595779418945, "learning_rate": 5.3399008228965926e-08, "logits/chosen": -1.6834443807601929, "logits/rejected": -1.737204909324646, "logps/chosen": -2.680849313735962, "logps/rejected": -2.803086519241333, "loss": 2.2861, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.80849266052246, "rewards/margins": 1.2223708629608154, "rewards/rejected": -28.03086280822754, "step": 25705 }, { "epoch": 0.8665610569955172, "grad_norm": 32.27495574951172, "learning_rate": 5.3266826230755234e-08, "logits/chosen": -1.3415998220443726, "logits/rejected": -1.842403769493103, "logps/chosen": -2.7923684120178223, "logps/rejected": -3.519589900970459, "loss": 0.7945, "rewards/accuracies": 1.0, "rewards/chosen": -27.923686981201172, "rewards/margins": 7.272216796875, "rewards/rejected": -35.195899963378906, "step": 25710 }, { "epoch": 0.8667295830665004, "grad_norm": 23.474517822265625, "learning_rate": 5.313479882929545e-08, "logits/chosen": -1.6315898895263672, "logits/rejected": -1.831046462059021, "logps/chosen": -2.6967477798461914, "logps/rejected": -2.7854743003845215, "loss": 3.9346, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -26.967477798461914, "rewards/margins": 0.8872681856155396, "rewards/rejected": -27.854745864868164, "step": 25715 }, { "epoch": 0.8668981091374836, "grad_norm": 63.77497482299805, "learning_rate": 5.3002926070276065e-08, "logits/chosen": -1.9106180667877197, "logits/rejected": -2.332305908203125, "logps/chosen": -2.7835779190063477, "logps/rejected": -2.867992877960205, "loss": 2.92, "rewards/accuracies": 0.5, "rewards/chosen": -27.835779190063477, "rewards/margins": 0.8441485166549683, "rewards/rejected": -28.679927825927734, "step": 25720 }, { "epoch": 0.8670666352084667, "grad_norm": 43.49818420410156, "learning_rate": 5.2871207999332866e-08, "logits/chosen": -1.9081672430038452, "logits/rejected": -2.2695870399475098, "logps/chosen": -3.0630500316619873, "logps/rejected": -3.210258960723877, "loss": 2.3696, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -30.630502700805664, "rewards/margins": 1.472088098526001, "rewards/rejected": -32.10258865356445, "step": 25725 }, { "epoch": 0.8672351612794499, "grad_norm": 42.61958312988281, "learning_rate": 5.273964466204844e-08, "logits/chosen": -1.745123267173767, "logits/rejected": -2.2044224739074707, "logps/chosen": -2.550452709197998, "logps/rejected": -2.8602192401885986, "loss": 2.4402, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.504528045654297, "rewards/margins": 3.0976648330688477, "rewards/rejected": -28.60219383239746, "step": 25730 }, { "epoch": 0.8674036873504332, "grad_norm": 20.689411163330078, "learning_rate": 5.260823610395177e-08, "logits/chosen": -1.9419893026351929, "logits/rejected": -2.1809206008911133, "logps/chosen": -1.7887070178985596, "logps/rejected": -1.9395878314971924, "loss": 2.787, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.887067794799805, "rewards/margins": 1.5088112354278564, "rewards/rejected": -19.395877838134766, "step": 25735 }, { "epoch": 0.8675722134214163, "grad_norm": 37.88859939575195, "learning_rate": 5.2476982370517895e-08, "logits/chosen": -2.154304265975952, "logits/rejected": -2.248838424682617, "logps/chosen": -2.048656940460205, "logps/rejected": -2.063976764678955, "loss": 3.0845, "rewards/accuracies": 0.5, "rewards/chosen": -20.4865665435791, "rewards/margins": 0.15320205688476562, "rewards/rejected": -20.639766693115234, "step": 25740 }, { "epoch": 0.8677407394923995, "grad_norm": 32.98319625854492, "learning_rate": 5.234588350716879e-08, "logits/chosen": -1.9644311666488647, "logits/rejected": -2.0829384326934814, "logps/chosen": -2.6873435974121094, "logps/rejected": -3.071718692779541, "loss": 2.2025, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.873437881469727, "rewards/margins": 3.843745470046997, "rewards/rejected": -30.717182159423828, "step": 25745 }, { "epoch": 0.8679092655633827, "grad_norm": 60.62992858886719, "learning_rate": 5.2214939559272474e-08, "logits/chosen": -1.1950910091400146, "logits/rejected": -1.9384139776229858, "logps/chosen": -2.483628034591675, "logps/rejected": -3.1380577087402344, "loss": 2.8334, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.83627700805664, "rewards/margins": 6.544297218322754, "rewards/rejected": -31.38057518005371, "step": 25750 }, { "epoch": 0.8680777916343658, "grad_norm": 1.7990132570266724, "learning_rate": 5.208415057214366e-08, "logits/chosen": -1.8497015237808228, "logits/rejected": -2.1536459922790527, "logps/chosen": -2.151120901107788, "logps/rejected": -2.640761137008667, "loss": 1.3812, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.51120948791504, "rewards/margins": 4.896402359008789, "rewards/rejected": -26.40761375427246, "step": 25755 }, { "epoch": 0.868246317705349, "grad_norm": 83.80754852294922, "learning_rate": 5.195351659104308e-08, "logits/chosen": -2.031888723373413, "logits/rejected": -1.8762810230255127, "logps/chosen": -2.5077152252197266, "logps/rejected": -2.4214892387390137, "loss": 5.0355, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -25.077150344848633, "rewards/margins": -0.8622571229934692, "rewards/rejected": -24.214893341064453, "step": 25760 }, { "epoch": 0.8684148437763322, "grad_norm": 23.788127899169922, "learning_rate": 5.182303766117807e-08, "logits/chosen": -1.7114553451538086, "logits/rejected": -1.8850023746490479, "logps/chosen": -2.594632625579834, "logps/rejected": -2.736550807952881, "loss": 3.2035, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.946325302124023, "rewards/margins": 1.4191802740097046, "rewards/rejected": -27.36550521850586, "step": 25765 }, { "epoch": 0.8685833698473154, "grad_norm": 31.160070419311523, "learning_rate": 5.169271382770224e-08, "logits/chosen": -2.122729778289795, "logits/rejected": -2.1520445346832275, "logps/chosen": -2.044503688812256, "logps/rejected": -2.158705949783325, "loss": 2.3285, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.445035934448242, "rewards/margins": 1.1420232057571411, "rewards/rejected": -21.587059020996094, "step": 25770 }, { "epoch": 0.8687518959182986, "grad_norm": 37.71006774902344, "learning_rate": 5.1562545135715676e-08, "logits/chosen": -2.032743453979492, "logits/rejected": -1.8914331197738647, "logps/chosen": -2.209226608276367, "logps/rejected": -2.465100049972534, "loss": 3.1635, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.092266082763672, "rewards/margins": 2.5587353706359863, "rewards/rejected": -24.6510009765625, "step": 25775 }, { "epoch": 0.8689204219892818, "grad_norm": 26.651168823242188, "learning_rate": 5.14325316302644e-08, "logits/chosen": -1.7310502529144287, "logits/rejected": -1.8083226680755615, "logps/chosen": -2.462341785430908, "logps/rejected": -2.4454903602600098, "loss": 3.7791, "rewards/accuracies": 0.5, "rewards/chosen": -24.623416900634766, "rewards/margins": -0.16851606965065002, "rewards/rejected": -24.45490264892578, "step": 25780 }, { "epoch": 0.8690889480602649, "grad_norm": 101.77996063232422, "learning_rate": 5.130267335634103e-08, "logits/chosen": -1.7319138050079346, "logits/rejected": -1.8407793045043945, "logps/chosen": -2.481172561645508, "logps/rejected": -2.444258213043213, "loss": 3.4822, "rewards/accuracies": 0.5, "rewards/chosen": -24.811725616455078, "rewards/margins": -0.3691454827785492, "rewards/rejected": -24.442581176757812, "step": 25785 }, { "epoch": 0.8692574741312481, "grad_norm": 96.85649108886719, "learning_rate": 5.117297035888451e-08, "logits/chosen": -1.9268567562103271, "logits/rejected": -2.3404312133789062, "logps/chosen": -2.1492671966552734, "logps/rejected": -2.1940503120422363, "loss": 3.1017, "rewards/accuracies": 0.5, "rewards/chosen": -21.492671966552734, "rewards/margins": 0.44782838225364685, "rewards/rejected": -21.940500259399414, "step": 25790 }, { "epoch": 0.8694260002022313, "grad_norm": 53.152503967285156, "learning_rate": 5.1043422682779837e-08, "logits/chosen": -1.959204912185669, "logits/rejected": -2.3568460941314697, "logps/chosen": -2.5286505222320557, "logps/rejected": -2.8187003135681152, "loss": 2.6502, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.2865047454834, "rewards/margins": 2.9004974365234375, "rewards/rejected": -28.187002182006836, "step": 25795 }, { "epoch": 0.8695945262732144, "grad_norm": 36.7203254699707, "learning_rate": 5.091403037285841e-08, "logits/chosen": -2.4292044639587402, "logits/rejected": -2.6584384441375732, "logps/chosen": -2.8159584999084473, "logps/rejected": -3.5781993865966797, "loss": 1.4842, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -28.15958595275879, "rewards/margins": 7.622408390045166, "rewards/rejected": -35.7819938659668, "step": 25800 }, { "epoch": 0.8697630523441976, "grad_norm": 12.048297882080078, "learning_rate": 5.078479347389786e-08, "logits/chosen": -1.6142889261245728, "logits/rejected": -2.0917158126831055, "logps/chosen": -1.6926759481430054, "logps/rejected": -2.018369674682617, "loss": 1.713, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -16.926759719848633, "rewards/margins": 3.2569377422332764, "rewards/rejected": -20.183696746826172, "step": 25805 }, { "epoch": 0.8699315784151809, "grad_norm": 17.070327758789062, "learning_rate": 5.065571203062186e-08, "logits/chosen": -2.337498903274536, "logits/rejected": -2.5879263877868652, "logps/chosen": -2.1566128730773926, "logps/rejected": -2.3633060455322266, "loss": 2.4247, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.56612777709961, "rewards/margins": 2.066929578781128, "rewards/rejected": -23.633060455322266, "step": 25810 }, { "epoch": 0.870100104486164, "grad_norm": 39.20555114746094, "learning_rate": 5.0526786087700446e-08, "logits/chosen": -2.312941551208496, "logits/rejected": -2.5641887187957764, "logps/chosen": -2.590254783630371, "logps/rejected": -2.675225019454956, "loss": 3.7626, "rewards/accuracies": 0.5, "rewards/chosen": -25.902551651000977, "rewards/margins": 0.8497023582458496, "rewards/rejected": -26.75225257873535, "step": 25815 }, { "epoch": 0.8702686305571472, "grad_norm": 9.575358853908256e-05, "learning_rate": 5.039801568974983e-08, "logits/chosen": -2.032660961151123, "logits/rejected": -2.1150026321411133, "logps/chosen": -3.270076036453247, "logps/rejected": -4.2751970291137695, "loss": 1.2892, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -32.70075607299805, "rewards/margins": 10.05120849609375, "rewards/rejected": -42.75196838378906, "step": 25820 }, { "epoch": 0.8704371566281304, "grad_norm": 21.379125595092773, "learning_rate": 5.0269400881332415e-08, "logits/chosen": -1.2569396495819092, "logits/rejected": -2.036778450012207, "logps/chosen": -1.7933368682861328, "logps/rejected": -2.9262733459472656, "loss": 1.9382, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.933368682861328, "rewards/margins": 11.329366683959961, "rewards/rejected": -29.26273536682129, "step": 25825 }, { "epoch": 0.8706056826991135, "grad_norm": 15.96414852142334, "learning_rate": 5.014094170695665e-08, "logits/chosen": -1.8142383098602295, "logits/rejected": -2.0156233310699463, "logps/chosen": -2.004096508026123, "logps/rejected": -2.1798839569091797, "loss": 2.7522, "rewards/accuracies": 0.5, "rewards/chosen": -20.040966033935547, "rewards/margins": 1.7578716278076172, "rewards/rejected": -21.798837661743164, "step": 25830 }, { "epoch": 0.8707742087700967, "grad_norm": 20.892578125, "learning_rate": 5.0012638211077205e-08, "logits/chosen": -2.0305237770080566, "logits/rejected": -2.6673262119293213, "logps/chosen": -2.4386634826660156, "logps/rejected": -2.5656256675720215, "loss": 3.8098, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -24.386632919311523, "rewards/margins": 1.269622802734375, "rewards/rejected": -25.6562557220459, "step": 25835 }, { "epoch": 0.8709427348410799, "grad_norm": 32.19673156738281, "learning_rate": 4.988449043809495e-08, "logits/chosen": -1.9713561534881592, "logits/rejected": -1.8127353191375732, "logps/chosen": -2.835386276245117, "logps/rejected": -2.4795279502868652, "loss": 7.098, "rewards/accuracies": 0.5, "rewards/chosen": -28.353862762451172, "rewards/margins": -3.558582305908203, "rewards/rejected": -24.79528045654297, "step": 25840 }, { "epoch": 0.8711112609120631, "grad_norm": 14.6963472366333, "learning_rate": 4.975649843235663e-08, "logits/chosen": -1.4432746171951294, "logits/rejected": -1.6536836624145508, "logps/chosen": -2.234740972518921, "logps/rejected": -2.4873673915863037, "loss": 2.6795, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.347408294677734, "rewards/margins": 2.5262651443481445, "rewards/rejected": -24.873674392700195, "step": 25845 }, { "epoch": 0.8712797869830463, "grad_norm": 27.49606704711914, "learning_rate": 4.9628662238155375e-08, "logits/chosen": -2.0613508224487305, "logits/rejected": -2.6506898403167725, "logps/chosen": -2.446141242980957, "logps/rejected": -2.9206721782684326, "loss": 2.1984, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.46141242980957, "rewards/margins": 4.745309352874756, "rewards/rejected": -29.206722259521484, "step": 25850 }, { "epoch": 0.8714483130540295, "grad_norm": 82.02774810791016, "learning_rate": 4.950098189973012e-08, "logits/chosen": -2.126680374145508, "logits/rejected": -2.285808801651001, "logps/chosen": -2.632131576538086, "logps/rejected": -2.502464771270752, "loss": 4.6964, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -26.32131576538086, "rewards/margins": -1.296666145324707, "rewards/rejected": -25.024648666381836, "step": 25855 }, { "epoch": 0.8716168391250126, "grad_norm": 19.395328521728516, "learning_rate": 4.9373457461266196e-08, "logits/chosen": -2.0173676013946533, "logits/rejected": -2.2325971126556396, "logps/chosen": -2.738555669784546, "logps/rejected": -3.175352096557617, "loss": 2.064, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -27.385555267333984, "rewards/margins": 4.367962837219238, "rewards/rejected": -31.753520965576172, "step": 25860 }, { "epoch": 0.8717853651959958, "grad_norm": 79.3759994506836, "learning_rate": 4.9246088966894586e-08, "logits/chosen": -1.773151159286499, "logits/rejected": -1.7250587940216064, "logps/chosen": -2.4536876678466797, "logps/rejected": -2.5901026725769043, "loss": 3.0366, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.536874771118164, "rewards/margins": 1.364152193069458, "rewards/rejected": -25.90102767944336, "step": 25865 }, { "epoch": 0.871953891266979, "grad_norm": 258.291259765625, "learning_rate": 4.911887646069257e-08, "logits/chosen": -1.7847461700439453, "logits/rejected": -1.9289964437484741, "logps/chosen": -2.952078104019165, "logps/rejected": -2.842191219329834, "loss": 4.685, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -29.520776748657227, "rewards/margins": -1.0988658666610718, "rewards/rejected": -28.42191505432129, "step": 25870 }, { "epoch": 0.8721224173379621, "grad_norm": 53.33089065551758, "learning_rate": 4.8991819986683506e-08, "logits/chosen": -2.109039068222046, "logits/rejected": -2.342543125152588, "logps/chosen": -2.3845467567443848, "logps/rejected": -2.819420337677002, "loss": 2.336, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.845470428466797, "rewards/margins": 4.348735809326172, "rewards/rejected": -28.194204330444336, "step": 25875 }, { "epoch": 0.8722909434089454, "grad_norm": 34.33763122558594, "learning_rate": 4.8864919588836425e-08, "logits/chosen": -1.5160837173461914, "logits/rejected": -1.320216417312622, "logps/chosen": -2.544429302215576, "logps/rejected": -2.622082471847534, "loss": 3.5551, "rewards/accuracies": 0.5, "rewards/chosen": -25.444292068481445, "rewards/margins": 0.7765324711799622, "rewards/rejected": -26.2208251953125, "step": 25880 }, { "epoch": 0.8724594694799286, "grad_norm": 72.6720962524414, "learning_rate": 4.8738175311066665e-08, "logits/chosen": -1.8243458271026611, "logits/rejected": -1.7153728008270264, "logps/chosen": -2.4128189086914062, "logps/rejected": -2.5200035572052, "loss": 2.5664, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -24.128189086914062, "rewards/margins": 1.071846604347229, "rewards/rejected": -25.200035095214844, "step": 25885 }, { "epoch": 0.8726279955509118, "grad_norm": 27.856830596923828, "learning_rate": 4.861158719723546e-08, "logits/chosen": -1.793646216392517, "logits/rejected": -1.7063287496566772, "logps/chosen": -2.146275043487549, "logps/rejected": -2.327519416809082, "loss": 2.6674, "rewards/accuracies": 0.5, "rewards/chosen": -21.462749481201172, "rewards/margins": 1.8124430179595947, "rewards/rejected": -23.275196075439453, "step": 25890 }, { "epoch": 0.8727965216218949, "grad_norm": 56.65886688232422, "learning_rate": 4.848515529114999e-08, "logits/chosen": -1.4978837966918945, "logits/rejected": -1.857505440711975, "logps/chosen": -2.9919285774230957, "logps/rejected": -3.0575897693634033, "loss": 4.2514, "rewards/accuracies": 0.5, "rewards/chosen": -29.919286727905273, "rewards/margins": 0.6566106081008911, "rewards/rejected": -30.575897216796875, "step": 25895 }, { "epoch": 0.8729650476928781, "grad_norm": 80.9780502319336, "learning_rate": 4.835887963656321e-08, "logits/chosen": -1.6397136449813843, "logits/rejected": -1.7722076177597046, "logps/chosen": -2.625100612640381, "logps/rejected": -3.0324058532714844, "loss": 3.1167, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -26.251007080078125, "rewards/margins": 4.073052883148193, "rewards/rejected": -30.32405662536621, "step": 25900 }, { "epoch": 0.8731335737638612, "grad_norm": 42.08506774902344, "learning_rate": 4.823276027717427e-08, "logits/chosen": -1.019819974899292, "logits/rejected": -1.865755319595337, "logps/chosen": -2.091823101043701, "logps/rejected": -2.5190513134002686, "loss": 1.7671, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.918230056762695, "rewards/margins": 4.272280693054199, "rewards/rejected": -25.19051170349121, "step": 25905 }, { "epoch": 0.8733020998348444, "grad_norm": 30.96110725402832, "learning_rate": 4.810679725662814e-08, "logits/chosen": -1.2372939586639404, "logits/rejected": -1.3814033269882202, "logps/chosen": -2.071434497833252, "logps/rejected": -2.412691354751587, "loss": 1.7943, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.714344024658203, "rewards/margins": 3.4125685691833496, "rewards/rejected": -24.12691307067871, "step": 25910 }, { "epoch": 0.8734706259058276, "grad_norm": 24.661306381225586, "learning_rate": 4.798099061851546e-08, "logits/chosen": -1.249774694442749, "logits/rejected": -1.1749866008758545, "logps/chosen": -2.4756853580474854, "logps/rejected": -2.424419403076172, "loss": 3.7921, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -24.756851196289062, "rewards/margins": -0.5126598477363586, "rewards/rejected": -24.24419403076172, "step": 25915 }, { "epoch": 0.8736391519768109, "grad_norm": 22.849403381347656, "learning_rate": 4.785534040637318e-08, "logits/chosen": -2.1637206077575684, "logits/rejected": -2.4329028129577637, "logps/chosen": -2.336874008178711, "logps/rejected": -2.866360902786255, "loss": 1.73, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.36874008178711, "rewards/margins": 5.294869422912598, "rewards/rejected": -28.663610458374023, "step": 25920 }, { "epoch": 0.873807678047794, "grad_norm": 16.065292358398438, "learning_rate": 4.7729846663683734e-08, "logits/chosen": -1.3076118230819702, "logits/rejected": -1.5740753412246704, "logps/chosen": -2.1626815795898438, "logps/rejected": -2.5529274940490723, "loss": 2.3872, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.626815795898438, "rewards/margins": 3.902461290359497, "rewards/rejected": -25.52927589416504, "step": 25925 }, { "epoch": 0.8739762041187772, "grad_norm": 57.940948486328125, "learning_rate": 4.7604509433875674e-08, "logits/chosen": -1.8605324029922485, "logits/rejected": -1.996279001235962, "logps/chosen": -2.231952428817749, "logps/rejected": -2.156428098678589, "loss": 4.6811, "rewards/accuracies": 0.5, "rewards/chosen": -22.31952667236328, "rewards/margins": -0.7552453875541687, "rewards/rejected": -21.564281463623047, "step": 25930 }, { "epoch": 0.8741447301897604, "grad_norm": 132.3805389404297, "learning_rate": 4.747932876032318e-08, "logits/chosen": -2.017028331756592, "logits/rejected": -1.755444884300232, "logps/chosen": -3.1288609504699707, "logps/rejected": -3.00249981880188, "loss": 4.5781, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -31.288610458374023, "rewards/margins": -1.2636101245880127, "rewards/rejected": -30.02499771118164, "step": 25935 }, { "epoch": 0.8743132562607435, "grad_norm": 77.25647735595703, "learning_rate": 4.7354304686346436e-08, "logits/chosen": -1.965309739112854, "logits/rejected": -2.3678011894226074, "logps/chosen": -2.950040102005005, "logps/rejected": -4.675734996795654, "loss": 1.7058, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -29.50040054321289, "rewards/margins": 17.25695037841797, "rewards/rejected": -46.75735092163086, "step": 25940 }, { "epoch": 0.8744817823317267, "grad_norm": 21.622243881225586, "learning_rate": 4.7229437255211394e-08, "logits/chosen": -1.2503682374954224, "logits/rejected": -1.5136396884918213, "logps/chosen": -2.392681837081909, "logps/rejected": -2.7116854190826416, "loss": 2.1726, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -23.926820755004883, "rewards/margins": 3.1900343894958496, "rewards/rejected": -27.11685562133789, "step": 25945 }, { "epoch": 0.8746503084027099, "grad_norm": 18.216819763183594, "learning_rate": 4.710472651012953e-08, "logits/chosen": -1.824148416519165, "logits/rejected": -2.0601725578308105, "logps/chosen": -2.657719135284424, "logps/rejected": -2.9636952877044678, "loss": 3.0157, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -26.577190399169922, "rewards/margins": 3.059760570526123, "rewards/rejected": -29.636951446533203, "step": 25950 }, { "epoch": 0.8748188344736931, "grad_norm": 18.032672882080078, "learning_rate": 4.6980172494258505e-08, "logits/chosen": -2.3354194164276123, "logits/rejected": -2.417893886566162, "logps/chosen": -2.268496036529541, "logps/rejected": -2.806499481201172, "loss": 2.4009, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.68495750427246, "rewards/margins": 5.380034923553467, "rewards/rejected": -28.064992904663086, "step": 25955 }, { "epoch": 0.8749873605446763, "grad_norm": 31.231897354125977, "learning_rate": 4.68557752507015e-08, "logits/chosen": -1.844909906387329, "logits/rejected": -1.9287885427474976, "logps/chosen": -1.8651247024536133, "logps/rejected": -2.1045570373535156, "loss": 1.6024, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -18.651248931884766, "rewards/margins": 2.3943233489990234, "rewards/rejected": -21.045570373535156, "step": 25960 }, { "epoch": 0.8751558866156595, "grad_norm": 92.8125, "learning_rate": 4.673153482250763e-08, "logits/chosen": -2.0548250675201416, "logits/rejected": -1.8608801364898682, "logps/chosen": -2.5349326133728027, "logps/rejected": -2.7556471824645996, "loss": 2.8412, "rewards/accuracies": 0.5, "rewards/chosen": -25.349323272705078, "rewards/margins": 2.2071495056152344, "rewards/rejected": -27.556472778320312, "step": 25965 }, { "epoch": 0.8753244126866426, "grad_norm": 57.874595642089844, "learning_rate": 4.66074512526714e-08, "logits/chosen": -1.6241235733032227, "logits/rejected": -1.7461202144622803, "logps/chosen": -2.530500888824463, "logps/rejected": -2.4094245433807373, "loss": 4.4387, "rewards/accuracies": 0.5, "rewards/chosen": -25.305007934570312, "rewards/margins": -1.2107617855072021, "rewards/rejected": -24.0942440032959, "step": 25970 }, { "epoch": 0.8754929387576258, "grad_norm": 64.47271728515625, "learning_rate": 4.648352458413329e-08, "logits/chosen": -1.1945641040802002, "logits/rejected": -1.3513495922088623, "logps/chosen": -2.8699965476989746, "logps/rejected": -3.5107929706573486, "loss": 1.2196, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -28.699966430664062, "rewards/margins": 6.40796422958374, "rewards/rejected": -35.10792922973633, "step": 25975 }, { "epoch": 0.875661464828609, "grad_norm": 102.32059478759766, "learning_rate": 4.635975485977961e-08, "logits/chosen": -1.6279737949371338, "logits/rejected": -1.6798328161239624, "logps/chosen": -3.12237286567688, "logps/rejected": -3.4952163696289062, "loss": 3.6478, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -31.22372817993164, "rewards/margins": 3.7284350395202637, "rewards/rejected": -34.95216369628906, "step": 25980 }, { "epoch": 0.8758299908995921, "grad_norm": 20.18497085571289, "learning_rate": 4.623614212244198e-08, "logits/chosen": -1.6860414743423462, "logits/rejected": -1.9204658269882202, "logps/chosen": -2.681396961212158, "logps/rejected": -3.1187262535095215, "loss": 1.9732, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -26.813968658447266, "rewards/margins": 4.373295783996582, "rewards/rejected": -31.1872615814209, "step": 25985 }, { "epoch": 0.8759985169705754, "grad_norm": 30.457250595092773, "learning_rate": 4.611268641489796e-08, "logits/chosen": -2.0881166458129883, "logits/rejected": -2.1515793800354004, "logps/chosen": -2.6387476921081543, "logps/rejected": -2.6230766773223877, "loss": 4.4297, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -26.387475967407227, "rewards/margins": -0.15670490264892578, "rewards/rejected": -26.230770111083984, "step": 25990 }, { "epoch": 0.8761670430415586, "grad_norm": 98.93780517578125, "learning_rate": 4.5989387779870716e-08, "logits/chosen": -2.187082529067993, "logits/rejected": -2.343308925628662, "logps/chosen": -2.834789752960205, "logps/rejected": -2.9538867473602295, "loss": 2.9953, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -28.347896575927734, "rewards/margins": 1.190970778465271, "rewards/rejected": -29.538867950439453, "step": 25995 }, { "epoch": 0.8763355691125417, "grad_norm": 38.53907775878906, "learning_rate": 4.586624626002916e-08, "logits/chosen": -1.936500906944275, "logits/rejected": -2.458440065383911, "logps/chosen": -2.3346505165100098, "logps/rejected": -2.676438093185425, "loss": 1.4801, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -23.34650421142578, "rewards/margins": 3.417875289916992, "rewards/rejected": -26.764379501342773, "step": 26000 }, { "epoch": 0.8763355691125417, "eval_logits/chosen": -2.305676221847534, "eval_logits/rejected": -2.4827284812927246, "eval_logps/chosen": -2.2852182388305664, "eval_logps/rejected": -2.440098762512207, "eval_loss": 3.083927631378174, "eval_rewards/accuracies": 0.6299999952316284, "eval_rewards/chosen": -22.852182388305664, "eval_rewards/margins": 1.548802137374878, "eval_rewards/rejected": -24.400985717773438, "eval_runtime": 12.8886, "eval_samples_per_second": 7.759, "eval_steps_per_second": 1.94, "step": 26000 }, { "epoch": 0.8765040951835249, "grad_norm": 18.142152786254883, "learning_rate": 4.574326189798755e-08, "logits/chosen": -1.9278751611709595, "logits/rejected": -2.1549553871154785, "logps/chosen": -2.518207550048828, "logps/rejected": -3.117119550704956, "loss": 2.1305, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.18207359313965, "rewards/margins": 5.989120960235596, "rewards/rejected": -31.17119789123535, "step": 26005 }, { "epoch": 0.8766726212545081, "grad_norm": 42.60398483276367, "learning_rate": 4.562043473630595e-08, "logits/chosen": -2.0454294681549072, "logits/rejected": -2.2847232818603516, "logps/chosen": -2.7390663623809814, "logps/rejected": -2.778416156768799, "loss": 2.8106, "rewards/accuracies": 0.5, "rewards/chosen": -27.390661239624023, "rewards/margins": 0.39350032806396484, "rewards/rejected": -27.784160614013672, "step": 26010 }, { "epoch": 0.8768411473254912, "grad_norm": 33.4304084777832, "learning_rate": 4.549776481749018e-08, "logits/chosen": -1.3675581216812134, "logits/rejected": -1.3049651384353638, "logps/chosen": -2.0452096462249756, "logps/rejected": -2.0717859268188477, "loss": 3.8067, "rewards/accuracies": 0.5, "rewards/chosen": -20.452096939086914, "rewards/margins": 0.2657632827758789, "rewards/rejected": -20.71786117553711, "step": 26015 }, { "epoch": 0.8770096733964744, "grad_norm": 48.97243881225586, "learning_rate": 4.537525218399124e-08, "logits/chosen": -1.8618223667144775, "logits/rejected": -1.8452171087265015, "logps/chosen": -2.2914299964904785, "logps/rejected": -2.3020145893096924, "loss": 4.5788, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.91429901123047, "rewards/margins": 0.10584697872400284, "rewards/rejected": -23.020145416259766, "step": 26020 }, { "epoch": 0.8771781994674576, "grad_norm": 43.23052978515625, "learning_rate": 4.525289687820599e-08, "logits/chosen": -2.1048130989074707, "logits/rejected": -2.744718074798584, "logps/chosen": -3.903913974761963, "logps/rejected": -4.380173206329346, "loss": 2.1283, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -39.03913497924805, "rewards/margins": 4.76259708404541, "rewards/rejected": -43.801734924316406, "step": 26025 }, { "epoch": 0.8773467255384408, "grad_norm": 200.03939819335938, "learning_rate": 4.5130698942476843e-08, "logits/chosen": -1.6560029983520508, "logits/rejected": -1.8514435291290283, "logps/chosen": -2.5157668590545654, "logps/rejected": -2.5783932209014893, "loss": 3.1845, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.157665252685547, "rewards/margins": 0.626263439655304, "rewards/rejected": -25.783931732177734, "step": 26030 }, { "epoch": 0.877515251609424, "grad_norm": 27.59550666809082, "learning_rate": 4.5008658419091686e-08, "logits/chosen": -1.7573442459106445, "logits/rejected": -1.9589412212371826, "logps/chosen": -2.2838704586029053, "logps/rejected": -2.5899243354797363, "loss": 1.5338, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.83870506286621, "rewards/margins": 3.0605380535125732, "rewards/rejected": -25.899242401123047, "step": 26035 }, { "epoch": 0.8776837776804072, "grad_norm": 32.58989334106445, "learning_rate": 4.48867753502839e-08, "logits/chosen": -2.00469708442688, "logits/rejected": -2.1252007484436035, "logps/chosen": -2.224625587463379, "logps/rejected": -2.202439785003662, "loss": 3.6983, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -22.246257781982422, "rewards/margins": -0.22186097502708435, "rewards/rejected": -22.024394989013672, "step": 26040 }, { "epoch": 0.8778523037513903, "grad_norm": 32.600189208984375, "learning_rate": 4.476504977823237e-08, "logits/chosen": -1.4882522821426392, "logits/rejected": -1.4651497602462769, "logps/chosen": -2.2987568378448486, "logps/rejected": -2.344611406326294, "loss": 3.488, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.987567901611328, "rewards/margins": 0.4585467278957367, "rewards/rejected": -23.44611358642578, "step": 26045 }, { "epoch": 0.8780208298223735, "grad_norm": 22.328445434570312, "learning_rate": 4.4643481745061664e-08, "logits/chosen": -0.9219368696212769, "logits/rejected": -1.0782215595245361, "logps/chosen": -2.841899871826172, "logps/rejected": -3.263303279876709, "loss": 0.9888, "rewards/accuracies": 1.0, "rewards/chosen": -28.41900062561035, "rewards/margins": 4.214035987854004, "rewards/rejected": -32.633033752441406, "step": 26050 }, { "epoch": 0.8781893558933567, "grad_norm": 81.40164184570312, "learning_rate": 4.4522071292841524e-08, "logits/chosen": -1.9810707569122314, "logits/rejected": -2.1055080890655518, "logps/chosen": -2.5610156059265137, "logps/rejected": -3.1644368171691895, "loss": 2.5395, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.610157012939453, "rewards/margins": 6.034211158752441, "rewards/rejected": -31.64436912536621, "step": 26055 }, { "epoch": 0.8783578819643398, "grad_norm": 28.70875358581543, "learning_rate": 4.440081846358734e-08, "logits/chosen": -1.7828556299209595, "logits/rejected": -1.9503099918365479, "logps/chosen": -3.098968744277954, "logps/rejected": -3.4416584968566895, "loss": 3.4186, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -30.989688873291016, "rewards/margins": 3.4268956184387207, "rewards/rejected": -34.41658401489258, "step": 26060 }, { "epoch": 0.8785264080353231, "grad_norm": 39.028114318847656, "learning_rate": 4.4279723299260053e-08, "logits/chosen": -2.1715588569641113, "logits/rejected": -1.9619592428207397, "logps/chosen": -2.756772518157959, "logps/rejected": -2.7361457347869873, "loss": 4.1748, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -27.567724227905273, "rewards/margins": -0.20626945793628693, "rewards/rejected": -27.3614559173584, "step": 26065 }, { "epoch": 0.8786949341063063, "grad_norm": 25.182750701904297, "learning_rate": 4.415878584176586e-08, "logits/chosen": -1.9110186100006104, "logits/rejected": -2.0006096363067627, "logps/chosen": -1.853926420211792, "logps/rejected": -2.0113346576690674, "loss": 2.167, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.539264678955078, "rewards/margins": 1.5740830898284912, "rewards/rejected": -20.11334800720215, "step": 26070 }, { "epoch": 0.8788634601772894, "grad_norm": 27.858596801757812, "learning_rate": 4.4038006132956554e-08, "logits/chosen": -2.290611505508423, "logits/rejected": -2.5834479331970215, "logps/chosen": -2.9701969623565674, "logps/rejected": -3.535025119781494, "loss": 3.1378, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -29.70197105407715, "rewards/margins": 5.64827823638916, "rewards/rejected": -35.35024642944336, "step": 26075 }, { "epoch": 0.8790319862482726, "grad_norm": 34.910274505615234, "learning_rate": 4.3917384214629035e-08, "logits/chosen": -2.2534584999084473, "logits/rejected": -2.124990940093994, "logps/chosen": -2.2603816986083984, "logps/rejected": -2.226010322570801, "loss": 3.7679, "rewards/accuracies": 0.5, "rewards/chosen": -22.603816986083984, "rewards/margins": -0.3437148928642273, "rewards/rejected": -22.260103225708008, "step": 26080 }, { "epoch": 0.8792005123192558, "grad_norm": 18.28013038635254, "learning_rate": 4.3796920128525927e-08, "logits/chosen": -1.7659509181976318, "logits/rejected": -2.554335117340088, "logps/chosen": -2.34553861618042, "logps/rejected": -3.6819541454315186, "loss": 3.374, "rewards/accuracies": 0.5, "rewards/chosen": -23.455387115478516, "rewards/margins": 13.364153861999512, "rewards/rejected": -36.819541931152344, "step": 26085 }, { "epoch": 0.8793690383902389, "grad_norm": 95.5963363647461, "learning_rate": 4.367661391633504e-08, "logits/chosen": -2.4856085777282715, "logits/rejected": -2.6605522632598877, "logps/chosen": -3.307602643966675, "logps/rejected": -3.840097427368164, "loss": 3.1067, "rewards/accuracies": 0.5, "rewards/chosen": -33.076026916503906, "rewards/margins": 5.324949741363525, "rewards/rejected": -38.400978088378906, "step": 26090 }, { "epoch": 0.8795375644612221, "grad_norm": 39.2291259765625, "learning_rate": 4.355646561968968e-08, "logits/chosen": -2.148571491241455, "logits/rejected": -2.437236785888672, "logps/chosen": -2.1277217864990234, "logps/rejected": -2.223153591156006, "loss": 2.7293, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.2772159576416, "rewards/margins": 0.9543191194534302, "rewards/rejected": -22.231534957885742, "step": 26095 }, { "epoch": 0.8797060905322054, "grad_norm": 31.748767852783203, "learning_rate": 4.343647528016842e-08, "logits/chosen": -2.2617831230163574, "logits/rejected": -2.2275707721710205, "logps/chosen": -2.0656771659851074, "logps/rejected": -2.070960521697998, "loss": 4.1375, "rewards/accuracies": 0.5, "rewards/chosen": -20.656770706176758, "rewards/margins": 0.05283470079302788, "rewards/rejected": -20.709606170654297, "step": 26100 }, { "epoch": 0.8798746166031886, "grad_norm": 29.608827590942383, "learning_rate": 4.331664293929521e-08, "logits/chosen": -2.194572925567627, "logits/rejected": -2.268022060394287, "logps/chosen": -2.4366941452026367, "logps/rejected": -2.6624813079833984, "loss": 2.9006, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.366941452026367, "rewards/margins": 2.2578701972961426, "rewards/rejected": -26.62481117248535, "step": 26105 }, { "epoch": 0.8800431426741717, "grad_norm": 34.71430587768555, "learning_rate": 4.3196968638539224e-08, "logits/chosen": -1.5167404413223267, "logits/rejected": -1.8624671697616577, "logps/chosen": -2.115821361541748, "logps/rejected": -2.019786834716797, "loss": 4.231, "rewards/accuracies": 0.5, "rewards/chosen": -21.158214569091797, "rewards/margins": -0.9603476524353027, "rewards/rejected": -20.197866439819336, "step": 26110 }, { "epoch": 0.8802116687451549, "grad_norm": 30.95269775390625, "learning_rate": 4.30774524193151e-08, "logits/chosen": -1.7086502313613892, "logits/rejected": -1.7376470565795898, "logps/chosen": -2.0986180305480957, "logps/rejected": -2.125507354736328, "loss": 2.8685, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.98617935180664, "rewards/margins": 0.2688938081264496, "rewards/rejected": -21.25507164001465, "step": 26115 }, { "epoch": 0.880380194816138, "grad_norm": 30.516836166381836, "learning_rate": 4.29580943229827e-08, "logits/chosen": -1.6464307308197021, "logits/rejected": -2.455376148223877, "logps/chosen": -1.8904457092285156, "logps/rejected": -2.61106538772583, "loss": 2.0733, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.904457092285156, "rewards/margins": 7.206197261810303, "rewards/rejected": -26.110652923583984, "step": 26120 }, { "epoch": 0.8805487208871212, "grad_norm": 10.775001525878906, "learning_rate": 4.283889439084709e-08, "logits/chosen": -1.5866488218307495, "logits/rejected": -1.9692538976669312, "logps/chosen": -2.8774304389953613, "logps/rejected": -3.5010387897491455, "loss": 2.0928, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -28.774303436279297, "rewards/margins": 6.236086368560791, "rewards/rejected": -35.01038360595703, "step": 26125 }, { "epoch": 0.8807172469581044, "grad_norm": 38.87605667114258, "learning_rate": 4.2719852664158673e-08, "logits/chosen": -1.7289457321166992, "logits/rejected": -2.259916305541992, "logps/chosen": -2.7738230228424072, "logps/rejected": -3.8895785808563232, "loss": 2.9428, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -27.738229751586914, "rewards/margins": 11.15755844116211, "rewards/rejected": -38.895790100097656, "step": 26130 }, { "epoch": 0.8808857730290875, "grad_norm": 4.833995342254639, "learning_rate": 4.26009691841131e-08, "logits/chosen": -2.0140976905822754, "logits/rejected": -2.562851667404175, "logps/chosen": -2.6443378925323486, "logps/rejected": -3.1197803020477295, "loss": 2.7417, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -26.443378448486328, "rewards/margins": 4.754426002502441, "rewards/rejected": -31.197805404663086, "step": 26135 }, { "epoch": 0.8810542991000708, "grad_norm": 20.582605361938477, "learning_rate": 4.2482243991851405e-08, "logits/chosen": -1.6481196880340576, "logits/rejected": -2.0094666481018066, "logps/chosen": -2.9820003509521484, "logps/rejected": -3.30322003364563, "loss": 2.1949, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -29.82000160217285, "rewards/margins": 3.212195873260498, "rewards/rejected": -33.03219985961914, "step": 26140 }, { "epoch": 0.881222825171054, "grad_norm": 17.267221450805664, "learning_rate": 4.236367712845951e-08, "logits/chosen": -1.5387892723083496, "logits/rejected": -1.8134196996688843, "logps/chosen": -2.9204771518707275, "logps/rejected": -3.2025482654571533, "loss": 2.188, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -29.20477294921875, "rewards/margins": 2.820711612701416, "rewards/rejected": -32.025482177734375, "step": 26145 }, { "epoch": 0.8813913512420372, "grad_norm": 127.3127212524414, "learning_rate": 4.22452686349688e-08, "logits/chosen": -1.8970210552215576, "logits/rejected": -1.8642895221710205, "logps/chosen": -3.043975353240967, "logps/rejected": -3.17755389213562, "loss": 2.802, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -30.43975257873535, "rewards/margins": 1.3357876539230347, "rewards/rejected": -31.77553939819336, "step": 26150 }, { "epoch": 0.8815598773130203, "grad_norm": 28.518186569213867, "learning_rate": 4.21270185523559e-08, "logits/chosen": -1.7622315883636475, "logits/rejected": -2.2663474082946777, "logps/chosen": -2.4464478492736816, "logps/rejected": -2.898188829421997, "loss": 3.1741, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.464481353759766, "rewards/margins": 4.5174055099487305, "rewards/rejected": -28.981884002685547, "step": 26155 }, { "epoch": 0.8817284033840035, "grad_norm": 130.8482208251953, "learning_rate": 4.2008926921542285e-08, "logits/chosen": -1.9923702478408813, "logits/rejected": -2.0565195083618164, "logps/chosen": -2.6995887756347656, "logps/rejected": -2.7402122020721436, "loss": 5.303, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -26.995885848999023, "rewards/margins": 0.4062366485595703, "rewards/rejected": -27.402124404907227, "step": 26160 }, { "epoch": 0.8818969294549867, "grad_norm": 79.5093994140625, "learning_rate": 4.189099378339495e-08, "logits/chosen": -2.1510725021362305, "logits/rejected": -2.7977688312530518, "logps/chosen": -2.7006373405456543, "logps/rejected": -3.728480577468872, "loss": 2.0349, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -27.006372451782227, "rewards/margins": 10.278432846069336, "rewards/rejected": -37.2848014831543, "step": 26165 }, { "epoch": 0.8820654555259698, "grad_norm": 37.60076141357422, "learning_rate": 4.177321917872589e-08, "logits/chosen": -1.7785587310791016, "logits/rejected": -2.1146063804626465, "logps/chosen": -2.0391170978546143, "logps/rejected": -2.5872554779052734, "loss": 1.6566, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.391170501708984, "rewards/margins": 5.481385707855225, "rewards/rejected": -25.872554779052734, "step": 26170 }, { "epoch": 0.8822339815969531, "grad_norm": 33.79237365722656, "learning_rate": 4.165560314829236e-08, "logits/chosen": -1.832585334777832, "logits/rejected": -1.8615341186523438, "logps/chosen": -2.010195732116699, "logps/rejected": -2.08314847946167, "loss": 2.6019, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.10195541381836, "rewards/margins": 0.7295287847518921, "rewards/rejected": -20.831485748291016, "step": 26175 }, { "epoch": 0.8824025076679363, "grad_norm": 131.44534301757812, "learning_rate": 4.153814573279646e-08, "logits/chosen": -1.9326139688491821, "logits/rejected": -1.7735908031463623, "logps/chosen": -3.275007724761963, "logps/rejected": -3.273677110671997, "loss": 4.6015, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -32.75007629394531, "rewards/margins": -0.01330490130931139, "rewards/rejected": -32.73677444458008, "step": 26180 }, { "epoch": 0.8825710337389194, "grad_norm": 26.220003128051758, "learning_rate": 4.1420846972885613e-08, "logits/chosen": -1.9239784479141235, "logits/rejected": -1.7237510681152344, "logps/chosen": -2.5050339698791504, "logps/rejected": -2.7856767177581787, "loss": 4.8332, "rewards/accuracies": 0.5, "rewards/chosen": -25.050338745117188, "rewards/margins": 2.8064308166503906, "rewards/rejected": -27.856769561767578, "step": 26185 }, { "epoch": 0.8827395598099026, "grad_norm": 335.8841552734375, "learning_rate": 4.1303706909152414e-08, "logits/chosen": -2.3145358562469482, "logits/rejected": -2.4512641429901123, "logps/chosen": -2.4827306270599365, "logps/rejected": -2.8225104808807373, "loss": 1.8292, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.827306747436523, "rewards/margins": 3.397799015045166, "rewards/rejected": -28.225107192993164, "step": 26190 }, { "epoch": 0.8829080858808858, "grad_norm": 46.0804328918457, "learning_rate": 4.1186725582134264e-08, "logits/chosen": -1.4715369939804077, "logits/rejected": -1.7713149785995483, "logps/chosen": -2.4654946327209473, "logps/rejected": -2.7149832248687744, "loss": 3.7104, "rewards/accuracies": 0.5, "rewards/chosen": -24.65494728088379, "rewards/margins": 2.494884490966797, "rewards/rejected": -27.149831771850586, "step": 26195 }, { "epoch": 0.8830766119518689, "grad_norm": 37.92697525024414, "learning_rate": 4.106990303231389e-08, "logits/chosen": -2.0186607837677, "logits/rejected": -1.8988367319107056, "logps/chosen": -2.6057591438293457, "logps/rejected": -2.699638843536377, "loss": 3.3298, "rewards/accuracies": 0.5, "rewards/chosen": -26.057592391967773, "rewards/margins": 0.9387954473495483, "rewards/rejected": -26.996387481689453, "step": 26200 }, { "epoch": 0.8832451380228521, "grad_norm": 40.06204605102539, "learning_rate": 4.0953239300119016e-08, "logits/chosen": -1.4002172946929932, "logits/rejected": -1.7599208354949951, "logps/chosen": -2.041703462600708, "logps/rejected": -2.2836692333221436, "loss": 2.1846, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.417034149169922, "rewards/margins": 2.41965913772583, "rewards/rejected": -22.836694717407227, "step": 26205 }, { "epoch": 0.8834136640938354, "grad_norm": 42.123268127441406, "learning_rate": 4.083673442592217e-08, "logits/chosen": -1.9567673206329346, "logits/rejected": -2.1912286281585693, "logps/chosen": -2.125568151473999, "logps/rejected": -2.2718589305877686, "loss": 2.6719, "rewards/accuracies": 0.5, "rewards/chosen": -21.25568199157715, "rewards/margins": 1.4629076719284058, "rewards/rejected": -22.718591690063477, "step": 26210 }, { "epoch": 0.8835821901648185, "grad_norm": 127.27120208740234, "learning_rate": 4.072038845004128e-08, "logits/chosen": -1.8005714416503906, "logits/rejected": -2.0479798316955566, "logps/chosen": -2.3783552646636963, "logps/rejected": -2.9461581707000732, "loss": 1.5435, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -23.783552169799805, "rewards/margins": 5.678030490875244, "rewards/rejected": -29.46158218383789, "step": 26215 }, { "epoch": 0.8837507162358017, "grad_norm": 36.57089614868164, "learning_rate": 4.060420141273907e-08, "logits/chosen": -1.8329858779907227, "logits/rejected": -1.9664011001586914, "logps/chosen": -1.9984794855117798, "logps/rejected": -2.5146331787109375, "loss": 1.5813, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.984792709350586, "rewards/margins": 5.161539554595947, "rewards/rejected": -25.14633560180664, "step": 26220 }, { "epoch": 0.8839192423067849, "grad_norm": 19.129188537597656, "learning_rate": 4.048817335422327e-08, "logits/chosen": -1.6715996265411377, "logits/rejected": -1.9743512868881226, "logps/chosen": -2.968926429748535, "logps/rejected": -3.169696092605591, "loss": 4.191, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -29.68926429748535, "rewards/margins": 2.0076935291290283, "rewards/rejected": -31.69696044921875, "step": 26225 }, { "epoch": 0.884087768377768, "grad_norm": 1.8036651611328125, "learning_rate": 4.037230431464661e-08, "logits/chosen": -1.4815946817398071, "logits/rejected": -1.510654091835022, "logps/chosen": -1.9119043350219727, "logps/rejected": -2.0924339294433594, "loss": 2.2048, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.119043350219727, "rewards/margins": 1.805295705795288, "rewards/rejected": -20.92433738708496, "step": 26230 }, { "epoch": 0.8842562944487512, "grad_norm": 51.39049530029297, "learning_rate": 4.025659433410683e-08, "logits/chosen": -1.695776343345642, "logits/rejected": -2.1014962196350098, "logps/chosen": -2.6338090896606445, "logps/rejected": -3.248582124710083, "loss": 1.1106, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -26.338092803955078, "rewards/margins": 6.1477274894714355, "rewards/rejected": -32.485816955566406, "step": 26235 }, { "epoch": 0.8844248205197344, "grad_norm": 13.321321487426758, "learning_rate": 4.014104345264663e-08, "logits/chosen": -1.6323163509368896, "logits/rejected": -1.794264554977417, "logps/chosen": -3.0402426719665527, "logps/rejected": -2.900519847869873, "loss": 5.725, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -30.402429580688477, "rewards/margins": -1.3972291946411133, "rewards/rejected": -29.005199432373047, "step": 26240 }, { "epoch": 0.8845933465907175, "grad_norm": 35.8294677734375, "learning_rate": 4.002565171025352e-08, "logits/chosen": -2.30120849609375, "logits/rejected": -2.334444761276245, "logps/chosen": -2.891282320022583, "logps/rejected": -4.049437522888184, "loss": 1.6171, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -28.91282081604004, "rewards/margins": 11.581551551818848, "rewards/rejected": -40.4943733215332, "step": 26245 }, { "epoch": 0.8847618726617008, "grad_norm": 15.660163879394531, "learning_rate": 3.991041914686011e-08, "logits/chosen": -1.8731937408447266, "logits/rejected": -1.8050944805145264, "logps/chosen": -2.4904088973999023, "logps/rejected": -3.021503210067749, "loss": 2.0161, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -24.904088973999023, "rewards/margins": 5.310944557189941, "rewards/rejected": -30.21503257751465, "step": 26250 }, { "epoch": 0.884930398732684, "grad_norm": 21.406620025634766, "learning_rate": 3.979534580234378e-08, "logits/chosen": -1.6166706085205078, "logits/rejected": -2.2093665599823, "logps/chosen": -2.2637438774108887, "logps/rejected": -2.64788556098938, "loss": 1.6625, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.637439727783203, "rewards/margins": 3.841416835784912, "rewards/rejected": -26.478857040405273, "step": 26255 }, { "epoch": 0.8850989248036671, "grad_norm": 164.92747497558594, "learning_rate": 3.968043171652707e-08, "logits/chosen": -2.1059999465942383, "logits/rejected": -2.2262609004974365, "logps/chosen": -3.2196247577667236, "logps/rejected": -3.4764976501464844, "loss": 3.888, "rewards/accuracies": 0.5, "rewards/chosen": -32.196250915527344, "rewards/margins": 2.568727970123291, "rewards/rejected": -34.764976501464844, "step": 26260 }, { "epoch": 0.8852674508746503, "grad_norm": 54.82228088378906, "learning_rate": 3.956567692917695e-08, "logits/chosen": -2.0362296104431152, "logits/rejected": -2.172341823577881, "logps/chosen": -2.940056324005127, "logps/rejected": -3.6462929248809814, "loss": 2.2166, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -29.400562286376953, "rewards/margins": 7.062368869781494, "rewards/rejected": -36.462928771972656, "step": 26265 }, { "epoch": 0.8854359769456335, "grad_norm": 212.3646697998047, "learning_rate": 3.9451081480005647e-08, "logits/chosen": -1.7555129528045654, "logits/rejected": -1.7756602764129639, "logps/chosen": -2.326939105987549, "logps/rejected": -2.392819404602051, "loss": 3.7741, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -23.269390106201172, "rewards/margins": 0.6588034629821777, "rewards/rejected": -23.92819595336914, "step": 26270 }, { "epoch": 0.8856045030166166, "grad_norm": 42.445858001708984, "learning_rate": 3.933664540867027e-08, "logits/chosen": -1.9917614459991455, "logits/rejected": -2.2271552085876465, "logps/chosen": -2.3137385845184326, "logps/rejected": -2.756070613861084, "loss": 2.4639, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.137386322021484, "rewards/margins": 4.4233198165893555, "rewards/rejected": -27.56070899963379, "step": 26275 }, { "epoch": 0.8857730290875998, "grad_norm": 43.41066360473633, "learning_rate": 3.922236875477236e-08, "logits/chosen": -2.333465099334717, "logits/rejected": -2.0408828258514404, "logps/chosen": -2.147674083709717, "logps/rejected": -1.9634158611297607, "loss": 5.2754, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.47673797607422, "rewards/margins": -1.8425800800323486, "rewards/rejected": -19.634159088134766, "step": 26280 }, { "epoch": 0.8859415551585831, "grad_norm": 32.42685317993164, "learning_rate": 3.910825155785874e-08, "logits/chosen": -1.8978359699249268, "logits/rejected": -1.8385225534439087, "logps/chosen": -2.6863465309143066, "logps/rejected": -2.741611957550049, "loss": 4.7944, "rewards/accuracies": 0.5, "rewards/chosen": -26.86346435546875, "rewards/margins": 0.5526536703109741, "rewards/rejected": -27.416118621826172, "step": 26285 }, { "epoch": 0.8861100812295662, "grad_norm": 3.309293031692505, "learning_rate": 3.899429385742087e-08, "logits/chosen": -1.8833850622177124, "logits/rejected": -1.8865734338760376, "logps/chosen": -2.6383042335510254, "logps/rejected": -2.7358269691467285, "loss": 2.9002, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -26.383041381835938, "rewards/margins": 0.9752256274223328, "rewards/rejected": -27.3582706451416, "step": 26290 }, { "epoch": 0.8862786073005494, "grad_norm": 34.39920425415039, "learning_rate": 3.888049569289503e-08, "logits/chosen": -1.429771900177002, "logits/rejected": -1.9705921411514282, "logps/chosen": -2.29795241355896, "logps/rejected": -2.6995646953582764, "loss": 1.6437, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.979524612426758, "rewards/margins": 4.016123294830322, "rewards/rejected": -26.995647430419922, "step": 26295 }, { "epoch": 0.8864471333715326, "grad_norm": 58.80097579956055, "learning_rate": 3.876685710366223e-08, "logits/chosen": -2.0685536861419678, "logits/rejected": -2.290975332260132, "logps/chosen": -2.396440029144287, "logps/rejected": -2.4646482467651367, "loss": 2.9537, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.964397430419922, "rewards/margins": 0.6820847392082214, "rewards/rejected": -24.646480560302734, "step": 26300 }, { "epoch": 0.8866156594425157, "grad_norm": 41.23129653930664, "learning_rate": 3.8653378129048285e-08, "logits/chosen": -1.9296531677246094, "logits/rejected": -1.8945322036743164, "logps/chosen": -2.165827989578247, "logps/rejected": -2.2192344665527344, "loss": 2.6034, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.658279418945312, "rewards/margins": 0.5340660214424133, "rewards/rejected": -22.192346572875977, "step": 26305 }, { "epoch": 0.8867841855134989, "grad_norm": 29.026077270507812, "learning_rate": 3.854005880832395e-08, "logits/chosen": -1.916271448135376, "logits/rejected": -1.7464656829833984, "logps/chosen": -1.7863132953643799, "logps/rejected": -2.0211892127990723, "loss": 1.7257, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.86313247680664, "rewards/margins": 2.3487584590911865, "rewards/rejected": -20.211891174316406, "step": 26310 }, { "epoch": 0.8869527115844821, "grad_norm": 125.84819793701172, "learning_rate": 3.8426899180704356e-08, "logits/chosen": -1.8223766088485718, "logits/rejected": -2.0062925815582275, "logps/chosen": -3.252180576324463, "logps/rejected": -3.5337576866149902, "loss": 2.8062, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -32.52180480957031, "rewards/margins": 2.8157730102539062, "rewards/rejected": -35.33757781982422, "step": 26315 }, { "epoch": 0.8871212376554654, "grad_norm": 42.38909149169922, "learning_rate": 3.831389928534967e-08, "logits/chosen": -1.7225860357284546, "logits/rejected": -2.0962350368499756, "logps/chosen": -2.7228047847747803, "logps/rejected": -3.8885185718536377, "loss": 2.579, "rewards/accuracies": 0.5, "rewards/chosen": -27.22804832458496, "rewards/margins": 11.657136917114258, "rewards/rejected": -38.88518524169922, "step": 26320 }, { "epoch": 0.8872897637264485, "grad_norm": 43.3592643737793, "learning_rate": 3.820105916136479e-08, "logits/chosen": -1.665102243423462, "logits/rejected": -2.1733100414276123, "logps/chosen": -2.2348780632019043, "logps/rejected": -2.450758934020996, "loss": 2.2147, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.348779678344727, "rewards/margins": 2.158808469772339, "rewards/rejected": -24.507587432861328, "step": 26325 }, { "epoch": 0.8874582897974317, "grad_norm": 34.18486404418945, "learning_rate": 3.808837884779925e-08, "logits/chosen": -1.429547667503357, "logits/rejected": -1.861356496810913, "logps/chosen": -2.4993577003479004, "logps/rejected": -2.9565072059631348, "loss": 1.7211, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.993579864501953, "rewards/margins": 4.57149600982666, "rewards/rejected": -29.565073013305664, "step": 26330 }, { "epoch": 0.8876268158684149, "grad_norm": 52.628761291503906, "learning_rate": 3.7975858383647086e-08, "logits/chosen": -1.957058310508728, "logits/rejected": -1.9958652257919312, "logps/chosen": -2.476940155029297, "logps/rejected": -3.013596296310425, "loss": 3.4843, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.76940155029297, "rewards/margins": 5.366562843322754, "rewards/rejected": -30.13596534729004, "step": 26335 }, { "epoch": 0.887795341939398, "grad_norm": 43.5367546081543, "learning_rate": 3.786349780784731e-08, "logits/chosen": -1.6538121700286865, "logits/rejected": -1.627824068069458, "logps/chosen": -2.146531820297241, "logps/rejected": -2.0789332389831543, "loss": 3.8805, "rewards/accuracies": 0.5, "rewards/chosen": -21.465316772460938, "rewards/margins": -0.675983726978302, "rewards/rejected": -20.78933334350586, "step": 26340 }, { "epoch": 0.8879638680103812, "grad_norm": 8.637046278181515e-08, "learning_rate": 3.77512971592836e-08, "logits/chosen": -2.0448696613311768, "logits/rejected": -2.352041244506836, "logps/chosen": -3.145742416381836, "logps/rejected": -4.253350734710693, "loss": 1.4902, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -31.457427978515625, "rewards/margins": 11.076081275939941, "rewards/rejected": -42.533504486083984, "step": 26345 }, { "epoch": 0.8881323940813644, "grad_norm": 0.0413767471909523, "learning_rate": 3.763925647678401e-08, "logits/chosen": -1.8336588144302368, "logits/rejected": -1.9427735805511475, "logps/chosen": -2.932631015777588, "logps/rejected": -3.345829486846924, "loss": 2.6405, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -29.326309204101562, "rewards/margins": 4.131980895996094, "rewards/rejected": -33.458290100097656, "step": 26350 }, { "epoch": 0.8883009201523475, "grad_norm": 18.382659912109375, "learning_rate": 3.752737579912146e-08, "logits/chosen": -1.733275055885315, "logits/rejected": -1.9531488418579102, "logps/chosen": -1.8927421569824219, "logps/rejected": -2.2471954822540283, "loss": 2.1427, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.92742347717285, "rewards/margins": 3.5445313453674316, "rewards/rejected": -22.471952438354492, "step": 26355 }, { "epoch": 0.8884694462233308, "grad_norm": 17.96552276611328, "learning_rate": 3.7415655165013435e-08, "logits/chosen": -2.0436959266662598, "logits/rejected": -2.390925884246826, "logps/chosen": -1.8439857959747314, "logps/rejected": -2.1458404064178467, "loss": 1.234, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -18.43985939025879, "rewards/margins": 3.018548011779785, "rewards/rejected": -21.458406448364258, "step": 26360 }, { "epoch": 0.888637972294314, "grad_norm": 5.227560043334961, "learning_rate": 3.7304094613122064e-08, "logits/chosen": -1.5187642574310303, "logits/rejected": -1.4792652130126953, "logps/chosen": -2.2077884674072266, "logps/rejected": -2.3878226280212402, "loss": 3.1346, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.0778865814209, "rewards/margins": 1.800341248512268, "rewards/rejected": -23.87822723388672, "step": 26365 }, { "epoch": 0.8888064983652971, "grad_norm": 42.939727783203125, "learning_rate": 3.7192694182054065e-08, "logits/chosen": -2.4460108280181885, "logits/rejected": -2.9244384765625, "logps/chosen": -2.6990339756011963, "logps/rejected": -3.735548496246338, "loss": 1.3055, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -26.990337371826172, "rewards/margins": 10.365147590637207, "rewards/rejected": -37.35548400878906, "step": 26370 }, { "epoch": 0.8889750244362803, "grad_norm": 16.590620040893555, "learning_rate": 3.708145391036077e-08, "logits/chosen": -1.39180588722229, "logits/rejected": -1.8719170093536377, "logps/chosen": -2.1872692108154297, "logps/rejected": -2.8928062915802, "loss": 1.5447, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.872692108154297, "rewards/margins": 7.055368900299072, "rewards/rejected": -28.92806053161621, "step": 26375 }, { "epoch": 0.8891435505072635, "grad_norm": 82.56770324707031, "learning_rate": 3.697037383653795e-08, "logits/chosen": -2.03816556930542, "logits/rejected": -2.2176055908203125, "logps/chosen": -3.07786226272583, "logps/rejected": -3.490302324295044, "loss": 2.1655, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -30.77862548828125, "rewards/margins": 4.124399662017822, "rewards/rejected": -34.90302276611328, "step": 26380 }, { "epoch": 0.8893120765782466, "grad_norm": 20.41136360168457, "learning_rate": 3.685945399902612e-08, "logits/chosen": -2.1533596515655518, "logits/rejected": -2.9651713371276855, "logps/chosen": -2.3581838607788086, "logps/rejected": -2.778751850128174, "loss": 1.6457, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -23.581836700439453, "rewards/margins": 4.205681324005127, "rewards/rejected": -27.787517547607422, "step": 26385 }, { "epoch": 0.8894806026492298, "grad_norm": 31.72279930114746, "learning_rate": 3.674869443621026e-08, "logits/chosen": -2.089503049850464, "logits/rejected": -2.2000720500946045, "logps/chosen": -2.883258581161499, "logps/rejected": -3.233872175216675, "loss": 2.5709, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -28.832584381103516, "rewards/margins": 3.5061347484588623, "rewards/rejected": -32.338722229003906, "step": 26390 }, { "epoch": 0.8896491287202131, "grad_norm": 72.49952697753906, "learning_rate": 3.6638095186419915e-08, "logits/chosen": -1.2590601444244385, "logits/rejected": -2.207016706466675, "logps/chosen": -2.256216049194336, "logps/rejected": -3.127159833908081, "loss": 2.2759, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.56216049194336, "rewards/margins": 8.709436416625977, "rewards/rejected": -31.271595001220703, "step": 26395 }, { "epoch": 0.8898176547911962, "grad_norm": 47.8337287902832, "learning_rate": 3.652765628792914e-08, "logits/chosen": -2.090017080307007, "logits/rejected": -2.1647074222564697, "logps/chosen": -2.8364415168762207, "logps/rejected": -3.1691741943359375, "loss": 2.5965, "rewards/accuracies": 0.5, "rewards/chosen": -28.364416122436523, "rewards/margins": 3.3273262977600098, "rewards/rejected": -31.691741943359375, "step": 26400 }, { "epoch": 0.8898176547911962, "eval_logits/chosen": -2.3095388412475586, "eval_logits/rejected": -2.4877052307128906, "eval_logps/chosen": -2.2862913608551025, "eval_logps/rejected": -2.441688299179077, "eval_loss": 3.084059953689575, "eval_rewards/accuracies": 0.6299999952316284, "eval_rewards/chosen": -22.8629150390625, "eval_rewards/margins": 1.553971767425537, "eval_rewards/rejected": -24.416885375976562, "eval_runtime": 12.9458, "eval_samples_per_second": 7.724, "eval_steps_per_second": 1.931, "step": 26400 }, { "epoch": 0.8899861808621794, "grad_norm": 15.262998580932617, "learning_rate": 3.641737777895631e-08, "logits/chosen": -1.386071801185608, "logits/rejected": -1.60391104221344, "logps/chosen": -1.9214773178100586, "logps/rejected": -2.374145269393921, "loss": 0.8926, "rewards/accuracies": 1.0, "rewards/chosen": -19.214773178100586, "rewards/margins": 4.526679992675781, "rewards/rejected": -23.741451263427734, "step": 26405 }, { "epoch": 0.8901547069331626, "grad_norm": 27.034141540527344, "learning_rate": 3.6307259697664684e-08, "logits/chosen": -1.3661694526672363, "logits/rejected": -1.655531883239746, "logps/chosen": -1.7544902563095093, "logps/rejected": -1.987342119216919, "loss": 1.7387, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.544902801513672, "rewards/margins": 2.328519105911255, "rewards/rejected": -19.873422622680664, "step": 26410 }, { "epoch": 0.8903232330041457, "grad_norm": 36.613914489746094, "learning_rate": 3.619730208216176e-08, "logits/chosen": -2.304776668548584, "logits/rejected": -2.4895987510681152, "logps/chosen": -1.8933130502700806, "logps/rejected": -2.0928142070770264, "loss": 2.1845, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -18.933130264282227, "rewards/margins": 1.995012640953064, "rewards/rejected": -20.928142547607422, "step": 26415 }, { "epoch": 0.8904917590751289, "grad_norm": 30.089303970336914, "learning_rate": 3.6087504970499394e-08, "logits/chosen": -1.9915310144424438, "logits/rejected": -2.441012144088745, "logps/chosen": -2.0041680335998535, "logps/rejected": -2.1425201892852783, "loss": 3.4509, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.041683197021484, "rewards/margins": 1.383520483970642, "rewards/rejected": -21.425201416015625, "step": 26420 }, { "epoch": 0.8906602851461121, "grad_norm": 75.44097137451172, "learning_rate": 3.597786840067418e-08, "logits/chosen": -1.8589273691177368, "logits/rejected": -1.8614015579223633, "logps/chosen": -2.8449208736419678, "logps/rejected": -3.298020124435425, "loss": 2.9951, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -28.449207305908203, "rewards/margins": 4.530993461608887, "rewards/rejected": -32.980201721191406, "step": 26425 }, { "epoch": 0.8908288112170953, "grad_norm": 40.12236022949219, "learning_rate": 3.586839241062695e-08, "logits/chosen": -1.7321977615356445, "logits/rejected": -1.890179991722107, "logps/chosen": -3.3235104084014893, "logps/rejected": -3.320343494415283, "loss": 4.4035, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -33.23509979248047, "rewards/margins": -0.03166971355676651, "rewards/rejected": -33.20343780517578, "step": 26430 }, { "epoch": 0.8909973372880785, "grad_norm": 11.368515968322754, "learning_rate": 3.5759077038243105e-08, "logits/chosen": -1.9310667514801025, "logits/rejected": -2.2706170082092285, "logps/chosen": -2.7760231494903564, "logps/rejected": -3.5364856719970703, "loss": 1.2416, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -27.760229110717773, "rewards/margins": 7.604626655578613, "rewards/rejected": -35.36486053466797, "step": 26435 }, { "epoch": 0.8911658633590617, "grad_norm": 33.3480339050293, "learning_rate": 3.5649922321352276e-08, "logits/chosen": -1.975754976272583, "logits/rejected": -2.155705451965332, "logps/chosen": -2.017496109008789, "logps/rejected": -2.319976329803467, "loss": 2.4917, "rewards/accuracies": 0.5, "rewards/chosen": -20.17496109008789, "rewards/margins": 3.0248000621795654, "rewards/rejected": -23.19976234436035, "step": 26440 }, { "epoch": 0.8913343894300448, "grad_norm": 60.67802810668945, "learning_rate": 3.5540928297728644e-08, "logits/chosen": -1.498110294342041, "logits/rejected": -1.9327552318572998, "logps/chosen": -2.3765645027160645, "logps/rejected": -2.515491008758545, "loss": 2.4061, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.765644073486328, "rewards/margins": 1.3892669677734375, "rewards/rejected": -25.154911041259766, "step": 26445 }, { "epoch": 0.891502915501028, "grad_norm": 5.0874528884887695, "learning_rate": 3.543209500509087e-08, "logits/chosen": -2.114600658416748, "logits/rejected": -2.1396377086639404, "logps/chosen": -2.263089895248413, "logps/rejected": -2.864563465118408, "loss": 2.5865, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.63089942932129, "rewards/margins": 6.014737129211426, "rewards/rejected": -28.6456356048584, "step": 26450 }, { "epoch": 0.8916714415720112, "grad_norm": 218.70321655273438, "learning_rate": 3.5323422481101704e-08, "logits/chosen": -1.9041109085083008, "logits/rejected": -1.7146583795547485, "logps/chosen": -2.972308874130249, "logps/rejected": -2.422968626022339, "loss": 8.8578, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -29.72308921813965, "rewards/margins": -5.49340295791626, "rewards/rejected": -24.229686737060547, "step": 26455 }, { "epoch": 0.8918399676429943, "grad_norm": 23.796815872192383, "learning_rate": 3.5214910763368465e-08, "logits/chosen": -1.6301389932632446, "logits/rejected": -1.4314095973968506, "logps/chosen": -2.698103427886963, "logps/rejected": -3.5130105018615723, "loss": 3.2043, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -26.981029510498047, "rewards/margins": 8.149070739746094, "rewards/rejected": -35.130104064941406, "step": 26460 }, { "epoch": 0.8920084937139775, "grad_norm": 29.702604293823242, "learning_rate": 3.5106559889442834e-08, "logits/chosen": -2.535851240158081, "logits/rejected": -2.5843117237091064, "logps/chosen": -3.023690938949585, "logps/rejected": -3.4369590282440186, "loss": 2.2928, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -30.236907958984375, "rewards/margins": 4.132681369781494, "rewards/rejected": -34.369590759277344, "step": 26465 }, { "epoch": 0.8921770197849608, "grad_norm": 34.1428337097168, "learning_rate": 3.499836989682081e-08, "logits/chosen": -1.4994641542434692, "logits/rejected": -1.624497413635254, "logps/chosen": -1.7730028629302979, "logps/rejected": -1.9339821338653564, "loss": 2.1891, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.730030059814453, "rewards/margins": 1.6097911596298218, "rewards/rejected": -19.339818954467773, "step": 26470 }, { "epoch": 0.892345545855944, "grad_norm": 43.83182907104492, "learning_rate": 3.489034082294257e-08, "logits/chosen": -2.2628543376922607, "logits/rejected": -2.2911453247070312, "logps/chosen": -2.642298698425293, "logps/rejected": -2.8167061805725098, "loss": 2.734, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -26.422988891601562, "rewards/margins": 1.7440717220306396, "rewards/rejected": -28.16705894470215, "step": 26475 }, { "epoch": 0.8925140719269271, "grad_norm": 24.363571166992188, "learning_rate": 3.47824727051928e-08, "logits/chosen": -2.286649703979492, "logits/rejected": -2.66743803024292, "logps/chosen": -3.0622658729553223, "logps/rejected": -3.6419472694396973, "loss": 2.8387, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -30.62265968322754, "rewards/margins": 5.796811103820801, "rewards/rejected": -36.419471740722656, "step": 26480 }, { "epoch": 0.8926825979979103, "grad_norm": 34.195926666259766, "learning_rate": 3.4674765580900435e-08, "logits/chosen": -1.6512067317962646, "logits/rejected": -1.8246276378631592, "logps/chosen": -2.3566603660583496, "logps/rejected": -2.6705374717712402, "loss": 1.879, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.566604614257812, "rewards/margins": 3.1387696266174316, "rewards/rejected": -26.705373764038086, "step": 26485 }, { "epoch": 0.8928511240688934, "grad_norm": 34.41433334350586, "learning_rate": 3.456721948733854e-08, "logits/chosen": -1.578489065170288, "logits/rejected": -2.0936756134033203, "logps/chosen": -2.417600154876709, "logps/rejected": -2.9172120094299316, "loss": 2.9168, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.17600440979004, "rewards/margins": 4.996116638183594, "rewards/rejected": -29.172119140625, "step": 26490 }, { "epoch": 0.8930196501398766, "grad_norm": 36.56930923461914, "learning_rate": 3.445983446172468e-08, "logits/chosen": -1.7800805568695068, "logits/rejected": -1.7682125568389893, "logps/chosen": -2.447359800338745, "logps/rejected": -2.3159613609313965, "loss": 4.43, "rewards/accuracies": 0.5, "rewards/chosen": -24.47359848022461, "rewards/margins": -1.3139829635620117, "rewards/rejected": -23.159616470336914, "step": 26495 }, { "epoch": 0.8931881762108598, "grad_norm": 48.0949592590332, "learning_rate": 3.4352610541220574e-08, "logits/chosen": -2.0843327045440674, "logits/rejected": -2.2306456565856934, "logps/chosen": -2.0241684913635254, "logps/rejected": -2.457520008087158, "loss": 2.3178, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.241683959960938, "rewards/margins": 4.333517074584961, "rewards/rejected": -24.5752010345459, "step": 26500 }, { "epoch": 0.893356702281843, "grad_norm": 69.8865966796875, "learning_rate": 3.42455477629322e-08, "logits/chosen": -1.8679606914520264, "logits/rejected": -2.1544744968414307, "logps/chosen": -2.019822120666504, "logps/rejected": -1.9206161499023438, "loss": 4.1463, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.198219299316406, "rewards/margins": -0.9920600056648254, "rewards/rejected": -19.206159591674805, "step": 26505 }, { "epoch": 0.8935252283528262, "grad_norm": 22.330610275268555, "learning_rate": 3.4138646163909715e-08, "logits/chosen": -1.898535132408142, "logits/rejected": -2.080216646194458, "logps/chosen": -2.839162826538086, "logps/rejected": -3.4196839332580566, "loss": 2.588, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -28.391626358032227, "rewards/margins": 5.805212020874023, "rewards/rejected": -34.19683837890625, "step": 26510 }, { "epoch": 0.8936937544238094, "grad_norm": 46.52693557739258, "learning_rate": 3.403190578114762e-08, "logits/chosen": -1.8540111780166626, "logits/rejected": -1.9055122137069702, "logps/chosen": -2.3714959621429443, "logps/rejected": -2.411144256591797, "loss": 4.4685, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -23.7149600982666, "rewards/margins": 0.39648064970970154, "rewards/rejected": -24.111440658569336, "step": 26515 }, { "epoch": 0.8938622804947925, "grad_norm": 27.26725959777832, "learning_rate": 3.392532665158449e-08, "logits/chosen": -2.230833053588867, "logits/rejected": -2.185213327407837, "logps/chosen": -2.181652545928955, "logps/rejected": -2.0785341262817383, "loss": 4.192, "rewards/accuracies": 0.5, "rewards/chosen": -21.816524505615234, "rewards/margins": -1.031184434890747, "rewards/rejected": -20.785343170166016, "step": 26520 }, { "epoch": 0.8940308065657757, "grad_norm": 29.42605972290039, "learning_rate": 3.38189088121032e-08, "logits/chosen": -2.070993661880493, "logits/rejected": -2.3387389183044434, "logps/chosen": -2.1574041843414307, "logps/rejected": -2.6259658336639404, "loss": 1.657, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.57404136657715, "rewards/margins": 4.685616970062256, "rewards/rejected": -26.259658813476562, "step": 26525 }, { "epoch": 0.8941993326367589, "grad_norm": 2.8149056434631348, "learning_rate": 3.371265229953074e-08, "logits/chosen": -1.9254567623138428, "logits/rejected": -2.2278952598571777, "logps/chosen": -2.96343731880188, "logps/rejected": -3.121615171432495, "loss": 5.3805, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -29.634368896484375, "rewards/margins": 1.5817797183990479, "rewards/rejected": -31.21615219116211, "step": 26530 }, { "epoch": 0.894367858707742, "grad_norm": 27.930755615234375, "learning_rate": 3.360655715063837e-08, "logits/chosen": -1.7379354238510132, "logits/rejected": -1.865264892578125, "logps/chosen": -2.278048515319824, "logps/rejected": -2.7165865898132324, "loss": 1.135, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -22.780485153198242, "rewards/margins": 4.385384559631348, "rewards/rejected": -27.165868759155273, "step": 26535 }, { "epoch": 0.8945363847787253, "grad_norm": 29.486385345458984, "learning_rate": 3.350062340214149e-08, "logits/chosen": -1.9781516790390015, "logits/rejected": -2.348292350769043, "logps/chosen": -2.722933053970337, "logps/rejected": -3.038951873779297, "loss": 1.3001, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -27.229333877563477, "rewards/margins": 3.160186529159546, "rewards/rejected": -30.389516830444336, "step": 26540 }, { "epoch": 0.8947049108497085, "grad_norm": 24.48255157470703, "learning_rate": 3.339485109069939e-08, "logits/chosen": -1.8980258703231812, "logits/rejected": -2.232142925262451, "logps/chosen": -2.2230277061462402, "logps/rejected": -2.9788966178894043, "loss": 1.5709, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -22.230276107788086, "rewards/margins": 7.558690547943115, "rewards/rejected": -29.788965225219727, "step": 26545 }, { "epoch": 0.8948734369206917, "grad_norm": 30.568679809570312, "learning_rate": 3.328924025291585e-08, "logits/chosen": -1.8301811218261719, "logits/rejected": -1.786790132522583, "logps/chosen": -2.1006762981414795, "logps/rejected": -2.4340219497680664, "loss": 1.2593, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -21.00676155090332, "rewards/margins": 3.333458662033081, "rewards/rejected": -24.340219497680664, "step": 26550 }, { "epoch": 0.8950419629916748, "grad_norm": 32.12637710571289, "learning_rate": 3.318379092533868e-08, "logits/chosen": -1.7069917917251587, "logits/rejected": -1.9681928157806396, "logps/chosen": -2.761687755584717, "logps/rejected": -2.6828818321228027, "loss": 3.8563, "rewards/accuracies": 0.5, "rewards/chosen": -27.61687660217285, "rewards/margins": -0.7880581021308899, "rewards/rejected": -26.828815460205078, "step": 26555 }, { "epoch": 0.895210489062658, "grad_norm": 28.803434371948242, "learning_rate": 3.3078503144459535e-08, "logits/chosen": -1.1517115831375122, "logits/rejected": -1.3329699039459229, "logps/chosen": -2.1754512786865234, "logps/rejected": -3.084625720977783, "loss": 2.7049, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.754512786865234, "rewards/margins": 9.091741561889648, "rewards/rejected": -30.846256256103516, "step": 26560 }, { "epoch": 0.8953790151336412, "grad_norm": 47.28593444824219, "learning_rate": 3.297337694671448e-08, "logits/chosen": -2.179915428161621, "logits/rejected": -2.6756978034973145, "logps/chosen": -2.4682414531707764, "logps/rejected": -3.088728666305542, "loss": 1.6676, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -24.682416915893555, "rewards/margins": 6.20487117767334, "rewards/rejected": -30.88728904724121, "step": 26565 }, { "epoch": 0.8955475412046243, "grad_norm": 39.167423248291016, "learning_rate": 3.286841236848353e-08, "logits/chosen": -1.4234455823898315, "logits/rejected": -1.5056092739105225, "logps/chosen": -2.820444345474243, "logps/rejected": -3.1854355335235596, "loss": 2.5673, "rewards/accuracies": 0.5, "rewards/chosen": -28.204442977905273, "rewards/margins": 3.649911403656006, "rewards/rejected": -31.854354858398438, "step": 26570 }, { "epoch": 0.8957160672756075, "grad_norm": 41.25774002075195, "learning_rate": 3.2763609446090966e-08, "logits/chosen": -2.3403804302215576, "logits/rejected": -2.43021821975708, "logps/chosen": -3.2246127128601074, "logps/rejected": -3.2669715881347656, "loss": 3.8261, "rewards/accuracies": 0.5, "rewards/chosen": -32.246131896972656, "rewards/margins": 0.42358970642089844, "rewards/rejected": -32.66971969604492, "step": 26575 }, { "epoch": 0.8958845933465908, "grad_norm": 34.58900451660156, "learning_rate": 3.265896821580466e-08, "logits/chosen": -1.5373561382293701, "logits/rejected": -1.5862390995025635, "logps/chosen": -2.270263195037842, "logps/rejected": -2.307436227798462, "loss": 3.3203, "rewards/accuracies": 0.5, "rewards/chosen": -22.702632904052734, "rewards/margins": 0.37172871828079224, "rewards/rejected": -23.074359893798828, "step": 26580 }, { "epoch": 0.8960531194175739, "grad_norm": 29.757532119750977, "learning_rate": 3.255448871383692e-08, "logits/chosen": -1.7115838527679443, "logits/rejected": -1.9038822650909424, "logps/chosen": -2.302544116973877, "logps/rejected": -2.024075746536255, "loss": 6.2287, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.025440216064453, "rewards/margins": -2.7846837043762207, "rewards/rejected": -20.24075698852539, "step": 26585 }, { "epoch": 0.8962216454885571, "grad_norm": 5.816464424133301, "learning_rate": 3.245017097634417e-08, "logits/chosen": -2.073551893234253, "logits/rejected": -2.096770763397217, "logps/chosen": -1.9916664361953735, "logps/rejected": -2.167724847793579, "loss": 2.6002, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.91666603088379, "rewards/margins": 1.7605831623077393, "rewards/rejected": -21.677249908447266, "step": 26590 }, { "epoch": 0.8963901715595403, "grad_norm": 29.604887008666992, "learning_rate": 3.234601503942641e-08, "logits/chosen": -1.8629038333892822, "logits/rejected": -1.8070766925811768, "logps/chosen": -2.160930871963501, "logps/rejected": -2.353682041168213, "loss": 3.0509, "rewards/accuracies": 0.5, "rewards/chosen": -21.60930824279785, "rewards/margins": 1.9275100231170654, "rewards/rejected": -23.536815643310547, "step": 26595 }, { "epoch": 0.8965586976305234, "grad_norm": 15.401908874511719, "learning_rate": 3.224202093912798e-08, "logits/chosen": -1.1934399604797363, "logits/rejected": -1.2277182340621948, "logps/chosen": -2.0062804222106934, "logps/rejected": -2.2444908618927, "loss": 1.7024, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.06280517578125, "rewards/margins": 2.3821024894714355, "rewards/rejected": -22.444908142089844, "step": 26600 }, { "epoch": 0.8967272237015066, "grad_norm": 28.409311294555664, "learning_rate": 3.21381887114372e-08, "logits/chosen": -1.805456519126892, "logits/rejected": -2.1640634536743164, "logps/chosen": -2.77734375, "logps/rejected": -2.9346604347229004, "loss": 2.6559, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -27.7734375, "rewards/margins": 1.5731639862060547, "rewards/rejected": -29.346603393554688, "step": 26605 }, { "epoch": 0.8968957497724898, "grad_norm": 97.36381530761719, "learning_rate": 3.203451839228638e-08, "logits/chosen": -1.751865029335022, "logits/rejected": -2.0187325477600098, "logps/chosen": -1.852243185043335, "logps/rejected": -1.9989274740219116, "loss": 4.0031, "rewards/accuracies": 0.5, "rewards/chosen": -18.522432327270508, "rewards/margins": 1.4668428897857666, "rewards/rejected": -19.989276885986328, "step": 26610 }, { "epoch": 0.897064275843473, "grad_norm": 34.04191970825195, "learning_rate": 3.1931010017551555e-08, "logits/chosen": -1.436517596244812, "logits/rejected": -2.0932650566101074, "logps/chosen": -2.7484524250030518, "logps/rejected": -3.309058427810669, "loss": 3.0777, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -27.484527587890625, "rewards/margins": 5.606059551239014, "rewards/rejected": -33.09058380126953, "step": 26615 }, { "epoch": 0.8972328019144562, "grad_norm": 42.45766830444336, "learning_rate": 3.1827663623052945e-08, "logits/chosen": -1.7623097896575928, "logits/rejected": -1.7391726970672607, "logps/chosen": -1.8129985332489014, "logps/rejected": -1.9569908380508423, "loss": 2.2725, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.129985809326172, "rewards/margins": 1.4399234056472778, "rewards/rejected": -19.569908142089844, "step": 26620 }, { "epoch": 0.8974013279854394, "grad_norm": 54.091957092285156, "learning_rate": 3.1724479244554794e-08, "logits/chosen": -1.7639528512954712, "logits/rejected": -2.0980749130249023, "logps/chosen": -2.110844135284424, "logps/rejected": -2.2267391681671143, "loss": 2.7418, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.108442306518555, "rewards/margins": 1.158947467803955, "rewards/rejected": -22.26738929748535, "step": 26625 }, { "epoch": 0.8975698540564225, "grad_norm": 28.083711624145508, "learning_rate": 3.1621456917765025e-08, "logits/chosen": -1.6799514293670654, "logits/rejected": -2.156489133834839, "logps/chosen": -2.3385581970214844, "logps/rejected": -2.5456862449645996, "loss": 2.7011, "rewards/accuracies": 0.5, "rewards/chosen": -23.38558006286621, "rewards/margins": 2.071280002593994, "rewards/rejected": -25.45686149597168, "step": 26630 }, { "epoch": 0.8977383801274057, "grad_norm": 8.380878448486328, "learning_rate": 3.151859667833562e-08, "logits/chosen": -2.053631544113159, "logits/rejected": -1.9986978769302368, "logps/chosen": -1.9755045175552368, "logps/rejected": -2.13932728767395, "loss": 2.5733, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.75504493713379, "rewards/margins": 1.638228178024292, "rewards/rejected": -21.393274307250977, "step": 26635 }, { "epoch": 0.8979069061983889, "grad_norm": 123.43840789794922, "learning_rate": 3.141589856186244e-08, "logits/chosen": -1.7218620777130127, "logits/rejected": -2.00181245803833, "logps/chosen": -3.3388209342956543, "logps/rejected": -3.430068254470825, "loss": 4.0124, "rewards/accuracies": 0.5, "rewards/chosen": -33.388206481933594, "rewards/margins": 0.9124706387519836, "rewards/rejected": -34.300682067871094, "step": 26640 }, { "epoch": 0.898075432269372, "grad_norm": 54.00300979614258, "learning_rate": 3.13133626038854e-08, "logits/chosen": -2.025984525680542, "logits/rejected": -1.9393415451049805, "logps/chosen": -2.0208346843719482, "logps/rejected": -2.30167818069458, "loss": 1.65, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.20834732055664, "rewards/margins": 2.8084373474121094, "rewards/rejected": -23.01678466796875, "step": 26645 }, { "epoch": 0.8982439583403552, "grad_norm": 100.82463073730469, "learning_rate": 3.121098883988793e-08, "logits/chosen": -1.9338929653167725, "logits/rejected": -1.9727376699447632, "logps/chosen": -3.4275753498077393, "logps/rejected": -3.6173622608184814, "loss": 5.8341, "rewards/accuracies": 0.5, "rewards/chosen": -34.275753021240234, "rewards/margins": 1.8978685140609741, "rewards/rejected": -36.173622131347656, "step": 26650 }, { "epoch": 0.8984124844113385, "grad_norm": 32.94302749633789, "learning_rate": 3.110877730529771e-08, "logits/chosen": -2.066908121109009, "logits/rejected": -2.4684619903564453, "logps/chosen": -2.7884440422058105, "logps/rejected": -3.1394457817077637, "loss": 2.0568, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -27.884441375732422, "rewards/margins": 3.510014057159424, "rewards/rejected": -31.394454956054688, "step": 26655 }, { "epoch": 0.8985810104823216, "grad_norm": 31.880945205688477, "learning_rate": 3.1006728035486095e-08, "logits/chosen": -2.197225570678711, "logits/rejected": -2.540008544921875, "logps/chosen": -2.043921709060669, "logps/rejected": -2.7333791255950928, "loss": 1.9214, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.43921661376953, "rewards/margins": 6.8945770263671875, "rewards/rejected": -27.33379554748535, "step": 26660 }, { "epoch": 0.8987495365533048, "grad_norm": 33.723976135253906, "learning_rate": 3.0904841065768293e-08, "logits/chosen": -1.9747921228408813, "logits/rejected": -2.0234017372131348, "logps/chosen": -2.2100253105163574, "logps/rejected": -2.1898884773254395, "loss": 3.3422, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.10025405883789, "rewards/margins": -0.20136889815330505, "rewards/rejected": -21.898883819580078, "step": 26665 }, { "epoch": 0.898918062624288, "grad_norm": 7.845767498016357, "learning_rate": 3.0803116431403375e-08, "logits/chosen": -1.9601243734359741, "logits/rejected": -2.368412733078003, "logps/chosen": -1.7174384593963623, "logps/rejected": -1.9996525049209595, "loss": 2.791, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.17438507080078, "rewards/margins": 2.8221397399902344, "rewards/rejected": -19.996524810791016, "step": 26670 }, { "epoch": 0.8990865886952711, "grad_norm": 33.07391357421875, "learning_rate": 3.0701554167594345e-08, "logits/chosen": -2.0543577671051025, "logits/rejected": -1.900191068649292, "logps/chosen": -3.7917778491973877, "logps/rejected": -3.9905028343200684, "loss": 2.6151, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -37.91777801513672, "rewards/margins": 1.9872547388076782, "rewards/rejected": -39.905033111572266, "step": 26675 }, { "epoch": 0.8992551147662543, "grad_norm": 159.2845916748047, "learning_rate": 3.06001543094877e-08, "logits/chosen": -1.7436946630477905, "logits/rejected": -2.141364574432373, "logps/chosen": -2.9924213886260986, "logps/rejected": -3.945605516433716, "loss": 2.1637, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -29.924213409423828, "rewards/margins": 9.531842231750488, "rewards/rejected": -39.4560546875, "step": 26680 }, { "epoch": 0.8994236408372375, "grad_norm": 31.464847564697266, "learning_rate": 3.049891689217404e-08, "logits/chosen": -1.6657555103302002, "logits/rejected": -1.8241376876831055, "logps/chosen": -2.2361302375793457, "logps/rejected": -2.5087828636169434, "loss": 1.8575, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.361303329467773, "rewards/margins": 2.7265238761901855, "rewards/rejected": -25.08782958984375, "step": 26685 }, { "epoch": 0.8995921669082207, "grad_norm": 28.74648666381836, "learning_rate": 3.039784195068762e-08, "logits/chosen": -2.540984630584717, "logits/rejected": -2.3905649185180664, "logps/chosen": -4.0224199295043945, "logps/rejected": -4.216039657592773, "loss": 4.7423, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -40.22419738769531, "rewards/margins": 1.9361984729766846, "rewards/rejected": -42.160396575927734, "step": 26690 }, { "epoch": 0.8997606929792039, "grad_norm": 26.239681243896484, "learning_rate": 3.029692952000662e-08, "logits/chosen": -1.6646521091461182, "logits/rejected": -2.0250906944274902, "logps/chosen": -2.4633376598358154, "logps/rejected": -3.2082221508026123, "loss": 2.9795, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.633377075195312, "rewards/margins": 7.448843955993652, "rewards/rejected": -32.08222198486328, "step": 26695 }, { "epoch": 0.8999292190501871, "grad_norm": 29.84377098083496, "learning_rate": 3.0196179635052664e-08, "logits/chosen": -2.0663959980010986, "logits/rejected": -1.9202702045440674, "logps/chosen": -2.9367499351501465, "logps/rejected": -3.0594124794006348, "loss": 3.022, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -29.36749839782715, "rewards/margins": 1.226623773574829, "rewards/rejected": -30.5941219329834, "step": 26700 }, { "epoch": 0.9000977451211702, "grad_norm": 34.29310989379883, "learning_rate": 3.009559233069142e-08, "logits/chosen": -1.9751489162445068, "logits/rejected": -1.7080457210540771, "logps/chosen": -2.1225452423095703, "logps/rejected": -1.9370813369750977, "loss": 5.146, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.225452423095703, "rewards/margins": -1.8546397686004639, "rewards/rejected": -19.370811462402344, "step": 26705 }, { "epoch": 0.9002662711921534, "grad_norm": 79.37320709228516, "learning_rate": 2.9995167641732154e-08, "logits/chosen": -1.799004316329956, "logits/rejected": -2.029813289642334, "logps/chosen": -2.426522731781006, "logps/rejected": -2.999939203262329, "loss": 2.2679, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.265226364135742, "rewards/margins": 5.73416805267334, "rewards/rejected": -29.9993953704834, "step": 26710 }, { "epoch": 0.9004347972631366, "grad_norm": 27.92984962463379, "learning_rate": 2.989490560292801e-08, "logits/chosen": -1.9473021030426025, "logits/rejected": -2.1453399658203125, "logps/chosen": -2.4092183113098145, "logps/rejected": -2.7499892711639404, "loss": 2.1248, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.092182159423828, "rewards/margins": 3.407710552215576, "rewards/rejected": -27.499893188476562, "step": 26715 }, { "epoch": 0.9006033233341197, "grad_norm": 26.3646240234375, "learning_rate": 2.9794806248975512e-08, "logits/chosen": -1.6984527111053467, "logits/rejected": -2.1335010528564453, "logps/chosen": -2.008679151535034, "logps/rejected": -2.133533000946045, "loss": 2.3707, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.0867919921875, "rewards/margins": 1.2485383749008179, "rewards/rejected": -21.335330963134766, "step": 26720 }, { "epoch": 0.900771849405103, "grad_norm": 10.612837791442871, "learning_rate": 2.9694869614515283e-08, "logits/chosen": -1.4333100318908691, "logits/rejected": -1.9090903997421265, "logps/chosen": -2.660550594329834, "logps/rejected": -3.347529649734497, "loss": 1.4041, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -26.60550308227539, "rewards/margins": 6.8697919845581055, "rewards/rejected": -33.47529983520508, "step": 26725 }, { "epoch": 0.9009403754760862, "grad_norm": 54.768768310546875, "learning_rate": 2.9595095734131438e-08, "logits/chosen": -1.4375172853469849, "logits/rejected": -1.9926517009735107, "logps/chosen": -2.2313895225524902, "logps/rejected": -2.748983383178711, "loss": 2.7583, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.31389617919922, "rewards/margins": 5.175940990447998, "rewards/rejected": -27.489837646484375, "step": 26730 }, { "epoch": 0.9011089015470694, "grad_norm": 34.80324172973633, "learning_rate": 2.9495484642351686e-08, "logits/chosen": -2.2112724781036377, "logits/rejected": -2.3973257541656494, "logps/chosen": -1.9748632907867432, "logps/rejected": -2.477585554122925, "loss": 1.9833, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.748634338378906, "rewards/margins": 5.027222156524658, "rewards/rejected": -24.775854110717773, "step": 26735 }, { "epoch": 0.9012774276180525, "grad_norm": 0.01907912641763687, "learning_rate": 2.939603637364757e-08, "logits/chosen": -1.949859619140625, "logits/rejected": -2.05171537399292, "logps/chosen": -2.8990979194641113, "logps/rejected": -3.3347411155700684, "loss": 2.8307, "rewards/accuracies": 0.5, "rewards/chosen": -28.990982055664062, "rewards/margins": 4.3564276695251465, "rewards/rejected": -33.347412109375, "step": 26740 }, { "epoch": 0.9014459536890357, "grad_norm": 228.8786163330078, "learning_rate": 2.929675096243428e-08, "logits/chosen": -1.9393707513809204, "logits/rejected": -2.239442825317383, "logps/chosen": -2.230768918991089, "logps/rejected": -2.1282057762145996, "loss": 4.1624, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.307689666748047, "rewards/margins": -1.0256315469741821, "rewards/rejected": -21.282058715820312, "step": 26745 }, { "epoch": 0.9016144797600188, "grad_norm": 58.850704193115234, "learning_rate": 2.9197628443070443e-08, "logits/chosen": -1.6945288181304932, "logits/rejected": -1.7264130115509033, "logps/chosen": -2.3006865978240967, "logps/rejected": -2.233219623565674, "loss": 3.7347, "rewards/accuracies": 0.5, "rewards/chosen": -23.006868362426758, "rewards/margins": -0.6746741533279419, "rewards/rejected": -22.332195281982422, "step": 26750 }, { "epoch": 0.901783005831002, "grad_norm": 3.1833982467651367, "learning_rate": 2.9098668849858508e-08, "logits/chosen": -1.402016043663025, "logits/rejected": -1.3957245349884033, "logps/chosen": -2.561000347137451, "logps/rejected": -3.177316188812256, "loss": 1.8452, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.61000633239746, "rewards/margins": 6.1631574630737305, "rewards/rejected": -31.77316665649414, "step": 26755 }, { "epoch": 0.9019515319019852, "grad_norm": 25.571414947509766, "learning_rate": 2.899987221704453e-08, "logits/chosen": -1.9028421640396118, "logits/rejected": -2.308382749557495, "logps/chosen": -2.1655309200286865, "logps/rejected": -2.9270224571228027, "loss": 2.0631, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.655309677124023, "rewards/margins": 7.614912986755371, "rewards/rejected": -29.270221710205078, "step": 26760 }, { "epoch": 0.9021200579729685, "grad_norm": 16.592662811279297, "learning_rate": 2.8901238578818153e-08, "logits/chosen": -1.417004942893982, "logits/rejected": -1.7401018142700195, "logps/chosen": -2.598989725112915, "logps/rejected": -3.228271961212158, "loss": 2.0323, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -25.98989486694336, "rewards/margins": 6.292824745178223, "rewards/rejected": -32.28272247314453, "step": 26765 }, { "epoch": 0.9022885840439516, "grad_norm": 19.315271377563477, "learning_rate": 2.8802767969312524e-08, "logits/chosen": -2.464322328567505, "logits/rejected": -2.841975688934326, "logps/chosen": -2.9235432147979736, "logps/rejected": -3.1510472297668457, "loss": 2.342, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -29.235431671142578, "rewards/margins": 2.27504301071167, "rewards/rejected": -31.510473251342773, "step": 26770 }, { "epoch": 0.9024571101149348, "grad_norm": 56.24665832519531, "learning_rate": 2.870446042260444e-08, "logits/chosen": -1.5425212383270264, "logits/rejected": -1.7652885913848877, "logps/chosen": -2.8841872215270996, "logps/rejected": -3.6044490337371826, "loss": 1.5722, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -28.841873168945312, "rewards/margins": 7.2026166915893555, "rewards/rejected": -36.044490814208984, "step": 26775 }, { "epoch": 0.902625636185918, "grad_norm": 21.616291046142578, "learning_rate": 2.86063159727144e-08, "logits/chosen": -2.3555169105529785, "logits/rejected": -2.294743061065674, "logps/chosen": -3.1372265815734863, "logps/rejected": -3.617166042327881, "loss": 2.1903, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -31.372264862060547, "rewards/margins": 4.799395561218262, "rewards/rejected": -36.17166519165039, "step": 26780 }, { "epoch": 0.9027941622569011, "grad_norm": 17.52286148071289, "learning_rate": 2.8508334653606135e-08, "logits/chosen": -2.1238174438476562, "logits/rejected": -2.157766580581665, "logps/chosen": -2.153778553009033, "logps/rejected": -2.369436502456665, "loss": 3.4831, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.537784576416016, "rewards/margins": 2.1565799713134766, "rewards/rejected": -23.694364547729492, "step": 26785 }, { "epoch": 0.9029626883278843, "grad_norm": 69.37480926513672, "learning_rate": 2.8410516499187244e-08, "logits/chosen": -1.8046211004257202, "logits/rejected": -2.1276800632476807, "logps/chosen": -2.6248655319213867, "logps/rejected": -3.009124279022217, "loss": 2.1481, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.2486572265625, "rewards/margins": 3.8425865173339844, "rewards/rejected": -30.09124183654785, "step": 26790 }, { "epoch": 0.9031312143988675, "grad_norm": 31.732303619384766, "learning_rate": 2.8312861543308696e-08, "logits/chosen": -2.1751065254211426, "logits/rejected": -2.1534345149993896, "logps/chosen": -2.7119526863098145, "logps/rejected": -2.8299505710601807, "loss": 4.864, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -27.11952781677246, "rewards/margins": 1.1799787282943726, "rewards/rejected": -28.29950523376465, "step": 26795 }, { "epoch": 0.9032997404698507, "grad_norm": 25.450721740722656, "learning_rate": 2.821536981976502e-08, "logits/chosen": -1.3617388010025024, "logits/rejected": -1.6299562454223633, "logps/chosen": -2.2438454627990723, "logps/rejected": -2.4280383586883545, "loss": 3.6415, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.43845558166504, "rewards/margins": 1.8419297933578491, "rewards/rejected": -24.280384063720703, "step": 26800 }, { "epoch": 0.9032997404698507, "eval_logits/chosen": -2.3114259243011475, "eval_logits/rejected": -2.489382743835449, "eval_logps/chosen": -2.2882986068725586, "eval_logps/rejected": -2.4433987140655518, "eval_loss": 3.089292287826538, "eval_rewards/accuracies": 0.6299999952316284, "eval_rewards/chosen": -22.882986068725586, "eval_rewards/margins": 1.5510010719299316, "eval_rewards/rejected": -24.433984756469727, "eval_runtime": 12.9054, "eval_samples_per_second": 7.749, "eval_steps_per_second": 1.937, "step": 26800 }, { "epoch": 0.9034682665408339, "grad_norm": 26.823387145996094, "learning_rate": 2.81180413622944e-08, "logits/chosen": -1.6327470541000366, "logits/rejected": -1.6029590368270874, "logps/chosen": -3.0612990856170654, "logps/rejected": -3.3704915046691895, "loss": 4.5638, "rewards/accuracies": 0.5, "rewards/chosen": -30.612987518310547, "rewards/margins": 3.091923713684082, "rewards/rejected": -33.70491409301758, "step": 26805 }, { "epoch": 0.9036367926118171, "grad_norm": 42.258033752441406, "learning_rate": 2.8020876204578104e-08, "logits/chosen": -2.2854976654052734, "logits/rejected": -2.360747814178467, "logps/chosen": -3.3987221717834473, "logps/rejected": -3.3430073261260986, "loss": 5.2192, "rewards/accuracies": 0.5, "rewards/chosen": -33.987220764160156, "rewards/margins": -0.5571478605270386, "rewards/rejected": -33.430076599121094, "step": 26810 }, { "epoch": 0.9038053186828002, "grad_norm": 3.3188095092773438, "learning_rate": 2.7923874380241407e-08, "logits/chosen": -1.265821933746338, "logits/rejected": -1.7384541034698486, "logps/chosen": -2.1968064308166504, "logps/rejected": -2.7010929584503174, "loss": 1.1806, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.968059539794922, "rewards/margins": 5.042864799499512, "rewards/rejected": -27.01092529296875, "step": 26815 }, { "epoch": 0.9039738447537834, "grad_norm": 42.46717834472656, "learning_rate": 2.7827035922852682e-08, "logits/chosen": -1.985002875328064, "logits/rejected": -2.063169240951538, "logps/chosen": -2.1471776962280273, "logps/rejected": -2.5167198181152344, "loss": 1.2743, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -21.471778869628906, "rewards/margins": 3.6954193115234375, "rewards/rejected": -25.16719627380371, "step": 26820 }, { "epoch": 0.9041423708247666, "grad_norm": 3.5700883865356445, "learning_rate": 2.7730360865923952e-08, "logits/chosen": -1.6788097620010376, "logits/rejected": -2.0639233589172363, "logps/chosen": -1.9001522064208984, "logps/rejected": -2.203819751739502, "loss": 1.5051, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.001523971557617, "rewards/margins": 3.0366764068603516, "rewards/rejected": -22.0382022857666, "step": 26825 }, { "epoch": 0.9043108968957497, "grad_norm": 29.52191734313965, "learning_rate": 2.7633849242910622e-08, "logits/chosen": -1.4773155450820923, "logits/rejected": -1.8487679958343506, "logps/chosen": -2.4244544506073, "logps/rejected": -2.540086030960083, "loss": 2.6833, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.244544982910156, "rewards/margins": 1.1563156843185425, "rewards/rejected": -25.400859832763672, "step": 26830 }, { "epoch": 0.904479422966733, "grad_norm": 40.38792419433594, "learning_rate": 2.75375010872117e-08, "logits/chosen": -1.6742206811904907, "logits/rejected": -1.5991586446762085, "logps/chosen": -2.3412322998046875, "logps/rejected": -2.2697975635528564, "loss": 4.0595, "rewards/accuracies": 0.5, "rewards/chosen": -23.412322998046875, "rewards/margins": -0.7143500447273254, "rewards/rejected": -22.697973251342773, "step": 26835 }, { "epoch": 0.9046479490377162, "grad_norm": 25.706867218017578, "learning_rate": 2.744131643216929e-08, "logits/chosen": -1.7045414447784424, "logits/rejected": -2.089465856552124, "logps/chosen": -1.8297027349472046, "logps/rejected": -2.1544618606567383, "loss": 1.8551, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.297027587890625, "rewards/margins": 3.247591495513916, "rewards/rejected": -21.544620513916016, "step": 26840 }, { "epoch": 0.9048164751086993, "grad_norm": 19.23711395263672, "learning_rate": 2.734529531106916e-08, "logits/chosen": -1.7439048290252686, "logits/rejected": -1.5896174907684326, "logps/chosen": -3.032853364944458, "logps/rejected": -3.0909409523010254, "loss": 5.9402, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -30.328533172607422, "rewards/margins": 0.5808780789375305, "rewards/rejected": -30.909412384033203, "step": 26845 }, { "epoch": 0.9049850011796825, "grad_norm": 46.411834716796875, "learning_rate": 2.7249437757140615e-08, "logits/chosen": -1.701541543006897, "logits/rejected": -1.8513438701629639, "logps/chosen": -2.831533908843994, "logps/rejected": -3.0360515117645264, "loss": 2.4463, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -28.31533432006836, "rewards/margins": 2.045178174972534, "rewards/rejected": -30.360515594482422, "step": 26850 }, { "epoch": 0.9051535272506657, "grad_norm": 26.993629455566406, "learning_rate": 2.7153743803555894e-08, "logits/chosen": -1.7826759815216064, "logits/rejected": -2.196767807006836, "logps/chosen": -1.8802204132080078, "logps/rejected": -2.171701192855835, "loss": 2.3952, "rewards/accuracies": 0.5, "rewards/chosen": -18.80220603942871, "rewards/margins": 2.9148058891296387, "rewards/rejected": -21.717010498046875, "step": 26855 }, { "epoch": 0.9053220533216488, "grad_norm": 72.22422790527344, "learning_rate": 2.705821348343107e-08, "logits/chosen": -1.5827332735061646, "logits/rejected": -2.21030855178833, "logps/chosen": -2.4274351596832275, "logps/rejected": -3.115079641342163, "loss": 0.7714, "rewards/accuracies": 1.0, "rewards/chosen": -24.274351119995117, "rewards/margins": 6.876446723937988, "rewards/rejected": -31.15079689025879, "step": 26860 }, { "epoch": 0.905490579392632, "grad_norm": 3.0414493083953857, "learning_rate": 2.6962846829825415e-08, "logits/chosen": -2.1721713542938232, "logits/rejected": -2.610666275024414, "logps/chosen": -3.422571897506714, "logps/rejected": -4.277789115905762, "loss": 1.0687, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -34.2257194519043, "rewards/margins": 8.55217170715332, "rewards/rejected": -42.77789306640625, "step": 26865 }, { "epoch": 0.9056591054636152, "grad_norm": 97.23381805419922, "learning_rate": 2.6867643875741585e-08, "logits/chosen": -1.9782911539077759, "logits/rejected": -2.014655351638794, "logps/chosen": -2.3666396141052246, "logps/rejected": -2.3032031059265137, "loss": 4.2141, "rewards/accuracies": 0.5, "rewards/chosen": -23.666393280029297, "rewards/margins": -0.6343621015548706, "rewards/rejected": -23.032032012939453, "step": 26870 }, { "epoch": 0.9058276315345984, "grad_norm": 44.421600341796875, "learning_rate": 2.677260465412551e-08, "logits/chosen": -1.8927501440048218, "logits/rejected": -2.187912940979004, "logps/chosen": -2.850724458694458, "logps/rejected": -3.9353561401367188, "loss": 3.3546, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -28.50724220275879, "rewards/margins": 10.846318244934082, "rewards/rejected": -39.35356521606445, "step": 26875 }, { "epoch": 0.9059961576055816, "grad_norm": 50.800689697265625, "learning_rate": 2.667772919786648e-08, "logits/chosen": -2.2175285816192627, "logits/rejected": -2.1087779998779297, "logps/chosen": -2.667564630508423, "logps/rejected": -3.046851396560669, "loss": 2.6457, "rewards/accuracies": 0.5, "rewards/chosen": -26.675647735595703, "rewards/margins": 3.792868137359619, "rewards/rejected": -30.4685115814209, "step": 26880 }, { "epoch": 0.9061646836765648, "grad_norm": 34.15726089477539, "learning_rate": 2.6583017539797358e-08, "logits/chosen": -1.3783862590789795, "logits/rejected": -1.9121935367584229, "logps/chosen": -2.2632646560668945, "logps/rejected": -2.575612783432007, "loss": 1.7138, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.632644653320312, "rewards/margins": 3.1234829425811768, "rewards/rejected": -25.756128311157227, "step": 26885 }, { "epoch": 0.9063332097475479, "grad_norm": 20.80045509338379, "learning_rate": 2.6488469712693862e-08, "logits/chosen": -1.5416743755340576, "logits/rejected": -1.7947609424591064, "logps/chosen": -2.3976223468780518, "logps/rejected": -2.6126463413238525, "loss": 2.0554, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.97622299194336, "rewards/margins": 2.150240898132324, "rewards/rejected": -26.12646484375, "step": 26890 }, { "epoch": 0.9065017358185311, "grad_norm": 0.6815192103385925, "learning_rate": 2.639408574927543e-08, "logits/chosen": -1.4656648635864258, "logits/rejected": -1.8038467168807983, "logps/chosen": -2.7530505657196045, "logps/rejected": -2.9685773849487305, "loss": 2.3417, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -27.530506134033203, "rewards/margins": 2.155269145965576, "rewards/rejected": -29.685771942138672, "step": 26895 }, { "epoch": 0.9066702618895143, "grad_norm": 140.0187530517578, "learning_rate": 2.62998656822046e-08, "logits/chosen": -1.39266836643219, "logits/rejected": -2.597870111465454, "logps/chosen": -2.699371814727783, "logps/rejected": -4.062556743621826, "loss": 1.7151, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.993717193603516, "rewards/margins": 13.631851196289062, "rewards/rejected": -40.62556838989258, "step": 26900 }, { "epoch": 0.9068387879604974, "grad_norm": 40.37078094482422, "learning_rate": 2.620580954408724e-08, "logits/chosen": -2.158634901046753, "logits/rejected": -2.2582380771636963, "logps/chosen": -2.825108051300049, "logps/rejected": -3.085275888442993, "loss": 2.8587, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -28.251079559326172, "rewards/margins": 2.6016762256622314, "rewards/rejected": -30.85275650024414, "step": 26905 }, { "epoch": 0.9070073140314807, "grad_norm": 0.6319014430046082, "learning_rate": 2.6111917367472425e-08, "logits/chosen": -1.7111196517944336, "logits/rejected": -1.805869698524475, "logps/chosen": -3.093212604522705, "logps/rejected": -3.5046744346618652, "loss": 1.9236, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -30.9321231842041, "rewards/margins": 4.114621639251709, "rewards/rejected": -35.04674530029297, "step": 26910 }, { "epoch": 0.9071758401024639, "grad_norm": 28.448257446289062, "learning_rate": 2.6018189184852545e-08, "logits/chosen": -1.9640458822250366, "logits/rejected": -2.1212968826293945, "logps/chosen": -1.9360281229019165, "logps/rejected": -1.9871841669082642, "loss": 2.8041, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.360280990600586, "rewards/margins": 0.5115610361099243, "rewards/rejected": -19.871841430664062, "step": 26915 }, { "epoch": 0.907344366173447, "grad_norm": 50.9765625, "learning_rate": 2.592462502866333e-08, "logits/chosen": -1.5658257007598877, "logits/rejected": -1.8782110214233398, "logps/chosen": -2.493435859680176, "logps/rejected": -2.6581640243530273, "loss": 2.7056, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.93436050415039, "rewards/margins": 1.6472809314727783, "rewards/rejected": -26.581640243530273, "step": 26920 }, { "epoch": 0.9075128922444302, "grad_norm": 24.455228805541992, "learning_rate": 2.583122493128348e-08, "logits/chosen": -1.6781708002090454, "logits/rejected": -1.9250268936157227, "logps/chosen": -2.064728021621704, "logps/rejected": -2.7096962928771973, "loss": 2.4928, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.64727783203125, "rewards/margins": 6.449685573577881, "rewards/rejected": -27.096965789794922, "step": 26925 }, { "epoch": 0.9076814183154134, "grad_norm": 74.76783752441406, "learning_rate": 2.5737988925035204e-08, "logits/chosen": -1.885584831237793, "logits/rejected": -2.2635011672973633, "logps/chosen": -2.8694005012512207, "logps/rejected": -3.353682279586792, "loss": 2.965, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -28.694005966186523, "rewards/margins": 4.842817306518555, "rewards/rejected": -33.53682327270508, "step": 26930 }, { "epoch": 0.9078499443863965, "grad_norm": 40.00210189819336, "learning_rate": 2.5644917042183745e-08, "logits/chosen": -1.3769192695617676, "logits/rejected": -1.2565984725952148, "logps/chosen": -2.0572023391723633, "logps/rejected": -2.2845962047576904, "loss": 1.8896, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.572025299072266, "rewards/margins": 2.2739388942718506, "rewards/rejected": -22.845962524414062, "step": 26935 }, { "epoch": 0.9080184704573797, "grad_norm": 28.726268768310547, "learning_rate": 2.5552009314937728e-08, "logits/chosen": -2.2686069011688232, "logits/rejected": -2.4023404121398926, "logps/chosen": -2.7572731971740723, "logps/rejected": -2.9134395122528076, "loss": 2.7201, "rewards/accuracies": 0.5, "rewards/chosen": -27.57273292541504, "rewards/margins": 1.5616614818572998, "rewards/rejected": -29.1343936920166, "step": 26940 }, { "epoch": 0.908186996528363, "grad_norm": 25.02350425720215, "learning_rate": 2.545926577544877e-08, "logits/chosen": -1.8146655559539795, "logits/rejected": -1.8045673370361328, "logps/chosen": -2.1324949264526367, "logps/rejected": -2.3172719478607178, "loss": 2.4044, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.324947357177734, "rewards/margins": 1.8477720022201538, "rewards/rejected": -23.172719955444336, "step": 26945 }, { "epoch": 0.9083555225993462, "grad_norm": 31.74529266357422, "learning_rate": 2.5366686455811693e-08, "logits/chosen": -2.118863344192505, "logits/rejected": -2.5201308727264404, "logps/chosen": -2.5623345375061035, "logps/rejected": -3.3792407512664795, "loss": 2.1214, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.623342514038086, "rewards/margins": 8.16906452178955, "rewards/rejected": -33.79241180419922, "step": 26950 }, { "epoch": 0.9085240486703293, "grad_norm": 17.955358505249023, "learning_rate": 2.527427138806465e-08, "logits/chosen": -1.5059000253677368, "logits/rejected": -1.6303247213363647, "logps/chosen": -2.6259942054748535, "logps/rejected": -3.205542802810669, "loss": 1.4593, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -26.25994300842285, "rewards/margins": 5.795483589172363, "rewards/rejected": -32.05542755126953, "step": 26955 }, { "epoch": 0.9086925747413125, "grad_norm": 30.412948608398438, "learning_rate": 2.5182020604188892e-08, "logits/chosen": -1.84066903591156, "logits/rejected": -2.3871397972106934, "logps/chosen": -3.038327693939209, "logps/rejected": -3.112415313720703, "loss": 4.9481, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -30.38327980041504, "rewards/margins": 0.7408765554428101, "rewards/rejected": -31.124155044555664, "step": 26960 }, { "epoch": 0.9088611008122957, "grad_norm": 25.51723289489746, "learning_rate": 2.508993413610866e-08, "logits/chosen": -1.8257606029510498, "logits/rejected": -2.3894286155700684, "logps/chosen": -2.486851453781128, "logps/rejected": -3.019463062286377, "loss": 2.6325, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.868515014648438, "rewards/margins": 5.326113224029541, "rewards/rejected": -30.194629669189453, "step": 26965 }, { "epoch": 0.9090296268832788, "grad_norm": 35.03866195678711, "learning_rate": 2.4998012015691517e-08, "logits/chosen": -1.8151493072509766, "logits/rejected": -2.12388277053833, "logps/chosen": -2.1048531532287598, "logps/rejected": -2.4690277576446533, "loss": 3.1153, "rewards/accuracies": 0.5, "rewards/chosen": -21.04853057861328, "rewards/margins": 3.6417458057403564, "rewards/rejected": -24.690277099609375, "step": 26970 }, { "epoch": 0.909198152954262, "grad_norm": 57.44851303100586, "learning_rate": 2.4906254274748182e-08, "logits/chosen": -1.9289391040802002, "logits/rejected": -2.2314047813415527, "logps/chosen": -2.174452781677246, "logps/rejected": -2.5939812660217285, "loss": 2.597, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.744531631469727, "rewards/margins": 4.195285797119141, "rewards/rejected": -25.939815521240234, "step": 26975 }, { "epoch": 0.9093666790252452, "grad_norm": 39.91963195800781, "learning_rate": 2.4814660945032206e-08, "logits/chosen": -1.9026374816894531, "logits/rejected": -2.213430166244507, "logps/chosen": -2.6050117015838623, "logps/rejected": -2.9902827739715576, "loss": 1.2653, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -26.050119400024414, "rewards/margins": 3.8527092933654785, "rewards/rejected": -29.902828216552734, "step": 26980 }, { "epoch": 0.9095352050962284, "grad_norm": 40.34387969970703, "learning_rate": 2.4723232058240507e-08, "logits/chosen": -2.1123440265655518, "logits/rejected": -1.9164499044418335, "logps/chosen": -3.002830982208252, "logps/rejected": -3.1687307357788086, "loss": 2.5419, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -30.028308868408203, "rewards/margins": 1.6589953899383545, "rewards/rejected": -31.687305450439453, "step": 26985 }, { "epoch": 0.9097037311672116, "grad_norm": 28.5035400390625, "learning_rate": 2.4631967646013107e-08, "logits/chosen": -1.6512867212295532, "logits/rejected": -1.7113101482391357, "logps/chosen": -2.0377159118652344, "logps/rejected": -2.164827585220337, "loss": 2.3158, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.377161026000977, "rewards/margins": 1.271114706993103, "rewards/rejected": -21.64827537536621, "step": 26990 }, { "epoch": 0.9098722572381948, "grad_norm": 47.506649017333984, "learning_rate": 2.4540867739932912e-08, "logits/chosen": -1.38832688331604, "logits/rejected": -1.7318493127822876, "logps/chosen": -2.3842415809631348, "logps/rejected": -2.51163911819458, "loss": 4.2326, "rewards/accuracies": 0.5, "rewards/chosen": -23.84241485595703, "rewards/margins": 1.2739765644073486, "rewards/rejected": -25.116390228271484, "step": 26995 }, { "epoch": 0.9100407833091779, "grad_norm": 39.53714370727539, "learning_rate": 2.444993237152604e-08, "logits/chosen": -2.342810869216919, "logits/rejected": -2.1015408039093018, "logps/chosen": -2.428750991821289, "logps/rejected": -2.727332592010498, "loss": 1.7182, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.28750991821289, "rewards/margins": 2.9858198165893555, "rewards/rejected": -27.273326873779297, "step": 27000 }, { "epoch": 0.9102093093801611, "grad_norm": 21.055631637573242, "learning_rate": 2.4359161572261645e-08, "logits/chosen": -1.832567572593689, "logits/rejected": -1.7975997924804688, "logps/chosen": -2.410696506500244, "logps/rejected": -2.769796848297119, "loss": 2.5338, "rewards/accuracies": 0.5, "rewards/chosen": -24.10696792602539, "rewards/margins": 3.59100341796875, "rewards/rejected": -27.69797134399414, "step": 27005 }, { "epoch": 0.9103778354511443, "grad_norm": 52.577049255371094, "learning_rate": 2.426855537355199e-08, "logits/chosen": -0.961966872215271, "logits/rejected": -1.447507381439209, "logps/chosen": -2.2045254707336426, "logps/rejected": -2.5315704345703125, "loss": 1.8555, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.04525375366211, "rewards/margins": 3.2704505920410156, "rewards/rejected": -25.315704345703125, "step": 27010 }, { "epoch": 0.9105463615221274, "grad_norm": 42.69902801513672, "learning_rate": 2.4178113806752222e-08, "logits/chosen": -2.3947811126708984, "logits/rejected": -2.06548810005188, "logps/chosen": -2.6121039390563965, "logps/rejected": -2.8605058193206787, "loss": 4.3609, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -26.121036529541016, "rewards/margins": 2.4840221405029297, "rewards/rejected": -28.605060577392578, "step": 27015 }, { "epoch": 0.9107148875931107, "grad_norm": 21.63785171508789, "learning_rate": 2.4087836903160574e-08, "logits/chosen": -1.3393892049789429, "logits/rejected": -1.6054449081420898, "logps/chosen": -2.1144537925720215, "logps/rejected": -2.4885573387145996, "loss": 1.193, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -21.14453887939453, "rewards/margins": 3.741034746170044, "rewards/rejected": -24.885574340820312, "step": 27020 }, { "epoch": 0.9108834136640939, "grad_norm": 34.35652160644531, "learning_rate": 2.399772469401845e-08, "logits/chosen": -1.684240698814392, "logits/rejected": -1.5888200998306274, "logps/chosen": -2.075859785079956, "logps/rejected": -1.9586131572723389, "loss": 5.9731, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.75859832763672, "rewards/margins": -1.1724650859832764, "rewards/rejected": -19.58613395690918, "step": 27025 }, { "epoch": 0.911051939735077, "grad_norm": 61.34150695800781, "learning_rate": 2.390777721051007e-08, "logits/chosen": -1.950811743736267, "logits/rejected": -2.438870668411255, "logps/chosen": -2.0603041648864746, "logps/rejected": -2.328369617462158, "loss": 3.326, "rewards/accuracies": 0.5, "rewards/chosen": -20.60304069519043, "rewards/margins": 2.6806557178497314, "rewards/rejected": -23.2836971282959, "step": 27030 }, { "epoch": 0.9112204658060602, "grad_norm": 43.496246337890625, "learning_rate": 2.3817994483762648e-08, "logits/chosen": -1.5817230939865112, "logits/rejected": -2.0574898719787598, "logps/chosen": -2.211698055267334, "logps/rejected": -2.8117573261260986, "loss": 2.0304, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.116981506347656, "rewards/margins": 6.000591278076172, "rewards/rejected": -28.117572784423828, "step": 27035 }, { "epoch": 0.9113889918770434, "grad_norm": 8.805611610412598, "learning_rate": 2.372837654484655e-08, "logits/chosen": -1.6177377700805664, "logits/rejected": -1.8455016613006592, "logps/chosen": -2.7482638359069824, "logps/rejected": -2.7214274406433105, "loss": 4.0024, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -27.48263931274414, "rewards/margins": -0.2683609127998352, "rewards/rejected": -27.214275360107422, "step": 27040 }, { "epoch": 0.9115575179480265, "grad_norm": 47.26149368286133, "learning_rate": 2.3638923424775025e-08, "logits/chosen": -2.373136520385742, "logits/rejected": -2.4441769123077393, "logps/chosen": -3.1748688220977783, "logps/rejected": -3.6415977478027344, "loss": 1.6607, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -31.748687744140625, "rewards/margins": 4.667288303375244, "rewards/rejected": -36.415977478027344, "step": 27045 }, { "epoch": 0.9117260440190097, "grad_norm": 141.83863830566406, "learning_rate": 2.3549635154504145e-08, "logits/chosen": -1.7244266271591187, "logits/rejected": -2.0603253841400146, "logps/chosen": -3.489849805831909, "logps/rejected": -3.5083630084991455, "loss": 6.7995, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -34.898494720458984, "rewards/margins": 0.1851322203874588, "rewards/rejected": -35.08362579345703, "step": 27050 }, { "epoch": 0.911894570089993, "grad_norm": 22.03189468383789, "learning_rate": 2.3460511764933187e-08, "logits/chosen": -2.0633015632629395, "logits/rejected": -2.145843029022217, "logps/chosen": -1.955177664756775, "logps/rejected": -2.733556032180786, "loss": 2.2386, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.55177879333496, "rewards/margins": 7.7837815284729, "rewards/rejected": -27.335559844970703, "step": 27055 }, { "epoch": 0.9120630961609761, "grad_norm": 32.86399459838867, "learning_rate": 2.337155328690421e-08, "logits/chosen": -1.5522150993347168, "logits/rejected": -1.4998700618743896, "logps/chosen": -2.131031036376953, "logps/rejected": -2.2082178592681885, "loss": 2.7185, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.31031036376953, "rewards/margins": 0.7718679308891296, "rewards/rejected": -22.082176208496094, "step": 27060 }, { "epoch": 0.9122316222319593, "grad_norm": 39.76874542236328, "learning_rate": 2.3282759751202197e-08, "logits/chosen": -2.256934642791748, "logits/rejected": -2.1311392784118652, "logps/chosen": -2.158473253250122, "logps/rejected": -2.0872859954833984, "loss": 4.0022, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -21.584732055664062, "rewards/margins": -0.7118738889694214, "rewards/rejected": -20.87285804748535, "step": 27065 }, { "epoch": 0.9124001483029425, "grad_norm": 34.26323318481445, "learning_rate": 2.319413118855512e-08, "logits/chosen": -1.6916424036026, "logits/rejected": -2.1182878017425537, "logps/chosen": -2.520982027053833, "logps/rejected": -2.8135578632354736, "loss": 2.329, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.209819793701172, "rewards/margins": 2.9257562160491943, "rewards/rejected": -28.135578155517578, "step": 27070 }, { "epoch": 0.9125686743739256, "grad_norm": 61.415306091308594, "learning_rate": 2.310566762963384e-08, "logits/chosen": -2.140953779220581, "logits/rejected": -2.252530813217163, "logps/chosen": -2.918928623199463, "logps/rejected": -3.3146369457244873, "loss": 5.3396, "rewards/accuracies": 0.5, "rewards/chosen": -29.189289093017578, "rewards/margins": 3.957087755203247, "rewards/rejected": -33.1463737487793, "step": 27075 }, { "epoch": 0.9127372004449088, "grad_norm": 30.03462791442871, "learning_rate": 2.3017369105052142e-08, "logits/chosen": -1.6456882953643799, "logits/rejected": -1.8969169855117798, "logps/chosen": -2.3969192504882812, "logps/rejected": -3.0997815132141113, "loss": 2.3552, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.969192504882812, "rewards/margins": 7.028619289398193, "rewards/rejected": -30.997814178466797, "step": 27080 }, { "epoch": 0.912905726515892, "grad_norm": 20.82946014404297, "learning_rate": 2.292923564536664e-08, "logits/chosen": -1.9467077255249023, "logits/rejected": -2.043200731277466, "logps/chosen": -2.5897421836853027, "logps/rejected": -3.2874908447265625, "loss": 3.6496, "rewards/accuracies": 0.5, "rewards/chosen": -25.897424697875977, "rewards/margins": 6.9774885177612305, "rewards/rejected": -32.874908447265625, "step": 27085 }, { "epoch": 0.9130742525868751, "grad_norm": 83.79644775390625, "learning_rate": 2.284126728107677e-08, "logits/chosen": -1.6562871932983398, "logits/rejected": -1.8409792184829712, "logps/chosen": -2.63004207611084, "logps/rejected": -2.590404748916626, "loss": 4.0595, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -26.3004207611084, "rewards/margins": -0.39637669920921326, "rewards/rejected": -25.9040470123291, "step": 27090 }, { "epoch": 0.9132427786578584, "grad_norm": 23.544960021972656, "learning_rate": 2.2753464042625015e-08, "logits/chosen": -1.9590803384780884, "logits/rejected": -2.0734570026397705, "logps/chosen": -2.8419137001037598, "logps/rejected": -3.1839821338653564, "loss": 2.4574, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -28.41913414001465, "rewards/margins": 3.4206855297088623, "rewards/rejected": -31.839818954467773, "step": 27095 }, { "epoch": 0.9134113047288416, "grad_norm": 45.71376037597656, "learning_rate": 2.2665825960396624e-08, "logits/chosen": -1.905147910118103, "logits/rejected": -2.3859221935272217, "logps/chosen": -1.8513004779815674, "logps/rejected": -2.118018627166748, "loss": 1.5718, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.513004302978516, "rewards/margins": 2.667184591293335, "rewards/rejected": -21.180187225341797, "step": 27100 }, { "epoch": 0.9135798307998247, "grad_norm": 52.81638717651367, "learning_rate": 2.257835306471967e-08, "logits/chosen": -2.118196487426758, "logits/rejected": -2.160170793533325, "logps/chosen": -2.148125648498535, "logps/rejected": -2.3048596382141113, "loss": 2.4082, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.481258392333984, "rewards/margins": 1.5673408508300781, "rewards/rejected": -23.04859733581543, "step": 27105 }, { "epoch": 0.9137483568708079, "grad_norm": 55.22969055175781, "learning_rate": 2.2491045385864993e-08, "logits/chosen": -1.9875249862670898, "logits/rejected": -2.3192827701568604, "logps/chosen": -3.277228593826294, "logps/rejected": -3.8430697917938232, "loss": 2.3816, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -32.77228927612305, "rewards/margins": 5.658409595489502, "rewards/rejected": -38.430702209472656, "step": 27110 }, { "epoch": 0.9139168829417911, "grad_norm": 45.046607971191406, "learning_rate": 2.2403902954046427e-08, "logits/chosen": -1.7681725025177002, "logits/rejected": -1.7429002523422241, "logps/chosen": -2.472874641418457, "logps/rejected": -2.4763524532318115, "loss": 3.3829, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.728748321533203, "rewards/margins": 0.0347774513065815, "rewards/rejected": -24.76352310180664, "step": 27115 }, { "epoch": 0.9140854090127742, "grad_norm": 22.940006256103516, "learning_rate": 2.2316925799420517e-08, "logits/chosen": -2.224156618118286, "logits/rejected": -2.271947145462036, "logps/chosen": -2.6618289947509766, "logps/rejected": -2.998880386352539, "loss": 2.5628, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.618289947509766, "rewards/margins": 3.370516300201416, "rewards/rejected": -29.988805770874023, "step": 27120 }, { "epoch": 0.9142539350837574, "grad_norm": 19.00252914428711, "learning_rate": 2.2230113952086626e-08, "logits/chosen": -1.829633116722107, "logits/rejected": -2.575827121734619, "logps/chosen": -2.6892051696777344, "logps/rejected": -3.115910291671753, "loss": 4.5612, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -26.89204978942871, "rewards/margins": 4.267054080963135, "rewards/rejected": -31.159103393554688, "step": 27125 }, { "epoch": 0.9144224611547407, "grad_norm": 25.019968032836914, "learning_rate": 2.2143467442086948e-08, "logits/chosen": -1.8367170095443726, "logits/rejected": -1.8480660915374756, "logps/chosen": -2.1454520225524902, "logps/rejected": -2.04133939743042, "loss": 4.9755, "rewards/accuracies": 0.5, "rewards/chosen": -21.454519271850586, "rewards/margins": -1.041122317314148, "rewards/rejected": -20.41339683532715, "step": 27130 }, { "epoch": 0.9145909872257239, "grad_norm": 72.38630676269531, "learning_rate": 2.205698629940639e-08, "logits/chosen": -1.979882836341858, "logits/rejected": -2.1404590606689453, "logps/chosen": -2.3046226501464844, "logps/rejected": -2.704655408859253, "loss": 1.9762, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.046226501464844, "rewards/margins": 4.000330448150635, "rewards/rejected": -27.046558380126953, "step": 27135 }, { "epoch": 0.914759513296707, "grad_norm": 617.034423828125, "learning_rate": 2.1970670553972613e-08, "logits/chosen": -2.043015480041504, "logits/rejected": -1.733640432357788, "logps/chosen": -2.702033758163452, "logps/rejected": -2.0956649780273438, "loss": 9.1843, "rewards/accuracies": 0.5, "rewards/chosen": -27.020336151123047, "rewards/margins": -6.063687801361084, "rewards/rejected": -20.956649780273438, "step": 27140 }, { "epoch": 0.9149280393676902, "grad_norm": 37.748779296875, "learning_rate": 2.188452023565618e-08, "logits/chosen": -1.8703845739364624, "logits/rejected": -1.954056739807129, "logps/chosen": -2.447855234146118, "logps/rejected": -3.0836079120635986, "loss": 2.5521, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.478551864624023, "rewards/margins": 6.357526779174805, "rewards/rejected": -30.836078643798828, "step": 27145 }, { "epoch": 0.9150965654386733, "grad_norm": 35.00310134887695, "learning_rate": 2.1798535374270345e-08, "logits/chosen": -1.4112383127212524, "logits/rejected": -1.43263840675354, "logps/chosen": -2.427001476287842, "logps/rejected": -2.7232165336608887, "loss": 2.4778, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.270015716552734, "rewards/margins": 2.9621503353118896, "rewards/rejected": -27.232168197631836, "step": 27150 }, { "epoch": 0.9152650915096565, "grad_norm": 51.57535171508789, "learning_rate": 2.1712715999570974e-08, "logits/chosen": -2.230093479156494, "logits/rejected": -2.553774833679199, "logps/chosen": -3.5125930309295654, "logps/rejected": -4.026968955993652, "loss": 2.2864, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -35.12593078613281, "rewards/margins": 5.1437578201293945, "rewards/rejected": -40.26968765258789, "step": 27155 }, { "epoch": 0.9154336175806397, "grad_norm": 79.23214721679688, "learning_rate": 2.1627062141256815e-08, "logits/chosen": -1.558318853378296, "logits/rejected": -1.3582528829574585, "logps/chosen": -2.803522825241089, "logps/rejected": -2.6959805488586426, "loss": 4.4538, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -28.035228729248047, "rewards/margins": -1.0754238367080688, "rewards/rejected": -26.95980453491211, "step": 27160 }, { "epoch": 0.915602143651623, "grad_norm": 14.498846054077148, "learning_rate": 2.154157382896943e-08, "logits/chosen": -1.7168614864349365, "logits/rejected": -2.117745876312256, "logps/chosen": -3.605928897857666, "logps/rejected": -4.526423454284668, "loss": 2.3218, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -36.059288024902344, "rewards/margins": 9.204951286315918, "rewards/rejected": -45.26424026489258, "step": 27165 }, { "epoch": 0.9157706697226061, "grad_norm": 1.3730069398880005, "learning_rate": 2.145625109229271e-08, "logits/chosen": -2.0063352584838867, "logits/rejected": -2.3878989219665527, "logps/chosen": -2.6619865894317627, "logps/rejected": -3.246828079223633, "loss": 2.4983, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -26.6198673248291, "rewards/margins": 5.848411560058594, "rewards/rejected": -32.46827697753906, "step": 27170 }, { "epoch": 0.9159391957935893, "grad_norm": 33.612762451171875, "learning_rate": 2.137109396075365e-08, "logits/chosen": -1.2773253917694092, "logits/rejected": -1.6957318782806396, "logps/chosen": -2.4628424644470215, "logps/rejected": -2.8134007453918457, "loss": 1.7423, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.6284236907959, "rewards/margins": 3.5055809020996094, "rewards/rejected": -28.13400650024414, "step": 27175 }, { "epoch": 0.9161077218645725, "grad_norm": 9.232412338256836, "learning_rate": 2.1286102463821675e-08, "logits/chosen": -2.0160505771636963, "logits/rejected": -2.3261797428131104, "logps/chosen": -2.102545738220215, "logps/rejected": -2.440828323364258, "loss": 1.8755, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -21.025455474853516, "rewards/margins": 3.3828253746032715, "rewards/rejected": -24.408281326293945, "step": 27180 }, { "epoch": 0.9162762479355556, "grad_norm": 15.199311256408691, "learning_rate": 2.1201276630909203e-08, "logits/chosen": -2.2370071411132812, "logits/rejected": -2.714974880218506, "logps/chosen": -1.9336721897125244, "logps/rejected": -2.636110305786133, "loss": 1.5309, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.336721420288086, "rewards/margins": 7.024382591247559, "rewards/rejected": -26.36110496520996, "step": 27185 }, { "epoch": 0.9164447740065388, "grad_norm": 39.4886589050293, "learning_rate": 2.1116616491370863e-08, "logits/chosen": -2.0533199310302734, "logits/rejected": -2.326814889907837, "logps/chosen": -2.4584240913391113, "logps/rejected": -2.459068775177002, "loss": 3.996, "rewards/accuracies": 0.5, "rewards/chosen": -24.584243774414062, "rewards/margins": 0.006443119142204523, "rewards/rejected": -24.590682983398438, "step": 27190 }, { "epoch": 0.916613300077522, "grad_norm": 26.44153594970703, "learning_rate": 2.1032122074504332e-08, "logits/chosen": -1.8153555393218994, "logits/rejected": -2.659576177597046, "logps/chosen": -2.7765450477600098, "logps/rejected": -4.102330207824707, "loss": 2.3122, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -27.76544761657715, "rewards/margins": 13.257855415344238, "rewards/rejected": -41.0233039855957, "step": 27195 }, { "epoch": 0.9167818261485051, "grad_norm": 50.18436813354492, "learning_rate": 2.094779340954983e-08, "logits/chosen": -2.3786637783050537, "logits/rejected": -2.708362102508545, "logps/chosen": -2.495008945465088, "logps/rejected": -3.038865804672241, "loss": 2.0584, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.950092315673828, "rewards/margins": 5.438567161560059, "rewards/rejected": -30.388656616210938, "step": 27200 }, { "epoch": 0.9167818261485051, "eval_logits/chosen": -2.3133950233459473, "eval_logits/rejected": -2.4917452335357666, "eval_logps/chosen": -2.2887940406799316, "eval_logps/rejected": -2.442683219909668, "eval_loss": 3.089421272277832, "eval_rewards/accuracies": 0.6299999952316284, "eval_rewards/chosen": -22.887939453125, "eval_rewards/margins": 1.5388928651809692, "eval_rewards/rejected": -24.426830291748047, "eval_runtime": 12.895, "eval_samples_per_second": 7.755, "eval_steps_per_second": 1.939, "step": 27200 }, { "epoch": 0.9169503522194884, "grad_norm": 1.289928913116455, "learning_rate": 2.0863630525690066e-08, "logits/chosen": -1.750335931777954, "logits/rejected": -2.047384738922119, "logps/chosen": -2.2318363189697266, "logps/rejected": -2.868582248687744, "loss": 2.0977, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.318363189697266, "rewards/margins": 6.367456912994385, "rewards/rejected": -28.685821533203125, "step": 27205 }, { "epoch": 0.9171188782904716, "grad_norm": 8.582649230957031, "learning_rate": 2.0779633452050526e-08, "logits/chosen": -1.8326599597930908, "logits/rejected": -2.413167953491211, "logps/chosen": -2.6722311973571777, "logps/rejected": -3.0735504627227783, "loss": 1.4892, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -26.722314834594727, "rewards/margins": 4.013188362121582, "rewards/rejected": -30.73550033569336, "step": 27210 }, { "epoch": 0.9172874043614547, "grad_norm": 20.33465576171875, "learning_rate": 2.0695802217699344e-08, "logits/chosen": -2.121351718902588, "logits/rejected": -2.4303436279296875, "logps/chosen": -2.191392421722412, "logps/rejected": -2.4328086376190186, "loss": 1.7268, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.913923263549805, "rewards/margins": 2.414163112640381, "rewards/rejected": -24.32808494567871, "step": 27215 }, { "epoch": 0.9174559304324379, "grad_norm": 35.218204498291016, "learning_rate": 2.0612136851647255e-08, "logits/chosen": -1.718423843383789, "logits/rejected": -2.4133172035217285, "logps/chosen": -2.11830735206604, "logps/rejected": -2.890991687774658, "loss": 1.1518, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.183073043823242, "rewards/margins": 7.72684383392334, "rewards/rejected": -28.909915924072266, "step": 27220 }, { "epoch": 0.9176244565034211, "grad_norm": 75.08181762695312, "learning_rate": 2.052863738284738e-08, "logits/chosen": -2.1429359912872314, "logits/rejected": -2.147839069366455, "logps/chosen": -3.5561728477478027, "logps/rejected": -3.924323558807373, "loss": 2.0425, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -35.561729431152344, "rewards/margins": 3.681507110595703, "rewards/rejected": -39.24324035644531, "step": 27225 }, { "epoch": 0.9177929825744042, "grad_norm": 186.15933227539062, "learning_rate": 2.0445303840195717e-08, "logits/chosen": -2.096203327178955, "logits/rejected": -2.370142936706543, "logps/chosen": -3.1219077110290527, "logps/rejected": -3.7952494621276855, "loss": 1.9524, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -31.219079971313477, "rewards/margins": 6.733415126800537, "rewards/rejected": -37.952491760253906, "step": 27230 }, { "epoch": 0.9179615086453874, "grad_norm": 32.30589294433594, "learning_rate": 2.0362136252530748e-08, "logits/chosen": -1.6504993438720703, "logits/rejected": -1.96540105342865, "logps/chosen": -2.1283984184265137, "logps/rejected": -2.554733991622925, "loss": 1.497, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.283985137939453, "rewards/margins": 4.263354301452637, "rewards/rejected": -25.547340393066406, "step": 27235 }, { "epoch": 0.9181300347163707, "grad_norm": 36.14775466918945, "learning_rate": 2.02791346486334e-08, "logits/chosen": -1.7485713958740234, "logits/rejected": -1.8560590744018555, "logps/chosen": -2.0910072326660156, "logps/rejected": -2.087937116622925, "loss": 3.6899, "rewards/accuracies": 0.5, "rewards/chosen": -20.91007423400879, "rewards/margins": -0.030699919909238815, "rewards/rejected": -20.87937355041504, "step": 27240 }, { "epoch": 0.9182985607873538, "grad_norm": 20.150341033935547, "learning_rate": 2.019629905722725e-08, "logits/chosen": -2.292171001434326, "logits/rejected": -2.4348361492156982, "logps/chosen": -2.8926589488983154, "logps/rejected": -3.336768388748169, "loss": 1.9384, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -28.926589965820312, "rewards/margins": 4.441091060638428, "rewards/rejected": -33.36768341064453, "step": 27245 }, { "epoch": 0.918467086858337, "grad_norm": 41.496490478515625, "learning_rate": 2.0113629506978536e-08, "logits/chosen": -1.8662121295928955, "logits/rejected": -2.5918049812316895, "logps/chosen": -2.452751874923706, "logps/rejected": -3.2109732627868652, "loss": 1.6352, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -24.52752113342285, "rewards/margins": 7.582211494445801, "rewards/rejected": -32.10973358154297, "step": 27250 }, { "epoch": 0.9186356129293202, "grad_norm": 12.877457618713379, "learning_rate": 2.0031126026495872e-08, "logits/chosen": -1.6692726612091064, "logits/rejected": -1.9603191614151, "logps/chosen": -2.6382012367248535, "logps/rejected": -3.2847511768341064, "loss": 0.9585, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -26.382009506225586, "rewards/margins": 6.4654998779296875, "rewards/rejected": -32.847511291503906, "step": 27255 }, { "epoch": 0.9188041390003033, "grad_norm": 27.91530418395996, "learning_rate": 1.9948788644330473e-08, "logits/chosen": -1.5431195497512817, "logits/rejected": -1.6130651235580444, "logps/chosen": -2.2890515327453613, "logps/rejected": -2.5770554542541504, "loss": 2.7111, "rewards/accuracies": 0.5, "rewards/chosen": -22.890514373779297, "rewards/margins": 2.8800418376922607, "rewards/rejected": -25.770557403564453, "step": 27260 }, { "epoch": 0.9189726650712865, "grad_norm": 28.36544418334961, "learning_rate": 1.9866617388976047e-08, "logits/chosen": -1.4692294597625732, "logits/rejected": -1.737244963645935, "logps/chosen": -1.9486863613128662, "logps/rejected": -2.105894088745117, "loss": 2.7634, "rewards/accuracies": 0.5, "rewards/chosen": -19.486865997314453, "rewards/margins": 1.572076439857483, "rewards/rejected": -21.058942794799805, "step": 27265 }, { "epoch": 0.9191411911422697, "grad_norm": 27.256328582763672, "learning_rate": 1.9784612288868907e-08, "logits/chosen": -2.2276055812835693, "logits/rejected": -2.251549243927002, "logps/chosen": -1.954451322555542, "logps/rejected": -2.037261486053467, "loss": 2.6429, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.544513702392578, "rewards/margins": 0.8281000852584839, "rewards/rejected": -20.37261390686035, "step": 27270 }, { "epoch": 0.9193097172132529, "grad_norm": 25.21340560913086, "learning_rate": 1.970277337238768e-08, "logits/chosen": -2.331439256668091, "logits/rejected": -2.3402552604675293, "logps/chosen": -2.5144410133361816, "logps/rejected": -2.983175754547119, "loss": 2.0745, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.1444091796875, "rewards/margins": 4.687346935272217, "rewards/rejected": -29.831756591796875, "step": 27275 }, { "epoch": 0.9194782432842361, "grad_norm": 61.691226959228516, "learning_rate": 1.962110066785361e-08, "logits/chosen": -1.3510804176330566, "logits/rejected": -1.6053918600082397, "logps/chosen": -2.667389392852783, "logps/rejected": -2.8345603942871094, "loss": 2.735, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -26.673892974853516, "rewards/margins": 1.6717100143432617, "rewards/rejected": -28.345605850219727, "step": 27280 }, { "epoch": 0.9196467693552193, "grad_norm": 22.086963653564453, "learning_rate": 1.9539594203530464e-08, "logits/chosen": -1.633506178855896, "logits/rejected": -1.676327109336853, "logps/chosen": -2.520235300064087, "logps/rejected": -2.5373756885528564, "loss": 3.0548, "rewards/accuracies": 0.5, "rewards/chosen": -25.20235252380371, "rewards/margins": 0.17140674591064453, "rewards/rejected": -25.373760223388672, "step": 27285 }, { "epoch": 0.9198152954262024, "grad_norm": 72.09431457519531, "learning_rate": 1.945825400762435e-08, "logits/chosen": -1.7755218744277954, "logits/rejected": -1.8230489492416382, "logps/chosen": -2.6445462703704834, "logps/rejected": -2.7068397998809814, "loss": 2.7089, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -26.445465087890625, "rewards/margins": 0.6229327321052551, "rewards/rejected": -27.068395614624023, "step": 27290 }, { "epoch": 0.9199838214971856, "grad_norm": 27.323516845703125, "learning_rate": 1.937708010828393e-08, "logits/chosen": -1.8622583150863647, "logits/rejected": -2.1557297706604004, "logps/chosen": -2.5883989334106445, "logps/rejected": -2.788325071334839, "loss": 2.2512, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -25.883991241455078, "rewards/margins": 1.999259352684021, "rewards/rejected": -27.883249282836914, "step": 27295 }, { "epoch": 0.9201523475681688, "grad_norm": 17.495512008666992, "learning_rate": 1.9296072533600326e-08, "logits/chosen": -1.857142448425293, "logits/rejected": -2.222248077392578, "logps/chosen": -2.382188081741333, "logps/rejected": -2.569136381149292, "loss": 2.7625, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.82187843322754, "rewards/margins": 1.86948561668396, "rewards/rejected": -25.69136619567871, "step": 27300 }, { "epoch": 0.9203208736391519, "grad_norm": 50.69965362548828, "learning_rate": 1.921523131160707e-08, "logits/chosen": -2.3246703147888184, "logits/rejected": -2.7445061206817627, "logps/chosen": -2.288409471511841, "logps/rejected": -3.077025890350342, "loss": 3.6925, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.88409423828125, "rewards/margins": 7.886164665222168, "rewards/rejected": -30.7702579498291, "step": 27305 }, { "epoch": 0.9204893997101351, "grad_norm": 28.7850284576416, "learning_rate": 1.913455647028006e-08, "logits/chosen": -2.21712327003479, "logits/rejected": -2.5007641315460205, "logps/chosen": -3.346345901489258, "logps/rejected": -3.8853752613067627, "loss": 3.0171, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -33.463462829589844, "rewards/margins": 5.390293121337891, "rewards/rejected": -38.85375213623047, "step": 27310 }, { "epoch": 0.9206579257811184, "grad_norm": 19.60544776916504, "learning_rate": 1.9054048037537683e-08, "logits/chosen": -1.5601160526275635, "logits/rejected": -2.036533832550049, "logps/chosen": -2.6824698448181152, "logps/rejected": -3.735724925994873, "loss": 1.3956, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -26.8247013092041, "rewards/margins": 10.532548904418945, "rewards/rejected": -37.35724639892578, "step": 27315 }, { "epoch": 0.9208264518521015, "grad_norm": 33.12553787231445, "learning_rate": 1.8973706041240824e-08, "logits/chosen": -1.1942306756973267, "logits/rejected": -1.300018072128296, "logps/chosen": -2.2763888835906982, "logps/rejected": -2.3771214485168457, "loss": 2.6452, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.76388931274414, "rewards/margins": 1.0073254108428955, "rewards/rejected": -23.77121353149414, "step": 27320 }, { "epoch": 0.9209949779230847, "grad_norm": 30.314062118530273, "learning_rate": 1.889353050919257e-08, "logits/chosen": -1.6723153591156006, "logits/rejected": -1.867597222328186, "logps/chosen": -1.978061318397522, "logps/rejected": -2.1816296577453613, "loss": 2.0179, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.780614852905273, "rewards/margins": 2.0356831550598145, "rewards/rejected": -21.816295623779297, "step": 27325 }, { "epoch": 0.9211635039940679, "grad_norm": 33.596317291259766, "learning_rate": 1.881352146913856e-08, "logits/chosen": -1.8963005542755127, "logits/rejected": -1.8901876211166382, "logps/chosen": -2.6146950721740723, "logps/rejected": -2.9768056869506836, "loss": 3.3404, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -26.14695167541504, "rewards/margins": 3.621105909347534, "rewards/rejected": -29.768056869506836, "step": 27330 }, { "epoch": 0.921332030065051, "grad_norm": 22.420825958251953, "learning_rate": 1.8733678948766816e-08, "logits/chosen": -2.086516857147217, "logits/rejected": -1.8573553562164307, "logps/chosen": -2.346881628036499, "logps/rejected": -2.5964813232421875, "loss": 1.9698, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -23.46881675720215, "rewards/margins": 2.4959959983825684, "rewards/rejected": -25.964813232421875, "step": 27335 }, { "epoch": 0.9215005561360342, "grad_norm": 74.21624755859375, "learning_rate": 1.8654002975707684e-08, "logits/chosen": -1.608689546585083, "logits/rejected": -1.6286035776138306, "logps/chosen": -2.3946728706359863, "logps/rejected": -2.4410223960876465, "loss": 2.7656, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.946725845336914, "rewards/margins": 0.46349868178367615, "rewards/rejected": -24.410226821899414, "step": 27340 }, { "epoch": 0.9216690822070174, "grad_norm": 50.7723274230957, "learning_rate": 1.8574493577533768e-08, "logits/chosen": -2.1765570640563965, "logits/rejected": -1.717564344406128, "logps/chosen": -2.544821262359619, "logps/rejected": -2.759453535079956, "loss": 2.9138, "rewards/accuracies": 0.5, "rewards/chosen": -25.448213577270508, "rewards/margins": 2.146322727203369, "rewards/rejected": -27.59453773498535, "step": 27345 }, { "epoch": 0.9218376082780007, "grad_norm": 60.63186264038086, "learning_rate": 1.8495150781760283e-08, "logits/chosen": -1.9756433963775635, "logits/rejected": -2.058195114135742, "logps/chosen": -1.8520549535751343, "logps/rejected": -1.9105947017669678, "loss": 2.9764, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.520549774169922, "rewards/margins": 0.5853978991508484, "rewards/rejected": -19.105945587158203, "step": 27350 }, { "epoch": 0.9220061343489838, "grad_norm": 36.562808990478516, "learning_rate": 1.8415974615844598e-08, "logits/chosen": -2.3384909629821777, "logits/rejected": -2.3781991004943848, "logps/chosen": -2.5374786853790283, "logps/rejected": -2.66489839553833, "loss": 2.4687, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -25.374786376953125, "rewards/margins": 1.2741988897323608, "rewards/rejected": -26.64898681640625, "step": 27355 }, { "epoch": 0.922174660419967, "grad_norm": 33.754371643066406, "learning_rate": 1.8336965107186354e-08, "logits/chosen": -1.54237961769104, "logits/rejected": -1.7503583431243896, "logps/chosen": -2.4724485874176025, "logps/rejected": -2.766716480255127, "loss": 2.126, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.724483489990234, "rewards/margins": 2.9426774978637695, "rewards/rejected": -27.667163848876953, "step": 27360 }, { "epoch": 0.9223431864909502, "grad_norm": 28.843944549560547, "learning_rate": 1.8258122283127787e-08, "logits/chosen": -1.7064812183380127, "logits/rejected": -1.896023154258728, "logps/chosen": -2.667555332183838, "logps/rejected": -2.815514087677002, "loss": 2.5606, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.675552368164062, "rewards/margins": 1.4795873165130615, "rewards/rejected": -28.155141830444336, "step": 27365 }, { "epoch": 0.9225117125619333, "grad_norm": 79.54619598388672, "learning_rate": 1.8179446170953182e-08, "logits/chosen": -1.9791038036346436, "logits/rejected": -1.836622953414917, "logps/chosen": -2.6727569103240967, "logps/rejected": -2.673759698867798, "loss": 3.3764, "rewards/accuracies": 0.5, "rewards/chosen": -26.727569580078125, "rewards/margins": 0.010026884265244007, "rewards/rejected": -26.737598419189453, "step": 27370 }, { "epoch": 0.9226802386329165, "grad_norm": 0.006284057628363371, "learning_rate": 1.810093679788932e-08, "logits/chosen": -1.9792457818984985, "logits/rejected": -2.220228433609009, "logps/chosen": -2.9561426639556885, "logps/rejected": -3.098778486251831, "loss": 6.8838, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -29.561426162719727, "rewards/margins": 1.4263594150543213, "rewards/rejected": -30.9877872467041, "step": 27375 }, { "epoch": 0.9228487647038996, "grad_norm": 38.517547607421875, "learning_rate": 1.8022594191105133e-08, "logits/chosen": -1.7412769794464111, "logits/rejected": -2.1089096069335938, "logps/chosen": -2.4893736839294434, "logps/rejected": -3.0103344917297363, "loss": 1.5992, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -24.893733978271484, "rewards/margins": 5.209610939025879, "rewards/rejected": -30.103343963623047, "step": 27380 }, { "epoch": 0.9230172907748829, "grad_norm": 15.737530708312988, "learning_rate": 1.794441837771199e-08, "logits/chosen": -2.132612705230713, "logits/rejected": -1.8385169506072998, "logps/chosen": -2.0549371242523193, "logps/rejected": -2.159449338912964, "loss": 3.5841, "rewards/accuracies": 0.5, "rewards/chosen": -20.54937171936035, "rewards/margins": 1.045123815536499, "rewards/rejected": -21.594493865966797, "step": 27385 }, { "epoch": 0.9231858168458661, "grad_norm": 42.83122634887695, "learning_rate": 1.786640938476336e-08, "logits/chosen": -1.5979821681976318, "logits/rejected": -1.888920545578003, "logps/chosen": -2.3727355003356934, "logps/rejected": -2.5195016860961914, "loss": 2.7367, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.727354049682617, "rewards/margins": 1.4676618576049805, "rewards/rejected": -25.195018768310547, "step": 27390 }, { "epoch": 0.9233543429168493, "grad_norm": 31.570093154907227, "learning_rate": 1.778856723925515e-08, "logits/chosen": -1.9060827493667603, "logits/rejected": -1.9400758743286133, "logps/chosen": -2.1560046672821045, "logps/rejected": -2.0583724975585938, "loss": 5.3615, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.560047149658203, "rewards/margins": -0.9763216972351074, "rewards/rejected": -20.58372688293457, "step": 27395 }, { "epoch": 0.9235228689878324, "grad_norm": 50.94015884399414, "learning_rate": 1.771089196812542e-08, "logits/chosen": -1.942318320274353, "logits/rejected": -2.1093525886535645, "logps/chosen": -2.1805660724639893, "logps/rejected": -2.2652573585510254, "loss": 2.756, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.805662155151367, "rewards/margins": 0.8469133377075195, "rewards/rejected": -22.65257453918457, "step": 27400 }, { "epoch": 0.9236913950588156, "grad_norm": 301.8170166015625, "learning_rate": 1.763338359825467e-08, "logits/chosen": -1.502886414527893, "logits/rejected": -1.5221701860427856, "logps/chosen": -2.850468873977661, "logps/rejected": -2.600606679916382, "loss": 6.5051, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -28.504688262939453, "rewards/margins": -2.498622417449951, "rewards/rejected": -26.006067276000977, "step": 27405 }, { "epoch": 0.9238599211297988, "grad_norm": 33.5927848815918, "learning_rate": 1.7556042156465278e-08, "logits/chosen": -2.5625860691070557, "logits/rejected": -3.1480767726898193, "logps/chosen": -1.8674657344818115, "logps/rejected": -2.3821492195129395, "loss": 1.9451, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.674657821655273, "rewards/margins": 5.1468329429626465, "rewards/rejected": -23.821491241455078, "step": 27410 }, { "epoch": 0.9240284472007819, "grad_norm": 68.57820892333984, "learning_rate": 1.747886766952217e-08, "logits/chosen": -1.8768354654312134, "logits/rejected": -2.275322437286377, "logps/chosen": -2.3458003997802734, "logps/rejected": -3.2828738689422607, "loss": 2.3436, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.458003997802734, "rewards/margins": 9.370733261108398, "rewards/rejected": -32.828739166259766, "step": 27415 }, { "epoch": 0.9241969732717651, "grad_norm": 8.633049964904785, "learning_rate": 1.7401860164132364e-08, "logits/chosen": -1.939244270324707, "logits/rejected": -1.7692468166351318, "logps/chosen": -3.509154796600342, "logps/rejected": -2.917518138885498, "loss": 11.9724, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -35.09154510498047, "rewards/margins": -5.916362762451172, "rewards/rejected": -29.175182342529297, "step": 27420 }, { "epoch": 0.9243654993427484, "grad_norm": 0.3071482479572296, "learning_rate": 1.7325019666945217e-08, "logits/chosen": -2.091637134552002, "logits/rejected": -2.435678005218506, "logps/chosen": -2.8514938354492188, "logps/rejected": -3.5164198875427246, "loss": 2.0882, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -28.514938354492188, "rewards/margins": 6.649260520935059, "rewards/rejected": -35.16419982910156, "step": 27425 }, { "epoch": 0.9245340254137315, "grad_norm": 31.114391326904297, "learning_rate": 1.7248346204552065e-08, "logits/chosen": -2.094665050506592, "logits/rejected": -2.374276638031006, "logps/chosen": -2.6455271244049072, "logps/rejected": -3.161177158355713, "loss": 2.3138, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -26.455270767211914, "rewards/margins": 5.156498908996582, "rewards/rejected": -31.611770629882812, "step": 27430 }, { "epoch": 0.9247025514847147, "grad_norm": 114.97472381591797, "learning_rate": 1.717183980348663e-08, "logits/chosen": -2.3799057006835938, "logits/rejected": -2.057694911956787, "logps/chosen": -2.764962673187256, "logps/rejected": -2.583735227584839, "loss": 6.0765, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -27.649627685546875, "rewards/margins": -1.8122737407684326, "rewards/rejected": -25.837352752685547, "step": 27435 }, { "epoch": 0.9248710775556979, "grad_norm": 151.6468505859375, "learning_rate": 1.709550049022479e-08, "logits/chosen": -1.7874248027801514, "logits/rejected": -1.8235124349594116, "logps/chosen": -3.8881237506866455, "logps/rejected": -5.001030445098877, "loss": 2.4004, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -38.8812370300293, "rewards/margins": 11.12906551361084, "rewards/rejected": -50.01030731201172, "step": 27440 }, { "epoch": 0.925039603626681, "grad_norm": 45.867191314697266, "learning_rate": 1.7019328291184632e-08, "logits/chosen": -1.7721633911132812, "logits/rejected": -1.5765838623046875, "logps/chosen": -2.182168483734131, "logps/rejected": -2.279179096221924, "loss": 2.5337, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.821685791015625, "rewards/margins": 0.9701029062271118, "rewards/rejected": -22.79178810119629, "step": 27445 }, { "epoch": 0.9252081296976642, "grad_norm": 28.50269889831543, "learning_rate": 1.6943323232726182e-08, "logits/chosen": -1.6506497859954834, "logits/rejected": -1.994645357131958, "logps/chosen": -2.5445361137390137, "logps/rejected": -2.7690863609313965, "loss": 2.6771, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.44536018371582, "rewards/margins": 2.2455050945281982, "rewards/rejected": -27.69086265563965, "step": 27450 }, { "epoch": 0.9253766557686474, "grad_norm": 23.46358871459961, "learning_rate": 1.68674853411519e-08, "logits/chosen": -1.7911182641983032, "logits/rejected": -2.173764705657959, "logps/chosen": -3.1003613471984863, "logps/rejected": -3.6647541522979736, "loss": 1.7456, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -31.003612518310547, "rewards/margins": 5.643929481506348, "rewards/rejected": -36.647544860839844, "step": 27455 }, { "epoch": 0.9255451818396306, "grad_norm": 158.92230224609375, "learning_rate": 1.6791814642706292e-08, "logits/chosen": -2.188016653060913, "logits/rejected": -2.542320489883423, "logps/chosen": -2.734858989715576, "logps/rejected": -2.99385929107666, "loss": 2.6771, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -27.348590850830078, "rewards/margins": 2.5900049209594727, "rewards/rejected": -29.9385929107666, "step": 27460 }, { "epoch": 0.9257137079106138, "grad_norm": 35.6912841796875, "learning_rate": 1.6716311163575967e-08, "logits/chosen": -1.6770681142807007, "logits/rejected": -1.688140630722046, "logps/chosen": -2.8789820671081543, "logps/rejected": -2.7300705909729004, "loss": 5.1186, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -28.789819717407227, "rewards/margins": -1.48911452293396, "rewards/rejected": -27.300708770751953, "step": 27465 }, { "epoch": 0.925882233981597, "grad_norm": 137.83705139160156, "learning_rate": 1.664097492988975e-08, "logits/chosen": -2.104797840118408, "logits/rejected": -2.3004043102264404, "logps/chosen": -3.1804895401000977, "logps/rejected": -3.556551456451416, "loss": 2.7026, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -31.804895401000977, "rewards/margins": 3.7606215476989746, "rewards/rejected": -35.565513610839844, "step": 27470 }, { "epoch": 0.9260507600525801, "grad_norm": 92.81694793701172, "learning_rate": 1.6565805967718504e-08, "logits/chosen": -2.285025119781494, "logits/rejected": -2.4701247215270996, "logps/chosen": -2.6418814659118652, "logps/rejected": -3.3334007263183594, "loss": 2.3526, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.4188175201416, "rewards/margins": 6.915187835693359, "rewards/rejected": -33.33400344848633, "step": 27475 }, { "epoch": 0.9262192861235633, "grad_norm": 45.628963470458984, "learning_rate": 1.649080430307537e-08, "logits/chosen": -1.9886986017227173, "logits/rejected": -2.3399429321289062, "logps/chosen": -2.4697680473327637, "logps/rejected": -2.8362088203430176, "loss": 2.7542, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.69767951965332, "rewards/margins": 3.6644082069396973, "rewards/rejected": -28.36208724975586, "step": 27480 }, { "epoch": 0.9263878121945465, "grad_norm": 53.76228713989258, "learning_rate": 1.6415969961915245e-08, "logits/chosen": -2.2061126232147217, "logits/rejected": -2.1689679622650146, "logps/chosen": -3.3630974292755127, "logps/rejected": -3.4911270141601562, "loss": 3.0385, "rewards/accuracies": 0.5, "rewards/chosen": -33.6309700012207, "rewards/margins": 1.2802989482879639, "rewards/rejected": -34.91127014160156, "step": 27485 }, { "epoch": 0.9265563382655296, "grad_norm": 84.9283447265625, "learning_rate": 1.6341302970135472e-08, "logits/chosen": -1.81033456325531, "logits/rejected": -1.845776915550232, "logps/chosen": -3.6270880699157715, "logps/rejected": -4.127110481262207, "loss": 2.1203, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -36.27088165283203, "rewards/margins": 5.00022029876709, "rewards/rejected": -41.2711067199707, "step": 27490 }, { "epoch": 0.9267248643365129, "grad_norm": 34.078857421875, "learning_rate": 1.6266803353575444e-08, "logits/chosen": -1.784132957458496, "logits/rejected": -1.6151421070098877, "logps/chosen": -2.0967466831207275, "logps/rejected": -2.0627224445343018, "loss": 3.4931, "rewards/accuracies": 0.5, "rewards/chosen": -20.967464447021484, "rewards/margins": -0.3402392268180847, "rewards/rejected": -20.627225875854492, "step": 27495 }, { "epoch": 0.9268933904074961, "grad_norm": 36.228755950927734, "learning_rate": 1.619247113801636e-08, "logits/chosen": -1.7772010564804077, "logits/rejected": -1.9724609851837158, "logps/chosen": -2.2621092796325684, "logps/rejected": -2.5008437633514404, "loss": 2.521, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.62109375, "rewards/margins": 2.387342929840088, "rewards/rejected": -25.008438110351562, "step": 27500 }, { "epoch": 0.9270619164784792, "grad_norm": 130.93255615234375, "learning_rate": 1.6118306349181766e-08, "logits/chosen": -1.3324534893035889, "logits/rejected": -1.5228675603866577, "logps/chosen": -2.476607322692871, "logps/rejected": -2.4428625106811523, "loss": 3.8933, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.766071319580078, "rewards/margins": -0.3374488949775696, "rewards/rejected": -24.428625106811523, "step": 27505 }, { "epoch": 0.9272304425494624, "grad_norm": 26.975387573242188, "learning_rate": 1.604430901273718e-08, "logits/chosen": -1.7993533611297607, "logits/rejected": -2.2579345703125, "logps/chosen": -2.5446598529815674, "logps/rejected": -3.0560150146484375, "loss": 2.2713, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.446598052978516, "rewards/margins": 5.113553047180176, "rewards/rejected": -30.560150146484375, "step": 27510 }, { "epoch": 0.9273989686204456, "grad_norm": 31.938961029052734, "learning_rate": 1.5970479154290228e-08, "logits/chosen": -1.9177414178848267, "logits/rejected": -2.183879852294922, "logps/chosen": -2.496314764022827, "logps/rejected": -2.7273929119110107, "loss": 1.8621, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -24.963146209716797, "rewards/margins": 2.310781955718994, "rewards/rejected": -27.273929595947266, "step": 27515 }, { "epoch": 0.9275674946914287, "grad_norm": 17.566181182861328, "learning_rate": 1.5896816799390313e-08, "logits/chosen": -2.3122284412384033, "logits/rejected": -2.619051456451416, "logps/chosen": -2.220881700515747, "logps/rejected": -2.4604485034942627, "loss": 2.3981, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.208816528320312, "rewards/margins": 2.3956680297851562, "rewards/rejected": -24.604482650756836, "step": 27520 }, { "epoch": 0.9277360207624119, "grad_norm": 25.077037811279297, "learning_rate": 1.5823321973529256e-08, "logits/chosen": -1.534099817276001, "logits/rejected": -1.9863879680633545, "logps/chosen": -2.358691692352295, "logps/rejected": -2.79862117767334, "loss": 2.2013, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.586915969848633, "rewards/margins": 4.399293422698975, "rewards/rejected": -27.986209869384766, "step": 27525 }, { "epoch": 0.9279045468333951, "grad_norm": 126.60164642333984, "learning_rate": 1.5749994702140666e-08, "logits/chosen": -2.381648540496826, "logits/rejected": -2.3383431434631348, "logps/chosen": -2.6393046379089355, "logps/rejected": -2.7791647911071777, "loss": 3.071, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -26.39304542541504, "rewards/margins": 1.3986046314239502, "rewards/rejected": -27.791650772094727, "step": 27530 }, { "epoch": 0.9280730729043783, "grad_norm": 25.092164993286133, "learning_rate": 1.5676835010600242e-08, "logits/chosen": -2.014479160308838, "logits/rejected": -2.1772711277008057, "logps/chosen": -2.1367969512939453, "logps/rejected": -2.154160261154175, "loss": 3.3433, "rewards/accuracies": 0.5, "rewards/chosen": -21.367969512939453, "rewards/margins": 0.1736338585615158, "rewards/rejected": -21.54160499572754, "step": 27535 }, { "epoch": 0.9282415989753615, "grad_norm": 44.35649108886719, "learning_rate": 1.560384292422562e-08, "logits/chosen": -1.437814712524414, "logits/rejected": -1.5548362731933594, "logps/chosen": -2.0173325538635254, "logps/rejected": -2.0877983570098877, "loss": 2.8418, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.173322677612305, "rewards/margins": 0.7046583294868469, "rewards/rejected": -20.877981185913086, "step": 27540 }, { "epoch": 0.9284101250463447, "grad_norm": 53.693397521972656, "learning_rate": 1.553101846827648e-08, "logits/chosen": -1.4574278593063354, "logits/rejected": -2.2143871784210205, "logps/chosen": -2.498276472091675, "logps/rejected": -3.071702480316162, "loss": 3.0937, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.982765197753906, "rewards/margins": 5.734262943267822, "rewards/rejected": -30.717029571533203, "step": 27545 }, { "epoch": 0.9285786511173278, "grad_norm": 20.134761810302734, "learning_rate": 1.5458361667954612e-08, "logits/chosen": -1.9929225444793701, "logits/rejected": -2.411076307296753, "logps/chosen": -2.3681139945983887, "logps/rejected": -2.5899434089660645, "loss": 2.8595, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.681140899658203, "rewards/margins": 2.218296527862549, "rewards/rejected": -25.899438858032227, "step": 27550 }, { "epoch": 0.928747177188311, "grad_norm": 19.580636978149414, "learning_rate": 1.5385872548403513e-08, "logits/chosen": -1.4577335119247437, "logits/rejected": -1.5620262622833252, "logps/chosen": -2.7865166664123535, "logps/rejected": -2.948183536529541, "loss": 2.5797, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -27.86516761779785, "rewards/margins": 1.616668701171875, "rewards/rejected": -29.48183250427246, "step": 27555 }, { "epoch": 0.9289157032592942, "grad_norm": 61.952980041503906, "learning_rate": 1.531355113470889e-08, "logits/chosen": -2.2046194076538086, "logits/rejected": -2.318603038787842, "logps/chosen": -2.6069815158843994, "logps/rejected": -2.8184146881103516, "loss": 3.1143, "rewards/accuracies": 0.5, "rewards/chosen": -26.069812774658203, "rewards/margins": 2.114332675933838, "rewards/rejected": -28.184146881103516, "step": 27560 }, { "epoch": 0.9290842293302773, "grad_norm": 11.280858039855957, "learning_rate": 1.524139745189845e-08, "logits/chosen": -1.6384538412094116, "logits/rejected": -1.7444041967391968, "logps/chosen": -2.4805355072021484, "logps/rejected": -2.4746363162994385, "loss": 3.8022, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.805355072021484, "rewards/margins": -0.058992959558963776, "rewards/rejected": -24.746362686157227, "step": 27565 }, { "epoch": 0.9292527554012606, "grad_norm": 61.026634216308594, "learning_rate": 1.5169411524941556e-08, "logits/chosen": -2.0152387619018555, "logits/rejected": -2.6300861835479736, "logps/chosen": -1.8867985010147095, "logps/rejected": -2.0875132083892822, "loss": 3.0549, "rewards/accuracies": 0.5, "rewards/chosen": -18.867984771728516, "rewards/margins": 2.0071449279785156, "rewards/rejected": -20.8751277923584, "step": 27570 }, { "epoch": 0.9294212814722438, "grad_norm": 45.335079193115234, "learning_rate": 1.5097593378749717e-08, "logits/chosen": -1.8455007076263428, "logits/rejected": -1.8004382848739624, "logps/chosen": -2.500025987625122, "logps/rejected": -2.3104910850524902, "loss": 5.0566, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -25.000259399414062, "rewards/margins": -1.895347237586975, "rewards/rejected": -23.10491180419922, "step": 27575 }, { "epoch": 0.929589807543227, "grad_norm": 10.588850975036621, "learning_rate": 1.5025943038176447e-08, "logits/chosen": -1.7851600646972656, "logits/rejected": -2.0314362049102783, "logps/chosen": -1.9325587749481201, "logps/rejected": -2.1342849731445312, "loss": 2.7042, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.32558822631836, "rewards/margins": 2.017261028289795, "rewards/rejected": -21.342849731445312, "step": 27580 }, { "epoch": 0.9297583336142101, "grad_norm": 27.62746810913086, "learning_rate": 1.4954460528017132e-08, "logits/chosen": -1.3435360193252563, "logits/rejected": -1.7085888385772705, "logps/chosen": -2.0199694633483887, "logps/rejected": -2.219791889190674, "loss": 2.4459, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.19969367980957, "rewards/margins": 1.9982258081436157, "rewards/rejected": -22.197919845581055, "step": 27585 }, { "epoch": 0.9299268596851933, "grad_norm": 147.57992553710938, "learning_rate": 1.4883145873008984e-08, "logits/chosen": -1.373422384262085, "logits/rejected": -2.1292011737823486, "logps/chosen": -3.485476016998291, "logps/rejected": -4.046762466430664, "loss": 3.336, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -34.85475540161133, "rewards/margins": 5.6128668785095215, "rewards/rejected": -40.467628479003906, "step": 27590 }, { "epoch": 0.9300953857561765, "grad_norm": 27.67316436767578, "learning_rate": 1.4811999097831151e-08, "logits/chosen": -1.812901258468628, "logits/rejected": -1.9904277324676514, "logps/chosen": -2.89308500289917, "logps/rejected": -3.0367727279663086, "loss": 4.4566, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -28.930850982666016, "rewards/margins": 1.436877965927124, "rewards/rejected": -30.367727279663086, "step": 27595 }, { "epoch": 0.9302639118271596, "grad_norm": 54.67453384399414, "learning_rate": 1.4741020227104883e-08, "logits/chosen": -1.3734403848648071, "logits/rejected": -1.4352593421936035, "logps/chosen": -2.109225034713745, "logps/rejected": -2.2077791690826416, "loss": 2.5068, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.09225082397461, "rewards/margins": 0.9855405688285828, "rewards/rejected": -22.07779312133789, "step": 27600 }, { "epoch": 0.9302639118271596, "eval_logits/chosen": -2.3133976459503174, "eval_logits/rejected": -2.492166519165039, "eval_logps/chosen": -2.2893614768981934, "eval_logps/rejected": -2.4440784454345703, "eval_loss": 3.0896363258361816, "eval_rewards/accuracies": 0.6299999952316284, "eval_rewards/chosen": -22.893613815307617, "eval_rewards/margins": 1.547170877456665, "eval_rewards/rejected": -24.440786361694336, "eval_runtime": 12.8916, "eval_samples_per_second": 7.757, "eval_steps_per_second": 1.939, "step": 27600 }, { "epoch": 0.9304324378981429, "grad_norm": 12.913957595825195, "learning_rate": 1.4670209285392975e-08, "logits/chosen": -1.9860661029815674, "logits/rejected": -2.458383321762085, "logps/chosen": -2.9501705169677734, "logps/rejected": -3.6025662422180176, "loss": 1.6886, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -29.501705169677734, "rewards/margins": 6.523956298828125, "rewards/rejected": -36.025657653808594, "step": 27605 }, { "epoch": 0.9306009639691261, "grad_norm": 20.887788772583008, "learning_rate": 1.4599566297200438e-08, "logits/chosen": -1.6999485492706299, "logits/rejected": -2.1493802070617676, "logps/chosen": -2.7761943340301514, "logps/rejected": -3.2419955730438232, "loss": 2.472, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -27.761943817138672, "rewards/margins": 4.6580095291137695, "rewards/rejected": -32.41995620727539, "step": 27610 }, { "epoch": 0.9307694900401092, "grad_norm": 95.52549743652344, "learning_rate": 1.4529091286973993e-08, "logits/chosen": -1.990030288696289, "logits/rejected": -1.9676322937011719, "logps/chosen": -2.367176055908203, "logps/rejected": -2.256159543991089, "loss": 4.5058, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.671756744384766, "rewards/margins": -1.110162377357483, "rewards/rejected": -22.561595916748047, "step": 27615 }, { "epoch": 0.9309380161110924, "grad_norm": 95.74559020996094, "learning_rate": 1.4458784279102299e-08, "logits/chosen": -1.9741586446762085, "logits/rejected": -2.3909668922424316, "logps/chosen": -1.9129314422607422, "logps/rejected": -2.1065430641174316, "loss": 2.8111, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.129314422607422, "rewards/margins": 1.9361183643341064, "rewards/rejected": -21.065433502197266, "step": 27620 }, { "epoch": 0.9311065421820756, "grad_norm": 4.898240566253662, "learning_rate": 1.4388645297915725e-08, "logits/chosen": -1.7137861251831055, "logits/rejected": -2.158115863800049, "logps/chosen": -2.891925811767578, "logps/rejected": -3.6723380088806152, "loss": 1.5774, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -28.919260025024414, "rewards/margins": 7.804121971130371, "rewards/rejected": -36.72338104248047, "step": 27625 }, { "epoch": 0.9312750682530587, "grad_norm": 32.084014892578125, "learning_rate": 1.4318674367686745e-08, "logits/chosen": -1.2843127250671387, "logits/rejected": -1.3736934661865234, "logps/chosen": -1.9384397268295288, "logps/rejected": -2.291755437850952, "loss": 2.4342, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.3843994140625, "rewards/margins": 3.5331573486328125, "rewards/rejected": -22.917556762695312, "step": 27630 }, { "epoch": 0.9314435943240419, "grad_norm": 12.00391960144043, "learning_rate": 1.424887151262949e-08, "logits/chosen": -2.211188554763794, "logits/rejected": -2.742053270339966, "logps/chosen": -2.813009738922119, "logps/rejected": -3.6539814472198486, "loss": 3.0753, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -28.130096435546875, "rewards/margins": 8.409716606140137, "rewards/rejected": -36.53981399536133, "step": 27635 }, { "epoch": 0.9316121203950251, "grad_norm": 28.257776260375977, "learning_rate": 1.4179236756899971e-08, "logits/chosen": -1.988883376121521, "logits/rejected": -2.2060391902923584, "logps/chosen": -2.281707286834717, "logps/rejected": -2.3848066329956055, "loss": 3.1235, "rewards/accuracies": 0.5, "rewards/chosen": -22.81707191467285, "rewards/margins": 1.0309938192367554, "rewards/rejected": -23.848064422607422, "step": 27640 }, { "epoch": 0.9317806464660083, "grad_norm": 39.309444427490234, "learning_rate": 1.4109770124596022e-08, "logits/chosen": -1.8199679851531982, "logits/rejected": -1.969831109046936, "logps/chosen": -2.4952075481414795, "logps/rejected": -2.6376724243164062, "loss": 2.4044, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.952075958251953, "rewards/margins": 1.4246470928192139, "rewards/rejected": -26.376724243164062, "step": 27645 }, { "epoch": 0.9319491725369915, "grad_norm": 40.65642547607422, "learning_rate": 1.4040471639757301e-08, "logits/chosen": -1.3425325155258179, "logits/rejected": -1.6632499694824219, "logps/chosen": -2.3197176456451416, "logps/rejected": -2.7622852325439453, "loss": 2.0812, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.19717788696289, "rewards/margins": 4.425675392150879, "rewards/rejected": -27.622852325439453, "step": 27650 }, { "epoch": 0.9321176986079747, "grad_norm": 47.1500244140625, "learning_rate": 1.3971341326365349e-08, "logits/chosen": -2.0653042793273926, "logits/rejected": -1.8965908288955688, "logps/chosen": -2.4217300415039062, "logps/rejected": -2.5961432456970215, "loss": 2.6185, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.217300415039062, "rewards/margins": 1.7441316843032837, "rewards/rejected": -25.9614315032959, "step": 27655 }, { "epoch": 0.9322862246789578, "grad_norm": 149.48915100097656, "learning_rate": 1.3902379208343362e-08, "logits/chosen": -1.7947008609771729, "logits/rejected": -1.8451206684112549, "logps/chosen": -2.4357595443725586, "logps/rejected": -2.420999050140381, "loss": 3.5261, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -24.357593536376953, "rewards/margins": -0.14760398864746094, "rewards/rejected": -24.209991455078125, "step": 27660 }, { "epoch": 0.932454750749941, "grad_norm": 27.083669662475586, "learning_rate": 1.3833585309556472e-08, "logits/chosen": -1.52590012550354, "logits/rejected": -1.748573660850525, "logps/chosen": -2.4408881664276123, "logps/rejected": -2.4321718215942383, "loss": 3.2916, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -24.408884048461914, "rewards/margins": -0.08716096729040146, "rewards/rejected": -24.32172203063965, "step": 27665 }, { "epoch": 0.9326232768209242, "grad_norm": 64.89007568359375, "learning_rate": 1.3764959653811525e-08, "logits/chosen": -1.835463523864746, "logits/rejected": -2.141990900039673, "logps/chosen": -2.914647340774536, "logps/rejected": -3.2299137115478516, "loss": 2.2172, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -29.146469116210938, "rewards/margins": 3.15266489982605, "rewards/rejected": -32.29913330078125, "step": 27670 }, { "epoch": 0.9327918028919073, "grad_norm": 69.75782775878906, "learning_rate": 1.3696502264857134e-08, "logits/chosen": -1.6715993881225586, "logits/rejected": -2.1183884143829346, "logps/chosen": -2.2679061889648438, "logps/rejected": -2.700915575027466, "loss": 3.0478, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.679061889648438, "rewards/margins": 4.3300933837890625, "rewards/rejected": -27.0091552734375, "step": 27675 }, { "epoch": 0.9329603289628906, "grad_norm": 17.441205978393555, "learning_rate": 1.3628213166383684e-08, "logits/chosen": -1.648181676864624, "logits/rejected": -1.6260621547698975, "logps/chosen": -1.8544126749038696, "logps/rejected": -1.884833574295044, "loss": 3.2328, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.544126510620117, "rewards/margins": 0.3042069375514984, "rewards/rejected": -18.84833335876465, "step": 27680 }, { "epoch": 0.9331288550338738, "grad_norm": 54.063350677490234, "learning_rate": 1.356009238202338e-08, "logits/chosen": -1.9545844793319702, "logits/rejected": -2.0981831550598145, "logps/chosen": -3.1395256519317627, "logps/rejected": -3.4566750526428223, "loss": 3.5834, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -31.3952579498291, "rewards/margins": 3.1714961528778076, "rewards/rejected": -34.56675338745117, "step": 27685 }, { "epoch": 0.9332973811048569, "grad_norm": 16.980487823486328, "learning_rate": 1.3492139935350143e-08, "logits/chosen": -1.3540923595428467, "logits/rejected": -1.7177197933197021, "logps/chosen": -2.7987303733825684, "logps/rejected": -2.9920051097869873, "loss": 5.1382, "rewards/accuracies": 0.5, "rewards/chosen": -27.9873046875, "rewards/margins": 1.9327491521835327, "rewards/rejected": -29.920055389404297, "step": 27690 }, { "epoch": 0.9334659071758401, "grad_norm": 2.9056694507598877, "learning_rate": 1.3424355849879665e-08, "logits/chosen": -2.472014904022217, "logits/rejected": -3.0095813274383545, "logps/chosen": -2.2502903938293457, "logps/rejected": -2.774362802505493, "loss": 1.2752, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -22.502906799316406, "rewards/margins": 5.240719795227051, "rewards/rejected": -27.743627548217773, "step": 27695 }, { "epoch": 0.9336344332468233, "grad_norm": 30.565427780151367, "learning_rate": 1.3356740149069234e-08, "logits/chosen": -1.6824843883514404, "logits/rejected": -1.5276223421096802, "logps/chosen": -3.4816088676452637, "logps/rejected": -3.265540361404419, "loss": 7.1031, "rewards/accuracies": 0.5, "rewards/chosen": -34.81608963012695, "rewards/margins": -2.160688877105713, "rewards/rejected": -32.65540313720703, "step": 27700 }, { "epoch": 0.9338029593178064, "grad_norm": 148.01853942871094, "learning_rate": 1.328929285631819e-08, "logits/chosen": -1.693103551864624, "logits/rejected": -2.0444936752319336, "logps/chosen": -2.936856508255005, "logps/rejected": -3.2731621265411377, "loss": 3.5818, "rewards/accuracies": 0.5, "rewards/chosen": -29.368566513061523, "rewards/margins": 3.3630542755126953, "rewards/rejected": -32.73162078857422, "step": 27705 }, { "epoch": 0.9339714853887896, "grad_norm": 20.24788475036621, "learning_rate": 1.322201399496714e-08, "logits/chosen": -2.2496120929718018, "logits/rejected": -2.1728973388671875, "logps/chosen": -3.4590187072753906, "logps/rejected": -3.597189426422119, "loss": 4.7234, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -34.590187072753906, "rewards/margins": 1.3817100524902344, "rewards/rejected": -35.971893310546875, "step": 27710 }, { "epoch": 0.9341400114597729, "grad_norm": 61.02878952026367, "learning_rate": 1.3154903588298794e-08, "logits/chosen": -2.0132148265838623, "logits/rejected": -2.283881664276123, "logps/chosen": -2.496445417404175, "logps/rejected": -2.786395311355591, "loss": 2.5994, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.964452743530273, "rewards/margins": 2.8994970321655273, "rewards/rejected": -27.86395263671875, "step": 27715 }, { "epoch": 0.934308537530756, "grad_norm": 53.6756477355957, "learning_rate": 1.3087961659537349e-08, "logits/chosen": -1.861713171005249, "logits/rejected": -1.7294833660125732, "logps/chosen": -2.40018892288208, "logps/rejected": -2.500532865524292, "loss": 2.574, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.001888275146484, "rewards/margins": 1.0034393072128296, "rewards/rejected": -25.005329132080078, "step": 27720 }, { "epoch": 0.9344770636017392, "grad_norm": 159.35438537597656, "learning_rate": 1.302118823184889e-08, "logits/chosen": -1.8387367725372314, "logits/rejected": -1.9457080364227295, "logps/chosen": -2.5194616317749023, "logps/rejected": -2.489358425140381, "loss": 4.4858, "rewards/accuracies": 0.5, "rewards/chosen": -25.19462013244629, "rewards/margins": -0.3010326325893402, "rewards/rejected": -24.893585205078125, "step": 27725 }, { "epoch": 0.9346455896727224, "grad_norm": 0.06013474240899086, "learning_rate": 1.2954583328340929e-08, "logits/chosen": -1.674951195716858, "logits/rejected": -2.087681293487549, "logps/chosen": -2.5127158164978027, "logps/rejected": -2.8834900856018066, "loss": 2.2586, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.127161026000977, "rewards/margins": 3.7077407836914062, "rewards/rejected": -28.83489990234375, "step": 27730 }, { "epoch": 0.9348141157437055, "grad_norm": 64.38245391845703, "learning_rate": 1.2888146972062863e-08, "logits/chosen": -2.249094009399414, "logits/rejected": -2.2135531902313232, "logps/chosen": -1.886885643005371, "logps/rejected": -2.272331476211548, "loss": 2.1438, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -18.868854522705078, "rewards/margins": 3.8544578552246094, "rewards/rejected": -22.723316192626953, "step": 27735 }, { "epoch": 0.9349826418146887, "grad_norm": 27.971521377563477, "learning_rate": 1.2821879186005747e-08, "logits/chosen": -1.8382654190063477, "logits/rejected": -1.9038264751434326, "logps/chosen": -1.9858547449111938, "logps/rejected": -2.219144821166992, "loss": 2.2832, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.858545303344727, "rewards/margins": 2.3329017162323, "rewards/rejected": -22.191448211669922, "step": 27740 }, { "epoch": 0.9351511678856719, "grad_norm": 42.19049835205078, "learning_rate": 1.2755779993102122e-08, "logits/chosen": -2.663670301437378, "logits/rejected": -2.474766492843628, "logps/chosen": -2.775183916091919, "logps/rejected": -2.7223057746887207, "loss": 7.3642, "rewards/accuracies": 0.5, "rewards/chosen": -27.751840591430664, "rewards/margins": -0.528782069683075, "rewards/rejected": -27.223058700561523, "step": 27745 }, { "epoch": 0.935319693956655, "grad_norm": 12.594414710998535, "learning_rate": 1.2689849416226362e-08, "logits/chosen": -1.594200849533081, "logits/rejected": -1.7864547967910767, "logps/chosen": -2.1359341144561768, "logps/rejected": -2.3643715381622314, "loss": 1.7207, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.35934066772461, "rewards/margins": 2.2843754291534424, "rewards/rejected": -23.643714904785156, "step": 27750 }, { "epoch": 0.9354882200276383, "grad_norm": 118.9895248413086, "learning_rate": 1.2624087478194545e-08, "logits/chosen": -2.010768413543701, "logits/rejected": -1.775254249572754, "logps/chosen": -2.5973310470581055, "logps/rejected": -2.587301254272461, "loss": 4.139, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.973312377929688, "rewards/margins": -0.10029850155115128, "rewards/rejected": -25.873010635375977, "step": 27755 }, { "epoch": 0.9356567460986215, "grad_norm": 62.92037582397461, "learning_rate": 1.2558494201764136e-08, "logits/chosen": -1.943966269493103, "logits/rejected": -2.2178397178649902, "logps/chosen": -2.327336072921753, "logps/rejected": -2.6325855255126953, "loss": 3.4514, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -23.273361206054688, "rewards/margins": 3.0524938106536865, "rewards/rejected": -26.325855255126953, "step": 27760 }, { "epoch": 0.9358252721696046, "grad_norm": 44.706966400146484, "learning_rate": 1.2493069609634477e-08, "logits/chosen": -2.0918984413146973, "logits/rejected": -1.994227409362793, "logps/chosen": -2.0638251304626465, "logps/rejected": -2.0130505561828613, "loss": 3.8463, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.638248443603516, "rewards/margins": -0.5077415704727173, "rewards/rejected": -20.130508422851562, "step": 27765 }, { "epoch": 0.9359937982405878, "grad_norm": 28.990034103393555, "learning_rate": 1.24278137244464e-08, "logits/chosen": -1.9322535991668701, "logits/rejected": -2.4840035438537598, "logps/chosen": -2.13610577583313, "logps/rejected": -2.6279749870300293, "loss": 2.0504, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.36105728149414, "rewards/margins": 4.918692588806152, "rewards/rejected": -26.279748916625977, "step": 27770 }, { "epoch": 0.936162324311571, "grad_norm": 25.019968032836914, "learning_rate": 1.2362726568782512e-08, "logits/chosen": -1.871313452720642, "logits/rejected": -1.842206358909607, "logps/chosen": -2.322143077850342, "logps/rejected": -3.0833826065063477, "loss": 1.9137, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.221431732177734, "rewards/margins": 7.6123948097229, "rewards/rejected": -30.833826065063477, "step": 27775 }, { "epoch": 0.9363308503825541, "grad_norm": 40.041114807128906, "learning_rate": 1.2297808165166735e-08, "logits/chosen": -2.6236162185668945, "logits/rejected": -3.0642149448394775, "logps/chosen": -2.3841915130615234, "logps/rejected": -2.809760332107544, "loss": 2.8452, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.841915130615234, "rewards/margins": 4.2556891441345215, "rewards/rejected": -28.097604751586914, "step": 27780 }, { "epoch": 0.9364993764535373, "grad_norm": 78.90538787841797, "learning_rate": 1.2233058536064821e-08, "logits/chosen": -1.3989120721817017, "logits/rejected": -1.1721596717834473, "logps/chosen": -2.9932949542999268, "logps/rejected": -3.3058419227600098, "loss": 3.0073, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -29.93294906616211, "rewards/margins": 3.125467300415039, "rewards/rejected": -33.05841827392578, "step": 27785 }, { "epoch": 0.9366679025245206, "grad_norm": 12.254767417907715, "learning_rate": 1.2168477703884184e-08, "logits/chosen": -2.0970308780670166, "logits/rejected": -2.6638078689575195, "logps/chosen": -1.7973181009292603, "logps/rejected": -2.045571804046631, "loss": 1.8345, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -17.973180770874023, "rewards/margins": 2.4825377464294434, "rewards/rejected": -20.455718994140625, "step": 27790 }, { "epoch": 0.9368364285955038, "grad_norm": 38.09868621826172, "learning_rate": 1.2104065690973554e-08, "logits/chosen": -2.2299957275390625, "logits/rejected": -2.420968532562256, "logps/chosen": -2.7009854316711426, "logps/rejected": -2.7875897884368896, "loss": 3.2235, "rewards/accuracies": 0.5, "rewards/chosen": -27.00985336303711, "rewards/margins": 0.866042971611023, "rewards/rejected": -27.875896453857422, "step": 27795 }, { "epoch": 0.9370049546664869, "grad_norm": 22.674205780029297, "learning_rate": 1.2039822519623489e-08, "logits/chosen": -1.987546682357788, "logits/rejected": -2.2283618450164795, "logps/chosen": -2.184810161590576, "logps/rejected": -2.653113603591919, "loss": 1.1945, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -21.848100662231445, "rewards/margins": 4.683035373687744, "rewards/rejected": -26.5311336517334, "step": 27800 }, { "epoch": 0.9371734807374701, "grad_norm": 4.418429671204649e-05, "learning_rate": 1.1975748212065928e-08, "logits/chosen": -2.031785011291504, "logits/rejected": -2.4191551208496094, "logps/chosen": -2.511854410171509, "logps/rejected": -3.39988374710083, "loss": 1.1578, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -25.118541717529297, "rewards/margins": 8.88029670715332, "rewards/rejected": -33.998836517333984, "step": 27805 }, { "epoch": 0.9373420068084533, "grad_norm": 191.81787109375, "learning_rate": 1.1911842790474635e-08, "logits/chosen": -2.0759177207946777, "logits/rejected": -2.140557289123535, "logps/chosen": -3.1038882732391357, "logps/rejected": -3.3400981426239014, "loss": 3.7893, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -31.038883209228516, "rewards/margins": 2.362098455429077, "rewards/rejected": -33.400978088378906, "step": 27810 }, { "epoch": 0.9375105328794364, "grad_norm": 28.452058792114258, "learning_rate": 1.184810627696453e-08, "logits/chosen": -2.5447916984558105, "logits/rejected": -2.1735787391662598, "logps/chosen": -2.8184876441955566, "logps/rejected": -2.4024946689605713, "loss": 7.4393, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -28.184871673583984, "rewards/margins": -4.1599273681640625, "rewards/rejected": -24.024944305419922, "step": 27815 }, { "epoch": 0.9376790589504196, "grad_norm": 26.47957992553711, "learning_rate": 1.1784538693592472e-08, "logits/chosen": -1.9781200885772705, "logits/rejected": -2.3153228759765625, "logps/chosen": -2.557616710662842, "logps/rejected": -3.2911009788513184, "loss": 2.8156, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.5761661529541, "rewards/margins": 7.334845542907715, "rewards/rejected": -32.911014556884766, "step": 27820 }, { "epoch": 0.9378475850214029, "grad_norm": 15.519754409790039, "learning_rate": 1.1721140062356638e-08, "logits/chosen": -1.6726192235946655, "logits/rejected": -1.926715612411499, "logps/chosen": -2.3190712928771973, "logps/rejected": -2.477382183074951, "loss": 2.8953, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.190710067749023, "rewards/margins": 1.5831115245819092, "rewards/rejected": -24.773822784423828, "step": 27825 }, { "epoch": 0.938016111092386, "grad_norm": 19.423538208007812, "learning_rate": 1.1657910405196814e-08, "logits/chosen": -1.5616414546966553, "logits/rejected": -1.7062292098999023, "logps/chosen": -2.029991865158081, "logps/rejected": -2.273928165435791, "loss": 1.7689, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.2999210357666, "rewards/margins": 2.4393606185913086, "rewards/rejected": -22.739282608032227, "step": 27830 }, { "epoch": 0.9381846371633692, "grad_norm": 31.801822662353516, "learning_rate": 1.1594849743994384e-08, "logits/chosen": -2.074384927749634, "logits/rejected": -1.9508994817733765, "logps/chosen": -2.286508083343506, "logps/rejected": -2.482470989227295, "loss": 1.9202, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.865079879760742, "rewards/margins": 1.9596277475357056, "rewards/rejected": -24.824708938598633, "step": 27835 }, { "epoch": 0.9383531632343524, "grad_norm": 66.98432922363281, "learning_rate": 1.1531958100571948e-08, "logits/chosen": -2.0815181732177734, "logits/rejected": -2.2014503479003906, "logps/chosen": -2.952211856842041, "logps/rejected": -3.1800284385681152, "loss": 3.0369, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -29.522119522094727, "rewards/margins": 2.2781662940979004, "rewards/rejected": -31.8002872467041, "step": 27840 }, { "epoch": 0.9385216893053355, "grad_norm": 28.881742477416992, "learning_rate": 1.146923549669393e-08, "logits/chosen": -2.072078227996826, "logits/rejected": -2.109978437423706, "logps/chosen": -2.861506462097168, "logps/rejected": -3.2766547203063965, "loss": 3.2541, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -28.615062713623047, "rewards/margins": 4.151480674743652, "rewards/rejected": -32.76654815673828, "step": 27845 }, { "epoch": 0.9386902153763187, "grad_norm": 106.37144470214844, "learning_rate": 1.1406681954066244e-08, "logits/chosen": -1.798151969909668, "logits/rejected": -1.6892074346542358, "logps/chosen": -2.5100529193878174, "logps/rejected": -2.3787970542907715, "loss": 4.5418, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -25.100528717041016, "rewards/margins": -1.3125566244125366, "rewards/rejected": -23.78797149658203, "step": 27850 }, { "epoch": 0.9388587414473019, "grad_norm": 54.06602096557617, "learning_rate": 1.1344297494336075e-08, "logits/chosen": -1.5469924211502075, "logits/rejected": -1.8411296606063843, "logps/chosen": -1.773207426071167, "logps/rejected": -1.9032232761383057, "loss": 3.1252, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.732074737548828, "rewards/margins": 1.3001577854156494, "rewards/rejected": -19.0322322845459, "step": 27855 }, { "epoch": 0.939027267518285, "grad_norm": 33.818641662597656, "learning_rate": 1.1282082139092319e-08, "logits/chosen": -1.8127334117889404, "logits/rejected": -2.0438244342803955, "logps/chosen": -2.7030246257781982, "logps/rejected": -2.8986270427703857, "loss": 2.6653, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -27.03024673461914, "rewards/margins": 1.9560247659683228, "rewards/rejected": -28.98626708984375, "step": 27860 }, { "epoch": 0.9391957935892683, "grad_norm": 287.0670471191406, "learning_rate": 1.1220035909865145e-08, "logits/chosen": -1.7046902179718018, "logits/rejected": -1.8295114040374756, "logps/chosen": -2.8792572021484375, "logps/rejected": -3.215181350708008, "loss": 4.1455, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -28.792572021484375, "rewards/margins": 3.3592400550842285, "rewards/rejected": -32.15180969238281, "step": 27865 }, { "epoch": 0.9393643196602515, "grad_norm": 81.16524505615234, "learning_rate": 1.115815882812643e-08, "logits/chosen": -2.333130121231079, "logits/rejected": -2.2076778411865234, "logps/chosen": -3.074096918106079, "logps/rejected": -2.455504894256592, "loss": 9.8043, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -30.740970611572266, "rewards/margins": -6.185919761657715, "rewards/rejected": -24.5550479888916, "step": 27870 }, { "epoch": 0.9395328457312346, "grad_norm": 279.53173828125, "learning_rate": 1.1096450915289324e-08, "logits/chosen": -2.2081775665283203, "logits/rejected": -1.953830361366272, "logps/chosen": -2.8438355922698975, "logps/rejected": -2.6367883682250977, "loss": 7.0671, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -28.4383544921875, "rewards/margins": -2.0704727172851562, "rewards/rejected": -26.367883682250977, "step": 27875 }, { "epoch": 0.9397013718022178, "grad_norm": 24.882822036743164, "learning_rate": 1.103491219270858e-08, "logits/chosen": -2.1974661350250244, "logits/rejected": -2.2992677688598633, "logps/chosen": -2.638702869415283, "logps/rejected": -2.751085042953491, "loss": 3.4128, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.38702964782715, "rewards/margins": 1.1238213777542114, "rewards/rejected": -27.510848999023438, "step": 27880 }, { "epoch": 0.939869897873201, "grad_norm": 0.040229279547929764, "learning_rate": 1.0973542681680215e-08, "logits/chosen": -2.1547296047210693, "logits/rejected": -2.1364502906799316, "logps/chosen": -2.7284128665924072, "logps/rejected": -2.731088161468506, "loss": 5.0208, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -27.284130096435547, "rewards/margins": 0.026755237951874733, "rewards/rejected": -27.310882568359375, "step": 27885 }, { "epoch": 0.9400384239441841, "grad_norm": 69.75837707519531, "learning_rate": 1.0912342403441854e-08, "logits/chosen": -2.2461647987365723, "logits/rejected": -2.149193525314331, "logps/chosen": -2.4559860229492188, "logps/rejected": -2.5802507400512695, "loss": 2.2705, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.559860229492188, "rewards/margins": 1.2426480054855347, "rewards/rejected": -25.802509307861328, "step": 27890 }, { "epoch": 0.9402069500151673, "grad_norm": 21.39365005493164, "learning_rate": 1.0851311379172556e-08, "logits/chosen": -1.7585290670394897, "logits/rejected": -1.6060632467269897, "logps/chosen": -2.1874899864196777, "logps/rejected": -2.264883041381836, "loss": 3.467, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.874902725219727, "rewards/margins": 0.7739295959472656, "rewards/rejected": -22.64883041381836, "step": 27895 }, { "epoch": 0.9403754760861506, "grad_norm": 101.61637115478516, "learning_rate": 1.0790449629992648e-08, "logits/chosen": -2.101813793182373, "logits/rejected": -2.3753764629364014, "logps/chosen": -3.024099826812744, "logps/rejected": -3.160783290863037, "loss": 2.7098, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -30.240997314453125, "rewards/margins": 1.366835117340088, "rewards/rejected": -31.607837677001953, "step": 27900 }, { "epoch": 0.9405440021571337, "grad_norm": 42.03780746459961, "learning_rate": 1.0729757176964005e-08, "logits/chosen": -2.129361867904663, "logits/rejected": -2.212040424346924, "logps/chosen": -2.0724599361419678, "logps/rejected": -2.3914830684661865, "loss": 1.864, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.724597930908203, "rewards/margins": 3.1902308464050293, "rewards/rejected": -23.91482925415039, "step": 27905 }, { "epoch": 0.9407125282281169, "grad_norm": 33.489646911621094, "learning_rate": 1.0669234041089991e-08, "logits/chosen": -1.8179874420166016, "logits/rejected": -2.0074830055236816, "logps/chosen": -2.027855634689331, "logps/rejected": -2.0084097385406494, "loss": 3.5226, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.2785587310791, "rewards/margins": -0.1944606751203537, "rewards/rejected": -20.084096908569336, "step": 27910 }, { "epoch": 0.9408810542991001, "grad_norm": 53.953433990478516, "learning_rate": 1.0608880243315188e-08, "logits/chosen": -2.8705196380615234, "logits/rejected": -2.637935161590576, "logps/chosen": -2.8548240661621094, "logps/rejected": -2.816099166870117, "loss": 4.5831, "rewards/accuracies": 0.5, "rewards/chosen": -28.548242568969727, "rewards/margins": -0.3872489929199219, "rewards/rejected": -28.16098976135254, "step": 27915 }, { "epoch": 0.9410495803700832, "grad_norm": 26.6319522857666, "learning_rate": 1.054869580452572e-08, "logits/chosen": -1.5271388292312622, "logits/rejected": -1.7312593460083008, "logps/chosen": -1.947431206703186, "logps/rejected": -2.1562416553497314, "loss": 2.0796, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.47431182861328, "rewards/margins": 2.0881032943725586, "rewards/rejected": -21.562414169311523, "step": 27920 }, { "epoch": 0.9412181064410664, "grad_norm": 1.690170407295227, "learning_rate": 1.0488680745548983e-08, "logits/chosen": -1.8683593273162842, "logits/rejected": -2.092747211456299, "logps/chosen": -2.7897651195526123, "logps/rejected": -3.438420057296753, "loss": 1.1328, "rewards/accuracies": 1.0, "rewards/chosen": -27.89764976501465, "rewards/margins": 6.486549377441406, "rewards/rejected": -34.38420104980469, "step": 27925 }, { "epoch": 0.9413866325120496, "grad_norm": 24.103199005126953, "learning_rate": 1.042883508715392e-08, "logits/chosen": -2.063265323638916, "logits/rejected": -2.2874794006347656, "logps/chosen": -2.7128069400787354, "logps/rejected": -2.7375307083129883, "loss": 3.6814, "rewards/accuracies": 0.5, "rewards/chosen": -27.128067016601562, "rewards/margins": 0.24723930656909943, "rewards/rejected": -27.375308990478516, "step": 27930 }, { "epoch": 0.9415551585830328, "grad_norm": 30.12285614013672, "learning_rate": 1.036915885005063e-08, "logits/chosen": -2.5748400688171387, "logits/rejected": -2.4589247703552246, "logps/chosen": -2.301511526107788, "logps/rejected": -2.3860554695129395, "loss": 2.9559, "rewards/accuracies": 0.5, "rewards/chosen": -23.01511573791504, "rewards/margins": 0.8454399108886719, "rewards/rejected": -23.860553741455078, "step": 27935 }, { "epoch": 0.941723684654016, "grad_norm": 29.477624893188477, "learning_rate": 1.0309652054890816e-08, "logits/chosen": -1.42814302444458, "logits/rejected": -1.6454927921295166, "logps/chosen": -2.172950029373169, "logps/rejected": -2.59196138381958, "loss": 1.1831, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -21.7294979095459, "rewards/margins": 4.190115451812744, "rewards/rejected": -25.919612884521484, "step": 27940 }, { "epoch": 0.9418922107249992, "grad_norm": 22.671653747558594, "learning_rate": 1.025031472226734e-08, "logits/chosen": -1.8475160598754883, "logits/rejected": -2.1124308109283447, "logps/chosen": -1.6322600841522217, "logps/rejected": -2.4699718952178955, "loss": 1.9411, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -16.322601318359375, "rewards/margins": 8.377116203308105, "rewards/rejected": -24.699716567993164, "step": 27945 }, { "epoch": 0.9420607367959823, "grad_norm": 25.53143882751465, "learning_rate": 1.0191146872714662e-08, "logits/chosen": -1.754601240158081, "logits/rejected": -2.093977451324463, "logps/chosen": -1.7095845937728882, "logps/rejected": -2.059696674346924, "loss": 0.7661, "rewards/accuracies": 1.0, "rewards/chosen": -17.095844268798828, "rewards/margins": 3.5011227130889893, "rewards/rejected": -20.596969604492188, "step": 27950 }, { "epoch": 0.9422292628669655, "grad_norm": 39.68607711791992, "learning_rate": 1.0132148526708296e-08, "logits/chosen": -1.9065732955932617, "logits/rejected": -2.1846835613250732, "logps/chosen": -1.9765098094940186, "logps/rejected": -2.0324313640594482, "loss": 2.7704, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.765098571777344, "rewards/margins": 0.5592161417007446, "rewards/rejected": -20.32431411743164, "step": 27955 }, { "epoch": 0.9423977889379487, "grad_norm": 27.000288009643555, "learning_rate": 1.0073319704665295e-08, "logits/chosen": -2.5076403617858887, "logits/rejected": -2.666171073913574, "logps/chosen": -2.1777713298797607, "logps/rejected": -2.539689302444458, "loss": 2.1136, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.777713775634766, "rewards/margins": 3.619178056716919, "rewards/rejected": -25.396892547607422, "step": 27960 }, { "epoch": 0.9425663150089318, "grad_norm": 21.133543014526367, "learning_rate": 1.0014660426944044e-08, "logits/chosen": -1.6421520709991455, "logits/rejected": -2.5268847942352295, "logps/chosen": -2.5491130352020264, "logps/rejected": -2.8137052059173584, "loss": 2.273, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.491130828857422, "rewards/margins": 2.645921230316162, "rewards/rejected": -28.13705062866211, "step": 27965 }, { "epoch": 0.942734841079915, "grad_norm": 49.087364196777344, "learning_rate": 9.956170713844136e-09, "logits/chosen": -1.8143583536148071, "logits/rejected": -2.109388828277588, "logps/chosen": -2.2308526039123535, "logps/rejected": -2.328622817993164, "loss": 2.7613, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.308523178100586, "rewards/margins": 0.9777054786682129, "rewards/rejected": -23.28622817993164, "step": 27970 }, { "epoch": 0.9429033671508983, "grad_norm": 16.130882263183594, "learning_rate": 9.897850585606605e-09, "logits/chosen": -2.2523300647735596, "logits/rejected": -2.478358507156372, "logps/chosen": -2.370288133621216, "logps/rejected": -2.562955379486084, "loss": 1.9819, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -23.702880859375, "rewards/margins": 1.926670789718628, "rewards/rejected": -25.629552841186523, "step": 27975 }, { "epoch": 0.9430718932218815, "grad_norm": 49.06074905395508, "learning_rate": 9.839700062413692e-09, "logits/chosen": -1.795732855796814, "logits/rejected": -2.104189157485962, "logps/chosen": -2.573464870452881, "logps/rejected": -2.9868791103363037, "loss": 3.0401, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.734649658203125, "rewards/margins": 4.134143829345703, "rewards/rejected": -29.868793487548828, "step": 27980 }, { "epoch": 0.9432404192928646, "grad_norm": 35.39234924316406, "learning_rate": 9.78171916438908e-09, "logits/chosen": -2.184047222137451, "logits/rejected": -2.3190789222717285, "logps/chosen": -3.3273043632507324, "logps/rejected": -3.9008243083953857, "loss": 2.3656, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -33.273040771484375, "rewards/margins": 5.735200881958008, "rewards/rejected": -39.008243560791016, "step": 27985 }, { "epoch": 0.9434089453638478, "grad_norm": 36.98085403442383, "learning_rate": 9.723907911597607e-09, "logits/chosen": -1.1470811367034912, "logits/rejected": -1.2830091714859009, "logps/chosen": -2.3072681427001953, "logps/rejected": -2.4950547218322754, "loss": 2.3621, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -23.072681427001953, "rewards/margins": 1.877862572669983, "rewards/rejected": -24.950544357299805, "step": 27990 }, { "epoch": 0.943577471434831, "grad_norm": 60.771636962890625, "learning_rate": 9.666266324045547e-09, "logits/chosen": -2.091439723968506, "logits/rejected": -1.8897613286972046, "logps/chosen": -2.765847682952881, "logps/rejected": -2.8236958980560303, "loss": 5.6922, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -27.658477783203125, "rewards/margins": 0.5784798860549927, "rewards/rejected": -28.236957550048828, "step": 27995 }, { "epoch": 0.9437459975058141, "grad_norm": 0.013381626456975937, "learning_rate": 9.608794421680334e-09, "logits/chosen": -2.3415331840515137, "logits/rejected": -2.6215662956237793, "logps/chosen": -3.6000258922576904, "logps/rejected": -4.7026519775390625, "loss": 0.677, "rewards/accuracies": 1.0, "rewards/chosen": -36.00026321411133, "rewards/margins": 11.026254653930664, "rewards/rejected": -47.02651596069336, "step": 28000 }, { "epoch": 0.9437459975058141, "eval_logits/chosen": -2.313400983810425, "eval_logits/rejected": -2.4919190406799316, "eval_logps/chosen": -2.2887589931488037, "eval_logps/rejected": -2.4447197914123535, "eval_loss": 3.0835182666778564, "eval_rewards/accuracies": 0.6299999952316284, "eval_rewards/chosen": -22.887590408325195, "eval_rewards/margins": 1.5596061944961548, "eval_rewards/rejected": -24.447195053100586, "eval_runtime": 12.8885, "eval_samples_per_second": 7.759, "eval_steps_per_second": 1.94, "step": 28000 }, { "epoch": 0.9439145235767973, "grad_norm": 52.55608367919922, "learning_rate": 9.551492224390666e-09, "logits/chosen": -2.1889519691467285, "logits/rejected": -2.4650866985321045, "logps/chosen": -2.833047866821289, "logps/rejected": -3.5559210777282715, "loss": 2.5163, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -28.33047866821289, "rewards/margins": 7.228735446929932, "rewards/rejected": -35.5592155456543, "step": 28005 }, { "epoch": 0.9440830496477806, "grad_norm": 28.72416114807129, "learning_rate": 9.494359752006686e-09, "logits/chosen": -1.4533464908599854, "logits/rejected": -2.3801748752593994, "logps/chosen": -2.3763134479522705, "logps/rejected": -3.6961147785186768, "loss": 1.967, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.76313591003418, "rewards/margins": 13.19801139831543, "rewards/rejected": -36.961151123046875, "step": 28010 }, { "epoch": 0.9442515757187637, "grad_norm": 8.59366512298584, "learning_rate": 9.437397024299631e-09, "logits/chosen": -1.1914489269256592, "logits/rejected": -1.926476240158081, "logps/chosen": -2.2145838737487793, "logps/rejected": -3.3406097888946533, "loss": 1.6371, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.145837783813477, "rewards/margins": 11.26025676727295, "rewards/rejected": -33.40609359741211, "step": 28015 }, { "epoch": 0.9444201017897469, "grad_norm": 30.539743423461914, "learning_rate": 9.380604060982123e-09, "logits/chosen": -1.732550024986267, "logits/rejected": -2.0217487812042236, "logps/chosen": -2.098297595977783, "logps/rejected": -2.4610893726348877, "loss": 1.9242, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.982975006103516, "rewards/margins": 3.6279189586639404, "rewards/rejected": -24.61089515686035, "step": 28020 }, { "epoch": 0.9445886278607301, "grad_norm": 27.034799575805664, "learning_rate": 9.323980881707827e-09, "logits/chosen": -1.8111553192138672, "logits/rejected": -1.981406807899475, "logps/chosen": -2.0265769958496094, "logps/rejected": -2.132596015930176, "loss": 2.2946, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.265771865844727, "rewards/margins": 1.0601909160614014, "rewards/rejected": -21.32596206665039, "step": 28025 }, { "epoch": 0.9447571539317132, "grad_norm": 50.454833984375, "learning_rate": 9.26752750607196e-09, "logits/chosen": -2.05342173576355, "logits/rejected": -1.770754098892212, "logps/chosen": -3.303680419921875, "logps/rejected": -2.9499616622924805, "loss": 7.4913, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -33.03680419921875, "rewards/margins": -3.537187099456787, "rewards/rejected": -29.499618530273438, "step": 28030 }, { "epoch": 0.9449256800026964, "grad_norm": 129.8310089111328, "learning_rate": 9.211243953610726e-09, "logits/chosen": -1.383266568183899, "logits/rejected": -1.8043930530548096, "logps/chosen": -2.948296308517456, "logps/rejected": -2.9599292278289795, "loss": 3.9439, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -29.482959747314453, "rewards/margins": 0.11632823944091797, "rewards/rejected": -29.599292755126953, "step": 28035 }, { "epoch": 0.9450942060736796, "grad_norm": 44.37803268432617, "learning_rate": 9.1551302438016e-09, "logits/chosen": -2.1227376461029053, "logits/rejected": -2.096128225326538, "logps/chosen": -2.1822402477264404, "logps/rejected": -2.4015820026397705, "loss": 2.6138, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.822399139404297, "rewards/margins": 2.193415880203247, "rewards/rejected": -24.015817642211914, "step": 28040 }, { "epoch": 0.9452627321446628, "grad_norm": 20.145410537719727, "learning_rate": 9.09918639606344e-09, "logits/chosen": -1.7266706228256226, "logits/rejected": -1.9664170742034912, "logps/chosen": -3.275007963180542, "logps/rejected": -3.9894778728485107, "loss": 1.1419, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -32.75008010864258, "rewards/margins": 7.1446990966796875, "rewards/rejected": -39.89478302001953, "step": 28045 }, { "epoch": 0.945431258215646, "grad_norm": 27.345746994018555, "learning_rate": 9.043412429756091e-09, "logits/chosen": -1.701939344406128, "logits/rejected": -1.802038550376892, "logps/chosen": -2.7647578716278076, "logps/rejected": -2.5529465675354004, "loss": 5.8626, "rewards/accuracies": 0.5, "rewards/chosen": -27.647579193115234, "rewards/margins": -2.1181139945983887, "rewards/rejected": -25.529464721679688, "step": 28050 }, { "epoch": 0.9455997842866292, "grad_norm": 46.19438552856445, "learning_rate": 8.987808364180837e-09, "logits/chosen": -2.0644924640655518, "logits/rejected": -2.2390027046203613, "logps/chosen": -2.9007680416107178, "logps/rejected": -2.8708555698394775, "loss": 4.7119, "rewards/accuracies": 0.5, "rewards/chosen": -29.007680892944336, "rewards/margins": -0.29912489652633667, "rewards/rejected": -28.70855712890625, "step": 28055 }, { "epoch": 0.9457683103576123, "grad_norm": 59.57084655761719, "learning_rate": 8.932374218579953e-09, "logits/chosen": -1.6476083993911743, "logits/rejected": -1.338379144668579, "logps/chosen": -2.277503728866577, "logps/rejected": -2.210233211517334, "loss": 3.7347, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -22.775039672851562, "rewards/margins": -0.6727081537246704, "rewards/rejected": -22.102331161499023, "step": 28060 }, { "epoch": 0.9459368364285955, "grad_norm": 162.84632873535156, "learning_rate": 8.87711001213709e-09, "logits/chosen": -2.315368413925171, "logits/rejected": -2.393383741378784, "logps/chosen": -2.475985050201416, "logps/rejected": -2.5510942935943604, "loss": 3.2253, "rewards/accuracies": 0.5, "rewards/chosen": -24.75984764099121, "rewards/margins": 0.7510935068130493, "rewards/rejected": -25.510942459106445, "step": 28065 }, { "epoch": 0.9461053624995787, "grad_norm": 27.281959533691406, "learning_rate": 8.822015763977009e-09, "logits/chosen": -2.1430652141571045, "logits/rejected": -2.3477883338928223, "logps/chosen": -2.2344632148742676, "logps/rejected": -2.4346368312835693, "loss": 1.9776, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.34463119506836, "rewards/margins": 2.0017340183258057, "rewards/rejected": -24.34636688232422, "step": 28070 }, { "epoch": 0.9462738885705618, "grad_norm": 5.122872829437256, "learning_rate": 8.767091493165568e-09, "logits/chosen": -1.899531602859497, "logits/rejected": -2.1331827640533447, "logps/chosen": -2.2442357540130615, "logps/rejected": -3.0054221153259277, "loss": 1.1727, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -22.44235610961914, "rewards/margins": 7.6118669509887695, "rewards/rejected": -30.054224014282227, "step": 28075 }, { "epoch": 0.946442414641545, "grad_norm": 0.05286615341901779, "learning_rate": 8.712337218710009e-09, "logits/chosen": -1.9935481548309326, "logits/rejected": -2.8229637145996094, "logps/chosen": -2.3799171447753906, "logps/rejected": -3.0323686599731445, "loss": 1.2227, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -23.79917335510254, "rewards/margins": 6.524515628814697, "rewards/rejected": -30.323688507080078, "step": 28080 }, { "epoch": 0.9466109407125283, "grad_norm": 30.209877014160156, "learning_rate": 8.657752959558562e-09, "logits/chosen": -1.6426223516464233, "logits/rejected": -1.5145375728607178, "logps/chosen": -2.6865198612213135, "logps/rejected": -2.789952516555786, "loss": 3.3766, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -26.865198135375977, "rewards/margins": 1.0343248844146729, "rewards/rejected": -27.899524688720703, "step": 28085 }, { "epoch": 0.9467794667835114, "grad_norm": 12.501883506774902, "learning_rate": 8.60333873460073e-09, "logits/chosen": -1.8514074087142944, "logits/rejected": -2.0068726539611816, "logps/chosen": -2.2644107341766357, "logps/rejected": -2.4691319465637207, "loss": 1.6423, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.64410972595215, "rewards/margins": 2.0472114086151123, "rewards/rejected": -24.691320419311523, "step": 28090 }, { "epoch": 0.9469479928544946, "grad_norm": 19.746841430664062, "learning_rate": 8.549094562667059e-09, "logits/chosen": -2.1093087196350098, "logits/rejected": -2.5620765686035156, "logps/chosen": -2.370896100997925, "logps/rejected": -3.1036839485168457, "loss": 2.4403, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.708959579467773, "rewards/margins": 7.327878475189209, "rewards/rejected": -31.036840438842773, "step": 28095 }, { "epoch": 0.9471165189254778, "grad_norm": 51.745853424072266, "learning_rate": 8.495020462529368e-09, "logits/chosen": -1.7082334756851196, "logits/rejected": -2.372655153274536, "logps/chosen": -2.9153425693511963, "logps/rejected": -4.1864423751831055, "loss": 4.3572, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -29.153427124023438, "rewards/margins": 12.710992813110352, "rewards/rejected": -41.864418029785156, "step": 28100 }, { "epoch": 0.9472850449964609, "grad_norm": 62.38421630859375, "learning_rate": 8.441116452900632e-09, "logits/chosen": -1.7928497791290283, "logits/rejected": -1.9549148082733154, "logps/chosen": -2.526888370513916, "logps/rejected": -2.4071240425109863, "loss": 4.8002, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -25.26888084411621, "rewards/margins": -1.1976430416107178, "rewards/rejected": -24.071239471435547, "step": 28105 }, { "epoch": 0.9474535710674441, "grad_norm": 18.18597412109375, "learning_rate": 8.387382552434763e-09, "logits/chosen": -2.213923454284668, "logits/rejected": -2.179241180419922, "logps/chosen": -2.6775755882263184, "logps/rejected": -3.191340208053589, "loss": 4.1458, "rewards/accuracies": 0.5, "rewards/chosen": -26.7757568359375, "rewards/margins": 5.137645721435547, "rewards/rejected": -31.913400650024414, "step": 28110 }, { "epoch": 0.9476220971384273, "grad_norm": 54.165409088134766, "learning_rate": 8.333818779727053e-09, "logits/chosen": -1.8373768329620361, "logits/rejected": -1.9318361282348633, "logps/chosen": -2.7003121376037598, "logps/rejected": -3.0800962448120117, "loss": 2.3767, "rewards/accuracies": 0.5, "rewards/chosen": -27.003122329711914, "rewards/margins": 3.7978405952453613, "rewards/rejected": -30.80096435546875, "step": 28115 }, { "epoch": 0.9477906232094105, "grad_norm": 78.94615936279297, "learning_rate": 8.280425153313786e-09, "logits/chosen": -1.6564782857894897, "logits/rejected": -1.58085298538208, "logps/chosen": -2.667534828186035, "logps/rejected": -2.594963550567627, "loss": 4.0251, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -26.67534828186035, "rewards/margins": -0.725711464881897, "rewards/rejected": -25.949636459350586, "step": 28120 }, { "epoch": 0.9479591492803937, "grad_norm": 162.03231811523438, "learning_rate": 8.227201691672403e-09, "logits/chosen": -2.3832004070281982, "logits/rejected": -2.4587082862854004, "logps/chosen": -3.8883845806121826, "logps/rejected": -3.8318228721618652, "loss": 4.067, "rewards/accuracies": 0.5, "rewards/chosen": -38.88385009765625, "rewards/margins": -0.5656188726425171, "rewards/rejected": -38.318233489990234, "step": 28125 }, { "epoch": 0.9481276753513769, "grad_norm": 0.09015277773141861, "learning_rate": 8.174148413221448e-09, "logits/chosen": -1.579756736755371, "logits/rejected": -2.15185809135437, "logps/chosen": -2.458775758743286, "logps/rejected": -2.8276195526123047, "loss": 2.6917, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.587757110595703, "rewards/margins": 3.688439130783081, "rewards/rejected": -28.276195526123047, "step": 28130 }, { "epoch": 0.94829620142236, "grad_norm": 47.610538482666016, "learning_rate": 8.121265336320572e-09, "logits/chosen": -1.3809497356414795, "logits/rejected": -1.3594849109649658, "logps/chosen": -2.757594108581543, "logps/rejected": -3.185593605041504, "loss": 2.5558, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -27.575939178466797, "rewards/margins": 4.279994964599609, "rewards/rejected": -31.855932235717773, "step": 28135 }, { "epoch": 0.9484647274933432, "grad_norm": 61.344520568847656, "learning_rate": 8.068552479270519e-09, "logits/chosen": -2.294250249862671, "logits/rejected": -2.034433603286743, "logps/chosen": -3.1079373359680176, "logps/rejected": -3.056140899658203, "loss": 4.0428, "rewards/accuracies": 0.5, "rewards/chosen": -31.079376220703125, "rewards/margins": -0.5179659128189087, "rewards/rejected": -30.5614070892334, "step": 28140 }, { "epoch": 0.9486332535643264, "grad_norm": 33.442928314208984, "learning_rate": 8.016009860313089e-09, "logits/chosen": -1.8951114416122437, "logits/rejected": -2.076521635055542, "logps/chosen": -2.8847079277038574, "logps/rejected": -3.0558080673217773, "loss": 2.3421, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -28.847076416015625, "rewards/margins": 1.7110036611557007, "rewards/rejected": -30.558080673217773, "step": 28145 }, { "epoch": 0.9488017796353095, "grad_norm": 28.92125701904297, "learning_rate": 7.963637497631237e-09, "logits/chosen": -2.5039522647857666, "logits/rejected": -2.384413242340088, "logps/chosen": -2.762303352355957, "logps/rejected": -3.159745454788208, "loss": 1.1603, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -27.623035430908203, "rewards/margins": 3.9744210243225098, "rewards/rejected": -31.597454071044922, "step": 28150 }, { "epoch": 0.9489703057062928, "grad_norm": 18.608285903930664, "learning_rate": 7.91143540934902e-09, "logits/chosen": -1.7014240026474, "logits/rejected": -2.3139872550964355, "logps/chosen": -3.1759629249572754, "logps/rejected": -3.790762424468994, "loss": 2.2991, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -31.759624481201172, "rewards/margins": 6.147995948791504, "rewards/rejected": -37.907623291015625, "step": 28155 }, { "epoch": 0.949138831777276, "grad_norm": 58.06821060180664, "learning_rate": 7.859403613531546e-09, "logits/chosen": -1.2894471883773804, "logits/rejected": -1.6731243133544922, "logps/chosen": -2.6358516216278076, "logps/rejected": -3.137530565261841, "loss": 2.8082, "rewards/accuracies": 0.5, "rewards/chosen": -26.358516693115234, "rewards/margins": 5.01678991317749, "rewards/rejected": -31.37530517578125, "step": 28160 }, { "epoch": 0.9493073578482591, "grad_norm": 24.11583709716797, "learning_rate": 7.807542128184852e-09, "logits/chosen": -2.01277756690979, "logits/rejected": -2.4013025760650635, "logps/chosen": -1.914523720741272, "logps/rejected": -2.132474184036255, "loss": 2.4283, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.14523696899414, "rewards/margins": 2.179502487182617, "rewards/rejected": -21.324739456176758, "step": 28165 }, { "epoch": 0.9494758839192423, "grad_norm": 37.180076599121094, "learning_rate": 7.75585097125625e-09, "logits/chosen": -1.6620514392852783, "logits/rejected": -2.2578060626983643, "logps/chosen": -2.9266459941864014, "logps/rejected": -3.3216185569763184, "loss": 2.0754, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -29.266460418701172, "rewards/margins": 3.949725389480591, "rewards/rejected": -33.2161865234375, "step": 28170 }, { "epoch": 0.9496444099902255, "grad_norm": 17.247196197509766, "learning_rate": 7.704330160633987e-09, "logits/chosen": -1.8131141662597656, "logits/rejected": -2.2728631496429443, "logps/chosen": -2.863938808441162, "logps/rejected": -3.2482426166534424, "loss": 2.2238, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -28.639385223388672, "rewards/margins": 3.8430371284484863, "rewards/rejected": -32.482421875, "step": 28175 }, { "epoch": 0.9498129360612086, "grad_norm": 24.438737869262695, "learning_rate": 7.652979714147357e-09, "logits/chosen": -2.0364794731140137, "logits/rejected": -2.189481019973755, "logps/chosen": -2.011589288711548, "logps/rejected": -1.904343605041504, "loss": 4.4351, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.11589241027832, "rewards/margins": -1.0724555253982544, "rewards/rejected": -19.04343605041504, "step": 28180 }, { "epoch": 0.9499814621321918, "grad_norm": 21.459518432617188, "learning_rate": 7.601799649566699e-09, "logits/chosen": -1.9324400424957275, "logits/rejected": -1.7644433975219727, "logps/chosen": -2.0944879055023193, "logps/rejected": -2.2868826389312744, "loss": 2.7438, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.94487953186035, "rewards/margins": 1.9239448308944702, "rewards/rejected": -22.868825912475586, "step": 28185 }, { "epoch": 0.950149988203175, "grad_norm": 41.32154083251953, "learning_rate": 7.550789984603512e-09, "logits/chosen": -1.8872880935668945, "logits/rejected": -2.033036470413208, "logps/chosen": -2.2491378784179688, "logps/rejected": -2.213416337966919, "loss": 3.4831, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.49138069152832, "rewards/margins": -0.3572167456150055, "rewards/rejected": -22.1341609954834, "step": 28190 }, { "epoch": 0.9503185142741583, "grad_norm": 60.450828552246094, "learning_rate": 7.499950736910232e-09, "logits/chosen": -1.3878852128982544, "logits/rejected": -1.6997029781341553, "logps/chosen": -1.9645227193832397, "logps/rejected": -2.1677870750427246, "loss": 2.4453, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.64522933959961, "rewards/margins": 2.032642364501953, "rewards/rejected": -21.677871704101562, "step": 28195 }, { "epoch": 0.9504870403451414, "grad_norm": 24.949216842651367, "learning_rate": 7.449281924080231e-09, "logits/chosen": -2.2458033561706543, "logits/rejected": -2.445981502532959, "logps/chosen": -1.9202144145965576, "logps/rejected": -2.4266836643218994, "loss": 1.3608, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.202144622802734, "rewards/margins": 5.064691066741943, "rewards/rejected": -24.266836166381836, "step": 28200 }, { "epoch": 0.9506555664161246, "grad_norm": 202.5205535888672, "learning_rate": 7.398783563648037e-09, "logits/chosen": -1.5952340364456177, "logits/rejected": -1.6486787796020508, "logps/chosen": -2.4221603870391846, "logps/rejected": -2.4342808723449707, "loss": 5.1397, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.22160530090332, "rewards/margins": 0.12120513617992401, "rewards/rejected": -24.34280776977539, "step": 28205 }, { "epoch": 0.9508240924871078, "grad_norm": 33.384708404541016, "learning_rate": 7.348455673089171e-09, "logits/chosen": -1.8759981393814087, "logits/rejected": -1.9951766729354858, "logps/chosen": -1.9931285381317139, "logps/rejected": -2.0459671020507812, "loss": 3.0892, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -19.93128776550293, "rewards/margins": 0.5283831357955933, "rewards/rejected": -20.459671020507812, "step": 28210 }, { "epoch": 0.9509926185580909, "grad_norm": 34.56550598144531, "learning_rate": 7.298298269820091e-09, "logits/chosen": -1.8990745544433594, "logits/rejected": -1.8381602764129639, "logps/chosen": -2.5596415996551514, "logps/rejected": -2.999729633331299, "loss": 3.3995, "rewards/accuracies": 0.5, "rewards/chosen": -25.596416473388672, "rewards/margins": 4.40087890625, "rewards/rejected": -29.997295379638672, "step": 28215 }, { "epoch": 0.9511611446290741, "grad_norm": 73.5636215209961, "learning_rate": 7.248311371198246e-09, "logits/chosen": -1.5171552896499634, "logits/rejected": -2.3751957416534424, "logps/chosen": -2.4360406398773193, "logps/rejected": -4.005704402923584, "loss": 1.3572, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -24.360408782958984, "rewards/margins": 15.696635246276855, "rewards/rejected": -40.057044982910156, "step": 28220 }, { "epoch": 0.9513296707000573, "grad_norm": 72.0538101196289, "learning_rate": 7.198494994522242e-09, "logits/chosen": -2.095430374145508, "logits/rejected": -1.976109266281128, "logps/chosen": -3.0543994903564453, "logps/rejected": -3.1049439907073975, "loss": 3.3299, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -30.543996810913086, "rewards/margins": 0.5054424405097961, "rewards/rejected": -31.0494384765625, "step": 28225 }, { "epoch": 0.9514981967710405, "grad_norm": 42.48782730102539, "learning_rate": 7.1488491570315116e-09, "logits/chosen": -2.2845242023468018, "logits/rejected": -2.450240135192871, "logps/chosen": -2.590700626373291, "logps/rejected": -2.9526290893554688, "loss": 2.2784, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.907007217407227, "rewards/margins": 3.619286298751831, "rewards/rejected": -29.526294708251953, "step": 28230 }, { "epoch": 0.9516667228420237, "grad_norm": 26.9639949798584, "learning_rate": 7.099373875906534e-09, "logits/chosen": -1.91399347782135, "logits/rejected": -1.7625631093978882, "logps/chosen": -2.7082173824310303, "logps/rejected": -2.8804149627685547, "loss": 2.7748, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -27.08217430114746, "rewards/margins": 1.7219760417938232, "rewards/rejected": -28.804149627685547, "step": 28235 }, { "epoch": 0.9518352489130069, "grad_norm": 0.7547553777694702, "learning_rate": 7.050069168268724e-09, "logits/chosen": -1.9236023426055908, "logits/rejected": -2.384753704071045, "logps/chosen": -2.392350435256958, "logps/rejected": -2.638664484024048, "loss": 3.2697, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.92350196838379, "rewards/margins": 2.4631431102752686, "rewards/rejected": -26.386646270751953, "step": 28240 }, { "epoch": 0.95200377498399, "grad_norm": 44.27131271362305, "learning_rate": 7.000935051180546e-09, "logits/chosen": -1.9845330715179443, "logits/rejected": -2.276456117630005, "logps/chosen": -2.1075778007507324, "logps/rejected": -2.3886475563049316, "loss": 1.9211, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.07577896118164, "rewards/margins": 2.810696840286255, "rewards/rejected": -23.886472702026367, "step": 28245 }, { "epoch": 0.9521723010549732, "grad_norm": 195.17190551757812, "learning_rate": 6.951971541645341e-09, "logits/chosen": -2.1520159244537354, "logits/rejected": -2.5742106437683105, "logps/chosen": -1.8013156652450562, "logps/rejected": -1.940410852432251, "loss": 2.6314, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.01315689086914, "rewards/margins": 1.3909523487091064, "rewards/rejected": -19.404109954833984, "step": 28250 }, { "epoch": 0.9523408271259564, "grad_norm": 36.765411376953125, "learning_rate": 6.9031786566075e-09, "logits/chosen": -1.9867013692855835, "logits/rejected": -2.1289658546447754, "logps/chosen": -1.9412078857421875, "logps/rejected": -1.8753058910369873, "loss": 4.0751, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.412078857421875, "rewards/margins": -0.6590215563774109, "rewards/rejected": -18.7530574798584, "step": 28255 }, { "epoch": 0.9525093531969395, "grad_norm": 26.724512100219727, "learning_rate": 6.854556412952239e-09, "logits/chosen": -2.8886868953704834, "logits/rejected": -2.6446754932403564, "logps/chosen": -2.0121846199035645, "logps/rejected": -2.0627729892730713, "loss": 4.1562, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.121845245361328, "rewards/margins": 0.5058833360671997, "rewards/rejected": -20.627731323242188, "step": 28260 }, { "epoch": 0.9526778792679228, "grad_norm": 29.14841079711914, "learning_rate": 6.806104827505932e-09, "logits/chosen": -1.601732850074768, "logits/rejected": -1.7263450622558594, "logps/chosen": -2.5978128910064697, "logps/rejected": -2.745781421661377, "loss": 3.0066, "rewards/accuracies": 0.5, "rewards/chosen": -25.97812843322754, "rewards/margins": 1.4796851873397827, "rewards/rejected": -27.457813262939453, "step": 28265 }, { "epoch": 0.952846405338906, "grad_norm": 55.97766876220703, "learning_rate": 6.75782391703561e-09, "logits/chosen": -1.4899791479110718, "logits/rejected": -1.2709168195724487, "logps/chosen": -2.8961353302001953, "logps/rejected": -2.7889018058776855, "loss": 4.4169, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -28.961355209350586, "rewards/margins": -1.072336196899414, "rewards/rejected": -27.88901710510254, "step": 28270 }, { "epoch": 0.9530149314098891, "grad_norm": 30.594099044799805, "learning_rate": 6.709713698249464e-09, "logits/chosen": -2.045694589614868, "logits/rejected": -1.9917447566986084, "logps/chosen": -2.5798110961914062, "logps/rejected": -2.8535544872283936, "loss": 2.7851, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.798110961914062, "rewards/margins": 2.7374346256256104, "rewards/rejected": -28.535547256469727, "step": 28275 }, { "epoch": 0.9531834574808723, "grad_norm": 44.43939208984375, "learning_rate": 6.66177418779651e-09, "logits/chosen": -1.5889904499053955, "logits/rejected": -1.861187219619751, "logps/chosen": -2.142317295074463, "logps/rejected": -2.2729101181030273, "loss": 2.6055, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.423171997070312, "rewards/margins": 1.305925965309143, "rewards/rejected": -22.72909927368164, "step": 28280 }, { "epoch": 0.9533519835518555, "grad_norm": 77.94812774658203, "learning_rate": 6.614005402266809e-09, "logits/chosen": -2.1064066886901855, "logits/rejected": -2.0448784828186035, "logps/chosen": -2.8452343940734863, "logps/rejected": -2.7356557846069336, "loss": 4.3889, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -28.452342987060547, "rewards/margins": -1.0957868099212646, "rewards/rejected": -27.356555938720703, "step": 28285 }, { "epoch": 0.9535205096228386, "grad_norm": 34.475746154785156, "learning_rate": 6.566407358191195e-09, "logits/chosen": -1.9868141412734985, "logits/rejected": -1.904761552810669, "logps/chosen": -2.3303680419921875, "logps/rejected": -2.568265199661255, "loss": 2.4678, "rewards/accuracies": 0.5, "rewards/chosen": -23.303680419921875, "rewards/margins": 2.37896990776062, "rewards/rejected": -25.68265151977539, "step": 28290 }, { "epoch": 0.9536890356938218, "grad_norm": 32.694644927978516, "learning_rate": 6.5189800720415465e-09, "logits/chosen": -1.7498209476470947, "logits/rejected": -1.8394416570663452, "logps/chosen": -2.193845272064209, "logps/rejected": -2.44431734085083, "loss": 1.7126, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.938451766967773, "rewards/margins": 2.5047202110290527, "rewards/rejected": -24.443172454833984, "step": 28295 }, { "epoch": 0.953857561764805, "grad_norm": 19.778499603271484, "learning_rate": 6.471723560230458e-09, "logits/chosen": -1.4242563247680664, "logits/rejected": -1.4941645860671997, "logps/chosen": -2.2942872047424316, "logps/rejected": -2.5150489807128906, "loss": 2.0971, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.942873001098633, "rewards/margins": 2.2076144218444824, "rewards/rejected": -25.150487899780273, "step": 28300 }, { "epoch": 0.9540260878357882, "grad_norm": 167.14210510253906, "learning_rate": 6.424637839111624e-09, "logits/chosen": -1.7573789358139038, "logits/rejected": -2.0239624977111816, "logps/chosen": -3.1321041584014893, "logps/rejected": -3.244879961013794, "loss": 2.6003, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -31.3210391998291, "rewards/margins": 1.127755880355835, "rewards/rejected": -32.44879913330078, "step": 28305 }, { "epoch": 0.9541946139067714, "grad_norm": 37.638282775878906, "learning_rate": 6.3777229249795114e-09, "logits/chosen": -2.3085360527038574, "logits/rejected": -2.0975089073181152, "logps/chosen": -2.0547664165496826, "logps/rejected": -2.1216301918029785, "loss": 4.1529, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -20.54766273498535, "rewards/margins": 0.6686397790908813, "rewards/rejected": -21.216304779052734, "step": 28310 }, { "epoch": 0.9543631399777546, "grad_norm": 52.93343734741211, "learning_rate": 6.330978834069578e-09, "logits/chosen": -1.2301470041275024, "logits/rejected": -1.3939309120178223, "logps/chosen": -2.5767438411712646, "logps/rejected": -2.3654720783233643, "loss": 5.5037, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -25.767436981201172, "rewards/margins": -2.112718105316162, "rewards/rejected": -23.654720306396484, "step": 28315 }, { "epoch": 0.9545316660487377, "grad_norm": 58.48306655883789, "learning_rate": 6.284405582558106e-09, "logits/chosen": -1.9192569255828857, "logits/rejected": -2.261803150177002, "logps/chosen": -2.3037891387939453, "logps/rejected": -2.5604360103607178, "loss": 3.1239, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -23.037891387939453, "rewards/margins": 2.5664706230163574, "rewards/rejected": -25.604360580444336, "step": 28320 }, { "epoch": 0.9547001921197209, "grad_norm": 49.80613327026367, "learning_rate": 6.2380031865622015e-09, "logits/chosen": -2.0661492347717285, "logits/rejected": -1.951005220413208, "logps/chosen": -2.4906983375549316, "logps/rejected": -3.1569619178771973, "loss": 4.9785, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -24.906984329223633, "rewards/margins": 6.662638187408447, "rewards/rejected": -31.56962013244629, "step": 28325 }, { "epoch": 0.9548687181907041, "grad_norm": 24.512651443481445, "learning_rate": 6.191771662140022e-09, "logits/chosen": -1.8547757863998413, "logits/rejected": -2.033181667327881, "logps/chosen": -2.024604320526123, "logps/rejected": -2.131096601486206, "loss": 2.4034, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.246042251586914, "rewards/margins": 1.0649257898330688, "rewards/rejected": -21.31096839904785, "step": 28330 }, { "epoch": 0.9550372442616872, "grad_norm": 23.979116439819336, "learning_rate": 6.145711025290323e-09, "logits/chosen": -2.1622776985168457, "logits/rejected": -2.1714184284210205, "logps/chosen": -2.2897789478302, "logps/rejected": -2.0312511920928955, "loss": 5.7362, "rewards/accuracies": 0.5, "rewards/chosen": -22.897790908813477, "rewards/margins": -2.585280179977417, "rewards/rejected": -20.312509536743164, "step": 28335 }, { "epoch": 0.9552057703326705, "grad_norm": 26.935739517211914, "learning_rate": 6.099821291952967e-09, "logits/chosen": -2.1228065490722656, "logits/rejected": -1.8896598815917969, "logps/chosen": -2.216864824295044, "logps/rejected": -2.186297655105591, "loss": 3.8784, "rewards/accuracies": 0.5, "rewards/chosen": -22.16864585876465, "rewards/margins": -0.30566978454589844, "rewards/rejected": -21.86297607421875, "step": 28340 }, { "epoch": 0.9553742964036537, "grad_norm": 42.45960235595703, "learning_rate": 6.0541024780085824e-09, "logits/chosen": -1.6388975381851196, "logits/rejected": -1.9698559045791626, "logps/chosen": -2.4808647632598877, "logps/rejected": -2.721128225326538, "loss": 2.4064, "rewards/accuracies": 0.5, "rewards/chosen": -24.80864906311035, "rewards/margins": 2.4026341438293457, "rewards/rejected": -27.21128273010254, "step": 28345 }, { "epoch": 0.9555428224746368, "grad_norm": 39.344261169433594, "learning_rate": 6.008554599278681e-09, "logits/chosen": -1.4452064037322998, "logits/rejected": -1.2238776683807373, "logps/chosen": -2.0691027641296387, "logps/rejected": -2.0181994438171387, "loss": 4.1986, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -20.691028594970703, "rewards/margins": -0.5090330839157104, "rewards/rejected": -20.181995391845703, "step": 28350 }, { "epoch": 0.95571134854562, "grad_norm": 25.936553955078125, "learning_rate": 5.9631776715254876e-09, "logits/chosen": -1.562342643737793, "logits/rejected": -1.6943756341934204, "logps/chosen": -1.931775450706482, "logps/rejected": -2.0938315391540527, "loss": 2.2773, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.3177547454834, "rewards/margins": 1.6205610036849976, "rewards/rejected": -20.938316345214844, "step": 28355 }, { "epoch": 0.9558798746166032, "grad_norm": 18.464405059814453, "learning_rate": 5.917971710452274e-09, "logits/chosen": -1.4668428897857666, "logits/rejected": -1.9650462865829468, "logps/chosen": -2.674292802810669, "logps/rejected": -3.530560255050659, "loss": 1.4807, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -26.742929458618164, "rewards/margins": 8.562676429748535, "rewards/rejected": -35.30560302734375, "step": 28360 }, { "epoch": 0.9560484006875863, "grad_norm": 20.47051239013672, "learning_rate": 5.872936731702971e-09, "logits/chosen": -2.486109972000122, "logits/rejected": -2.605259656906128, "logps/chosen": -2.237062692642212, "logps/rejected": -2.565539598464966, "loss": 1.8784, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.37062644958496, "rewards/margins": 3.2847702503204346, "rewards/rejected": -25.6553955078125, "step": 28365 }, { "epoch": 0.9562169267585695, "grad_norm": 96.48336791992188, "learning_rate": 5.828072750862445e-09, "logits/chosen": -1.6551952362060547, "logits/rejected": -1.9391210079193115, "logps/chosen": -2.6091928482055664, "logps/rejected": -2.7079176902770996, "loss": 3.4794, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -26.091930389404297, "rewards/margins": 0.9872471690177917, "rewards/rejected": -27.079174041748047, "step": 28370 }, { "epoch": 0.9563854528295528, "grad_norm": 13.303318977355957, "learning_rate": 5.783379783456332e-09, "logits/chosen": -1.881603479385376, "logits/rejected": -2.7143008708953857, "logps/chosen": -2.1637027263641357, "logps/rejected": -3.5254616737365723, "loss": 1.3403, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.637027740478516, "rewards/margins": 13.617584228515625, "rewards/rejected": -35.254615783691406, "step": 28375 }, { "epoch": 0.956553978900536, "grad_norm": 0.08836426585912704, "learning_rate": 5.738857844951095e-09, "logits/chosen": -1.431921124458313, "logits/rejected": -1.5725934505462646, "logps/chosen": -2.3832743167877197, "logps/rejected": -2.58109188079834, "loss": 2.9704, "rewards/accuracies": 0.5, "rewards/chosen": -23.832740783691406, "rewards/margins": 1.9781758785247803, "rewards/rejected": -25.8109188079834, "step": 28380 }, { "epoch": 0.9567225049715191, "grad_norm": 21.63187026977539, "learning_rate": 5.69450695075413e-09, "logits/chosen": -1.9501020908355713, "logits/rejected": -2.476367712020874, "logps/chosen": -2.1850900650024414, "logps/rejected": -2.6595330238342285, "loss": 1.6813, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.850900650024414, "rewards/margins": 4.7444257736206055, "rewards/rejected": -26.595327377319336, "step": 28385 }, { "epoch": 0.9568910310425023, "grad_norm": 26.18231964111328, "learning_rate": 5.650327116213383e-09, "logits/chosen": -1.7134357690811157, "logits/rejected": -2.0885303020477295, "logps/chosen": -2.298783540725708, "logps/rejected": -2.608452558517456, "loss": 3.0266, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.987834930419922, "rewards/margins": 3.0966885089874268, "rewards/rejected": -26.084524154663086, "step": 28390 }, { "epoch": 0.9570595571134854, "grad_norm": 30.45747184753418, "learning_rate": 5.6063183566177894e-09, "logits/chosen": -2.2303929328918457, "logits/rejected": -2.2387430667877197, "logps/chosen": -2.4746217727661133, "logps/rejected": -2.5235071182250977, "loss": 2.9599, "rewards/accuracies": 0.5, "rewards/chosen": -24.7462158203125, "rewards/margins": 0.48885470628738403, "rewards/rejected": -25.235071182250977, "step": 28395 }, { "epoch": 0.9572280831844686, "grad_norm": 25.11550521850586, "learning_rate": 5.562480687197169e-09, "logits/chosen": -1.8280874490737915, "logits/rejected": -2.0225062370300293, "logps/chosen": -2.3772523403167725, "logps/rejected": -2.5516517162323, "loss": 2.5931, "rewards/accuracies": 0.5, "rewards/chosen": -23.772525787353516, "rewards/margins": 1.7439934015274048, "rewards/rejected": -25.516517639160156, "step": 28400 }, { "epoch": 0.9572280831844686, "eval_logits/chosen": -2.3117282390594482, "eval_logits/rejected": -2.4906811714172363, "eval_logps/chosen": -2.2893803119659424, "eval_logps/rejected": -2.444187641143799, "eval_loss": 3.087453603744507, "eval_rewards/accuracies": 0.6299999952316284, "eval_rewards/chosen": -22.8938045501709, "eval_rewards/margins": 1.5480728149414062, "eval_rewards/rejected": -24.441877365112305, "eval_runtime": 12.8967, "eval_samples_per_second": 7.754, "eval_steps_per_second": 1.938, "step": 28400 }, { "epoch": 0.9573966092554518, "grad_norm": 38.26020812988281, "learning_rate": 5.518814123121884e-09, "logits/chosen": -1.7564789056777954, "logits/rejected": -2.1984901428222656, "logps/chosen": -2.915069341659546, "logps/rejected": -3.5751430988311768, "loss": 2.0598, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -29.15069580078125, "rewards/margins": 6.60073709487915, "rewards/rejected": -35.75143051147461, "step": 28405 }, { "epoch": 0.957565135326435, "grad_norm": 22.221555709838867, "learning_rate": 5.475318679503238e-09, "logits/chosen": -1.768252968788147, "logits/rejected": -2.038252353668213, "logps/chosen": -2.356065034866333, "logps/rejected": -2.307173252105713, "loss": 3.9019, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -23.560649871826172, "rewards/margins": -0.4889196455478668, "rewards/rejected": -23.07172966003418, "step": 28410 }, { "epoch": 0.9577336613974182, "grad_norm": 33.3132438659668, "learning_rate": 5.4319943713933e-09, "logits/chosen": -2.22514009475708, "logits/rejected": -2.325124979019165, "logps/chosen": -2.7837955951690674, "logps/rejected": -3.3888633251190186, "loss": 1.191, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -27.837955474853516, "rewards/margins": 6.050678253173828, "rewards/rejected": -33.888633728027344, "step": 28415 }, { "epoch": 0.9579021874684014, "grad_norm": 0.19005419313907623, "learning_rate": 5.388841213784911e-09, "logits/chosen": -1.3929895162582397, "logits/rejected": -1.7694988250732422, "logps/chosen": -2.0488882064819336, "logps/rejected": -2.4571712017059326, "loss": 1.7978, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.48888397216797, "rewards/margins": 4.082829475402832, "rewards/rejected": -24.571712493896484, "step": 28420 }, { "epoch": 0.9580707135393846, "grad_norm": 56.7274169921875, "learning_rate": 5.345859221611626e-09, "logits/chosen": -2.0072145462036133, "logits/rejected": -1.8055555820465088, "logps/chosen": -3.5600218772888184, "logps/rejected": -3.5715057849884033, "loss": 4.3241, "rewards/accuracies": 0.5, "rewards/chosen": -35.600215911865234, "rewards/margins": 0.11484356224536896, "rewards/rejected": -35.71506118774414, "step": 28425 }, { "epoch": 0.9582392396103677, "grad_norm": 50.16498565673828, "learning_rate": 5.30304840974799e-09, "logits/chosen": -1.5627635717391968, "logits/rejected": -1.7483123540878296, "logps/chosen": -2.061707019805908, "logps/rejected": -2.066906452178955, "loss": 3.4939, "rewards/accuracies": 0.5, "rewards/chosen": -20.6170711517334, "rewards/margins": 0.05199117586016655, "rewards/rejected": -20.669063568115234, "step": 28430 }, { "epoch": 0.9584077656813509, "grad_norm": 153.0499267578125, "learning_rate": 5.26040879300893e-09, "logits/chosen": -1.2752379179000854, "logits/rejected": -1.482933759689331, "logps/chosen": -2.8088629245758057, "logps/rejected": -3.7246298789978027, "loss": 4.4737, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -28.0886287689209, "rewards/margins": 9.157669067382812, "rewards/rejected": -37.246299743652344, "step": 28435 }, { "epoch": 0.958576291752334, "grad_norm": 13.519747734069824, "learning_rate": 5.2179403861504215e-09, "logits/chosen": -1.7635982036590576, "logits/rejected": -1.942718267440796, "logps/chosen": -2.7124428749084473, "logps/rejected": -3.15181303024292, "loss": 2.2837, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -27.12442970275879, "rewards/margins": 4.39370059967041, "rewards/rejected": -31.518131256103516, "step": 28440 }, { "epoch": 0.9587448178233172, "grad_norm": 17.492719650268555, "learning_rate": 5.175643203869151e-09, "logits/chosen": -1.8395544290542603, "logits/rejected": -2.241781711578369, "logps/chosen": -2.3206355571746826, "logps/rejected": -2.714784622192383, "loss": 2.0138, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -23.206356048583984, "rewards/margins": 3.9414877891540527, "rewards/rejected": -27.147846221923828, "step": 28445 }, { "epoch": 0.9589133438943005, "grad_norm": 3.5783886909484863, "learning_rate": 5.133517260802411e-09, "logits/chosen": -2.2439727783203125, "logits/rejected": -2.1783652305603027, "logps/chosen": -3.2132935523986816, "logps/rejected": -3.4586052894592285, "loss": 3.8842, "rewards/accuracies": 0.5, "rewards/chosen": -32.1329345703125, "rewards/margins": 2.453115940093994, "rewards/rejected": -34.5860481262207, "step": 28450 }, { "epoch": 0.9590818699652837, "grad_norm": 31.823118209838867, "learning_rate": 5.091562571528485e-09, "logits/chosen": -2.086142063140869, "logits/rejected": -2.2255921363830566, "logps/chosen": -2.3477437496185303, "logps/rejected": -2.6771368980407715, "loss": 2.753, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.47743797302246, "rewards/margins": 3.293931484222412, "rewards/rejected": -26.7713680267334, "step": 28455 }, { "epoch": 0.9592503960362668, "grad_norm": 33.51295852661133, "learning_rate": 5.049779150566036e-09, "logits/chosen": -1.6997692584991455, "logits/rejected": -2.0853042602539062, "logps/chosen": -2.2463183403015137, "logps/rejected": -3.2169253826141357, "loss": 2.017, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.463180541992188, "rewards/margins": 9.706072807312012, "rewards/rejected": -32.169254302978516, "step": 28460 }, { "epoch": 0.95941892210725, "grad_norm": 19.38419532775879, "learning_rate": 5.008167012374831e-09, "logits/chosen": -1.8482658863067627, "logits/rejected": -2.0040698051452637, "logps/chosen": -3.1203415393829346, "logps/rejected": -3.2832634449005127, "loss": 2.8735, "rewards/accuracies": 0.5, "rewards/chosen": -31.203411102294922, "rewards/margins": 1.6292169094085693, "rewards/rejected": -32.8326301574707, "step": 28465 }, { "epoch": 0.9595874481782332, "grad_norm": 28.08884620666504, "learning_rate": 4.966726171355129e-09, "logits/chosen": -1.5639275312423706, "logits/rejected": -1.9470984935760498, "logps/chosen": -2.2116024494171143, "logps/rejected": -2.476243734359741, "loss": 2.7705, "rewards/accuracies": 0.5, "rewards/chosen": -22.116024017333984, "rewards/margins": 2.646414279937744, "rewards/rejected": -24.76243782043457, "step": 28470 }, { "epoch": 0.9597559742492163, "grad_norm": 31.719369888305664, "learning_rate": 4.925456641847903e-09, "logits/chosen": -1.650435209274292, "logits/rejected": -1.4346725940704346, "logps/chosen": -2.046463966369629, "logps/rejected": -2.1387128829956055, "loss": 2.9857, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.46463966369629, "rewards/margins": 0.9224891662597656, "rewards/rejected": -21.387126922607422, "step": 28475 }, { "epoch": 0.9599245003201995, "grad_norm": 25.16029930114746, "learning_rate": 4.884358438135006e-09, "logits/chosen": -1.7380361557006836, "logits/rejected": -2.3135714530944824, "logps/chosen": -1.9699513912200928, "logps/rejected": -2.640869140625, "loss": 2.9684, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.699514389038086, "rewards/margins": 6.709176540374756, "rewards/rejected": -26.40869140625, "step": 28480 }, { "epoch": 0.9600930263911828, "grad_norm": 17.687870025634766, "learning_rate": 4.843431574438839e-09, "logits/chosen": -2.181513547897339, "logits/rejected": -2.259943962097168, "logps/chosen": -2.7528865337371826, "logps/rejected": -3.143580198287964, "loss": 4.391, "rewards/accuracies": 0.5, "rewards/chosen": -27.52886390686035, "rewards/margins": 3.906938076019287, "rewards/rejected": -31.435800552368164, "step": 28485 }, { "epoch": 0.9602615524621659, "grad_norm": 170.2515869140625, "learning_rate": 4.802676064922684e-09, "logits/chosen": -1.9429032802581787, "logits/rejected": -1.7839523553848267, "logps/chosen": -3.687286853790283, "logps/rejected": -3.8839850425720215, "loss": 4.0808, "rewards/accuracies": 0.5, "rewards/chosen": -36.872867584228516, "rewards/margins": 1.9669809341430664, "rewards/rejected": -38.83985137939453, "step": 28490 }, { "epoch": 0.9604300785331491, "grad_norm": 29.234834671020508, "learning_rate": 4.762091923690315e-09, "logits/chosen": -1.4997103214263916, "logits/rejected": -1.640759825706482, "logps/chosen": -3.094761371612549, "logps/rejected": -2.9256839752197266, "loss": 5.9159, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -30.94761085510254, "rewards/margins": -1.6907707452774048, "rewards/rejected": -29.256839752197266, "step": 28495 }, { "epoch": 0.9605986046041323, "grad_norm": 48.139503479003906, "learning_rate": 4.721679164786329e-09, "logits/chosen": -1.8838342428207397, "logits/rejected": -1.9751946926116943, "logps/chosen": -2.078104257583618, "logps/rejected": -2.1125640869140625, "loss": 2.8742, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.781042098999023, "rewards/margins": 0.3445958197116852, "rewards/rejected": -21.125638961791992, "step": 28500 }, { "epoch": 0.9607671306751154, "grad_norm": 52.75870132446289, "learning_rate": 4.681437802196042e-09, "logits/chosen": -1.1743929386138916, "logits/rejected": -1.440617322921753, "logps/chosen": -2.2910919189453125, "logps/rejected": -2.7413864135742188, "loss": 1.6422, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.910917282104492, "rewards/margins": 4.502947807312012, "rewards/rejected": -27.413867950439453, "step": 28505 }, { "epoch": 0.9609356567460986, "grad_norm": 81.56078338623047, "learning_rate": 4.641367849845312e-09, "logits/chosen": -2.3156750202178955, "logits/rejected": -2.689465045928955, "logps/chosen": -2.853290557861328, "logps/rejected": -3.101311683654785, "loss": 3.3697, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -28.53290367126465, "rewards/margins": 2.480211019515991, "rewards/rejected": -31.01311683654785, "step": 28510 }, { "epoch": 0.9611041828170818, "grad_norm": 6.894532680511475, "learning_rate": 4.601469321600826e-09, "logits/chosen": -1.5325663089752197, "logits/rejected": -1.9350004196166992, "logps/chosen": -2.1749038696289062, "logps/rejected": -3.0541293621063232, "loss": 2.1314, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.749038696289062, "rewards/margins": 8.792255401611328, "rewards/rejected": -30.541296005249023, "step": 28515 }, { "epoch": 0.9612727088880649, "grad_norm": 34.375911712646484, "learning_rate": 4.561742231269872e-09, "logits/chosen": -1.6561189889907837, "logits/rejected": -1.647509217262268, "logps/chosen": -1.992790937423706, "logps/rejected": -2.0441665649414062, "loss": 3.1003, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -19.92790985107422, "rewards/margins": 0.5137545466423035, "rewards/rejected": -20.441665649414062, "step": 28520 }, { "epoch": 0.9614412349590482, "grad_norm": 56.41557693481445, "learning_rate": 4.522186592600452e-09, "logits/chosen": -2.0667316913604736, "logits/rejected": -2.038256883621216, "logps/chosen": -2.5181336402893066, "logps/rejected": -2.4041855335235596, "loss": 4.1961, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -25.18133544921875, "rewards/margins": -1.139478325843811, "rewards/rejected": -24.041854858398438, "step": 28525 }, { "epoch": 0.9616097610300314, "grad_norm": 29.580322265625, "learning_rate": 4.482802419281229e-09, "logits/chosen": -1.4627254009246826, "logits/rejected": -1.1754380464553833, "logps/chosen": -1.9528896808624268, "logps/rejected": -1.9384548664093018, "loss": 3.9845, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.52889633178711, "rewards/margins": -0.14434775710105896, "rewards/rejected": -19.38454818725586, "step": 28530 }, { "epoch": 0.9617782871010145, "grad_norm": 56.96183776855469, "learning_rate": 4.443589724941466e-09, "logits/chosen": -1.8529266119003296, "logits/rejected": -2.2691307067871094, "logps/chosen": -2.944936513900757, "logps/rejected": -3.49541974067688, "loss": 1.8121, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -29.449365615844727, "rewards/margins": 5.504830837249756, "rewards/rejected": -34.95419692993164, "step": 28535 }, { "epoch": 0.9619468131719977, "grad_norm": 22.926856994628906, "learning_rate": 4.404548523151197e-09, "logits/chosen": -2.0277888774871826, "logits/rejected": -2.1954684257507324, "logps/chosen": -2.326841354370117, "logps/rejected": -2.5182197093963623, "loss": 2.5164, "rewards/accuracies": 0.5, "rewards/chosen": -23.268413543701172, "rewards/margins": 1.9137840270996094, "rewards/rejected": -25.18219566345215, "step": 28540 }, { "epoch": 0.9621153392429809, "grad_norm": 224.8534393310547, "learning_rate": 4.365678827420949e-09, "logits/chosen": -1.8096294403076172, "logits/rejected": -2.36387300491333, "logps/chosen": -2.040325164794922, "logps/rejected": -2.2415404319763184, "loss": 3.5418, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.40325164794922, "rewards/margins": 2.0121548175811768, "rewards/rejected": -22.4154052734375, "step": 28545 }, { "epoch": 0.962283865313964, "grad_norm": 60.661651611328125, "learning_rate": 4.326980651202072e-09, "logits/chosen": -2.061427354812622, "logits/rejected": -2.287824869155884, "logps/chosen": -2.8698368072509766, "logps/rejected": -3.1925554275512695, "loss": 1.8822, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -28.6983642578125, "rewards/margins": 3.227189540863037, "rewards/rejected": -31.925556182861328, "step": 28550 }, { "epoch": 0.9624523913849472, "grad_norm": 38.10822677612305, "learning_rate": 4.28845400788641e-09, "logits/chosen": -1.6126525402069092, "logits/rejected": -1.8027045726776123, "logps/chosen": -3.1499922275543213, "logps/rejected": -3.1586296558380127, "loss": 5.7095, "rewards/accuracies": 0.5, "rewards/chosen": -31.499919891357422, "rewards/margins": 0.08637352287769318, "rewards/rejected": -31.5862979888916, "step": 28555 }, { "epoch": 0.9626209174559305, "grad_norm": 28.21145248413086, "learning_rate": 4.250098910806632e-09, "logits/chosen": -2.2559456825256348, "logits/rejected": -2.324834108352661, "logps/chosen": -2.5185751914978027, "logps/rejected": -2.8438072204589844, "loss": 3.1164, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.18575096130371, "rewards/margins": 3.2523179054260254, "rewards/rejected": -28.43807029724121, "step": 28560 }, { "epoch": 0.9627894435269136, "grad_norm": 24.09324073791504, "learning_rate": 4.211915373235841e-09, "logits/chosen": -1.9755699634552002, "logits/rejected": -1.9022626876831055, "logps/chosen": -2.221282720565796, "logps/rejected": -2.3595638275146484, "loss": 3.3347, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.21282958984375, "rewards/margins": 1.3828084468841553, "rewards/rejected": -23.59563636779785, "step": 28565 }, { "epoch": 0.9629579695978968, "grad_norm": 32.821205139160156, "learning_rate": 4.173903408387802e-09, "logits/chosen": -2.4099087715148926, "logits/rejected": -2.6462645530700684, "logps/chosen": -2.4751827716827393, "logps/rejected": -2.603869915008545, "loss": 2.7741, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.7518310546875, "rewards/margins": 1.2868703603744507, "rewards/rejected": -26.0387020111084, "step": 28570 }, { "epoch": 0.96312649566888, "grad_norm": 38.95467758178711, "learning_rate": 4.136063029417103e-09, "logits/chosen": -1.8634698390960693, "logits/rejected": -2.089498996734619, "logps/chosen": -1.9672338962554932, "logps/rejected": -2.2207417488098145, "loss": 2.5031, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.672338485717773, "rewards/margins": 2.5350780487060547, "rewards/rejected": -22.207416534423828, "step": 28575 }, { "epoch": 0.9632950217398631, "grad_norm": 33.86354446411133, "learning_rate": 4.098394249418657e-09, "logits/chosen": -1.2706377506256104, "logits/rejected": -1.239839792251587, "logps/chosen": -1.8439422845840454, "logps/rejected": -1.8992751836776733, "loss": 2.6658, "rewards/accuracies": 0.5, "rewards/chosen": -18.439422607421875, "rewards/margins": 0.5533312559127808, "rewards/rejected": -18.992752075195312, "step": 28580 }, { "epoch": 0.9634635478108463, "grad_norm": 31.960468292236328, "learning_rate": 4.06089708142826e-09, "logits/chosen": -1.8972434997558594, "logits/rejected": -2.0411770343780518, "logps/chosen": -2.1596405506134033, "logps/rejected": -2.309891939163208, "loss": 2.1328, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.596405029296875, "rewards/margins": 1.5025140047073364, "rewards/rejected": -23.098918914794922, "step": 28585 }, { "epoch": 0.9636320738818295, "grad_norm": 15.974693298339844, "learning_rate": 4.023571538422199e-09, "logits/chosen": -1.7587133646011353, "logits/rejected": -1.7669118642807007, "logps/chosen": -2.5280303955078125, "logps/rejected": -2.8322720527648926, "loss": 1.4144, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -25.280302047729492, "rewards/margins": 3.0424180030822754, "rewards/rejected": -28.32271957397461, "step": 28590 }, { "epoch": 0.9638005999528128, "grad_norm": 12.441394805908203, "learning_rate": 3.986417633317307e-09, "logits/chosen": -1.434709906578064, "logits/rejected": -1.4066721200942993, "logps/chosen": -2.8106179237365723, "logps/rejected": -2.5945558547973633, "loss": 5.7322, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -28.106182098388672, "rewards/margins": -2.1606221199035645, "rewards/rejected": -25.945560455322266, "step": 28595 }, { "epoch": 0.9639691260237959, "grad_norm": 55.572017669677734, "learning_rate": 3.949435378971078e-09, "logits/chosen": -2.209183692932129, "logits/rejected": -2.1617417335510254, "logps/chosen": -2.475675106048584, "logps/rejected": -2.499035358428955, "loss": 3.7538, "rewards/accuracies": 0.5, "rewards/chosen": -24.75674819946289, "rewards/margins": 0.2336008995771408, "rewards/rejected": -24.9903507232666, "step": 28600 }, { "epoch": 0.9641376520947791, "grad_norm": 26.91092872619629, "learning_rate": 3.912624788181718e-09, "logits/chosen": -1.8822380304336548, "logits/rejected": -2.017254590988159, "logps/chosen": -2.8590264320373535, "logps/rejected": -3.2963809967041016, "loss": 1.8247, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -28.59026527404785, "rewards/margins": 4.373543739318848, "rewards/rejected": -32.96380615234375, "step": 28605 }, { "epoch": 0.9643061781657623, "grad_norm": 26.15085220336914, "learning_rate": 3.875985873687815e-09, "logits/chosen": -1.9373159408569336, "logits/rejected": -2.321394443511963, "logps/chosen": -2.242321491241455, "logps/rejected": -3.103455066680908, "loss": 2.3169, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.423213958740234, "rewards/margins": 8.611337661743164, "rewards/rejected": -31.0345516204834, "step": 28610 }, { "epoch": 0.9644747042367454, "grad_norm": 37.95960235595703, "learning_rate": 3.839518648168727e-09, "logits/chosen": -1.6489719152450562, "logits/rejected": -1.6028999090194702, "logps/chosen": -2.2515532970428467, "logps/rejected": -2.245664119720459, "loss": 3.6217, "rewards/accuracies": 0.5, "rewards/chosen": -22.515533447265625, "rewards/margins": -0.05889282375574112, "rewards/rejected": -22.456640243530273, "step": 28615 }, { "epoch": 0.9646432303077286, "grad_norm": 19.891639709472656, "learning_rate": 3.803223124244248e-09, "logits/chosen": -1.575360894203186, "logits/rejected": -1.8237911462783813, "logps/chosen": -2.147665500640869, "logps/rejected": -2.6151740550994873, "loss": 2.6801, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.476654052734375, "rewards/margins": 4.675088405609131, "rewards/rejected": -26.151742935180664, "step": 28620 }, { "epoch": 0.9648117563787117, "grad_norm": 59.24190902709961, "learning_rate": 3.767099314474887e-09, "logits/chosen": -2.1034979820251465, "logits/rejected": -2.959904670715332, "logps/chosen": -3.063356637954712, "logps/rejected": -5.073480129241943, "loss": 3.1896, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -30.633569717407227, "rewards/margins": 20.10123062133789, "rewards/rejected": -50.73479461669922, "step": 28625 }, { "epoch": 0.9649802824496949, "grad_norm": 19.20398712158203, "learning_rate": 3.731147231361698e-09, "logits/chosen": -1.7048972845077515, "logits/rejected": -1.8632335662841797, "logps/chosen": -2.008641481399536, "logps/rejected": -2.1975979804992676, "loss": 1.9952, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.086414337158203, "rewards/margins": 1.8895690441131592, "rewards/rejected": -21.975982666015625, "step": 28630 }, { "epoch": 0.9651488085206782, "grad_norm": 31.66425895690918, "learning_rate": 3.695366887346174e-09, "logits/chosen": -1.8439382314682007, "logits/rejected": -2.2968287467956543, "logps/chosen": -1.9383208751678467, "logps/rejected": -2.144949436187744, "loss": 3.0275, "rewards/accuracies": 0.5, "rewards/chosen": -19.383209228515625, "rewards/margins": 2.066281795501709, "rewards/rejected": -21.44948959350586, "step": 28635 }, { "epoch": 0.9653173345916614, "grad_norm": 39.83387756347656, "learning_rate": 3.6597582948105774e-09, "logits/chosen": -1.7409656047821045, "logits/rejected": -1.9152864217758179, "logps/chosen": -2.375166654586792, "logps/rejected": -2.440739393234253, "loss": 3.1421, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.75166893005371, "rewards/margins": 0.6557281613349915, "rewards/rejected": -24.407394409179688, "step": 28640 }, { "epoch": 0.9654858606626445, "grad_norm": 90.26930236816406, "learning_rate": 3.624321466077662e-09, "logits/chosen": -1.8725887537002563, "logits/rejected": -2.0449306964874268, "logps/chosen": -3.7120277881622314, "logps/rejected": -3.5196595191955566, "loss": 6.2949, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -37.120277404785156, "rewards/margins": -1.9236809015274048, "rewards/rejected": -35.19659423828125, "step": 28645 }, { "epoch": 0.9656543867336277, "grad_norm": 33.77092361450195, "learning_rate": 3.589056413410563e-09, "logits/chosen": -1.4964176416397095, "logits/rejected": -1.619513750076294, "logps/chosen": -1.9811350107192993, "logps/rejected": -2.0898003578186035, "loss": 2.4862, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.811349868774414, "rewards/margins": 1.0866527557373047, "rewards/rejected": -20.898000717163086, "step": 28650 }, { "epoch": 0.9658229128046109, "grad_norm": 25.501209259033203, "learning_rate": 3.553963149013295e-09, "logits/chosen": -1.4640041589736938, "logits/rejected": -1.5587005615234375, "logps/chosen": -2.191490650177002, "logps/rejected": -2.5473177433013916, "loss": 1.2411, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -21.914907455444336, "rewards/margins": 3.558271884918213, "rewards/rejected": -25.47317886352539, "step": 28655 }, { "epoch": 0.965991438875594, "grad_norm": 25.410991668701172, "learning_rate": 3.5190416850301998e-09, "logits/chosen": -1.6539732217788696, "logits/rejected": -1.6497859954833984, "logps/chosen": -1.9782453775405884, "logps/rejected": -2.0479655265808105, "loss": 2.602, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.782453536987305, "rewards/margins": 0.697202205657959, "rewards/rejected": -20.479656219482422, "step": 28660 }, { "epoch": 0.9661599649465772, "grad_norm": 18.554931640625, "learning_rate": 3.484292033546166e-09, "logits/chosen": -1.9503523111343384, "logits/rejected": -2.0218005180358887, "logps/chosen": -2.2136752605438232, "logps/rejected": -2.3350918292999268, "loss": 2.4669, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.13675308227539, "rewards/margins": 1.2141668796539307, "rewards/rejected": -23.35091781616211, "step": 28665 }, { "epoch": 0.9663284910175605, "grad_norm": 29.587387084960938, "learning_rate": 3.44971420658674e-09, "logits/chosen": -2.254612684249878, "logits/rejected": -2.438527822494507, "logps/chosen": -1.7682807445526123, "logps/rejected": -2.1252379417419434, "loss": 2.1373, "rewards/accuracies": 0.5, "rewards/chosen": -17.68280601501465, "rewards/margins": 3.569573163986206, "rewards/rejected": -21.25238037109375, "step": 28670 }, { "epoch": 0.9664970170885436, "grad_norm": 41.48356628417969, "learning_rate": 3.415308216117907e-09, "logits/chosen": -2.4520583152770996, "logits/rejected": -2.6164937019348145, "logps/chosen": -3.1903624534606934, "logps/rejected": -3.697064161300659, "loss": 1.6011, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -31.90362548828125, "rewards/margins": 5.067018985748291, "rewards/rejected": -36.970645904541016, "step": 28675 }, { "epoch": 0.9666655431595268, "grad_norm": 29.71332359313965, "learning_rate": 3.3810740740463086e-09, "logits/chosen": -1.7559601068496704, "logits/rejected": -1.8875644207000732, "logps/chosen": -2.5920846462249756, "logps/rejected": -2.9399256706237793, "loss": 2.3224, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -25.920846939086914, "rewards/margins": 3.4784131050109863, "rewards/rejected": -29.39925765991211, "step": 28680 }, { "epoch": 0.96683406923051, "grad_norm": 21.529844284057617, "learning_rate": 3.3470117922189123e-09, "logits/chosen": -1.950628638267517, "logits/rejected": -2.0394814014434814, "logps/chosen": -2.423513412475586, "logps/rejected": -2.692171096801758, "loss": 1.7308, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -24.23513412475586, "rewards/margins": 2.6865768432617188, "rewards/rejected": -26.921710968017578, "step": 28685 }, { "epoch": 0.9670025953014931, "grad_norm": 113.19538879394531, "learning_rate": 3.3131213824234007e-09, "logits/chosen": -1.4651386737823486, "logits/rejected": -1.5366795063018799, "logps/chosen": -2.404541492462158, "logps/rejected": -2.637510299682617, "loss": 2.2841, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -24.0454158782959, "rewards/margins": 2.3296852111816406, "rewards/rejected": -26.37510108947754, "step": 28690 }, { "epoch": 0.9671711213724763, "grad_norm": 50.14772033691406, "learning_rate": 3.2794028563878917e-09, "logits/chosen": -1.2284538745880127, "logits/rejected": -1.4384486675262451, "logps/chosen": -2.580862522125244, "logps/rejected": -2.5931923389434814, "loss": 3.6783, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -25.808624267578125, "rewards/margins": 0.1232973113656044, "rewards/rejected": -25.93192481994629, "step": 28695 }, { "epoch": 0.9673396474434595, "grad_norm": 31.067855834960938, "learning_rate": 3.245856225781052e-09, "logits/chosen": -2.0800235271453857, "logits/rejected": -2.189945936203003, "logps/chosen": -2.929348945617676, "logps/rejected": -3.384255886077881, "loss": 2.2898, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -29.293487548828125, "rewards/margins": 4.549074649810791, "rewards/rejected": -33.84256362915039, "step": 28700 }, { "epoch": 0.9675081735144427, "grad_norm": 37.61033248901367, "learning_rate": 3.2124815022120387e-09, "logits/chosen": -2.1509222984313965, "logits/rejected": -2.2263705730438232, "logps/chosen": -2.11362361907959, "logps/rejected": -2.359367609024048, "loss": 2.7692, "rewards/accuracies": 0.5, "rewards/chosen": -21.13623809814453, "rewards/margins": 2.457437038421631, "rewards/rejected": -23.593673706054688, "step": 28705 }, { "epoch": 0.9676766995854259, "grad_norm": 31.55821418762207, "learning_rate": 3.179278697230503e-09, "logits/chosen": -2.073500156402588, "logits/rejected": -2.0467491149902344, "logps/chosen": -2.6881165504455566, "logps/rejected": -2.769103527069092, "loss": 3.0882, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -26.88116455078125, "rewards/margins": 0.8098711967468262, "rewards/rejected": -27.691036224365234, "step": 28710 }, { "epoch": 0.9678452256564091, "grad_norm": 29.740568161010742, "learning_rate": 3.1462478223266975e-09, "logits/chosen": -2.016167640686035, "logits/rejected": -2.078772783279419, "logps/chosen": -2.57477068901062, "logps/rejected": -2.529498338699341, "loss": 4.3984, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -25.747705459594727, "rewards/margins": -0.45272406935691833, "rewards/rejected": -25.29498291015625, "step": 28715 }, { "epoch": 0.9680137517273922, "grad_norm": 98.16922760009766, "learning_rate": 3.1133888889312565e-09, "logits/chosen": -2.303128242492676, "logits/rejected": -2.5696189403533936, "logps/chosen": -2.2590153217315674, "logps/rejected": -2.7877578735351562, "loss": 2.3962, "rewards/accuracies": 0.5, "rewards/chosen": -22.590152740478516, "rewards/margins": 5.287428855895996, "rewards/rejected": -27.877578735351562, "step": 28720 }, { "epoch": 0.9681822777983754, "grad_norm": 15.801095008850098, "learning_rate": 3.0807019084153618e-09, "logits/chosen": -1.862961769104004, "logits/rejected": -2.02888822555542, "logps/chosen": -1.8569800853729248, "logps/rejected": -2.0848309993743896, "loss": 3.0561, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.569801330566406, "rewards/margins": 2.2785086631774902, "rewards/rejected": -20.848312377929688, "step": 28725 }, { "epoch": 0.9683508038693586, "grad_norm": 54.87903594970703, "learning_rate": 3.0481868920906874e-09, "logits/chosen": -1.3197087049484253, "logits/rejected": -1.2992570400238037, "logps/chosen": -2.3507628440856934, "logps/rejected": -2.3117618560791016, "loss": 3.8642, "rewards/accuracies": 0.5, "rewards/chosen": -23.507625579833984, "rewards/margins": -0.3900091052055359, "rewards/rejected": -23.117618560791016, "step": 28730 }, { "epoch": 0.9685193299403417, "grad_norm": 109.8236083984375, "learning_rate": 3.0158438512093986e-09, "logits/chosen": -2.0474331378936768, "logits/rejected": -2.081048011779785, "logps/chosen": -3.522554397583008, "logps/rejected": -3.243844509124756, "loss": 6.6535, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -35.225547790527344, "rewards/margins": -2.7871007919311523, "rewards/rejected": -32.438446044921875, "step": 28735 }, { "epoch": 0.9686878560113249, "grad_norm": 56.84428405761719, "learning_rate": 2.9836727969642095e-09, "logits/chosen": -2.045356273651123, "logits/rejected": -2.2447898387908936, "logps/chosen": -3.090925693511963, "logps/rejected": -3.5039238929748535, "loss": 2.3891, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -30.909255981445312, "rewards/margins": 4.12998104095459, "rewards/rejected": -35.03923797607422, "step": 28740 }, { "epoch": 0.9688563820823082, "grad_norm": 34.671470642089844, "learning_rate": 2.9516737404881587e-09, "logits/chosen": -1.768585443496704, "logits/rejected": -2.076643228530884, "logps/chosen": -2.617316484451294, "logps/rejected": -3.1697921752929688, "loss": 4.1929, "rewards/accuracies": 0.5, "rewards/chosen": -26.17316246032715, "rewards/margins": 5.5247578620910645, "rewards/rejected": -31.697921752929688, "step": 28745 }, { "epoch": 0.9690249081532913, "grad_norm": 25.47473907470703, "learning_rate": 2.9198466928549435e-09, "logits/chosen": -2.3101115226745605, "logits/rejected": -2.5386404991149902, "logps/chosen": -1.9861423969268799, "logps/rejected": -2.017178773880005, "loss": 3.0888, "rewards/accuracies": 0.5, "rewards/chosen": -19.861421585083008, "rewards/margins": 0.3103656768798828, "rewards/rejected": -20.171789169311523, "step": 28750 }, { "epoch": 0.9691934342242745, "grad_norm": 30.15863037109375, "learning_rate": 2.8881916650785875e-09, "logits/chosen": -2.482518434524536, "logits/rejected": -2.915527582168579, "logps/chosen": -3.200958251953125, "logps/rejected": -5.268540382385254, "loss": 1.2083, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -32.00957489013672, "rewards/margins": 20.67582130432129, "rewards/rejected": -52.685401916503906, "step": 28755 }, { "epoch": 0.9693619602952577, "grad_norm": 36.95034408569336, "learning_rate": 2.8567086681136608e-09, "logits/chosen": -1.4024853706359863, "logits/rejected": -1.4760109186172485, "logps/chosen": -2.4063594341278076, "logps/rejected": -2.446986436843872, "loss": 3.5985, "rewards/accuracies": 0.5, "rewards/chosen": -24.063594818115234, "rewards/margins": 0.40627098083496094, "rewards/rejected": -24.469867706298828, "step": 28760 }, { "epoch": 0.9695304863662408, "grad_norm": 5.878857612609863, "learning_rate": 2.8253977128551708e-09, "logits/chosen": -1.9031862020492554, "logits/rejected": -1.948831558227539, "logps/chosen": -2.9409420490264893, "logps/rejected": -3.4585976600646973, "loss": 2.0746, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -29.409423828125, "rewards/margins": 5.176552772521973, "rewards/rejected": -34.585975646972656, "step": 28765 }, { "epoch": 0.969699012437224, "grad_norm": 45.4705924987793, "learning_rate": 2.794258810138728e-09, "logits/chosen": -1.9813125133514404, "logits/rejected": -2.1614718437194824, "logps/chosen": -2.020968198776245, "logps/rejected": -2.2112536430358887, "loss": 2.5303, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -20.20968246459961, "rewards/margins": 1.9028533697128296, "rewards/rejected": -22.11253547668457, "step": 28770 }, { "epoch": 0.9698675385082072, "grad_norm": 58.42421340942383, "learning_rate": 2.7632919707401e-09, "logits/chosen": -1.7921464443206787, "logits/rejected": -1.8928537368774414, "logps/chosen": -3.405698299407959, "logps/rejected": -3.547631025314331, "loss": 3.6025, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -34.056983947753906, "rewards/margins": 1.4193235635757446, "rewards/rejected": -35.47631072998047, "step": 28775 }, { "epoch": 0.9700360645791904, "grad_norm": 31.0329647064209, "learning_rate": 2.7324972053758275e-09, "logits/chosen": -2.084688663482666, "logits/rejected": -2.337444305419922, "logps/chosen": -2.1967928409576416, "logps/rejected": -2.5291614532470703, "loss": 2.0453, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.967926025390625, "rewards/margins": 3.3236873149871826, "rewards/rejected": -25.291614532470703, "step": 28780 }, { "epoch": 0.9702045906501736, "grad_norm": 17.989961624145508, "learning_rate": 2.7018745247027184e-09, "logits/chosen": -1.533870816230774, "logits/rejected": -1.8206316232681274, "logps/chosen": -1.893083930015564, "logps/rejected": -2.1794352531433105, "loss": 1.4881, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.93084144592285, "rewards/margins": 2.863513946533203, "rewards/rejected": -21.794353485107422, "step": 28785 }, { "epoch": 0.9703731167211568, "grad_norm": 43.55194854736328, "learning_rate": 2.671423939318018e-09, "logits/chosen": -1.1976906061172485, "logits/rejected": -1.5607186555862427, "logps/chosen": -2.5054843425750732, "logps/rejected": -2.798884391784668, "loss": 2.1093, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.05484390258789, "rewards/margins": 2.933999538421631, "rewards/rejected": -27.988842010498047, "step": 28790 }, { "epoch": 0.97054164279214, "grad_norm": 86.71769714355469, "learning_rate": 2.641145459759575e-09, "logits/chosen": -1.8374137878417969, "logits/rejected": -2.4237678050994873, "logps/chosen": -2.241248369216919, "logps/rejected": -2.8495514392852783, "loss": 2.2861, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.4124813079834, "rewards/margins": 6.083029747009277, "rewards/rejected": -28.495513916015625, "step": 28795 }, { "epoch": 0.9707101688631231, "grad_norm": 11.737354278564453, "learning_rate": 2.611039096505563e-09, "logits/chosen": -1.5608351230621338, "logits/rejected": -2.000445604324341, "logps/chosen": -3.0214154720306396, "logps/rejected": -3.182666540145874, "loss": 4.4413, "rewards/accuracies": 0.5, "rewards/chosen": -30.214153289794922, "rewards/margins": 1.612510323524475, "rewards/rejected": -31.8266658782959, "step": 28800 }, { "epoch": 0.9707101688631231, "eval_logits/chosen": -2.3131000995635986, "eval_logits/rejected": -2.4913644790649414, "eval_logps/chosen": -2.2895238399505615, "eval_logps/rejected": -2.44382905960083, "eval_loss": 3.089254140853882, "eval_rewards/accuracies": 0.6299999952316284, "eval_rewards/chosen": -22.89523696899414, "eval_rewards/margins": 1.5430537462234497, "eval_rewards/rejected": -24.438291549682617, "eval_runtime": 12.9148, "eval_samples_per_second": 7.743, "eval_steps_per_second": 1.936, "step": 28800 }, { "epoch": 0.9708786949341063, "grad_norm": 4.901158332824707, "learning_rate": 2.5811048599744813e-09, "logits/chosen": -1.9958289861679077, "logits/rejected": -2.1583971977233887, "logps/chosen": -2.2538743019104004, "logps/rejected": -2.5135021209716797, "loss": 3.2677, "rewards/accuracies": 0.5, "rewards/chosen": -22.53874397277832, "rewards/margins": 2.5962772369384766, "rewards/rejected": -25.135021209716797, "step": 28805 }, { "epoch": 0.9710472210050894, "grad_norm": 38.69203567504883, "learning_rate": 2.5513427605255433e-09, "logits/chosen": -1.3330328464508057, "logits/rejected": -2.0836803913116455, "logps/chosen": -2.295102596282959, "logps/rejected": -3.4632301330566406, "loss": 1.5787, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.95102882385254, "rewards/margins": 11.681272506713867, "rewards/rejected": -34.632301330566406, "step": 28810 }, { "epoch": 0.9712157470760727, "grad_norm": 64.19732666015625, "learning_rate": 2.5217528084581773e-09, "logits/chosen": -2.233541965484619, "logits/rejected": -2.275514602661133, "logps/chosen": -2.8877837657928467, "logps/rejected": -2.8634886741638184, "loss": 3.5775, "rewards/accuracies": 0.5, "rewards/chosen": -28.877838134765625, "rewards/margins": -0.24294976890087128, "rewards/rejected": -28.6348876953125, "step": 28815 }, { "epoch": 0.9713842731470559, "grad_norm": 28.017070770263672, "learning_rate": 2.4923350140123033e-09, "logits/chosen": -1.9607566595077515, "logits/rejected": -1.9388478994369507, "logps/chosen": -2.181363344192505, "logps/rejected": -2.456298828125, "loss": 3.5988, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.81363296508789, "rewards/margins": 2.7493553161621094, "rewards/rejected": -24.562986373901367, "step": 28820 }, { "epoch": 0.971552799218039, "grad_norm": 40.535179138183594, "learning_rate": 2.4630893873682777e-09, "logits/chosen": -1.5083204507827759, "logits/rejected": -1.7215535640716553, "logps/chosen": -3.0454647541046143, "logps/rejected": -3.433953046798706, "loss": 1.6007, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -30.45464515686035, "rewards/margins": 3.8848869800567627, "rewards/rejected": -34.33953094482422, "step": 28825 }, { "epoch": 0.9717213252890222, "grad_norm": 45.41533660888672, "learning_rate": 2.4340159386468383e-09, "logits/chosen": -1.9942636489868164, "logits/rejected": -2.528942108154297, "logps/chosen": -2.4124343395233154, "logps/rejected": -2.7206547260284424, "loss": 1.6811, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -24.124343872070312, "rewards/margins": 3.0822019577026367, "rewards/rejected": -27.206546783447266, "step": 28830 }, { "epoch": 0.9718898513600054, "grad_norm": 22.677080154418945, "learning_rate": 2.405114677909159e-09, "logits/chosen": -1.1437983512878418, "logits/rejected": -1.6425338983535767, "logps/chosen": -2.4738216400146484, "logps/rejected": -3.083200693130493, "loss": 2.4923, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.738216400146484, "rewards/margins": 6.0937910079956055, "rewards/rejected": -30.832006454467773, "step": 28835 }, { "epoch": 0.9720583774309886, "grad_norm": 18.422693252563477, "learning_rate": 2.3763856151567953e-09, "logits/chosen": -1.6711835861206055, "logits/rejected": -2.03035831451416, "logps/chosen": -1.7528514862060547, "logps/rejected": -2.049534797668457, "loss": 2.1549, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -17.528514862060547, "rewards/margins": 2.966834306716919, "rewards/rejected": -20.49534797668457, "step": 28840 }, { "epoch": 0.9722269035019717, "grad_norm": 38.746925354003906, "learning_rate": 2.347828760331849e-09, "logits/chosen": -2.295783519744873, "logits/rejected": -2.0634446144104004, "logps/chosen": -2.789072036743164, "logps/rejected": -2.251504898071289, "loss": 8.5075, "rewards/accuracies": 0.5, "rewards/chosen": -27.890722274780273, "rewards/margins": -5.375671863555908, "rewards/rejected": -22.51504898071289, "step": 28845 }, { "epoch": 0.9723954295729549, "grad_norm": 29.222929000854492, "learning_rate": 2.319444123316583e-09, "logits/chosen": -1.7320674657821655, "logits/rejected": -2.0177040100097656, "logps/chosen": -1.969813585281372, "logps/rejected": -2.4684014320373535, "loss": 1.4651, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.698135375976562, "rewards/margins": 4.985878944396973, "rewards/rejected": -24.684011459350586, "step": 28850 }, { "epoch": 0.9725639556439382, "grad_norm": 23.3667049407959, "learning_rate": 2.2912317139339164e-09, "logits/chosen": -1.6429636478424072, "logits/rejected": -1.873246192932129, "logps/chosen": -3.437391757965088, "logps/rejected": -3.674466609954834, "loss": 2.725, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -34.3739128112793, "rewards/margins": 2.3707499504089355, "rewards/rejected": -36.744667053222656, "step": 28855 }, { "epoch": 0.9727324817149213, "grad_norm": 167.22389221191406, "learning_rate": 2.2631915419470406e-09, "logits/chosen": -1.5777119398117065, "logits/rejected": -1.7945228815078735, "logps/chosen": -2.8456039428710938, "logps/rejected": -2.8768460750579834, "loss": 3.7524, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -28.456035614013672, "rewards/margins": 0.31242045760154724, "rewards/rejected": -28.76845932006836, "step": 28860 }, { "epoch": 0.9729010077859045, "grad_norm": 23.845905303955078, "learning_rate": 2.23532361705947e-09, "logits/chosen": -1.9737046957015991, "logits/rejected": -1.9449818134307861, "logps/chosen": -2.9338698387145996, "logps/rejected": -3.416442394256592, "loss": 1.9032, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -29.338695526123047, "rewards/margins": 4.82572603225708, "rewards/rejected": -34.16442108154297, "step": 28865 }, { "epoch": 0.9730695338568877, "grad_norm": 305.1143798828125, "learning_rate": 2.207627948915269e-09, "logits/chosen": -1.7003231048583984, "logits/rejected": -1.7136930227279663, "logps/chosen": -3.029107093811035, "logps/rejected": -3.0569915771484375, "loss": 3.7145, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -30.29107093811035, "rewards/margins": 0.27884674072265625, "rewards/rejected": -30.569915771484375, "step": 28870 }, { "epoch": 0.9732380599278708, "grad_norm": 28.607589721679688, "learning_rate": 2.1801045470987713e-09, "logits/chosen": -1.466975450515747, "logits/rejected": -1.8123632669448853, "logps/chosen": -2.1416099071502686, "logps/rejected": -2.387000322341919, "loss": 2.759, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.41609764099121, "rewards/margins": 2.453904628753662, "rewards/rejected": -23.870004653930664, "step": 28875 }, { "epoch": 0.973406585998854, "grad_norm": 76.04690551757812, "learning_rate": 2.1527534211348008e-09, "logits/chosen": -1.8390899896621704, "logits/rejected": -2.3541603088378906, "logps/chosen": -2.2660720348358154, "logps/rejected": -2.8816139698028564, "loss": 2.0392, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.660724639892578, "rewards/margins": 6.155418872833252, "rewards/rejected": -28.81614112854004, "step": 28880 }, { "epoch": 0.9735751120698372, "grad_norm": 23.925153732299805, "learning_rate": 2.1255745804885096e-09, "logits/chosen": -2.8247294425964355, "logits/rejected": -2.893087148666382, "logps/chosen": -3.386502742767334, "logps/rejected": -3.850271224975586, "loss": 1.8654, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -33.865028381347656, "rewards/margins": 4.637683868408203, "rewards/rejected": -38.502708435058594, "step": 28885 }, { "epoch": 0.9737436381408204, "grad_norm": 37.15006637573242, "learning_rate": 2.098568034565318e-09, "logits/chosen": -1.4200313091278076, "logits/rejected": -1.75543212890625, "logps/chosen": -1.8023878335952759, "logps/rejected": -2.0486152172088623, "loss": 2.5868, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.02387809753418, "rewards/margins": 2.462275743484497, "rewards/rejected": -20.48615264892578, "step": 28890 }, { "epoch": 0.9739121642118036, "grad_norm": 7.579281806945801, "learning_rate": 2.07173379271125e-09, "logits/chosen": -0.8189151883125305, "logits/rejected": -1.8859783411026, "logps/chosen": -2.5505759716033936, "logps/rejected": -3.3479816913604736, "loss": 1.2665, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -25.505762100219727, "rewards/margins": 7.974055290222168, "rewards/rejected": -33.47981643676758, "step": 28895 }, { "epoch": 0.9740806902827868, "grad_norm": 40.954010009765625, "learning_rate": 2.0450718642124887e-09, "logits/chosen": -1.6798347234725952, "logits/rejected": -1.7447267770767212, "logps/chosen": -2.070929527282715, "logps/rejected": -2.3259315490722656, "loss": 2.3437, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.709293365478516, "rewards/margins": 2.5500221252441406, "rewards/rejected": -23.25931739807129, "step": 28900 }, { "epoch": 0.9742492163537699, "grad_norm": 29.78924560546875, "learning_rate": 2.0185822582957648e-09, "logits/chosen": -2.079418659210205, "logits/rejected": -2.615996837615967, "logps/chosen": -2.4528346061706543, "logps/rejected": -3.0139706134796143, "loss": 2.5714, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.528343200683594, "rewards/margins": 5.611359119415283, "rewards/rejected": -30.13970375061035, "step": 28905 }, { "epoch": 0.9744177424247531, "grad_norm": 101.4603500366211, "learning_rate": 1.9922649841279673e-09, "logits/chosen": -2.2764244079589844, "logits/rejected": -2.7363200187683105, "logps/chosen": -3.0192959308624268, "logps/rejected": -3.1520752906799316, "loss": 5.4982, "rewards/accuracies": 0.5, "rewards/chosen": -30.192956924438477, "rewards/margins": 1.3277934789657593, "rewards/rejected": -31.520751953125, "step": 28910 }, { "epoch": 0.9745862684957363, "grad_norm": 57.92829513549805, "learning_rate": 1.966120050816589e-09, "logits/chosen": -1.9717071056365967, "logits/rejected": -2.3834729194641113, "logps/chosen": -2.6838412284851074, "logps/rejected": -3.0372207164764404, "loss": 2.1416, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.83841323852539, "rewards/margins": 3.533792495727539, "rewards/rejected": -30.372207641601562, "step": 28915 }, { "epoch": 0.9747547945667194, "grad_norm": 0.024026213213801384, "learning_rate": 1.940147467409281e-09, "logits/chosen": -1.569778561592102, "logits/rejected": -1.7317079305648804, "logps/chosen": -3.2188212871551514, "logps/rejected": -4.365044593811035, "loss": 1.1098, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -32.18821334838867, "rewards/margins": 11.462237358093262, "rewards/rejected": -43.65044403076172, "step": 28920 }, { "epoch": 0.9749233206377027, "grad_norm": 30.91973876953125, "learning_rate": 1.9143472428941877e-09, "logits/chosen": -1.6828441619873047, "logits/rejected": -2.0210423469543457, "logps/chosen": -2.875429391860962, "logps/rejected": -3.3311076164245605, "loss": 3.7751, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -28.754297256469727, "rewards/margins": 4.55678129196167, "rewards/rejected": -33.31107711791992, "step": 28925 }, { "epoch": 0.9750918467086859, "grad_norm": 39.773719787597656, "learning_rate": 1.8887193861996664e-09, "logits/chosen": -1.3712999820709229, "logits/rejected": -1.5209099054336548, "logps/chosen": -2.148160457611084, "logps/rejected": -2.1714203357696533, "loss": 3.2071, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -21.48160743713379, "rewards/margins": 0.23259706795215607, "rewards/rejected": -21.714202880859375, "step": 28930 }, { "epoch": 0.975260372779669, "grad_norm": 38.75511169433594, "learning_rate": 1.8632639061946233e-09, "logits/chosen": -1.8781366348266602, "logits/rejected": -1.9841234683990479, "logps/chosen": -2.3052010536193848, "logps/rejected": -2.2915608882904053, "loss": 3.5765, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.052011489868164, "rewards/margins": -0.13640041649341583, "rewards/rejected": -22.91560935974121, "step": 28935 }, { "epoch": 0.9754288988506522, "grad_norm": 27.88527488708496, "learning_rate": 1.8379808116881224e-09, "logits/chosen": -1.8556627035140991, "logits/rejected": -2.137606620788574, "logps/chosen": -2.3696718215942383, "logps/rejected": -2.8112268447875977, "loss": 2.1553, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.69671630859375, "rewards/margins": 4.415551662445068, "rewards/rejected": -28.112268447875977, "step": 28940 }, { "epoch": 0.9755974249216354, "grad_norm": 48.13548278808594, "learning_rate": 1.812870111429665e-09, "logits/chosen": -2.6025211811065674, "logits/rejected": -3.094240665435791, "logps/chosen": -3.035796642303467, "logps/rejected": -4.208906650543213, "loss": 1.0567, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -30.357967376708984, "rewards/margins": 11.73109245300293, "rewards/rejected": -42.08905792236328, "step": 28945 }, { "epoch": 0.9757659509926185, "grad_norm": 62.21755599975586, "learning_rate": 1.7879318141090226e-09, "logits/chosen": -2.0555005073547363, "logits/rejected": -2.551521062850952, "logps/chosen": -2.7381367683410645, "logps/rejected": -2.8648579120635986, "loss": 3.8195, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -27.38136863708496, "rewards/margins": 1.2672100067138672, "rewards/rejected": -28.648578643798828, "step": 28950 }, { "epoch": 0.9759344770636017, "grad_norm": 114.9220199584961, "learning_rate": 1.7631659283564582e-09, "logits/chosen": -1.478084683418274, "logits/rejected": -1.5832620859146118, "logps/chosen": -2.6809325218200684, "logps/rejected": -2.6643710136413574, "loss": 3.6116, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -26.809326171875, "rewards/margins": -0.1656126081943512, "rewards/rejected": -26.643712997436523, "step": 28955 }, { "epoch": 0.9761030031345849, "grad_norm": 56.121726989746094, "learning_rate": 1.7385724627423936e-09, "logits/chosen": -1.7954374551773071, "logits/rejected": -2.2343225479125977, "logps/chosen": -2.2366280555725098, "logps/rejected": -2.4855599403381348, "loss": 2.2053, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.366281509399414, "rewards/margins": 2.48931622505188, "rewards/rejected": -24.8555965423584, "step": 28960 }, { "epoch": 0.9762715292055681, "grad_norm": 20.632129669189453, "learning_rate": 1.7141514257777435e-09, "logits/chosen": -1.7457069158554077, "logits/rejected": -1.9348461627960205, "logps/chosen": -2.1216189861297607, "logps/rejected": -2.219635486602783, "loss": 2.7346, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.216190338134766, "rewards/margins": 0.9801637530326843, "rewards/rejected": -22.19635581970215, "step": 28965 }, { "epoch": 0.9764400552765513, "grad_norm": 20.116281509399414, "learning_rate": 1.689902825913525e-09, "logits/chosen": -1.2659189701080322, "logits/rejected": -2.0253186225891113, "logps/chosen": -2.4162893295288086, "logps/rejected": -3.086562395095825, "loss": 2.2569, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.162893295288086, "rewards/margins": 6.702728271484375, "rewards/rejected": -30.865625381469727, "step": 28970 }, { "epoch": 0.9766085813475345, "grad_norm": 45.286468505859375, "learning_rate": 1.6658266715413593e-09, "logits/chosen": -1.519742727279663, "logits/rejected": -1.9302390813827515, "logps/chosen": -3.0658726692199707, "logps/rejected": -3.5045742988586426, "loss": 4.8525, "rewards/accuracies": 0.5, "rewards/chosen": -30.65872573852539, "rewards/margins": 4.387016296386719, "rewards/rejected": -35.04574203491211, "step": 28975 }, { "epoch": 0.9767771074185176, "grad_norm": 46.37701416015625, "learning_rate": 1.6419229709929704e-09, "logits/chosen": -2.17651104927063, "logits/rejected": -2.3157334327697754, "logps/chosen": -2.608999729156494, "logps/rejected": -2.539881467819214, "loss": 4.01, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -26.089996337890625, "rewards/margins": -0.6911813020706177, "rewards/rejected": -25.398815155029297, "step": 28980 }, { "epoch": 0.9769456334895008, "grad_norm": 22.79644012451172, "learning_rate": 1.6181917325405192e-09, "logits/chosen": -1.9644426107406616, "logits/rejected": -2.2710633277893066, "logps/chosen": -2.450073480606079, "logps/rejected": -2.7313647270202637, "loss": 2.3456, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.500734329223633, "rewards/margins": 2.81291127204895, "rewards/rejected": -27.313644409179688, "step": 28985 }, { "epoch": 0.977114159560484, "grad_norm": 2.411163568496704, "learning_rate": 1.5946329643964363e-09, "logits/chosen": -1.7838443517684937, "logits/rejected": -1.734575867652893, "logps/chosen": -2.378779172897339, "logps/rejected": -2.4997496604919434, "loss": 2.8712, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.78779411315918, "rewards/margins": 1.2097065448760986, "rewards/rejected": -24.997501373291016, "step": 28990 }, { "epoch": 0.9772826856314671, "grad_norm": 46.94289016723633, "learning_rate": 1.5712466747135334e-09, "logits/chosen": -1.7236160039901733, "logits/rejected": -2.0851263999938965, "logps/chosen": -2.2219066619873047, "logps/rejected": -2.5512471199035645, "loss": 2.763, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -22.219064712524414, "rewards/margins": 3.293402910232544, "rewards/rejected": -25.512470245361328, "step": 28995 }, { "epoch": 0.9774512117024504, "grad_norm": 49.3720817565918, "learning_rate": 1.5480328715848367e-09, "logits/chosen": -1.7591028213500977, "logits/rejected": -1.8149206638336182, "logps/chosen": -2.1563258171081543, "logps/rejected": -3.1621527671813965, "loss": 1.3768, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.563257217407227, "rewards/margins": 10.058270454406738, "rewards/rejected": -31.62152671813965, "step": 29000 }, { "epoch": 0.9776197377734336, "grad_norm": 0.0798095315694809, "learning_rate": 1.5249915630437538e-09, "logits/chosen": -1.4440138339996338, "logits/rejected": -2.5432896614074707, "logps/chosen": -2.615081787109375, "logps/rejected": -4.234910011291504, "loss": 1.5559, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -26.15081787109375, "rewards/margins": 16.19828224182129, "rewards/rejected": -42.34910202026367, "step": 29005 }, { "epoch": 0.9777882638444167, "grad_norm": 36.522396087646484, "learning_rate": 1.5021227570639062e-09, "logits/chosen": -1.3796061277389526, "logits/rejected": -1.505127191543579, "logps/chosen": -2.3741424083709717, "logps/rejected": -2.817383289337158, "loss": 1.7147, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -23.741424560546875, "rewards/margins": 4.432408332824707, "rewards/rejected": -28.1738338470459, "step": 29010 }, { "epoch": 0.9779567899153999, "grad_norm": 20.012441635131836, "learning_rate": 1.4794264615594076e-09, "logits/chosen": -1.6143648624420166, "logits/rejected": -1.552825689315796, "logps/chosen": -2.325551748275757, "logps/rejected": -2.6102821826934814, "loss": 2.0325, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -23.25551986694336, "rewards/margins": 2.8473024368286133, "rewards/rejected": -26.10282325744629, "step": 29015 }, { "epoch": 0.9781253159863831, "grad_norm": 29.94846534729004, "learning_rate": 1.4569026843844201e-09, "logits/chosen": -1.814994215965271, "logits/rejected": -1.8052419424057007, "logps/chosen": -1.8165454864501953, "logps/rejected": -1.75994074344635, "loss": 3.7696, "rewards/accuracies": 0.5, "rewards/chosen": -18.165454864501953, "rewards/margins": -0.5660479664802551, "rewards/rejected": -17.599407196044922, "step": 29020 }, { "epoch": 0.9782938420573662, "grad_norm": 42.956748962402344, "learning_rate": 1.4345514333336528e-09, "logits/chosen": -2.1387548446655273, "logits/rejected": -2.604097366333008, "logps/chosen": -2.7845778465270996, "logps/rejected": -3.264005661010742, "loss": 3.4512, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -27.845779418945312, "rewards/margins": 4.794276237487793, "rewards/rejected": -32.640052795410156, "step": 29025 }, { "epoch": 0.9784623681283494, "grad_norm": 24.933717727661133, "learning_rate": 1.4123727161419186e-09, "logits/chosen": -2.3789522647857666, "logits/rejected": -2.5313992500305176, "logps/chosen": -3.1476669311523438, "logps/rejected": -4.168401718139648, "loss": 1.8028, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -31.476673126220703, "rewards/margins": 10.20734691619873, "rewards/rejected": -41.68401336669922, "step": 29030 }, { "epoch": 0.9786308941993327, "grad_norm": 172.79562377929688, "learning_rate": 1.3903665404844112e-09, "logits/chosen": -1.3795078992843628, "logits/rejected": -1.259374737739563, "logps/chosen": -2.9650959968566895, "logps/rejected": -3.208503007888794, "loss": 2.6839, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -29.650955200195312, "rewards/margins": 2.4340739250183105, "rewards/rejected": -32.08502960205078, "step": 29035 }, { "epoch": 0.9787994202703159, "grad_norm": 87.01050567626953, "learning_rate": 1.3685329139765945e-09, "logits/chosen": -1.9595003128051758, "logits/rejected": -1.6780097484588623, "logps/chosen": -2.65761137008667, "logps/rejected": -2.8279170989990234, "loss": 3.2983, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -26.576114654541016, "rewards/margins": 1.703057885169983, "rewards/rejected": -28.279170989990234, "step": 29040 }, { "epoch": 0.978967946341299, "grad_norm": 44.545223236083984, "learning_rate": 1.3468718441743132e-09, "logits/chosen": -1.9111160039901733, "logits/rejected": -2.1547746658325195, "logps/chosen": -2.1006617546081543, "logps/rejected": -2.54248046875, "loss": 1.6358, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.006620407104492, "rewards/margins": 4.418184757232666, "rewards/rejected": -25.424802780151367, "step": 29045 }, { "epoch": 0.9791364724122822, "grad_norm": 23.81278419494629, "learning_rate": 1.3253833385734603e-09, "logits/chosen": -1.5132275819778442, "logits/rejected": -1.3265550136566162, "logps/chosen": -3.042940616607666, "logps/rejected": -3.4171204566955566, "loss": 1.3111, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -30.429407119750977, "rewards/margins": 3.7417988777160645, "rewards/rejected": -34.17120361328125, "step": 29050 }, { "epoch": 0.9793049984832654, "grad_norm": 27.37566375732422, "learning_rate": 1.304067404610476e-09, "logits/chosen": -1.7547746896743774, "logits/rejected": -2.023449659347534, "logps/chosen": -2.1776890754699707, "logps/rejected": -2.5270471572875977, "loss": 3.4206, "rewards/accuracies": 0.5, "rewards/chosen": -21.776891708374023, "rewards/margins": 3.4935803413391113, "rewards/rejected": -25.270471572875977, "step": 29055 }, { "epoch": 0.9794735245542485, "grad_norm": 35.15890121459961, "learning_rate": 1.2829240496619042e-09, "logits/chosen": -1.609505295753479, "logits/rejected": -1.6805378198623657, "logps/chosen": -2.2529454231262207, "logps/rejected": -2.4146313667297363, "loss": 2.7068, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.52945327758789, "rewards/margins": 1.6168590784072876, "rewards/rejected": -24.146312713623047, "step": 29060 }, { "epoch": 0.9796420506252317, "grad_norm": 48.223201751708984, "learning_rate": 1.2619532810446699e-09, "logits/chosen": -1.2492077350616455, "logits/rejected": -1.2873347997665405, "logps/chosen": -2.0145344734191895, "logps/rejected": -2.223071813583374, "loss": 2.6329, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.14534568786621, "rewards/margins": 2.0853724479675293, "rewards/rejected": -22.2307186126709, "step": 29065 }, { "epoch": 0.9798105766962149, "grad_norm": 43.44717025756836, "learning_rate": 1.241155106015912e-09, "logits/chosen": -1.802374243736267, "logits/rejected": -1.7998616695404053, "logps/chosen": -2.245756149291992, "logps/rejected": -2.369556427001953, "loss": 3.1205, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.457561492919922, "rewards/margins": 1.2380021810531616, "rewards/rejected": -23.695566177368164, "step": 29070 }, { "epoch": 0.9799791027671981, "grad_norm": 112.0615463256836, "learning_rate": 1.2205295317730402e-09, "logits/chosen": -1.9436088800430298, "logits/rejected": -2.0945136547088623, "logps/chosen": -2.564476490020752, "logps/rejected": -3.054882049560547, "loss": 3.0998, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.644763946533203, "rewards/margins": 4.904057502746582, "rewards/rejected": -30.5488224029541, "step": 29075 }, { "epoch": 0.9801476288381813, "grad_norm": 42.39691162109375, "learning_rate": 1.2000765654537892e-09, "logits/chosen": -2.0819265842437744, "logits/rejected": -1.9295778274536133, "logps/chosen": -2.2024292945861816, "logps/rejected": -2.439565896987915, "loss": 2.701, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.0242919921875, "rewards/margins": 2.371366500854492, "rewards/rejected": -24.39565658569336, "step": 29080 }, { "epoch": 0.9803161549091645, "grad_norm": 54.22341537475586, "learning_rate": 1.1797962141360529e-09, "logits/chosen": -1.668026328086853, "logits/rejected": -1.8890268802642822, "logps/chosen": -2.1706669330596924, "logps/rejected": -2.2109477519989014, "loss": 2.8692, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.7066707611084, "rewards/margins": 0.4028078019618988, "rewards/rejected": -22.109477996826172, "step": 29085 }, { "epoch": 0.9804846809801476, "grad_norm": 0.008076355792582035, "learning_rate": 1.1596884848381616e-09, "logits/chosen": -1.7775027751922607, "logits/rejected": -1.9149770736694336, "logps/chosen": -2.447822093963623, "logps/rejected": -2.7496438026428223, "loss": 2.0461, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.47822380065918, "rewards/margins": 3.018213987350464, "rewards/rejected": -27.496435165405273, "step": 29090 }, { "epoch": 0.9806532070511308, "grad_norm": 66.91258239746094, "learning_rate": 1.1397533845185492e-09, "logits/chosen": -1.7012237310409546, "logits/rejected": -1.7801170349121094, "logps/chosen": -2.3985514640808105, "logps/rejected": -2.5203640460968018, "loss": 2.4471, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.985515594482422, "rewards/margins": 1.218124508857727, "rewards/rejected": -25.203641891479492, "step": 29095 }, { "epoch": 0.980821733122114, "grad_norm": 67.51605987548828, "learning_rate": 1.1199909200760305e-09, "logits/chosen": -2.3086938858032227, "logits/rejected": -2.628457546234131, "logps/chosen": -1.8733421564102173, "logps/rejected": -2.0515007972717285, "loss": 2.1875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.733423233032227, "rewards/margins": 1.7815853357315063, "rewards/rejected": -20.51500701904297, "step": 29100 }, { "epoch": 0.9809902591930971, "grad_norm": 16.39108657836914, "learning_rate": 1.1004010983495238e-09, "logits/chosen": -1.6252338886260986, "logits/rejected": -1.9250046014785767, "logps/chosen": -2.4929986000061035, "logps/rejected": -2.7562291622161865, "loss": 1.9, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.92998695373535, "rewards/margins": 2.6323070526123047, "rewards/rejected": -27.562292098999023, "step": 29105 }, { "epoch": 0.9811587852640804, "grad_norm": 37.90568161010742, "learning_rate": 1.0809839261183285e-09, "logits/chosen": -1.688126564025879, "logits/rejected": -1.9933946132659912, "logps/chosen": -2.792940616607666, "logps/rejected": -3.1035900115966797, "loss": 3.0784, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -27.92940330505371, "rewards/margins": 3.106494903564453, "rewards/rejected": -31.035900115966797, "step": 29110 }, { "epoch": 0.9813273113350636, "grad_norm": 28.15143585205078, "learning_rate": 1.0617394101020139e-09, "logits/chosen": -2.4952352046966553, "logits/rejected": -2.438091278076172, "logps/chosen": -2.3946175575256348, "logps/rejected": -2.601440668106079, "loss": 2.2339, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.9461727142334, "rewards/margins": 2.0682334899902344, "rewards/rejected": -26.014408111572266, "step": 29115 }, { "epoch": 0.9814958374060467, "grad_norm": 29.762697219848633, "learning_rate": 1.0426675569602529e-09, "logits/chosen": -1.716699242591858, "logits/rejected": -2.109680652618408, "logps/chosen": -2.18900728225708, "logps/rejected": -2.372774600982666, "loss": 2.4536, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.890071868896484, "rewards/margins": 1.8376758098602295, "rewards/rejected": -23.727746963500977, "step": 29120 }, { "epoch": 0.9816643634770299, "grad_norm": 117.76435852050781, "learning_rate": 1.0237683732931545e-09, "logits/chosen": -2.146617889404297, "logits/rejected": -2.143404960632324, "logps/chosen": -2.951927423477173, "logps/rejected": -3.0596113204956055, "loss": 3.7283, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -29.519275665283203, "rewards/margins": 1.0768405199050903, "rewards/rejected": -30.596111297607422, "step": 29125 }, { "epoch": 0.9818328895480131, "grad_norm": 72.63237762451172, "learning_rate": 1.0050418656408766e-09, "logits/chosen": -1.8240457773208618, "logits/rejected": -2.367785930633545, "logps/chosen": -3.1584181785583496, "logps/rejected": -3.3102378845214844, "loss": 3.8649, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -31.584178924560547, "rewards/margins": 1.5181951522827148, "rewards/rejected": -33.102378845214844, "step": 29130 }, { "epoch": 0.9820014156189962, "grad_norm": 37.19519805908203, "learning_rate": 9.86488040484068e-10, "logits/chosen": -1.5405725240707397, "logits/rejected": -1.7350629568099976, "logps/chosen": -2.063495635986328, "logps/rejected": -2.0626368522644043, "loss": 3.511, "rewards/accuracies": 0.5, "rewards/chosen": -20.63495445251465, "rewards/margins": -0.008587169460952282, "rewards/rejected": -20.62636947631836, "step": 29135 }, { "epoch": 0.9821699416899794, "grad_norm": 40.8488883972168, "learning_rate": 9.68106904243371e-10, "logits/chosen": -1.2401533126831055, "logits/rejected": -1.1598286628723145, "logps/chosen": -2.5394511222839355, "logps/rejected": -2.641644239425659, "loss": 3.0189, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.39451026916504, "rewards/margins": 1.0219334363937378, "rewards/rejected": -26.41644287109375, "step": 29140 }, { "epoch": 0.9823384677609627, "grad_norm": 27.47707176208496, "learning_rate": 9.49898463279808e-10, "logits/chosen": -1.5628474950790405, "logits/rejected": -1.6683374643325806, "logps/chosen": -2.1274776458740234, "logps/rejected": -2.3868026733398438, "loss": 1.7491, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -21.274776458740234, "rewards/margins": 2.5932505130767822, "rewards/rejected": -23.868024826049805, "step": 29145 }, { "epoch": 0.9825069938319458, "grad_norm": 26.376081466674805, "learning_rate": 9.318627238946164e-10, "logits/chosen": -1.6443513631820679, "logits/rejected": -1.7434518337249756, "logps/chosen": -3.311771869659424, "logps/rejected": -3.490668535232544, "loss": 2.9818, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -33.11771774291992, "rewards/margins": 1.788968801498413, "rewards/rejected": -34.90668487548828, "step": 29150 }, { "epoch": 0.982675519902929, "grad_norm": 0.5067969560623169, "learning_rate": 9.139996923291927e-10, "logits/chosen": -1.6530559062957764, "logits/rejected": -2.6780498027801514, "logps/chosen": -2.3142178058624268, "logps/rejected": -3.273839235305786, "loss": 2.5638, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.14217758178711, "rewards/margins": 9.59621524810791, "rewards/rejected": -32.7383918762207, "step": 29155 }, { "epoch": 0.9828440459739122, "grad_norm": 1.6404162645339966, "learning_rate": 8.963093747653139e-10, "logits/chosen": -1.8647918701171875, "logits/rejected": -2.108612537384033, "logps/chosen": -2.119544267654419, "logps/rejected": -2.551068067550659, "loss": 1.0067, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.195444107055664, "rewards/margins": 4.315237998962402, "rewards/rejected": -25.510677337646484, "step": 29160 }, { "epoch": 0.9830125720448953, "grad_norm": 24.310962677001953, "learning_rate": 8.78791777324861e-10, "logits/chosen": -2.083103895187378, "logits/rejected": -2.228764772415161, "logps/chosen": -3.090188503265381, "logps/rejected": -3.515840530395508, "loss": 1.6981, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -30.90188980102539, "rewards/margins": 4.256514549255371, "rewards/rejected": -35.15840530395508, "step": 29165 }, { "epoch": 0.9831810981158785, "grad_norm": 43.69054412841797, "learning_rate": 8.614469060699292e-10, "logits/chosen": -2.039569854736328, "logits/rejected": -2.212449550628662, "logps/chosen": -2.5203347206115723, "logps/rejected": -2.6209521293640137, "loss": 2.8296, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.203344345092773, "rewards/margins": 1.0061757564544678, "rewards/rejected": -26.209522247314453, "step": 29170 }, { "epoch": 0.9833496241868617, "grad_norm": 114.72516632080078, "learning_rate": 8.442747670029948e-10, "logits/chosen": -1.3245737552642822, "logits/rejected": -1.4342682361602783, "logps/chosen": -2.268794298171997, "logps/rejected": -2.3118577003479004, "loss": 4.0943, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.687946319580078, "rewards/margins": 0.43063393235206604, "rewards/rejected": -23.118576049804688, "step": 29175 }, { "epoch": 0.9835181502578448, "grad_norm": 0.09160160273313522, "learning_rate": 8.272753660665821e-10, "logits/chosen": -1.489280104637146, "logits/rejected": -2.2636759281158447, "logps/chosen": -2.1328532695770264, "logps/rejected": -3.0796000957489014, "loss": 1.7, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.328535079956055, "rewards/margins": 9.467466354370117, "rewards/rejected": -30.796001434326172, "step": 29180 }, { "epoch": 0.9836866763288281, "grad_norm": 32.43025207519531, "learning_rate": 8.104487091435963e-10, "logits/chosen": -2.016549825668335, "logits/rejected": -2.1013665199279785, "logps/chosen": -2.4078688621520996, "logps/rejected": -2.9210119247436523, "loss": 2.4298, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.078689575195312, "rewards/margins": 5.131430149078369, "rewards/rejected": -29.210119247436523, "step": 29185 }, { "epoch": 0.9838552023998113, "grad_norm": 39.57857894897461, "learning_rate": 7.937948020569906e-10, "logits/chosen": -1.3796770572662354, "logits/rejected": -1.4084769487380981, "logps/chosen": -2.1895060539245605, "logps/rejected": -2.352719306945801, "loss": 2.0924, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.895061492919922, "rewards/margins": 1.6321327686309814, "rewards/rejected": -23.527191162109375, "step": 29190 }, { "epoch": 0.9840237284707944, "grad_norm": 18.390674591064453, "learning_rate": 7.773136505700995e-10, "logits/chosen": -1.5435973405838013, "logits/rejected": -1.9853636026382446, "logps/chosen": -2.088843822479248, "logps/rejected": -2.8570563793182373, "loss": 1.6811, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.8884334564209, "rewards/margins": 7.682130336761475, "rewards/rejected": -28.570566177368164, "step": 29195 }, { "epoch": 0.9841922545417776, "grad_norm": 13.768671989440918, "learning_rate": 7.610052603863048e-10, "logits/chosen": -1.6334747076034546, "logits/rejected": -1.6783907413482666, "logps/chosen": -1.8387682437896729, "logps/rejected": -1.905200719833374, "loss": 2.7584, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -18.387680053710938, "rewards/margins": 0.664326012134552, "rewards/rejected": -19.0520076751709, "step": 29200 }, { "epoch": 0.9841922545417776, "eval_logits/chosen": -2.3112363815307617, "eval_logits/rejected": -2.489377975463867, "eval_logps/chosen": -2.289456605911255, "eval_logps/rejected": -2.444096088409424, "eval_loss": 3.0873899459838867, "eval_rewards/accuracies": 0.6299999952316284, "eval_rewards/chosen": -22.89456558227539, "eval_rewards/margins": 1.5463964939117432, "eval_rewards/rejected": -24.440961837768555, "eval_runtime": 12.8936, "eval_samples_per_second": 7.756, "eval_steps_per_second": 1.939, "step": 29200 }, { "epoch": 0.9843607806127608, "grad_norm": 59.42336654663086, "learning_rate": 7.448696371494257e-10, "logits/chosen": -1.8782780170440674, "logits/rejected": -2.575622797012329, "logps/chosen": -2.4771718978881836, "logps/rejected": -3.015079975128174, "loss": 1.4728, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -24.771717071533203, "rewards/margins": 5.379080772399902, "rewards/rejected": -30.150802612304688, "step": 29205 }, { "epoch": 0.9845293066837439, "grad_norm": 8.903990745544434, "learning_rate": 7.28906786443273e-10, "logits/chosen": -2.6060993671417236, "logits/rejected": -2.4315786361694336, "logps/chosen": -3.1309814453125, "logps/rejected": -3.394001007080078, "loss": 2.6579, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -31.309810638427734, "rewards/margins": 2.630197763442993, "rewards/rejected": -33.94001007080078, "step": 29210 }, { "epoch": 0.9846978327547271, "grad_norm": 35.52606964111328, "learning_rate": 7.13116713791928e-10, "logits/chosen": -1.7815643548965454, "logits/rejected": -1.951751470565796, "logps/chosen": -2.3580946922302246, "logps/rejected": -2.5867106914520264, "loss": 2.2759, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.580947875976562, "rewards/margins": 2.2861576080322266, "rewards/rejected": -25.867107391357422, "step": 29215 }, { "epoch": 0.9848663588257104, "grad_norm": 33.73111343383789, "learning_rate": 6.974994246598531e-10, "logits/chosen": -2.0735392570495605, "logits/rejected": -2.174109697341919, "logps/chosen": -2.5979247093200684, "logps/rejected": -2.739593267440796, "loss": 3.0291, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.979248046875, "rewards/margins": 1.4166853427886963, "rewards/rejected": -27.395931243896484, "step": 29220 }, { "epoch": 0.9850348848966936, "grad_norm": 32.62689208984375, "learning_rate": 6.820549244514473e-10, "logits/chosen": -2.2399845123291016, "logits/rejected": -1.980328917503357, "logps/chosen": -2.616978645324707, "logps/rejected": -2.76224684715271, "loss": 3.4192, "rewards/accuracies": 0.5, "rewards/chosen": -26.169788360595703, "rewards/margins": 1.4526822566986084, "rewards/rejected": -27.62247085571289, "step": 29225 }, { "epoch": 0.9852034109676767, "grad_norm": 34.097007751464844, "learning_rate": 6.667832185114908e-10, "logits/chosen": -1.56089186668396, "logits/rejected": -1.7227294445037842, "logps/chosen": -1.9335496425628662, "logps/rejected": -2.214301824569702, "loss": 2.5257, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.335498809814453, "rewards/margins": 2.807521343231201, "rewards/rejected": -22.14301872253418, "step": 29230 }, { "epoch": 0.9853719370386599, "grad_norm": 31.763954162597656, "learning_rate": 6.516843121249227e-10, "logits/chosen": -1.6636956930160522, "logits/rejected": -1.8226375579833984, "logps/chosen": -1.736010193824768, "logps/rejected": -1.8142486810684204, "loss": 2.6185, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.3601016998291, "rewards/margins": 0.7823851108551025, "rewards/rejected": -18.142486572265625, "step": 29235 }, { "epoch": 0.985540463109643, "grad_norm": 32.31121826171875, "learning_rate": 6.367582105168968e-10, "logits/chosen": -1.8201602697372437, "logits/rejected": -2.7413697242736816, "logps/chosen": -2.2536308765411377, "logps/rejected": -3.829385280609131, "loss": 1.3365, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.536306381225586, "rewards/margins": 15.757547378540039, "rewards/rejected": -38.293853759765625, "step": 29240 }, { "epoch": 0.9857089891806262, "grad_norm": 27.129085540771484, "learning_rate": 6.220049188527254e-10, "logits/chosen": -1.7569191455841064, "logits/rejected": -1.9579541683197021, "logps/chosen": -2.5340628623962402, "logps/rejected": -2.6569466590881348, "loss": 3.4591, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.340627670288086, "rewards/margins": 1.228839635848999, "rewards/rejected": -26.5694637298584, "step": 29245 }, { "epoch": 0.9858775152516094, "grad_norm": 7.850180149078369, "learning_rate": 6.07424442237936e-10, "logits/chosen": -2.442378044128418, "logits/rejected": -2.476134777069092, "logps/chosen": -2.5148308277130127, "logps/rejected": -3.0705695152282715, "loss": 2.3982, "rewards/accuracies": 0.5, "rewards/chosen": -25.148305892944336, "rewards/margins": 5.557389736175537, "rewards/rejected": -30.705698013305664, "step": 29250 }, { "epoch": 0.9860460413225927, "grad_norm": 56.99378967285156, "learning_rate": 5.930167857182699e-10, "logits/chosen": -1.0735199451446533, "logits/rejected": -1.1086907386779785, "logps/chosen": -2.445570468902588, "logps/rejected": -2.5016674995422363, "loss": 3.7414, "rewards/accuracies": 0.5, "rewards/chosen": -24.455707550048828, "rewards/margins": 0.560967743396759, "rewards/rejected": -25.016674041748047, "step": 29255 }, { "epoch": 0.9862145673935758, "grad_norm": 30.35076904296875, "learning_rate": 5.787819542796279e-10, "logits/chosen": -1.972487211227417, "logits/rejected": -2.2727303504943848, "logps/chosen": -2.4433388710021973, "logps/rejected": -2.5713579654693604, "loss": 4.8165, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -24.433391571044922, "rewards/margins": 1.2801861763000488, "rewards/rejected": -25.713577270507812, "step": 29260 }, { "epoch": 0.986383093464559, "grad_norm": 22.402549743652344, "learning_rate": 5.647199528481805e-10, "logits/chosen": -2.089200496673584, "logits/rejected": -2.1892621517181396, "logps/chosen": -2.8049428462982178, "logps/rejected": -3.1378207206726074, "loss": 2.0068, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -28.049428939819336, "rewards/margins": 3.328777313232422, "rewards/rejected": -31.37820816040039, "step": 29265 }, { "epoch": 0.9865516195355422, "grad_norm": 719.610107421875, "learning_rate": 5.508307862901462e-10, "logits/chosen": -2.50829815864563, "logits/rejected": -2.8846817016601562, "logps/chosen": -3.515585422515869, "logps/rejected": -3.9776313304901123, "loss": 3.2065, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -35.15585708618164, "rewards/margins": 4.620454788208008, "rewards/rejected": -39.77631378173828, "step": 29270 }, { "epoch": 0.9867201456065253, "grad_norm": 65.99314880371094, "learning_rate": 5.371144594120691e-10, "logits/chosen": -1.5672378540039062, "logits/rejected": -1.792138695716858, "logps/chosen": -3.856194257736206, "logps/rejected": -3.839482069015503, "loss": 5.3038, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -38.56194305419922, "rewards/margins": -0.16712017357349396, "rewards/rejected": -38.39482116699219, "step": 29275 }, { "epoch": 0.9868886716775085, "grad_norm": 50.995548248291016, "learning_rate": 5.235709769606522e-10, "logits/chosen": -1.8027080297470093, "logits/rejected": -2.1330089569091797, "logps/chosen": -2.413428544998169, "logps/rejected": -2.9030966758728027, "loss": 2.2273, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -24.13428497314453, "rewards/margins": 4.896681785583496, "rewards/rejected": -29.030963897705078, "step": 29280 }, { "epoch": 0.9870571977484917, "grad_norm": 51.32760238647461, "learning_rate": 5.102003436227576e-10, "logits/chosen": -2.102128267288208, "logits/rejected": -2.396106004714966, "logps/chosen": -2.6976327896118164, "logps/rejected": -3.4498488903045654, "loss": 1.5823, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -26.976327896118164, "rewards/margins": 7.522161960601807, "rewards/rejected": -34.49848937988281, "step": 29285 }, { "epoch": 0.9872257238194748, "grad_norm": 41.277225494384766, "learning_rate": 4.970025640253505e-10, "logits/chosen": -1.7478545904159546, "logits/rejected": -2.046114206314087, "logps/chosen": -2.7258543968200684, "logps/rejected": -2.9497528076171875, "loss": 3.0146, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -27.258544921875, "rewards/margins": 2.238983154296875, "rewards/rejected": -29.497528076171875, "step": 29290 }, { "epoch": 0.9873942498904581, "grad_norm": 11.74426555633545, "learning_rate": 4.839776427357778e-10, "logits/chosen": -1.8929609060287476, "logits/rejected": -2.212101459503174, "logps/chosen": -2.844020366668701, "logps/rejected": -3.1261801719665527, "loss": 2.3292, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -28.440200805664062, "rewards/margins": 2.8215994834899902, "rewards/rejected": -31.261804580688477, "step": 29295 }, { "epoch": 0.9875627759614413, "grad_norm": 21.692180633544922, "learning_rate": 4.711255842613226e-10, "logits/chosen": -1.4589345455169678, "logits/rejected": -2.142324209213257, "logps/chosen": -1.9915090799331665, "logps/rejected": -2.128383159637451, "loss": 2.2874, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.915088653564453, "rewards/margins": 1.368740200996399, "rewards/rejected": -21.283828735351562, "step": 29300 }, { "epoch": 0.9877313020324244, "grad_norm": 21.955440521240234, "learning_rate": 4.584463930497051e-10, "logits/chosen": -1.9233735799789429, "logits/rejected": -2.112452268600464, "logps/chosen": -2.1158950328826904, "logps/rejected": -2.686612606048584, "loss": 2.1764, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.158950805664062, "rewards/margins": 5.707176208496094, "rewards/rejected": -26.866125106811523, "step": 29305 }, { "epoch": 0.9878998281034076, "grad_norm": 20.328765869140625, "learning_rate": 4.459400734886376e-10, "logits/chosen": -1.9143705368041992, "logits/rejected": -2.199479341506958, "logps/chosen": -1.9987812042236328, "logps/rejected": -2.205310344696045, "loss": 1.8434, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.987812042236328, "rewards/margins": 2.0652928352355957, "rewards/rejected": -22.053104400634766, "step": 29310 }, { "epoch": 0.9880683541743908, "grad_norm": 25.061670303344727, "learning_rate": 4.33606629906047e-10, "logits/chosen": -1.4500936269760132, "logits/rejected": -1.5468348264694214, "logps/chosen": -2.3813271522521973, "logps/rejected": -2.4723310470581055, "loss": 2.542, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -23.81327247619629, "rewards/margins": 0.910036563873291, "rewards/rejected": -24.723308563232422, "step": 29315 }, { "epoch": 0.9882368802453739, "grad_norm": 110.3287582397461, "learning_rate": 4.2144606657007475e-10, "logits/chosen": -1.7592315673828125, "logits/rejected": -1.8345882892608643, "logps/chosen": -2.7503910064697266, "logps/rejected": -2.7181968688964844, "loss": 4.7156, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -27.5039119720459, "rewards/margins": -0.3219425082206726, "rewards/rejected": -27.18196678161621, "step": 29320 }, { "epoch": 0.9884054063163571, "grad_norm": 69.42078399658203, "learning_rate": 4.0945838768902116e-10, "logits/chosen": -1.7482601404190063, "logits/rejected": -1.8381834030151367, "logps/chosen": -2.808565378189087, "logps/rejected": -3.3611602783203125, "loss": 1.7605, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -28.085657119750977, "rewards/margins": 5.525949954986572, "rewards/rejected": -33.611602783203125, "step": 29325 }, { "epoch": 0.9885739323873404, "grad_norm": 56.57253646850586, "learning_rate": 3.9764359741134566e-10, "logits/chosen": -1.9182945489883423, "logits/rejected": -2.220074415206909, "logps/chosen": -2.5628058910369873, "logps/rejected": -3.0394446849823, "loss": 1.4884, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -25.628061294555664, "rewards/margins": 4.766386985778809, "rewards/rejected": -30.394445419311523, "step": 29330 }, { "epoch": 0.9887424584583235, "grad_norm": 28.18037986755371, "learning_rate": 3.8600169982566655e-10, "logits/chosen": -1.6121898889541626, "logits/rejected": -1.843125581741333, "logps/chosen": -2.0145015716552734, "logps/rejected": -2.2432212829589844, "loss": 2.1496, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.1450138092041, "rewards/margins": 2.2871999740600586, "rewards/rejected": -22.432212829589844, "step": 29335 }, { "epoch": 0.9889109845293067, "grad_norm": 26.318395614624023, "learning_rate": 3.7453269896081665e-10, "logits/chosen": -2.023275852203369, "logits/rejected": -2.5470919609069824, "logps/chosen": -2.5232348442077637, "logps/rejected": -3.547163486480713, "loss": 0.7872, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -25.232349395751953, "rewards/margins": 10.239288330078125, "rewards/rejected": -35.47163391113281, "step": 29340 }, { "epoch": 0.9890795106002899, "grad_norm": 68.82476043701172, "learning_rate": 3.632365987856767e-10, "logits/chosen": -2.064157009124756, "logits/rejected": -1.9872633218765259, "logps/chosen": -2.234658718109131, "logps/rejected": -2.367302179336548, "loss": 2.9236, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.34658432006836, "rewards/margins": 1.3264367580413818, "rewards/rejected": -23.673023223876953, "step": 29345 }, { "epoch": 0.989248036671273, "grad_norm": 175.67710876464844, "learning_rate": 3.5211340320950853e-10, "logits/chosen": -2.1267590522766113, "logits/rejected": -2.370877742767334, "logps/chosen": -3.2157340049743652, "logps/rejected": -3.601935625076294, "loss": 1.9471, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -32.15734100341797, "rewards/margins": 3.862014055252075, "rewards/rejected": -36.01935577392578, "step": 29350 }, { "epoch": 0.9894165627422562, "grad_norm": 13.162681579589844, "learning_rate": 3.4116311608151095e-10, "logits/chosen": -1.7573215961456299, "logits/rejected": -1.9320533275604248, "logps/chosen": -3.3882813453674316, "logps/rejected": -3.6124320030212402, "loss": 2.8952, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -33.8828125, "rewards/margins": 2.2415099143981934, "rewards/rejected": -36.12432098388672, "step": 29355 }, { "epoch": 0.9895850888132394, "grad_norm": 49.257545471191406, "learning_rate": 3.303857411912081e-10, "logits/chosen": -1.8998920917510986, "logits/rejected": -2.0867068767547607, "logps/chosen": -1.901677131652832, "logps/rejected": -2.135617971420288, "loss": 2.2306, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.01677131652832, "rewards/margins": 2.339409351348877, "rewards/rejected": -21.356182098388672, "step": 29360 }, { "epoch": 0.9897536148842226, "grad_norm": 73.01632690429688, "learning_rate": 3.1978128226822775e-10, "logits/chosen": -1.3546922206878662, "logits/rejected": -1.2406705617904663, "logps/chosen": -2.1406056880950928, "logps/rejected": -2.3482110500335693, "loss": 2.5606, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -21.406057357788086, "rewards/margins": 2.0760552883148193, "rewards/rejected": -23.482112884521484, "step": 29365 }, { "epoch": 0.9899221409552058, "grad_norm": 51.609500885009766, "learning_rate": 3.093497429823011e-10, "logits/chosen": -2.147397518157959, "logits/rejected": -2.0322537422180176, "logps/chosen": -2.272533893585205, "logps/rejected": -2.137698173522949, "loss": 5.1356, "rewards/accuracies": 0.5, "rewards/chosen": -22.725337982177734, "rewards/margins": -1.3483564853668213, "rewards/rejected": -21.37697982788086, "step": 29370 }, { "epoch": 0.990090667026189, "grad_norm": 15.991497039794922, "learning_rate": 2.990911269433738e-10, "logits/chosen": -2.087343215942383, "logits/rejected": -2.145076036453247, "logps/chosen": -3.2033817768096924, "logps/rejected": -3.4729628562927246, "loss": 2.1612, "rewards/accuracies": 0.5, "rewards/chosen": -32.0338134765625, "rewards/margins": 2.6958134174346924, "rewards/rejected": -34.7296257019043, "step": 29375 }, { "epoch": 0.9902591930971721, "grad_norm": 35.58505630493164, "learning_rate": 2.89005437701606e-10, "logits/chosen": -2.4574391841888428, "logits/rejected": -2.5140743255615234, "logps/chosen": -2.106220245361328, "logps/rejected": -2.2607998847961426, "loss": 1.9083, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.06220245361328, "rewards/margins": 1.5457961559295654, "rewards/rejected": -22.60799789428711, "step": 29380 }, { "epoch": 0.9904277191681553, "grad_norm": 127.40261840820312, "learning_rate": 2.790926787472614e-10, "logits/chosen": -2.1903467178344727, "logits/rejected": -2.4454219341278076, "logps/chosen": -3.0538299083709717, "logps/rejected": -3.1230578422546387, "loss": 2.8361, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -30.538299560546875, "rewards/margins": 0.6922758221626282, "rewards/rejected": -31.230575561523438, "step": 29385 }, { "epoch": 0.9905962452391385, "grad_norm": 290.842041015625, "learning_rate": 2.693528535106515e-10, "logits/chosen": -2.323547840118408, "logits/rejected": -2.200070858001709, "logps/chosen": -2.8393075466156006, "logps/rejected": -2.5815980434417725, "loss": 6.0147, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -28.393077850341797, "rewards/margins": -2.5770952701568604, "rewards/rejected": -25.815982818603516, "step": 29390 }, { "epoch": 0.9907647713101216, "grad_norm": 0.32132697105407715, "learning_rate": 2.59785965362469e-10, "logits/chosen": -1.2735278606414795, "logits/rejected": -1.618552803993225, "logps/chosen": -2.191002368927002, "logps/rejected": -2.9995617866516113, "loss": 2.9313, "rewards/accuracies": 0.5, "rewards/chosen": -21.910022735595703, "rewards/margins": 8.085596084594727, "rewards/rejected": -29.995616912841797, "step": 29395 }, { "epoch": 0.9909332973811048, "grad_norm": 15.46959114074707, "learning_rate": 2.503920176133989e-10, "logits/chosen": -2.1506848335266113, "logits/rejected": -2.733182191848755, "logps/chosen": -2.5749802589416504, "logps/rejected": -3.1757876873016357, "loss": 1.575, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -25.749805450439453, "rewards/margins": 6.008072853088379, "rewards/rejected": -31.75787353515625, "step": 29400 }, { "epoch": 0.9911018234520881, "grad_norm": 41.7327880859375, "learning_rate": 2.4117101351428527e-10, "logits/chosen": -1.9427173137664795, "logits/rejected": -2.579338550567627, "logps/chosen": -2.8326809406280518, "logps/rejected": -4.368477821350098, "loss": 1.543, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -28.32680892944336, "rewards/margins": 15.357976913452148, "rewards/rejected": -43.684783935546875, "step": 29405 }, { "epoch": 0.9912703495230712, "grad_norm": 112.20866394042969, "learning_rate": 2.321229562561311e-10, "logits/chosen": -1.885607123374939, "logits/rejected": -2.2688915729522705, "logps/chosen": -2.0797476768493652, "logps/rejected": -2.6279759407043457, "loss": 2.9359, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.7974796295166, "rewards/margins": 5.482276916503906, "rewards/rejected": -26.279754638671875, "step": 29410 }, { "epoch": 0.9914388755940544, "grad_norm": 33.625770568847656, "learning_rate": 2.2324784897020942e-10, "logits/chosen": -2.2554001808166504, "logits/rejected": -2.609884738922119, "logps/chosen": -2.615712881088257, "logps/rejected": -3.316114902496338, "loss": 2.0842, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -26.157129287719727, "rewards/margins": 7.0040178298950195, "rewards/rejected": -33.16114807128906, "step": 29415 }, { "epoch": 0.9916074016650376, "grad_norm": 21.583099365234375, "learning_rate": 2.1454569472773022e-10, "logits/chosen": -1.9407627582550049, "logits/rejected": -2.017627000808716, "logps/chosen": -2.4730026721954346, "logps/rejected": -3.1498286724090576, "loss": 2.4309, "rewards/accuracies": 0.5, "rewards/chosen": -24.730026245117188, "rewards/margins": 6.7682600021362305, "rewards/rejected": -31.4982852935791, "step": 29420 }, { "epoch": 0.9917759277360207, "grad_norm": 14.413912773132324, "learning_rate": 2.0601649654028441e-10, "logits/chosen": -1.7489020824432373, "logits/rejected": -2.184823513031006, "logps/chosen": -2.741649627685547, "logps/rejected": -3.141798734664917, "loss": 2.2382, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -27.4164981842041, "rewards/margins": 4.001489162445068, "rewards/rejected": -31.417987823486328, "step": 29425 }, { "epoch": 0.9919444538070039, "grad_norm": 20.478744506835938, "learning_rate": 1.9766025735939995e-10, "logits/chosen": -2.1394152641296387, "logits/rejected": -2.261610507965088, "logps/chosen": -1.9039617776870728, "logps/rejected": -2.0586116313934326, "loss": 2.9164, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.03961753845215, "rewards/margins": 1.5464990139007568, "rewards/rejected": -20.58611488342285, "step": 29430 }, { "epoch": 0.9921129798779871, "grad_norm": 31.20186996459961, "learning_rate": 1.8947698007687474e-10, "logits/chosen": -1.5419042110443115, "logits/rejected": -1.47139310836792, "logps/chosen": -2.033275842666626, "logps/rejected": -2.1540489196777344, "loss": 2.9035, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.33275604248047, "rewards/margins": 1.207731008529663, "rewards/rejected": -21.54048728942871, "step": 29435 }, { "epoch": 0.9922815059489704, "grad_norm": 79.59695434570312, "learning_rate": 1.8146666752466566e-10, "logits/chosen": -1.8107490539550781, "logits/rejected": -1.939801812171936, "logps/chosen": -2.415051221847534, "logps/rejected": -2.6897196769714355, "loss": 2.4452, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.150510787963867, "rewards/margins": 2.7466835975646973, "rewards/rejected": -26.897192001342773, "step": 29440 }, { "epoch": 0.9924500320199535, "grad_norm": 7.891360291978344e-05, "learning_rate": 1.7362932247472206e-10, "logits/chosen": -2.1820032596588135, "logits/rejected": -2.3591179847717285, "logps/chosen": -2.8081321716308594, "logps/rejected": -3.666566848754883, "loss": 1.7237, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -28.081323623657227, "rewards/margins": 8.584344863891602, "rewards/rejected": -36.66566848754883, "step": 29445 }, { "epoch": 0.9926185580909367, "grad_norm": 48.53298568725586, "learning_rate": 1.6596494763931878e-10, "logits/chosen": -1.581608772277832, "logits/rejected": -1.462424635887146, "logps/chosen": -2.1671245098114014, "logps/rejected": -2.1732068061828613, "loss": 3.3712, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -21.671245574951172, "rewards/margins": 0.06082124635577202, "rewards/rejected": -21.732067108154297, "step": 29450 }, { "epoch": 0.9927870841619199, "grad_norm": 71.53341674804688, "learning_rate": 1.5847354567077864e-10, "logits/chosen": -1.604501485824585, "logits/rejected": -2.0805764198303223, "logps/chosen": -2.781949043273926, "logps/rejected": -2.9773471355438232, "loss": 3.1445, "rewards/accuracies": 0.5, "rewards/chosen": -27.81949234008789, "rewards/margins": 1.9539794921875, "rewards/rejected": -29.77347183227539, "step": 29455 }, { "epoch": 0.992955610232903, "grad_norm": 38.53224563598633, "learning_rate": 1.511551191615279e-10, "logits/chosen": -2.089249849319458, "logits/rejected": -2.2512056827545166, "logps/chosen": -2.8309149742126465, "logps/rejected": -2.986532688140869, "loss": 2.5773, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -28.30914878845215, "rewards/margins": 1.5561754703521729, "rewards/rejected": -29.86532211303711, "step": 29460 }, { "epoch": 0.9931241363038862, "grad_norm": 34.39546585083008, "learning_rate": 1.4400967064426283e-10, "logits/chosen": -1.3842085599899292, "logits/rejected": -1.924599051475525, "logps/chosen": -1.9771686792373657, "logps/rejected": -2.347097396850586, "loss": 1.884, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -19.771686553955078, "rewards/margins": 3.6992874145507812, "rewards/rejected": -23.47097396850586, "step": 29465 }, { "epoch": 0.9932926623748694, "grad_norm": 88.5250244140625, "learning_rate": 1.3703720259172768e-10, "logits/chosen": -1.921378493309021, "logits/rejected": -1.913988471031189, "logps/chosen": -2.352660655975342, "logps/rejected": -2.473912000656128, "loss": 2.6431, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.52660369873047, "rewards/margins": 1.2125155925750732, "rewards/rejected": -24.739120483398438, "step": 29470 }, { "epoch": 0.9934611884458526, "grad_norm": 87.77112579345703, "learning_rate": 1.3023771741682564e-10, "logits/chosen": -1.5295897722244263, "logits/rejected": -1.5386664867401123, "logps/chosen": -2.8793563842773438, "logps/rejected": -2.8281455039978027, "loss": 4.8123, "rewards/accuracies": 0.5, "rewards/chosen": -28.793567657470703, "rewards/margins": -0.5121095776557922, "rewards/rejected": -28.28145408630371, "step": 29475 }, { "epoch": 0.9936297145168358, "grad_norm": 35.174774169921875, "learning_rate": 1.2361121747250792e-10, "logits/chosen": -1.5465962886810303, "logits/rejected": -1.622862458229065, "logps/chosen": -2.6694183349609375, "logps/rejected": -2.7877554893493652, "loss": 2.9192, "rewards/accuracies": 0.5, "rewards/chosen": -26.69417953491211, "rewards/margins": 1.1833751201629639, "rewards/rejected": -27.877553939819336, "step": 29480 }, { "epoch": 0.993798240587819, "grad_norm": 38.90205383300781, "learning_rate": 1.1715770505205114e-10, "logits/chosen": -1.9135345220565796, "logits/rejected": -2.611912250518799, "logps/chosen": -2.255214214324951, "logps/rejected": -2.5369668006896973, "loss": 2.9454, "rewards/accuracies": 0.5, "rewards/chosen": -22.552143096923828, "rewards/margins": 2.8175246715545654, "rewards/rejected": -25.369667053222656, "step": 29485 }, { "epoch": 0.9939667666588021, "grad_norm": 38.99119186401367, "learning_rate": 1.1087718238866894e-10, "logits/chosen": -2.2368454933166504, "logits/rejected": -2.5548629760742188, "logps/chosen": -2.6624879837036133, "logps/rejected": -2.9628915786743164, "loss": 2.3557, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -26.6248779296875, "rewards/margins": 3.0040369033813477, "rewards/rejected": -29.628917694091797, "step": 29490 }, { "epoch": 0.9941352927297853, "grad_norm": 28.901277542114258, "learning_rate": 1.0476965165590046e-10, "logits/chosen": -2.244162082672119, "logits/rejected": -2.3326351642608643, "logps/chosen": -2.6673943996429443, "logps/rejected": -2.514676332473755, "loss": 6.1204, "rewards/accuracies": 0.5, "rewards/chosen": -26.6739444732666, "rewards/margins": -1.527183175086975, "rewards/rejected": -25.14676284790039, "step": 29495 }, { "epoch": 0.9943038188007685, "grad_norm": 31.86191177368164, "learning_rate": 9.883511496722175e-11, "logits/chosen": -2.0590789318084717, "logits/rejected": -2.631744623184204, "logps/chosen": -2.57993745803833, "logps/rejected": -3.5227248668670654, "loss": 1.5666, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -25.79937744140625, "rewards/margins": 9.427871704101562, "rewards/rejected": -35.22725296020508, "step": 29500 }, { "epoch": 0.9944723448717516, "grad_norm": 28.257728576660156, "learning_rate": 9.307357437637887e-11, "logits/chosen": -1.8757565021514893, "logits/rejected": -1.8568928241729736, "logps/chosen": -2.9727933406829834, "logps/rejected": -3.244004011154175, "loss": 1.5322, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -29.72793197631836, "rewards/margins": 2.7121071815490723, "rewards/rejected": -32.440040588378906, "step": 29505 }, { "epoch": 0.9946408709427348, "grad_norm": 2.5099925994873047, "learning_rate": 8.748503187727685e-11, "logits/chosen": -2.2586395740509033, "logits/rejected": -2.446408748626709, "logps/chosen": -3.2831978797912598, "logps/rejected": -3.4535508155822754, "loss": 2.634, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -32.83197784423828, "rewards/margins": 1.7035328149795532, "rewards/rejected": -34.53550720214844, "step": 29510 }, { "epoch": 0.9948093970137181, "grad_norm": 145.2407684326172, "learning_rate": 8.20694894038132e-11, "logits/chosen": -2.0817415714263916, "logits/rejected": -1.9775947332382202, "logps/chosen": -2.9662461280822754, "logps/rejected": -3.018073797225952, "loss": 4.6757, "rewards/accuracies": 0.5, "rewards/chosen": -29.662464141845703, "rewards/margins": 0.5182735323905945, "rewards/rejected": -30.180736541748047, "step": 29515 }, { "epoch": 0.9949779230847012, "grad_norm": 35.678321838378906, "learning_rate": 7.682694883015539e-11, "logits/chosen": -1.6773369312286377, "logits/rejected": -2.1161258220672607, "logps/chosen": -1.8673717975616455, "logps/rejected": -2.8105673789978027, "loss": 2.3088, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.673717498779297, "rewards/margins": 9.431957244873047, "rewards/rejected": -28.105676651000977, "step": 29520 }, { "epoch": 0.9951464491556844, "grad_norm": 12.716200828552246, "learning_rate": 7.175741197046337e-11, "logits/chosen": -2.0986926555633545, "logits/rejected": -2.707505702972412, "logps/chosen": -2.657029390335083, "logps/rejected": -3.285545825958252, "loss": 2.1085, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.57029151916504, "rewards/margins": 6.285164833068848, "rewards/rejected": -32.8554573059082, "step": 29525 }, { "epoch": 0.9953149752266676, "grad_norm": 44.7720947265625, "learning_rate": 6.686088057916706e-11, "logits/chosen": -1.5420719385147095, "logits/rejected": -1.6112966537475586, "logps/chosen": -1.7459132671356201, "logps/rejected": -1.9257084131240845, "loss": 2.3536, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -17.45913314819336, "rewards/margins": 1.7979503870010376, "rewards/rejected": -19.257083892822266, "step": 29530 }, { "epoch": 0.9954835012976507, "grad_norm": 55.11979293823242, "learning_rate": 6.213735635068885e-11, "logits/chosen": -1.4349676370620728, "logits/rejected": -1.5553808212280273, "logps/chosen": -2.4499545097351074, "logps/rejected": -2.442242383956909, "loss": 3.2346, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -24.499547958374023, "rewards/margins": -0.0771210640668869, "rewards/rejected": -24.422426223754883, "step": 29535 }, { "epoch": 0.9956520273686339, "grad_norm": 38.8819465637207, "learning_rate": 5.7586840919776616e-11, "logits/chosen": -1.542608618736267, "logits/rejected": -1.7981573343276978, "logps/chosen": -2.311901569366455, "logps/rejected": -2.3454556465148926, "loss": 3.6785, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -23.119014739990234, "rewards/margins": 0.3355420231819153, "rewards/rejected": -23.454559326171875, "step": 29540 }, { "epoch": 0.9958205534396171, "grad_norm": 30.439922332763672, "learning_rate": 5.320933586105969e-11, "logits/chosen": -1.348572850227356, "logits/rejected": -1.7277705669403076, "logps/chosen": -2.084947109222412, "logps/rejected": -2.9177541732788086, "loss": 1.8665, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -20.849472045898438, "rewards/margins": 8.328069686889648, "rewards/rejected": -29.177541732788086, "step": 29545 }, { "epoch": 0.9959890795106003, "grad_norm": 35.98751449584961, "learning_rate": 4.90048426894929e-11, "logits/chosen": -1.7604057788848877, "logits/rejected": -1.8795154094696045, "logps/chosen": -1.927356481552124, "logps/rejected": -2.0210258960723877, "loss": 2.453, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.2735652923584, "rewards/margins": 0.9366942644119263, "rewards/rejected": -20.21026039123535, "step": 29550 }, { "epoch": 0.9961576055815835, "grad_norm": 23.52623176574707, "learning_rate": 4.497336286007902e-11, "logits/chosen": -2.0633692741394043, "logits/rejected": -2.2805099487304688, "logps/chosen": -2.636157989501953, "logps/rejected": -2.6414108276367188, "loss": 3.6446, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.361581802368164, "rewards/margins": 0.05252895504236221, "rewards/rejected": -26.414112091064453, "step": 29555 }, { "epoch": 0.9963261316525667, "grad_norm": 0.07981903105974197, "learning_rate": 4.111489776792432e-11, "logits/chosen": -1.992034912109375, "logits/rejected": -1.8614925146102905, "logps/chosen": -2.9912405014038086, "logps/rejected": -3.2475357055664062, "loss": 2.7499, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -29.912405014038086, "rewards/margins": 2.5629496574401855, "rewards/rejected": -32.4753532409668, "step": 29560 }, { "epoch": 0.9964946577235498, "grad_norm": 29.34018325805664, "learning_rate": 3.742944874829401e-11, "logits/chosen": -1.7337758541107178, "logits/rejected": -1.7979612350463867, "logps/chosen": -2.685328722000122, "logps/rejected": -2.908069133758545, "loss": 2.1848, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.853282928466797, "rewards/margins": 2.2274093627929688, "rewards/rejected": -29.0806941986084, "step": 29565 }, { "epoch": 0.996663183794533, "grad_norm": 31.588529586791992, "learning_rate": 3.391701707666783e-11, "logits/chosen": -1.919046401977539, "logits/rejected": -2.470398426055908, "logps/chosen": -2.288696050643921, "logps/rejected": -3.090512752532959, "loss": 1.9905, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.886960983276367, "rewards/margins": 8.018167495727539, "rewards/rejected": -30.90513038635254, "step": 29570 }, { "epoch": 0.9968317098655162, "grad_norm": 36.931217193603516, "learning_rate": 3.0577603968406915e-11, "logits/chosen": -1.0361428260803223, "logits/rejected": -1.8651416301727295, "logps/chosen": -2.3662960529327393, "logps/rejected": -3.014408588409424, "loss": 2.6455, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.662960052490234, "rewards/margins": 6.481126308441162, "rewards/rejected": -30.144084930419922, "step": 29575 }, { "epoch": 0.9970002359364993, "grad_norm": 77.14857482910156, "learning_rate": 2.741121057925344e-11, "logits/chosen": -2.14117169380188, "logits/rejected": -2.7788901329040527, "logps/chosen": -3.382768154144287, "logps/rejected": -4.087404727935791, "loss": 1.3519, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -33.82768249511719, "rewards/margins": 7.046361446380615, "rewards/rejected": -40.874046325683594, "step": 29580 }, { "epoch": 0.9971687620074826, "grad_norm": 13.677254676818848, "learning_rate": 2.4417838004942014e-11, "logits/chosen": -1.6798818111419678, "logits/rejected": -1.7711633443832397, "logps/chosen": -2.3940412998199463, "logps/rejected": -2.2626118659973145, "loss": 4.6407, "rewards/accuracies": 0.5, "rewards/chosen": -23.940410614013672, "rewards/margins": -1.3142937421798706, "rewards/rejected": -22.626117706298828, "step": 29585 }, { "epoch": 0.9973372880784658, "grad_norm": 25.177274703979492, "learning_rate": 2.1597487281366234e-11, "logits/chosen": -1.2978934049606323, "logits/rejected": -1.5820646286010742, "logps/chosen": -2.278775691986084, "logps/rejected": -2.6102938652038574, "loss": 1.3518, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.78775405883789, "rewards/margins": 3.315180540084839, "rewards/rejected": -26.102935791015625, "step": 29590 }, { "epoch": 0.9975058141494489, "grad_norm": 17.82845115661621, "learning_rate": 1.8950159384578666e-11, "logits/chosen": -1.8338546752929688, "logits/rejected": -1.8769657611846924, "logps/chosen": -3.4636001586914062, "logps/rejected": -3.8861083984375, "loss": 2.4416, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -34.6359977722168, "rewards/margins": 4.225088596343994, "rewards/rejected": -38.861087799072266, "step": 29595 }, { "epoch": 0.9976743402204321, "grad_norm": 33.08721923828125, "learning_rate": 1.6475855230624336e-11, "logits/chosen": -2.3700110912323, "logits/rejected": -2.2387986183166504, "logps/chosen": -2.5337352752685547, "logps/rejected": -2.4349582195281982, "loss": 4.4406, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -25.337352752685547, "rewards/margins": -0.9877703785896301, "rewards/rejected": -24.34958267211914, "step": 29600 }, { "epoch": 0.9976743402204321, "eval_logits/chosen": -2.3131020069122314, "eval_logits/rejected": -2.491314649581909, "eval_logps/chosen": -2.289489507675171, "eval_logps/rejected": -2.4444422721862793, "eval_loss": 3.0876708030700684, "eval_rewards/accuracies": 0.6299999952316284, "eval_rewards/chosen": -22.8948917388916, "eval_rewards/margins": 1.5495290756225586, "eval_rewards/rejected": -24.44442367553711, "eval_runtime": 12.8936, "eval_samples_per_second": 7.756, "eval_steps_per_second": 1.939, "step": 29600 }, { "epoch": 0.9978428662914153, "grad_norm": 327.03582763671875, "learning_rate": 1.4174575675818256e-11, "logits/chosen": -1.5383819341659546, "logits/rejected": -1.8430954217910767, "logps/chosen": -2.5591301918029785, "logps/rejected": -2.5529518127441406, "loss": 3.5187, "rewards/accuracies": 0.5, "rewards/chosen": -25.591299057006836, "rewards/margins": -0.061784934252500534, "rewards/rejected": -25.529516220092773, "step": 29605 }, { "epoch": 0.9980113923623984, "grad_norm": 165.06675720214844, "learning_rate": 1.2046321516523405e-11, "logits/chosen": -2.3325538635253906, "logits/rejected": -2.551406145095825, "logps/chosen": -3.6366772651672363, "logps/rejected": -4.427830696105957, "loss": 2.3673, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -36.36676788330078, "rewards/margins": 7.911537170410156, "rewards/rejected": -44.2783088684082, "step": 29610 }, { "epoch": 0.9981799184333816, "grad_norm": 43.59071350097656, "learning_rate": 1.0091093489317249e-11, "logits/chosen": -2.1163182258605957, "logits/rejected": -2.2026889324188232, "logps/chosen": -2.3985800743103027, "logps/rejected": -2.481600522994995, "loss": 2.8614, "rewards/accuracies": 0.5, "rewards/chosen": -23.985801696777344, "rewards/margins": 0.830204963684082, "rewards/rejected": -24.81600570678711, "step": 29615 }, { "epoch": 0.9983484445043648, "grad_norm": 26.20160484313965, "learning_rate": 8.308892270714184e-12, "logits/chosen": -1.830585241317749, "logits/rejected": -1.8308626413345337, "logps/chosen": -2.605586290359497, "logps/rejected": -3.0579631328582764, "loss": 2.5246, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -26.055866241455078, "rewards/margins": 4.52376651763916, "rewards/rejected": -30.579631805419922, "step": 29620 }, { "epoch": 0.998516970575348, "grad_norm": 40.348228454589844, "learning_rate": 6.6997184775541285e-12, "logits/chosen": -1.8791911602020264, "logits/rejected": -2.223784923553467, "logps/chosen": -2.243389368057251, "logps/rejected": -2.3758978843688965, "loss": 2.6277, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -22.43389320373535, "rewards/margins": 1.3250858783721924, "rewards/rejected": -23.75897979736328, "step": 29625 }, { "epoch": 0.9986854966463312, "grad_norm": 37.30038070678711, "learning_rate": 5.263572666613925e-12, "logits/chosen": -1.5783147811889648, "logits/rejected": -1.5388597249984741, "logps/chosen": -1.9799810647964478, "logps/rejected": -2.0920217037200928, "loss": 2.8596, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -19.799808502197266, "rewards/margins": 1.1204078197479248, "rewards/rejected": -20.920215606689453, "step": 29630 }, { "epoch": 0.9988540227173144, "grad_norm": 56.66443634033203, "learning_rate": 4.0004553349959335e-12, "logits/chosen": -2.0255823135375977, "logits/rejected": -2.4068846702575684, "logps/chosen": -2.280776262283325, "logps/rejected": -3.0424695014953613, "loss": 1.2858, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -22.807762145996094, "rewards/margins": 7.616931915283203, "rewards/rejected": -30.424694061279297, "step": 29635 }, { "epoch": 0.9990225487882975, "grad_norm": 34.044044494628906, "learning_rate": 2.910366919739449e-12, "logits/chosen": -2.4027655124664307, "logits/rejected": -2.703449010848999, "logps/chosen": -2.6529898643493652, "logps/rejected": -3.1543755531311035, "loss": 3.742, "rewards/accuracies": 0.5, "rewards/chosen": -26.529897689819336, "rewards/margins": 5.013855457305908, "rewards/rejected": -31.54375648498535, "step": 29640 }, { "epoch": 0.9991910748592807, "grad_norm": 37.6712646484375, "learning_rate": 1.9933077980982537e-12, "logits/chosen": -1.9718729257583618, "logits/rejected": -2.110247850418091, "logps/chosen": -2.6708006858825684, "logps/rejected": -2.7890660762786865, "loss": 2.8295, "rewards/accuracies": 0.5, "rewards/chosen": -26.7080078125, "rewards/margins": 1.1826552152633667, "rewards/rejected": -27.890661239624023, "step": 29645 }, { "epoch": 0.9993596009302639, "grad_norm": 75.10658264160156, "learning_rate": 1.2492782874851115e-12, "logits/chosen": -2.043384552001953, "logits/rejected": -2.308835744857788, "logps/chosen": -2.0701959133148193, "logps/rejected": -2.082885265350342, "loss": 3.1115, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.70195960998535, "rewards/margins": 0.12689141929149628, "rewards/rejected": -20.8288516998291, "step": 29650 }, { "epoch": 0.999528127001247, "grad_norm": 14.825223922729492, "learning_rate": 6.782786453052303e-13, "logits/chosen": -1.656032919883728, "logits/rejected": -2.6159210205078125, "logps/chosen": -2.042296886444092, "logps/rejected": -2.4043121337890625, "loss": 1.6344, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -20.422969818115234, "rewards/margins": 3.6201515197753906, "rewards/rejected": -24.043121337890625, "step": 29655 }, { "epoch": 0.9996966530722303, "grad_norm": 24.87315559387207, "learning_rate": 2.803090691783083e-13, "logits/chosen": -1.6609647274017334, "logits/rejected": -2.110682487487793, "logps/chosen": -2.726547956466675, "logps/rejected": -3.436779737472534, "loss": 1.4057, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -27.26548194885254, "rewards/margins": 7.102316856384277, "rewards/rejected": -34.3677978515625, "step": 29660 }, { "epoch": 0.9998651791432135, "grad_norm": 33.12513732910156, "learning_rate": 5.5369696827511915e-14, "logits/chosen": -2.2672488689422607, "logits/rejected": -2.2087759971618652, "logps/chosen": -2.245776414871216, "logps/rejected": -2.293640613555908, "loss": 3.5487, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -22.457765579223633, "rewards/margins": 0.47864046692848206, "rewards/rejected": -22.936405181884766, "step": 29665 } ], "logging_steps": 5, "max_steps": 29669, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }