diff --git "a/checkpoint-29669/trainer_state.json" "b/checkpoint-29669/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-29669/trainer_state.json" @@ -0,0 +1,90212 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 400, + "global_step": 29669, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0001685260709831811, + "grad_norm": 20.571081161499023, + "learning_rate": 1.6852039096730705e-09, + "logits/chosen": -0.5615859031677246, + "logits/rejected": -0.5738595724105835, + "logps/chosen": -1.6699402332305908, + "logps/rejected": -1.7010023593902588, + "loss": 2.882, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.699399948120117, + "rewards/margins": 0.3106228709220886, + "rewards/rejected": -17.01002311706543, + "step": 5 + }, + { + "epoch": 0.0003370521419663622, + "grad_norm": 31.532499313354492, + "learning_rate": 3.370407819346141e-09, + "logits/chosen": -0.5780839323997498, + "logits/rejected": -0.44632649421691895, + "logps/chosen": -1.7983486652374268, + "logps/rejected": -1.7575130462646484, + "loss": 4.2936, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.98348617553711, + "rewards/margins": -0.4083539843559265, + "rewards/rejected": -17.575130462646484, + "step": 10 + }, + { + "epoch": 0.0005055782129495433, + "grad_norm": 30.432205200195312, + "learning_rate": 5.055611729019211e-09, + "logits/chosen": -0.3094201982021332, + "logits/rejected": -0.33659106492996216, + "logps/chosen": -1.6785993576049805, + "logps/rejected": -1.8086090087890625, + "loss": 2.1801, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -16.785995483398438, + "rewards/margins": 1.3000948429107666, + "rewards/rejected": -18.086088180541992, + "step": 15 + }, + { + "epoch": 0.0006741042839327244, + "grad_norm": 27.722911834716797, + "learning_rate": 6.740815638692282e-09, + "logits/chosen": -0.7963976860046387, + "logits/rejected": -0.6953271627426147, + "logps/chosen": -1.6158479452133179, + "logps/rejected": -1.637351632118225, + "loss": 2.9568, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.15848159790039, + "rewards/margins": 0.21503643691539764, + "rewards/rejected": -16.373516082763672, + "step": 20 + }, + { + "epoch": 0.0008426303549159055, + "grad_norm": 27.396446228027344, + "learning_rate": 8.426019548365353e-09, + "logits/chosen": -0.4122963547706604, + "logits/rejected": -0.23325464129447937, + "logps/chosen": -1.8151277303695679, + "logps/rejected": -1.9228709936141968, + "loss": 2.3935, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.151275634765625, + "rewards/margins": 1.077433466911316, + "rewards/rejected": -19.228710174560547, + "step": 25 + }, + { + "epoch": 0.0010111564258990866, + "grad_norm": 20.264379501342773, + "learning_rate": 1.0111223458038422e-08, + "logits/chosen": -0.4613228738307953, + "logits/rejected": -0.47090595960617065, + "logps/chosen": -1.5245769023895264, + "logps/rejected": -1.5697605609893799, + "loss": 2.7916, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -15.245768547058105, + "rewards/margins": 0.45183706283569336, + "rewards/rejected": -15.697604179382324, + "step": 30 + }, + { + "epoch": 0.0011796824968822678, + "grad_norm": 23.512556076049805, + "learning_rate": 1.1796427367711492e-08, + "logits/chosen": -0.30312925577163696, + "logits/rejected": -0.5215615630149841, + "logps/chosen": -2.0133779048919678, + "logps/rejected": -2.0369296073913574, + "loss": 3.9116, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.133779525756836, + "rewards/margins": 0.23551683127880096, + "rewards/rejected": -20.36929702758789, + "step": 35 + }, + { + "epoch": 0.0013482085678654487, + "grad_norm": 17.596803665161133, + "learning_rate": 1.3481631277384564e-08, + "logits/chosen": -0.5437296628952026, + "logits/rejected": -0.5092577934265137, + "logps/chosen": -1.5528526306152344, + "logps/rejected": -1.5792545080184937, + "loss": 2.8937, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -15.528528213500977, + "rewards/margins": 0.264018714427948, + "rewards/rejected": -15.7925443649292, + "step": 40 + }, + { + "epoch": 0.0015167346388486299, + "grad_norm": 18.151222229003906, + "learning_rate": 1.5166835187057634e-08, + "logits/chosen": -0.4424339234828949, + "logits/rejected": -0.5105506181716919, + "logps/chosen": -1.7674528360366821, + "logps/rejected": -1.7977365255355835, + "loss": 2.8757, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.674528121948242, + "rewards/margins": 0.30283719301223755, + "rewards/rejected": -17.977365493774414, + "step": 45 + }, + { + "epoch": 0.001685260709831811, + "grad_norm": 17.36290168762207, + "learning_rate": 1.6852039096730706e-08, + "logits/chosen": -0.6893698573112488, + "logits/rejected": -0.600393533706665, + "logps/chosen": -1.8375508785247803, + "logps/rejected": -1.839758276939392, + "loss": 3.4694, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.37550926208496, + "rewards/margins": 0.02207345888018608, + "rewards/rejected": -18.3975830078125, + "step": 50 + }, + { + "epoch": 0.0018537867808149922, + "grad_norm": 35.55210876464844, + "learning_rate": 1.8537243006403775e-08, + "logits/chosen": -0.6072413921356201, + "logits/rejected": -0.5612670183181763, + "logps/chosen": -1.688486099243164, + "logps/rejected": -1.655321717262268, + "loss": 3.4563, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.88486099243164, + "rewards/margins": -0.33164510130882263, + "rewards/rejected": -16.5532169342041, + "step": 55 + }, + { + "epoch": 0.0020223128517981733, + "grad_norm": 9.634931564331055, + "learning_rate": 2.0222446916076843e-08, + "logits/chosen": 0.009985041804611683, + "logits/rejected": 0.04499584436416626, + "logps/chosen": -1.4743859767913818, + "logps/rejected": -1.5558125972747803, + "loss": 2.491, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -14.743858337402344, + "rewards/margins": 0.8142659068107605, + "rewards/rejected": -15.558123588562012, + "step": 60 + }, + { + "epoch": 0.0021908389227813544, + "grad_norm": 19.739643096923828, + "learning_rate": 2.1907650825749915e-08, + "logits/chosen": -0.5582388639450073, + "logits/rejected": -0.7695599794387817, + "logps/chosen": -1.4039520025253296, + "logps/rejected": -1.3351157903671265, + "loss": 3.7285, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -14.039520263671875, + "rewards/margins": -0.688363254070282, + "rewards/rejected": -13.351158142089844, + "step": 65 + }, + { + "epoch": 0.0023593649937645356, + "grad_norm": 16.17816734313965, + "learning_rate": 2.3592854735422984e-08, + "logits/chosen": -0.7809473872184753, + "logits/rejected": -0.7580649852752686, + "logps/chosen": -1.7219566106796265, + "logps/rejected": -1.7046865224838257, + "loss": 3.4937, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.21956443786621, + "rewards/margins": -0.17269916832447052, + "rewards/rejected": -17.046865463256836, + "step": 70 + }, + { + "epoch": 0.0025278910647477163, + "grad_norm": 22.2804012298584, + "learning_rate": 2.5278058645096056e-08, + "logits/chosen": -0.7723641395568848, + "logits/rejected": -0.8045178651809692, + "logps/chosen": -1.4883219003677368, + "logps/rejected": -1.4072678089141846, + "loss": 3.8948, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -14.883219718933105, + "rewards/margins": -0.8105422854423523, + "rewards/rejected": -14.072675704956055, + "step": 75 + }, + { + "epoch": 0.0026964171357308974, + "grad_norm": 25.86888313293457, + "learning_rate": 2.6963262554769128e-08, + "logits/chosen": -0.08162397891283035, + "logits/rejected": -0.007745756767690182, + "logps/chosen": -1.797745943069458, + "logps/rejected": -1.9569447040557861, + "loss": 3.0899, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.977460861206055, + "rewards/margins": 1.5919866561889648, + "rewards/rejected": -19.569446563720703, + "step": 80 + }, + { + "epoch": 0.0028649432067140786, + "grad_norm": 23.099294662475586, + "learning_rate": 2.8648466464442196e-08, + "logits/chosen": -0.7632491588592529, + "logits/rejected": -0.7715046405792236, + "logps/chosen": -1.686475396156311, + "logps/rejected": -1.7604153156280518, + "loss": 2.6555, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.864755630493164, + "rewards/margins": 0.7393980026245117, + "rewards/rejected": -17.60415267944336, + "step": 85 + }, + { + "epoch": 0.0030334692776972597, + "grad_norm": 21.547155380249023, + "learning_rate": 3.033367037411527e-08, + "logits/chosen": -0.5401488542556763, + "logits/rejected": -0.46405941247940063, + "logps/chosen": -1.8458507061004639, + "logps/rejected": -1.7931216955184937, + "loss": 3.5862, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.458507537841797, + "rewards/margins": -0.5272892117500305, + "rewards/rejected": -17.931217193603516, + "step": 90 + }, + { + "epoch": 0.003201995348680441, + "grad_norm": 30.385618209838867, + "learning_rate": 3.2018874283788334e-08, + "logits/chosen": -0.24752536416053772, + "logits/rejected": -0.5087365508079529, + "logps/chosen": -1.8774124383926392, + "logps/rejected": -1.7933375835418701, + "loss": 3.9459, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.774124145507812, + "rewards/margins": -0.8407486081123352, + "rewards/rejected": -17.93337631225586, + "step": 95 + }, + { + "epoch": 0.003370521419663622, + "grad_norm": 34.13222885131836, + "learning_rate": 3.370407819346141e-08, + "logits/chosen": -0.3253437876701355, + "logits/rejected": -0.32326677441596985, + "logps/chosen": -1.559247612953186, + "logps/rejected": -1.4792327880859375, + "loss": 3.951, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -15.592477798461914, + "rewards/margins": -0.8001473546028137, + "rewards/rejected": -14.792327880859375, + "step": 100 + }, + { + "epoch": 0.003539047490646803, + "grad_norm": 28.58753776550293, + "learning_rate": 3.538928210313448e-08, + "logits/chosen": -0.3705151379108429, + "logits/rejected": -0.5448762774467468, + "logps/chosen": -1.6743396520614624, + "logps/rejected": -1.6100950241088867, + "loss": 3.7262, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -16.743396759033203, + "rewards/margins": -0.6424452662467957, + "rewards/rejected": -16.100950241088867, + "step": 105 + }, + { + "epoch": 0.0037075735616299843, + "grad_norm": 30.319379806518555, + "learning_rate": 3.707448601280755e-08, + "logits/chosen": -0.5734914541244507, + "logits/rejected": -0.795892059803009, + "logps/chosen": -1.7318475246429443, + "logps/rejected": -1.8011115789413452, + "loss": 2.4595, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.3184757232666, + "rewards/margins": 0.692640483379364, + "rewards/rejected": -18.0111141204834, + "step": 110 + }, + { + "epoch": 0.0038760996326131654, + "grad_norm": 24.694068908691406, + "learning_rate": 3.8759689922480615e-08, + "logits/chosen": -0.28614291548728943, + "logits/rejected": -0.40578216314315796, + "logps/chosen": -1.639915108680725, + "logps/rejected": -1.6953051090240479, + "loss": 2.6474, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.399150848388672, + "rewards/margins": 0.5539007186889648, + "rewards/rejected": -16.953052520751953, + "step": 115 + }, + { + "epoch": 0.004044625703596347, + "grad_norm": 27.61186981201172, + "learning_rate": 4.044489383215369e-08, + "logits/chosen": -0.3833938241004944, + "logits/rejected": -0.3862255811691284, + "logps/chosen": -1.5427688360214233, + "logps/rejected": -1.6645368337631226, + "loss": 2.6032, + "rewards/accuracies": 0.5, + "rewards/chosen": -15.427688598632812, + "rewards/margins": 1.2176802158355713, + "rewards/rejected": -16.645368576049805, + "step": 120 + }, + { + "epoch": 0.004213151774579527, + "grad_norm": 21.950868606567383, + "learning_rate": 4.213009774182676e-08, + "logits/chosen": -0.3655502200126648, + "logits/rejected": -0.46016925573349, + "logps/chosen": -1.8143894672393799, + "logps/rejected": -1.824389100074768, + "loss": 2.9648, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.143896102905273, + "rewards/margins": 0.09999465942382812, + "rewards/rejected": -18.2438907623291, + "step": 125 + }, + { + "epoch": 0.004381677845562709, + "grad_norm": 28.688859939575195, + "learning_rate": 4.381530165149983e-08, + "logits/chosen": -0.5313527584075928, + "logits/rejected": -0.577104926109314, + "logps/chosen": -1.7656917572021484, + "logps/rejected": -1.750722885131836, + "loss": 3.4855, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.656917572021484, + "rewards/margins": -0.1496877670288086, + "rewards/rejected": -17.50722885131836, + "step": 130 + }, + { + "epoch": 0.00455020391654589, + "grad_norm": 16.734312057495117, + "learning_rate": 4.55005055611729e-08, + "logits/chosen": -0.2671900689601898, + "logits/rejected": -0.3611742854118347, + "logps/chosen": -1.9485645294189453, + "logps/rejected": -1.9769279956817627, + "loss": 3.0009, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.485645294189453, + "rewards/margins": 0.2836341857910156, + "rewards/rejected": -19.76927947998047, + "step": 135 + }, + { + "epoch": 0.004718729987529071, + "grad_norm": 27.427196502685547, + "learning_rate": 4.718570947084597e-08, + "logits/chosen": -0.7098779082298279, + "logits/rejected": -0.48461517691612244, + "logps/chosen": -1.6501989364624023, + "logps/rejected": -1.714129090309143, + "loss": 2.602, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.501989364624023, + "rewards/margins": 0.6392990946769714, + "rewards/rejected": -17.14128875732422, + "step": 140 + }, + { + "epoch": 0.004887256058512252, + "grad_norm": 18.88107681274414, + "learning_rate": 4.887091338051904e-08, + "logits/chosen": -0.4832366406917572, + "logits/rejected": -0.47723278403282166, + "logps/chosen": -1.741558313369751, + "logps/rejected": -1.8169167041778564, + "loss": 3.0037, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.41558265686035, + "rewards/margins": 0.753582775592804, + "rewards/rejected": -18.169164657592773, + "step": 145 + }, + { + "epoch": 0.005055782129495433, + "grad_norm": 29.208202362060547, + "learning_rate": 5.055611729019211e-08, + "logits/chosen": -0.66395503282547, + "logits/rejected": -0.6029442548751831, + "logps/chosen": -1.5494129657745361, + "logps/rejected": -1.5750149488449097, + "loss": 2.9043, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -15.494128227233887, + "rewards/margins": 0.25602206587791443, + "rewards/rejected": -15.750149726867676, + "step": 150 + }, + { + "epoch": 0.005224308200478614, + "grad_norm": 20.492918014526367, + "learning_rate": 5.224132119986518e-08, + "logits/chosen": -0.8075466156005859, + "logits/rejected": -0.8982459306716919, + "logps/chosen": -1.5187879800796509, + "logps/rejected": -1.539340853691101, + "loss": 2.9627, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -15.18787956237793, + "rewards/margins": 0.20552997291088104, + "rewards/rejected": -15.393407821655273, + "step": 155 + }, + { + "epoch": 0.005392834271461795, + "grad_norm": 14.183518409729004, + "learning_rate": 5.3926525109538256e-08, + "logits/chosen": -0.32353320717811584, + "logits/rejected": -0.3807728588581085, + "logps/chosen": -1.760858178138733, + "logps/rejected": -1.8059089183807373, + "loss": 2.8887, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.60858154296875, + "rewards/margins": 0.4505080282688141, + "rewards/rejected": -18.059091567993164, + "step": 160 + }, + { + "epoch": 0.0055613603424449765, + "grad_norm": 18.467872619628906, + "learning_rate": 5.561172901921132e-08, + "logits/chosen": -0.07653169333934784, + "logits/rejected": -0.14054766297340393, + "logps/chosen": -1.6342408657073975, + "logps/rejected": -1.7179641723632812, + "loss": 2.5857, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.342411041259766, + "rewards/margins": 0.8372310400009155, + "rewards/rejected": -17.179641723632812, + "step": 165 + }, + { + "epoch": 0.005729886413428157, + "grad_norm": 23.680160522460938, + "learning_rate": 5.729693292888439e-08, + "logits/chosen": -0.3570849597454071, + "logits/rejected": -0.4266514778137207, + "logps/chosen": -1.7811241149902344, + "logps/rejected": -1.894479513168335, + "loss": 2.3806, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.811241149902344, + "rewards/margins": 1.1335537433624268, + "rewards/rejected": -18.944795608520508, + "step": 170 + }, + { + "epoch": 0.005898412484411339, + "grad_norm": 14.393828392028809, + "learning_rate": 5.898213683855746e-08, + "logits/chosen": -0.5269268751144409, + "logits/rejected": -0.5787986516952515, + "logps/chosen": -1.867837905883789, + "logps/rejected": -1.9020452499389648, + "loss": 3.0536, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.67837905883789, + "rewards/margins": 0.34207430481910706, + "rewards/rejected": -19.020450592041016, + "step": 175 + }, + { + "epoch": 0.0060669385553945195, + "grad_norm": 28.581340789794922, + "learning_rate": 6.066734074823054e-08, + "logits/chosen": -0.4054934084415436, + "logits/rejected": -0.3458530902862549, + "logps/chosen": -1.9068748950958252, + "logps/rejected": -2.2061736583709717, + "loss": 2.2303, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.06874656677246, + "rewards/margins": 2.9929919242858887, + "rewards/rejected": -22.06174087524414, + "step": 180 + }, + { + "epoch": 0.006235464626377701, + "grad_norm": 26.398284912109375, + "learning_rate": 6.235254465790361e-08, + "logits/chosen": -0.7721191048622131, + "logits/rejected": -0.6895912885665894, + "logps/chosen": -1.4460922479629517, + "logps/rejected": -1.4541490077972412, + "loss": 3.0706, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -14.46092414855957, + "rewards/margins": 0.08056803047657013, + "rewards/rejected": -14.541491508483887, + "step": 185 + }, + { + "epoch": 0.006403990697360882, + "grad_norm": 14.64199161529541, + "learning_rate": 6.403774856757667e-08, + "logits/chosen": -0.3238942325115204, + "logits/rejected": -0.24976284801959991, + "logps/chosen": -1.8821359872817993, + "logps/rejected": -2.101795196533203, + "loss": 2.8468, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.821361541748047, + "rewards/margins": 2.1965911388397217, + "rewards/rejected": -21.01795196533203, + "step": 190 + }, + { + "epoch": 0.0065725167683440625, + "grad_norm": 27.205289840698242, + "learning_rate": 6.572295247724974e-08, + "logits/chosen": -0.41295117139816284, + "logits/rejected": -0.22403642535209656, + "logps/chosen": -1.789427399635315, + "logps/rejected": -1.9406659603118896, + "loss": 2.2107, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.89427375793457, + "rewards/margins": 1.5123885869979858, + "rewards/rejected": -19.406661987304688, + "step": 195 + }, + { + "epoch": 0.006741042839327244, + "grad_norm": 5.611418724060059, + "learning_rate": 6.740815638692282e-08, + "logits/chosen": -0.16343602538108826, + "logits/rejected": -0.2591503858566284, + "logps/chosen": -2.1357204914093018, + "logps/rejected": -2.256701707839966, + "loss": 2.5936, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.35720443725586, + "rewards/margins": 1.2098113298416138, + "rewards/rejected": -22.5670166015625, + "step": 200 + }, + { + "epoch": 0.006909568910310425, + "grad_norm": 19.420745849609375, + "learning_rate": 6.90933602965959e-08, + "logits/chosen": -0.7015420198440552, + "logits/rejected": -0.8463886380195618, + "logps/chosen": -1.6153056621551514, + "logps/rejected": -1.5792958736419678, + "loss": 3.856, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.153057098388672, + "rewards/margins": -0.360098659992218, + "rewards/rejected": -15.792959213256836, + "step": 205 + }, + { + "epoch": 0.007078094981293606, + "grad_norm": 23.551700592041016, + "learning_rate": 7.077856420626896e-08, + "logits/chosen": -0.5431650876998901, + "logits/rejected": -0.5651072263717651, + "logps/chosen": -1.4538341760635376, + "logps/rejected": -1.4151197671890259, + "loss": 3.446, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -14.538342475891113, + "rewards/margins": -0.38714489340782166, + "rewards/rejected": -14.15119743347168, + "step": 210 + }, + { + "epoch": 0.007246621052276787, + "grad_norm": 33.41309356689453, + "learning_rate": 7.246376811594203e-08, + "logits/chosen": -0.85772705078125, + "logits/rejected": -0.8544243574142456, + "logps/chosen": -1.7897136211395264, + "logps/rejected": -1.7153469324111938, + "loss": 3.7949, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -17.897136688232422, + "rewards/margins": -0.7436662912368774, + "rewards/rejected": -17.15346908569336, + "step": 215 + }, + { + "epoch": 0.007415147123259969, + "grad_norm": 28.571849822998047, + "learning_rate": 7.41489720256151e-08, + "logits/chosen": -0.4796590805053711, + "logits/rejected": -0.5393766164779663, + "logps/chosen": -1.9003206491470337, + "logps/rejected": -2.005420446395874, + "loss": 2.9494, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.003206253051758, + "rewards/margins": 1.0509979724884033, + "rewards/rejected": -20.0542049407959, + "step": 220 + }, + { + "epoch": 0.007583673194243149, + "grad_norm": 20.83780288696289, + "learning_rate": 7.583417593528817e-08, + "logits/chosen": -0.5321765542030334, + "logits/rejected": -0.4122452735900879, + "logps/chosen": -1.856885552406311, + "logps/rejected": -1.857283592224121, + "loss": 3.1423, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.56885528564453, + "rewards/margins": 0.003980970475822687, + "rewards/rejected": -18.57283592224121, + "step": 225 + }, + { + "epoch": 0.007752199265226331, + "grad_norm": 22.060514450073242, + "learning_rate": 7.751937984496123e-08, + "logits/chosen": -0.4617394506931305, + "logits/rejected": -0.3590688109397888, + "logps/chosen": -1.579564094543457, + "logps/rejected": -1.6632616519927979, + "loss": 3.1801, + "rewards/accuracies": 0.5, + "rewards/chosen": -15.79564094543457, + "rewards/margins": 0.8369754552841187, + "rewards/rejected": -16.63261604309082, + "step": 230 + }, + { + "epoch": 0.007920725336209512, + "grad_norm": 22.34297752380371, + "learning_rate": 7.92045837546343e-08, + "logits/chosen": -0.5775288343429565, + "logits/rejected": -0.5112261176109314, + "logps/chosen": -1.5766397714614868, + "logps/rejected": -1.7778068780899048, + "loss": 1.8744, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -15.766397476196289, + "rewards/margins": 2.011671304702759, + "rewards/rejected": -17.7780704498291, + "step": 235 + }, + { + "epoch": 0.008089251407192693, + "grad_norm": 26.297338485717773, + "learning_rate": 8.088978766430737e-08, + "logits/chosen": -0.6276997923851013, + "logits/rejected": -0.5408580899238586, + "logps/chosen": -1.744314432144165, + "logps/rejected": -1.8765428066253662, + "loss": 2.1406, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.443145751953125, + "rewards/margins": 1.322283148765564, + "rewards/rejected": -18.765426635742188, + "step": 240 + }, + { + "epoch": 0.008257777478175873, + "grad_norm": 44.11431884765625, + "learning_rate": 8.257499157398045e-08, + "logits/chosen": -0.4730430543422699, + "logits/rejected": -0.4535676836967468, + "logps/chosen": -1.7764599323272705, + "logps/rejected": -1.780321717262268, + "loss": 3.0543, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.764596939086914, + "rewards/margins": 0.038619231432676315, + "rewards/rejected": -17.8032169342041, + "step": 245 + }, + { + "epoch": 0.008426303549159055, + "grad_norm": 22.68023109436035, + "learning_rate": 8.426019548365352e-08, + "logits/chosen": -0.4147162437438965, + "logits/rejected": -0.35299405455589294, + "logps/chosen": -1.7950069904327393, + "logps/rejected": -1.8354160785675049, + "loss": 2.7634, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.950069427490234, + "rewards/margins": 0.40409010648727417, + "rewards/rejected": -18.35416030883789, + "step": 250 + }, + { + "epoch": 0.008594829620142236, + "grad_norm": 23.5480899810791, + "learning_rate": 8.594539939332659e-08, + "logits/chosen": -0.6697706580162048, + "logits/rejected": -0.6132184863090515, + "logps/chosen": -1.8134784698486328, + "logps/rejected": -1.8508756160736084, + "loss": 2.7429, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.134784698486328, + "rewards/margins": 0.373971551656723, + "rewards/rejected": -18.50875473022461, + "step": 255 + }, + { + "epoch": 0.008763355691125418, + "grad_norm": 19.03260040283203, + "learning_rate": 8.763060330299966e-08, + "logits/chosen": -0.20593388378620148, + "logits/rejected": -0.37953242659568787, + "logps/chosen": -2.1354403495788574, + "logps/rejected": -2.0018649101257324, + "loss": 4.4218, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.354402542114258, + "rewards/margins": -1.3357551097869873, + "rewards/rejected": -20.01865005493164, + "step": 260 + }, + { + "epoch": 0.008931881762108598, + "grad_norm": 22.810266494750977, + "learning_rate": 8.931580721267273e-08, + "logits/chosen": -0.5022013783454895, + "logits/rejected": -0.5145460367202759, + "logps/chosen": -1.636365532875061, + "logps/rejected": -1.6282808780670166, + "loss": 3.1787, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.36365509033203, + "rewards/margins": -0.08084668964147568, + "rewards/rejected": -16.282808303833008, + "step": 265 + }, + { + "epoch": 0.00910040783309178, + "grad_norm": 21.40003776550293, + "learning_rate": 9.10010111223458e-08, + "logits/chosen": -0.42804187536239624, + "logits/rejected": -0.3837874233722687, + "logps/chosen": -1.6583467721939087, + "logps/rejected": -1.7007923126220703, + "loss": 2.7244, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.58346939086914, + "rewards/margins": 0.42445507645606995, + "rewards/rejected": -17.007923126220703, + "step": 270 + }, + { + "epoch": 0.00926893390407496, + "grad_norm": 0.5620936155319214, + "learning_rate": 9.268621503201888e-08, + "logits/chosen": -0.5316357016563416, + "logits/rejected": -0.7304517030715942, + "logps/chosen": -1.8833847045898438, + "logps/rejected": -2.030012607574463, + "loss": 3.6374, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -18.833847045898438, + "rewards/margins": 1.4662790298461914, + "rewards/rejected": -20.300125122070312, + "step": 275 + }, + { + "epoch": 0.009437459975058142, + "grad_norm": 20.305036544799805, + "learning_rate": 9.437141894169194e-08, + "logits/chosen": -0.5274496674537659, + "logits/rejected": -0.5468205809593201, + "logps/chosen": -1.3872991800308228, + "logps/rejected": -1.3830955028533936, + "loss": 3.1062, + "rewards/accuracies": 0.5, + "rewards/chosen": -13.872991561889648, + "rewards/margins": -0.042035769671201706, + "rewards/rejected": -13.830957412719727, + "step": 280 + }, + { + "epoch": 0.009605986046041322, + "grad_norm": 34.91652297973633, + "learning_rate": 9.605662285136501e-08, + "logits/chosen": -0.539037823677063, + "logits/rejected": -0.3293539583683014, + "logps/chosen": -1.6226667165756226, + "logps/rejected": -1.6433026790618896, + "loss": 2.9828, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.226667404174805, + "rewards/margins": 0.2063591033220291, + "rewards/rejected": -16.433025360107422, + "step": 285 + }, + { + "epoch": 0.009774512117024504, + "grad_norm": 26.9847469329834, + "learning_rate": 9.774182676103808e-08, + "logits/chosen": -0.36581525206565857, + "logits/rejected": -0.33445021510124207, + "logps/chosen": -1.572983980178833, + "logps/rejected": -1.717125654220581, + "loss": 2.8238, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -15.729840278625488, + "rewards/margins": 1.44141685962677, + "rewards/rejected": -17.1712589263916, + "step": 290 + }, + { + "epoch": 0.009943038188007685, + "grad_norm": 18.268667221069336, + "learning_rate": 9.942703067071115e-08, + "logits/chosen": -0.6619695425033569, + "logits/rejected": -0.5615943670272827, + "logps/chosen": -1.6950050592422485, + "logps/rejected": -1.7060911655426025, + "loss": 3.0675, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.950048446655273, + "rewards/margins": 0.11086149513721466, + "rewards/rejected": -17.060911178588867, + "step": 295 + }, + { + "epoch": 0.010111564258990865, + "grad_norm": 24.732633590698242, + "learning_rate": 1.0111223458038422e-07, + "logits/chosen": -0.42776185274124146, + "logits/rejected": -0.5683671236038208, + "logps/chosen": -1.41172194480896, + "logps/rejected": -1.4656214714050293, + "loss": 2.6678, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -14.117219924926758, + "rewards/margins": 0.538996696472168, + "rewards/rejected": -14.656216621398926, + "step": 300 + }, + { + "epoch": 0.010280090329974047, + "grad_norm": 15.735332489013672, + "learning_rate": 1.0279743849005728e-07, + "logits/chosen": -1.0114845037460327, + "logits/rejected": -1.0291179418563843, + "logps/chosen": -1.759996771812439, + "logps/rejected": -1.738663673400879, + "loss": 3.2891, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.599966049194336, + "rewards/margins": -0.21333065629005432, + "rewards/rejected": -17.38663673400879, + "step": 305 + }, + { + "epoch": 0.010448616400957228, + "grad_norm": 31.485431671142578, + "learning_rate": 1.0448264239973035e-07, + "logits/chosen": -0.5081278085708618, + "logits/rejected": -0.5943376421928406, + "logps/chosen": -1.7860934734344482, + "logps/rejected": -1.8374830484390259, + "loss": 2.7103, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.86093521118164, + "rewards/margins": 0.5138964653015137, + "rewards/rejected": -18.374832153320312, + "step": 310 + }, + { + "epoch": 0.01061714247194041, + "grad_norm": 24.96269416809082, + "learning_rate": 1.0616784630940344e-07, + "logits/chosen": -0.4897727370262146, + "logits/rejected": -0.6077844500541687, + "logps/chosen": -1.6169993877410889, + "logps/rejected": -1.6388232707977295, + "loss": 3.4165, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -16.16999626159668, + "rewards/margins": 0.21823683381080627, + "rewards/rejected": -16.388233184814453, + "step": 315 + }, + { + "epoch": 0.01078566854292359, + "grad_norm": 20.947097778320312, + "learning_rate": 1.0785305021907651e-07, + "logits/chosen": -0.6524089574813843, + "logits/rejected": -0.5542722940444946, + "logps/chosen": -1.5016238689422607, + "logps/rejected": -1.4815490245819092, + "loss": 3.2804, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -15.01623821258545, + "rewards/margins": -0.20074859261512756, + "rewards/rejected": -14.81549072265625, + "step": 320 + }, + { + "epoch": 0.010954194613906771, + "grad_norm": 57.642093658447266, + "learning_rate": 1.0953825412874958e-07, + "logits/chosen": -0.14283767342567444, + "logits/rejected": -0.25287362933158875, + "logps/chosen": -2.091562509536743, + "logps/rejected": -2.088674545288086, + "loss": 3.2121, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.915624618530273, + "rewards/margins": -0.028880024328827858, + "rewards/rejected": -20.88674545288086, + "step": 325 + }, + { + "epoch": 0.011122720684889953, + "grad_norm": 26.221973419189453, + "learning_rate": 1.1122345803842264e-07, + "logits/chosen": -0.8916441798210144, + "logits/rejected": -0.8147870302200317, + "logps/chosen": -1.7097644805908203, + "logps/rejected": -1.969477891921997, + "loss": 2.2724, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.097644805908203, + "rewards/margins": 2.597132444381714, + "rewards/rejected": -19.69477653503418, + "step": 330 + }, + { + "epoch": 0.011291246755873133, + "grad_norm": 22.461708068847656, + "learning_rate": 1.1290866194809571e-07, + "logits/chosen": -0.7663329243659973, + "logits/rejected": -0.6704310178756714, + "logps/chosen": -1.5523570775985718, + "logps/rejected": -1.4849021434783936, + "loss": 3.7766, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -15.52357006072998, + "rewards/margins": -0.674548327922821, + "rewards/rejected": -14.849021911621094, + "step": 335 + }, + { + "epoch": 0.011459772826856314, + "grad_norm": 32.96992874145508, + "learning_rate": 1.1459386585776879e-07, + "logits/chosen": -0.3288322389125824, + "logits/rejected": -0.2193385809659958, + "logps/chosen": -2.2772393226623535, + "logps/rejected": -2.1371006965637207, + "loss": 4.5364, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.77239418029785, + "rewards/margins": -1.4013869762420654, + "rewards/rejected": -21.37100601196289, + "step": 340 + }, + { + "epoch": 0.011628298897839496, + "grad_norm": 27.60466194152832, + "learning_rate": 1.1627906976744186e-07, + "logits/chosen": -0.18993383646011353, + "logits/rejected": -0.15279248356819153, + "logps/chosen": -1.8032690286636353, + "logps/rejected": -1.9014475345611572, + "loss": 2.575, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.032690048217773, + "rewards/margins": 0.9817847013473511, + "rewards/rejected": -19.014476776123047, + "step": 345 + }, + { + "epoch": 0.011796824968822677, + "grad_norm": 20.22198486328125, + "learning_rate": 1.1796427367711492e-07, + "logits/chosen": -0.52468341588974, + "logits/rejected": -0.5473569631576538, + "logps/chosen": -1.686406135559082, + "logps/rejected": -1.6621391773223877, + "loss": 3.3808, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.86406135559082, + "rewards/margins": -0.242669016122818, + "rewards/rejected": -16.62139320373535, + "step": 350 + }, + { + "epoch": 0.011965351039805857, + "grad_norm": 14.638656616210938, + "learning_rate": 1.19649477586788e-07, + "logits/chosen": -0.061285682022571564, + "logits/rejected": -0.03674466535449028, + "logps/chosen": -1.6679449081420898, + "logps/rejected": -1.850379228591919, + "loss": 3.0775, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.679447174072266, + "rewards/margins": 1.8243420124053955, + "rewards/rejected": -18.5037899017334, + "step": 355 + }, + { + "epoch": 0.012133877110789039, + "grad_norm": 12.102712631225586, + "learning_rate": 1.2133468149646107e-07, + "logits/chosen": -0.5100473165512085, + "logits/rejected": -0.4110667109489441, + "logps/chosen": -1.6822515726089478, + "logps/rejected": -1.6356151103973389, + "loss": 3.7413, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -16.8225154876709, + "rewards/margins": -0.46636518836021423, + "rewards/rejected": -16.356151580810547, + "step": 360 + }, + { + "epoch": 0.01230240318177222, + "grad_norm": 12.664548873901367, + "learning_rate": 1.2301988540613412e-07, + "logits/chosen": -0.5096290707588196, + "logits/rejected": -0.5500877499580383, + "logps/chosen": -1.790327787399292, + "logps/rejected": -1.7157615423202515, + "loss": 4.2013, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.903278350830078, + "rewards/margins": -0.7456639409065247, + "rewards/rejected": -17.157615661621094, + "step": 365 + }, + { + "epoch": 0.012470929252755402, + "grad_norm": 24.88652229309082, + "learning_rate": 1.2470508931580722e-07, + "logits/chosen": -0.47424596548080444, + "logits/rejected": -0.36643728613853455, + "logps/chosen": -1.7648521661758423, + "logps/rejected": -1.726318120956421, + "loss": 3.5944, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.64851951599121, + "rewards/margins": -0.38534069061279297, + "rewards/rejected": -17.263179779052734, + "step": 370 + }, + { + "epoch": 0.012639455323738582, + "grad_norm": 14.096327781677246, + "learning_rate": 1.263902932254803e-07, + "logits/chosen": 0.02537798322737217, + "logits/rejected": 0.0857175663113594, + "logps/chosen": -2.335552930831909, + "logps/rejected": -2.3750240802764893, + "loss": 2.8665, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.355531692504883, + "rewards/margins": 0.3947105407714844, + "rewards/rejected": -23.750240325927734, + "step": 375 + }, + { + "epoch": 0.012807981394721763, + "grad_norm": 17.82650375366211, + "learning_rate": 1.2807549713515333e-07, + "logits/chosen": -0.6803150177001953, + "logits/rejected": -0.4150848388671875, + "logps/chosen": -1.7237411737442017, + "logps/rejected": -1.6108821630477905, + "loss": 4.1818, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.237409591674805, + "rewards/margins": -1.1285897493362427, + "rewards/rejected": -16.10881996154785, + "step": 380 + }, + { + "epoch": 0.012976507465704945, + "grad_norm": 24.19916343688965, + "learning_rate": 1.2976070104482643e-07, + "logits/chosen": -0.6842484474182129, + "logits/rejected": -0.41495975852012634, + "logps/chosen": -1.7041406631469727, + "logps/rejected": -1.7983328104019165, + "loss": 2.4166, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.041406631469727, + "rewards/margins": 0.9419221878051758, + "rewards/rejected": -17.983327865600586, + "step": 385 + }, + { + "epoch": 0.013145033536688125, + "grad_norm": 32.509334564208984, + "learning_rate": 1.3144590495449948e-07, + "logits/chosen": -0.1651981621980667, + "logits/rejected": -0.20989704132080078, + "logps/chosen": -1.7264354228973389, + "logps/rejected": -1.7077096700668335, + "loss": 3.39, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.264352798461914, + "rewards/margins": -0.18725700676441193, + "rewards/rejected": -17.077096939086914, + "step": 390 + }, + { + "epoch": 0.013313559607671306, + "grad_norm": 37.62208938598633, + "learning_rate": 1.3313110886417255e-07, + "logits/chosen": -0.9938071966171265, + "logits/rejected": -0.8165189027786255, + "logps/chosen": -1.909253478050232, + "logps/rejected": -1.8714126348495483, + "loss": 3.4985, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -19.092533111572266, + "rewards/margins": -0.37840786576271057, + "rewards/rejected": -18.714126586914062, + "step": 395 + }, + { + "epoch": 0.013482085678654488, + "grad_norm": 57.049564361572266, + "learning_rate": 1.3481631277384565e-07, + "logits/chosen": -0.3681066930294037, + "logits/rejected": -0.5049809813499451, + "logps/chosen": -1.8573124408721924, + "logps/rejected": -1.8679901361465454, + "loss": 3.2871, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.573123931884766, + "rewards/margins": 0.10677585750818253, + "rewards/rejected": -18.679901123046875, + "step": 400 + }, + { + "epoch": 0.013482085678654488, + "eval_logits/chosen": -0.700717031955719, + "eval_logits/rejected": -0.7018821835517883, + "eval_logps/chosen": -1.655368685722351, + "eval_logps/rejected": -1.65134859085083, + "eval_loss": 3.4378702640533447, + "eval_rewards/accuracies": 0.4699999988079071, + "eval_rewards/chosen": -16.553686141967773, + "eval_rewards/margins": -0.04019847884774208, + "eval_rewards/rejected": -16.513486862182617, + "eval_runtime": 13.2029, + "eval_samples_per_second": 7.574, + "eval_steps_per_second": 1.894, + "step": 400 + }, + { + "epoch": 0.01365061174963767, + "grad_norm": 32.858551025390625, + "learning_rate": 1.365015166835187e-07, + "logits/chosen": -0.4087739586830139, + "logits/rejected": -0.32055968046188354, + "logps/chosen": -1.6603796482086182, + "logps/rejected": -1.7441883087158203, + "loss": 2.4208, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.603796005249023, + "rewards/margins": 0.8380862474441528, + "rewards/rejected": -17.441883087158203, + "step": 405 + }, + { + "epoch": 0.01381913782062085, + "grad_norm": 12.036665916442871, + "learning_rate": 1.381867205931918e-07, + "logits/chosen": -0.3701552748680115, + "logits/rejected": -0.3905239701271057, + "logps/chosen": -1.765268325805664, + "logps/rejected": -1.789777159690857, + "loss": 2.888, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.65268325805664, + "rewards/margins": 0.24508953094482422, + "rewards/rejected": -17.89777183532715, + "step": 410 + }, + { + "epoch": 0.013987663891604031, + "grad_norm": 24.49802589416504, + "learning_rate": 1.3987192450286484e-07, + "logits/chosen": -0.40653306245803833, + "logits/rejected": -0.4059394896030426, + "logps/chosen": -1.9510490894317627, + "logps/rejected": -2.0723910331726074, + "loss": 2.9058, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.510488510131836, + "rewards/margins": 1.2134212255477905, + "rewards/rejected": -20.72391128540039, + "step": 415 + }, + { + "epoch": 0.014156189962587213, + "grad_norm": 21.609708786010742, + "learning_rate": 1.415571284125379e-07, + "logits/chosen": 0.07993341982364655, + "logits/rejected": 0.046029143035411835, + "logps/chosen": -1.7785285711288452, + "logps/rejected": -1.9162448644638062, + "loss": 3.0046, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.78528594970703, + "rewards/margins": 1.3771638870239258, + "rewards/rejected": -19.16244888305664, + "step": 420 + }, + { + "epoch": 0.014324716033570392, + "grad_norm": 22.334131240844727, + "learning_rate": 1.4324233232221098e-07, + "logits/chosen": -0.1741364747285843, + "logits/rejected": -0.12129688262939453, + "logps/chosen": -1.4081411361694336, + "logps/rejected": -1.5430591106414795, + "loss": 2.1877, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -14.08141040802002, + "rewards/margins": 1.3491812944412231, + "rewards/rejected": -15.430593490600586, + "step": 425 + }, + { + "epoch": 0.014493242104553574, + "grad_norm": 29.417394638061523, + "learning_rate": 1.4492753623188405e-07, + "logits/chosen": -0.5014611482620239, + "logits/rejected": -0.4477139413356781, + "logps/chosen": -1.5395524501800537, + "logps/rejected": -1.6200401782989502, + "loss": 2.4931, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -15.395523071289062, + "rewards/margins": 0.804876446723938, + "rewards/rejected": -16.20039939880371, + "step": 430 + }, + { + "epoch": 0.014661768175536756, + "grad_norm": 25.562877655029297, + "learning_rate": 1.466127401415571e-07, + "logits/chosen": -0.48289117217063904, + "logits/rejected": -0.34240299463272095, + "logps/chosen": -1.839874267578125, + "logps/rejected": -1.9911397695541382, + "loss": 2.1309, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.398744583129883, + "rewards/margins": 1.51265549659729, + "rewards/rejected": -19.91139793395996, + "step": 435 + }, + { + "epoch": 0.014830294246519937, + "grad_norm": 42.604774475097656, + "learning_rate": 1.482979440512302e-07, + "logits/chosen": -0.23516055941581726, + "logits/rejected": -0.2886582016944885, + "logps/chosen": -1.9065258502960205, + "logps/rejected": -1.8996076583862305, + "loss": 3.1934, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -19.065256118774414, + "rewards/margins": -0.0691794902086258, + "rewards/rejected": -18.996078491210938, + "step": 440 + }, + { + "epoch": 0.014998820317503117, + "grad_norm": 32.816471099853516, + "learning_rate": 1.4998314796090324e-07, + "logits/chosen": -0.46411070227622986, + "logits/rejected": -0.30949535965919495, + "logps/chosen": -1.802843689918518, + "logps/rejected": -1.8105186223983765, + "loss": 3.0283, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.0284366607666, + "rewards/margins": 0.07674837112426758, + "rewards/rejected": -18.10518455505371, + "step": 445 + }, + { + "epoch": 0.015167346388486299, + "grad_norm": 39.4717903137207, + "learning_rate": 1.5166835187057634e-07, + "logits/chosen": -0.12686650454998016, + "logits/rejected": -0.17431500554084778, + "logps/chosen": -2.150648593902588, + "logps/rejected": -2.0025336742401123, + "loss": 4.5375, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -21.506488800048828, + "rewards/margins": -1.4811471700668335, + "rewards/rejected": -20.025339126586914, + "step": 450 + }, + { + "epoch": 0.01533587245946948, + "grad_norm": 15.029488563537598, + "learning_rate": 1.5335355578024941e-07, + "logits/chosen": -0.18588756024837494, + "logits/rejected": -0.25931158661842346, + "logps/chosen": -1.945797324180603, + "logps/rejected": -1.991434097290039, + "loss": 3.2645, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.45797348022461, + "rewards/margins": 0.4563663601875305, + "rewards/rejected": -19.91434097290039, + "step": 455 + }, + { + "epoch": 0.015504398530452662, + "grad_norm": 17.134843826293945, + "learning_rate": 1.5503875968992246e-07, + "logits/chosen": -0.4510701298713684, + "logits/rejected": -0.5152049660682678, + "logps/chosen": -1.6451425552368164, + "logps/rejected": -1.7289478778839111, + "loss": 2.7484, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.451427459716797, + "rewards/margins": 0.8380520939826965, + "rewards/rejected": -17.289478302001953, + "step": 460 + }, + { + "epoch": 0.015672924601435843, + "grad_norm": 16.33306121826172, + "learning_rate": 1.5672396359959556e-07, + "logits/chosen": -0.2507234513759613, + "logits/rejected": -0.3093252182006836, + "logps/chosen": -1.4972885847091675, + "logps/rejected": -1.6501014232635498, + "loss": 2.4099, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -14.972885131835938, + "rewards/margins": 1.5281298160552979, + "rewards/rejected": -16.501014709472656, + "step": 465 + }, + { + "epoch": 0.015841450672419023, + "grad_norm": 24.52484703063965, + "learning_rate": 1.584091675092686e-07, + "logits/chosen": -0.47617292404174805, + "logits/rejected": -0.5188449621200562, + "logps/chosen": -1.6798864603042603, + "logps/rejected": -1.6755599975585938, + "loss": 3.1259, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.798864364624023, + "rewards/margins": -0.04326476901769638, + "rewards/rejected": -16.75560188293457, + "step": 470 + }, + { + "epoch": 0.016009976743402203, + "grad_norm": 30.124879837036133, + "learning_rate": 1.600943714189417e-07, + "logits/chosen": -0.6728774905204773, + "logits/rejected": -0.6839405298233032, + "logps/chosen": -1.6356920003890991, + "logps/rejected": -1.6939365863800049, + "loss": 2.659, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.356922149658203, + "rewards/margins": 0.5824446678161621, + "rewards/rejected": -16.93936538696289, + "step": 475 + }, + { + "epoch": 0.016178502814385386, + "grad_norm": 35.19898223876953, + "learning_rate": 1.6177957532861475e-07, + "logits/chosen": -0.3229294419288635, + "logits/rejected": -0.1391395628452301, + "logps/chosen": -1.695289969444275, + "logps/rejected": -1.714638352394104, + "loss": 2.9622, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.952899932861328, + "rewards/margins": 0.19348230957984924, + "rewards/rejected": -17.14638328552246, + "step": 480 + }, + { + "epoch": 0.016347028885368566, + "grad_norm": 38.96176528930664, + "learning_rate": 1.6346477923828782e-07, + "logits/chosen": -0.06022145599126816, + "logits/rejected": -0.1673354208469391, + "logps/chosen": -2.0568957328796387, + "logps/rejected": -2.0894064903259277, + "loss": 3.747, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.568958282470703, + "rewards/margins": 0.3251078724861145, + "rewards/rejected": -20.894065856933594, + "step": 485 + }, + { + "epoch": 0.016515554956351746, + "grad_norm": 30.77846908569336, + "learning_rate": 1.651499831479609e-07, + "logits/chosen": -0.6888018250465393, + "logits/rejected": -0.6225077509880066, + "logps/chosen": -1.6226558685302734, + "logps/rejected": -1.5915305614471436, + "loss": 3.6297, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.226558685302734, + "rewards/margins": -0.31125250458717346, + "rewards/rejected": -15.915306091308594, + "step": 490 + }, + { + "epoch": 0.01668408102733493, + "grad_norm": 41.58315658569336, + "learning_rate": 1.6683518705763396e-07, + "logits/chosen": -0.5833055377006531, + "logits/rejected": -0.5206926465034485, + "logps/chosen": -2.156750202178955, + "logps/rejected": -2.0980007648468018, + "loss": 3.6524, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.567501068115234, + "rewards/margins": -0.5874932408332825, + "rewards/rejected": -20.98000717163086, + "step": 495 + }, + { + "epoch": 0.01685260709831811, + "grad_norm": 19.05613136291504, + "learning_rate": 1.6852039096730703e-07, + "logits/chosen": -0.1755112111568451, + "logits/rejected": -0.277473121881485, + "logps/chosen": -2.1205787658691406, + "logps/rejected": -2.038501739501953, + "loss": 3.9988, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.205785751342773, + "rewards/margins": -0.820770263671875, + "rewards/rejected": -20.3850154876709, + "step": 500 + }, + { + "epoch": 0.017021133169301293, + "grad_norm": 32.78766632080078, + "learning_rate": 1.702055948769801e-07, + "logits/chosen": -0.697665274143219, + "logits/rejected": -0.6786226034164429, + "logps/chosen": -1.837233543395996, + "logps/rejected": -1.7147718667984009, + "loss": 4.4203, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -18.37233543395996, + "rewards/margins": -1.2246164083480835, + "rewards/rejected": -17.147716522216797, + "step": 505 + }, + { + "epoch": 0.017189659240284472, + "grad_norm": 19.369592666625977, + "learning_rate": 1.7189079878665318e-07, + "logits/chosen": -0.19188065826892853, + "logits/rejected": -0.16713164746761322, + "logps/chosen": -1.770496129989624, + "logps/rejected": -1.862823247909546, + "loss": 2.5088, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.7049617767334, + "rewards/margins": 0.9232719540596008, + "rewards/rejected": -18.628232955932617, + "step": 510 + }, + { + "epoch": 0.017358185311267652, + "grad_norm": 25.27272605895996, + "learning_rate": 1.7357600269632625e-07, + "logits/chosen": -0.598182201385498, + "logits/rejected": -0.4703841209411621, + "logps/chosen": -1.418545126914978, + "logps/rejected": -1.3824225664138794, + "loss": 3.4124, + "rewards/accuracies": 0.5, + "rewards/chosen": -14.185450553894043, + "rewards/margins": -0.36122599244117737, + "rewards/rejected": -13.824226379394531, + "step": 515 + }, + { + "epoch": 0.017526711382250836, + "grad_norm": 20.890714645385742, + "learning_rate": 1.7526120660599932e-07, + "logits/chosen": -0.2779539227485657, + "logits/rejected": -0.33786919713020325, + "logps/chosen": -1.8864319324493408, + "logps/rejected": -1.8625271320343018, + "loss": 4.4328, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -18.86431884765625, + "rewards/margins": -0.23904666304588318, + "rewards/rejected": -18.62527084350586, + "step": 520 + }, + { + "epoch": 0.017695237453234015, + "grad_norm": 20.13123321533203, + "learning_rate": 1.769464105156724e-07, + "logits/chosen": -0.5535017848014832, + "logits/rejected": -0.607734203338623, + "logps/chosen": -1.6528222560882568, + "logps/rejected": -1.7298600673675537, + "loss": 2.7765, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.528223037719727, + "rewards/margins": 0.7703768610954285, + "rewards/rejected": -17.298601150512695, + "step": 525 + }, + { + "epoch": 0.017863763524217195, + "grad_norm": 19.041948318481445, + "learning_rate": 1.7863161442534547e-07, + "logits/chosen": -0.4681750237941742, + "logits/rejected": -0.5170144438743591, + "logps/chosen": -1.6290159225463867, + "logps/rejected": -1.7050189971923828, + "loss": 2.4945, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.290157318115234, + "rewards/margins": 0.7600309252738953, + "rewards/rejected": -17.050189971923828, + "step": 530 + }, + { + "epoch": 0.01803228959520038, + "grad_norm": 39.596923828125, + "learning_rate": 1.803168183350185e-07, + "logits/chosen": -0.44919759035110474, + "logits/rejected": -0.4530429244041443, + "logps/chosen": -2.0134143829345703, + "logps/rejected": -2.075237989425659, + "loss": 2.7457, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.134143829345703, + "rewards/margins": 0.6182364225387573, + "rewards/rejected": -20.75238037109375, + "step": 535 + }, + { + "epoch": 0.01820081566618356, + "grad_norm": 19.176101684570312, + "learning_rate": 1.820020222446916e-07, + "logits/chosen": -0.6088439226150513, + "logits/rejected": -0.5793352127075195, + "logps/chosen": -1.5337207317352295, + "logps/rejected": -1.6069551706314087, + "loss": 2.4939, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -15.33720588684082, + "rewards/margins": 0.7323451042175293, + "rewards/rejected": -16.06955337524414, + "step": 540 + }, + { + "epoch": 0.018369341737166738, + "grad_norm": 21.026443481445312, + "learning_rate": 1.8368722615436466e-07, + "logits/chosen": -0.7524499893188477, + "logits/rejected": -0.7342022657394409, + "logps/chosen": -1.7819461822509766, + "logps/rejected": -1.3424131870269775, + "loss": 7.415, + "rewards/accuracies": 0.10000000149011612, + "rewards/chosen": -17.819461822509766, + "rewards/margins": -4.395329475402832, + "rewards/rejected": -13.42413330078125, + "step": 545 + }, + { + "epoch": 0.01853786780814992, + "grad_norm": 27.13228988647461, + "learning_rate": 1.8537243006403775e-07, + "logits/chosen": -0.46805500984191895, + "logits/rejected": -0.3271024823188782, + "logps/chosen": -1.8898948431015015, + "logps/rejected": -1.9199107885360718, + "loss": 2.9731, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.898948669433594, + "rewards/margins": 0.30015745759010315, + "rewards/rejected": -19.199106216430664, + "step": 550 + }, + { + "epoch": 0.0187063938791331, + "grad_norm": 20.596628189086914, + "learning_rate": 1.870576339737108e-07, + "logits/chosen": -0.4603646397590637, + "logits/rejected": -0.4007042944431305, + "logps/chosen": -2.0334296226501465, + "logps/rejected": -2.0893046855926514, + "loss": 2.9646, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.33429527282715, + "rewards/margins": 0.5587537884712219, + "rewards/rejected": -20.893047332763672, + "step": 555 + }, + { + "epoch": 0.018874919950116285, + "grad_norm": 29.820533752441406, + "learning_rate": 1.8874283788338387e-07, + "logits/chosen": -0.051391713321208954, + "logits/rejected": -0.12424926459789276, + "logps/chosen": -1.8017486333847046, + "logps/rejected": -1.9026470184326172, + "loss": 2.8019, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.017486572265625, + "rewards/margins": 1.0089836120605469, + "rewards/rejected": -19.026470184326172, + "step": 560 + }, + { + "epoch": 0.019043446021099465, + "grad_norm": 16.212018966674805, + "learning_rate": 1.9042804179305697e-07, + "logits/chosen": -0.1161131039261818, + "logits/rejected": -0.12281863391399384, + "logps/chosen": -1.7073466777801514, + "logps/rejected": -1.6138560771942139, + "loss": 4.0608, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.073467254638672, + "rewards/margins": -0.9349047541618347, + "rewards/rejected": -16.13856315612793, + "step": 565 + }, + { + "epoch": 0.019211972092082644, + "grad_norm": 27.620121002197266, + "learning_rate": 1.9211324570273002e-07, + "logits/chosen": -0.34033095836639404, + "logits/rejected": -0.3796425759792328, + "logps/chosen": -1.9775043725967407, + "logps/rejected": -1.8988311290740967, + "loss": 3.9004, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.775043487548828, + "rewards/margins": -0.7867323756217957, + "rewards/rejected": -18.98831558227539, + "step": 570 + }, + { + "epoch": 0.019380498163065828, + "grad_norm": 20.538888931274414, + "learning_rate": 1.9379844961240311e-07, + "logits/chosen": -0.5360706448554993, + "logits/rejected": -0.5650817155838013, + "logps/chosen": -1.6927764415740967, + "logps/rejected": -1.6992028951644897, + "loss": 3.0786, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.927766799926758, + "rewards/margins": 0.06426439434289932, + "rewards/rejected": -16.99203109741211, + "step": 575 + }, + { + "epoch": 0.019549024234049008, + "grad_norm": 22.677988052368164, + "learning_rate": 1.9548365352207616e-07, + "logits/chosen": -0.3704223334789276, + "logits/rejected": -0.3610188961029053, + "logps/chosen": -1.7271544933319092, + "logps/rejected": -1.7471681833267212, + "loss": 2.9617, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.27154541015625, + "rewards/margins": 0.20013637840747833, + "rewards/rejected": -17.471683502197266, + "step": 580 + }, + { + "epoch": 0.019717550305032187, + "grad_norm": 17.373239517211914, + "learning_rate": 1.9716885743174923e-07, + "logits/chosen": -0.3982377350330353, + "logits/rejected": -0.4115076959133148, + "logps/chosen": -1.8473840951919556, + "logps/rejected": -1.8948602676391602, + "loss": 2.8885, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.473840713500977, + "rewards/margins": 0.47476130723953247, + "rewards/rejected": -18.9486026763916, + "step": 585 + }, + { + "epoch": 0.01988607637601537, + "grad_norm": 10.618928909301758, + "learning_rate": 1.988540613414223e-07, + "logits/chosen": -0.3850679099559784, + "logits/rejected": -0.33462971448898315, + "logps/chosen": -1.780971884727478, + "logps/rejected": -1.95230233669281, + "loss": 1.987, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.80971908569336, + "rewards/margins": 1.7133023738861084, + "rewards/rejected": -19.523021697998047, + "step": 590 + }, + { + "epoch": 0.02005460244699855, + "grad_norm": 23.18230628967285, + "learning_rate": 2.0053926525109538e-07, + "logits/chosen": -0.3498408794403076, + "logits/rejected": -0.5002946257591248, + "logps/chosen": -1.680807113647461, + "logps/rejected": -1.6838855743408203, + "loss": 3.3985, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.80807113647461, + "rewards/margins": 0.030785083770751953, + "rewards/rejected": -16.838855743408203, + "step": 595 + }, + { + "epoch": 0.02022312851798173, + "grad_norm": 31.54558563232422, + "learning_rate": 2.0222446916076845e-07, + "logits/chosen": -0.11058574914932251, + "logits/rejected": -0.0773305743932724, + "logps/chosen": -1.980094313621521, + "logps/rejected": -2.0804390907287598, + "loss": 3.2886, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.800945281982422, + "rewards/margins": 1.0034451484680176, + "rewards/rejected": -20.80438804626465, + "step": 600 + }, + { + "epoch": 0.020391654588964914, + "grad_norm": 12.026350975036621, + "learning_rate": 2.0390967307044152e-07, + "logits/chosen": -0.24122457206249237, + "logits/rejected": -0.2786351144313812, + "logps/chosen": -1.7854111194610596, + "logps/rejected": -1.8579390048980713, + "loss": 2.8281, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.854110717773438, + "rewards/margins": 0.7252796292304993, + "rewards/rejected": -18.579389572143555, + "step": 605 + }, + { + "epoch": 0.020560180659948094, + "grad_norm": 22.978633880615234, + "learning_rate": 2.0559487698011456e-07, + "logits/chosen": -0.18546470999717712, + "logits/rejected": -0.1565871238708496, + "logps/chosen": -1.7760133743286133, + "logps/rejected": -1.8380794525146484, + "loss": 3.3063, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.760133743286133, + "rewards/margins": 0.6206603050231934, + "rewards/rejected": -18.38079261779785, + "step": 610 + }, + { + "epoch": 0.020728706730931273, + "grad_norm": 38.687129974365234, + "learning_rate": 2.0728008088978766e-07, + "logits/chosen": -0.3419579863548279, + "logits/rejected": -0.43344956636428833, + "logps/chosen": -1.891229271888733, + "logps/rejected": -1.7788646221160889, + "loss": 4.1865, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -18.91229248046875, + "rewards/margins": -1.1236467361450195, + "rewards/rejected": -17.788646697998047, + "step": 615 + }, + { + "epoch": 0.020897232801914457, + "grad_norm": 23.251344680786133, + "learning_rate": 2.089652847994607e-07, + "logits/chosen": -0.46370410919189453, + "logits/rejected": -0.518548846244812, + "logps/chosen": -1.6470798254013062, + "logps/rejected": -1.640175461769104, + "loss": 3.2648, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.47079849243164, + "rewards/margins": -0.06904516369104385, + "rewards/rejected": -16.401752471923828, + "step": 620 + }, + { + "epoch": 0.021065758872897637, + "grad_norm": 8.000481605529785, + "learning_rate": 2.106504887091338e-07, + "logits/chosen": -0.19095095992088318, + "logits/rejected": -0.2789108157157898, + "logps/chosen": -1.883286476135254, + "logps/rejected": -1.9219729900360107, + "loss": 3.0147, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.832866668701172, + "rewards/margins": 0.38686609268188477, + "rewards/rejected": -19.219730377197266, + "step": 625 + }, + { + "epoch": 0.02123428494388082, + "grad_norm": 17.710752487182617, + "learning_rate": 2.1233569261880688e-07, + "logits/chosen": -0.6913865804672241, + "logits/rejected": -0.6803420782089233, + "logps/chosen": -1.4992603063583374, + "logps/rejected": -1.6658436059951782, + "loss": 2.1558, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -14.99260425567627, + "rewards/margins": 1.6658321619033813, + "rewards/rejected": -16.658435821533203, + "step": 630 + }, + { + "epoch": 0.021402811014864, + "grad_norm": 14.574746131896973, + "learning_rate": 2.1402089652847992e-07, + "logits/chosen": -0.7683529853820801, + "logits/rejected": -0.8330462574958801, + "logps/chosen": -1.4146801233291626, + "logps/rejected": -1.6268253326416016, + "loss": 2.1639, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -14.146801948547363, + "rewards/margins": 2.1214513778686523, + "rewards/rejected": -16.268253326416016, + "step": 635 + }, + { + "epoch": 0.02157133708584718, + "grad_norm": 28.91573143005371, + "learning_rate": 2.1570610043815302e-07, + "logits/chosen": -0.5444644689559937, + "logits/rejected": -0.41493433713912964, + "logps/chosen": -1.7037174701690674, + "logps/rejected": -1.7247931957244873, + "loss": 3.0659, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.03717613220215, + "rewards/margins": 0.2107563018798828, + "rewards/rejected": -17.2479305267334, + "step": 640 + }, + { + "epoch": 0.021739863156830363, + "grad_norm": 24.06241798400879, + "learning_rate": 2.1739130434782607e-07, + "logits/chosen": -0.527267575263977, + "logits/rejected": -0.42481595277786255, + "logps/chosen": -1.6632928848266602, + "logps/rejected": -1.5876281261444092, + "loss": 3.8596, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -16.6329288482666, + "rewards/margins": -0.7566463351249695, + "rewards/rejected": -15.87628173828125, + "step": 645 + }, + { + "epoch": 0.021908389227813543, + "grad_norm": 33.575355529785156, + "learning_rate": 2.1907650825749917e-07, + "logits/chosen": -0.5085206627845764, + "logits/rejected": -0.6477267146110535, + "logps/chosen": -1.398726224899292, + "logps/rejected": -1.6070148944854736, + "loss": 2.3809, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -13.987261772155762, + "rewards/margins": 2.0828864574432373, + "rewards/rejected": -16.070148468017578, + "step": 650 + }, + { + "epoch": 0.022076915298796723, + "grad_norm": 21.62416648864746, + "learning_rate": 2.207617121671722e-07, + "logits/chosen": -1.0992919206619263, + "logits/rejected": -1.0706102848052979, + "logps/chosen": -1.7128111124038696, + "logps/rejected": -1.6248939037322998, + "loss": 3.942, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.128108978271484, + "rewards/margins": -0.8791699409484863, + "rewards/rejected": -16.248939514160156, + "step": 655 + }, + { + "epoch": 0.022245441369779906, + "grad_norm": 16.11137580871582, + "learning_rate": 2.2244691607684528e-07, + "logits/chosen": -0.6853328347206116, + "logits/rejected": -0.6241937279701233, + "logps/chosen": -2.0717978477478027, + "logps/rejected": -2.090998888015747, + "loss": 2.999, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.71797752380371, + "rewards/margins": 0.1920129805803299, + "rewards/rejected": -20.909992218017578, + "step": 660 + }, + { + "epoch": 0.022413967440763086, + "grad_norm": 22.234249114990234, + "learning_rate": 2.2413211998651836e-07, + "logits/chosen": -0.2508560121059418, + "logits/rejected": -0.28406912088394165, + "logps/chosen": -1.9362938404083252, + "logps/rejected": -1.8561948537826538, + "loss": 3.9404, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.362937927246094, + "rewards/margins": -0.8009899258613586, + "rewards/rejected": -18.561946868896484, + "step": 665 + }, + { + "epoch": 0.022582493511746266, + "grad_norm": 48.30265426635742, + "learning_rate": 2.2581732389619143e-07, + "logits/chosen": -0.8998171091079712, + "logits/rejected": -0.8238614797592163, + "logps/chosen": -1.8364791870117188, + "logps/rejected": -1.8647804260253906, + "loss": 2.9906, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.36479377746582, + "rewards/margins": 0.28301066160202026, + "rewards/rejected": -18.647804260253906, + "step": 670 + }, + { + "epoch": 0.02275101958272945, + "grad_norm": 24.514257431030273, + "learning_rate": 2.2750252780586447e-07, + "logits/chosen": -0.317034512758255, + "logits/rejected": -0.35245281457901, + "logps/chosen": -1.6648200750350952, + "logps/rejected": -1.6886028051376343, + "loss": 2.9444, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.6481990814209, + "rewards/margins": 0.23782816529273987, + "rewards/rejected": -16.886028289794922, + "step": 675 + }, + { + "epoch": 0.02291954565371263, + "grad_norm": 39.15069580078125, + "learning_rate": 2.2918773171553757e-07, + "logits/chosen": -0.4383459985256195, + "logits/rejected": -0.3607695400714874, + "logps/chosen": -1.6028152704238892, + "logps/rejected": -1.6575685739517212, + "loss": 2.7703, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.028154373168945, + "rewards/margins": 0.5475319623947144, + "rewards/rejected": -16.575685501098633, + "step": 680 + }, + { + "epoch": 0.023088071724695812, + "grad_norm": 41.797752380371094, + "learning_rate": 2.3087293562521064e-07, + "logits/chosen": -0.5070951581001282, + "logits/rejected": -0.5197767019271851, + "logps/chosen": -1.8675521612167358, + "logps/rejected": -1.805352807044983, + "loss": 3.7902, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.675521850585938, + "rewards/margins": -0.6219925880432129, + "rewards/rejected": -18.053529739379883, + "step": 685 + }, + { + "epoch": 0.023256597795678992, + "grad_norm": 38.495574951171875, + "learning_rate": 2.3255813953488372e-07, + "logits/chosen": -0.41856566071510315, + "logits/rejected": -0.5050365328788757, + "logps/chosen": -1.7272249460220337, + "logps/rejected": -1.64120352268219, + "loss": 4.1383, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -17.272247314453125, + "rewards/margins": -0.8602131605148315, + "rewards/rejected": -16.41203498840332, + "step": 690 + }, + { + "epoch": 0.02342512386666217, + "grad_norm": 27.242050170898438, + "learning_rate": 2.342433434445568e-07, + "logits/chosen": -0.49871888756752014, + "logits/rejected": -0.436892032623291, + "logps/chosen": -1.7822635173797607, + "logps/rejected": -1.804446816444397, + "loss": 2.9828, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.822635650634766, + "rewards/margins": 0.22183457016944885, + "rewards/rejected": -18.044469833374023, + "step": 695 + }, + { + "epoch": 0.023593649937645355, + "grad_norm": 24.042818069458008, + "learning_rate": 2.3592854735422983e-07, + "logits/chosen": -0.28581100702285767, + "logits/rejected": -0.38166743516921997, + "logps/chosen": -1.5768885612487793, + "logps/rejected": -1.607193946838379, + "loss": 3.0823, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -15.768885612487793, + "rewards/margins": 0.30305296182632446, + "rewards/rejected": -16.07193946838379, + "step": 700 + }, + { + "epoch": 0.023762176008628535, + "grad_norm": 32.76372146606445, + "learning_rate": 2.3761375126390293e-07, + "logits/chosen": -0.428058922290802, + "logits/rejected": -0.39834824204444885, + "logps/chosen": -1.963168740272522, + "logps/rejected": -1.8921149969100952, + "loss": 3.8068, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.631689071655273, + "rewards/margins": -0.7105369567871094, + "rewards/rejected": -18.92115020751953, + "step": 705 + }, + { + "epoch": 0.023930702079611715, + "grad_norm": 19.744258880615234, + "learning_rate": 2.39298955173576e-07, + "logits/chosen": -0.2931436002254486, + "logits/rejected": -0.24489137530326843, + "logps/chosen": -1.5167903900146484, + "logps/rejected": -1.6117607355117798, + "loss": 2.4292, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -15.1679048538208, + "rewards/margins": 0.9497024416923523, + "rewards/rejected": -16.11760711669922, + "step": 710 + }, + { + "epoch": 0.024099228150594898, + "grad_norm": 21.923982620239258, + "learning_rate": 2.409841590832491e-07, + "logits/chosen": -0.4506607949733734, + "logits/rejected": -0.5256415605545044, + "logps/chosen": -1.996930480003357, + "logps/rejected": -1.7969143390655518, + "loss": 5.0751, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.96930503845215, + "rewards/margins": -2.0001626014709473, + "rewards/rejected": -17.96914291381836, + "step": 715 + }, + { + "epoch": 0.024267754221578078, + "grad_norm": 23.204833984375, + "learning_rate": 2.4266936299292215e-07, + "logits/chosen": -0.43543902039527893, + "logits/rejected": -0.37989646196365356, + "logps/chosen": -1.9207398891448975, + "logps/rejected": -1.9233181476593018, + "loss": 3.4542, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.207401275634766, + "rewards/margins": 0.02578134462237358, + "rewards/rejected": -19.23318099975586, + "step": 720 + }, + { + "epoch": 0.024436280292561258, + "grad_norm": 24.56550407409668, + "learning_rate": 2.443545669025952e-07, + "logits/chosen": -0.6575881838798523, + "logits/rejected": -0.5963281989097595, + "logps/chosen": -1.7525399923324585, + "logps/rejected": -1.6743013858795166, + "loss": 4.0134, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.525400161743164, + "rewards/margins": -0.7823885679244995, + "rewards/rejected": -16.743013381958008, + "step": 725 + }, + { + "epoch": 0.02460480636354444, + "grad_norm": 19.73300552368164, + "learning_rate": 2.4603977081226824e-07, + "logits/chosen": -0.24325819313526154, + "logits/rejected": -0.35052576661109924, + "logps/chosen": -1.7987921237945557, + "logps/rejected": -1.9659143686294556, + "loss": 2.6491, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.9879207611084, + "rewards/margins": 1.6712220907211304, + "rewards/rejected": -19.659143447875977, + "step": 730 + }, + { + "epoch": 0.02477333243452762, + "grad_norm": 41.61115646362305, + "learning_rate": 2.4772497472194136e-07, + "logits/chosen": -0.7007697224617004, + "logits/rejected": -0.6481397747993469, + "logps/chosen": -1.6374019384384155, + "logps/rejected": -1.7201652526855469, + "loss": 2.6239, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.374019622802734, + "rewards/margins": 0.8276341557502747, + "rewards/rejected": -17.2016544342041, + "step": 735 + }, + { + "epoch": 0.024941858505510804, + "grad_norm": 14.423554420471191, + "learning_rate": 2.4941017863161443e-07, + "logits/chosen": -0.5714423656463623, + "logits/rejected": -0.5921608209609985, + "logps/chosen": -1.9647853374481201, + "logps/rejected": -2.303321123123169, + "loss": 2.7816, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.64785385131836, + "rewards/margins": 3.385359525680542, + "rewards/rejected": -23.033212661743164, + "step": 740 + }, + { + "epoch": 0.025110384576493984, + "grad_norm": 82.18790435791016, + "learning_rate": 2.510953825412875e-07, + "logits/chosen": -0.40886393189430237, + "logits/rejected": -0.2524716258049011, + "logps/chosen": -1.6332025527954102, + "logps/rejected": -1.7254012823104858, + "loss": 2.8306, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.332027435302734, + "rewards/margins": 0.9219877123832703, + "rewards/rejected": -17.254013061523438, + "step": 745 + }, + { + "epoch": 0.025278910647477164, + "grad_norm": 25.27329444885254, + "learning_rate": 2.527805864509606e-07, + "logits/chosen": -0.6217483878135681, + "logits/rejected": -0.5722722411155701, + "logps/chosen": -1.817664384841919, + "logps/rejected": -1.8501255512237549, + "loss": 2.7603, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.17664337158203, + "rewards/margins": 0.3246099352836609, + "rewards/rejected": -18.50125503540039, + "step": 750 + }, + { + "epoch": 0.025447436718460347, + "grad_norm": 38.237152099609375, + "learning_rate": 2.5446579036063365e-07, + "logits/chosen": -0.52363520860672, + "logits/rejected": -0.5313522219657898, + "logps/chosen": -1.905940294265747, + "logps/rejected": -1.8410999774932861, + "loss": 3.7816, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -19.059402465820312, + "rewards/margins": -0.6484056711196899, + "rewards/rejected": -18.41099739074707, + "step": 755 + }, + { + "epoch": 0.025615962789443527, + "grad_norm": 37.8472785949707, + "learning_rate": 2.5615099427030667e-07, + "logits/chosen": -0.5234827399253845, + "logits/rejected": -0.5807468891143799, + "logps/chosen": -1.763085961341858, + "logps/rejected": -1.7279046773910522, + "loss": 3.4781, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.630859375, + "rewards/margins": -0.35181283950805664, + "rewards/rejected": -17.2790470123291, + "step": 760 + }, + { + "epoch": 0.025784488860426707, + "grad_norm": 29.356088638305664, + "learning_rate": 2.5783619817997974e-07, + "logits/chosen": -0.5783195495605469, + "logits/rejected": -0.5610502362251282, + "logps/chosen": -1.7727210521697998, + "logps/rejected": -1.6591606140136719, + "loss": 4.158, + "rewards/accuracies": 0.10000000149011612, + "rewards/chosen": -17.72721290588379, + "rewards/margins": -1.1356046199798584, + "rewards/rejected": -16.59160804748535, + "step": 765 + }, + { + "epoch": 0.02595301493140989, + "grad_norm": 22.93600845336914, + "learning_rate": 2.5952140208965287e-07, + "logits/chosen": -0.31732669472694397, + "logits/rejected": -0.3070994019508362, + "logps/chosen": -1.8145532608032227, + "logps/rejected": -1.844628095626831, + "loss": 2.8821, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.14553451538086, + "rewards/margins": 0.3007470965385437, + "rewards/rejected": -18.446279525756836, + "step": 770 + }, + { + "epoch": 0.02612154100239307, + "grad_norm": 36.81005859375, + "learning_rate": 2.6120660599932594e-07, + "logits/chosen": -0.6823596954345703, + "logits/rejected": -0.8584939241409302, + "logps/chosen": -1.685476303100586, + "logps/rejected": -1.749103307723999, + "loss": 2.5587, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.85476303100586, + "rewards/margins": 0.6362693905830383, + "rewards/rejected": -17.491031646728516, + "step": 775 + }, + { + "epoch": 0.02629006707337625, + "grad_norm": 11.87728214263916, + "learning_rate": 2.6289180990899896e-07, + "logits/chosen": -0.33666494488716125, + "logits/rejected": -0.2485232651233673, + "logps/chosen": -1.8858550786972046, + "logps/rejected": -1.8398468494415283, + "loss": 3.6989, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.85854721069336, + "rewards/margins": -0.46007975935935974, + "rewards/rejected": -18.398468017578125, + "step": 780 + }, + { + "epoch": 0.026458593144359433, + "grad_norm": 34.59413528442383, + "learning_rate": 2.6457701381867203e-07, + "logits/chosen": -0.678062915802002, + "logits/rejected": -0.703747034072876, + "logps/chosen": -1.4804130792617798, + "logps/rejected": -1.481335997581482, + "loss": 3.1245, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -14.804130554199219, + "rewards/margins": 0.009229278191924095, + "rewards/rejected": -14.813360214233398, + "step": 785 + }, + { + "epoch": 0.026627119215342613, + "grad_norm": 13.038679122924805, + "learning_rate": 2.662622177283451e-07, + "logits/chosen": -0.43355339765548706, + "logits/rejected": -0.26748722791671753, + "logps/chosen": -1.9919134378433228, + "logps/rejected": -2.074113368988037, + "loss": 2.6707, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.919132232666016, + "rewards/margins": 0.8219999074935913, + "rewards/rejected": -20.741134643554688, + "step": 790 + }, + { + "epoch": 0.026795645286325796, + "grad_norm": 38.65684509277344, + "learning_rate": 2.679474216380182e-07, + "logits/chosen": -0.5652610063552856, + "logits/rejected": -0.3666650950908661, + "logps/chosen": -1.6639525890350342, + "logps/rejected": -1.6299728155136108, + "loss": 3.7574, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.639524459838867, + "rewards/margins": -0.33979684114456177, + "rewards/rejected": -16.299728393554688, + "step": 795 + }, + { + "epoch": 0.026964171357308976, + "grad_norm": 21.291852951049805, + "learning_rate": 2.696326255476913e-07, + "logits/chosen": -0.5117262005805969, + "logits/rejected": -0.47947850823402405, + "logps/chosen": -2.0055885314941406, + "logps/rejected": -1.9679181575775146, + "loss": 3.4746, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.055883407592773, + "rewards/margins": -0.3767028748989105, + "rewards/rejected": -19.679183959960938, + "step": 800 + }, + { + "epoch": 0.026964171357308976, + "eval_logits/chosen": -0.6988369822502136, + "eval_logits/rejected": -0.7001645565032959, + "eval_logps/chosen": -1.655611276626587, + "eval_logps/rejected": -1.6514620780944824, + "eval_loss": 3.436960458755493, + "eval_rewards/accuracies": 0.4699999988079071, + "eval_rewards/chosen": -16.55611228942871, + "eval_rewards/margins": -0.041492920368909836, + "eval_rewards/rejected": -16.514619827270508, + "eval_runtime": 12.9309, + "eval_samples_per_second": 7.733, + "eval_steps_per_second": 1.933, + "step": 800 + }, + { + "epoch": 0.027132697428292156, + "grad_norm": 18.55634117126465, + "learning_rate": 2.713178294573643e-07, + "logits/chosen": -0.5753890872001648, + "logits/rejected": -0.4462040364742279, + "logps/chosen": -1.4152113199234009, + "logps/rejected": -1.5044399499893188, + "loss": 2.6021, + "rewards/accuracies": 0.5, + "rewards/chosen": -14.152114868164062, + "rewards/margins": 0.8922847509384155, + "rewards/rejected": -15.044398307800293, + "step": 805 + }, + { + "epoch": 0.02730122349927534, + "grad_norm": 35.19956588745117, + "learning_rate": 2.730030333670374e-07, + "logits/chosen": -0.5496922731399536, + "logits/rejected": -0.417860746383667, + "logps/chosen": -1.7335258722305298, + "logps/rejected": -1.7158492803573608, + "loss": 3.4416, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -17.33525848388672, + "rewards/margins": -0.176764577627182, + "rewards/rejected": -17.158493041992188, + "step": 810 + }, + { + "epoch": 0.02746974957025852, + "grad_norm": 21.941349029541016, + "learning_rate": 2.7468823727671046e-07, + "logits/chosen": -0.3967406749725342, + "logits/rejected": -0.5216356515884399, + "logps/chosen": -1.9088962078094482, + "logps/rejected": -1.7400329113006592, + "loss": 4.7099, + "rewards/accuracies": 0.10000000149011612, + "rewards/chosen": -19.08896255493164, + "rewards/margins": -1.6886341571807861, + "rewards/rejected": -17.400327682495117, + "step": 815 + }, + { + "epoch": 0.0276382756412417, + "grad_norm": 17.868234634399414, + "learning_rate": 2.763734411863836e-07, + "logits/chosen": -0.37519901990890503, + "logits/rejected": -0.4539732336997986, + "logps/chosen": -1.8600950241088867, + "logps/rejected": -1.8098268508911133, + "loss": 3.6029, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.600950241088867, + "rewards/margins": -0.5026828050613403, + "rewards/rejected": -18.098268508911133, + "step": 820 + }, + { + "epoch": 0.027806801712224882, + "grad_norm": 19.673343658447266, + "learning_rate": 2.780586450960566e-07, + "logits/chosen": -0.7934955954551697, + "logits/rejected": -0.8090070486068726, + "logps/chosen": -1.6202653646469116, + "logps/rejected": -1.5587866306304932, + "loss": 3.6798, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -16.202655792236328, + "rewards/margins": -0.6147867441177368, + "rewards/rejected": -15.587865829467773, + "step": 825 + }, + { + "epoch": 0.027975327783208062, + "grad_norm": 30.37026023864746, + "learning_rate": 2.797438490057297e-07, + "logits/chosen": -0.5915518999099731, + "logits/rejected": -0.6104450225830078, + "logps/chosen": -1.6193252801895142, + "logps/rejected": -1.656368613243103, + "loss": 3.0309, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.19325065612793, + "rewards/margins": 0.3704357147216797, + "rewards/rejected": -16.56368637084961, + "step": 830 + }, + { + "epoch": 0.028143853854191242, + "grad_norm": 28.94577980041504, + "learning_rate": 2.8142905291540275e-07, + "logits/chosen": -0.3376855254173279, + "logits/rejected": -0.36098232865333557, + "logps/chosen": -1.6270509958267212, + "logps/rejected": -1.7100646495819092, + "loss": 2.4789, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.270511627197266, + "rewards/margins": 0.8301382064819336, + "rewards/rejected": -17.10064697265625, + "step": 835 + }, + { + "epoch": 0.028312379925174425, + "grad_norm": 23.16632652282715, + "learning_rate": 2.831142568250758e-07, + "logits/chosen": -0.10355281829833984, + "logits/rejected": -0.10854457318782806, + "logps/chosen": -1.6649757623672485, + "logps/rejected": -2.3399498462677, + "loss": 1.6261, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -16.649757385253906, + "rewards/margins": 6.7497406005859375, + "rewards/rejected": -23.39949607849121, + "step": 840 + }, + { + "epoch": 0.028480905996157605, + "grad_norm": 31.516361236572266, + "learning_rate": 2.8479946073474884e-07, + "logits/chosen": -0.29107311367988586, + "logits/rejected": -0.15227220952510834, + "logps/chosen": -1.724825143814087, + "logps/rejected": -1.6380412578582764, + "loss": 4.0866, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.24825096130371, + "rewards/margins": -0.8678376078605652, + "rewards/rejected": -16.380413055419922, + "step": 845 + }, + { + "epoch": 0.028649432067140785, + "grad_norm": 21.751916885375977, + "learning_rate": 2.8648466464442196e-07, + "logits/chosen": -0.3685319125652313, + "logits/rejected": -0.47823366522789, + "logps/chosen": -1.6271034479141235, + "logps/rejected": -2.1975488662719727, + "loss": 2.3549, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.271034240722656, + "rewards/margins": 5.7044548988342285, + "rewards/rejected": -21.97549057006836, + "step": 850 + }, + { + "epoch": 0.02881795813812397, + "grad_norm": 18.084596633911133, + "learning_rate": 2.8816986855409504e-07, + "logits/chosen": -0.28694620728492737, + "logits/rejected": -0.3606267273426056, + "logps/chosen": -1.6423877477645874, + "logps/rejected": -1.6014961004257202, + "loss": 3.7834, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.42387580871582, + "rewards/margins": -0.4089130461215973, + "rewards/rejected": -16.014963150024414, + "step": 855 + }, + { + "epoch": 0.028986484209107148, + "grad_norm": 18.191057205200195, + "learning_rate": 2.898550724637681e-07, + "logits/chosen": -0.40083274245262146, + "logits/rejected": -0.41338223218917847, + "logps/chosen": -1.865255355834961, + "logps/rejected": -2.0431108474731445, + "loss": 1.8863, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.652551651000977, + "rewards/margins": 1.7785552740097046, + "rewards/rejected": -20.431108474731445, + "step": 860 + }, + { + "epoch": 0.02915501028009033, + "grad_norm": 36.193058013916016, + "learning_rate": 2.915402763734412e-07, + "logits/chosen": -0.34367144107818604, + "logits/rejected": -0.35139140486717224, + "logps/chosen": -1.5019428730010986, + "logps/rejected": -1.6563999652862549, + "loss": 2.3754, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -15.019430160522461, + "rewards/margins": 1.544569969177246, + "rewards/rejected": -16.56399917602539, + "step": 865 + }, + { + "epoch": 0.02932353635107351, + "grad_norm": 29.971036911010742, + "learning_rate": 2.932254802831142e-07, + "logits/chosen": -0.4394436776638031, + "logits/rejected": -0.489290326833725, + "logps/chosen": -1.6216356754302979, + "logps/rejected": -1.5399792194366455, + "loss": 3.8889, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.21635627746582, + "rewards/margins": -0.8165642619132996, + "rewards/rejected": -15.39979076385498, + "step": 870 + }, + { + "epoch": 0.02949206242205669, + "grad_norm": 29.058055877685547, + "learning_rate": 2.949106841927873e-07, + "logits/chosen": -0.6171309351921082, + "logits/rejected": -0.5241347551345825, + "logps/chosen": -1.6407206058502197, + "logps/rejected": -1.7146087884902954, + "loss": 2.4908, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.407207489013672, + "rewards/margins": 0.7388814091682434, + "rewards/rejected": -17.146087646484375, + "step": 875 + }, + { + "epoch": 0.029660588493039874, + "grad_norm": 36.95437240600586, + "learning_rate": 2.965958881024604e-07, + "logits/chosen": -0.3627752661705017, + "logits/rejected": -0.3334648907184601, + "logps/chosen": -1.6400692462921143, + "logps/rejected": -1.598304033279419, + "loss": 3.5085, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.400691986083984, + "rewards/margins": -0.417651891708374, + "rewards/rejected": -15.983039855957031, + "step": 880 + }, + { + "epoch": 0.029829114564023054, + "grad_norm": 30.360219955444336, + "learning_rate": 2.9828109201213347e-07, + "logits/chosen": -0.4603070616722107, + "logits/rejected": -0.37161141633987427, + "logps/chosen": -1.752171277999878, + "logps/rejected": -1.8547074794769287, + "loss": 2.3261, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.521711349487305, + "rewards/margins": 1.0253627300262451, + "rewards/rejected": -18.547075271606445, + "step": 885 + }, + { + "epoch": 0.029997640635006234, + "grad_norm": 24.124099731445312, + "learning_rate": 2.999662959218065e-07, + "logits/chosen": -0.32377538084983826, + "logits/rejected": -0.4409562051296234, + "logps/chosen": -1.6169246435165405, + "logps/rejected": -1.5861228704452515, + "loss": 3.446, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.169246673583984, + "rewards/margins": -0.30801552534103394, + "rewards/rejected": -15.861230850219727, + "step": 890 + }, + { + "epoch": 0.030166166705989417, + "grad_norm": 22.953449249267578, + "learning_rate": 3.0165149983147956e-07, + "logits/chosen": -0.6455360651016235, + "logits/rejected": -0.6651984453201294, + "logps/chosen": -1.7256028652191162, + "logps/rejected": -1.8558244705200195, + "loss": 2.8954, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.256031036376953, + "rewards/margins": 1.302215337753296, + "rewards/rejected": -18.558244705200195, + "step": 895 + }, + { + "epoch": 0.030334692776972597, + "grad_norm": 22.631338119506836, + "learning_rate": 3.033367037411527e-07, + "logits/chosen": -0.47631579637527466, + "logits/rejected": -0.5697728395462036, + "logps/chosen": -1.7963521480560303, + "logps/rejected": -1.802026391029358, + "loss": 3.335, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -17.963520050048828, + "rewards/margins": 0.05674257129430771, + "rewards/rejected": -18.020263671875, + "step": 900 + }, + { + "epoch": 0.030503218847955777, + "grad_norm": 13.581130981445312, + "learning_rate": 3.0502190765082576e-07, + "logits/chosen": -0.5964738726615906, + "logits/rejected": -0.6222713589668274, + "logps/chosen": -1.6977436542510986, + "logps/rejected": -1.6556882858276367, + "loss": 3.6774, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.977436065673828, + "rewards/margins": -0.42055463790893555, + "rewards/rejected": -16.556880950927734, + "step": 905 + }, + { + "epoch": 0.03067174491893896, + "grad_norm": 24.475252151489258, + "learning_rate": 3.0670711156049883e-07, + "logits/chosen": -0.2089192122220993, + "logits/rejected": -0.14673969149589539, + "logps/chosen": -1.7369495630264282, + "logps/rejected": -1.7116025686264038, + "loss": 3.3661, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.369495391845703, + "rewards/margins": -0.253470242023468, + "rewards/rejected": -17.116024017333984, + "step": 910 + }, + { + "epoch": 0.03084027098992214, + "grad_norm": 15.064949989318848, + "learning_rate": 3.0839231547017185e-07, + "logits/chosen": -0.41629713773727417, + "logits/rejected": -0.28385329246520996, + "logps/chosen": -1.7120001316070557, + "logps/rejected": -1.910131812095642, + "loss": 2.2545, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.1200008392334, + "rewards/margins": 1.9813172817230225, + "rewards/rejected": -19.101320266723633, + "step": 915 + }, + { + "epoch": 0.031008797060905324, + "grad_norm": 18.73622703552246, + "learning_rate": 3.100775193798449e-07, + "logits/chosen": -0.5003215074539185, + "logits/rejected": -0.4684736132621765, + "logps/chosen": -1.7935521602630615, + "logps/rejected": -2.2938485145568848, + "loss": 2.5871, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.935522079467773, + "rewards/margins": 5.002964973449707, + "rewards/rejected": -22.938486099243164, + "step": 920 + }, + { + "epoch": 0.031177323131888503, + "grad_norm": 23.802066802978516, + "learning_rate": 3.1176272328951804e-07, + "logits/chosen": -0.6484017968177795, + "logits/rejected": -0.8259338140487671, + "logps/chosen": -1.4363877773284912, + "logps/rejected": -1.5680042505264282, + "loss": 2.2103, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -14.36387825012207, + "rewards/margins": 1.316164255142212, + "rewards/rejected": -15.680041313171387, + "step": 925 + }, + { + "epoch": 0.03134584920287169, + "grad_norm": 20.22801399230957, + "learning_rate": 3.134479271991911e-07, + "logits/chosen": -0.6938272714614868, + "logits/rejected": -0.5847961902618408, + "logps/chosen": -1.6226203441619873, + "logps/rejected": -1.6188541650772095, + "loss": 3.099, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.2262020111084, + "rewards/margins": -0.03766317293047905, + "rewards/rejected": -16.188541412353516, + "step": 930 + }, + { + "epoch": 0.03151437527385487, + "grad_norm": 36.12754821777344, + "learning_rate": 3.1513313110886413e-07, + "logits/chosen": -0.3781919777393341, + "logits/rejected": -0.21472349762916565, + "logps/chosen": -1.8906118869781494, + "logps/rejected": -1.7393276691436768, + "loss": 4.7426, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.906116485595703, + "rewards/margins": -1.512840986251831, + "rewards/rejected": -17.39327621459961, + "step": 935 + }, + { + "epoch": 0.031682901344838046, + "grad_norm": 169.67495727539062, + "learning_rate": 3.168183350185372e-07, + "logits/chosen": -0.3850443661212921, + "logits/rejected": -0.31055304408073425, + "logps/chosen": -2.1397993564605713, + "logps/rejected": -2.0641467571258545, + "loss": 4.6265, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.397991180419922, + "rewards/margins": -0.7565252184867859, + "rewards/rejected": -20.641468048095703, + "step": 940 + }, + { + "epoch": 0.031851427415821226, + "grad_norm": 51.70778274536133, + "learning_rate": 3.185035389282103e-07, + "logits/chosen": -0.5556879043579102, + "logits/rejected": -0.6957587599754333, + "logps/chosen": -1.6177966594696045, + "logps/rejected": -1.445115327835083, + "loss": 4.8937, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -16.177967071533203, + "rewards/margins": -1.7268126010894775, + "rewards/rejected": -14.451153755187988, + "step": 945 + }, + { + "epoch": 0.032019953486804406, + "grad_norm": 41.51219940185547, + "learning_rate": 3.201887428378834e-07, + "logits/chosen": -0.553308367729187, + "logits/rejected": -0.2694825530052185, + "logps/chosen": -1.7990858554840088, + "logps/rejected": -1.7950258255004883, + "loss": 3.4404, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -17.990856170654297, + "rewards/margins": -0.04059934616088867, + "rewards/rejected": -17.950258255004883, + "step": 950 + }, + { + "epoch": 0.03218847955778759, + "grad_norm": 39.060516357421875, + "learning_rate": 3.218739467475564e-07, + "logits/chosen": -0.21345441043376923, + "logits/rejected": -0.2154117077589035, + "logps/chosen": -2.0603363513946533, + "logps/rejected": -1.9187877178192139, + "loss": 4.559, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -20.603363037109375, + "rewards/margins": -1.4154869318008423, + "rewards/rejected": -19.187877655029297, + "step": 955 + }, + { + "epoch": 0.03235700562877077, + "grad_norm": 31.9559268951416, + "learning_rate": 3.235591506572295e-07, + "logits/chosen": -0.5925837755203247, + "logits/rejected": -0.4998060166835785, + "logps/chosen": -1.7358916997909546, + "logps/rejected": -1.7558130025863647, + "loss": 3.0011, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.358917236328125, + "rewards/margins": 0.19921168684959412, + "rewards/rejected": -17.558130264282227, + "step": 960 + }, + { + "epoch": 0.03252553169975395, + "grad_norm": 22.82416534423828, + "learning_rate": 3.2524435456690257e-07, + "logits/chosen": -0.28516143560409546, + "logits/rejected": -0.2573995888233185, + "logps/chosen": -1.8022711277008057, + "logps/rejected": -1.7835382223129272, + "loss": 3.5213, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.022708892822266, + "rewards/margins": -0.18732872605323792, + "rewards/rejected": -17.83538246154785, + "step": 965 + }, + { + "epoch": 0.03269405777073713, + "grad_norm": 28.213504791259766, + "learning_rate": 3.2692955847657564e-07, + "logits/chosen": -0.5727102756500244, + "logits/rejected": -0.5985099077224731, + "logps/chosen": -1.98666512966156, + "logps/rejected": -1.7860243320465088, + "loss": 5.0562, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.86665153503418, + "rewards/margins": -2.00640869140625, + "rewards/rejected": -17.86024284362793, + "step": 970 + }, + { + "epoch": 0.03286258384172031, + "grad_norm": 20.811389923095703, + "learning_rate": 3.2861476238624876e-07, + "logits/chosen": -0.3460441827774048, + "logits/rejected": -0.4190608561038971, + "logps/chosen": -1.6150789260864258, + "logps/rejected": -1.7487542629241943, + "loss": 2.3329, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.150787353515625, + "rewards/margins": 1.3367559909820557, + "rewards/rejected": -17.487545013427734, + "step": 975 + }, + { + "epoch": 0.03303110991270349, + "grad_norm": 27.982589721679688, + "learning_rate": 3.302999662959218e-07, + "logits/chosen": -0.6669625043869019, + "logits/rejected": -0.5034026503562927, + "logps/chosen": -1.5034055709838867, + "logps/rejected": -1.5552198886871338, + "loss": 2.7122, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -15.034055709838867, + "rewards/margins": 0.518143892288208, + "rewards/rejected": -15.55219841003418, + "step": 980 + }, + { + "epoch": 0.03319963598368668, + "grad_norm": 19.364490509033203, + "learning_rate": 3.3198517020559485e-07, + "logits/chosen": -0.3645666539669037, + "logits/rejected": -0.3036833703517914, + "logps/chosen": -1.6654491424560547, + "logps/rejected": -1.8591690063476562, + "loss": 2.1034, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -16.654491424560547, + "rewards/margins": 1.9371992349624634, + "rewards/rejected": -18.591690063476562, + "step": 985 + }, + { + "epoch": 0.03336816205466986, + "grad_norm": 16.458690643310547, + "learning_rate": 3.336703741152679e-07, + "logits/chosen": -0.6402947306632996, + "logits/rejected": -0.7181426286697388, + "logps/chosen": -1.6192500591278076, + "logps/rejected": -1.620171308517456, + "loss": 3.1025, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.192501068115234, + "rewards/margins": 0.009212017059326172, + "rewards/rejected": -16.20171356201172, + "step": 990 + }, + { + "epoch": 0.03353668812565304, + "grad_norm": 22.21959686279297, + "learning_rate": 3.35355578024941e-07, + "logits/chosen": -0.4840850830078125, + "logits/rejected": -0.41178077459335327, + "logps/chosen": -1.736339807510376, + "logps/rejected": -1.763511300086975, + "loss": 2.9488, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.3633975982666, + "rewards/margins": 0.27171534299850464, + "rewards/rejected": -17.635112762451172, + "step": 995 + }, + { + "epoch": 0.03370521419663622, + "grad_norm": 23.295127868652344, + "learning_rate": 3.3704078193461407e-07, + "logits/chosen": -0.09787406027317047, + "logits/rejected": -0.16445288062095642, + "logps/chosen": -1.9367231130599976, + "logps/rejected": -1.9507242441177368, + "loss": 3.0651, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.367233276367188, + "rewards/margins": 0.14000901579856873, + "rewards/rejected": -19.50724220275879, + "step": 1000 + }, + { + "epoch": 0.0338737402676194, + "grad_norm": 15.762697219848633, + "learning_rate": 3.3872598584428714e-07, + "logits/chosen": -0.35486823320388794, + "logits/rejected": -0.24091534316539764, + "logps/chosen": -1.7885253429412842, + "logps/rejected": -1.9451637268066406, + "loss": 2.5787, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.885255813598633, + "rewards/margins": 1.5663820505142212, + "rewards/rejected": -19.451635360717773, + "step": 1005 + }, + { + "epoch": 0.034042266338602585, + "grad_norm": 20.35264778137207, + "learning_rate": 3.404111897539602e-07, + "logits/chosen": -0.7031236886978149, + "logits/rejected": -0.7049790620803833, + "logps/chosen": -1.589521884918213, + "logps/rejected": -1.5903050899505615, + "loss": 3.4214, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -15.895219802856445, + "rewards/margins": 0.007831478491425514, + "rewards/rejected": -15.903048515319824, + "step": 1010 + }, + { + "epoch": 0.034210792409585765, + "grad_norm": 26.860595703125, + "learning_rate": 3.420963936636333e-07, + "logits/chosen": -0.5545657873153687, + "logits/rejected": -0.47453317046165466, + "logps/chosen": -2.0342624187469482, + "logps/rejected": -2.24369478225708, + "loss": 2.2526, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.34262466430664, + "rewards/margins": 2.0943217277526855, + "rewards/rejected": -22.436946868896484, + "step": 1015 + }, + { + "epoch": 0.034379318480568945, + "grad_norm": 19.55118751525879, + "learning_rate": 3.4378159757330636e-07, + "logits/chosen": -0.237474724650383, + "logits/rejected": -0.09069846570491791, + "logps/chosen": -1.5036439895629883, + "logps/rejected": -1.4696455001831055, + "loss": 3.5962, + "rewards/accuracies": 0.5, + "rewards/chosen": -15.0364408493042, + "rewards/margins": -0.3399861454963684, + "rewards/rejected": -14.696454048156738, + "step": 1020 + }, + { + "epoch": 0.034547844551552125, + "grad_norm": 24.427839279174805, + "learning_rate": 3.4546680148297943e-07, + "logits/chosen": -0.029159266501665115, + "logits/rejected": 0.004416605923324823, + "logps/chosen": -1.6479896306991577, + "logps/rejected": -1.6794894933700562, + "loss": 2.8159, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.479896545410156, + "rewards/margins": 0.3149985373020172, + "rewards/rejected": -16.79489517211914, + "step": 1025 + }, + { + "epoch": 0.034716370622535304, + "grad_norm": 35.8406867980957, + "learning_rate": 3.471520053926525e-07, + "logits/chosen": -0.5084778666496277, + "logits/rejected": -0.47411975264549255, + "logps/chosen": -1.8330333232879639, + "logps/rejected": -1.8400707244873047, + "loss": 3.1579, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.330333709716797, + "rewards/margins": 0.07037486881017685, + "rewards/rejected": -18.400707244873047, + "step": 1030 + }, + { + "epoch": 0.034884896693518484, + "grad_norm": 12.815260887145996, + "learning_rate": 3.4883720930232557e-07, + "logits/chosen": -0.44474729895591736, + "logits/rejected": -0.5331543684005737, + "logps/chosen": -1.9221336841583252, + "logps/rejected": -2.0161516666412354, + "loss": 2.7604, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.221338272094727, + "rewards/margins": 0.9401793479919434, + "rewards/rejected": -20.161518096923828, + "step": 1035 + }, + { + "epoch": 0.03505342276450167, + "grad_norm": 19.394817352294922, + "learning_rate": 3.5052241321199864e-07, + "logits/chosen": -0.2036978304386139, + "logits/rejected": -0.1269582062959671, + "logps/chosen": -2.039153814315796, + "logps/rejected": -1.904292345046997, + "loss": 4.7773, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.391536712646484, + "rewards/margins": -1.3486132621765137, + "rewards/rejected": -19.042922973632812, + "step": 1040 + }, + { + "epoch": 0.03522194883548485, + "grad_norm": 31.41037368774414, + "learning_rate": 3.5220761712167166e-07, + "logits/chosen": -0.42470335960388184, + "logits/rejected": -0.5246855616569519, + "logps/chosen": -2.123039722442627, + "logps/rejected": -2.208981990814209, + "loss": 3.0471, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.230396270751953, + "rewards/margins": 0.8594244122505188, + "rewards/rejected": -22.089818954467773, + "step": 1045 + }, + { + "epoch": 0.03539047490646803, + "grad_norm": 16.57583236694336, + "learning_rate": 3.538928210313448e-07, + "logits/chosen": -0.26511508226394653, + "logits/rejected": -0.28311991691589355, + "logps/chosen": -1.609574317932129, + "logps/rejected": -1.6120364665985107, + "loss": 3.1126, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.09574317932129, + "rewards/margins": 0.02462158165872097, + "rewards/rejected": -16.120365142822266, + "step": 1050 + }, + { + "epoch": 0.03555900097745121, + "grad_norm": 19.813968658447266, + "learning_rate": 3.5557802494101786e-07, + "logits/chosen": -0.7508059144020081, + "logits/rejected": -0.8045485615730286, + "logps/chosen": -1.5364524126052856, + "logps/rejected": -1.5765130519866943, + "loss": 3.1092, + "rewards/accuracies": 0.5, + "rewards/chosen": -15.364524841308594, + "rewards/margins": 0.40060538053512573, + "rewards/rejected": -15.765130996704102, + "step": 1055 + }, + { + "epoch": 0.03572752704843439, + "grad_norm": 25.874378204345703, + "learning_rate": 3.5726322885069093e-07, + "logits/chosen": -0.4468226432800293, + "logits/rejected": -0.39583808183670044, + "logps/chosen": -1.6582987308502197, + "logps/rejected": -1.787592887878418, + "loss": 2.8136, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.58298683166504, + "rewards/margins": 1.2929418087005615, + "rewards/rejected": -17.875926971435547, + "step": 1060 + }, + { + "epoch": 0.03589605311941758, + "grad_norm": 23.56908416748047, + "learning_rate": 3.5894843276036395e-07, + "logits/chosen": -0.50593101978302, + "logits/rejected": -0.5206912755966187, + "logps/chosen": -1.942335844039917, + "logps/rejected": -1.8451474905014038, + "loss": 4.047, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -19.423358917236328, + "rewards/margins": -0.9718831181526184, + "rewards/rejected": -18.451473236083984, + "step": 1065 + }, + { + "epoch": 0.03606457919040076, + "grad_norm": 25.075082778930664, + "learning_rate": 3.60633636670037e-07, + "logits/chosen": -0.34409254789352417, + "logits/rejected": -0.5635167956352234, + "logps/chosen": -1.6613903045654297, + "logps/rejected": -1.6975667476654053, + "loss": 2.8626, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.613903045654297, + "rewards/margins": 0.3617649972438812, + "rewards/rejected": -16.97566795349121, + "step": 1070 + }, + { + "epoch": 0.03623310526138394, + "grad_norm": 24.480600357055664, + "learning_rate": 3.6231884057971015e-07, + "logits/chosen": -0.5511472821235657, + "logits/rejected": -0.5344496965408325, + "logps/chosen": -1.5300863981246948, + "logps/rejected": -1.600724458694458, + "loss": 2.7084, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -15.300865173339844, + "rewards/margins": 0.7063789367675781, + "rewards/rejected": -16.007244110107422, + "step": 1075 + }, + { + "epoch": 0.03640163133236712, + "grad_norm": 26.33382797241211, + "learning_rate": 3.640040444893832e-07, + "logits/chosen": -0.22453565895557404, + "logits/rejected": -0.2891274094581604, + "logps/chosen": -1.901058554649353, + "logps/rejected": -1.7239421606063843, + "loss": 4.8905, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -19.01058578491211, + "rewards/margins": -1.7711639404296875, + "rewards/rejected": -17.239421844482422, + "step": 1080 + }, + { + "epoch": 0.0365701574033503, + "grad_norm": 25.77227210998535, + "learning_rate": 3.656892483990563e-07, + "logits/chosen": -0.506012499332428, + "logits/rejected": -0.5107889175415039, + "logps/chosen": -1.7310975790023804, + "logps/rejected": -1.6403017044067383, + "loss": 3.967, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.310977935791016, + "rewards/margins": -0.9079577326774597, + "rewards/rejected": -16.403018951416016, + "step": 1085 + }, + { + "epoch": 0.036738683474333476, + "grad_norm": 20.275495529174805, + "learning_rate": 3.673744523087293e-07, + "logits/chosen": -0.48249855637550354, + "logits/rejected": -0.4290506839752197, + "logps/chosen": -1.869666337966919, + "logps/rejected": -1.8338606357574463, + "loss": 3.5558, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.696664810180664, + "rewards/margins": -0.35805749893188477, + "rewards/rejected": -18.338603973388672, + "step": 1090 + }, + { + "epoch": 0.03690720954531666, + "grad_norm": 21.183048248291016, + "learning_rate": 3.690596562184024e-07, + "logits/chosen": -0.44681963324546814, + "logits/rejected": -0.3580966293811798, + "logps/chosen": -1.7726942300796509, + "logps/rejected": -1.85599684715271, + "loss": 2.7325, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.72694206237793, + "rewards/margins": 0.8330275416374207, + "rewards/rejected": -18.55997085571289, + "step": 1095 + }, + { + "epoch": 0.03707573561629984, + "grad_norm": 27.373397827148438, + "learning_rate": 3.707448601280755e-07, + "logits/chosen": -0.4769948422908783, + "logits/rejected": -0.33051571249961853, + "logps/chosen": -1.8934491872787476, + "logps/rejected": -1.8802127838134766, + "loss": 4.1084, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.934494018554688, + "rewards/margins": -0.13236340880393982, + "rewards/rejected": -18.802127838134766, + "step": 1100 + }, + { + "epoch": 0.03724426168728302, + "grad_norm": 33.653560638427734, + "learning_rate": 3.724300640377486e-07, + "logits/chosen": -0.5165948867797852, + "logits/rejected": -0.4417875409126282, + "logps/chosen": -1.9007021188735962, + "logps/rejected": -1.9125080108642578, + "loss": 3.0777, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.007020950317383, + "rewards/margins": 0.11805801093578339, + "rewards/rejected": -19.125080108642578, + "step": 1105 + }, + { + "epoch": 0.0374127877582662, + "grad_norm": 64.54642486572266, + "learning_rate": 3.741152679474216e-07, + "logits/chosen": -0.16624772548675537, + "logits/rejected": -0.4547352194786072, + "logps/chosen": -1.6494159698486328, + "logps/rejected": -1.7214076519012451, + "loss": 2.7371, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.494159698486328, + "rewards/margins": 0.7199157476425171, + "rewards/rejected": -17.21407699584961, + "step": 1110 + }, + { + "epoch": 0.03758131382924938, + "grad_norm": 24.6578369140625, + "learning_rate": 3.7580047185709467e-07, + "logits/chosen": -0.014136564917862415, + "logits/rejected": -0.16067324578762054, + "logps/chosen": -2.145573139190674, + "logps/rejected": -2.0913286209106445, + "loss": 3.6231, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.455732345581055, + "rewards/margins": -0.5424462556838989, + "rewards/rejected": -20.913288116455078, + "step": 1115 + }, + { + "epoch": 0.03774983990023257, + "grad_norm": 23.080047607421875, + "learning_rate": 3.7748567576676774e-07, + "logits/chosen": -0.7539829015731812, + "logits/rejected": -0.7695743441581726, + "logps/chosen": -1.6319653987884521, + "logps/rejected": -1.5968726873397827, + "loss": 3.553, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.31965446472168, + "rewards/margins": -0.35092657804489136, + "rewards/rejected": -15.968729019165039, + "step": 1120 + }, + { + "epoch": 0.03791836597121575, + "grad_norm": 43.620174407958984, + "learning_rate": 3.7917087967644087e-07, + "logits/chosen": -0.5959217548370361, + "logits/rejected": -0.5997222065925598, + "logps/chosen": -1.7780771255493164, + "logps/rejected": -1.747106909751892, + "loss": 3.3741, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.780773162841797, + "rewards/margins": -0.30970388650894165, + "rewards/rejected": -17.4710693359375, + "step": 1125 + }, + { + "epoch": 0.03808689204219893, + "grad_norm": 28.987207412719727, + "learning_rate": 3.8085608358611394e-07, + "logits/chosen": -0.10104711353778839, + "logits/rejected": -0.13577064871788025, + "logps/chosen": -2.0339839458465576, + "logps/rejected": -2.018089771270752, + "loss": 3.2938, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.339839935302734, + "rewards/margins": -0.15894070267677307, + "rewards/rejected": -20.180898666381836, + "step": 1130 + }, + { + "epoch": 0.03825541811318211, + "grad_norm": 27.349302291870117, + "learning_rate": 3.8254128749578696e-07, + "logits/chosen": -0.4686538279056549, + "logits/rejected": -0.33619728684425354, + "logps/chosen": -1.8348045349121094, + "logps/rejected": -1.895215630531311, + "loss": 2.9007, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.348045349121094, + "rewards/margins": 0.6041110754013062, + "rewards/rejected": -18.952157974243164, + "step": 1135 + }, + { + "epoch": 0.03842394418416529, + "grad_norm": 49.29142761230469, + "learning_rate": 3.8422649140546003e-07, + "logits/chosen": -0.6437035799026489, + "logits/rejected": -0.6373649835586548, + "logps/chosen": -1.6333469152450562, + "logps/rejected": -1.700933814048767, + "loss": 2.5222, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.33346939086914, + "rewards/margins": 0.6758671998977661, + "rewards/rejected": -17.009336471557617, + "step": 1140 + }, + { + "epoch": 0.03859247025514847, + "grad_norm": 27.445465087890625, + "learning_rate": 3.859116953151331e-07, + "logits/chosen": -0.03426782041788101, + "logits/rejected": -0.028365587815642357, + "logps/chosen": -2.383859395980835, + "logps/rejected": -2.457791805267334, + "loss": 3.2568, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.838594436645508, + "rewards/margins": 0.7393231391906738, + "rewards/rejected": -24.577917098999023, + "step": 1145 + }, + { + "epoch": 0.038760996326131655, + "grad_norm": 22.15361213684082, + "learning_rate": 3.8759689922480623e-07, + "logits/chosen": -0.561955988407135, + "logits/rejected": -0.6235911250114441, + "logps/chosen": -1.4886285066604614, + "logps/rejected": -1.5125375986099243, + "loss": 2.915, + "rewards/accuracies": 0.5, + "rewards/chosen": -14.886285781860352, + "rewards/margins": 0.239091157913208, + "rewards/rejected": -15.12537670135498, + "step": 1150 + }, + { + "epoch": 0.038929522397114835, + "grad_norm": 19.871843338012695, + "learning_rate": 3.8928210313447925e-07, + "logits/chosen": -0.6531854867935181, + "logits/rejected": -0.6476200819015503, + "logps/chosen": -1.7495161294937134, + "logps/rejected": -1.8894214630126953, + "loss": 2.1886, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.495162963867188, + "rewards/margins": 1.399052381515503, + "rewards/rejected": -18.894214630126953, + "step": 1155 + }, + { + "epoch": 0.039098048468098015, + "grad_norm": 38.3976936340332, + "learning_rate": 3.909673070441523e-07, + "logits/chosen": -0.5150366425514221, + "logits/rejected": -0.512414813041687, + "logps/chosen": -1.4856387376785278, + "logps/rejected": -1.4582234621047974, + "loss": 3.3862, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -14.8563871383667, + "rewards/margins": -0.2741526663303375, + "rewards/rejected": -14.582234382629395, + "step": 1160 + }, + { + "epoch": 0.039266574539081195, + "grad_norm": 21.68328285217285, + "learning_rate": 3.926525109538254e-07, + "logits/chosen": -0.2962788939476013, + "logits/rejected": -0.34172508120536804, + "logps/chosen": -1.905922293663025, + "logps/rejected": -1.9026237726211548, + "loss": 3.358, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.059223175048828, + "rewards/margins": -0.03298645094037056, + "rewards/rejected": -19.02623748779297, + "step": 1165 + }, + { + "epoch": 0.039435100610064375, + "grad_norm": 24.6955623626709, + "learning_rate": 3.9433771486349846e-07, + "logits/chosen": -0.401714026927948, + "logits/rejected": -0.4989330768585205, + "logps/chosen": -1.7247121334075928, + "logps/rejected": -1.6620609760284424, + "loss": 3.7205, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.247121810913086, + "rewards/margins": -0.6265131235122681, + "rewards/rejected": -16.620609283447266, + "step": 1170 + }, + { + "epoch": 0.03960362668104756, + "grad_norm": 22.365659713745117, + "learning_rate": 3.9602291877317153e-07, + "logits/chosen": -0.5538057088851929, + "logits/rejected": -0.3844769597053528, + "logps/chosen": -1.7564191818237305, + "logps/rejected": -1.7836357355117798, + "loss": 2.8724, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.564189910888672, + "rewards/margins": 0.2721673548221588, + "rewards/rejected": -17.83635902404785, + "step": 1175 + }, + { + "epoch": 0.03977215275203074, + "grad_norm": 19.269901275634766, + "learning_rate": 3.977081226828446e-07, + "logits/chosen": -0.432064950466156, + "logits/rejected": -0.31601718068122864, + "logps/chosen": -1.9622234106063843, + "logps/rejected": -2.001495122909546, + "loss": 3.1216, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.622234344482422, + "rewards/margins": 0.3927184045314789, + "rewards/rejected": -20.01495361328125, + "step": 1180 + }, + { + "epoch": 0.03994067882301392, + "grad_norm": 25.248796463012695, + "learning_rate": 3.993933265925177e-07, + "logits/chosen": -0.47508859634399414, + "logits/rejected": -0.5017315149307251, + "logps/chosen": -1.7752971649169922, + "logps/rejected": -2.049936294555664, + "loss": 3.234, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.752971649169922, + "rewards/margins": 2.7463924884796143, + "rewards/rejected": -20.49936294555664, + "step": 1185 + }, + { + "epoch": 0.0401092048939971, + "grad_norm": 20.06141471862793, + "learning_rate": 4.0107853050219075e-07, + "logits/chosen": -0.17705413699150085, + "logits/rejected": -0.25445953011512756, + "logps/chosen": -1.946390151977539, + "logps/rejected": -1.9433777332305908, + "loss": 3.1987, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.46390151977539, + "rewards/margins": -0.03012552298605442, + "rewards/rejected": -19.43377685546875, + "step": 1190 + }, + { + "epoch": 0.04027773096498028, + "grad_norm": 33.42091751098633, + "learning_rate": 4.027637344118638e-07, + "logits/chosen": -0.5977298021316528, + "logits/rejected": -0.6200628280639648, + "logps/chosen": -1.7825301885604858, + "logps/rejected": -1.7205543518066406, + "loss": 3.6795, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -17.825302124023438, + "rewards/margins": -0.6197582483291626, + "rewards/rejected": -17.20554542541504, + "step": 1195 + }, + { + "epoch": 0.04044625703596346, + "grad_norm": 18.302885055541992, + "learning_rate": 4.044489383215369e-07, + "logits/chosen": -0.6540176868438721, + "logits/rejected": -0.6137186884880066, + "logps/chosen": -1.7022262811660767, + "logps/rejected": -1.7463051080703735, + "loss": 2.8856, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.02226448059082, + "rewards/margins": 0.4407869279384613, + "rewards/rejected": -17.463048934936523, + "step": 1200 + }, + { + "epoch": 0.04044625703596346, + "eval_logits/chosen": -0.6984472870826721, + "eval_logits/rejected": -0.6997342109680176, + "eval_logps/chosen": -1.656233549118042, + "eval_logps/rejected": -1.6515949964523315, + "eval_loss": 3.4398796558380127, + "eval_rewards/accuracies": 0.4699999988079071, + "eval_rewards/chosen": -16.562335968017578, + "eval_rewards/margins": -0.04638513922691345, + "eval_rewards/rejected": -16.51595115661621, + "eval_runtime": 12.9174, + "eval_samples_per_second": 7.741, + "eval_steps_per_second": 1.935, + "step": 1200 + }, + { + "epoch": 0.04061478310694665, + "grad_norm": 23.862524032592773, + "learning_rate": 4.0613414223120997e-07, + "logits/chosen": -0.5462231040000916, + "logits/rejected": -0.5446901917457581, + "logps/chosen": -1.6794078350067139, + "logps/rejected": -1.7836143970489502, + "loss": 2.4901, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.794078826904297, + "rewards/margins": 1.0420640707015991, + "rewards/rejected": -17.83614158630371, + "step": 1205 + }, + { + "epoch": 0.04078330917792983, + "grad_norm": 18.685237884521484, + "learning_rate": 4.0781934614088304e-07, + "logits/chosen": -0.36270827054977417, + "logits/rejected": -0.520888090133667, + "logps/chosen": -1.7202644348144531, + "logps/rejected": -1.7153618335723877, + "loss": 3.3016, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.2026424407959, + "rewards/margins": -0.049027156084775925, + "rewards/rejected": -17.15361785888672, + "step": 1210 + }, + { + "epoch": 0.04095183524891301, + "grad_norm": 27.324533462524414, + "learning_rate": 4.095045500505561e-07, + "logits/chosen": -0.521334707736969, + "logits/rejected": -0.5759360194206238, + "logps/chosen": -1.778804063796997, + "logps/rejected": -1.7410389184951782, + "loss": 3.4761, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.788042068481445, + "rewards/margins": -0.37765082716941833, + "rewards/rejected": -17.410388946533203, + "step": 1215 + }, + { + "epoch": 0.04112036131989619, + "grad_norm": 13.48217487335205, + "learning_rate": 4.1118975396022913e-07, + "logits/chosen": 0.02969430759549141, + "logits/rejected": -0.014116739854216576, + "logps/chosen": -1.8857414722442627, + "logps/rejected": -1.967960000038147, + "loss": 3.302, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.8574161529541, + "rewards/margins": 0.8221861720085144, + "rewards/rejected": -19.679601669311523, + "step": 1220 + }, + { + "epoch": 0.04128888739087937, + "grad_norm": 58.38434600830078, + "learning_rate": 4.1287495786990225e-07, + "logits/chosen": -0.4893341660499573, + "logits/rejected": -0.4025818705558777, + "logps/chosen": -1.5990569591522217, + "logps/rejected": -1.6483962535858154, + "loss": 2.7695, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -15.990570068359375, + "rewards/margins": 0.4933937191963196, + "rewards/rejected": -16.483963012695312, + "step": 1225 + }, + { + "epoch": 0.04145741346186255, + "grad_norm": 53.46418380737305, + "learning_rate": 4.145601617795753e-07, + "logits/chosen": -0.5396580696105957, + "logits/rejected": -0.6176207661628723, + "logps/chosen": -1.6050565242767334, + "logps/rejected": -1.5374106168746948, + "loss": 3.7522, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.05056381225586, + "rewards/margins": -0.6764588356018066, + "rewards/rejected": -15.374105453491211, + "step": 1230 + }, + { + "epoch": 0.041625939532845734, + "grad_norm": 23.273683547973633, + "learning_rate": 4.162453656892484e-07, + "logits/chosen": -0.517733097076416, + "logits/rejected": -0.46530431509017944, + "logps/chosen": -1.717034101486206, + "logps/rejected": -1.7212120294570923, + "loss": 3.0328, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.17034149169922, + "rewards/margins": 0.041781235486269, + "rewards/rejected": -17.21212387084961, + "step": 1235 + }, + { + "epoch": 0.04179446560382891, + "grad_norm": 32.8110237121582, + "learning_rate": 4.179305695989214e-07, + "logits/chosen": -0.5621334314346313, + "logits/rejected": -0.4572630524635315, + "logps/chosen": -1.8699407577514648, + "logps/rejected": -1.951944351196289, + "loss": 2.7422, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.699405670166016, + "rewards/margins": 0.8200358152389526, + "rewards/rejected": -19.519441604614258, + "step": 1240 + }, + { + "epoch": 0.04196299167481209, + "grad_norm": 21.079713821411133, + "learning_rate": 4.196157735085945e-07, + "logits/chosen": -0.32299450039863586, + "logits/rejected": -0.2742319703102112, + "logps/chosen": -2.0296037197113037, + "logps/rejected": -1.9701045751571655, + "loss": 3.8158, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -20.296039581298828, + "rewards/margins": -0.5949932932853699, + "rewards/rejected": -19.701045989990234, + "step": 1245 + }, + { + "epoch": 0.04213151774579527, + "grad_norm": 26.673555374145508, + "learning_rate": 4.213009774182676e-07, + "logits/chosen": -0.41463375091552734, + "logits/rejected": -0.4377075135707855, + "logps/chosen": -1.6575113534927368, + "logps/rejected": -1.534987449645996, + "loss": 4.3934, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.57511329650879, + "rewards/margins": -1.2252376079559326, + "rewards/rejected": -15.349874496459961, + "step": 1250 + }, + { + "epoch": 0.04230004381677845, + "grad_norm": 17.89075469970703, + "learning_rate": 4.229861813279407e-07, + "logits/chosen": -0.6064554452896118, + "logits/rejected": -0.6212650537490845, + "logps/chosen": -1.851464867591858, + "logps/rejected": -1.6988131999969482, + "loss": 4.5567, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -18.514650344848633, + "rewards/margins": -1.526517391204834, + "rewards/rejected": -16.98813247680664, + "step": 1255 + }, + { + "epoch": 0.04246856988776164, + "grad_norm": 33.90618896484375, + "learning_rate": 4.2467138523761376e-07, + "logits/chosen": -0.2495536506175995, + "logits/rejected": -0.25900721549987793, + "logps/chosen": -1.6897900104522705, + "logps/rejected": -1.7224485874176025, + "loss": 3.0004, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.897899627685547, + "rewards/margins": 0.3265857696533203, + "rewards/rejected": -17.224485397338867, + "step": 1260 + }, + { + "epoch": 0.04263709595874482, + "grad_norm": 15.698116302490234, + "learning_rate": 4.263565891472868e-07, + "logits/chosen": -0.5363945364952087, + "logits/rejected": -0.5787938237190247, + "logps/chosen": -1.9835312366485596, + "logps/rejected": -1.9302680492401123, + "loss": 3.981, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -19.835309982299805, + "rewards/margins": -0.5326288342475891, + "rewards/rejected": -19.302682876586914, + "step": 1265 + }, + { + "epoch": 0.042805622029728, + "grad_norm": 25.389694213867188, + "learning_rate": 4.2804179305695985e-07, + "logits/chosen": -0.29593202471733093, + "logits/rejected": -0.221342995762825, + "logps/chosen": -1.9105651378631592, + "logps/rejected": -1.9219735860824585, + "loss": 3.4243, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.105648040771484, + "rewards/margins": 0.1140863448381424, + "rewards/rejected": -19.21973419189453, + "step": 1270 + }, + { + "epoch": 0.04297414810071118, + "grad_norm": 20.01345443725586, + "learning_rate": 4.2972699696663297e-07, + "logits/chosen": -0.6382346153259277, + "logits/rejected": -0.5160123705863953, + "logps/chosen": -1.6651241779327393, + "logps/rejected": -1.7844934463500977, + "loss": 2.8522, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.651241302490234, + "rewards/margins": 1.193691372871399, + "rewards/rejected": -17.844934463500977, + "step": 1275 + }, + { + "epoch": 0.04314267417169436, + "grad_norm": 30.08690071105957, + "learning_rate": 4.3141220087630604e-07, + "logits/chosen": -0.8173457980155945, + "logits/rejected": -0.7372242212295532, + "logps/chosen": -1.5411250591278076, + "logps/rejected": -1.540610432624817, + "loss": 3.3094, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -15.41125202178955, + "rewards/margins": -0.005145835690200329, + "rewards/rejected": -15.406105041503906, + "step": 1280 + }, + { + "epoch": 0.04331120024267754, + "grad_norm": 24.03042221069336, + "learning_rate": 4.3309740478597906e-07, + "logits/chosen": 0.08485229313373566, + "logits/rejected": 0.023478638380765915, + "logps/chosen": -1.7684332132339478, + "logps/rejected": -1.9061079025268555, + "loss": 2.5292, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.6843318939209, + "rewards/margins": 1.3767480850219727, + "rewards/rejected": -19.061079025268555, + "step": 1285 + }, + { + "epoch": 0.043479726313660726, + "grad_norm": 0.18441037833690643, + "learning_rate": 4.3478260869565214e-07, + "logits/chosen": -0.6562118530273438, + "logits/rejected": -0.7641373872756958, + "logps/chosen": -1.5792770385742188, + "logps/rejected": -1.6253446340560913, + "loss": 3.1678, + "rewards/accuracies": 0.5, + "rewards/chosen": -15.792770385742188, + "rewards/margins": 0.4606761932373047, + "rewards/rejected": -16.25344467163086, + "step": 1290 + }, + { + "epoch": 0.043648252384643905, + "grad_norm": 21.094350814819336, + "learning_rate": 4.364678126053252e-07, + "logits/chosen": -0.5818384289741516, + "logits/rejected": -0.6034550666809082, + "logps/chosen": -1.4294414520263672, + "logps/rejected": -1.4413141012191772, + "loss": 2.9525, + "rewards/accuracies": 0.5, + "rewards/chosen": -14.294413566589355, + "rewards/margins": 0.11872673034667969, + "rewards/rejected": -14.413141250610352, + "step": 1295 + }, + { + "epoch": 0.043816778455627085, + "grad_norm": 9.890064239501953, + "learning_rate": 4.3815301651499833e-07, + "logits/chosen": -0.5558933019638062, + "logits/rejected": -0.4949572682380676, + "logps/chosen": -1.7029212713241577, + "logps/rejected": -1.8398288488388062, + "loss": 2.477, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.02921485900879, + "rewards/margins": 1.369077205657959, + "rewards/rejected": -18.398290634155273, + "step": 1300 + }, + { + "epoch": 0.043985304526610265, + "grad_norm": 24.280580520629883, + "learning_rate": 4.398382204246714e-07, + "logits/chosen": -0.4872204661369324, + "logits/rejected": -0.4315074384212494, + "logps/chosen": -1.6111892461776733, + "logps/rejected": -1.5915868282318115, + "loss": 3.3073, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.111894607543945, + "rewards/margins": -0.19602584838867188, + "rewards/rejected": -15.915868759155273, + "step": 1305 + }, + { + "epoch": 0.044153830597593445, + "grad_norm": 25.741281509399414, + "learning_rate": 4.415234243343444e-07, + "logits/chosen": -0.5934125781059265, + "logits/rejected": -0.7145117521286011, + "logps/chosen": -1.6144893169403076, + "logps/rejected": -1.635259985923767, + "loss": 2.9124, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -16.144893646240234, + "rewards/margins": 0.20770521461963654, + "rewards/rejected": -16.35260009765625, + "step": 1310 + }, + { + "epoch": 0.04432235666857663, + "grad_norm": 38.52206039428711, + "learning_rate": 4.432086282440175e-07, + "logits/chosen": -0.493429958820343, + "logits/rejected": -0.44628891348838806, + "logps/chosen": -2.13350248336792, + "logps/rejected": -2.1447107791900635, + "loss": 3.1856, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.33502769470215, + "rewards/margins": 0.11208000034093857, + "rewards/rejected": -21.44710922241211, + "step": 1315 + }, + { + "epoch": 0.04449088273955981, + "grad_norm": 31.843582153320312, + "learning_rate": 4.4489383215369057e-07, + "logits/chosen": -0.5034832954406738, + "logits/rejected": -0.6175475716590881, + "logps/chosen": -1.6708362102508545, + "logps/rejected": -1.7281715869903564, + "loss": 2.6639, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.708362579345703, + "rewards/margins": 0.5733525156974792, + "rewards/rejected": -17.281715393066406, + "step": 1320 + }, + { + "epoch": 0.04465940881054299, + "grad_norm": 25.711498260498047, + "learning_rate": 4.465790360633637e-07, + "logits/chosen": -0.5433965921401978, + "logits/rejected": -0.5722803473472595, + "logps/chosen": -1.6870956420898438, + "logps/rejected": -1.6297203302383423, + "loss": 3.6554, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -16.870956420898438, + "rewards/margins": -0.5737524032592773, + "rewards/rejected": -16.297204971313477, + "step": 1325 + }, + { + "epoch": 0.04482793488152617, + "grad_norm": 148.03579711914062, + "learning_rate": 4.482642399730367e-07, + "logits/chosen": -0.4677700996398926, + "logits/rejected": -0.35929447412490845, + "logps/chosen": -1.8165819644927979, + "logps/rejected": -1.690076470375061, + "loss": 4.6109, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.16581916809082, + "rewards/margins": -1.26505446434021, + "rewards/rejected": -16.90076446533203, + "step": 1330 + }, + { + "epoch": 0.04499646095250935, + "grad_norm": 14.833139419555664, + "learning_rate": 4.499494438827098e-07, + "logits/chosen": -0.06567313522100449, + "logits/rejected": 0.02431054599583149, + "logps/chosen": -1.7260477542877197, + "logps/rejected": -1.953141450881958, + "loss": 1.5616, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.260478973388672, + "rewards/margins": 2.270934581756592, + "rewards/rejected": -19.531414031982422, + "step": 1335 + }, + { + "epoch": 0.04516498702349253, + "grad_norm": 31.50690460205078, + "learning_rate": 4.5163464779238286e-07, + "logits/chosen": -0.36952149868011475, + "logits/rejected": -0.3253156542778015, + "logps/chosen": -2.0293240547180176, + "logps/rejected": -1.9255653619766235, + "loss": 4.171, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.293243408203125, + "rewards/margins": -1.037587285041809, + "rewards/rejected": -19.255653381347656, + "step": 1340 + }, + { + "epoch": 0.04533351309447572, + "grad_norm": 25.02866554260254, + "learning_rate": 4.5331985170205593e-07, + "logits/chosen": -0.2764541804790497, + "logits/rejected": -0.4271532893180847, + "logps/chosen": -1.822595238685608, + "logps/rejected": -1.819495439529419, + "loss": 3.1072, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.2259521484375, + "rewards/margins": -0.03099794313311577, + "rewards/rejected": -18.19495391845703, + "step": 1345 + }, + { + "epoch": 0.0455020391654589, + "grad_norm": 73.92284393310547, + "learning_rate": 4.5500505561172895e-07, + "logits/chosen": -0.3259121775627136, + "logits/rejected": -0.3415904641151428, + "logps/chosen": -2.656127691268921, + "logps/rejected": -2.435351848602295, + "loss": 5.274, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -26.561279296875, + "rewards/margins": -2.20776104927063, + "rewards/rejected": -24.353517532348633, + "step": 1350 + }, + { + "epoch": 0.04567056523644208, + "grad_norm": 21.903491973876953, + "learning_rate": 4.5669025952140207e-07, + "logits/chosen": -0.5971258878707886, + "logits/rejected": -0.49006539583206177, + "logps/chosen": -1.649746298789978, + "logps/rejected": -1.5837621688842773, + "loss": 3.7002, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.497465133666992, + "rewards/margins": -0.6598426699638367, + "rewards/rejected": -15.837620735168457, + "step": 1355 + }, + { + "epoch": 0.04583909130742526, + "grad_norm": 23.044069290161133, + "learning_rate": 4.5837546343107514e-07, + "logits/chosen": -0.43621835112571716, + "logits/rejected": -0.32460182905197144, + "logps/chosen": -1.6573737859725952, + "logps/rejected": -1.6105083227157593, + "loss": 3.5635, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.5737361907959, + "rewards/margins": -0.46865320205688477, + "rewards/rejected": -16.105083465576172, + "step": 1360 + }, + { + "epoch": 0.04600761737840844, + "grad_norm": 26.895612716674805, + "learning_rate": 4.600606673407482e-07, + "logits/chosen": -0.6109335422515869, + "logits/rejected": -0.5404512286186218, + "logps/chosen": -1.8257896900177002, + "logps/rejected": -1.864885687828064, + "loss": 2.8298, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.257898330688477, + "rewards/margins": 0.39095717668533325, + "rewards/rejected": -18.648855209350586, + "step": 1365 + }, + { + "epoch": 0.046176143449391624, + "grad_norm": 29.45931625366211, + "learning_rate": 4.617458712504213e-07, + "logits/chosen": -0.6883367896080017, + "logits/rejected": -0.4843037724494934, + "logps/chosen": -1.4697327613830566, + "logps/rejected": -1.4948680400848389, + "loss": 3.034, + "rewards/accuracies": 0.5, + "rewards/chosen": -14.69732666015625, + "rewards/margins": 0.25135332345962524, + "rewards/rejected": -14.94867992401123, + "step": 1370 + }, + { + "epoch": 0.046344669520374804, + "grad_norm": 26.856428146362305, + "learning_rate": 4.634310751600943e-07, + "logits/chosen": -0.1802286058664322, + "logits/rejected": -0.13944736123085022, + "logps/chosen": -1.8919188976287842, + "logps/rejected": -2.034350872039795, + "loss": 1.9954, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -18.919189453125, + "rewards/margins": 1.4243175983428955, + "rewards/rejected": -20.343509674072266, + "step": 1375 + }, + { + "epoch": 0.046513195591357984, + "grad_norm": 28.34373664855957, + "learning_rate": 4.6511627906976743e-07, + "logits/chosen": -0.4101434350013733, + "logits/rejected": -0.2598617672920227, + "logps/chosen": -1.6901744604110718, + "logps/rejected": -1.7066800594329834, + "loss": 3.0128, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.901744842529297, + "rewards/margins": 0.16505737602710724, + "rewards/rejected": -17.066801071166992, + "step": 1380 + }, + { + "epoch": 0.046681721662341163, + "grad_norm": 32.586944580078125, + "learning_rate": 4.668014829794405e-07, + "logits/chosen": -0.8744746446609497, + "logits/rejected": -0.8278564214706421, + "logps/chosen": -1.6448156833648682, + "logps/rejected": -1.6291271448135376, + "loss": 3.2346, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.448158264160156, + "rewards/margins": -0.1568845808506012, + "rewards/rejected": -16.291271209716797, + "step": 1385 + }, + { + "epoch": 0.04685024773332434, + "grad_norm": 57.97466278076172, + "learning_rate": 4.684866868891136e-07, + "logits/chosen": -0.4052852690219879, + "logits/rejected": -0.2951774001121521, + "logps/chosen": -1.8488044738769531, + "logps/rejected": -1.7702134847640991, + "loss": 3.8409, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.4880428314209, + "rewards/margins": -0.7859078645706177, + "rewards/rejected": -17.70213508605957, + "step": 1390 + }, + { + "epoch": 0.04701877380430752, + "grad_norm": 25.0904483795166, + "learning_rate": 4.701718907987866e-07, + "logits/chosen": -0.4210020899772644, + "logits/rejected": -0.41034144163131714, + "logps/chosen": -1.842725396156311, + "logps/rejected": -1.8793871402740479, + "loss": 3.0764, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.4272518157959, + "rewards/margins": 0.3666171133518219, + "rewards/rejected": -18.793869018554688, + "step": 1395 + }, + { + "epoch": 0.04718729987529071, + "grad_norm": 11.349648475646973, + "learning_rate": 4.7185709470845967e-07, + "logits/chosen": -0.46164339780807495, + "logits/rejected": -0.28323012590408325, + "logps/chosen": -2.108422040939331, + "logps/rejected": -2.2248573303222656, + "loss": 3.1837, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.084218978881836, + "rewards/margins": 1.1643527746200562, + "rewards/rejected": -22.248571395874023, + "step": 1400 + }, + { + "epoch": 0.04735582594627389, + "grad_norm": 24.339231491088867, + "learning_rate": 4.735422986181328e-07, + "logits/chosen": -0.7064308524131775, + "logits/rejected": -0.6853176951408386, + "logps/chosen": -1.5951899290084839, + "logps/rejected": -1.4471842050552368, + "loss": 4.5243, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -15.951898574829102, + "rewards/margins": -1.4800574779510498, + "rewards/rejected": -14.471841812133789, + "step": 1405 + }, + { + "epoch": 0.04752435201725707, + "grad_norm": 24.074268341064453, + "learning_rate": 4.7522750252780586e-07, + "logits/chosen": -0.5154116749763489, + "logits/rejected": -0.473574161529541, + "logps/chosen": -1.5804609060287476, + "logps/rejected": -1.4201406240463257, + "loss": 4.6391, + "rewards/accuracies": 0.10000000149011612, + "rewards/chosen": -15.804609298706055, + "rewards/margins": -1.60320246219635, + "rewards/rejected": -14.201406478881836, + "step": 1410 + }, + { + "epoch": 0.04769287808824025, + "grad_norm": 21.47679328918457, + "learning_rate": 4.769127064374789e-07, + "logits/chosen": -0.3036695420742035, + "logits/rejected": -0.40201884508132935, + "logps/chosen": -1.8438358306884766, + "logps/rejected": -1.8053996562957764, + "loss": 3.4989, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.438358306884766, + "rewards/margins": -0.38436126708984375, + "rewards/rejected": -18.053997039794922, + "step": 1415 + }, + { + "epoch": 0.04786140415922343, + "grad_norm": 28.6302490234375, + "learning_rate": 4.78597910347152e-07, + "logits/chosen": -0.28868794441223145, + "logits/rejected": -0.14128902554512024, + "logps/chosen": -1.936479926109314, + "logps/rejected": -1.8393840789794922, + "loss": 4.0585, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -19.364797592163086, + "rewards/margins": -0.9709571003913879, + "rewards/rejected": -18.393840789794922, + "step": 1420 + }, + { + "epoch": 0.048029930230206616, + "grad_norm": 30.24493980407715, + "learning_rate": 4.802831142568251e-07, + "logits/chosen": -0.2036716639995575, + "logits/rejected": -0.17143428325653076, + "logps/chosen": -1.869917631149292, + "logps/rejected": -1.8757200241088867, + "loss": 3.0552, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.69917869567871, + "rewards/margins": 0.058022309094667435, + "rewards/rejected": -18.757200241088867, + "step": 1425 + }, + { + "epoch": 0.048198456301189796, + "grad_norm": 30.839862823486328, + "learning_rate": 4.819683181664982e-07, + "logits/chosen": -0.43438920378685, + "logits/rejected": -0.5123583078384399, + "logps/chosen": -1.749098777770996, + "logps/rejected": -1.6257498264312744, + "loss": 4.7179, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.49098777770996, + "rewards/margins": -1.2334905862808228, + "rewards/rejected": -16.257495880126953, + "step": 1430 + }, + { + "epoch": 0.048366982372172976, + "grad_norm": 8.07622241973877, + "learning_rate": 4.836535220761712e-07, + "logits/chosen": -0.3356507420539856, + "logits/rejected": -0.2890998423099518, + "logps/chosen": -1.912122130393982, + "logps/rejected": -2.059704303741455, + "loss": 2.745, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.1212215423584, + "rewards/margins": 1.4758189916610718, + "rewards/rejected": -20.5970401763916, + "step": 1435 + }, + { + "epoch": 0.048535508443156156, + "grad_norm": 20.30443000793457, + "learning_rate": 4.853387259858443e-07, + "logits/chosen": 0.08023197948932648, + "logits/rejected": -0.002189111663028598, + "logps/chosen": -1.799748420715332, + "logps/rejected": -1.933651328086853, + "loss": 2.7519, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.99748420715332, + "rewards/margins": 1.339029312133789, + "rewards/rejected": -19.336511611938477, + "step": 1440 + }, + { + "epoch": 0.048704034514139335, + "grad_norm": 30.887516021728516, + "learning_rate": 4.870239298955174e-07, + "logits/chosen": -0.39526060223579407, + "logits/rejected": -0.2602062225341797, + "logps/chosen": -1.5550906658172607, + "logps/rejected": -1.580798864364624, + "loss": 2.9367, + "rewards/accuracies": 0.5, + "rewards/chosen": -15.550908088684082, + "rewards/margins": 0.2570803761482239, + "rewards/rejected": -15.807989120483398, + "step": 1445 + }, + { + "epoch": 0.048872560585122515, + "grad_norm": 17.38456153869629, + "learning_rate": 4.887091338051904e-07, + "logits/chosen": -0.3647865653038025, + "logits/rejected": -0.30020421743392944, + "logps/chosen": -1.8236277103424072, + "logps/rejected": -1.9944959878921509, + "loss": 2.3455, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.236276626586914, + "rewards/margins": 1.708682656288147, + "rewards/rejected": -19.94495964050293, + "step": 1450 + }, + { + "epoch": 0.0490410866561057, + "grad_norm": 128.50694274902344, + "learning_rate": 4.903943377148635e-07, + "logits/chosen": -0.3505763113498688, + "logits/rejected": -0.4260808527469635, + "logps/chosen": -2.0881869792938232, + "logps/rejected": -1.8180824518203735, + "loss": 5.8073, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.88187026977539, + "rewards/margins": -2.701045513153076, + "rewards/rejected": -18.180824279785156, + "step": 1455 + }, + { + "epoch": 0.04920961272708888, + "grad_norm": 31.025297164916992, + "learning_rate": 4.920795416245365e-07, + "logits/chosen": -0.5189875364303589, + "logits/rejected": -0.5705165863037109, + "logps/chosen": -1.635557770729065, + "logps/rejected": -1.7688162326812744, + "loss": 2.7251, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.35557746887207, + "rewards/margins": 1.3325845003128052, + "rewards/rejected": -17.688159942626953, + "step": 1460 + }, + { + "epoch": 0.04937813879807206, + "grad_norm": 24.771587371826172, + "learning_rate": 4.937647455342097e-07, + "logits/chosen": -0.3771246373653412, + "logits/rejected": -0.35342922806739807, + "logps/chosen": -1.6810089349746704, + "logps/rejected": -1.7020902633666992, + "loss": 3.2912, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.810089111328125, + "rewards/margins": 0.2108127623796463, + "rewards/rejected": -17.020902633666992, + "step": 1465 + }, + { + "epoch": 0.04954666486905524, + "grad_norm": 12.7250337600708, + "learning_rate": 4.954499494438827e-07, + "logits/chosen": -0.4494267404079437, + "logits/rejected": -0.5832349061965942, + "logps/chosen": -1.6091959476470947, + "logps/rejected": -1.7226126194000244, + "loss": 3.0505, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.09195899963379, + "rewards/margins": 1.1341665983200073, + "rewards/rejected": -17.226125717163086, + "step": 1470 + }, + { + "epoch": 0.04971519094003842, + "grad_norm": 23.972063064575195, + "learning_rate": 4.971351533535558e-07, + "logits/chosen": -0.18479518592357635, + "logits/rejected": -0.12936873733997345, + "logps/chosen": -1.9910461902618408, + "logps/rejected": -1.9244306087493896, + "loss": 3.9929, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.910459518432617, + "rewards/margins": -0.6661556363105774, + "rewards/rejected": -19.244304656982422, + "step": 1475 + }, + { + "epoch": 0.04988371701102161, + "grad_norm": 23.053617477416992, + "learning_rate": 4.988203572632289e-07, + "logits/chosen": -0.29251593351364136, + "logits/rejected": -0.23342540860176086, + "logps/chosen": -1.8383159637451172, + "logps/rejected": -1.888374924659729, + "loss": 2.906, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.383159637451172, + "rewards/margins": 0.5005893707275391, + "rewards/rejected": -18.88374900817871, + "step": 1480 + }, + { + "epoch": 0.05005224308200479, + "grad_norm": 30.78038215637207, + "learning_rate": 5.005055611729018e-07, + "logits/chosen": -0.4969760477542877, + "logits/rejected": -0.5444768667221069, + "logps/chosen": -1.74346923828125, + "logps/rejected": -1.8779300451278687, + "loss": 2.1338, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.4346923828125, + "rewards/margins": 1.3446089029312134, + "rewards/rejected": -18.7793025970459, + "step": 1485 + }, + { + "epoch": 0.05022076915298797, + "grad_norm": 25.886037826538086, + "learning_rate": 5.02190765082575e-07, + "logits/chosen": -0.4277273118495941, + "logits/rejected": -0.38578924536705017, + "logps/chosen": -1.7436481714248657, + "logps/rejected": -1.703674077987671, + "loss": 3.8824, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.436481475830078, + "rewards/margins": -0.3997390866279602, + "rewards/rejected": -17.036739349365234, + "step": 1490 + }, + { + "epoch": 0.05038929522397115, + "grad_norm": 20.68552589416504, + "learning_rate": 5.038759689922481e-07, + "logits/chosen": -0.5360159873962402, + "logits/rejected": -0.4824953079223633, + "logps/chosen": -1.9363552331924438, + "logps/rejected": -1.9381564855575562, + "loss": 3.1114, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.36355209350586, + "rewards/margins": 0.018012618646025658, + "rewards/rejected": -19.38156509399414, + "step": 1495 + }, + { + "epoch": 0.05055782129495433, + "grad_norm": 25.067201614379883, + "learning_rate": 5.055611729019212e-07, + "logits/chosen": -0.5183056592941284, + "logits/rejected": -0.5793188810348511, + "logps/chosen": -1.6892459392547607, + "logps/rejected": -1.6137710809707642, + "loss": 3.7882, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -16.8924617767334, + "rewards/margins": -0.7547513842582703, + "rewards/rejected": -16.13770866394043, + "step": 1500 + }, + { + "epoch": 0.05072634736593751, + "grad_norm": 48.83390808105469, + "learning_rate": 5.072463768115942e-07, + "logits/chosen": -0.128324493765831, + "logits/rejected": -0.15247969329357147, + "logps/chosen": -1.816828966140747, + "logps/rejected": -1.7542918920516968, + "loss": 3.7735, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.168289184570312, + "rewards/margins": -0.625369668006897, + "rewards/rejected": -17.542919158935547, + "step": 1505 + }, + { + "epoch": 0.050894873436920694, + "grad_norm": 38.07307815551758, + "learning_rate": 5.089315807212673e-07, + "logits/chosen": -0.3632165193557739, + "logits/rejected": -0.287231981754303, + "logps/chosen": -1.9052129983901978, + "logps/rejected": -1.9243885278701782, + "loss": 2.9336, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.0521297454834, + "rewards/margins": 0.1917535811662674, + "rewards/rejected": -19.243885040283203, + "step": 1510 + }, + { + "epoch": 0.051063399507903874, + "grad_norm": 25.32839584350586, + "learning_rate": 5.106167846309403e-07, + "logits/chosen": -0.5327574014663696, + "logits/rejected": -0.46464890241622925, + "logps/chosen": -1.7534282207489014, + "logps/rejected": -1.8104603290557861, + "loss": 2.6527, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.53428077697754, + "rewards/margins": 0.5703199505805969, + "rewards/rejected": -18.10460090637207, + "step": 1515 + }, + { + "epoch": 0.051231925578887054, + "grad_norm": 24.18781852722168, + "learning_rate": 5.123019885406133e-07, + "logits/chosen": -0.4070916771888733, + "logits/rejected": -0.3911053538322449, + "logps/chosen": -1.787366509437561, + "logps/rejected": -1.853560209274292, + "loss": 2.4961, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.873666763305664, + "rewards/margins": 0.6619375944137573, + "rewards/rejected": -18.53560447692871, + "step": 1520 + }, + { + "epoch": 0.051400451649870234, + "grad_norm": 26.94209861755371, + "learning_rate": 5.139871924502864e-07, + "logits/chosen": -0.2895117402076721, + "logits/rejected": -0.3377618193626404, + "logps/chosen": -1.7159216403961182, + "logps/rejected": -1.7107264995574951, + "loss": 3.3023, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.159215927124023, + "rewards/margins": -0.05195007473230362, + "rewards/rejected": -17.10726547241211, + "step": 1525 + }, + { + "epoch": 0.051568977720853414, + "grad_norm": 12.506376266479492, + "learning_rate": 5.156723963599595e-07, + "logits/chosen": -0.6083391308784485, + "logits/rejected": -0.6645565032958984, + "logps/chosen": -1.7214155197143555, + "logps/rejected": -1.7124853134155273, + "loss": 3.2891, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.214157104492188, + "rewards/margins": -0.08930368721485138, + "rewards/rejected": -17.12485122680664, + "step": 1530 + }, + { + "epoch": 0.0517375037918366, + "grad_norm": 21.57737159729004, + "learning_rate": 5.173576002696326e-07, + "logits/chosen": -0.5799289345741272, + "logits/rejected": -0.6960101127624512, + "logps/chosen": -2.1023707389831543, + "logps/rejected": -2.1636109352111816, + "loss": 3.9, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -21.02370834350586, + "rewards/margins": 0.6123997569084167, + "rewards/rejected": -21.636106491088867, + "step": 1535 + }, + { + "epoch": 0.05190602986281978, + "grad_norm": 6.66773796081543, + "learning_rate": 5.190428041793057e-07, + "logits/chosen": -0.2898945212364197, + "logits/rejected": -0.2669333815574646, + "logps/chosen": -1.9524368047714233, + "logps/rejected": -2.0663256645202637, + "loss": 2.6577, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.52436637878418, + "rewards/margins": 1.1388882398605347, + "rewards/rejected": -20.663257598876953, + "step": 1540 + }, + { + "epoch": 0.05207455593380296, + "grad_norm": 23.368228912353516, + "learning_rate": 5.207280080889788e-07, + "logits/chosen": -0.34807825088500977, + "logits/rejected": -0.37616461515426636, + "logps/chosen": -1.7619297504425049, + "logps/rejected": -1.7687091827392578, + "loss": 3.0278, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.61929702758789, + "rewards/margins": 0.06779269874095917, + "rewards/rejected": -17.687091827392578, + "step": 1545 + }, + { + "epoch": 0.05224308200478614, + "grad_norm": 29.27812957763672, + "learning_rate": 5.224132119986519e-07, + "logits/chosen": -0.4002048373222351, + "logits/rejected": -0.21160908043384552, + "logps/chosen": -1.8722518682479858, + "logps/rejected": -1.9842402935028076, + "loss": 2.6542, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.722518920898438, + "rewards/margins": 1.1198838949203491, + "rewards/rejected": -19.842403411865234, + "step": 1550 + }, + { + "epoch": 0.05241160807576932, + "grad_norm": 26.955427169799805, + "learning_rate": 5.24098415908325e-07, + "logits/chosen": -0.6399390697479248, + "logits/rejected": -0.5415032505989075, + "logps/chosen": -1.7122814655303955, + "logps/rejected": -1.7477025985717773, + "loss": 2.773, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.122814178466797, + "rewards/margins": 0.35421285033226013, + "rewards/rejected": -17.477027893066406, + "step": 1555 + }, + { + "epoch": 0.0525801341467525, + "grad_norm": 19.174209594726562, + "learning_rate": 5.257836198179979e-07, + "logits/chosen": -0.601922869682312, + "logits/rejected": -0.47701844573020935, + "logps/chosen": -1.6277345418930054, + "logps/rejected": -1.6898586750030518, + "loss": 2.5356, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.277345657348633, + "rewards/margins": 0.6212414503097534, + "rewards/rejected": -16.89858627319336, + "step": 1560 + }, + { + "epoch": 0.052748660217735686, + "grad_norm": 21.741432189941406, + "learning_rate": 5.27468823727671e-07, + "logits/chosen": -0.5087687969207764, + "logits/rejected": -0.6106857061386108, + "logps/chosen": -1.5809760093688965, + "logps/rejected": -1.6826480627059937, + "loss": 2.7595, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -15.809759140014648, + "rewards/margins": 1.0167211294174194, + "rewards/rejected": -16.826480865478516, + "step": 1565 + }, + { + "epoch": 0.052917186288718866, + "grad_norm": 23.948646545410156, + "learning_rate": 5.291540276373441e-07, + "logits/chosen": -0.2296111136674881, + "logits/rejected": -0.31996551156044006, + "logps/chosen": -1.6906408071517944, + "logps/rejected": -1.7543413639068604, + "loss": 2.9589, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.906408309936523, + "rewards/margins": 0.6370050311088562, + "rewards/rejected": -17.543415069580078, + "step": 1570 + }, + { + "epoch": 0.053085712359702046, + "grad_norm": 29.292556762695312, + "learning_rate": 5.308392315470171e-07, + "logits/chosen": -0.6534808874130249, + "logits/rejected": -0.7786849141120911, + "logps/chosen": -1.802342414855957, + "logps/rejected": -1.7604506015777588, + "loss": 3.5204, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -18.02342414855957, + "rewards/margins": -0.41891756653785706, + "rewards/rejected": -17.604507446289062, + "step": 1575 + }, + { + "epoch": 0.053254238430685226, + "grad_norm": 29.834030151367188, + "learning_rate": 5.325244354566902e-07, + "logits/chosen": -0.2010071724653244, + "logits/rejected": -0.2582705020904541, + "logps/chosen": -1.875382661819458, + "logps/rejected": -1.7762857675552368, + "loss": 4.0972, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -18.753826141357422, + "rewards/margins": -0.9909681081771851, + "rewards/rejected": -17.76285743713379, + "step": 1580 + }, + { + "epoch": 0.053422764501668406, + "grad_norm": 24.455078125, + "learning_rate": 5.342096393663633e-07, + "logits/chosen": -0.4177281856536865, + "logits/rejected": -0.2973349094390869, + "logps/chosen": -1.7950809001922607, + "logps/rejected": -1.8943901062011719, + "loss": 2.5308, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.950809478759766, + "rewards/margins": 0.9930933117866516, + "rewards/rejected": -18.94390296936035, + "step": 1585 + }, + { + "epoch": 0.05359129057265159, + "grad_norm": 19.05913543701172, + "learning_rate": 5.358948432760365e-07, + "logits/chosen": -0.2933509945869446, + "logits/rejected": -0.3594059348106384, + "logps/chosen": -1.8748451471328735, + "logps/rejected": -1.921567678451538, + "loss": 2.712, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.748451232910156, + "rewards/margins": 0.46722546219825745, + "rewards/rejected": -19.215679168701172, + "step": 1590 + }, + { + "epoch": 0.05375981664363477, + "grad_norm": 16.711894989013672, + "learning_rate": 5.375800471857095e-07, + "logits/chosen": -0.42561930418014526, + "logits/rejected": -0.3419100344181061, + "logps/chosen": -1.541689157485962, + "logps/rejected": -1.7792189121246338, + "loss": 2.7164, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -15.416891098022461, + "rewards/margins": 2.375296115875244, + "rewards/rejected": -17.79218864440918, + "step": 1595 + }, + { + "epoch": 0.05392834271461795, + "grad_norm": 29.211706161499023, + "learning_rate": 5.392652510953826e-07, + "logits/chosen": -0.6977416276931763, + "logits/rejected": -0.6029377579689026, + "logps/chosen": -1.619699239730835, + "logps/rejected": -1.546464204788208, + "loss": 3.8819, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -16.19699478149414, + "rewards/margins": -0.732352077960968, + "rewards/rejected": -15.464642524719238, + "step": 1600 + }, + { + "epoch": 0.05392834271461795, + "eval_logits/chosen": -0.699831485748291, + "eval_logits/rejected": -0.7011949419975281, + "eval_logps/chosen": -1.656389594078064, + "eval_logps/rejected": -1.65248441696167, + "eval_loss": 3.437371253967285, + "eval_rewards/accuracies": 0.4699999988079071, + "eval_rewards/chosen": -16.56389617919922, + "eval_rewards/margins": -0.039052486419677734, + "eval_rewards/rejected": -16.524843215942383, + "eval_runtime": 12.9021, + "eval_samples_per_second": 7.751, + "eval_steps_per_second": 1.938, + "step": 1600 + }, + { + "epoch": 0.05409686878560113, + "grad_norm": 48.776004791259766, + "learning_rate": 5.409504550050556e-07, + "logits/chosen": -0.21524472534656525, + "logits/rejected": -0.1732257902622223, + "logps/chosen": -2.1390466690063477, + "logps/rejected": -2.1870763301849365, + "loss": 2.7335, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.39046859741211, + "rewards/margins": 0.4802955687046051, + "rewards/rejected": -21.870765686035156, + "step": 1605 + }, + { + "epoch": 0.05426539485658431, + "grad_norm": 39.24583053588867, + "learning_rate": 5.426356589147286e-07, + "logits/chosen": -0.5967484712600708, + "logits/rejected": -0.7126034498214722, + "logps/chosen": -1.4969217777252197, + "logps/rejected": -1.545467734336853, + "loss": 2.7097, + "rewards/accuracies": 0.5, + "rewards/chosen": -14.969217300415039, + "rewards/margins": 0.4854598939418793, + "rewards/rejected": -15.454675674438477, + "step": 1610 + }, + { + "epoch": 0.05443392092756749, + "grad_norm": 30.210620880126953, + "learning_rate": 5.443208628244017e-07, + "logits/chosen": -0.4601069390773773, + "logits/rejected": -0.41530901193618774, + "logps/chosen": -2.0412802696228027, + "logps/rejected": -2.0564990043640137, + "loss": 3.1674, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.41280174255371, + "rewards/margins": 0.15218643844127655, + "rewards/rejected": -20.56498908996582, + "step": 1615 + }, + { + "epoch": 0.05460244699855068, + "grad_norm": 17.723539352416992, + "learning_rate": 5.460060667340748e-07, + "logits/chosen": -0.4109547734260559, + "logits/rejected": -0.4085041880607605, + "logps/chosen": -1.9298667907714844, + "logps/rejected": -1.9639778137207031, + "loss": 3.0299, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.298669815063477, + "rewards/margins": 0.341108500957489, + "rewards/rejected": -19.6397762298584, + "step": 1620 + }, + { + "epoch": 0.05477097306953386, + "grad_norm": 19.268251419067383, + "learning_rate": 5.476912706437478e-07, + "logits/chosen": -0.05240452289581299, + "logits/rejected": -0.03201603889465332, + "logps/chosen": -1.9599721431732178, + "logps/rejected": -1.8682050704956055, + "loss": 3.9744, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.599721908569336, + "rewards/margins": -0.917669951915741, + "rewards/rejected": -18.682048797607422, + "step": 1625 + }, + { + "epoch": 0.05493949914051704, + "grad_norm": 28.24390411376953, + "learning_rate": 5.493764745534209e-07, + "logits/chosen": -0.46913594007492065, + "logits/rejected": -0.4315093159675598, + "logps/chosen": -1.75238037109375, + "logps/rejected": -1.8159980773925781, + "loss": 2.6752, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.5238037109375, + "rewards/margins": 0.6361768841743469, + "rewards/rejected": -18.15998077392578, + "step": 1630 + }, + { + "epoch": 0.05510802521150022, + "grad_norm": 22.484506607055664, + "learning_rate": 5.51061678463094e-07, + "logits/chosen": 0.1574796438217163, + "logits/rejected": 0.19055981934070587, + "logps/chosen": -2.093717098236084, + "logps/rejected": -2.0308218002319336, + "loss": 3.6801, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.937170028686523, + "rewards/margins": -0.6289529800415039, + "rewards/rejected": -20.308218002319336, + "step": 1635 + }, + { + "epoch": 0.0552765512824834, + "grad_norm": 24.651714324951172, + "learning_rate": 5.527468823727672e-07, + "logits/chosen": -0.4641755521297455, + "logits/rejected": -0.5991306900978088, + "logps/chosen": -1.6150715351104736, + "logps/rejected": -1.6226387023925781, + "loss": 3.1799, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.15071678161621, + "rewards/margins": 0.07567119598388672, + "rewards/rejected": -16.22638702392578, + "step": 1640 + }, + { + "epoch": 0.055445077353466585, + "grad_norm": 30.90976333618164, + "learning_rate": 5.544320862824402e-07, + "logits/chosen": -0.3300759196281433, + "logits/rejected": -0.39983344078063965, + "logps/chosen": -1.6464049816131592, + "logps/rejected": -1.761338472366333, + "loss": 2.1055, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.46405029296875, + "rewards/margins": 1.1493333578109741, + "rewards/rejected": -17.613384246826172, + "step": 1645 + }, + { + "epoch": 0.055613603424449765, + "grad_norm": 26.457569122314453, + "learning_rate": 5.561172901921132e-07, + "logits/chosen": -0.38823699951171875, + "logits/rejected": -0.365914523601532, + "logps/chosen": -1.955287218093872, + "logps/rejected": -1.883247971534729, + "loss": 4.172, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.55286979675293, + "rewards/margins": -0.7203909158706665, + "rewards/rejected": -18.83247947692871, + "step": 1650 + }, + { + "epoch": 0.055782129495432944, + "grad_norm": 15.721268653869629, + "learning_rate": 5.578024941017863e-07, + "logits/chosen": -0.48343658447265625, + "logits/rejected": -0.4833255708217621, + "logps/chosen": -1.5667378902435303, + "logps/rejected": -1.7361778020858765, + "loss": 2.0005, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -15.667379379272461, + "rewards/margins": 1.694397211074829, + "rewards/rejected": -17.361778259277344, + "step": 1655 + }, + { + "epoch": 0.055950655566416124, + "grad_norm": 36.59139633178711, + "learning_rate": 5.594876980114594e-07, + "logits/chosen": -0.18844670057296753, + "logits/rejected": -0.28556400537490845, + "logps/chosen": -1.668357491493225, + "logps/rejected": -1.7187904119491577, + "loss": 2.7758, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.68357276916504, + "rewards/margins": 0.5043299794197083, + "rewards/rejected": -17.187904357910156, + "step": 1660 + }, + { + "epoch": 0.056119181637399304, + "grad_norm": 18.01271629333496, + "learning_rate": 5.611729019211324e-07, + "logits/chosen": -0.5732772946357727, + "logits/rejected": -0.5180662870407104, + "logps/chosen": -1.6921463012695312, + "logps/rejected": -1.879349708557129, + "loss": 2.4687, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.921463012695312, + "rewards/margins": 1.872035264968872, + "rewards/rejected": -18.793498992919922, + "step": 1665 + }, + { + "epoch": 0.056287707708382484, + "grad_norm": 29.73081398010254, + "learning_rate": 5.628581058308055e-07, + "logits/chosen": -0.41807693243026733, + "logits/rejected": -0.4663594663143158, + "logps/chosen": -1.612532377243042, + "logps/rejected": -1.6978442668914795, + "loss": 2.4525, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.125324249267578, + "rewards/margins": 0.8531206250190735, + "rewards/rejected": -16.978443145751953, + "step": 1670 + }, + { + "epoch": 0.05645623377936567, + "grad_norm": 17.574676513671875, + "learning_rate": 5.645433097404786e-07, + "logits/chosen": -0.6845382452011108, + "logits/rejected": -0.6192032098770142, + "logps/chosen": -1.545688271522522, + "logps/rejected": -1.5205169916152954, + "loss": 3.3514, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -15.456881523132324, + "rewards/margins": -0.25171154737472534, + "rewards/rejected": -15.205171585083008, + "step": 1675 + }, + { + "epoch": 0.05662475985034885, + "grad_norm": 27.954343795776367, + "learning_rate": 5.662285136501516e-07, + "logits/chosen": -0.24558699131011963, + "logits/rejected": -0.13401418924331665, + "logps/chosen": -1.796062707901001, + "logps/rejected": -1.9593982696533203, + "loss": 2.1787, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.96062660217285, + "rewards/margins": 1.6333551406860352, + "rewards/rejected": -19.593982696533203, + "step": 1680 + }, + { + "epoch": 0.05679328592133203, + "grad_norm": 19.399494171142578, + "learning_rate": 5.679137175598247e-07, + "logits/chosen": -0.742358386516571, + "logits/rejected": -0.7818718552589417, + "logps/chosen": -1.5913759469985962, + "logps/rejected": -1.6343498229980469, + "loss": 2.7957, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -15.913759231567383, + "rewards/margins": 0.42973804473876953, + "rewards/rejected": -16.343496322631836, + "step": 1685 + }, + { + "epoch": 0.05696181199231521, + "grad_norm": 26.784698486328125, + "learning_rate": 5.695989214694977e-07, + "logits/chosen": -0.04572455957531929, + "logits/rejected": -0.2053622305393219, + "logps/chosen": -1.710603952407837, + "logps/rejected": -1.750200629234314, + "loss": 2.8564, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.106037139892578, + "rewards/margins": 0.3959696888923645, + "rewards/rejected": -17.50200843811035, + "step": 1690 + }, + { + "epoch": 0.05713033806329839, + "grad_norm": 10.891070365905762, + "learning_rate": 5.712841253791709e-07, + "logits/chosen": -0.4375430941581726, + "logits/rejected": -0.27901238203048706, + "logps/chosen": -1.8508228063583374, + "logps/rejected": -1.9515966176986694, + "loss": 2.7353, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.508228302001953, + "rewards/margins": 1.0077383518218994, + "rewards/rejected": -19.515966415405273, + "step": 1695 + }, + { + "epoch": 0.05729886413428157, + "grad_norm": 28.085920333862305, + "learning_rate": 5.729693292888439e-07, + "logits/chosen": -0.37988463044166565, + "logits/rejected": -0.3089436888694763, + "logps/chosen": -1.7150055170059204, + "logps/rejected": -1.8164036273956299, + "loss": 2.4916, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.150054931640625, + "rewards/margins": 1.0139801502227783, + "rewards/rejected": -18.16403579711914, + "step": 1700 + }, + { + "epoch": 0.05746739020526476, + "grad_norm": 27.796403884887695, + "learning_rate": 5.74654533198517e-07, + "logits/chosen": -0.14855363965034485, + "logits/rejected": -0.055423758924007416, + "logps/chosen": -1.8769832849502563, + "logps/rejected": -1.7882697582244873, + "loss": 3.9854, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.769832611083984, + "rewards/margins": -0.887133777141571, + "rewards/rejected": -17.8826961517334, + "step": 1705 + }, + { + "epoch": 0.05763591627624794, + "grad_norm": 19.106117248535156, + "learning_rate": 5.763397371081901e-07, + "logits/chosen": -0.6356021165847778, + "logits/rejected": -0.5874723196029663, + "logps/chosen": -1.445164680480957, + "logps/rejected": -1.4936878681182861, + "loss": 2.7015, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -14.451647758483887, + "rewards/margins": 0.4852313995361328, + "rewards/rejected": -14.936877250671387, + "step": 1710 + }, + { + "epoch": 0.057804442347231116, + "grad_norm": 21.671457290649414, + "learning_rate": 5.780249410178631e-07, + "logits/chosen": -0.45297783613204956, + "logits/rejected": -0.38992589712142944, + "logps/chosen": -1.5569000244140625, + "logps/rejected": -1.5796066522598267, + "loss": 3.0231, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -15.569000244140625, + "rewards/margins": 0.2270650863647461, + "rewards/rejected": -15.796066284179688, + "step": 1715 + }, + { + "epoch": 0.057972968418214296, + "grad_norm": 29.682052612304688, + "learning_rate": 5.797101449275362e-07, + "logits/chosen": -0.5061969757080078, + "logits/rejected": -0.40872421860694885, + "logps/chosen": -1.8883237838745117, + "logps/rejected": -1.8492504358291626, + "loss": 3.4416, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -18.883237838745117, + "rewards/margins": -0.3907338082790375, + "rewards/rejected": -18.492504119873047, + "step": 1720 + }, + { + "epoch": 0.058141494489197476, + "grad_norm": 31.583568572998047, + "learning_rate": 5.813953488372093e-07, + "logits/chosen": -0.2520661950111389, + "logits/rejected": -0.23302459716796875, + "logps/chosen": -1.6508252620697021, + "logps/rejected": -1.8004897832870483, + "loss": 1.9262, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.508251190185547, + "rewards/margins": 1.4966458082199097, + "rewards/rejected": -18.004898071289062, + "step": 1725 + }, + { + "epoch": 0.05831002056018066, + "grad_norm": 18.796131134033203, + "learning_rate": 5.830805527468824e-07, + "logits/chosen": -0.3362746834754944, + "logits/rejected": -0.3569917678833008, + "logps/chosen": -1.7772108316421509, + "logps/rejected": -1.8702561855316162, + "loss": 2.8101, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.772109985351562, + "rewards/margins": 0.9304534792900085, + "rewards/rejected": -18.70256233215332, + "step": 1730 + }, + { + "epoch": 0.05847854663116384, + "grad_norm": 20.094972610473633, + "learning_rate": 5.847657566565553e-07, + "logits/chosen": -0.8289566040039062, + "logits/rejected": -0.6430098414421082, + "logps/chosen": -1.6072938442230225, + "logps/rejected": -1.583827257156372, + "loss": 3.3512, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.07293701171875, + "rewards/margins": -0.23466500639915466, + "rewards/rejected": -15.838272094726562, + "step": 1735 + }, + { + "epoch": 0.05864707270214702, + "grad_norm": 25.6748104095459, + "learning_rate": 5.864509605662284e-07, + "logits/chosen": -0.17799155414104462, + "logits/rejected": -0.09096328914165497, + "logps/chosen": -1.725823998451233, + "logps/rejected": -1.8445842266082764, + "loss": 2.6412, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.25823974609375, + "rewards/margins": 1.1876022815704346, + "rewards/rejected": -18.445842742919922, + "step": 1740 + }, + { + "epoch": 0.0588155987731302, + "grad_norm": 43.991390228271484, + "learning_rate": 5.881361644759016e-07, + "logits/chosen": -0.5916566252708435, + "logits/rejected": -0.5461825132369995, + "logps/chosen": -1.7417824268341064, + "logps/rejected": -2.0078158378601074, + "loss": 2.34, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.417823791503906, + "rewards/margins": 2.6603331565856934, + "rewards/rejected": -20.078155517578125, + "step": 1745 + }, + { + "epoch": 0.05898412484411338, + "grad_norm": 26.747943878173828, + "learning_rate": 5.898213683855746e-07, + "logits/chosen": -0.9465047717094421, + "logits/rejected": -0.9426982998847961, + "logps/chosen": -1.5433447360992432, + "logps/rejected": -1.5770765542984009, + "loss": 2.7903, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -15.433446884155273, + "rewards/margins": 0.33731889724731445, + "rewards/rejected": -15.77076530456543, + "step": 1750 + }, + { + "epoch": 0.05915265091509656, + "grad_norm": 16.108659744262695, + "learning_rate": 5.915065722952477e-07, + "logits/chosen": -0.7509289979934692, + "logits/rejected": -0.6715911030769348, + "logps/chosen": -1.8838880062103271, + "logps/rejected": -1.9311984777450562, + "loss": 2.7285, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.838882446289062, + "rewards/margins": 0.47310376167297363, + "rewards/rejected": -19.31198501586914, + "step": 1755 + }, + { + "epoch": 0.05932117698607975, + "grad_norm": 25.781511306762695, + "learning_rate": 5.931917762049208e-07, + "logits/chosen": 0.09684916585683823, + "logits/rejected": -0.03794277831912041, + "logps/chosen": -1.651545524597168, + "logps/rejected": -1.5341824293136597, + "loss": 4.218, + "rewards/accuracies": 0.10000000149011612, + "rewards/chosen": -16.515457153320312, + "rewards/margins": -1.1736314296722412, + "rewards/rejected": -15.341824531555176, + "step": 1760 + }, + { + "epoch": 0.05948970305706293, + "grad_norm": 19.869956970214844, + "learning_rate": 5.948769801145939e-07, + "logits/chosen": -0.5614740252494812, + "logits/rejected": -0.6934599876403809, + "logps/chosen": -1.4802380800247192, + "logps/rejected": -1.4893232583999634, + "loss": 3.0454, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -14.80238151550293, + "rewards/margins": 0.09085007011890411, + "rewards/rejected": -14.893231391906738, + "step": 1765 + }, + { + "epoch": 0.05965822912804611, + "grad_norm": 38.33526611328125, + "learning_rate": 5.965621840242669e-07, + "logits/chosen": -0.3522099554538727, + "logits/rejected": -0.42708688974380493, + "logps/chosen": -1.9067974090576172, + "logps/rejected": -1.9417225122451782, + "loss": 3.4045, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.067974090576172, + "rewards/margins": 0.34925180673599243, + "rewards/rejected": -19.417224884033203, + "step": 1770 + }, + { + "epoch": 0.05982675519902929, + "grad_norm": 21.137075424194336, + "learning_rate": 5.9824738793394e-07, + "logits/chosen": -0.25959745049476624, + "logits/rejected": -0.24761705100536346, + "logps/chosen": -1.875977873802185, + "logps/rejected": -1.9285907745361328, + "loss": 3.4363, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.759777069091797, + "rewards/margins": 0.5261300206184387, + "rewards/rejected": -19.28590965270996, + "step": 1775 + }, + { + "epoch": 0.05999528127001247, + "grad_norm": 20.143699645996094, + "learning_rate": 5.99932591843613e-07, + "logits/chosen": -0.3771205544471741, + "logits/rejected": -0.3303254246711731, + "logps/chosen": -1.5552126169204712, + "logps/rejected": -1.814523458480835, + "loss": 2.1268, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -15.552125930786133, + "rewards/margins": 2.593108654022217, + "rewards/rejected": -18.145235061645508, + "step": 1780 + }, + { + "epoch": 0.060163807340995655, + "grad_norm": 48.40658950805664, + "learning_rate": 6.01617795753286e-07, + "logits/chosen": -0.14805591106414795, + "logits/rejected": -0.24271002411842346, + "logps/chosen": -1.7285377979278564, + "logps/rejected": -1.9390876293182373, + "loss": 2.9948, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.285375595092773, + "rewards/margins": 2.105499267578125, + "rewards/rejected": -19.3908748626709, + "step": 1785 + }, + { + "epoch": 0.060332333411978835, + "grad_norm": 25.84428596496582, + "learning_rate": 6.033029996629591e-07, + "logits/chosen": -0.013548873364925385, + "logits/rejected": 0.018930787220597267, + "logps/chosen": -1.738650918006897, + "logps/rejected": -1.6590766906738281, + "loss": 3.9597, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.38650894165039, + "rewards/margins": -0.7957417368888855, + "rewards/rejected": -16.59076690673828, + "step": 1790 + }, + { + "epoch": 0.060500859482962015, + "grad_norm": 73.66495513916016, + "learning_rate": 6.049882035726323e-07, + "logits/chosen": -0.5159806609153748, + "logits/rejected": -0.3327074646949768, + "logps/chosen": -1.820635437965393, + "logps/rejected": -1.9370348453521729, + "loss": 2.6383, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.20635414123535, + "rewards/margins": 1.163994550704956, + "rewards/rejected": -19.37034797668457, + "step": 1795 + }, + { + "epoch": 0.060669385553945195, + "grad_norm": 22.028491973876953, + "learning_rate": 6.066734074823054e-07, + "logits/chosen": -0.2898898124694824, + "logits/rejected": -0.2503661513328552, + "logps/chosen": -1.6143338680267334, + "logps/rejected": -1.5903599262237549, + "loss": 3.4173, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.143339157104492, + "rewards/margins": -0.23974084854125977, + "rewards/rejected": -15.903597831726074, + "step": 1800 + }, + { + "epoch": 0.060837911624928374, + "grad_norm": 24.202577590942383, + "learning_rate": 6.083586113919784e-07, + "logits/chosen": 0.024017006158828735, + "logits/rejected": 0.04643130302429199, + "logps/chosen": -1.527374267578125, + "logps/rejected": -1.5289933681488037, + "loss": 3.619, + "rewards/accuracies": 0.5, + "rewards/chosen": -15.27374267578125, + "rewards/margins": 0.01618986204266548, + "rewards/rejected": -15.289934158325195, + "step": 1805 + }, + { + "epoch": 0.061006437695911554, + "grad_norm": 89.72640991210938, + "learning_rate": 6.100438153016515e-07, + "logits/chosen": -0.5288889408111572, + "logits/rejected": -0.5065113306045532, + "logps/chosen": -1.7922258377075195, + "logps/rejected": -1.803261399269104, + "loss": 3.1556, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.922256469726562, + "rewards/margins": 0.11035575717687607, + "rewards/rejected": -18.032611846923828, + "step": 1810 + }, + { + "epoch": 0.06117496376689474, + "grad_norm": 26.971994400024414, + "learning_rate": 6.117290192113246e-07, + "logits/chosen": -0.7729194760322571, + "logits/rejected": -0.8629820942878723, + "logps/chosen": -1.9254817962646484, + "logps/rejected": -1.95050048828125, + "loss": 3.7236, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.254819869995117, + "rewards/margins": 0.25018566846847534, + "rewards/rejected": -19.5050048828125, + "step": 1815 + }, + { + "epoch": 0.06134348983787792, + "grad_norm": 23.856813430786133, + "learning_rate": 6.134142231209977e-07, + "logits/chosen": -0.33167606592178345, + "logits/rejected": -0.3584614396095276, + "logps/chosen": -1.486135721206665, + "logps/rejected": -1.5292284488677979, + "loss": 2.9592, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -14.861356735229492, + "rewards/margins": 0.43092602491378784, + "rewards/rejected": -15.29228401184082, + "step": 1820 + }, + { + "epoch": 0.0615120159088611, + "grad_norm": 21.96845245361328, + "learning_rate": 6.150994270306706e-07, + "logits/chosen": -0.30157405138015747, + "logits/rejected": -0.32445019483566284, + "logps/chosen": -1.9414207935333252, + "logps/rejected": -1.929107904434204, + "loss": 3.3149, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.414207458496094, + "rewards/margins": -0.12312869727611542, + "rewards/rejected": -19.291080474853516, + "step": 1825 + }, + { + "epoch": 0.06168054197984428, + "grad_norm": 42.928070068359375, + "learning_rate": 6.167846309403437e-07, + "logits/chosen": -0.10247864574193954, + "logits/rejected": -0.16112163662910461, + "logps/chosen": -1.736191987991333, + "logps/rejected": -1.667532205581665, + "loss": 3.8427, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.361919403076172, + "rewards/margins": -0.6865975260734558, + "rewards/rejected": -16.675321578979492, + "step": 1830 + }, + { + "epoch": 0.06184906805082746, + "grad_norm": 23.188669204711914, + "learning_rate": 6.184698348500168e-07, + "logits/chosen": -0.5849705934524536, + "logits/rejected": -0.3921450972557068, + "logps/chosen": -1.8506797552108765, + "logps/rejected": -1.881422758102417, + "loss": 3.0305, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.506797790527344, + "rewards/margins": 0.30743035674095154, + "rewards/rejected": -18.814228057861328, + "step": 1835 + }, + { + "epoch": 0.06201759412181065, + "grad_norm": 77.21211242675781, + "learning_rate": 6.201550387596898e-07, + "logits/chosen": -0.5807980298995972, + "logits/rejected": -0.5246071815490723, + "logps/chosen": -2.000711679458618, + "logps/rejected": -1.9282617568969727, + "loss": 3.775, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.007116317749023, + "rewards/margins": -0.7245005369186401, + "rewards/rejected": -19.282617568969727, + "step": 1840 + }, + { + "epoch": 0.06218612019279383, + "grad_norm": 22.97465705871582, + "learning_rate": 6.21840242669363e-07, + "logits/chosen": -0.5278009176254272, + "logits/rejected": -0.48332375288009644, + "logps/chosen": -1.8859955072402954, + "logps/rejected": -1.8935880661010742, + "loss": 3.1492, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.859954833984375, + "rewards/margins": 0.07592477649450302, + "rewards/rejected": -18.93587875366211, + "step": 1845 + }, + { + "epoch": 0.06235464626377701, + "grad_norm": 21.464570999145508, + "learning_rate": 6.235254465790361e-07, + "logits/chosen": -0.4647518992424011, + "logits/rejected": -0.35028940439224243, + "logps/chosen": -1.6057485342025757, + "logps/rejected": -1.7640788555145264, + "loss": 2.3478, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.057485580444336, + "rewards/margins": 1.5833041667938232, + "rewards/rejected": -17.640789031982422, + "step": 1850 + }, + { + "epoch": 0.0625231723347602, + "grad_norm": 23.270641326904297, + "learning_rate": 6.252106504887092e-07, + "logits/chosen": -0.626205563545227, + "logits/rejected": -0.5367187261581421, + "logps/chosen": -1.6747545003890991, + "logps/rejected": -1.808241605758667, + "loss": 2.8049, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.74754524230957, + "rewards/margins": 1.3348705768585205, + "rewards/rejected": -18.082416534423828, + "step": 1855 + }, + { + "epoch": 0.06269169840574337, + "grad_norm": 20.01548194885254, + "learning_rate": 6.268958543983822e-07, + "logits/chosen": -0.6003610491752625, + "logits/rejected": -0.46292710304260254, + "logps/chosen": -2.0454294681549072, + "logps/rejected": -2.074065685272217, + "loss": 3.3159, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.45429229736328, + "rewards/margins": 0.286365807056427, + "rewards/rejected": -20.740657806396484, + "step": 1860 + }, + { + "epoch": 0.06286022447672655, + "grad_norm": 21.655704498291016, + "learning_rate": 6.285810583080553e-07, + "logits/chosen": -0.24826118350028992, + "logits/rejected": -0.25482481718063354, + "logps/chosen": -2.4187121391296387, + "logps/rejected": -2.6595332622528076, + "loss": 2.1698, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.187122344970703, + "rewards/margins": 2.4082117080688477, + "rewards/rejected": -26.595333099365234, + "step": 1865 + }, + { + "epoch": 0.06302875054770973, + "grad_norm": 12.449923515319824, + "learning_rate": 6.302662622177283e-07, + "logits/chosen": -0.27865132689476013, + "logits/rejected": -0.1912352293729782, + "logps/chosen": -1.7426397800445557, + "logps/rejected": -1.803223967552185, + "loss": 2.8243, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.4263973236084, + "rewards/margins": 0.6058410406112671, + "rewards/rejected": -18.032238006591797, + "step": 1870 + }, + { + "epoch": 0.06319727661869291, + "grad_norm": 18.938154220581055, + "learning_rate": 6.319514661274013e-07, + "logits/chosen": -0.9526360630989075, + "logits/rejected": -0.8011028170585632, + "logps/chosen": -1.36483895778656, + "logps/rejected": -1.439701795578003, + "loss": 2.9105, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -13.64838981628418, + "rewards/margins": 0.7486263513565063, + "rewards/rejected": -14.397016525268555, + "step": 1875 + }, + { + "epoch": 0.06336580268967609, + "grad_norm": 13.122438430786133, + "learning_rate": 6.336366700370744e-07, + "logits/chosen": -0.7261112332344055, + "logits/rejected": -0.6287399530410767, + "logps/chosen": -1.5695728063583374, + "logps/rejected": -1.7251754999160767, + "loss": 2.6971, + "rewards/accuracies": 0.5, + "rewards/chosen": -15.695727348327637, + "rewards/margins": 1.5560270547866821, + "rewards/rejected": -17.25175666809082, + "step": 1880 + }, + { + "epoch": 0.06353432876065927, + "grad_norm": 27.177547454833984, + "learning_rate": 6.353218739467475e-07, + "logits/chosen": -0.3480846583843231, + "logits/rejected": -0.2502368092536926, + "logps/chosen": -1.6377151012420654, + "logps/rejected": -1.578667163848877, + "loss": 3.8532, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.377151489257812, + "rewards/margins": -0.5904794931411743, + "rewards/rejected": -15.786672592163086, + "step": 1885 + }, + { + "epoch": 0.06370285483164245, + "grad_norm": 32.058998107910156, + "learning_rate": 6.370070778564206e-07, + "logits/chosen": -0.15475639700889587, + "logits/rejected": -0.08114627748727798, + "logps/chosen": -1.704306960105896, + "logps/rejected": -1.788220763206482, + "loss": 3.0792, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.04306983947754, + "rewards/margins": 0.8391389846801758, + "rewards/rejected": -17.8822078704834, + "step": 1890 + }, + { + "epoch": 0.06387138090262563, + "grad_norm": 21.426652908325195, + "learning_rate": 6.386922817660937e-07, + "logits/chosen": -0.4754219949245453, + "logits/rejected": -0.44864320755004883, + "logps/chosen": -1.8361284732818604, + "logps/rejected": -1.8722747564315796, + "loss": 2.8451, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.361284255981445, + "rewards/margins": 0.3614630699157715, + "rewards/rejected": -18.722747802734375, + "step": 1895 + }, + { + "epoch": 0.06403990697360881, + "grad_norm": 24.273914337158203, + "learning_rate": 6.403774856757668e-07, + "logits/chosen": -0.02922775410115719, + "logits/rejected": -0.1809547245502472, + "logps/chosen": -1.8290389776229858, + "logps/rejected": -1.9153051376342773, + "loss": 2.7326, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.290390014648438, + "rewards/margins": 0.8626611828804016, + "rewards/rejected": -19.15304946899414, + "step": 1900 + }, + { + "epoch": 0.06420843304459199, + "grad_norm": 29.323482513427734, + "learning_rate": 6.420626895854399e-07, + "logits/chosen": -0.12777681648731232, + "logits/rejected": -0.08850021660327911, + "logps/chosen": -1.8227113485336304, + "logps/rejected": -1.8100782632827759, + "loss": 3.2956, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.227113723754883, + "rewards/margins": -0.12633152306079865, + "rewards/rejected": -18.10078239440918, + "step": 1905 + }, + { + "epoch": 0.06437695911557519, + "grad_norm": 20.20551300048828, + "learning_rate": 6.437478934951128e-07, + "logits/chosen": -0.6324241757392883, + "logits/rejected": -0.6633267998695374, + "logps/chosen": -1.5772778987884521, + "logps/rejected": -1.6632354259490967, + "loss": 2.3262, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -15.772778511047363, + "rewards/margins": 0.8595759272575378, + "rewards/rejected": -16.632354736328125, + "step": 1910 + }, + { + "epoch": 0.06454548518655837, + "grad_norm": 27.31475830078125, + "learning_rate": 6.454330974047859e-07, + "logits/chosen": -0.5674937963485718, + "logits/rejected": -0.3717323839664459, + "logps/chosen": -1.6537988185882568, + "logps/rejected": -1.6325750350952148, + "loss": 3.3598, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -16.537988662719727, + "rewards/margins": -0.2122385948896408, + "rewards/rejected": -16.325748443603516, + "step": 1915 + }, + { + "epoch": 0.06471401125754155, + "grad_norm": 21.887195587158203, + "learning_rate": 6.47118301314459e-07, + "logits/chosen": -0.030434776097536087, + "logits/rejected": -0.04068700224161148, + "logps/chosen": -1.930794358253479, + "logps/rejected": -2.0131115913391113, + "loss": 2.4377, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.307941436767578, + "rewards/margins": 0.8231736421585083, + "rewards/rejected": -20.131114959716797, + "step": 1920 + }, + { + "epoch": 0.06488253732852473, + "grad_norm": 47.787200927734375, + "learning_rate": 6.488035052241321e-07, + "logits/chosen": -0.6423458456993103, + "logits/rejected": -0.5267582535743713, + "logps/chosen": -1.8432353734970093, + "logps/rejected": -1.9428443908691406, + "loss": 2.2613, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.432353973388672, + "rewards/margins": 0.9960900545120239, + "rewards/rejected": -19.428442001342773, + "step": 1925 + }, + { + "epoch": 0.0650510633995079, + "grad_norm": 18.88721466064453, + "learning_rate": 6.504887091338051e-07, + "logits/chosen": -0.30531203746795654, + "logits/rejected": -0.4015568196773529, + "logps/chosen": -1.5693740844726562, + "logps/rejected": -1.6434333324432373, + "loss": 2.4761, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -15.693740844726562, + "rewards/margins": 0.7405935525894165, + "rewards/rejected": -16.434335708618164, + "step": 1930 + }, + { + "epoch": 0.06521958947049109, + "grad_norm": 30.243013381958008, + "learning_rate": 6.521739130434782e-07, + "logits/chosen": -0.2444632351398468, + "logits/rejected": -0.2351723164319992, + "logps/chosen": -1.8687137365341187, + "logps/rejected": -2.091553211212158, + "loss": 2.259, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.687137603759766, + "rewards/margins": 2.2283949851989746, + "rewards/rejected": -20.9155330657959, + "step": 1935 + }, + { + "epoch": 0.06538811554147426, + "grad_norm": 21.0958309173584, + "learning_rate": 6.538591169531513e-07, + "logits/chosen": -0.5276403427124023, + "logits/rejected": -0.5364550352096558, + "logps/chosen": -1.705955147743225, + "logps/rejected": -1.6914780139923096, + "loss": 3.2859, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.059551239013672, + "rewards/margins": -0.14477066695690155, + "rewards/rejected": -16.914779663085938, + "step": 1940 + }, + { + "epoch": 0.06555664161245744, + "grad_norm": 32.9719352722168, + "learning_rate": 6.555443208628245e-07, + "logits/chosen": -0.17250430583953857, + "logits/rejected": -0.22453102469444275, + "logps/chosen": -1.782339096069336, + "logps/rejected": -1.7523667812347412, + "loss": 3.4177, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.82339096069336, + "rewards/margins": -0.2997213304042816, + "rewards/rejected": -17.52366828918457, + "step": 1945 + }, + { + "epoch": 0.06572516768344062, + "grad_norm": 21.734193801879883, + "learning_rate": 6.572295247724975e-07, + "logits/chosen": 0.016134237870573997, + "logits/rejected": -0.03876941278576851, + "logps/chosen": -1.5835492610931396, + "logps/rejected": -1.5337642431259155, + "loss": 3.5641, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -15.835492134094238, + "rewards/margins": -0.49785009026527405, + "rewards/rejected": -15.337640762329102, + "step": 1950 + }, + { + "epoch": 0.0658936937544238, + "grad_norm": 21.90297508239746, + "learning_rate": 6.589147286821705e-07, + "logits/chosen": -0.46315592527389526, + "logits/rejected": -0.44253987073898315, + "logps/chosen": -1.4891859292984009, + "logps/rejected": -1.679639220237732, + "loss": 2.9178, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -14.89185905456543, + "rewards/margins": 1.904531478881836, + "rewards/rejected": -16.796390533447266, + "step": 1955 + }, + { + "epoch": 0.06606221982540698, + "grad_norm": 47.062469482421875, + "learning_rate": 6.605999325918436e-07, + "logits/chosen": -0.5072580575942993, + "logits/rejected": -0.6388121843338013, + "logps/chosen": -1.9043185710906982, + "logps/rejected": -1.834246039390564, + "loss": 3.8808, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -19.04318618774414, + "rewards/margins": -0.7007244229316711, + "rewards/rejected": -18.34246063232422, + "step": 1960 + }, + { + "epoch": 0.06623074589639018, + "grad_norm": 30.098649978637695, + "learning_rate": 6.622851365015166e-07, + "logits/chosen": -0.48198550939559937, + "logits/rejected": -0.4838237166404724, + "logps/chosen": -1.7086031436920166, + "logps/rejected": -1.7821567058563232, + "loss": 2.8094, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.086029052734375, + "rewards/margins": 0.73553866147995, + "rewards/rejected": -17.82156753540039, + "step": 1965 + }, + { + "epoch": 0.06639927196737336, + "grad_norm": 0.15973490476608276, + "learning_rate": 6.639703404111897e-07, + "logits/chosen": -0.28459540009498596, + "logits/rejected": -0.26555758714675903, + "logps/chosen": -1.542386531829834, + "logps/rejected": -1.799584150314331, + "loss": 2.2961, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -15.423864364624023, + "rewards/margins": 2.57197642326355, + "rewards/rejected": -17.9958438873291, + "step": 1970 + }, + { + "epoch": 0.06656779803835654, + "grad_norm": 51.54502487182617, + "learning_rate": 6.656555443208628e-07, + "logits/chosen": -0.7996042966842651, + "logits/rejected": -0.7479974031448364, + "logps/chosen": -2.016369342803955, + "logps/rejected": -1.9065616130828857, + "loss": 4.158, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -20.163692474365234, + "rewards/margins": -1.0980759859085083, + "rewards/rejected": -19.065616607666016, + "step": 1975 + }, + { + "epoch": 0.06673632410933972, + "grad_norm": 17.913270950317383, + "learning_rate": 6.673407482305359e-07, + "logits/chosen": -0.33906176686286926, + "logits/rejected": -0.29040712118148804, + "logps/chosen": -1.7747215032577515, + "logps/rejected": -1.872317910194397, + "loss": 2.4515, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.74721336364746, + "rewards/margins": 0.9759650230407715, + "rewards/rejected": -18.723180770874023, + "step": 1980 + }, + { + "epoch": 0.0669048501803229, + "grad_norm": 21.00700569152832, + "learning_rate": 6.690259521402089e-07, + "logits/chosen": -0.4829220771789551, + "logits/rejected": -0.4446737766265869, + "logps/chosen": -1.4653489589691162, + "logps/rejected": -1.5274522304534912, + "loss": 2.7453, + "rewards/accuracies": 0.5, + "rewards/chosen": -14.653491020202637, + "rewards/margins": 0.6210311651229858, + "rewards/rejected": -15.274523735046387, + "step": 1985 + }, + { + "epoch": 0.06707337625130608, + "grad_norm": 40.25058364868164, + "learning_rate": 6.70711156049882e-07, + "logits/chosen": -0.11107297241687775, + "logits/rejected": 0.057940077036619186, + "logps/chosen": -1.9686638116836548, + "logps/rejected": -2.0639851093292236, + "loss": 2.3266, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.6866397857666, + "rewards/margins": 0.9532124400138855, + "rewards/rejected": -20.639850616455078, + "step": 1990 + }, + { + "epoch": 0.06724190232228926, + "grad_norm": 16.159786224365234, + "learning_rate": 6.723963599595552e-07, + "logits/chosen": -0.7763963937759399, + "logits/rejected": -0.6344070434570312, + "logps/chosen": -1.7498953342437744, + "logps/rejected": -1.845721960067749, + "loss": 2.9993, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.498952865600586, + "rewards/margins": 0.9582692980766296, + "rewards/rejected": -18.457223892211914, + "step": 1995 + }, + { + "epoch": 0.06741042839327244, + "grad_norm": 20.94877815246582, + "learning_rate": 6.740815638692281e-07, + "logits/chosen": -0.45790061354637146, + "logits/rejected": -0.5162444710731506, + "logps/chosen": -1.5169246196746826, + "logps/rejected": -1.4949506521224976, + "loss": 3.622, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -15.1692476272583, + "rewards/margins": -0.2197399139404297, + "rewards/rejected": -14.949508666992188, + "step": 2000 + }, + { + "epoch": 0.06741042839327244, + "eval_logits/chosen": -0.7068748474121094, + "eval_logits/rejected": -0.7089285254478455, + "eval_logps/chosen": -1.6583834886550903, + "eval_logps/rejected": -1.6555068492889404, + "eval_loss": 3.4318957328796387, + "eval_rewards/accuracies": 0.4699999988079071, + "eval_rewards/chosen": -16.583833694458008, + "eval_rewards/margins": -0.02876390889286995, + "eval_rewards/rejected": -16.555068969726562, + "eval_runtime": 12.8917, + "eval_samples_per_second": 7.757, + "eval_steps_per_second": 1.939, + "step": 2000 + }, + { + "epoch": 0.06757895446425562, + "grad_norm": 40.18283462524414, + "learning_rate": 6.757667677789012e-07, + "logits/chosen": -0.33664292097091675, + "logits/rejected": -0.08164303004741669, + "logps/chosen": -1.5835665464401245, + "logps/rejected": -1.6931190490722656, + "loss": 2.8106, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -15.835665702819824, + "rewards/margins": 1.0955229997634888, + "rewards/rejected": -16.931188583374023, + "step": 2005 + }, + { + "epoch": 0.0677474805352388, + "grad_norm": 26.831192016601562, + "learning_rate": 6.774519716885743e-07, + "logits/chosen": -0.3066862225532532, + "logits/rejected": -0.5098311901092529, + "logps/chosen": -1.628483772277832, + "logps/rejected": -1.6336151361465454, + "loss": 3.0836, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.284839630126953, + "rewards/margins": 0.05131196975708008, + "rewards/rejected": -16.336151123046875, + "step": 2010 + }, + { + "epoch": 0.06791600660622198, + "grad_norm": 16.26115608215332, + "learning_rate": 6.791371755982474e-07, + "logits/chosen": -0.723468005657196, + "logits/rejected": -0.6189843416213989, + "logps/chosen": -1.680509328842163, + "logps/rejected": -1.7051823139190674, + "loss": 2.9784, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.80509376525879, + "rewards/margins": 0.2467300444841385, + "rewards/rejected": -17.05182456970215, + "step": 2015 + }, + { + "epoch": 0.06808453267720517, + "grad_norm": 29.91892433166504, + "learning_rate": 6.808223795079204e-07, + "logits/chosen": -0.7424842119216919, + "logits/rejected": -0.704288125038147, + "logps/chosen": -1.6226580142974854, + "logps/rejected": -1.633644700050354, + "loss": 3.1707, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.226581573486328, + "rewards/margins": 0.10986528545618057, + "rewards/rejected": -16.336444854736328, + "step": 2020 + }, + { + "epoch": 0.06825305874818835, + "grad_norm": 28.981691360473633, + "learning_rate": 6.825075834175935e-07, + "logits/chosen": 0.017427653074264526, + "logits/rejected": -0.05734679102897644, + "logps/chosen": -2.1242425441741943, + "logps/rejected": -1.984273910522461, + "loss": 4.4866, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -21.2424259185791, + "rewards/margins": -1.399685263633728, + "rewards/rejected": -19.84273910522461, + "step": 2025 + }, + { + "epoch": 0.06842158481917153, + "grad_norm": 56.0091552734375, + "learning_rate": 6.841927873272666e-07, + "logits/chosen": -0.20901520550251007, + "logits/rejected": -0.23270806670188904, + "logps/chosen": -2.094710111618042, + "logps/rejected": -2.2162601947784424, + "loss": 2.2359, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.94710350036621, + "rewards/margins": 1.215496301651001, + "rewards/rejected": -22.162599563598633, + "step": 2030 + }, + { + "epoch": 0.06859011089015471, + "grad_norm": 33.509132385253906, + "learning_rate": 6.858779912369396e-07, + "logits/chosen": -0.5838386416435242, + "logits/rejected": -0.43091145157814026, + "logps/chosen": -1.8723928928375244, + "logps/rejected": -1.8803253173828125, + "loss": 3.061, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.723926544189453, + "rewards/margins": 0.07932768017053604, + "rewards/rejected": -18.80325698852539, + "step": 2035 + }, + { + "epoch": 0.06875863696113789, + "grad_norm": 21.0834903717041, + "learning_rate": 6.875631951466127e-07, + "logits/chosen": -0.9346014857292175, + "logits/rejected": -0.7744470834732056, + "logps/chosen": -1.6558029651641846, + "logps/rejected": -1.6898488998413086, + "loss": 2.8263, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.55803108215332, + "rewards/margins": 0.34045690298080444, + "rewards/rejected": -16.898487091064453, + "step": 2040 + }, + { + "epoch": 0.06892716303212107, + "grad_norm": 21.175643920898438, + "learning_rate": 6.892483990562858e-07, + "logits/chosen": -0.3706910312175751, + "logits/rejected": -0.28410759568214417, + "logps/chosen": -2.399585723876953, + "logps/rejected": -1.9964946508407593, + "loss": 7.2358, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.995859146118164, + "rewards/margins": -4.030909538269043, + "rewards/rejected": -19.964946746826172, + "step": 2045 + }, + { + "epoch": 0.06909568910310425, + "grad_norm": 26.842979431152344, + "learning_rate": 6.909336029659589e-07, + "logits/chosen": -0.5333553552627563, + "logits/rejected": -0.4201357960700989, + "logps/chosen": -1.720887541770935, + "logps/rejected": -1.797654390335083, + "loss": 3.268, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.208877563476562, + "rewards/margins": 0.7676678895950317, + "rewards/rejected": -17.976543426513672, + "step": 2050 + }, + { + "epoch": 0.06926421517408743, + "grad_norm": 20.48428726196289, + "learning_rate": 6.926188068756319e-07, + "logits/chosen": -0.584562361240387, + "logits/rejected": -0.6016994714736938, + "logps/chosen": -1.502666711807251, + "logps/rejected": -1.5560954809188843, + "loss": 2.8615, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -15.026666641235352, + "rewards/margins": 0.534288227558136, + "rewards/rejected": -15.560954093933105, + "step": 2055 + }, + { + "epoch": 0.06943274124507061, + "grad_norm": 23.584716796875, + "learning_rate": 6.94304010785305e-07, + "logits/chosen": -0.6470240354537964, + "logits/rejected": -0.6648738384246826, + "logps/chosen": -1.733741044998169, + "logps/rejected": -1.7135350704193115, + "loss": 3.3818, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.337411880493164, + "rewards/margins": -0.20206137001514435, + "rewards/rejected": -17.13534927368164, + "step": 2060 + }, + { + "epoch": 0.06960126731605379, + "grad_norm": 32.5343132019043, + "learning_rate": 6.959892146949781e-07, + "logits/chosen": -0.13596948981285095, + "logits/rejected": -0.12064089626073837, + "logps/chosen": -1.509447693824768, + "logps/rejected": -1.6260335445404053, + "loss": 2.5401, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -15.094476699829102, + "rewards/margins": 1.1658592224121094, + "rewards/rejected": -16.26033592224121, + "step": 2065 + }, + { + "epoch": 0.06976979338703697, + "grad_norm": 15.475911140441895, + "learning_rate": 6.976744186046511e-07, + "logits/chosen": -0.6770817041397095, + "logits/rejected": -0.6954010725021362, + "logps/chosen": -1.5978891849517822, + "logps/rejected": -1.6195186376571655, + "loss": 2.9897, + "rewards/accuracies": 0.5, + "rewards/chosen": -15.978894233703613, + "rewards/margins": 0.21629361808300018, + "rewards/rejected": -16.195186614990234, + "step": 2070 + }, + { + "epoch": 0.06993831945802016, + "grad_norm": 28.1234130859375, + "learning_rate": 6.993596225143242e-07, + "logits/chosen": -0.5654903054237366, + "logits/rejected": -0.6328141093254089, + "logps/chosen": -1.7364234924316406, + "logps/rejected": -1.6471540927886963, + "loss": 4.1059, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -17.36423683166504, + "rewards/margins": -0.892695426940918, + "rewards/rejected": -16.471540451049805, + "step": 2075 + }, + { + "epoch": 0.07010684552900334, + "grad_norm": 35.210113525390625, + "learning_rate": 7.010448264239973e-07, + "logits/chosen": -0.052232611924409866, + "logits/rejected": -0.19137360155582428, + "logps/chosen": -1.8225181102752686, + "logps/rejected": -1.6726758480072021, + "loss": 4.59, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.225181579589844, + "rewards/margins": -1.49842369556427, + "rewards/rejected": -16.726757049560547, + "step": 2080 + }, + { + "epoch": 0.07027537159998652, + "grad_norm": 22.082977294921875, + "learning_rate": 7.027300303336703e-07, + "logits/chosen": -0.1036457046866417, + "logits/rejected": -0.22812290489673615, + "logps/chosen": -1.4879047870635986, + "logps/rejected": -1.4071712493896484, + "loss": 3.868, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -14.879046440124512, + "rewards/margins": -0.8073347210884094, + "rewards/rejected": -14.0717134475708, + "step": 2085 + }, + { + "epoch": 0.0704438976709697, + "grad_norm": 18.9871883392334, + "learning_rate": 7.044152342433433e-07, + "logits/chosen": -0.5369516015052795, + "logits/rejected": -0.5570724606513977, + "logps/chosen": -1.5926361083984375, + "logps/rejected": -1.8241342306137085, + "loss": 3.1885, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -15.926362991333008, + "rewards/margins": 2.3149805068969727, + "rewards/rejected": -18.241342544555664, + "step": 2090 + }, + { + "epoch": 0.07061242374195288, + "grad_norm": 19.876121520996094, + "learning_rate": 7.061004381530165e-07, + "logits/chosen": -0.648863673210144, + "logits/rejected": -0.6402491331100464, + "logps/chosen": -1.9981971979141235, + "logps/rejected": -1.8780739307403564, + "loss": 4.3948, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.98197364807129, + "rewards/margins": -1.2012332677841187, + "rewards/rejected": -18.780738830566406, + "step": 2095 + }, + { + "epoch": 0.07078094981293606, + "grad_norm": 21.34756851196289, + "learning_rate": 7.077856420626896e-07, + "logits/chosen": -0.4101831316947937, + "logits/rejected": -0.41583624482154846, + "logps/chosen": -1.7545570135116577, + "logps/rejected": -1.691300392150879, + "loss": 3.6937, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -17.54557228088379, + "rewards/margins": -0.6325671076774597, + "rewards/rejected": -16.91300392150879, + "step": 2100 + }, + { + "epoch": 0.07094947588391924, + "grad_norm": 30.001667022705078, + "learning_rate": 7.094708459723626e-07, + "logits/chosen": -0.3208427131175995, + "logits/rejected": -0.16100385785102844, + "logps/chosen": -1.696240782737732, + "logps/rejected": -1.7646242380142212, + "loss": 2.6131, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.9624080657959, + "rewards/margins": 0.6838338971138, + "rewards/rejected": -17.646244049072266, + "step": 2105 + }, + { + "epoch": 0.07111800195490242, + "grad_norm": 35.3121337890625, + "learning_rate": 7.111560498820357e-07, + "logits/chosen": -0.6093899607658386, + "logits/rejected": -0.5890295505523682, + "logps/chosen": -1.7065532207489014, + "logps/rejected": -1.719430685043335, + "loss": 3.4051, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.065532684326172, + "rewards/margins": 0.12877540290355682, + "rewards/rejected": -17.19430923461914, + "step": 2110 + }, + { + "epoch": 0.0712865280258856, + "grad_norm": 22.868438720703125, + "learning_rate": 7.128412537917088e-07, + "logits/chosen": -0.7410895824432373, + "logits/rejected": -0.7691564559936523, + "logps/chosen": -1.4438047409057617, + "logps/rejected": -1.487786054611206, + "loss": 2.9583, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -14.4380464553833, + "rewards/margins": 0.4398147463798523, + "rewards/rejected": -14.877861022949219, + "step": 2115 + }, + { + "epoch": 0.07145505409686878, + "grad_norm": 30.30385971069336, + "learning_rate": 7.145264577013819e-07, + "logits/chosen": -0.4199215769767761, + "logits/rejected": -0.35399970412254333, + "logps/chosen": -1.6688213348388672, + "logps/rejected": -1.6876872777938843, + "loss": 3.0365, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.688213348388672, + "rewards/margins": 0.18865904211997986, + "rewards/rejected": -16.876873016357422, + "step": 2120 + }, + { + "epoch": 0.07162358016785196, + "grad_norm": 20.555082321166992, + "learning_rate": 7.162116616110549e-07, + "logits/chosen": -0.33508139848709106, + "logits/rejected": -0.26377108693122864, + "logps/chosen": -1.7805957794189453, + "logps/rejected": -1.8432705402374268, + "loss": 2.5923, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.805957794189453, + "rewards/margins": 0.6267482042312622, + "rewards/rejected": -18.43270492553711, + "step": 2125 + }, + { + "epoch": 0.07179210623883515, + "grad_norm": 22.803489685058594, + "learning_rate": 7.178968655207279e-07, + "logits/chosen": -0.7927559018135071, + "logits/rejected": -0.7616699934005737, + "logps/chosen": -1.4907985925674438, + "logps/rejected": -1.5001600980758667, + "loss": 2.9985, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -14.907986640930176, + "rewards/margins": 0.09361562877893448, + "rewards/rejected": -15.001602172851562, + "step": 2130 + }, + { + "epoch": 0.07196063230981833, + "grad_norm": 31.1895751953125, + "learning_rate": 7.19582069430401e-07, + "logits/chosen": -0.5055627822875977, + "logits/rejected": -0.5777538418769836, + "logps/chosen": -2.0089962482452393, + "logps/rejected": -2.064162492752075, + "loss": 2.9047, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.0899658203125, + "rewards/margins": 0.5516608953475952, + "rewards/rejected": -20.64162254333496, + "step": 2135 + }, + { + "epoch": 0.07212915838080151, + "grad_norm": 21.80877113342285, + "learning_rate": 7.21267273340074e-07, + "logits/chosen": -0.8962277173995972, + "logits/rejected": -1.062765121459961, + "logps/chosen": -1.6682827472686768, + "logps/rejected": -1.6122701168060303, + "loss": 3.6308, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.68282699584961, + "rewards/margins": -0.5601242184638977, + "rewards/rejected": -16.12270164489746, + "step": 2140 + }, + { + "epoch": 0.0722976844517847, + "grad_norm": 39.813602447509766, + "learning_rate": 7.229524772497472e-07, + "logits/chosen": -0.6865358352661133, + "logits/rejected": -0.6879181265830994, + "logps/chosen": -1.5924547910690308, + "logps/rejected": -1.5645772218704224, + "loss": 3.3997, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -15.924548149108887, + "rewards/margins": -0.2787768244743347, + "rewards/rejected": -15.645771980285645, + "step": 2145 + }, + { + "epoch": 0.07246621052276787, + "grad_norm": 34.804351806640625, + "learning_rate": 7.246376811594203e-07, + "logits/chosen": -0.37989914417266846, + "logits/rejected": -0.31740498542785645, + "logps/chosen": -2.073552370071411, + "logps/rejected": -2.0728116035461426, + "loss": 3.5797, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.735519409179688, + "rewards/margins": -0.0074065206572413445, + "rewards/rejected": -20.72811508178711, + "step": 2150 + }, + { + "epoch": 0.07263473659375105, + "grad_norm": 32.999820709228516, + "learning_rate": 7.263228850690934e-07, + "logits/chosen": -0.4067623019218445, + "logits/rejected": -0.2200227677822113, + "logps/chosen": -1.6104068756103516, + "logps/rejected": -1.7444422245025635, + "loss": 2.4908, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.104068756103516, + "rewards/margins": 1.3403522968292236, + "rewards/rejected": -17.444419860839844, + "step": 2155 + }, + { + "epoch": 0.07280326266473423, + "grad_norm": 21.049962997436523, + "learning_rate": 7.280080889787664e-07, + "logits/chosen": -0.9572398066520691, + "logits/rejected": -0.894806981086731, + "logps/chosen": -1.5985901355743408, + "logps/rejected": -1.6197038888931274, + "loss": 2.984, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -15.985898971557617, + "rewards/margins": 0.2111394852399826, + "rewards/rejected": -16.197040557861328, + "step": 2160 + }, + { + "epoch": 0.07297178873571741, + "grad_norm": 19.7977294921875, + "learning_rate": 7.296932928884395e-07, + "logits/chosen": -0.7495409846305847, + "logits/rejected": -0.7707471251487732, + "logps/chosen": -1.5134512186050415, + "logps/rejected": -1.604833960533142, + "loss": 2.4129, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -15.134511947631836, + "rewards/margins": 0.9138285517692566, + "rewards/rejected": -16.048341751098633, + "step": 2165 + }, + { + "epoch": 0.0731403148067006, + "grad_norm": 26.598833084106445, + "learning_rate": 7.313784967981126e-07, + "logits/chosen": -0.4947318434715271, + "logits/rejected": -0.4180319905281067, + "logps/chosen": -1.8544307947158813, + "logps/rejected": -1.9303770065307617, + "loss": 2.6067, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.5443058013916, + "rewards/margins": 0.7594637870788574, + "rewards/rejected": -19.303770065307617, + "step": 2170 + }, + { + "epoch": 0.07330884087768377, + "grad_norm": 22.23002052307129, + "learning_rate": 7.330637007077856e-07, + "logits/chosen": -0.3323080837726593, + "logits/rejected": -0.32338953018188477, + "logps/chosen": -1.6461750268936157, + "logps/rejected": -1.8945128917694092, + "loss": 1.8916, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.461750030517578, + "rewards/margins": 2.483377695083618, + "rewards/rejected": -18.945127487182617, + "step": 2175 + }, + { + "epoch": 0.07347736694866695, + "grad_norm": 28.355215072631836, + "learning_rate": 7.347489046174586e-07, + "logits/chosen": -0.5796966552734375, + "logits/rejected": -0.5349574089050293, + "logps/chosen": -1.4876244068145752, + "logps/rejected": -1.5375049114227295, + "loss": 2.7552, + "rewards/accuracies": 0.5, + "rewards/chosen": -14.876243591308594, + "rewards/margins": 0.4988061487674713, + "rewards/rejected": -15.375048637390137, + "step": 2180 + }, + { + "epoch": 0.07364589301965015, + "grad_norm": 27.98024559020996, + "learning_rate": 7.364341085271317e-07, + "logits/chosen": -0.6341951489448547, + "logits/rejected": -0.6173728108406067, + "logps/chosen": -1.6965017318725586, + "logps/rejected": -1.7058916091918945, + "loss": 3.2173, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -16.965015411376953, + "rewards/margins": 0.09389963001012802, + "rewards/rejected": -17.058916091918945, + "step": 2185 + }, + { + "epoch": 0.07381441909063333, + "grad_norm": 17.731689453125, + "learning_rate": 7.381193124368048e-07, + "logits/chosen": -0.5357145667076111, + "logits/rejected": -0.5386208295822144, + "logps/chosen": -1.4506524801254272, + "logps/rejected": -1.4157730340957642, + "loss": 3.4912, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -14.506525039672852, + "rewards/margins": -0.34879380464553833, + "rewards/rejected": -14.157732009887695, + "step": 2190 + }, + { + "epoch": 0.0739829451616165, + "grad_norm": 26.12714958190918, + "learning_rate": 7.398045163464779e-07, + "logits/chosen": -0.2705962061882019, + "logits/rejected": -0.18878893554210663, + "logps/chosen": -1.7146854400634766, + "logps/rejected": -1.6282618045806885, + "loss": 3.967, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.146854400634766, + "rewards/margins": -0.8642366528511047, + "rewards/rejected": -16.282617568969727, + "step": 2195 + }, + { + "epoch": 0.07415147123259969, + "grad_norm": 21.842453002929688, + "learning_rate": 7.41489720256151e-07, + "logits/chosen": -0.1653115302324295, + "logits/rejected": -0.073493592441082, + "logps/chosen": -1.6875333786010742, + "logps/rejected": -1.7379028797149658, + "loss": 2.7714, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.875333786010742, + "rewards/margins": 0.5036935806274414, + "rewards/rejected": -17.3790283203125, + "step": 2200 + }, + { + "epoch": 0.07431999730358287, + "grad_norm": 17.958499908447266, + "learning_rate": 7.431749241658241e-07, + "logits/chosen": -0.4340514540672302, + "logits/rejected": -0.4271577000617981, + "logps/chosen": -1.8904931545257568, + "logps/rejected": -2.2419497966766357, + "loss": 1.6577, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.90493392944336, + "rewards/margins": 3.5145657062530518, + "rewards/rejected": -22.419498443603516, + "step": 2205 + }, + { + "epoch": 0.07448852337456605, + "grad_norm": 21.014781951904297, + "learning_rate": 7.448601280754972e-07, + "logits/chosen": -0.85997474193573, + "logits/rejected": -0.6709158420562744, + "logps/chosen": -1.753631591796875, + "logps/rejected": -1.5993316173553467, + "loss": 4.6344, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -17.53631591796875, + "rewards/margins": -1.5429986715316772, + "rewards/rejected": -15.993316650390625, + "step": 2210 + }, + { + "epoch": 0.07465704944554923, + "grad_norm": 17.678613662719727, + "learning_rate": 7.465453319851702e-07, + "logits/chosen": -0.5139130353927612, + "logits/rejected": -0.6146605610847473, + "logps/chosen": -1.7065128087997437, + "logps/rejected": -1.9963042736053467, + "loss": 2.515, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.06513023376465, + "rewards/margins": 2.8979153633117676, + "rewards/rejected": -19.963045120239258, + "step": 2215 + }, + { + "epoch": 0.0748255755165324, + "grad_norm": 24.807268142700195, + "learning_rate": 7.482305358948432e-07, + "logits/chosen": -0.3597901463508606, + "logits/rejected": -0.4648984372615814, + "logps/chosen": -1.699566125869751, + "logps/rejected": -1.7271579504013062, + "loss": 2.9498, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.99566078186035, + "rewards/margins": 0.2759190499782562, + "rewards/rejected": -17.27157974243164, + "step": 2220 + }, + { + "epoch": 0.07499410158751559, + "grad_norm": 37.47063446044922, + "learning_rate": 7.499157398045163e-07, + "logits/chosen": 0.019718538969755173, + "logits/rejected": 0.006109035108238459, + "logps/chosen": -2.5828468799591064, + "logps/rejected": -2.2230963706970215, + "loss": 6.7239, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.828466415405273, + "rewards/margins": -3.597503662109375, + "rewards/rejected": -22.23096466064453, + "step": 2225 + }, + { + "epoch": 0.07516262765849877, + "grad_norm": 32.3070182800293, + "learning_rate": 7.516009437141893e-07, + "logits/chosen": -0.6688810586929321, + "logits/rejected": -0.5590790510177612, + "logps/chosen": -1.5791361331939697, + "logps/rejected": -1.5508191585540771, + "loss": 3.4132, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -15.791361808776855, + "rewards/margins": -0.28317031264305115, + "rewards/rejected": -15.508191108703613, + "step": 2230 + }, + { + "epoch": 0.07533115372948194, + "grad_norm": 12.327166557312012, + "learning_rate": 7.532861476238624e-07, + "logits/chosen": -0.2609061598777771, + "logits/rejected": -0.25732770562171936, + "logps/chosen": -2.010161876678467, + "logps/rejected": -1.8172937631607056, + "loss": 5.3264, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -20.10161590576172, + "rewards/margins": -1.9286785125732422, + "rewards/rejected": -18.172937393188477, + "step": 2235 + }, + { + "epoch": 0.07549967980046514, + "grad_norm": 17.71880340576172, + "learning_rate": 7.549713515335355e-07, + "logits/chosen": -0.7396351099014282, + "logits/rejected": -0.7504865527153015, + "logps/chosen": -1.7640094757080078, + "logps/rejected": -1.7561099529266357, + "loss": 3.2278, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -17.640094757080078, + "rewards/margins": -0.07899437099695206, + "rewards/rejected": -17.561100006103516, + "step": 2240 + }, + { + "epoch": 0.07566820587144832, + "grad_norm": 12.440887451171875, + "learning_rate": 7.566565554432086e-07, + "logits/chosen": -0.15917307138442993, + "logits/rejected": -0.1628536731004715, + "logps/chosen": -2.0435426235198975, + "logps/rejected": -2.197415590286255, + "loss": 2.1454, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.435426712036133, + "rewards/margins": 1.5387285947799683, + "rewards/rejected": -21.97415542602539, + "step": 2245 + }, + { + "epoch": 0.0758367319424315, + "grad_norm": 24.053064346313477, + "learning_rate": 7.583417593528817e-07, + "logits/chosen": -0.79632967710495, + "logits/rejected": -0.5563432574272156, + "logps/chosen": -1.7878577709197998, + "logps/rejected": -1.9376850128173828, + "loss": 1.9539, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.878576278686523, + "rewards/margins": 1.498272180557251, + "rewards/rejected": -19.376850128173828, + "step": 2250 + }, + { + "epoch": 0.07600525801341468, + "grad_norm": 25.67947006225586, + "learning_rate": 7.600269632625548e-07, + "logits/chosen": -0.7355546951293945, + "logits/rejected": -0.5501964688301086, + "logps/chosen": -1.9016857147216797, + "logps/rejected": -1.9401962757110596, + "loss": 2.7661, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.016857147216797, + "rewards/margins": 0.3851049542427063, + "rewards/rejected": -19.401962280273438, + "step": 2255 + }, + { + "epoch": 0.07617378408439786, + "grad_norm": 48.06303405761719, + "learning_rate": 7.617121671722279e-07, + "logits/chosen": -0.40767064690589905, + "logits/rejected": -0.3229612708091736, + "logps/chosen": -2.4324240684509277, + "logps/rejected": -2.4284210205078125, + "loss": 3.1582, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.324243545532227, + "rewards/margins": -0.04003305360674858, + "rewards/rejected": -24.284210205078125, + "step": 2260 + }, + { + "epoch": 0.07634231015538104, + "grad_norm": 22.984275817871094, + "learning_rate": 7.633973710819008e-07, + "logits/chosen": -0.2127332240343094, + "logits/rejected": -0.05123148113489151, + "logps/chosen": -1.8632936477661133, + "logps/rejected": -1.8484798669815063, + "loss": 3.2724, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -18.632936477661133, + "rewards/margins": -0.1481371819972992, + "rewards/rejected": -18.484798431396484, + "step": 2265 + }, + { + "epoch": 0.07651083622636422, + "grad_norm": 226.9107208251953, + "learning_rate": 7.650825749915739e-07, + "logits/chosen": -0.2728140652179718, + "logits/rejected": -0.3592303395271301, + "logps/chosen": -2.038437604904175, + "logps/rejected": -2.0359554290771484, + "loss": 5.825, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.38437843322754, + "rewards/margins": -0.024822425097227097, + "rewards/rejected": -20.359554290771484, + "step": 2270 + }, + { + "epoch": 0.0766793622973474, + "grad_norm": 32.962181091308594, + "learning_rate": 7.66767778901247e-07, + "logits/chosen": -0.39705973863601685, + "logits/rejected": -0.31574827432632446, + "logps/chosen": -1.697000503540039, + "logps/rejected": -1.711599349975586, + "loss": 3.405, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.97000503540039, + "rewards/margins": 0.1459902822971344, + "rewards/rejected": -17.11599349975586, + "step": 2275 + }, + { + "epoch": 0.07684788836833058, + "grad_norm": 23.07267951965332, + "learning_rate": 7.684529828109201e-07, + "logits/chosen": -0.17579427361488342, + "logits/rejected": -0.27825185656547546, + "logps/chosen": -2.1279714107513428, + "logps/rejected": -1.9978828430175781, + "loss": 4.3433, + "rewards/accuracies": 0.10000000149011612, + "rewards/chosen": -21.279714584350586, + "rewards/margins": -1.3008840084075928, + "rewards/rejected": -19.978830337524414, + "step": 2280 + }, + { + "epoch": 0.07701641443931376, + "grad_norm": 35.53074264526367, + "learning_rate": 7.701381867205931e-07, + "logits/chosen": -0.4603014588356018, + "logits/rejected": -0.49287882447242737, + "logps/chosen": -2.0541510581970215, + "logps/rejected": -2.232800006866455, + "loss": 3.401, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.5415096282959, + "rewards/margins": 1.7864891290664673, + "rewards/rejected": -22.327999114990234, + "step": 2285 + }, + { + "epoch": 0.07718494051029694, + "grad_norm": 27.62151527404785, + "learning_rate": 7.718233906302662e-07, + "logits/chosen": -0.2474125325679779, + "logits/rejected": -0.15923914313316345, + "logps/chosen": -1.755563735961914, + "logps/rejected": -1.8253600597381592, + "loss": 2.6478, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.555635452270508, + "rewards/margins": 0.6979645490646362, + "rewards/rejected": -18.253599166870117, + "step": 2290 + }, + { + "epoch": 0.07735346658128013, + "grad_norm": 34.95461654663086, + "learning_rate": 7.735085945399393e-07, + "logits/chosen": -0.6580768823623657, + "logits/rejected": -0.7368906140327454, + "logps/chosen": -1.8308817148208618, + "logps/rejected": -1.9212795495986938, + "loss": 2.3617, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.30881690979004, + "rewards/margins": 0.90397709608078, + "rewards/rejected": -19.212793350219727, + "step": 2295 + }, + { + "epoch": 0.07752199265226331, + "grad_norm": 16.449514389038086, + "learning_rate": 7.751937984496125e-07, + "logits/chosen": -0.49866050481796265, + "logits/rejected": -0.44423356652259827, + "logps/chosen": -1.5588819980621338, + "logps/rejected": -1.574965238571167, + "loss": 3.1077, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -15.58881950378418, + "rewards/margins": 0.16083398461341858, + "rewards/rejected": -15.749651908874512, + "step": 2300 + }, + { + "epoch": 0.07769051872324649, + "grad_norm": 33.75349807739258, + "learning_rate": 7.768790023592854e-07, + "logits/chosen": -0.35698094964027405, + "logits/rejected": -0.31498563289642334, + "logps/chosen": -1.8009259700775146, + "logps/rejected": -1.8306411504745483, + "loss": 3.5954, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.009258270263672, + "rewards/margins": 0.29715123772621155, + "rewards/rejected": -18.306411743164062, + "step": 2305 + }, + { + "epoch": 0.07785904479422967, + "grad_norm": 27.460735321044922, + "learning_rate": 7.785642062689585e-07, + "logits/chosen": -0.5881357789039612, + "logits/rejected": -0.5689517855644226, + "logps/chosen": -1.458878755569458, + "logps/rejected": -1.4286056756973267, + "loss": 3.3968, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -14.588786125183105, + "rewards/margins": -0.30273109674453735, + "rewards/rejected": -14.286054611206055, + "step": 2310 + }, + { + "epoch": 0.07802757086521285, + "grad_norm": 9.67859172821045, + "learning_rate": 7.802494101786316e-07, + "logits/chosen": -0.3839682340621948, + "logits/rejected": -0.4018523097038269, + "logps/chosen": -1.3250752687454224, + "logps/rejected": -1.469162940979004, + "loss": 2.5192, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -13.250753402709961, + "rewards/margins": 1.4408762454986572, + "rewards/rejected": -14.691629409790039, + "step": 2315 + }, + { + "epoch": 0.07819609693619603, + "grad_norm": 53.11182403564453, + "learning_rate": 7.819346140883046e-07, + "logits/chosen": -0.25247180461883545, + "logits/rejected": -0.2038680762052536, + "logps/chosen": -2.0797410011291504, + "logps/rejected": -1.9495025873184204, + "loss": 4.3848, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -20.79741096496582, + "rewards/margins": -1.302383542060852, + "rewards/rejected": -19.495027542114258, + "step": 2320 + }, + { + "epoch": 0.07836462300717921, + "grad_norm": 14.444879531860352, + "learning_rate": 7.836198179979777e-07, + "logits/chosen": -0.5390850305557251, + "logits/rejected": -0.5718969106674194, + "logps/chosen": -2.016746759414673, + "logps/rejected": -1.9924167394638062, + "loss": 3.7065, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -20.16746711730957, + "rewards/margins": -0.24329786002635956, + "rewards/rejected": -19.92416763305664, + "step": 2325 + }, + { + "epoch": 0.07853314907816239, + "grad_norm": 24.698184967041016, + "learning_rate": 7.853050219076508e-07, + "logits/chosen": -0.4092784523963928, + "logits/rejected": -0.1485476940870285, + "logps/chosen": -1.8159494400024414, + "logps/rejected": -1.8311035633087158, + "loss": 3.4358, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.159494400024414, + "rewards/margins": 0.15154066681861877, + "rewards/rejected": -18.31103515625, + "step": 2330 + }, + { + "epoch": 0.07870167514914557, + "grad_norm": 28.049543380737305, + "learning_rate": 7.869902258173239e-07, + "logits/chosen": -0.4887131154537201, + "logits/rejected": -0.2594299614429474, + "logps/chosen": -1.7859032154083252, + "logps/rejected": -1.8176990747451782, + "loss": 2.9338, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.859033584594727, + "rewards/margins": 0.3179585337638855, + "rewards/rejected": -18.176990509033203, + "step": 2335 + }, + { + "epoch": 0.07887020122012875, + "grad_norm": 24.492351531982422, + "learning_rate": 7.886754297269969e-07, + "logits/chosen": -0.21818354725837708, + "logits/rejected": -0.2571013271808624, + "logps/chosen": -1.9694633483886719, + "logps/rejected": -2.051032781600952, + "loss": 2.5502, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.694631576538086, + "rewards/margins": 0.8156954050064087, + "rewards/rejected": -20.510326385498047, + "step": 2340 + }, + { + "epoch": 0.07903872729111193, + "grad_norm": 26.16986656188965, + "learning_rate": 7.9036063363667e-07, + "logits/chosen": -0.6751791834831238, + "logits/rejected": -0.6740162372589111, + "logps/chosen": -1.5954325199127197, + "logps/rejected": -1.6436774730682373, + "loss": 2.6107, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -15.954325675964355, + "rewards/margins": 0.482450008392334, + "rewards/rejected": -16.43677520751953, + "step": 2345 + }, + { + "epoch": 0.07920725336209512, + "grad_norm": 26.028491973876953, + "learning_rate": 7.920458375463431e-07, + "logits/chosen": -0.6816641092300415, + "logits/rejected": -0.6692745089530945, + "logps/chosen": -1.7431175708770752, + "logps/rejected": -1.6779407262802124, + "loss": 3.7004, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -17.43117332458496, + "rewards/margins": -0.6517672538757324, + "rewards/rejected": -16.779407501220703, + "step": 2350 + }, + { + "epoch": 0.0793757794330783, + "grad_norm": 17.893020629882812, + "learning_rate": 7.937310414560161e-07, + "logits/chosen": -0.3664388060569763, + "logits/rejected": -0.5129767656326294, + "logps/chosen": -1.8353245258331299, + "logps/rejected": -1.8356059789657593, + "loss": 3.1945, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.35324478149414, + "rewards/margins": 0.0028172493912279606, + "rewards/rejected": -18.356060028076172, + "step": 2355 + }, + { + "epoch": 0.07954430550406148, + "grad_norm": 22.861780166625977, + "learning_rate": 7.954162453656892e-07, + "logits/chosen": -0.3994244337081909, + "logits/rejected": -0.5220701098442078, + "logps/chosen": -1.808762550354004, + "logps/rejected": -1.820603609085083, + "loss": 3.1259, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.08762550354004, + "rewards/margins": 0.11841030418872833, + "rewards/rejected": -18.206035614013672, + "step": 2360 + }, + { + "epoch": 0.07971283157504466, + "grad_norm": 31.54043960571289, + "learning_rate": 7.971014492753623e-07, + "logits/chosen": -0.059911616146564484, + "logits/rejected": -0.0022819482255727053, + "logps/chosen": -2.044661521911621, + "logps/rejected": -2.0498645305633545, + "loss": 3.2518, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.44661521911621, + "rewards/margins": 0.05202770233154297, + "rewards/rejected": -20.498641967773438, + "step": 2365 + }, + { + "epoch": 0.07988135764602784, + "grad_norm": 18.20899772644043, + "learning_rate": 7.987866531850354e-07, + "logits/chosen": -0.6337564587593079, + "logits/rejected": -0.6478831171989441, + "logps/chosen": -1.5746173858642578, + "logps/rejected": -1.8139946460723877, + "loss": 1.8241, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -15.746172904968262, + "rewards/margins": 2.393773317337036, + "rewards/rejected": -18.13994789123535, + "step": 2370 + }, + { + "epoch": 0.08004988371701102, + "grad_norm": 25.54546356201172, + "learning_rate": 8.004718570947084e-07, + "logits/chosen": -0.7622129321098328, + "logits/rejected": -0.7510684728622437, + "logps/chosen": -1.621252417564392, + "logps/rejected": -1.6405149698257446, + "loss": 3.0707, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.212522506713867, + "rewards/margins": 0.19262532889842987, + "rewards/rejected": -16.405147552490234, + "step": 2375 + }, + { + "epoch": 0.0802184097879942, + "grad_norm": 22.381973266601562, + "learning_rate": 8.021570610043815e-07, + "logits/chosen": -0.29075971245765686, + "logits/rejected": -0.2355223149061203, + "logps/chosen": -1.7290035486221313, + "logps/rejected": -1.7951444387435913, + "loss": 2.585, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.290037155151367, + "rewards/margins": 0.6614087224006653, + "rewards/rejected": -17.951444625854492, + "step": 2380 + }, + { + "epoch": 0.08038693585897738, + "grad_norm": 15.249910354614258, + "learning_rate": 8.038422649140546e-07, + "logits/chosen": -0.6698895692825317, + "logits/rejected": -0.5626107454299927, + "logps/chosen": -1.3483240604400635, + "logps/rejected": -1.4742333889007568, + "loss": 2.2219, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -13.483240127563477, + "rewards/margins": 1.259093999862671, + "rewards/rejected": -14.742334365844727, + "step": 2385 + }, + { + "epoch": 0.08055546192996056, + "grad_norm": 9.537259101867676, + "learning_rate": 8.055274688237276e-07, + "logits/chosen": -0.38138988614082336, + "logits/rejected": -0.39942893385887146, + "logps/chosen": -2.068192958831787, + "logps/rejected": -2.230304479598999, + "loss": 2.7261, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.681930541992188, + "rewards/margins": 1.6211124658584595, + "rewards/rejected": -22.30304527282715, + "step": 2390 + }, + { + "epoch": 0.08072398800094374, + "grad_norm": 42.501583099365234, + "learning_rate": 8.072126727334006e-07, + "logits/chosen": -0.6858797669410706, + "logits/rejected": -0.6615114808082581, + "logps/chosen": -1.8533143997192383, + "logps/rejected": -1.8078025579452515, + "loss": 3.6226, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.533143997192383, + "rewards/margins": -0.4551170766353607, + "rewards/rejected": -18.078027725219727, + "step": 2395 + }, + { + "epoch": 0.08089251407192692, + "grad_norm": 24.91890525817871, + "learning_rate": 8.088978766430738e-07, + "logits/chosen": -0.36171025037765503, + "logits/rejected": -0.43480342626571655, + "logps/chosen": -1.681675910949707, + "logps/rejected": -1.6178693771362305, + "loss": 3.6924, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.81675910949707, + "rewards/margins": -0.6380661129951477, + "rewards/rejected": -16.178691864013672, + "step": 2400 + }, + { + "epoch": 0.08089251407192692, + "eval_logits/chosen": -0.7006931900978088, + "eval_logits/rejected": -0.703230619430542, + "eval_logps/chosen": -1.6610907316207886, + "eval_logps/rejected": -1.6590111255645752, + "eval_loss": 3.4272820949554443, + "eval_rewards/accuracies": 0.4699999988079071, + "eval_rewards/chosen": -16.61090850830078, + "eval_rewards/margins": -0.020797576755285263, + "eval_rewards/rejected": -16.590110778808594, + "eval_runtime": 12.8899, + "eval_samples_per_second": 7.758, + "eval_steps_per_second": 1.939, + "step": 2400 + }, + { + "epoch": 0.0810610401429101, + "grad_norm": 23.806867599487305, + "learning_rate": 8.105830805527469e-07, + "logits/chosen": -0.5742610692977905, + "logits/rejected": -0.6479157209396362, + "logps/chosen": -1.6960289478302002, + "logps/rejected": -1.9552192687988281, + "loss": 2.0351, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.960285186767578, + "rewards/margins": 2.591905355453491, + "rewards/rejected": -19.55219268798828, + "step": 2405 + }, + { + "epoch": 0.0812295662138933, + "grad_norm": 18.037389755249023, + "learning_rate": 8.122682844624199e-07, + "logits/chosen": -0.6523152589797974, + "logits/rejected": -0.6970680356025696, + "logps/chosen": -1.4654817581176758, + "logps/rejected": -1.5350319147109985, + "loss": 2.6353, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -14.654818534851074, + "rewards/margins": 0.6954997777938843, + "rewards/rejected": -15.350318908691406, + "step": 2410 + }, + { + "epoch": 0.08139809228487647, + "grad_norm": 16.91983413696289, + "learning_rate": 8.13953488372093e-07, + "logits/chosen": -0.35806578397750854, + "logits/rejected": -0.34750303626060486, + "logps/chosen": -1.6740829944610596, + "logps/rejected": -1.7585744857788086, + "loss": 2.3532, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.740829467773438, + "rewards/margins": 0.8449147343635559, + "rewards/rejected": -17.585744857788086, + "step": 2415 + }, + { + "epoch": 0.08156661835585965, + "grad_norm": 40.20941162109375, + "learning_rate": 8.156386922817661e-07, + "logits/chosen": -0.37193426489830017, + "logits/rejected": -0.3277415633201599, + "logps/chosen": -1.804286003112793, + "logps/rejected": -1.8441333770751953, + "loss": 2.9108, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.042861938476562, + "rewards/margins": 0.398474782705307, + "rewards/rejected": -18.441335678100586, + "step": 2420 + }, + { + "epoch": 0.08173514442684283, + "grad_norm": 28.02414321899414, + "learning_rate": 8.173238961914391e-07, + "logits/chosen": -0.6131106615066528, + "logits/rejected": -0.5054728984832764, + "logps/chosen": -1.6783838272094727, + "logps/rejected": -1.5606751441955566, + "loss": 4.2124, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.78384017944336, + "rewards/margins": -1.177086591720581, + "rewards/rejected": -15.606752395629883, + "step": 2425 + }, + { + "epoch": 0.08190367049782601, + "grad_norm": 29.80112648010254, + "learning_rate": 8.190091001011122e-07, + "logits/chosen": -0.5313662886619568, + "logits/rejected": -0.38392385840415955, + "logps/chosen": -1.7252247333526611, + "logps/rejected": -1.7433083057403564, + "loss": 3.2085, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.252246856689453, + "rewards/margins": 0.1808355301618576, + "rewards/rejected": -17.43308448791504, + "step": 2430 + }, + { + "epoch": 0.0820721965688092, + "grad_norm": 53.98869705200195, + "learning_rate": 8.206943040107853e-07, + "logits/chosen": -0.26283207535743713, + "logits/rejected": -0.2831880748271942, + "logps/chosen": -1.9631812572479248, + "logps/rejected": -2.0056893825531006, + "loss": 3.3863, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.631813049316406, + "rewards/margins": 0.4250810742378235, + "rewards/rejected": -20.05689239501953, + "step": 2435 + }, + { + "epoch": 0.08224072263979237, + "grad_norm": 50.21210861206055, + "learning_rate": 8.223795079204583e-07, + "logits/chosen": -0.20228877663612366, + "logits/rejected": -0.15569323301315308, + "logps/chosen": -1.7776590585708618, + "logps/rejected": -1.6473630666732788, + "loss": 4.4394, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -17.776588439941406, + "rewards/margins": -1.3029576539993286, + "rewards/rejected": -16.473630905151367, + "step": 2440 + }, + { + "epoch": 0.08240924871077555, + "grad_norm": 21.28071403503418, + "learning_rate": 8.240647118301313e-07, + "logits/chosen": 0.09300395101308823, + "logits/rejected": 0.14108344912528992, + "logps/chosen": -2.510117530822754, + "logps/rejected": -2.6373324394226074, + "loss": 1.9585, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -25.101173400878906, + "rewards/margins": 1.2721501588821411, + "rewards/rejected": -26.37332534790039, + "step": 2445 + }, + { + "epoch": 0.08257777478175873, + "grad_norm": 31.054330825805664, + "learning_rate": 8.257499157398045e-07, + "logits/chosen": -0.8290077447891235, + "logits/rejected": -0.6386555433273315, + "logps/chosen": -1.528441071510315, + "logps/rejected": -1.553753137588501, + "loss": 2.8956, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -15.284411430358887, + "rewards/margins": 0.253119558095932, + "rewards/rejected": -15.537530899047852, + "step": 2450 + }, + { + "epoch": 0.08274630085274191, + "grad_norm": 21.822750091552734, + "learning_rate": 8.274351196494776e-07, + "logits/chosen": -0.4548490643501282, + "logits/rejected": -0.5014796257019043, + "logps/chosen": -1.5887835025787354, + "logps/rejected": -1.537469506263733, + "loss": 3.5735, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -15.887834548950195, + "rewards/margins": -0.5131396055221558, + "rewards/rejected": -15.37469482421875, + "step": 2455 + }, + { + "epoch": 0.0829148269237251, + "grad_norm": 31.784954071044922, + "learning_rate": 8.291203235591507e-07, + "logits/chosen": -0.22424478828907013, + "logits/rejected": -0.25032782554626465, + "logps/chosen": -1.8386729955673218, + "logps/rejected": -1.7788835763931274, + "loss": 3.6937, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.386730194091797, + "rewards/margins": -0.5978950262069702, + "rewards/rejected": -17.788835525512695, + "step": 2460 + }, + { + "epoch": 0.08308335299470829, + "grad_norm": 16.605253219604492, + "learning_rate": 8.308055274688237e-07, + "logits/chosen": -0.3429441452026367, + "logits/rejected": -0.3958896994590759, + "logps/chosen": -1.5618867874145508, + "logps/rejected": -1.6103063821792603, + "loss": 2.7523, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -15.618867874145508, + "rewards/margins": 0.4841957986354828, + "rewards/rejected": -16.103063583374023, + "step": 2465 + }, + { + "epoch": 0.08325187906569147, + "grad_norm": 23.05854606628418, + "learning_rate": 8.324907313784968e-07, + "logits/chosen": -0.5046225786209106, + "logits/rejected": -0.3628554344177246, + "logps/chosen": -1.6668421030044556, + "logps/rejected": -1.7868725061416626, + "loss": 3.2138, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.66842269897461, + "rewards/margins": 1.2003029584884644, + "rewards/rejected": -17.868724822998047, + "step": 2470 + }, + { + "epoch": 0.08342040513667465, + "grad_norm": 20.510766983032227, + "learning_rate": 8.341759352881699e-07, + "logits/chosen": -0.028436947613954544, + "logits/rejected": -0.0018680095672607422, + "logps/chosen": -1.6618999242782593, + "logps/rejected": -1.6381727457046509, + "loss": 3.3843, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.618999481201172, + "rewards/margins": -0.23727349936962128, + "rewards/rejected": -16.38172721862793, + "step": 2475 + }, + { + "epoch": 0.08358893120765783, + "grad_norm": 29.016969680786133, + "learning_rate": 8.358611391978428e-07, + "logits/chosen": -0.5356850624084473, + "logits/rejected": -0.5231121182441711, + "logps/chosen": -1.4784767627716064, + "logps/rejected": -1.6118614673614502, + "loss": 1.9429, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -14.784765243530273, + "rewards/margins": 1.333849310874939, + "rewards/rejected": -16.118616104125977, + "step": 2480 + }, + { + "epoch": 0.083757457278641, + "grad_norm": 24.966922760009766, + "learning_rate": 8.375463431075159e-07, + "logits/chosen": -0.5175787210464478, + "logits/rejected": -0.5980731248855591, + "logps/chosen": -1.721757173538208, + "logps/rejected": -1.6184993982315063, + "loss": 4.1047, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.217571258544922, + "rewards/margins": -1.0325804948806763, + "rewards/rejected": -16.184993743896484, + "step": 2485 + }, + { + "epoch": 0.08392598334962419, + "grad_norm": 23.180864334106445, + "learning_rate": 8.39231547017189e-07, + "logits/chosen": -0.14530567824840546, + "logits/rejected": -0.19354048371315002, + "logps/chosen": -2.2405307292938232, + "logps/rejected": -2.4060721397399902, + "loss": 2.3143, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.40530776977539, + "rewards/margins": 1.655413031578064, + "rewards/rejected": -24.060718536376953, + "step": 2490 + }, + { + "epoch": 0.08409450942060737, + "grad_norm": 10.979439735412598, + "learning_rate": 8.40916750926862e-07, + "logits/chosen": -0.44204673171043396, + "logits/rejected": -0.440899521112442, + "logps/chosen": -1.6555083990097046, + "logps/rejected": -1.7864110469818115, + "loss": 2.7175, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.555084228515625, + "rewards/margins": 1.309026837348938, + "rewards/rejected": -17.864110946655273, + "step": 2495 + }, + { + "epoch": 0.08426303549159055, + "grad_norm": 23.828073501586914, + "learning_rate": 8.426019548365352e-07, + "logits/chosen": -0.6088763475418091, + "logits/rejected": -0.49291929602622986, + "logps/chosen": -1.8682682514190674, + "logps/rejected": -1.9699296951293945, + "loss": 2.2446, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.68268394470215, + "rewards/margins": 1.01661217212677, + "rewards/rejected": -19.699295043945312, + "step": 2500 + }, + { + "epoch": 0.08443156156257373, + "grad_norm": 20.06534194946289, + "learning_rate": 8.442871587462083e-07, + "logits/chosen": -0.7816548347473145, + "logits/rejected": -0.6558118462562561, + "logps/chosen": -1.7117058038711548, + "logps/rejected": -1.7417329549789429, + "loss": 2.8461, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.11705780029297, + "rewards/margins": 0.30027294158935547, + "rewards/rejected": -17.417329788208008, + "step": 2505 + }, + { + "epoch": 0.0846000876335569, + "grad_norm": 14.00971794128418, + "learning_rate": 8.459723626558814e-07, + "logits/chosen": -0.3416903018951416, + "logits/rejected": -0.3518048822879791, + "logps/chosen": -1.8263881206512451, + "logps/rejected": -2.0474984645843506, + "loss": 2.6428, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.263883590698242, + "rewards/margins": 2.2111012935638428, + "rewards/rejected": -20.474987030029297, + "step": 2510 + }, + { + "epoch": 0.08476861370454009, + "grad_norm": 28.76343536376953, + "learning_rate": 8.476575665655544e-07, + "logits/chosen": -0.5365201234817505, + "logits/rejected": -0.3862677216529846, + "logps/chosen": -1.7432008981704712, + "logps/rejected": -1.638911485671997, + "loss": 4.1144, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.432008743286133, + "rewards/margins": -1.0428931713104248, + "rewards/rejected": -16.389114379882812, + "step": 2515 + }, + { + "epoch": 0.08493713977552328, + "grad_norm": 32.587223052978516, + "learning_rate": 8.493427704752275e-07, + "logits/chosen": -0.852696418762207, + "logits/rejected": -0.8922135233879089, + "logps/chosen": -1.7877833843231201, + "logps/rejected": -1.7858728170394897, + "loss": 3.1803, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.87783432006836, + "rewards/margins": -0.019106198102235794, + "rewards/rejected": -17.85873031616211, + "step": 2520 + }, + { + "epoch": 0.08510566584650646, + "grad_norm": 17.61363983154297, + "learning_rate": 8.510279743849005e-07, + "logits/chosen": -0.689334511756897, + "logits/rejected": -0.60210782289505, + "logps/chosen": -1.4714277982711792, + "logps/rejected": -1.419995903968811, + "loss": 3.6069, + "rewards/accuracies": 0.5, + "rewards/chosen": -14.714277267456055, + "rewards/margins": -0.5143192410469055, + "rewards/rejected": -14.199956893920898, + "step": 2525 + }, + { + "epoch": 0.08527419191748964, + "grad_norm": 14.594451904296875, + "learning_rate": 8.527131782945736e-07, + "logits/chosen": -0.3180989623069763, + "logits/rejected": -0.17765101790428162, + "logps/chosen": -1.7926902770996094, + "logps/rejected": -1.7263141870498657, + "loss": 3.9356, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.926902770996094, + "rewards/margins": -0.6637606024742126, + "rewards/rejected": -17.263141632080078, + "step": 2530 + }, + { + "epoch": 0.08544271798847282, + "grad_norm": 34.54674530029297, + "learning_rate": 8.543983822042466e-07, + "logits/chosen": -0.6042408347129822, + "logits/rejected": -0.649804413318634, + "logps/chosen": -1.699530005455017, + "logps/rejected": -1.7469911575317383, + "loss": 2.9274, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.99530029296875, + "rewards/margins": 0.47461146116256714, + "rewards/rejected": -17.469911575317383, + "step": 2535 + }, + { + "epoch": 0.085611244059456, + "grad_norm": 27.885591506958008, + "learning_rate": 8.560835861139197e-07, + "logits/chosen": -0.45336025953292847, + "logits/rejected": -0.5436762571334839, + "logps/chosen": -1.7844282388687134, + "logps/rejected": -1.680147409439087, + "loss": 4.1847, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -17.844282150268555, + "rewards/margins": -1.0428078174591064, + "rewards/rejected": -16.80147361755371, + "step": 2540 + }, + { + "epoch": 0.08577977013043918, + "grad_norm": 25.23313331604004, + "learning_rate": 8.577687900235928e-07, + "logits/chosen": -0.7203700542449951, + "logits/rejected": -0.5824630260467529, + "logps/chosen": -1.8401525020599365, + "logps/rejected": -1.9509315490722656, + "loss": 2.2659, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.40152359008789, + "rewards/margins": 1.1077923774719238, + "rewards/rejected": -19.50931739807129, + "step": 2545 + }, + { + "epoch": 0.08594829620142236, + "grad_norm": 44.29021072387695, + "learning_rate": 8.594539939332659e-07, + "logits/chosen": -0.1677735149860382, + "logits/rejected": -0.12232518196105957, + "logps/chosen": -2.243950605392456, + "logps/rejected": -2.3047237396240234, + "loss": 2.9511, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -22.439504623413086, + "rewards/margins": 0.6077313423156738, + "rewards/rejected": -23.047237396240234, + "step": 2550 + }, + { + "epoch": 0.08611682227240554, + "grad_norm": 21.166263580322266, + "learning_rate": 8.61139197842939e-07, + "logits/chosen": -0.16569176316261292, + "logits/rejected": -0.1610005795955658, + "logps/chosen": -1.6874868869781494, + "logps/rejected": -1.731498122215271, + "loss": 2.7449, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.87487030029297, + "rewards/margins": 0.44011297821998596, + "rewards/rejected": -17.314983367919922, + "step": 2555 + }, + { + "epoch": 0.08628534834338872, + "grad_norm": 60.169769287109375, + "learning_rate": 8.628244017526121e-07, + "logits/chosen": -0.5680924654006958, + "logits/rejected": -0.5643773078918457, + "logps/chosen": -1.5531455278396606, + "logps/rejected": -1.451002597808838, + "loss": 4.0851, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -15.531454086303711, + "rewards/margins": -1.0214288234710693, + "rewards/rejected": -14.510026931762695, + "step": 2560 + }, + { + "epoch": 0.0864538744143719, + "grad_norm": 37.528282165527344, + "learning_rate": 8.645096056622852e-07, + "logits/chosen": -0.3817165791988373, + "logits/rejected": -0.1156226173043251, + "logps/chosen": -1.930029273033142, + "logps/rejected": -2.2233052253723145, + "loss": 2.162, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.30029296875, + "rewards/margins": 2.9327609539031982, + "rewards/rejected": -22.23305320739746, + "step": 2565 + }, + { + "epoch": 0.08662240048535508, + "grad_norm": 21.0026798248291, + "learning_rate": 8.661948095719581e-07, + "logits/chosen": -0.400460422039032, + "logits/rejected": -0.40357083082199097, + "logps/chosen": -1.6324317455291748, + "logps/rejected": -1.6684566736221313, + "loss": 2.8871, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.324317932128906, + "rewards/margins": 0.3602485656738281, + "rewards/rejected": -16.684566497802734, + "step": 2570 + }, + { + "epoch": 0.08679092655633827, + "grad_norm": 25.365571975708008, + "learning_rate": 8.678800134816312e-07, + "logits/chosen": -0.7402045130729675, + "logits/rejected": -0.7104997038841248, + "logps/chosen": -1.5982532501220703, + "logps/rejected": -1.785348653793335, + "loss": 2.21, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -15.98253345489502, + "rewards/margins": 1.8709545135498047, + "rewards/rejected": -17.85348892211914, + "step": 2575 + }, + { + "epoch": 0.08695945262732145, + "grad_norm": 16.19428253173828, + "learning_rate": 8.695652173913043e-07, + "logits/chosen": -0.4783341884613037, + "logits/rejected": -0.4398323893547058, + "logps/chosen": -1.4235103130340576, + "logps/rejected": -1.4861732721328735, + "loss": 2.5501, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -14.235102653503418, + "rewards/margins": 0.6266301870346069, + "rewards/rejected": -14.861734390258789, + "step": 2580 + }, + { + "epoch": 0.08712797869830463, + "grad_norm": 13.780372619628906, + "learning_rate": 8.712504213009773e-07, + "logits/chosen": -0.5036159753799438, + "logits/rejected": -0.4957659840583801, + "logps/chosen": -1.6749789714813232, + "logps/rejected": -1.7661399841308594, + "loss": 2.2914, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.74979019165039, + "rewards/margins": 0.911608874797821, + "rewards/rejected": -17.66139793395996, + "step": 2585 + }, + { + "epoch": 0.08729650476928781, + "grad_norm": 25.284282684326172, + "learning_rate": 8.729356252106504e-07, + "logits/chosen": -0.19299769401550293, + "logits/rejected": -0.13180485367774963, + "logps/chosen": -1.554852843284607, + "logps/rejected": -1.6819696426391602, + "loss": 2.5059, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -15.548528671264648, + "rewards/margins": 1.271166443824768, + "rewards/rejected": -16.8196964263916, + "step": 2590 + }, + { + "epoch": 0.08746503084027099, + "grad_norm": 61.71205520629883, + "learning_rate": 8.746208291203235e-07, + "logits/chosen": -0.7792420387268066, + "logits/rejected": -0.8336542248725891, + "logps/chosen": -1.8205082416534424, + "logps/rejected": -1.7581212520599365, + "loss": 3.703, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.205081939697266, + "rewards/margins": -0.623867392539978, + "rewards/rejected": -17.581214904785156, + "step": 2595 + }, + { + "epoch": 0.08763355691125417, + "grad_norm": 27.993122100830078, + "learning_rate": 8.763060330299967e-07, + "logits/chosen": -0.2036883383989334, + "logits/rejected": -0.21279895305633545, + "logps/chosen": -1.7165559530258179, + "logps/rejected": -1.8176734447479248, + "loss": 2.4624, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.165559768676758, + "rewards/margins": 1.0111745595932007, + "rewards/rejected": -18.176733016967773, + "step": 2600 + }, + { + "epoch": 0.08780208298223735, + "grad_norm": 16.836488723754883, + "learning_rate": 8.779912369396697e-07, + "logits/chosen": -0.32075661420822144, + "logits/rejected": -0.32733437418937683, + "logps/chosen": -1.8509960174560547, + "logps/rejected": -1.8140252828598022, + "loss": 3.6891, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.509960174560547, + "rewards/margins": -0.3697075843811035, + "rewards/rejected": -18.1402530670166, + "step": 2605 + }, + { + "epoch": 0.08797060905322053, + "grad_norm": 29.089462280273438, + "learning_rate": 8.796764408493428e-07, + "logits/chosen": -0.4722573757171631, + "logits/rejected": -0.3574286699295044, + "logps/chosen": -1.879150629043579, + "logps/rejected": -1.8658632040023804, + "loss": 3.4009, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.791505813598633, + "rewards/margins": -0.13287362456321716, + "rewards/rejected": -18.658634185791016, + "step": 2610 + }, + { + "epoch": 0.08813913512420371, + "grad_norm": 22.84649658203125, + "learning_rate": 8.813616447590158e-07, + "logits/chosen": -0.1808585822582245, + "logits/rejected": -0.3681219816207886, + "logps/chosen": -1.8490365743637085, + "logps/rejected": -1.804755449295044, + "loss": 3.6758, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.490365982055664, + "rewards/margins": -0.44281005859375, + "rewards/rejected": -18.047555923461914, + "step": 2615 + }, + { + "epoch": 0.08830766119518689, + "grad_norm": 29.813987731933594, + "learning_rate": 8.830468486686888e-07, + "logits/chosen": -0.5773710012435913, + "logits/rejected": -0.556043267250061, + "logps/chosen": -1.7120014429092407, + "logps/rejected": -1.75547194480896, + "loss": 3.0912, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.120014190673828, + "rewards/margins": 0.43470603227615356, + "rewards/rejected": -17.554719924926758, + "step": 2620 + }, + { + "epoch": 0.08847618726617007, + "grad_norm": 23.193584442138672, + "learning_rate": 8.847320525783619e-07, + "logits/chosen": -0.6107980012893677, + "logits/rejected": -0.587156355381012, + "logps/chosen": -1.7202411890029907, + "logps/rejected": -1.7343101501464844, + "loss": 3.0331, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.202411651611328, + "rewards/margins": 0.14069156348705292, + "rewards/rejected": -17.343101501464844, + "step": 2625 + }, + { + "epoch": 0.08864471333715326, + "grad_norm": 22.517454147338867, + "learning_rate": 8.86417256488035e-07, + "logits/chosen": -0.5417401790618896, + "logits/rejected": -0.4637575149536133, + "logps/chosen": -1.7523746490478516, + "logps/rejected": -1.8010832071304321, + "loss": 2.9477, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.523746490478516, + "rewards/margins": 0.48708638548851013, + "rewards/rejected": -18.010833740234375, + "step": 2630 + }, + { + "epoch": 0.08881323940813644, + "grad_norm": 22.720060348510742, + "learning_rate": 8.881024603977081e-07, + "logits/chosen": -0.11410139501094818, + "logits/rejected": -0.21287024021148682, + "logps/chosen": -1.6555503606796265, + "logps/rejected": -1.6463634967803955, + "loss": 3.539, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.555505752563477, + "rewards/margins": -0.09186983108520508, + "rewards/rejected": -16.463634490966797, + "step": 2635 + }, + { + "epoch": 0.08898176547911962, + "grad_norm": 12.026870727539062, + "learning_rate": 8.897876643073811e-07, + "logits/chosen": -0.39031511545181274, + "logits/rejected": -0.45363712310791016, + "logps/chosen": -1.5173307657241821, + "logps/rejected": -1.8569914102554321, + "loss": 2.4449, + "rewards/accuracies": 0.5, + "rewards/chosen": -15.173307418823242, + "rewards/margins": 3.3966071605682373, + "rewards/rejected": -18.56991195678711, + "step": 2640 + }, + { + "epoch": 0.0891502915501028, + "grad_norm": 19.74846649169922, + "learning_rate": 8.914728682170542e-07, + "logits/chosen": -0.796288788318634, + "logits/rejected": -0.7678640484809875, + "logps/chosen": -1.6136983633041382, + "logps/rejected": -1.503281831741333, + "loss": 4.334, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.136981964111328, + "rewards/margins": -1.1041651964187622, + "rewards/rejected": -15.032818794250488, + "step": 2645 + }, + { + "epoch": 0.08931881762108598, + "grad_norm": 10.176506042480469, + "learning_rate": 8.931580721267274e-07, + "logits/chosen": -0.4079221189022064, + "logits/rejected": -0.4823921322822571, + "logps/chosen": -1.8552573919296265, + "logps/rejected": -1.789438247680664, + "loss": 3.9795, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -18.552574157714844, + "rewards/margins": -0.6581940650939941, + "rewards/rejected": -17.89438247680664, + "step": 2650 + }, + { + "epoch": 0.08948734369206916, + "grad_norm": 21.36243438720703, + "learning_rate": 8.948432760364005e-07, + "logits/chosen": -0.15099112689495087, + "logits/rejected": -0.23671016097068787, + "logps/chosen": -1.9283297061920166, + "logps/rejected": -1.9521408081054688, + "loss": 3.5779, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.283296585083008, + "rewards/margins": 0.23811152577400208, + "rewards/rejected": -19.521408081054688, + "step": 2655 + }, + { + "epoch": 0.08965586976305234, + "grad_norm": 29.926528930664062, + "learning_rate": 8.965284799460734e-07, + "logits/chosen": -0.3068148195743561, + "logits/rejected": -0.235337495803833, + "logps/chosen": -1.8573440313339233, + "logps/rejected": -1.7485700845718384, + "loss": 4.1967, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -18.573440551757812, + "rewards/margins": -1.0877423286437988, + "rewards/rejected": -17.485698699951172, + "step": 2660 + }, + { + "epoch": 0.08982439583403552, + "grad_norm": 43.94549560546875, + "learning_rate": 8.982136838557465e-07, + "logits/chosen": -0.5529682040214539, + "logits/rejected": -0.49506306648254395, + "logps/chosen": -1.6662704944610596, + "logps/rejected": -1.6164556741714478, + "loss": 3.7571, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.662704467773438, + "rewards/margins": -0.498150497674942, + "rewards/rejected": -16.164554595947266, + "step": 2665 + }, + { + "epoch": 0.0899929219050187, + "grad_norm": 22.449846267700195, + "learning_rate": 8.998988877654196e-07, + "logits/chosen": -0.536016583442688, + "logits/rejected": -0.5582794547080994, + "logps/chosen": -1.6905359029769897, + "logps/rejected": -1.8122714757919312, + "loss": 2.5696, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.905357360839844, + "rewards/margins": 1.2173573970794678, + "rewards/rejected": -18.12271499633789, + "step": 2670 + }, + { + "epoch": 0.09016144797600188, + "grad_norm": 24.964977264404297, + "learning_rate": 9.015840916750926e-07, + "logits/chosen": -0.44670265913009644, + "logits/rejected": -0.3518041670322418, + "logps/chosen": -1.6536505222320557, + "logps/rejected": -1.6681900024414062, + "loss": 3.0175, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.5365047454834, + "rewards/margins": 0.14539547264575958, + "rewards/rejected": -16.681900024414062, + "step": 2675 + }, + { + "epoch": 0.09032997404698506, + "grad_norm": 21.91883087158203, + "learning_rate": 9.032692955847657e-07, + "logits/chosen": -0.5816112756729126, + "logits/rejected": -0.5414116978645325, + "logps/chosen": -1.976906180381775, + "logps/rejected": -1.8742444515228271, + "loss": 4.4431, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.769062042236328, + "rewards/margins": -1.0266164541244507, + "rewards/rejected": -18.74244499206543, + "step": 2680 + }, + { + "epoch": 0.09049850011796826, + "grad_norm": 26.940380096435547, + "learning_rate": 9.049544994944388e-07, + "logits/chosen": -0.17546026408672333, + "logits/rejected": -0.1966692954301834, + "logps/chosen": -1.948767066001892, + "logps/rejected": -2.047886610031128, + "loss": 2.2436, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.4876708984375, + "rewards/margins": 0.9911966323852539, + "rewards/rejected": -20.478870391845703, + "step": 2685 + }, + { + "epoch": 0.09066702618895144, + "grad_norm": 42.87982940673828, + "learning_rate": 9.066397034041119e-07, + "logits/chosen": -0.42629021406173706, + "logits/rejected": -0.31089669466018677, + "logps/chosen": -1.7324479818344116, + "logps/rejected": -1.7612009048461914, + "loss": 3.0846, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.324478149414062, + "rewards/margins": 0.2875285744667053, + "rewards/rejected": -17.612009048461914, + "step": 2690 + }, + { + "epoch": 0.09083555225993462, + "grad_norm": 26.106229782104492, + "learning_rate": 9.083249073137849e-07, + "logits/chosen": -0.6283289194107056, + "logits/rejected": -0.5349553823471069, + "logps/chosen": -1.7475484609603882, + "logps/rejected": -1.8252407312393188, + "loss": 2.7924, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.47548484802246, + "rewards/margins": 0.77692049741745, + "rewards/rejected": -18.25240707397461, + "step": 2695 + }, + { + "epoch": 0.0910040783309178, + "grad_norm": 16.301986694335938, + "learning_rate": 9.100101112234579e-07, + "logits/chosen": -0.37398990988731384, + "logits/rejected": -0.3726533353328705, + "logps/chosen": -1.613258719444275, + "logps/rejected": -1.5538930892944336, + "loss": 3.9193, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.132587432861328, + "rewards/margins": -0.5936561822891235, + "rewards/rejected": -15.538930892944336, + "step": 2700 + }, + { + "epoch": 0.09117260440190098, + "grad_norm": 25.090686798095703, + "learning_rate": 9.116953151331311e-07, + "logits/chosen": -0.5178991556167603, + "logits/rejected": -0.3978433907032013, + "logps/chosen": -1.6178112030029297, + "logps/rejected": -1.6006628274917603, + "loss": 3.3633, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.17811393737793, + "rewards/margins": -0.17148585617542267, + "rewards/rejected": -16.006628036499023, + "step": 2705 + }, + { + "epoch": 0.09134113047288415, + "grad_norm": 49.08612060546875, + "learning_rate": 9.133805190428041e-07, + "logits/chosen": -0.1910863071680069, + "logits/rejected": -0.22724132239818573, + "logps/chosen": -2.229879140853882, + "logps/rejected": -2.327415943145752, + "loss": 2.6912, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.29879379272461, + "rewards/margins": 0.9753679037094116, + "rewards/rejected": -23.274160385131836, + "step": 2710 + }, + { + "epoch": 0.09150965654386733, + "grad_norm": 11.361995697021484, + "learning_rate": 9.150657229524772e-07, + "logits/chosen": -0.43085527420043945, + "logits/rejected": -0.2916302978992462, + "logps/chosen": -1.3857858180999756, + "logps/rejected": -1.6563870906829834, + "loss": 2.1313, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -13.857858657836914, + "rewards/margins": 2.7060129642486572, + "rewards/rejected": -16.563871383666992, + "step": 2715 + }, + { + "epoch": 0.09167818261485051, + "grad_norm": 23.483665466308594, + "learning_rate": 9.167509268621503e-07, + "logits/chosen": -0.5284978151321411, + "logits/rejected": -0.5280933380126953, + "logps/chosen": -1.9413951635360718, + "logps/rejected": -2.0041792392730713, + "loss": 3.5537, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -19.413949966430664, + "rewards/margins": 0.6278434991836548, + "rewards/rejected": -20.041791915893555, + "step": 2720 + }, + { + "epoch": 0.0918467086858337, + "grad_norm": 21.85501480102539, + "learning_rate": 9.184361307718234e-07, + "logits/chosen": -0.0768141895532608, + "logits/rejected": -0.00690958509221673, + "logps/chosen": -1.6710220575332642, + "logps/rejected": -1.7174562215805054, + "loss": 2.6894, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.710220336914062, + "rewards/margins": 0.4643425941467285, + "rewards/rejected": -17.174564361572266, + "step": 2725 + }, + { + "epoch": 0.09201523475681687, + "grad_norm": 19.08591651916504, + "learning_rate": 9.201213346814964e-07, + "logits/chosen": -0.6823151707649231, + "logits/rejected": -0.5807539224624634, + "logps/chosen": -1.6391655206680298, + "logps/rejected": -1.7414779663085938, + "loss": 2.4353, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.39165687561035, + "rewards/margins": 1.0231223106384277, + "rewards/rejected": -17.414779663085938, + "step": 2730 + }, + { + "epoch": 0.09218376082780005, + "grad_norm": 17.085020065307617, + "learning_rate": 9.218065385911695e-07, + "logits/chosen": -0.1736678183078766, + "logits/rejected": -0.0709773451089859, + "logps/chosen": -1.9339990615844727, + "logps/rejected": -2.009704113006592, + "loss": 2.4459, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.339990615844727, + "rewards/margins": 0.7570520639419556, + "rewards/rejected": -20.097042083740234, + "step": 2735 + }, + { + "epoch": 0.09235228689878325, + "grad_norm": 21.93718147277832, + "learning_rate": 9.234917425008426e-07, + "logits/chosen": -0.708516001701355, + "logits/rejected": -0.536871612071991, + "logps/chosen": -1.5185356140136719, + "logps/rejected": -1.6370971202850342, + "loss": 3.115, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -15.185358047485352, + "rewards/margins": 1.185613989830017, + "rewards/rejected": -16.3709716796875, + "step": 2740 + }, + { + "epoch": 0.09252081296976643, + "grad_norm": 48.83224868774414, + "learning_rate": 9.251769464105155e-07, + "logits/chosen": -0.39072203636169434, + "logits/rejected": -0.432451069355011, + "logps/chosen": -1.755963921546936, + "logps/rejected": -1.6103007793426514, + "loss": 4.5056, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.559640884399414, + "rewards/margins": -1.4566314220428467, + "rewards/rejected": -16.103008270263672, + "step": 2745 + }, + { + "epoch": 0.09268933904074961, + "grad_norm": 23.23576545715332, + "learning_rate": 9.268621503201886e-07, + "logits/chosen": 0.031300973147153854, + "logits/rejected": -0.01834370568394661, + "logps/chosen": -2.059108257293701, + "logps/rejected": -2.0446953773498535, + "loss": 3.2485, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.591083526611328, + "rewards/margins": -0.14412669837474823, + "rewards/rejected": -20.44695472717285, + "step": 2750 + }, + { + "epoch": 0.09285786511173279, + "grad_norm": 11.359932899475098, + "learning_rate": 9.285473542298618e-07, + "logits/chosen": -0.04536425322294235, + "logits/rejected": -0.004798299167305231, + "logps/chosen": -1.8216663599014282, + "logps/rejected": -1.8901094198226929, + "loss": 3.0188, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.216663360595703, + "rewards/margins": 0.684432327747345, + "rewards/rejected": -18.90109634399414, + "step": 2755 + }, + { + "epoch": 0.09302639118271597, + "grad_norm": 22.040935516357422, + "learning_rate": 9.302325581395349e-07, + "logits/chosen": -0.3122491240501404, + "logits/rejected": -0.3268435597419739, + "logps/chosen": -1.8650197982788086, + "logps/rejected": -1.7798147201538086, + "loss": 3.9723, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.650197982788086, + "rewards/margins": -0.852049708366394, + "rewards/rejected": -17.798147201538086, + "step": 2760 + }, + { + "epoch": 0.09319491725369915, + "grad_norm": 22.49321746826172, + "learning_rate": 9.319177620492079e-07, + "logits/chosen": -0.507947564125061, + "logits/rejected": -0.5531247854232788, + "logps/chosen": -1.662655234336853, + "logps/rejected": -1.6844278573989868, + "loss": 3.5314, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.62655258178711, + "rewards/margins": 0.21772536635398865, + "rewards/rejected": -16.84427833557129, + "step": 2765 + }, + { + "epoch": 0.09336344332468233, + "grad_norm": 16.84610366821289, + "learning_rate": 9.33602965958881e-07, + "logits/chosen": -0.6405504941940308, + "logits/rejected": -0.5383267402648926, + "logps/chosen": -1.588544487953186, + "logps/rejected": -1.6625516414642334, + "loss": 2.5218, + "rewards/accuracies": 0.5, + "rewards/chosen": -15.885442733764648, + "rewards/margins": 0.7400724291801453, + "rewards/rejected": -16.625516891479492, + "step": 2770 + }, + { + "epoch": 0.0935319693956655, + "grad_norm": 29.287565231323242, + "learning_rate": 9.352881698685541e-07, + "logits/chosen": -0.10549436509609222, + "logits/rejected": -0.18245017528533936, + "logps/chosen": -1.8676893711090088, + "logps/rejected": -1.814923882484436, + "loss": 3.5887, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -18.676895141601562, + "rewards/margins": -0.5276561975479126, + "rewards/rejected": -18.14923667907715, + "step": 2775 + }, + { + "epoch": 0.09370049546664869, + "grad_norm": 35.53804397583008, + "learning_rate": 9.369733737782271e-07, + "logits/chosen": -0.9606320261955261, + "logits/rejected": -0.9500244855880737, + "logps/chosen": -1.4875165224075317, + "logps/rejected": -1.488067865371704, + "loss": 3.1073, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -14.875162124633789, + "rewards/margins": 0.00551605224609375, + "rewards/rejected": -14.8806791305542, + "step": 2780 + }, + { + "epoch": 0.09386902153763187, + "grad_norm": 24.10531997680664, + "learning_rate": 9.386585776879002e-07, + "logits/chosen": -0.43831610679626465, + "logits/rejected": -0.43352681398391724, + "logps/chosen": -1.8498563766479492, + "logps/rejected": -1.868779182434082, + "loss": 3.2661, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.498563766479492, + "rewards/margins": 0.1892286241054535, + "rewards/rejected": -18.687793731689453, + "step": 2785 + }, + { + "epoch": 0.09403754760861505, + "grad_norm": 25.974620819091797, + "learning_rate": 9.403437815975732e-07, + "logits/chosen": -0.5580755472183228, + "logits/rejected": -0.46703624725341797, + "logps/chosen": -1.6518490314483643, + "logps/rejected": -1.5979589223861694, + "loss": 3.5922, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.518489837646484, + "rewards/margins": -0.5388990640640259, + "rewards/rejected": -15.979589462280273, + "step": 2790 + }, + { + "epoch": 0.09420607367959824, + "grad_norm": 38.141876220703125, + "learning_rate": 9.420289855072463e-07, + "logits/chosen": -0.687545657157898, + "logits/rejected": -0.7488195896148682, + "logps/chosen": -1.834449052810669, + "logps/rejected": -1.693974256515503, + "loss": 4.435, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -18.344486236572266, + "rewards/margins": -1.4047452211380005, + "rewards/rejected": -16.939743041992188, + "step": 2795 + }, + { + "epoch": 0.09437459975058142, + "grad_norm": 33.917057037353516, + "learning_rate": 9.437141894169193e-07, + "logits/chosen": -0.48264870047569275, + "logits/rejected": -0.4806605279445648, + "logps/chosen": -1.8548141717910767, + "logps/rejected": -1.8633720874786377, + "loss": 3.0591, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.548139572143555, + "rewards/margins": 0.08558006584644318, + "rewards/rejected": -18.633718490600586, + "step": 2800 + }, + { + "epoch": 0.09437459975058142, + "eval_logits/chosen": -0.725313127040863, + "eval_logits/rejected": -0.7294741272926331, + "eval_logps/chosen": -1.668626070022583, + "eval_logps/rejected": -1.6697936058044434, + "eval_loss": 3.416093349456787, + "eval_rewards/accuracies": 0.46000000834465027, + "eval_rewards/chosen": -16.686260223388672, + "eval_rewards/margins": 0.01167456153780222, + "eval_rewards/rejected": -16.69793701171875, + "eval_runtime": 12.897, + "eval_samples_per_second": 7.754, + "eval_steps_per_second": 1.938, + "step": 2800 + }, + { + "epoch": 0.0945431258215646, + "grad_norm": 25.618377685546875, + "learning_rate": 9.453993933265925e-07, + "logits/chosen": -0.6152405738830566, + "logits/rejected": -0.6342421174049377, + "logps/chosen": -1.9061603546142578, + "logps/rejected": -2.1016743183135986, + "loss": 1.5141, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.061603546142578, + "rewards/margins": 1.9551403522491455, + "rewards/rejected": -21.016742706298828, + "step": 2805 + }, + { + "epoch": 0.09471165189254778, + "grad_norm": 20.979740142822266, + "learning_rate": 9.470845972362656e-07, + "logits/chosen": -0.17542439699172974, + "logits/rejected": -0.19594234228134155, + "logps/chosen": -1.8021949529647827, + "logps/rejected": -1.939035415649414, + "loss": 2.3183, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.02195167541504, + "rewards/margins": 1.3684017658233643, + "rewards/rejected": -19.390352249145508, + "step": 2810 + }, + { + "epoch": 0.09488017796353096, + "grad_norm": 37.135013580322266, + "learning_rate": 9.487698011459387e-07, + "logits/chosen": -0.8760625123977661, + "logits/rejected": -0.9745880365371704, + "logps/chosen": -1.827471137046814, + "logps/rejected": -1.8833777904510498, + "loss": 2.6297, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.27471160888672, + "rewards/margins": 0.5590680837631226, + "rewards/rejected": -18.83378028869629, + "step": 2815 + }, + { + "epoch": 0.09504870403451414, + "grad_norm": 16.353891372680664, + "learning_rate": 9.504550050556117e-07, + "logits/chosen": -0.4122091233730316, + "logits/rejected": -0.5152057409286499, + "logps/chosen": -1.6709582805633545, + "logps/rejected": -1.8144867420196533, + "loss": 2.1599, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.709585189819336, + "rewards/margins": 1.4352830648422241, + "rewards/rejected": -18.144866943359375, + "step": 2820 + }, + { + "epoch": 0.09521723010549732, + "grad_norm": 36.70476531982422, + "learning_rate": 9.521402089652848e-07, + "logits/chosen": -0.32984694838523865, + "logits/rejected": -0.23229286074638367, + "logps/chosen": -2.0945968627929688, + "logps/rejected": -2.1234512329101562, + "loss": 3.7133, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.945966720581055, + "rewards/margins": 0.2885432243347168, + "rewards/rejected": -21.234512329101562, + "step": 2825 + }, + { + "epoch": 0.0953857561764805, + "grad_norm": 33.7508544921875, + "learning_rate": 9.538254128749579e-07, + "logits/chosen": -0.28869864344596863, + "logits/rejected": -0.32969799637794495, + "logps/chosen": -1.8106577396392822, + "logps/rejected": -1.9228843450546265, + "loss": 2.9404, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.106576919555664, + "rewards/margins": 1.1222679615020752, + "rewards/rejected": -19.228845596313477, + "step": 2830 + }, + { + "epoch": 0.09555428224746368, + "grad_norm": 23.599029541015625, + "learning_rate": 9.55510616784631e-07, + "logits/chosen": -0.3256445527076721, + "logits/rejected": -0.3391716778278351, + "logps/chosen": -1.7188100814819336, + "logps/rejected": -1.7335550785064697, + "loss": 3.0343, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.188098907470703, + "rewards/margins": 0.14744806289672852, + "rewards/rejected": -17.335548400878906, + "step": 2835 + }, + { + "epoch": 0.09572280831844686, + "grad_norm": 38.509002685546875, + "learning_rate": 9.57195820694304e-07, + "logits/chosen": -0.38658618927001953, + "logits/rejected": -0.15633238852024078, + "logps/chosen": -2.1311075687408447, + "logps/rejected": -2.3841655254364014, + "loss": 3.1981, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -21.31107521057129, + "rewards/margins": 2.530580997467041, + "rewards/rejected": -23.841655731201172, + "step": 2840 + }, + { + "epoch": 0.09589133438943004, + "grad_norm": 27.05527114868164, + "learning_rate": 9.58881024603977e-07, + "logits/chosen": -0.818171501159668, + "logits/rejected": -0.6529834866523743, + "logps/chosen": -1.4707438945770264, + "logps/rejected": -1.6013038158416748, + "loss": 3.1145, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -14.707438468933105, + "rewards/margins": 1.3056000471115112, + "rewards/rejected": -16.01304054260254, + "step": 2845 + }, + { + "epoch": 0.09605986046041323, + "grad_norm": 24.46522331237793, + "learning_rate": 9.605662285136502e-07, + "logits/chosen": -0.5285354852676392, + "logits/rejected": -0.6483272314071655, + "logps/chosen": -1.71415114402771, + "logps/rejected": -1.738390326499939, + "loss": 2.9818, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.141511917114258, + "rewards/margins": 0.24239292740821838, + "rewards/rejected": -17.3839054107666, + "step": 2850 + }, + { + "epoch": 0.09622838653139641, + "grad_norm": 26.20121955871582, + "learning_rate": 9.622514324233232e-07, + "logits/chosen": -0.48643478751182556, + "logits/rejected": -0.45796099305152893, + "logps/chosen": -1.6284525394439697, + "logps/rejected": -1.654547929763794, + "loss": 3.0792, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.284526824951172, + "rewards/margins": 0.2609531283378601, + "rewards/rejected": -16.54547691345215, + "step": 2855 + }, + { + "epoch": 0.09639691260237959, + "grad_norm": 38.97121047973633, + "learning_rate": 9.639366363329963e-07, + "logits/chosen": -0.18868876993656158, + "logits/rejected": -0.110798180103302, + "logps/chosen": -2.010000705718994, + "logps/rejected": -1.9542725086212158, + "loss": 3.6253, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.100006103515625, + "rewards/margins": -0.5572818517684937, + "rewards/rejected": -19.542726516723633, + "step": 2860 + }, + { + "epoch": 0.09656543867336277, + "grad_norm": 24.709421157836914, + "learning_rate": 9.656218402426694e-07, + "logits/chosen": -0.7445005178451538, + "logits/rejected": -0.7172940373420715, + "logps/chosen": -1.8106021881103516, + "logps/rejected": -1.9134094715118408, + "loss": 2.2706, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.106021881103516, + "rewards/margins": 1.0280708074569702, + "rewards/rejected": -19.134092330932617, + "step": 2865 + }, + { + "epoch": 0.09673396474434595, + "grad_norm": 13.824490547180176, + "learning_rate": 9.673070441523424e-07, + "logits/chosen": -0.7275068163871765, + "logits/rejected": -0.7243115305900574, + "logps/chosen": -1.5358262062072754, + "logps/rejected": -1.5569114685058594, + "loss": 2.9848, + "rewards/accuracies": 0.5, + "rewards/chosen": -15.35826301574707, + "rewards/margins": 0.21085242927074432, + "rewards/rejected": -15.569114685058594, + "step": 2870 + }, + { + "epoch": 0.09690249081532913, + "grad_norm": 22.991600036621094, + "learning_rate": 9.689922480620153e-07, + "logits/chosen": -0.4551324248313904, + "logits/rejected": -0.4542100429534912, + "logps/chosen": -1.6309089660644531, + "logps/rejected": -1.6141412258148193, + "loss": 3.4348, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.30908966064453, + "rewards/margins": -0.16767773032188416, + "rewards/rejected": -16.14141273498535, + "step": 2875 + }, + { + "epoch": 0.09707101688631231, + "grad_norm": 40.251258850097656, + "learning_rate": 9.706774519716886e-07, + "logits/chosen": -0.30819210410118103, + "logits/rejected": -0.15154561400413513, + "logps/chosen": -1.7786144018173218, + "logps/rejected": -1.8767932653427124, + "loss": 2.6402, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.786144256591797, + "rewards/margins": 0.9817888140678406, + "rewards/rejected": -18.767932891845703, + "step": 2880 + }, + { + "epoch": 0.09723954295729549, + "grad_norm": 16.79170799255371, + "learning_rate": 9.723626558813617e-07, + "logits/chosen": -0.3624119758605957, + "logits/rejected": -0.5596259832382202, + "logps/chosen": -1.7782881259918213, + "logps/rejected": -1.7136462926864624, + "loss": 3.736, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.782880783081055, + "rewards/margins": -0.6464160680770874, + "rewards/rejected": -17.136465072631836, + "step": 2885 + }, + { + "epoch": 0.09740806902827867, + "grad_norm": 53.098182678222656, + "learning_rate": 9.740478597910347e-07, + "logits/chosen": -0.5692394375801086, + "logits/rejected": -0.6051616072654724, + "logps/chosen": -2.1128344535827637, + "logps/rejected": -2.038572311401367, + "loss": 3.8688, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.128345489501953, + "rewards/margins": -0.742621123790741, + "rewards/rejected": -20.385725021362305, + "step": 2890 + }, + { + "epoch": 0.09757659509926185, + "grad_norm": 2.4433932304382324, + "learning_rate": 9.757330637007078e-07, + "logits/chosen": -0.10106615722179413, + "logits/rejected": -0.12207716703414917, + "logps/chosen": -2.044761896133423, + "logps/rejected": -2.2555930614471436, + "loss": 2.3954, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.447616577148438, + "rewards/margins": 2.1083126068115234, + "rewards/rejected": -22.55592918395996, + "step": 2895 + }, + { + "epoch": 0.09774512117024503, + "grad_norm": 43.901123046875, + "learning_rate": 9.774182676103809e-07, + "logits/chosen": -0.17931941151618958, + "logits/rejected": -0.30949029326438904, + "logps/chosen": -2.0980095863342285, + "logps/rejected": -2.07295823097229, + "loss": 3.3587, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.980093002319336, + "rewards/margins": -0.2505127787590027, + "rewards/rejected": -20.72957992553711, + "step": 2900 + }, + { + "epoch": 0.09791364724122822, + "grad_norm": 14.931760787963867, + "learning_rate": 9.79103471520054e-07, + "logits/chosen": -0.5947299003601074, + "logits/rejected": -0.7893961668014526, + "logps/chosen": -1.727439522743225, + "logps/rejected": -1.5623410940170288, + "loss": 4.8803, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -17.27439308166504, + "rewards/margins": -1.6509826183319092, + "rewards/rejected": -15.623411178588867, + "step": 2905 + }, + { + "epoch": 0.0980821733122114, + "grad_norm": 19.381126403808594, + "learning_rate": 9.80788675429727e-07, + "logits/chosen": -0.5605853796005249, + "logits/rejected": -0.49684804677963257, + "logps/chosen": -1.5796529054641724, + "logps/rejected": -1.7275539636611938, + "loss": 2.1057, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -15.796528816223145, + "rewards/margins": 1.4790114164352417, + "rewards/rejected": -17.27553939819336, + "step": 2910 + }, + { + "epoch": 0.09825069938319458, + "grad_norm": 28.500883102416992, + "learning_rate": 9.824738793394e-07, + "logits/chosen": -0.6147949695587158, + "logits/rejected": -0.8111165165901184, + "logps/chosen": -1.7637847661972046, + "logps/rejected": -1.633888840675354, + "loss": 4.3361, + "rewards/accuracies": 0.10000000149011612, + "rewards/chosen": -17.637847900390625, + "rewards/margins": -1.2989604473114014, + "rewards/rejected": -16.338886260986328, + "step": 2915 + }, + { + "epoch": 0.09841922545417776, + "grad_norm": 20.428529739379883, + "learning_rate": 9.84159083249073e-07, + "logits/chosen": -0.2776259779930115, + "logits/rejected": -0.18497855961322784, + "logps/chosen": -2.1334004402160645, + "logps/rejected": -2.049607992172241, + "loss": 4.0409, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.334003448486328, + "rewards/margins": -0.8379223942756653, + "rewards/rejected": -20.49608039855957, + "step": 2920 + }, + { + "epoch": 0.09858775152516094, + "grad_norm": 26.86744499206543, + "learning_rate": 9.85844287158746e-07, + "logits/chosen": -0.6018735766410828, + "logits/rejected": -0.670646071434021, + "logps/chosen": -1.6858047246932983, + "logps/rejected": -1.7887957096099854, + "loss": 2.9202, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.858049392700195, + "rewards/margins": 1.0299079418182373, + "rewards/rejected": -17.887958526611328, + "step": 2925 + }, + { + "epoch": 0.09875627759614412, + "grad_norm": 52.686256408691406, + "learning_rate": 9.875294910684193e-07, + "logits/chosen": -0.314828097820282, + "logits/rejected": -0.24025221168994904, + "logps/chosen": -1.9489824771881104, + "logps/rejected": -2.161886215209961, + "loss": 2.3532, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.489826202392578, + "rewards/margins": 2.1290364265441895, + "rewards/rejected": -21.61886215209961, + "step": 2930 + }, + { + "epoch": 0.0989248036671273, + "grad_norm": 25.790864944458008, + "learning_rate": 9.892146949780924e-07, + "logits/chosen": -0.5112024545669556, + "logits/rejected": -0.5521891713142395, + "logps/chosen": -1.707360029220581, + "logps/rejected": -1.6799137592315674, + "loss": 3.4054, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.073598861694336, + "rewards/margins": -0.27446356415748596, + "rewards/rejected": -16.799137115478516, + "step": 2935 + }, + { + "epoch": 0.09909332973811048, + "grad_norm": 35.845943450927734, + "learning_rate": 9.908998988877655e-07, + "logits/chosen": -0.46860605478286743, + "logits/rejected": -0.6842837929725647, + "logps/chosen": -1.6268116235733032, + "logps/rejected": -1.5220801830291748, + "loss": 4.1136, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -16.268115997314453, + "rewards/margins": -1.047314167022705, + "rewards/rejected": -15.220802307128906, + "step": 2940 + }, + { + "epoch": 0.09926185580909366, + "grad_norm": 19.779817581176758, + "learning_rate": 9.925851027974385e-07, + "logits/chosen": -0.53152996301651, + "logits/rejected": -0.4204404950141907, + "logps/chosen": -1.6282031536102295, + "logps/rejected": -1.6569904088974, + "loss": 3.1771, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.282032012939453, + "rewards/margins": 0.28787460923194885, + "rewards/rejected": -16.569904327392578, + "step": 2945 + }, + { + "epoch": 0.09943038188007684, + "grad_norm": 23.035404205322266, + "learning_rate": 9.942703067071116e-07, + "logits/chosen": -0.6475009322166443, + "logits/rejected": -0.4998777508735657, + "logps/chosen": -2.263774871826172, + "logps/rejected": -2.4253833293914795, + "loss": 2.4489, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.63774871826172, + "rewards/margins": 1.6160846948623657, + "rewards/rejected": -24.253833770751953, + "step": 2950 + }, + { + "epoch": 0.09959890795106002, + "grad_norm": 15.458049774169922, + "learning_rate": 9.959555106167847e-07, + "logits/chosen": -0.44105878472328186, + "logits/rejected": -0.5367931127548218, + "logps/chosen": -1.7834441661834717, + "logps/rejected": -1.888159990310669, + "loss": 2.3057, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.834440231323242, + "rewards/margins": 1.04715895652771, + "rewards/rejected": -18.8815975189209, + "step": 2955 + }, + { + "epoch": 0.09976743402204322, + "grad_norm": 30.391836166381836, + "learning_rate": 9.976407145264577e-07, + "logits/chosen": -0.3933469355106354, + "logits/rejected": -0.35893380641937256, + "logps/chosen": -1.749251365661621, + "logps/rejected": -1.7169334888458252, + "loss": 3.4411, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.49251365661621, + "rewards/margins": -0.32318076491355896, + "rewards/rejected": -17.16933250427246, + "step": 2960 + }, + { + "epoch": 0.0999359600930264, + "grad_norm": 26.183565139770508, + "learning_rate": 9.993259184361306e-07, + "logits/chosen": -0.5088056325912476, + "logits/rejected": -0.49937066435813904, + "logps/chosen": -1.6994832754135132, + "logps/rejected": -1.613231897354126, + "loss": 4.2055, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.99483299255371, + "rewards/margins": -0.8625134229660034, + "rewards/rejected": -16.132320404052734, + "step": 2965 + }, + { + "epoch": 0.10010448616400958, + "grad_norm": 32.271846771240234, + "learning_rate": 9.999999688545453e-07, + "logits/chosen": -0.33978405594825745, + "logits/rejected": -0.1545068919658661, + "logps/chosen": -1.9441852569580078, + "logps/rejected": -1.9940448999404907, + "loss": 2.7681, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.441852569580078, + "rewards/margins": 0.4985966682434082, + "rewards/rejected": -19.940448760986328, + "step": 2970 + }, + { + "epoch": 0.10027301223499276, + "grad_norm": 24.9840145111084, + "learning_rate": 9.99999778521225e-07, + "logits/chosen": -1.09787118434906, + "logits/rejected": -1.0583521127700806, + "logps/chosen": -1.7992699146270752, + "logps/rejected": -1.7420806884765625, + "loss": 3.9549, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -17.992698669433594, + "rewards/margins": -0.5718933939933777, + "rewards/rejected": -17.420804977416992, + "step": 2975 + }, + { + "epoch": 0.10044153830597594, + "grad_norm": 14.769031524658203, + "learning_rate": 9.999994151576805e-07, + "logits/chosen": -1.1787911653518677, + "logits/rejected": -1.0617015361785889, + "logps/chosen": -1.6846939325332642, + "logps/rejected": -1.791347861289978, + "loss": 2.3642, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.84693717956543, + "rewards/margins": 1.0665404796600342, + "rewards/rejected": -17.91347885131836, + "step": 2980 + }, + { + "epoch": 0.10061006437695912, + "grad_norm": 31.43277359008789, + "learning_rate": 9.999988787640376e-07, + "logits/chosen": -0.07004846632480621, + "logits/rejected": -0.003652901854366064, + "logps/chosen": -1.5951316356658936, + "logps/rejected": -1.7138687372207642, + "loss": 2.2606, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -15.951314926147461, + "rewards/margins": 1.1873712539672852, + "rewards/rejected": -17.138687133789062, + "step": 2985 + }, + { + "epoch": 0.1007785904479423, + "grad_norm": 17.785526275634766, + "learning_rate": 9.99998169340482e-07, + "logits/chosen": 0.04601895064115524, + "logits/rejected": -0.05084504559636116, + "logps/chosen": -1.6214570999145508, + "logps/rejected": -1.5676156282424927, + "loss": 3.6376, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.21457290649414, + "rewards/margins": -0.5384153127670288, + "rewards/rejected": -15.676156997680664, + "step": 2990 + }, + { + "epoch": 0.10094711651892548, + "grad_norm": 24.750946044921875, + "learning_rate": 9.99997286887259e-07, + "logits/chosen": -0.40296635031700134, + "logits/rejected": -0.32598623633384705, + "logps/chosen": -1.5744359493255615, + "logps/rejected": -1.5301647186279297, + "loss": 3.6044, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -15.744359970092773, + "rewards/margins": -0.44271326065063477, + "rewards/rejected": -15.30164623260498, + "step": 2995 + }, + { + "epoch": 0.10111564258990866, + "grad_norm": 22.85517692565918, + "learning_rate": 9.999962314046742e-07, + "logits/chosen": -0.6072363257408142, + "logits/rejected": -0.6490095853805542, + "logps/chosen": -1.840370535850525, + "logps/rejected": -1.7861398458480835, + "loss": 3.6313, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.403705596923828, + "rewards/margins": -0.5423071980476379, + "rewards/rejected": -17.861400604248047, + "step": 3000 + }, + { + "epoch": 0.10128416866089184, + "grad_norm": 16.391801834106445, + "learning_rate": 9.999950028930927e-07, + "logits/chosen": -0.6177361011505127, + "logits/rejected": -0.44297394156455994, + "logps/chosen": -1.480201005935669, + "logps/rejected": -1.643811821937561, + "loss": 3.065, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -14.802009582519531, + "rewards/margins": 1.6361064910888672, + "rewards/rejected": -16.43811798095703, + "step": 3005 + }, + { + "epoch": 0.10145269473187501, + "grad_norm": 7.250625133514404, + "learning_rate": 9.9999360135294e-07, + "logits/chosen": -0.35616278648376465, + "logits/rejected": -0.4137846529483795, + "logps/chosen": -2.107487678527832, + "logps/rejected": -2.161532163619995, + "loss": 3.3089, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.074878692626953, + "rewards/margins": 0.5404449701309204, + "rewards/rejected": -21.61532211303711, + "step": 3010 + }, + { + "epoch": 0.10162122080285821, + "grad_norm": 25.29311752319336, + "learning_rate": 9.999920267847007e-07, + "logits/chosen": -0.0703842043876648, + "logits/rejected": -0.06832405179738998, + "logps/chosen": -2.167907238006592, + "logps/rejected": -1.8903629779815674, + "loss": 5.8812, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.679073333740234, + "rewards/margins": -2.775441884994507, + "rewards/rejected": -18.90363121032715, + "step": 3015 + }, + { + "epoch": 0.10178974687384139, + "grad_norm": 17.77655792236328, + "learning_rate": 9.999902791889196e-07, + "logits/chosen": -0.3733082711696625, + "logits/rejected": -0.39694738388061523, + "logps/chosen": -1.7270221710205078, + "logps/rejected": -1.724534273147583, + "loss": 3.1331, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.270221710205078, + "rewards/margins": -0.02487936057150364, + "rewards/rejected": -17.245342254638672, + "step": 3020 + }, + { + "epoch": 0.10195827294482457, + "grad_norm": 15.479154586791992, + "learning_rate": 9.999883585662018e-07, + "logits/chosen": -0.6984944343566895, + "logits/rejected": -0.6338313817977905, + "logps/chosen": -1.5714912414550781, + "logps/rejected": -1.6366016864776611, + "loss": 2.7082, + "rewards/accuracies": 0.5, + "rewards/chosen": -15.714914321899414, + "rewards/margins": 0.6511033177375793, + "rewards/rejected": -16.366016387939453, + "step": 3025 + }, + { + "epoch": 0.10212679901580775, + "grad_norm": 20.5145263671875, + "learning_rate": 9.99986264917212e-07, + "logits/chosen": -0.4067594110965729, + "logits/rejected": -0.3656831383705139, + "logps/chosen": -1.598494529724121, + "logps/rejected": -1.6863136291503906, + "loss": 2.3492, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -15.984945297241211, + "rewards/margins": 0.878189206123352, + "rewards/rejected": -16.863134384155273, + "step": 3030 + }, + { + "epoch": 0.10229532508679093, + "grad_norm": 22.588560104370117, + "learning_rate": 9.999839982426744e-07, + "logits/chosen": -0.7191510796546936, + "logits/rejected": -0.6838647127151489, + "logps/chosen": -1.9661766290664673, + "logps/rejected": -2.049471616744995, + "loss": 2.5507, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.661766052246094, + "rewards/margins": 0.832950234413147, + "rewards/rejected": -20.49471664428711, + "step": 3035 + }, + { + "epoch": 0.10246385115777411, + "grad_norm": 19.967512130737305, + "learning_rate": 9.99981558543374e-07, + "logits/chosen": -0.6296879053115845, + "logits/rejected": -0.5854636430740356, + "logps/chosen": -2.0275909900665283, + "logps/rejected": -1.940290093421936, + "loss": 3.9449, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.275909423828125, + "rewards/margins": -0.8730102777481079, + "rewards/rejected": -19.40289878845215, + "step": 3040 + }, + { + "epoch": 0.10263237722875729, + "grad_norm": 27.079069137573242, + "learning_rate": 9.999789458201542e-07, + "logits/chosen": -0.19687362015247345, + "logits/rejected": -0.28982409834861755, + "logps/chosen": -1.9375187158584595, + "logps/rejected": -1.863250494003296, + "loss": 3.814, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -19.375186920166016, + "rewards/margins": -0.7426818013191223, + "rewards/rejected": -18.632503509521484, + "step": 3045 + }, + { + "epoch": 0.10280090329974047, + "grad_norm": 89.50406646728516, + "learning_rate": 9.999761600739198e-07, + "logits/chosen": -0.19124934077262878, + "logits/rejected": -0.32821229100227356, + "logps/chosen": -2.2497265338897705, + "logps/rejected": -2.3240628242492676, + "loss": 2.759, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.49726676940918, + "rewards/margins": 0.743362545967102, + "rewards/rejected": -23.24062728881836, + "step": 3050 + }, + { + "epoch": 0.10296942937072365, + "grad_norm": 35.753177642822266, + "learning_rate": 9.999732013056347e-07, + "logits/chosen": -0.2083957940340042, + "logits/rejected": -0.24470682442188263, + "logps/chosen": -1.7198501825332642, + "logps/rejected": -1.783496618270874, + "loss": 2.8293, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.198501586914062, + "rewards/margins": 0.6364647746086121, + "rewards/rejected": -17.8349666595459, + "step": 3055 + }, + { + "epoch": 0.10313795544170683, + "grad_norm": 24.368762969970703, + "learning_rate": 9.999700695163228e-07, + "logits/chosen": -0.6474219560623169, + "logits/rejected": -0.7258163690567017, + "logps/chosen": -1.804152488708496, + "logps/rejected": -1.6962175369262695, + "loss": 4.1681, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.04152488708496, + "rewards/margins": -1.0793492794036865, + "rewards/rejected": -16.962177276611328, + "step": 3060 + }, + { + "epoch": 0.10330648151269001, + "grad_norm": 23.3294677734375, + "learning_rate": 9.999667647070678e-07, + "logits/chosen": -0.4061599671840668, + "logits/rejected": -0.3327089846134186, + "logps/chosen": -1.9971929788589478, + "logps/rejected": -1.8557159900665283, + "loss": 4.6338, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.971927642822266, + "rewards/margins": -1.4147707223892212, + "rewards/rejected": -18.557159423828125, + "step": 3065 + }, + { + "epoch": 0.1034750075836732, + "grad_norm": 15.371095657348633, + "learning_rate": 9.999632868790135e-07, + "logits/chosen": -0.7014227509498596, + "logits/rejected": -0.7013593316078186, + "logps/chosen": -1.5081361532211304, + "logps/rejected": -1.695508360862732, + "loss": 2.0241, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -15.0813627243042, + "rewards/margins": 1.8737218379974365, + "rewards/rejected": -16.9550838470459, + "step": 3070 + }, + { + "epoch": 0.10364353365465638, + "grad_norm": 24.77459144592285, + "learning_rate": 9.999596360333634e-07, + "logits/chosen": -0.39553630352020264, + "logits/rejected": -0.31712251901626587, + "logps/chosen": -1.7771854400634766, + "logps/rejected": -1.7988437414169312, + "loss": 2.9031, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.7718563079834, + "rewards/margins": 0.21658353507518768, + "rewards/rejected": -17.988439559936523, + "step": 3075 + }, + { + "epoch": 0.10381205972563956, + "grad_norm": 22.489713668823242, + "learning_rate": 9.99955812171381e-07, + "logits/chosen": -0.37975651025772095, + "logits/rejected": -0.3504462242126465, + "logps/chosen": -1.5920625925064087, + "logps/rejected": -1.608381986618042, + "loss": 3.0138, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -15.920623779296875, + "rewards/margins": 0.16319552063941956, + "rewards/rejected": -16.083820343017578, + "step": 3080 + }, + { + "epoch": 0.10398058579662274, + "grad_norm": 21.4431095123291, + "learning_rate": 9.999518152943892e-07, + "logits/chosen": -0.3899223804473877, + "logits/rejected": -0.5776330232620239, + "logps/chosen": -1.7637507915496826, + "logps/rejected": -1.8772176504135132, + "loss": 2.3654, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.637508392333984, + "rewards/margins": 1.1346690654754639, + "rewards/rejected": -18.772174835205078, + "step": 3085 + }, + { + "epoch": 0.10414911186760592, + "grad_norm": 35.68021774291992, + "learning_rate": 9.999476454037716e-07, + "logits/chosen": -0.23362183570861816, + "logits/rejected": -0.2220069169998169, + "logps/chosen": -1.9831234216690063, + "logps/rejected": -1.9658180475234985, + "loss": 3.2693, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.83123207092285, + "rewards/margins": -0.17305298149585724, + "rewards/rejected": -19.658180236816406, + "step": 3090 + }, + { + "epoch": 0.1043176379385891, + "grad_norm": 13.99560546875, + "learning_rate": 9.99943302500971e-07, + "logits/chosen": -0.1623246967792511, + "logits/rejected": -0.0977829098701477, + "logps/chosen": -1.9267008304595947, + "logps/rejected": -2.038973093032837, + "loss": 2.4041, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.26700782775879, + "rewards/margins": 1.1227233409881592, + "rewards/rejected": -20.389732360839844, + "step": 3095 + }, + { + "epoch": 0.10448616400957228, + "grad_norm": 34.67253494262695, + "learning_rate": 9.999387865874904e-07, + "logits/chosen": -0.31366387009620667, + "logits/rejected": -0.18402309715747833, + "logps/chosen": -1.8098551034927368, + "logps/rejected": -1.8108694553375244, + "loss": 3.2052, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.09855079650879, + "rewards/margins": 0.010143804363906384, + "rewards/rejected": -18.10869598388672, + "step": 3100 + }, + { + "epoch": 0.10465469008055546, + "grad_norm": 24.195720672607422, + "learning_rate": 9.999340976648928e-07, + "logits/chosen": -0.8281705975532532, + "logits/rejected": -0.8121271133422852, + "logps/chosen": -1.3327033519744873, + "logps/rejected": -1.316989541053772, + "loss": 3.2535, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -13.327033996582031, + "rewards/margins": -0.1571389138698578, + "rewards/rejected": -13.169894218444824, + "step": 3105 + }, + { + "epoch": 0.10482321615153864, + "grad_norm": 6.956542491912842, + "learning_rate": 9.999292357348005e-07, + "logits/chosen": -0.477001428604126, + "logits/rejected": -0.32476919889450073, + "logps/chosen": -1.6057878732681274, + "logps/rejected": -1.7301912307739258, + "loss": 2.7141, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.057878494262695, + "rewards/margins": 1.2440357208251953, + "rewards/rejected": -17.30191421508789, + "step": 3110 + }, + { + "epoch": 0.10499174222252182, + "grad_norm": 15.775235176086426, + "learning_rate": 9.99924200798896e-07, + "logits/chosen": -0.29664766788482666, + "logits/rejected": -0.24033299088478088, + "logps/chosen": -1.8023223876953125, + "logps/rejected": -1.7808622121810913, + "loss": 3.3402, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.023223876953125, + "rewards/margins": -0.21460160613059998, + "rewards/rejected": -17.808622360229492, + "step": 3115 + }, + { + "epoch": 0.105160268293505, + "grad_norm": 27.72269058227539, + "learning_rate": 9.999189928589217e-07, + "logits/chosen": -0.45769166946411133, + "logits/rejected": -0.4425771236419678, + "logps/chosen": -1.8008079528808594, + "logps/rejected": -1.8088890314102173, + "loss": 3.0318, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.008079528808594, + "rewards/margins": 0.0808115005493164, + "rewards/rejected": -18.08888816833496, + "step": 3120 + }, + { + "epoch": 0.10532879436448819, + "grad_norm": 32.695072174072266, + "learning_rate": 9.999136119166803e-07, + "logits/chosen": -0.2397170513868332, + "logits/rejected": -0.1902095526456833, + "logps/chosen": -1.861707091331482, + "logps/rejected": -1.9670108556747437, + "loss": 2.7405, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.617069244384766, + "rewards/margins": 1.0530370473861694, + "rewards/rejected": -19.670108795166016, + "step": 3125 + }, + { + "epoch": 0.10549732043547137, + "grad_norm": 32.09098815917969, + "learning_rate": 9.999080579740335e-07, + "logits/chosen": -0.2794944643974304, + "logits/rejected": -0.15733040869235992, + "logps/chosen": -1.518586277961731, + "logps/rejected": -1.4439319372177124, + "loss": 3.7903, + "rewards/accuracies": 0.5, + "rewards/chosen": -15.18586254119873, + "rewards/margins": -0.7465431094169617, + "rewards/rejected": -14.43932056427002, + "step": 3130 + }, + { + "epoch": 0.10566584650645455, + "grad_norm": 35.430084228515625, + "learning_rate": 9.999023310329032e-07, + "logits/chosen": -0.39965033531188965, + "logits/rejected": -0.32974153757095337, + "logps/chosen": -1.5488475561141968, + "logps/rejected": -1.5709179639816284, + "loss": 2.9906, + "rewards/accuracies": 0.5, + "rewards/chosen": -15.48847484588623, + "rewards/margins": 0.22070512175559998, + "rewards/rejected": -15.709179878234863, + "step": 3135 + }, + { + "epoch": 0.10583437257743773, + "grad_norm": 19.31629753112793, + "learning_rate": 9.99896431095272e-07, + "logits/chosen": -0.6975013613700867, + "logits/rejected": -0.6075304746627808, + "logps/chosen": -1.9307944774627686, + "logps/rejected": -1.9646923542022705, + "loss": 2.9467, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.307945251464844, + "rewards/margins": 0.3389785885810852, + "rewards/rejected": -19.646923065185547, + "step": 3140 + }, + { + "epoch": 0.10600289864842091, + "grad_norm": 23.149045944213867, + "learning_rate": 9.998903581631808e-07, + "logits/chosen": -0.17336881160736084, + "logits/rejected": -0.12885281443595886, + "logps/chosen": -1.6452945470809937, + "logps/rejected": -1.8046413660049438, + "loss": 2.4754, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.452945709228516, + "rewards/margins": 1.5934689044952393, + "rewards/rejected": -18.04641342163086, + "step": 3145 + }, + { + "epoch": 0.10617142471940409, + "grad_norm": 33.11212158203125, + "learning_rate": 9.998841122387315e-07, + "logits/chosen": -0.3954317569732666, + "logits/rejected": -0.5562535524368286, + "logps/chosen": -1.827657699584961, + "logps/rejected": -1.8005775213241577, + "loss": 3.4283, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -18.276575088500977, + "rewards/margins": -0.27080029249191284, + "rewards/rejected": -18.00577735900879, + "step": 3150 + }, + { + "epoch": 0.10633995079038727, + "grad_norm": 24.7961368560791, + "learning_rate": 9.998776933240858e-07, + "logits/chosen": -0.6464129686355591, + "logits/rejected": -0.6087260842323303, + "logps/chosen": -1.6729981899261475, + "logps/rejected": -1.562254786491394, + "loss": 4.1478, + "rewards/accuracies": 0.10000000149011612, + "rewards/chosen": -16.729984283447266, + "rewards/margins": -1.1074340343475342, + "rewards/rejected": -15.62254810333252, + "step": 3155 + }, + { + "epoch": 0.10650847686137045, + "grad_norm": 49.34171676635742, + "learning_rate": 9.998711014214648e-07, + "logits/chosen": -0.1933070868253708, + "logits/rejected": -0.15686890482902527, + "logps/chosen": -1.7761684656143188, + "logps/rejected": -2.0039217472076416, + "loss": 2.2653, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.761682510375977, + "rewards/margins": 2.277535915374756, + "rewards/rejected": -20.03921890258789, + "step": 3160 + }, + { + "epoch": 0.10667700293235363, + "grad_norm": 22.142282485961914, + "learning_rate": 9.998643365331496e-07, + "logits/chosen": -0.3458858132362366, + "logits/rejected": -0.3292901813983917, + "logps/chosen": -1.9010028839111328, + "logps/rejected": -2.0131144523620605, + "loss": 2.3761, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.010028839111328, + "rewards/margins": 1.1211156845092773, + "rewards/rejected": -20.131145477294922, + "step": 3165 + }, + { + "epoch": 0.10684552900333681, + "grad_norm": 20.39383316040039, + "learning_rate": 9.998573986614815e-07, + "logits/chosen": -0.5390158891677856, + "logits/rejected": -0.5976846218109131, + "logps/chosen": -1.7556896209716797, + "logps/rejected": -1.7425487041473389, + "loss": 3.2772, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.556896209716797, + "rewards/margins": -0.13141116499900818, + "rewards/rejected": -17.425487518310547, + "step": 3170 + }, + { + "epoch": 0.10701405507431999, + "grad_norm": 27.439123153686523, + "learning_rate": 9.998502878088613e-07, + "logits/chosen": -0.18560490012168884, + "logits/rejected": -0.11901037395000458, + "logps/chosen": -1.8787009716033936, + "logps/rejected": -1.9589424133300781, + "loss": 2.5655, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.787012100219727, + "rewards/margins": 0.8024119138717651, + "rewards/rejected": -19.58942413330078, + "step": 3175 + }, + { + "epoch": 0.10718258114530319, + "grad_norm": 47.90141296386719, + "learning_rate": 9.998430039777496e-07, + "logits/chosen": -0.8242633938789368, + "logits/rejected": -0.5837022662162781, + "logps/chosen": -1.5141726732254028, + "logps/rejected": -1.659218430519104, + "loss": 2.7919, + "rewards/accuracies": 0.5, + "rewards/chosen": -15.14172649383545, + "rewards/margins": 1.4504566192626953, + "rewards/rejected": -16.59218406677246, + "step": 3180 + }, + { + "epoch": 0.10735110721628636, + "grad_norm": 11.335456848144531, + "learning_rate": 9.998355471706676e-07, + "logits/chosen": -0.09614237397909164, + "logits/rejected": -0.008962017484009266, + "logps/chosen": -1.9023325443267822, + "logps/rejected": -2.03184175491333, + "loss": 3.2201, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -19.023326873779297, + "rewards/margins": 1.2950923442840576, + "rewards/rejected": -20.318416595458984, + "step": 3185 + }, + { + "epoch": 0.10751963328726954, + "grad_norm": 26.510276794433594, + "learning_rate": 9.998279173901951e-07, + "logits/chosen": -0.27121829986572266, + "logits/rejected": -0.274336576461792, + "logps/chosen": -1.76497483253479, + "logps/rejected": -1.8142725229263306, + "loss": 3.2245, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.64974594116211, + "rewards/margins": 0.492978572845459, + "rewards/rejected": -18.14272689819336, + "step": 3190 + }, + { + "epoch": 0.10768815935825272, + "grad_norm": 36.92559051513672, + "learning_rate": 9.998201146389731e-07, + "logits/chosen": -0.08655179291963577, + "logits/rejected": -0.14043311774730682, + "logps/chosen": -2.083996534347534, + "logps/rejected": -2.0735764503479004, + "loss": 3.3045, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.8399658203125, + "rewards/margins": -0.1042022705078125, + "rewards/rejected": -20.735763549804688, + "step": 3195 + }, + { + "epoch": 0.1078566854292359, + "grad_norm": 24.98342514038086, + "learning_rate": 9.998121389197015e-07, + "logits/chosen": -0.5542726516723633, + "logits/rejected": -0.5774582624435425, + "logps/chosen": -1.83783757686615, + "logps/rejected": -1.8029735088348389, + "loss": 3.4937, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.378376007080078, + "rewards/margins": -0.34863871335983276, + "rewards/rejected": -18.029735565185547, + "step": 3200 + }, + { + "epoch": 0.1078566854292359, + "eval_logits/chosen": -0.7412148118019104, + "eval_logits/rejected": -0.7483264207839966, + "eval_logps/chosen": -1.679823637008667, + "eval_logps/rejected": -1.6859045028686523, + "eval_loss": 3.401301383972168, + "eval_rewards/accuracies": 0.4699999988079071, + "eval_rewards/chosen": -16.798236846923828, + "eval_rewards/margins": 0.06080753356218338, + "eval_rewards/rejected": -16.859045028686523, + "eval_runtime": 12.9013, + "eval_samples_per_second": 7.751, + "eval_steps_per_second": 1.938, + "step": 3200 + }, + { + "epoch": 0.10802521150021908, + "grad_norm": 22.075878143310547, + "learning_rate": 9.998039902351404e-07, + "logits/chosen": -0.5936040878295898, + "logits/rejected": -0.6071931719779968, + "logps/chosen": -2.6149399280548096, + "logps/rejected": -2.601722002029419, + "loss": 3.6117, + "rewards/accuracies": 0.5, + "rewards/chosen": -26.149398803710938, + "rewards/margins": -0.13217754662036896, + "rewards/rejected": -26.017221450805664, + "step": 3205 + }, + { + "epoch": 0.10819373757120226, + "grad_norm": 14.17717170715332, + "learning_rate": 9.997956685881097e-07, + "logits/chosen": -0.8582628965377808, + "logits/rejected": -0.9205999374389648, + "logps/chosen": -1.6408593654632568, + "logps/rejected": -1.5776455402374268, + "loss": 3.7241, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.408592224121094, + "rewards/margins": -0.6321369409561157, + "rewards/rejected": -15.776455879211426, + "step": 3210 + }, + { + "epoch": 0.10836226364218544, + "grad_norm": 41.10568618774414, + "learning_rate": 9.997871739814894e-07, + "logits/chosen": -0.6507139801979065, + "logits/rejected": -0.6784830689430237, + "logps/chosen": -1.9885571002960205, + "logps/rejected": -1.9365136623382568, + "loss": 3.6387, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.885570526123047, + "rewards/margins": -0.5204324722290039, + "rewards/rejected": -19.365137100219727, + "step": 3215 + }, + { + "epoch": 0.10853078971316862, + "grad_norm": 28.58173942565918, + "learning_rate": 9.99778506418219e-07, + "logits/chosen": -0.4071117043495178, + "logits/rejected": -0.44211989641189575, + "logps/chosen": -1.6532456874847412, + "logps/rejected": -1.737221121788025, + "loss": 3.0016, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.532459259033203, + "rewards/margins": 0.8397525548934937, + "rewards/rejected": -17.372211456298828, + "step": 3220 + }, + { + "epoch": 0.1086993157841518, + "grad_norm": 19.28890609741211, + "learning_rate": 9.99769665901298e-07, + "logits/chosen": -0.320314884185791, + "logits/rejected": -0.27917996048927307, + "logps/chosen": -1.9284579753875732, + "logps/rejected": -2.031317949295044, + "loss": 2.9486, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.28458023071289, + "rewards/margins": 1.0285985469818115, + "rewards/rejected": -20.313180923461914, + "step": 3225 + }, + { + "epoch": 0.10886784185513498, + "grad_norm": 22.410377502441406, + "learning_rate": 9.997606524337856e-07, + "logits/chosen": -0.5617813467979431, + "logits/rejected": -0.7193830013275146, + "logps/chosen": -1.606142282485962, + "logps/rejected": -1.7297741174697876, + "loss": 2.3398, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.06142234802246, + "rewards/margins": 1.23631751537323, + "rewards/rejected": -17.297740936279297, + "step": 3230 + }, + { + "epoch": 0.10903636792611818, + "grad_norm": 16.280595779418945, + "learning_rate": 9.997514660188012e-07, + "logits/chosen": -0.2999853193759918, + "logits/rejected": -0.3209920823574066, + "logps/chosen": -2.043691635131836, + "logps/rejected": -1.9882628917694092, + "loss": 3.8689, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.43691635131836, + "rewards/margins": -0.5542882084846497, + "rewards/rejected": -19.88262939453125, + "step": 3235 + }, + { + "epoch": 0.10920489399710136, + "grad_norm": 47.62866973876953, + "learning_rate": 9.997421066595242e-07, + "logits/chosen": -0.27927201986312866, + "logits/rejected": -0.3500203490257263, + "logps/chosen": -1.8980588912963867, + "logps/rejected": -1.8838756084442139, + "loss": 3.2512, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.980587005615234, + "rewards/margins": -0.1418312042951584, + "rewards/rejected": -18.838756561279297, + "step": 3240 + }, + { + "epoch": 0.10937342006808454, + "grad_norm": 23.319583892822266, + "learning_rate": 9.997325743591927e-07, + "logits/chosen": -0.4203563332557678, + "logits/rejected": -0.24051852524280548, + "logps/chosen": -1.784014344215393, + "logps/rejected": -1.7746086120605469, + "loss": 3.2181, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.840145111083984, + "rewards/margins": -0.09405937045812607, + "rewards/rejected": -17.746084213256836, + "step": 3245 + }, + { + "epoch": 0.10954194613906772, + "grad_norm": 40.517086029052734, + "learning_rate": 9.997228691211062e-07, + "logits/chosen": -0.6908475160598755, + "logits/rejected": -0.6859616041183472, + "logps/chosen": -1.6905533075332642, + "logps/rejected": -1.7253916263580322, + "loss": 2.771, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.905534744262695, + "rewards/margins": 0.3483811318874359, + "rewards/rejected": -17.253915786743164, + "step": 3250 + }, + { + "epoch": 0.1097104722100509, + "grad_norm": 20.207984924316406, + "learning_rate": 9.997129909486227e-07, + "logits/chosen": 0.0059540183283388615, + "logits/rejected": 0.033363353461027145, + "logps/chosen": -2.2267398834228516, + "logps/rejected": -2.2308731079101562, + "loss": 3.2782, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.267398834228516, + "rewards/margins": 0.04133415222167969, + "rewards/rejected": -22.308731079101562, + "step": 3255 + }, + { + "epoch": 0.10987899828103408, + "grad_norm": 21.74619483947754, + "learning_rate": 9.997029398451613e-07, + "logits/chosen": -0.6611747741699219, + "logits/rejected": -0.828205406665802, + "logps/chosen": -1.661211371421814, + "logps/rejected": -1.7203048467636108, + "loss": 2.9658, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.61211585998535, + "rewards/margins": 0.5909349322319031, + "rewards/rejected": -17.20305061340332, + "step": 3260 + }, + { + "epoch": 0.11004752435201726, + "grad_norm": 22.0983829498291, + "learning_rate": 9.996927158141997e-07, + "logits/chosen": -0.2499655932188034, + "logits/rejected": -0.25793296098709106, + "logps/chosen": -1.9067351818084717, + "logps/rejected": -1.9466044902801514, + "loss": 2.8391, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.067352294921875, + "rewards/margins": 0.3986912667751312, + "rewards/rejected": -19.466045379638672, + "step": 3265 + }, + { + "epoch": 0.11021605042300044, + "grad_norm": 30.887603759765625, + "learning_rate": 9.996823188592761e-07, + "logits/chosen": -0.48815393447875977, + "logits/rejected": -0.4621101915836334, + "logps/chosen": -1.9688169956207275, + "logps/rejected": -2.039149761199951, + "loss": 3.3813, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.688167572021484, + "rewards/margins": 0.7033309936523438, + "rewards/rejected": -20.39150047302246, + "step": 3270 + }, + { + "epoch": 0.11038457649398362, + "grad_norm": 18.655275344848633, + "learning_rate": 9.99671748983989e-07, + "logits/chosen": -0.6428505778312683, + "logits/rejected": -0.6691970229148865, + "logps/chosen": -1.804451584815979, + "logps/rejected": -1.76565682888031, + "loss": 3.4545, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.04451560974121, + "rewards/margins": -0.3879469037055969, + "rewards/rejected": -17.65656852722168, + "step": 3275 + }, + { + "epoch": 0.1105531025649668, + "grad_norm": 35.781402587890625, + "learning_rate": 9.996610061919956e-07, + "logits/chosen": -0.07583383470773697, + "logits/rejected": 0.12737944722175598, + "logps/chosen": -2.226592540740967, + "logps/rejected": -2.279959201812744, + "loss": 2.6481, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.26592445373535, + "rewards/margins": 0.533666729927063, + "rewards/rejected": -22.799591064453125, + "step": 3280 + }, + { + "epoch": 0.11072162863594998, + "grad_norm": 30.42848777770996, + "learning_rate": 9.99650090487014e-07, + "logits/chosen": -0.32865971326828003, + "logits/rejected": -0.2108878195285797, + "logps/chosen": -1.6149228811264038, + "logps/rejected": -1.7221686840057373, + "loss": 2.7874, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.149229049682617, + "rewards/margins": 1.0724586248397827, + "rewards/rejected": -17.221689224243164, + "step": 3285 + }, + { + "epoch": 0.11089015470693317, + "grad_norm": 42.69295883178711, + "learning_rate": 9.996390018728216e-07, + "logits/chosen": -0.1308925449848175, + "logits/rejected": -0.10833205282688141, + "logps/chosen": -1.7586466073989868, + "logps/rejected": -1.8226900100708008, + "loss": 2.7789, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.58646583557129, + "rewards/margins": 0.6404353380203247, + "rewards/rejected": -18.226900100708008, + "step": 3290 + }, + { + "epoch": 0.11105868077791635, + "grad_norm": 32.807212829589844, + "learning_rate": 9.996277403532553e-07, + "logits/chosen": -0.46557608246803284, + "logits/rejected": -0.3887875974178314, + "logps/chosen": -1.9230026006698608, + "logps/rejected": -1.7927436828613281, + "loss": 4.3985, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.230026245117188, + "rewards/margins": -1.302587628364563, + "rewards/rejected": -17.927440643310547, + "step": 3295 + }, + { + "epoch": 0.11122720684889953, + "grad_norm": 20.832653045654297, + "learning_rate": 9.996163059322128e-07, + "logits/chosen": -0.4223001003265381, + "logits/rejected": -0.36276328563690186, + "logps/chosen": -1.6754577159881592, + "logps/rejected": -1.7316392660140991, + "loss": 2.6598, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.75457763671875, + "rewards/margins": 0.5618153810501099, + "rewards/rejected": -17.31639289855957, + "step": 3300 + }, + { + "epoch": 0.11139573291988271, + "grad_norm": 16.177337646484375, + "learning_rate": 9.996046986136508e-07, + "logits/chosen": -0.5084297060966492, + "logits/rejected": -0.6135154962539673, + "logps/chosen": -1.4946218729019165, + "logps/rejected": -1.5074679851531982, + "loss": 3.029, + "rewards/accuracies": 0.5, + "rewards/chosen": -14.946218490600586, + "rewards/margins": 0.12846069037914276, + "rewards/rejected": -15.074679374694824, + "step": 3305 + }, + { + "epoch": 0.11156425899086589, + "grad_norm": 25.017414093017578, + "learning_rate": 9.995929184015864e-07, + "logits/chosen": -0.3279469311237335, + "logits/rejected": -0.25481417775154114, + "logps/chosen": -2.0133399963378906, + "logps/rejected": -2.0342857837677, + "loss": 3.1221, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.133398056030273, + "rewards/margins": 0.2094583511352539, + "rewards/rejected": -20.342859268188477, + "step": 3310 + }, + { + "epoch": 0.11173278506184907, + "grad_norm": 17.220426559448242, + "learning_rate": 9.99580965300096e-07, + "logits/chosen": -0.7247230410575867, + "logits/rejected": -0.6492441296577454, + "logps/chosen": -1.6587985754013062, + "logps/rejected": -1.6609020233154297, + "loss": 3.0976, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.58798599243164, + "rewards/margins": 0.021033382043242455, + "rewards/rejected": -16.609020233154297, + "step": 3315 + }, + { + "epoch": 0.11190131113283225, + "grad_norm": 18.606319427490234, + "learning_rate": 9.995688393133163e-07, + "logits/chosen": -0.3558014929294586, + "logits/rejected": -0.3948605954647064, + "logps/chosen": -2.1518683433532715, + "logps/rejected": -2.4243855476379395, + "loss": 2.6067, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.5186824798584, + "rewards/margins": 2.725175142288208, + "rewards/rejected": -24.24385643005371, + "step": 3320 + }, + { + "epoch": 0.11206983720381543, + "grad_norm": 23.35149383544922, + "learning_rate": 9.995565404454436e-07, + "logits/chosen": -0.3345261216163635, + "logits/rejected": -0.4478190541267395, + "logps/chosen": -1.6862157583236694, + "logps/rejected": -1.6972332000732422, + "loss": 3.1233, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.862157821655273, + "rewards/margins": 0.11017484962940216, + "rewards/rejected": -16.972332000732422, + "step": 3325 + }, + { + "epoch": 0.11223836327479861, + "grad_norm": 21.12904167175293, + "learning_rate": 9.99544068700734e-07, + "logits/chosen": -0.47232285141944885, + "logits/rejected": -0.43678373098373413, + "logps/chosen": -1.9338699579238892, + "logps/rejected": -1.812021255493164, + "loss": 4.6974, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -19.338699340820312, + "rewards/margins": -1.218487024307251, + "rewards/rejected": -18.120214462280273, + "step": 3330 + }, + { + "epoch": 0.11240688934578179, + "grad_norm": 47.94718551635742, + "learning_rate": 9.995314240835032e-07, + "logits/chosen": -0.37584561109542847, + "logits/rejected": -0.2552175521850586, + "logps/chosen": -1.8831770420074463, + "logps/rejected": -1.8070863485336304, + "loss": 3.9093, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.831768035888672, + "rewards/margins": -0.7609058618545532, + "rewards/rejected": -18.070865631103516, + "step": 3335 + }, + { + "epoch": 0.11257541541676497, + "grad_norm": 6.487846374511719, + "learning_rate": 9.995186065981275e-07, + "logits/chosen": -0.3062947392463684, + "logits/rejected": -0.21386337280273438, + "logps/chosen": -2.0181260108947754, + "logps/rejected": -2.0927810668945312, + "loss": 3.1407, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.181259155273438, + "rewards/margins": 0.7465518712997437, + "rewards/rejected": -20.927810668945312, + "step": 3340 + }, + { + "epoch": 0.11274394148774815, + "grad_norm": 13.073860168457031, + "learning_rate": 9.995056162490423e-07, + "logits/chosen": -0.64457768201828, + "logits/rejected": -0.6320878267288208, + "logps/chosen": -1.70615553855896, + "logps/rejected": -1.741113305091858, + "loss": 3.1045, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.061555862426758, + "rewards/margins": 0.3495769500732422, + "rewards/rejected": -17.4111328125, + "step": 3345 + }, + { + "epoch": 0.11291246755873134, + "grad_norm": 22.207111358642578, + "learning_rate": 9.994924530407429e-07, + "logits/chosen": -0.6318604946136475, + "logits/rejected": -0.6339292526245117, + "logps/chosen": -1.6218618154525757, + "logps/rejected": -1.507359266281128, + "loss": 4.2157, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.218618392944336, + "rewards/margins": -1.145025610923767, + "rewards/rejected": -15.073593139648438, + "step": 3350 + }, + { + "epoch": 0.11308099362971452, + "grad_norm": 53.56328582763672, + "learning_rate": 9.99479116977785e-07, + "logits/chosen": -0.2964858412742615, + "logits/rejected": -0.2377942055463791, + "logps/chosen": -2.008469581604004, + "logps/rejected": -2.0366320610046387, + "loss": 2.9689, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.08469581604004, + "rewards/margins": 0.28162336349487305, + "rewards/rejected": -20.36631965637207, + "step": 3355 + }, + { + "epoch": 0.1132495197006977, + "grad_norm": 21.440587997436523, + "learning_rate": 9.994656080647833e-07, + "logits/chosen": -0.11374900490045547, + "logits/rejected": -0.1913887858390808, + "logps/chosen": -2.431771755218506, + "logps/rejected": -2.641162872314453, + "loss": 2.6706, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.317718505859375, + "rewards/margins": 2.0939087867736816, + "rewards/rejected": -26.4116268157959, + "step": 3360 + }, + { + "epoch": 0.11341804577168088, + "grad_norm": 47.10682678222656, + "learning_rate": 9.994519263064125e-07, + "logits/chosen": -0.5051501393318176, + "logits/rejected": -0.5939575433731079, + "logps/chosen": -1.783546805381775, + "logps/rejected": -1.7367546558380127, + "loss": 3.549, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.835468292236328, + "rewards/margins": -0.4679209589958191, + "rewards/rejected": -17.3675479888916, + "step": 3365 + }, + { + "epoch": 0.11358657184266406, + "grad_norm": 22.752422332763672, + "learning_rate": 9.99438071707408e-07, + "logits/chosen": -0.7217915654182434, + "logits/rejected": -0.6456397771835327, + "logps/chosen": -1.6618263721466064, + "logps/rejected": -1.5785537958145142, + "loss": 3.8696, + "rewards/accuracies": 0.10000000149011612, + "rewards/chosen": -16.61826515197754, + "rewards/margins": -0.8327254056930542, + "rewards/rejected": -15.785539627075195, + "step": 3370 + }, + { + "epoch": 0.11375509791364724, + "grad_norm": 28.850156784057617, + "learning_rate": 9.994240442725639e-07, + "logits/chosen": -0.28035932779312134, + "logits/rejected": -0.38660159707069397, + "logps/chosen": -1.8262609243392944, + "logps/rejected": -1.949873685836792, + "loss": 2.7329, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.262609481811523, + "rewards/margins": 1.2361291646957397, + "rewards/rejected": -19.498737335205078, + "step": 3375 + }, + { + "epoch": 0.11392362398463042, + "grad_norm": 20.6746883392334, + "learning_rate": 9.994098440067344e-07, + "logits/chosen": -0.3793238699436188, + "logits/rejected": -0.5065209269523621, + "logps/chosen": -1.8463551998138428, + "logps/rejected": -1.7899481058120728, + "loss": 3.791, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.463550567626953, + "rewards/margins": -0.5640712976455688, + "rewards/rejected": -17.899478912353516, + "step": 3380 + }, + { + "epoch": 0.1140921500556136, + "grad_norm": 24.47591781616211, + "learning_rate": 9.99395470914834e-07, + "logits/chosen": -0.5719932317733765, + "logits/rejected": -0.38576334714889526, + "logps/chosen": -2.1512346267700195, + "logps/rejected": -2.1791083812713623, + "loss": 3.1521, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.512344360351562, + "rewards/margins": 0.2787378430366516, + "rewards/rejected": -21.79108238220215, + "step": 3385 + }, + { + "epoch": 0.11426067612659678, + "grad_norm": 27.42087745666504, + "learning_rate": 9.993809250018364e-07, + "logits/chosen": -0.687567412853241, + "logits/rejected": -0.7501705884933472, + "logps/chosen": -1.8461263179779053, + "logps/rejected": -1.9768394231796265, + "loss": 3.1181, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.461265563964844, + "rewards/margins": 1.3071308135986328, + "rewards/rejected": -19.768394470214844, + "step": 3390 + }, + { + "epoch": 0.11442920219757996, + "grad_norm": 16.828157424926758, + "learning_rate": 9.993662062727757e-07, + "logits/chosen": -0.6422561407089233, + "logits/rejected": -0.6002863645553589, + "logps/chosen": -1.6547744274139404, + "logps/rejected": -1.6610130071640015, + "loss": 3.0731, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.547744750976562, + "rewards/margins": 0.06238384172320366, + "rewards/rejected": -16.61012840270996, + "step": 3395 + }, + { + "epoch": 0.11459772826856314, + "grad_norm": 26.950260162353516, + "learning_rate": 9.99351314732745e-07, + "logits/chosen": -0.11768583953380585, + "logits/rejected": -0.1222616657614708, + "logps/chosen": -2.5058677196502686, + "logps/rejected": -2.431462287902832, + "loss": 4.0213, + "rewards/accuracies": 0.5, + "rewards/chosen": -25.058677673339844, + "rewards/margins": -0.7440546154975891, + "rewards/rejected": -24.31462287902832, + "step": 3400 + }, + { + "epoch": 0.11476625433954633, + "grad_norm": 25.076194763183594, + "learning_rate": 9.99336250386898e-07, + "logits/chosen": -0.4412030577659607, + "logits/rejected": -0.3538280725479126, + "logps/chosen": -2.0328128337860107, + "logps/rejected": -2.137967586517334, + "loss": 2.6655, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.3281307220459, + "rewards/margins": 1.051546335220337, + "rewards/rejected": -21.379674911499023, + "step": 3405 + }, + { + "epoch": 0.11493478041052951, + "grad_norm": 15.957403182983398, + "learning_rate": 9.993210132404479e-07, + "logits/chosen": -0.4786578118801117, + "logits/rejected": -0.5396562814712524, + "logps/chosen": -2.0582754611968994, + "logps/rejected": -2.204559087753296, + "loss": 2.1681, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.582754135131836, + "rewards/margins": 1.4628350734710693, + "rewards/rejected": -22.045589447021484, + "step": 3410 + }, + { + "epoch": 0.1151033064815127, + "grad_norm": 43.187171936035156, + "learning_rate": 9.993056032986676e-07, + "logits/chosen": -0.6285707950592041, + "logits/rejected": -0.4946421682834625, + "logps/chosen": -1.5473029613494873, + "logps/rejected": -1.574300765991211, + "loss": 2.8268, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -15.473031997680664, + "rewards/margins": 0.269977867603302, + "rewards/rejected": -15.743008613586426, + "step": 3415 + }, + { + "epoch": 0.11527183255249587, + "grad_norm": 16.885690689086914, + "learning_rate": 9.992900205668896e-07, + "logits/chosen": -0.3438575267791748, + "logits/rejected": -0.3555828928947449, + "logps/chosen": -1.5282278060913086, + "logps/rejected": -1.643481969833374, + "loss": 2.6082, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -15.28227710723877, + "rewards/margins": 1.1525436639785767, + "rewards/rejected": -16.43482208251953, + "step": 3420 + }, + { + "epoch": 0.11544035862347905, + "grad_norm": 101.62150573730469, + "learning_rate": 9.992742650505071e-07, + "logits/chosen": -0.37929654121398926, + "logits/rejected": -0.45227640867233276, + "logps/chosen": -2.4064865112304688, + "logps/rejected": -2.1600139141082764, + "loss": 6.0675, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.064863204956055, + "rewards/margins": -2.464722156524658, + "rewards/rejected": -21.600141525268555, + "step": 3425 + }, + { + "epoch": 0.11560888469446223, + "grad_norm": 24.92697525024414, + "learning_rate": 9.992583367549719e-07, + "logits/chosen": -0.5041292905807495, + "logits/rejected": -0.6150269508361816, + "logps/chosen": -1.5577831268310547, + "logps/rejected": -1.5323947668075562, + "loss": 3.3247, + "rewards/accuracies": 0.5, + "rewards/chosen": -15.577832221984863, + "rewards/margins": -0.2538827955722809, + "rewards/rejected": -15.323948860168457, + "step": 3430 + }, + { + "epoch": 0.11577741076544541, + "grad_norm": 26.776927947998047, + "learning_rate": 9.992422356857963e-07, + "logits/chosen": -0.6362167596817017, + "logits/rejected": -0.6052538752555847, + "logps/chosen": -1.5442123413085938, + "logps/rejected": -1.5520304441452026, + "loss": 3.1438, + "rewards/accuracies": 0.5, + "rewards/chosen": -15.442123413085938, + "rewards/margins": 0.07818098366260529, + "rewards/rejected": -15.520304679870605, + "step": 3435 + }, + { + "epoch": 0.11594593683642859, + "grad_norm": 23.41965675354004, + "learning_rate": 9.992259618485523e-07, + "logits/chosen": 0.030453210696578026, + "logits/rejected": 0.03214035555720329, + "logps/chosen": -2.0576891899108887, + "logps/rejected": -2.1275794506073, + "loss": 2.723, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.576892852783203, + "rewards/margins": 0.6989046335220337, + "rewards/rejected": -21.27579689025879, + "step": 3440 + }, + { + "epoch": 0.11611446290741177, + "grad_norm": 23.502748489379883, + "learning_rate": 9.992095152488718e-07, + "logits/chosen": -0.40521278977394104, + "logits/rejected": -0.27406761050224304, + "logps/chosen": -1.9019501209259033, + "logps/rejected": -2.0432791709899902, + "loss": 2.512, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.019500732421875, + "rewards/margins": 1.4132897853851318, + "rewards/rejected": -20.432790756225586, + "step": 3445 + }, + { + "epoch": 0.11628298897839495, + "grad_norm": 21.05483627319336, + "learning_rate": 9.991928958924458e-07, + "logits/chosen": -0.36097151041030884, + "logits/rejected": -0.35222867131233215, + "logps/chosen": -1.7462047338485718, + "logps/rejected": -1.74808669090271, + "loss": 3.1059, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.462047576904297, + "rewards/margins": 0.01881999894976616, + "rewards/rejected": -17.48086929321289, + "step": 3450 + }, + { + "epoch": 0.11645151504937813, + "grad_norm": 19.621593475341797, + "learning_rate": 9.991761037850262e-07, + "logits/chosen": -0.4645184874534607, + "logits/rejected": -0.47621220350265503, + "logps/chosen": -1.7795826196670532, + "logps/rejected": -1.776755928993225, + "loss": 3.2258, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.795825958251953, + "rewards/margins": -0.028265666216611862, + "rewards/rejected": -17.767559051513672, + "step": 3455 + }, + { + "epoch": 0.11662004112036133, + "grad_norm": 30.127208709716797, + "learning_rate": 9.99159138932424e-07, + "logits/chosen": -0.4851096570491791, + "logits/rejected": -0.47763925790786743, + "logps/chosen": -1.8681427240371704, + "logps/rejected": -1.7204406261444092, + "loss": 4.5361, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.681425094604492, + "rewards/margins": -1.4770203828811646, + "rewards/rejected": -17.20440673828125, + "step": 3460 + }, + { + "epoch": 0.1167885671913445, + "grad_norm": 27.600656509399414, + "learning_rate": 9.991420013405095e-07, + "logits/chosen": -0.4824017584323883, + "logits/rejected": -0.43301159143447876, + "logps/chosen": -1.8536741733551025, + "logps/rejected": -1.9033809900283813, + "loss": 2.9276, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.536741256713867, + "rewards/margins": 0.49706658720970154, + "rewards/rejected": -19.033809661865234, + "step": 3465 + }, + { + "epoch": 0.11695709326232769, + "grad_norm": 15.61844253540039, + "learning_rate": 9.99124691015214e-07, + "logits/chosen": -0.7837399840354919, + "logits/rejected": -0.675243079662323, + "logps/chosen": -1.6042782068252563, + "logps/rejected": -1.7261016368865967, + "loss": 2.5794, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.042781829833984, + "rewards/margins": 1.2182351350784302, + "rewards/rejected": -17.261016845703125, + "step": 3470 + }, + { + "epoch": 0.11712561933331087, + "grad_norm": 18.784343719482422, + "learning_rate": 9.991072079625275e-07, + "logits/chosen": -0.08538699895143509, + "logits/rejected": -0.10807951539754868, + "logps/chosen": -1.6151269674301147, + "logps/rejected": -1.8042793273925781, + "loss": 2.1694, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.151269912719727, + "rewards/margins": 1.8915237188339233, + "rewards/rejected": -18.04279327392578, + "step": 3475 + }, + { + "epoch": 0.11729414540429405, + "grad_norm": 24.512720108032227, + "learning_rate": 9.990895521885005e-07, + "logits/chosen": 0.02145923301577568, + "logits/rejected": -0.07628260552883148, + "logps/chosen": -1.8797547817230225, + "logps/rejected": -1.8971583843231201, + "loss": 3.3072, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.797550201416016, + "rewards/margins": 0.17403507232666016, + "rewards/rejected": -18.97158432006836, + "step": 3480 + }, + { + "epoch": 0.11746267147527722, + "grad_norm": 13.824844360351562, + "learning_rate": 9.990717236992428e-07, + "logits/chosen": -0.40924936532974243, + "logits/rejected": -0.3904082477092743, + "logps/chosen": -1.5942176580429077, + "logps/rejected": -1.5907622575759888, + "loss": 3.1973, + "rewards/accuracies": 0.5, + "rewards/chosen": -15.942178726196289, + "rewards/margins": -0.03455467149615288, + "rewards/rejected": -15.907621383666992, + "step": 3485 + }, + { + "epoch": 0.1176311975462604, + "grad_norm": 21.68479347229004, + "learning_rate": 9.990537225009242e-07, + "logits/chosen": -0.3777514100074768, + "logits/rejected": -0.34054070711135864, + "logps/chosen": -1.5628550052642822, + "logps/rejected": -1.6658351421356201, + "loss": 2.6825, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -15.62855052947998, + "rewards/margins": 1.0298006534576416, + "rewards/rejected": -16.65835189819336, + "step": 3490 + }, + { + "epoch": 0.11779972361724358, + "grad_norm": 40.326507568359375, + "learning_rate": 9.99035548599774e-07, + "logits/chosen": -0.5707781910896301, + "logits/rejected": -0.5124248266220093, + "logps/chosen": -1.8460489511489868, + "logps/rejected": -1.740540862083435, + "loss": 4.0993, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.46048927307129, + "rewards/margins": -1.0550801753997803, + "rewards/rejected": -17.405406951904297, + "step": 3495 + }, + { + "epoch": 0.11796824968822676, + "grad_norm": 76.25070190429688, + "learning_rate": 9.990172020020818e-07, + "logits/chosen": -0.30215927958488464, + "logits/rejected": -0.23703379929065704, + "logps/chosen": -1.8280613422393799, + "logps/rejected": -1.7260147333145142, + "loss": 4.21, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.280614852905273, + "rewards/margins": -1.020465612411499, + "rewards/rejected": -17.260149002075195, + "step": 3500 + }, + { + "epoch": 0.11813677575920994, + "grad_norm": 26.626041412353516, + "learning_rate": 9.989986827141963e-07, + "logits/chosen": -0.4228217601776123, + "logits/rejected": -0.4215395450592041, + "logps/chosen": -1.7155015468597412, + "logps/rejected": -1.7580169439315796, + "loss": 2.7408, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.15501594543457, + "rewards/margins": 0.4251536428928375, + "rewards/rejected": -17.580167770385742, + "step": 3505 + }, + { + "epoch": 0.11830530183019312, + "grad_norm": 25.78866195678711, + "learning_rate": 9.989799907425268e-07, + "logits/chosen": -0.3247675597667694, + "logits/rejected": -0.3234170079231262, + "logps/chosen": -1.9856287240982056, + "logps/rejected": -2.078611373901367, + "loss": 3.0682, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.856287002563477, + "rewards/margins": 0.9298248291015625, + "rewards/rejected": -20.786113739013672, + "step": 3510 + }, + { + "epoch": 0.11847382790117632, + "grad_norm": 27.890932083129883, + "learning_rate": 9.989611260935413e-07, + "logits/chosen": -0.3991270959377289, + "logits/rejected": -0.38465866446495056, + "logps/chosen": -2.2957253456115723, + "logps/rejected": -2.220700740814209, + "loss": 3.9668, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.957252502441406, + "rewards/margins": -0.750244140625, + "rewards/rejected": -22.207008361816406, + "step": 3515 + }, + { + "epoch": 0.1186423539721595, + "grad_norm": 62.70400619506836, + "learning_rate": 9.989420887737683e-07, + "logits/chosen": -0.32912951707839966, + "logits/rejected": -0.4093703329563141, + "logps/chosen": -2.0264673233032227, + "logps/rejected": -1.9524818658828735, + "loss": 3.9945, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.264673233032227, + "rewards/margins": -0.7398567199707031, + "rewards/rejected": -19.52481460571289, + "step": 3520 + }, + { + "epoch": 0.11881088004314268, + "grad_norm": 31.33793067932129, + "learning_rate": 9.98922878789796e-07, + "logits/chosen": -0.12958894670009613, + "logits/rejected": -0.21700029075145721, + "logps/chosen": -2.0517940521240234, + "logps/rejected": -2.1143057346343994, + "loss": 2.6628, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.517940521240234, + "rewards/margins": 0.6251155734062195, + "rewards/rejected": -21.143054962158203, + "step": 3525 + }, + { + "epoch": 0.11897940611412586, + "grad_norm": 32.016998291015625, + "learning_rate": 9.989034961482721e-07, + "logits/chosen": -0.3710765242576599, + "logits/rejected": -0.35672563314437866, + "logps/chosen": -2.2033329010009766, + "logps/rejected": -2.3549094200134277, + "loss": 2.7372, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.0333309173584, + "rewards/margins": 1.5157625675201416, + "rewards/rejected": -23.54909324645996, + "step": 3530 + }, + { + "epoch": 0.11914793218510904, + "grad_norm": 31.52229881286621, + "learning_rate": 9.988839408559044e-07, + "logits/chosen": -0.5221918225288391, + "logits/rejected": -0.5441256761550903, + "logps/chosen": -1.7516603469848633, + "logps/rejected": -1.7036716938018799, + "loss": 3.6209, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.5166015625, + "rewards/margins": -0.4798874855041504, + "rewards/rejected": -17.03671646118164, + "step": 3535 + }, + { + "epoch": 0.11931645825609222, + "grad_norm": 23.19977569580078, + "learning_rate": 9.988642129194598e-07, + "logits/chosen": -0.23060889542102814, + "logits/rejected": -0.27095380425453186, + "logps/chosen": -1.9323209524154663, + "logps/rejected": -1.8174877166748047, + "loss": 4.1969, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.32320785522461, + "rewards/margins": -1.1483336687088013, + "rewards/rejected": -18.174877166748047, + "step": 3540 + }, + { + "epoch": 0.1194849843270754, + "grad_norm": 25.622182846069336, + "learning_rate": 9.988443123457655e-07, + "logits/chosen": -0.320268452167511, + "logits/rejected": -0.28582894802093506, + "logps/chosen": -1.6403043270111084, + "logps/rejected": -1.7142482995986938, + "loss": 2.5097, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.403045654296875, + "rewards/margins": 0.7394381761550903, + "rewards/rejected": -17.14248275756836, + "step": 3545 + }, + { + "epoch": 0.11965351039805858, + "grad_norm": 30.17853355407715, + "learning_rate": 9.988242391417086e-07, + "logits/chosen": -0.7527714967727661, + "logits/rejected": -0.7969571352005005, + "logps/chosen": -1.7517030239105225, + "logps/rejected": -1.8085277080535889, + "loss": 3.1445, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.517030715942383, + "rewards/margins": 0.5682462453842163, + "rewards/rejected": -18.085277557373047, + "step": 3550 + }, + { + "epoch": 0.11982203646904176, + "grad_norm": 34.78050994873047, + "learning_rate": 9.988039933142353e-07, + "logits/chosen": -0.6721317768096924, + "logits/rejected": -0.6552478075027466, + "logps/chosen": -1.6741306781768799, + "logps/rejected": -1.7111762762069702, + "loss": 2.8275, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.74130630493164, + "rewards/margins": 0.37045803666114807, + "rewards/rejected": -17.11176300048828, + "step": 3555 + }, + { + "epoch": 0.11999056254002494, + "grad_norm": 39.94550323486328, + "learning_rate": 9.98783574870352e-07, + "logits/chosen": -0.21335110068321228, + "logits/rejected": -0.20708951354026794, + "logps/chosen": -1.988205909729004, + "logps/rejected": -2.133162021636963, + "loss": 3.0632, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.882061004638672, + "rewards/margins": 1.4495601654052734, + "rewards/rejected": -21.331623077392578, + "step": 3560 + }, + { + "epoch": 0.12015908861100812, + "grad_norm": 88.8822021484375, + "learning_rate": 9.987629838171248e-07, + "logits/chosen": -0.6824513673782349, + "logits/rejected": -0.7000759840011597, + "logps/chosen": -1.8915393352508545, + "logps/rejected": -1.930729866027832, + "loss": 2.9006, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.915393829345703, + "rewards/margins": 0.39190226793289185, + "rewards/rejected": -19.307296752929688, + "step": 3565 + }, + { + "epoch": 0.12032761468199131, + "grad_norm": 24.55368995666504, + "learning_rate": 9.987422201616792e-07, + "logits/chosen": -0.585277259349823, + "logits/rejected": -0.6046692132949829, + "logps/chosen": -2.3041303157806396, + "logps/rejected": -2.2990269660949707, + "loss": 3.2232, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.041301727294922, + "rewards/margins": -0.05103177949786186, + "rewards/rejected": -22.990270614624023, + "step": 3570 + }, + { + "epoch": 0.12049614075297449, + "grad_norm": 18.27791976928711, + "learning_rate": 9.98721283911201e-07, + "logits/chosen": -0.28167563676834106, + "logits/rejected": -0.320262610912323, + "logps/chosen": -1.744283676147461, + "logps/rejected": -1.791398286819458, + "loss": 2.734, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.442834854125977, + "rewards/margins": 0.4711475372314453, + "rewards/rejected": -17.913982391357422, + "step": 3575 + }, + { + "epoch": 0.12066466682395767, + "grad_norm": 16.623876571655273, + "learning_rate": 9.987001750729354e-07, + "logits/chosen": -0.7061062455177307, + "logits/rejected": -0.8444076776504517, + "logps/chosen": -1.6471954584121704, + "logps/rejected": -1.7329185009002686, + "loss": 2.4762, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.471954345703125, + "rewards/margins": 0.8572310209274292, + "rewards/rejected": -17.329185485839844, + "step": 3580 + }, + { + "epoch": 0.12083319289494085, + "grad_norm": 57.20680236816406, + "learning_rate": 9.98678893654187e-07, + "logits/chosen": -0.08321056514978409, + "logits/rejected": -0.08359815180301666, + "logps/chosen": -2.018667221069336, + "logps/rejected": -1.961265206336975, + "loss": 3.642, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.18667221069336, + "rewards/margins": -0.5740194320678711, + "rewards/rejected": -19.612651824951172, + "step": 3585 + }, + { + "epoch": 0.12100171896592403, + "grad_norm": 17.29876708984375, + "learning_rate": 9.986574396623208e-07, + "logits/chosen": -0.3871995806694031, + "logits/rejected": -0.45271673798561096, + "logps/chosen": -2.271118640899658, + "logps/rejected": -2.2195260524749756, + "loss": 5.4421, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -22.711185455322266, + "rewards/margins": -0.5159248113632202, + "rewards/rejected": -22.19525909423828, + "step": 3590 + }, + { + "epoch": 0.12117024503690721, + "grad_norm": 22.723844528198242, + "learning_rate": 9.986358131047609e-07, + "logits/chosen": -0.4757024645805359, + "logits/rejected": -0.35431593656539917, + "logps/chosen": -1.4331119060516357, + "logps/rejected": -1.6594613790512085, + "loss": 1.8193, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -14.3311185836792, + "rewards/margins": 2.2634944915771484, + "rewards/rejected": -16.594614028930664, + "step": 3595 + }, + { + "epoch": 0.12133877110789039, + "grad_norm": 77.45389556884766, + "learning_rate": 9.986140139889916e-07, + "logits/chosen": -0.04966864734888077, + "logits/rejected": 0.07870599627494812, + "logps/chosen": -2.0236928462982178, + "logps/rejected": -2.0264344215393066, + "loss": 3.1565, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.236928939819336, + "rewards/margins": 0.027413081377744675, + "rewards/rejected": -20.26434326171875, + "step": 3600 + }, + { + "epoch": 0.12133877110789039, + "eval_logits/chosen": -0.7526273131370544, + "eval_logits/rejected": -0.7618316411972046, + "eval_logps/chosen": -1.6854208707809448, + "eval_logps/rejected": -1.6938505172729492, + "eval_loss": 3.3851799964904785, + "eval_rewards/accuracies": 0.4699999988079071, + "eval_rewards/chosen": -16.854209899902344, + "eval_rewards/margins": 0.08429720997810364, + "eval_rewards/rejected": -16.938507080078125, + "eval_runtime": 12.8944, + "eval_samples_per_second": 7.755, + "eval_steps_per_second": 1.939, + "step": 3600 + }, + { + "epoch": 0.12150729717887357, + "grad_norm": 22.01042938232422, + "learning_rate": 9.98592042322557e-07, + "logits/chosen": -0.6339259147644043, + "logits/rejected": -0.670158326625824, + "logps/chosen": -1.7541553974151611, + "logps/rejected": -1.723435640335083, + "loss": 3.4452, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -17.541553497314453, + "rewards/margins": -0.3071993887424469, + "rewards/rejected": -17.23435401916504, + "step": 3605 + }, + { + "epoch": 0.12167582324985675, + "grad_norm": 87.37193298339844, + "learning_rate": 9.9856989811306e-07, + "logits/chosen": -0.28349608182907104, + "logits/rejected": -0.31893137097358704, + "logps/chosen": -2.0025875568389893, + "logps/rejected": -1.8682279586791992, + "loss": 4.5711, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -20.025875091552734, + "rewards/margins": -1.3435958623886108, + "rewards/rejected": -18.682279586791992, + "step": 3610 + }, + { + "epoch": 0.12184434932083993, + "grad_norm": 22.39496421813965, + "learning_rate": 9.985475813681639e-07, + "logits/chosen": -0.6315996050834656, + "logits/rejected": -0.7218677401542664, + "logps/chosen": -1.6861488819122314, + "logps/rejected": -1.6557300090789795, + "loss": 3.4551, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.86149024963379, + "rewards/margins": -0.3041892945766449, + "rewards/rejected": -16.557300567626953, + "step": 3615 + }, + { + "epoch": 0.12201287539182311, + "grad_norm": 24.289531707763672, + "learning_rate": 9.985250920955921e-07, + "logits/chosen": -0.590740978717804, + "logits/rejected": -0.4106348156929016, + "logps/chosen": -1.9113991260528564, + "logps/rejected": -1.8914527893066406, + "loss": 3.4557, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.113990783691406, + "rewards/margins": -0.19946375489234924, + "rewards/rejected": -18.914525985717773, + "step": 3620 + }, + { + "epoch": 0.1221814014628063, + "grad_norm": 31.299972534179688, + "learning_rate": 9.98502430303127e-07, + "logits/chosen": -0.2667599022388458, + "logits/rejected": -0.2786843180656433, + "logps/chosen": -2.008376359939575, + "logps/rejected": -2.3521313667297363, + "loss": 2.2606, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.08376121520996, + "rewards/margins": 3.437549591064453, + "rewards/rejected": -23.521310806274414, + "step": 3625 + }, + { + "epoch": 0.12234992753378948, + "grad_norm": 31.534088134765625, + "learning_rate": 9.984795959986112e-07, + "logits/chosen": 0.22582361102104187, + "logits/rejected": 0.14064531028270721, + "logps/chosen": -2.0859715938568115, + "logps/rejected": -1.983786940574646, + "loss": 4.2662, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.859716415405273, + "rewards/margins": -1.0218474864959717, + "rewards/rejected": -19.83786964416504, + "step": 3630 + }, + { + "epoch": 0.12251845360477266, + "grad_norm": 32.42240905761719, + "learning_rate": 9.984565891899463e-07, + "logits/chosen": -0.1923142522573471, + "logits/rejected": -0.25374284386634827, + "logps/chosen": -1.7470464706420898, + "logps/rejected": -1.6832809448242188, + "loss": 3.7209, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.47046661376953, + "rewards/margins": -0.6376568675041199, + "rewards/rejected": -16.832809448242188, + "step": 3635 + }, + { + "epoch": 0.12268697967575584, + "grad_norm": 35.28072738647461, + "learning_rate": 9.984334098850944e-07, + "logits/chosen": -0.3545475900173187, + "logits/rejected": -0.23502996563911438, + "logps/chosen": -2.015934944152832, + "logps/rejected": -1.9783185720443726, + "loss": 3.5999, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.159351348876953, + "rewards/margins": -0.37616491317749023, + "rewards/rejected": -19.783187866210938, + "step": 3640 + }, + { + "epoch": 0.12285550574673902, + "grad_norm": 27.003463745117188, + "learning_rate": 9.984100580920768e-07, + "logits/chosen": -0.27759090065956116, + "logits/rejected": -0.2839242219924927, + "logps/chosen": -1.5591684579849243, + "logps/rejected": -1.6653550863265991, + "loss": 3.5842, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -15.591684341430664, + "rewards/margins": 1.0618667602539062, + "rewards/rejected": -16.65355110168457, + "step": 3645 + }, + { + "epoch": 0.1230240318177222, + "grad_norm": 27.013710021972656, + "learning_rate": 9.983865338189746e-07, + "logits/chosen": -0.5425965189933777, + "logits/rejected": -0.5991086959838867, + "logps/chosen": -1.6972814798355103, + "logps/rejected": -1.698444128036499, + "loss": 3.2068, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.97281265258789, + "rewards/margins": 0.011627817526459694, + "rewards/rejected": -16.984439849853516, + "step": 3650 + }, + { + "epoch": 0.12319255788870538, + "grad_norm": 12.66113567352295, + "learning_rate": 9.983628370739288e-07, + "logits/chosen": -0.6093885898590088, + "logits/rejected": -0.48226800560951233, + "logps/chosen": -1.6346813440322876, + "logps/rejected": -1.6976230144500732, + "loss": 3.1563, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -16.346811294555664, + "rewards/margins": 0.6294196844100952, + "rewards/rejected": -16.976232528686523, + "step": 3655 + }, + { + "epoch": 0.12336108395968856, + "grad_norm": 66.68254089355469, + "learning_rate": 9.983389678651398e-07, + "logits/chosen": -0.32299280166625977, + "logits/rejected": -0.2723495662212372, + "logps/chosen": -2.3520588874816895, + "logps/rejected": -2.3130524158477783, + "loss": 4.0511, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.520587921142578, + "rewards/margins": -0.39006367325782776, + "rewards/rejected": -23.130521774291992, + "step": 3660 + }, + { + "epoch": 0.12352961003067174, + "grad_norm": 18.499338150024414, + "learning_rate": 9.983149262008677e-07, + "logits/chosen": -0.7030686736106873, + "logits/rejected": -0.6500356793403625, + "logps/chosen": -1.80410635471344, + "logps/rejected": -1.8138717412948608, + "loss": 2.9992, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.041065216064453, + "rewards/margins": 0.09765472263097763, + "rewards/rejected": -18.138717651367188, + "step": 3665 + }, + { + "epoch": 0.12369813610165492, + "grad_norm": 22.921619415283203, + "learning_rate": 9.982907120894325e-07, + "logits/chosen": -0.4772499203681946, + "logits/rejected": -0.6954213380813599, + "logps/chosen": -1.6093018054962158, + "logps/rejected": -1.5932279825210571, + "loss": 3.2344, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.093017578125, + "rewards/margins": -0.16073980927467346, + "rewards/rejected": -15.932279586791992, + "step": 3670 + }, + { + "epoch": 0.1238666621726381, + "grad_norm": 21.458614349365234, + "learning_rate": 9.982663255392137e-07, + "logits/chosen": -0.34770479798316956, + "logits/rejected": -0.39969635009765625, + "logps/chosen": -1.571874976158142, + "logps/rejected": -1.5835822820663452, + "loss": 3.1312, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -15.71875, + "rewards/margins": 0.11707315593957901, + "rewards/rejected": -15.835823059082031, + "step": 3675 + }, + { + "epoch": 0.1240351882436213, + "grad_norm": 29.32381820678711, + "learning_rate": 9.982417665586508e-07, + "logits/chosen": -0.4228406846523285, + "logits/rejected": -0.45245176553726196, + "logps/chosen": -2.108222007751465, + "logps/rejected": -1.8850529193878174, + "loss": 6.1308, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.08222007751465, + "rewards/margins": -2.2316925525665283, + "rewards/rejected": -18.850528717041016, + "step": 3680 + }, + { + "epoch": 0.12420371431460447, + "grad_norm": 25.706727981567383, + "learning_rate": 9.98217035156242e-07, + "logits/chosen": -0.32080286741256714, + "logits/rejected": -0.32200899720191956, + "logps/chosen": -2.022761583328247, + "logps/rejected": -2.0280632972717285, + "loss": 3.2838, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.227615356445312, + "rewards/margins": 0.05301866680383682, + "rewards/rejected": -20.2806339263916, + "step": 3685 + }, + { + "epoch": 0.12437224038558765, + "grad_norm": 37.784912109375, + "learning_rate": 9.981921313405464e-07, + "logits/chosen": -0.3293726444244385, + "logits/rejected": -0.42785710096359253, + "logps/chosen": -1.616093635559082, + "logps/rejected": -1.6466785669326782, + "loss": 2.855, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.160934448242188, + "rewards/margins": 0.3058490753173828, + "rewards/rejected": -16.466785430908203, + "step": 3690 + }, + { + "epoch": 0.12454076645657083, + "grad_norm": 22.215789794921875, + "learning_rate": 9.981670551201824e-07, + "logits/chosen": -0.7644900679588318, + "logits/rejected": -0.6727944016456604, + "logps/chosen": -1.6866724491119385, + "logps/rejected": -1.7234609127044678, + "loss": 2.886, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.866724014282227, + "rewards/margins": 0.3678857684135437, + "rewards/rejected": -17.234609603881836, + "step": 3695 + }, + { + "epoch": 0.12470929252755401, + "grad_norm": 32.08245849609375, + "learning_rate": 9.981418065038273e-07, + "logits/chosen": -0.30399376153945923, + "logits/rejected": -0.19136790931224823, + "logps/chosen": -1.7521512508392334, + "logps/rejected": -1.924407958984375, + "loss": 1.9513, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.521514892578125, + "rewards/margins": 1.7225669622421265, + "rewards/rejected": -19.24407958984375, + "step": 3700 + }, + { + "epoch": 0.1248778185985372, + "grad_norm": 35.04029846191406, + "learning_rate": 9.981163855002192e-07, + "logits/chosen": -0.35477086901664734, + "logits/rejected": -0.43373337388038635, + "logps/chosen": -1.8812839984893799, + "logps/rejected": -1.8338260650634766, + "loss": 3.5299, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.812841415405273, + "rewards/margins": -0.4745791554450989, + "rewards/rejected": -18.338260650634766, + "step": 3705 + }, + { + "epoch": 0.1250463446695204, + "grad_norm": 18.448402404785156, + "learning_rate": 9.98090792118155e-07, + "logits/chosen": -0.2764646112918854, + "logits/rejected": -0.3987608850002289, + "logps/chosen": -1.8961387872695923, + "logps/rejected": -2.0002424716949463, + "loss": 2.5696, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.961389541625977, + "rewards/margins": 1.041035532951355, + "rewards/rejected": -20.002424240112305, + "step": 3710 + }, + { + "epoch": 0.12521487074050355, + "grad_norm": 20.717815399169922, + "learning_rate": 9.980650263664917e-07, + "logits/chosen": -0.2851320803165436, + "logits/rejected": -0.3245907723903656, + "logps/chosen": -1.7553514242172241, + "logps/rejected": -1.8028484582901, + "loss": 2.8536, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.553516387939453, + "rewards/margins": 0.4749675691127777, + "rewards/rejected": -18.02848243713379, + "step": 3715 + }, + { + "epoch": 0.12538339681148675, + "grad_norm": 17.02474021911621, + "learning_rate": 9.980390882541456e-07, + "logits/chosen": -0.7261825799942017, + "logits/rejected": -0.675274670124054, + "logps/chosen": -1.6618716716766357, + "logps/rejected": -1.7756210565567017, + "loss": 2.3753, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.618717193603516, + "rewards/margins": 1.137491226196289, + "rewards/rejected": -17.756208419799805, + "step": 3720 + }, + { + "epoch": 0.1255519228824699, + "grad_norm": 22.09320640563965, + "learning_rate": 9.980129777900932e-07, + "logits/chosen": -0.6453119516372681, + "logits/rejected": -0.6601329445838928, + "logps/chosen": -1.3661413192749023, + "logps/rejected": -1.289612889289856, + "loss": 3.8121, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -13.661413192749023, + "rewards/margins": -0.7652846574783325, + "rewards/rejected": -12.896130561828613, + "step": 3725 + }, + { + "epoch": 0.1257204489534531, + "grad_norm": 19.057912826538086, + "learning_rate": 9.9798669498337e-07, + "logits/chosen": -0.4333661198616028, + "logits/rejected": -0.4165084958076477, + "logps/chosen": -1.9831759929656982, + "logps/rejected": -2.0759761333465576, + "loss": 2.2843, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.83176040649414, + "rewards/margins": 0.9280030131340027, + "rewards/rejected": -20.759763717651367, + "step": 3730 + }, + { + "epoch": 0.12588897502443627, + "grad_norm": 25.03388786315918, + "learning_rate": 9.979602398430715e-07, + "logits/chosen": -0.6505326628684998, + "logits/rejected": -0.5560685396194458, + "logps/chosen": -1.5252478122711182, + "logps/rejected": -1.583319067955017, + "loss": 2.7461, + "rewards/accuracies": 0.5, + "rewards/chosen": -15.252477645874023, + "rewards/margins": 0.5807129144668579, + "rewards/rejected": -15.83319091796875, + "step": 3735 + }, + { + "epoch": 0.12605750109541947, + "grad_norm": 23.37642478942871, + "learning_rate": 9.97933612378353e-07, + "logits/chosen": -0.3261653780937195, + "logits/rejected": -0.32249951362609863, + "logps/chosen": -1.7047618627548218, + "logps/rejected": -1.8726396560668945, + "loss": 2.5351, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.04762077331543, + "rewards/margins": 1.6787769794464111, + "rewards/rejected": -18.726398468017578, + "step": 3740 + }, + { + "epoch": 0.12622602716640263, + "grad_norm": 21.604785919189453, + "learning_rate": 9.97906812598429e-07, + "logits/chosen": -0.5837305784225464, + "logits/rejected": -0.6095893979072571, + "logps/chosen": -1.9071247577667236, + "logps/rejected": -1.909767746925354, + "loss": 3.1111, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.071247100830078, + "rewards/margins": 0.026428794488310814, + "rewards/rejected": -19.09767723083496, + "step": 3745 + }, + { + "epoch": 0.12639455323738583, + "grad_norm": 31.987625122070312, + "learning_rate": 9.978798405125739e-07, + "logits/chosen": -0.37430623173713684, + "logits/rejected": -0.5127814412117004, + "logps/chosen": -1.7623825073242188, + "logps/rejected": -1.9457428455352783, + "loss": 2.3911, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.623825073242188, + "rewards/margins": 1.8336032629013062, + "rewards/rejected": -19.457427978515625, + "step": 3750 + }, + { + "epoch": 0.126563079308369, + "grad_norm": 22.882980346679688, + "learning_rate": 9.978526961301218e-07, + "logits/chosen": -0.28497645258903503, + "logits/rejected": -0.4219549298286438, + "logps/chosen": -1.6226459741592407, + "logps/rejected": -1.6232162714004517, + "loss": 3.2443, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.226459503173828, + "rewards/margins": 0.0057045938447117805, + "rewards/rejected": -16.23216438293457, + "step": 3755 + }, + { + "epoch": 0.12673160537935219, + "grad_norm": 21.146074295043945, + "learning_rate": 9.978253794604658e-07, + "logits/chosen": -0.5370095372200012, + "logits/rejected": -0.7515683174133301, + "logps/chosen": -1.5996499061584473, + "logps/rejected": -1.6215238571166992, + "loss": 2.9973, + "rewards/accuracies": 0.5, + "rewards/chosen": -15.996500015258789, + "rewards/margins": 0.218739315867424, + "rewards/rejected": -16.21523666381836, + "step": 3760 + }, + { + "epoch": 0.12690013145033538, + "grad_norm": 21.315244674682617, + "learning_rate": 9.9779789051306e-07, + "logits/chosen": -0.44618409872055054, + "logits/rejected": -0.40568017959594727, + "logps/chosen": -1.8859344720840454, + "logps/rejected": -1.7330694198608398, + "loss": 4.5439, + "rewards/accuracies": 0.0, + "rewards/chosen": -18.859344482421875, + "rewards/margins": -1.5286482572555542, + "rewards/rejected": -17.3306941986084, + "step": 3765 + }, + { + "epoch": 0.12706865752131855, + "grad_norm": 98.00025939941406, + "learning_rate": 9.977702292974165e-07, + "logits/chosen": -0.5471475720405579, + "logits/rejected": -0.5172755122184753, + "logps/chosen": -1.957866907119751, + "logps/rejected": -1.828743577003479, + "loss": 4.3705, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -19.578670501708984, + "rewards/margins": -1.291232705116272, + "rewards/rejected": -18.28743553161621, + "step": 3770 + }, + { + "epoch": 0.12723718359230174, + "grad_norm": 22.819908142089844, + "learning_rate": 9.97742395823108e-07, + "logits/chosen": -0.5692895650863647, + "logits/rejected": -0.4863424301147461, + "logps/chosen": -1.811985731124878, + "logps/rejected": -1.6893869638442993, + "loss": 4.2602, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -18.119857788085938, + "rewards/margins": -1.2259877920150757, + "rewards/rejected": -16.893869400024414, + "step": 3775 + }, + { + "epoch": 0.1274057096632849, + "grad_norm": 28.10711097717285, + "learning_rate": 9.977143900997664e-07, + "logits/chosen": -0.550839900970459, + "logits/rejected": -0.6493675112724304, + "logps/chosen": -2.099008560180664, + "logps/rejected": -2.022428512573242, + "loss": 3.8717, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.99008560180664, + "rewards/margins": -0.7658005952835083, + "rewards/rejected": -20.224285125732422, + "step": 3780 + }, + { + "epoch": 0.1275742357342681, + "grad_norm": 57.06027603149414, + "learning_rate": 9.976862121370838e-07, + "logits/chosen": -0.37889954447746277, + "logits/rejected": -0.48400768637657166, + "logps/chosen": -1.8434131145477295, + "logps/rejected": -1.9132566452026367, + "loss": 2.4397, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -18.434131622314453, + "rewards/margins": 0.6984332203865051, + "rewards/rejected": -19.132564544677734, + "step": 3785 + }, + { + "epoch": 0.12774276180525126, + "grad_norm": 29.508501052856445, + "learning_rate": 9.976578619448112e-07, + "logits/chosen": -0.6355900764465332, + "logits/rejected": -0.6713854074478149, + "logps/chosen": -1.7098877429962158, + "logps/rejected": -1.7345527410507202, + "loss": 2.9836, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.098876953125, + "rewards/margins": 0.24665145576000214, + "rewards/rejected": -17.34552764892578, + "step": 3790 + }, + { + "epoch": 0.12791128787623446, + "grad_norm": 23.302797317504883, + "learning_rate": 9.976293395327596e-07, + "logits/chosen": -0.6434440016746521, + "logits/rejected": -0.4357782304286957, + "logps/chosen": -1.9776248931884766, + "logps/rejected": -2.018247604370117, + "loss": 3.053, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.776248931884766, + "rewards/margins": 0.40622633695602417, + "rewards/rejected": -20.18247413635254, + "step": 3795 + }, + { + "epoch": 0.12807981394721762, + "grad_norm": 76.14665985107422, + "learning_rate": 9.976006449107993e-07, + "logits/chosen": -0.30134084820747375, + "logits/rejected": -0.23310093581676483, + "logps/chosen": -2.2058663368225098, + "logps/rejected": -2.109875440597534, + "loss": 4.0036, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -22.05866241455078, + "rewards/margins": -0.9599083065986633, + "rewards/rejected": -21.098752975463867, + "step": 3800 + }, + { + "epoch": 0.12824834001820082, + "grad_norm": 23.990066528320312, + "learning_rate": 9.975717780888602e-07, + "logits/chosen": -0.4945443272590637, + "logits/rejected": -0.5459513068199158, + "logps/chosen": -1.6550929546356201, + "logps/rejected": -1.7307243347167969, + "loss": 2.461, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.55093002319336, + "rewards/margins": 0.7563133239746094, + "rewards/rejected": -17.307241439819336, + "step": 3805 + }, + { + "epoch": 0.12841686608918398, + "grad_norm": 23.518524169921875, + "learning_rate": 9.975427390769327e-07, + "logits/chosen": -0.7974092364311218, + "logits/rejected": -0.8891481161117554, + "logps/chosen": -1.5475536584854126, + "logps/rejected": -1.5537341833114624, + "loss": 3.0766, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -15.475537300109863, + "rewards/margins": 0.061804674565792084, + "rewards/rejected": -15.53734302520752, + "step": 3810 + }, + { + "epoch": 0.12858539216016718, + "grad_norm": 25.759145736694336, + "learning_rate": 9.975135278850652e-07, + "logits/chosen": -0.46052369475364685, + "logits/rejected": -0.4665374159812927, + "logps/chosen": -2.109952449798584, + "logps/rejected": -2.2452311515808105, + "loss": 2.0272, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -21.099523544311523, + "rewards/margins": 1.352790355682373, + "rewards/rejected": -22.452314376831055, + "step": 3815 + }, + { + "epoch": 0.12875391823115037, + "grad_norm": 22.549362182617188, + "learning_rate": 9.974841445233673e-07, + "logits/chosen": -0.767123818397522, + "logits/rejected": -0.804367184638977, + "logps/chosen": -1.5822185277938843, + "logps/rejected": -1.6508678197860718, + "loss": 2.7948, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -15.822186470031738, + "rewards/margins": 0.6864920854568481, + "rewards/rejected": -16.508676528930664, + "step": 3820 + }, + { + "epoch": 0.12892244430213354, + "grad_norm": 19.02248764038086, + "learning_rate": 9.97454589002007e-07, + "logits/chosen": -0.48755064606666565, + "logits/rejected": -0.7069557309150696, + "logps/chosen": -1.649071455001831, + "logps/rejected": -1.7350966930389404, + "loss": 2.5954, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.49071502685547, + "rewards/margins": 0.8602536916732788, + "rewards/rejected": -17.350969314575195, + "step": 3825 + }, + { + "epoch": 0.12909097037311673, + "grad_norm": 17.452266693115234, + "learning_rate": 9.974248613312122e-07, + "logits/chosen": -0.5228408575057983, + "logits/rejected": -0.5327178239822388, + "logps/chosen": -1.8645496368408203, + "logps/rejected": -1.6770515441894531, + "loss": 4.9626, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -18.64549446105957, + "rewards/margins": -1.874979019165039, + "rewards/rejected": -16.77051544189453, + "step": 3830 + }, + { + "epoch": 0.1292594964440999, + "grad_norm": 20.079130172729492, + "learning_rate": 9.973949615212709e-07, + "logits/chosen": -0.11785700172185898, + "logits/rejected": -0.11308972537517548, + "logps/chosen": -2.2112364768981934, + "logps/rejected": -2.1767048835754395, + "loss": 3.9099, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -22.112361907958984, + "rewards/margins": -0.34531623125076294, + "rewards/rejected": -21.767047882080078, + "step": 3835 + }, + { + "epoch": 0.1294280225150831, + "grad_norm": 29.872507095336914, + "learning_rate": 9.973648895825297e-07, + "logits/chosen": -0.5825292468070984, + "logits/rejected": -0.6343661546707153, + "logps/chosen": -1.552022933959961, + "logps/rejected": -1.5541592836380005, + "loss": 3.1794, + "rewards/accuracies": 0.5, + "rewards/chosen": -15.520230293273926, + "rewards/margins": 0.021363258361816406, + "rewards/rejected": -15.541592597961426, + "step": 3840 + }, + { + "epoch": 0.12959654858606626, + "grad_norm": 28.075502395629883, + "learning_rate": 9.973346455253959e-07, + "logits/chosen": -0.5383496284484863, + "logits/rejected": -0.4658065736293793, + "logps/chosen": -1.51207435131073, + "logps/rejected": -1.427404522895813, + "loss": 3.9488, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -15.120744705200195, + "rewards/margins": -0.8466998934745789, + "rewards/rejected": -14.27404499053955, + "step": 3845 + }, + { + "epoch": 0.12976507465704945, + "grad_norm": 33.11229705810547, + "learning_rate": 9.973042293603354e-07, + "logits/chosen": -0.24757274985313416, + "logits/rejected": -0.4024467468261719, + "logps/chosen": -1.6025587320327759, + "logps/rejected": -1.7867257595062256, + "loss": 2.643, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.025588989257812, + "rewards/margins": 1.8416721820831299, + "rewards/rejected": -17.867259979248047, + "step": 3850 + }, + { + "epoch": 0.12993360072803262, + "grad_norm": 13.463517189025879, + "learning_rate": 9.97273641097874e-07, + "logits/chosen": -0.21165132522583008, + "logits/rejected": -0.36822745203971863, + "logps/chosen": -1.5910065174102783, + "logps/rejected": -1.629421591758728, + "loss": 2.9384, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -15.910066604614258, + "rewards/margins": 0.38414907455444336, + "rewards/rejected": -16.29421615600586, + "step": 3855 + }, + { + "epoch": 0.1301021267990158, + "grad_norm": 17.587718963623047, + "learning_rate": 9.972428807485972e-07, + "logits/chosen": -0.4675527513027191, + "logits/rejected": -0.412848562002182, + "logps/chosen": -1.6668990850448608, + "logps/rejected": -1.8437143564224243, + "loss": 1.9073, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.668991088867188, + "rewards/margins": 1.7681528329849243, + "rewards/rejected": -18.437143325805664, + "step": 3860 + }, + { + "epoch": 0.13027065286999898, + "grad_norm": 23.6627197265625, + "learning_rate": 9.972119483231502e-07, + "logits/chosen": -0.6713553667068481, + "logits/rejected": -0.6638098955154419, + "logps/chosen": -1.6672265529632568, + "logps/rejected": -1.875038504600525, + "loss": 2.3677, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.67226791381836, + "rewards/margins": 2.0781185626983643, + "rewards/rejected": -18.750385284423828, + "step": 3865 + }, + { + "epoch": 0.13043917894098217, + "grad_norm": 27.069753646850586, + "learning_rate": 9.97180843832237e-07, + "logits/chosen": -0.6440941095352173, + "logits/rejected": -0.5108510851860046, + "logps/chosen": -1.7878713607788086, + "logps/rejected": -1.7104899883270264, + "loss": 3.883, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -17.878713607788086, + "rewards/margins": -0.773812472820282, + "rewards/rejected": -17.104900360107422, + "step": 3870 + }, + { + "epoch": 0.13060770501196536, + "grad_norm": 19.128957748413086, + "learning_rate": 9.97149567286622e-07, + "logits/chosen": -0.5943403840065002, + "logits/rejected": -0.6030054688453674, + "logps/chosen": -1.5599461793899536, + "logps/rejected": -1.617856740951538, + "loss": 2.6472, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -15.599462509155273, + "rewards/margins": 0.5791065096855164, + "rewards/rejected": -16.178569793701172, + "step": 3875 + }, + { + "epoch": 0.13077623108294853, + "grad_norm": 20.993228912353516, + "learning_rate": 9.97118118697129e-07, + "logits/chosen": -0.6433027982711792, + "logits/rejected": -0.5695537328720093, + "logps/chosen": -1.9984300136566162, + "logps/rejected": -2.169884204864502, + "loss": 2.2335, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.98430061340332, + "rewards/margins": 1.7145410776138306, + "rewards/rejected": -21.698841094970703, + "step": 3880 + }, + { + "epoch": 0.13094475715393172, + "grad_norm": 23.70119285583496, + "learning_rate": 9.970864980746402e-07, + "logits/chosen": -0.7846357226371765, + "logits/rejected": -0.6926722526550293, + "logps/chosen": -1.5518462657928467, + "logps/rejected": -1.6256211996078491, + "loss": 2.7388, + "rewards/accuracies": 0.5, + "rewards/chosen": -15.518463134765625, + "rewards/margins": 0.7377495765686035, + "rewards/rejected": -16.25621223449707, + "step": 3885 + }, + { + "epoch": 0.1311132832249149, + "grad_norm": 20.39211654663086, + "learning_rate": 9.970547054300993e-07, + "logits/chosen": -0.6103144884109497, + "logits/rejected": -0.4634561538696289, + "logps/chosen": -1.693368911743164, + "logps/rejected": -1.6804195642471313, + "loss": 3.4093, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.93368911743164, + "rewards/margins": -0.12949447333812714, + "rewards/rejected": -16.8041934967041, + "step": 3890 + }, + { + "epoch": 0.13128180929589808, + "grad_norm": 23.615053176879883, + "learning_rate": 9.970227407745077e-07, + "logits/chosen": -0.7008322477340698, + "logits/rejected": -0.63578861951828, + "logps/chosen": -1.8769195079803467, + "logps/rejected": -1.9181302785873413, + "loss": 3.0259, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.769195556640625, + "rewards/margins": 0.41210755705833435, + "rewards/rejected": -19.181303024291992, + "step": 3895 + }, + { + "epoch": 0.13145033536688125, + "grad_norm": 20.815834045410156, + "learning_rate": 9.969906041189276e-07, + "logits/chosen": -0.6005369424819946, + "logits/rejected": -0.4755684733390808, + "logps/chosen": -1.6404476165771484, + "logps/rejected": -1.680262804031372, + "loss": 2.9446, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.404476165771484, + "rewards/margins": 0.3981497883796692, + "rewards/rejected": -16.80262565612793, + "step": 3900 + }, + { + "epoch": 0.13161886143786444, + "grad_norm": 23.32740592956543, + "learning_rate": 9.969582954744799e-07, + "logits/chosen": -0.4026781916618347, + "logits/rejected": -0.37207791209220886, + "logps/chosen": -1.8272647857666016, + "logps/rejected": -1.9074573516845703, + "loss": 2.4612, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.27264976501465, + "rewards/margins": 0.8019250631332397, + "rewards/rejected": -19.074573516845703, + "step": 3905 + }, + { + "epoch": 0.1317873875088476, + "grad_norm": 37.29978561401367, + "learning_rate": 9.96925814852346e-07, + "logits/chosen": -0.4581456184387207, + "logits/rejected": -0.5759280920028687, + "logps/chosen": -1.7276592254638672, + "logps/rejected": -1.7996619939804077, + "loss": 2.7312, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.27659034729004, + "rewards/margins": 0.720028817653656, + "rewards/rejected": -17.996620178222656, + "step": 3910 + }, + { + "epoch": 0.1319559135798308, + "grad_norm": 29.896190643310547, + "learning_rate": 9.968931622637651e-07, + "logits/chosen": -0.23583588004112244, + "logits/rejected": -0.2508379817008972, + "logps/chosen": -2.083098888397217, + "logps/rejected": -2.0896639823913574, + "loss": 3.1457, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.830989837646484, + "rewards/margins": 0.06565189361572266, + "rewards/rejected": -20.89664077758789, + "step": 3915 + }, + { + "epoch": 0.13212443965081397, + "grad_norm": 18.37214469909668, + "learning_rate": 9.968603377200377e-07, + "logits/chosen": -0.54749995470047, + "logits/rejected": -0.3493199348449707, + "logps/chosen": -1.6087677478790283, + "logps/rejected": -1.9329464435577393, + "loss": 1.8455, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -16.087677001953125, + "rewards/margins": 3.241788387298584, + "rewards/rejected": -19.329465866088867, + "step": 3920 + }, + { + "epoch": 0.13229296572179716, + "grad_norm": 25.379499435424805, + "learning_rate": 9.96827341232523e-07, + "logits/chosen": -0.473417192697525, + "logits/rejected": -0.4509311616420746, + "logps/chosen": -1.8252029418945312, + "logps/rejected": -1.9195716381072998, + "loss": 2.9775, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.252029418945312, + "rewards/margins": 0.9436872601509094, + "rewards/rejected": -19.195714950561523, + "step": 3925 + }, + { + "epoch": 0.13246149179278036, + "grad_norm": 23.761089324951172, + "learning_rate": 9.967941728126398e-07, + "logits/chosen": -0.5376905202865601, + "logits/rejected": -0.43829545378685, + "logps/chosen": -1.7135887145996094, + "logps/rejected": -1.6739921569824219, + "loss": 3.5778, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.135887145996094, + "rewards/margins": -0.39596787095069885, + "rewards/rejected": -16.739919662475586, + "step": 3930 + }, + { + "epoch": 0.13263001786376352, + "grad_norm": 38.03128433227539, + "learning_rate": 9.967608324718661e-07, + "logits/chosen": -0.5036236047744751, + "logits/rejected": -0.6714465618133545, + "logps/chosen": -1.942854642868042, + "logps/rejected": -2.249541759490967, + "loss": 2.6183, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.428546905517578, + "rewards/margins": 3.0668723583221436, + "rewards/rejected": -22.495418548583984, + "step": 3935 + }, + { + "epoch": 0.13279854393474672, + "grad_norm": 15.746185302734375, + "learning_rate": 9.9672732022174e-07, + "logits/chosen": -0.12133710086345673, + "logits/rejected": -0.2733135521411896, + "logps/chosen": -1.9632943868637085, + "logps/rejected": -2.0086188316345215, + "loss": 3.6416, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.632946014404297, + "rewards/margins": 0.45324498414993286, + "rewards/rejected": -20.08618927001953, + "step": 3940 + }, + { + "epoch": 0.13296707000572988, + "grad_norm": 26.305767059326172, + "learning_rate": 9.966936360738586e-07, + "logits/chosen": -0.7777701616287231, + "logits/rejected": -0.9092004895210266, + "logps/chosen": -1.8936526775360107, + "logps/rejected": -1.915820837020874, + "loss": 3.3031, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -18.936527252197266, + "rewards/margins": 0.2216823548078537, + "rewards/rejected": -19.1582088470459, + "step": 3945 + }, + { + "epoch": 0.13313559607671308, + "grad_norm": 21.544748306274414, + "learning_rate": 9.966597800398789e-07, + "logits/chosen": -0.5004564523696899, + "logits/rejected": -0.49012812972068787, + "logps/chosen": -1.747496247291565, + "logps/rejected": -1.7987169027328491, + "loss": 2.7442, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.474964141845703, + "rewards/margins": 0.5122075080871582, + "rewards/rejected": -17.987171173095703, + "step": 3950 + }, + { + "epoch": 0.13330412214769624, + "grad_norm": 17.286357879638672, + "learning_rate": 9.966257521315166e-07, + "logits/chosen": -0.20877251029014587, + "logits/rejected": -0.23092889785766602, + "logps/chosen": -2.1308019161224365, + "logps/rejected": -2.1815826892852783, + "loss": 2.6274, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.308019638061523, + "rewards/margins": 0.5078089833259583, + "rewards/rejected": -21.815828323364258, + "step": 3955 + }, + { + "epoch": 0.13347264821867943, + "grad_norm": 18.512609481811523, + "learning_rate": 9.965915523605482e-07, + "logits/chosen": -0.5298896431922913, + "logits/rejected": -0.37859925627708435, + "logps/chosen": -1.8345773220062256, + "logps/rejected": -1.9792572259902954, + "loss": 2.3234, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.34577178955078, + "rewards/margins": 1.446800947189331, + "rewards/rejected": -19.792572021484375, + "step": 3960 + }, + { + "epoch": 0.1336411742896626, + "grad_norm": 19.06651496887207, + "learning_rate": 9.965571807388082e-07, + "logits/chosen": -0.41732341051101685, + "logits/rejected": -0.3108731508255005, + "logps/chosen": -1.5117712020874023, + "logps/rejected": -1.74689519405365, + "loss": 2.5905, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -15.117711067199707, + "rewards/margins": 2.3512415885925293, + "rewards/rejected": -17.468952178955078, + "step": 3965 + }, + { + "epoch": 0.1338097003606458, + "grad_norm": 21.32099723815918, + "learning_rate": 9.965226372781914e-07, + "logits/chosen": -0.7179542779922485, + "logits/rejected": -0.7836523056030273, + "logps/chosen": -1.6507011651992798, + "logps/rejected": -1.714941382408142, + "loss": 2.5683, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.50701332092285, + "rewards/margins": 0.6424006223678589, + "rewards/rejected": -17.1494140625, + "step": 3970 + }, + { + "epoch": 0.13397822643162896, + "grad_norm": 15.45632266998291, + "learning_rate": 9.96487921990652e-07, + "logits/chosen": -0.6785871386528015, + "logits/rejected": -0.824084460735321, + "logps/chosen": -1.446620225906372, + "logps/rejected": -1.482995629310608, + "loss": 2.9628, + "rewards/accuracies": 0.5, + "rewards/chosen": -14.466203689575195, + "rewards/margins": 0.36375388503074646, + "rewards/rejected": -14.8299560546875, + "step": 3975 + }, + { + "epoch": 0.13414675250261215, + "grad_norm": 18.907337188720703, + "learning_rate": 9.96453034888204e-07, + "logits/chosen": -0.6268816590309143, + "logits/rejected": -0.5451667904853821, + "logps/chosen": -1.8741906881332397, + "logps/rejected": -1.7744128704071045, + "loss": 4.0882, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -18.741907119750977, + "rewards/margins": -0.9977784156799316, + "rewards/rejected": -17.744129180908203, + "step": 3980 + }, + { + "epoch": 0.13431527857359535, + "grad_norm": 29.47572135925293, + "learning_rate": 9.964179759829199e-07, + "logits/chosen": -0.13526254892349243, + "logits/rejected": -0.24333901703357697, + "logps/chosen": -1.850494146347046, + "logps/rejected": -1.8943946361541748, + "loss": 3.1622, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.504940032958984, + "rewards/margins": 0.4390055537223816, + "rewards/rejected": -18.943946838378906, + "step": 3985 + }, + { + "epoch": 0.13448380464457851, + "grad_norm": 45.685768127441406, + "learning_rate": 9.963827452869325e-07, + "logits/chosen": -0.32695698738098145, + "logits/rejected": -0.3549268841743469, + "logps/chosen": -1.690030813217163, + "logps/rejected": -1.7497482299804688, + "loss": 3.099, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.90030860900879, + "rewards/margins": 0.5971736907958984, + "rewards/rejected": -17.497482299804688, + "step": 3990 + }, + { + "epoch": 0.1346523307155617, + "grad_norm": 38.33502960205078, + "learning_rate": 9.963473428124334e-07, + "logits/chosen": -0.4510404169559479, + "logits/rejected": -0.5716635584831238, + "logps/chosen": -1.9731696844100952, + "logps/rejected": -1.9337685108184814, + "loss": 3.5215, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.731698989868164, + "rewards/margins": -0.39401358366012573, + "rewards/rejected": -19.337682723999023, + "step": 3995 + }, + { + "epoch": 0.13482085678654487, + "grad_norm": 18.996416091918945, + "learning_rate": 9.963117685716744e-07, + "logits/chosen": -0.7336264848709106, + "logits/rejected": -0.6646067500114441, + "logps/chosen": -1.6362497806549072, + "logps/rejected": -1.7025432586669922, + "loss": 2.7504, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.362497329711914, + "rewards/margins": 0.6629348993301392, + "rewards/rejected": -17.025432586669922, + "step": 4000 + }, + { + "epoch": 0.13482085678654487, + "eval_logits/chosen": -0.7574084997177124, + "eval_logits/rejected": -0.7684286236763, + "eval_logps/chosen": -1.6912825107574463, + "eval_logps/rejected": -1.7017545700073242, + "eval_loss": 3.3711276054382324, + "eval_rewards/accuracies": 0.47999998927116394, + "eval_rewards/chosen": -16.912826538085938, + "eval_rewards/margins": 0.10472100228071213, + "eval_rewards/rejected": -17.017545700073242, + "eval_runtime": 12.8943, + "eval_samples_per_second": 7.755, + "eval_steps_per_second": 1.939, + "step": 4000 + }, + { + "epoch": 0.13498938285752807, + "grad_norm": 44.0849723815918, + "learning_rate": 9.962760225769664e-07, + "logits/chosen": -0.2051403522491455, + "logits/rejected": -0.19655892252922058, + "logps/chosen": -2.0785844326019287, + "logps/rejected": -2.2632524967193604, + "loss": 1.9564, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.785846710205078, + "rewards/margins": 1.8466781377792358, + "rewards/rejected": -22.632522583007812, + "step": 4005 + }, + { + "epoch": 0.13515790892851123, + "grad_norm": 29.675466537475586, + "learning_rate": 9.962401048406792e-07, + "logits/chosen": -0.1515626162290573, + "logits/rejected": -0.11530622094869614, + "logps/chosen": -2.379957675933838, + "logps/rejected": -2.8324062824249268, + "loss": 3.4024, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.799577713012695, + "rewards/margins": 4.524485111236572, + "rewards/rejected": -28.32406234741211, + "step": 4010 + }, + { + "epoch": 0.13532643499949443, + "grad_norm": 13.641871452331543, + "learning_rate": 9.96204015375243e-07, + "logits/chosen": -0.8857007026672363, + "logits/rejected": -0.8562878370285034, + "logps/chosen": -1.5614138841629028, + "logps/rejected": -1.7495664358139038, + "loss": 1.9699, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -15.614137649536133, + "rewards/margins": 1.881527304649353, + "rewards/rejected": -17.495664596557617, + "step": 4015 + }, + { + "epoch": 0.1354949610704776, + "grad_norm": 22.9551944732666, + "learning_rate": 9.961677541931466e-07, + "logits/chosen": -0.5161920785903931, + "logits/rejected": -0.43604689836502075, + "logps/chosen": -1.6171886920928955, + "logps/rejected": -1.686789870262146, + "loss": 2.6197, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.171884536743164, + "rewards/margins": 0.6960126757621765, + "rewards/rejected": -16.86789894104004, + "step": 4020 + }, + { + "epoch": 0.1356634871414608, + "grad_norm": 21.6877384185791, + "learning_rate": 9.961313213069386e-07, + "logits/chosen": -0.6385021209716797, + "logits/rejected": -0.441770076751709, + "logps/chosen": -1.7623170614242554, + "logps/rejected": -1.9857242107391357, + "loss": 1.9666, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.623170852661133, + "rewards/margins": 2.2340712547302246, + "rewards/rejected": -19.857242584228516, + "step": 4025 + }, + { + "epoch": 0.13583201321244395, + "grad_norm": 33.36929702758789, + "learning_rate": 9.960947167292274e-07, + "logits/chosen": -0.7597072720527649, + "logits/rejected": -0.5987949371337891, + "logps/chosen": -1.963201880455017, + "logps/rejected": -2.146829605102539, + "loss": 2.8322, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.632022857666016, + "rewards/margins": 1.8362756967544556, + "rewards/rejected": -21.46829605102539, + "step": 4030 + }, + { + "epoch": 0.13600053928342715, + "grad_norm": 22.11895751953125, + "learning_rate": 9.960579404726797e-07, + "logits/chosen": -0.36422428488731384, + "logits/rejected": -0.222096249461174, + "logps/chosen": -1.9910913705825806, + "logps/rejected": -2.0189363956451416, + "loss": 3.3419, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.910913467407227, + "rewards/margins": 0.2784494459629059, + "rewards/rejected": -20.189363479614258, + "step": 4035 + }, + { + "epoch": 0.13616906535441034, + "grad_norm": 31.01476287841797, + "learning_rate": 9.96020992550023e-07, + "logits/chosen": -0.3307796120643616, + "logits/rejected": -0.5669467449188232, + "logps/chosen": -1.5970932245254517, + "logps/rejected": -1.6248077154159546, + "loss": 2.8625, + "rewards/accuracies": 0.5, + "rewards/chosen": -15.970930099487305, + "rewards/margins": 0.27714595198631287, + "rewards/rejected": -16.248077392578125, + "step": 4040 + }, + { + "epoch": 0.1363375914253935, + "grad_norm": 24.931251525878906, + "learning_rate": 9.95983872974043e-07, + "logits/chosen": -0.6774837970733643, + "logits/rejected": -0.595493495464325, + "logps/chosen": -1.5050591230392456, + "logps/rejected": -1.4872527122497559, + "loss": 3.2411, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -15.050592422485352, + "rewards/margins": -0.17806454002857208, + "rewards/rejected": -14.872526168823242, + "step": 4045 + }, + { + "epoch": 0.1365061174963767, + "grad_norm": 33.974796295166016, + "learning_rate": 9.959465817575858e-07, + "logits/chosen": -0.5587460398674011, + "logits/rejected": -0.32978394627571106, + "logps/chosen": -1.6928952932357788, + "logps/rejected": -1.8271658420562744, + "loss": 2.4119, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -16.928955078125, + "rewards/margins": 1.3427047729492188, + "rewards/rejected": -18.271657943725586, + "step": 4050 + }, + { + "epoch": 0.13667464356735987, + "grad_norm": 16.821455001831055, + "learning_rate": 9.95909118913556e-07, + "logits/chosen": -0.8507031202316284, + "logits/rejected": -0.7523888349533081, + "logps/chosen": -1.645453691482544, + "logps/rejected": -1.8627036809921265, + "loss": 1.3962, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -16.45453453063965, + "rewards/margins": 2.1725001335144043, + "rewards/rejected": -18.62703514099121, + "step": 4055 + }, + { + "epoch": 0.13684316963834306, + "grad_norm": 23.652524948120117, + "learning_rate": 9.958714844549183e-07, + "logits/chosen": -0.6848211288452148, + "logits/rejected": -0.7481715083122253, + "logps/chosen": -1.671247124671936, + "logps/rejected": -1.6934289932250977, + "loss": 3.0554, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.712472915649414, + "rewards/margins": 0.22181835770606995, + "rewards/rejected": -16.934289932250977, + "step": 4060 + }, + { + "epoch": 0.13701169570932623, + "grad_norm": 8.807697296142578, + "learning_rate": 9.958336783946964e-07, + "logits/chosen": -0.10141198337078094, + "logits/rejected": -0.18145883083343506, + "logps/chosen": -1.7855380773544312, + "logps/rejected": -1.9229755401611328, + "loss": 2.2009, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.85538101196289, + "rewards/margins": 1.3743743896484375, + "rewards/rejected": -19.229755401611328, + "step": 4065 + }, + { + "epoch": 0.13718022178030942, + "grad_norm": 10.74984359741211, + "learning_rate": 9.957957007459734e-07, + "logits/chosen": -0.5784136056900024, + "logits/rejected": -0.5899661779403687, + "logps/chosen": -1.6685832738876343, + "logps/rejected": -1.7646796703338623, + "loss": 2.4155, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.685832977294922, + "rewards/margins": 0.9609651565551758, + "rewards/rejected": -17.64679527282715, + "step": 4070 + }, + { + "epoch": 0.13734874785129259, + "grad_norm": 15.270707130432129, + "learning_rate": 9.957575515218919e-07, + "logits/chosen": -0.7281454205513, + "logits/rejected": -0.6709175109863281, + "logps/chosen": -1.7470428943634033, + "logps/rejected": -1.9406992197036743, + "loss": 3.2565, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.470428466796875, + "rewards/margins": 1.9365642070770264, + "rewards/rejected": -19.406991958618164, + "step": 4075 + }, + { + "epoch": 0.13751727392227578, + "grad_norm": 39.64847183227539, + "learning_rate": 9.95719230735654e-07, + "logits/chosen": -0.1662284880876541, + "logits/rejected": -0.023158108815550804, + "logps/chosen": -1.5711307525634766, + "logps/rejected": -1.640824556350708, + "loss": 2.6556, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -15.71130657196045, + "rewards/margins": 0.6969379186630249, + "rewards/rejected": -16.408245086669922, + "step": 4080 + }, + { + "epoch": 0.13768579999325894, + "grad_norm": 55.75159454345703, + "learning_rate": 9.956807384005209e-07, + "logits/chosen": -0.5080665946006775, + "logits/rejected": -0.5409480333328247, + "logps/chosen": -1.77492356300354, + "logps/rejected": -1.7267024517059326, + "loss": 3.8746, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -17.749237060546875, + "rewards/margins": -0.4822106957435608, + "rewards/rejected": -17.267024993896484, + "step": 4085 + }, + { + "epoch": 0.13785432606424214, + "grad_norm": 44.708839416503906, + "learning_rate": 9.956420745298132e-07, + "logits/chosen": -0.5287455320358276, + "logits/rejected": -0.45763248205184937, + "logps/chosen": -1.798195242881775, + "logps/rejected": -1.77925705909729, + "loss": 3.2948, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.981952667236328, + "rewards/margins": -0.18938274681568146, + "rewards/rejected": -17.792570114135742, + "step": 4090 + }, + { + "epoch": 0.13802285213522533, + "grad_norm": 25.557100296020508, + "learning_rate": 9.956032391369109e-07, + "logits/chosen": -0.2953604757785797, + "logits/rejected": -0.36451655626296997, + "logps/chosen": -2.143887758255005, + "logps/rejected": -1.942565679550171, + "loss": 5.0524, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -21.43887710571289, + "rewards/margins": -2.0132193565368652, + "rewards/rejected": -19.425655364990234, + "step": 4095 + }, + { + "epoch": 0.1381913782062085, + "grad_norm": 26.615631103515625, + "learning_rate": 9.955642322352538e-07, + "logits/chosen": -0.5584558844566345, + "logits/rejected": -0.5644673705101013, + "logps/chosen": -1.7165279388427734, + "logps/rejected": -1.680544137954712, + "loss": 3.7856, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.165279388427734, + "rewards/margins": -0.35983896255493164, + "rewards/rejected": -16.80544090270996, + "step": 4100 + }, + { + "epoch": 0.1383599042771917, + "grad_norm": 17.09918975830078, + "learning_rate": 9.955250538383402e-07, + "logits/chosen": -0.4320560097694397, + "logits/rejected": -0.5020492076873779, + "logps/chosen": -1.728809118270874, + "logps/rejected": -1.7111473083496094, + "loss": 3.5286, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.2880916595459, + "rewards/margins": -0.1766192466020584, + "rewards/rejected": -17.111474990844727, + "step": 4105 + }, + { + "epoch": 0.13852843034817486, + "grad_norm": 32.29576110839844, + "learning_rate": 9.954857039597285e-07, + "logits/chosen": -0.5975824594497681, + "logits/rejected": -0.6915780305862427, + "logps/chosen": -1.9237430095672607, + "logps/rejected": -1.9363324642181396, + "loss": 3.0509, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.237430572509766, + "rewards/margins": 0.12589678168296814, + "rewards/rejected": -19.363325119018555, + "step": 4110 + }, + { + "epoch": 0.13869695641915805, + "grad_norm": 15.100337028503418, + "learning_rate": 9.95446182613036e-07, + "logits/chosen": -0.8075372576713562, + "logits/rejected": -0.6522815227508545, + "logps/chosen": -1.5407874584197998, + "logps/rejected": -1.5490734577178955, + "loss": 3.0917, + "rewards/accuracies": 0.5, + "rewards/chosen": -15.407875061035156, + "rewards/margins": 0.08286003768444061, + "rewards/rejected": -15.490735054016113, + "step": 4115 + }, + { + "epoch": 0.13886548249014122, + "grad_norm": 12.08738899230957, + "learning_rate": 9.954064898119393e-07, + "logits/chosen": -0.5792916417121887, + "logits/rejected": -0.7377229332923889, + "logps/chosen": -1.6446812152862549, + "logps/rejected": -1.7030906677246094, + "loss": 2.9207, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.44681167602539, + "rewards/margins": 0.5840937495231628, + "rewards/rejected": -17.03090476989746, + "step": 4120 + }, + { + "epoch": 0.1390340085611244, + "grad_norm": 20.199708938598633, + "learning_rate": 9.953666255701747e-07, + "logits/chosen": -0.4421865940093994, + "logits/rejected": -0.34469595551490784, + "logps/chosen": -2.231015682220459, + "logps/rejected": -1.954992651939392, + "loss": 7.4907, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.31015968322754, + "rewards/margins": -2.7602322101593018, + "rewards/rejected": -19.5499267578125, + "step": 4125 + }, + { + "epoch": 0.13920253463210758, + "grad_norm": 24.32143211364746, + "learning_rate": 9.953265899015378e-07, + "logits/chosen": -0.45347872376441956, + "logits/rejected": -0.5372802019119263, + "logps/chosen": -2.0228564739227295, + "logps/rejected": -2.0596730709075928, + "loss": 3.7307, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.228565216064453, + "rewards/margins": 0.36816471815109253, + "rewards/rejected": -20.596731185913086, + "step": 4130 + }, + { + "epoch": 0.13937106070309077, + "grad_norm": 18.27474594116211, + "learning_rate": 9.952863828198832e-07, + "logits/chosen": -0.5868837833404541, + "logits/rejected": -0.8388587236404419, + "logps/chosen": -1.9159198999404907, + "logps/rejected": -1.8486074209213257, + "loss": 3.7772, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.159198760986328, + "rewards/margins": -0.6731247901916504, + "rewards/rejected": -18.486074447631836, + "step": 4135 + }, + { + "epoch": 0.13953958677407394, + "grad_norm": 29.658910751342773, + "learning_rate": 9.952460043391251e-07, + "logits/chosen": -0.4052742123603821, + "logits/rejected": -0.30263233184814453, + "logps/chosen": -1.8732227087020874, + "logps/rejected": -1.9779338836669922, + "loss": 2.7132, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.732229232788086, + "rewards/margins": 1.0471103191375732, + "rewards/rejected": -19.779338836669922, + "step": 4140 + }, + { + "epoch": 0.13970811284505713, + "grad_norm": 18.792110443115234, + "learning_rate": 9.952054544732366e-07, + "logits/chosen": -0.7875449657440186, + "logits/rejected": -0.8418458700180054, + "logps/chosen": -1.5930900573730469, + "logps/rejected": -1.4902503490447998, + "loss": 4.2111, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -15.930900573730469, + "rewards/margins": -1.0283968448638916, + "rewards/rejected": -14.902502059936523, + "step": 4145 + }, + { + "epoch": 0.13987663891604032, + "grad_norm": 15.064722061157227, + "learning_rate": 9.95164733236251e-07, + "logits/chosen": -0.45020851492881775, + "logits/rejected": -0.5293843746185303, + "logps/chosen": -1.3041831254959106, + "logps/rejected": -1.4093153476715088, + "loss": 2.4878, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -13.041831970214844, + "rewards/margins": 1.0513209104537964, + "rewards/rejected": -14.093152046203613, + "step": 4150 + }, + { + "epoch": 0.1400451649870235, + "grad_norm": 21.02431869506836, + "learning_rate": 9.951238406422594e-07, + "logits/chosen": -0.4772763252258301, + "logits/rejected": -0.358822226524353, + "logps/chosen": -1.6318118572235107, + "logps/rejected": -1.6670395135879517, + "loss": 2.9819, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.318119049072266, + "rewards/margins": 0.35227519273757935, + "rewards/rejected": -16.670392990112305, + "step": 4155 + }, + { + "epoch": 0.14021369105800668, + "grad_norm": 12.805978775024414, + "learning_rate": 9.950827767054141e-07, + "logits/chosen": -0.5042682886123657, + "logits/rejected": -0.36263027787208557, + "logps/chosen": -1.785143494606018, + "logps/rejected": -1.8810393810272217, + "loss": 2.3962, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.851436614990234, + "rewards/margins": 0.9589592814445496, + "rewards/rejected": -18.810396194458008, + "step": 4160 + }, + { + "epoch": 0.14038221712898985, + "grad_norm": 24.619632720947266, + "learning_rate": 9.950415414399252e-07, + "logits/chosen": 0.04073786735534668, + "logits/rejected": -0.05679405480623245, + "logps/chosen": -2.54649019241333, + "logps/rejected": -3.144075632095337, + "loss": 3.1133, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.464900970458984, + "rewards/margins": 5.975854873657227, + "rewards/rejected": -31.44075584411621, + "step": 4165 + }, + { + "epoch": 0.14055074319997304, + "grad_norm": 15.252907752990723, + "learning_rate": 9.950001348600625e-07, + "logits/chosen": -0.5035009384155273, + "logits/rejected": -0.5782333016395569, + "logps/chosen": -1.7510089874267578, + "logps/rejected": -1.777282953262329, + "loss": 2.9862, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.510089874267578, + "rewards/margins": 0.2627398371696472, + "rewards/rejected": -17.772830963134766, + "step": 4170 + }, + { + "epoch": 0.1407192692709562, + "grad_norm": 19.329147338867188, + "learning_rate": 9.949585569801554e-07, + "logits/chosen": -0.24519672989845276, + "logits/rejected": -0.30350321531295776, + "logps/chosen": -1.5690886974334717, + "logps/rejected": -1.8500550985336304, + "loss": 2.8123, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -15.690889358520508, + "rewards/margins": 2.809664249420166, + "rewards/rejected": -18.500551223754883, + "step": 4175 + }, + { + "epoch": 0.1408877953419394, + "grad_norm": 18.852506637573242, + "learning_rate": 9.949168078145925e-07, + "logits/chosen": -0.7113882899284363, + "logits/rejected": -0.7007073163986206, + "logps/chosen": -1.8825536966323853, + "logps/rejected": -1.8443183898925781, + "loss": 3.4646, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.82553482055664, + "rewards/margins": -0.38235053420066833, + "rewards/rejected": -18.44318389892578, + "step": 4180 + }, + { + "epoch": 0.14105632141292257, + "grad_norm": 22.75662612915039, + "learning_rate": 9.948748873778212e-07, + "logits/chosen": -0.22483432292938232, + "logits/rejected": -0.10999743640422821, + "logps/chosen": -1.646648645401001, + "logps/rejected": -1.7723115682601929, + "loss": 3.2349, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.46648597717285, + "rewards/margins": 1.2566298246383667, + "rewards/rejected": -17.723115921020508, + "step": 4185 + }, + { + "epoch": 0.14122484748390576, + "grad_norm": 25.911178588867188, + "learning_rate": 9.948327956843487e-07, + "logits/chosen": -0.24501433968544006, + "logits/rejected": -0.2834986746311188, + "logps/chosen": -1.7771613597869873, + "logps/rejected": -1.839350700378418, + "loss": 2.9625, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.7716121673584, + "rewards/margins": 0.6218937635421753, + "rewards/rejected": -18.39350700378418, + "step": 4190 + }, + { + "epoch": 0.14139337355488893, + "grad_norm": 66.31412506103516, + "learning_rate": 9.94790532748741e-07, + "logits/chosen": -0.2734326720237732, + "logits/rejected": -0.2607944905757904, + "logps/chosen": -2.068535566329956, + "logps/rejected": -2.099501848220825, + "loss": 2.8103, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.685354232788086, + "rewards/margins": 0.3096626400947571, + "rewards/rejected": -20.995018005371094, + "step": 4195 + }, + { + "epoch": 0.14156189962587212, + "grad_norm": 33.704647064208984, + "learning_rate": 9.947480985856241e-07, + "logits/chosen": -0.107094407081604, + "logits/rejected": -0.1890457272529602, + "logps/chosen": -2.6533761024475098, + "logps/rejected": -2.707219123840332, + "loss": 2.9024, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.533761978149414, + "rewards/margins": 0.5384283065795898, + "rewards/rejected": -27.072193145751953, + "step": 4200 + }, + { + "epoch": 0.14173042569685532, + "grad_norm": 26.672903060913086, + "learning_rate": 9.947054932096827e-07, + "logits/chosen": -0.48258861899375916, + "logits/rejected": -0.7191343903541565, + "logps/chosen": -1.6313819885253906, + "logps/rejected": -1.6558310985565186, + "loss": 2.8408, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.31382179260254, + "rewards/margins": 0.24449129402637482, + "rewards/rejected": -16.558313369750977, + "step": 4205 + }, + { + "epoch": 0.14189895176783848, + "grad_norm": 52.22471618652344, + "learning_rate": 9.946627166356608e-07, + "logits/chosen": -0.7462440729141235, + "logits/rejected": -0.7692978978157043, + "logps/chosen": -1.5429750680923462, + "logps/rejected": -1.4933751821517944, + "loss": 3.5855, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -15.429750442504883, + "rewards/margins": -0.4959982931613922, + "rewards/rejected": -14.933751106262207, + "step": 4210 + }, + { + "epoch": 0.14206747783882168, + "grad_norm": 16.94681167602539, + "learning_rate": 9.946197688783612e-07, + "logits/chosen": -0.8591279983520508, + "logits/rejected": -0.7224575877189636, + "logps/chosen": -1.5940550565719604, + "logps/rejected": -1.6403049230575562, + "loss": 3.1158, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -15.94054889678955, + "rewards/margins": 0.4624989926815033, + "rewards/rejected": -16.40304946899414, + "step": 4215 + }, + { + "epoch": 0.14223600390980484, + "grad_norm": 35.62173080444336, + "learning_rate": 9.945766499526472e-07, + "logits/chosen": -0.22328560054302216, + "logits/rejected": -0.25591030716896057, + "logps/chosen": -1.9984073638916016, + "logps/rejected": -2.045516014099121, + "loss": 2.7188, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.984073638916016, + "rewards/margins": 0.47108450531959534, + "rewards/rejected": -20.455158233642578, + "step": 4220 + }, + { + "epoch": 0.14240452998078804, + "grad_norm": 37.213661193847656, + "learning_rate": 9.9453335987344e-07, + "logits/chosen": -0.6606870889663696, + "logits/rejected": -0.7392102479934692, + "logps/chosen": -2.145911693572998, + "logps/rejected": -2.1802453994750977, + "loss": 3.5793, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.459117889404297, + "rewards/margins": 0.3433364927768707, + "rewards/rejected": -21.802453994750977, + "step": 4225 + }, + { + "epoch": 0.1425730560517712, + "grad_norm": 23.552959442138672, + "learning_rate": 9.944898986557208e-07, + "logits/chosen": -0.5075179934501648, + "logits/rejected": -0.5485433340072632, + "logps/chosen": -1.5870262384414673, + "logps/rejected": -1.6476974487304688, + "loss": 2.639, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -15.870260238647461, + "rewards/margins": 0.6067121624946594, + "rewards/rejected": -16.476974487304688, + "step": 4230 + }, + { + "epoch": 0.1427415821227544, + "grad_norm": 19.936710357666016, + "learning_rate": 9.944462663145299e-07, + "logits/chosen": -0.6237483024597168, + "logits/rejected": -0.7083162069320679, + "logps/chosen": -1.8480708599090576, + "logps/rejected": -1.78692626953125, + "loss": 3.6742, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.4807071685791, + "rewards/margins": -0.6114450693130493, + "rewards/rejected": -17.8692626953125, + "step": 4235 + }, + { + "epoch": 0.14291010819373756, + "grad_norm": 40.25701904296875, + "learning_rate": 9.944024628649665e-07, + "logits/chosen": -0.4969102740287781, + "logits/rejected": -0.6931605339050293, + "logps/chosen": -1.6680597066879272, + "logps/rejected": -1.658609390258789, + "loss": 3.2996, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.68059730529785, + "rewards/margins": -0.09450320899486542, + "rewards/rejected": -16.58609390258789, + "step": 4240 + }, + { + "epoch": 0.14307863426472076, + "grad_norm": 30.906673431396484, + "learning_rate": 9.943584883221897e-07, + "logits/chosen": -0.3676472306251526, + "logits/rejected": -0.4649665355682373, + "logps/chosen": -1.7316913604736328, + "logps/rejected": -1.6490224599838257, + "loss": 3.8978, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -17.31691551208496, + "rewards/margins": -0.8266892433166504, + "rewards/rejected": -16.490224838256836, + "step": 4245 + }, + { + "epoch": 0.14324716033570392, + "grad_norm": 22.4572696685791, + "learning_rate": 9.943143427014166e-07, + "logits/chosen": -0.330500066280365, + "logits/rejected": -0.4424312710762024, + "logps/chosen": -1.9481405019760132, + "logps/rejected": -1.9094657897949219, + "loss": 3.5611, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.481403350830078, + "rewards/margins": -0.3867475390434265, + "rewards/rejected": -19.094655990600586, + "step": 4250 + }, + { + "epoch": 0.14341568640668712, + "grad_norm": 27.16339683532715, + "learning_rate": 9.942700260179248e-07, + "logits/chosen": -0.7029052376747131, + "logits/rejected": -0.5835430026054382, + "logps/chosen": -1.7974525690078735, + "logps/rejected": -1.9083143472671509, + "loss": 3.1116, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.974523544311523, + "rewards/margins": 1.1086170673370361, + "rewards/rejected": -19.083141326904297, + "step": 4255 + }, + { + "epoch": 0.1435842124776703, + "grad_norm": 20.374530792236328, + "learning_rate": 9.942255382870506e-07, + "logits/chosen": -0.6019352674484253, + "logits/rejected": -0.5517061352729797, + "logps/chosen": -1.4612500667572021, + "logps/rejected": -1.5348877906799316, + "loss": 2.6918, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -14.612500190734863, + "rewards/margins": 0.736376941204071, + "rewards/rejected": -15.348876953125, + "step": 4260 + }, + { + "epoch": 0.14375273854865347, + "grad_norm": 36.474327087402344, + "learning_rate": 9.941808795241892e-07, + "logits/chosen": -0.4818621575832367, + "logits/rejected": -0.39904358983039856, + "logps/chosen": -1.5083070993423462, + "logps/rejected": -1.5565204620361328, + "loss": 2.742, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -15.08306884765625, + "rewards/margins": 0.48213452100753784, + "rewards/rejected": -15.565203666687012, + "step": 4265 + }, + { + "epoch": 0.14392126461963667, + "grad_norm": 19.787578582763672, + "learning_rate": 9.941360497447954e-07, + "logits/chosen": -0.3252061605453491, + "logits/rejected": -0.22330248355865479, + "logps/chosen": -1.7723124027252197, + "logps/rejected": -1.6830450296401978, + "loss": 4.0528, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.723125457763672, + "rewards/margins": -0.8926737904548645, + "rewards/rejected": -16.830448150634766, + "step": 4270 + }, + { + "epoch": 0.14408979069061983, + "grad_norm": 16.231342315673828, + "learning_rate": 9.94091048964383e-07, + "logits/chosen": -0.7457118034362793, + "logits/rejected": -0.5876340866088867, + "logps/chosen": -2.191019058227539, + "logps/rejected": -2.25704026222229, + "loss": 3.2538, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.91019058227539, + "rewards/margins": 0.6602120399475098, + "rewards/rejected": -22.570402145385742, + "step": 4275 + }, + { + "epoch": 0.14425831676160303, + "grad_norm": 18.32788848876953, + "learning_rate": 9.94045877198525e-07, + "logits/chosen": -0.18542389571666718, + "logits/rejected": -0.06351794302463531, + "logps/chosen": -1.9895687103271484, + "logps/rejected": -2.033998966217041, + "loss": 2.8715, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.895687103271484, + "rewards/margins": 0.44430312514305115, + "rewards/rejected": -20.339988708496094, + "step": 4280 + }, + { + "epoch": 0.1444268428325862, + "grad_norm": 39.58668518066406, + "learning_rate": 9.940005344628535e-07, + "logits/chosen": -0.24566316604614258, + "logits/rejected": -0.20698890089988708, + "logps/chosen": -1.7219035625457764, + "logps/rejected": -1.6719424724578857, + "loss": 3.6078, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.219036102294922, + "rewards/margins": -0.49961042404174805, + "rewards/rejected": -16.719425201416016, + "step": 4285 + }, + { + "epoch": 0.1445953689035694, + "grad_norm": 21.891944885253906, + "learning_rate": 9.9395502077306e-07, + "logits/chosen": -0.48740309476852417, + "logits/rejected": -0.5501845479011536, + "logps/chosen": -1.685595154762268, + "logps/rejected": -1.7272504568099976, + "loss": 2.7276, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.8559513092041, + "rewards/margins": 0.4165545403957367, + "rewards/rejected": -17.272504806518555, + "step": 4290 + }, + { + "epoch": 0.14476389497455255, + "grad_norm": 30.494169235229492, + "learning_rate": 9.939093361448944e-07, + "logits/chosen": -0.3153546452522278, + "logits/rejected": -0.32003530859947205, + "logps/chosen": -1.6743957996368408, + "logps/rejected": -1.6838239431381226, + "loss": 3.0992, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.743959426879883, + "rewards/margins": 0.09428195655345917, + "rewards/rejected": -16.838239669799805, + "step": 4295 + }, + { + "epoch": 0.14493242104553575, + "grad_norm": 22.064924240112305, + "learning_rate": 9.938634805941671e-07, + "logits/chosen": -0.9115715026855469, + "logits/rejected": -0.8669688105583191, + "logps/chosen": -1.8068641424179077, + "logps/rejected": -1.7367687225341797, + "loss": 3.8418, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.068639755249023, + "rewards/margins": -0.7009520530700684, + "rewards/rejected": -17.367687225341797, + "step": 4300 + }, + { + "epoch": 0.1451009471165189, + "grad_norm": 21.508249282836914, + "learning_rate": 9.938174541367466e-07, + "logits/chosen": -0.44647008180618286, + "logits/rejected": -0.6526331901550293, + "logps/chosen": -1.685974359512329, + "logps/rejected": -1.668164849281311, + "loss": 3.2786, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.8597412109375, + "rewards/margins": -0.17809438705444336, + "rewards/rejected": -16.6816463470459, + "step": 4305 + }, + { + "epoch": 0.1452694731875021, + "grad_norm": 31.063007354736328, + "learning_rate": 9.937712567885608e-07, + "logits/chosen": -0.49208030104637146, + "logits/rejected": -0.5488861203193665, + "logps/chosen": -2.050914764404297, + "logps/rejected": -1.9329230785369873, + "loss": 4.2608, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.509145736694336, + "rewards/margins": -1.1799161434173584, + "rewards/rejected": -19.3292293548584, + "step": 4310 + }, + { + "epoch": 0.1454379992584853, + "grad_norm": 29.91889762878418, + "learning_rate": 9.93724888565597e-07, + "logits/chosen": -0.3852444887161255, + "logits/rejected": -0.3011249005794525, + "logps/chosen": -2.0186514854431152, + "logps/rejected": -2.0107169151306152, + "loss": 3.1377, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.18651580810547, + "rewards/margins": -0.07934770733118057, + "rewards/rejected": -20.107166290283203, + "step": 4315 + }, + { + "epoch": 0.14560652532946847, + "grad_norm": 16.69972801208496, + "learning_rate": 9.93678349483901e-07, + "logits/chosen": -0.5411044955253601, + "logits/rejected": -0.6111071705818176, + "logps/chosen": -1.8235629796981812, + "logps/rejected": -1.8869975805282593, + "loss": 2.7772, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.23563003540039, + "rewards/margins": 0.6343483924865723, + "rewards/rejected": -18.869977951049805, + "step": 4320 + }, + { + "epoch": 0.14577505140045166, + "grad_norm": 24.001726150512695, + "learning_rate": 9.936316395595788e-07, + "logits/chosen": -0.2619974613189697, + "logits/rejected": -0.3185933530330658, + "logps/chosen": -2.2614998817443848, + "logps/rejected": -2.4379124641418457, + "loss": 2.6044, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.615001678466797, + "rewards/margins": 1.7641239166259766, + "rewards/rejected": -24.379125595092773, + "step": 4325 + }, + { + "epoch": 0.14594357747143483, + "grad_norm": 33.08540725708008, + "learning_rate": 9.935847588087942e-07, + "logits/chosen": -0.2971573770046234, + "logits/rejected": -0.37692004442214966, + "logps/chosen": -1.8515352010726929, + "logps/rejected": -1.8141686916351318, + "loss": 3.5533, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.51535415649414, + "rewards/margins": -0.373665988445282, + "rewards/rejected": -18.141687393188477, + "step": 4330 + }, + { + "epoch": 0.14611210354241802, + "grad_norm": 7.031495094299316, + "learning_rate": 9.935377072477709e-07, + "logits/chosen": -0.24837207794189453, + "logits/rejected": -0.22563381493091583, + "logps/chosen": -2.002490997314453, + "logps/rejected": -2.1387100219726562, + "loss": 2.0619, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -20.024911880493164, + "rewards/margins": 1.3621878623962402, + "rewards/rejected": -21.387096405029297, + "step": 4335 + }, + { + "epoch": 0.1462806296134012, + "grad_norm": 23.234272003173828, + "learning_rate": 9.934904848927919e-07, + "logits/chosen": -0.35107535123825073, + "logits/rejected": -0.28245970606803894, + "logps/chosen": -1.78196120262146, + "logps/rejected": -1.920299768447876, + "loss": 2.5926, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.81961441040039, + "rewards/margins": 1.3833829164505005, + "rewards/rejected": -19.2029972076416, + "step": 4340 + }, + { + "epoch": 0.14644915568438438, + "grad_norm": 20.092247009277344, + "learning_rate": 9.934430917601988e-07, + "logits/chosen": -0.9359011650085449, + "logits/rejected": -0.9503633379936218, + "logps/chosen": -1.5813624858856201, + "logps/rejected": -1.5645041465759277, + "loss": 3.2902, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -15.813626289367676, + "rewards/margins": -0.16858339309692383, + "rewards/rejected": -15.645042419433594, + "step": 4345 + }, + { + "epoch": 0.14661768175536755, + "grad_norm": 28.749696731567383, + "learning_rate": 9.933955278663926e-07, + "logits/chosen": -0.5206044912338257, + "logits/rejected": -0.6252120733261108, + "logps/chosen": -1.46475088596344, + "logps/rejected": -1.4322856664657593, + "loss": 3.5735, + "rewards/accuracies": 0.5, + "rewards/chosen": -14.647509574890137, + "rewards/margins": -0.32465142011642456, + "rewards/rejected": -14.322857856750488, + "step": 4350 + }, + { + "epoch": 0.14678620782635074, + "grad_norm": 35.41548538208008, + "learning_rate": 9.933477932278331e-07, + "logits/chosen": -0.2737705707550049, + "logits/rejected": -0.11229648441076279, + "logps/chosen": -1.6251589059829712, + "logps/rejected": -1.7368186712265015, + "loss": 3.4879, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.2515869140625, + "rewards/margins": 1.1165990829467773, + "rewards/rejected": -17.368188858032227, + "step": 4355 + }, + { + "epoch": 0.1469547338973339, + "grad_norm": 16.393056869506836, + "learning_rate": 9.932998878610395e-07, + "logits/chosen": -0.6516093015670776, + "logits/rejected": -0.7561215758323669, + "logps/chosen": -1.7601429224014282, + "logps/rejected": -1.7828855514526367, + "loss": 3.2031, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.601428985595703, + "rewards/margins": 0.2274264395236969, + "rewards/rejected": -17.828855514526367, + "step": 4360 + }, + { + "epoch": 0.1471232599683171, + "grad_norm": 26.61617660522461, + "learning_rate": 9.9325181178259e-07, + "logits/chosen": -0.7604036331176758, + "logits/rejected": -0.7403522729873657, + "logps/chosen": -1.8788295984268188, + "logps/rejected": -1.847124695777893, + "loss": 3.4314, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.78829574584961, + "rewards/margins": -0.3170498013496399, + "rewards/rejected": -18.47124671936035, + "step": 4365 + }, + { + "epoch": 0.1472917860393003, + "grad_norm": 26.9691162109375, + "learning_rate": 9.932035650091217e-07, + "logits/chosen": -0.3129528760910034, + "logits/rejected": -0.03679082915186882, + "logps/chosen": -2.2325453758239746, + "logps/rejected": -2.3919172286987305, + "loss": 2.0888, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.325454711914062, + "rewards/margins": 1.5937201976776123, + "rewards/rejected": -23.919174194335938, + "step": 4370 + }, + { + "epoch": 0.14746031211028346, + "grad_norm": 12.038168907165527, + "learning_rate": 9.93155147557331e-07, + "logits/chosen": -0.3520078659057617, + "logits/rejected": -0.44000229239463806, + "logps/chosen": -1.4542829990386963, + "logps/rejected": -1.5576039552688599, + "loss": 2.5207, + "rewards/accuracies": 0.5, + "rewards/chosen": -14.542831420898438, + "rewards/margins": 1.0332090854644775, + "rewards/rejected": -15.57603931427002, + "step": 4375 + }, + { + "epoch": 0.14762883818126665, + "grad_norm": 15.351378440856934, + "learning_rate": 9.931065594439734e-07, + "logits/chosen": -0.7100510001182556, + "logits/rejected": -0.6339886784553528, + "logps/chosen": -1.542508840560913, + "logps/rejected": -1.4648298025131226, + "loss": 3.8282, + "rewards/accuracies": 0.5, + "rewards/chosen": -15.425088882446289, + "rewards/margins": -0.7767902612686157, + "rewards/rejected": -14.648298263549805, + "step": 4380 + }, + { + "epoch": 0.14779736425224982, + "grad_norm": 26.496545791625977, + "learning_rate": 9.930578006858632e-07, + "logits/chosen": -0.7247037887573242, + "logits/rejected": -0.8406316041946411, + "logps/chosen": -1.6559617519378662, + "logps/rejected": -1.729272484779358, + "loss": 2.5924, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.559616088867188, + "rewards/margins": 0.7331069707870483, + "rewards/rejected": -17.292724609375, + "step": 4385 + }, + { + "epoch": 0.147965890323233, + "grad_norm": 32.76643371582031, + "learning_rate": 9.930088712998738e-07, + "logits/chosen": -0.6651844382286072, + "logits/rejected": -0.4605945646762848, + "logps/chosen": -2.0005507469177246, + "logps/rejected": -2.0000672340393066, + "loss": 3.2982, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.00550651550293, + "rewards/margins": -0.004834270570427179, + "rewards/rejected": -20.000673294067383, + "step": 4390 + }, + { + "epoch": 0.14813441639421618, + "grad_norm": 32.53934097290039, + "learning_rate": 9.929597713029379e-07, + "logits/chosen": -0.44316577911376953, + "logits/rejected": -0.5748022198677063, + "logps/chosen": -2.264150619506836, + "logps/rejected": -2.81333327293396, + "loss": 2.6056, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.64150619506836, + "rewards/margins": 5.491827011108398, + "rewards/rejected": -28.13333511352539, + "step": 4395 + }, + { + "epoch": 0.14830294246519937, + "grad_norm": 32.589378356933594, + "learning_rate": 9.929105007120468e-07, + "logits/chosen": -0.6730566024780273, + "logits/rejected": -0.6370115280151367, + "logps/chosen": -1.741729497909546, + "logps/rejected": -1.756996512413025, + "loss": 3.0312, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.417295455932617, + "rewards/margins": 0.15266962349414825, + "rewards/rejected": -17.569965362548828, + "step": 4400 + }, + { + "epoch": 0.14830294246519937, + "eval_logits/chosen": -0.7628591060638428, + "eval_logits/rejected": -0.7754251956939697, + "eval_logps/chosen": -1.6971991062164307, + "eval_logps/rejected": -1.709100604057312, + "eval_loss": 3.360646963119507, + "eval_rewards/accuracies": 0.49000000953674316, + "eval_rewards/chosen": -16.97199249267578, + "eval_rewards/margins": 0.11901436001062393, + "eval_rewards/rejected": -17.091007232666016, + "eval_runtime": 12.8869, + "eval_samples_per_second": 7.76, + "eval_steps_per_second": 1.94, + "step": 4400 + }, + { + "epoch": 0.14847146853618254, + "grad_norm": 24.610149383544922, + "learning_rate": 9.928610595442514e-07, + "logits/chosen": -0.3652550280094147, + "logits/rejected": -0.32519787549972534, + "logps/chosen": -2.136669635772705, + "logps/rejected": -2.183389902114868, + "loss": 3.8469, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.366697311401367, + "rewards/margins": 0.4672008454799652, + "rewards/rejected": -21.833898544311523, + "step": 4405 + }, + { + "epoch": 0.14863999460716573, + "grad_norm": 27.064762115478516, + "learning_rate": 9.928114478166613e-07, + "logits/chosen": -0.3917112648487091, + "logits/rejected": -0.38873490691185, + "logps/chosen": -1.883427381515503, + "logps/rejected": -1.9321670532226562, + "loss": 2.6834, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.834274291992188, + "rewards/margins": 0.48739489912986755, + "rewards/rejected": -19.321670532226562, + "step": 4410 + }, + { + "epoch": 0.1488085206781489, + "grad_norm": 17.143781661987305, + "learning_rate": 9.92761665546445e-07, + "logits/chosen": -0.6784085035324097, + "logits/rejected": -0.777459979057312, + "logps/chosen": -1.6531413793563843, + "logps/rejected": -1.7106910943984985, + "loss": 2.9767, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.531414031982422, + "rewards/margins": 0.5754953622817993, + "rewards/rejected": -17.106908798217773, + "step": 4415 + }, + { + "epoch": 0.1489770467491321, + "grad_norm": 20.776880264282227, + "learning_rate": 9.927117127508305e-07, + "logits/chosen": -0.9443651437759399, + "logits/rejected": -0.7738694548606873, + "logps/chosen": -1.633569359779358, + "logps/rejected": -1.733229398727417, + "loss": 2.2787, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.335697174072266, + "rewards/margins": 0.996599018573761, + "rewards/rejected": -17.332294464111328, + "step": 4420 + }, + { + "epoch": 0.14914557282011529, + "grad_norm": 76.14692687988281, + "learning_rate": 9.926615894471042e-07, + "logits/chosen": -0.2984052896499634, + "logits/rejected": -0.23642554879188538, + "logps/chosen": -1.9602696895599365, + "logps/rejected": -1.781272530555725, + "loss": 4.897, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.60269546508789, + "rewards/margins": -1.7899707555770874, + "rewards/rejected": -17.812725067138672, + "step": 4425 + }, + { + "epoch": 0.14931409889109845, + "grad_norm": 22.077974319458008, + "learning_rate": 9.926112956526118e-07, + "logits/chosen": -0.7455543279647827, + "logits/rejected": -0.7593054175376892, + "logps/chosen": -1.776155710220337, + "logps/rejected": -1.761704683303833, + "loss": 3.2266, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.76155662536621, + "rewards/margins": -0.14451150596141815, + "rewards/rejected": -17.617046356201172, + "step": 4430 + }, + { + "epoch": 0.14948262496208164, + "grad_norm": 30.44013214111328, + "learning_rate": 9.92560831384758e-07, + "logits/chosen": -0.28689366579055786, + "logits/rejected": -0.458076536655426, + "logps/chosen": -2.103567361831665, + "logps/rejected": -2.0760843753814697, + "loss": 3.5442, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.03567123413086, + "rewards/margins": -0.27482956647872925, + "rewards/rejected": -20.760845184326172, + "step": 4435 + }, + { + "epoch": 0.1496511510330648, + "grad_norm": 41.20957565307617, + "learning_rate": 9.925101966610067e-07, + "logits/chosen": -0.2810547947883606, + "logits/rejected": -0.3384184241294861, + "logps/chosen": -2.0202267169952393, + "logps/rejected": -2.081272602081299, + "loss": 2.7238, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.202266693115234, + "rewards/margins": 0.610458254814148, + "rewards/rejected": -20.812725067138672, + "step": 4440 + }, + { + "epoch": 0.149819677104048, + "grad_norm": 41.3414421081543, + "learning_rate": 9.924593914988806e-07, + "logits/chosen": -0.29100021719932556, + "logits/rejected": -0.4603959918022156, + "logps/chosen": -1.6702888011932373, + "logps/rejected": -1.7332197427749634, + "loss": 2.5224, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -16.70288848876953, + "rewards/margins": 0.6293088793754578, + "rewards/rejected": -17.332195281982422, + "step": 4445 + }, + { + "epoch": 0.14998820317503117, + "grad_norm": 13.796420097351074, + "learning_rate": 9.924084159159608e-07, + "logits/chosen": -0.5338420271873474, + "logits/rejected": -0.6065437197685242, + "logps/chosen": -1.4859281778335571, + "logps/rejected": -1.7795900106430054, + "loss": 1.2324, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -14.859281539916992, + "rewards/margins": 2.9366185665130615, + "rewards/rejected": -17.795900344848633, + "step": 4450 + }, + { + "epoch": 0.15015672924601436, + "grad_norm": 20.278915405273438, + "learning_rate": 9.923572699298888e-07, + "logits/chosen": -0.6260601282119751, + "logits/rejected": -0.5569266676902771, + "logps/chosen": -1.9350582361221313, + "logps/rejected": -1.8638197183609009, + "loss": 3.845, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.350582122802734, + "rewards/margins": -0.7123873233795166, + "rewards/rejected": -18.638195037841797, + "step": 4455 + }, + { + "epoch": 0.15032525531699753, + "grad_norm": 32.74846267700195, + "learning_rate": 9.923059535583636e-07, + "logits/chosen": -0.3299594223499298, + "logits/rejected": -0.18386907875537872, + "logps/chosen": -1.8987400531768799, + "logps/rejected": -1.8763782978057861, + "loss": 3.5642, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.98740005493164, + "rewards/margins": -0.22361735999584198, + "rewards/rejected": -18.763782501220703, + "step": 4460 + }, + { + "epoch": 0.15049378138798072, + "grad_norm": 36.09501266479492, + "learning_rate": 9.92254466819144e-07, + "logits/chosen": -0.3556022644042969, + "logits/rejected": -0.31961601972579956, + "logps/chosen": -1.8345237970352173, + "logps/rejected": -1.9695651531219482, + "loss": 1.9562, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.345237731933594, + "rewards/margins": 1.3504129648208618, + "rewards/rejected": -19.69565200805664, + "step": 4465 + }, + { + "epoch": 0.1506623074589639, + "grad_norm": 16.155614852905273, + "learning_rate": 9.922028097300475e-07, + "logits/chosen": -0.503342866897583, + "logits/rejected": -0.38056522607803345, + "logps/chosen": -1.684818983078003, + "logps/rejected": -1.8295650482177734, + "loss": 2.0349, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.848190307617188, + "rewards/margins": 1.4474587440490723, + "rewards/rejected": -18.295650482177734, + "step": 4470 + }, + { + "epoch": 0.15083083352994708, + "grad_norm": 25.365110397338867, + "learning_rate": 9.921509823089505e-07, + "logits/chosen": -0.8112923502922058, + "logits/rejected": -0.8188997507095337, + "logps/chosen": -1.9288915395736694, + "logps/rejected": -1.8551056385040283, + "loss": 3.8075, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.28891944885254, + "rewards/margins": -0.7378617525100708, + "rewards/rejected": -18.55105209350586, + "step": 4475 + }, + { + "epoch": 0.15099935960093028, + "grad_norm": 31.704790115356445, + "learning_rate": 9.920989845737885e-07, + "logits/chosen": -0.4111596941947937, + "logits/rejected": -0.4327424466609955, + "logps/chosen": -1.8056867122650146, + "logps/rejected": -1.8885425329208374, + "loss": 2.938, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.056869506835938, + "rewards/margins": 0.8285573124885559, + "rewards/rejected": -18.885425567626953, + "step": 4480 + }, + { + "epoch": 0.15116788567191344, + "grad_norm": 66.93142700195312, + "learning_rate": 9.92046816542556e-07, + "logits/chosen": -0.4983634054660797, + "logits/rejected": -0.44021081924438477, + "logps/chosen": -1.9229497909545898, + "logps/rejected": -1.9702413082122803, + "loss": 3.5794, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -19.2294979095459, + "rewards/margins": 0.47291526198387146, + "rewards/rejected": -19.70241355895996, + "step": 4485 + }, + { + "epoch": 0.15133641174289664, + "grad_norm": 27.996192932128906, + "learning_rate": 9.91994478233306e-07, + "logits/chosen": -0.21563634276390076, + "logits/rejected": -0.08831771463155746, + "logps/chosen": -1.7543987035751343, + "logps/rejected": -1.7821855545043945, + "loss": 3.1206, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.543987274169922, + "rewards/margins": 0.27786731719970703, + "rewards/rejected": -17.821855545043945, + "step": 4490 + }, + { + "epoch": 0.1515049378138798, + "grad_norm": 31.757884979248047, + "learning_rate": 9.919419696641512e-07, + "logits/chosen": -0.6939610838890076, + "logits/rejected": -0.7058770060539246, + "logps/chosen": -1.8848850727081299, + "logps/rejected": -1.9081329107284546, + "loss": 2.8662, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.84885025024414, + "rewards/margins": 0.23247957229614258, + "rewards/rejected": -19.081329345703125, + "step": 4495 + }, + { + "epoch": 0.151673463884863, + "grad_norm": 18.024682998657227, + "learning_rate": 9.918892908532621e-07, + "logits/chosen": -0.6620336174964905, + "logits/rejected": -0.6601449847221375, + "logps/chosen": -2.105888605117798, + "logps/rejected": -2.041553020477295, + "loss": 3.884, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.058883666992188, + "rewards/margins": -0.6433547735214233, + "rewards/rejected": -20.415531158447266, + "step": 4500 + }, + { + "epoch": 0.15184198995584616, + "grad_norm": 21.89647674560547, + "learning_rate": 9.918364418188692e-07, + "logits/chosen": -0.20931684970855713, + "logits/rejected": -0.26949331164360046, + "logps/chosen": -2.057158946990967, + "logps/rejected": -2.0671579837799072, + "loss": 3.5455, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.571590423583984, + "rewards/margins": 0.0999903678894043, + "rewards/rejected": -20.671581268310547, + "step": 4505 + }, + { + "epoch": 0.15201051602682936, + "grad_norm": 22.97340965270996, + "learning_rate": 9.917834225792615e-07, + "logits/chosen": -0.25889870524406433, + "logits/rejected": -0.5221267938613892, + "logps/chosen": -1.8550924062728882, + "logps/rejected": -1.8294522762298584, + "loss": 3.3505, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -18.55092430114746, + "rewards/margins": -0.2563992440700531, + "rewards/rejected": -18.294525146484375, + "step": 4510 + }, + { + "epoch": 0.15217904209781252, + "grad_norm": 28.872962951660156, + "learning_rate": 9.917302331527864e-07, + "logits/chosen": -0.4469106197357178, + "logits/rejected": -0.43801528215408325, + "logps/chosen": -1.8834269046783447, + "logps/rejected": -1.8372375965118408, + "loss": 3.8107, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -18.83426856994629, + "rewards/margins": -0.461892694234848, + "rewards/rejected": -18.37237548828125, + "step": 4515 + }, + { + "epoch": 0.15234756816879572, + "grad_norm": 22.910249710083008, + "learning_rate": 9.916768735578513e-07, + "logits/chosen": -0.40119099617004395, + "logits/rejected": -0.46237850189208984, + "logps/chosen": -1.7076635360717773, + "logps/rejected": -1.8835132122039795, + "loss": 2.4283, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.076635360717773, + "rewards/margins": 1.7584972381591797, + "rewards/rejected": -18.835132598876953, + "step": 4520 + }, + { + "epoch": 0.15251609423977888, + "grad_norm": 16.087495803833008, + "learning_rate": 9.916233438129213e-07, + "logits/chosen": -0.5997047424316406, + "logits/rejected": -0.5393815040588379, + "logps/chosen": -1.5532910823822021, + "logps/rejected": -1.5561127662658691, + "loss": 3.1255, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -15.53291130065918, + "rewards/margins": 0.0282150749117136, + "rewards/rejected": -15.561126708984375, + "step": 4525 + }, + { + "epoch": 0.15268462031076208, + "grad_norm": 19.14484405517578, + "learning_rate": 9.915696439365216e-07, + "logits/chosen": -0.553629457950592, + "logits/rejected": -0.4729720950126648, + "logps/chosen": -1.8226730823516846, + "logps/rejected": -1.6980514526367188, + "loss": 4.3086, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -18.226730346679688, + "rewards/margins": -1.2462149858474731, + "rewards/rejected": -16.980514526367188, + "step": 4530 + }, + { + "epoch": 0.15285314638174527, + "grad_norm": 33.58027648925781, + "learning_rate": 9.91515773947235e-07, + "logits/chosen": -0.2996135354042053, + "logits/rejected": -0.30325740575790405, + "logps/chosen": -1.7898555994033813, + "logps/rejected": -1.7090318202972412, + "loss": 3.8958, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.898555755615234, + "rewards/margins": -0.8082362413406372, + "rewards/rejected": -17.090320587158203, + "step": 4535 + }, + { + "epoch": 0.15302167245272844, + "grad_norm": 41.30805587768555, + "learning_rate": 9.914617338637038e-07, + "logits/chosen": -0.2837643027305603, + "logits/rejected": -0.28739625215530396, + "logps/chosen": -1.7426464557647705, + "logps/rejected": -1.8160514831542969, + "loss": 2.6695, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.426464080810547, + "rewards/margins": 0.7340496778488159, + "rewards/rejected": -18.16051483154297, + "step": 4540 + }, + { + "epoch": 0.15319019852371163, + "grad_norm": 169.09719848632812, + "learning_rate": 9.914075237046296e-07, + "logits/chosen": -0.5721148252487183, + "logits/rejected": -0.7339398860931396, + "logps/chosen": -1.804178237915039, + "logps/rejected": -1.5889647006988525, + "loss": 5.3903, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.04178237915039, + "rewards/margins": -2.1521360874176025, + "rewards/rejected": -15.88964557647705, + "step": 4545 + }, + { + "epoch": 0.1533587245946948, + "grad_norm": 20.485097885131836, + "learning_rate": 9.913531434887718e-07, + "logits/chosen": -0.6761281490325928, + "logits/rejected": -0.627028226852417, + "logps/chosen": -1.6126229763031006, + "logps/rejected": -1.644484281539917, + "loss": 2.8009, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.126230239868164, + "rewards/margins": 0.31861066818237305, + "rewards/rejected": -16.444841384887695, + "step": 4550 + }, + { + "epoch": 0.153527250665678, + "grad_norm": 14.091863632202148, + "learning_rate": 9.912985932349498e-07, + "logits/chosen": -0.5259329080581665, + "logits/rejected": -0.3877810835838318, + "logps/chosen": -1.4369596242904663, + "logps/rejected": -1.5198132991790771, + "loss": 2.5432, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -14.369596481323242, + "rewards/margins": 0.8285359144210815, + "rewards/rejected": -15.198132514953613, + "step": 4555 + }, + { + "epoch": 0.15369577673666115, + "grad_norm": 25.25543975830078, + "learning_rate": 9.912438729620412e-07, + "logits/chosen": -0.4410991072654724, + "logits/rejected": -0.6500669121742249, + "logps/chosen": -1.6194860935211182, + "logps/rejected": -1.7692973613739014, + "loss": 2.1913, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.19485855102539, + "rewards/margins": 1.4981143474578857, + "rewards/rejected": -17.692974090576172, + "step": 4560 + }, + { + "epoch": 0.15386430280764435, + "grad_norm": 12.284860610961914, + "learning_rate": 9.911889826889823e-07, + "logits/chosen": -0.3081240952014923, + "logits/rejected": -0.3586021959781647, + "logps/chosen": -1.7863963842391968, + "logps/rejected": -2.0202338695526123, + "loss": 2.0076, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.863964080810547, + "rewards/margins": 2.3383731842041016, + "rewards/rejected": -20.20233726501465, + "step": 4565 + }, + { + "epoch": 0.15403282887862751, + "grad_norm": 28.293598175048828, + "learning_rate": 9.911339224347684e-07, + "logits/chosen": -0.48678913712501526, + "logits/rejected": -0.541092038154602, + "logps/chosen": -1.9596703052520752, + "logps/rejected": -1.844909429550171, + "loss": 4.2267, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -19.59670066833496, + "rewards/margins": -1.147606372833252, + "rewards/rejected": -18.4490966796875, + "step": 4570 + }, + { + "epoch": 0.1542013549496107, + "grad_norm": 19.478225708007812, + "learning_rate": 9.91078692218454e-07, + "logits/chosen": -0.8977234959602356, + "logits/rejected": -0.8771381378173828, + "logps/chosen": -1.830026626586914, + "logps/rejected": -1.9383594989776611, + "loss": 2.5465, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.30026626586914, + "rewards/margins": 1.083329677581787, + "rewards/rejected": -19.383596420288086, + "step": 4575 + }, + { + "epoch": 0.15436988102059387, + "grad_norm": 25.026344299316406, + "learning_rate": 9.910232920591518e-07, + "logits/chosen": -0.702392578125, + "logits/rejected": -0.49518975615501404, + "logps/chosen": -1.558858871459961, + "logps/rejected": -1.669965386390686, + "loss": 2.4797, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -15.588589668273926, + "rewards/margins": 1.1110646724700928, + "rewards/rejected": -16.69965362548828, + "step": 4580 + }, + { + "epoch": 0.15453840709157707, + "grad_norm": 20.98350715637207, + "learning_rate": 9.90967721976034e-07, + "logits/chosen": -0.4868387281894684, + "logits/rejected": -0.3116268813610077, + "logps/chosen": -1.7529996633529663, + "logps/rejected": -1.7481448650360107, + "loss": 3.2614, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.529998779296875, + "rewards/margins": -0.04854869842529297, + "rewards/rejected": -17.481449127197266, + "step": 4585 + }, + { + "epoch": 0.15470693316256026, + "grad_norm": 24.198535919189453, + "learning_rate": 9.90911981988331e-07, + "logits/chosen": -0.8397296667098999, + "logits/rejected": -0.8432148098945618, + "logps/chosen": -1.7114646434783936, + "logps/rejected": -1.6712682247161865, + "loss": 3.5441, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -17.11464500427246, + "rewards/margins": -0.40196236968040466, + "rewards/rejected": -16.71268081665039, + "step": 4590 + }, + { + "epoch": 0.15487545923354343, + "grad_norm": 13.411236763000488, + "learning_rate": 9.90856072115332e-07, + "logits/chosen": -0.35893386602401733, + "logits/rejected": -0.3036018908023834, + "logps/chosen": -1.6867424249649048, + "logps/rejected": -1.7620487213134766, + "loss": 2.993, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.86742401123047, + "rewards/margins": 0.7530642747879028, + "rewards/rejected": -17.620487213134766, + "step": 4595 + }, + { + "epoch": 0.15504398530452662, + "grad_norm": 46.425350189208984, + "learning_rate": 9.907999923763855e-07, + "logits/chosen": -0.5545027852058411, + "logits/rejected": -0.5339478254318237, + "logps/chosen": -1.9723618030548096, + "logps/rejected": -1.987330436706543, + "loss": 3.1193, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.723617553710938, + "rewards/margins": 0.1496877670288086, + "rewards/rejected": -19.873302459716797, + "step": 4600 + }, + { + "epoch": 0.1552125113755098, + "grad_norm": 20.02104377746582, + "learning_rate": 9.907437427908983e-07, + "logits/chosen": -0.4415665566921234, + "logits/rejected": -0.5307387113571167, + "logps/chosen": -1.5795601606369019, + "logps/rejected": -1.679488182067871, + "loss": 2.3314, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -15.795602798461914, + "rewards/margins": 0.9992785453796387, + "rewards/rejected": -16.794879913330078, + "step": 4605 + }, + { + "epoch": 0.15538103744649298, + "grad_norm": 26.12117576599121, + "learning_rate": 9.906873233783363e-07, + "logits/chosen": -0.814510703086853, + "logits/rejected": -0.5907190442085266, + "logps/chosen": -1.5405075550079346, + "logps/rejected": -1.7465384006500244, + "loss": 2.3861, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -15.405075073242188, + "rewards/margins": 2.0603084564208984, + "rewards/rejected": -17.465383529663086, + "step": 4610 + }, + { + "epoch": 0.15554956351747615, + "grad_norm": 19.746633529663086, + "learning_rate": 9.90630734158224e-07, + "logits/chosen": -0.43650826811790466, + "logits/rejected": -0.686555027961731, + "logps/chosen": -1.5947755575180054, + "logps/rejected": -1.760087251663208, + "loss": 2.3027, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -15.947755813598633, + "rewards/margins": 1.6531174182891846, + "rewards/rejected": -17.600873947143555, + "step": 4615 + }, + { + "epoch": 0.15571808958845934, + "grad_norm": 17.813255310058594, + "learning_rate": 9.905739751501447e-07, + "logits/chosen": -0.5361579656600952, + "logits/rejected": -0.4771784842014313, + "logps/chosen": -1.6195627450942993, + "logps/rejected": -1.671136498451233, + "loss": 2.7585, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.195627212524414, + "rewards/margins": 0.5157370567321777, + "rewards/rejected": -16.71136474609375, + "step": 4620 + }, + { + "epoch": 0.1558866156594425, + "grad_norm": 27.102527618408203, + "learning_rate": 9.905170463737405e-07, + "logits/chosen": -0.6031167507171631, + "logits/rejected": -0.5590722560882568, + "logps/chosen": -1.8172991275787354, + "logps/rejected": -1.7973215579986572, + "loss": 3.2828, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.172992706298828, + "rewards/margins": -0.19977673888206482, + "rewards/rejected": -17.973215103149414, + "step": 4625 + }, + { + "epoch": 0.1560551417304257, + "grad_norm": 69.08274841308594, + "learning_rate": 9.904599478487121e-07, + "logits/chosen": -0.30473047494888306, + "logits/rejected": -0.28297197818756104, + "logps/chosen": -2.3783857822418213, + "logps/rejected": -2.160964012145996, + "loss": 5.4019, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -23.783857345581055, + "rewards/margins": -2.1742167472839355, + "rewards/rejected": -21.609642028808594, + "step": 4630 + }, + { + "epoch": 0.15622366780140887, + "grad_norm": 19.284488677978516, + "learning_rate": 9.90402679594819e-07, + "logits/chosen": -0.8089929819107056, + "logits/rejected": -0.8168126344680786, + "logps/chosen": -1.6827844381332397, + "logps/rejected": -1.726898193359375, + "loss": 2.802, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.827844619750977, + "rewards/margins": 0.4411369264125824, + "rewards/rejected": -17.26898193359375, + "step": 4635 + }, + { + "epoch": 0.15639219387239206, + "grad_norm": 25.2092342376709, + "learning_rate": 9.903452416318796e-07, + "logits/chosen": -0.4793424606323242, + "logits/rejected": -0.42350155115127563, + "logps/chosen": -1.631096601486206, + "logps/rejected": -1.5376965999603271, + "loss": 3.9991, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -16.31096649169922, + "rewards/margins": -0.934001088142395, + "rewards/rejected": -15.376965522766113, + "step": 4640 + }, + { + "epoch": 0.15656071994337525, + "grad_norm": 25.0135498046875, + "learning_rate": 9.90287633979771e-07, + "logits/chosen": -0.27238425612449646, + "logits/rejected": -0.3240829408168793, + "logps/chosen": -2.0996146202087402, + "logps/rejected": -2.365563154220581, + "loss": 1.3909, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.99614906311035, + "rewards/margins": 2.6594841480255127, + "rewards/rejected": -23.6556339263916, + "step": 4645 + }, + { + "epoch": 0.15672924601435842, + "grad_norm": 22.960716247558594, + "learning_rate": 9.90229856658429e-07, + "logits/chosen": -0.21965241432189941, + "logits/rejected": -0.4097241461277008, + "logps/chosen": -1.7522590160369873, + "logps/rejected": -1.758040189743042, + "loss": 3.5186, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.5225887298584, + "rewards/margins": 0.057814598083496094, + "rewards/rejected": -17.580402374267578, + "step": 4650 + }, + { + "epoch": 0.1568977720853416, + "grad_norm": 25.19063377380371, + "learning_rate": 9.901719096878476e-07, + "logits/chosen": -0.6338127255439758, + "logits/rejected": -0.5322138071060181, + "logps/chosen": -1.6390702724456787, + "logps/rejected": -1.6156508922576904, + "loss": 3.5505, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.390703201293945, + "rewards/margins": -0.23419399559497833, + "rewards/rejected": -16.156509399414062, + "step": 4655 + }, + { + "epoch": 0.15706629815632478, + "grad_norm": 21.21099090576172, + "learning_rate": 9.901137930880802e-07, + "logits/chosen": -0.6699775457382202, + "logits/rejected": -0.7210027575492859, + "logps/chosen": -1.574340581893921, + "logps/rejected": -1.5693788528442383, + "loss": 3.1195, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -15.743408203125, + "rewards/margins": -0.04962043836712837, + "rewards/rejected": -15.69378662109375, + "step": 4660 + }, + { + "epoch": 0.15723482422730797, + "grad_norm": 15.227494239807129, + "learning_rate": 9.90055506879239e-07, + "logits/chosen": -0.5711018443107605, + "logits/rejected": -0.4353240430355072, + "logps/chosen": -1.8625409603118896, + "logps/rejected": -2.0394296646118164, + "loss": 1.9554, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.625408172607422, + "rewards/margins": 1.7688881158828735, + "rewards/rejected": -20.394298553466797, + "step": 4665 + }, + { + "epoch": 0.15740335029829114, + "grad_norm": 22.745201110839844, + "learning_rate": 9.899970510814941e-07, + "logits/chosen": -0.5177310705184937, + "logits/rejected": -0.5119687914848328, + "logps/chosen": -1.8094947338104248, + "logps/rejected": -1.8350107669830322, + "loss": 3.8939, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -18.094947814941406, + "rewards/margins": 0.25515851378440857, + "rewards/rejected": -18.350107192993164, + "step": 4670 + }, + { + "epoch": 0.15757187636927433, + "grad_norm": 30.20716667175293, + "learning_rate": 9.899384257150752e-07, + "logits/chosen": -0.2176884412765503, + "logits/rejected": -0.2588900625705719, + "logps/chosen": -1.8750667572021484, + "logps/rejected": -2.0222976207733154, + "loss": 2.3665, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.750667572021484, + "rewards/margins": 1.4723093509674072, + "rewards/rejected": -20.22297477722168, + "step": 4675 + }, + { + "epoch": 0.1577404024402575, + "grad_norm": 44.010440826416016, + "learning_rate": 9.898796308002698e-07, + "logits/chosen": -0.7645952701568604, + "logits/rejected": -0.5875598192214966, + "logps/chosen": -1.7068369388580322, + "logps/rejected": -1.7132556438446045, + "loss": 3.0981, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.068368911743164, + "rewards/margins": 0.06418828666210175, + "rewards/rejected": -17.132556915283203, + "step": 4680 + }, + { + "epoch": 0.1579089285112407, + "grad_norm": 37.04304885864258, + "learning_rate": 9.898206663574244e-07, + "logits/chosen": -0.6298079490661621, + "logits/rejected": -0.5319895148277283, + "logps/chosen": -1.7374365329742432, + "logps/rejected": -1.611778974533081, + "loss": 4.3514, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -17.374366760253906, + "rewards/margins": -1.2565762996673584, + "rewards/rejected": -16.11779022216797, + "step": 4685 + }, + { + "epoch": 0.15807745458222386, + "grad_norm": 16.909269332885742, + "learning_rate": 9.897615324069447e-07, + "logits/chosen": -0.4190613627433777, + "logits/rejected": -0.3520964980125427, + "logps/chosen": -1.878212332725525, + "logps/rejected": -2.0033230781555176, + "loss": 2.2772, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.782123565673828, + "rewards/margins": 1.2511035203933716, + "rewards/rejected": -20.03322982788086, + "step": 4690 + }, + { + "epoch": 0.15824598065320705, + "grad_norm": 28.17768096923828, + "learning_rate": 9.897022289692946e-07, + "logits/chosen": -0.217799574136734, + "logits/rejected": -0.20938460528850555, + "logps/chosen": -2.18269681930542, + "logps/rejected": -2.183464527130127, + "loss": 3.3206, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.826969146728516, + "rewards/margins": 0.007677173707634211, + "rewards/rejected": -21.834646224975586, + "step": 4695 + }, + { + "epoch": 0.15841450672419025, + "grad_norm": 20.799894332885742, + "learning_rate": 9.896427560649965e-07, + "logits/chosen": -0.2771221399307251, + "logits/rejected": -0.3630429804325104, + "logps/chosen": -1.9276962280273438, + "logps/rejected": -1.8691962957382202, + "loss": 3.7554, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.276962280273438, + "rewards/margins": -0.5849997401237488, + "rewards/rejected": -18.69196128845215, + "step": 4700 + }, + { + "epoch": 0.1585830327951734, + "grad_norm": 26.0824031829834, + "learning_rate": 9.895831137146318e-07, + "logits/chosen": -0.4591868817806244, + "logits/rejected": -0.49390801787376404, + "logps/chosen": -1.8923221826553345, + "logps/rejected": -1.9390461444854736, + "loss": 2.8115, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.923221588134766, + "rewards/margins": 0.4672381281852722, + "rewards/rejected": -19.390460968017578, + "step": 4705 + }, + { + "epoch": 0.1587515588661566, + "grad_norm": 17.884014129638672, + "learning_rate": 9.8952330193884e-07, + "logits/chosen": -0.7502976059913635, + "logits/rejected": -0.45593467354774475, + "logps/chosen": -1.6914688348770142, + "logps/rejected": -1.630499243736267, + "loss": 3.7795, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.914688110351562, + "rewards/margins": -0.609697163105011, + "rewards/rejected": -16.304988861083984, + "step": 4710 + }, + { + "epoch": 0.15892008493713977, + "grad_norm": 21.209936141967773, + "learning_rate": 9.894633207583202e-07, + "logits/chosen": -0.7454978227615356, + "logits/rejected": -0.7061313390731812, + "logps/chosen": -1.8046905994415283, + "logps/rejected": -1.8066291809082031, + "loss": 3.2366, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -18.046905517578125, + "rewards/margins": 0.019386673346161842, + "rewards/rejected": -18.066293716430664, + "step": 4715 + }, + { + "epoch": 0.15908861100812297, + "grad_norm": 23.273921966552734, + "learning_rate": 9.894031701938287e-07, + "logits/chosen": -0.47708067297935486, + "logits/rejected": -0.5105594396591187, + "logps/chosen": -1.8260581493377686, + "logps/rejected": -1.7911970615386963, + "loss": 4.0011, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.260583877563477, + "rewards/margins": -0.34861230850219727, + "rewards/rejected": -17.911970138549805, + "step": 4720 + }, + { + "epoch": 0.15925713707910613, + "grad_norm": 31.692359924316406, + "learning_rate": 9.89342850266182e-07, + "logits/chosen": -0.05979665368795395, + "logits/rejected": -0.1203019842505455, + "logps/chosen": -2.0891809463500977, + "logps/rejected": -2.055260181427002, + "loss": 3.5457, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.89181137084961, + "rewards/margins": -0.3392105996608734, + "rewards/rejected": -20.552600860595703, + "step": 4725 + }, + { + "epoch": 0.15942566315008933, + "grad_norm": 29.511369705200195, + "learning_rate": 9.892823609962543e-07, + "logits/chosen": -0.5264648199081421, + "logits/rejected": -0.5226877331733704, + "logps/chosen": -1.7901853322982788, + "logps/rejected": -1.7577362060546875, + "loss": 3.4421, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.90185546875, + "rewards/margins": -0.32449159026145935, + "rewards/rejected": -17.577362060546875, + "step": 4730 + }, + { + "epoch": 0.1595941892210725, + "grad_norm": 69.38639068603516, + "learning_rate": 9.89221702404978e-07, + "logits/chosen": -0.338792622089386, + "logits/rejected": -0.20237763226032257, + "logps/chosen": -1.7657020092010498, + "logps/rejected": -1.7486886978149414, + "loss": 3.3845, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.65702247619629, + "rewards/margins": -0.1701340675354004, + "rewards/rejected": -17.486886978149414, + "step": 4735 + }, + { + "epoch": 0.15976271529205568, + "grad_norm": 18.09201431274414, + "learning_rate": 9.891608745133453e-07, + "logits/chosen": -0.45816025137901306, + "logits/rejected": -0.4524189829826355, + "logps/chosen": -1.8241355419158936, + "logps/rejected": -1.8072960376739502, + "loss": 3.4098, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.241355895996094, + "rewards/margins": -0.16839809715747833, + "rewards/rejected": -18.07295799255371, + "step": 4740 + }, + { + "epoch": 0.15993124136303885, + "grad_norm": 12.629457473754883, + "learning_rate": 9.890998773424061e-07, + "logits/chosen": -0.7667060494422913, + "logits/rejected": -0.8384987711906433, + "logps/chosen": -1.699568510055542, + "logps/rejected": -1.9147104024887085, + "loss": 1.3524, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -16.995685577392578, + "rewards/margins": 2.1514182090759277, + "rewards/rejected": -19.147104263305664, + "step": 4745 + }, + { + "epoch": 0.16009976743402204, + "grad_norm": 25.00735092163086, + "learning_rate": 9.890387109132692e-07, + "logits/chosen": -0.6868584752082825, + "logits/rejected": -0.7392014265060425, + "logps/chosen": -1.9946911334991455, + "logps/rejected": -2.007953643798828, + "loss": 3.0638, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.946910858154297, + "rewards/margins": 0.13262434303760529, + "rewards/rejected": -20.07953453063965, + "step": 4750 + }, + { + "epoch": 0.1602682935050052, + "grad_norm": 26.735124588012695, + "learning_rate": 9.889773752471017e-07, + "logits/chosen": -0.16993948817253113, + "logits/rejected": -0.21659204363822937, + "logps/chosen": -1.8886897563934326, + "logps/rejected": -1.9271786212921143, + "loss": 3.3185, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.886898040771484, + "rewards/margins": 0.3848879933357239, + "rewards/rejected": -19.271785736083984, + "step": 4755 + }, + { + "epoch": 0.1604368195759884, + "grad_norm": 21.052892684936523, + "learning_rate": 9.889158703651296e-07, + "logits/chosen": -0.460991233587265, + "logits/rejected": -0.5180394649505615, + "logps/chosen": -1.4519951343536377, + "logps/rejected": -1.507206678390503, + "loss": 2.5809, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -14.519950866699219, + "rewards/margins": 0.552115797996521, + "rewards/rejected": -15.072067260742188, + "step": 4760 + }, + { + "epoch": 0.1606053456469716, + "grad_norm": 38.17951965332031, + "learning_rate": 9.888541962886371e-07, + "logits/chosen": -0.2759491503238678, + "logits/rejected": -0.2980247735977173, + "logps/chosen": -1.7075881958007812, + "logps/rejected": -1.8610565662384033, + "loss": 2.0582, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -17.075881958007812, + "rewards/margins": 1.5346832275390625, + "rewards/rejected": -18.610565185546875, + "step": 4765 + }, + { + "epoch": 0.16077387171795476, + "grad_norm": 27.016674041748047, + "learning_rate": 9.887923530389676e-07, + "logits/chosen": -0.5326557159423828, + "logits/rejected": -0.6520851850509644, + "logps/chosen": -2.017056941986084, + "logps/rejected": -1.8724479675292969, + "loss": 4.602, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.17057228088379, + "rewards/margins": -1.4460914134979248, + "rewards/rejected": -18.7244815826416, + "step": 4770 + }, + { + "epoch": 0.16094239778893796, + "grad_norm": 21.01801300048828, + "learning_rate": 9.887303406375224e-07, + "logits/chosen": -0.3734387159347534, + "logits/rejected": -0.3648655116558075, + "logps/chosen": -1.9372440576553345, + "logps/rejected": -1.8856014013290405, + "loss": 3.5991, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.3724422454834, + "rewards/margins": -0.5164254903793335, + "rewards/rejected": -18.856014251708984, + "step": 4775 + }, + { + "epoch": 0.16111092385992112, + "grad_norm": 21.466033935546875, + "learning_rate": 9.886681591057613e-07, + "logits/chosen": -0.06261344254016876, + "logits/rejected": -0.1325574368238449, + "logps/chosen": -2.264781951904297, + "logps/rejected": -2.4676618576049805, + "loss": 1.8788, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.647817611694336, + "rewards/margins": 2.028799533843994, + "rewards/rejected": -24.676618576049805, + "step": 4780 + }, + { + "epoch": 0.16127944993090432, + "grad_norm": 24.781084060668945, + "learning_rate": 9.886058084652032e-07, + "logits/chosen": -0.6147192716598511, + "logits/rejected": -0.6076455116271973, + "logps/chosen": -1.492680549621582, + "logps/rejected": -1.502992868423462, + "loss": 3.0557, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -14.92680549621582, + "rewards/margins": 0.10312385857105255, + "rewards/rejected": -15.029928207397461, + "step": 4785 + }, + { + "epoch": 0.16144797600188748, + "grad_norm": 276.31402587890625, + "learning_rate": 9.885432887374252e-07, + "logits/chosen": -0.6703779101371765, + "logits/rejected": -0.6189125180244446, + "logps/chosen": -2.240828275680542, + "logps/rejected": -2.2110419273376465, + "loss": 3.3497, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -22.408281326293945, + "rewards/margins": -0.2978610098361969, + "rewards/rejected": -22.110422134399414, + "step": 4790 + }, + { + "epoch": 0.16161650207287068, + "grad_norm": 24.567428588867188, + "learning_rate": 9.884805999440627e-07, + "logits/chosen": -0.40758800506591797, + "logits/rejected": -0.35104599595069885, + "logps/chosen": -2.059055805206299, + "logps/rejected": -1.9487988948822021, + "loss": 4.1751, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.59055519104004, + "rewards/margins": -1.1025663614273071, + "rewards/rejected": -19.487987518310547, + "step": 4795 + }, + { + "epoch": 0.16178502814385384, + "grad_norm": 36.26649856567383, + "learning_rate": 9.8841774210681e-07, + "logits/chosen": -0.3545742928981781, + "logits/rejected": -0.39434343576431274, + "logps/chosen": -1.829400658607483, + "logps/rejected": -1.7184016704559326, + "loss": 4.145, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -18.294008255004883, + "rewards/margins": -1.1099907159805298, + "rewards/rejected": -17.184017181396484, + "step": 4800 + }, + { + "epoch": 0.16178502814385384, + "eval_logits/chosen": -0.7746235728263855, + "eval_logits/rejected": -0.7902061939239502, + "eval_logps/chosen": -1.708162784576416, + "eval_logps/rejected": -1.7237504720687866, + "eval_loss": 3.340670585632324, + "eval_rewards/accuracies": 0.5099999904632568, + "eval_rewards/chosen": -17.081628799438477, + "eval_rewards/margins": 0.15587686002254486, + "eval_rewards/rejected": -17.237504959106445, + "eval_runtime": 12.8967, + "eval_samples_per_second": 7.754, + "eval_steps_per_second": 1.938, + "step": 4800 + }, + { + "epoch": 0.16195355421483704, + "grad_norm": 27.378616333007812, + "learning_rate": 9.883547152474195e-07, + "logits/chosen": -0.25638216733932495, + "logits/rejected": -0.1468086540699005, + "logps/chosen": -1.7783082723617554, + "logps/rejected": -1.8178768157958984, + "loss": 3.2298, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.783084869384766, + "rewards/margins": 0.3956846594810486, + "rewards/rejected": -18.178768157958984, + "step": 4805 + }, + { + "epoch": 0.1621220802858202, + "grad_norm": 20.546716690063477, + "learning_rate": 9.882915193877024e-07, + "logits/chosen": -0.921650767326355, + "logits/rejected": -0.7422033548355103, + "logps/chosen": -1.7827374935150146, + "logps/rejected": -1.8927743434906006, + "loss": 2.4601, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.827375411987305, + "rewards/margins": 1.1003668308258057, + "rewards/rejected": -18.927743911743164, + "step": 4810 + }, + { + "epoch": 0.1622906063568034, + "grad_norm": 27.94754409790039, + "learning_rate": 9.882281545495285e-07, + "logits/chosen": -0.07513687759637833, + "logits/rejected": -0.22319336235523224, + "logps/chosen": -1.5725983381271362, + "logps/rejected": -1.6626056432724, + "loss": 2.6133, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -15.725984573364258, + "rewards/margins": 0.9000707864761353, + "rewards/rejected": -16.626056671142578, + "step": 4815 + }, + { + "epoch": 0.1624591324277866, + "grad_norm": 28.80483055114746, + "learning_rate": 9.881646207548257e-07, + "logits/chosen": -0.7495092153549194, + "logits/rejected": -0.6132189631462097, + "logps/chosen": -1.7083728313446045, + "logps/rejected": -1.913587212562561, + "loss": 3.0995, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.083728790283203, + "rewards/margins": 2.052143096923828, + "rewards/rejected": -19.1358699798584, + "step": 4820 + }, + { + "epoch": 0.16262765849876976, + "grad_norm": 20.03661346435547, + "learning_rate": 9.881009180255807e-07, + "logits/chosen": -0.4534785747528076, + "logits/rejected": -0.32503554224967957, + "logps/chosen": -1.5937612056732178, + "logps/rejected": -1.6601520776748657, + "loss": 2.6911, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -15.937612533569336, + "rewards/margins": 0.6639073491096497, + "rewards/rejected": -16.601520538330078, + "step": 4825 + }, + { + "epoch": 0.16279618456975295, + "grad_norm": 26.125856399536133, + "learning_rate": 9.88037046383838e-07, + "logits/chosen": -0.3081539571285248, + "logits/rejected": -0.33038654923439026, + "logps/chosen": -1.4090408086776733, + "logps/rejected": -1.528576135635376, + "loss": 2.7093, + "rewards/accuracies": 0.5, + "rewards/chosen": -14.090408325195312, + "rewards/margins": 1.1953526735305786, + "rewards/rejected": -15.285760879516602, + "step": 4830 + }, + { + "epoch": 0.16296471064073612, + "grad_norm": 28.715280532836914, + "learning_rate": 9.879730058517017e-07, + "logits/chosen": -0.3715924322605133, + "logits/rejected": -0.35126742720603943, + "logps/chosen": -1.8122129440307617, + "logps/rejected": -1.9514148235321045, + "loss": 2.1753, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.122127532958984, + "rewards/margins": 1.3920185565948486, + "rewards/rejected": -19.514148712158203, + "step": 4835 + }, + { + "epoch": 0.1631332367117193, + "grad_norm": 34.09306716918945, + "learning_rate": 9.879087964513335e-07, + "logits/chosen": -0.27698105573654175, + "logits/rejected": -0.3473663926124573, + "logps/chosen": -2.0653817653656006, + "logps/rejected": -2.1154792308807373, + "loss": 2.7881, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.653818130493164, + "rewards/margins": 0.5009748339653015, + "rewards/rejected": -21.15479278564453, + "step": 4840 + }, + { + "epoch": 0.16330176278270248, + "grad_norm": 24.596439361572266, + "learning_rate": 9.878444182049537e-07, + "logits/chosen": -0.7519720792770386, + "logits/rejected": -0.7053220868110657, + "logps/chosen": -1.832969069480896, + "logps/rejected": -1.7468926906585693, + "loss": 3.9439, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.32969093322754, + "rewards/margins": -0.8607624173164368, + "rewards/rejected": -17.468929290771484, + "step": 4845 + }, + { + "epoch": 0.16347028885368567, + "grad_norm": 21.327348709106445, + "learning_rate": 9.87779871134841e-07, + "logits/chosen": -0.3140432834625244, + "logits/rejected": -0.38714686036109924, + "logps/chosen": -1.9222043752670288, + "logps/rejected": -1.9501903057098389, + "loss": 3.0331, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.222043991088867, + "rewards/margins": 0.27986058592796326, + "rewards/rejected": -19.501903533935547, + "step": 4850 + }, + { + "epoch": 0.16363881492466884, + "grad_norm": 17.42647933959961, + "learning_rate": 9.877151552633327e-07, + "logits/chosen": -0.388538658618927, + "logits/rejected": -0.17918941378593445, + "logps/chosen": -1.743137001991272, + "logps/rejected": -1.9893146753311157, + "loss": 3.62, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.43136978149414, + "rewards/margins": 2.4617762565612793, + "rewards/rejected": -19.893146514892578, + "step": 4855 + }, + { + "epoch": 0.16380734099565203, + "grad_norm": 51.683448791503906, + "learning_rate": 9.876502706128242e-07, + "logits/chosen": -0.4844183325767517, + "logits/rejected": -0.7024241089820862, + "logps/chosen": -1.7798906564712524, + "logps/rejected": -1.8352922201156616, + "loss": 3.1281, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.798908233642578, + "rewards/margins": 0.5540148019790649, + "rewards/rejected": -18.352920532226562, + "step": 4860 + }, + { + "epoch": 0.1639758670666352, + "grad_norm": 24.889793395996094, + "learning_rate": 9.875852172057699e-07, + "logits/chosen": -0.8120881915092468, + "logits/rejected": -0.7175842523574829, + "logps/chosen": -1.6519649028778076, + "logps/rejected": -1.7745163440704346, + "loss": 2.6253, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.519649505615234, + "rewards/margins": 1.2255139350891113, + "rewards/rejected": -17.745162963867188, + "step": 4865 + }, + { + "epoch": 0.1641443931376184, + "grad_norm": 37.9599609375, + "learning_rate": 9.87519995064682e-07, + "logits/chosen": -0.24003906548023224, + "logits/rejected": -0.4899943470954895, + "logps/chosen": -2.0363316535949707, + "logps/rejected": -1.8726288080215454, + "loss": 4.9414, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.36331558227539, + "rewards/margins": -1.637028455734253, + "rewards/rejected": -18.726289749145508, + "step": 4870 + }, + { + "epoch": 0.16431291920860158, + "grad_norm": 14.678876876831055, + "learning_rate": 9.874546042121313e-07, + "logits/chosen": -0.6595714688301086, + "logits/rejected": -0.593315601348877, + "logps/chosen": -1.868060827255249, + "logps/rejected": -1.9109785556793213, + "loss": 3.0881, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.68060874938965, + "rewards/margins": 0.429177850484848, + "rewards/rejected": -19.109785079956055, + "step": 4875 + }, + { + "epoch": 0.16448144527958475, + "grad_norm": 36.223548889160156, + "learning_rate": 9.873890446707469e-07, + "logits/chosen": -0.40664395689964294, + "logits/rejected": -0.4028412699699402, + "logps/chosen": -1.7218236923217773, + "logps/rejected": -1.8740679025650024, + "loss": 2.015, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.218236923217773, + "rewards/margins": 1.5224418640136719, + "rewards/rejected": -18.740680694580078, + "step": 4880 + }, + { + "epoch": 0.16464997135056794, + "grad_norm": 29.82371711730957, + "learning_rate": 9.873233164632166e-07, + "logits/chosen": -0.4903503954410553, + "logits/rejected": -0.4249725341796875, + "logps/chosen": -1.972914457321167, + "logps/rejected": -2.142824649810791, + "loss": 2.1319, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.729143142700195, + "rewards/margins": 1.6991031169891357, + "rewards/rejected": -21.428245544433594, + "step": 4885 + }, + { + "epoch": 0.1648184974215511, + "grad_norm": 18.373544692993164, + "learning_rate": 9.872574196122863e-07, + "logits/chosen": -0.42714181542396545, + "logits/rejected": -0.5435231924057007, + "logps/chosen": -1.7797390222549438, + "logps/rejected": -1.795607328414917, + "loss": 3.3615, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -17.79738998413086, + "rewards/margins": 0.1586824357509613, + "rewards/rejected": -17.956073760986328, + "step": 4890 + }, + { + "epoch": 0.1649870234925343, + "grad_norm": 25.699377059936523, + "learning_rate": 9.871913541407602e-07, + "logits/chosen": -0.8359493017196655, + "logits/rejected": -1.0766808986663818, + "logps/chosen": -1.8503162860870361, + "logps/rejected": -1.7836980819702148, + "loss": 3.7392, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -18.503162384033203, + "rewards/margins": -0.6661826968193054, + "rewards/rejected": -17.83698272705078, + "step": 4895 + }, + { + "epoch": 0.16515554956351747, + "grad_norm": 24.661052703857422, + "learning_rate": 9.87125120071501e-07, + "logits/chosen": -0.6496170163154602, + "logits/rejected": -0.6843874454498291, + "logps/chosen": -1.588417410850525, + "logps/rejected": -1.6372764110565186, + "loss": 2.6524, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -15.884173393249512, + "rewards/margins": 0.48858919739723206, + "rewards/rejected": -16.37276268005371, + "step": 4900 + }, + { + "epoch": 0.16532407563450066, + "grad_norm": 32.23655319213867, + "learning_rate": 9.870587174274297e-07, + "logits/chosen": -0.5077248811721802, + "logits/rejected": -0.4583125114440918, + "logps/chosen": -1.8581647872924805, + "logps/rejected": -1.850717544555664, + "loss": 3.3088, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.581645965576172, + "rewards/margins": -0.07447147369384766, + "rewards/rejected": -18.50717544555664, + "step": 4905 + }, + { + "epoch": 0.16549260170548383, + "grad_norm": 20.224699020385742, + "learning_rate": 9.869921462315256e-07, + "logits/chosen": -0.49347972869873047, + "logits/rejected": -0.45034274458885193, + "logps/chosen": -1.4449265003204346, + "logps/rejected": -1.6118764877319336, + "loss": 1.9775, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -14.449264526367188, + "rewards/margins": 1.6694999933242798, + "rewards/rejected": -16.118764877319336, + "step": 4910 + }, + { + "epoch": 0.16566112777646702, + "grad_norm": 20.77910041809082, + "learning_rate": 9.869254065068265e-07, + "logits/chosen": -0.5645862817764282, + "logits/rejected": -0.7167826294898987, + "logps/chosen": -1.7465883493423462, + "logps/rejected": -1.704671859741211, + "loss": 3.5355, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -17.465885162353516, + "rewards/margins": -0.41916388273239136, + "rewards/rejected": -17.04671859741211, + "step": 4915 + }, + { + "epoch": 0.1658296538474502, + "grad_norm": 33.12343978881836, + "learning_rate": 9.868584982764282e-07, + "logits/chosen": -0.4147886335849762, + "logits/rejected": -0.5667712092399597, + "logps/chosen": -1.6357110738754272, + "logps/rejected": -1.6382662057876587, + "loss": 3.2923, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.35711097717285, + "rewards/margins": 0.025551462545990944, + "rewards/rejected": -16.38266372680664, + "step": 4920 + }, + { + "epoch": 0.16599817991843338, + "grad_norm": 99.80087280273438, + "learning_rate": 9.867914215634852e-07, + "logits/chosen": -0.45811495184898376, + "logits/rejected": -0.39101505279541016, + "logps/chosen": -2.020596981048584, + "logps/rejected": -2.055602550506592, + "loss": 2.8389, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.20596694946289, + "rewards/margins": 0.35005730390548706, + "rewards/rejected": -20.5560245513916, + "step": 4925 + }, + { + "epoch": 0.16616670598941657, + "grad_norm": 25.863933563232422, + "learning_rate": 9.867241763912098e-07, + "logits/chosen": -0.879712700843811, + "logits/rejected": -0.911721408367157, + "logps/chosen": -1.6381375789642334, + "logps/rejected": -1.5870459079742432, + "loss": 3.6379, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -16.38137435913086, + "rewards/margins": -0.5109177827835083, + "rewards/rejected": -15.870458602905273, + "step": 4930 + }, + { + "epoch": 0.16633523206039974, + "grad_norm": 78.96031188964844, + "learning_rate": 9.866567627828735e-07, + "logits/chosen": -0.7218554615974426, + "logits/rejected": -0.7632007598876953, + "logps/chosen": -2.043527841567993, + "logps/rejected": -1.9857170581817627, + "loss": 3.6717, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.43527603149414, + "rewards/margins": -0.5781074166297913, + "rewards/rejected": -19.85717010498047, + "step": 4935 + }, + { + "epoch": 0.16650375813138293, + "grad_norm": 21.739521026611328, + "learning_rate": 9.865891807618048e-07, + "logits/chosen": -0.5816560983657837, + "logits/rejected": -0.528420090675354, + "logps/chosen": -1.5282984972000122, + "logps/rejected": -1.672467589378357, + "loss": 1.9666, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -15.282984733581543, + "rewards/margins": 1.4416911602020264, + "rewards/rejected": -16.72467613220215, + "step": 4940 + }, + { + "epoch": 0.1666722842023661, + "grad_norm": 36.299537658691406, + "learning_rate": 9.865214303513916e-07, + "logits/chosen": -0.32008111476898193, + "logits/rejected": -0.10507240146398544, + "logps/chosen": -2.123836040496826, + "logps/rejected": -2.2200088500976562, + "loss": 2.7891, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.238357543945312, + "rewards/margins": 0.9617301821708679, + "rewards/rejected": -22.200088500976562, + "step": 4945 + }, + { + "epoch": 0.1668408102733493, + "grad_norm": 56.05241012573242, + "learning_rate": 9.864535115750795e-07, + "logits/chosen": -0.21717092394828796, + "logits/rejected": -0.2897348403930664, + "logps/chosen": -1.9129555225372314, + "logps/rejected": -1.926944375038147, + "loss": 3.1466, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.12955665588379, + "rewards/margins": 0.13988880813121796, + "rewards/rejected": -19.26944351196289, + "step": 4950 + }, + { + "epoch": 0.16700933634433246, + "grad_norm": 21.449539184570312, + "learning_rate": 9.863854244563725e-07, + "logits/chosen": -0.43917790055274963, + "logits/rejected": -0.44615238904953003, + "logps/chosen": -1.9873645305633545, + "logps/rejected": -2.0402169227600098, + "loss": 2.9082, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.873645782470703, + "rewards/margins": 0.5285249948501587, + "rewards/rejected": -20.402172088623047, + "step": 4955 + }, + { + "epoch": 0.16717786241531565, + "grad_norm": 37.11687469482422, + "learning_rate": 9.86317169018833e-07, + "logits/chosen": -0.5457051396369934, + "logits/rejected": -0.5516785979270935, + "logps/chosen": -1.803820252418518, + "logps/rejected": -1.8196312189102173, + "loss": 3.1856, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.038204193115234, + "rewards/margins": 0.1581093817949295, + "rewards/rejected": -18.19631004333496, + "step": 4960 + }, + { + "epoch": 0.16734638848629882, + "grad_norm": 25.299577713012695, + "learning_rate": 9.862487452860814e-07, + "logits/chosen": -0.3777625262737274, + "logits/rejected": -0.3700116276741028, + "logps/chosen": -1.6833274364471436, + "logps/rejected": -1.6934551000595093, + "loss": 3.1463, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.833276748657227, + "rewards/margins": 0.10127668082714081, + "rewards/rejected": -16.934551239013672, + "step": 4965 + }, + { + "epoch": 0.167514914557282, + "grad_norm": 25.97972297668457, + "learning_rate": 9.861801532817965e-07, + "logits/chosen": -0.7144454717636108, + "logits/rejected": -0.647177517414093, + "logps/chosen": -1.6258128881454468, + "logps/rejected": -1.7660753726959229, + "loss": 2.6313, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.258129119873047, + "rewards/margins": 1.4026236534118652, + "rewards/rejected": -17.660751342773438, + "step": 4970 + }, + { + "epoch": 0.16768344062826518, + "grad_norm": 16.2631778717041, + "learning_rate": 9.861113930297155e-07, + "logits/chosen": -0.34379979968070984, + "logits/rejected": -0.5184779763221741, + "logps/chosen": -1.8947486877441406, + "logps/rejected": -2.0445353984832764, + "loss": 3.0566, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.947486877441406, + "rewards/margins": 1.4978677034378052, + "rewards/rejected": -20.445354461669922, + "step": 4975 + }, + { + "epoch": 0.16785196669924837, + "grad_norm": 15.502371788024902, + "learning_rate": 9.86042464553633e-07, + "logits/chosen": -0.37498247623443604, + "logits/rejected": -0.42046207189559937, + "logps/chosen": -1.5052076578140259, + "logps/rejected": -1.6121822595596313, + "loss": 2.1299, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -15.05207633972168, + "rewards/margins": 1.0697453022003174, + "rewards/rejected": -16.121822357177734, + "step": 4980 + }, + { + "epoch": 0.16802049277023157, + "grad_norm": 22.429950714111328, + "learning_rate": 9.859733678774031e-07, + "logits/chosen": -0.5181079506874084, + "logits/rejected": -0.4065426290035248, + "logps/chosen": -1.9371169805526733, + "logps/rejected": -2.3126235008239746, + "loss": 1.6999, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.371170043945312, + "rewards/margins": 3.7550644874572754, + "rewards/rejected": -23.12623405456543, + "step": 4985 + }, + { + "epoch": 0.16818901884121473, + "grad_norm": 36.0225944519043, + "learning_rate": 9.859041030249372e-07, + "logits/chosen": -0.22574977576732635, + "logits/rejected": -0.2761825621128082, + "logps/chosen": -2.0685524940490723, + "logps/rejected": -1.9672033786773682, + "loss": 4.0854, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.685522079467773, + "rewards/margins": -1.0134881734848022, + "rewards/rejected": -19.672035217285156, + "step": 4990 + }, + { + "epoch": 0.16835754491219793, + "grad_norm": 78.74285125732422, + "learning_rate": 9.858346700202048e-07, + "logits/chosen": -0.24602890014648438, + "logits/rejected": -0.36132198572158813, + "logps/chosen": -2.4286651611328125, + "logps/rejected": -2.3484768867492676, + "loss": 3.8898, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -24.28664779663086, + "rewards/margins": -0.8018797636032104, + "rewards/rejected": -23.48476791381836, + "step": 4995 + }, + { + "epoch": 0.1685260709831811, + "grad_norm": 29.308990478515625, + "learning_rate": 9.857650688872345e-07, + "logits/chosen": -0.6732075214385986, + "logits/rejected": -0.6380990147590637, + "logps/chosen": -1.8012161254882812, + "logps/rejected": -1.7454330921173096, + "loss": 3.6902, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -18.012157440185547, + "rewards/margins": -0.5578286051750183, + "rewards/rejected": -17.454328536987305, + "step": 5000 + }, + { + "epoch": 0.16869459705416429, + "grad_norm": 19.303850173950195, + "learning_rate": 9.856952996501121e-07, + "logits/chosen": -0.5144712924957275, + "logits/rejected": -0.6592981219291687, + "logps/chosen": -1.752929925918579, + "logps/rejected": -1.8932254314422607, + "loss": 3.3654, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -17.529298782348633, + "rewards/margins": 1.402956247329712, + "rewards/rejected": -18.932254791259766, + "step": 5005 + }, + { + "epoch": 0.16886312312514745, + "grad_norm": 21.755300521850586, + "learning_rate": 9.856253623329822e-07, + "logits/chosen": -0.38438963890075684, + "logits/rejected": -0.384308397769928, + "logps/chosen": -1.6376289129257202, + "logps/rejected": -1.8839448690414429, + "loss": 1.7633, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -16.37628936767578, + "rewards/margins": 2.4631593227386475, + "rewards/rejected": -18.83945083618164, + "step": 5010 + }, + { + "epoch": 0.16903164919613065, + "grad_norm": 29.190107345581055, + "learning_rate": 9.855552569600473e-07, + "logits/chosen": -0.35856691002845764, + "logits/rejected": -0.40881720185279846, + "logps/chosen": -1.7899494171142578, + "logps/rejected": -1.7577186822891235, + "loss": 3.5591, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.899494171142578, + "rewards/margins": -0.3223080635070801, + "rewards/rejected": -17.577186584472656, + "step": 5015 + }, + { + "epoch": 0.1692001752671138, + "grad_norm": 10.551178932189941, + "learning_rate": 9.85484983555568e-07, + "logits/chosen": -0.6378435492515564, + "logits/rejected": -0.5291840434074402, + "logps/chosen": -1.5740575790405273, + "logps/rejected": -1.7817277908325195, + "loss": 1.8232, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -15.740574836730957, + "rewards/margins": 2.076702833175659, + "rewards/rejected": -17.817277908325195, + "step": 5020 + }, + { + "epoch": 0.169368701338097, + "grad_norm": 39.82679748535156, + "learning_rate": 9.854145421438634e-07, + "logits/chosen": -0.4193429946899414, + "logits/rejected": -0.6050506830215454, + "logps/chosen": -1.467621088027954, + "logps/rejected": -1.443719506263733, + "loss": 3.3306, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -14.676210403442383, + "rewards/margins": -0.2390153855085373, + "rewards/rejected": -14.43719482421875, + "step": 5025 + }, + { + "epoch": 0.16953722740908017, + "grad_norm": 37.75446701049805, + "learning_rate": 9.853439327493102e-07, + "logits/chosen": -0.5865007638931274, + "logits/rejected": -0.5626325607299805, + "logps/chosen": -1.9477989673614502, + "logps/rejected": -2.027604818344116, + "loss": 2.6758, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.477991104125977, + "rewards/margins": 0.7980586290359497, + "rewards/rejected": -20.276050567626953, + "step": 5030 + }, + { + "epoch": 0.16970575348006336, + "grad_norm": 18.804014205932617, + "learning_rate": 9.852731553963435e-07, + "logits/chosen": -0.3155723512172699, + "logits/rejected": -0.3093792200088501, + "logps/chosen": -2.113027572631836, + "logps/rejected": -1.9566431045532227, + "loss": 4.6724, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.130273818969727, + "rewards/margins": -1.5638428926467896, + "rewards/rejected": -19.56643295288086, + "step": 5035 + }, + { + "epoch": 0.16987427955104656, + "grad_norm": 47.76856231689453, + "learning_rate": 9.85202210109457e-07, + "logits/chosen": -0.4034988284111023, + "logits/rejected": -0.4541633725166321, + "logps/chosen": -1.8395111560821533, + "logps/rejected": -1.839477300643921, + "loss": 3.3137, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.395111083984375, + "rewards/margins": -0.00033893584623001516, + "rewards/rejected": -18.394771575927734, + "step": 5040 + }, + { + "epoch": 0.17004280562202972, + "grad_norm": 27.453750610351562, + "learning_rate": 9.851310969132017e-07, + "logits/chosen": -0.2835858464241028, + "logits/rejected": -0.3324558734893799, + "logps/chosen": -1.7421255111694336, + "logps/rejected": -1.6985328197479248, + "loss": 3.5587, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -17.421253204345703, + "rewards/margins": -0.4359270930290222, + "rewards/rejected": -16.985326766967773, + "step": 5045 + }, + { + "epoch": 0.17021133169301292, + "grad_norm": 24.93321418762207, + "learning_rate": 9.850598158321871e-07, + "logits/chosen": -0.6215084195137024, + "logits/rejected": -0.5685603618621826, + "logps/chosen": -1.6707217693328857, + "logps/rejected": -1.7257225513458252, + "loss": 2.6856, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.707218170166016, + "rewards/margins": 0.5500091314315796, + "rewards/rejected": -17.257226943969727, + "step": 5050 + }, + { + "epoch": 0.17037985776399608, + "grad_norm": 11.612754821777344, + "learning_rate": 9.849883668910808e-07, + "logits/chosen": -0.5886046290397644, + "logits/rejected": -0.5103198289871216, + "logps/chosen": -1.879009485244751, + "logps/rejected": -1.9458684921264648, + "loss": 2.9753, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.790096282958984, + "rewards/margins": 0.6685900688171387, + "rewards/rejected": -19.45868492126465, + "step": 5055 + }, + { + "epoch": 0.17054838383497928, + "grad_norm": 37.67578887939453, + "learning_rate": 9.849167501146087e-07, + "logits/chosen": -0.46718257665634155, + "logits/rejected": -0.6271299123764038, + "logps/chosen": -1.9356199502944946, + "logps/rejected": -1.833788514137268, + "loss": 4.194, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.356197357177734, + "rewards/margins": -1.0183136463165283, + "rewards/rejected": -18.3378849029541, + "step": 5060 + }, + { + "epoch": 0.17071690990596244, + "grad_norm": 90.10015106201172, + "learning_rate": 9.848449655275542e-07, + "logits/chosen": -0.508590817451477, + "logits/rejected": -0.489675909280777, + "logps/chosen": -2.6385538578033447, + "logps/rejected": -2.5385284423828125, + "loss": 4.5381, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -26.385540008544922, + "rewards/margins": -1.0002546310424805, + "rewards/rejected": -25.385284423828125, + "step": 5065 + }, + { + "epoch": 0.17088543597694564, + "grad_norm": 18.061155319213867, + "learning_rate": 9.847730131547592e-07, + "logits/chosen": -0.7201946973800659, + "logits/rejected": -0.6724601984024048, + "logps/chosen": -1.8912174701690674, + "logps/rejected": -1.9988510608673096, + "loss": 2.908, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.912174224853516, + "rewards/margins": 1.0763366222381592, + "rewards/rejected": -19.988510131835938, + "step": 5070 + }, + { + "epoch": 0.1710539620479288, + "grad_norm": 18.397972106933594, + "learning_rate": 9.847008930211238e-07, + "logits/chosen": -0.6509224772453308, + "logits/rejected": -0.5626755952835083, + "logps/chosen": -1.779077172279358, + "logps/rejected": -1.9971240758895874, + "loss": 2.4914, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.790771484375, + "rewards/margins": 2.180471420288086, + "rewards/rejected": -19.971242904663086, + "step": 5075 + }, + { + "epoch": 0.171222488118912, + "grad_norm": 43.98146057128906, + "learning_rate": 9.846286051516055e-07, + "logits/chosen": -0.7050510048866272, + "logits/rejected": -0.6167488098144531, + "logps/chosen": -1.776510238647461, + "logps/rejected": -1.702623724937439, + "loss": 3.8085, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -17.765100479125977, + "rewards/margins": -0.7388646006584167, + "rewards/rejected": -17.02623748779297, + "step": 5080 + }, + { + "epoch": 0.17139101418989516, + "grad_norm": 32.1152458190918, + "learning_rate": 9.84556149571221e-07, + "logits/chosen": -0.5005149841308594, + "logits/rejected": -0.6876319050788879, + "logps/chosen": -1.667649507522583, + "logps/rejected": -1.7641193866729736, + "loss": 2.361, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -16.676494598388672, + "rewards/margins": 0.9646992683410645, + "rewards/rejected": -17.641193389892578, + "step": 5085 + }, + { + "epoch": 0.17155954026087836, + "grad_norm": 58.48193359375, + "learning_rate": 9.844835263050435e-07, + "logits/chosen": -0.5897291898727417, + "logits/rejected": -0.6116207242012024, + "logps/chosen": -1.7926311492919922, + "logps/rejected": -1.6217567920684814, + "loss": 4.739, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -17.926311492919922, + "rewards/margins": -1.7087459564208984, + "rewards/rejected": -16.217565536499023, + "step": 5090 + }, + { + "epoch": 0.17172806633186155, + "grad_norm": 87.55854797363281, + "learning_rate": 9.844107353782054e-07, + "logits/chosen": -0.3844572901725769, + "logits/rejected": -0.7152290940284729, + "logps/chosen": -1.977447509765625, + "logps/rejected": -1.8635094165802002, + "loss": 4.4243, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -19.77447509765625, + "rewards/margins": -1.139381766319275, + "rewards/rejected": -18.63509178161621, + "step": 5095 + }, + { + "epoch": 0.17189659240284472, + "grad_norm": 30.744888305664062, + "learning_rate": 9.843377768158971e-07, + "logits/chosen": -0.12349516153335571, + "logits/rejected": -0.04538143798708916, + "logps/chosen": -2.1029043197631836, + "logps/rejected": -2.2611899375915527, + "loss": 2.5362, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.029043197631836, + "rewards/margins": 1.5828584432601929, + "rewards/rejected": -22.611900329589844, + "step": 5100 + }, + { + "epoch": 0.1720651184738279, + "grad_norm": 16.7409610748291, + "learning_rate": 9.842646506433663e-07, + "logits/chosen": -0.7018550634384155, + "logits/rejected": -0.5086437463760376, + "logps/chosen": -1.8820219039916992, + "logps/rejected": -1.940272331237793, + "loss": 2.9036, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.820220947265625, + "rewards/margins": 0.5825031995773315, + "rewards/rejected": -19.402721405029297, + "step": 5105 + }, + { + "epoch": 0.17223364454481108, + "grad_norm": 24.791820526123047, + "learning_rate": 9.84191356885919e-07, + "logits/chosen": -0.44258204102516174, + "logits/rejected": -0.49366721510887146, + "logps/chosen": -1.5188804864883423, + "logps/rejected": -1.4613652229309082, + "loss": 3.6738, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -15.188802719116211, + "rewards/margins": -0.5751511454582214, + "rewards/rejected": -14.613652229309082, + "step": 5110 + }, + { + "epoch": 0.17240217061579427, + "grad_norm": 38.88006591796875, + "learning_rate": 9.841178955689197e-07, + "logits/chosen": -0.4519910216331482, + "logits/rejected": -0.45148682594299316, + "logps/chosen": -1.992713212966919, + "logps/rejected": -2.0434980392456055, + "loss": 3.3871, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.92713165283203, + "rewards/margins": 0.5078484416007996, + "rewards/rejected": -20.434978485107422, + "step": 5115 + }, + { + "epoch": 0.17257069668677744, + "grad_norm": 20.674013137817383, + "learning_rate": 9.840442667177902e-07, + "logits/chosen": -0.4431692957878113, + "logits/rejected": -0.42282018065452576, + "logps/chosen": -1.7422406673431396, + "logps/rejected": -1.968392014503479, + "loss": 1.8286, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.422407150268555, + "rewards/margins": 2.2615127563476562, + "rewards/rejected": -19.68391990661621, + "step": 5120 + }, + { + "epoch": 0.17273922275776063, + "grad_norm": 22.77712631225586, + "learning_rate": 9.839704703580104e-07, + "logits/chosen": -0.5449277758598328, + "logits/rejected": -0.4195954203605652, + "logps/chosen": -1.7473876476287842, + "logps/rejected": -1.7471768856048584, + "loss": 3.135, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.473878860473633, + "rewards/margins": -0.002109623048454523, + "rewards/rejected": -17.47176742553711, + "step": 5125 + }, + { + "epoch": 0.1729077488287438, + "grad_norm": 27.39888572692871, + "learning_rate": 9.838965065151185e-07, + "logits/chosen": -0.5343486070632935, + "logits/rejected": -0.5076795816421509, + "logps/chosen": -1.7901744842529297, + "logps/rejected": -1.8608520030975342, + "loss": 2.9082, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.901744842529297, + "rewards/margins": 0.7067748308181763, + "rewards/rejected": -18.608518600463867, + "step": 5130 + }, + { + "epoch": 0.173076274899727, + "grad_norm": 17.52597427368164, + "learning_rate": 9.838223752147105e-07, + "logits/chosen": -0.1843159943819046, + "logits/rejected": -0.2545137405395508, + "logps/chosen": -1.8627382516860962, + "logps/rejected": -1.9470560550689697, + "loss": 2.4678, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.627382278442383, + "rewards/margins": 0.8431784510612488, + "rewards/rejected": -19.47056007385254, + "step": 5135 + }, + { + "epoch": 0.17324480097071016, + "grad_norm": 35.71249008178711, + "learning_rate": 9.837480764824404e-07, + "logits/chosen": -0.557886004447937, + "logits/rejected": -0.3560100197792053, + "logps/chosen": -1.9558770656585693, + "logps/rejected": -1.8917385339736938, + "loss": 3.773, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -19.55877113342285, + "rewards/margins": -0.641386091709137, + "rewards/rejected": -18.91738510131836, + "step": 5140 + }, + { + "epoch": 0.17341332704169335, + "grad_norm": 7.512803077697754, + "learning_rate": 9.836736103440199e-07, + "logits/chosen": -0.22889885306358337, + "logits/rejected": -0.15423983335494995, + "logps/chosen": -2.0584750175476074, + "logps/rejected": -2.3373608589172363, + "loss": 2.4755, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.584747314453125, + "rewards/margins": 2.788860559463501, + "rewards/rejected": -23.373611450195312, + "step": 5145 + }, + { + "epoch": 0.17358185311267654, + "grad_norm": 26.877891540527344, + "learning_rate": 9.835989768252188e-07, + "logits/chosen": -0.718030571937561, + "logits/rejected": -0.7842522859573364, + "logps/chosen": -1.863952875137329, + "logps/rejected": -1.868159532546997, + "loss": 3.3168, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.639530181884766, + "rewards/margins": 0.04206543043255806, + "rewards/rejected": -18.68159294128418, + "step": 5150 + }, + { + "epoch": 0.1737503791836597, + "grad_norm": 13.796395301818848, + "learning_rate": 9.835241759518648e-07, + "logits/chosen": -0.6250889301300049, + "logits/rejected": -0.5135878324508667, + "logps/chosen": -2.1139869689941406, + "logps/rejected": -2.2198023796081543, + "loss": 2.6164, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.139867782592773, + "rewards/margins": 1.0581531524658203, + "rewards/rejected": -22.198020935058594, + "step": 5155 + }, + { + "epoch": 0.1739189052546429, + "grad_norm": 32.31558609008789, + "learning_rate": 9.834492077498438e-07, + "logits/chosen": -0.4362711012363434, + "logits/rejected": -0.2519915997982025, + "logps/chosen": -2.14894437789917, + "logps/rejected": -2.1211888790130615, + "loss": 3.382, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.489444732666016, + "rewards/margins": -0.27755576372146606, + "rewards/rejected": -21.21188735961914, + "step": 5160 + }, + { + "epoch": 0.17408743132562607, + "grad_norm": 39.9125862121582, + "learning_rate": 9.833740722450989e-07, + "logits/chosen": -0.18216952681541443, + "logits/rejected": -0.32020363211631775, + "logps/chosen": -1.8249847888946533, + "logps/rejected": -2.1250081062316895, + "loss": 2.6139, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.249849319458008, + "rewards/margins": 3.0002331733703613, + "rewards/rejected": -21.250080108642578, + "step": 5165 + }, + { + "epoch": 0.17425595739660926, + "grad_norm": 61.76107406616211, + "learning_rate": 9.832987694636318e-07, + "logits/chosen": -0.7317668795585632, + "logits/rejected": -0.8237543106079102, + "logps/chosen": -1.6339073181152344, + "logps/rejected": -1.7707321643829346, + "loss": 2.3452, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.339075088500977, + "rewards/margins": 1.3682477474212646, + "rewards/rejected": -17.707321166992188, + "step": 5170 + }, + { + "epoch": 0.17442448346759243, + "grad_norm": 15.688084602355957, + "learning_rate": 9.83223299431502e-07, + "logits/chosen": -0.45724186301231384, + "logits/rejected": -0.6557691693305969, + "logps/chosen": -1.8950207233428955, + "logps/rejected": -1.9983152151107788, + "loss": 2.846, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.950206756591797, + "rewards/margins": 1.032945990562439, + "rewards/rejected": -19.983150482177734, + "step": 5175 + }, + { + "epoch": 0.17459300953857562, + "grad_norm": 163.08804321289062, + "learning_rate": 9.831476621748262e-07, + "logits/chosen": -0.27674490213394165, + "logits/rejected": -0.11143366992473602, + "logps/chosen": -2.5093579292297363, + "logps/rejected": -2.5167133808135986, + "loss": 3.8225, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.093576431274414, + "rewards/margins": 0.07355757057666779, + "rewards/rejected": -25.16713523864746, + "step": 5180 + }, + { + "epoch": 0.1747615356095588, + "grad_norm": 39.82172393798828, + "learning_rate": 9.8307185771978e-07, + "logits/chosen": -0.4417055547237396, + "logits/rejected": -0.5631288886070251, + "logps/chosen": -1.964228630065918, + "logps/rejected": -1.9234275817871094, + "loss": 3.6418, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.642284393310547, + "rewards/margins": -0.408011257648468, + "rewards/rejected": -19.23427391052246, + "step": 5185 + }, + { + "epoch": 0.17493006168054198, + "grad_norm": 16.453245162963867, + "learning_rate": 9.82995886092596e-07, + "logits/chosen": -0.4573501944541931, + "logits/rejected": -0.3422870934009552, + "logps/chosen": -2.0120418071746826, + "logps/rejected": -2.1436915397644043, + "loss": 2.3553, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.120418548583984, + "rewards/margins": 1.316498041152954, + "rewards/rejected": -21.43691635131836, + "step": 5190 + }, + { + "epoch": 0.17509858775152515, + "grad_norm": 18.16073989868164, + "learning_rate": 9.829197473195653e-07, + "logits/chosen": -0.5349117517471313, + "logits/rejected": -0.5821327567100525, + "logps/chosen": -1.5197559595108032, + "logps/rejected": -1.6109358072280884, + "loss": 2.6405, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -15.197558403015137, + "rewards/margins": 0.9117996096611023, + "rewards/rejected": -16.109357833862305, + "step": 5195 + }, + { + "epoch": 0.17526711382250834, + "grad_norm": 20.79674530029297, + "learning_rate": 9.828434414270362e-07, + "logits/chosen": -0.48446908593177795, + "logits/rejected": -0.4432446360588074, + "logps/chosen": -2.0608952045440674, + "logps/rejected": -1.973127007484436, + "loss": 3.9514, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.608951568603516, + "rewards/margins": -0.8776818513870239, + "rewards/rejected": -19.73126983642578, + "step": 5200 + }, + { + "epoch": 0.17526711382250834, + "eval_logits/chosen": -0.8001312017440796, + "eval_logits/rejected": -0.8201078772544861, + "eval_logps/chosen": -1.7195191383361816, + "eval_logps/rejected": -1.7392371892929077, + "eval_loss": 3.3126471042633057, + "eval_rewards/accuracies": 0.5099999904632568, + "eval_rewards/chosen": -17.195192337036133, + "eval_rewards/margins": 0.1971810758113861, + "eval_rewards/rejected": -17.392372131347656, + "eval_runtime": 12.9075, + "eval_samples_per_second": 7.747, + "eval_steps_per_second": 1.937, + "step": 5200 + }, + { + "epoch": 0.17543563989349154, + "grad_norm": 46.21132278442383, + "learning_rate": 9.827669684414153e-07, + "logits/chosen": -0.9259397387504578, + "logits/rejected": -0.6422527432441711, + "logps/chosen": -1.5217444896697998, + "logps/rejected": -1.620764970779419, + "loss": 2.2411, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -15.217447280883789, + "rewards/margins": 0.9902023077011108, + "rewards/rejected": -16.2076473236084, + "step": 5205 + }, + { + "epoch": 0.1756041659644747, + "grad_norm": 50.1036491394043, + "learning_rate": 9.826903283891667e-07, + "logits/chosen": -0.6379965543746948, + "logits/rejected": -0.6837188005447388, + "logps/chosen": -1.9009840488433838, + "logps/rejected": -1.8843845129013062, + "loss": 3.2654, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.009838104248047, + "rewards/margins": -0.16599377989768982, + "rewards/rejected": -18.84384536743164, + "step": 5210 + }, + { + "epoch": 0.1757726920354579, + "grad_norm": 32.01786804199219, + "learning_rate": 9.82613521296813e-07, + "logits/chosen": -0.43797287344932556, + "logits/rejected": -0.4878421723842621, + "logps/chosen": -1.6981594562530518, + "logps/rejected": -1.7703397274017334, + "loss": 2.5659, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.98159408569336, + "rewards/margins": 0.7218042612075806, + "rewards/rejected": -17.703399658203125, + "step": 5215 + }, + { + "epoch": 0.17594121810644106, + "grad_norm": 24.32670021057129, + "learning_rate": 9.825365471909337e-07, + "logits/chosen": -0.22852332890033722, + "logits/rejected": -0.220924973487854, + "logps/chosen": -1.610447883605957, + "logps/rejected": -1.6706794500350952, + "loss": 2.923, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.104480743408203, + "rewards/margins": 0.6023159027099609, + "rewards/rejected": -16.7067928314209, + "step": 5220 + }, + { + "epoch": 0.17610974417742425, + "grad_norm": 19.045787811279297, + "learning_rate": 9.824594060981665e-07, + "logits/chosen": -0.2582782208919525, + "logits/rejected": -0.3515141010284424, + "logps/chosen": -1.7661092281341553, + "logps/rejected": -1.8709022998809814, + "loss": 2.6033, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.66109275817871, + "rewards/margins": 1.0479302406311035, + "rewards/rejected": -18.70902442932129, + "step": 5225 + }, + { + "epoch": 0.17627827024840742, + "grad_norm": 26.933544158935547, + "learning_rate": 9.823820980452072e-07, + "logits/chosen": -0.20558574795722961, + "logits/rejected": -0.15755276381969452, + "logps/chosen": -1.7019073963165283, + "logps/rejected": -1.833353042602539, + "loss": 2.3367, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.01907730102539, + "rewards/margins": 1.3144561052322388, + "rewards/rejected": -18.33353042602539, + "step": 5230 + }, + { + "epoch": 0.17644679631939061, + "grad_norm": 21.980003356933594, + "learning_rate": 9.823046230588085e-07, + "logits/chosen": -0.3728945851325989, + "logits/rejected": -0.17971554398536682, + "logps/chosen": -2.131380558013916, + "logps/rejected": -2.4322307109832764, + "loss": 1.9342, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -21.313804626464844, + "rewards/margins": 3.00850248336792, + "rewards/rejected": -24.322307586669922, + "step": 5235 + }, + { + "epoch": 0.17661532239037378, + "grad_norm": 22.891450881958008, + "learning_rate": 9.82226981165782e-07, + "logits/chosen": -0.4645145833492279, + "logits/rejected": -0.3906251788139343, + "logps/chosen": -2.064272165298462, + "logps/rejected": -2.2810354232788086, + "loss": 3.3253, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.64272117614746, + "rewards/margins": 2.1676318645477295, + "rewards/rejected": -22.810354232788086, + "step": 5240 + }, + { + "epoch": 0.17678384846135697, + "grad_norm": 34.193443298339844, + "learning_rate": 9.821491723929963e-07, + "logits/chosen": 0.022101493552327156, + "logits/rejected": -0.01094393152743578, + "logps/chosen": -1.9077723026275635, + "logps/rejected": -1.9567878246307373, + "loss": 3.0994, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.077722549438477, + "rewards/margins": 0.49015599489212036, + "rewards/rejected": -19.567880630493164, + "step": 5245 + }, + { + "epoch": 0.17695237453234014, + "grad_norm": 19.244543075561523, + "learning_rate": 9.82071196767378e-07, + "logits/chosen": -0.7040097117424011, + "logits/rejected": -0.5882058143615723, + "logps/chosen": -1.7580782175064087, + "logps/rejected": -1.6740825176239014, + "loss": 3.8844, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.580781936645508, + "rewards/margins": -0.8399547338485718, + "rewards/rejected": -16.740825653076172, + "step": 5250 + }, + { + "epoch": 0.17712090060332333, + "grad_norm": 23.584217071533203, + "learning_rate": 9.819930543159112e-07, + "logits/chosen": -0.47853463888168335, + "logits/rejected": -0.4111636281013489, + "logps/chosen": -1.689819097518921, + "logps/rejected": -1.7779957056045532, + "loss": 2.4064, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.898193359375, + "rewards/margins": 0.8817658424377441, + "rewards/rejected": -17.779958724975586, + "step": 5255 + }, + { + "epoch": 0.17728942667430653, + "grad_norm": 60.129798889160156, + "learning_rate": 9.819147450656382e-07, + "logits/chosen": -0.3128248155117035, + "logits/rejected": -0.3781060576438904, + "logps/chosen": -1.551443338394165, + "logps/rejected": -1.5790199041366577, + "loss": 2.8277, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -15.514434814453125, + "rewards/margins": 0.27576375007629395, + "rewards/rejected": -15.790199279785156, + "step": 5260 + }, + { + "epoch": 0.1774579527452897, + "grad_norm": 26.094388961791992, + "learning_rate": 9.818362690436586e-07, + "logits/chosen": -0.7319404482841492, + "logits/rejected": -0.6763112545013428, + "logps/chosen": -1.6520391702651978, + "logps/rejected": -1.6662318706512451, + "loss": 2.9878, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.5203914642334, + "rewards/margins": 0.14192715287208557, + "rewards/rejected": -16.66231918334961, + "step": 5265 + }, + { + "epoch": 0.1776264788162729, + "grad_norm": 82.54247283935547, + "learning_rate": 9.817576262771298e-07, + "logits/chosen": -0.11548449099063873, + "logits/rejected": -0.031751085072755814, + "logps/chosen": -2.201295852661133, + "logps/rejected": -2.252549648284912, + "loss": 2.7633, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.012958526611328, + "rewards/margins": 0.5125373601913452, + "rewards/rejected": -22.525497436523438, + "step": 5270 + }, + { + "epoch": 0.17779500488725605, + "grad_norm": 44.67779541015625, + "learning_rate": 9.816788167932672e-07, + "logits/chosen": -0.6120609045028687, + "logits/rejected": -0.6190296411514282, + "logps/chosen": -2.088254451751709, + "logps/rejected": -2.0191900730133057, + "loss": 4.0392, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -20.882543563842773, + "rewards/margins": -0.6906436681747437, + "rewards/rejected": -20.1919002532959, + "step": 5275 + }, + { + "epoch": 0.17796353095823925, + "grad_norm": 31.414321899414062, + "learning_rate": 9.815998406193436e-07, + "logits/chosen": -0.32458242774009705, + "logits/rejected": -0.31162434816360474, + "logps/chosen": -2.0117292404174805, + "logps/rejected": -1.9058940410614014, + "loss": 4.1566, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -20.117290496826172, + "rewards/margins": -1.058351755142212, + "rewards/rejected": -19.058940887451172, + "step": 5280 + }, + { + "epoch": 0.1781320570292224, + "grad_norm": 47.64741134643555, + "learning_rate": 9.81520697782689e-07, + "logits/chosen": -0.3747056722640991, + "logits/rejected": -0.3660200238227844, + "logps/chosen": -2.0151686668395996, + "logps/rejected": -2.171931505203247, + "loss": 2.8612, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.15168571472168, + "rewards/margins": 1.5676277875900269, + "rewards/rejected": -21.719314575195312, + "step": 5285 + }, + { + "epoch": 0.1783005831002056, + "grad_norm": 37.78845977783203, + "learning_rate": 9.814413883106924e-07, + "logits/chosen": -0.2810427248477936, + "logits/rejected": -0.30334943532943726, + "logps/chosen": -2.2405996322631836, + "logps/rejected": -2.317270517349243, + "loss": 2.7636, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.405996322631836, + "rewards/margins": 0.7667078971862793, + "rewards/rejected": -23.172704696655273, + "step": 5290 + }, + { + "epoch": 0.17846910917118877, + "grad_norm": 24.239185333251953, + "learning_rate": 9.813619122307993e-07, + "logits/chosen": -0.2296961098909378, + "logits/rejected": -0.23331353068351746, + "logps/chosen": -1.9587428569793701, + "logps/rejected": -1.9045612812042236, + "loss": 3.8117, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -19.587427139282227, + "rewards/margins": -0.5418151617050171, + "rewards/rejected": -19.045612335205078, + "step": 5295 + }, + { + "epoch": 0.17863763524217197, + "grad_norm": 116.19218444824219, + "learning_rate": 9.81282269570513e-07, + "logits/chosen": -0.6433163285255432, + "logits/rejected": -0.4264778196811676, + "logps/chosen": -2.0715627670288086, + "logps/rejected": -2.044890880584717, + "loss": 3.5245, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.715627670288086, + "rewards/margins": -0.266719251871109, + "rewards/rejected": -20.448909759521484, + "step": 5300 + }, + { + "epoch": 0.17880616131315513, + "grad_norm": 21.32752227783203, + "learning_rate": 9.812024603573954e-07, + "logits/chosen": -0.4090822637081146, + "logits/rejected": -0.46003809571266174, + "logps/chosen": -1.727638840675354, + "logps/rejected": -1.7646032571792603, + "loss": 2.8879, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.27638816833496, + "rewards/margins": 0.36964479088783264, + "rewards/rejected": -17.646032333374023, + "step": 5305 + }, + { + "epoch": 0.17897468738413833, + "grad_norm": 23.98275375366211, + "learning_rate": 9.811224846190647e-07, + "logits/chosen": -0.5765715837478638, + "logits/rejected": -0.6701818704605103, + "logps/chosen": -1.867069959640503, + "logps/rejected": -1.9343897104263306, + "loss": 2.684, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.67070198059082, + "rewards/margins": 0.6731952428817749, + "rewards/rejected": -19.343896865844727, + "step": 5310 + }, + { + "epoch": 0.17914321345512152, + "grad_norm": 15.5569429397583, + "learning_rate": 9.810423423831974e-07, + "logits/chosen": -0.8625133633613586, + "logits/rejected": -0.7863910794258118, + "logps/chosen": -1.8105857372283936, + "logps/rejected": -1.9400856494903564, + "loss": 2.0151, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.105857849121094, + "rewards/margins": 1.2949975728988647, + "rewards/rejected": -19.400856018066406, + "step": 5315 + }, + { + "epoch": 0.17931173952610469, + "grad_norm": 32.97955322265625, + "learning_rate": 9.80962033677528e-07, + "logits/chosen": -0.32209211587905884, + "logits/rejected": -0.5163687467575073, + "logps/chosen": -1.8706210851669312, + "logps/rejected": -1.6956441402435303, + "loss": 4.881, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -18.70621109008789, + "rewards/margins": -1.7497684955596924, + "rewards/rejected": -16.95644187927246, + "step": 5320 + }, + { + "epoch": 0.17948026559708788, + "grad_norm": 24.574478149414062, + "learning_rate": 9.808815585298475e-07, + "logits/chosen": -0.28291743993759155, + "logits/rejected": -0.28594347834587097, + "logps/chosen": -1.8679603338241577, + "logps/rejected": -1.9361158609390259, + "loss": 2.8133, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.679601669311523, + "rewards/margins": 0.6815553903579712, + "rewards/rejected": -19.361156463623047, + "step": 5325 + }, + { + "epoch": 0.17964879166807105, + "grad_norm": 14.93967342376709, + "learning_rate": 9.80800916968006e-07, + "logits/chosen": -0.755969762802124, + "logits/rejected": -0.7597614526748657, + "logps/chosen": -1.9039316177368164, + "logps/rejected": -2.2054519653320312, + "loss": 2.0603, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.039318084716797, + "rewards/margins": 3.015202045440674, + "rewards/rejected": -22.05451774597168, + "step": 5330 + }, + { + "epoch": 0.17981731773905424, + "grad_norm": 15.54341983795166, + "learning_rate": 9.807201090199095e-07, + "logits/chosen": -0.4750305116176605, + "logits/rejected": -0.5239988565444946, + "logps/chosen": -1.9736168384552002, + "logps/rejected": -1.8567231893539429, + "loss": 4.2195, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -19.736167907714844, + "rewards/margins": -1.1689354181289673, + "rewards/rejected": -18.567232131958008, + "step": 5335 + }, + { + "epoch": 0.1799858438100374, + "grad_norm": 26.10828399658203, + "learning_rate": 9.806391347135233e-07, + "logits/chosen": -1.0066381692886353, + "logits/rejected": -1.0506072044372559, + "logps/chosen": -1.6646168231964111, + "logps/rejected": -1.6354296207427979, + "loss": 3.5164, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.646167755126953, + "rewards/margins": -0.29187268018722534, + "rewards/rejected": -16.354293823242188, + "step": 5340 + }, + { + "epoch": 0.1801543698810206, + "grad_norm": 22.388622283935547, + "learning_rate": 9.805579940768687e-07, + "logits/chosen": -0.6220000982284546, + "logits/rejected": -0.43516239523887634, + "logps/chosen": -1.761366605758667, + "logps/rejected": -1.8666290044784546, + "loss": 2.7263, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.613666534423828, + "rewards/margins": 1.0526221990585327, + "rewards/rejected": -18.66628646850586, + "step": 5345 + }, + { + "epoch": 0.18032289595200376, + "grad_norm": 25.53797721862793, + "learning_rate": 9.804766871380257e-07, + "logits/chosen": -0.4475626051425934, + "logits/rejected": -0.5037604570388794, + "logps/chosen": -1.771932601928711, + "logps/rejected": -1.826242446899414, + "loss": 2.6043, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.719324111938477, + "rewards/margins": 0.543099045753479, + "rewards/rejected": -18.26242446899414, + "step": 5350 + }, + { + "epoch": 0.18049142202298696, + "grad_norm": 22.603837966918945, + "learning_rate": 9.803952139251311e-07, + "logits/chosen": -1.0026981830596924, + "logits/rejected": -1.0315004587173462, + "logps/chosen": -1.718327522277832, + "logps/rejected": -1.6823968887329102, + "loss": 3.4208, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.183277130126953, + "rewards/margins": -0.35930928587913513, + "rewards/rejected": -16.8239688873291, + "step": 5355 + }, + { + "epoch": 0.18065994809397012, + "grad_norm": 18.36005401611328, + "learning_rate": 9.803135744663802e-07, + "logits/chosen": -0.5656792521476746, + "logits/rejected": -0.49610671401023865, + "logps/chosen": -1.738930344581604, + "logps/rejected": -1.7509796619415283, + "loss": 3.1428, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.38930320739746, + "rewards/margins": 0.12049512565135956, + "rewards/rejected": -17.509796142578125, + "step": 5360 + }, + { + "epoch": 0.18082847416495332, + "grad_norm": 25.456018447875977, + "learning_rate": 9.802317687900247e-07, + "logits/chosen": -0.6400080323219299, + "logits/rejected": -0.5408387184143066, + "logps/chosen": -1.99357008934021, + "logps/rejected": -1.9206161499023438, + "loss": 3.8065, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -19.935699462890625, + "rewards/margins": -0.72953861951828, + "rewards/rejected": -19.206161499023438, + "step": 5365 + }, + { + "epoch": 0.1809970002359365, + "grad_norm": 26.897117614746094, + "learning_rate": 9.80149796924374e-07, + "logits/chosen": -0.41049957275390625, + "logits/rejected": -0.5050762295722961, + "logps/chosen": -2.407325267791748, + "logps/rejected": -2.060502529144287, + "loss": 6.8161, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.073257446289062, + "rewards/margins": -3.4682304859161377, + "rewards/rejected": -20.605026245117188, + "step": 5370 + }, + { + "epoch": 0.18116552630691968, + "grad_norm": 22.910654067993164, + "learning_rate": 9.80067658897796e-07, + "logits/chosen": -0.7714192271232605, + "logits/rejected": -0.8598917126655579, + "logps/chosen": -1.7116810083389282, + "logps/rejected": -1.7139278650283813, + "loss": 3.0925, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.11680793762207, + "rewards/margins": 0.02247037924826145, + "rewards/rejected": -17.139278411865234, + "step": 5375 + }, + { + "epoch": 0.18133405237790287, + "grad_norm": 24.104690551757812, + "learning_rate": 9.799853547387152e-07, + "logits/chosen": -0.5268293023109436, + "logits/rejected": -0.4755741059780121, + "logps/chosen": -2.100019931793213, + "logps/rejected": -2.20682430267334, + "loss": 2.3812, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.000202178955078, + "rewards/margins": 1.068042516708374, + "rewards/rejected": -22.0682430267334, + "step": 5380 + }, + { + "epoch": 0.18150257844888604, + "grad_norm": 24.671052932739258, + "learning_rate": 9.799028844756137e-07, + "logits/chosen": -0.4852059781551361, + "logits/rejected": -0.4492795467376709, + "logps/chosen": -2.029736042022705, + "logps/rejected": -2.0513997077941895, + "loss": 2.9847, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.297359466552734, + "rewards/margins": 0.21663923561573029, + "rewards/rejected": -20.513999938964844, + "step": 5385 + }, + { + "epoch": 0.18167110451986923, + "grad_norm": 29.028913497924805, + "learning_rate": 9.798202481370314e-07, + "logits/chosen": -0.779220700263977, + "logits/rejected": -0.7618386149406433, + "logps/chosen": -1.896601676940918, + "logps/rejected": -1.668404221534729, + "loss": 5.3947, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.966014862060547, + "rewards/margins": -2.281973361968994, + "rewards/rejected": -16.68404197692871, + "step": 5390 + }, + { + "epoch": 0.1818396305908524, + "grad_norm": 28.980819702148438, + "learning_rate": 9.797374457515652e-07, + "logits/chosen": -0.45707112550735474, + "logits/rejected": -0.532502293586731, + "logps/chosen": -1.9134578704833984, + "logps/rejected": -1.8541061878204346, + "loss": 3.8084, + "rewards/accuracies": 0.10000000149011612, + "rewards/chosen": -19.134578704833984, + "rewards/margins": -0.5935171842575073, + "rewards/rejected": -18.541059494018555, + "step": 5395 + }, + { + "epoch": 0.1820081566618356, + "grad_norm": 39.4348030090332, + "learning_rate": 9.796544773478701e-07, + "logits/chosen": -0.3773210644721985, + "logits/rejected": -0.3709460198879242, + "logps/chosen": -2.3887171745300293, + "logps/rejected": -2.5245351791381836, + "loss": 1.9783, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -23.88717269897461, + "rewards/margins": 1.3581793308258057, + "rewards/rejected": -25.245349884033203, + "step": 5400 + }, + { + "epoch": 0.18217668273281876, + "grad_norm": 25.09897232055664, + "learning_rate": 9.79571342954658e-07, + "logits/chosen": -0.42378631234169006, + "logits/rejected": -0.3182446360588074, + "logps/chosen": -2.159898281097412, + "logps/rejected": -2.1635546684265137, + "loss": 4.122, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -21.598981857299805, + "rewards/margins": 0.03656425327062607, + "rewards/rejected": -21.63554573059082, + "step": 5405 + }, + { + "epoch": 0.18234520880380195, + "grad_norm": 40.908599853515625, + "learning_rate": 9.794880426006983e-07, + "logits/chosen": -0.7844399809837341, + "logits/rejected": -0.6861599087715149, + "logps/chosen": -1.6490141153335571, + "logps/rejected": -1.6898380517959595, + "loss": 2.8132, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.49013900756836, + "rewards/margins": 0.40823841094970703, + "rewards/rejected": -16.898380279541016, + "step": 5410 + }, + { + "epoch": 0.18251373487478512, + "grad_norm": 39.26100540161133, + "learning_rate": 9.794045763148184e-07, + "logits/chosen": -0.6198582053184509, + "logits/rejected": -0.6737528443336487, + "logps/chosen": -1.8701423406600952, + "logps/rejected": -1.860345482826233, + "loss": 3.2316, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.70142364501953, + "rewards/margins": -0.0979672446846962, + "rewards/rejected": -18.603458404541016, + "step": 5415 + }, + { + "epoch": 0.1826822609457683, + "grad_norm": 32.086490631103516, + "learning_rate": 9.793209441259022e-07, + "logits/chosen": -0.5078805685043335, + "logits/rejected": -0.5676628947257996, + "logps/chosen": -1.7885487079620361, + "logps/rejected": -1.8656467199325562, + "loss": 2.5302, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.885486602783203, + "rewards/margins": 0.7709810137748718, + "rewards/rejected": -18.65646743774414, + "step": 5420 + }, + { + "epoch": 0.1828507870167515, + "grad_norm": 19.564838409423828, + "learning_rate": 9.79237146062892e-07, + "logits/chosen": -0.26721253991127014, + "logits/rejected": -0.35675540566444397, + "logps/chosen": -1.4722105264663696, + "logps/rejected": -1.6488120555877686, + "loss": 2.0667, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -14.7221040725708, + "rewards/margins": 1.7660157680511475, + "rewards/rejected": -16.488121032714844, + "step": 5425 + }, + { + "epoch": 0.18301931308773467, + "grad_norm": 34.31207275390625, + "learning_rate": 9.791531821547865e-07, + "logits/chosen": -0.23346984386444092, + "logits/rejected": -0.258556604385376, + "logps/chosen": -2.2937192916870117, + "logps/rejected": -2.3578433990478516, + "loss": 3.7761, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.937192916870117, + "rewards/margins": 0.6412402391433716, + "rewards/rejected": -23.578433990478516, + "step": 5430 + }, + { + "epoch": 0.18318783915871786, + "grad_norm": 25.44965934753418, + "learning_rate": 9.790690524306426e-07, + "logits/chosen": -0.47591328620910645, + "logits/rejected": -0.679408609867096, + "logps/chosen": -1.8982775211334229, + "logps/rejected": -1.9024616479873657, + "loss": 3.0692, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.98277473449707, + "rewards/margins": 0.0418427474796772, + "rewards/rejected": -19.024616241455078, + "step": 5435 + }, + { + "epoch": 0.18335636522970103, + "grad_norm": 28.910072326660156, + "learning_rate": 9.789847569195742e-07, + "logits/chosen": -0.5068162083625793, + "logits/rejected": -0.3690851330757141, + "logps/chosen": -2.17146635055542, + "logps/rejected": -2.3016185760498047, + "loss": 1.9669, + "rewards/accuracies": 1.0, + "rewards/chosen": -21.714664459228516, + "rewards/margins": 1.3015224933624268, + "rewards/rejected": -23.016185760498047, + "step": 5440 + }, + { + "epoch": 0.18352489130068422, + "grad_norm": 27.728517532348633, + "learning_rate": 9.789002956507525e-07, + "logits/chosen": -0.4048040509223938, + "logits/rejected": -0.4588392376899719, + "logps/chosen": -1.8538213968276978, + "logps/rejected": -2.2220959663391113, + "loss": 2.2989, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.5382137298584, + "rewards/margins": 3.6827454566955566, + "rewards/rejected": -22.220958709716797, + "step": 5445 + }, + { + "epoch": 0.1836934173716674, + "grad_norm": 60.68367385864258, + "learning_rate": 9.788156686534069e-07, + "logits/chosen": -0.3275999426841736, + "logits/rejected": -0.19864805042743683, + "logps/chosen": -2.4291739463806152, + "logps/rejected": -2.6376471519470215, + "loss": 2.9337, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.29174041748047, + "rewards/margins": 2.084731101989746, + "rewards/rejected": -26.3764705657959, + "step": 5450 + }, + { + "epoch": 0.18386194344265058, + "grad_norm": 24.850337982177734, + "learning_rate": 9.787308759568225e-07, + "logits/chosen": -0.6782561540603638, + "logits/rejected": -0.5187098979949951, + "logps/chosen": -1.7515907287597656, + "logps/rejected": -2.032395839691162, + "loss": 1.6244, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.515907287597656, + "rewards/margins": 2.8080544471740723, + "rewards/rejected": -20.32396125793457, + "step": 5455 + }, + { + "epoch": 0.18403046951363375, + "grad_norm": 44.323890686035156, + "learning_rate": 9.786459175903433e-07, + "logits/chosen": -0.6140015125274658, + "logits/rejected": -0.4879150986671448, + "logps/chosen": -2.3758678436279297, + "logps/rejected": -2.6171932220458984, + "loss": 2.4341, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.758678436279297, + "rewards/margins": 2.4132542610168457, + "rewards/rejected": -26.17193031311035, + "step": 5460 + }, + { + "epoch": 0.18419899558461694, + "grad_norm": 20.042804718017578, + "learning_rate": 9.7856079358337e-07, + "logits/chosen": -0.587514340877533, + "logits/rejected": -0.5617462396621704, + "logps/chosen": -1.873822808265686, + "logps/rejected": -1.8781769275665283, + "loss": 3.4026, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.73822784423828, + "rewards/margins": 0.04354066774249077, + "rewards/rejected": -18.781768798828125, + "step": 5465 + }, + { + "epoch": 0.1843675216556001, + "grad_norm": 16.114381790161133, + "learning_rate": 9.784755039653605e-07, + "logits/chosen": -0.44803792238235474, + "logits/rejected": -0.6248019337654114, + "logps/chosen": -1.5381447076797485, + "logps/rejected": -1.7500909566879272, + "loss": 2.2383, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -15.381446838378906, + "rewards/margins": 2.1194636821746826, + "rewards/rejected": -17.50090980529785, + "step": 5470 + }, + { + "epoch": 0.1845360477265833, + "grad_norm": 184.44888305664062, + "learning_rate": 9.783900487658304e-07, + "logits/chosen": -0.37028566002845764, + "logits/rejected": -0.23744535446166992, + "logps/chosen": -2.110156536102295, + "logps/rejected": -1.7765562534332275, + "loss": 7.1675, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.1015682220459, + "rewards/margins": -3.3360047340393066, + "rewards/rejected": -17.76556396484375, + "step": 5475 + }, + { + "epoch": 0.1847045737975665, + "grad_norm": 22.66653060913086, + "learning_rate": 9.78304428014352e-07, + "logits/chosen": -0.8866994976997375, + "logits/rejected": -0.7490772008895874, + "logps/chosen": -1.6607614755630493, + "logps/rejected": -1.740256905555725, + "loss": 2.4509, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.60761260986328, + "rewards/margins": 0.7949555516242981, + "rewards/rejected": -17.402568817138672, + "step": 5480 + }, + { + "epoch": 0.18487309986854966, + "grad_norm": 29.36440658569336, + "learning_rate": 9.782186417405556e-07, + "logits/chosen": -0.21898670494556427, + "logits/rejected": -0.17396847903728485, + "logps/chosen": -2.279313564300537, + "logps/rejected": -2.278048515319824, + "loss": 3.2844, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -22.793132781982422, + "rewards/margins": -0.012647914700210094, + "rewards/rejected": -22.78048324584961, + "step": 5485 + }, + { + "epoch": 0.18504162593953286, + "grad_norm": 25.005599975585938, + "learning_rate": 9.781326899741284e-07, + "logits/chosen": -0.6242612600326538, + "logits/rejected": -0.6651891469955444, + "logps/chosen": -1.8814970254898071, + "logps/rejected": -1.8976047039031982, + "loss": 3.0298, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.814970016479492, + "rewards/margins": 0.16107892990112305, + "rewards/rejected": -18.976049423217773, + "step": 5490 + }, + { + "epoch": 0.18521015201051602, + "grad_norm": 32.897987365722656, + "learning_rate": 9.780465727448149e-07, + "logits/chosen": -0.5068541169166565, + "logits/rejected": -0.54652339220047, + "logps/chosen": -1.840574860572815, + "logps/rejected": -1.9388887882232666, + "loss": 3.0219, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.405750274658203, + "rewards/margins": 0.983138918876648, + "rewards/rejected": -19.388887405395508, + "step": 5495 + }, + { + "epoch": 0.18537867808149922, + "grad_norm": 19.555273056030273, + "learning_rate": 9.779602900824167e-07, + "logits/chosen": -0.5434045791625977, + "logits/rejected": -0.7178775668144226, + "logps/chosen": -1.7472089529037476, + "logps/rejected": -1.7612006664276123, + "loss": 2.9711, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.472087860107422, + "rewards/margins": 0.13991737365722656, + "rewards/rejected": -17.61200714111328, + "step": 5500 + }, + { + "epoch": 0.18554720415248238, + "grad_norm": 41.695682525634766, + "learning_rate": 9.77873842016793e-07, + "logits/chosen": -0.681435227394104, + "logits/rejected": -0.6236013174057007, + "logps/chosen": -1.890181541442871, + "logps/rejected": -1.9118419885635376, + "loss": 3.0724, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -18.901817321777344, + "rewards/margins": 0.21660356223583221, + "rewards/rejected": -19.118419647216797, + "step": 5505 + }, + { + "epoch": 0.18571573022346557, + "grad_norm": 21.719018936157227, + "learning_rate": 9.777872285778603e-07, + "logits/chosen": -0.6559665203094482, + "logits/rejected": -0.5704790949821472, + "logps/chosen": -1.7349132299423218, + "logps/rejected": -1.7577717304229736, + "loss": 2.9269, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.349132537841797, + "rewards/margins": 0.22858504951000214, + "rewards/rejected": -17.577716827392578, + "step": 5510 + }, + { + "epoch": 0.18588425629444874, + "grad_norm": 30.6837215423584, + "learning_rate": 9.777004497955918e-07, + "logits/chosen": -0.9211034774780273, + "logits/rejected": -0.9266023635864258, + "logps/chosen": -1.7248684167861938, + "logps/rejected": -1.8038078546524048, + "loss": 2.5282, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.24868392944336, + "rewards/margins": 0.7893939018249512, + "rewards/rejected": -18.03807830810547, + "step": 5515 + }, + { + "epoch": 0.18605278236543193, + "grad_norm": 30.61154556274414, + "learning_rate": 9.77613505700018e-07, + "logits/chosen": -0.4484230875968933, + "logits/rejected": -0.28866034746170044, + "logps/chosen": -1.8092586994171143, + "logps/rejected": -1.8665987253189087, + "loss": 3.0392, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.092586517333984, + "rewards/margins": 0.5734013319015503, + "rewards/rejected": -18.66598892211914, + "step": 5520 + }, + { + "epoch": 0.1862213084364151, + "grad_norm": 24.71278190612793, + "learning_rate": 9.775263963212275e-07, + "logits/chosen": -0.9451042413711548, + "logits/rejected": -0.9575036764144897, + "logps/chosen": -1.615065336227417, + "logps/rejected": -1.6316133737564087, + "loss": 3.1126, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.150653839111328, + "rewards/margins": 0.16548070311546326, + "rewards/rejected": -16.31613540649414, + "step": 5525 + }, + { + "epoch": 0.1863898345073983, + "grad_norm": 18.097076416015625, + "learning_rate": 9.774391216893646e-07, + "logits/chosen": -0.41714709997177124, + "logits/rejected": -0.2718644142150879, + "logps/chosen": -1.9782493114471436, + "logps/rejected": -2.1051318645477295, + "loss": 2.3445, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.78249168395996, + "rewards/margins": 1.2688255310058594, + "rewards/rejected": -21.051319122314453, + "step": 5530 + }, + { + "epoch": 0.1865583605783815, + "grad_norm": 20.818796157836914, + "learning_rate": 9.773516818346323e-07, + "logits/chosen": -0.8004224896430969, + "logits/rejected": -0.694173276424408, + "logps/chosen": -1.8741105794906616, + "logps/rejected": -1.9920237064361572, + "loss": 2.3865, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.741107940673828, + "rewards/margins": 1.1791307926177979, + "rewards/rejected": -19.920238494873047, + "step": 5535 + }, + { + "epoch": 0.18672688664936465, + "grad_norm": 23.69890594482422, + "learning_rate": 9.772640767872899e-07, + "logits/chosen": -0.4144554138183594, + "logits/rejected": -0.4145297110080719, + "logps/chosen": -1.9912002086639404, + "logps/rejected": -1.9969688653945923, + "loss": 4.3266, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.912002563476562, + "rewards/margins": 0.05768384784460068, + "rewards/rejected": -19.96968650817871, + "step": 5540 + }, + { + "epoch": 0.18689541272034785, + "grad_norm": 24.523786544799805, + "learning_rate": 9.771763065776538e-07, + "logits/chosen": -0.5034445524215698, + "logits/rejected": -0.6185353994369507, + "logps/chosen": -1.7700822353363037, + "logps/rejected": -1.7892067432403564, + "loss": 2.9114, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.700820922851562, + "rewards/margins": 0.1912446916103363, + "rewards/rejected": -17.892066955566406, + "step": 5545 + }, + { + "epoch": 0.187063938791331, + "grad_norm": 18.63873863220215, + "learning_rate": 9.77088371236098e-07, + "logits/chosen": -0.5159907937049866, + "logits/rejected": -0.5340949296951294, + "logps/chosen": -1.7579313516616821, + "logps/rejected": -1.73773992061615, + "loss": 3.3439, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.57931137084961, + "rewards/margins": -0.2019151747226715, + "rewards/rejected": -17.377399444580078, + "step": 5550 + }, + { + "epoch": 0.1872324648623142, + "grad_norm": 41.46942138671875, + "learning_rate": 9.770002707930535e-07, + "logits/chosen": -0.37359291315078735, + "logits/rejected": -0.4222946763038635, + "logps/chosen": -1.71608567237854, + "logps/rejected": -1.8431408405303955, + "loss": 2.3919, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.160858154296875, + "rewards/margins": 1.2705483436584473, + "rewards/rejected": -18.431407928466797, + "step": 5555 + }, + { + "epoch": 0.18740099093329737, + "grad_norm": 30.87368392944336, + "learning_rate": 9.769120052790084e-07, + "logits/chosen": -0.4397002160549164, + "logits/rejected": -0.3921523094177246, + "logps/chosen": -1.8612377643585205, + "logps/rejected": -1.8991447687149048, + "loss": 2.8729, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.612377166748047, + "rewards/margins": 0.3790697157382965, + "rewards/rejected": -18.99144744873047, + "step": 5560 + }, + { + "epoch": 0.18756951700428057, + "grad_norm": 23.553661346435547, + "learning_rate": 9.768235747245078e-07, + "logits/chosen": -0.25023049116134644, + "logits/rejected": -0.2950226366519928, + "logps/chosen": -1.9554675817489624, + "logps/rejected": -2.0229415893554688, + "loss": 2.6882, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.554676055908203, + "rewards/margins": 0.6747380495071411, + "rewards/rejected": -20.229412078857422, + "step": 5565 + }, + { + "epoch": 0.18773804307526373, + "grad_norm": 24.0847110748291, + "learning_rate": 9.767349791601539e-07, + "logits/chosen": -0.7085575461387634, + "logits/rejected": -0.7902520895004272, + "logps/chosen": -1.752450942993164, + "logps/rejected": -1.6067931652069092, + "loss": 4.5458, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.52450942993164, + "rewards/margins": -1.456578016281128, + "rewards/rejected": -16.06793212890625, + "step": 5570 + }, + { + "epoch": 0.18790656914624693, + "grad_norm": 25.039995193481445, + "learning_rate": 9.766462186166064e-07, + "logits/chosen": -0.3297646939754486, + "logits/rejected": -0.5008463263511658, + "logps/chosen": -1.6626918315887451, + "logps/rejected": -1.8057029247283936, + "loss": 2.5971, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.62691879272461, + "rewards/margins": 1.430110216140747, + "rewards/rejected": -18.057031631469727, + "step": 5575 + }, + { + "epoch": 0.1880750952172301, + "grad_norm": 36.930419921875, + "learning_rate": 9.76557293124582e-07, + "logits/chosen": -0.6175158023834229, + "logits/rejected": -0.5426197648048401, + "logps/chosen": -1.818953275680542, + "logps/rejected": -1.803902268409729, + "loss": 3.4941, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.189531326293945, + "rewards/margins": -0.15051087737083435, + "rewards/rejected": -18.039020538330078, + "step": 5580 + }, + { + "epoch": 0.1882436212882133, + "grad_norm": 17.54623794555664, + "learning_rate": 9.764682027148538e-07, + "logits/chosen": -0.9951989054679871, + "logits/rejected": -0.990329921245575, + "logps/chosen": -1.5430244207382202, + "logps/rejected": -1.5838948488235474, + "loss": 2.7648, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -15.430246353149414, + "rewards/margins": 0.4087028503417969, + "rewards/rejected": -15.838948249816895, + "step": 5585 + }, + { + "epoch": 0.18841214735919648, + "grad_norm": 22.05076026916504, + "learning_rate": 9.763789474182529e-07, + "logits/chosen": -0.5766128301620483, + "logits/rejected": -0.6728538870811462, + "logps/chosen": -1.7131710052490234, + "logps/rejected": -1.6873975992202759, + "loss": 3.4466, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.131710052490234, + "rewards/margins": -0.2577347755432129, + "rewards/rejected": -16.87397575378418, + "step": 5590 + }, + { + "epoch": 0.18858067343017965, + "grad_norm": 0.013031297363340855, + "learning_rate": 9.762895272656667e-07, + "logits/chosen": -0.33102065324783325, + "logits/rejected": -0.30809807777404785, + "logps/chosen": -2.0100135803222656, + "logps/rejected": -2.431529998779297, + "loss": 2.0247, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.10013771057129, + "rewards/margins": 4.215163230895996, + "rewards/rejected": -24.3153018951416, + "step": 5595 + }, + { + "epoch": 0.18874919950116284, + "grad_norm": 18.23565101623535, + "learning_rate": 9.761999422880402e-07, + "logits/chosen": -0.45090073347091675, + "logits/rejected": -0.39279770851135254, + "logps/chosen": -1.7720234394073486, + "logps/rejected": -1.8577537536621094, + "loss": 2.4942, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.720233917236328, + "rewards/margins": 0.8573040962219238, + "rewards/rejected": -18.577539443969727, + "step": 5600 + }, + { + "epoch": 0.18874919950116284, + "eval_logits/chosen": -0.7959946990013123, + "eval_logits/rejected": -0.818673312664032, + "eval_logps/chosen": -1.7273130416870117, + "eval_logps/rejected": -1.749546766281128, + "eval_loss": 3.2864151000976562, + "eval_rewards/accuracies": 0.5099999904632568, + "eval_rewards/chosen": -17.273130416870117, + "eval_rewards/margins": 0.22233757376670837, + "eval_rewards/rejected": -17.495468139648438, + "eval_runtime": 12.912, + "eval_samples_per_second": 7.745, + "eval_steps_per_second": 1.936, + "step": 5600 + }, + { + "epoch": 0.188917725572146, + "grad_norm": 26.624677658081055, + "learning_rate": 9.761101925163752e-07, + "logits/chosen": -0.595131516456604, + "logits/rejected": -0.687487781047821, + "logps/chosen": -1.9345438480377197, + "logps/rejected": -2.0004289150238037, + "loss": 2.9438, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.34543800354004, + "rewards/margins": 0.6588513255119324, + "rewards/rejected": -20.004289627075195, + "step": 5605 + }, + { + "epoch": 0.1890862516431292, + "grad_norm": 32.13505172729492, + "learning_rate": 9.76020277981731e-07, + "logits/chosen": -0.4904160499572754, + "logits/rejected": -0.33541202545166016, + "logps/chosen": -1.7449373006820679, + "logps/rejected": -1.8215818405151367, + "loss": 2.8797, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.44937515258789, + "rewards/margins": 0.7664445042610168, + "rewards/rejected": -18.215818405151367, + "step": 5610 + }, + { + "epoch": 0.18925477771411237, + "grad_norm": 9.597542762756348, + "learning_rate": 9.759301987152225e-07, + "logits/chosen": -0.3985956609249115, + "logits/rejected": -0.4843437075614929, + "logps/chosen": -1.9301046133041382, + "logps/rejected": -1.9960343837738037, + "loss": 2.5725, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.30104637145996, + "rewards/margins": 0.6592990756034851, + "rewards/rejected": -19.960346221923828, + "step": 5615 + }, + { + "epoch": 0.18942330378509556, + "grad_norm": 22.30307388305664, + "learning_rate": 9.758399547480232e-07, + "logits/chosen": -0.4911147654056549, + "logits/rejected": -0.5110594034194946, + "logps/chosen": -1.7245635986328125, + "logps/rejected": -1.575623869895935, + "loss": 4.65, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -17.245634078979492, + "rewards/margins": -1.4893945455551147, + "rewards/rejected": -15.75623893737793, + "step": 5620 + }, + { + "epoch": 0.18959182985607873, + "grad_norm": 18.47562599182129, + "learning_rate": 9.757495461113632e-07, + "logits/chosen": -0.432157427072525, + "logits/rejected": -0.41354498267173767, + "logps/chosen": -1.7506507635116577, + "logps/rejected": -1.8003524541854858, + "loss": 2.9609, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.506507873535156, + "rewards/margins": 0.49701786041259766, + "rewards/rejected": -18.00352668762207, + "step": 5625 + }, + { + "epoch": 0.18976035592706192, + "grad_norm": 27.529369354248047, + "learning_rate": 9.756589728365288e-07, + "logits/chosen": -0.569477379322052, + "logits/rejected": -0.5368244051933289, + "logps/chosen": -1.5696738958358765, + "logps/rejected": -1.6201614141464233, + "loss": 2.7874, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -15.696739196777344, + "rewards/margins": 0.5048743486404419, + "rewards/rejected": -16.201614379882812, + "step": 5630 + }, + { + "epoch": 0.18992888199804508, + "grad_norm": 25.479154586791992, + "learning_rate": 9.755682349548643e-07, + "logits/chosen": -0.36692845821380615, + "logits/rejected": -0.32827451825141907, + "logps/chosen": -1.7392432689666748, + "logps/rejected": -1.812835693359375, + "loss": 2.8218, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.39243507385254, + "rewards/margins": 0.7359241247177124, + "rewards/rejected": -18.12835693359375, + "step": 5635 + }, + { + "epoch": 0.19009740806902828, + "grad_norm": 18.921920776367188, + "learning_rate": 9.7547733249777e-07, + "logits/chosen": -0.7113291025161743, + "logits/rejected": -0.6769397258758545, + "logps/chosen": -1.5992504358291626, + "logps/rejected": -1.6348832845687866, + "loss": 3.048, + "rewards/accuracies": 0.5, + "rewards/chosen": -15.992505073547363, + "rewards/margins": 0.356327623128891, + "rewards/rejected": -16.348833084106445, + "step": 5640 + }, + { + "epoch": 0.19026593414001147, + "grad_norm": 24.759885787963867, + "learning_rate": 9.753862654967044e-07, + "logits/chosen": -0.6169870495796204, + "logits/rejected": -0.9548047184944153, + "logps/chosen": -1.5832029581069946, + "logps/rejected": -1.6658849716186523, + "loss": 2.3219, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -15.83202838897705, + "rewards/margins": 0.8268213272094727, + "rewards/rejected": -16.658849716186523, + "step": 5645 + }, + { + "epoch": 0.19043446021099464, + "grad_norm": 23.502939224243164, + "learning_rate": 9.752950339831815e-07, + "logits/chosen": -0.6806584000587463, + "logits/rejected": -0.6301476955413818, + "logps/chosen": -1.8754730224609375, + "logps/rejected": -1.8923496007919312, + "loss": 3.2034, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -18.754730224609375, + "rewards/margins": 0.1687663048505783, + "rewards/rejected": -18.92349624633789, + "step": 5650 + }, + { + "epoch": 0.19060298628197783, + "grad_norm": 41.726806640625, + "learning_rate": 9.752036379887733e-07, + "logits/chosen": -0.5170332789421082, + "logits/rejected": -0.577129065990448, + "logps/chosen": -1.943185806274414, + "logps/rejected": -1.9628463983535767, + "loss": 4.2989, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.43185806274414, + "rewards/margins": 0.1966075897216797, + "rewards/rejected": -19.628467559814453, + "step": 5655 + }, + { + "epoch": 0.190771512352961, + "grad_norm": 16.944015502929688, + "learning_rate": 9.751120775451083e-07, + "logits/chosen": -0.3677743077278137, + "logits/rejected": -0.34167230129241943, + "logps/chosen": -2.053145170211792, + "logps/rejected": -2.0245869159698486, + "loss": 3.7197, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.531452178955078, + "rewards/margins": -0.2855833172798157, + "rewards/rejected": -20.245868682861328, + "step": 5660 + }, + { + "epoch": 0.1909400384239442, + "grad_norm": 31.120407104492188, + "learning_rate": 9.750203526838719e-07, + "logits/chosen": -0.3285277783870697, + "logits/rejected": -0.3090980052947998, + "logps/chosen": -1.9437472820281982, + "logps/rejected": -1.775830626487732, + "loss": 4.7381, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -19.437475204467773, + "rewards/margins": -1.6791677474975586, + "rewards/rejected": -17.7583065032959, + "step": 5665 + }, + { + "epoch": 0.19110856449492736, + "grad_norm": 27.374696731567383, + "learning_rate": 9.749284634368064e-07, + "logits/chosen": -0.2821030020713806, + "logits/rejected": -0.4119594097137451, + "logps/chosen": -1.6939541101455688, + "logps/rejected": -1.643109917640686, + "loss": 3.578, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -16.93954086303711, + "rewards/margins": -0.5084422826766968, + "rewards/rejected": -16.43109893798828, + "step": 5670 + }, + { + "epoch": 0.19127709056591055, + "grad_norm": 27.388362884521484, + "learning_rate": 9.748364098357113e-07, + "logits/chosen": -0.5865864157676697, + "logits/rejected": -0.6384282112121582, + "logps/chosen": -1.9883333444595337, + "logps/rejected": -1.9986881017684937, + "loss": 3.3969, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.88333511352539, + "rewards/margins": 0.10354681313037872, + "rewards/rejected": -19.986881256103516, + "step": 5675 + }, + { + "epoch": 0.19144561663689372, + "grad_norm": 28.64921760559082, + "learning_rate": 9.747441919124426e-07, + "logits/chosen": -0.3893504738807678, + "logits/rejected": -0.5681343078613281, + "logps/chosen": -2.015651226043701, + "logps/rejected": -2.0036251544952393, + "loss": 3.2282, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.156513214111328, + "rewards/margins": -0.12026166915893555, + "rewards/rejected": -20.036251068115234, + "step": 5680 + }, + { + "epoch": 0.1916141427078769, + "grad_norm": 39.19678497314453, + "learning_rate": 9.74651809698913e-07, + "logits/chosen": -0.34206151962280273, + "logits/rejected": -0.2204916775226593, + "logps/chosen": -1.8198713064193726, + "logps/rejected": -1.842043161392212, + "loss": 2.984, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.198715209960938, + "rewards/margins": 0.22171735763549805, + "rewards/rejected": -18.42043113708496, + "step": 5685 + }, + { + "epoch": 0.19178266877886008, + "grad_norm": 52.62766647338867, + "learning_rate": 9.74559263227093e-07, + "logits/chosen": -0.25368112325668335, + "logits/rejected": -0.28468552231788635, + "logps/chosen": -1.864861249923706, + "logps/rejected": -1.778679609298706, + "loss": 3.9123, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -18.648611068725586, + "rewards/margins": -0.8618162870407104, + "rewards/rejected": -17.78679656982422, + "step": 5690 + }, + { + "epoch": 0.19195119484984327, + "grad_norm": 20.276491165161133, + "learning_rate": 9.744665525290087e-07, + "logits/chosen": -0.9554246068000793, + "logits/rejected": -0.8531731367111206, + "logps/chosen": -1.6874557733535767, + "logps/rejected": -1.726912260055542, + "loss": 2.9321, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.874557495117188, + "rewards/margins": 0.3945651054382324, + "rewards/rejected": -17.269123077392578, + "step": 5695 + }, + { + "epoch": 0.19211972092082646, + "grad_norm": 16.296485900878906, + "learning_rate": 9.743736776367435e-07, + "logits/chosen": -0.747841477394104, + "logits/rejected": -0.6805245876312256, + "logps/chosen": -1.3950637578964233, + "logps/rejected": -1.4789271354675293, + "loss": 2.6315, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -13.950637817382812, + "rewards/margins": 0.8386334180831909, + "rewards/rejected": -14.789271354675293, + "step": 5700 + }, + { + "epoch": 0.19228824699180963, + "grad_norm": 29.865983963012695, + "learning_rate": 9.742806385824383e-07, + "logits/chosen": -0.7948347330093384, + "logits/rejected": -0.76251620054245, + "logps/chosen": -1.8513984680175781, + "logps/rejected": -1.9355262517929077, + "loss": 2.8024, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.513986587524414, + "rewards/margins": 0.8412765264511108, + "rewards/rejected": -19.355260848999023, + "step": 5705 + }, + { + "epoch": 0.19245677306279282, + "grad_norm": 38.51240921020508, + "learning_rate": 9.7418743539829e-07, + "logits/chosen": -0.20339004695415497, + "logits/rejected": -0.27671149373054504, + "logps/chosen": -1.9387633800506592, + "logps/rejected": -1.8538455963134766, + "loss": 3.995, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.38763427734375, + "rewards/margins": -0.8491789698600769, + "rewards/rejected": -18.538455963134766, + "step": 5710 + }, + { + "epoch": 0.192625299133776, + "grad_norm": 26.83456039428711, + "learning_rate": 9.740940681165526e-07, + "logits/chosen": -0.6951231956481934, + "logits/rejected": -0.5157309770584106, + "logps/chosen": -1.8345329761505127, + "logps/rejected": -1.8946685791015625, + "loss": 2.8783, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.345333099365234, + "rewards/margins": 0.6013542413711548, + "rewards/rejected": -18.946683883666992, + "step": 5715 + }, + { + "epoch": 0.19279382520475918, + "grad_norm": 11.871299743652344, + "learning_rate": 9.740005367695368e-07, + "logits/chosen": -0.7048253417015076, + "logits/rejected": -0.7177507281303406, + "logps/chosen": -1.4334383010864258, + "logps/rejected": -1.6110057830810547, + "loss": 1.9898, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -14.334383964538574, + "rewards/margins": 1.7756729125976562, + "rewards/rejected": -16.110057830810547, + "step": 5720 + }, + { + "epoch": 0.19296235127574235, + "grad_norm": 22.618762969970703, + "learning_rate": 9.739068413896098e-07, + "logits/chosen": -0.7570010423660278, + "logits/rejected": -0.6863754987716675, + "logps/chosen": -1.5401397943496704, + "logps/rejected": -1.59748113155365, + "loss": 2.7378, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -15.401399612426758, + "rewards/margins": 0.5734131932258606, + "rewards/rejected": -15.974810600280762, + "step": 5725 + }, + { + "epoch": 0.19313087734672554, + "grad_norm": 16.779813766479492, + "learning_rate": 9.738129820091964e-07, + "logits/chosen": -0.5011672973632812, + "logits/rejected": -0.5614322423934937, + "logps/chosen": -1.8755191564559937, + "logps/rejected": -1.8646976947784424, + "loss": 3.2224, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.755191802978516, + "rewards/margins": -0.10821404308080673, + "rewards/rejected": -18.646976470947266, + "step": 5730 + }, + { + "epoch": 0.1932994034177087, + "grad_norm": 30.153682708740234, + "learning_rate": 9.737189586607774e-07, + "logits/chosen": -0.40961700677871704, + "logits/rejected": -0.6002104878425598, + "logps/chosen": -2.0768322944641113, + "logps/rejected": -2.105672836303711, + "loss": 3.5266, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.768325805664062, + "rewards/margins": 0.2884038984775543, + "rewards/rejected": -21.05672836303711, + "step": 5735 + }, + { + "epoch": 0.1934679294886919, + "grad_norm": 27.910282135009766, + "learning_rate": 9.736247713768908e-07, + "logits/chosen": -0.23275843262672424, + "logits/rejected": -0.2791540026664734, + "logps/chosen": -1.5457642078399658, + "logps/rejected": -1.623756766319275, + "loss": 2.9391, + "rewards/accuracies": 0.5, + "rewards/chosen": -15.4576416015625, + "rewards/margins": 0.7799254655838013, + "rewards/rejected": -16.237567901611328, + "step": 5740 + }, + { + "epoch": 0.19363645555967507, + "grad_norm": 19.33202362060547, + "learning_rate": 9.735304201901306e-07, + "logits/chosen": -0.877076268196106, + "logits/rejected": -0.9000272750854492, + "logps/chosen": -1.7524656057357788, + "logps/rejected": -1.6089779138565063, + "loss": 4.5017, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -17.524654388427734, + "rewards/margins": -1.4348783493041992, + "rewards/rejected": -16.089778900146484, + "step": 5745 + }, + { + "epoch": 0.19380498163065826, + "grad_norm": 25.68280601501465, + "learning_rate": 9.734359051331485e-07, + "logits/chosen": -0.2692585587501526, + "logits/rejected": -0.3588291108608246, + "logps/chosen": -1.8761327266693115, + "logps/rejected": -1.8713629245758057, + "loss": 3.3386, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.76132583618164, + "rewards/margins": -0.047698307782411575, + "rewards/rejected": -18.713626861572266, + "step": 5750 + }, + { + "epoch": 0.19397350770164146, + "grad_norm": 32.249664306640625, + "learning_rate": 9.73341226238652e-07, + "logits/chosen": -0.250613272190094, + "logits/rejected": -0.2793341875076294, + "logps/chosen": -1.8432165384292603, + "logps/rejected": -1.846566915512085, + "loss": 3.6394, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -18.43216323852539, + "rewards/margins": 0.03350649029016495, + "rewards/rejected": -18.46567153930664, + "step": 5755 + }, + { + "epoch": 0.19414203377262462, + "grad_norm": 44.116703033447266, + "learning_rate": 9.732463835394063e-07, + "logits/chosen": -0.4042670726776123, + "logits/rejected": -0.23007135093212128, + "logps/chosen": -1.9827651977539062, + "logps/rejected": -1.97702157497406, + "loss": 3.736, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.827651977539062, + "rewards/margins": -0.05743579939007759, + "rewards/rejected": -19.770214080810547, + "step": 5760 + }, + { + "epoch": 0.19431055984360782, + "grad_norm": 28.332536697387695, + "learning_rate": 9.731513770682323e-07, + "logits/chosen": -0.5446762442588806, + "logits/rejected": -0.5711333155632019, + "logps/chosen": -1.74869704246521, + "logps/rejected": -1.832109808921814, + "loss": 2.508, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.48697280883789, + "rewards/margins": 0.8341274261474609, + "rewards/rejected": -18.32110023498535, + "step": 5765 + }, + { + "epoch": 0.19447908591459098, + "grad_norm": 6.460046291351318, + "learning_rate": 9.730562068580082e-07, + "logits/chosen": -0.8367756009101868, + "logits/rejected": -0.9457358121871948, + "logps/chosen": -2.0421996116638184, + "logps/rejected": -2.199319362640381, + "loss": 2.4971, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.421993255615234, + "rewards/margins": 1.5711987018585205, + "rewards/rejected": -21.993194580078125, + "step": 5770 + }, + { + "epoch": 0.19464761198557418, + "grad_norm": 28.444000244140625, + "learning_rate": 9.729608729416685e-07, + "logits/chosen": -0.5505703687667847, + "logits/rejected": -0.7946529388427734, + "logps/chosen": -1.8401174545288086, + "logps/rejected": -1.7754510641098022, + "loss": 3.7309, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -18.401174545288086, + "rewards/margins": -0.6466630697250366, + "rewards/rejected": -17.754512786865234, + "step": 5775 + }, + { + "epoch": 0.19481613805655734, + "grad_norm": 37.49628448486328, + "learning_rate": 9.728653753522045e-07, + "logits/chosen": -0.5423851013183594, + "logits/rejected": -0.5040590167045593, + "logps/chosen": -1.6697317361831665, + "logps/rejected": -1.6786043643951416, + "loss": 3.2312, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.697317123413086, + "rewards/margins": 0.08872584998607635, + "rewards/rejected": -16.786041259765625, + "step": 5780 + }, + { + "epoch": 0.19498466412754054, + "grad_norm": 35.8685188293457, + "learning_rate": 9.727697141226644e-07, + "logits/chosen": -0.5237818360328674, + "logits/rejected": -0.6760590672492981, + "logps/chosen": -1.9653745889663696, + "logps/rejected": -1.8444054126739502, + "loss": 4.2802, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -19.653745651245117, + "rewards/margins": -1.209693193435669, + "rewards/rejected": -18.444053649902344, + "step": 5785 + }, + { + "epoch": 0.1951531901985237, + "grad_norm": 34.6473388671875, + "learning_rate": 9.726738892861526e-07, + "logits/chosen": -0.37371161580085754, + "logits/rejected": -0.44602465629577637, + "logps/chosen": -1.9422938823699951, + "logps/rejected": -2.0096755027770996, + "loss": 3.0281, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.42293930053711, + "rewards/margins": 0.67381751537323, + "rewards/rejected": -20.096755981445312, + "step": 5790 + }, + { + "epoch": 0.1953217162695069, + "grad_norm": 25.640531539916992, + "learning_rate": 9.725779008758303e-07, + "logits/chosen": -0.4616813659667969, + "logits/rejected": -0.475913941860199, + "logps/chosen": -1.3816555738449097, + "logps/rejected": -1.4525748491287231, + "loss": 3.0528, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -13.816555976867676, + "rewards/margins": 0.7091928720474243, + "rewards/rejected": -14.525749206542969, + "step": 5795 + }, + { + "epoch": 0.19549024234049006, + "grad_norm": 18.854049682617188, + "learning_rate": 9.724817489249154e-07, + "logits/chosen": -0.24698737263679504, + "logits/rejected": -0.237580806016922, + "logps/chosen": -1.6802804470062256, + "logps/rejected": -1.8639495372772217, + "loss": 2.637, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.802804946899414, + "rewards/margins": 1.8366916179656982, + "rewards/rejected": -18.639495849609375, + "step": 5800 + }, + { + "epoch": 0.19565876841147326, + "grad_norm": 139.14752197265625, + "learning_rate": 9.72385433466682e-07, + "logits/chosen": -0.6270781755447388, + "logits/rejected": -0.7254256010055542, + "logps/chosen": -2.0985748767852783, + "logps/rejected": -1.8484370708465576, + "loss": 5.6863, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.985748291015625, + "rewards/margins": -2.5013768672943115, + "rewards/rejected": -18.484371185302734, + "step": 5805 + }, + { + "epoch": 0.19582729448245645, + "grad_norm": 19.971731185913086, + "learning_rate": 9.722889545344614e-07, + "logits/chosen": -0.4281018376350403, + "logits/rejected": -0.37164705991744995, + "logps/chosen": -1.734190583229065, + "logps/rejected": -1.7541513442993164, + "loss": 3.3521, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.34190559387207, + "rewards/margins": 0.19960804283618927, + "rewards/rejected": -17.541513442993164, + "step": 5810 + }, + { + "epoch": 0.19599582055343961, + "grad_norm": 60.791259765625, + "learning_rate": 9.721923121616413e-07, + "logits/chosen": -0.07752398401498795, + "logits/rejected": -0.12650911509990692, + "logps/chosen": -1.9717010259628296, + "logps/rejected": -2.07768177986145, + "loss": 3.2981, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.71701431274414, + "rewards/margins": 1.0598057508468628, + "rewards/rejected": -20.776817321777344, + "step": 5815 + }, + { + "epoch": 0.1961643466244228, + "grad_norm": 30.976852416992188, + "learning_rate": 9.720955063816654e-07, + "logits/chosen": -0.6050688624382019, + "logits/rejected": -0.6339203119277954, + "logps/chosen": -1.7570356130599976, + "logps/rejected": -1.7321546077728271, + "loss": 3.3348, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -17.570356369018555, + "rewards/margins": -0.24881133437156677, + "rewards/rejected": -17.32154655456543, + "step": 5820 + }, + { + "epoch": 0.19633287269540597, + "grad_norm": 20.5854434967041, + "learning_rate": 9.719985372280347e-07, + "logits/chosen": -0.4038739800453186, + "logits/rejected": -0.3884919583797455, + "logps/chosen": -2.019855499267578, + "logps/rejected": -2.1005451679229736, + "loss": 2.6202, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.19855308532715, + "rewards/margins": 0.8068978190422058, + "rewards/rejected": -21.005451202392578, + "step": 5825 + }, + { + "epoch": 0.19650139876638917, + "grad_norm": 18.702695846557617, + "learning_rate": 9.71901404734306e-07, + "logits/chosen": -0.8625004887580872, + "logits/rejected": -0.6800850033760071, + "logps/chosen": -1.7070497274398804, + "logps/rejected": -1.8050239086151123, + "loss": 2.3985, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.070499420166016, + "rewards/margins": 0.9797409176826477, + "rewards/rejected": -18.05023956298828, + "step": 5830 + }, + { + "epoch": 0.19666992483737233, + "grad_norm": 16.789480209350586, + "learning_rate": 9.718041089340936e-07, + "logits/chosen": -0.6353279948234558, + "logits/rejected": -0.6146889328956604, + "logps/chosen": -1.8607136011123657, + "logps/rejected": -1.9272617101669312, + "loss": 2.6298, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.60713768005371, + "rewards/margins": 0.665479302406311, + "rewards/rejected": -19.27261734008789, + "step": 5835 + }, + { + "epoch": 0.19683845090835553, + "grad_norm": 37.87785720825195, + "learning_rate": 9.717066498610673e-07, + "logits/chosen": -0.5720896124839783, + "logits/rejected": -0.7081347107887268, + "logps/chosen": -1.609519600868225, + "logps/rejected": -1.6527904272079468, + "loss": 2.8625, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.095195770263672, + "rewards/margins": 0.43270936608314514, + "rewards/rejected": -16.527904510498047, + "step": 5840 + }, + { + "epoch": 0.1970069769793387, + "grad_norm": 10.778923034667969, + "learning_rate": 9.71609027548954e-07, + "logits/chosen": -0.35430818796157837, + "logits/rejected": -0.2933744192123413, + "logps/chosen": -1.6635167598724365, + "logps/rejected": -1.7969443798065186, + "loss": 2.5076, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.635168075561523, + "rewards/margins": 1.3342747688293457, + "rewards/rejected": -17.96944236755371, + "step": 5845 + }, + { + "epoch": 0.1971755030503219, + "grad_norm": 23.068889617919922, + "learning_rate": 9.715112420315368e-07, + "logits/chosen": -1.0444772243499756, + "logits/rejected": -0.9749389886856079, + "logps/chosen": -1.5760352611541748, + "logps/rejected": -1.6423890590667725, + "loss": 2.6107, + "rewards/accuracies": 0.5, + "rewards/chosen": -15.760353088378906, + "rewards/margins": 0.66353839635849, + "rewards/rejected": -16.423892974853516, + "step": 5850 + }, + { + "epoch": 0.19734402912130505, + "grad_norm": 30.91828727722168, + "learning_rate": 9.714132933426557e-07, + "logits/chosen": -0.43545252084732056, + "logits/rejected": -0.40154963731765747, + "logps/chosen": -1.805863618850708, + "logps/rejected": -1.7854446172714233, + "loss": 3.3247, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.058637619018555, + "rewards/margins": -0.2041909247636795, + "rewards/rejected": -17.85444450378418, + "step": 5855 + }, + { + "epoch": 0.19751255519228825, + "grad_norm": 31.091777801513672, + "learning_rate": 9.713151815162067e-07, + "logits/chosen": -0.4769212603569031, + "logits/rejected": -0.5193304419517517, + "logps/chosen": -1.664804220199585, + "logps/rejected": -1.700951337814331, + "loss": 2.8362, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.648040771484375, + "rewards/margins": 0.36147135496139526, + "rewards/rejected": -17.00951385498047, + "step": 5860 + }, + { + "epoch": 0.19768108126327144, + "grad_norm": 25.882474899291992, + "learning_rate": 9.712169065861424e-07, + "logits/chosen": -0.49053654074668884, + "logits/rejected": -0.2957160472869873, + "logps/chosen": -2.1436679363250732, + "logps/rejected": -2.3108649253845215, + "loss": 2.587, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.43667984008789, + "rewards/margins": 1.6719691753387451, + "rewards/rejected": -23.10865020751953, + "step": 5865 + }, + { + "epoch": 0.1978496073342546, + "grad_norm": 20.540668487548828, + "learning_rate": 9.71118468586472e-07, + "logits/chosen": -0.26937225461006165, + "logits/rejected": -0.18302848935127258, + "logps/chosen": -1.7931854724884033, + "logps/rejected": -1.7600908279418945, + "loss": 3.406, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.931856155395508, + "rewards/margins": -0.33094778656959534, + "rewards/rejected": -17.600908279418945, + "step": 5870 + }, + { + "epoch": 0.1980181334052378, + "grad_norm": 49.5175666809082, + "learning_rate": 9.710198675512608e-07, + "logits/chosen": -0.4052053391933441, + "logits/rejected": -0.4798244535923004, + "logps/chosen": -1.8708423376083374, + "logps/rejected": -1.916273832321167, + "loss": 2.8076, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.708423614501953, + "rewards/margins": 0.45431557297706604, + "rewards/rejected": -19.162738800048828, + "step": 5875 + }, + { + "epoch": 0.19818665947622097, + "grad_norm": 29.265352249145508, + "learning_rate": 9.70921103514631e-07, + "logits/chosen": -0.641699492931366, + "logits/rejected": -0.7690142393112183, + "logps/chosen": -1.9759972095489502, + "logps/rejected": -1.9683644771575928, + "loss": 3.2972, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.759973526000977, + "rewards/margins": -0.07632637023925781, + "rewards/rejected": -19.683645248413086, + "step": 5880 + }, + { + "epoch": 0.19835518554720416, + "grad_norm": 35.96839904785156, + "learning_rate": 9.708221765107607e-07, + "logits/chosen": -0.45869994163513184, + "logits/rejected": -0.4570987820625305, + "logps/chosen": -2.0931758880615234, + "logps/rejected": -2.1789519786834717, + "loss": 2.7377, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.931758880615234, + "rewards/margins": 0.8577610850334167, + "rewards/rejected": -21.789520263671875, + "step": 5885 + }, + { + "epoch": 0.19852371161818733, + "grad_norm": 23.893787384033203, + "learning_rate": 9.70723086573885e-07, + "logits/chosen": -0.6110178828239441, + "logits/rejected": -0.6918951869010925, + "logps/chosen": -1.456732988357544, + "logps/rejected": -1.639822244644165, + "loss": 2.0513, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -14.567329406738281, + "rewards/margins": 1.8308935165405273, + "rewards/rejected": -16.398223876953125, + "step": 5890 + }, + { + "epoch": 0.19869223768917052, + "grad_norm": 15.295425415039062, + "learning_rate": 9.706238337382947e-07, + "logits/chosen": -0.45297136902809143, + "logits/rejected": -0.5232487916946411, + "logps/chosen": -1.600191354751587, + "logps/rejected": -1.6955150365829468, + "loss": 2.8533, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.00191307067871, + "rewards/margins": 0.9532366991043091, + "rewards/rejected": -16.955150604248047, + "step": 5895 + }, + { + "epoch": 0.19886076376015369, + "grad_norm": 24.45920753479004, + "learning_rate": 9.705244180383373e-07, + "logits/chosen": -0.3062538504600525, + "logits/rejected": -0.4098590910434723, + "logps/chosen": -1.7645599842071533, + "logps/rejected": -1.7392189502716064, + "loss": 3.3747, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -17.645599365234375, + "rewards/margins": -0.2534084916114807, + "rewards/rejected": -17.39219093322754, + "step": 5900 + }, + { + "epoch": 0.19902928983113688, + "grad_norm": 19.98879623413086, + "learning_rate": 9.704248395084168e-07, + "logits/chosen": -0.4781894087791443, + "logits/rejected": -0.3514857888221741, + "logps/chosen": -2.0645909309387207, + "logps/rejected": -2.069446325302124, + "loss": 3.1028, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.64590835571289, + "rewards/margins": 0.048557378351688385, + "rewards/rejected": -20.6944637298584, + "step": 5905 + }, + { + "epoch": 0.19919781590212005, + "grad_norm": 28.52830696105957, + "learning_rate": 9.703250981829932e-07, + "logits/chosen": -0.09460530430078506, + "logits/rejected": -0.139235720038414, + "logps/chosen": -2.3103229999542236, + "logps/rejected": -2.6174914836883545, + "loss": 1.5726, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -23.103229522705078, + "rewards/margins": 3.0716869831085205, + "rewards/rejected": -26.174917221069336, + "step": 5910 + }, + { + "epoch": 0.19936634197310324, + "grad_norm": 49.13142395019531, + "learning_rate": 9.702251940965833e-07, + "logits/chosen": -0.5760871171951294, + "logits/rejected": -0.335256963968277, + "logps/chosen": -1.875337839126587, + "logps/rejected": -1.816159963607788, + "loss": 3.7141, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.75337791442871, + "rewards/margins": -0.5917772054672241, + "rewards/rejected": -18.16160011291504, + "step": 5915 + }, + { + "epoch": 0.19953486804408643, + "grad_norm": 23.02393341064453, + "learning_rate": 9.701251272837599e-07, + "logits/chosen": -0.46277540922164917, + "logits/rejected": -0.30417174100875854, + "logps/chosen": -1.7341015338897705, + "logps/rejected": -1.7499128580093384, + "loss": 3.1858, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.341014862060547, + "rewards/margins": 0.15811166167259216, + "rewards/rejected": -17.499130249023438, + "step": 5920 + }, + { + "epoch": 0.1997033941150696, + "grad_norm": 43.57001495361328, + "learning_rate": 9.700248977791522e-07, + "logits/chosen": -0.6107692122459412, + "logits/rejected": -0.6478714346885681, + "logps/chosen": -1.9078069925308228, + "logps/rejected": -2.158576488494873, + "loss": 2.1498, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.07806968688965, + "rewards/margins": 2.5076966285705566, + "rewards/rejected": -21.585765838623047, + "step": 5925 + }, + { + "epoch": 0.1998719201860528, + "grad_norm": 25.969892501831055, + "learning_rate": 9.699245056174454e-07, + "logits/chosen": -0.5042437314987183, + "logits/rejected": -0.5279260873794556, + "logps/chosen": -2.0870537757873535, + "logps/rejected": -2.097421169281006, + "loss": 3.393, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -20.87053680419922, + "rewards/margins": 0.10367707908153534, + "rewards/rejected": -20.974214553833008, + "step": 5930 + }, + { + "epoch": 0.20004044625703596, + "grad_norm": 26.384300231933594, + "learning_rate": 9.698239508333816e-07, + "logits/chosen": -0.7063272595405579, + "logits/rejected": -0.7647172808647156, + "logps/chosen": -1.8058459758758545, + "logps/rejected": -1.8751310110092163, + "loss": 2.4807, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.058460235595703, + "rewards/margins": 0.6928521990776062, + "rewards/rejected": -18.751310348510742, + "step": 5935 + }, + { + "epoch": 0.20020897232801915, + "grad_norm": 34.54125213623047, + "learning_rate": 9.697232334617589e-07, + "logits/chosen": -0.6371638774871826, + "logits/rejected": -0.6679813861846924, + "logps/chosen": -1.9524176120758057, + "logps/rejected": -2.2252604961395264, + "loss": 2.9359, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.5241756439209, + "rewards/margins": 2.7284293174743652, + "rewards/rejected": -22.252605438232422, + "step": 5940 + }, + { + "epoch": 0.20037749839900232, + "grad_norm": 115.66785430908203, + "learning_rate": 9.696223535374313e-07, + "logits/chosen": -0.2789526879787445, + "logits/rejected": -0.33569225668907166, + "logps/chosen": -2.0028605461120605, + "logps/rejected": -2.0199637413024902, + "loss": 3.5454, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.02860450744629, + "rewards/margins": 0.1710345298051834, + "rewards/rejected": -20.19964027404785, + "step": 5945 + }, + { + "epoch": 0.2005460244699855, + "grad_norm": 20.133207321166992, + "learning_rate": 9.695213110953095e-07, + "logits/chosen": -0.24441027641296387, + "logits/rejected": -0.00073289277497679, + "logps/chosen": -1.8092113733291626, + "logps/rejected": -2.1856415271759033, + "loss": 1.4174, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.092111587524414, + "rewards/margins": 3.764303684234619, + "rewards/rejected": -21.856416702270508, + "step": 5950 + }, + { + "epoch": 0.20071455054096868, + "grad_norm": 31.918554306030273, + "learning_rate": 9.694201061703604e-07, + "logits/chosen": -0.16023916006088257, + "logits/rejected": -0.14087925851345062, + "logps/chosen": -2.2564046382904053, + "logps/rejected": -2.180935859680176, + "loss": 3.9491, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.564044952392578, + "rewards/margins": -0.7546874284744263, + "rewards/rejected": -21.80936050415039, + "step": 5955 + }, + { + "epoch": 0.20088307661195187, + "grad_norm": 23.612600326538086, + "learning_rate": 9.693187387976069e-07, + "logits/chosen": -0.7719990015029907, + "logits/rejected": -0.7834844589233398, + "logps/chosen": -1.8583654165267944, + "logps/rejected": -1.797662377357483, + "loss": 3.6691, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.583656311035156, + "rewards/margins": -0.6070324778556824, + "rewards/rejected": -17.97662353515625, + "step": 5960 + }, + { + "epoch": 0.20105160268293504, + "grad_norm": 22.211774826049805, + "learning_rate": 9.692172090121283e-07, + "logits/chosen": -0.6716977953910828, + "logits/rejected": -0.650750994682312, + "logps/chosen": -2.168424606323242, + "logps/rejected": -2.2529549598693848, + "loss": 2.8945, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.684246063232422, + "rewards/margins": 0.8453027009963989, + "rewards/rejected": -22.52954864501953, + "step": 5965 + }, + { + "epoch": 0.20122012875391823, + "grad_norm": 26.79001235961914, + "learning_rate": 9.6911551684906e-07, + "logits/chosen": -0.2786385416984558, + "logits/rejected": -0.3720299303531647, + "logps/chosen": -1.8238327503204346, + "logps/rejected": -1.905925989151001, + "loss": 2.5022, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.238325119018555, + "rewards/margins": 0.8209331631660461, + "rewards/rejected": -19.05925941467285, + "step": 5970 + }, + { + "epoch": 0.20138865482490143, + "grad_norm": 33.01688766479492, + "learning_rate": 9.69013662343594e-07, + "logits/chosen": -0.5326055288314819, + "logits/rejected": -0.5187439322471619, + "logps/chosen": -1.7838671207427979, + "logps/rejected": -1.8846409320831299, + "loss": 2.8234, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.83867073059082, + "rewards/margins": 1.0077372789382935, + "rewards/rejected": -18.84640884399414, + "step": 5975 + }, + { + "epoch": 0.2015571808958846, + "grad_norm": 52.05866241455078, + "learning_rate": 9.689116455309778e-07, + "logits/chosen": 0.0032115548383444548, + "logits/rejected": -0.10644103586673737, + "logps/chosen": -1.8666794300079346, + "logps/rejected": -2.0400755405426025, + "loss": 2.6665, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.666791915893555, + "rewards/margins": 1.7339622974395752, + "rewards/rejected": -20.400753021240234, + "step": 5980 + }, + { + "epoch": 0.20172570696686778, + "grad_norm": 26.484529495239258, + "learning_rate": 9.688094664465153e-07, + "logits/chosen": -0.3309114873409271, + "logits/rejected": -0.4025183320045471, + "logps/chosen": -1.8796402215957642, + "logps/rejected": -1.9859651327133179, + "loss": 3.4244, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.79640007019043, + "rewards/margins": 1.0632522106170654, + "rewards/rejected": -19.85965347290039, + "step": 5985 + }, + { + "epoch": 0.20189423303785095, + "grad_norm": 21.167022705078125, + "learning_rate": 9.68707125125567e-07, + "logits/chosen": -0.1683267056941986, + "logits/rejected": -0.21480047702789307, + "logps/chosen": -1.8829946517944336, + "logps/rejected": -1.7001540660858154, + "loss": 5.0036, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.829946517944336, + "rewards/margins": -1.8284060955047607, + "rewards/rejected": -17.001541137695312, + "step": 5990 + }, + { + "epoch": 0.20206275910883414, + "grad_norm": 33.96486282348633, + "learning_rate": 9.68604621603549e-07, + "logits/chosen": -0.4873688220977783, + "logits/rejected": -0.45059436559677124, + "logps/chosen": -1.7515493631362915, + "logps/rejected": -1.759034514427185, + "loss": 3.4861, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -17.515491485595703, + "rewards/margins": 0.0748540386557579, + "rewards/rejected": -17.59034538269043, + "step": 5995 + }, + { + "epoch": 0.2022312851798173, + "grad_norm": 14.7745943069458, + "learning_rate": 9.685019559159335e-07, + "logits/chosen": -0.7040256261825562, + "logits/rejected": -0.678537130355835, + "logps/chosen": -1.5471678972244263, + "logps/rejected": -1.618430495262146, + "loss": 2.6757, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -15.4716796875, + "rewards/margins": 0.7126253247261047, + "rewards/rejected": -16.18430519104004, + "step": 6000 + }, + { + "epoch": 0.2022312851798173, + "eval_logits/chosen": -0.7735000848770142, + "eval_logits/rejected": -0.7976916432380676, + "eval_logps/chosen": -1.7360255718231201, + "eval_logps/rejected": -1.7606263160705566, + "eval_loss": 3.2614545822143555, + "eval_rewards/accuracies": 0.5199999809265137, + "eval_rewards/chosen": -17.36025619506836, + "eval_rewards/margins": 0.24600711464881897, + "eval_rewards/rejected": -17.60626220703125, + "eval_runtime": 12.9219, + "eval_samples_per_second": 7.739, + "eval_steps_per_second": 1.935, + "step": 6000 + }, + { + "epoch": 0.2023998112508005, + "grad_norm": 24.49705696105957, + "learning_rate": 9.683991280982496e-07, + "logits/chosen": -0.6119563579559326, + "logits/rejected": -0.612609326839447, + "logps/chosen": -2.0236449241638184, + "logps/rejected": -2.0016684532165527, + "loss": 3.3791, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.2364501953125, + "rewards/margins": -0.21976280212402344, + "rewards/rejected": -20.016687393188477, + "step": 6005 + }, + { + "epoch": 0.20256833732178367, + "grad_norm": 24.686538696289062, + "learning_rate": 9.682961381860816e-07, + "logits/chosen": -0.5340906381607056, + "logits/rejected": -0.5442155599594116, + "logps/chosen": -1.488397479057312, + "logps/rejected": -1.5215156078338623, + "loss": 2.8946, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -14.883974075317383, + "rewards/margins": 0.33118313550949097, + "rewards/rejected": -15.215158462524414, + "step": 6010 + }, + { + "epoch": 0.20273686339276686, + "grad_norm": 11.82690715789795, + "learning_rate": 9.681929862150702e-07, + "logits/chosen": -0.6329524517059326, + "logits/rejected": -0.5246933698654175, + "logps/chosen": -2.3101534843444824, + "logps/rejected": -2.4467110633850098, + "loss": 2.6406, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.10153579711914, + "rewards/margins": 1.3655731678009033, + "rewards/rejected": -24.46710968017578, + "step": 6015 + }, + { + "epoch": 0.20290538946375003, + "grad_norm": 29.439022064208984, + "learning_rate": 9.680896722209122e-07, + "logits/chosen": -0.5506235361099243, + "logits/rejected": -0.4831174910068512, + "logps/chosen": -1.7477543354034424, + "logps/rejected": -1.8734235763549805, + "loss": 2.246, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.477542877197266, + "rewards/margins": 1.2566947937011719, + "rewards/rejected": -18.734233856201172, + "step": 6020 + }, + { + "epoch": 0.20307391553473322, + "grad_norm": 15.599130630493164, + "learning_rate": 9.67986196239361e-07, + "logits/chosen": -0.0006256193155422807, + "logits/rejected": 0.09851661324501038, + "logps/chosen": -2.0748696327209473, + "logps/rejected": -2.4163742065429688, + "loss": 2.5649, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.748693466186523, + "rewards/margins": 3.4150452613830566, + "rewards/rejected": -24.163740158081055, + "step": 6025 + }, + { + "epoch": 0.20324244160571642, + "grad_norm": 23.84718894958496, + "learning_rate": 9.67882558306225e-07, + "logits/chosen": -0.45076194405555725, + "logits/rejected": -0.6076852679252625, + "logps/chosen": -1.6174421310424805, + "logps/rejected": -1.6184570789337158, + "loss": 3.1546, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.174421310424805, + "rewards/margins": 0.010150337591767311, + "rewards/rejected": -16.184572219848633, + "step": 6030 + }, + { + "epoch": 0.20341096767669958, + "grad_norm": 24.911104202270508, + "learning_rate": 9.677787584573693e-07, + "logits/chosen": -0.6882834434509277, + "logits/rejected": -0.7800687551498413, + "logps/chosen": -1.8254387378692627, + "logps/rejected": -1.8409912586212158, + "loss": 3.1078, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.254384994506836, + "rewards/margins": 0.1555270254611969, + "rewards/rejected": -18.409914016723633, + "step": 6035 + }, + { + "epoch": 0.20357949374768278, + "grad_norm": 19.878774642944336, + "learning_rate": 9.676747967287153e-07, + "logits/chosen": -0.21769729256629944, + "logits/rejected": -0.21601931750774384, + "logps/chosen": -2.1479928493499756, + "logps/rejected": -2.5670371055603027, + "loss": 2.5418, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.479928970336914, + "rewards/margins": 4.190441131591797, + "rewards/rejected": -25.67037010192871, + "step": 6040 + }, + { + "epoch": 0.20374801981866594, + "grad_norm": 26.25444793701172, + "learning_rate": 9.675706731562395e-07, + "logits/chosen": -0.12167356163263321, + "logits/rejected": -0.1530594676733017, + "logps/chosen": -2.234086275100708, + "logps/rejected": -2.3034989833831787, + "loss": 2.8796, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.340862274169922, + "rewards/margins": 0.6941286325454712, + "rewards/rejected": -23.034990310668945, + "step": 6045 + }, + { + "epoch": 0.20391654588964914, + "grad_norm": 33.240116119384766, + "learning_rate": 9.674663877759758e-07, + "logits/chosen": -0.15784066915512085, + "logits/rejected": -0.27910029888153076, + "logps/chosen": -1.6888351440429688, + "logps/rejected": -1.777195692062378, + "loss": 2.5451, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.888351440429688, + "rewards/margins": 0.8836051225662231, + "rewards/rejected": -17.771955490112305, + "step": 6050 + }, + { + "epoch": 0.2040850719606323, + "grad_norm": 37.929264068603516, + "learning_rate": 9.673619406240122e-07, + "logits/chosen": -0.8239002227783203, + "logits/rejected": -0.8135004043579102, + "logps/chosen": -1.9584920406341553, + "logps/rejected": -1.9400832653045654, + "loss": 3.2595, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.58492088317871, + "rewards/margins": -0.1840866059064865, + "rewards/rejected": -19.400833129882812, + "step": 6055 + }, + { + "epoch": 0.2042535980316155, + "grad_norm": 15.64293098449707, + "learning_rate": 9.672573317364945e-07, + "logits/chosen": -0.4086344838142395, + "logits/rejected": -0.49510034918785095, + "logps/chosen": -1.9705301523208618, + "logps/rejected": -1.976130485534668, + "loss": 3.5653, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.705303192138672, + "rewards/margins": 0.056003473699092865, + "rewards/rejected": -19.76130485534668, + "step": 6060 + }, + { + "epoch": 0.20442212410259866, + "grad_norm": 99.170654296875, + "learning_rate": 9.671525611496235e-07, + "logits/chosen": -0.6581277847290039, + "logits/rejected": -0.6759839653968811, + "logps/chosen": -2.240769147872925, + "logps/rejected": -2.044039011001587, + "loss": 5.0291, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -22.407690048217773, + "rewards/margins": -1.9673011302947998, + "rewards/rejected": -20.440387725830078, + "step": 6065 + }, + { + "epoch": 0.20459065017358186, + "grad_norm": 28.03131675720215, + "learning_rate": 9.67047628899656e-07, + "logits/chosen": -0.5428565740585327, + "logits/rejected": -0.6084557175636292, + "logps/chosen": -1.6421184539794922, + "logps/rejected": -1.7033920288085938, + "loss": 2.8575, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.421184539794922, + "rewards/margins": 0.6127360463142395, + "rewards/rejected": -17.033920288085938, + "step": 6070 + }, + { + "epoch": 0.20475917624456502, + "grad_norm": 18.674060821533203, + "learning_rate": 9.66942535022905e-07, + "logits/chosen": -0.46286773681640625, + "logits/rejected": -0.45752209424972534, + "logps/chosen": -1.7388484477996826, + "logps/rejected": -1.9831173419952393, + "loss": 2.4786, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.388484954833984, + "rewards/margins": 2.4426865577697754, + "rewards/rejected": -19.8311710357666, + "step": 6075 + }, + { + "epoch": 0.20492770231554822, + "grad_norm": 19.101802825927734, + "learning_rate": 9.668372795557398e-07, + "logits/chosen": -0.734752357006073, + "logits/rejected": -0.9037183523178101, + "logps/chosen": -1.6903842687606812, + "logps/rejected": -1.6715500354766846, + "loss": 3.2694, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.90384292602539, + "rewards/margins": -0.1883423775434494, + "rewards/rejected": -16.71550178527832, + "step": 6080 + }, + { + "epoch": 0.2050962283865314, + "grad_norm": 38.23634719848633, + "learning_rate": 9.667318625345847e-07, + "logits/chosen": -0.4373010993003845, + "logits/rejected": -0.30279842019081116, + "logps/chosen": -1.723619818687439, + "logps/rejected": -1.8654381036758423, + "loss": 2.4254, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.2362003326416, + "rewards/margins": 1.4181811809539795, + "rewards/rejected": -18.65437889099121, + "step": 6085 + }, + { + "epoch": 0.20526475445751458, + "grad_norm": 15.726313591003418, + "learning_rate": 9.666262839959203e-07, + "logits/chosen": -0.5284382700920105, + "logits/rejected": -0.5790424346923828, + "logps/chosen": -1.8899242877960205, + "logps/rejected": -2.023179531097412, + "loss": 2.2101, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.899242401123047, + "rewards/margins": 1.3325533866882324, + "rewards/rejected": -20.231792449951172, + "step": 6090 + }, + { + "epoch": 0.20543328052849777, + "grad_norm": 25.096664428710938, + "learning_rate": 9.665205439762833e-07, + "logits/chosen": -0.3775356709957123, + "logits/rejected": -0.22368212044239044, + "logps/chosen": -2.0895133018493652, + "logps/rejected": -2.1160058975219727, + "loss": 3.4497, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.89513397216797, + "rewards/margins": 0.2649245262145996, + "rewards/rejected": -21.160058975219727, + "step": 6095 + }, + { + "epoch": 0.20560180659948094, + "grad_norm": 86.66022491455078, + "learning_rate": 9.664146425122664e-07, + "logits/chosen": -0.624174952507019, + "logits/rejected": -0.5643637776374817, + "logps/chosen": -2.0391335487365723, + "logps/rejected": -2.046374797821045, + "loss": 3.3601, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.391334533691406, + "rewards/margins": 0.0724119171500206, + "rewards/rejected": -20.4637451171875, + "step": 6100 + }, + { + "epoch": 0.20577033267046413, + "grad_norm": 29.608823776245117, + "learning_rate": 9.663085796405177e-07, + "logits/chosen": -0.8779303431510925, + "logits/rejected": -0.7524505257606506, + "logps/chosen": -1.580993413925171, + "logps/rejected": -1.547828197479248, + "loss": 3.3839, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -15.809934616088867, + "rewards/margins": -0.33165159821510315, + "rewards/rejected": -15.47828197479248, + "step": 6105 + }, + { + "epoch": 0.2059388587414473, + "grad_norm": 27.97531509399414, + "learning_rate": 9.662023553977414e-07, + "logits/chosen": -0.4506549835205078, + "logits/rejected": -0.5821264386177063, + "logps/chosen": -1.843197226524353, + "logps/rejected": -1.8881915807724, + "loss": 3.2041, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.431970596313477, + "rewards/margins": 0.449946790933609, + "rewards/rejected": -18.88191795349121, + "step": 6110 + }, + { + "epoch": 0.2061073848124305, + "grad_norm": 36.354248046875, + "learning_rate": 9.660959698206977e-07, + "logits/chosen": -0.21553261578083038, + "logits/rejected": -0.3534262776374817, + "logps/chosen": -2.1483542919158936, + "logps/rejected": -2.0875895023345947, + "loss": 3.7702, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.48354148864746, + "rewards/margins": -0.6076488494873047, + "rewards/rejected": -20.87589454650879, + "step": 6115 + }, + { + "epoch": 0.20627591088341365, + "grad_norm": 26.448593139648438, + "learning_rate": 9.65989422946202e-07, + "logits/chosen": -0.507719099521637, + "logits/rejected": -0.4437492787837982, + "logps/chosen": -1.8520978689193726, + "logps/rejected": -2.002349853515625, + "loss": 3.574, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.520977020263672, + "rewards/margins": 1.5025217533111572, + "rewards/rejected": -20.02349853515625, + "step": 6120 + }, + { + "epoch": 0.20644443695439685, + "grad_norm": 18.775527954101562, + "learning_rate": 9.658827148111263e-07, + "logits/chosen": -0.5065479278564453, + "logits/rejected": -0.434520959854126, + "logps/chosen": -2.0218491554260254, + "logps/rejected": -2.132096767425537, + "loss": 2.2438, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -20.218490600585938, + "rewards/margins": 1.102478265762329, + "rewards/rejected": -21.320964813232422, + "step": 6125 + }, + { + "epoch": 0.20661296302538001, + "grad_norm": 23.01225471496582, + "learning_rate": 9.657758454523983e-07, + "logits/chosen": -0.2993611991405487, + "logits/rejected": -0.33458638191223145, + "logps/chosen": -1.8482387065887451, + "logps/rejected": -1.9364979267120361, + "loss": 2.8185, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.48238754272461, + "rewards/margins": 0.8825904726982117, + "rewards/rejected": -19.364978790283203, + "step": 6130 + }, + { + "epoch": 0.2067814890963632, + "grad_norm": 35.34732437133789, + "learning_rate": 9.656688149070006e-07, + "logits/chosen": -0.7763724327087402, + "logits/rejected": -0.8895975947380066, + "logps/chosen": -1.8610140085220337, + "logps/rejected": -1.8075506687164307, + "loss": 3.7522, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.610139846801758, + "rewards/margins": -0.5346325635910034, + "rewards/rejected": -18.075504302978516, + "step": 6135 + }, + { + "epoch": 0.2069500151673464, + "grad_norm": 33.043312072753906, + "learning_rate": 9.65561623211973e-07, + "logits/chosen": -0.7509564757347107, + "logits/rejected": -0.6484511494636536, + "logps/chosen": -1.8732191324234009, + "logps/rejected": -1.766880989074707, + "loss": 4.2271, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -18.732189178466797, + "rewards/margins": -1.063381552696228, + "rewards/rejected": -17.66880989074707, + "step": 6140 + }, + { + "epoch": 0.20711854123832957, + "grad_norm": 21.4063663482666, + "learning_rate": 9.6545427040441e-07, + "logits/chosen": -0.6801129579544067, + "logits/rejected": -0.6815675497055054, + "logps/chosen": -2.0237112045288086, + "logps/rejected": -1.9940065145492554, + "loss": 3.3582, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.237112045288086, + "rewards/margins": -0.2970461845397949, + "rewards/rejected": -19.940067291259766, + "step": 6145 + }, + { + "epoch": 0.20728706730931276, + "grad_norm": 29.270660400390625, + "learning_rate": 9.653467565214622e-07, + "logits/chosen": -0.7824691534042358, + "logits/rejected": -0.8962273597717285, + "logps/chosen": -1.485214114189148, + "logps/rejected": -1.6048316955566406, + "loss": 2.268, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -14.852142333984375, + "rewards/margins": 1.1961743831634521, + "rewards/rejected": -16.048315048217773, + "step": 6150 + }, + { + "epoch": 0.20745559338029593, + "grad_norm": 19.40476417541504, + "learning_rate": 9.652390816003357e-07, + "logits/chosen": -0.6924790143966675, + "logits/rejected": -0.7991618514060974, + "logps/chosen": -1.395509123802185, + "logps/rejected": -1.5696979761123657, + "loss": 1.7101, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -13.95509147644043, + "rewards/margins": 1.741887092590332, + "rewards/rejected": -15.696978569030762, + "step": 6155 + }, + { + "epoch": 0.20762411945127912, + "grad_norm": 15.866957664489746, + "learning_rate": 9.65131245678293e-07, + "logits/chosen": -0.7179878354072571, + "logits/rejected": -0.5750179290771484, + "logps/chosen": -1.5896713733673096, + "logps/rejected": -1.6341173648834229, + "loss": 3.1135, + "rewards/accuracies": 0.5, + "rewards/chosen": -15.89671516418457, + "rewards/margins": 0.4444583058357239, + "rewards/rejected": -16.341171264648438, + "step": 6160 + }, + { + "epoch": 0.2077926455222623, + "grad_norm": 19.111637115478516, + "learning_rate": 9.650232487926514e-07, + "logits/chosen": -0.7711466550827026, + "logits/rejected": -0.8609915971755981, + "logps/chosen": -1.733947515487671, + "logps/rejected": -1.926256775856018, + "loss": 1.9845, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.3394775390625, + "rewards/margins": 1.9230899810791016, + "rewards/rejected": -19.262563705444336, + "step": 6165 + }, + { + "epoch": 0.20796117159324548, + "grad_norm": 20.010040283203125, + "learning_rate": 9.649150909807847e-07, + "logits/chosen": -0.8053653836250305, + "logits/rejected": -0.7153798341751099, + "logps/chosen": -2.1477370262145996, + "logps/rejected": -2.236959934234619, + "loss": 2.813, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.477371215820312, + "rewards/margins": 0.892225444316864, + "rewards/rejected": -22.369596481323242, + "step": 6170 + }, + { + "epoch": 0.20812969766422865, + "grad_norm": 14.432764053344727, + "learning_rate": 9.64806772280122e-07, + "logits/chosen": -0.39327144622802734, + "logits/rejected": -0.4320458769798279, + "logps/chosen": -1.6436790227890015, + "logps/rejected": -1.7568355798721313, + "loss": 2.5333, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.43678855895996, + "rewards/margins": 1.1315654516220093, + "rewards/rejected": -17.568355560302734, + "step": 6175 + }, + { + "epoch": 0.20829822373521184, + "grad_norm": 30.609670639038086, + "learning_rate": 9.646982927281479e-07, + "logits/chosen": -0.6090951561927795, + "logits/rejected": -0.634852945804596, + "logps/chosen": -1.7177422046661377, + "logps/rejected": -1.7539911270141602, + "loss": 3.2011, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.177419662475586, + "rewards/margins": 0.36249056458473206, + "rewards/rejected": -17.5399112701416, + "step": 6180 + }, + { + "epoch": 0.208466749806195, + "grad_norm": 29.538747787475586, + "learning_rate": 9.64589652362403e-07, + "logits/chosen": -0.7170549631118774, + "logits/rejected": -0.6587635278701782, + "logps/chosen": -1.4641923904418945, + "logps/rejected": -1.5876529216766357, + "loss": 2.7722, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -14.641921997070312, + "rewards/margins": 1.2346062660217285, + "rewards/rejected": -15.8765287399292, + "step": 6185 + }, + { + "epoch": 0.2086352758771782, + "grad_norm": 37.199344635009766, + "learning_rate": 9.644808512204837e-07, + "logits/chosen": -0.5983039140701294, + "logits/rejected": -0.7115954756736755, + "logps/chosen": -1.6693763732910156, + "logps/rejected": -1.6038440465927124, + "loss": 3.7817, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.693761825561523, + "rewards/margins": -0.6553219556808472, + "rewards/rejected": -16.038440704345703, + "step": 6190 + }, + { + "epoch": 0.2088038019481614, + "grad_norm": 21.11469841003418, + "learning_rate": 9.643718893400416e-07, + "logits/chosen": -0.6305486559867859, + "logits/rejected": -0.46515828371047974, + "logps/chosen": -1.977230429649353, + "logps/rejected": -2.1082019805908203, + "loss": 2.644, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.77230453491211, + "rewards/margins": 1.3097164630889893, + "rewards/rejected": -21.082019805908203, + "step": 6195 + }, + { + "epoch": 0.20897232801914456, + "grad_norm": 25.193111419677734, + "learning_rate": 9.642627667587842e-07, + "logits/chosen": -0.38711774349212646, + "logits/rejected": -0.2904582619667053, + "logps/chosen": -1.7007389068603516, + "logps/rejected": -1.7201976776123047, + "loss": 3.6035, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -17.007389068603516, + "rewards/margins": 0.19458922743797302, + "rewards/rejected": -17.201976776123047, + "step": 6200 + }, + { + "epoch": 0.20914085409012775, + "grad_norm": 22.27320098876953, + "learning_rate": 9.641534835144742e-07, + "logits/chosen": -0.6792038679122925, + "logits/rejected": -0.6867347955703735, + "logps/chosen": -1.9890989065170288, + "logps/rejected": -1.9915831089019775, + "loss": 3.3895, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.890989303588867, + "rewards/margins": 0.024839973077178, + "rewards/rejected": -19.915828704833984, + "step": 6205 + }, + { + "epoch": 0.20930938016111092, + "grad_norm": 18.266559600830078, + "learning_rate": 9.640440396449304e-07, + "logits/chosen": -0.7111789584159851, + "logits/rejected": -0.691353440284729, + "logps/chosen": -1.796891212463379, + "logps/rejected": -1.7614777088165283, + "loss": 3.5483, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.968910217285156, + "rewards/margins": -0.35413503646850586, + "rewards/rejected": -17.614776611328125, + "step": 6210 + }, + { + "epoch": 0.2094779062320941, + "grad_norm": 29.486312866210938, + "learning_rate": 9.639344351880276e-07, + "logits/chosen": -0.19421645998954773, + "logits/rejected": -0.12176599353551865, + "logps/chosen": -1.9066638946533203, + "logps/rejected": -2.329479694366455, + "loss": 2.2476, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.066638946533203, + "rewards/margins": 4.228161334991455, + "rewards/rejected": -23.294797897338867, + "step": 6215 + }, + { + "epoch": 0.20964643230307728, + "grad_norm": 14.711030006408691, + "learning_rate": 9.638246701816946e-07, + "logits/chosen": -0.7750159502029419, + "logits/rejected": -0.7665907740592957, + "logps/chosen": -1.6569950580596924, + "logps/rejected": -1.7536866664886475, + "loss": 2.292, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -16.569950103759766, + "rewards/margins": 0.966915488243103, + "rewards/rejected": -17.536867141723633, + "step": 6220 + }, + { + "epoch": 0.20981495837406047, + "grad_norm": 19.998607635498047, + "learning_rate": 9.637147446639172e-07, + "logits/chosen": -0.7628619074821472, + "logits/rejected": -0.639403223991394, + "logps/chosen": -1.9863265752792358, + "logps/rejected": -2.026005268096924, + "loss": 2.8586, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.863265991210938, + "rewards/margins": 0.3967866003513336, + "rewards/rejected": -20.260051727294922, + "step": 6225 + }, + { + "epoch": 0.20998348444504364, + "grad_norm": 23.68602752685547, + "learning_rate": 9.636046586727366e-07, + "logits/chosen": -0.6806201934814453, + "logits/rejected": -0.46249714493751526, + "logps/chosen": -1.8867038488388062, + "logps/rejected": -1.9447044134140015, + "loss": 3.3294, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.86703872680664, + "rewards/margins": 0.5800049901008606, + "rewards/rejected": -19.44704246520996, + "step": 6230 + }, + { + "epoch": 0.21015201051602683, + "grad_norm": 18.60115623474121, + "learning_rate": 9.63494412246249e-07, + "logits/chosen": -0.2512991428375244, + "logits/rejected": -0.20353391766548157, + "logps/chosen": -2.148930549621582, + "logps/rejected": -2.354224920272827, + "loss": 1.8338, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.489307403564453, + "rewards/margins": 2.052943706512451, + "rewards/rejected": -23.542251586914062, + "step": 6235 + }, + { + "epoch": 0.21032053658701, + "grad_norm": 21.855871200561523, + "learning_rate": 9.63384005422606e-07, + "logits/chosen": -0.365181028842926, + "logits/rejected": -0.33115094900131226, + "logps/chosen": -2.059521436691284, + "logps/rejected": -2.308326005935669, + "loss": 2.1597, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.595216751098633, + "rewards/margins": 2.4880428314208984, + "rewards/rejected": -23.0832576751709, + "step": 6240 + }, + { + "epoch": 0.2104890626579932, + "grad_norm": 43.752532958984375, + "learning_rate": 9.632734382400154e-07, + "logits/chosen": -0.21305397152900696, + "logits/rejected": -0.3376748859882355, + "logps/chosen": -1.9535162448883057, + "logps/rejected": -1.8234503269195557, + "loss": 4.429, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.5351619720459, + "rewards/margins": -1.300659418106079, + "rewards/rejected": -18.234500885009766, + "step": 6245 + }, + { + "epoch": 0.21065758872897639, + "grad_norm": 18.581083297729492, + "learning_rate": 9.6316271073674e-07, + "logits/chosen": -0.9776862263679504, + "logits/rejected": -1.0578795671463013, + "logps/chosen": -1.5479357242584229, + "logps/rejected": -1.4699079990386963, + "loss": 3.9844, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -15.479357719421387, + "rewards/margins": -0.7802785634994507, + "rewards/rejected": -14.699081420898438, + "step": 6250 + }, + { + "epoch": 0.21082611479995955, + "grad_norm": 42.386260986328125, + "learning_rate": 9.630518229510984e-07, + "logits/chosen": -0.4895743727684021, + "logits/rejected": -0.546970784664154, + "logps/chosen": -1.7075220346450806, + "logps/rejected": -1.6165168285369873, + "loss": 4.2254, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.075220108032227, + "rewards/margins": -0.9100503921508789, + "rewards/rejected": -16.16516876220703, + "step": 6255 + }, + { + "epoch": 0.21099464087094275, + "grad_norm": 30.77920913696289, + "learning_rate": 9.629407749214643e-07, + "logits/chosen": -0.13082917034626007, + "logits/rejected": -0.14675593376159668, + "logps/chosen": -2.299105167388916, + "logps/rejected": -2.578866958618164, + "loss": 2.8197, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.991052627563477, + "rewards/margins": 2.797618865966797, + "rewards/rejected": -25.788671493530273, + "step": 6260 + }, + { + "epoch": 0.2111631669419259, + "grad_norm": 26.253183364868164, + "learning_rate": 9.628295666862672e-07, + "logits/chosen": -0.48517459630966187, + "logits/rejected": -0.4192652702331543, + "logps/chosen": -1.9712200164794922, + "logps/rejected": -1.9250516891479492, + "loss": 3.5636, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.712200164794922, + "rewards/margins": -0.4616851806640625, + "rewards/rejected": -19.25051498413086, + "step": 6265 + }, + { + "epoch": 0.2113316930129091, + "grad_norm": 45.756656646728516, + "learning_rate": 9.627181982839918e-07, + "logits/chosen": -0.760116696357727, + "logits/rejected": -0.814267635345459, + "logps/chosen": -1.9276320934295654, + "logps/rejected": -2.051849842071533, + "loss": 2.378, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.276317596435547, + "rewards/margins": 1.242180347442627, + "rewards/rejected": -20.51849937438965, + "step": 6270 + }, + { + "epoch": 0.21150021908389227, + "grad_norm": 21.75693702697754, + "learning_rate": 9.626066697531784e-07, + "logits/chosen": -0.4948394298553467, + "logits/rejected": -0.5552079081535339, + "logps/chosen": -1.6278167963027954, + "logps/rejected": -1.7882741689682007, + "loss": 2.2152, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.278165817260742, + "rewards/margins": 1.6045728921890259, + "rewards/rejected": -17.882740020751953, + "step": 6275 + }, + { + "epoch": 0.21166874515487547, + "grad_norm": 28.453824996948242, + "learning_rate": 9.624949811324226e-07, + "logits/chosen": -0.07232952117919922, + "logits/rejected": -0.025306105613708496, + "logps/chosen": -2.205857515335083, + "logps/rejected": -2.5525448322296143, + "loss": 2.6162, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.058574676513672, + "rewards/margins": 3.4668731689453125, + "rewards/rejected": -25.525447845458984, + "step": 6280 + }, + { + "epoch": 0.21183727122585863, + "grad_norm": 21.348684310913086, + "learning_rate": 9.623831324603752e-07, + "logits/chosen": -0.3630000054836273, + "logits/rejected": -0.35038089752197266, + "logps/chosen": -1.8361036777496338, + "logps/rejected": -1.7641780376434326, + "loss": 3.8496, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -18.361034393310547, + "rewards/margins": -0.7192561030387878, + "rewards/rejected": -17.641780853271484, + "step": 6285 + }, + { + "epoch": 0.21200579729684182, + "grad_norm": 47.256561279296875, + "learning_rate": 9.62271123775743e-07, + "logits/chosen": -0.39444655179977417, + "logits/rejected": -0.3994792401790619, + "logps/chosen": -2.297093152999878, + "logps/rejected": -2.3859825134277344, + "loss": 2.4176, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.970930099487305, + "rewards/margins": 0.8888934850692749, + "rewards/rejected": -23.859825134277344, + "step": 6290 + }, + { + "epoch": 0.212174323367825, + "grad_norm": 37.113338470458984, + "learning_rate": 9.621589551172875e-07, + "logits/chosen": -0.5275182723999023, + "logits/rejected": -0.49484142661094666, + "logps/chosen": -1.9655338525772095, + "logps/rejected": -1.872588872909546, + "loss": 4.0064, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.655338287353516, + "rewards/margins": -0.9294483065605164, + "rewards/rejected": -18.725889205932617, + "step": 6295 + }, + { + "epoch": 0.21234284943880818, + "grad_norm": 33.60499572753906, + "learning_rate": 9.620466265238261e-07, + "logits/chosen": -0.40655916929244995, + "logits/rejected": -0.3757340908050537, + "logps/chosen": -1.8657829761505127, + "logps/rejected": -1.9118738174438477, + "loss": 2.9708, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -18.65782928466797, + "rewards/margins": 0.4609087109565735, + "rewards/rejected": -19.118738174438477, + "step": 6300 + }, + { + "epoch": 0.21251137550979138, + "grad_norm": 19.811283111572266, + "learning_rate": 9.619341380342312e-07, + "logits/chosen": -0.7586280107498169, + "logits/rejected": -0.7591967582702637, + "logps/chosen": -1.701525092124939, + "logps/rejected": -1.781437635421753, + "loss": 2.4697, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.0152530670166, + "rewards/margins": 0.7991257905960083, + "rewards/rejected": -17.814376831054688, + "step": 6305 + }, + { + "epoch": 0.21267990158077454, + "grad_norm": 38.60699462890625, + "learning_rate": 9.618214896874305e-07, + "logits/chosen": -0.4628763794898987, + "logits/rejected": -0.6883228421211243, + "logps/chosen": -1.6286872625350952, + "logps/rejected": -1.568355917930603, + "loss": 3.7782, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.2868709564209, + "rewards/margins": -0.6033118367195129, + "rewards/rejected": -15.683561325073242, + "step": 6310 + }, + { + "epoch": 0.21284842765175774, + "grad_norm": 42.60224151611328, + "learning_rate": 9.617086815224072e-07, + "logits/chosen": -0.17860253155231476, + "logits/rejected": -0.10489163547754288, + "logps/chosen": -2.4953255653381348, + "logps/rejected": -2.519314765930176, + "loss": 4.2209, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -24.953256607055664, + "rewards/margins": 0.23989124596118927, + "rewards/rejected": -25.19314956665039, + "step": 6315 + }, + { + "epoch": 0.2130169537227409, + "grad_norm": 39.443023681640625, + "learning_rate": 9.615957135782e-07, + "logits/chosen": -0.7516977190971375, + "logits/rejected": -0.752848744392395, + "logps/chosen": -1.9006750583648682, + "logps/rejected": -1.9128259420394897, + "loss": 3.3723, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.00674819946289, + "rewards/margins": 0.12151069939136505, + "rewards/rejected": -19.12826156616211, + "step": 6320 + }, + { + "epoch": 0.2131854797937241, + "grad_norm": 111.87989044189453, + "learning_rate": 9.614825858939023e-07, + "logits/chosen": -0.3990008533000946, + "logits/rejected": -0.5169636607170105, + "logps/chosen": -1.9904636144638062, + "logps/rejected": -2.0598185062408447, + "loss": 2.5614, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.90463638305664, + "rewards/margins": 0.6935473680496216, + "rewards/rejected": -20.59818458557129, + "step": 6325 + }, + { + "epoch": 0.21335400586470726, + "grad_norm": 56.99147415161133, + "learning_rate": 9.613692985086634e-07, + "logits/chosen": -0.044111065566539764, + "logits/rejected": -0.09141120314598083, + "logps/chosen": -2.312798023223877, + "logps/rejected": -2.367541790008545, + "loss": 2.8679, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -23.127979278564453, + "rewards/margins": 0.5474358797073364, + "rewards/rejected": -23.6754150390625, + "step": 6330 + }, + { + "epoch": 0.21352253193569046, + "grad_norm": 17.781930923461914, + "learning_rate": 9.612558514616874e-07, + "logits/chosen": -0.6025907397270203, + "logits/rejected": -0.6184767484664917, + "logps/chosen": -2.0852742195129395, + "logps/rejected": -2.31107497215271, + "loss": 2.4546, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.85274314880371, + "rewards/margins": 2.258007049560547, + "rewards/rejected": -23.110750198364258, + "step": 6335 + }, + { + "epoch": 0.21369105800667362, + "grad_norm": 38.529624938964844, + "learning_rate": 9.61142244792234e-07, + "logits/chosen": -0.5689278841018677, + "logits/rejected": -0.6688274145126343, + "logps/chosen": -1.6294472217559814, + "logps/rejected": -1.7487682104110718, + "loss": 2.5781, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.29447364807129, + "rewards/margins": 1.1932106018066406, + "rewards/rejected": -17.487682342529297, + "step": 6340 + }, + { + "epoch": 0.21385958407765682, + "grad_norm": 13.591431617736816, + "learning_rate": 9.610284785396182e-07, + "logits/chosen": -0.7865332365036011, + "logits/rejected": -0.7709859609603882, + "logps/chosen": -1.7249629497528076, + "logps/rejected": -1.8565632104873657, + "loss": 2.1548, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.249629974365234, + "rewards/margins": 1.3160030841827393, + "rewards/rejected": -18.56563377380371, + "step": 6345 + }, + { + "epoch": 0.21402811014863998, + "grad_norm": 25.202449798583984, + "learning_rate": 9.609145527432096e-07, + "logits/chosen": -0.5416828393936157, + "logits/rejected": -0.5098170042037964, + "logps/chosen": -1.8249015808105469, + "logps/rejected": -1.9837112426757812, + "loss": 2.8382, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.249013900756836, + "rewards/margins": 1.5880978107452393, + "rewards/rejected": -19.837112426757812, + "step": 6350 + }, + { + "epoch": 0.21419663621962318, + "grad_norm": 32.652381896972656, + "learning_rate": 9.608004674424336e-07, + "logits/chosen": -0.4759892523288727, + "logits/rejected": -0.26175767183303833, + "logps/chosen": -1.7728564739227295, + "logps/rejected": -1.8391892910003662, + "loss": 2.9791, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.728567123413086, + "rewards/margins": 0.6633265614509583, + "rewards/rejected": -18.391895294189453, + "step": 6355 + }, + { + "epoch": 0.21436516229060637, + "grad_norm": 16.489791870117188, + "learning_rate": 9.606862226767706e-07, + "logits/chosen": -0.32876458764076233, + "logits/rejected": -0.36927470564842224, + "logps/chosen": -2.0482099056243896, + "logps/rejected": -2.105473279953003, + "loss": 2.6771, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.482101440429688, + "rewards/margins": 0.5726326107978821, + "rewards/rejected": -21.05473518371582, + "step": 6360 + }, + { + "epoch": 0.21453368836158954, + "grad_norm": 23.093629837036133, + "learning_rate": 9.605718184857563e-07, + "logits/chosen": -0.4692727029323578, + "logits/rejected": -0.586158275604248, + "logps/chosen": -1.8002586364746094, + "logps/rejected": -1.9701658487319946, + "loss": 1.7385, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.002586364746094, + "rewards/margins": 1.6990737915039062, + "rewards/rejected": -19.70166015625, + "step": 6365 + }, + { + "epoch": 0.21470221443257273, + "grad_norm": 34.142677307128906, + "learning_rate": 9.604572549089812e-07, + "logits/chosen": -0.4840649664402008, + "logits/rejected": -0.4249550700187683, + "logps/chosen": -1.9199146032333374, + "logps/rejected": -2.041658878326416, + "loss": 3.2644, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.19914436340332, + "rewards/margins": 1.2174437046051025, + "rewards/rejected": -20.416587829589844, + "step": 6370 + }, + { + "epoch": 0.2148707405035559, + "grad_norm": 28.089157104492188, + "learning_rate": 9.603425319860918e-07, + "logits/chosen": -0.504118800163269, + "logits/rejected": -0.5168687105178833, + "logps/chosen": -2.0993189811706543, + "logps/rejected": -1.6906509399414062, + "loss": 7.128, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.993188858032227, + "rewards/margins": -4.086681365966797, + "rewards/rejected": -16.90650749206543, + "step": 6375 + }, + { + "epoch": 0.2150392665745391, + "grad_norm": 27.385347366333008, + "learning_rate": 9.602276497567887e-07, + "logits/chosen": -0.053630925714969635, + "logits/rejected": -0.29481998085975647, + "logps/chosen": -1.8843891620635986, + "logps/rejected": -2.11942982673645, + "loss": 2.0263, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.84389305114746, + "rewards/margins": 2.35040545463562, + "rewards/rejected": -21.19429588317871, + "step": 6380 + }, + { + "epoch": 0.21520779264552226, + "grad_norm": 22.69534683227539, + "learning_rate": 9.601126082608285e-07, + "logits/chosen": -0.4440391957759857, + "logits/rejected": -0.5650304555892944, + "logps/chosen": -1.7570676803588867, + "logps/rejected": -1.7968946695327759, + "loss": 3.0257, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.570674896240234, + "rewards/margins": 0.39827051758766174, + "rewards/rejected": -17.968944549560547, + "step": 6385 + }, + { + "epoch": 0.21537631871650545, + "grad_norm": 28.62639617919922, + "learning_rate": 9.59997407538022e-07, + "logits/chosen": -0.6663795113563538, + "logits/rejected": -0.7694646120071411, + "logps/chosen": -1.6902217864990234, + "logps/rejected": -1.7215359210968018, + "loss": 2.8422, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.902217864990234, + "rewards/margins": 0.3131416440010071, + "rewards/rejected": -17.21535873413086, + "step": 6390 + }, + { + "epoch": 0.21554484478748862, + "grad_norm": 23.405054092407227, + "learning_rate": 9.59882047628236e-07, + "logits/chosen": -0.22101497650146484, + "logits/rejected": -0.30498817563056946, + "logps/chosen": -2.058715581893921, + "logps/rejected": -2.188988447189331, + "loss": 2.2117, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.587156295776367, + "rewards/margins": 1.3027280569076538, + "rewards/rejected": -21.889883041381836, + "step": 6395 + }, + { + "epoch": 0.2157133708584718, + "grad_norm": 20.8488712310791, + "learning_rate": 9.59766528571392e-07, + "logits/chosen": -0.7016115188598633, + "logits/rejected": -0.8231936693191528, + "logps/chosen": -1.582484483718872, + "logps/rejected": -1.6375694274902344, + "loss": 2.8576, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -15.824844360351562, + "rewards/margins": 0.5508493185043335, + "rewards/rejected": -16.375694274902344, + "step": 6400 + }, + { + "epoch": 0.2157133708584718, + "eval_logits/chosen": -0.8260197639465332, + "eval_logits/rejected": -0.8562415838241577, + "eval_logps/chosen": -1.7505967617034912, + "eval_logps/rejected": -1.7813204526901245, + "eval_loss": 3.2381811141967773, + "eval_rewards/accuracies": 0.550000011920929, + "eval_rewards/chosen": -17.505966186523438, + "eval_rewards/margins": 0.3072388768196106, + "eval_rewards/rejected": -17.813203811645508, + "eval_runtime": 12.902, + "eval_samples_per_second": 7.751, + "eval_steps_per_second": 1.938, + "step": 6400 + }, + { + "epoch": 0.21588189692945497, + "grad_norm": 49.400997161865234, + "learning_rate": 9.596508504074664e-07, + "logits/chosen": -0.20146696269512177, + "logits/rejected": -0.20826086401939392, + "logps/chosen": -2.2696011066436768, + "logps/rejected": -2.6526265144348145, + "loss": 2.9121, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -22.696012496948242, + "rewards/margins": 3.8302536010742188, + "rewards/rejected": -26.52626609802246, + "step": 6405 + }, + { + "epoch": 0.21605042300043817, + "grad_norm": 24.17193603515625, + "learning_rate": 9.595350131764911e-07, + "logits/chosen": -0.8560646176338196, + "logits/rejected": -0.9106782674789429, + "logps/chosen": -1.775774359703064, + "logps/rejected": -1.9019801616668701, + "loss": 2.6313, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.75774383544922, + "rewards/margins": 1.262058973312378, + "rewards/rejected": -19.01980209350586, + "step": 6410 + }, + { + "epoch": 0.21621894907142136, + "grad_norm": 22.19492530822754, + "learning_rate": 9.594190169185528e-07, + "logits/chosen": -0.9066619873046875, + "logits/rejected": -0.865528404712677, + "logps/chosen": -1.8104702234268188, + "logps/rejected": -1.7503440380096436, + "loss": 3.6651, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.104700088500977, + "rewards/margins": -0.6012603640556335, + "rewards/rejected": -17.503442764282227, + "step": 6415 + }, + { + "epoch": 0.21638747514240453, + "grad_norm": 14.114916801452637, + "learning_rate": 9.593028616737929e-07, + "logits/chosen": -0.731952965259552, + "logits/rejected": -0.7744450569152832, + "logps/chosen": -1.712438941001892, + "logps/rejected": -1.7795616388320923, + "loss": 2.6212, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.1243896484375, + "rewards/margins": 0.6712266206741333, + "rewards/rejected": -17.795616149902344, + "step": 6420 + }, + { + "epoch": 0.21655600121338772, + "grad_norm": 19.506546020507812, + "learning_rate": 9.591865474824084e-07, + "logits/chosen": -0.6614322662353516, + "logits/rejected": -0.6236995458602905, + "logps/chosen": -1.8650777339935303, + "logps/rejected": -1.9538198709487915, + "loss": 2.4345, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.650775909423828, + "rewards/margins": 0.887421727180481, + "rewards/rejected": -19.538198471069336, + "step": 6425 + }, + { + "epoch": 0.2167245272843709, + "grad_norm": 21.893081665039062, + "learning_rate": 9.590700743846511e-07, + "logits/chosen": -0.4665060043334961, + "logits/rejected": -0.4667798578739166, + "logps/chosen": -1.7320483922958374, + "logps/rejected": -1.7750571966171265, + "loss": 2.7378, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.320484161376953, + "rewards/margins": 0.4300875663757324, + "rewards/rejected": -17.75057029724121, + "step": 6430 + }, + { + "epoch": 0.21689305335535408, + "grad_norm": 33.71809005737305, + "learning_rate": 9.58953442420828e-07, + "logits/chosen": -0.6123358607292175, + "logits/rejected": -0.5208547711372375, + "logps/chosen": -2.1607580184936523, + "logps/rejected": -2.6098697185516357, + "loss": 3.0716, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.60757827758789, + "rewards/margins": 4.491117477416992, + "rewards/rejected": -26.098697662353516, + "step": 6435 + }, + { + "epoch": 0.21706157942633725, + "grad_norm": 29.654769897460938, + "learning_rate": 9.588366516313001e-07, + "logits/chosen": -0.4644528329372406, + "logits/rejected": -0.5957885980606079, + "logps/chosen": -1.9361892938613892, + "logps/rejected": -2.192762613296509, + "loss": 2.9682, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.361894607543945, + "rewards/margins": 2.5657315254211426, + "rewards/rejected": -21.92762565612793, + "step": 6440 + }, + { + "epoch": 0.21723010549732044, + "grad_norm": 33.55850601196289, + "learning_rate": 9.587197020564847e-07, + "logits/chosen": -0.3282471299171448, + "logits/rejected": -0.34332892298698425, + "logps/chosen": -1.893535852432251, + "logps/rejected": -1.8809051513671875, + "loss": 3.262, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.93535804748535, + "rewards/margins": -0.12630634009838104, + "rewards/rejected": -18.809051513671875, + "step": 6445 + }, + { + "epoch": 0.2173986315683036, + "grad_norm": 18.890377044677734, + "learning_rate": 9.586025937368532e-07, + "logits/chosen": -0.27933672070503235, + "logits/rejected": -0.37898606061935425, + "logps/chosen": -1.7535426616668701, + "logps/rejected": -1.7873185873031616, + "loss": 3.133, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.53542709350586, + "rewards/margins": 0.3377595841884613, + "rewards/rejected": -17.873186111450195, + "step": 6450 + }, + { + "epoch": 0.2175671576392868, + "grad_norm": 23.26448631286621, + "learning_rate": 9.584853267129323e-07, + "logits/chosen": -0.8444086313247681, + "logits/rejected": -0.8294426202774048, + "logps/chosen": -1.8335950374603271, + "logps/rejected": -1.8612003326416016, + "loss": 2.8761, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.335948944091797, + "rewards/margins": 0.2760535180568695, + "rewards/rejected": -18.612003326416016, + "step": 6455 + }, + { + "epoch": 0.21773568371026997, + "grad_norm": 34.03776168823242, + "learning_rate": 9.583679010253033e-07, + "logits/chosen": -0.5135722160339355, + "logits/rejected": -0.32302290201187134, + "logps/chosen": -2.0652201175689697, + "logps/rejected": -2.1298694610595703, + "loss": 2.619, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.652198791503906, + "rewards/margins": 0.6464970707893372, + "rewards/rejected": -21.298694610595703, + "step": 6460 + }, + { + "epoch": 0.21790420978125316, + "grad_norm": 32.695228576660156, + "learning_rate": 9.582503167146027e-07, + "logits/chosen": -0.5547333359718323, + "logits/rejected": -0.47261205315589905, + "logps/chosen": -1.9371652603149414, + "logps/rejected": -1.691992998123169, + "loss": 5.5003, + "rewards/accuracies": 0.10000000149011612, + "rewards/chosen": -19.371652603149414, + "rewards/margins": -2.451723098754883, + "rewards/rejected": -16.91992950439453, + "step": 6465 + }, + { + "epoch": 0.21807273585223635, + "grad_norm": 19.64496421813965, + "learning_rate": 9.58132573821522e-07, + "logits/chosen": -0.7506909370422363, + "logits/rejected": -0.6311949491500854, + "logps/chosen": -1.596892237663269, + "logps/rejected": -1.7580207586288452, + "loss": 2.1849, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -15.96892261505127, + "rewards/margins": 1.6112867593765259, + "rewards/rejected": -17.580209732055664, + "step": 6470 + }, + { + "epoch": 0.21824126192321952, + "grad_norm": 25.59090805053711, + "learning_rate": 9.580146723868072e-07, + "logits/chosen": -0.5404247045516968, + "logits/rejected": -0.5816585421562195, + "logps/chosen": -1.8674030303955078, + "logps/rejected": -1.8501733541488647, + "loss": 3.4942, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.67403221130371, + "rewards/margins": -0.1722976714372635, + "rewards/rejected": -18.501733779907227, + "step": 6475 + }, + { + "epoch": 0.21840978799420271, + "grad_norm": 20.649513244628906, + "learning_rate": 9.578966124512593e-07, + "logits/chosen": -0.2291760891675949, + "logits/rejected": -0.20900988578796387, + "logps/chosen": -2.7134552001953125, + "logps/rejected": -2.463433027267456, + "loss": 5.9579, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -27.134552001953125, + "rewards/margins": -2.500222682952881, + "rewards/rejected": -24.63433074951172, + "step": 6480 + }, + { + "epoch": 0.21857831406518588, + "grad_norm": 24.989622116088867, + "learning_rate": 9.577783940557343e-07, + "logits/chosen": -0.5035347938537598, + "logits/rejected": -0.5523974299430847, + "logps/chosen": -1.8505939245224, + "logps/rejected": -2.091245412826538, + "loss": 2.3623, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.505939483642578, + "rewards/margins": 2.406515598297119, + "rewards/rejected": -20.91245460510254, + "step": 6485 + }, + { + "epoch": 0.21874684013616907, + "grad_norm": 25.5734920501709, + "learning_rate": 9.576600172411427e-07, + "logits/chosen": -0.12298359721899033, + "logits/rejected": -0.15590530633926392, + "logps/chosen": -2.0219905376434326, + "logps/rejected": -2.2334604263305664, + "loss": 2.2344, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.219905853271484, + "rewards/margins": 2.1146976947784424, + "rewards/rejected": -22.334606170654297, + "step": 6490 + }, + { + "epoch": 0.21891536620715224, + "grad_norm": 17.484527587890625, + "learning_rate": 9.575414820484504e-07, + "logits/chosen": -0.9196332097053528, + "logits/rejected": -1.0346037149429321, + "logps/chosen": -1.782769799232483, + "logps/rejected": -1.728493094444275, + "loss": 3.6021, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -17.827695846557617, + "rewards/margins": -0.5427675247192383, + "rewards/rejected": -17.284927368164062, + "step": 6495 + }, + { + "epoch": 0.21908389227813543, + "grad_norm": 18.694116592407227, + "learning_rate": 9.574227885186775e-07, + "logits/chosen": -0.8279164433479309, + "logits/rejected": -0.8473072052001953, + "logps/chosen": -1.5809705257415771, + "logps/rejected": -1.5579888820648193, + "loss": 3.3854, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -15.80970573425293, + "rewards/margins": -0.22981634736061096, + "rewards/rejected": -15.579889297485352, + "step": 6500 + }, + { + "epoch": 0.2192524183491186, + "grad_norm": 20.692136764526367, + "learning_rate": 9.57303936692899e-07, + "logits/chosen": -0.5701172351837158, + "logits/rejected": -0.6228208541870117, + "logps/chosen": -1.9674314260482788, + "logps/rejected": -1.919097900390625, + "loss": 3.6805, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -19.674312591552734, + "rewards/margins": -0.48333635926246643, + "rewards/rejected": -19.190977096557617, + "step": 6505 + }, + { + "epoch": 0.2194209444201018, + "grad_norm": 26.013273239135742, + "learning_rate": 9.571849266122454e-07, + "logits/chosen": -0.7226217985153198, + "logits/rejected": -0.8986842036247253, + "logps/chosen": -1.7132012844085693, + "logps/rejected": -1.7211544513702393, + "loss": 3.2907, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.13201332092285, + "rewards/margins": 0.07953214645385742, + "rewards/rejected": -17.211544036865234, + "step": 6510 + }, + { + "epoch": 0.21958947049108496, + "grad_norm": 58.09033203125, + "learning_rate": 9.57065758317901e-07, + "logits/chosen": -0.2748204171657562, + "logits/rejected": -0.2713713049888611, + "logps/chosen": -1.971434235572815, + "logps/rejected": -1.9171030521392822, + "loss": 3.6719, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.714340209960938, + "rewards/margins": -0.5433112978935242, + "rewards/rejected": -19.171030044555664, + "step": 6515 + }, + { + "epoch": 0.21975799656206815, + "grad_norm": 18.163524627685547, + "learning_rate": 9.569464318511051e-07, + "logits/chosen": -0.5663856863975525, + "logits/rejected": -0.741828203201294, + "logps/chosen": -1.770307183265686, + "logps/rejected": -1.7896112203598022, + "loss": 3.1223, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.70307159423828, + "rewards/margins": 0.1930416077375412, + "rewards/rejected": -17.8961124420166, + "step": 6520 + }, + { + "epoch": 0.21992652263305135, + "grad_norm": 25.333982467651367, + "learning_rate": 9.568269472531524e-07, + "logits/chosen": -0.7503092288970947, + "logits/rejected": -0.8162568211555481, + "logps/chosen": -1.926618218421936, + "logps/rejected": -2.043292760848999, + "loss": 3.1717, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.266183853149414, + "rewards/margins": 1.166744589805603, + "rewards/rejected": -20.432926177978516, + "step": 6525 + }, + { + "epoch": 0.2200950487040345, + "grad_norm": 20.86461067199707, + "learning_rate": 9.567073045653914e-07, + "logits/chosen": -0.6699740290641785, + "logits/rejected": -0.5963459610939026, + "logps/chosen": -1.864882469177246, + "logps/rejected": -2.050961494445801, + "loss": 2.8193, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.64882469177246, + "rewards/margins": 1.8607914447784424, + "rewards/rejected": -20.50961685180664, + "step": 6530 + }, + { + "epoch": 0.2202635747750177, + "grad_norm": 25.337141036987305, + "learning_rate": 9.565875038292257e-07, + "logits/chosen": -0.5631424784660339, + "logits/rejected": -0.6421005129814148, + "logps/chosen": -1.7564456462860107, + "logps/rejected": -1.802941083908081, + "loss": 2.7187, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.564455032348633, + "rewards/margins": 0.4649575352668762, + "rewards/rejected": -18.0294132232666, + "step": 6535 + }, + { + "epoch": 0.22043210084600087, + "grad_norm": 29.222047805786133, + "learning_rate": 9.56467545086114e-07, + "logits/chosen": -0.6572110652923584, + "logits/rejected": -0.6429082155227661, + "logps/chosen": -1.6367639303207397, + "logps/rejected": -1.780846357345581, + "loss": 1.9368, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -16.367639541625977, + "rewards/margins": 1.440824031829834, + "rewards/rejected": -17.808462142944336, + "step": 6540 + }, + { + "epoch": 0.22060062691698407, + "grad_norm": 28.995386123657227, + "learning_rate": 9.56347428377569e-07, + "logits/chosen": -0.42840996384620667, + "logits/rejected": -0.5102885365486145, + "logps/chosen": -1.6626837253570557, + "logps/rejected": -1.6840680837631226, + "loss": 3.0719, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.626834869384766, + "rewards/margins": 0.2138429582118988, + "rewards/rejected": -16.840679168701172, + "step": 6545 + }, + { + "epoch": 0.22076915298796723, + "grad_norm": 35.35552978515625, + "learning_rate": 9.562271537451584e-07, + "logits/chosen": -1.028755784034729, + "logits/rejected": -0.9808349609375, + "logps/chosen": -1.923678994178772, + "logps/rejected": -1.8900012969970703, + "loss": 3.4336, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.23678970336914, + "rewards/margins": -0.33677586913108826, + "rewards/rejected": -18.900012969970703, + "step": 6550 + }, + { + "epoch": 0.22093767905895043, + "grad_norm": 26.18325424194336, + "learning_rate": 9.561067212305043e-07, + "logits/chosen": -0.445654958486557, + "logits/rejected": -0.5217684507369995, + "logps/chosen": -1.6793758869171143, + "logps/rejected": -1.6027400493621826, + "loss": 3.9224, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -16.793758392333984, + "rewards/margins": -0.7663576006889343, + "rewards/rejected": -16.027400970458984, + "step": 6555 + }, + { + "epoch": 0.2211062051299336, + "grad_norm": 23.301103591918945, + "learning_rate": 9.559861308752842e-07, + "logits/chosen": -0.7921528816223145, + "logits/rejected": -0.7277683019638062, + "logps/chosen": -1.7608333826065063, + "logps/rejected": -1.758272409439087, + "loss": 3.2054, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.608333587646484, + "rewards/margins": -0.025607967749238014, + "rewards/rejected": -17.58272361755371, + "step": 6560 + }, + { + "epoch": 0.22127473120091679, + "grad_norm": 24.503135681152344, + "learning_rate": 9.55865382721229e-07, + "logits/chosen": -0.4001065194606781, + "logits/rejected": -0.45027488470077515, + "logps/chosen": -1.9991649389266968, + "logps/rejected": -2.029761791229248, + "loss": 3.766, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -19.991649627685547, + "rewards/margins": 0.3059673309326172, + "rewards/rejected": -20.297616958618164, + "step": 6565 + }, + { + "epoch": 0.22144325727189995, + "grad_norm": 22.448015213012695, + "learning_rate": 9.557444768101254e-07, + "logits/chosen": -0.19503983855247498, + "logits/rejected": -0.15284790098667145, + "logps/chosen": -1.850487470626831, + "logps/rejected": -1.883581519126892, + "loss": 2.8386, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.50487518310547, + "rewards/margins": 0.3309392035007477, + "rewards/rejected": -18.835817337036133, + "step": 6570 + }, + { + "epoch": 0.22161178334288315, + "grad_norm": 29.75132179260254, + "learning_rate": 9.556234131838141e-07, + "logits/chosen": -0.5109794735908508, + "logits/rejected": -0.4620528221130371, + "logps/chosen": -1.8948596715927124, + "logps/rejected": -1.8566793203353882, + "loss": 3.462, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.948598861694336, + "rewards/margins": -0.3818041682243347, + "rewards/rejected": -18.56679344177246, + "step": 6575 + }, + { + "epoch": 0.22178030941386634, + "grad_norm": 21.997732162475586, + "learning_rate": 9.555021918841902e-07, + "logits/chosen": -0.2893935441970825, + "logits/rejected": -0.3516172766685486, + "logps/chosen": -2.042057991027832, + "logps/rejected": -2.02874755859375, + "loss": 3.2381, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.420581817626953, + "rewards/margins": -0.1331055611371994, + "rewards/rejected": -20.287473678588867, + "step": 6580 + }, + { + "epoch": 0.2219488354848495, + "grad_norm": 42.73481750488281, + "learning_rate": 9.553808129532037e-07, + "logits/chosen": -0.6087731122970581, + "logits/rejected": -0.4572841227054596, + "logps/chosen": -1.7725818157196045, + "logps/rejected": -1.8231289386749268, + "loss": 2.8295, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.725818634033203, + "rewards/margins": 0.5054678320884705, + "rewards/rejected": -18.231287002563477, + "step": 6585 + }, + { + "epoch": 0.2221173615558327, + "grad_norm": 25.13774299621582, + "learning_rate": 9.552592764328593e-07, + "logits/chosen": -0.6387229561805725, + "logits/rejected": -0.5615711808204651, + "logps/chosen": -1.724822759628296, + "logps/rejected": -1.750415563583374, + "loss": 2.9788, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.248228073120117, + "rewards/margins": 0.255929172039032, + "rewards/rejected": -17.5041561126709, + "step": 6590 + }, + { + "epoch": 0.22228588762681586, + "grad_norm": 22.05751609802246, + "learning_rate": 9.551375823652158e-07, + "logits/chosen": -0.3565082848072052, + "logits/rejected": -0.34892910718917847, + "logps/chosen": -1.891668677330017, + "logps/rejected": -1.9782425165176392, + "loss": 2.4094, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.91668701171875, + "rewards/margins": 0.8657382130622864, + "rewards/rejected": -19.782424926757812, + "step": 6595 + }, + { + "epoch": 0.22245441369779906, + "grad_norm": 13.150750160217285, + "learning_rate": 9.550157307923865e-07, + "logits/chosen": -0.4826990067958832, + "logits/rejected": -0.4953466057777405, + "logps/chosen": -1.9391247034072876, + "logps/rejected": -2.0218353271484375, + "loss": 2.6082, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.391246795654297, + "rewards/margins": 0.8271061778068542, + "rewards/rejected": -20.218353271484375, + "step": 6600 + }, + { + "epoch": 0.22262293976878222, + "grad_norm": 71.07939910888672, + "learning_rate": 9.5489372175654e-07, + "logits/chosen": -0.11356012523174286, + "logits/rejected": -0.12396962940692902, + "logps/chosen": -2.3725666999816895, + "logps/rejected": -2.422454833984375, + "loss": 2.8288, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.72566795349121, + "rewards/margins": 0.498879998922348, + "rewards/rejected": -24.22454833984375, + "step": 6605 + }, + { + "epoch": 0.22279146583976542, + "grad_norm": 29.017131805419922, + "learning_rate": 9.547715552998984e-07, + "logits/chosen": -0.6171203851699829, + "logits/rejected": -0.587684690952301, + "logps/chosen": -2.0196220874786377, + "logps/rejected": -1.9217265844345093, + "loss": 4.0861, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.19622230529785, + "rewards/margins": -0.9789560437202454, + "rewards/rejected": -19.217266082763672, + "step": 6610 + }, + { + "epoch": 0.22295999191074858, + "grad_norm": 25.13987159729004, + "learning_rate": 9.546492314647387e-07, + "logits/chosen": -0.8641460537910461, + "logits/rejected": -0.7396403551101685, + "logps/chosen": -1.8659296035766602, + "logps/rejected": -2.2930824756622314, + "loss": 1.7524, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.6592960357666, + "rewards/margins": 4.271526336669922, + "rewards/rejected": -22.930822372436523, + "step": 6615 + }, + { + "epoch": 0.22312851798173178, + "grad_norm": 15.366714477539062, + "learning_rate": 9.545267502933925e-07, + "logits/chosen": -0.7649020552635193, + "logits/rejected": -0.9529761075973511, + "logps/chosen": -1.4214767217636108, + "logps/rejected": -1.4711424112319946, + "loss": 2.9393, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -14.214767456054688, + "rewards/margins": 0.4966561198234558, + "rewards/rejected": -14.71142292022705, + "step": 6620 + }, + { + "epoch": 0.22329704405271494, + "grad_norm": 28.1103458404541, + "learning_rate": 9.544041118282457e-07, + "logits/chosen": -0.8209296464920044, + "logits/rejected": -0.8028782606124878, + "logps/chosen": -1.7018072605133057, + "logps/rejected": -1.8131166696548462, + "loss": 2.1291, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.0180721282959, + "rewards/margins": 1.1130938529968262, + "rewards/rejected": -18.131168365478516, + "step": 6625 + }, + { + "epoch": 0.22346557012369814, + "grad_norm": 121.37602233886719, + "learning_rate": 9.542813161117384e-07, + "logits/chosen": -0.11982444673776627, + "logits/rejected": -0.2019883692264557, + "logps/chosen": -2.4919440746307373, + "logps/rejected": -2.4208712577819824, + "loss": 3.8158, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -24.9194393157959, + "rewards/margins": -0.7107278108596802, + "rewards/rejected": -24.20871353149414, + "step": 6630 + }, + { + "epoch": 0.22363409619468133, + "grad_norm": 15.064230918884277, + "learning_rate": 9.541583631863658e-07, + "logits/chosen": -0.36297905445098877, + "logits/rejected": -0.26934993267059326, + "logps/chosen": -1.9586620330810547, + "logps/rejected": -2.617410659790039, + "loss": 1.8796, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -19.586620330810547, + "rewards/margins": 6.587483882904053, + "rewards/rejected": -26.174102783203125, + "step": 6635 + }, + { + "epoch": 0.2238026222656645, + "grad_norm": 26.91686248779297, + "learning_rate": 9.540352530946769e-07, + "logits/chosen": -0.43593844771385193, + "logits/rejected": -0.3484894633293152, + "logps/chosen": -2.200504779815674, + "logps/rejected": -2.132821559906006, + "loss": 3.8118, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -22.005050659179688, + "rewards/margins": -0.6768323183059692, + "rewards/rejected": -21.328218460083008, + "step": 6640 + }, + { + "epoch": 0.2239711483366477, + "grad_norm": 16.408367156982422, + "learning_rate": 9.53911985879275e-07, + "logits/chosen": -0.34586840867996216, + "logits/rejected": -0.5225354433059692, + "logps/chosen": -1.722537636756897, + "logps/rejected": -1.7929697036743164, + "loss": 3.1201, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.22537612915039, + "rewards/margins": 0.7043231129646301, + "rewards/rejected": -17.929698944091797, + "step": 6645 + }, + { + "epoch": 0.22413967440763086, + "grad_norm": 22.123409271240234, + "learning_rate": 9.537885615828184e-07, + "logits/chosen": -0.6946117281913757, + "logits/rejected": -0.698863685131073, + "logps/chosen": -1.8962112665176392, + "logps/rejected": -1.6883169412612915, + "loss": 5.1022, + "rewards/accuracies": 0.10000000149011612, + "rewards/chosen": -18.962108612060547, + "rewards/margins": -2.0789437294006348, + "rewards/rejected": -16.883167266845703, + "step": 6650 + }, + { + "epoch": 0.22430820047861405, + "grad_norm": 21.01542854309082, + "learning_rate": 9.536649802480189e-07, + "logits/chosen": -0.6982828974723816, + "logits/rejected": -0.6379950046539307, + "logps/chosen": -1.8179126977920532, + "logps/rejected": -1.7372678518295288, + "loss": 3.8856, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -18.179126739501953, + "rewards/margins": -0.8064476251602173, + "rewards/rejected": -17.3726806640625, + "step": 6655 + }, + { + "epoch": 0.22447672654959722, + "grad_norm": 14.440115928649902, + "learning_rate": 9.535412419176436e-07, + "logits/chosen": -0.5513002276420593, + "logits/rejected": -0.43792086839675903, + "logps/chosen": -2.1794581413269043, + "logps/rejected": -2.4261536598205566, + "loss": 2.6097, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.79458236694336, + "rewards/margins": 2.4669549465179443, + "rewards/rejected": -24.261539459228516, + "step": 6660 + }, + { + "epoch": 0.2246452526205804, + "grad_norm": 26.166810989379883, + "learning_rate": 9.534173466345132e-07, + "logits/chosen": -0.4692150950431824, + "logits/rejected": -0.33260539174079895, + "logps/chosen": -1.8202335834503174, + "logps/rejected": -1.7992265224456787, + "loss": 4.2495, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -18.202335357666016, + "rewards/margins": -0.21007022261619568, + "rewards/rejected": -17.992265701293945, + "step": 6665 + }, + { + "epoch": 0.22481377869156358, + "grad_norm": 65.70748901367188, + "learning_rate": 9.532932944415031e-07, + "logits/chosen": -0.6528714299201965, + "logits/rejected": -0.49211350083351135, + "logps/chosen": -2.224884033203125, + "logps/rejected": -2.3405163288116455, + "loss": 3.6286, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.248838424682617, + "rewards/margins": 1.1563222408294678, + "rewards/rejected": -23.405162811279297, + "step": 6670 + }, + { + "epoch": 0.22498230476254677, + "grad_norm": 25.11475944519043, + "learning_rate": 9.531690853815428e-07, + "logits/chosen": -0.7747845649719238, + "logits/rejected": -0.8052785992622375, + "logps/chosen": -1.8111671209335327, + "logps/rejected": -1.9030259847640991, + "loss": 2.2895, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.111669540405273, + "rewards/margins": 0.9185881614685059, + "rewards/rejected": -19.03026008605957, + "step": 6675 + }, + { + "epoch": 0.22515083083352994, + "grad_norm": 25.83588409423828, + "learning_rate": 9.530447194976163e-07, + "logits/chosen": -0.6666157841682434, + "logits/rejected": -0.648045539855957, + "logps/chosen": -1.7558119297027588, + "logps/rejected": -1.7752193212509155, + "loss": 3.098, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.558116912841797, + "rewards/margins": 0.1940757781267166, + "rewards/rejected": -17.752193450927734, + "step": 6680 + }, + { + "epoch": 0.22531935690451313, + "grad_norm": 16.22815704345703, + "learning_rate": 9.529201968327616e-07, + "logits/chosen": -0.42409926652908325, + "logits/rejected": -0.47118502855300903, + "logps/chosen": -2.16229248046875, + "logps/rejected": -2.263599395751953, + "loss": 2.5547, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.6229248046875, + "rewards/margins": 1.0130705833435059, + "rewards/rejected": -22.635995864868164, + "step": 6685 + }, + { + "epoch": 0.2254878829754963, + "grad_norm": 211.99465942382812, + "learning_rate": 9.527955174300711e-07, + "logits/chosen": -0.7075196504592896, + "logits/rejected": -0.8274857401847839, + "logps/chosen": -2.0796940326690674, + "logps/rejected": -2.0456149578094482, + "loss": 3.6199, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.796939849853516, + "rewards/margins": -0.34078893065452576, + "rewards/rejected": -20.45615005493164, + "step": 6690 + }, + { + "epoch": 0.2256564090464795, + "grad_norm": 26.797080993652344, + "learning_rate": 9.526706813326914e-07, + "logits/chosen": -0.2986915409564972, + "logits/rejected": -0.4110233187675476, + "logps/chosen": -2.0323266983032227, + "logps/rejected": -2.0355358123779297, + "loss": 3.17, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.323266983032227, + "rewards/margins": 0.03209175914525986, + "rewards/rejected": -20.355358123779297, + "step": 6695 + }, + { + "epoch": 0.22582493511746268, + "grad_norm": 37.4174690246582, + "learning_rate": 9.525456885838234e-07, + "logits/chosen": -0.497040331363678, + "logits/rejected": -0.5556106567382812, + "logps/chosen": -2.1126370429992676, + "logps/rejected": -1.7659200429916382, + "loss": 6.574, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -21.12636947631836, + "rewards/margins": -3.4671692848205566, + "rewards/rejected": -17.65920066833496, + "step": 6700 + }, + { + "epoch": 0.22599346118844585, + "grad_norm": 31.655685424804688, + "learning_rate": 9.524205392267223e-07, + "logits/chosen": -0.6221505999565125, + "logits/rejected": -0.524019181728363, + "logps/chosen": -1.5004098415374756, + "logps/rejected": -1.6298938989639282, + "loss": 2.1264, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -15.004098892211914, + "rewards/margins": 1.2948402166366577, + "rewards/rejected": -16.298938751220703, + "step": 6705 + }, + { + "epoch": 0.22616198725942904, + "grad_norm": 25.590959548950195, + "learning_rate": 9.522952333046972e-07, + "logits/chosen": -0.6376134157180786, + "logits/rejected": -0.6879085302352905, + "logps/chosen": -1.544236660003662, + "logps/rejected": -1.590477705001831, + "loss": 2.8925, + "rewards/accuracies": 0.5, + "rewards/chosen": -15.442367553710938, + "rewards/margins": 0.462411105632782, + "rewards/rejected": -15.904777526855469, + "step": 6710 + }, + { + "epoch": 0.2263305133304122, + "grad_norm": 32.62403106689453, + "learning_rate": 9.521697708611114e-07, + "logits/chosen": -0.38372719287872314, + "logits/rejected": -0.29491403698921204, + "logps/chosen": -2.097902774810791, + "logps/rejected": -1.9758249521255493, + "loss": 4.3138, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.979028701782227, + "rewards/margins": -1.2207807302474976, + "rewards/rejected": -19.75824737548828, + "step": 6715 + }, + { + "epoch": 0.2264990394013954, + "grad_norm": 14.913753509521484, + "learning_rate": 9.52044151939383e-07, + "logits/chosen": -0.3649575710296631, + "logits/rejected": -0.3394049108028412, + "logps/chosen": -2.1916496753692627, + "logps/rejected": -2.3644070625305176, + "loss": 1.9326, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -21.9164981842041, + "rewards/margins": 1.7275749444961548, + "rewards/rejected": -23.64406967163086, + "step": 6720 + }, + { + "epoch": 0.22666756547237857, + "grad_norm": 29.087324142456055, + "learning_rate": 9.519183765829831e-07, + "logits/chosen": -0.71019047498703, + "logits/rejected": -0.7044280171394348, + "logps/chosen": -1.7740542888641357, + "logps/rejected": -1.9014533758163452, + "loss": 2.9555, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.74054527282715, + "rewards/margins": 1.2739917039871216, + "rewards/rejected": -19.014535903930664, + "step": 6725 + }, + { + "epoch": 0.22683609154336176, + "grad_norm": 30.07399559020996, + "learning_rate": 9.517924448354381e-07, + "logits/chosen": -0.474402517080307, + "logits/rejected": -0.5710434913635254, + "logps/chosen": -1.7302747964859009, + "logps/rejected": -1.8411529064178467, + "loss": 2.2038, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.302745819091797, + "rewards/margins": 1.1087833642959595, + "rewards/rejected": -18.411531448364258, + "step": 6730 + }, + { + "epoch": 0.22700461761434493, + "grad_norm": 27.725431442260742, + "learning_rate": 9.516663567403278e-07, + "logits/chosen": -0.6341021060943604, + "logits/rejected": -0.7254477739334106, + "logps/chosen": -1.5502922534942627, + "logps/rejected": -1.6045587062835693, + "loss": 2.736, + "rewards/accuracies": 0.5, + "rewards/chosen": -15.502920150756836, + "rewards/margins": 0.5426663160324097, + "rewards/rejected": -16.045589447021484, + "step": 6735 + }, + { + "epoch": 0.22717314368532812, + "grad_norm": 28.309858322143555, + "learning_rate": 9.515401123412865e-07, + "logits/chosen": -0.5554046630859375, + "logits/rejected": -0.5442392826080322, + "logps/chosen": -1.5706939697265625, + "logps/rejected": -1.4859893321990967, + "loss": 3.9188, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -15.706939697265625, + "rewards/margins": -0.8470472097396851, + "rewards/rejected": -14.859891891479492, + "step": 6740 + }, + { + "epoch": 0.2273416697563113, + "grad_norm": 23.21711540222168, + "learning_rate": 9.514137116820022e-07, + "logits/chosen": -0.4454229772090912, + "logits/rejected": -0.4550401568412781, + "logps/chosen": -1.8525142669677734, + "logps/rejected": -1.8814477920532227, + "loss": 2.8648, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.525142669677734, + "rewards/margins": 0.2893357276916504, + "rewards/rejected": -18.81447982788086, + "step": 6745 + }, + { + "epoch": 0.22751019582729448, + "grad_norm": 41.267127990722656, + "learning_rate": 9.512871548062173e-07, + "logits/chosen": -0.43118929862976074, + "logits/rejected": -0.45497363805770874, + "logps/chosen": -1.9234278202056885, + "logps/rejected": -2.0509819984436035, + "loss": 2.7819, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.234275817871094, + "rewards/margins": 1.2755451202392578, + "rewards/rejected": -20.509822845458984, + "step": 6750 + }, + { + "epoch": 0.22767872189827768, + "grad_norm": 42.087406158447266, + "learning_rate": 9.51160441757728e-07, + "logits/chosen": -0.42311835289001465, + "logits/rejected": -0.4611433148384094, + "logps/chosen": -1.8028514385223389, + "logps/rejected": -1.7913545370101929, + "loss": 3.3435, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.028514862060547, + "rewards/margins": -0.11496935039758682, + "rewards/rejected": -17.91354751586914, + "step": 6755 + }, + { + "epoch": 0.22784724796926084, + "grad_norm": 17.753387451171875, + "learning_rate": 9.51033572580385e-07, + "logits/chosen": -0.8366772532463074, + "logits/rejected": -0.8449716567993164, + "logps/chosen": -1.6344677209854126, + "logps/rejected": -1.6162865161895752, + "loss": 3.3434, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.344676971435547, + "rewards/margins": -0.1818103790283203, + "rewards/rejected": -16.162866592407227, + "step": 6760 + }, + { + "epoch": 0.22801577404024403, + "grad_norm": 15.194600105285645, + "learning_rate": 9.509065473180924e-07, + "logits/chosen": -0.6022413372993469, + "logits/rejected": -0.6520088911056519, + "logps/chosen": -1.9417146444320679, + "logps/rejected": -2.4450767040252686, + "loss": 2.5999, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.417146682739258, + "rewards/margins": 5.033621311187744, + "rewards/rejected": -24.450769424438477, + "step": 6765 + }, + { + "epoch": 0.2281843001112272, + "grad_norm": 16.05148696899414, + "learning_rate": 9.507793660148089e-07, + "logits/chosen": -0.5362441539764404, + "logits/rejected": -0.5728567242622375, + "logps/chosen": -2.0258445739746094, + "logps/rejected": -2.310606002807617, + "loss": 1.6216, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.25844383239746, + "rewards/margins": 2.8476147651672363, + "rewards/rejected": -23.106060028076172, + "step": 6770 + }, + { + "epoch": 0.2283528261822104, + "grad_norm": 42.07566452026367, + "learning_rate": 9.506520287145467e-07, + "logits/chosen": -0.5904275178909302, + "logits/rejected": -0.6621605157852173, + "logps/chosen": -2.118563175201416, + "logps/rejected": -2.208606004714966, + "loss": 3.1772, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.18562889099121, + "rewards/margins": 0.9004287719726562, + "rewards/rejected": -22.086057662963867, + "step": 6775 + }, + { + "epoch": 0.22852135225319356, + "grad_norm": 14.864330291748047, + "learning_rate": 9.505245354613725e-07, + "logits/chosen": -0.4866722524166107, + "logits/rejected": -0.5212317705154419, + "logps/chosen": -1.8210203647613525, + "logps/rejected": -2.2158994674682617, + "loss": 1.2505, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.210201263427734, + "rewards/margins": 3.948791980743408, + "rewards/rejected": -22.158994674682617, + "step": 6780 + }, + { + "epoch": 0.22868987832417675, + "grad_norm": 23.20281410217285, + "learning_rate": 9.503968862994065e-07, + "logits/chosen": -0.5525738000869751, + "logits/rejected": -0.7386514544487, + "logps/chosen": -2.000377655029297, + "logps/rejected": -2.2927498817443848, + "loss": 1.8016, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -20.0037784576416, + "rewards/margins": 2.923720598220825, + "rewards/rejected": -22.9274959564209, + "step": 6785 + }, + { + "epoch": 0.22885840439515992, + "grad_norm": 11.40327262878418, + "learning_rate": 9.502690812728229e-07, + "logits/chosen": -0.6232748627662659, + "logits/rejected": -0.5543674230575562, + "logps/chosen": -1.6506311893463135, + "logps/rejected": -1.6903736591339111, + "loss": 3.0261, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.506309509277344, + "rewards/margins": 0.3974243104457855, + "rewards/rejected": -16.903736114501953, + "step": 6790 + }, + { + "epoch": 0.2290269304661431, + "grad_norm": 22.759078979492188, + "learning_rate": 9.501411204258504e-07, + "logits/chosen": -0.48999086022377014, + "logits/rejected": -0.45051321387290955, + "logps/chosen": -1.6978733539581299, + "logps/rejected": -1.5963754653930664, + "loss": 4.0642, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -16.97873306274414, + "rewards/margins": -1.0149786472320557, + "rewards/rejected": -15.96375560760498, + "step": 6795 + }, + { + "epoch": 0.22919545653712628, + "grad_norm": 29.907974243164062, + "learning_rate": 9.500130038027709e-07, + "logits/chosen": -0.6095082759857178, + "logits/rejected": -0.4460016191005707, + "logps/chosen": -2.117158889770508, + "logps/rejected": -2.062587261199951, + "loss": 3.7483, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.171588897705078, + "rewards/margins": -0.545714259147644, + "rewards/rejected": -20.625873565673828, + "step": 6800 + }, + { + "epoch": 0.22919545653712628, + "eval_logits/chosen": -0.8406579494476318, + "eval_logits/rejected": -0.8750758171081543, + "eval_logps/chosen": -1.7596479654312134, + "eval_logps/rejected": -1.7937562465667725, + "eval_loss": 3.213974714279175, + "eval_rewards/accuracies": 0.5699999928474426, + "eval_rewards/chosen": -17.596479415893555, + "eval_rewards/margins": 0.3410845100879669, + "eval_rewards/rejected": -17.937564849853516, + "eval_runtime": 12.9165, + "eval_samples_per_second": 7.742, + "eval_steps_per_second": 1.936, + "step": 6800 + }, + { + "epoch": 0.22936398260810947, + "grad_norm": 21.820558547973633, + "learning_rate": 9.498847314479205e-07, + "logits/chosen": -0.6138108968734741, + "logits/rejected": -0.6479529738426208, + "logps/chosen": -2.047436475753784, + "logps/rejected": -1.9390960931777954, + "loss": 4.1537, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -20.474365234375, + "rewards/margins": -1.0834046602249146, + "rewards/rejected": -19.390958786010742, + "step": 6805 + }, + { + "epoch": 0.22953250867909267, + "grad_norm": 12.025269508361816, + "learning_rate": 9.497563034056894e-07, + "logits/chosen": -0.5252547264099121, + "logits/rejected": -0.572632908821106, + "logps/chosen": -2.4980757236480713, + "logps/rejected": -2.691744565963745, + "loss": 2.5268, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.980758666992188, + "rewards/margins": 1.9366881847381592, + "rewards/rejected": -26.917444229125977, + "step": 6810 + }, + { + "epoch": 0.22970103475007583, + "grad_norm": 23.271806716918945, + "learning_rate": 9.496277197205213e-07, + "logits/chosen": -0.867204487323761, + "logits/rejected": -0.9149467349052429, + "logps/chosen": -1.4222182035446167, + "logps/rejected": -1.5333744287490845, + "loss": 2.2384, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -14.222180366516113, + "rewards/margins": 1.1115612983703613, + "rewards/rejected": -15.33374309539795, + "step": 6815 + }, + { + "epoch": 0.22986956082105903, + "grad_norm": 42.697227478027344, + "learning_rate": 9.49498980436914e-07, + "logits/chosen": -0.2986542582511902, + "logits/rejected": -0.32819774746894836, + "logps/chosen": -1.8275251388549805, + "logps/rejected": -1.9412891864776611, + "loss": 2.1117, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.275253295898438, + "rewards/margins": 1.137637734413147, + "rewards/rejected": -19.412891387939453, + "step": 6820 + }, + { + "epoch": 0.2300380868920422, + "grad_norm": 17.321014404296875, + "learning_rate": 9.493700855994194e-07, + "logits/chosen": -0.7194541692733765, + "logits/rejected": -0.7003971338272095, + "logps/chosen": -1.5345981121063232, + "logps/rejected": -1.6274287700653076, + "loss": 2.4023, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -15.345980644226074, + "rewards/margins": 0.9283071756362915, + "rewards/rejected": -16.274288177490234, + "step": 6825 + }, + { + "epoch": 0.2302066129630254, + "grad_norm": 15.599539756774902, + "learning_rate": 9.492410352526423e-07, + "logits/chosen": -0.49592137336730957, + "logits/rejected": -0.650071918964386, + "logps/chosen": -2.09785795211792, + "logps/rejected": -1.9914875030517578, + "loss": 4.4761, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.978580474853516, + "rewards/margins": -1.063704252243042, + "rewards/rejected": -19.914875030517578, + "step": 6830 + }, + { + "epoch": 0.23037513903400855, + "grad_norm": 22.581743240356445, + "learning_rate": 9.491118294412423e-07, + "logits/chosen": -0.7487185597419739, + "logits/rejected": -0.6236433982849121, + "logps/chosen": -1.9754031896591187, + "logps/rejected": -2.0959200859069824, + "loss": 3.6634, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -19.754032135009766, + "rewards/margins": 1.2051713466644287, + "rewards/rejected": -20.959203720092773, + "step": 6835 + }, + { + "epoch": 0.23054366510499175, + "grad_norm": 22.873687744140625, + "learning_rate": 9.489824682099327e-07, + "logits/chosen": -0.5404466390609741, + "logits/rejected": -0.8129202127456665, + "logps/chosen": -2.1067965030670166, + "logps/rejected": -2.1464054584503174, + "loss": 3.3768, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -21.067964553833008, + "rewards/margins": 0.3960878252983093, + "rewards/rejected": -21.464054107666016, + "step": 6840 + }, + { + "epoch": 0.2307121911759749, + "grad_norm": 23.130495071411133, + "learning_rate": 9.488529516034799e-07, + "logits/chosen": -0.5961964130401611, + "logits/rejected": -0.6887432336807251, + "logps/chosen": -1.951284408569336, + "logps/rejected": -2.1935133934020996, + "loss": 2.2539, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.51284408569336, + "rewards/margins": 2.4222893714904785, + "rewards/rejected": -21.935134887695312, + "step": 6845 + }, + { + "epoch": 0.2308807172469581, + "grad_norm": 25.29384422302246, + "learning_rate": 9.487232796667046e-07, + "logits/chosen": -0.7350292205810547, + "logits/rejected": -0.6262162327766418, + "logps/chosen": -2.2591564655303955, + "logps/rejected": -2.370372772216797, + "loss": 2.8626, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.591564178466797, + "rewards/margins": 1.112162709236145, + "rewards/rejected": -23.703725814819336, + "step": 6850 + }, + { + "epoch": 0.23104924331794127, + "grad_norm": 54.073036193847656, + "learning_rate": 9.485934524444814e-07, + "logits/chosen": -0.3840603828430176, + "logits/rejected": -0.47770124673843384, + "logps/chosen": -2.075129270553589, + "logps/rejected": -2.033393383026123, + "loss": 3.5247, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.751293182373047, + "rewards/margins": -0.41735896468162537, + "rewards/rejected": -20.333934783935547, + "step": 6855 + }, + { + "epoch": 0.23121776938892447, + "grad_norm": 15.75258731842041, + "learning_rate": 9.48463469981738e-07, + "logits/chosen": -0.636978268623352, + "logits/rejected": -0.5728309750556946, + "logps/chosen": -1.620154619216919, + "logps/rejected": -1.6520893573760986, + "loss": 3.1537, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.201547622680664, + "rewards/margins": 0.31934672594070435, + "rewards/rejected": -16.520893096923828, + "step": 6860 + }, + { + "epoch": 0.23138629545990766, + "grad_norm": 20.73732566833496, + "learning_rate": 9.483333323234564e-07, + "logits/chosen": -0.29280218482017517, + "logits/rejected": -0.3819066882133484, + "logps/chosen": -2.0080299377441406, + "logps/rejected": -2.111577272415161, + "loss": 2.4741, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.08030128479004, + "rewards/margins": 1.0354706048965454, + "rewards/rejected": -21.11577033996582, + "step": 6865 + }, + { + "epoch": 0.23155482153089083, + "grad_norm": 39.25248336791992, + "learning_rate": 9.482030395146721e-07, + "logits/chosen": -0.1386869251728058, + "logits/rejected": -0.34750866889953613, + "logps/chosen": -1.9895210266113281, + "logps/rejected": -1.9648908376693726, + "loss": 3.6539, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.89521026611328, + "rewards/margins": -0.24630098044872284, + "rewards/rejected": -19.648908615112305, + "step": 6870 + }, + { + "epoch": 0.23172334760187402, + "grad_norm": 39.787757873535156, + "learning_rate": 9.480725916004744e-07, + "logits/chosen": -0.5389952659606934, + "logits/rejected": -0.626449465751648, + "logps/chosen": -1.7958948612213135, + "logps/rejected": -1.9659671783447266, + "loss": 2.1301, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.958948135375977, + "rewards/margins": 1.7007251977920532, + "rewards/rejected": -19.659671783447266, + "step": 6875 + }, + { + "epoch": 0.23189187367285718, + "grad_norm": 27.43910789489746, + "learning_rate": 9.479419886260062e-07, + "logits/chosen": -0.7904404997825623, + "logits/rejected": -0.6668750047683716, + "logps/chosen": -1.8657734394073486, + "logps/rejected": -2.2539117336273193, + "loss": 1.8542, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.65773582458496, + "rewards/margins": 3.8813834190368652, + "rewards/rejected": -22.53911781311035, + "step": 6880 + }, + { + "epoch": 0.23206039974384038, + "grad_norm": 27.898033142089844, + "learning_rate": 9.478112306364639e-07, + "logits/chosen": -0.7889149785041809, + "logits/rejected": -0.6062323451042175, + "logps/chosen": -2.0559146404266357, + "logps/rejected": -2.083693265914917, + "loss": 4.3504, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -20.559146881103516, + "rewards/margins": 0.2777865529060364, + "rewards/rejected": -20.836933135986328, + "step": 6885 + }, + { + "epoch": 0.23222892581482354, + "grad_norm": 28.48827362060547, + "learning_rate": 9.476803176770975e-07, + "logits/chosen": -0.7070311307907104, + "logits/rejected": -0.6959258913993835, + "logps/chosen": -1.712421178817749, + "logps/rejected": -1.7417049407958984, + "loss": 2.8571, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.12421226501465, + "rewards/margins": 0.2928358018398285, + "rewards/rejected": -17.417049407958984, + "step": 6890 + }, + { + "epoch": 0.23239745188580674, + "grad_norm": 43.162017822265625, + "learning_rate": 9.475492497932113e-07, + "logits/chosen": -0.1779576987028122, + "logits/rejected": -0.1989670693874359, + "logps/chosen": -2.1419949531555176, + "logps/rejected": -2.124372959136963, + "loss": 4.1412, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.41994857788086, + "rewards/margins": -0.1762198507785797, + "rewards/rejected": -21.243728637695312, + "step": 6895 + }, + { + "epoch": 0.2325659779567899, + "grad_norm": 31.642684936523438, + "learning_rate": 9.474180270301624e-07, + "logits/chosen": -0.42405325174331665, + "logits/rejected": -0.5832849740982056, + "logps/chosen": -1.8670570850372314, + "logps/rejected": -2.023833990097046, + "loss": 1.9224, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.67057228088379, + "rewards/margins": 1.56776762008667, + "rewards/rejected": -20.238338470458984, + "step": 6900 + }, + { + "epoch": 0.2327345040277731, + "grad_norm": 30.18916130065918, + "learning_rate": 9.472866494333618e-07, + "logits/chosen": -0.18663057684898376, + "logits/rejected": -0.2984737455844879, + "logps/chosen": -2.2856342792510986, + "logps/rejected": -2.303093910217285, + "loss": 3.2669, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.856340408325195, + "rewards/margins": 0.17459754645824432, + "rewards/rejected": -23.03093719482422, + "step": 6905 + }, + { + "epoch": 0.23290303009875626, + "grad_norm": 19.05219078063965, + "learning_rate": 9.471551170482744e-07, + "logits/chosen": -0.4543988108634949, + "logits/rejected": -0.5632656812667847, + "logps/chosen": -1.767507791519165, + "logps/rejected": -2.026975393295288, + "loss": 2.1611, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.67507553100586, + "rewards/margins": 2.5946767330169678, + "rewards/rejected": -20.26975440979004, + "step": 6910 + }, + { + "epoch": 0.23307155616973946, + "grad_norm": 22.81329345703125, + "learning_rate": 9.47023429920418e-07, + "logits/chosen": -0.5807046890258789, + "logits/rejected": -0.6248041987419128, + "logps/chosen": -1.697127103805542, + "logps/rejected": -1.7438815832138062, + "loss": 2.669, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.97127342224121, + "rewards/margins": 0.46754246950149536, + "rewards/rejected": -17.43881607055664, + "step": 6915 + }, + { + "epoch": 0.23324008224072265, + "grad_norm": 30.798070907592773, + "learning_rate": 9.468915880953648e-07, + "logits/chosen": -0.5077857971191406, + "logits/rejected": -0.5084593296051025, + "logps/chosen": -1.7491766214370728, + "logps/rejected": -1.8958772420883179, + "loss": 1.9738, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.49176597595215, + "rewards/margins": 1.4670063257217407, + "rewards/rejected": -18.958772659301758, + "step": 6920 + }, + { + "epoch": 0.23340860831170582, + "grad_norm": 24.726884841918945, + "learning_rate": 9.467595916187396e-07, + "logits/chosen": -0.27916693687438965, + "logits/rejected": -0.2978662848472595, + "logps/chosen": -2.0224666595458984, + "logps/rejected": -2.0680315494537354, + "loss": 2.9168, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.224666595458984, + "rewards/margins": 0.4556505084037781, + "rewards/rejected": -20.68031883239746, + "step": 6925 + }, + { + "epoch": 0.233577134382689, + "grad_norm": 18.920076370239258, + "learning_rate": 9.466274405362214e-07, + "logits/chosen": -0.8002731204032898, + "logits/rejected": -0.8154155015945435, + "logps/chosen": -1.664515733718872, + "logps/rejected": -1.9041354656219482, + "loss": 2.3318, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.645156860351562, + "rewards/margins": 2.396198272705078, + "rewards/rejected": -19.04135513305664, + "step": 6930 + }, + { + "epoch": 0.23374566045367218, + "grad_norm": 25.981199264526367, + "learning_rate": 9.464951348935424e-07, + "logits/chosen": -0.7253482937812805, + "logits/rejected": -0.8367172479629517, + "logps/chosen": -1.6983642578125, + "logps/rejected": -1.812461495399475, + "loss": 2.4775, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.983642578125, + "rewards/margins": 1.1409722566604614, + "rewards/rejected": -18.124614715576172, + "step": 6935 + }, + { + "epoch": 0.23391418652465537, + "grad_norm": 28.854867935180664, + "learning_rate": 9.463626747364886e-07, + "logits/chosen": -0.6369872689247131, + "logits/rejected": -0.6354336142539978, + "logps/chosen": -1.8397117853164673, + "logps/rejected": -2.0742554664611816, + "loss": 1.956, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.397119522094727, + "rewards/margins": 2.345435619354248, + "rewards/rejected": -20.7425537109375, + "step": 6940 + }, + { + "epoch": 0.23408271259563854, + "grad_norm": 16.387676239013672, + "learning_rate": 9.462300601108988e-07, + "logits/chosen": -0.6813743114471436, + "logits/rejected": -0.682715117931366, + "logps/chosen": -1.463041067123413, + "logps/rejected": -1.5665823221206665, + "loss": 2.3658, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -14.630411148071289, + "rewards/margins": 1.0354135036468506, + "rewards/rejected": -15.665822982788086, + "step": 6945 + }, + { + "epoch": 0.23425123866662173, + "grad_norm": 20.10076141357422, + "learning_rate": 9.460972910626661e-07, + "logits/chosen": -0.474844366312027, + "logits/rejected": -0.5014239549636841, + "logps/chosen": -1.6983531713485718, + "logps/rejected": -2.114022731781006, + "loss": 2.2609, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.983531951904297, + "rewards/margins": 4.15669584274292, + "rewards/rejected": -21.140228271484375, + "step": 6950 + }, + { + "epoch": 0.2344197647376049, + "grad_norm": 53.66566467285156, + "learning_rate": 9.459643676377364e-07, + "logits/chosen": 0.0645643025636673, + "logits/rejected": -0.021223559975624084, + "logps/chosen": -2.0207462310791016, + "logps/rejected": -1.9168494939804077, + "loss": 4.0967, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.207462310791016, + "rewards/margins": -1.0389666557312012, + "rewards/rejected": -19.168495178222656, + "step": 6955 + }, + { + "epoch": 0.2345882908085881, + "grad_norm": 15.13321304321289, + "learning_rate": 9.458312898821095e-07, + "logits/chosen": -0.8532525897026062, + "logits/rejected": -0.8107415437698364, + "logps/chosen": -1.4715559482574463, + "logps/rejected": -1.6997382640838623, + "loss": 1.9906, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -14.715560913085938, + "rewards/margins": 2.281822681427002, + "rewards/rejected": -16.99738121032715, + "step": 6960 + }, + { + "epoch": 0.23475681687957126, + "grad_norm": 20.282747268676758, + "learning_rate": 9.456980578418384e-07, + "logits/chosen": -0.2857280969619751, + "logits/rejected": -0.3203180730342865, + "logps/chosen": -1.5712473392486572, + "logps/rejected": -1.6090400218963623, + "loss": 2.907, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -15.71247386932373, + "rewards/margins": 0.3779268264770508, + "rewards/rejected": -16.09040069580078, + "step": 6965 + }, + { + "epoch": 0.23492534295055445, + "grad_norm": 29.665863037109375, + "learning_rate": 9.455646715630289e-07, + "logits/chosen": -0.34003663063049316, + "logits/rejected": -0.46588826179504395, + "logps/chosen": -1.9911762475967407, + "logps/rejected": -1.999943494796753, + "loss": 3.2851, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.911760330200195, + "rewards/margins": 0.08767547458410263, + "rewards/rejected": -19.999435424804688, + "step": 6970 + }, + { + "epoch": 0.23509386902153764, + "grad_norm": 15.373470306396484, + "learning_rate": 9.454311310918413e-07, + "logits/chosen": -0.715580403804779, + "logits/rejected": -0.6359925270080566, + "logps/chosen": -1.8380359411239624, + "logps/rejected": -1.8280082941055298, + "loss": 3.3255, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.38035774230957, + "rewards/margins": -0.10027551651000977, + "rewards/rejected": -18.28008460998535, + "step": 6975 + }, + { + "epoch": 0.2352623950925208, + "grad_norm": 39.4725341796875, + "learning_rate": 9.452974364744884e-07, + "logits/chosen": -0.8144010305404663, + "logits/rejected": -0.9141764640808105, + "logps/chosen": -1.7286930084228516, + "logps/rejected": -1.7226985692977905, + "loss": 3.2424, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -17.286928176879883, + "rewards/margins": -0.059942055493593216, + "rewards/rejected": -17.226985931396484, + "step": 6980 + }, + { + "epoch": 0.235430921163504, + "grad_norm": 204.15028381347656, + "learning_rate": 9.451635877572368e-07, + "logits/chosen": -0.797519326210022, + "logits/rejected": -0.7118474841117859, + "logps/chosen": -2.037544012069702, + "logps/rejected": -2.163386344909668, + "loss": 2.3279, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.37544059753418, + "rewards/margins": 1.2584235668182373, + "rewards/rejected": -21.63386344909668, + "step": 6985 + }, + { + "epoch": 0.23559944723448717, + "grad_norm": 15.090885162353516, + "learning_rate": 9.450295849864063e-07, + "logits/chosen": -0.5462337136268616, + "logits/rejected": -0.5176796317100525, + "logps/chosen": -1.770716667175293, + "logps/rejected": -1.8121637105941772, + "loss": 2.8383, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.70716667175293, + "rewards/margins": 0.4144694209098816, + "rewards/rejected": -18.12163734436035, + "step": 6990 + }, + { + "epoch": 0.23576797330547036, + "grad_norm": 21.92974090576172, + "learning_rate": 9.448954282083699e-07, + "logits/chosen": -0.7090522050857544, + "logits/rejected": -0.6703694462776184, + "logps/chosen": -1.772658109664917, + "logps/rejected": -1.7615941762924194, + "loss": 3.6834, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -17.726581573486328, + "rewards/margins": -0.11064042896032333, + "rewards/rejected": -17.615942001342773, + "step": 6995 + }, + { + "epoch": 0.23593649937645353, + "grad_norm": 29.28874397277832, + "learning_rate": 9.44761117469554e-07, + "logits/chosen": -0.44356757402420044, + "logits/rejected": -0.4245205819606781, + "logps/chosen": -1.6671860218048096, + "logps/rejected": -1.8381659984588623, + "loss": 2.5147, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.671859741210938, + "rewards/margins": 1.7098020315170288, + "rewards/rejected": -18.381662368774414, + "step": 7000 + }, + { + "epoch": 0.23610502544743672, + "grad_norm": 22.89272117614746, + "learning_rate": 9.446266528164382e-07, + "logits/chosen": -0.7267704010009766, + "logits/rejected": -0.8729580640792847, + "logps/chosen": -1.7048250436782837, + "logps/rejected": -1.7571985721588135, + "loss": 2.688, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.048248291015625, + "rewards/margins": 0.5237363576889038, + "rewards/rejected": -17.571985244750977, + "step": 7005 + }, + { + "epoch": 0.2362735515184199, + "grad_norm": 12.386322975158691, + "learning_rate": 9.444920342955553e-07, + "logits/chosen": -0.6881308555603027, + "logits/rejected": -0.7025818824768066, + "logps/chosen": -2.3889026641845703, + "logps/rejected": -2.4722933769226074, + "loss": 3.3052, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -23.889026641845703, + "rewards/margins": 0.8339089155197144, + "rewards/rejected": -24.72293472290039, + "step": 7010 + }, + { + "epoch": 0.23644207758940308, + "grad_norm": 29.82985496520996, + "learning_rate": 9.443572619534917e-07, + "logits/chosen": -0.46417030692100525, + "logits/rejected": -0.5751253366470337, + "logps/chosen": -1.8959985971450806, + "logps/rejected": -2.1164956092834473, + "loss": 2.7809, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.959985733032227, + "rewards/margins": 2.204969882965088, + "rewards/rejected": -21.16495704650879, + "step": 7015 + }, + { + "epoch": 0.23661060366038625, + "grad_norm": 28.779356002807617, + "learning_rate": 9.442223358368868e-07, + "logits/chosen": -0.9354928135871887, + "logits/rejected": -0.8245365023612976, + "logps/chosen": -1.950887680053711, + "logps/rejected": -2.124640703201294, + "loss": 2.5568, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.50887680053711, + "rewards/margins": 1.7375301122665405, + "rewards/rejected": -21.24640655517578, + "step": 7020 + }, + { + "epoch": 0.23677912973136944, + "grad_norm": 22.374792098999023, + "learning_rate": 9.440872559924331e-07, + "logits/chosen": -0.19462314248085022, + "logits/rejected": -0.1797332465648651, + "logps/chosen": -2.0091001987457275, + "logps/rejected": -1.94009530544281, + "loss": 3.8656, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.090999603271484, + "rewards/margins": -0.6900471448898315, + "rewards/rejected": -19.40095329284668, + "step": 7025 + }, + { + "epoch": 0.23694765580235264, + "grad_norm": 79.91705322265625, + "learning_rate": 9.439520224668764e-07, + "logits/chosen": -0.597054660320282, + "logits/rejected": -0.6568640470504761, + "logps/chosen": -1.9609358310699463, + "logps/rejected": -2.1057090759277344, + "loss": 2.5551, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.609355926513672, + "rewards/margins": 1.4477331638336182, + "rewards/rejected": -21.057090759277344, + "step": 7030 + }, + { + "epoch": 0.2371161818733358, + "grad_norm": 20.779041290283203, + "learning_rate": 9.438166353070158e-07, + "logits/chosen": -0.8121329545974731, + "logits/rejected": -0.8105417490005493, + "logps/chosen": -1.6662849187850952, + "logps/rejected": -1.689305305480957, + "loss": 2.9107, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.6628475189209, + "rewards/margins": 0.23020382225513458, + "rewards/rejected": -16.893051147460938, + "step": 7035 + }, + { + "epoch": 0.237284707944319, + "grad_norm": 30.07358741760254, + "learning_rate": 9.436810945597034e-07, + "logits/chosen": -0.8572260141372681, + "logits/rejected": -0.9229756593704224, + "logps/chosen": -1.8019100427627563, + "logps/rejected": -1.8127281665802002, + "loss": 3.3009, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.019100189208984, + "rewards/margins": 0.10818271338939667, + "rewards/rejected": -18.127283096313477, + "step": 7040 + }, + { + "epoch": 0.23745323401530216, + "grad_norm": 18.86367416381836, + "learning_rate": 9.435454002718444e-07, + "logits/chosen": -0.6191602349281311, + "logits/rejected": -0.5740264654159546, + "logps/chosen": -1.707728624343872, + "logps/rejected": -1.9487249851226807, + "loss": 2.1382, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.077285766601562, + "rewards/margins": 2.4099647998809814, + "rewards/rejected": -19.48724937438965, + "step": 7045 + }, + { + "epoch": 0.23762176008628536, + "grad_norm": 17.491254806518555, + "learning_rate": 9.434095524903974e-07, + "logits/chosen": -0.5991761088371277, + "logits/rejected": -0.7022507786750793, + "logps/chosen": -1.9639873504638672, + "logps/rejected": -1.9689788818359375, + "loss": 3.1378, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.639873504638672, + "rewards/margins": 0.04991502687335014, + "rewards/rejected": -19.689788818359375, + "step": 7050 + }, + { + "epoch": 0.23779028615726852, + "grad_norm": 36.60987854003906, + "learning_rate": 9.43273551262374e-07, + "logits/chosen": -0.8516243696212769, + "logits/rejected": -0.8240333795547485, + "logps/chosen": -1.865692138671875, + "logps/rejected": -1.9466756582260132, + "loss": 3.1045, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.656917572021484, + "rewards/margins": 0.8098365068435669, + "rewards/rejected": -19.46675682067871, + "step": 7055 + }, + { + "epoch": 0.23795881222825171, + "grad_norm": 16.094783782958984, + "learning_rate": 9.431373966348387e-07, + "logits/chosen": -0.38345012068748474, + "logits/rejected": -0.4076191484928131, + "logps/chosen": -2.2534797191619873, + "logps/rejected": -2.212144613265991, + "loss": 3.5651, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.53479766845703, + "rewards/margins": -0.413351833820343, + "rewards/rejected": -22.121448516845703, + "step": 7060 + }, + { + "epoch": 0.23812733829923488, + "grad_norm": 26.482032775878906, + "learning_rate": 9.430010886549094e-07, + "logits/chosen": -0.7364832162857056, + "logits/rejected": -0.6639753580093384, + "logps/chosen": -2.0670037269592285, + "logps/rejected": -1.7953437566757202, + "loss": 5.8885, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.670034408569336, + "rewards/margins": -2.7165987491607666, + "rewards/rejected": -17.95343589782715, + "step": 7065 + }, + { + "epoch": 0.23829586437021807, + "grad_norm": 23.36323356628418, + "learning_rate": 9.428646273697568e-07, + "logits/chosen": -0.4462354779243469, + "logits/rejected": -0.5697998404502869, + "logps/chosen": -2.318197727203369, + "logps/rejected": -2.4804301261901855, + "loss": 2.7402, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.181976318359375, + "rewards/margins": 1.6223284006118774, + "rewards/rejected": -24.804302215576172, + "step": 7070 + }, + { + "epoch": 0.23846439044120124, + "grad_norm": 29.695451736450195, + "learning_rate": 9.427280128266049e-07, + "logits/chosen": -0.09391313791275024, + "logits/rejected": -0.21296796202659607, + "logps/chosen": -1.7834405899047852, + "logps/rejected": -1.8914865255355835, + "loss": 2.8719, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.83440589904785, + "rewards/margins": 1.0804593563079834, + "rewards/rejected": -18.91486358642578, + "step": 7075 + }, + { + "epoch": 0.23863291651218443, + "grad_norm": 18.212629318237305, + "learning_rate": 9.425912450727305e-07, + "logits/chosen": -0.32436639070510864, + "logits/rejected": -0.4176466464996338, + "logps/chosen": -1.747227668762207, + "logps/rejected": -2.081019639968872, + "loss": 1.5811, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.472274780273438, + "rewards/margins": 3.3379178047180176, + "rewards/rejected": -20.810192108154297, + "step": 7080 + }, + { + "epoch": 0.23880144258316763, + "grad_norm": 21.501232147216797, + "learning_rate": 9.424543241554637e-07, + "logits/chosen": -0.5840815901756287, + "logits/rejected": -0.45451974868774414, + "logps/chosen": -2.1316933631896973, + "logps/rejected": -2.056612014770508, + "loss": 3.951, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.316936492919922, + "rewards/margins": -0.7508147954940796, + "rewards/rejected": -20.566120147705078, + "step": 7085 + }, + { + "epoch": 0.2389699686541508, + "grad_norm": 41.77083969116211, + "learning_rate": 9.423172501221872e-07, + "logits/chosen": -0.40182381868362427, + "logits/rejected": -0.4287623465061188, + "logps/chosen": -1.449648141860962, + "logps/rejected": -1.610764741897583, + "loss": 2.4708, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -14.496480941772461, + "rewards/margins": 1.6111669540405273, + "rewards/rejected": -16.107648849487305, + "step": 7090 + }, + { + "epoch": 0.239138494725134, + "grad_norm": 21.541324615478516, + "learning_rate": 9.42180023020337e-07, + "logits/chosen": -0.6269460320472717, + "logits/rejected": -0.5908786058425903, + "logps/chosen": -1.9286348819732666, + "logps/rejected": -1.9429540634155273, + "loss": 3.1871, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -19.286346435546875, + "rewards/margins": 0.14319276809692383, + "rewards/rejected": -19.429540634155273, + "step": 7095 + }, + { + "epoch": 0.23930702079611715, + "grad_norm": 40.71235275268555, + "learning_rate": 9.420426428974021e-07, + "logits/chosen": -0.45769214630126953, + "logits/rejected": -0.7178536653518677, + "logps/chosen": -2.998274326324463, + "logps/rejected": -2.8907580375671387, + "loss": 5.4313, + "rewards/accuracies": 0.5, + "rewards/chosen": -29.982742309570312, + "rewards/margins": -1.0751609802246094, + "rewards/rejected": -28.907581329345703, + "step": 7100 + }, + { + "epoch": 0.23947554686710035, + "grad_norm": 21.48967933654785, + "learning_rate": 9.419051098009243e-07, + "logits/chosen": -0.5916844606399536, + "logits/rejected": -0.6824017763137817, + "logps/chosen": -1.5144556760787964, + "logps/rejected": -1.598915457725525, + "loss": 2.7017, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -15.144556999206543, + "rewards/margins": 0.844598650932312, + "rewards/rejected": -15.989156723022461, + "step": 7105 + }, + { + "epoch": 0.2396440729380835, + "grad_norm": 25.29039764404297, + "learning_rate": 9.417674237784983e-07, + "logits/chosen": -0.4818636476993561, + "logits/rejected": -0.5085574984550476, + "logps/chosen": -1.6022964715957642, + "logps/rejected": -1.549843668937683, + "loss": 3.662, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.02296257019043, + "rewards/margins": -0.5245256423950195, + "rewards/rejected": -15.498437881469727, + "step": 7110 + }, + { + "epoch": 0.2398125990090667, + "grad_norm": 23.890718460083008, + "learning_rate": 9.416295848777718e-07, + "logits/chosen": -0.8053399920463562, + "logits/rejected": -0.7373215556144714, + "logps/chosen": -1.9440422058105469, + "logps/rejected": -1.8284790515899658, + "loss": 4.4745, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -19.440420150756836, + "rewards/margins": -1.1556302309036255, + "rewards/rejected": -18.2847900390625, + "step": 7115 + }, + { + "epoch": 0.23998112508004987, + "grad_norm": 26.433320999145508, + "learning_rate": 9.414915931464456e-07, + "logits/chosen": -0.8600558042526245, + "logits/rejected": -0.8631598353385925, + "logps/chosen": -1.7601732015609741, + "logps/rejected": -1.8001441955566406, + "loss": 2.8108, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.601734161376953, + "rewards/margins": 0.3997109532356262, + "rewards/rejected": -18.00144386291504, + "step": 7120 + }, + { + "epoch": 0.24014965115103307, + "grad_norm": 26.806915283203125, + "learning_rate": 9.413534486322732e-07, + "logits/chosen": -0.5686289072036743, + "logits/rejected": -0.6157848834991455, + "logps/chosen": -1.79929518699646, + "logps/rejected": -1.975182294845581, + "loss": 2.1733, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.992952346801758, + "rewards/margins": 1.7588691711425781, + "rewards/rejected": -19.751821517944336, + "step": 7125 + }, + { + "epoch": 0.24031817722201623, + "grad_norm": 41.423851013183594, + "learning_rate": 9.412151513830606e-07, + "logits/chosen": -0.6805993914604187, + "logits/rejected": -0.9111586809158325, + "logps/chosen": -2.1881282329559326, + "logps/rejected": -2.279981851577759, + "loss": 2.4858, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.881282806396484, + "rewards/margins": 0.9185339212417603, + "rewards/rejected": -22.799816131591797, + "step": 7130 + }, + { + "epoch": 0.24048670329299943, + "grad_norm": 23.80078887939453, + "learning_rate": 9.410767014466675e-07, + "logits/chosen": -0.514999508857727, + "logits/rejected": -0.4333207607269287, + "logps/chosen": -1.9776651859283447, + "logps/rejected": -2.026716709136963, + "loss": 2.8135, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.77665138244629, + "rewards/margins": 0.490518182516098, + "rewards/rejected": -20.267169952392578, + "step": 7135 + }, + { + "epoch": 0.24065522936398262, + "grad_norm": 22.437442779541016, + "learning_rate": 9.409380988710057e-07, + "logits/chosen": -0.6936476230621338, + "logits/rejected": -0.7516869306564331, + "logps/chosen": -1.657705545425415, + "logps/rejected": -1.6317991018295288, + "loss": 3.6723, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.577054977416992, + "rewards/margins": -0.25906410813331604, + "rewards/rejected": -16.317991256713867, + "step": 7140 + }, + { + "epoch": 0.24082375543496579, + "grad_norm": 29.402507781982422, + "learning_rate": 9.4079934370404e-07, + "logits/chosen": -0.44884634017944336, + "logits/rejected": -0.40613657236099243, + "logps/chosen": -2.073878049850464, + "logps/rejected": -2.1649553775787354, + "loss": 2.7853, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.738779067993164, + "rewards/margins": 0.9107732772827148, + "rewards/rejected": -21.649551391601562, + "step": 7145 + }, + { + "epoch": 0.24099228150594898, + "grad_norm": 25.87431526184082, + "learning_rate": 9.406604359937884e-07, + "logits/chosen": -0.9141045808792114, + "logits/rejected": -0.8093926310539246, + "logps/chosen": -1.623300552368164, + "logps/rejected": -1.6307185888290405, + "loss": 3.0879, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.23300552368164, + "rewards/margins": 0.07417931407690048, + "rewards/rejected": -16.307186126708984, + "step": 7150 + }, + { + "epoch": 0.24116080757693215, + "grad_norm": 17.400171279907227, + "learning_rate": 9.405213757883212e-07, + "logits/chosen": -0.9052945375442505, + "logits/rejected": -0.8418493270874023, + "logps/chosen": -1.7425527572631836, + "logps/rejected": -2.0263311862945557, + "loss": 2.0622, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.425527572631836, + "rewards/margins": 2.837782621383667, + "rewards/rejected": -20.2633113861084, + "step": 7155 + }, + { + "epoch": 0.24132933364791534, + "grad_norm": 18.482860565185547, + "learning_rate": 9.403821631357618e-07, + "logits/chosen": -0.7080804109573364, + "logits/rejected": -0.7618001103401184, + "logps/chosen": -1.8975080251693726, + "logps/rejected": -2.0444533824920654, + "loss": 2.5719, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.975080490112305, + "rewards/margins": 1.4694522619247437, + "rewards/rejected": -20.444534301757812, + "step": 7160 + }, + { + "epoch": 0.2414978597188985, + "grad_norm": 16.333810806274414, + "learning_rate": 9.40242798084286e-07, + "logits/chosen": -1.0082132816314697, + "logits/rejected": -0.8503907322883606, + "logps/chosen": -1.8380146026611328, + "logps/rejected": -1.8924537897109985, + "loss": 3.4038, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.380146026611328, + "rewards/margins": 0.5443927049636841, + "rewards/rejected": -18.924537658691406, + "step": 7165 + }, + { + "epoch": 0.2416663857898817, + "grad_norm": 25.957456588745117, + "learning_rate": 9.401032806821227e-07, + "logits/chosen": -0.6452234387397766, + "logits/rejected": -0.5962679982185364, + "logps/chosen": -1.7202945947647095, + "logps/rejected": -1.86736261844635, + "loss": 2.2991, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.20294761657715, + "rewards/margins": 1.4706783294677734, + "rewards/rejected": -18.67362403869629, + "step": 7170 + }, + { + "epoch": 0.24183491186086487, + "grad_norm": 27.938039779663086, + "learning_rate": 9.399636109775531e-07, + "logits/chosen": -0.5970734357833862, + "logits/rejected": -0.6353007555007935, + "logps/chosen": -1.7174265384674072, + "logps/rejected": -1.934348702430725, + "loss": 1.799, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -17.174264907836914, + "rewards/margins": 2.169222354888916, + "rewards/rejected": -19.343486785888672, + "step": 7175 + }, + { + "epoch": 0.24200343793184806, + "grad_norm": 32.18524932861328, + "learning_rate": 9.398237890189119e-07, + "logits/chosen": -0.6640039682388306, + "logits/rejected": -0.6827796697616577, + "logps/chosen": -1.8866550922393799, + "logps/rejected": -1.8781245946884155, + "loss": 3.2777, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.866552352905273, + "rewards/margins": -0.08530483394861221, + "rewards/rejected": -18.781246185302734, + "step": 7180 + }, + { + "epoch": 0.24217196400283122, + "grad_norm": 17.34172821044922, + "learning_rate": 9.396838148545855e-07, + "logits/chosen": -0.5536460876464844, + "logits/rejected": -0.5025007724761963, + "logps/chosen": -2.2877159118652344, + "logps/rejected": -2.4163801670074463, + "loss": 2.5845, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.877161026000977, + "rewards/margins": 1.2866411209106445, + "rewards/rejected": -24.163801193237305, + "step": 7185 + }, + { + "epoch": 0.24234049007381442, + "grad_norm": 28.588186264038086, + "learning_rate": 9.395436885330138e-07, + "logits/chosen": -0.38710886240005493, + "logits/rejected": -0.3867741525173187, + "logps/chosen": -2.279740810394287, + "logps/rejected": -2.194554090499878, + "loss": 4.1465, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.797407150268555, + "rewards/margins": -0.8518630862236023, + "rewards/rejected": -21.945545196533203, + "step": 7190 + }, + { + "epoch": 0.2425090161447976, + "grad_norm": 20.487680435180664, + "learning_rate": 9.394034101026887e-07, + "logits/chosen": -0.5616172552108765, + "logits/rejected": -0.6295533776283264, + "logps/chosen": -1.7545499801635742, + "logps/rejected": -2.3229641914367676, + "loss": 2.2103, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.54549789428711, + "rewards/margins": 5.684144020080566, + "rewards/rejected": -23.22964096069336, + "step": 7195 + }, + { + "epoch": 0.24267754221578078, + "grad_norm": 25.60300064086914, + "learning_rate": 9.392629796121552e-07, + "logits/chosen": -0.5492648482322693, + "logits/rejected": -0.5753589272499084, + "logps/chosen": -1.7553179264068604, + "logps/rejected": -1.741642951965332, + "loss": 3.5349, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.553180694580078, + "rewards/margins": -0.13674993813037872, + "rewards/rejected": -17.41642951965332, + "step": 7200 + }, + { + "epoch": 0.24267754221578078, + "eval_logits/chosen": -0.8416627645492554, + "eval_logits/rejected": -0.8779700994491577, + "eval_logps/chosen": -1.766628384590149, + "eval_logps/rejected": -1.8019263744354248, + "eval_loss": 3.2034659385681152, + "eval_rewards/accuracies": 0.5799999833106995, + "eval_rewards/chosen": -17.666284561157227, + "eval_rewards/margins": 0.35297882556915283, + "eval_rewards/rejected": -18.019264221191406, + "eval_runtime": 12.9006, + "eval_samples_per_second": 7.752, + "eval_steps_per_second": 1.938, + "step": 7200 + }, + { + "epoch": 0.24284606828676397, + "grad_norm": 25.423349380493164, + "learning_rate": 9.391223971100108e-07, + "logits/chosen": -0.3729293942451477, + "logits/rejected": -0.44401878118515015, + "logps/chosen": -1.965648889541626, + "logps/rejected": -2.294408082962036, + "loss": 2.6272, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.6564884185791, + "rewards/margins": 3.287592649459839, + "rewards/rejected": -22.944080352783203, + "step": 7205 + }, + { + "epoch": 0.24301459435774714, + "grad_norm": 15.547274589538574, + "learning_rate": 9.389816626449054e-07, + "logits/chosen": -0.8222540020942688, + "logits/rejected": -0.9664362072944641, + "logps/chosen": -1.8262207508087158, + "logps/rejected": -1.713092565536499, + "loss": 4.6239, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.26220703125, + "rewards/margins": -1.1312811374664307, + "rewards/rejected": -17.13092613220215, + "step": 7210 + }, + { + "epoch": 0.24318312042873033, + "grad_norm": 28.496145248413086, + "learning_rate": 9.388407762655418e-07, + "logits/chosen": -0.5452624559402466, + "logits/rejected": -0.4920075833797455, + "logps/chosen": -1.8025963306427002, + "logps/rejected": -1.8232835531234741, + "loss": 3.112, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.025964736938477, + "rewards/margins": 0.20687207579612732, + "rewards/rejected": -18.23283576965332, + "step": 7215 + }, + { + "epoch": 0.2433516464997135, + "grad_norm": 29.377939224243164, + "learning_rate": 9.386997380206751e-07, + "logits/chosen": -0.7790455222129822, + "logits/rejected": -0.8049925565719604, + "logps/chosen": -1.6588413715362549, + "logps/rejected": -1.706597089767456, + "loss": 2.6962, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.58841323852539, + "rewards/margins": 0.4775591790676117, + "rewards/rejected": -17.06597328186035, + "step": 7220 + }, + { + "epoch": 0.2435201725706967, + "grad_norm": 24.955659866333008, + "learning_rate": 9.385585479591133e-07, + "logits/chosen": -0.3855653405189514, + "logits/rejected": -0.3951939046382904, + "logps/chosen": -1.7210830450057983, + "logps/rejected": -1.7640886306762695, + "loss": 2.962, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.210830688476562, + "rewards/margins": 0.4300549626350403, + "rewards/rejected": -17.640888214111328, + "step": 7225 + }, + { + "epoch": 0.24368869864167986, + "grad_norm": 24.871198654174805, + "learning_rate": 9.384172061297165e-07, + "logits/chosen": -1.087990403175354, + "logits/rejected": -1.0538965463638306, + "logps/chosen": -1.6797294616699219, + "logps/rejected": -1.7016398906707764, + "loss": 2.9103, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.79729652404785, + "rewards/margins": 0.21910543739795685, + "rewards/rejected": -17.016399383544922, + "step": 7230 + }, + { + "epoch": 0.24385722471266305, + "grad_norm": 47.272247314453125, + "learning_rate": 9.382757125813975e-07, + "logits/chosen": -0.5247252583503723, + "logits/rejected": -0.6660190224647522, + "logps/chosen": -1.898813247680664, + "logps/rejected": -1.8705341815948486, + "loss": 3.3691, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -18.98813247680664, + "rewards/margins": -0.28279104828834534, + "rewards/rejected": -18.705341339111328, + "step": 7235 + }, + { + "epoch": 0.24402575078364622, + "grad_norm": 28.096302032470703, + "learning_rate": 9.381340673631217e-07, + "logits/chosen": -0.7041982412338257, + "logits/rejected": -0.887669563293457, + "logps/chosen": -1.9297678470611572, + "logps/rejected": -1.8985267877578735, + "loss": 3.6096, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.297677993774414, + "rewards/margins": -0.312410831451416, + "rewards/rejected": -18.985267639160156, + "step": 7240 + }, + { + "epoch": 0.2441942768546294, + "grad_norm": 29.46830940246582, + "learning_rate": 9.379922705239072e-07, + "logits/chosen": -0.7096506357192993, + "logits/rejected": -0.776728630065918, + "logps/chosen": -1.73779296875, + "logps/rejected": -1.6395982503890991, + "loss": 4.1988, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -17.3779296875, + "rewards/margins": -0.9819458723068237, + "rewards/rejected": -16.395984649658203, + "step": 7245 + }, + { + "epoch": 0.2443628029256126, + "grad_norm": 34.96768569946289, + "learning_rate": 9.37850322112824e-07, + "logits/chosen": -0.23200814425945282, + "logits/rejected": -0.40445584058761597, + "logps/chosen": -1.738954782485962, + "logps/rejected": -1.8703186511993408, + "loss": 2.2738, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.38954734802246, + "rewards/margins": 1.313637137413025, + "rewards/rejected": -18.703184127807617, + "step": 7250 + }, + { + "epoch": 0.24453132899659577, + "grad_norm": 16.488719940185547, + "learning_rate": 9.377082221789949e-07, + "logits/chosen": -0.9548704028129578, + "logits/rejected": -1.0706945657730103, + "logps/chosen": -1.6227245330810547, + "logps/rejected": -1.7374531030654907, + "loss": 2.6642, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.227245330810547, + "rewards/margins": 1.1472835540771484, + "rewards/rejected": -17.374530792236328, + "step": 7255 + }, + { + "epoch": 0.24469985506757896, + "grad_norm": 19.77877426147461, + "learning_rate": 9.375659707715951e-07, + "logits/chosen": -0.20007792115211487, + "logits/rejected": -0.41266050934791565, + "logps/chosen": -2.046832323074341, + "logps/rejected": -2.0882670879364014, + "loss": 3.0815, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.46832275390625, + "rewards/margins": 0.4143500328063965, + "rewards/rejected": -20.882671356201172, + "step": 7260 + }, + { + "epoch": 0.24486838113856213, + "grad_norm": 18.866714477539062, + "learning_rate": 9.374235679398524e-07, + "logits/chosen": -0.24752330780029297, + "logits/rejected": -0.22193972766399384, + "logps/chosen": -2.1437008380889893, + "logps/rejected": -2.275897264480591, + "loss": 2.8594, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.437007904052734, + "rewards/margins": 1.321963906288147, + "rewards/rejected": -22.75897216796875, + "step": 7265 + }, + { + "epoch": 0.24503690720954532, + "grad_norm": 16.798664093017578, + "learning_rate": 9.372810137330464e-07, + "logits/chosen": -0.7686780691146851, + "logits/rejected": -0.9035130739212036, + "logps/chosen": -1.7028411626815796, + "logps/rejected": -2.389897108078003, + "loss": 1.6245, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.028413772583008, + "rewards/margins": 6.8705573081970215, + "rewards/rejected": -23.898971557617188, + "step": 7270 + }, + { + "epoch": 0.2452054332805285, + "grad_norm": 24.184377670288086, + "learning_rate": 9.371383082005098e-07, + "logits/chosen": -0.8139625787734985, + "logits/rejected": -0.7616298794746399, + "logps/chosen": -1.6962127685546875, + "logps/rejected": -1.8044872283935547, + "loss": 2.5592, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.962127685546875, + "rewards/margins": 1.0827454328536987, + "rewards/rejected": -18.044872283935547, + "step": 7275 + }, + { + "epoch": 0.24537395935151168, + "grad_norm": 24.054651260375977, + "learning_rate": 9.369954513916273e-07, + "logits/chosen": -0.4652763903141022, + "logits/rejected": -0.42502039670944214, + "logps/chosen": -2.0171265602111816, + "logps/rejected": -1.992876410484314, + "loss": 3.6302, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.1712646484375, + "rewards/margins": -0.24250011146068573, + "rewards/rejected": -19.92876625061035, + "step": 7280 + }, + { + "epoch": 0.24554248542249485, + "grad_norm": 19.809432983398438, + "learning_rate": 9.36852443355836e-07, + "logits/chosen": -0.7408286333084106, + "logits/rejected": -0.7723020911216736, + "logps/chosen": -1.9411401748657227, + "logps/rejected": -2.0283544063568115, + "loss": 2.6034, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.41140365600586, + "rewards/margins": 0.8721408843994141, + "rewards/rejected": -20.283544540405273, + "step": 7285 + }, + { + "epoch": 0.24571101149347804, + "grad_norm": 26.219873428344727, + "learning_rate": 9.367092841426254e-07, + "logits/chosen": -0.5848680138587952, + "logits/rejected": -0.6235638856887817, + "logps/chosen": -1.9324791431427002, + "logps/rejected": -1.8378928899765015, + "loss": 4.3249, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -19.324790954589844, + "rewards/margins": -0.94586181640625, + "rewards/rejected": -18.37892723083496, + "step": 7290 + }, + { + "epoch": 0.2458795375644612, + "grad_norm": 17.715272903442383, + "learning_rate": 9.365659738015372e-07, + "logits/chosen": -0.855682373046875, + "logits/rejected": -0.9267571568489075, + "logps/chosen": -1.8638238906860352, + "logps/rejected": -1.7686694860458374, + "loss": 4.0473, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -18.63823890686035, + "rewards/margins": -0.9515427350997925, + "rewards/rejected": -17.686695098876953, + "step": 7295 + }, + { + "epoch": 0.2460480636354444, + "grad_norm": 43.57025146484375, + "learning_rate": 9.364225123821655e-07, + "logits/chosen": -0.5702222585678101, + "logits/rejected": -0.4446120262145996, + "logps/chosen": -1.705815076828003, + "logps/rejected": -1.676786184310913, + "loss": 3.5881, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.058151245117188, + "rewards/margins": -0.29028749465942383, + "rewards/rejected": -16.767864227294922, + "step": 7300 + }, + { + "epoch": 0.2462165897064276, + "grad_norm": 81.14810180664062, + "learning_rate": 9.362788999341567e-07, + "logits/chosen": -0.5665368437767029, + "logits/rejected": -0.5875300168991089, + "logps/chosen": -1.9316835403442383, + "logps/rejected": -1.9984298944473267, + "loss": 2.5229, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.31683349609375, + "rewards/margins": 0.66746586561203, + "rewards/rejected": -19.984302520751953, + "step": 7305 + }, + { + "epoch": 0.24638511577741076, + "grad_norm": 24.982755661010742, + "learning_rate": 9.36135136507209e-07, + "logits/chosen": -0.6017649173736572, + "logits/rejected": -0.5996742248535156, + "logps/chosen": -2.1955275535583496, + "logps/rejected": -2.2533159255981445, + "loss": 3.0001, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.955278396606445, + "rewards/margins": 0.5778809785842896, + "rewards/rejected": -22.533157348632812, + "step": 7310 + }, + { + "epoch": 0.24655364184839396, + "grad_norm": 36.10542678833008, + "learning_rate": 9.35991222151074e-07, + "logits/chosen": -0.430186927318573, + "logits/rejected": -0.6433888673782349, + "logps/chosen": -1.7533395290374756, + "logps/rejected": -1.7290500402450562, + "loss": 3.5198, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.533395767211914, + "rewards/margins": -0.242896169424057, + "rewards/rejected": -17.29050064086914, + "step": 7315 + }, + { + "epoch": 0.24672216791937712, + "grad_norm": 25.019153594970703, + "learning_rate": 9.358471569155542e-07, + "logits/chosen": -0.6420100331306458, + "logits/rejected": -0.5819183588027954, + "logps/chosen": -1.7961498498916626, + "logps/rejected": -1.8753328323364258, + "loss": 2.7737, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.961498260498047, + "rewards/margins": 0.7918294072151184, + "rewards/rejected": -18.753326416015625, + "step": 7320 + }, + { + "epoch": 0.24689069399036032, + "grad_norm": 24.0214900970459, + "learning_rate": 9.35702940850505e-07, + "logits/chosen": -0.6717097163200378, + "logits/rejected": -0.6161051392555237, + "logps/chosen": -1.6413822174072266, + "logps/rejected": -1.6599630117416382, + "loss": 2.8914, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.413822174072266, + "rewards/margins": 0.18580922484397888, + "rewards/rejected": -16.599628448486328, + "step": 7325 + }, + { + "epoch": 0.24705922006134348, + "grad_norm": 16.337671279907227, + "learning_rate": 9.355585740058341e-07, + "logits/chosen": -0.6781617999076843, + "logits/rejected": -0.8230530023574829, + "logps/chosen": -1.4555952548980713, + "logps/rejected": -1.6636173725128174, + "loss": 1.8262, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -14.555951118469238, + "rewards/margins": 2.0802226066589355, + "rewards/rejected": -16.636173248291016, + "step": 7330 + }, + { + "epoch": 0.24722774613232668, + "grad_norm": 43.3533935546875, + "learning_rate": 9.354140564315011e-07, + "logits/chosen": -0.8407415151596069, + "logits/rejected": -0.846244215965271, + "logps/chosen": -1.916285514831543, + "logps/rejected": -1.8819057941436768, + "loss": 3.4865, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -19.162857055664062, + "rewards/margins": -0.34379902482032776, + "rewards/rejected": -18.81905746459961, + "step": 7335 + }, + { + "epoch": 0.24739627220330984, + "grad_norm": 19.098432540893555, + "learning_rate": 9.352693881775178e-07, + "logits/chosen": -0.7666581273078918, + "logits/rejected": -0.7992604970932007, + "logps/chosen": -1.715882658958435, + "logps/rejected": -1.7032171487808228, + "loss": 3.3118, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -17.15882682800293, + "rewards/margins": -0.1266559660434723, + "rewards/rejected": -17.03217124938965, + "step": 7340 + }, + { + "epoch": 0.24756479827429304, + "grad_norm": 28.776378631591797, + "learning_rate": 9.35124569293948e-07, + "logits/chosen": -0.25755545496940613, + "logits/rejected": -0.4848417341709137, + "logps/chosen": -1.7341407537460327, + "logps/rejected": -1.7520534992218018, + "loss": 2.9939, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.34140396118164, + "rewards/margins": 0.179127499461174, + "rewards/rejected": -17.52053451538086, + "step": 7345 + }, + { + "epoch": 0.2477333243452762, + "grad_norm": 45.91712951660156, + "learning_rate": 9.349795998309081e-07, + "logits/chosen": -0.44464340806007385, + "logits/rejected": -0.4558509290218353, + "logps/chosen": -1.9467509984970093, + "logps/rejected": -1.8426557779312134, + "loss": 4.5344, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -19.46750831604004, + "rewards/margins": -1.0409516096115112, + "rewards/rejected": -18.426557540893555, + "step": 7350 + }, + { + "epoch": 0.2479018504162594, + "grad_norm": 42.15274429321289, + "learning_rate": 9.348344798385662e-07, + "logits/chosen": -0.3427899479866028, + "logits/rejected": -0.3019963800907135, + "logps/chosen": -2.4352622032165527, + "logps/rejected": -2.2316198348999023, + "loss": 5.0965, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.35262107849121, + "rewards/margins": -2.036423921585083, + "rewards/rejected": -22.31619644165039, + "step": 7355 + }, + { + "epoch": 0.2480703764872426, + "grad_norm": 71.61480712890625, + "learning_rate": 9.346892093671427e-07, + "logits/chosen": -0.32045572996139526, + "logits/rejected": -0.4350900650024414, + "logps/chosen": -2.1125540733337402, + "logps/rejected": -2.0484910011291504, + "loss": 4.0104, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.125539779663086, + "rewards/margins": -0.6406329274177551, + "rewards/rejected": -20.484909057617188, + "step": 7360 + }, + { + "epoch": 0.24823890255822575, + "grad_norm": 78.812255859375, + "learning_rate": 9.345437884669098e-07, + "logits/chosen": -0.6491799354553223, + "logits/rejected": -0.7181687355041504, + "logps/chosen": -2.0276429653167725, + "logps/rejected": -2.0618278980255127, + "loss": 2.8536, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.276430130004883, + "rewards/margins": 0.341848760843277, + "rewards/rejected": -20.61827850341797, + "step": 7365 + }, + { + "epoch": 0.24840742862920895, + "grad_norm": 25.033784866333008, + "learning_rate": 9.343982171881921e-07, + "logits/chosen": -0.6499834060668945, + "logits/rejected": -0.6727867126464844, + "logps/chosen": -2.126075029373169, + "logps/rejected": -2.216430425643921, + "loss": 2.5102, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.26074981689453, + "rewards/margins": 0.903557300567627, + "rewards/rejected": -22.164308547973633, + "step": 7370 + }, + { + "epoch": 0.24857595470019211, + "grad_norm": 16.618209838867188, + "learning_rate": 9.342524955813661e-07, + "logits/chosen": -0.6488254070281982, + "logits/rejected": -0.5306954979896545, + "logps/chosen": -1.7417656183242798, + "logps/rejected": -1.8435453176498413, + "loss": 2.5985, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.41765785217285, + "rewards/margins": 1.0177967548370361, + "rewards/rejected": -18.43545150756836, + "step": 7375 + }, + { + "epoch": 0.2487444807711753, + "grad_norm": 25.25537872314453, + "learning_rate": 9.341066236968602e-07, + "logits/chosen": -0.6297262907028198, + "logits/rejected": -0.6211252212524414, + "logps/chosen": -1.7245155572891235, + "logps/rejected": -1.6116546392440796, + "loss": 4.2853, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -17.24515724182129, + "rewards/margins": -1.128609299659729, + "rewards/rejected": -16.116546630859375, + "step": 7380 + }, + { + "epoch": 0.24891300684215847, + "grad_norm": 36.48454284667969, + "learning_rate": 9.339606015851549e-07, + "logits/chosen": -0.4730660319328308, + "logits/rejected": -0.5225784778594971, + "logps/chosen": -1.9237180948257446, + "logps/rejected": -2.076869249343872, + "loss": 3.0223, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.237178802490234, + "rewards/margins": 1.5315120220184326, + "rewards/rejected": -20.768692016601562, + "step": 7385 + }, + { + "epoch": 0.24908153291314167, + "grad_norm": 33.16250228881836, + "learning_rate": 9.338144292967829e-07, + "logits/chosen": -0.3939630091190338, + "logits/rejected": -0.3019777834415436, + "logps/chosen": -1.8183307647705078, + "logps/rejected": -1.838861107826233, + "loss": 3.0173, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.18330955505371, + "rewards/margins": 0.2053002417087555, + "rewards/rejected": -18.38861083984375, + "step": 7390 + }, + { + "epoch": 0.24925005898412483, + "grad_norm": 36.6594123840332, + "learning_rate": 9.336681068823284e-07, + "logits/chosen": -0.38314300775527954, + "logits/rejected": -0.47642940282821655, + "logps/chosen": -2.0626749992370605, + "logps/rejected": -2.094438076019287, + "loss": 3.0801, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.626750946044922, + "rewards/margins": 0.31763094663619995, + "rewards/rejected": -20.944381713867188, + "step": 7395 + }, + { + "epoch": 0.24941858505510803, + "grad_norm": 25.774776458740234, + "learning_rate": 9.335216343924279e-07, + "logits/chosen": -0.3303954005241394, + "logits/rejected": -0.4347568154335022, + "logps/chosen": -1.9623749256134033, + "logps/rejected": -2.0461456775665283, + "loss": 2.6242, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.623748779296875, + "rewards/margins": 0.8377087712287903, + "rewards/rejected": -20.461456298828125, + "step": 7400 + }, + { + "epoch": 0.2495871111260912, + "grad_norm": 25.02663230895996, + "learning_rate": 9.333750118777699e-07, + "logits/chosen": -0.5133193731307983, + "logits/rejected": -0.3330710828304291, + "logps/chosen": -1.7310879230499268, + "logps/rejected": -1.8869367837905884, + "loss": 2.0172, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.31087875366211, + "rewards/margins": 1.558487057685852, + "rewards/rejected": -18.869367599487305, + "step": 7405 + }, + { + "epoch": 0.2497556371970744, + "grad_norm": 24.98771095275879, + "learning_rate": 9.332282393890946e-07, + "logits/chosen": -0.007177996449172497, + "logits/rejected": -0.031101590022444725, + "logps/chosen": -2.019897937774658, + "logps/rejected": -2.3106188774108887, + "loss": 1.9254, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.1989803314209, + "rewards/margins": 2.907209873199463, + "rewards/rejected": -23.106189727783203, + "step": 7410 + }, + { + "epoch": 0.24992416326805758, + "grad_norm": 30.926401138305664, + "learning_rate": 9.330813169771941e-07, + "logits/chosen": -0.8511411547660828, + "logits/rejected": -0.7760568857192993, + "logps/chosen": -1.6532011032104492, + "logps/rejected": -1.6798311471939087, + "loss": 3.0094, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.532011032104492, + "rewards/margins": 0.2663000524044037, + "rewards/rejected": -16.79831314086914, + "step": 7415 + }, + { + "epoch": 0.2500926893390408, + "grad_norm": 16.393407821655273, + "learning_rate": 9.329342446929125e-07, + "logits/chosen": -0.7259313464164734, + "logits/rejected": -0.7014585137367249, + "logps/chosen": -1.7854305505752563, + "logps/rejected": -1.9603378772735596, + "loss": 1.9886, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.854305267333984, + "rewards/margins": 1.749071478843689, + "rewards/rejected": -19.603378295898438, + "step": 7420 + }, + { + "epoch": 0.25026121541002394, + "grad_norm": 34.89096450805664, + "learning_rate": 9.327870225871458e-07, + "logits/chosen": -0.8024528622627258, + "logits/rejected": -0.690179705619812, + "logps/chosen": -1.7187427282333374, + "logps/rejected": -1.8699325323104858, + "loss": 2.1882, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.187427520751953, + "rewards/margins": 1.511897087097168, + "rewards/rejected": -18.699325561523438, + "step": 7425 + }, + { + "epoch": 0.2504297414810071, + "grad_norm": 23.38494873046875, + "learning_rate": 9.326396507108417e-07, + "logits/chosen": -0.7363360524177551, + "logits/rejected": -0.7402480840682983, + "logps/chosen": -1.6879494190216064, + "logps/rejected": -1.7465381622314453, + "loss": 2.6665, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.87949562072754, + "rewards/margins": 0.5858873128890991, + "rewards/rejected": -17.465381622314453, + "step": 7430 + }, + { + "epoch": 0.2505982675519903, + "grad_norm": 29.959455490112305, + "learning_rate": 9.324921291149999e-07, + "logits/chosen": -0.7818613648414612, + "logits/rejected": -0.8807746171951294, + "logps/chosen": -1.4602100849151611, + "logps/rejected": -1.6767206192016602, + "loss": 1.5994, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -14.60210132598877, + "rewards/margins": 2.1651058197021484, + "rewards/rejected": -16.7672061920166, + "step": 7435 + }, + { + "epoch": 0.2507667936229735, + "grad_norm": 33.429073333740234, + "learning_rate": 9.323444578506716e-07, + "logits/chosen": -0.5882720947265625, + "logits/rejected": -0.756925642490387, + "logps/chosen": -1.9938703775405884, + "logps/rejected": -2.0285696983337402, + "loss": 2.953, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.938703536987305, + "rewards/margins": 0.3469921946525574, + "rewards/rejected": -20.285696029663086, + "step": 7440 + }, + { + "epoch": 0.25093531969395666, + "grad_norm": 35.908668518066406, + "learning_rate": 9.3219663696896e-07, + "logits/chosen": -0.32356154918670654, + "logits/rejected": -0.2778133749961853, + "logps/chosen": -1.959058403968811, + "logps/rejected": -1.9171994924545288, + "loss": 3.755, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.5905818939209, + "rewards/margins": -0.4185875952243805, + "rewards/rejected": -19.171995162963867, + "step": 7445 + }, + { + "epoch": 0.2511038457649398, + "grad_norm": 19.67608642578125, + "learning_rate": 9.320486665210204e-07, + "logits/chosen": -0.3477417528629303, + "logits/rejected": -0.430941641330719, + "logps/chosen": -1.936028242111206, + "logps/rejected": -2.15446400642395, + "loss": 2.2592, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.360280990600586, + "rewards/margins": 2.1843581199645996, + "rewards/rejected": -21.544641494750977, + "step": 7450 + }, + { + "epoch": 0.251272371835923, + "grad_norm": 17.752864837646484, + "learning_rate": 9.319005465580594e-07, + "logits/chosen": -0.6723566651344299, + "logits/rejected": -0.7199611067771912, + "logps/chosen": -1.8637107610702515, + "logps/rejected": -2.097921371459961, + "loss": 1.4462, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -18.637107849121094, + "rewards/margins": 2.3421072959899902, + "rewards/rejected": -20.979215621948242, + "step": 7455 + }, + { + "epoch": 0.2514408979069062, + "grad_norm": 29.509733200073242, + "learning_rate": 9.317522771313353e-07, + "logits/chosen": -0.3042893409729004, + "logits/rejected": -0.39669641852378845, + "logps/chosen": -1.7221676111221313, + "logps/rejected": -1.6911754608154297, + "loss": 3.5054, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.221675872802734, + "rewards/margins": -0.30992332100868225, + "rewards/rejected": -16.911754608154297, + "step": 7460 + }, + { + "epoch": 0.2516094239778894, + "grad_norm": 90.46784973144531, + "learning_rate": 9.316038582921586e-07, + "logits/chosen": -0.5147503614425659, + "logits/rejected": -0.5856298208236694, + "logps/chosen": -2.3417937755584717, + "logps/rejected": -2.436811923980713, + "loss": 2.7598, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.41794204711914, + "rewards/margins": 0.950177788734436, + "rewards/rejected": -24.368114471435547, + "step": 7465 + }, + { + "epoch": 0.25177795004887255, + "grad_norm": 36.988059997558594, + "learning_rate": 9.314552900918908e-07, + "logits/chosen": -0.6642917394638062, + "logits/rejected": -0.7818363308906555, + "logps/chosen": -1.3997938632965088, + "logps/rejected": -1.5536857843399048, + "loss": 2.2515, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -13.99793815612793, + "rewards/margins": 1.5389198064804077, + "rewards/rejected": -15.536859512329102, + "step": 7470 + }, + { + "epoch": 0.25194647611985577, + "grad_norm": 24.968826293945312, + "learning_rate": 9.31306572581946e-07, + "logits/chosen": -0.3435518741607666, + "logits/rejected": -0.3793199360370636, + "logps/chosen": -1.8419605493545532, + "logps/rejected": -1.8113027811050415, + "loss": 3.7557, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.419605255126953, + "rewards/margins": -0.30657798051834106, + "rewards/rejected": -18.113027572631836, + "step": 7475 + }, + { + "epoch": 0.25211500219083893, + "grad_norm": 22.026336669921875, + "learning_rate": 9.311577058137892e-07, + "logits/chosen": -0.9103565216064453, + "logits/rejected": -0.6073547601699829, + "logps/chosen": -1.4468992948532104, + "logps/rejected": -1.6920430660247803, + "loss": 1.7871, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -14.468992233276367, + "rewards/margins": 2.451439380645752, + "rewards/rejected": -16.92043113708496, + "step": 7480 + }, + { + "epoch": 0.2522835282618221, + "grad_norm": 30.010156631469727, + "learning_rate": 9.310086898389374e-07, + "logits/chosen": -0.632883608341217, + "logits/rejected": -0.7742358446121216, + "logps/chosen": -1.770172357559204, + "logps/rejected": -1.7545216083526611, + "loss": 3.3847, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.701723098754883, + "rewards/margins": -0.1565081626176834, + "rewards/rejected": -17.545215606689453, + "step": 7485 + }, + { + "epoch": 0.25245205433280526, + "grad_norm": 9.875432968139648, + "learning_rate": 9.30859524708959e-07, + "logits/chosen": -0.6751956343650818, + "logits/rejected": -0.7236267328262329, + "logps/chosen": -1.6921708583831787, + "logps/rejected": -2.151487350463867, + "loss": 1.9065, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.921709060668945, + "rewards/margins": 4.593166351318359, + "rewards/rejected": -21.514873504638672, + "step": 7490 + }, + { + "epoch": 0.2526205804037885, + "grad_norm": 8.505745887756348, + "learning_rate": 9.307102104754742e-07, + "logits/chosen": -0.5317801237106323, + "logits/rejected": -0.3818480968475342, + "logps/chosen": -1.8155962228775024, + "logps/rejected": -2.12412166595459, + "loss": 2.6481, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.155963897705078, + "rewards/margins": 3.0852532386779785, + "rewards/rejected": -21.2412166595459, + "step": 7495 + }, + { + "epoch": 0.25278910647477165, + "grad_norm": 29.946495056152344, + "learning_rate": 9.30560747190155e-07, + "logits/chosen": -0.6961748003959656, + "logits/rejected": -0.7425335049629211, + "logps/chosen": -2.139946460723877, + "logps/rejected": -1.9574038982391357, + "loss": 4.8539, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -21.399463653564453, + "rewards/margins": -1.8254258632659912, + "rewards/rejected": -19.574039459228516, + "step": 7500 + }, + { + "epoch": 0.2529576325457548, + "grad_norm": 8.821472147246823e-05, + "learning_rate": 9.304111349047245e-07, + "logits/chosen": -0.3789060115814209, + "logits/rejected": -0.48966988921165466, + "logps/chosen": -2.3770015239715576, + "logps/rejected": -2.9544246196746826, + "loss": 2.4364, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.770015716552734, + "rewards/margins": 5.774231433868408, + "rewards/rejected": -29.54425048828125, + "step": 7505 + }, + { + "epoch": 0.253126158616738, + "grad_norm": 23.108131408691406, + "learning_rate": 9.30261373670958e-07, + "logits/chosen": -0.3439405858516693, + "logits/rejected": -0.3222549259662628, + "logps/chosen": -1.766324758529663, + "logps/rejected": -1.7195783853530884, + "loss": 3.6566, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.66324806213379, + "rewards/margins": -0.46746331453323364, + "rewards/rejected": -17.195783615112305, + "step": 7510 + }, + { + "epoch": 0.2532946846877212, + "grad_norm": 32.8663444519043, + "learning_rate": 9.301114635406813e-07, + "logits/chosen": -0.43090614676475525, + "logits/rejected": -0.40088552236557007, + "logps/chosen": -1.8684027194976807, + "logps/rejected": -2.1092865467071533, + "loss": 3.1633, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.684024810791016, + "rewards/margins": 2.4088387489318848, + "rewards/rejected": -21.09286880493164, + "step": 7515 + }, + { + "epoch": 0.25346321075870437, + "grad_norm": 3.117332935333252, + "learning_rate": 9.299614045657731e-07, + "logits/chosen": -0.3704325258731842, + "logits/rejected": -0.44101667404174805, + "logps/chosen": -1.7551378011703491, + "logps/rejected": -1.8985016345977783, + "loss": 2.3468, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.55137825012207, + "rewards/margins": 1.4336389303207397, + "rewards/rejected": -18.985015869140625, + "step": 7520 + }, + { + "epoch": 0.25363173682968754, + "grad_norm": 30.372228622436523, + "learning_rate": 9.298111967981625e-07, + "logits/chosen": -0.24413923919200897, + "logits/rejected": -0.28805142641067505, + "logps/chosen": -2.1313605308532715, + "logps/rejected": -2.2983944416046143, + "loss": 2.7782, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.313602447509766, + "rewards/margins": 1.67034113407135, + "rewards/rejected": -22.983945846557617, + "step": 7525 + }, + { + "epoch": 0.25380026290067076, + "grad_norm": 32.955474853515625, + "learning_rate": 9.296608402898305e-07, + "logits/chosen": -0.6186766624450684, + "logits/rejected": -0.6145849823951721, + "logps/chosen": -1.6524741649627686, + "logps/rejected": -1.6749063730239868, + "loss": 3.0369, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.524742126464844, + "rewards/margins": 0.2243211716413498, + "rewards/rejected": -16.74906349182129, + "step": 7530 + }, + { + "epoch": 0.2539687889716539, + "grad_norm": 28.929431915283203, + "learning_rate": 9.295103350928098e-07, + "logits/chosen": -0.7780159115791321, + "logits/rejected": -0.6910878419876099, + "logps/chosen": -1.8065674304962158, + "logps/rejected": -1.903062105178833, + "loss": 2.889, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.065673828125, + "rewards/margins": 0.9649454951286316, + "rewards/rejected": -19.030620574951172, + "step": 7535 + }, + { + "epoch": 0.2541373150426371, + "grad_norm": 18.927898406982422, + "learning_rate": 9.293596812591839e-07, + "logits/chosen": -0.23738765716552734, + "logits/rejected": -0.3145070970058441, + "logps/chosen": -2.188699722290039, + "logps/rejected": -2.463549852371216, + "loss": 3.5589, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.886999130249023, + "rewards/margins": 2.7484989166259766, + "rewards/rejected": -24.635496139526367, + "step": 7540 + }, + { + "epoch": 0.25430584111362026, + "grad_norm": 11.518166542053223, + "learning_rate": 9.292088788410885e-07, + "logits/chosen": -0.27805063128471375, + "logits/rejected": -0.3325250744819641, + "logps/chosen": -2.21008563041687, + "logps/rejected": -2.4034438133239746, + "loss": 3.4056, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -22.10085678100586, + "rewards/margins": 1.9335815906524658, + "rewards/rejected": -24.034439086914062, + "step": 7545 + }, + { + "epoch": 0.2544743671846035, + "grad_norm": 32.053321838378906, + "learning_rate": 9.290579278907104e-07, + "logits/chosen": -0.6328898072242737, + "logits/rejected": -0.5089461207389832, + "logps/chosen": -1.902991533279419, + "logps/rejected": -2.017669200897217, + "loss": 2.6123, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.02991485595703, + "rewards/margins": 1.1467796564102173, + "rewards/rejected": -20.176692962646484, + "step": 7550 + }, + { + "epoch": 0.25464289325558664, + "grad_norm": 42.185794830322266, + "learning_rate": 9.289068284602877e-07, + "logits/chosen": -0.43494969606399536, + "logits/rejected": -0.5104081034660339, + "logps/chosen": -2.2199254035949707, + "logps/rejected": -2.1992757320404053, + "loss": 3.3121, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.199254989624023, + "rewards/margins": -0.20649738609790802, + "rewards/rejected": -21.992755889892578, + "step": 7555 + }, + { + "epoch": 0.2548114193265698, + "grad_norm": 27.1168270111084, + "learning_rate": 9.287555806021097e-07, + "logits/chosen": -0.9154708981513977, + "logits/rejected": -0.902691662311554, + "logps/chosen": -1.6451492309570312, + "logps/rejected": -1.5507086515426636, + "loss": 4.0632, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -16.451494216918945, + "rewards/margins": -0.9444063901901245, + "rewards/rejected": -15.507085800170898, + "step": 7560 + }, + { + "epoch": 0.254979945397553, + "grad_norm": 10.494613647460938, + "learning_rate": 9.286041843685177e-07, + "logits/chosen": -0.3919174075126648, + "logits/rejected": -0.5906190872192383, + "logps/chosen": -2.21108341217041, + "logps/rejected": -2.351998805999756, + "loss": 3.1334, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -22.11083221435547, + "rewards/margins": 1.409155011177063, + "rewards/rejected": -23.519987106323242, + "step": 7565 + }, + { + "epoch": 0.2551484714685362, + "grad_norm": 15.32865047454834, + "learning_rate": 9.284526398119038e-07, + "logits/chosen": -0.23844066262245178, + "logits/rejected": -0.24300916492938995, + "logps/chosen": -1.9033092260360718, + "logps/rejected": -2.0004773139953613, + "loss": 2.6859, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.033092498779297, + "rewards/margins": 0.9716783761978149, + "rewards/rejected": -20.004772186279297, + "step": 7570 + }, + { + "epoch": 0.25531699753951936, + "grad_norm": 18.072689056396484, + "learning_rate": 9.283009469847116e-07, + "logits/chosen": -0.6887549161911011, + "logits/rejected": -0.7601548433303833, + "logps/chosen": -1.7088873386383057, + "logps/rejected": -1.8237041234970093, + "loss": 2.1205, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.0888729095459, + "rewards/margins": 1.1481688022613525, + "rewards/rejected": -18.237041473388672, + "step": 7575 + }, + { + "epoch": 0.25548552361050253, + "grad_norm": 31.30813980102539, + "learning_rate": 9.281491059394361e-07, + "logits/chosen": -0.7471412420272827, + "logits/rejected": -0.6275310516357422, + "logps/chosen": -1.9791702032089233, + "logps/rejected": -2.046593189239502, + "loss": 2.6766, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.791702270507812, + "rewards/margins": 0.6742300987243652, + "rewards/rejected": -20.465932846069336, + "step": 7580 + }, + { + "epoch": 0.25565404968148575, + "grad_norm": 23.66309356689453, + "learning_rate": 9.279971167286233e-07, + "logits/chosen": -0.44159945845603943, + "logits/rejected": -0.5053187608718872, + "logps/chosen": -2.107053518295288, + "logps/rejected": -2.204468250274658, + "loss": 2.3342, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.070533752441406, + "rewards/margins": 0.9741487503051758, + "rewards/rejected": -22.0446834564209, + "step": 7585 + }, + { + "epoch": 0.2558225757524689, + "grad_norm": 40.30325698852539, + "learning_rate": 9.27844979404871e-07, + "logits/chosen": 0.027735818177461624, + "logits/rejected": -0.025748800486326218, + "logps/chosen": -2.3189496994018555, + "logps/rejected": -2.194973945617676, + "loss": 4.4813, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.189496994018555, + "rewards/margins": -1.239757776260376, + "rewards/rejected": -21.949739456176758, + "step": 7590 + }, + { + "epoch": 0.2559911018234521, + "grad_norm": 15.760374069213867, + "learning_rate": 9.276926940208276e-07, + "logits/chosen": -0.43298858404159546, + "logits/rejected": -0.38209548592567444, + "logps/chosen": -1.8992574214935303, + "logps/rejected": -1.923172950744629, + "loss": 4.0977, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -18.992572784423828, + "rewards/margins": 0.23915652930736542, + "rewards/rejected": -19.231731414794922, + "step": 7595 + }, + { + "epoch": 0.25615962789443525, + "grad_norm": 14.925925254821777, + "learning_rate": 9.275402606291933e-07, + "logits/chosen": -0.8036720156669617, + "logits/rejected": -0.803006649017334, + "logps/chosen": -1.5346969366073608, + "logps/rejected": -1.8176238536834717, + "loss": 2.0604, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -15.346969604492188, + "rewards/margins": 2.829270839691162, + "rewards/rejected": -18.176240921020508, + "step": 7600 + }, + { + "epoch": 0.25615962789443525, + "eval_logits/chosen": -0.8601851463317871, + "eval_logits/rejected": -0.9017350077629089, + "eval_logps/chosen": -1.7739288806915283, + "eval_logps/rejected": -1.8104459047317505, + "eval_loss": 3.192545175552368, + "eval_rewards/accuracies": 0.6100000143051147, + "eval_rewards/chosen": -17.739290237426758, + "eval_rewards/margins": 0.3651680648326874, + "eval_rewards/rejected": -18.10445785522461, + "eval_runtime": 12.9145, + "eval_samples_per_second": 7.743, + "eval_steps_per_second": 1.936, + "step": 7600 + }, + { + "epoch": 0.25632815396541847, + "grad_norm": 26.527164459228516, + "learning_rate": 9.273876792827192e-07, + "logits/chosen": -0.717302143573761, + "logits/rejected": -0.7854380011558533, + "logps/chosen": -1.8394664525985718, + "logps/rejected": -1.8153263330459595, + "loss": 3.3416, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.394664764404297, + "rewards/margins": -0.24140223860740662, + "rewards/rejected": -18.153263092041016, + "step": 7605 + }, + { + "epoch": 0.25649668003640164, + "grad_norm": 17.837858200073242, + "learning_rate": 9.272349500342076e-07, + "logits/chosen": -0.6437766551971436, + "logits/rejected": -0.7148224115371704, + "logps/chosen": -1.7915229797363281, + "logps/rejected": -2.0384459495544434, + "loss": 2.5635, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.91522789001465, + "rewards/margins": 2.4692318439483643, + "rewards/rejected": -20.384458541870117, + "step": 7610 + }, + { + "epoch": 0.2566652061073848, + "grad_norm": 9.811715126037598, + "learning_rate": 9.270820729365123e-07, + "logits/chosen": -0.4629322588443756, + "logits/rejected": -0.4842115044593811, + "logps/chosen": -2.0855185985565186, + "logps/rejected": -2.273648977279663, + "loss": 2.1479, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.855188369750977, + "rewards/margins": 1.8813034296035767, + "rewards/rejected": -22.736492156982422, + "step": 7615 + }, + { + "epoch": 0.25683373217836797, + "grad_norm": 20.732784271240234, + "learning_rate": 9.269290480425378e-07, + "logits/chosen": -0.7600752115249634, + "logits/rejected": -0.8454673886299133, + "logps/chosen": -1.9267241954803467, + "logps/rejected": -2.1829326152801514, + "loss": 2.5112, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.267242431640625, + "rewards/margins": 2.562086582183838, + "rewards/rejected": -21.829326629638672, + "step": 7620 + }, + { + "epoch": 0.2570022582493512, + "grad_norm": 27.150493621826172, + "learning_rate": 9.267758754052402e-07, + "logits/chosen": -0.5359091758728027, + "logits/rejected": -0.4138420522212982, + "logps/chosen": -2.203834056854248, + "logps/rejected": -2.302135467529297, + "loss": 2.6782, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.038341522216797, + "rewards/margins": 0.9830153584480286, + "rewards/rejected": -23.02135467529297, + "step": 7625 + }, + { + "epoch": 0.25717078432033436, + "grad_norm": 31.828556060791016, + "learning_rate": 9.266225550776265e-07, + "logits/chosen": -0.3831477761268616, + "logits/rejected": -0.4036986827850342, + "logps/chosen": -1.6559679508209229, + "logps/rejected": -1.6691372394561768, + "loss": 3.626, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -16.55967903137207, + "rewards/margins": 0.13169364631175995, + "rewards/rejected": -16.69137191772461, + "step": 7630 + }, + { + "epoch": 0.2573393103913175, + "grad_norm": 40.279781341552734, + "learning_rate": 9.264690871127545e-07, + "logits/chosen": -0.5447670817375183, + "logits/rejected": -0.37773579359054565, + "logps/chosen": -1.839734673500061, + "logps/rejected": -1.8037055730819702, + "loss": 3.8751, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.3973445892334, + "rewards/margins": -0.3602902889251709, + "rewards/rejected": -18.03705596923828, + "step": 7635 + }, + { + "epoch": 0.25750783646230074, + "grad_norm": 30.85480499267578, + "learning_rate": 9.263154715637339e-07, + "logits/chosen": -0.8868627548217773, + "logits/rejected": -0.7809014916419983, + "logps/chosen": -1.909232497215271, + "logps/rejected": -2.0275964736938477, + "loss": 2.5388, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.092327117919922, + "rewards/margins": 1.1836379766464233, + "rewards/rejected": -20.275962829589844, + "step": 7640 + }, + { + "epoch": 0.2576763625332839, + "grad_norm": 36.895179748535156, + "learning_rate": 9.261617084837247e-07, + "logits/chosen": -0.2057121992111206, + "logits/rejected": -0.4458581805229187, + "logps/chosen": -2.2100377082824707, + "logps/rejected": -2.46073842048645, + "loss": 2.4705, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.100372314453125, + "rewards/margins": 2.5070078372955322, + "rewards/rejected": -24.60738182067871, + "step": 7645 + }, + { + "epoch": 0.2578448886042671, + "grad_norm": 0.7452271580696106, + "learning_rate": 9.260077979259382e-07, + "logits/chosen": -0.33283504843711853, + "logits/rejected": -0.3346695303916931, + "logps/chosen": -2.141392469406128, + "logps/rejected": -2.394256114959717, + "loss": 2.3332, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.413925170898438, + "rewards/margins": 2.528637170791626, + "rewards/rejected": -23.942562103271484, + "step": 7650 + }, + { + "epoch": 0.25801341467525024, + "grad_norm": 32.01772689819336, + "learning_rate": 9.25853739943637e-07, + "logits/chosen": -0.7218912243843079, + "logits/rejected": -0.799573540687561, + "logps/chosen": -1.6060640811920166, + "logps/rejected": -1.6765620708465576, + "loss": 2.5264, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.06064224243164, + "rewards/margins": 0.7049804925918579, + "rewards/rejected": -16.765621185302734, + "step": 7655 + }, + { + "epoch": 0.25818194074623346, + "grad_norm": 37.131343841552734, + "learning_rate": 9.256995345901342e-07, + "logits/chosen": -0.6877612471580505, + "logits/rejected": -0.8829323649406433, + "logps/chosen": -1.8432331085205078, + "logps/rejected": -1.828413724899292, + "loss": 3.4766, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -18.432331085205078, + "rewards/margins": -0.1481925994157791, + "rewards/rejected": -18.284137725830078, + "step": 7660 + }, + { + "epoch": 0.25835046681721663, + "grad_norm": 18.985092163085938, + "learning_rate": 9.255451819187945e-07, + "logits/chosen": -0.45237255096435547, + "logits/rejected": -0.4141133427619934, + "logps/chosen": -1.8477073907852173, + "logps/rejected": -1.9039417505264282, + "loss": 2.9308, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.47707176208496, + "rewards/margins": 0.5623448491096497, + "rewards/rejected": -19.039417266845703, + "step": 7665 + }, + { + "epoch": 0.2585189928881998, + "grad_norm": 28.19369125366211, + "learning_rate": 9.25390681983033e-07, + "logits/chosen": -1.010534644126892, + "logits/rejected": -0.9251688718795776, + "logps/chosen": -1.7543470859527588, + "logps/rejected": -1.7675797939300537, + "loss": 3.1437, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.54347038269043, + "rewards/margins": 0.13232669234275818, + "rewards/rejected": -17.675798416137695, + "step": 7670 + }, + { + "epoch": 0.25868751895918296, + "grad_norm": 23.96495246887207, + "learning_rate": 9.252360348363164e-07, + "logits/chosen": -0.4807816445827484, + "logits/rejected": -0.5013648271560669, + "logps/chosen": -2.0067787170410156, + "logps/rejected": -2.1580376625061035, + "loss": 2.3734, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.067790985107422, + "rewards/margins": 1.5125861167907715, + "rewards/rejected": -21.58037757873535, + "step": 7675 + }, + { + "epoch": 0.2588560450301662, + "grad_norm": 20.593505859375, + "learning_rate": 9.250812405321618e-07, + "logits/chosen": -0.423076868057251, + "logits/rejected": -0.5166851878166199, + "logps/chosen": -1.464730978012085, + "logps/rejected": -1.5135142803192139, + "loss": 3.1644, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -14.647310256958008, + "rewards/margins": 0.4878327250480652, + "rewards/rejected": -15.135144233703613, + "step": 7680 + }, + { + "epoch": 0.25902457110114935, + "grad_norm": 20.095569610595703, + "learning_rate": 9.249262991241372e-07, + "logits/chosen": -0.059383898973464966, + "logits/rejected": -0.39212357997894287, + "logps/chosen": -2.0744400024414062, + "logps/rejected": -2.1283066272735596, + "loss": 3.3333, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -20.744401931762695, + "rewards/margins": 0.5386648178100586, + "rewards/rejected": -21.283065795898438, + "step": 7685 + }, + { + "epoch": 0.2591930971721325, + "grad_norm": 41.47001266479492, + "learning_rate": 9.247712106658619e-07, + "logits/chosen": -0.4025161862373352, + "logits/rejected": -0.42906612157821655, + "logps/chosen": -2.040505886077881, + "logps/rejected": -2.0720152854919434, + "loss": 2.847, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.405057907104492, + "rewards/margins": 0.3150941729545593, + "rewards/rejected": -20.72015380859375, + "step": 7690 + }, + { + "epoch": 0.25936162324311574, + "grad_norm": 6.932709693908691, + "learning_rate": 9.246159752110061e-07, + "logits/chosen": -0.5451598167419434, + "logits/rejected": -0.5768376588821411, + "logps/chosen": -2.0141749382019043, + "logps/rejected": -2.4219484329223633, + "loss": 2.8068, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.141748428344727, + "rewards/margins": 4.077737808227539, + "rewards/rejected": -24.219484329223633, + "step": 7695 + }, + { + "epoch": 0.2595301493140989, + "grad_norm": 6.739269733428955, + "learning_rate": 9.244605928132902e-07, + "logits/chosen": -0.8408918380737305, + "logits/rejected": -0.8135308027267456, + "logps/chosen": -1.4772506952285767, + "logps/rejected": -1.5503901243209839, + "loss": 2.8249, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -14.772506713867188, + "rewards/margins": 0.7313947677612305, + "rewards/rejected": -15.503901481628418, + "step": 7700 + }, + { + "epoch": 0.25969867538508207, + "grad_norm": 13.87769603729248, + "learning_rate": 9.243050635264864e-07, + "logits/chosen": -0.9550352096557617, + "logits/rejected": -1.0311752557754517, + "logps/chosen": -1.6748530864715576, + "logps/rejected": -1.7567179203033447, + "loss": 3.0043, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.748531341552734, + "rewards/margins": 0.8186489939689636, + "rewards/rejected": -17.56717872619629, + "step": 7705 + }, + { + "epoch": 0.25986720145606523, + "grad_norm": 26.09083366394043, + "learning_rate": 9.24149387404417e-07, + "logits/chosen": -0.7877703905105591, + "logits/rejected": -0.8280628323554993, + "logps/chosen": -1.6516129970550537, + "logps/rejected": -1.7163331508636475, + "loss": 3.0976, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.516132354736328, + "rewards/margins": 0.6472002267837524, + "rewards/rejected": -17.163331985473633, + "step": 7710 + }, + { + "epoch": 0.26003572752704845, + "grad_norm": 16.435352325439453, + "learning_rate": 9.239935645009555e-07, + "logits/chosen": -0.7832959890365601, + "logits/rejected": -0.8045759201049805, + "logps/chosen": -1.9323409795761108, + "logps/rejected": -2.1384482383728027, + "loss": 1.8541, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.323410034179688, + "rewards/margins": 2.0610733032226562, + "rewards/rejected": -21.384485244750977, + "step": 7715 + }, + { + "epoch": 0.2602042535980316, + "grad_norm": 40.79289245605469, + "learning_rate": 9.238375948700261e-07, + "logits/chosen": -0.6069063544273376, + "logits/rejected": -0.6482208967208862, + "logps/chosen": -1.9219573736190796, + "logps/rejected": -2.137608289718628, + "loss": 1.7378, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.219573974609375, + "rewards/margins": 2.156508207321167, + "rewards/rejected": -21.376083374023438, + "step": 7720 + }, + { + "epoch": 0.2603727796690148, + "grad_norm": 34.74724197387695, + "learning_rate": 9.236814785656035e-07, + "logits/chosen": 0.003743249224498868, + "logits/rejected": -0.06489133089780807, + "logps/chosen": -2.0553712844848633, + "logps/rejected": -2.0764975547790527, + "loss": 3.0784, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.553712844848633, + "rewards/margins": 0.21126461029052734, + "rewards/rejected": -20.764976501464844, + "step": 7725 + }, + { + "epoch": 0.26054130573999795, + "grad_norm": 50.483341217041016, + "learning_rate": 9.235252156417134e-07, + "logits/chosen": -0.6431624293327332, + "logits/rejected": -0.7084658741950989, + "logps/chosen": -1.9164985418319702, + "logps/rejected": -1.9143717288970947, + "loss": 3.113, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.16498565673828, + "rewards/margins": -0.021271228790283203, + "rewards/rejected": -19.14371681213379, + "step": 7730 + }, + { + "epoch": 0.2607098318109812, + "grad_norm": 72.169677734375, + "learning_rate": 9.233688061524327e-07, + "logits/chosen": 0.04161912947893143, + "logits/rejected": -0.1001749038696289, + "logps/chosen": -1.9186267852783203, + "logps/rejected": -1.9041011333465576, + "loss": 3.2578, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.186267852783203, + "rewards/margins": -0.14525547623634338, + "rewards/rejected": -19.041011810302734, + "step": 7735 + }, + { + "epoch": 0.26087835788196434, + "grad_norm": 50.393035888671875, + "learning_rate": 9.232122501518882e-07, + "logits/chosen": -0.4775795042514801, + "logits/rejected": -0.5555458068847656, + "logps/chosen": -2.081469774246216, + "logps/rejected": -2.207404375076294, + "loss": 1.9899, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.814697265625, + "rewards/margins": 1.2593481540679932, + "rewards/rejected": -22.074045181274414, + "step": 7740 + }, + { + "epoch": 0.2610468839529475, + "grad_norm": 24.568172454833984, + "learning_rate": 9.230555476942576e-07, + "logits/chosen": -0.5699299573898315, + "logits/rejected": -0.5548152923583984, + "logps/chosen": -1.547564148902893, + "logps/rejected": -1.6397268772125244, + "loss": 2.4438, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -15.475641250610352, + "rewards/margins": 0.9216279983520508, + "rewards/rejected": -16.397268295288086, + "step": 7745 + }, + { + "epoch": 0.2612154100239307, + "grad_norm": 40.764381408691406, + "learning_rate": 9.228986988337699e-07, + "logits/chosen": -0.6876112222671509, + "logits/rejected": -0.6586848497390747, + "logps/chosen": -1.8404690027236938, + "logps/rejected": -1.828752875328064, + "loss": 3.371, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.40468978881836, + "rewards/margins": -0.11716127395629883, + "rewards/rejected": -18.28752899169922, + "step": 7750 + }, + { + "epoch": 0.2613839360949139, + "grad_norm": 13.969160079956055, + "learning_rate": 9.22741703624704e-07, + "logits/chosen": -0.6685362458229065, + "logits/rejected": -0.5314013361930847, + "logps/chosen": -1.958852767944336, + "logps/rejected": -2.212700366973877, + "loss": 2.2484, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.58852767944336, + "rewards/margins": 2.5384745597839355, + "rewards/rejected": -22.127002716064453, + "step": 7755 + }, + { + "epoch": 0.26155246216589706, + "grad_norm": 23.888172149658203, + "learning_rate": 9.225845621213897e-07, + "logits/chosen": -0.657477855682373, + "logits/rejected": -0.6711796522140503, + "logps/chosen": -1.9453691244125366, + "logps/rejected": -2.1049752235412598, + "loss": 2.3569, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.453693389892578, + "rewards/margins": 1.5960614681243896, + "rewards/rejected": -21.049753189086914, + "step": 7760 + }, + { + "epoch": 0.2617209882368802, + "grad_norm": 28.694900512695312, + "learning_rate": 9.224272743782078e-07, + "logits/chosen": -0.7226378321647644, + "logits/rejected": -0.7690739631652832, + "logps/chosen": -2.0600342750549316, + "logps/rejected": -1.965648889541626, + "loss": 4.342, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.600341796875, + "rewards/margins": -0.9438526034355164, + "rewards/rejected": -19.6564884185791, + "step": 7765 + }, + { + "epoch": 0.26188951430786345, + "grad_norm": 20.68834114074707, + "learning_rate": 9.222698404495892e-07, + "logits/chosen": -0.5737382173538208, + "logits/rejected": -0.554766833782196, + "logps/chosen": -1.6142475605010986, + "logps/rejected": -1.7195327281951904, + "loss": 2.5591, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.142475128173828, + "rewards/margins": 1.052852749824524, + "rewards/rejected": -17.195327758789062, + "step": 7770 + }, + { + "epoch": 0.2620580403788466, + "grad_norm": 57.733306884765625, + "learning_rate": 9.221122603900155e-07, + "logits/chosen": -0.7775768041610718, + "logits/rejected": -0.6437594294548035, + "logps/chosen": -2.050947427749634, + "logps/rejected": -2.2061009407043457, + "loss": 2.7095, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.50947380065918, + "rewards/margins": 1.5515353679656982, + "rewards/rejected": -22.06100845336914, + "step": 7775 + }, + { + "epoch": 0.2622265664498298, + "grad_norm": 16.664154052734375, + "learning_rate": 9.219545342540191e-07, + "logits/chosen": -0.2868829369544983, + "logits/rejected": -0.33900654315948486, + "logps/chosen": -1.7464549541473389, + "logps/rejected": -1.9309101104736328, + "loss": 1.904, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.464550018310547, + "rewards/margins": 1.8445497751235962, + "rewards/rejected": -19.309101104736328, + "step": 7780 + }, + { + "epoch": 0.26239509252081294, + "grad_norm": 16.679758071899414, + "learning_rate": 9.217966620961828e-07, + "logits/chosen": -0.6830729842185974, + "logits/rejected": -0.7631603479385376, + "logps/chosen": -1.570847988128662, + "logps/rejected": -1.589646577835083, + "loss": 3.0564, + "rewards/accuracies": 0.5, + "rewards/chosen": -15.708480834960938, + "rewards/margins": 0.18798570334911346, + "rewards/rejected": -15.896464347839355, + "step": 7785 + }, + { + "epoch": 0.26256361859179617, + "grad_norm": 23.36970329284668, + "learning_rate": 9.216386439711397e-07, + "logits/chosen": -0.6877197027206421, + "logits/rejected": -0.5568257570266724, + "logps/chosen": -1.8986759185791016, + "logps/rejected": -1.9676154851913452, + "loss": 2.9447, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.986759185791016, + "rewards/margins": 0.689396858215332, + "rewards/rejected": -19.676156997680664, + "step": 7790 + }, + { + "epoch": 0.26273214466277933, + "grad_norm": 17.951892852783203, + "learning_rate": 9.214804799335739e-07, + "logits/chosen": -0.6987979412078857, + "logits/rejected": -0.8650094866752625, + "logps/chosen": -1.6295093297958374, + "logps/rejected": -1.909968614578247, + "loss": 1.5949, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.295093536376953, + "rewards/margins": 2.804591417312622, + "rewards/rejected": -19.09968376159668, + "step": 7795 + }, + { + "epoch": 0.2629006707337625, + "grad_norm": 17.47931480407715, + "learning_rate": 9.213221700382196e-07, + "logits/chosen": -0.49302539229393005, + "logits/rejected": -0.5437559485435486, + "logps/chosen": -1.695380449295044, + "logps/rejected": -1.804037094116211, + "loss": 2.9106, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.95380401611328, + "rewards/margins": 1.0865659713745117, + "rewards/rejected": -18.04037094116211, + "step": 7800 + }, + { + "epoch": 0.2630691968047457, + "grad_norm": 40.34727478027344, + "learning_rate": 9.211637143398619e-07, + "logits/chosen": -0.4341478943824768, + "logits/rejected": -0.4721830487251282, + "logps/chosen": -1.685331106185913, + "logps/rejected": -1.7352886199951172, + "loss": 3.0491, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.85331153869629, + "rewards/margins": 0.49957332015037537, + "rewards/rejected": -17.352886199951172, + "step": 7805 + }, + { + "epoch": 0.2632377228757289, + "grad_norm": 17.324604034423828, + "learning_rate": 9.210051128933356e-07, + "logits/chosen": -0.8900884389877319, + "logits/rejected": -0.9262346029281616, + "logps/chosen": -2.093942403793335, + "logps/rejected": -2.1410953998565674, + "loss": 2.9911, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.939424514770508, + "rewards/margins": 0.4715285301208496, + "rewards/rejected": -21.410953521728516, + "step": 7810 + }, + { + "epoch": 0.26340624894671205, + "grad_norm": 9.978482246398926, + "learning_rate": 9.208463657535268e-07, + "logits/chosen": -0.5291486382484436, + "logits/rejected": -0.6421756148338318, + "logps/chosen": -1.926701307296753, + "logps/rejected": -1.9434372186660767, + "loss": 3.3581, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.26701545715332, + "rewards/margins": 0.1673574447631836, + "rewards/rejected": -19.434371948242188, + "step": 7815 + }, + { + "epoch": 0.2635747750176952, + "grad_norm": 43.69354248046875, + "learning_rate": 9.206874729753716e-07, + "logits/chosen": -0.21648378670215607, + "logits/rejected": -0.14429767429828644, + "logps/chosen": -2.1134257316589355, + "logps/rejected": -1.7602497339248657, + "loss": 6.6086, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.134258270263672, + "rewards/margins": -3.5317604541778564, + "rewards/rejected": -17.60249900817871, + "step": 7820 + }, + { + "epoch": 0.26374330108867844, + "grad_norm": 20.53978157043457, + "learning_rate": 9.205284346138562e-07, + "logits/chosen": -0.4210183024406433, + "logits/rejected": -0.4092690944671631, + "logps/chosen": -1.7413660287857056, + "logps/rejected": -1.9055869579315186, + "loss": 2.033, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.413660049438477, + "rewards/margins": 1.6422086954116821, + "rewards/rejected": -19.05586814880371, + "step": 7825 + }, + { + "epoch": 0.2639118271596616, + "grad_norm": 19.872541427612305, + "learning_rate": 9.203692507240179e-07, + "logits/chosen": -0.39160478115081787, + "logits/rejected": -0.48597049713134766, + "logps/chosen": -2.0400238037109375, + "logps/rejected": -2.3305420875549316, + "loss": 2.3703, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.40023422241211, + "rewards/margins": 2.9051859378814697, + "rewards/rejected": -23.305421829223633, + "step": 7830 + }, + { + "epoch": 0.26408035323064477, + "grad_norm": 44.66335678100586, + "learning_rate": 9.202099213609437e-07, + "logits/chosen": -0.7690407037734985, + "logits/rejected": -0.6451536417007446, + "logps/chosen": -1.8670456409454346, + "logps/rejected": -1.8136584758758545, + "loss": 3.6153, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.670455932617188, + "rewards/margins": -0.5338695645332336, + "rewards/rejected": -18.136585235595703, + "step": 7835 + }, + { + "epoch": 0.26424887930162794, + "grad_norm": 41.984840393066406, + "learning_rate": 9.200504465797714e-07, + "logits/chosen": -0.5631991624832153, + "logits/rejected": -0.6159490346908569, + "logps/chosen": -2.0934665203094482, + "logps/rejected": -1.8932098150253296, + "loss": 5.0476, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.93466567993164, + "rewards/margins": -2.002568006515503, + "rewards/rejected": -18.932098388671875, + "step": 7840 + }, + { + "epoch": 0.26441740537261116, + "grad_norm": 33.509124755859375, + "learning_rate": 9.198908264356888e-07, + "logits/chosen": -0.4450080394744873, + "logits/rejected": -0.5047804713249207, + "logps/chosen": -1.6830122470855713, + "logps/rejected": -1.7845993041992188, + "loss": 3.0891, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.830120086669922, + "rewards/margins": 1.0158723592758179, + "rewards/rejected": -17.845993041992188, + "step": 7845 + }, + { + "epoch": 0.2645859314435943, + "grad_norm": 23.490671157836914, + "learning_rate": 9.197310609839343e-07, + "logits/chosen": -0.8265358209609985, + "logits/rejected": -0.6917954683303833, + "logps/chosen": -1.649942398071289, + "logps/rejected": -1.774391770362854, + "loss": 2.2419, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.49942398071289, + "rewards/margins": 1.2444937229156494, + "rewards/rejected": -17.74391746520996, + "step": 7850 + }, + { + "epoch": 0.2647544575145775, + "grad_norm": 20.921653747558594, + "learning_rate": 9.195711502797963e-07, + "logits/chosen": -0.2501029074192047, + "logits/rejected": -0.3494180142879486, + "logps/chosen": -2.1978306770324707, + "logps/rejected": -2.097285509109497, + "loss": 4.5838, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.97830581665039, + "rewards/margins": -1.0054512023925781, + "rewards/rejected": -20.972856521606445, + "step": 7855 + }, + { + "epoch": 0.2649229835855607, + "grad_norm": 28.807878494262695, + "learning_rate": 9.194110943786135e-07, + "logits/chosen": -0.7602871656417847, + "logits/rejected": -0.9272792935371399, + "logps/chosen": -1.496128797531128, + "logps/rejected": -1.7382854223251343, + "loss": 2.3124, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -14.96129035949707, + "rewards/margins": 2.4215664863586426, + "rewards/rejected": -17.382854461669922, + "step": 7860 + }, + { + "epoch": 0.2650915096565439, + "grad_norm": 21.33911895751953, + "learning_rate": 9.192508933357752e-07, + "logits/chosen": -0.4943917393684387, + "logits/rejected": -0.4705166816711426, + "logps/chosen": -1.9024235010147095, + "logps/rejected": -2.0955557823181152, + "loss": 2.4284, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.024234771728516, + "rewards/margins": 1.9313228130340576, + "rewards/rejected": -20.955556869506836, + "step": 7865 + }, + { + "epoch": 0.26526003572752704, + "grad_norm": 20.33489990234375, + "learning_rate": 9.190905472067205e-07, + "logits/chosen": -0.8497546911239624, + "logits/rejected": -0.8793678283691406, + "logps/chosen": -1.8554210662841797, + "logps/rejected": -1.9822323322296143, + "loss": 2.3817, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.554210662841797, + "rewards/margins": 1.268110990524292, + "rewards/rejected": -19.82232093811035, + "step": 7870 + }, + { + "epoch": 0.2654285617985102, + "grad_norm": 26.243898391723633, + "learning_rate": 9.18930056046939e-07, + "logits/chosen": -0.8190891146659851, + "logits/rejected": -0.6624525785446167, + "logps/chosen": -2.0691378116607666, + "logps/rejected": -2.048070192337036, + "loss": 3.4637, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.69137954711914, + "rewards/margins": -0.21067848801612854, + "rewards/rejected": -20.480701446533203, + "step": 7875 + }, + { + "epoch": 0.26559708786949343, + "grad_norm": 19.79037094116211, + "learning_rate": 9.187694199119703e-07, + "logits/chosen": -0.8644890785217285, + "logits/rejected": -0.8535453677177429, + "logps/chosen": -1.686231017112732, + "logps/rejected": -1.907679557800293, + "loss": 2.1817, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.862308502197266, + "rewards/margins": 2.2144851684570312, + "rewards/rejected": -19.076793670654297, + "step": 7880 + }, + { + "epoch": 0.2657656139404766, + "grad_norm": 20.234148025512695, + "learning_rate": 9.186086388574041e-07, + "logits/chosen": -0.870225727558136, + "logits/rejected": -0.7963360548019409, + "logps/chosen": -1.719887375831604, + "logps/rejected": -1.9151662588119507, + "loss": 2.0666, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.19887351989746, + "rewards/margins": 1.952789068222046, + "rewards/rejected": -19.151662826538086, + "step": 7885 + }, + { + "epoch": 0.26593414001145976, + "grad_norm": 14.168811798095703, + "learning_rate": 9.184477129388807e-07, + "logits/chosen": -0.7177601456642151, + "logits/rejected": -0.7062331438064575, + "logps/chosen": -1.7758731842041016, + "logps/rejected": -1.938987374305725, + "loss": 2.1635, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.758731842041016, + "rewards/margins": 1.6311410665512085, + "rewards/rejected": -19.389873504638672, + "step": 7890 + }, + { + "epoch": 0.26610266608244293, + "grad_norm": 14.62055778503418, + "learning_rate": 9.182866422120898e-07, + "logits/chosen": -0.8328273892402649, + "logits/rejected": -1.0201135873794556, + "logps/chosen": -1.7414734363555908, + "logps/rejected": -1.9945809841156006, + "loss": 2.0646, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.414735794067383, + "rewards/margins": 2.531074047088623, + "rewards/rejected": -19.945810317993164, + "step": 7895 + }, + { + "epoch": 0.26627119215342615, + "grad_norm": 29.412940979003906, + "learning_rate": 9.181254267327721e-07, + "logits/chosen": -0.6478652358055115, + "logits/rejected": -0.6217209100723267, + "logps/chosen": -2.1247923374176025, + "logps/rejected": -2.0956790447235107, + "loss": 3.4741, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.247920989990234, + "rewards/margins": -0.29112958908081055, + "rewards/rejected": -20.9567928314209, + "step": 7900 + }, + { + "epoch": 0.2664397182244093, + "grad_norm": 15.633121490478516, + "learning_rate": 9.179640665567175e-07, + "logits/chosen": -0.42172449827194214, + "logits/rejected": -0.4099927544593811, + "logps/chosen": -2.567204236984253, + "logps/rejected": -2.4554595947265625, + "loss": 4.6659, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.672039031982422, + "rewards/margins": -1.117445945739746, + "rewards/rejected": -24.554594039916992, + "step": 7905 + }, + { + "epoch": 0.2666082442953925, + "grad_norm": 37.013118743896484, + "learning_rate": 9.178025617397667e-07, + "logits/chosen": -0.572817325592041, + "logits/rejected": -0.5794605016708374, + "logps/chosen": -2.0265133380889893, + "logps/rejected": -2.1147801876068115, + "loss": 2.3479, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.2651309967041, + "rewards/margins": 0.8826696276664734, + "rewards/rejected": -21.147802352905273, + "step": 7910 + }, + { + "epoch": 0.2667767703663757, + "grad_norm": 0.1122620701789856, + "learning_rate": 9.1764091233781e-07, + "logits/chosen": -0.15012314915657043, + "logits/rejected": -0.27648302912712097, + "logps/chosen": -1.822546362876892, + "logps/rejected": -2.272954225540161, + "loss": 1.6367, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.2254638671875, + "rewards/margins": 4.504077911376953, + "rewards/rejected": -22.729541778564453, + "step": 7915 + }, + { + "epoch": 0.26694529643735887, + "grad_norm": 43.50534439086914, + "learning_rate": 9.174791184067881e-07, + "logits/chosen": -0.7817455530166626, + "logits/rejected": -0.9152389764785767, + "logps/chosen": -2.034762144088745, + "logps/rejected": -2.0368587970733643, + "loss": 3.5936, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.347620010375977, + "rewards/margins": 0.020967865362763405, + "rewards/rejected": -20.368587493896484, + "step": 7920 + }, + { + "epoch": 0.26711382250834204, + "grad_norm": 19.68915367126465, + "learning_rate": 9.173171800026911e-07, + "logits/chosen": -0.7624861001968384, + "logits/rejected": -0.6545786261558533, + "logps/chosen": -2.075883388519287, + "logps/rejected": -2.0948750972747803, + "loss": 3.1003, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.758832931518555, + "rewards/margins": 0.18991927802562714, + "rewards/rejected": -20.94875144958496, + "step": 7925 + }, + { + "epoch": 0.2672823485793252, + "grad_norm": 19.582759857177734, + "learning_rate": 9.171550971815599e-07, + "logits/chosen": -0.6269463300704956, + "logits/rejected": -0.5425857305526733, + "logps/chosen": -1.6861941814422607, + "logps/rejected": -1.9470901489257812, + "loss": 2.2631, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.861942291259766, + "rewards/margins": 2.6089584827423096, + "rewards/rejected": -19.470901489257812, + "step": 7930 + }, + { + "epoch": 0.2674508746503084, + "grad_norm": 67.3069839477539, + "learning_rate": 9.169928699994846e-07, + "logits/chosen": -0.4286138117313385, + "logits/rejected": -0.6139780282974243, + "logps/chosen": -1.865778923034668, + "logps/rejected": -1.8274714946746826, + "loss": 3.497, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.65778923034668, + "rewards/margins": -0.38307541608810425, + "rewards/rejected": -18.274715423583984, + "step": 7935 + }, + { + "epoch": 0.2676194007212916, + "grad_norm": 24.0504093170166, + "learning_rate": 9.168304985126061e-07, + "logits/chosen": -0.5239379405975342, + "logits/rejected": -0.7206992506980896, + "logps/chosen": -2.006237745285034, + "logps/rejected": -2.1122491359710693, + "loss": 3.175, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -20.062381744384766, + "rewards/margins": 1.060112714767456, + "rewards/rejected": -21.12249183654785, + "step": 7940 + }, + { + "epoch": 0.26778792679227476, + "grad_norm": 24.669153213500977, + "learning_rate": 9.166679827771145e-07, + "logits/chosen": -0.5793864130973816, + "logits/rejected": -0.6483038663864136, + "logps/chosen": -2.009474754333496, + "logps/rejected": -1.9184348583221436, + "loss": 4.2305, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.09474754333496, + "rewards/margins": -0.9103986024856567, + "rewards/rejected": -19.184350967407227, + "step": 7945 + }, + { + "epoch": 0.2679564528632579, + "grad_norm": 53.88877487182617, + "learning_rate": 9.165053228492499e-07, + "logits/chosen": -0.7628545165061951, + "logits/rejected": -0.7562915086746216, + "logps/chosen": -2.2420852184295654, + "logps/rejected": -2.3469042778015137, + "loss": 2.4504, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.42085075378418, + "rewards/margins": 1.0481895208358765, + "rewards/rejected": -23.46904182434082, + "step": 7950 + }, + { + "epoch": 0.26812497893424114, + "grad_norm": 15.133487701416016, + "learning_rate": 9.163425187853029e-07, + "logits/chosen": -0.5153997540473938, + "logits/rejected": -0.5927127599716187, + "logps/chosen": -1.7608623504638672, + "logps/rejected": -1.9573513269424438, + "loss": 2.3498, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.608623504638672, + "rewards/margins": 1.964890718460083, + "rewards/rejected": -19.57351303100586, + "step": 7955 + }, + { + "epoch": 0.2682935050052243, + "grad_norm": 31.434738159179688, + "learning_rate": 9.161795706416133e-07, + "logits/chosen": -0.4737616181373596, + "logits/rejected": -0.5601701140403748, + "logps/chosen": -1.8284581899642944, + "logps/rejected": -1.917937994003296, + "loss": 3.3058, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.284582138061523, + "rewards/margins": 0.894799530506134, + "rewards/rejected": -19.179380416870117, + "step": 7960 + }, + { + "epoch": 0.2684620310762075, + "grad_norm": 21.656169891357422, + "learning_rate": 9.160164784745713e-07, + "logits/chosen": -1.0133836269378662, + "logits/rejected": -0.9854210615158081, + "logps/chosen": -1.6828429698944092, + "logps/rejected": -1.8976166248321533, + "loss": 2.4476, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.828426361083984, + "rewards/margins": 2.147737503051758, + "rewards/rejected": -18.976165771484375, + "step": 7965 + }, + { + "epoch": 0.2686305571471907, + "grad_norm": 19.28142547607422, + "learning_rate": 9.158532423406164e-07, + "logits/chosen": -0.8909885287284851, + "logits/rejected": -0.9013813138008118, + "logps/chosen": -1.8956714868545532, + "logps/rejected": -2.063211679458618, + "loss": 2.8542, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.956714630126953, + "rewards/margins": 1.6754035949707031, + "rewards/rejected": -20.632118225097656, + "step": 7970 + }, + { + "epoch": 0.26879908321817386, + "grad_norm": 18.054901123046875, + "learning_rate": 9.156898622962383e-07, + "logits/chosen": -0.4297906756401062, + "logits/rejected": -0.3117820620536804, + "logps/chosen": -1.7339808940887451, + "logps/rejected": -2.3593122959136963, + "loss": 2.0759, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.339807510375977, + "rewards/margins": 6.253314018249512, + "rewards/rejected": -23.593122482299805, + "step": 7975 + }, + { + "epoch": 0.26896760928915703, + "grad_norm": 16.134092330932617, + "learning_rate": 9.155263383979763e-07, + "logits/chosen": -1.0646306276321411, + "logits/rejected": -1.2803940773010254, + "logps/chosen": -1.7984859943389893, + "logps/rejected": -1.9528639316558838, + "loss": 1.8858, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -17.984859466552734, + "rewards/margins": 1.5437793731689453, + "rewards/rejected": -19.528636932373047, + "step": 7980 + }, + { + "epoch": 0.2691361353601402, + "grad_norm": 12.436636924743652, + "learning_rate": 9.153626707024197e-07, + "logits/chosen": -0.7420053482055664, + "logits/rejected": -0.8438242077827454, + "logps/chosen": -1.503025770187378, + "logps/rejected": -1.6121635437011719, + "loss": 2.7156, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -15.030258178710938, + "rewards/margins": 1.0913773775100708, + "rewards/rejected": -16.12163543701172, + "step": 7985 + }, + { + "epoch": 0.2693046614311234, + "grad_norm": 4.8486328125, + "learning_rate": 9.151988592662075e-07, + "logits/chosen": -0.7318106293678284, + "logits/rejected": -0.8289157152175903, + "logps/chosen": -2.7210354804992676, + "logps/rejected": -2.8618569374084473, + "loss": 2.2072, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -27.21035385131836, + "rewards/margins": 1.4082170724868774, + "rewards/rejected": -28.618572235107422, + "step": 7990 + }, + { + "epoch": 0.2694731875021066, + "grad_norm": 11.79561996459961, + "learning_rate": 9.150349041460282e-07, + "logits/chosen": -0.6596136093139648, + "logits/rejected": -0.6863908767700195, + "logps/chosen": -1.6224896907806396, + "logps/rejected": -1.7435400485992432, + "loss": 3.6967, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.224897384643555, + "rewards/margins": 1.2105019092559814, + "rewards/rejected": -17.435400009155273, + "step": 7995 + }, + { + "epoch": 0.26964171357308975, + "grad_norm": 18.109264373779297, + "learning_rate": 9.148708053986203e-07, + "logits/chosen": -0.6359624862670898, + "logits/rejected": -0.4724443554878235, + "logps/chosen": -2.2945563793182373, + "logps/rejected": -2.0365214347839355, + "loss": 5.7031, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.945566177368164, + "rewards/margins": -2.580352783203125, + "rewards/rejected": -20.36521339416504, + "step": 8000 + }, + { + "epoch": 0.26964171357308975, + "eval_logits/chosen": -0.9466900825500488, + "eval_logits/rejected": -0.9982383847236633, + "eval_logps/chosen": -1.8017518520355225, + "eval_logps/rejected": -1.8493553400039673, + "eval_loss": 3.167187452316284, + "eval_rewards/accuracies": 0.6100000143051147, + "eval_rewards/chosen": -18.01751708984375, + "eval_rewards/margins": 0.4760337769985199, + "eval_rewards/rejected": -18.493553161621094, + "eval_runtime": 12.8998, + "eval_samples_per_second": 7.752, + "eval_steps_per_second": 1.938, + "step": 8000 + }, + { + "epoch": 0.2698102396440729, + "grad_norm": 21.768617630004883, + "learning_rate": 9.14706563080772e-07, + "logits/chosen": -0.5055543184280396, + "logits/rejected": -0.4700976014137268, + "logps/chosen": -1.7952054738998413, + "logps/rejected": -1.823185682296753, + "loss": 3.8035, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.95205307006836, + "rewards/margins": 0.2798027992248535, + "rewards/rejected": -18.231857299804688, + "step": 8005 + }, + { + "epoch": 0.26997876571505613, + "grad_norm": 6.784413814544678, + "learning_rate": 9.14542177249321e-07, + "logits/chosen": -0.5981645584106445, + "logits/rejected": -0.6344643831253052, + "logps/chosen": -2.039539098739624, + "logps/rejected": -2.182983875274658, + "loss": 2.6472, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.395395278930664, + "rewards/margins": 1.4344440698623657, + "rewards/rejected": -21.829837799072266, + "step": 8010 + }, + { + "epoch": 0.2701472917860393, + "grad_norm": 108.63782501220703, + "learning_rate": 9.143776479611544e-07, + "logits/chosen": -0.3831273913383484, + "logits/rejected": -0.5212526917457581, + "logps/chosen": -2.725125551223755, + "logps/rejected": -2.3707118034362793, + "loss": 6.8674, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -27.25125503540039, + "rewards/margins": -3.5441346168518066, + "rewards/rejected": -23.70711898803711, + "step": 8015 + }, + { + "epoch": 0.27031581785702247, + "grad_norm": 69.21826934814453, + "learning_rate": 9.142129752732101e-07, + "logits/chosen": -0.6095398664474487, + "logits/rejected": -0.7510106563568115, + "logps/chosen": -1.9469678401947021, + "logps/rejected": -2.0188910961151123, + "loss": 2.5247, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.46967887878418, + "rewards/margins": 0.7192336320877075, + "rewards/rejected": -20.188913345336914, + "step": 8020 + }, + { + "epoch": 0.2704843439280057, + "grad_norm": 14.318962097167969, + "learning_rate": 9.140481592424742e-07, + "logits/chosen": -0.5337401628494263, + "logits/rejected": -0.4288802146911621, + "logps/chosen": -2.1296792030334473, + "logps/rejected": -2.3022007942199707, + "loss": 2.3135, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.29679298400879, + "rewards/margins": 1.7252171039581299, + "rewards/rejected": -23.022008895874023, + "step": 8025 + }, + { + "epoch": 0.27065286999898885, + "grad_norm": 13.43238639831543, + "learning_rate": 9.138831999259833e-07, + "logits/chosen": -0.7914996147155762, + "logits/rejected": -0.7920703291893005, + "logps/chosen": -1.9322484731674194, + "logps/rejected": -2.026156425476074, + "loss": 2.7437, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.322486877441406, + "rewards/margins": 0.9390776753425598, + "rewards/rejected": -20.26156234741211, + "step": 8030 + }, + { + "epoch": 0.270821396069972, + "grad_norm": 34.501399993896484, + "learning_rate": 9.137180973808233e-07, + "logits/chosen": -0.5773485898971558, + "logits/rejected": -0.7042641639709473, + "logps/chosen": -2.2767586708068848, + "logps/rejected": -2.022218942642212, + "loss": 5.6654, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -22.767587661743164, + "rewards/margins": -2.545400619506836, + "rewards/rejected": -20.222187042236328, + "step": 8035 + }, + { + "epoch": 0.2709899221409552, + "grad_norm": 44.062347412109375, + "learning_rate": 9.135528516641295e-07, + "logits/chosen": -0.6731947660446167, + "logits/rejected": -0.8957898020744324, + "logps/chosen": -1.9581935405731201, + "logps/rejected": -2.2106902599334717, + "loss": 2.4807, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.58193588256836, + "rewards/margins": 2.524967908859253, + "rewards/rejected": -22.106903076171875, + "step": 8040 + }, + { + "epoch": 0.2711584482119384, + "grad_norm": 21.43016242980957, + "learning_rate": 9.133874628330874e-07, + "logits/chosen": -0.6238225102424622, + "logits/rejected": -0.711872935295105, + "logps/chosen": -2.185377597808838, + "logps/rejected": -2.101410388946533, + "loss": 4.0274, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.853775024414062, + "rewards/margins": -0.8396707773208618, + "rewards/rejected": -21.01410484313965, + "step": 8045 + }, + { + "epoch": 0.2713269742829216, + "grad_norm": 23.42485809326172, + "learning_rate": 9.132219309449307e-07, + "logits/chosen": -0.6938878297805786, + "logits/rejected": -0.846452534198761, + "logps/chosen": -1.8809760808944702, + "logps/rejected": -2.4321799278259277, + "loss": 1.8875, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.80975914001465, + "rewards/margins": 5.512040615081787, + "rewards/rejected": -24.321800231933594, + "step": 8050 + }, + { + "epoch": 0.27149550035390474, + "grad_norm": 28.485620498657227, + "learning_rate": 9.130562560569444e-07, + "logits/chosen": -0.7439653873443604, + "logits/rejected": -0.7892839312553406, + "logps/chosen": -1.7448148727416992, + "logps/rejected": -1.8274883031845093, + "loss": 2.3475, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.44814682006836, + "rewards/margins": 0.8267344236373901, + "rewards/rejected": -18.274883270263672, + "step": 8055 + }, + { + "epoch": 0.2716640264248879, + "grad_norm": 12.987263679504395, + "learning_rate": 9.128904382264615e-07, + "logits/chosen": -0.2975326478481293, + "logits/rejected": -0.33172541856765747, + "logps/chosen": -2.0645947456359863, + "logps/rejected": -2.379493474960327, + "loss": 1.9203, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.645946502685547, + "rewards/margins": 3.148991823196411, + "rewards/rejected": -23.79493522644043, + "step": 8060 + }, + { + "epoch": 0.2718325524958711, + "grad_norm": 7.247758388519287, + "learning_rate": 9.127244775108652e-07, + "logits/chosen": -0.9111455082893372, + "logits/rejected": -1.0679799318313599, + "logps/chosen": -1.9295141696929932, + "logps/rejected": -2.3214521408081055, + "loss": 0.8627, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -19.295141220092773, + "rewards/margins": 3.9193801879882812, + "rewards/rejected": -23.214523315429688, + "step": 8065 + }, + { + "epoch": 0.2720010785668543, + "grad_norm": 31.42644500732422, + "learning_rate": 9.125583739675879e-07, + "logits/chosen": -1.1115130186080933, + "logits/rejected": -1.1000540256500244, + "logps/chosen": -1.689842939376831, + "logps/rejected": -1.5708680152893066, + "loss": 4.2923, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.8984317779541, + "rewards/margins": -1.1897509098052979, + "rewards/rejected": -15.708681106567383, + "step": 8070 + }, + { + "epoch": 0.27216960463783746, + "grad_norm": 32.55793380737305, + "learning_rate": 9.123921276541115e-07, + "logits/chosen": -0.6790528893470764, + "logits/rejected": -0.4730163514614105, + "logps/chosen": -2.0601210594177246, + "logps/rejected": -2.2196249961853027, + "loss": 2.9702, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.601207733154297, + "rewards/margins": 1.5950422286987305, + "rewards/rejected": -22.196250915527344, + "step": 8075 + }, + { + "epoch": 0.2723381307088207, + "grad_norm": 19.02665138244629, + "learning_rate": 9.122257386279675e-07, + "logits/chosen": -0.8409290313720703, + "logits/rejected": -0.8434259295463562, + "logps/chosen": -1.7004365921020508, + "logps/rejected": -1.6659228801727295, + "loss": 3.484, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.004365921020508, + "rewards/margins": -0.34513577818870544, + "rewards/rejected": -16.659229278564453, + "step": 8080 + }, + { + "epoch": 0.27250665677980385, + "grad_norm": 39.294979095458984, + "learning_rate": 9.120592069467361e-07, + "logits/chosen": -0.4286056458950043, + "logits/rejected": -0.5703016519546509, + "logps/chosen": -1.7742544412612915, + "logps/rejected": -1.78805673122406, + "loss": 3.1235, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.74254608154297, + "rewards/margins": 0.13802233338356018, + "rewards/rejected": -17.88056755065918, + "step": 8085 + }, + { + "epoch": 0.272675182850787, + "grad_norm": 27.521507263183594, + "learning_rate": 9.118925326680479e-07, + "logits/chosen": -0.5573610663414001, + "logits/rejected": -0.5069311857223511, + "logps/chosen": -2.0201663970947266, + "logps/rejected": -1.9709510803222656, + "loss": 3.749, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.201663970947266, + "rewards/margins": -0.4921538233757019, + "rewards/rejected": -19.709510803222656, + "step": 8090 + }, + { + "epoch": 0.2728437089217702, + "grad_norm": 108.85688018798828, + "learning_rate": 9.117257158495819e-07, + "logits/chosen": -0.11293216049671173, + "logits/rejected": -0.2181539237499237, + "logps/chosen": -2.149608612060547, + "logps/rejected": -2.3219380378723145, + "loss": 2.284, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.496084213256836, + "rewards/margins": 1.7232919931411743, + "rewards/rejected": -23.219379425048828, + "step": 8095 + }, + { + "epoch": 0.2730122349927534, + "grad_norm": 22.91443634033203, + "learning_rate": 9.115587565490672e-07, + "logits/chosen": -0.36285096406936646, + "logits/rejected": -0.35819101333618164, + "logps/chosen": -2.12048602104187, + "logps/rejected": -2.111513614654541, + "loss": 3.5268, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.20486068725586, + "rewards/margins": -0.08972187340259552, + "rewards/rejected": -21.11513900756836, + "step": 8100 + }, + { + "epoch": 0.27318076106373657, + "grad_norm": 20.997974395751953, + "learning_rate": 9.113916548242815e-07, + "logits/chosen": -0.5678842067718506, + "logits/rejected": -0.507403552532196, + "logps/chosen": -1.812657117843628, + "logps/rejected": -2.1203980445861816, + "loss": 2.1092, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.126571655273438, + "rewards/margins": 3.0774083137512207, + "rewards/rejected": -21.2039794921875, + "step": 8105 + }, + { + "epoch": 0.27334928713471973, + "grad_norm": 22.05033302307129, + "learning_rate": 9.112244107330523e-07, + "logits/chosen": -0.9710659980773926, + "logits/rejected": -0.8690118789672852, + "logps/chosen": -1.7730506658554077, + "logps/rejected": -1.861966848373413, + "loss": 3.4844, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.730506896972656, + "rewards/margins": 0.8891617655754089, + "rewards/rejected": -18.619670867919922, + "step": 8110 + }, + { + "epoch": 0.2735178132057029, + "grad_norm": 18.96816635131836, + "learning_rate": 9.11057024333256e-07, + "logits/chosen": -0.9982341527938843, + "logits/rejected": -1.0075794458389282, + "logps/chosen": -1.9207165241241455, + "logps/rejected": -1.8805665969848633, + "loss": 3.9492, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.207164764404297, + "rewards/margins": -0.4014988839626312, + "rewards/rejected": -18.805665969848633, + "step": 8115 + }, + { + "epoch": 0.2736863392766861, + "grad_norm": 16.318857192993164, + "learning_rate": 9.108894956828187e-07, + "logits/chosen": -0.7426223754882812, + "logits/rejected": -0.9075484275817871, + "logps/chosen": -1.800121545791626, + "logps/rejected": -1.9071872234344482, + "loss": 2.8309, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.0012149810791, + "rewards/margins": 1.070657730102539, + "rewards/rejected": -19.07187271118164, + "step": 8120 + }, + { + "epoch": 0.2738548653476693, + "grad_norm": 28.79190444946289, + "learning_rate": 9.107218248397153e-07, + "logits/chosen": -0.6248952150344849, + "logits/rejected": -0.6566001772880554, + "logps/chosen": -1.7045423984527588, + "logps/rejected": -1.808816909790039, + "loss": 2.3869, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.045421600341797, + "rewards/margins": 1.042743444442749, + "rewards/rejected": -18.088167190551758, + "step": 8125 + }, + { + "epoch": 0.27402339141865245, + "grad_norm": 25.37258529663086, + "learning_rate": 9.105540118619701e-07, + "logits/chosen": -0.9384697079658508, + "logits/rejected": -0.9719937443733215, + "logps/chosen": -1.7226083278656006, + "logps/rejected": -1.7185615301132202, + "loss": 3.3347, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.22608184814453, + "rewards/margins": -0.04046592861413956, + "rewards/rejected": -17.18561553955078, + "step": 8130 + }, + { + "epoch": 0.2741919174896357, + "grad_norm": 30.92608070373535, + "learning_rate": 9.103860568076566e-07, + "logits/chosen": -0.8808475732803345, + "logits/rejected": -0.9752508997917175, + "logps/chosen": -1.8235105276107788, + "logps/rejected": -1.8683273792266846, + "loss": 3.0119, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.235103607177734, + "rewards/margins": 0.4481666684150696, + "rewards/rejected": -18.683271408081055, + "step": 8135 + }, + { + "epoch": 0.27436044356061884, + "grad_norm": 30.845779418945312, + "learning_rate": 9.102179597348974e-07, + "logits/chosen": -0.5595682859420776, + "logits/rejected": -0.828567385673523, + "logps/chosen": -1.7682873010635376, + "logps/rejected": -1.7338556051254272, + "loss": 3.4405, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.682872772216797, + "rewards/margins": -0.3443172574043274, + "rewards/rejected": -17.33855628967285, + "step": 8140 + }, + { + "epoch": 0.274528969631602, + "grad_norm": 119.611328125, + "learning_rate": 9.100497207018643e-07, + "logits/chosen": -0.4314725995063782, + "logits/rejected": -0.2558334767818451, + "logps/chosen": -1.8698337078094482, + "logps/rejected": -1.995448112487793, + "loss": 2.6223, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.69833755493164, + "rewards/margins": 1.2561436891555786, + "rewards/rejected": -19.954483032226562, + "step": 8145 + }, + { + "epoch": 0.27469749570258517, + "grad_norm": 22.87830924987793, + "learning_rate": 9.098813397667782e-07, + "logits/chosen": -0.7702856063842773, + "logits/rejected": -0.7954924702644348, + "logps/chosen": -1.871756911277771, + "logps/rejected": -2.1178066730499268, + "loss": 2.0098, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.71756935119629, + "rewards/margins": 2.4604969024658203, + "rewards/rejected": -21.17806625366211, + "step": 8150 + }, + { + "epoch": 0.2748660217735684, + "grad_norm": 21.176847457885742, + "learning_rate": 9.097128169879091e-07, + "logits/chosen": -0.6883500814437866, + "logits/rejected": -0.6603747010231018, + "logps/chosen": -1.5661739110946655, + "logps/rejected": -1.6865043640136719, + "loss": 2.4503, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -15.661738395690918, + "rewards/margins": 1.2033051252365112, + "rewards/rejected": -16.86504364013672, + "step": 8155 + }, + { + "epoch": 0.27503454784455156, + "grad_norm": 29.86301612854004, + "learning_rate": 9.095441524235761e-07, + "logits/chosen": -0.6998110413551331, + "logits/rejected": -0.7533560991287231, + "logps/chosen": -2.0183091163635254, + "logps/rejected": -1.887945532798767, + "loss": 4.3634, + "rewards/accuracies": 0.10000000149011612, + "rewards/chosen": -20.183090209960938, + "rewards/margins": -1.3036348819732666, + "rewards/rejected": -18.879453659057617, + "step": 8160 + }, + { + "epoch": 0.2752030739155347, + "grad_norm": 27.845413208007812, + "learning_rate": 9.093753461321472e-07, + "logits/chosen": -0.7750387787818909, + "logits/rejected": -0.7488040924072266, + "logps/chosen": -2.1097846031188965, + "logps/rejected": -2.3540987968444824, + "loss": 2.8014, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.09784507751465, + "rewards/margins": 2.4431440830230713, + "rewards/rejected": -23.54098892211914, + "step": 8165 + }, + { + "epoch": 0.2753715999865179, + "grad_norm": 14.149901390075684, + "learning_rate": 9.092063981720398e-07, + "logits/chosen": -0.6205824017524719, + "logits/rejected": -0.5347954630851746, + "logps/chosen": -1.62399423122406, + "logps/rejected": -1.7922455072402954, + "loss": 2.2484, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.239940643310547, + "rewards/margins": 1.682511568069458, + "rewards/rejected": -17.922454833984375, + "step": 8170 + }, + { + "epoch": 0.2755401260575011, + "grad_norm": 22.775360107421875, + "learning_rate": 9.090373086017202e-07, + "logits/chosen": -0.35927271842956543, + "logits/rejected": -0.5841717720031738, + "logps/chosen": -2.026437759399414, + "logps/rejected": -2.075422763824463, + "loss": 2.9589, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.26437759399414, + "rewards/margins": 0.4898509979248047, + "rewards/rejected": -20.754228591918945, + "step": 8175 + }, + { + "epoch": 0.2757086521284843, + "grad_norm": 42.60033416748047, + "learning_rate": 9.088680774797033e-07, + "logits/chosen": -0.6615483164787292, + "logits/rejected": -0.6774601340293884, + "logps/chosen": -1.9017183780670166, + "logps/rejected": -2.0788064002990723, + "loss": 1.8902, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.01718521118164, + "rewards/margins": 1.7708823680877686, + "rewards/rejected": -20.788066864013672, + "step": 8180 + }, + { + "epoch": 0.27587717819946744, + "grad_norm": 16.84940528869629, + "learning_rate": 9.086987048645538e-07, + "logits/chosen": -0.7287185788154602, + "logits/rejected": -0.7646733522415161, + "logps/chosen": -2.029722213745117, + "logps/rejected": -2.120607852935791, + "loss": 2.6955, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.297222137451172, + "rewards/margins": 0.908857524394989, + "rewards/rejected": -21.206079483032227, + "step": 8185 + }, + { + "epoch": 0.27604570427045066, + "grad_norm": 78.0438232421875, + "learning_rate": 9.085291908148844e-07, + "logits/chosen": -0.566503643989563, + "logits/rejected": -0.5320937037467957, + "logps/chosen": -2.180643081665039, + "logps/rejected": -2.3785202503204346, + "loss": 2.7018, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.806428909301758, + "rewards/margins": 1.9787718057632446, + "rewards/rejected": -23.785200119018555, + "step": 8190 + }, + { + "epoch": 0.27621423034143383, + "grad_norm": 40.340904235839844, + "learning_rate": 9.083595353893576e-07, + "logits/chosen": -0.5297061800956726, + "logits/rejected": -0.6772249937057495, + "logps/chosen": -1.8808778524398804, + "logps/rejected": -2.027780055999756, + "loss": 4.0922, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.808780670166016, + "rewards/margins": 1.4690202474594116, + "rewards/rejected": -20.27779769897461, + "step": 8195 + }, + { + "epoch": 0.276382756412417, + "grad_norm": 16.951213836669922, + "learning_rate": 9.081897386466843e-07, + "logits/chosen": -0.7171338200569153, + "logits/rejected": -0.7746341824531555, + "logps/chosen": -1.764413833618164, + "logps/rejected": -1.7617387771606445, + "loss": 3.2047, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.64413833618164, + "rewards/margins": -0.026749800890684128, + "rewards/rejected": -17.617389678955078, + "step": 8200 + }, + { + "epoch": 0.27655128248340016, + "grad_norm": 43.04359817504883, + "learning_rate": 9.080198006456246e-07, + "logits/chosen": -0.5838262438774109, + "logits/rejected": -0.6805842518806458, + "logps/chosen": -1.8306487798690796, + "logps/rejected": -1.8248965740203857, + "loss": 3.1635, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.306488037109375, + "rewards/margins": -0.057521723210811615, + "rewards/rejected": -18.248966217041016, + "step": 8205 + }, + { + "epoch": 0.2767198085543834, + "grad_norm": 19.069856643676758, + "learning_rate": 9.078497214449869e-07, + "logits/chosen": -0.8335992693901062, + "logits/rejected": -0.9426866769790649, + "logps/chosen": -1.739052176475525, + "logps/rejected": -1.7812341451644897, + "loss": 3.1077, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.390522003173828, + "rewards/margins": 0.4218207001686096, + "rewards/rejected": -17.81234359741211, + "step": 8210 + }, + { + "epoch": 0.27688833462536655, + "grad_norm": 33.81036376953125, + "learning_rate": 9.076795011036296e-07, + "logits/chosen": -0.4566499590873718, + "logits/rejected": -0.7345161437988281, + "logps/chosen": -1.6366084814071655, + "logps/rejected": -1.8738971948623657, + "loss": 2.7152, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.366085052490234, + "rewards/margins": 2.372885227203369, + "rewards/rejected": -18.738971710205078, + "step": 8215 + }, + { + "epoch": 0.2770568606963497, + "grad_norm": 38.866764068603516, + "learning_rate": 9.075091396804587e-07, + "logits/chosen": -0.29251110553741455, + "logits/rejected": -0.1948651373386383, + "logps/chosen": -2.141052722930908, + "logps/rejected": -2.3842930793762207, + "loss": 2.8919, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.410524368286133, + "rewards/margins": 2.432407855987549, + "rewards/rejected": -23.842931747436523, + "step": 8220 + }, + { + "epoch": 0.2772253867673329, + "grad_norm": 22.157766342163086, + "learning_rate": 9.073386372344299e-07, + "logits/chosen": -0.616371750831604, + "logits/rejected": -0.7910966277122498, + "logps/chosen": -1.8958488702774048, + "logps/rejected": -2.1439311504364014, + "loss": 1.7345, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.95848846435547, + "rewards/margins": 2.4808223247528076, + "rewards/rejected": -21.439311981201172, + "step": 8225 + }, + { + "epoch": 0.2773939128383161, + "grad_norm": 49.546382904052734, + "learning_rate": 9.071679938245471e-07, + "logits/chosen": -0.6186679005622864, + "logits/rejected": -0.7885586023330688, + "logps/chosen": -2.076205015182495, + "logps/rejected": -2.2669005393981934, + "loss": 2.3493, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.76205062866211, + "rewards/margins": 1.9069541692733765, + "rewards/rejected": -22.66900634765625, + "step": 8230 + }, + { + "epoch": 0.27756243890929927, + "grad_norm": 15.524943351745605, + "learning_rate": 9.069972095098635e-07, + "logits/chosen": -0.5857125520706177, + "logits/rejected": -0.4242025911808014, + "logps/chosen": -1.6623932123184204, + "logps/rejected": -2.17319917678833, + "loss": 2.5676, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.623931884765625, + "rewards/margins": 5.108059406280518, + "rewards/rejected": -21.731990814208984, + "step": 8235 + }, + { + "epoch": 0.27773096498028244, + "grad_norm": 15.611465454101562, + "learning_rate": 9.068262843494808e-07, + "logits/chosen": -0.5052198767662048, + "logits/rejected": -0.632614016532898, + "logps/chosen": -1.7403032779693604, + "logps/rejected": -1.9178276062011719, + "loss": 1.8667, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.403034210205078, + "rewards/margins": 1.7752418518066406, + "rewards/rejected": -19.178274154663086, + "step": 8240 + }, + { + "epoch": 0.27789949105126566, + "grad_norm": 48.774776458740234, + "learning_rate": 9.066552184025493e-07, + "logits/chosen": -0.7690288424491882, + "logits/rejected": -0.7475972175598145, + "logps/chosen": -1.7967151403427124, + "logps/rejected": -1.9638046026229858, + "loss": 1.8465, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -17.967151641845703, + "rewards/margins": 1.670892357826233, + "rewards/rejected": -19.638042449951172, + "step": 8245 + }, + { + "epoch": 0.2780680171222488, + "grad_norm": 16.428930282592773, + "learning_rate": 9.064840117282684e-07, + "logits/chosen": 0.04140068218111992, + "logits/rejected": 0.00973301101475954, + "logps/chosen": -2.419886350631714, + "logps/rejected": -3.2392563819885254, + "loss": 1.9439, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.198863983154297, + "rewards/margins": 8.193696975708008, + "rewards/rejected": -32.39256286621094, + "step": 8250 + }, + { + "epoch": 0.278236543193232, + "grad_norm": 19.519775390625, + "learning_rate": 9.063126643858859e-07, + "logits/chosen": -0.4937060475349426, + "logits/rejected": -0.558300793170929, + "logps/chosen": -1.9107048511505127, + "logps/rejected": -2.1082000732421875, + "loss": 1.8862, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.107044219970703, + "rewards/margins": 1.9749542474746704, + "rewards/rejected": -21.082000732421875, + "step": 8255 + }, + { + "epoch": 0.27840506926421515, + "grad_norm": 48.95004653930664, + "learning_rate": 9.061411764346982e-07, + "logits/chosen": -0.8905462026596069, + "logits/rejected": -0.8621233105659485, + "logps/chosen": -1.711071252822876, + "logps/rejected": -1.7398639917373657, + "loss": 2.9241, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.1107120513916, + "rewards/margins": 0.2879270613193512, + "rewards/rejected": -17.398639678955078, + "step": 8260 + }, + { + "epoch": 0.2785735953351984, + "grad_norm": 19.999202728271484, + "learning_rate": 9.059695479340507e-07, + "logits/chosen": -0.5921692252159119, + "logits/rejected": -0.7616919279098511, + "logps/chosen": -1.5218065977096558, + "logps/rejected": -1.737052321434021, + "loss": 2.2126, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -15.218066215515137, + "rewards/margins": 2.1524569988250732, + "rewards/rejected": -17.370525360107422, + "step": 8265 + }, + { + "epoch": 0.27874212140618154, + "grad_norm": 17.218103408813477, + "learning_rate": 9.057977789433372e-07, + "logits/chosen": -0.5395227670669556, + "logits/rejected": -0.4856560230255127, + "logps/chosen": -2.105602502822876, + "logps/rejected": -2.3897926807403564, + "loss": 2.2021, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.056026458740234, + "rewards/margins": 2.841902732849121, + "rewards/rejected": -23.89792823791504, + "step": 8270 + }, + { + "epoch": 0.2789106474771647, + "grad_norm": 16.424165725708008, + "learning_rate": 9.056258695220002e-07, + "logits/chosen": -0.8902799487113953, + "logits/rejected": -0.8830171823501587, + "logps/chosen": -1.8373076915740967, + "logps/rejected": -2.084036350250244, + "loss": 2.2416, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.373077392578125, + "rewards/margins": 2.4672863483428955, + "rewards/rejected": -20.840364456176758, + "step": 8275 + }, + { + "epoch": 0.2790791735481479, + "grad_norm": 39.2321662902832, + "learning_rate": 9.05453819729531e-07, + "logits/chosen": -0.8157766461372375, + "logits/rejected": -0.8567003011703491, + "logps/chosen": -1.738638162612915, + "logps/rejected": -1.8060699701309204, + "loss": 3.035, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.386381149291992, + "rewards/margins": 0.6743199229240417, + "rewards/rejected": -18.060701370239258, + "step": 8280 + }, + { + "epoch": 0.2792476996191311, + "grad_norm": 0.01929272711277008, + "learning_rate": 9.052816296254687e-07, + "logits/chosen": -0.6749808192253113, + "logits/rejected": -0.5758073925971985, + "logps/chosen": -1.931583046913147, + "logps/rejected": -2.3056092262268066, + "loss": 2.0042, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.315832138061523, + "rewards/margins": 3.740262985229492, + "rewards/rejected": -23.056095123291016, + "step": 8285 + }, + { + "epoch": 0.27941622569011426, + "grad_norm": 17.415260314941406, + "learning_rate": 9.051092992694021e-07, + "logits/chosen": -0.6588484048843384, + "logits/rejected": -0.6199295520782471, + "logps/chosen": -1.7081390619277954, + "logps/rejected": -1.96379816532135, + "loss": 2.4417, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.081390380859375, + "rewards/margins": 2.5565922260284424, + "rewards/rejected": -19.637985229492188, + "step": 8290 + }, + { + "epoch": 0.2795847517610974, + "grad_norm": 21.187219619750977, + "learning_rate": 9.049368287209675e-07, + "logits/chosen": -0.38697052001953125, + "logits/rejected": -0.5481768250465393, + "logps/chosen": -2.061035633087158, + "logps/rejected": -2.188666820526123, + "loss": 2.4915, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.6103572845459, + "rewards/margins": 1.2763102054595947, + "rewards/rejected": -21.886669158935547, + "step": 8295 + }, + { + "epoch": 0.27975327783208065, + "grad_norm": 17.981689453125, + "learning_rate": 9.047642180398505e-07, + "logits/chosen": -0.8721598386764526, + "logits/rejected": -0.9671875238418579, + "logps/chosen": -1.6943117380142212, + "logps/rejected": -1.8319746255874634, + "loss": 2.6523, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.943117141723633, + "rewards/margins": 1.376629114151001, + "rewards/rejected": -18.319746017456055, + "step": 8300 + }, + { + "epoch": 0.2799218039030638, + "grad_norm": 19.146921157836914, + "learning_rate": 9.045914672857846e-07, + "logits/chosen": -0.874261200428009, + "logits/rejected": -0.9750015139579773, + "logps/chosen": -1.7600091695785522, + "logps/rejected": -1.7663971185684204, + "loss": 3.6973, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.6000919342041, + "rewards/margins": 0.06387872993946075, + "rewards/rejected": -17.663970947265625, + "step": 8305 + }, + { + "epoch": 0.280090329974047, + "grad_norm": 58.41259002685547, + "learning_rate": 9.044185765185521e-07, + "logits/chosen": -0.6827090382575989, + "logits/rejected": -0.5883959531784058, + "logps/chosen": -2.0557332038879395, + "logps/rejected": -2.0246777534484863, + "loss": 3.4819, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.557331085205078, + "rewards/margins": -0.3105539381504059, + "rewards/rejected": -20.246776580810547, + "step": 8310 + }, + { + "epoch": 0.28025885604503015, + "grad_norm": 30.77134895324707, + "learning_rate": 9.042455457979838e-07, + "logits/chosen": -1.1021406650543213, + "logits/rejected": -0.9775190353393555, + "logps/chosen": -1.8120008707046509, + "logps/rejected": -1.7186940908432007, + "loss": 4.0694, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.12000846862793, + "rewards/margins": -0.9330673217773438, + "rewards/rejected": -17.186941146850586, + "step": 8315 + }, + { + "epoch": 0.28042738211601337, + "grad_norm": 21.78514862060547, + "learning_rate": 9.040723751839587e-07, + "logits/chosen": -0.830939769744873, + "logits/rejected": -0.9109653234481812, + "logps/chosen": -1.8959213495254517, + "logps/rejected": -2.0493202209472656, + "loss": 2.7228, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.959213256835938, + "rewards/margins": 1.5339890718460083, + "rewards/rejected": -20.493200302124023, + "step": 8320 + }, + { + "epoch": 0.28059590818699653, + "grad_norm": 19.907915115356445, + "learning_rate": 9.038990647364045e-07, + "logits/chosen": -0.5402109622955322, + "logits/rejected": -0.5689557790756226, + "logps/chosen": -1.5069522857666016, + "logps/rejected": -1.602616548538208, + "loss": 2.2936, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -15.0695219039917, + "rewards/margins": 0.9566418528556824, + "rewards/rejected": -16.026165008544922, + "step": 8325 + }, + { + "epoch": 0.2807644342579797, + "grad_norm": 33.659568786621094, + "learning_rate": 9.037256145152969e-07, + "logits/chosen": -0.530125617980957, + "logits/rejected": -0.769835352897644, + "logps/chosen": -2.094918727874756, + "logps/rejected": -1.9430668354034424, + "loss": 4.7906, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.949188232421875, + "rewards/margins": -1.5185197591781616, + "rewards/rejected": -19.430667877197266, + "step": 8330 + }, + { + "epoch": 0.28093296032896287, + "grad_norm": 42.06294250488281, + "learning_rate": 9.035520245806603e-07, + "logits/chosen": -0.5174092650413513, + "logits/rejected": -0.6184954643249512, + "logps/chosen": -2.0975887775421143, + "logps/rejected": -2.228912830352783, + "loss": 2.7851, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.975887298583984, + "rewards/margins": 1.3132425546646118, + "rewards/rejected": -22.28913116455078, + "step": 8335 + }, + { + "epoch": 0.2811014863999461, + "grad_norm": 23.34773063659668, + "learning_rate": 9.033782949925672e-07, + "logits/chosen": -0.9857513308525085, + "logits/rejected": -1.022640585899353, + "logps/chosen": -1.6867862939834595, + "logps/rejected": -1.6855357885360718, + "loss": 3.1505, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.867862701416016, + "rewards/margins": -0.012502431869506836, + "rewards/rejected": -16.85536003112793, + "step": 8340 + }, + { + "epoch": 0.28127001247092925, + "grad_norm": 35.15296936035156, + "learning_rate": 9.032044258111389e-07, + "logits/chosen": -0.4666837751865387, + "logits/rejected": -0.5388206243515015, + "logps/chosen": -1.867395043373108, + "logps/rejected": -1.9092090129852295, + "loss": 3.6015, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.673952102661133, + "rewards/margins": 0.4181400239467621, + "rewards/rejected": -19.092092514038086, + "step": 8345 + }, + { + "epoch": 0.2814385385419124, + "grad_norm": 29.71284294128418, + "learning_rate": 9.030304170965442e-07, + "logits/chosen": -1.0103025436401367, + "logits/rejected": -1.0144057273864746, + "logps/chosen": -1.6153638362884521, + "logps/rejected": -1.7078787088394165, + "loss": 2.2607, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -16.15363883972168, + "rewards/margins": 0.9251474142074585, + "rewards/rejected": -17.078784942626953, + "step": 8350 + }, + { + "epoch": 0.28160706461289564, + "grad_norm": 44.523921966552734, + "learning_rate": 9.02856268909001e-07, + "logits/chosen": -0.970793604850769, + "logits/rejected": -1.0736042261123657, + "logps/chosen": -2.353757619857788, + "logps/rejected": -2.1389412879943848, + "loss": 5.235, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -23.537578582763672, + "rewards/margins": -2.148162603378296, + "rewards/rejected": -21.389413833618164, + "step": 8355 + }, + { + "epoch": 0.2817755906838788, + "grad_norm": 19.944168090820312, + "learning_rate": 9.026819813087751e-07, + "logits/chosen": -0.6036643385887146, + "logits/rejected": -0.5563797354698181, + "logps/chosen": -1.8941113948822021, + "logps/rejected": -2.0597589015960693, + "loss": 3.5323, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.941116333007812, + "rewards/margins": 1.6564744710922241, + "rewards/rejected": -20.597591400146484, + "step": 8360 + }, + { + "epoch": 0.281944116754862, + "grad_norm": 31.779321670532227, + "learning_rate": 9.025075543561804e-07, + "logits/chosen": -0.2636929154396057, + "logits/rejected": -0.3036728501319885, + "logps/chosen": -2.436537981033325, + "logps/rejected": -2.284217119216919, + "loss": 5.2804, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -24.365379333496094, + "rewards/margins": -1.5232096910476685, + "rewards/rejected": -22.842172622680664, + "step": 8365 + }, + { + "epoch": 0.28211264282584514, + "grad_norm": 61.62702178955078, + "learning_rate": 9.023329881115793e-07, + "logits/chosen": -0.6866958737373352, + "logits/rejected": -0.7670364379882812, + "logps/chosen": -2.037566661834717, + "logps/rejected": -2.0160598754882812, + "loss": 3.31, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.375667572021484, + "rewards/margins": -0.21506690979003906, + "rewards/rejected": -20.160600662231445, + "step": 8370 + }, + { + "epoch": 0.28228116889682836, + "grad_norm": 13.388713836669922, + "learning_rate": 9.021582826353824e-07, + "logits/chosen": -0.3811209499835968, + "logits/rejected": -0.28033262491226196, + "logps/chosen": -2.2232699394226074, + "logps/rejected": -2.5159478187561035, + "loss": 2.3813, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.232698440551758, + "rewards/margins": 2.9267804622650146, + "rewards/rejected": -25.15947914123535, + "step": 8375 + }, + { + "epoch": 0.2824496949678115, + "grad_norm": 71.75244140625, + "learning_rate": 9.019834379880482e-07, + "logits/chosen": -0.7619329690933228, + "logits/rejected": -0.7269800901412964, + "logps/chosen": -2.1262454986572266, + "logps/rejected": -1.9746198654174805, + "loss": 4.6105, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -21.262454986572266, + "rewards/margins": -1.5162568092346191, + "rewards/rejected": -19.746196746826172, + "step": 8380 + }, + { + "epoch": 0.2826182210387947, + "grad_norm": 47.1468505859375, + "learning_rate": 9.018084542300836e-07, + "logits/chosen": -0.6928752660751343, + "logits/rejected": -0.7898589372634888, + "logps/chosen": -1.8955409526824951, + "logps/rejected": -1.9372844696044922, + "loss": 3.0123, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.95541000366211, + "rewards/margins": 0.4174327850341797, + "rewards/rejected": -19.372844696044922, + "step": 8385 + }, + { + "epoch": 0.28278674710977786, + "grad_norm": 85.1002197265625, + "learning_rate": 9.016333314220437e-07, + "logits/chosen": -0.5826826691627502, + "logits/rejected": -0.6400490999221802, + "logps/chosen": -2.334876775741577, + "logps/rejected": -2.1809592247009277, + "loss": 4.7307, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.348770141601562, + "rewards/margins": -1.5391769409179688, + "rewards/rejected": -21.809593200683594, + "step": 8390 + }, + { + "epoch": 0.2829552731807611, + "grad_norm": 12.693016052246094, + "learning_rate": 9.014580696245315e-07, + "logits/chosen": -0.6832539439201355, + "logits/rejected": -0.8097559809684753, + "logps/chosen": -1.5608079433441162, + "logps/rejected": -2.0265145301818848, + "loss": 2.1457, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -15.60807991027832, + "rewards/margins": 4.657065391540527, + "rewards/rejected": -20.265146255493164, + "step": 8395 + }, + { + "epoch": 0.28312379925174425, + "grad_norm": 25.155517578125, + "learning_rate": 9.012826688981983e-07, + "logits/chosen": -0.7850767970085144, + "logits/rejected": -0.847511887550354, + "logps/chosen": -1.9985544681549072, + "logps/rejected": -2.099179744720459, + "loss": 2.6005, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.985544204711914, + "rewards/margins": 1.0062540769577026, + "rewards/rejected": -20.991796493530273, + "step": 8400 + }, + { + "epoch": 0.28312379925174425, + "eval_logits/chosen": -1.0161163806915283, + "eval_logits/rejected": -1.0731902122497559, + "eval_logps/chosen": -1.8116202354431152, + "eval_logps/rejected": -1.8628262281417847, + "eval_loss": 3.1475327014923096, + "eval_rewards/accuracies": 0.6100000143051147, + "eval_rewards/chosen": -18.116201400756836, + "eval_rewards/margins": 0.512061357498169, + "eval_rewards/rejected": -18.628263473510742, + "eval_runtime": 12.9025, + "eval_samples_per_second": 7.75, + "eval_steps_per_second": 1.938, + "step": 8400 + }, + { + "epoch": 0.2832923253227274, + "grad_norm": 24.380800247192383, + "learning_rate": 9.011071293037431e-07, + "logits/chosen": -0.6405395269393921, + "logits/rejected": -0.7416598796844482, + "logps/chosen": -1.801579236984253, + "logps/rejected": -1.8673263788223267, + "loss": 2.5371, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.015789031982422, + "rewards/margins": 0.6574710607528687, + "rewards/rejected": -18.673263549804688, + "step": 8405 + }, + { + "epoch": 0.28346085139371063, + "grad_norm": 42.704307556152344, + "learning_rate": 9.009314509019136e-07, + "logits/chosen": -0.5548028349876404, + "logits/rejected": -0.3818223178386688, + "logps/chosen": -2.210094451904297, + "logps/rejected": -2.4619388580322266, + "loss": 2.1946, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.1009464263916, + "rewards/margins": 2.5184431076049805, + "rewards/rejected": -24.619388580322266, + "step": 8410 + }, + { + "epoch": 0.2836293774646938, + "grad_norm": 19.076499938964844, + "learning_rate": 9.00755633753505e-07, + "logits/chosen": -0.3302133083343506, + "logits/rejected": -0.5427986979484558, + "logps/chosen": -1.8025833368301392, + "logps/rejected": -1.8589093685150146, + "loss": 2.9957, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.025833129882812, + "rewards/margins": 0.5632610321044922, + "rewards/rejected": -18.589094161987305, + "step": 8415 + }, + { + "epoch": 0.28379790353567697, + "grad_norm": 44.3154296875, + "learning_rate": 9.005796779193606e-07, + "logits/chosen": -0.6408971548080444, + "logits/rejected": -0.7122513055801392, + "logps/chosen": -1.502657175064087, + "logps/rejected": -1.5073373317718506, + "loss": 3.3831, + "rewards/accuracies": 0.5, + "rewards/chosen": -15.026571273803711, + "rewards/margins": 0.046801090240478516, + "rewards/rejected": -15.073373794555664, + "step": 8420 + }, + { + "epoch": 0.28396642960666013, + "grad_norm": 26.614643096923828, + "learning_rate": 9.004035834603718e-07, + "logits/chosen": -0.7186806797981262, + "logits/rejected": -0.5692561864852905, + "logps/chosen": -1.7262153625488281, + "logps/rejected": -1.7688636779785156, + "loss": 2.8119, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.26215362548828, + "rewards/margins": 0.42648276686668396, + "rewards/rejected": -17.688634872436523, + "step": 8425 + }, + { + "epoch": 0.28413495567764335, + "grad_norm": 21.044157028198242, + "learning_rate": 9.002273504374782e-07, + "logits/chosen": -0.5849789381027222, + "logits/rejected": -0.7305509448051453, + "logps/chosen": -1.8201496601104736, + "logps/rejected": -1.939295768737793, + "loss": 2.5125, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.20149803161621, + "rewards/margins": 1.1914616823196411, + "rewards/rejected": -19.392959594726562, + "step": 8430 + }, + { + "epoch": 0.2843034817486265, + "grad_norm": 25.133769989013672, + "learning_rate": 9.000509789116671e-07, + "logits/chosen": -0.8678895831108093, + "logits/rejected": -0.9390038251876831, + "logps/chosen": -2.053654909133911, + "logps/rejected": -1.8398149013519287, + "loss": 5.6367, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.536548614501953, + "rewards/margins": -2.138397693634033, + "rewards/rejected": -18.398149490356445, + "step": 8435 + }, + { + "epoch": 0.2844720078196097, + "grad_norm": 17.994701385498047, + "learning_rate": 8.998744689439732e-07, + "logits/chosen": -0.610783040523529, + "logits/rejected": -0.5971266627311707, + "logps/chosen": -1.8423837423324585, + "logps/rejected": -1.8631229400634766, + "loss": 4.3969, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.423837661743164, + "rewards/margins": 0.20739364624023438, + "rewards/rejected": -18.631229400634766, + "step": 8440 + }, + { + "epoch": 0.28464053389059285, + "grad_norm": 18.32319450378418, + "learning_rate": 8.996978205954802e-07, + "logits/chosen": -0.5855869054794312, + "logits/rejected": -0.6118898391723633, + "logps/chosen": -1.760204553604126, + "logps/rejected": -1.9514306783676147, + "loss": 2.2484, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.6020450592041, + "rewards/margins": 1.9122610092163086, + "rewards/rejected": -19.514307022094727, + "step": 8445 + }, + { + "epoch": 0.28480905996157607, + "grad_norm": 84.37977600097656, + "learning_rate": 8.995210339273192e-07, + "logits/chosen": -0.46147075295448303, + "logits/rejected": -0.48088377714157104, + "logps/chosen": -2.0352015495300293, + "logps/rejected": -2.0512282848358154, + "loss": 3.5981, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.35201644897461, + "rewards/margins": 0.16026830673217773, + "rewards/rejected": -20.512283325195312, + "step": 8450 + }, + { + "epoch": 0.28497758603255924, + "grad_norm": 18.52257537841797, + "learning_rate": 8.993441090006684e-07, + "logits/chosen": -0.5983000993728638, + "logits/rejected": -0.6808794736862183, + "logps/chosen": -2.0714492797851562, + "logps/rejected": -2.1377134323120117, + "loss": 2.9054, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.714492797851562, + "rewards/margins": 0.6626425981521606, + "rewards/rejected": -21.377134323120117, + "step": 8455 + }, + { + "epoch": 0.2851461121035424, + "grad_norm": 37.097259521484375, + "learning_rate": 8.991670458767553e-07, + "logits/chosen": -1.0555012226104736, + "logits/rejected": -0.8454039692878723, + "logps/chosen": -2.041461944580078, + "logps/rejected": -1.99785578250885, + "loss": 3.5113, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.41461944580078, + "rewards/margins": -0.43606311082839966, + "rewards/rejected": -19.97855567932129, + "step": 8460 + }, + { + "epoch": 0.2853146381745256, + "grad_norm": 102.19123077392578, + "learning_rate": 8.989898446168541e-07, + "logits/chosen": -0.4955880045890808, + "logits/rejected": -0.45627278089523315, + "logps/chosen": -1.9363250732421875, + "logps/rejected": -2.056997776031494, + "loss": 2.476, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.363250732421875, + "rewards/margins": 1.206725835800171, + "rewards/rejected": -20.569976806640625, + "step": 8465 + }, + { + "epoch": 0.2854831642455088, + "grad_norm": 30.65717315673828, + "learning_rate": 8.988125052822872e-07, + "logits/chosen": -0.3843476176261902, + "logits/rejected": -0.24830541014671326, + "logps/chosen": -1.781465768814087, + "logps/rejected": -1.8322114944458008, + "loss": 2.7073, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.81465721130371, + "rewards/margins": 0.5074566602706909, + "rewards/rejected": -18.322113037109375, + "step": 8470 + }, + { + "epoch": 0.28565169031649196, + "grad_norm": 14.486468315124512, + "learning_rate": 8.98635027934425e-07, + "logits/chosen": -0.3739572763442993, + "logits/rejected": -0.5670051574707031, + "logps/chosen": -2.1897037029266357, + "logps/rejected": -2.430849552154541, + "loss": 3.5944, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.897037506103516, + "rewards/margins": 2.411459445953369, + "rewards/rejected": -24.308496475219727, + "step": 8475 + }, + { + "epoch": 0.2858202163874751, + "grad_norm": 10.307511329650879, + "learning_rate": 8.984574126346851e-07, + "logits/chosen": -0.6641756296157837, + "logits/rejected": -0.633230984210968, + "logps/chosen": -1.8256546258926392, + "logps/rejected": -2.082393169403076, + "loss": 1.5023, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -18.25654411315918, + "rewards/margins": 2.567385196685791, + "rewards/rejected": -20.823930740356445, + "step": 8480 + }, + { + "epoch": 0.28598874245845834, + "grad_norm": 15.988293647766113, + "learning_rate": 8.982796594445332e-07, + "logits/chosen": -0.8362739682197571, + "logits/rejected": -0.9717584848403931, + "logps/chosen": -1.600237250328064, + "logps/rejected": -1.7573835849761963, + "loss": 2.2579, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.00237464904785, + "rewards/margins": 1.5714609622955322, + "rewards/rejected": -17.573835372924805, + "step": 8485 + }, + { + "epoch": 0.2861572685294415, + "grad_norm": 33.097137451171875, + "learning_rate": 8.981017684254828e-07, + "logits/chosen": -0.42199596762657166, + "logits/rejected": -0.43986397981643677, + "logps/chosen": -1.9197683334350586, + "logps/rejected": -1.9568519592285156, + "loss": 3.3897, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -19.197681427001953, + "rewards/margins": 0.3708377778530121, + "rewards/rejected": -19.56852149963379, + "step": 8490 + }, + { + "epoch": 0.2863257946004247, + "grad_norm": 22.73672866821289, + "learning_rate": 8.979237396390951e-07, + "logits/chosen": -1.106762409210205, + "logits/rejected": -0.9645527601242065, + "logps/chosen": -1.9312461614608765, + "logps/rejected": -1.8773243427276611, + "loss": 3.5936, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -19.312461853027344, + "rewards/margins": -0.5392182469367981, + "rewards/rejected": -18.773242950439453, + "step": 8495 + }, + { + "epoch": 0.28649432067140784, + "grad_norm": 33.734256744384766, + "learning_rate": 8.977455731469786e-07, + "logits/chosen": -0.6991892457008362, + "logits/rejected": -0.5479412078857422, + "logps/chosen": -1.7963107824325562, + "logps/rejected": -1.7532427310943604, + "loss": 3.5209, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.963109970092773, + "rewards/margins": -0.4306826591491699, + "rewards/rejected": -17.532426834106445, + "step": 8500 + }, + { + "epoch": 0.28666284674239106, + "grad_norm": 23.050457000732422, + "learning_rate": 8.975672690107896e-07, + "logits/chosen": -0.6764280200004578, + "logits/rejected": -0.5734367370605469, + "logps/chosen": -2.08373761177063, + "logps/rejected": -2.144780397415161, + "loss": 2.9743, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.83737564086914, + "rewards/margins": 0.6104259490966797, + "rewards/rejected": -21.447803497314453, + "step": 8505 + }, + { + "epoch": 0.28683137281337423, + "grad_norm": 30.37391471862793, + "learning_rate": 8.973888272922325e-07, + "logits/chosen": -0.7625263929367065, + "logits/rejected": -0.8862202763557434, + "logps/chosen": -2.0527968406677246, + "logps/rejected": -2.016758441925049, + "loss": 4.0696, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.527965545654297, + "rewards/margins": -0.3603835999965668, + "rewards/rejected": -20.16758155822754, + "step": 8510 + }, + { + "epoch": 0.2869998988843574, + "grad_norm": 26.156654357910156, + "learning_rate": 8.972102480530586e-07, + "logits/chosen": -0.8353249430656433, + "logits/rejected": -0.942841649055481, + "logps/chosen": -2.201547861099243, + "logps/rejected": -2.3174633979797363, + "loss": 3.7065, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.015478134155273, + "rewards/margins": 1.159156322479248, + "rewards/rejected": -23.174633026123047, + "step": 8515 + }, + { + "epoch": 0.2871684249553406, + "grad_norm": 13.300694465637207, + "learning_rate": 8.970315313550676e-07, + "logits/chosen": -1.0247485637664795, + "logits/rejected": -0.9068101048469543, + "logps/chosen": -1.7162628173828125, + "logps/rejected": -1.7405004501342773, + "loss": 3.0056, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.162628173828125, + "rewards/margins": 0.24237728118896484, + "rewards/rejected": -17.405006408691406, + "step": 8520 + }, + { + "epoch": 0.2873369510263238, + "grad_norm": 61.11632537841797, + "learning_rate": 8.968526772601057e-07, + "logits/chosen": -0.758571445941925, + "logits/rejected": -0.6705536246299744, + "logps/chosen": -2.1682193279266357, + "logps/rejected": -2.228081703186035, + "loss": 2.8205, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.682193756103516, + "rewards/margins": 0.5986258387565613, + "rewards/rejected": -22.280818939208984, + "step": 8525 + }, + { + "epoch": 0.28750547709730695, + "grad_norm": 23.13623046875, + "learning_rate": 8.966736858300678e-07, + "logits/chosen": -0.735704779624939, + "logits/rejected": -0.6865785717964172, + "logps/chosen": -1.7772157192230225, + "logps/rejected": -1.7628653049468994, + "loss": 3.7543, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.77215576171875, + "rewards/margins": -0.1435052901506424, + "rewards/rejected": -17.628652572631836, + "step": 8530 + }, + { + "epoch": 0.2876740031682901, + "grad_norm": 18.340370178222656, + "learning_rate": 8.964945571268953e-07, + "logits/chosen": -0.6413258910179138, + "logits/rejected": -0.5849345922470093, + "logps/chosen": -1.7927738428115845, + "logps/rejected": -2.061357021331787, + "loss": 2.2013, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.927738189697266, + "rewards/margins": 2.6858315467834473, + "rewards/rejected": -20.613569259643555, + "step": 8535 + }, + { + "epoch": 0.28784252923927334, + "grad_norm": 17.296632766723633, + "learning_rate": 8.963152912125783e-07, + "logits/chosen": -0.7454960942268372, + "logits/rejected": -0.6678019762039185, + "logps/chosen": -2.1356372833251953, + "logps/rejected": -2.2526679039001465, + "loss": 2.5295, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.356372833251953, + "rewards/margins": 1.1703062057495117, + "rewards/rejected": -22.52667808532715, + "step": 8540 + }, + { + "epoch": 0.2880110553102565, + "grad_norm": 50.47782516479492, + "learning_rate": 8.961358881491528e-07, + "logits/chosen": -0.777079164981842, + "logits/rejected": -0.8767390251159668, + "logps/chosen": -1.8275363445281982, + "logps/rejected": -1.8506664037704468, + "loss": 3.4138, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -18.27536392211914, + "rewards/margins": 0.2313002645969391, + "rewards/rejected": -18.506664276123047, + "step": 8545 + }, + { + "epoch": 0.28817958138123967, + "grad_norm": 35.43614959716797, + "learning_rate": 8.959563479987035e-07, + "logits/chosen": -0.8436108827590942, + "logits/rejected": -0.870860755443573, + "logps/chosen": -1.8714988231658936, + "logps/rejected": -1.936994194984436, + "loss": 2.837, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.714990615844727, + "rewards/margins": 0.6549515724182129, + "rewards/rejected": -19.36993980407715, + "step": 8550 + }, + { + "epoch": 0.28834810745222283, + "grad_norm": 17.51824188232422, + "learning_rate": 8.957766708233625e-07, + "logits/chosen": -1.1280739307403564, + "logits/rejected": -1.240715742111206, + "logps/chosen": -1.6478168964385986, + "logps/rejected": -1.7780081033706665, + "loss": 2.7749, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.478168487548828, + "rewards/margins": 1.3019111156463623, + "rewards/rejected": -17.780078887939453, + "step": 8555 + }, + { + "epoch": 0.28851663352320606, + "grad_norm": 18.251188278198242, + "learning_rate": 8.955968566853086e-07, + "logits/chosen": -0.8560983538627625, + "logits/rejected": -0.7794613242149353, + "logps/chosen": -1.753149390220642, + "logps/rejected": -1.9621002674102783, + "loss": 2.089, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.531494140625, + "rewards/margins": 2.089508056640625, + "rewards/rejected": -19.621002197265625, + "step": 8560 + }, + { + "epoch": 0.2886851595941892, + "grad_norm": 32.65653610229492, + "learning_rate": 8.954169056467684e-07, + "logits/chosen": -0.7626281380653381, + "logits/rejected": -0.7990679740905762, + "logps/chosen": -1.5963528156280518, + "logps/rejected": -1.5640228986740112, + "loss": 3.388, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -15.963528633117676, + "rewards/margins": -0.3232990801334381, + "rewards/rejected": -15.640230178833008, + "step": 8565 + }, + { + "epoch": 0.2888536856651724, + "grad_norm": 7.690257549285889, + "learning_rate": 8.95236817770016e-07, + "logits/chosen": -0.9705495834350586, + "logits/rejected": -0.8679699897766113, + "logps/chosen": -1.9340327978134155, + "logps/rejected": -2.223405122756958, + "loss": 1.6523, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.340328216552734, + "rewards/margins": 2.8937225341796875, + "rewards/rejected": -22.234050750732422, + "step": 8570 + }, + { + "epoch": 0.2890222117361556, + "grad_norm": 33.865028381347656, + "learning_rate": 8.950565931173728e-07, + "logits/chosen": -0.6105413436889648, + "logits/rejected": -0.6946656107902527, + "logps/chosen": -2.086103916168213, + "logps/rejected": -2.1015257835388184, + "loss": 3.2424, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.861034393310547, + "rewards/margins": 0.1542188674211502, + "rewards/rejected": -21.015254974365234, + "step": 8575 + }, + { + "epoch": 0.2891907378071388, + "grad_norm": 15.458596229553223, + "learning_rate": 8.948762317512073e-07, + "logits/chosen": -0.504424512386322, + "logits/rejected": -0.6665661931037903, + "logps/chosen": -1.8025707006454468, + "logps/rejected": -1.9153648614883423, + "loss": 2.3811, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.025705337524414, + "rewards/margins": 1.1279430389404297, + "rewards/rejected": -19.15364646911621, + "step": 8580 + }, + { + "epoch": 0.28935926387812194, + "grad_norm": 21.86128044128418, + "learning_rate": 8.946957337339354e-07, + "logits/chosen": -0.5030733346939087, + "logits/rejected": -0.4260841906070709, + "logps/chosen": -1.9623653888702393, + "logps/rejected": -2.001736879348755, + "loss": 3.0308, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.623653411865234, + "rewards/margins": 0.39371395111083984, + "rewards/rejected": -20.01736831665039, + "step": 8585 + }, + { + "epoch": 0.2895277899491051, + "grad_norm": 30.289661407470703, + "learning_rate": 8.945150991280205e-07, + "logits/chosen": -0.7599642872810364, + "logits/rejected": -0.7587991952896118, + "logps/chosen": -1.845049262046814, + "logps/rejected": -1.9118587970733643, + "loss": 2.5643, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.45049285888672, + "rewards/margins": 0.6680957078933716, + "rewards/rejected": -19.118587493896484, + "step": 8590 + }, + { + "epoch": 0.28969631602008833, + "grad_norm": 34.044654846191406, + "learning_rate": 8.94334327995973e-07, + "logits/chosen": -0.6147949695587158, + "logits/rejected": -0.6777374744415283, + "logps/chosen": -1.9771358966827393, + "logps/rejected": -2.1955230236053467, + "loss": 1.9538, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.7713565826416, + "rewards/margins": 2.1838738918304443, + "rewards/rejected": -21.955232620239258, + "step": 8595 + }, + { + "epoch": 0.2898648420910715, + "grad_norm": 25.871217727661133, + "learning_rate": 8.941534204003509e-07, + "logits/chosen": -0.9726818799972534, + "logits/rejected": -1.0019123554229736, + "logps/chosen": -1.7473583221435547, + "logps/rejected": -1.9080984592437744, + "loss": 2.3828, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.473581314086914, + "rewards/margins": 1.6074016094207764, + "rewards/rejected": -19.080984115600586, + "step": 8600 + }, + { + "epoch": 0.29003336816205466, + "grad_norm": 25.060718536376953, + "learning_rate": 8.939723764037588e-07, + "logits/chosen": -0.9149066805839539, + "logits/rejected": -0.8806743621826172, + "logps/chosen": -2.042038679122925, + "logps/rejected": -2.366466999053955, + "loss": 2.2901, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.420385360717773, + "rewards/margins": 3.24428129196167, + "rewards/rejected": -23.6646671295166, + "step": 8605 + }, + { + "epoch": 0.2902018942330378, + "grad_norm": 87.30838012695312, + "learning_rate": 8.937911960688493e-07, + "logits/chosen": -0.8675652742385864, + "logits/rejected": -1.1229223012924194, + "logps/chosen": -2.310178279876709, + "logps/rejected": -2.154825448989868, + "loss": 4.7692, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -23.101781845092773, + "rewards/margins": -1.55352783203125, + "rewards/rejected": -21.548254013061523, + "step": 8610 + }, + { + "epoch": 0.29037042030402105, + "grad_norm": 18.950347900390625, + "learning_rate": 8.936098794583215e-07, + "logits/chosen": -0.554070770740509, + "logits/rejected": -0.8680068850517273, + "logps/chosen": -1.8547636270523071, + "logps/rejected": -2.017946720123291, + "loss": 2.1449, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.54763412475586, + "rewards/margins": 1.631829857826233, + "rewards/rejected": -20.17946434020996, + "step": 8615 + }, + { + "epoch": 0.2905389463750042, + "grad_norm": 20.925893783569336, + "learning_rate": 8.934284266349221e-07, + "logits/chosen": -0.45976686477661133, + "logits/rejected": -0.5486811399459839, + "logps/chosen": -1.9799697399139404, + "logps/rejected": -2.120398998260498, + "loss": 2.1415, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.799697875976562, + "rewards/margins": 1.4042949676513672, + "rewards/rejected": -21.20399284362793, + "step": 8620 + }, + { + "epoch": 0.2907074724459874, + "grad_norm": 24.982271194458008, + "learning_rate": 8.932468376614446e-07, + "logits/chosen": -1.134254813194275, + "logits/rejected": -1.2710667848587036, + "logps/chosen": -1.7824151515960693, + "logps/rejected": -2.1663620471954346, + "loss": 2.336, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.82415199279785, + "rewards/margins": 3.839470386505127, + "rewards/rejected": -21.663619995117188, + "step": 8625 + }, + { + "epoch": 0.2908759985169706, + "grad_norm": 50.53211975097656, + "learning_rate": 8.9306511260073e-07, + "logits/chosen": -0.9046827554702759, + "logits/rejected": -1.054692268371582, + "logps/chosen": -1.7211036682128906, + "logps/rejected": -1.7873337268829346, + "loss": 2.6263, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.211036682128906, + "rewards/margins": 0.6622999906539917, + "rewards/rejected": -17.873336791992188, + "step": 8630 + }, + { + "epoch": 0.29104452458795377, + "grad_norm": 49.24147033691406, + "learning_rate": 8.92883251515666e-07, + "logits/chosen": -0.8043139576911926, + "logits/rejected": -0.834795355796814, + "logps/chosen": -1.8925583362579346, + "logps/rejected": -1.9303007125854492, + "loss": 2.8508, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.925580978393555, + "rewards/margins": 0.37742501497268677, + "rewards/rejected": -19.30300521850586, + "step": 8635 + }, + { + "epoch": 0.29121305065893693, + "grad_norm": 8.035228729248047, + "learning_rate": 8.927012544691877e-07, + "logits/chosen": -0.7065514326095581, + "logits/rejected": -0.7449557781219482, + "logps/chosen": -1.6677768230438232, + "logps/rejected": -2.148942470550537, + "loss": 1.9738, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -16.67776870727539, + "rewards/margins": 4.811653137207031, + "rewards/rejected": -21.489421844482422, + "step": 8640 + }, + { + "epoch": 0.2913815767299201, + "grad_norm": 19.720861434936523, + "learning_rate": 8.925191215242769e-07, + "logits/chosen": -0.6974073648452759, + "logits/rejected": -0.8620367050170898, + "logps/chosen": -1.8329426050186157, + "logps/rejected": -2.1030852794647217, + "loss": 1.5808, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -18.329425811767578, + "rewards/margins": 2.701427936553955, + "rewards/rejected": -21.030853271484375, + "step": 8645 + }, + { + "epoch": 0.2915501028009033, + "grad_norm": 96.2696533203125, + "learning_rate": 8.92336852743963e-07, + "logits/chosen": -0.535683274269104, + "logits/rejected": -0.6992945671081543, + "logps/chosen": -2.1274380683898926, + "logps/rejected": -2.254375696182251, + "loss": 3.5109, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.27437973022461, + "rewards/margins": 1.2693758010864258, + "rewards/rejected": -22.54375648498535, + "step": 8650 + }, + { + "epoch": 0.2917186288718865, + "grad_norm": 21.258485794067383, + "learning_rate": 8.921544481913217e-07, + "logits/chosen": -0.9834004640579224, + "logits/rejected": -1.0161277055740356, + "logps/chosen": -1.942801833152771, + "logps/rejected": -1.9483106136322021, + "loss": 4.0447, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.42801856994629, + "rewards/margins": 0.05508852005004883, + "rewards/rejected": -19.483104705810547, + "step": 8655 + }, + { + "epoch": 0.29188715494286965, + "grad_norm": 40.73654556274414, + "learning_rate": 8.919719079294761e-07, + "logits/chosen": -0.8954814672470093, + "logits/rejected": -0.9347285032272339, + "logps/chosen": -1.9151986837387085, + "logps/rejected": -2.066523313522339, + "loss": 2.6765, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.151987075805664, + "rewards/margins": 1.5132462978363037, + "rewards/rejected": -20.665233612060547, + "step": 8660 + }, + { + "epoch": 0.2920556810138528, + "grad_norm": 28.10308265686035, + "learning_rate": 8.917892320215963e-07, + "logits/chosen": -0.5667335987091064, + "logits/rejected": -0.6796129941940308, + "logps/chosen": -2.036616563796997, + "logps/rejected": -2.078256607055664, + "loss": 3.0412, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.36616325378418, + "rewards/margins": 0.41640299558639526, + "rewards/rejected": -20.782567977905273, + "step": 8665 + }, + { + "epoch": 0.29222420708483604, + "grad_norm": 11.453448295593262, + "learning_rate": 8.91606420530899e-07, + "logits/chosen": -0.4996485710144043, + "logits/rejected": -0.6699423789978027, + "logps/chosen": -2.383340358734131, + "logps/rejected": -2.6928350925445557, + "loss": 1.4457, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -23.833402633666992, + "rewards/margins": 3.0949490070343018, + "rewards/rejected": -26.9283504486084, + "step": 8670 + }, + { + "epoch": 0.2923927331558192, + "grad_norm": 21.407791137695312, + "learning_rate": 8.914234735206485e-07, + "logits/chosen": -1.0669759511947632, + "logits/rejected": -1.0711863040924072, + "logps/chosen": -1.4989492893218994, + "logps/rejected": -1.5646870136260986, + "loss": 2.4623, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -14.98949146270752, + "rewards/margins": 0.6573789715766907, + "rewards/rejected": -15.646868705749512, + "step": 8675 + }, + { + "epoch": 0.2925612592268024, + "grad_norm": 40.112220764160156, + "learning_rate": 8.912403910541552e-07, + "logits/chosen": -1.0745022296905518, + "logits/rejected": -0.9828779101371765, + "logps/chosen": -1.8560600280761719, + "logps/rejected": -1.7852586507797241, + "loss": 3.7889, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -18.56060028076172, + "rewards/margins": -0.7080144882202148, + "rewards/rejected": -17.85258674621582, + "step": 8680 + }, + { + "epoch": 0.2927297852977856, + "grad_norm": 21.407392501831055, + "learning_rate": 8.910571731947767e-07, + "logits/chosen": -0.4315149784088135, + "logits/rejected": -0.5765363574028015, + "logps/chosen": -1.6076252460479736, + "logps/rejected": -1.6742461919784546, + "loss": 3.3513, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.07625389099121, + "rewards/margins": 0.6662089228630066, + "rewards/rejected": -16.742462158203125, + "step": 8685 + }, + { + "epoch": 0.29289831136876876, + "grad_norm": 24.01103973388672, + "learning_rate": 8.908738200059178e-07, + "logits/chosen": -1.1414722204208374, + "logits/rejected": -1.230464220046997, + "logps/chosen": -1.833152413368225, + "logps/rejected": -1.9002851247787476, + "loss": 2.791, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.331523895263672, + "rewards/margins": 0.6713264584541321, + "rewards/rejected": -19.002851486206055, + "step": 8690 + }, + { + "epoch": 0.2930668374397519, + "grad_norm": 22.16900634765625, + "learning_rate": 8.906903315510294e-07, + "logits/chosen": -0.7110240459442139, + "logits/rejected": -0.8365713953971863, + "logps/chosen": -1.6791126728057861, + "logps/rejected": -1.6718724966049194, + "loss": 3.5858, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.791126251220703, + "rewards/margins": -0.0724002867937088, + "rewards/rejected": -16.71872329711914, + "step": 8695 + }, + { + "epoch": 0.2932353635107351, + "grad_norm": 40.03135681152344, + "learning_rate": 8.9050670789361e-07, + "logits/chosen": -0.6402872204780579, + "logits/rejected": -0.814786434173584, + "logps/chosen": -1.8824033737182617, + "logps/rejected": -1.8985248804092407, + "loss": 3.2084, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.824033737182617, + "rewards/margins": 0.1612163484096527, + "rewards/rejected": -18.985248565673828, + "step": 8700 + }, + { + "epoch": 0.2934038895817183, + "grad_norm": 19.727540969848633, + "learning_rate": 8.903229490972042e-07, + "logits/chosen": -0.3936053514480591, + "logits/rejected": -0.47466689348220825, + "logps/chosen": -2.7040302753448486, + "logps/rejected": -2.9196763038635254, + "loss": 2.9614, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -27.040302276611328, + "rewards/margins": 2.156456708908081, + "rewards/rejected": -29.196758270263672, + "step": 8705 + }, + { + "epoch": 0.2935724156527015, + "grad_norm": 35.2152099609375, + "learning_rate": 8.90139055225404e-07, + "logits/chosen": -0.3665235936641693, + "logits/rejected": -0.44224101305007935, + "logps/chosen": -2.0081772804260254, + "logps/rejected": -2.328625202178955, + "loss": 2.1435, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.081769943237305, + "rewards/margins": 3.2044830322265625, + "rewards/rejected": -23.286252975463867, + "step": 8710 + }, + { + "epoch": 0.29374094172368465, + "grad_norm": 26.51418685913086, + "learning_rate": 8.899550263418475e-07, + "logits/chosen": -0.670992374420166, + "logits/rejected": -0.7386754155158997, + "logps/chosen": -1.8449127674102783, + "logps/rejected": -1.9972244501113892, + "loss": 1.7944, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.449127197265625, + "rewards/margins": 1.5231168270111084, + "rewards/rejected": -19.97224235534668, + "step": 8715 + }, + { + "epoch": 0.2939094677946678, + "grad_norm": 22.333667755126953, + "learning_rate": 8.8977086251022e-07, + "logits/chosen": -0.81922447681427, + "logits/rejected": -0.7545638084411621, + "logps/chosen": -2.3857390880584717, + "logps/rejected": -2.6660449504852295, + "loss": 3.0081, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.857393264770508, + "rewards/margins": 2.803058385848999, + "rewards/rejected": -26.660449981689453, + "step": 8720 + }, + { + "epoch": 0.29407799386565103, + "grad_norm": 18.825889587402344, + "learning_rate": 8.895865637942535e-07, + "logits/chosen": -1.0190632343292236, + "logits/rejected": -1.0829589366912842, + "logps/chosen": -1.6110626459121704, + "logps/rejected": -1.6284345388412476, + "loss": 3.3191, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.110624313354492, + "rewards/margins": 0.1737208366394043, + "rewards/rejected": -16.284343719482422, + "step": 8725 + }, + { + "epoch": 0.2942465199366342, + "grad_norm": 49.59651184082031, + "learning_rate": 8.894021302577263e-07, + "logits/chosen": -0.7957747578620911, + "logits/rejected": -0.8983270525932312, + "logps/chosen": -2.2004895210266113, + "logps/rejected": -2.2305634021759033, + "loss": 3.6281, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.00489616394043, + "rewards/margins": 0.30073651671409607, + "rewards/rejected": -22.305633544921875, + "step": 8730 + }, + { + "epoch": 0.29441504600761736, + "grad_norm": 62.021484375, + "learning_rate": 8.892175619644635e-07, + "logits/chosen": -0.9021312594413757, + "logits/rejected": -0.9147001504898071, + "logps/chosen": -2.173342227935791, + "logps/rejected": -2.4811558723449707, + "loss": 2.1825, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.733421325683594, + "rewards/margins": 3.0781381130218506, + "rewards/rejected": -24.811559677124023, + "step": 8735 + }, + { + "epoch": 0.2945835720786006, + "grad_norm": 11.684818267822266, + "learning_rate": 8.890328589783373e-07, + "logits/chosen": -1.1126207113265991, + "logits/rejected": -0.9916723966598511, + "logps/chosen": -2.0568923950195312, + "logps/rejected": -3.0319085121154785, + "loss": 1.1855, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.568927764892578, + "rewards/margins": 9.750160217285156, + "rewards/rejected": -30.3190860748291, + "step": 8740 + }, + { + "epoch": 0.29475209814958375, + "grad_norm": 17.79829216003418, + "learning_rate": 8.888480213632656e-07, + "logits/chosen": -1.0284509658813477, + "logits/rejected": -0.9819244146347046, + "logps/chosen": -1.696070671081543, + "logps/rejected": -1.8273723125457764, + "loss": 2.7422, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.960704803466797, + "rewards/margins": 1.3130178451538086, + "rewards/rejected": -18.273723602294922, + "step": 8745 + }, + { + "epoch": 0.2949206242205669, + "grad_norm": 20.419384002685547, + "learning_rate": 8.88663049183214e-07, + "logits/chosen": -0.7981722950935364, + "logits/rejected": -0.8218636512756348, + "logps/chosen": -2.0747475624084473, + "logps/rejected": -1.9626662731170654, + "loss": 4.1684, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.74747657775879, + "rewards/margins": -1.1208148002624512, + "rewards/rejected": -19.62666130065918, + "step": 8750 + }, + { + "epoch": 0.2950891502915501, + "grad_norm": 23.614099502563477, + "learning_rate": 8.884779425021936e-07, + "logits/chosen": -0.9454744458198547, + "logits/rejected": -1.0266331434249878, + "logps/chosen": -1.5546374320983887, + "logps/rejected": -1.7335621118545532, + "loss": 1.8324, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -15.546374320983887, + "rewards/margins": 1.7892467975616455, + "rewards/rejected": -17.335620880126953, + "step": 8755 + }, + { + "epoch": 0.2952576763625333, + "grad_norm": 30.09710121154785, + "learning_rate": 8.882927013842628e-07, + "logits/chosen": -1.1429169178009033, + "logits/rejected": -1.2618019580841064, + "logps/chosen": -1.8678550720214844, + "logps/rejected": -2.0443532466888428, + "loss": 2.179, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.678550720214844, + "rewards/margins": 1.7649834156036377, + "rewards/rejected": -20.44353485107422, + "step": 8760 + }, + { + "epoch": 0.29542620243351647, + "grad_norm": 23.52537727355957, + "learning_rate": 8.881073258935262e-07, + "logits/chosen": -0.8685327768325806, + "logits/rejected": -0.7556779384613037, + "logps/chosen": -1.8107166290283203, + "logps/rejected": -1.8469966650009155, + "loss": 2.8891, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.107166290283203, + "rewards/margins": 0.362802118062973, + "rewards/rejected": -18.469966888427734, + "step": 8765 + }, + { + "epoch": 0.29559472850449964, + "grad_norm": 23.21879005432129, + "learning_rate": 8.879218160941348e-07, + "logits/chosen": -1.1114360094070435, + "logits/rejected": -1.1394498348236084, + "logps/chosen": -1.627173662185669, + "logps/rejected": -1.7318960428237915, + "loss": 2.4583, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.27173614501953, + "rewards/margins": 1.0472228527069092, + "rewards/rejected": -17.318958282470703, + "step": 8770 + }, + { + "epoch": 0.2957632545754828, + "grad_norm": 17.88161849975586, + "learning_rate": 8.877361720502865e-07, + "logits/chosen": -0.787623941898346, + "logits/rejected": -0.8592535257339478, + "logps/chosen": -1.8772789239883423, + "logps/rejected": -2.0536484718322754, + "loss": 2.2936, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.772790908813477, + "rewards/margins": 1.7636915445327759, + "rewards/rejected": -20.536481857299805, + "step": 8775 + }, + { + "epoch": 0.295931780646466, + "grad_norm": 27.691944122314453, + "learning_rate": 8.875503938262252e-07, + "logits/chosen": -0.6448401808738708, + "logits/rejected": -0.6703423261642456, + "logps/chosen": -2.0140814781188965, + "logps/rejected": -1.8916003704071045, + "loss": 5.0234, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.14081382751465, + "rewards/margins": -1.22481107711792, + "rewards/rejected": -18.91600227355957, + "step": 8780 + }, + { + "epoch": 0.2961003067174492, + "grad_norm": 27.60009002685547, + "learning_rate": 8.873644814862416e-07, + "logits/chosen": -0.45917612314224243, + "logits/rejected": -0.7504684925079346, + "logps/chosen": -2.220410108566284, + "logps/rejected": -2.457399845123291, + "loss": 4.0908, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -22.2041015625, + "rewards/margins": 2.3698973655700684, + "rewards/rejected": -24.573999404907227, + "step": 8785 + }, + { + "epoch": 0.29626883278843236, + "grad_norm": 18.960968017578125, + "learning_rate": 8.871784350946723e-07, + "logits/chosen": -1.1864864826202393, + "logits/rejected": -1.3105027675628662, + "logps/chosen": -1.658342719078064, + "logps/rejected": -1.9156605005264282, + "loss": 1.7573, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -16.58342933654785, + "rewards/margins": 2.5731749534606934, + "rewards/rejected": -19.156604766845703, + "step": 8790 + }, + { + "epoch": 0.2964373588594156, + "grad_norm": 18.231367111206055, + "learning_rate": 8.869922547159009e-07, + "logits/chosen": -0.4016094207763672, + "logits/rejected": -0.5172806978225708, + "logps/chosen": -2.007896900177002, + "logps/rejected": -2.092552661895752, + "loss": 2.8234, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.078968048095703, + "rewards/margins": 0.8465590476989746, + "rewards/rejected": -20.925525665283203, + "step": 8795 + }, + { + "epoch": 0.29660588493039874, + "grad_norm": 21.657655715942383, + "learning_rate": 8.868059404143571e-07, + "logits/chosen": -0.8904246091842651, + "logits/rejected": -0.9135071039199829, + "logps/chosen": -1.8589338064193726, + "logps/rejected": -2.0225725173950195, + "loss": 1.9787, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.589340209960938, + "rewards/margins": 1.6363856792449951, + "rewards/rejected": -20.225723266601562, + "step": 8800 + }, + { + "epoch": 0.29660588493039874, + "eval_logits/chosen": -1.1061619520187378, + "eval_logits/rejected": -1.1691018342971802, + "eval_logps/chosen": -1.8326040506362915, + "eval_logps/rejected": -1.8919801712036133, + "eval_loss": 3.122637987136841, + "eval_rewards/accuracies": 0.6100000143051147, + "eval_rewards/chosen": -18.32604217529297, + "eval_rewards/margins": 0.5937601923942566, + "eval_rewards/rejected": -18.919801712036133, + "eval_runtime": 12.9076, + "eval_samples_per_second": 7.747, + "eval_steps_per_second": 1.937, + "step": 8800 + }, + { + "epoch": 0.2967744110013819, + "grad_norm": 14.822147369384766, + "learning_rate": 8.866194922545167e-07, + "logits/chosen": -0.9613476991653442, + "logits/rejected": -0.8759132623672485, + "logps/chosen": -2.081514835357666, + "logps/rejected": -2.1302268505096436, + "loss": 3.6599, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.815149307250977, + "rewards/margins": 0.4871188998222351, + "rewards/rejected": -21.302268981933594, + "step": 8805 + }, + { + "epoch": 0.2969429370723651, + "grad_norm": 58.875946044921875, + "learning_rate": 8.864329103009025e-07, + "logits/chosen": -0.7224435210227966, + "logits/rejected": -0.6182790994644165, + "logps/chosen": -2.059049129486084, + "logps/rejected": -2.1806259155273438, + "loss": 3.2944, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.590492248535156, + "rewards/margins": 1.2157670259475708, + "rewards/rejected": -21.806259155273438, + "step": 8810 + }, + { + "epoch": 0.2971114631433483, + "grad_norm": 25.18557357788086, + "learning_rate": 8.862461946180826e-07, + "logits/chosen": -0.873163104057312, + "logits/rejected": -0.8473867177963257, + "logps/chosen": -2.5954298973083496, + "logps/rejected": -2.170555591583252, + "loss": 7.2803, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -25.954299926757812, + "rewards/margins": -4.248744010925293, + "rewards/rejected": -21.705556869506836, + "step": 8815 + }, + { + "epoch": 0.29727998921433146, + "grad_norm": 10.100975036621094, + "learning_rate": 8.860593452706724e-07, + "logits/chosen": -0.07160119712352753, + "logits/rejected": -0.18310347199440002, + "logps/chosen": -1.888536810874939, + "logps/rejected": -2.037360429763794, + "loss": 2.3866, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.88536834716797, + "rewards/margins": 1.4882361888885498, + "rewards/rejected": -20.373605728149414, + "step": 8820 + }, + { + "epoch": 0.29744851528531463, + "grad_norm": 85.96381378173828, + "learning_rate": 8.858723623233329e-07, + "logits/chosen": -0.6304708123207092, + "logits/rejected": -0.7319132685661316, + "logps/chosen": -2.1153464317321777, + "logps/rejected": -2.3846354484558105, + "loss": 2.1868, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.153467178344727, + "rewards/margins": 2.6928863525390625, + "rewards/rejected": -23.846351623535156, + "step": 8825 + }, + { + "epoch": 0.2976170413562978, + "grad_norm": 27.138891220092773, + "learning_rate": 8.856852458407716e-07, + "logits/chosen": -0.7233554720878601, + "logits/rejected": -0.8463021516799927, + "logps/chosen": -1.82294499874115, + "logps/rejected": -1.8261346817016602, + "loss": 3.2798, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -18.229450225830078, + "rewards/margins": 0.031897544860839844, + "rewards/rejected": -18.2613468170166, + "step": 8830 + }, + { + "epoch": 0.297785567427281, + "grad_norm": 36.38093566894531, + "learning_rate": 8.854979958877421e-07, + "logits/chosen": -0.2902129590511322, + "logits/rejected": -0.41110771894454956, + "logps/chosen": -2.67460298538208, + "logps/rejected": -2.487720012664795, + "loss": 5.4603, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -26.74602699279785, + "rewards/margins": -1.8688280582427979, + "rewards/rejected": -24.877201080322266, + "step": 8835 + }, + { + "epoch": 0.2979540934982642, + "grad_norm": 27.867528915405273, + "learning_rate": 8.853106125290442e-07, + "logits/chosen": -0.8619322776794434, + "logits/rejected": -0.8854168057441711, + "logps/chosen": -1.8924545049667358, + "logps/rejected": -1.848910927772522, + "loss": 3.5393, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.924543380737305, + "rewards/margins": -0.43543368577957153, + "rewards/rejected": -18.48910903930664, + "step": 8840 + }, + { + "epoch": 0.29812261956924735, + "grad_norm": 31.812618255615234, + "learning_rate": 8.85123095829524e-07, + "logits/chosen": -1.0855190753936768, + "logits/rejected": -0.99981290102005, + "logps/chosen": -2.0037198066711426, + "logps/rejected": -2.1378657817840576, + "loss": 2.3159, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.03719711303711, + "rewards/margins": 1.3414623737335205, + "rewards/rejected": -21.378658294677734, + "step": 8845 + }, + { + "epoch": 0.29829114564023057, + "grad_norm": 20.495399475097656, + "learning_rate": 8.849354458540734e-07, + "logits/chosen": -0.6814324855804443, + "logits/rejected": -0.8289278745651245, + "logps/chosen": -1.9179836511611938, + "logps/rejected": -1.9481815099716187, + "loss": 3.3473, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.17983627319336, + "rewards/margins": 0.3019787669181824, + "rewards/rejected": -19.481815338134766, + "step": 8850 + }, + { + "epoch": 0.29845967171121374, + "grad_norm": 10.193831443786621, + "learning_rate": 8.84747662667631e-07, + "logits/chosen": -0.7345752716064453, + "logits/rejected": -0.8806995153427124, + "logps/chosen": -1.7043479681015015, + "logps/rejected": -1.8931198120117188, + "loss": 1.7613, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -17.04347801208496, + "rewards/margins": 1.887718915939331, + "rewards/rejected": -18.931198120117188, + "step": 8855 + }, + { + "epoch": 0.2986281977821969, + "grad_norm": 19.934276580810547, + "learning_rate": 8.845597463351811e-07, + "logits/chosen": -0.8036659955978394, + "logits/rejected": -0.7760517001152039, + "logps/chosen": -1.3290200233459473, + "logps/rejected": -1.3339701890945435, + "loss": 3.0572, + "rewards/accuracies": 0.5, + "rewards/chosen": -13.290201187133789, + "rewards/margins": 0.04950146749615669, + "rewards/rejected": -13.339701652526855, + "step": 8860 + }, + { + "epoch": 0.29879672385318007, + "grad_norm": 16.844425201416016, + "learning_rate": 8.843716969217538e-07, + "logits/chosen": -0.7272054553031921, + "logits/rejected": -0.9063571691513062, + "logps/chosen": -1.646959900856018, + "logps/rejected": -1.6387298107147217, + "loss": 3.6634, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.469600677490234, + "rewards/margins": -0.08229970932006836, + "rewards/rejected": -16.387298583984375, + "step": 8865 + }, + { + "epoch": 0.2989652499241633, + "grad_norm": 24.358808517456055, + "learning_rate": 8.84183514492426e-07, + "logits/chosen": -0.8251449465751648, + "logits/rejected": -1.1270743608474731, + "logps/chosen": -2.065498113632202, + "logps/rejected": -2.5702450275421143, + "loss": 3.3374, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.654979705810547, + "rewards/margins": 5.047468662261963, + "rewards/rejected": -25.70244789123535, + "step": 8870 + }, + { + "epoch": 0.29913377599514646, + "grad_norm": 42.65480041503906, + "learning_rate": 8.8399519911232e-07, + "logits/chosen": -0.8278477787971497, + "logits/rejected": -0.7521225214004517, + "logps/chosen": -2.142841100692749, + "logps/rejected": -2.3546812534332275, + "loss": 2.1474, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.42841148376465, + "rewards/margins": 2.1184000968933105, + "rewards/rejected": -23.546810150146484, + "step": 8875 + }, + { + "epoch": 0.2993023020661296, + "grad_norm": 14.598514556884766, + "learning_rate": 8.838067508466044e-07, + "logits/chosen": -0.8302278518676758, + "logits/rejected": -1.1150890588760376, + "logps/chosen": -1.5165884494781494, + "logps/rejected": -1.7599170207977295, + "loss": 1.9003, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -15.165884017944336, + "rewards/margins": 2.43328595161438, + "rewards/rejected": -17.599170684814453, + "step": 8880 + }, + { + "epoch": 0.2994708281371128, + "grad_norm": 18.158178329467773, + "learning_rate": 8.836181697604937e-07, + "logits/chosen": -0.7643877863883972, + "logits/rejected": -0.7783080339431763, + "logps/chosen": -2.0373730659484863, + "logps/rejected": -2.215522289276123, + "loss": 2.5642, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.37373161315918, + "rewards/margins": 1.781489372253418, + "rewards/rejected": -22.155221939086914, + "step": 8885 + }, + { + "epoch": 0.299639354208096, + "grad_norm": 22.55415153503418, + "learning_rate": 8.834294559192483e-07, + "logits/chosen": -0.9910376667976379, + "logits/rejected": -1.1309711933135986, + "logps/chosen": -1.6215696334838867, + "logps/rejected": -1.7703845500946045, + "loss": 2.8646, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.215696334838867, + "rewards/margins": 1.4881494045257568, + "rewards/rejected": -17.703845977783203, + "step": 8890 + }, + { + "epoch": 0.2998078802790792, + "grad_norm": 28.628612518310547, + "learning_rate": 8.832406093881749e-07, + "logits/chosen": -0.7218562364578247, + "logits/rejected": -0.8594409823417664, + "logps/chosen": -2.144453763961792, + "logps/rejected": -2.02648663520813, + "loss": 4.5639, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -21.444538116455078, + "rewards/margins": -1.1796700954437256, + "rewards/rejected": -20.264867782592773, + "step": 8895 + }, + { + "epoch": 0.29997640635006234, + "grad_norm": 23.925495147705078, + "learning_rate": 8.830516302326257e-07, + "logits/chosen": -0.8188495635986328, + "logits/rejected": -0.9341527223587036, + "logps/chosen": -1.8960866928100586, + "logps/rejected": -1.8661048412322998, + "loss": 3.4802, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.960866928100586, + "rewards/margins": -0.2998184263706207, + "rewards/rejected": -18.661048889160156, + "step": 8900 + }, + { + "epoch": 0.30014493242104556, + "grad_norm": 16.280384063720703, + "learning_rate": 8.828625185179988e-07, + "logits/chosen": -0.48790502548217773, + "logits/rejected": -0.6944053769111633, + "logps/chosen": -1.6479403972625732, + "logps/rejected": -1.7591689825057983, + "loss": 2.2314, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -16.47940444946289, + "rewards/margins": 1.1122846603393555, + "rewards/rejected": -17.591690063476562, + "step": 8905 + }, + { + "epoch": 0.30031345849202873, + "grad_norm": 22.790857315063477, + "learning_rate": 8.826732743097385e-07, + "logits/chosen": -0.9420550465583801, + "logits/rejected": -1.0960060358047485, + "logps/chosen": -1.7473942041397095, + "logps/rejected": -1.692716360092163, + "loss": 3.9335, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -17.473941802978516, + "rewards/margins": -0.5467766523361206, + "rewards/rejected": -16.927165985107422, + "step": 8910 + }, + { + "epoch": 0.3004819845630119, + "grad_norm": 24.296518325805664, + "learning_rate": 8.824838976733345e-07, + "logits/chosen": -0.6912819147109985, + "logits/rejected": -1.0317418575286865, + "logps/chosen": -2.033799409866333, + "logps/rejected": -2.1355299949645996, + "loss": 2.7434, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.337993621826172, + "rewards/margins": 1.0173046588897705, + "rewards/rejected": -21.355297088623047, + "step": 8915 + }, + { + "epoch": 0.30065051063399506, + "grad_norm": 10.289129257202148, + "learning_rate": 8.822943886743229e-07, + "logits/chosen": -1.1432678699493408, + "logits/rejected": -1.317195177078247, + "logps/chosen": -1.761526107788086, + "logps/rejected": -2.0710625648498535, + "loss": 1.9022, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.61526107788086, + "rewards/margins": 3.0953640937805176, + "rewards/rejected": -20.710622787475586, + "step": 8920 + }, + { + "epoch": 0.3008190367049783, + "grad_norm": 27.289064407348633, + "learning_rate": 8.821047473782852e-07, + "logits/chosen": -0.5983952879905701, + "logits/rejected": -0.35027188062667847, + "logps/chosen": -2.5783469676971436, + "logps/rejected": -2.4220213890075684, + "loss": 5.0843, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -25.783472061157227, + "rewards/margins": -1.563256025314331, + "rewards/rejected": -24.22021484375, + "step": 8925 + }, + { + "epoch": 0.30098756277596145, + "grad_norm": 73.90116882324219, + "learning_rate": 8.819149738508488e-07, + "logits/chosen": -0.7550094127655029, + "logits/rejected": -0.8008283376693726, + "logps/chosen": -2.168519973754883, + "logps/rejected": -2.512540102005005, + "loss": 2.612, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.685199737548828, + "rewards/margins": 3.4402008056640625, + "rewards/rejected": -25.12540054321289, + "step": 8930 + }, + { + "epoch": 0.3011560888469446, + "grad_norm": 20.581180572509766, + "learning_rate": 8.817250681576867e-07, + "logits/chosen": -1.3998351097106934, + "logits/rejected": -1.3207590579986572, + "logps/chosen": -1.6048024892807007, + "logps/rejected": -1.691149353981018, + "loss": 2.514, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -16.048025131225586, + "rewards/margins": 0.8634673357009888, + "rewards/rejected": -16.9114933013916, + "step": 8935 + }, + { + "epoch": 0.3013246149179278, + "grad_norm": 12.386482238769531, + "learning_rate": 8.815350303645179e-07, + "logits/chosen": -0.7268382906913757, + "logits/rejected": -0.9646919369697571, + "logps/chosen": -1.9220527410507202, + "logps/rejected": -2.1499226093292236, + "loss": 2.1157, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.22052574157715, + "rewards/margins": 2.278700113296509, + "rewards/rejected": -21.49922752380371, + "step": 8940 + }, + { + "epoch": 0.301493140988911, + "grad_norm": 48.431854248046875, + "learning_rate": 8.81344860537107e-07, + "logits/chosen": -0.8490222692489624, + "logits/rejected": -0.8382167816162109, + "logps/chosen": -2.010575532913208, + "logps/rejected": -2.0359785556793213, + "loss": 3.4625, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.10575294494629, + "rewards/margins": 0.2540292739868164, + "rewards/rejected": -20.359783172607422, + "step": 8945 + }, + { + "epoch": 0.30166166705989417, + "grad_norm": 26.83896255493164, + "learning_rate": 8.811545587412643e-07, + "logits/chosen": -0.7713761329650879, + "logits/rejected": -0.9512740969657898, + "logps/chosen": -1.7185767889022827, + "logps/rejected": -1.8028638362884521, + "loss": 3.4062, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.185766220092773, + "rewards/margins": 0.8428729176521301, + "rewards/rejected": -18.028640747070312, + "step": 8950 + }, + { + "epoch": 0.30183019313087733, + "grad_norm": 22.699316024780273, + "learning_rate": 8.809641250428457e-07, + "logits/chosen": -0.7982644438743591, + "logits/rejected": -0.800157368183136, + "logps/chosen": -1.7635858058929443, + "logps/rejected": -1.7425504922866821, + "loss": 3.3254, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.6358585357666, + "rewards/margins": -0.21035441756248474, + "rewards/rejected": -17.425504684448242, + "step": 8955 + }, + { + "epoch": 0.30199871920186055, + "grad_norm": 16.492294311523438, + "learning_rate": 8.807735595077526e-07, + "logits/chosen": -0.8586235046386719, + "logits/rejected": -0.8732374310493469, + "logps/chosen": -2.2647202014923096, + "logps/rejected": -2.4003055095672607, + "loss": 2.5696, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.647199630737305, + "rewards/margins": 1.3558542728424072, + "rewards/rejected": -24.003055572509766, + "step": 8960 + }, + { + "epoch": 0.3021672452728437, + "grad_norm": 32.511531829833984, + "learning_rate": 8.805828622019326e-07, + "logits/chosen": -0.8290532827377319, + "logits/rejected": -0.8825815320014954, + "logps/chosen": -1.861469030380249, + "logps/rejected": -1.8529653549194336, + "loss": 3.225, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.61469078063965, + "rewards/margins": -0.08503618091344833, + "rewards/rejected": -18.529653549194336, + "step": 8965 + }, + { + "epoch": 0.3023357713438269, + "grad_norm": 72.5047378540039, + "learning_rate": 8.803920331913785e-07, + "logits/chosen": -0.8660562634468079, + "logits/rejected": -0.984213650226593, + "logps/chosen": -2.007239580154419, + "logps/rejected": -2.131221055984497, + "loss": 2.8056, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.07239532470703, + "rewards/margins": 1.2398135662078857, + "rewards/rejected": -21.312210083007812, + "step": 8970 + }, + { + "epoch": 0.30250429741481005, + "grad_norm": 26.04867172241211, + "learning_rate": 8.802010725421283e-07, + "logits/chosen": -0.2826697826385498, + "logits/rejected": -0.4146658778190613, + "logps/chosen": -1.6923812627792358, + "logps/rejected": -1.7552837133407593, + "loss": 2.7193, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.923812866210938, + "rewards/margins": 0.6290236711502075, + "rewards/rejected": -17.552837371826172, + "step": 8975 + }, + { + "epoch": 0.3026728234857933, + "grad_norm": 58.66663360595703, + "learning_rate": 8.800099803202663e-07, + "logits/chosen": -0.7005506753921509, + "logits/rejected": -0.7806217670440674, + "logps/chosen": -1.9371159076690674, + "logps/rejected": -2.365093946456909, + "loss": 1.8148, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.371158599853516, + "rewards/margins": 4.279781818389893, + "rewards/rejected": -23.65093994140625, + "step": 8980 + }, + { + "epoch": 0.30284134955677644, + "grad_norm": 32.66402816772461, + "learning_rate": 8.79818756591922e-07, + "logits/chosen": -0.546657919883728, + "logits/rejected": -0.6938589811325073, + "logps/chosen": -1.9407997131347656, + "logps/rejected": -2.485011577606201, + "loss": 2.8969, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.40799903869629, + "rewards/margins": 5.4421186447143555, + "rewards/rejected": -24.850116729736328, + "step": 8985 + }, + { + "epoch": 0.3030098756277596, + "grad_norm": 25.154067993164062, + "learning_rate": 8.796274014232703e-07, + "logits/chosen": -1.053919792175293, + "logits/rejected": -1.0685478448867798, + "logps/chosen": -1.861169457435608, + "logps/rejected": -1.8835694789886475, + "loss": 3.0481, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.611690521240234, + "rewards/margins": 0.22400188446044922, + "rewards/rejected": -18.835693359375, + "step": 8990 + }, + { + "epoch": 0.30317840169874277, + "grad_norm": 42.444305419921875, + "learning_rate": 8.794359148805316e-07, + "logits/chosen": -0.8252202868461609, + "logits/rejected": -0.9097492098808289, + "logps/chosen": -1.8112564086914062, + "logps/rejected": -1.9504534006118774, + "loss": 2.1677, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.112564086914062, + "rewards/margins": 1.391969084739685, + "rewards/rejected": -19.504533767700195, + "step": 8995 + }, + { + "epoch": 0.303346927769726, + "grad_norm": 21.111835479736328, + "learning_rate": 8.79244297029972e-07, + "logits/chosen": -1.1233201026916504, + "logits/rejected": -1.2216030359268188, + "logps/chosen": -1.5746749639511108, + "logps/rejected": -1.7738206386566162, + "loss": 1.8468, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -15.746749877929688, + "rewards/margins": 1.9914575815200806, + "rewards/rejected": -17.738208770751953, + "step": 9000 + }, + { + "epoch": 0.30351545384070916, + "grad_norm": 41.230464935302734, + "learning_rate": 8.790525479379027e-07, + "logits/chosen": -0.6682386994361877, + "logits/rejected": -0.846937358379364, + "logps/chosen": -1.9128681421279907, + "logps/rejected": -1.9116519689559937, + "loss": 3.1896, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.128681182861328, + "rewards/margins": -0.0121612548828125, + "rewards/rejected": -19.11652183532715, + "step": 9005 + }, + { + "epoch": 0.3036839799116923, + "grad_norm": 21.60496711730957, + "learning_rate": 8.788606676706808e-07, + "logits/chosen": -0.7037097215652466, + "logits/rejected": -0.7241389751434326, + "logps/chosen": -1.754116415977478, + "logps/rejected": -1.8600555658340454, + "loss": 2.4375, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.54116439819336, + "rewards/margins": 1.0593905448913574, + "rewards/rejected": -18.600553512573242, + "step": 9010 + }, + { + "epoch": 0.30385250598267555, + "grad_norm": 29.23500633239746, + "learning_rate": 8.786686562947083e-07, + "logits/chosen": -0.9285395741462708, + "logits/rejected": -0.8670550584793091, + "logps/chosen": -1.9415439367294312, + "logps/rejected": -2.041487216949463, + "loss": 2.8543, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.41543960571289, + "rewards/margins": 0.9994330406188965, + "rewards/rejected": -20.414873123168945, + "step": 9015 + }, + { + "epoch": 0.3040210320536587, + "grad_norm": 107.34830474853516, + "learning_rate": 8.784765138764327e-07, + "logits/chosen": -0.4858947694301605, + "logits/rejected": -0.3548693060874939, + "logps/chosen": -2.049286365509033, + "logps/rejected": -2.0892035961151123, + "loss": 3.5124, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.492860794067383, + "rewards/margins": 0.3991745114326477, + "rewards/rejected": -20.89203453063965, + "step": 9020 + }, + { + "epoch": 0.3041895581246419, + "grad_norm": 39.668636322021484, + "learning_rate": 8.782842404823472e-07, + "logits/chosen": -0.9904440641403198, + "logits/rejected": -0.9966761469841003, + "logps/chosen": -2.305406332015991, + "logps/rejected": -2.2075486183166504, + "loss": 4.0946, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.05406379699707, + "rewards/margins": -0.9785760045051575, + "rewards/rejected": -22.075489044189453, + "step": 9025 + }, + { + "epoch": 0.30435808419562504, + "grad_norm": 14.98969554901123, + "learning_rate": 8.780918361789897e-07, + "logits/chosen": -0.7242355942726135, + "logits/rejected": -0.6773864030838013, + "logps/chosen": -2.184032440185547, + "logps/rejected": -2.7087905406951904, + "loss": 2.0529, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.8403263092041, + "rewards/margins": 5.24758243560791, + "rewards/rejected": -27.087909698486328, + "step": 9030 + }, + { + "epoch": 0.30452661026660827, + "grad_norm": 18.60861587524414, + "learning_rate": 8.778993010329441e-07, + "logits/chosen": -0.8874000310897827, + "logits/rejected": -1.0219591856002808, + "logps/chosen": -2.139561176300049, + "logps/rejected": -2.437269926071167, + "loss": 2.2432, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.395610809326172, + "rewards/margins": 2.977090358734131, + "rewards/rejected": -24.37270164489746, + "step": 9035 + }, + { + "epoch": 0.30469513633759143, + "grad_norm": 16.233108520507812, + "learning_rate": 8.777066351108388e-07, + "logits/chosen": -0.7690008878707886, + "logits/rejected": -1.0270230770111084, + "logps/chosen": -1.7827821969985962, + "logps/rejected": -1.8636226654052734, + "loss": 2.5837, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.827821731567383, + "rewards/margins": 0.8084052205085754, + "rewards/rejected": -18.636226654052734, + "step": 9040 + }, + { + "epoch": 0.3048636624085746, + "grad_norm": 42.710716247558594, + "learning_rate": 8.775138384793483e-07, + "logits/chosen": -0.7992104887962341, + "logits/rejected": -0.7307273149490356, + "logps/chosen": -1.7060863971710205, + "logps/rejected": -1.6231950521469116, + "loss": 4.0294, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.060863494873047, + "rewards/margins": -0.8289132118225098, + "rewards/rejected": -16.231948852539062, + "step": 9045 + }, + { + "epoch": 0.30503218847955776, + "grad_norm": 22.898052215576172, + "learning_rate": 8.773209112051918e-07, + "logits/chosen": -1.1620280742645264, + "logits/rejected": -1.0759422779083252, + "logps/chosen": -1.6986467838287354, + "logps/rejected": -1.8278793096542358, + "loss": 2.1036, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -16.986465454101562, + "rewards/margins": 1.2923271656036377, + "rewards/rejected": -18.27879524230957, + "step": 9050 + }, + { + "epoch": 0.305200714550541, + "grad_norm": 30.709197998046875, + "learning_rate": 8.771278533551338e-07, + "logits/chosen": -0.6249741315841675, + "logits/rejected": -0.6515600085258484, + "logps/chosen": -1.6547349691390991, + "logps/rejected": -1.5639965534210205, + "loss": 4.2284, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.547351837158203, + "rewards/margins": -0.9073851704597473, + "rewards/rejected": -15.639966011047363, + "step": 9055 + }, + { + "epoch": 0.30536924062152415, + "grad_norm": 21.73700714111328, + "learning_rate": 8.769346649959839e-07, + "logits/chosen": -0.7830342650413513, + "logits/rejected": -0.6824518442153931, + "logps/chosen": -1.5401135683059692, + "logps/rejected": -1.622176170349121, + "loss": 2.4441, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -15.401135444641113, + "rewards/margins": 0.820626437664032, + "rewards/rejected": -16.22176170349121, + "step": 9060 + }, + { + "epoch": 0.3055377666925073, + "grad_norm": 29.96711540222168, + "learning_rate": 8.76741346194597e-07, + "logits/chosen": -1.2152740955352783, + "logits/rejected": -1.201468586921692, + "logps/chosen": -1.8606659173965454, + "logps/rejected": -1.9004096984863281, + "loss": 2.8449, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.606660842895508, + "rewards/margins": 0.39743900299072266, + "rewards/rejected": -19.004098892211914, + "step": 9065 + }, + { + "epoch": 0.30570629276349054, + "grad_norm": 38.4486083984375, + "learning_rate": 8.765478970178733e-07, + "logits/chosen": -0.9709786176681519, + "logits/rejected": -1.066765546798706, + "logps/chosen": -1.6975494623184204, + "logps/rejected": -1.7299559116363525, + "loss": 2.8402, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -16.975492477416992, + "rewards/margins": 0.3240653872489929, + "rewards/rejected": -17.299556732177734, + "step": 9070 + }, + { + "epoch": 0.3058748188344737, + "grad_norm": 0.006397194694727659, + "learning_rate": 8.763543175327579e-07, + "logits/chosen": -0.5968400239944458, + "logits/rejected": -0.7210376858711243, + "logps/chosen": -1.7113231420516968, + "logps/rejected": -1.9671258926391602, + "loss": 2.5714, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.113231658935547, + "rewards/margins": 2.55802583694458, + "rewards/rejected": -19.6712589263916, + "step": 9075 + }, + { + "epoch": 0.30604334490545687, + "grad_norm": 34.90524673461914, + "learning_rate": 8.761606078062409e-07, + "logits/chosen": -0.5262940526008606, + "logits/rejected": -0.9337191581726074, + "logps/chosen": -1.9339056015014648, + "logps/rejected": -2.1920578479766846, + "loss": 2.1257, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.33905601501465, + "rewards/margins": 2.5815205574035645, + "rewards/rejected": -21.920576095581055, + "step": 9080 + }, + { + "epoch": 0.30621187097644004, + "grad_norm": 20.485763549804688, + "learning_rate": 8.759667679053576e-07, + "logits/chosen": -0.9563091397285461, + "logits/rejected": -1.1899265050888062, + "logps/chosen": -1.9684795141220093, + "logps/rejected": -2.0848593711853027, + "loss": 2.2998, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.684795379638672, + "rewards/margins": 1.1637986898422241, + "rewards/rejected": -20.848594665527344, + "step": 9085 + }, + { + "epoch": 0.30638039704742326, + "grad_norm": 18.3884220123291, + "learning_rate": 8.757727978971885e-07, + "logits/chosen": -0.585478663444519, + "logits/rejected": -0.7393139600753784, + "logps/chosen": -1.8395030498504639, + "logps/rejected": -1.911931037902832, + "loss": 2.6764, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.395029067993164, + "rewards/margins": 0.7242798805236816, + "rewards/rejected": -19.11931037902832, + "step": 9090 + }, + { + "epoch": 0.3065489231184064, + "grad_norm": 23.539100646972656, + "learning_rate": 8.755786978488589e-07, + "logits/chosen": -0.5933682918548584, + "logits/rejected": -0.8263611793518066, + "logps/chosen": -1.4465597867965698, + "logps/rejected": -1.3904832601547241, + "loss": 3.7162, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -14.465597152709961, + "rewards/margins": -0.5607655644416809, + "rewards/rejected": -13.904830932617188, + "step": 9095 + }, + { + "epoch": 0.3067174491893896, + "grad_norm": 20.3573055267334, + "learning_rate": 8.753844678275392e-07, + "logits/chosen": -1.2448726892471313, + "logits/rejected": -1.4484832286834717, + "logps/chosen": -1.6800048351287842, + "logps/rejected": -1.8209593296051025, + "loss": 3.3054, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -16.800048828125, + "rewards/margins": 1.4095432758331299, + "rewards/rejected": -18.209592819213867, + "step": 9100 + }, + { + "epoch": 0.30688597526037276, + "grad_norm": 4.400402545928955, + "learning_rate": 8.751901079004447e-07, + "logits/chosen": -0.7810710072517395, + "logits/rejected": -0.879501223564148, + "logps/chosen": -1.8006995916366577, + "logps/rejected": -2.0738296508789062, + "loss": 1.9631, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.006996154785156, + "rewards/margins": 2.7313015460968018, + "rewards/rejected": -20.738298416137695, + "step": 9105 + }, + { + "epoch": 0.307054501331356, + "grad_norm": 51.79671096801758, + "learning_rate": 8.749956181348359e-07, + "logits/chosen": -0.6710806488990784, + "logits/rejected": -0.8282930254936218, + "logps/chosen": -2.1655375957489014, + "logps/rejected": -2.0684962272644043, + "loss": 4.0727, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.655378341674805, + "rewards/margins": -0.970413327217102, + "rewards/rejected": -20.68496322631836, + "step": 9110 + }, + { + "epoch": 0.30722302740233914, + "grad_norm": 15.096877098083496, + "learning_rate": 8.748009985980177e-07, + "logits/chosen": -1.044159173965454, + "logits/rejected": -1.160940408706665, + "logps/chosen": -1.7486212253570557, + "logps/rejected": -1.8441343307495117, + "loss": 3.0722, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.4862117767334, + "rewards/margins": 0.9551332592964172, + "rewards/rejected": -18.44134521484375, + "step": 9115 + }, + { + "epoch": 0.3073915534733223, + "grad_norm": 12.892077445983887, + "learning_rate": 8.746062493573405e-07, + "logits/chosen": -0.6979146599769592, + "logits/rejected": -0.6973038911819458, + "logps/chosen": -1.865674614906311, + "logps/rejected": -2.052708625793457, + "loss": 1.7233, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.65674591064453, + "rewards/margins": 1.8703396320343018, + "rewards/rejected": -20.527084350585938, + "step": 9120 + }, + { + "epoch": 0.30756007954430553, + "grad_norm": 18.94183349609375, + "learning_rate": 8.744113704801994e-07, + "logits/chosen": -0.970826268196106, + "logits/rejected": -1.3497257232666016, + "logps/chosen": -1.6942193508148193, + "logps/rejected": -1.9070132970809937, + "loss": 2.5195, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.94219398498535, + "rewards/margins": 2.127938747406006, + "rewards/rejected": -19.070133209228516, + "step": 9125 + }, + { + "epoch": 0.3077286056152887, + "grad_norm": 76.88709259033203, + "learning_rate": 8.742163620340342e-07, + "logits/chosen": -0.3977760374546051, + "logits/rejected": -0.5327490568161011, + "logps/chosen": -2.3799424171447754, + "logps/rejected": -2.4491336345672607, + "loss": 3.1359, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.799423217773438, + "rewards/margins": 0.6919113397598267, + "rewards/rejected": -24.4913330078125, + "step": 9130 + }, + { + "epoch": 0.30789713168627186, + "grad_norm": 17.748449325561523, + "learning_rate": 8.740212240863295e-07, + "logits/chosen": -1.0044649839401245, + "logits/rejected": -1.0892714262008667, + "logps/chosen": -1.5947548151016235, + "logps/rejected": -1.6854908466339111, + "loss": 3.3036, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -15.947547912597656, + "rewards/margins": 0.9073607325553894, + "rewards/rejected": -16.854907989501953, + "step": 9135 + }, + { + "epoch": 0.30806565775725503, + "grad_norm": 20.412609100341797, + "learning_rate": 8.738259567046151e-07, + "logits/chosen": -0.45204153656959534, + "logits/rejected": -0.5009742975234985, + "logps/chosen": -1.8742802143096924, + "logps/rejected": -1.9355961084365845, + "loss": 2.7849, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.742801666259766, + "rewards/margins": 0.6131598353385925, + "rewards/rejected": -19.355960845947266, + "step": 9140 + }, + { + "epoch": 0.30823418382823825, + "grad_norm": 16.282869338989258, + "learning_rate": 8.736305599564652e-07, + "logits/chosen": -0.6326123476028442, + "logits/rejected": -0.582168459892273, + "logps/chosen": -1.7905687093734741, + "logps/rejected": -2.0051612854003906, + "loss": 1.6708, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.90568733215332, + "rewards/margins": 2.1459248065948486, + "rewards/rejected": -20.051612854003906, + "step": 9145 + }, + { + "epoch": 0.3084027098992214, + "grad_norm": 22.959726333618164, + "learning_rate": 8.734350339094989e-07, + "logits/chosen": -0.7373453378677368, + "logits/rejected": -0.9396425485610962, + "logps/chosen": -1.7323005199432373, + "logps/rejected": -1.8902448415756226, + "loss": 1.8278, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -17.3230037689209, + "rewards/margins": 1.579443097114563, + "rewards/rejected": -18.902446746826172, + "step": 9150 + }, + { + "epoch": 0.3085712359702046, + "grad_norm": 26.068431854248047, + "learning_rate": 8.732393786313803e-07, + "logits/chosen": -0.8269859552383423, + "logits/rejected": -0.8830499649047852, + "logps/chosen": -1.9429603815078735, + "logps/rejected": -1.9990386962890625, + "loss": 3.1581, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.42960548400879, + "rewards/margins": 0.5607836842536926, + "rewards/rejected": -19.990388870239258, + "step": 9155 + }, + { + "epoch": 0.30873976204118775, + "grad_norm": 14.898977279663086, + "learning_rate": 8.730435941898175e-07, + "logits/chosen": -1.2051137685775757, + "logits/rejected": -1.2535730600357056, + "logps/chosen": -2.182587146759033, + "logps/rejected": -2.2140278816223145, + "loss": 3.71, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.825870513916016, + "rewards/margins": 0.31440791487693787, + "rewards/rejected": -22.140277862548828, + "step": 9160 + }, + { + "epoch": 0.30890828811217097, + "grad_norm": 32.99822998046875, + "learning_rate": 8.728476806525644e-07, + "logits/chosen": -0.8135523796081543, + "logits/rejected": -0.8619807362556458, + "logps/chosen": -1.740747094154358, + "logps/rejected": -1.7622601985931396, + "loss": 2.9138, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.407470703125, + "rewards/margins": 0.21513119339942932, + "rewards/rejected": -17.622602462768555, + "step": 9165 + }, + { + "epoch": 0.30907681418315414, + "grad_norm": 29.103960037231445, + "learning_rate": 8.726516380874185e-07, + "logits/chosen": -0.764143705368042, + "logits/rejected": -0.7308832406997681, + "logps/chosen": -2.05553936958313, + "logps/rejected": -1.9418214559555054, + "loss": 4.1841, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.55539321899414, + "rewards/margins": -1.1371777057647705, + "rewards/rejected": -19.418216705322266, + "step": 9170 + }, + { + "epoch": 0.3092453402541373, + "grad_norm": 28.63543128967285, + "learning_rate": 8.724554665622226e-07, + "logits/chosen": -0.6835757493972778, + "logits/rejected": -0.8575420379638672, + "logps/chosen": -1.7750861644744873, + "logps/rejected": -1.815028429031372, + "loss": 3.2284, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.750864028930664, + "rewards/margins": 0.3994219899177551, + "rewards/rejected": -18.150283813476562, + "step": 9175 + }, + { + "epoch": 0.3094138663251205, + "grad_norm": 0.2285619080066681, + "learning_rate": 8.722591661448637e-07, + "logits/chosen": -0.49821311235427856, + "logits/rejected": -0.590837836265564, + "logps/chosen": -1.7573721408843994, + "logps/rejected": -2.0172266960144043, + "loss": 2.2316, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.573719024658203, + "rewards/margins": 2.5985465049743652, + "rewards/rejected": -20.17226791381836, + "step": 9180 + }, + { + "epoch": 0.3095823923961037, + "grad_norm": 41.559940338134766, + "learning_rate": 8.72062736903274e-07, + "logits/chosen": -0.7645959854125977, + "logits/rejected": -0.7926516532897949, + "logps/chosen": -1.973306655883789, + "logps/rejected": -2.1764540672302246, + "loss": 2.8765, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.73306655883789, + "rewards/margins": 2.0314762592315674, + "rewards/rejected": -21.764545440673828, + "step": 9185 + }, + { + "epoch": 0.30975091846708686, + "grad_norm": 18.116069793701172, + "learning_rate": 8.718661789054297e-07, + "logits/chosen": -0.525948703289032, + "logits/rejected": -0.5813354253768921, + "logps/chosen": -1.958701491355896, + "logps/rejected": -2.065213203430176, + "loss": 2.2873, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.587017059326172, + "rewards/margins": 1.0651161670684814, + "rewards/rejected": -20.65213394165039, + "step": 9190 + }, + { + "epoch": 0.30991944453807, + "grad_norm": 27.396312713623047, + "learning_rate": 8.716694922193517e-07, + "logits/chosen": -1.0580114126205444, + "logits/rejected": -1.1286802291870117, + "logps/chosen": -1.9167063236236572, + "logps/rejected": -1.9518272876739502, + "loss": 3.0418, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.167064666748047, + "rewards/margins": 0.35120925307273865, + "rewards/rejected": -19.518274307250977, + "step": 9195 + }, + { + "epoch": 0.31008797060905324, + "grad_norm": 26.48879623413086, + "learning_rate": 8.714726769131058e-07, + "logits/chosen": -0.952828049659729, + "logits/rejected": -1.0840023756027222, + "logps/chosen": -1.8382370471954346, + "logps/rejected": -2.317875385284424, + "loss": 2.8347, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -18.382369995117188, + "rewards/margins": 4.796382904052734, + "rewards/rejected": -23.178752899169922, + "step": 9200 + }, + { + "epoch": 0.31008797060905324, + "eval_logits/chosen": -1.1909996271133423, + "eval_logits/rejected": -1.25919508934021, + "eval_logps/chosen": -1.8463243246078491, + "eval_logps/rejected": -1.9093353748321533, + "eval_loss": 3.115633010864258, + "eval_rewards/accuracies": 0.6100000143051147, + "eval_rewards/chosen": -18.46324348449707, + "eval_rewards/margins": 0.6301097273826599, + "eval_rewards/rejected": -19.093353271484375, + "eval_runtime": 12.9029, + "eval_samples_per_second": 7.75, + "eval_steps_per_second": 1.938, + "step": 9200 + }, + { + "epoch": 0.3102564966800364, + "grad_norm": 1.0767076015472412, + "learning_rate": 8.71275733054802e-07, + "logits/chosen": -0.5575070381164551, + "logits/rejected": -0.8997724652290344, + "logps/chosen": -2.1544461250305176, + "logps/rejected": -2.622394561767578, + "loss": 1.0911, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -21.54446029663086, + "rewards/margins": 4.679484844207764, + "rewards/rejected": -26.22394371032715, + "step": 9205 + }, + { + "epoch": 0.3104250227510196, + "grad_norm": 21.940515518188477, + "learning_rate": 8.710786607125944e-07, + "logits/chosen": -0.535169780254364, + "logits/rejected": -0.6544603109359741, + "logps/chosen": -2.1633615493774414, + "logps/rejected": -2.1183650493621826, + "loss": 3.796, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.633617401123047, + "rewards/margins": -0.44996652007102966, + "rewards/rejected": -21.18364906311035, + "step": 9210 + }, + { + "epoch": 0.31059354882200274, + "grad_norm": 20.01294708251953, + "learning_rate": 8.708814599546823e-07, + "logits/chosen": -0.9173457026481628, + "logits/rejected": -0.8900350332260132, + "logps/chosen": -1.5918538570404053, + "logps/rejected": -1.5473552942276, + "loss": 3.5273, + "rewards/accuracies": 0.5, + "rewards/chosen": -15.918538093566895, + "rewards/margins": -0.4449850916862488, + "rewards/rejected": -15.473551750183105, + "step": 9215 + }, + { + "epoch": 0.31076207489298596, + "grad_norm": 31.564699172973633, + "learning_rate": 8.706841308493091e-07, + "logits/chosen": -0.7942282557487488, + "logits/rejected": -0.8029106855392456, + "logps/chosen": -2.029221773147583, + "logps/rejected": -2.0451905727386475, + "loss": 3.3308, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.292217254638672, + "rewards/margins": 0.1596868485212326, + "rewards/rejected": -20.451906204223633, + "step": 9220 + }, + { + "epoch": 0.31093060096396913, + "grad_norm": 22.814817428588867, + "learning_rate": 8.704866734647624e-07, + "logits/chosen": -1.0109196901321411, + "logits/rejected": -0.7166153788566589, + "logps/chosen": -1.6865415573120117, + "logps/rejected": -1.6409651041030884, + "loss": 3.6558, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.865415573120117, + "rewards/margins": -0.45576468110084534, + "rewards/rejected": -16.409652709960938, + "step": 9225 + }, + { + "epoch": 0.3110991270349523, + "grad_norm": 21.77043342590332, + "learning_rate": 8.702890878693749e-07, + "logits/chosen": -0.66042160987854, + "logits/rejected": -0.6365646123886108, + "logps/chosen": -1.6145178079605103, + "logps/rejected": -1.8034298419952393, + "loss": 2.3972, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.145177841186523, + "rewards/margins": 1.8891197443008423, + "rewards/rejected": -18.034297943115234, + "step": 9230 + }, + { + "epoch": 0.3112676531059355, + "grad_norm": 43.32010269165039, + "learning_rate": 8.700913741315228e-07, + "logits/chosen": -0.5597115755081177, + "logits/rejected": -0.6904144883155823, + "logps/chosen": -2.2811083793640137, + "logps/rejected": -2.2438998222351074, + "loss": 3.6817, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.811084747314453, + "rewards/margins": -0.37208643555641174, + "rewards/rejected": -22.438995361328125, + "step": 9235 + }, + { + "epoch": 0.3114361791769187, + "grad_norm": 28.244930267333984, + "learning_rate": 8.69893532319627e-07, + "logits/chosen": -0.62388676404953, + "logits/rejected": -0.6791104078292847, + "logps/chosen": -1.8690904378890991, + "logps/rejected": -1.922142744064331, + "loss": 2.6813, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.690906524658203, + "rewards/margins": 0.5305234789848328, + "rewards/rejected": -19.22142791748047, + "step": 9240 + }, + { + "epoch": 0.31160470524790185, + "grad_norm": 18.90542984008789, + "learning_rate": 8.696955625021531e-07, + "logits/chosen": -0.8645426034927368, + "logits/rejected": -1.0093305110931396, + "logps/chosen": -1.7112518548965454, + "logps/rejected": -1.8172565698623657, + "loss": 3.6309, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.112520217895508, + "rewards/margins": 1.060046672821045, + "rewards/rejected": -18.172565460205078, + "step": 9245 + }, + { + "epoch": 0.311773231318885, + "grad_norm": 22.444063186645508, + "learning_rate": 8.694974647476103e-07, + "logits/chosen": -0.9879194498062134, + "logits/rejected": -1.0465924739837646, + "logps/chosen": -1.9798351526260376, + "logps/rejected": -2.0943007469177246, + "loss": 2.4644, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.798351287841797, + "rewards/margins": 1.1446588039398193, + "rewards/rejected": -20.943008422851562, + "step": 9250 + }, + { + "epoch": 0.31194175738986823, + "grad_norm": 19.90350341796875, + "learning_rate": 8.692992391245526e-07, + "logits/chosen": -0.7082827091217041, + "logits/rejected": -0.8607162237167358, + "logps/chosen": -1.9293746948242188, + "logps/rejected": -2.0852489471435547, + "loss": 3.0405, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.293745040893555, + "rewards/margins": 1.5587440729141235, + "rewards/rejected": -20.852489471435547, + "step": 9255 + }, + { + "epoch": 0.3121102834608514, + "grad_norm": 40.98049545288086, + "learning_rate": 8.69100885701578e-07, + "logits/chosen": -0.8298488855361938, + "logits/rejected": -0.9287115931510925, + "logps/chosen": -2.2203915119171143, + "logps/rejected": -2.493387460708618, + "loss": 1.8449, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -22.20391082763672, + "rewards/margins": 2.7299604415893555, + "rewards/rejected": -24.93387222290039, + "step": 9260 + }, + { + "epoch": 0.31227880953183457, + "grad_norm": 45.79103088378906, + "learning_rate": 8.689024045473289e-07, + "logits/chosen": -0.9122930765151978, + "logits/rejected": -0.9798024296760559, + "logps/chosen": -1.878292441368103, + "logps/rejected": -1.9426196813583374, + "loss": 3.3604, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.78292465209961, + "rewards/margins": 0.643271803855896, + "rewards/rejected": -19.426197052001953, + "step": 9265 + }, + { + "epoch": 0.31244733560281773, + "grad_norm": 98.69233703613281, + "learning_rate": 8.687037957304916e-07, + "logits/chosen": -0.6158634424209595, + "logits/rejected": -0.8452926874160767, + "logps/chosen": -2.1620185375213623, + "logps/rejected": -2.1422224044799805, + "loss": 4.0814, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.620187759399414, + "rewards/margins": -0.19796237349510193, + "rewards/rejected": -21.422225952148438, + "step": 9270 + }, + { + "epoch": 0.31261586167380095, + "grad_norm": 27.064868927001953, + "learning_rate": 8.685050593197974e-07, + "logits/chosen": -0.3698921203613281, + "logits/rejected": -0.5817358493804932, + "logps/chosen": -1.7380319833755493, + "logps/rejected": -1.8976764678955078, + "loss": 2.137, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.380319595336914, + "rewards/margins": 1.596444010734558, + "rewards/rejected": -18.976764678955078, + "step": 9275 + }, + { + "epoch": 0.3127843877447841, + "grad_norm": 46.59791564941406, + "learning_rate": 8.683061953840203e-07, + "logits/chosen": -1.0455105304718018, + "logits/rejected": -1.0464041233062744, + "logps/chosen": -1.9930102825164795, + "logps/rejected": -1.9345165491104126, + "loss": 3.6931, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -19.930103302001953, + "rewards/margins": -0.5849382281303406, + "rewards/rejected": -19.345165252685547, + "step": 9280 + }, + { + "epoch": 0.3129529138157673, + "grad_norm": 17.088973999023438, + "learning_rate": 8.681072039919797e-07, + "logits/chosen": -1.1266381740570068, + "logits/rejected": -0.9821329116821289, + "logps/chosen": -2.0811820030212402, + "logps/rejected": -2.1210179328918457, + "loss": 3.321, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.811819076538086, + "rewards/margins": 0.39835816621780396, + "rewards/rejected": -21.210176467895508, + "step": 9285 + }, + { + "epoch": 0.3131214398867505, + "grad_norm": 27.47848129272461, + "learning_rate": 8.679080852125388e-07, + "logits/chosen": -0.7783876657485962, + "logits/rejected": -0.9243799448013306, + "logps/chosen": -2.048316240310669, + "logps/rejected": -2.2565720081329346, + "loss": 2.8541, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.48316192626953, + "rewards/margins": 2.08255934715271, + "rewards/rejected": -22.565723419189453, + "step": 9290 + }, + { + "epoch": 0.3132899659577337, + "grad_norm": 36.77070617675781, + "learning_rate": 8.677088391146045e-07, + "logits/chosen": -0.9123884439468384, + "logits/rejected": -1.1769945621490479, + "logps/chosen": -1.8474833965301514, + "logps/rejected": -1.9363610744476318, + "loss": 2.6286, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.474834442138672, + "rewards/margins": 0.8887761831283569, + "rewards/rejected": -19.363611221313477, + "step": 9295 + }, + { + "epoch": 0.31345849202871684, + "grad_norm": 20.16952896118164, + "learning_rate": 8.675094657671281e-07, + "logits/chosen": -0.8153167963027954, + "logits/rejected": -0.8752968907356262, + "logps/chosen": -2.131622791290283, + "logps/rejected": -2.1669955253601074, + "loss": 3.2583, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.316226959228516, + "rewards/margins": 0.3537294268608093, + "rewards/rejected": -21.66995620727539, + "step": 9300 + }, + { + "epoch": 0.3136270180997, + "grad_norm": 73.12217712402344, + "learning_rate": 8.673099652391049e-07, + "logits/chosen": -0.43089723587036133, + "logits/rejected": -0.40491873025894165, + "logps/chosen": -2.151824474334717, + "logps/rejected": -2.3143529891967773, + "loss": 2.2401, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.51824378967285, + "rewards/margins": 1.6252834796905518, + "rewards/rejected": -23.143529891967773, + "step": 9305 + }, + { + "epoch": 0.3137955441706832, + "grad_norm": 27.147113800048828, + "learning_rate": 8.671103375995743e-07, + "logits/chosen": -0.7571858167648315, + "logits/rejected": -0.8392313122749329, + "logps/chosen": -1.9569320678710938, + "logps/rejected": -1.8534408807754517, + "loss": 4.0802, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.569320678710938, + "rewards/margins": -1.0349119901657104, + "rewards/rejected": -18.534408569335938, + "step": 9310 + }, + { + "epoch": 0.3139640702416664, + "grad_norm": 29.20941734313965, + "learning_rate": 8.669105829176193e-07, + "logits/chosen": -1.1783753633499146, + "logits/rejected": -1.204984188079834, + "logps/chosen": -1.939965844154358, + "logps/rejected": -2.06992506980896, + "loss": 2.4982, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.399660110473633, + "rewards/margins": 1.2995918989181519, + "rewards/rejected": -20.69925308227539, + "step": 9315 + }, + { + "epoch": 0.31413259631264956, + "grad_norm": 24.12032699584961, + "learning_rate": 8.667107012623674e-07, + "logits/chosen": -0.9502077102661133, + "logits/rejected": -1.0443466901779175, + "logps/chosen": -2.350555896759033, + "logps/rejected": -2.3637421131134033, + "loss": 3.0449, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.50555992126465, + "rewards/margins": 0.1318587362766266, + "rewards/rejected": -23.637420654296875, + "step": 9320 + }, + { + "epoch": 0.3143011223836327, + "grad_norm": 31.85713768005371, + "learning_rate": 8.665106927029894e-07, + "logits/chosen": -0.6978214383125305, + "logits/rejected": -0.8441342115402222, + "logps/chosen": -1.7820132970809937, + "logps/rejected": -2.2420763969421387, + "loss": 2.1663, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.820133209228516, + "rewards/margins": 4.6006340980529785, + "rewards/rejected": -22.420766830444336, + "step": 9325 + }, + { + "epoch": 0.31446964845461595, + "grad_norm": 1.8817801475524902, + "learning_rate": 8.663105573087007e-07, + "logits/chosen": -0.695899486541748, + "logits/rejected": -0.8433302044868469, + "logps/chosen": -1.9872623682022095, + "logps/rejected": -2.4130704402923584, + "loss": 1.7234, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.872623443603516, + "rewards/margins": 4.258082866668701, + "rewards/rejected": -24.130704879760742, + "step": 9330 + }, + { + "epoch": 0.3146381745255991, + "grad_norm": 15.969996452331543, + "learning_rate": 8.661102951487601e-07, + "logits/chosen": -1.011237382888794, + "logits/rejected": -1.1954885721206665, + "logps/chosen": -1.8208835124969482, + "logps/rejected": -1.9458087682724, + "loss": 2.7697, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.208837509155273, + "rewards/margins": 1.2492512464523315, + "rewards/rejected": -19.458087921142578, + "step": 9335 + }, + { + "epoch": 0.3148067005965823, + "grad_norm": 44.60512161254883, + "learning_rate": 8.659099062924706e-07, + "logits/chosen": -0.6627892851829529, + "logits/rejected": -0.7953187227249146, + "logps/chosen": -1.8654075860977173, + "logps/rejected": -1.9347187280654907, + "loss": 2.8071, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.65407371520996, + "rewards/margins": 0.6931111216545105, + "rewards/rejected": -19.347187042236328, + "step": 9340 + }, + { + "epoch": 0.3149752266675655, + "grad_norm": 82.3418197631836, + "learning_rate": 8.657093908091788e-07, + "logits/chosen": -1.0475388765335083, + "logits/rejected": -1.3520863056182861, + "logps/chosen": -1.7849271297454834, + "logps/rejected": -1.921607255935669, + "loss": 2.2732, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.84926986694336, + "rewards/margins": 1.3668019771575928, + "rewards/rejected": -19.216073989868164, + "step": 9345 + }, + { + "epoch": 0.31514375273854867, + "grad_norm": 42.31816864013672, + "learning_rate": 8.655087487682753e-07, + "logits/chosen": -0.971020519733429, + "logits/rejected": -0.9450328946113586, + "logps/chosen": -1.6906359195709229, + "logps/rejected": -1.7991920709609985, + "loss": 2.3246, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.906360626220703, + "rewards/margins": 1.0855603218078613, + "rewards/rejected": -17.991918563842773, + "step": 9350 + }, + { + "epoch": 0.31531227880953183, + "grad_norm": 25.02703857421875, + "learning_rate": 8.653079802391943e-07, + "logits/chosen": -1.0409469604492188, + "logits/rejected": -1.2706401348114014, + "logps/chosen": -1.9263372421264648, + "logps/rejected": -1.8309767246246338, + "loss": 4.0889, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.26337242126465, + "rewards/margins": -0.9536054730415344, + "rewards/rejected": -18.30976676940918, + "step": 9355 + }, + { + "epoch": 0.315480804880515, + "grad_norm": 31.79754066467285, + "learning_rate": 8.651070852914137e-07, + "logits/chosen": -0.7451499104499817, + "logits/rejected": -0.7770187258720398, + "logps/chosen": -1.7138770818710327, + "logps/rejected": -1.7701056003570557, + "loss": 2.5513, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.13877296447754, + "rewards/margins": 0.5622828602790833, + "rewards/rejected": -17.7010555267334, + "step": 9360 + }, + { + "epoch": 0.3156493309514982, + "grad_norm": 12.896461486816406, + "learning_rate": 8.649060639944557e-07, + "logits/chosen": -0.5657856464385986, + "logits/rejected": -0.6135199069976807, + "logps/chosen": -2.1283740997314453, + "logps/rejected": -2.3975729942321777, + "loss": 1.6793, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.283740997314453, + "rewards/margins": 2.691988945007324, + "rewards/rejected": -23.97572898864746, + "step": 9365 + }, + { + "epoch": 0.3158178570224814, + "grad_norm": 27.075576782226562, + "learning_rate": 8.647049164178857e-07, + "logits/chosen": -1.006801724433899, + "logits/rejected": -1.1020275354385376, + "logps/chosen": -1.8209720849990845, + "logps/rejected": -1.8261454105377197, + "loss": 3.1815, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.209720611572266, + "rewards/margins": 0.05173378065228462, + "rewards/rejected": -18.26145362854004, + "step": 9370 + }, + { + "epoch": 0.31598638309346455, + "grad_norm": 40.327659606933594, + "learning_rate": 8.645036426313128e-07, + "logits/chosen": -1.0180786848068237, + "logits/rejected": -1.0732612609863281, + "logps/chosen": -2.100003719329834, + "logps/rejected": -2.201460599899292, + "loss": 2.8745, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.000036239624023, + "rewards/margins": 1.0145716667175293, + "rewards/rejected": -22.014606475830078, + "step": 9375 + }, + { + "epoch": 0.3161549091644477, + "grad_norm": 33.58378219604492, + "learning_rate": 8.643022427043901e-07, + "logits/chosen": -0.5249465703964233, + "logits/rejected": -0.6569808125495911, + "logps/chosen": -2.062403440475464, + "logps/rejected": -2.0120742321014404, + "loss": 3.6936, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -20.624034881591797, + "rewards/margins": -0.5032904744148254, + "rewards/rejected": -20.120744705200195, + "step": 9380 + }, + { + "epoch": 0.31632343523543094, + "grad_norm": 22.796710968017578, + "learning_rate": 8.641007167068141e-07, + "logits/chosen": -0.9827069044113159, + "logits/rejected": -1.0087189674377441, + "logps/chosen": -2.1850533485412598, + "logps/rejected": -2.171093225479126, + "loss": 4.1346, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.85053062438965, + "rewards/margins": -0.1395985633134842, + "rewards/rejected": -21.7109317779541, + "step": 9385 + }, + { + "epoch": 0.3164919613064141, + "grad_norm": 16.15544319152832, + "learning_rate": 8.638990647083252e-07, + "logits/chosen": -0.7128480672836304, + "logits/rejected": -0.888879120349884, + "logps/chosen": -2.0813021659851074, + "logps/rejected": -2.189892530441284, + "loss": 2.7087, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.81302261352539, + "rewards/margins": 1.085901141166687, + "rewards/rejected": -21.89892578125, + "step": 9390 + }, + { + "epoch": 0.31666048737739727, + "grad_norm": 20.231306076049805, + "learning_rate": 8.636972867787069e-07, + "logits/chosen": -0.823818027973175, + "logits/rejected": -0.8593767881393433, + "logps/chosen": -2.1330342292785645, + "logps/rejected": -2.3957901000976562, + "loss": 1.9103, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.330341339111328, + "rewards/margins": 2.627556324005127, + "rewards/rejected": -23.957897186279297, + "step": 9395 + }, + { + "epoch": 0.3168290134483805, + "grad_norm": 18.477651596069336, + "learning_rate": 8.634953829877869e-07, + "logits/chosen": -0.855760395526886, + "logits/rejected": -0.9541324377059937, + "logps/chosen": -2.114394187927246, + "logps/rejected": -2.291926860809326, + "loss": 2.7066, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.143939971923828, + "rewards/margins": 1.7753299474716187, + "rewards/rejected": -22.919269561767578, + "step": 9400 + }, + { + "epoch": 0.31699753951936366, + "grad_norm": 38.60859298706055, + "learning_rate": 8.632933534054359e-07, + "logits/chosen": -0.571567177772522, + "logits/rejected": -0.5827969312667847, + "logps/chosen": -1.69004225730896, + "logps/rejected": -1.6473404169082642, + "loss": 3.5833, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -16.90042495727539, + "rewards/margins": -0.4270210862159729, + "rewards/rejected": -16.473403930664062, + "step": 9405 + }, + { + "epoch": 0.3171660655903468, + "grad_norm": 39.37350082397461, + "learning_rate": 8.630911981015683e-07, + "logits/chosen": -0.3582506775856018, + "logits/rejected": -0.49621373414993286, + "logps/chosen": -2.3286869525909424, + "logps/rejected": -2.4957258701324463, + "loss": 2.3083, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.286869049072266, + "rewards/margins": 1.670389175415039, + "rewards/rejected": -24.957258224487305, + "step": 9410 + }, + { + "epoch": 0.31733459166133, + "grad_norm": 34.105995178222656, + "learning_rate": 8.628889171461426e-07, + "logits/chosen": -1.1902964115142822, + "logits/rejected": -1.1787660121917725, + "logps/chosen": -1.9384952783584595, + "logps/rejected": -1.8969110250473022, + "loss": 4.4437, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -19.384952545166016, + "rewards/margins": -0.4158410131931305, + "rewards/rejected": -18.9691104888916, + "step": 9415 + }, + { + "epoch": 0.3175031177323132, + "grad_norm": 29.657711029052734, + "learning_rate": 8.626865106091596e-07, + "logits/chosen": -1.00395929813385, + "logits/rejected": -1.1669594049453735, + "logps/chosen": -1.771984338760376, + "logps/rejected": -1.697410225868225, + "loss": 3.8024, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -17.7198429107666, + "rewards/margins": -0.7457417249679565, + "rewards/rejected": -16.974102020263672, + "step": 9420 + }, + { + "epoch": 0.3176716438032964, + "grad_norm": 22.412580490112305, + "learning_rate": 8.624839785606648e-07, + "logits/chosen": -1.3314557075500488, + "logits/rejected": -1.3555439710617065, + "logps/chosen": -1.5902462005615234, + "logps/rejected": -1.7207386493682861, + "loss": 2.1892, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -15.902461051940918, + "rewards/margins": 1.3049246072769165, + "rewards/rejected": -17.207386016845703, + "step": 9425 + }, + { + "epoch": 0.31784016987427954, + "grad_norm": 121.15837097167969, + "learning_rate": 8.622813210707463e-07, + "logits/chosen": -0.8635732531547546, + "logits/rejected": -0.7936872243881226, + "logps/chosen": -2.18469500541687, + "logps/rejected": -2.245011568069458, + "loss": 3.3173, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.84695053100586, + "rewards/margins": 0.6031640768051147, + "rewards/rejected": -22.450115203857422, + "step": 9430 + }, + { + "epoch": 0.3180086959452627, + "grad_norm": 12.949443817138672, + "learning_rate": 8.620785382095357e-07, + "logits/chosen": -0.571534276008606, + "logits/rejected": -0.7806876301765442, + "logps/chosen": -2.23927640914917, + "logps/rejected": -2.2879207134246826, + "loss": 4.3462, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.392765045166016, + "rewards/margins": 0.4864432215690613, + "rewards/rejected": -22.879207611083984, + "step": 9435 + }, + { + "epoch": 0.31817722201624593, + "grad_norm": 53.13159942626953, + "learning_rate": 8.618756300472085e-07, + "logits/chosen": -0.9639241099357605, + "logits/rejected": -1.171112060546875, + "logps/chosen": -1.884033441543579, + "logps/rejected": -1.9627354145050049, + "loss": 2.5174, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.840335845947266, + "rewards/margins": 0.7870213389396667, + "rewards/rejected": -19.627357482910156, + "step": 9440 + }, + { + "epoch": 0.3183457480872291, + "grad_norm": 29.785985946655273, + "learning_rate": 8.616725966539831e-07, + "logits/chosen": -0.48238930106163025, + "logits/rejected": -0.6622364521026611, + "logps/chosen": -1.9177591800689697, + "logps/rejected": -2.15956449508667, + "loss": 2.3485, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.17759132385254, + "rewards/margins": 2.4180550575256348, + "rewards/rejected": -21.595645904541016, + "step": 9445 + }, + { + "epoch": 0.31851427415821226, + "grad_norm": 40.153404235839844, + "learning_rate": 8.614694381001213e-07, + "logits/chosen": -0.8885830044746399, + "logits/rejected": -0.7523598670959473, + "logps/chosen": -1.8252578973770142, + "logps/rejected": -1.7716503143310547, + "loss": 3.7513, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -18.252578735351562, + "rewards/margins": -0.5360754132270813, + "rewards/rejected": -17.716503143310547, + "step": 9450 + }, + { + "epoch": 0.3186828002291955, + "grad_norm": 28.715364456176758, + "learning_rate": 8.612661544559284e-07, + "logits/chosen": -1.2380956411361694, + "logits/rejected": -1.2025935649871826, + "logps/chosen": -1.6656516790390015, + "logps/rejected": -1.7289981842041016, + "loss": 2.9496, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.656518936157227, + "rewards/margins": 0.633465588092804, + "rewards/rejected": -17.289981842041016, + "step": 9455 + }, + { + "epoch": 0.31885132630017865, + "grad_norm": 35.392513275146484, + "learning_rate": 8.610627457917526e-07, + "logits/chosen": -0.8869184255599976, + "logits/rejected": -0.8394335508346558, + "logps/chosen": -2.1466517448425293, + "logps/rejected": -2.15181827545166, + "loss": 3.3505, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.46651840209961, + "rewards/margins": 0.05166482925415039, + "rewards/rejected": -21.5181827545166, + "step": 9460 + }, + { + "epoch": 0.3190198523711618, + "grad_norm": 24.403085708618164, + "learning_rate": 8.608592121779856e-07, + "logits/chosen": -0.5069986581802368, + "logits/rejected": -0.3841266632080078, + "logps/chosen": -2.139840602874756, + "logps/rejected": -2.1221654415130615, + "loss": 3.2549, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -21.398408889770508, + "rewards/margins": -0.17675228416919708, + "rewards/rejected": -21.221654891967773, + "step": 9465 + }, + { + "epoch": 0.319188378442145, + "grad_norm": 34.315101623535156, + "learning_rate": 8.606555536850628e-07, + "logits/chosen": -0.9683068990707397, + "logits/rejected": -0.9776903986930847, + "logps/chosen": -2.0443432331085205, + "logps/rejected": -2.006239414215088, + "loss": 3.6543, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.443431854248047, + "rewards/margins": -0.3810390532016754, + "rewards/rejected": -20.062393188476562, + "step": 9470 + }, + { + "epoch": 0.3193569045131282, + "grad_norm": 18.212900161743164, + "learning_rate": 8.604517703834622e-07, + "logits/chosen": -0.9584037661552429, + "logits/rejected": -0.8274585008621216, + "logps/chosen": -1.603839635848999, + "logps/rejected": -1.6559810638427734, + "loss": 2.941, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.03839683532715, + "rewards/margins": 0.5214144587516785, + "rewards/rejected": -16.559810638427734, + "step": 9475 + }, + { + "epoch": 0.31952543058411137, + "grad_norm": 31.893938064575195, + "learning_rate": 8.60247862343705e-07, + "logits/chosen": -0.7184762954711914, + "logits/rejected": -0.7728734016418457, + "logps/chosen": -1.8432371616363525, + "logps/rejected": -1.8536771535873413, + "loss": 3.3165, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.432369232177734, + "rewards/margins": 0.10440144687891006, + "rewards/rejected": -18.536771774291992, + "step": 9480 + }, + { + "epoch": 0.31969395665509454, + "grad_norm": 25.503551483154297, + "learning_rate": 8.600438296363559e-07, + "logits/chosen": -1.151992917060852, + "logits/rejected": -1.1667964458465576, + "logps/chosen": -1.5828298330307007, + "logps/rejected": -1.6267452239990234, + "loss": 2.7009, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -15.828298568725586, + "rewards/margins": 0.43915247917175293, + "rewards/rejected": -16.267452239990234, + "step": 9485 + }, + { + "epoch": 0.3198624827260777, + "grad_norm": 31.356788635253906, + "learning_rate": 8.598396723320224e-07, + "logits/chosen": -0.9881758689880371, + "logits/rejected": -1.132152795791626, + "logps/chosen": -1.8627265691757202, + "logps/rejected": -1.8474178314208984, + "loss": 3.6102, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -18.62726402282715, + "rewards/margins": -0.15308618545532227, + "rewards/rejected": -18.474178314208984, + "step": 9490 + }, + { + "epoch": 0.3200310087970609, + "grad_norm": 119.1594467163086, + "learning_rate": 8.596353905013556e-07, + "logits/chosen": -0.33203741908073425, + "logits/rejected": -0.49453800916671753, + "logps/chosen": -2.566382884979248, + "logps/rejected": -2.5018601417541504, + "loss": 3.7362, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -25.663827896118164, + "rewards/margins": -0.6452277302742004, + "rewards/rejected": -25.018598556518555, + "step": 9495 + }, + { + "epoch": 0.3201995348680441, + "grad_norm": 100.49848937988281, + "learning_rate": 8.594309842150491e-07, + "logits/chosen": -1.0113314390182495, + "logits/rejected": -0.9509127736091614, + "logps/chosen": -1.7100799083709717, + "logps/rejected": -1.7621523141860962, + "loss": 2.6916, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.100797653198242, + "rewards/margins": 0.5207257866859436, + "rewards/rejected": -17.621524810791016, + "step": 9500 + }, + { + "epoch": 0.32036806093902725, + "grad_norm": 28.509531021118164, + "learning_rate": 8.592264535438399e-07, + "logits/chosen": -0.9975595474243164, + "logits/rejected": -1.1303162574768066, + "logps/chosen": -1.623504400253296, + "logps/rejected": -1.6738468408584595, + "loss": 2.7432, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.235044479370117, + "rewards/margins": 0.5034238696098328, + "rewards/rejected": -16.738468170166016, + "step": 9505 + }, + { + "epoch": 0.3205365870100104, + "grad_norm": 17.786806106567383, + "learning_rate": 8.590217985585083e-07, + "logits/chosen": -0.43555235862731934, + "logits/rejected": -0.549434244632721, + "logps/chosen": -2.0390753746032715, + "logps/rejected": -2.358064889907837, + "loss": 2.2292, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.39075469970703, + "rewards/margins": 3.1898932456970215, + "rewards/rejected": -23.58064842224121, + "step": 9510 + }, + { + "epoch": 0.32070511308099364, + "grad_norm": 69.22200012207031, + "learning_rate": 8.588170193298769e-07, + "logits/chosen": -0.5065039396286011, + "logits/rejected": -0.5373364686965942, + "logps/chosen": -2.0895285606384277, + "logps/rejected": -2.2371814250946045, + "loss": 2.788, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.895288467407227, + "rewards/margins": 1.4765275716781616, + "rewards/rejected": -22.371814727783203, + "step": 9515 + }, + { + "epoch": 0.3208736391519768, + "grad_norm": 14.257323265075684, + "learning_rate": 8.58612115928812e-07, + "logits/chosen": -1.1542446613311768, + "logits/rejected": -1.0949870347976685, + "logps/chosen": -1.9531142711639404, + "logps/rejected": -2.2141025066375732, + "loss": 2.0862, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.531143188476562, + "rewards/margins": 2.609881639480591, + "rewards/rejected": -22.14102554321289, + "step": 9520 + }, + { + "epoch": 0.32104216522296, + "grad_norm": 37.42106628417969, + "learning_rate": 8.584070884262225e-07, + "logits/chosen": -1.0090105533599854, + "logits/rejected": -1.0935182571411133, + "logps/chosen": -1.9097763299942017, + "logps/rejected": -2.0177340507507324, + "loss": 2.1412, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.09776496887207, + "rewards/margins": 1.0795772075653076, + "rewards/rejected": -20.17734146118164, + "step": 9525 + }, + { + "epoch": 0.3212106912939432, + "grad_norm": 24.182695388793945, + "learning_rate": 8.582019368930605e-07, + "logits/chosen": -1.008709192276001, + "logits/rejected": -1.2963372468948364, + "logps/chosen": -1.8214937448501587, + "logps/rejected": -2.3255906105041504, + "loss": 2.9947, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.21493911743164, + "rewards/margins": 5.0409698486328125, + "rewards/rejected": -23.25590705871582, + "step": 9530 + }, + { + "epoch": 0.32137921736492636, + "grad_norm": 48.76929473876953, + "learning_rate": 8.579966614003206e-07, + "logits/chosen": -0.8978897929191589, + "logits/rejected": -0.8626217842102051, + "logps/chosen": -1.9985179901123047, + "logps/rejected": -2.1461033821105957, + "loss": 2.7831, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.985179901123047, + "rewards/margins": 1.4758514165878296, + "rewards/rejected": -21.461029052734375, + "step": 9535 + }, + { + "epoch": 0.3215477434359095, + "grad_norm": 17.92789649963379, + "learning_rate": 8.577912620190408e-07, + "logits/chosen": -1.0495909452438354, + "logits/rejected": -1.0890016555786133, + "logps/chosen": -1.8421592712402344, + "logps/rejected": -2.0374155044555664, + "loss": 2.5219, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.42159080505371, + "rewards/margins": 1.9525625705718994, + "rewards/rejected": -20.374156951904297, + "step": 9540 + }, + { + "epoch": 0.3217162695068927, + "grad_norm": 19.91221046447754, + "learning_rate": 8.575857388203016e-07, + "logits/chosen": -0.7675440907478333, + "logits/rejected": -0.9447044134140015, + "logps/chosen": -1.7784792184829712, + "logps/rejected": -2.116765260696411, + "loss": 2.1139, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.784793853759766, + "rewards/margins": 3.382856845855713, + "rewards/rejected": -21.16765022277832, + "step": 9545 + }, + { + "epoch": 0.3218847955778759, + "grad_norm": 17.915637969970703, + "learning_rate": 8.573800918752266e-07, + "logits/chosen": -1.1638829708099365, + "logits/rejected": -1.052834391593933, + "logps/chosen": -1.5051783323287964, + "logps/rejected": -1.4815104007720947, + "loss": 3.3848, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -15.051783561706543, + "rewards/margins": -0.23667888343334198, + "rewards/rejected": -14.815104484558105, + "step": 9550 + }, + { + "epoch": 0.3220533216488591, + "grad_norm": 22.767057418823242, + "learning_rate": 8.571743212549817e-07, + "logits/chosen": -0.7604719400405884, + "logits/rejected": -1.0411133766174316, + "logps/chosen": -1.8910396099090576, + "logps/rejected": -1.7851155996322632, + "loss": 4.1678, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.910396575927734, + "rewards/margins": -1.0592386722564697, + "rewards/rejected": -17.85115623474121, + "step": 9555 + }, + { + "epoch": 0.32222184771984225, + "grad_norm": 199.3408966064453, + "learning_rate": 8.569684270307767e-07, + "logits/chosen": -0.7701305150985718, + "logits/rejected": -0.8080227971076965, + "logps/chosen": -2.486893892288208, + "logps/rejected": -2.411651134490967, + "loss": 4.5925, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -24.868938446044922, + "rewards/margins": -0.7524264454841614, + "rewards/rejected": -24.11651039123535, + "step": 9560 + }, + { + "epoch": 0.3223903737908254, + "grad_norm": 28.15599822998047, + "learning_rate": 8.567624092738629e-07, + "logits/chosen": -0.9503934979438782, + "logits/rejected": -1.0396947860717773, + "logps/chosen": -1.709754228591919, + "logps/rejected": -1.7016417980194092, + "loss": 3.2697, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.09754180908203, + "rewards/margins": -0.08112458884716034, + "rewards/rejected": -17.01641845703125, + "step": 9565 + }, + { + "epoch": 0.32255889986180863, + "grad_norm": 25.845806121826172, + "learning_rate": 8.565562680555351e-07, + "logits/chosen": -0.9343695640563965, + "logits/rejected": -0.6249425411224365, + "logps/chosen": -1.81558096408844, + "logps/rejected": -1.869520902633667, + "loss": 3.1009, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.155807495117188, + "rewards/margins": 0.5394006967544556, + "rewards/rejected": -18.695209503173828, + "step": 9570 + }, + { + "epoch": 0.3227274259327918, + "grad_norm": 30.309011459350586, + "learning_rate": 8.563500034471308e-07, + "logits/chosen": -0.9814979434013367, + "logits/rejected": -1.0041558742523193, + "logps/chosen": -1.9443080425262451, + "logps/rejected": -1.9487826824188232, + "loss": 3.2169, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -19.44308090209961, + "rewards/margins": 0.04474801942706108, + "rewards/rejected": -19.48782730102539, + "step": 9575 + }, + { + "epoch": 0.32289595200377497, + "grad_norm": 27.450937271118164, + "learning_rate": 8.561436155200299e-07, + "logits/chosen": -0.8652593493461609, + "logits/rejected": -0.8776264190673828, + "logps/chosen": -2.157226800918579, + "logps/rejected": -2.30631947517395, + "loss": 2.236, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.572269439697266, + "rewards/margins": 1.4909271001815796, + "rewards/rejected": -23.063194274902344, + "step": 9580 + }, + { + "epoch": 0.3230644780747582, + "grad_norm": 25.80462646484375, + "learning_rate": 8.559371043456551e-07, + "logits/chosen": -0.7589584589004517, + "logits/rejected": -0.8819445371627808, + "logps/chosen": -2.0037195682525635, + "logps/rejected": -2.218597173690796, + "loss": 1.9919, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.037195205688477, + "rewards/margins": 2.1487772464752197, + "rewards/rejected": -22.185970306396484, + "step": 9585 + }, + { + "epoch": 0.32323300414574135, + "grad_norm": 23.218080520629883, + "learning_rate": 8.55730469995472e-07, + "logits/chosen": -0.8827985525131226, + "logits/rejected": -0.8832064867019653, + "logps/chosen": -2.133117198944092, + "logps/rejected": -2.1974258422851562, + "loss": 2.6685, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.331172943115234, + "rewards/margins": 0.6430840492248535, + "rewards/rejected": -21.974258422851562, + "step": 9590 + }, + { + "epoch": 0.3234015302167245, + "grad_norm": 28.35801887512207, + "learning_rate": 8.555237125409882e-07, + "logits/chosen": -0.9255178570747375, + "logits/rejected": -0.8860748410224915, + "logps/chosen": -2.0652050971984863, + "logps/rejected": -2.2760536670684814, + "loss": 2.1149, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.652048110961914, + "rewards/margins": 2.1084868907928467, + "rewards/rejected": -22.76053810119629, + "step": 9595 + }, + { + "epoch": 0.3235700562877077, + "grad_norm": 14.770458221435547, + "learning_rate": 8.553168320537547e-07, + "logits/chosen": -1.5179839134216309, + "logits/rejected": -1.4965096712112427, + "logps/chosen": -2.1413462162017822, + "logps/rejected": -2.4134342670440674, + "loss": 2.701, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.413461685180664, + "rewards/margins": 2.7208831310272217, + "rewards/rejected": -24.13434410095215, + "step": 9600 + }, + { + "epoch": 0.3235700562877077, + "eval_logits/chosen": -1.2073123455047607, + "eval_logits/rejected": -1.278466820716858, + "eval_logps/chosen": -1.8508267402648926, + "eval_logps/rejected": -1.9134628772735596, + "eval_loss": 3.102193593978882, + "eval_rewards/accuracies": 0.6100000143051147, + "eval_rewards/chosen": -18.50826644897461, + "eval_rewards/margins": 0.6263617873191833, + "eval_rewards/rejected": -19.134628295898438, + "eval_runtime": 12.9203, + "eval_samples_per_second": 7.74, + "eval_steps_per_second": 1.935, + "step": 9600 + }, + { + "epoch": 0.3237385823586909, + "grad_norm": 40.8096923828125, + "learning_rate": 8.551098286053647e-07, + "logits/chosen": -1.0502384901046753, + "logits/rejected": -0.9606598019599915, + "logps/chosen": -1.8660523891448975, + "logps/rejected": -1.9286441802978516, + "loss": 2.6769, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.660526275634766, + "rewards/margins": 0.6259174346923828, + "rewards/rejected": -19.286441802978516, + "step": 9605 + }, + { + "epoch": 0.3239071084296741, + "grad_norm": 31.944400787353516, + "learning_rate": 8.549027022674536e-07, + "logits/chosen": -1.1471023559570312, + "logits/rejected": -1.282149314880371, + "logps/chosen": -1.9713191986083984, + "logps/rejected": -2.035529851913452, + "loss": 2.892, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.713191986083984, + "rewards/margins": 0.642108142375946, + "rewards/rejected": -20.35529899597168, + "step": 9610 + }, + { + "epoch": 0.32407563450065724, + "grad_norm": 12.34630012512207, + "learning_rate": 8.546954531116999e-07, + "logits/chosen": -0.7726519703865051, + "logits/rejected": -1.0031630992889404, + "logps/chosen": -1.6736605167388916, + "logps/rejected": -1.8040440082550049, + "loss": 2.2425, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.73660659790039, + "rewards/margins": 1.3038337230682373, + "rewards/rejected": -18.04043960571289, + "step": 9615 + }, + { + "epoch": 0.3242441605716404, + "grad_norm": 47.13421630859375, + "learning_rate": 8.544880812098242e-07, + "logits/chosen": -1.16587495803833, + "logits/rejected": -0.9911531209945679, + "logps/chosen": -1.9475829601287842, + "logps/rejected": -1.936532974243164, + "loss": 3.2225, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.475828170776367, + "rewards/margins": -0.11049928516149521, + "rewards/rejected": -19.36532974243164, + "step": 9620 + }, + { + "epoch": 0.3244126866426236, + "grad_norm": 21.326196670532227, + "learning_rate": 8.542805866335902e-07, + "logits/chosen": -0.9639849662780762, + "logits/rejected": -1.0168709754943848, + "logps/chosen": -1.6442312002182007, + "logps/rejected": -1.623727560043335, + "loss": 3.3254, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.442312240600586, + "rewards/margins": -0.20503464341163635, + "rewards/rejected": -16.23727798461914, + "step": 9625 + }, + { + "epoch": 0.3245812127136068, + "grad_norm": 22.18870735168457, + "learning_rate": 8.54072969454803e-07, + "logits/chosen": -0.5071839094161987, + "logits/rejected": -0.3684050142765045, + "logps/chosen": -2.0935750007629395, + "logps/rejected": -2.424656391143799, + "loss": 2.7616, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.935749053955078, + "rewards/margins": 3.3108131885528564, + "rewards/rejected": -24.246562957763672, + "step": 9630 + }, + { + "epoch": 0.32474973878458996, + "grad_norm": 21.057998657226562, + "learning_rate": 8.53865229745311e-07, + "logits/chosen": -0.9777098894119263, + "logits/rejected": -1.1507208347320557, + "logps/chosen": -1.7663494348526, + "logps/rejected": -1.8777844905853271, + "loss": 3.0791, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.663494110107422, + "rewards/margins": 1.114350438117981, + "rewards/rejected": -18.777843475341797, + "step": 9635 + }, + { + "epoch": 0.3249182648555732, + "grad_norm": 26.480030059814453, + "learning_rate": 8.536573675770048e-07, + "logits/chosen": -0.8730767965316772, + "logits/rejected": -0.8443295359611511, + "logps/chosen": -1.9790958166122437, + "logps/rejected": -2.0051825046539307, + "loss": 3.1889, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.790958404541016, + "rewards/margins": 0.2608667314052582, + "rewards/rejected": -20.05182456970215, + "step": 9640 + }, + { + "epoch": 0.32508679092655635, + "grad_norm": 27.223636627197266, + "learning_rate": 8.534493830218173e-07, + "logits/chosen": -0.8448840379714966, + "logits/rejected": -0.9053970575332642, + "logps/chosen": -1.7872326374053955, + "logps/rejected": -1.7945051193237305, + "loss": 3.2698, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.872325897216797, + "rewards/margins": 0.07272644340991974, + "rewards/rejected": -17.945053100585938, + "step": 9645 + }, + { + "epoch": 0.3252553169975395, + "grad_norm": 22.850114822387695, + "learning_rate": 8.532412761517236e-07, + "logits/chosen": -0.45379573106765747, + "logits/rejected": -0.5692591071128845, + "logps/chosen": -1.8840221166610718, + "logps/rejected": -1.8490978479385376, + "loss": 3.6867, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.840221405029297, + "rewards/margins": -0.3492446839809418, + "rewards/rejected": -18.490978240966797, + "step": 9650 + }, + { + "epoch": 0.3254238430685227, + "grad_norm": 30.58220672607422, + "learning_rate": 8.530330470387412e-07, + "logits/chosen": -1.219278335571289, + "logits/rejected": -1.2407810688018799, + "logps/chosen": -1.8869062662124634, + "logps/rejected": -1.8954986333847046, + "loss": 3.1748, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.869060516357422, + "rewards/margins": 0.08592300117015839, + "rewards/rejected": -18.954986572265625, + "step": 9655 + }, + { + "epoch": 0.3255923691395059, + "grad_norm": 23.84304428100586, + "learning_rate": 8.528246957549303e-07, + "logits/chosen": -1.1590244770050049, + "logits/rejected": -1.387596607208252, + "logps/chosen": -1.9695241451263428, + "logps/rejected": -2.313225269317627, + "loss": 2.8357, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.69524383544922, + "rewards/margins": 3.4370105266571045, + "rewards/rejected": -23.132251739501953, + "step": 9660 + }, + { + "epoch": 0.32576089521048907, + "grad_norm": 25.04132080078125, + "learning_rate": 8.52616222372393e-07, + "logits/chosen": -0.4334254264831543, + "logits/rejected": -0.39630335569381714, + "logps/chosen": -1.6619259119033813, + "logps/rejected": -1.7265924215316772, + "loss": 2.4997, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -16.619258880615234, + "rewards/margins": 0.6466663479804993, + "rewards/rejected": -17.265926361083984, + "step": 9665 + }, + { + "epoch": 0.32592942128147223, + "grad_norm": 34.72787857055664, + "learning_rate": 8.524076269632736e-07, + "logits/chosen": -0.741960346698761, + "logits/rejected": -0.7212079763412476, + "logps/chosen": -1.7507514953613281, + "logps/rejected": -1.8867127895355225, + "loss": 2.0226, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -17.50751304626465, + "rewards/margins": 1.3596150875091553, + "rewards/rejected": -18.867128372192383, + "step": 9670 + }, + { + "epoch": 0.3260979473524554, + "grad_norm": 41.89925003051758, + "learning_rate": 8.521989095997589e-07, + "logits/chosen": -0.8195021748542786, + "logits/rejected": -0.7949432730674744, + "logps/chosen": -1.8670480251312256, + "logps/rejected": -2.0466132164001465, + "loss": 2.6577, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.670482635498047, + "rewards/margins": 1.7956523895263672, + "rewards/rejected": -20.46613311767578, + "step": 9675 + }, + { + "epoch": 0.3262664734234386, + "grad_norm": 21.182729721069336, + "learning_rate": 8.519900703540776e-07, + "logits/chosen": -1.038873553276062, + "logits/rejected": -0.9915347099304199, + "logps/chosen": -1.9196319580078125, + "logps/rejected": -2.232133150100708, + "loss": 2.1505, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.196319580078125, + "rewards/margins": 3.1250100135803223, + "rewards/rejected": -22.32132911682129, + "step": 9680 + }, + { + "epoch": 0.3264349994944218, + "grad_norm": 23.9052677154541, + "learning_rate": 8.517811092985008e-07, + "logits/chosen": -0.7712268829345703, + "logits/rejected": -0.8451194763183594, + "logps/chosen": -2.0270209312438965, + "logps/rejected": -2.146897792816162, + "loss": 2.9616, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.27021026611328, + "rewards/margins": 1.1987684965133667, + "rewards/rejected": -21.468978881835938, + "step": 9685 + }, + { + "epoch": 0.32660352556540495, + "grad_norm": 30.760866165161133, + "learning_rate": 8.515720265053416e-07, + "logits/chosen": -0.8517478704452515, + "logits/rejected": -0.6468242406845093, + "logps/chosen": -1.8079086542129517, + "logps/rejected": -1.719451665878296, + "loss": 3.981, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.079084396362305, + "rewards/margins": -0.8845663070678711, + "rewards/rejected": -17.19451904296875, + "step": 9690 + }, + { + "epoch": 0.32677205163638817, + "grad_norm": 2.7800254821777344, + "learning_rate": 8.513628220469556e-07, + "logits/chosen": -0.8500580787658691, + "logits/rejected": -0.9404166340827942, + "logps/chosen": -1.8277915716171265, + "logps/rejected": -2.126081705093384, + "loss": 1.8373, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.277915954589844, + "rewards/margins": 2.982900619506836, + "rewards/rejected": -21.26081657409668, + "step": 9695 + }, + { + "epoch": 0.32694057770737134, + "grad_norm": 104.6100845336914, + "learning_rate": 8.5115349599574e-07, + "logits/chosen": -0.7318485975265503, + "logits/rejected": -0.7622694373130798, + "logps/chosen": -2.2147345542907715, + "logps/rejected": -2.025134801864624, + "loss": 5.1087, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.1473445892334, + "rewards/margins": -1.895997405052185, + "rewards/rejected": -20.2513484954834, + "step": 9700 + }, + { + "epoch": 0.3271091037783545, + "grad_norm": 17.52537727355957, + "learning_rate": 8.509440484241342e-07, + "logits/chosen": -1.0226285457611084, + "logits/rejected": -1.0693570375442505, + "logps/chosen": -2.1967062950134277, + "logps/rejected": -2.1093969345092773, + "loss": 4.3936, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.967063903808594, + "rewards/margins": -0.8730939030647278, + "rewards/rejected": -21.09396743774414, + "step": 9705 + }, + { + "epoch": 0.32727762984933767, + "grad_norm": 19.541439056396484, + "learning_rate": 8.507344794046201e-07, + "logits/chosen": -1.3186284303665161, + "logits/rejected": -1.2661542892456055, + "logps/chosen": -1.856690764427185, + "logps/rejected": -1.9217170476913452, + "loss": 2.7054, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.566905975341797, + "rewards/margins": 0.6502623558044434, + "rewards/rejected": -19.21717071533203, + "step": 9710 + }, + { + "epoch": 0.3274461559203209, + "grad_norm": 22.625598907470703, + "learning_rate": 8.505247890097208e-07, + "logits/chosen": -0.7529505491256714, + "logits/rejected": -0.941428005695343, + "logps/chosen": -1.9435609579086304, + "logps/rejected": -2.5068881511688232, + "loss": 1.0568, + "rewards/accuracies": 1.0, + "rewards/chosen": -19.435611724853516, + "rewards/margins": 5.633272647857666, + "rewards/rejected": -25.06888198852539, + "step": 9715 + }, + { + "epoch": 0.32761468199130406, + "grad_norm": 16.12258529663086, + "learning_rate": 8.503149773120023e-07, + "logits/chosen": -0.8368858098983765, + "logits/rejected": -0.9379183053970337, + "logps/chosen": -2.060410976409912, + "logps/rejected": -2.178067445755005, + "loss": 3.0949, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.604108810424805, + "rewards/margins": 1.1765660047531128, + "rewards/rejected": -21.78067398071289, + "step": 9720 + }, + { + "epoch": 0.3277832080622872, + "grad_norm": 23.239463806152344, + "learning_rate": 8.501050443840721e-07, + "logits/chosen": -1.0896296501159668, + "logits/rejected": -1.2426843643188477, + "logps/chosen": -1.9288963079452515, + "logps/rejected": -2.2791318893432617, + "loss": 1.6746, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.288963317871094, + "rewards/margins": 3.502357006072998, + "rewards/rejected": -22.791318893432617, + "step": 9725 + }, + { + "epoch": 0.3279517341332704, + "grad_norm": 29.32204246520996, + "learning_rate": 8.498949902985795e-07, + "logits/chosen": -0.5991629362106323, + "logits/rejected": -0.5655861496925354, + "logps/chosen": -2.132098436355591, + "logps/rejected": -2.0694327354431152, + "loss": 4.0641, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.32098388671875, + "rewards/margins": -0.6266528367996216, + "rewards/rejected": -20.6943302154541, + "step": 9730 + }, + { + "epoch": 0.3281202602042536, + "grad_norm": 43.981815338134766, + "learning_rate": 8.49684815128216e-07, + "logits/chosen": -0.7278081178665161, + "logits/rejected": -0.7373084425926208, + "logps/chosen": -2.299593448638916, + "logps/rejected": -2.4935460090637207, + "loss": 1.894, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.99593734741211, + "rewards/margins": 1.9395256042480469, + "rewards/rejected": -24.935461044311523, + "step": 9735 + }, + { + "epoch": 0.3282887862752368, + "grad_norm": 18.560821533203125, + "learning_rate": 8.494745189457151e-07, + "logits/chosen": -1.152789831161499, + "logits/rejected": -1.0454437732696533, + "logps/chosen": -1.897719144821167, + "logps/rejected": -2.0982089042663574, + "loss": 3.2824, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.977191925048828, + "rewards/margins": 2.0048956871032715, + "rewards/rejected": -20.98208999633789, + "step": 9740 + }, + { + "epoch": 0.32845731234621994, + "grad_norm": 114.25135040283203, + "learning_rate": 8.49264101823852e-07, + "logits/chosen": -0.6349064111709595, + "logits/rejected": -0.6722576022148132, + "logps/chosen": -2.2356173992156982, + "logps/rejected": -2.3247010707855225, + "loss": 2.9115, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.35617446899414, + "rewards/margins": 0.8908360600471497, + "rewards/rejected": -23.24700927734375, + "step": 9745 + }, + { + "epoch": 0.32862583841720316, + "grad_norm": 45.583343505859375, + "learning_rate": 8.490535638354436e-07, + "logits/chosen": -0.9422609210014343, + "logits/rejected": -0.7428984045982361, + "logps/chosen": -1.7766081094741821, + "logps/rejected": -1.7448434829711914, + "loss": 3.4315, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.766080856323242, + "rewards/margins": -0.31764650344848633, + "rewards/rejected": -17.448434829711914, + "step": 9750 + }, + { + "epoch": 0.32879436448818633, + "grad_norm": 26.040782928466797, + "learning_rate": 8.48842905053349e-07, + "logits/chosen": -0.9188686609268188, + "logits/rejected": -1.0052945613861084, + "logps/chosen": -2.036522626876831, + "logps/rejected": -2.0704965591430664, + "loss": 2.982, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.36522674560547, + "rewards/margins": 0.33973854780197144, + "rewards/rejected": -20.70496368408203, + "step": 9755 + }, + { + "epoch": 0.3289628905591695, + "grad_norm": 27.022668838500977, + "learning_rate": 8.486321255504687e-07, + "logits/chosen": -1.0087183713912964, + "logits/rejected": -1.115375280380249, + "logps/chosen": -1.6923017501831055, + "logps/rejected": -1.6801421642303467, + "loss": 3.1842, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.923015594482422, + "rewards/margins": -0.12159526348114014, + "rewards/rejected": -16.801422119140625, + "step": 9760 + }, + { + "epoch": 0.32913141663015266, + "grad_norm": 40.464080810546875, + "learning_rate": 8.484212253997455e-07, + "logits/chosen": -0.6966105103492737, + "logits/rejected": -0.9393747448921204, + "logps/chosen": -2.2360169887542725, + "logps/rejected": -1.9587417840957642, + "loss": 5.9142, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -22.360172271728516, + "rewards/margins": -2.772754430770874, + "rewards/rejected": -19.587417602539062, + "step": 9765 + }, + { + "epoch": 0.3292999427011359, + "grad_norm": 45.77811050415039, + "learning_rate": 8.482102046741633e-07, + "logits/chosen": -0.8145803213119507, + "logits/rejected": -0.793364405632019, + "logps/chosen": -2.1728711128234863, + "logps/rejected": -2.2482926845550537, + "loss": 3.1594, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.728710174560547, + "rewards/margins": 0.7542173266410828, + "rewards/rejected": -22.482927322387695, + "step": 9770 + }, + { + "epoch": 0.32946846877211905, + "grad_norm": 53.0908203125, + "learning_rate": 8.479990634467482e-07, + "logits/chosen": -0.9707155227661133, + "logits/rejected": -0.947056770324707, + "logps/chosen": -2.244295120239258, + "logps/rejected": -2.3716988563537598, + "loss": 2.0424, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.442951202392578, + "rewards/margins": 1.2740370035171509, + "rewards/rejected": -23.71698760986328, + "step": 9775 + }, + { + "epoch": 0.3296369948431022, + "grad_norm": 23.31206512451172, + "learning_rate": 8.47787801790568e-07, + "logits/chosen": -0.6974349021911621, + "logits/rejected": -0.9545847177505493, + "logps/chosen": -1.9938684701919556, + "logps/rejected": -2.0778064727783203, + "loss": 2.9787, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.93868637084961, + "rewards/margins": 0.8393799662590027, + "rewards/rejected": -20.778064727783203, + "step": 9780 + }, + { + "epoch": 0.3298055209140854, + "grad_norm": 23.06899070739746, + "learning_rate": 8.475764197787317e-07, + "logits/chosen": -1.0806128978729248, + "logits/rejected": -0.8885553479194641, + "logps/chosen": -1.8322607278823853, + "logps/rejected": -1.838618516921997, + "loss": 3.6789, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.322607040405273, + "rewards/margins": 0.0635797530412674, + "rewards/rejected": -18.386188507080078, + "step": 9785 + }, + { + "epoch": 0.3299740469850686, + "grad_norm": 22.257450103759766, + "learning_rate": 8.473649174843906e-07, + "logits/chosen": -1.033005952835083, + "logits/rejected": -1.2247979640960693, + "logps/chosen": -1.7999452352523804, + "logps/rejected": -1.8376827239990234, + "loss": 3.2559, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.999454498291016, + "rewards/margins": 0.37737494707107544, + "rewards/rejected": -18.376827239990234, + "step": 9790 + }, + { + "epoch": 0.33014257305605177, + "grad_norm": 38.79618835449219, + "learning_rate": 8.471532949807372e-07, + "logits/chosen": -0.8862309455871582, + "logits/rejected": -0.8179659843444824, + "logps/chosen": -1.777960181236267, + "logps/rejected": -1.7753187417984009, + "loss": 3.6499, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.779598236083984, + "rewards/margins": -0.026413727551698685, + "rewards/rejected": -17.75318717956543, + "step": 9795 + }, + { + "epoch": 0.33031109912703494, + "grad_norm": 15.938300132751465, + "learning_rate": 8.469415523410056e-07, + "logits/chosen": -0.6858940124511719, + "logits/rejected": -0.6337902545928955, + "logps/chosen": -1.9199140071868896, + "logps/rejected": -2.024050235748291, + "loss": 2.3887, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.199140548706055, + "rewards/margins": 1.0413639545440674, + "rewards/rejected": -20.240503311157227, + "step": 9800 + }, + { + "epoch": 0.33047962519801816, + "grad_norm": 27.558385848999023, + "learning_rate": 8.467296896384717e-07, + "logits/chosen": -0.9122729301452637, + "logits/rejected": -0.9424211382865906, + "logps/chosen": -1.8392369747161865, + "logps/rejected": -2.293322801589966, + "loss": 2.3289, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.39236831665039, + "rewards/margins": 4.540858268737793, + "rewards/rejected": -22.9332275390625, + "step": 9805 + }, + { + "epoch": 0.3306481512690013, + "grad_norm": 17.154170989990234, + "learning_rate": 8.465177069464528e-07, + "logits/chosen": -0.7819756269454956, + "logits/rejected": -0.9240609407424927, + "logps/chosen": -1.9444135427474976, + "logps/rejected": -2.0653367042541504, + "loss": 2.3107, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.444137573242188, + "rewards/margins": 1.209228754043579, + "rewards/rejected": -20.653366088867188, + "step": 9810 + }, + { + "epoch": 0.3308166773399845, + "grad_norm": 19.14472198486328, + "learning_rate": 8.463056043383079e-07, + "logits/chosen": -0.9145916700363159, + "logits/rejected": -0.8885605931282043, + "logps/chosen": -2.2163727283477783, + "logps/rejected": -2.3537745475769043, + "loss": 3.1827, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.163726806640625, + "rewards/margins": 1.3740198612213135, + "rewards/rejected": -23.53774642944336, + "step": 9815 + }, + { + "epoch": 0.33098520341096765, + "grad_norm": 22.862253189086914, + "learning_rate": 8.460933818874372e-07, + "logits/chosen": -1.0414626598358154, + "logits/rejected": -1.1654443740844727, + "logps/chosen": -1.8751583099365234, + "logps/rejected": -1.9087566137313843, + "loss": 2.9526, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.751583099365234, + "rewards/margins": 0.3359828591346741, + "rewards/rejected": -19.08756446838379, + "step": 9820 + }, + { + "epoch": 0.3311537294819509, + "grad_norm": 18.878591537475586, + "learning_rate": 8.458810396672827e-07, + "logits/chosen": -0.9540130496025085, + "logits/rejected": -1.0682528018951416, + "logps/chosen": -2.067431688308716, + "logps/rejected": -2.05672025680542, + "loss": 3.8316, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.674314498901367, + "rewards/margins": -0.10711526870727539, + "rewards/rejected": -20.56719970703125, + "step": 9825 + }, + { + "epoch": 0.33132225555293404, + "grad_norm": 20.707855224609375, + "learning_rate": 8.456685777513273e-07, + "logits/chosen": -1.0735938549041748, + "logits/rejected": -1.0637882947921753, + "logps/chosen": -1.8889278173446655, + "logps/rejected": -2.131657600402832, + "loss": 2.0185, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.8892765045166, + "rewards/margins": 2.4273009300231934, + "rewards/rejected": -21.316577911376953, + "step": 9830 + }, + { + "epoch": 0.3314907816239172, + "grad_norm": 50.769004821777344, + "learning_rate": 8.45455996213096e-07, + "logits/chosen": -1.1742569208145142, + "logits/rejected": -1.2606443166732788, + "logps/chosen": -1.8508479595184326, + "logps/rejected": -1.8263981342315674, + "loss": 3.3138, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.508480072021484, + "rewards/margins": -0.24450120329856873, + "rewards/rejected": -18.263980865478516, + "step": 9835 + }, + { + "epoch": 0.3316593076949004, + "grad_norm": 19.067001342773438, + "learning_rate": 8.452432951261548e-07, + "logits/chosen": -0.9642957448959351, + "logits/rejected": -1.1399331092834473, + "logps/chosen": -2.040045738220215, + "logps/rejected": -2.4999804496765137, + "loss": 3.3077, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.400455474853516, + "rewards/margins": 4.599349021911621, + "rewards/rejected": -24.99980354309082, + "step": 9840 + }, + { + "epoch": 0.3318278337658836, + "grad_norm": 26.52146339416504, + "learning_rate": 8.450304745641112e-07, + "logits/chosen": -1.0799713134765625, + "logits/rejected": -0.9734416007995605, + "logps/chosen": -1.9313831329345703, + "logps/rejected": -1.8945175409317017, + "loss": 3.4691, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.313831329345703, + "rewards/margins": -0.3686564564704895, + "rewards/rejected": -18.945175170898438, + "step": 9845 + }, + { + "epoch": 0.33199635983686676, + "grad_norm": 19.426136016845703, + "learning_rate": 8.448175346006141e-07, + "logits/chosen": -0.675517737865448, + "logits/rejected": -0.7951253056526184, + "logps/chosen": -1.6109631061553955, + "logps/rejected": -1.7086893320083618, + "loss": 2.5614, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -16.109630584716797, + "rewards/margins": 0.9772618412971497, + "rewards/rejected": -17.08689308166504, + "step": 9850 + }, + { + "epoch": 0.3321648859078499, + "grad_norm": 29.161775588989258, + "learning_rate": 8.446044753093535e-07, + "logits/chosen": -0.7901461720466614, + "logits/rejected": -0.745838463306427, + "logps/chosen": -2.0752806663513184, + "logps/rejected": -2.0568737983703613, + "loss": 3.6689, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.752811431884766, + "rewards/margins": -0.1840725839138031, + "rewards/rejected": -20.568737030029297, + "step": 9855 + }, + { + "epoch": 0.33233341197883315, + "grad_norm": 2.2476565837860107, + "learning_rate": 8.44391296764061e-07, + "logits/chosen": -0.9421189427375793, + "logits/rejected": -1.1054116487503052, + "logps/chosen": -2.5878076553344727, + "logps/rejected": -2.8351638317108154, + "loss": 2.177, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.878076553344727, + "rewards/margins": 2.4735639095306396, + "rewards/rejected": -28.351642608642578, + "step": 9860 + }, + { + "epoch": 0.3325019380498163, + "grad_norm": 29.23499870300293, + "learning_rate": 8.441779990385089e-07, + "logits/chosen": -0.7637979388237, + "logits/rejected": -0.8095385432243347, + "logps/chosen": -2.1008362770080566, + "logps/rejected": -2.135150909423828, + "loss": 3.0109, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -21.008363723754883, + "rewards/margins": 0.34314537048339844, + "rewards/rejected": -21.35150909423828, + "step": 9865 + }, + { + "epoch": 0.3326704641207995, + "grad_norm": 39.963294982910156, + "learning_rate": 8.439645822065115e-07, + "logits/chosen": -0.7813414335250854, + "logits/rejected": -0.8605804443359375, + "logps/chosen": -2.0555100440979004, + "logps/rejected": -2.060969829559326, + "loss": 3.9695, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.555099487304688, + "rewards/margins": 0.05459785461425781, + "rewards/rejected": -20.609697341918945, + "step": 9870 + }, + { + "epoch": 0.33283899019178265, + "grad_norm": 59.06498336791992, + "learning_rate": 8.43751046341924e-07, + "logits/chosen": -0.9045795202255249, + "logits/rejected": -1.0206382274627686, + "logps/chosen": -2.1519813537597656, + "logps/rejected": -2.279313564300537, + "loss": 2.3572, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.519811630249023, + "rewards/margins": 1.2733227014541626, + "rewards/rejected": -22.793134689331055, + "step": 9875 + }, + { + "epoch": 0.33300751626276587, + "grad_norm": 65.2740249633789, + "learning_rate": 8.435373915186426e-07, + "logits/chosen": -0.8387653231620789, + "logits/rejected": -0.8890382647514343, + "logps/chosen": -2.2679905891418457, + "logps/rejected": -2.2393336296081543, + "loss": 3.4295, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -22.67991065979004, + "rewards/margins": -0.2865728437900543, + "rewards/rejected": -22.393335342407227, + "step": 9880 + }, + { + "epoch": 0.33317604233374903, + "grad_norm": 24.272924423217773, + "learning_rate": 8.433236178106047e-07, + "logits/chosen": -0.8641761541366577, + "logits/rejected": -0.8543815612792969, + "logps/chosen": -2.308032512664795, + "logps/rejected": -2.3229072093963623, + "loss": 3.3651, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.080322265625, + "rewards/margins": 0.14874887466430664, + "rewards/rejected": -23.22907257080078, + "step": 9885 + }, + { + "epoch": 0.3333445684047322, + "grad_norm": 25.492050170898438, + "learning_rate": 8.43109725291789e-07, + "logits/chosen": -0.8823873400688171, + "logits/rejected": -0.9587462544441223, + "logps/chosen": -2.126710891723633, + "logps/rejected": -2.0421059131622314, + "loss": 4.2964, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.267108917236328, + "rewards/margins": -0.8460475206375122, + "rewards/rejected": -20.42106056213379, + "step": 9890 + }, + { + "epoch": 0.33351309447571537, + "grad_norm": 33.2651252746582, + "learning_rate": 8.428957140362157e-07, + "logits/chosen": -1.146289587020874, + "logits/rejected": -1.347427487373352, + "logps/chosen": -1.861358404159546, + "logps/rejected": -1.9180434942245483, + "loss": 2.7072, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.613584518432617, + "rewards/margins": 0.5668505430221558, + "rewards/rejected": -19.180435180664062, + "step": 9895 + }, + { + "epoch": 0.3336816205466986, + "grad_norm": 20.580039978027344, + "learning_rate": 8.426815841179451e-07, + "logits/chosen": -0.49518918991088867, + "logits/rejected": -0.4970271587371826, + "logps/chosen": -2.2172446250915527, + "logps/rejected": -2.412372589111328, + "loss": 3.1679, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.172447204589844, + "rewards/margins": 1.9512779712677002, + "rewards/rejected": -24.12372589111328, + "step": 9900 + }, + { + "epoch": 0.33385014661768175, + "grad_norm": 55.29900360107422, + "learning_rate": 8.424673356110792e-07, + "logits/chosen": -1.1283732652664185, + "logits/rejected": -1.0785002708435059, + "logps/chosen": -1.8419973850250244, + "logps/rejected": -1.9012470245361328, + "loss": 2.6125, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.419971466064453, + "rewards/margins": 0.5924980044364929, + "rewards/rejected": -19.012470245361328, + "step": 9905 + }, + { + "epoch": 0.3340186726886649, + "grad_norm": 22.117595672607422, + "learning_rate": 8.422529685897614e-07, + "logits/chosen": -0.9310518503189087, + "logits/rejected": -0.8916823267936707, + "logps/chosen": -2.5464320182800293, + "logps/rejected": -2.720090389251709, + "loss": 1.6402, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -25.46432113647461, + "rewards/margins": 1.7365844249725342, + "rewards/rejected": -27.20090675354004, + "step": 9910 + }, + { + "epoch": 0.33418719875964814, + "grad_norm": 21.287263870239258, + "learning_rate": 8.420384831281752e-07, + "logits/chosen": -1.1691551208496094, + "logits/rejected": -1.24275541305542, + "logps/chosen": -2.466435670852661, + "logps/rejected": -2.7765564918518066, + "loss": 3.6018, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -24.664356231689453, + "rewards/margins": 3.1012115478515625, + "rewards/rejected": -27.765567779541016, + "step": 9915 + }, + { + "epoch": 0.3343557248306313, + "grad_norm": 48.62921905517578, + "learning_rate": 8.418238793005459e-07, + "logits/chosen": -1.2796119451522827, + "logits/rejected": -1.1867414712905884, + "logps/chosen": -1.770689606666565, + "logps/rejected": -1.7434495687484741, + "loss": 3.3813, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.706897735595703, + "rewards/margins": -0.27240076661109924, + "rewards/rejected": -17.43449592590332, + "step": 9920 + }, + { + "epoch": 0.3345242509016145, + "grad_norm": 28.96337127685547, + "learning_rate": 8.416091571811393e-07, + "logits/chosen": -1.2204170227050781, + "logits/rejected": -1.3906219005584717, + "logps/chosen": -1.966515302658081, + "logps/rejected": -1.9123204946517944, + "loss": 4.1011, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.665151596069336, + "rewards/margins": -0.5419479608535767, + "rewards/rejected": -19.123207092285156, + "step": 9925 + }, + { + "epoch": 0.33469277697259764, + "grad_norm": 40.10865783691406, + "learning_rate": 8.413943168442621e-07, + "logits/chosen": -0.6399518251419067, + "logits/rejected": -0.7417412996292114, + "logps/chosen": -1.9978796243667603, + "logps/rejected": -2.1743671894073486, + "loss": 1.7883, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -19.97879409790039, + "rewards/margins": 1.7648769617080688, + "rewards/rejected": -21.74367332458496, + "step": 9930 + }, + { + "epoch": 0.33486130304358086, + "grad_norm": 12.934712409973145, + "learning_rate": 8.411793583642625e-07, + "logits/chosen": -0.8698463439941406, + "logits/rejected": -1.0204404592514038, + "logps/chosen": -1.9658260345458984, + "logps/rejected": -2.275739908218384, + "loss": 3.0921, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.658262252807617, + "rewards/margins": 3.0991365909576416, + "rewards/rejected": -22.757396697998047, + "step": 9935 + }, + { + "epoch": 0.335029829114564, + "grad_norm": 28.698816299438477, + "learning_rate": 8.409642818155287e-07, + "logits/chosen": -0.9460894465446472, + "logits/rejected": -0.9732720255851746, + "logps/chosen": -1.5314778089523315, + "logps/rejected": -1.5256506204605103, + "loss": 3.1376, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -15.314778327941895, + "rewards/margins": -0.05827188491821289, + "rewards/rejected": -15.256505012512207, + "step": 9940 + }, + { + "epoch": 0.3351983551855472, + "grad_norm": 24.51972198486328, + "learning_rate": 8.407490872724905e-07, + "logits/chosen": -1.241775393486023, + "logits/rejected": -1.3540207147598267, + "logps/chosen": -1.8327884674072266, + "logps/rejected": -1.8153759241104126, + "loss": 3.6465, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.327884674072266, + "rewards/margins": -0.17412586510181427, + "rewards/rejected": -18.153759002685547, + "step": 9945 + }, + { + "epoch": 0.33536688125653036, + "grad_norm": 38.081138610839844, + "learning_rate": 8.405337748096182e-07, + "logits/chosen": -0.9224987030029297, + "logits/rejected": -1.4343881607055664, + "logps/chosen": -1.85904860496521, + "logps/rejected": -2.256868839263916, + "loss": 1.9235, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.590484619140625, + "rewards/margins": 3.978203535079956, + "rewards/rejected": -22.56869125366211, + "step": 9950 + }, + { + "epoch": 0.3355354073275136, + "grad_norm": 12.562773704528809, + "learning_rate": 8.403183445014228e-07, + "logits/chosen": -0.7454361319541931, + "logits/rejected": -0.9337083101272583, + "logps/chosen": -2.4363226890563965, + "logps/rejected": -2.676544666290283, + "loss": 2.0228, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.36322784423828, + "rewards/margins": 2.402218818664551, + "rewards/rejected": -26.765445709228516, + "step": 9955 + }, + { + "epoch": 0.33570393339849675, + "grad_norm": 24.086362838745117, + "learning_rate": 8.401027964224565e-07, + "logits/chosen": -0.8464582562446594, + "logits/rejected": -0.9272781610488892, + "logps/chosen": -1.9356921911239624, + "logps/rejected": -1.8562870025634766, + "loss": 4.2157, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.356922149658203, + "rewards/margins": -0.7940529584884644, + "rewards/rejected": -18.562870025634766, + "step": 9960 + }, + { + "epoch": 0.3358724594694799, + "grad_norm": 26.327152252197266, + "learning_rate": 8.398871306473118e-07, + "logits/chosen": -0.979174017906189, + "logits/rejected": -0.9731283187866211, + "logps/chosen": -2.057375192642212, + "logps/rejected": -2.0879650115966797, + "loss": 3.4316, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.573749542236328, + "rewards/margins": 0.30589810013771057, + "rewards/rejected": -20.879650115966797, + "step": 9965 + }, + { + "epoch": 0.33604098554046313, + "grad_norm": 20.45331382751465, + "learning_rate": 8.396713472506222e-07, + "logits/chosen": -0.8873946070671082, + "logits/rejected": -0.948246955871582, + "logps/chosen": -1.953375220298767, + "logps/rejected": -2.0850324630737305, + "loss": 2.0067, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -19.53375244140625, + "rewards/margins": 1.3165717124938965, + "rewards/rejected": -20.850322723388672, + "step": 9970 + }, + { + "epoch": 0.3362095116114463, + "grad_norm": 27.921266555786133, + "learning_rate": 8.394554463070619e-07, + "logits/chosen": -0.7784280776977539, + "logits/rejected": -0.8730652928352356, + "logps/chosen": -2.3976194858551025, + "logps/rejected": -2.5944883823394775, + "loss": 2.3079, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.9761962890625, + "rewards/margins": 1.9686893224716187, + "rewards/rejected": -25.94488525390625, + "step": 9975 + }, + { + "epoch": 0.33637803768242946, + "grad_norm": 12.803922653198242, + "learning_rate": 8.392394278913456e-07, + "logits/chosen": -0.8921037912368774, + "logits/rejected": -1.1823089122772217, + "logps/chosen": -2.6166586875915527, + "logps/rejected": -2.4037137031555176, + "loss": 6.1695, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -26.166589736938477, + "rewards/margins": -2.1294519901275635, + "rewards/rejected": -24.03713607788086, + "step": 9980 + }, + { + "epoch": 0.33654656375341263, + "grad_norm": 44.06391143798828, + "learning_rate": 8.390232920782287e-07, + "logits/chosen": -1.0536354780197144, + "logits/rejected": -1.010083556175232, + "logps/chosen": -1.964163064956665, + "logps/rejected": -2.0906193256378174, + "loss": 2.6179, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.641630172729492, + "rewards/margins": 1.264561414718628, + "rewards/rejected": -20.906192779541016, + "step": 9985 + }, + { + "epoch": 0.33671508982439585, + "grad_norm": 39.75322723388672, + "learning_rate": 8.388070389425077e-07, + "logits/chosen": -1.345232605934143, + "logits/rejected": -1.3481756448745728, + "logps/chosen": -1.900757074356079, + "logps/rejected": -1.9350669384002686, + "loss": 3.1176, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.007572174072266, + "rewards/margins": 0.3430987298488617, + "rewards/rejected": -19.350669860839844, + "step": 9990 + }, + { + "epoch": 0.336883615895379, + "grad_norm": 16.268253326416016, + "learning_rate": 8.385906685590187e-07, + "logits/chosen": -0.9428138732910156, + "logits/rejected": -0.8706213235855103, + "logps/chosen": -1.9050085544586182, + "logps/rejected": -2.324378728866577, + "loss": 2.3956, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.050085067749023, + "rewards/margins": 4.193702220916748, + "rewards/rejected": -23.243785858154297, + "step": 9995 + }, + { + "epoch": 0.3370521419663622, + "grad_norm": 28.86384391784668, + "learning_rate": 8.383741810026395e-07, + "logits/chosen": -1.1897058486938477, + "logits/rejected": -1.244680643081665, + "logps/chosen": -1.8175055980682373, + "logps/rejected": -1.7724707126617432, + "loss": 3.772, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.175058364868164, + "rewards/margins": -0.45034971833229065, + "rewards/rejected": -17.724706649780273, + "step": 10000 + }, + { + "epoch": 0.3370521419663622, + "eval_logits/chosen": -1.2586745023727417, + "eval_logits/rejected": -1.3345402479171753, + "eval_logps/chosen": -1.8584271669387817, + "eval_logps/rejected": -1.9249132871627808, + "eval_loss": 3.0772247314453125, + "eval_rewards/accuracies": 0.6100000143051147, + "eval_rewards/chosen": -18.584270477294922, + "eval_rewards/margins": 0.6648635268211365, + "eval_rewards/rejected": -19.249135971069336, + "eval_runtime": 12.9088, + "eval_samples_per_second": 7.747, + "eval_steps_per_second": 1.937, + "step": 10000 + }, + { + "epoch": 0.33722066803734535, + "grad_norm": 37.60383605957031, + "learning_rate": 8.381575763482875e-07, + "logits/chosen": -0.7130342721939087, + "logits/rejected": -0.5373210906982422, + "logps/chosen": -2.3927738666534424, + "logps/rejected": -2.6733829975128174, + "loss": 2.5409, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.927738189697266, + "rewards/margins": 2.8060927391052246, + "rewards/rejected": -26.733829498291016, + "step": 10005 + }, + { + "epoch": 0.33738919410832857, + "grad_norm": 18.94394874572754, + "learning_rate": 8.379408546709212e-07, + "logits/chosen": -1.180837631225586, + "logits/rejected": -1.2834656238555908, + "logps/chosen": -1.5497967004776, + "logps/rejected": -1.8647041320800781, + "loss": 2.0521, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -15.497967720031738, + "rewards/margins": 3.1490752696990967, + "rewards/rejected": -18.647043228149414, + "step": 10010 + }, + { + "epoch": 0.33755772017931174, + "grad_norm": 27.073068618774414, + "learning_rate": 8.377240160455395e-07, + "logits/chosen": -0.5144739151000977, + "logits/rejected": -0.6853980422019958, + "logps/chosen": -2.464611053466797, + "logps/rejected": -2.574533224105835, + "loss": 2.9053, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.64611053466797, + "rewards/margins": 1.099221110343933, + "rewards/rejected": -25.745330810546875, + "step": 10015 + }, + { + "epoch": 0.3377262462502949, + "grad_norm": 22.88643455505371, + "learning_rate": 8.375070605471815e-07, + "logits/chosen": -0.9767011404037476, + "logits/rejected": -0.978759765625, + "logps/chosen": -1.664846658706665, + "logps/rejected": -1.8161695003509521, + "loss": 2.3565, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.648466110229492, + "rewards/margins": 1.5132266283035278, + "rewards/rejected": -18.161693572998047, + "step": 10020 + }, + { + "epoch": 0.3378947723212781, + "grad_norm": 37.77303695678711, + "learning_rate": 8.372899882509273e-07, + "logits/chosen": -0.8196894526481628, + "logits/rejected": -0.847124457359314, + "logps/chosen": -2.192286968231201, + "logps/rejected": -2.450854778289795, + "loss": 2.3799, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.922870635986328, + "rewards/margins": 2.5856778621673584, + "rewards/rejected": -24.508548736572266, + "step": 10025 + }, + { + "epoch": 0.3380632983922613, + "grad_norm": 75.76078033447266, + "learning_rate": 8.370727992318967e-07, + "logits/chosen": -1.073168396949768, + "logits/rejected": -1.253846526145935, + "logps/chosen": -2.0942280292510986, + "logps/rejected": -2.1867103576660156, + "loss": 2.9323, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.942279815673828, + "rewards/margins": 0.924824059009552, + "rewards/rejected": -21.867103576660156, + "step": 10030 + }, + { + "epoch": 0.33823182446324446, + "grad_norm": 35.56973648071289, + "learning_rate": 8.368554935652503e-07, + "logits/chosen": -0.6315957903862, + "logits/rejected": -0.7099935412406921, + "logps/chosen": -2.129772663116455, + "logps/rejected": -2.2371087074279785, + "loss": 2.8981, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.297725677490234, + "rewards/margins": 1.0733586549758911, + "rewards/rejected": -22.371084213256836, + "step": 10035 + }, + { + "epoch": 0.3384003505342276, + "grad_norm": 41.625614166259766, + "learning_rate": 8.366380713261894e-07, + "logits/chosen": -1.1311523914337158, + "logits/rejected": -1.1663687229156494, + "logps/chosen": -2.106232166290283, + "logps/rejected": -2.3760056495666504, + "loss": 2.496, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.062320709228516, + "rewards/margins": 2.69773006439209, + "rewards/rejected": -23.760051727294922, + "step": 10040 + }, + { + "epoch": 0.33856887660521084, + "grad_norm": 56.50038146972656, + "learning_rate": 8.364205325899549e-07, + "logits/chosen": -1.2110131978988647, + "logits/rejected": -1.2723443508148193, + "logps/chosen": -2.363647937774658, + "logps/rejected": -2.497826337814331, + "loss": 2.4119, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.63648223876953, + "rewards/margins": 1.341783881187439, + "rewards/rejected": -24.978261947631836, + "step": 10045 + }, + { + "epoch": 0.338737402676194, + "grad_norm": 22.00423240661621, + "learning_rate": 8.362028774318286e-07, + "logits/chosen": -0.8191970586776733, + "logits/rejected": -0.9766160845756531, + "logps/chosen": -1.750830888748169, + "logps/rejected": -1.8567674160003662, + "loss": 2.2368, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.50830841064453, + "rewards/margins": 1.0593667030334473, + "rewards/rejected": -18.567676544189453, + "step": 10050 + }, + { + "epoch": 0.3389059287471772, + "grad_norm": 29.267391204833984, + "learning_rate": 8.359851059271323e-07, + "logits/chosen": -0.760384738445282, + "logits/rejected": -0.8759638667106628, + "logps/chosen": -2.0324978828430176, + "logps/rejected": -2.2271060943603516, + "loss": 2.3612, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.324981689453125, + "rewards/margins": 1.9460808038711548, + "rewards/rejected": -22.271060943603516, + "step": 10055 + }, + { + "epoch": 0.33907445481816034, + "grad_norm": 24.558929443359375, + "learning_rate": 8.357672181512281e-07, + "logits/chosen": -1.1694022417068481, + "logits/rejected": -1.303847312927246, + "logps/chosen": -1.7414251565933228, + "logps/rejected": -1.8605835437774658, + "loss": 2.5338, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.41425132751465, + "rewards/margins": 1.1915841102600098, + "rewards/rejected": -18.6058349609375, + "step": 10060 + }, + { + "epoch": 0.33924298088914356, + "grad_norm": 21.290231704711914, + "learning_rate": 8.355492141795184e-07, + "logits/chosen": -0.7563185691833496, + "logits/rejected": -0.895641028881073, + "logps/chosen": -1.799481749534607, + "logps/rejected": -2.1289761066436768, + "loss": 1.8406, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.994815826416016, + "rewards/margins": 3.2949440479278564, + "rewards/rejected": -21.289762496948242, + "step": 10065 + }, + { + "epoch": 0.33941150696012673, + "grad_norm": 19.94627571105957, + "learning_rate": 8.353310940874457e-07, + "logits/chosen": -0.45717424154281616, + "logits/rejected": -0.6762049794197083, + "logps/chosen": -1.7784864902496338, + "logps/rejected": -2.0194575786590576, + "loss": 2.3624, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.784862518310547, + "rewards/margins": 2.4097115993499756, + "rewards/rejected": -20.194576263427734, + "step": 10070 + }, + { + "epoch": 0.3395800330311099, + "grad_norm": 12.628251075744629, + "learning_rate": 8.351128579504929e-07, + "logits/chosen": -1.2598450183868408, + "logits/rejected": -1.4129188060760498, + "logps/chosen": -2.335139036178589, + "logps/rejected": -2.369058609008789, + "loss": 3.2621, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.351390838623047, + "rewards/margins": 0.3391936421394348, + "rewards/rejected": -23.690584182739258, + "step": 10075 + }, + { + "epoch": 0.3397485591020931, + "grad_norm": 25.803693771362305, + "learning_rate": 8.34894505844183e-07, + "logits/chosen": -0.8495704531669617, + "logits/rejected": -0.9760378003120422, + "logps/chosen": -1.7889350652694702, + "logps/rejected": -1.8187023401260376, + "loss": 3.0082, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.88934898376465, + "rewards/margins": 0.2976733148097992, + "rewards/rejected": -18.187023162841797, + "step": 10080 + }, + { + "epoch": 0.3399170851730763, + "grad_norm": 17.753204345703125, + "learning_rate": 8.346760378440787e-07, + "logits/chosen": -1.2412251234054565, + "logits/rejected": -1.4337639808654785, + "logps/chosen": -1.5392677783966064, + "logps/rejected": -1.7446925640106201, + "loss": 1.9918, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -15.392675399780273, + "rewards/margins": 2.054250955581665, + "rewards/rejected": -17.44692611694336, + "step": 10085 + }, + { + "epoch": 0.34008561124405945, + "grad_norm": 24.789297103881836, + "learning_rate": 8.344574540257836e-07, + "logits/chosen": -1.016185998916626, + "logits/rejected": -0.9818147420883179, + "logps/chosen": -1.6750415563583374, + "logps/rejected": -1.9193319082260132, + "loss": 3.0797, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.750415802001953, + "rewards/margins": 2.442903995513916, + "rewards/rejected": -19.19331932067871, + "step": 10090 + }, + { + "epoch": 0.3402541373150426, + "grad_norm": 5.1002421379089355, + "learning_rate": 8.342387544649407e-07, + "logits/chosen": -0.885150134563446, + "logits/rejected": -1.1269336938858032, + "logps/chosen": -1.7850160598754883, + "logps/rejected": -2.032360792160034, + "loss": 4.2058, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.850162506103516, + "rewards/margins": 2.4734485149383545, + "rewards/rejected": -20.3236083984375, + "step": 10095 + }, + { + "epoch": 0.34042266338602584, + "grad_norm": 33.8403205871582, + "learning_rate": 8.340199392372334e-07, + "logits/chosen": -0.7171911001205444, + "logits/rejected": -0.8975852727890015, + "logps/chosen": -2.01006817817688, + "logps/rejected": -2.1210741996765137, + "loss": 2.4229, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.100679397583008, + "rewards/margins": 1.110063910484314, + "rewards/rejected": -21.210742950439453, + "step": 10100 + }, + { + "epoch": 0.340591189457009, + "grad_norm": 26.41214370727539, + "learning_rate": 8.338010084183848e-07, + "logits/chosen": -0.6185327768325806, + "logits/rejected": -0.8199454545974731, + "logps/chosen": -2.1546084880828857, + "logps/rejected": -2.167876720428467, + "loss": 3.3486, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.546085357666016, + "rewards/margins": 0.13268089294433594, + "rewards/rejected": -21.67876625061035, + "step": 10105 + }, + { + "epoch": 0.34075971552799217, + "grad_norm": 92.04045104980469, + "learning_rate": 8.335819620841588e-07, + "logits/chosen": -1.1275532245635986, + "logits/rejected": -0.9211037755012512, + "logps/chosen": -2.427985668182373, + "logps/rejected": -2.0251243114471436, + "loss": 7.0349, + "rewards/accuracies": 0.0, + "rewards/chosen": -24.279855728149414, + "rewards/margins": -4.0286126136779785, + "rewards/rejected": -20.251245498657227, + "step": 10110 + }, + { + "epoch": 0.34092824159897533, + "grad_norm": 55.04663848876953, + "learning_rate": 8.33362800310358e-07, + "logits/chosen": -0.7637117505073547, + "logits/rejected": -0.8645876049995422, + "logps/chosen": -1.7918641567230225, + "logps/rejected": -1.942285180091858, + "loss": 2.2968, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.918643951416016, + "rewards/margins": 1.5042095184326172, + "rewards/rejected": -19.4228515625, + "step": 10115 + }, + { + "epoch": 0.34109676766995856, + "grad_norm": 57.4473991394043, + "learning_rate": 8.331435231728261e-07, + "logits/chosen": -0.8943387269973755, + "logits/rejected": -1.140529751777649, + "logps/chosen": -1.9503597021102905, + "logps/rejected": -1.9956696033477783, + "loss": 3.2865, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.503597259521484, + "rewards/margins": 0.4531001150608063, + "rewards/rejected": -19.956695556640625, + "step": 10120 + }, + { + "epoch": 0.3412652937409417, + "grad_norm": 30.372905731201172, + "learning_rate": 8.329241307474462e-07, + "logits/chosen": -0.9090366363525391, + "logits/rejected": -0.9714096784591675, + "logps/chosen": -1.5740294456481934, + "logps/rejected": -1.6546818017959595, + "loss": 2.545, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -15.740293502807617, + "rewards/margins": 0.8065251111984253, + "rewards/rejected": -16.546817779541016, + "step": 10125 + }, + { + "epoch": 0.3414338198119249, + "grad_norm": 35.968505859375, + "learning_rate": 8.327046231101413e-07, + "logits/chosen": -0.9563786387443542, + "logits/rejected": -1.0514451265335083, + "logps/chosen": -2.135432720184326, + "logps/rejected": -2.0912060737609863, + "loss": 3.5679, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.354328155517578, + "rewards/margins": -0.4422665536403656, + "rewards/rejected": -20.91206169128418, + "step": 10130 + }, + { + "epoch": 0.3416023458829081, + "grad_norm": 27.77577018737793, + "learning_rate": 8.324850003368744e-07, + "logits/chosen": -1.0580577850341797, + "logits/rejected": -1.0771617889404297, + "logps/chosen": -2.0225207805633545, + "logps/rejected": -1.999882698059082, + "loss": 3.5039, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.225208282470703, + "rewards/margins": -0.22638091444969177, + "rewards/rejected": -19.998825073242188, + "step": 10135 + }, + { + "epoch": 0.3417708719538913, + "grad_norm": 76.16617584228516, + "learning_rate": 8.322652625036482e-07, + "logits/chosen": -0.9272229075431824, + "logits/rejected": -0.9870445132255554, + "logps/chosen": -1.9856401681900024, + "logps/rejected": -2.3070406913757324, + "loss": 1.934, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.856403350830078, + "rewards/margins": 3.2140049934387207, + "rewards/rejected": -23.07040786743164, + "step": 10140 + }, + { + "epoch": 0.34193939802487444, + "grad_norm": 23.884069442749023, + "learning_rate": 8.320454096865054e-07, + "logits/chosen": -0.785285472869873, + "logits/rejected": -0.854631245136261, + "logps/chosen": -1.846387505531311, + "logps/rejected": -1.9219926595687866, + "loss": 2.7894, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.46387481689453, + "rewards/margins": 0.7560516595840454, + "rewards/rejected": -19.219926834106445, + "step": 10145 + }, + { + "epoch": 0.3421079240958576, + "grad_norm": 32.95330047607422, + "learning_rate": 8.318254419615283e-07, + "logits/chosen": -0.9430086016654968, + "logits/rejected": -1.0663788318634033, + "logps/chosen": -1.7976980209350586, + "logps/rejected": -2.0233747959136963, + "loss": 1.4286, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.976980209350586, + "rewards/margins": 2.2567665576934814, + "rewards/rejected": -20.233745574951172, + "step": 10150 + }, + { + "epoch": 0.34227645016684083, + "grad_norm": 25.350078582763672, + "learning_rate": 8.316053594048394e-07, + "logits/chosen": -0.994820237159729, + "logits/rejected": -1.3681151866912842, + "logps/chosen": -2.111388683319092, + "logps/rejected": -2.4443917274475098, + "loss": 1.6322, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.1138858795166, + "rewards/margins": 3.3300297260284424, + "rewards/rejected": -24.44391632080078, + "step": 10155 + }, + { + "epoch": 0.342444976237824, + "grad_norm": 19.32894515991211, + "learning_rate": 8.313851620926e-07, + "logits/chosen": -0.9013049006462097, + "logits/rejected": -0.9923794865608215, + "logps/chosen": -1.902090072631836, + "logps/rejected": -1.9696989059448242, + "loss": 2.9091, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.020898818969727, + "rewards/margins": 0.6760891079902649, + "rewards/rejected": -19.696989059448242, + "step": 10160 + }, + { + "epoch": 0.34261350230880716, + "grad_norm": 24.06064224243164, + "learning_rate": 8.311648501010122e-07, + "logits/chosen": -1.1217981576919556, + "logits/rejected": -1.0748631954193115, + "logps/chosen": -2.2157540321350098, + "logps/rejected": -2.2380218505859375, + "loss": 3.0986, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.15753936767578, + "rewards/margins": 0.22267866134643555, + "rewards/rejected": -22.380218505859375, + "step": 10165 + }, + { + "epoch": 0.3427820283797903, + "grad_norm": 26.289478302001953, + "learning_rate": 8.309444235063172e-07, + "logits/chosen": -0.9130669832229614, + "logits/rejected": -0.7826474905014038, + "logps/chosen": -1.8189846277236938, + "logps/rejected": -1.9442589282989502, + "loss": 2.3744, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.18984603881836, + "rewards/margins": 1.2527434825897217, + "rewards/rejected": -19.442588806152344, + "step": 10170 + }, + { + "epoch": 0.34295055445077355, + "grad_norm": 28.468158721923828, + "learning_rate": 8.307238823847959e-07, + "logits/chosen": -0.6284016370773315, + "logits/rejected": -0.5836378931999207, + "logps/chosen": -1.7535285949707031, + "logps/rejected": -1.9830653667449951, + "loss": 1.7572, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -17.53528594970703, + "rewards/margins": 2.295367956161499, + "rewards/rejected": -19.83065414428711, + "step": 10175 + }, + { + "epoch": 0.3431190805217567, + "grad_norm": 14.106034278869629, + "learning_rate": 8.30503226812769e-07, + "logits/chosen": -0.8923094868659973, + "logits/rejected": -0.9873468279838562, + "logps/chosen": -2.0170390605926514, + "logps/rejected": -2.203902006149292, + "loss": 2.0167, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -20.17038917541504, + "rewards/margins": 1.8686323165893555, + "rewards/rejected": -22.03902244567871, + "step": 10180 + }, + { + "epoch": 0.3432876065927399, + "grad_norm": 32.9178352355957, + "learning_rate": 8.302824568665965e-07, + "logits/chosen": -1.0593280792236328, + "logits/rejected": -1.00070321559906, + "logps/chosen": -2.128171920776367, + "logps/rejected": -1.895475149154663, + "loss": 5.4176, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.281719207763672, + "rewards/margins": -2.326968193054199, + "rewards/rejected": -18.95475196838379, + "step": 10185 + }, + { + "epoch": 0.3434561326637231, + "grad_norm": 26.191192626953125, + "learning_rate": 8.300615726226783e-07, + "logits/chosen": -1.1325823068618774, + "logits/rejected": -0.9920172691345215, + "logps/chosen": -1.9885280132293701, + "logps/rejected": -2.0667898654937744, + "loss": 3.2633, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.88528060913086, + "rewards/margins": 0.7826164960861206, + "rewards/rejected": -20.667896270751953, + "step": 10190 + }, + { + "epoch": 0.34362465873470627, + "grad_norm": 18.76409149169922, + "learning_rate": 8.298405741574537e-07, + "logits/chosen": -1.216739296913147, + "logits/rejected": -1.285988211631775, + "logps/chosen": -1.7787593603134155, + "logps/rejected": -1.7824738025665283, + "loss": 3.2125, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.787593841552734, + "rewards/margins": 0.037142276763916016, + "rewards/rejected": -17.824735641479492, + "step": 10195 + }, + { + "epoch": 0.34379318480568943, + "grad_norm": 37.59312057495117, + "learning_rate": 8.296194615474014e-07, + "logits/chosen": -1.0172761678695679, + "logits/rejected": -1.1041208505630493, + "logps/chosen": -2.025296926498413, + "logps/rejected": -2.2218832969665527, + "loss": 2.0428, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.252967834472656, + "rewards/margins": 1.9658634662628174, + "rewards/rejected": -22.218830108642578, + "step": 10200 + }, + { + "epoch": 0.3439617108766726, + "grad_norm": 26.07461929321289, + "learning_rate": 8.293982348690402e-07, + "logits/chosen": -1.3663597106933594, + "logits/rejected": -1.3732165098190308, + "logps/chosen": -1.9943389892578125, + "logps/rejected": -1.8807373046875, + "loss": 4.3672, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.943389892578125, + "rewards/margins": -1.1360156536102295, + "rewards/rejected": -18.807373046875, + "step": 10205 + }, + { + "epoch": 0.3441302369476558, + "grad_norm": 16.750768661499023, + "learning_rate": 8.291768941989277e-07, + "logits/chosen": -1.353293776512146, + "logits/rejected": -1.2948862314224243, + "logps/chosen": -1.8148410320281982, + "logps/rejected": -1.8855838775634766, + "loss": 2.4624, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.14841079711914, + "rewards/margins": 0.7074286341667175, + "rewards/rejected": -18.855838775634766, + "step": 10210 + }, + { + "epoch": 0.344298763018639, + "grad_norm": 50.30863571166992, + "learning_rate": 8.289554396136611e-07, + "logits/chosen": -0.7885207533836365, + "logits/rejected": -0.789328932762146, + "logps/chosen": -2.4931399822235107, + "logps/rejected": -2.1422553062438965, + "loss": 6.6146, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.9314022064209, + "rewards/margins": -3.508845567703247, + "rewards/rejected": -21.422555923461914, + "step": 10215 + }, + { + "epoch": 0.34446728908962215, + "grad_norm": 38.200897216796875, + "learning_rate": 8.287338711898771e-07, + "logits/chosen": -0.6832276582717896, + "logits/rejected": -0.6595412492752075, + "logps/chosen": -2.279621124267578, + "logps/rejected": -2.7458064556121826, + "loss": 3.0489, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -22.79621124267578, + "rewards/margins": 4.661856651306152, + "rewards/rejected": -27.45806884765625, + "step": 10220 + }, + { + "epoch": 0.3446358151606053, + "grad_norm": 24.22657585144043, + "learning_rate": 8.28512189004252e-07, + "logits/chosen": -0.6188753843307495, + "logits/rejected": -0.6037112474441528, + "logps/chosen": -1.9316953420639038, + "logps/rejected": -2.0164954662323, + "loss": 2.4917, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.316951751708984, + "rewards/margins": 0.8480027318000793, + "rewards/rejected": -20.16495704650879, + "step": 10225 + }, + { + "epoch": 0.34480434123158854, + "grad_norm": 46.43810272216797, + "learning_rate": 8.28290393133501e-07, + "logits/chosen": -0.9024465680122375, + "logits/rejected": -1.0171552896499634, + "logps/chosen": -2.0227932929992676, + "logps/rejected": -2.0649027824401855, + "loss": 3.14, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.22793197631836, + "rewards/margins": 0.4210955500602722, + "rewards/rejected": -20.649028778076172, + "step": 10230 + }, + { + "epoch": 0.3449728673025717, + "grad_norm": 22.429914474487305, + "learning_rate": 8.280684836543793e-07, + "logits/chosen": -1.074040412902832, + "logits/rejected": -1.2552762031555176, + "logps/chosen": -1.6524499654769897, + "logps/rejected": -1.741180419921875, + "loss": 2.6131, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.524497985839844, + "rewards/margins": 0.8873060941696167, + "rewards/rejected": -17.41180419921875, + "step": 10235 + }, + { + "epoch": 0.3451413933735549, + "grad_norm": 75.95999908447266, + "learning_rate": 8.278464606436807e-07, + "logits/chosen": -0.4927440285682678, + "logits/rejected": -0.4743828773498535, + "logps/chosen": -2.1160855293273926, + "logps/rejected": -2.2202095985412598, + "loss": 2.5375, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.16085433959961, + "rewards/margins": 1.0412417650222778, + "rewards/rejected": -22.20209503173828, + "step": 10240 + }, + { + "epoch": 0.3453099194445381, + "grad_norm": 30.984689712524414, + "learning_rate": 8.276243241782386e-07, + "logits/chosen": -0.9812358617782593, + "logits/rejected": -1.0138499736785889, + "logps/chosen": -1.5530610084533691, + "logps/rejected": -1.5287425518035889, + "loss": 3.5052, + "rewards/accuracies": 0.5, + "rewards/chosen": -15.530611038208008, + "rewards/margins": -0.24318504333496094, + "rewards/rejected": -15.28742504119873, + "step": 10245 + }, + { + "epoch": 0.34547844551552126, + "grad_norm": 55.070709228515625, + "learning_rate": 8.27402074334926e-07, + "logits/chosen": -0.8449915051460266, + "logits/rejected": -0.9934304356575012, + "logps/chosen": -1.7992223501205444, + "logps/rejected": -1.8660894632339478, + "loss": 2.9444, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.992223739624023, + "rewards/margins": 0.6686684489250183, + "rewards/rejected": -18.660892486572266, + "step": 10250 + }, + { + "epoch": 0.3456469715865044, + "grad_norm": 34.87515640258789, + "learning_rate": 8.271797111906542e-07, + "logits/chosen": -0.8565098643302917, + "logits/rejected": -0.9601860046386719, + "logps/chosen": -2.121169328689575, + "logps/rejected": -2.3623504638671875, + "loss": 1.9052, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.21169090270996, + "rewards/margins": 2.411813974380493, + "rewards/rejected": -23.62350845336914, + "step": 10255 + }, + { + "epoch": 0.3458154976574876, + "grad_norm": 24.82640838623047, + "learning_rate": 8.26957234822375e-07, + "logits/chosen": -0.8380298614501953, + "logits/rejected": -1.002062439918518, + "logps/chosen": -1.8750407695770264, + "logps/rejected": -2.041203022003174, + "loss": 1.8186, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -18.750408172607422, + "rewards/margins": 1.6616241931915283, + "rewards/rejected": -20.412031173706055, + "step": 10260 + }, + { + "epoch": 0.3459840237284708, + "grad_norm": 25.13413429260254, + "learning_rate": 8.267346453070785e-07, + "logits/chosen": -1.0303947925567627, + "logits/rejected": -0.9507439732551575, + "logps/chosen": -1.8449833393096924, + "logps/rejected": -1.8964245319366455, + "loss": 3.5502, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.449832916259766, + "rewards/margins": 0.5144118070602417, + "rewards/rejected": -18.964242935180664, + "step": 10265 + }, + { + "epoch": 0.346152549799454, + "grad_norm": 33.180355072021484, + "learning_rate": 8.265119427217939e-07, + "logits/chosen": -1.1469266414642334, + "logits/rejected": -1.0809723138809204, + "logps/chosen": -1.9312407970428467, + "logps/rejected": -1.9858585596084595, + "loss": 3.6962, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.312410354614258, + "rewards/margins": 0.5461755990982056, + "rewards/rejected": -19.858585357666016, + "step": 10270 + }, + { + "epoch": 0.34632107587043715, + "grad_norm": 27.01608657836914, + "learning_rate": 8.262891271435901e-07, + "logits/chosen": -0.7008494138717651, + "logits/rejected": -0.694682240486145, + "logps/chosen": -1.4841798543930054, + "logps/rejected": -1.623469591140747, + "loss": 2.1888, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -14.841798782348633, + "rewards/margins": 1.3928959369659424, + "rewards/rejected": -16.234695434570312, + "step": 10275 + }, + { + "epoch": 0.3464896019414203, + "grad_norm": 27.898277282714844, + "learning_rate": 8.260661986495748e-07, + "logits/chosen": -0.9827602505683899, + "logits/rejected": -1.0116498470306396, + "logps/chosen": -1.5244865417480469, + "logps/rejected": -1.416467308998108, + "loss": 4.1803, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -15.244865417480469, + "rewards/margins": -1.0801918506622314, + "rewards/rejected": -14.1646728515625, + "step": 10280 + }, + { + "epoch": 0.34665812801240353, + "grad_norm": 16.065555572509766, + "learning_rate": 8.258431573168944e-07, + "logits/chosen": -0.8114410638809204, + "logits/rejected": -0.6855028867721558, + "logps/chosen": -2.205500841140747, + "logps/rejected": -2.2870655059814453, + "loss": 3.2703, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.055007934570312, + "rewards/margins": 0.8156498670578003, + "rewards/rejected": -22.870656967163086, + "step": 10285 + }, + { + "epoch": 0.3468266540833867, + "grad_norm": 29.678882598876953, + "learning_rate": 8.25620003222735e-07, + "logits/chosen": -1.0882136821746826, + "logits/rejected": -1.0546444654464722, + "logps/chosen": -1.7505064010620117, + "logps/rejected": -1.8048797845840454, + "loss": 2.6905, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.505064010620117, + "rewards/margins": 0.5437334179878235, + "rewards/rejected": -18.048799514770508, + "step": 10290 + }, + { + "epoch": 0.34699518015436986, + "grad_norm": 17.689411163330078, + "learning_rate": 8.253967364443214e-07, + "logits/chosen": -0.49934762716293335, + "logits/rejected": -0.5486842393875122, + "logps/chosen": -2.119231939315796, + "logps/rejected": -2.325207233428955, + "loss": 2.5515, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.192317962646484, + "rewards/margins": 2.0597527027130127, + "rewards/rejected": -23.252071380615234, + "step": 10295 + }, + { + "epoch": 0.3471637062253531, + "grad_norm": 30.368303298950195, + "learning_rate": 8.251733570589176e-07, + "logits/chosen": -0.9879854321479797, + "logits/rejected": -1.0501902103424072, + "logps/chosen": -2.1748950481414795, + "logps/rejected": -2.3603451251983643, + "loss": 2.5822, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.748950958251953, + "rewards/margins": 1.8544994592666626, + "rewards/rejected": -23.603452682495117, + "step": 10300 + }, + { + "epoch": 0.34733223229633625, + "grad_norm": 62.947410583496094, + "learning_rate": 8.249498651438261e-07, + "logits/chosen": -1.065384864807129, + "logits/rejected": -1.0049973726272583, + "logps/chosen": -2.1836864948272705, + "logps/rejected": -2.3289408683776855, + "loss": 2.5892, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.836864471435547, + "rewards/margins": 1.452542781829834, + "rewards/rejected": -23.289409637451172, + "step": 10305 + }, + { + "epoch": 0.3475007583673194, + "grad_norm": 17.733478546142578, + "learning_rate": 8.247262607763887e-07, + "logits/chosen": -1.1448280811309814, + "logits/rejected": -1.433410882949829, + "logps/chosen": -1.7068363428115845, + "logps/rejected": -1.804264783859253, + "loss": 2.2979, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.068363189697266, + "rewards/margins": 0.9742859601974487, + "rewards/rejected": -18.042648315429688, + "step": 10310 + }, + { + "epoch": 0.3476692844383026, + "grad_norm": 24.782636642456055, + "learning_rate": 8.245025440339864e-07, + "logits/chosen": -0.9603859186172485, + "logits/rejected": -0.9498102068901062, + "logps/chosen": -2.2697396278381348, + "logps/rejected": -2.155381441116333, + "loss": 4.5072, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.69739532470703, + "rewards/margins": -1.1435810327529907, + "rewards/rejected": -21.553813934326172, + "step": 10315 + }, + { + "epoch": 0.3478378105092858, + "grad_norm": 27.691980361938477, + "learning_rate": 8.242787149940382e-07, + "logits/chosen": -1.1349749565124512, + "logits/rejected": -1.1333445310592651, + "logps/chosen": -2.019230365753174, + "logps/rejected": -1.935373067855835, + "loss": 3.9671, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -20.192302703857422, + "rewards/margins": -0.8385698199272156, + "rewards/rejected": -19.35373306274414, + "step": 10320 + }, + { + "epoch": 0.34800633658026897, + "grad_norm": 15.423532485961914, + "learning_rate": 8.24054773734003e-07, + "logits/chosen": -0.9372714757919312, + "logits/rejected": -0.927697479724884, + "logps/chosen": -2.0240514278411865, + "logps/rejected": -2.0457513332366943, + "loss": 3.1761, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.240514755249023, + "rewards/margins": 0.21699972450733185, + "rewards/rejected": -20.4575138092041, + "step": 10325 + }, + { + "epoch": 0.34817486265125214, + "grad_norm": 23.208126068115234, + "learning_rate": 8.238307203313779e-07, + "logits/chosen": -1.2076337337493896, + "logits/rejected": -1.4667056798934937, + "logps/chosen": -1.6836729049682617, + "logps/rejected": -1.6917756795883179, + "loss": 3.1361, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.836727142333984, + "rewards/margins": 0.08102655410766602, + "rewards/rejected": -16.91775894165039, + "step": 10330 + }, + { + "epoch": 0.3483433887222353, + "grad_norm": 40.44734191894531, + "learning_rate": 8.236065548636987e-07, + "logits/chosen": -0.9224785566329956, + "logits/rejected": -1.0374819040298462, + "logps/chosen": -2.0995991230010986, + "logps/rejected": -2.18255877494812, + "loss": 2.4162, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.995990753173828, + "rewards/margins": 0.8295975923538208, + "rewards/rejected": -21.82558822631836, + "step": 10335 + }, + { + "epoch": 0.3485119147932185, + "grad_norm": 45.3128662109375, + "learning_rate": 8.233822774085406e-07, + "logits/chosen": -1.0633924007415771, + "logits/rejected": -1.2170779705047607, + "logps/chosen": -1.716073751449585, + "logps/rejected": -1.9018266201019287, + "loss": 2.2667, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.16073989868164, + "rewards/margins": 1.8575260639190674, + "rewards/rejected": -19.018264770507812, + "step": 10340 + }, + { + "epoch": 0.3486804408642017, + "grad_norm": 25.652551651000977, + "learning_rate": 8.231578880435172e-07, + "logits/chosen": -0.7540196180343628, + "logits/rejected": -1.001956582069397, + "logps/chosen": -1.909753441810608, + "logps/rejected": -1.8614799976348877, + "loss": 3.7268, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.0975341796875, + "rewards/margins": -0.4827335476875305, + "rewards/rejected": -18.61480140686035, + "step": 10345 + }, + { + "epoch": 0.34884896693518486, + "grad_norm": 19.225238800048828, + "learning_rate": 8.229333868462804e-07, + "logits/chosen": -0.4429520070552826, + "logits/rejected": -0.4661738872528076, + "logps/chosen": -1.8548179864883423, + "logps/rejected": -2.0106232166290283, + "loss": 2.276, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.548179626464844, + "rewards/margins": 1.5580523014068604, + "rewards/rejected": -20.106231689453125, + "step": 10350 + }, + { + "epoch": 0.3490174930061681, + "grad_norm": 40.73187255859375, + "learning_rate": 8.227087738945216e-07, + "logits/chosen": -0.9383414387702942, + "logits/rejected": -1.0112953186035156, + "logps/chosen": -2.1667134761810303, + "logps/rejected": -2.1280548572540283, + "loss": 3.9253, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.66713523864746, + "rewards/margins": -0.3865896165370941, + "rewards/rejected": -21.280548095703125, + "step": 10355 + }, + { + "epoch": 0.34918601907715124, + "grad_norm": 25.78534698486328, + "learning_rate": 8.224840492659704e-07, + "logits/chosen": -1.1125036478042603, + "logits/rejected": -0.9585012197494507, + "logps/chosen": -2.114788770675659, + "logps/rejected": -2.162379503250122, + "loss": 3.0587, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.14788818359375, + "rewards/margins": 0.4759071469306946, + "rewards/rejected": -21.623794555664062, + "step": 10360 + }, + { + "epoch": 0.3493545451481344, + "grad_norm": 33.07319259643555, + "learning_rate": 8.22259213038395e-07, + "logits/chosen": -1.2760889530181885, + "logits/rejected": -1.3612782955169678, + "logps/chosen": -1.783482551574707, + "logps/rejected": -1.891667366027832, + "loss": 2.4565, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.834823608398438, + "rewards/margins": 1.0818490982055664, + "rewards/rejected": -18.91667366027832, + "step": 10365 + }, + { + "epoch": 0.3495230712191176, + "grad_norm": 60.872222900390625, + "learning_rate": 8.220342652896026e-07, + "logits/chosen": -1.4350082874298096, + "logits/rejected": -1.29402756690979, + "logps/chosen": -2.09328031539917, + "logps/rejected": -2.333141803741455, + "loss": 2.3236, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.932804107666016, + "rewards/margins": 2.398613691329956, + "rewards/rejected": -23.331417083740234, + "step": 10370 + }, + { + "epoch": 0.3496915972901008, + "grad_norm": 27.59153175354004, + "learning_rate": 8.218092060974385e-07, + "logits/chosen": -1.290093183517456, + "logits/rejected": -0.9906972050666809, + "logps/chosen": -2.023043394088745, + "logps/rejected": -2.148740768432617, + "loss": 2.9719, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.23043441772461, + "rewards/margins": 1.256973385810852, + "rewards/rejected": -21.487407684326172, + "step": 10375 + }, + { + "epoch": 0.34986012336108396, + "grad_norm": 34.23496627807617, + "learning_rate": 8.215840355397871e-07, + "logits/chosen": -0.91291743516922, + "logits/rejected": -1.5867294073104858, + "logps/chosen": -1.6097930669784546, + "logps/rejected": -2.0119924545288086, + "loss": 2.237, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.097929000854492, + "rewards/margins": 4.021994113922119, + "rewards/rejected": -20.119922637939453, + "step": 10380 + }, + { + "epoch": 0.35002864943206713, + "grad_norm": 23.3467960357666, + "learning_rate": 8.213587536945708e-07, + "logits/chosen": -0.9239813685417175, + "logits/rejected": -0.8876129984855652, + "logps/chosen": -1.8807910680770874, + "logps/rejected": -2.263803243637085, + "loss": 2.5773, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.80790901184082, + "rewards/margins": 3.8301239013671875, + "rewards/rejected": -22.638032913208008, + "step": 10385 + }, + { + "epoch": 0.3501971755030503, + "grad_norm": 21.498138427734375, + "learning_rate": 8.211333606397508e-07, + "logits/chosen": -1.2303471565246582, + "logits/rejected": -1.2403428554534912, + "logps/chosen": -1.7818397283554077, + "logps/rejected": -1.784547209739685, + "loss": 3.0743, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.81839942932129, + "rewards/margins": 0.027074432000517845, + "rewards/rejected": -17.845470428466797, + "step": 10390 + }, + { + "epoch": 0.3503657015740335, + "grad_norm": 33.57012939453125, + "learning_rate": 8.209078564533269e-07, + "logits/chosen": -0.9230928421020508, + "logits/rejected": -1.0170118808746338, + "logps/chosen": -1.7714271545410156, + "logps/rejected": -1.8767017126083374, + "loss": 2.7308, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.71427345275879, + "rewards/margins": 1.0527466535568237, + "rewards/rejected": -18.767019271850586, + "step": 10395 + }, + { + "epoch": 0.3505342276450167, + "grad_norm": 25.153078079223633, + "learning_rate": 8.206822412133372e-07, + "logits/chosen": -0.9156519174575806, + "logits/rejected": -1.0805760622024536, + "logps/chosen": -1.826174020767212, + "logps/rejected": -1.9233871698379517, + "loss": 2.7414, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.261737823486328, + "rewards/margins": 0.9721338152885437, + "rewards/rejected": -19.233871459960938, + "step": 10400 + }, + { + "epoch": 0.3505342276450167, + "eval_logits/chosen": -1.300429105758667, + "eval_logits/rejected": -1.3824095726013184, + "eval_logps/chosen": -1.8830461502075195, + "eval_logps/rejected": -1.9594608545303345, + "eval_loss": 3.055062770843506, + "eval_rewards/accuracies": 0.6100000143051147, + "eval_rewards/chosen": -18.830459594726562, + "eval_rewards/margins": 0.7641494870185852, + "eval_rewards/rejected": -19.594608306884766, + "eval_runtime": 12.9087, + "eval_samples_per_second": 7.747, + "eval_steps_per_second": 1.937, + "step": 10400 + }, + { + "epoch": 0.35070275371599985, + "grad_norm": 25.646238327026367, + "learning_rate": 8.204565149978582e-07, + "logits/chosen": -0.9649211168289185, + "logits/rejected": -1.0713859796524048, + "logps/chosen": -2.2169041633605957, + "logps/rejected": -2.3779962062835693, + "loss": 3.2058, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.16904067993164, + "rewards/margins": 1.61092209815979, + "rewards/rejected": -23.779964447021484, + "step": 10405 + }, + { + "epoch": 0.35087127978698307, + "grad_norm": 26.233304977416992, + "learning_rate": 8.202306778850048e-07, + "logits/chosen": -0.3807659447193146, + "logits/rejected": -0.40060725808143616, + "logps/chosen": -2.0837881565093994, + "logps/rejected": -2.195038080215454, + "loss": 3.0738, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.837881088256836, + "rewards/margins": 1.1124993562698364, + "rewards/rejected": -21.950382232666016, + "step": 10410 + }, + { + "epoch": 0.35103980585796624, + "grad_norm": 54.93354034423828, + "learning_rate": 8.200047299529305e-07, + "logits/chosen": -0.7190951108932495, + "logits/rejected": -0.6860246658325195, + "logps/chosen": -2.145395278930664, + "logps/rejected": -2.186631679534912, + "loss": 3.2817, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.45395278930664, + "rewards/margins": 0.41236335039138794, + "rewards/rejected": -21.866313934326172, + "step": 10415 + }, + { + "epoch": 0.3512083319289494, + "grad_norm": 23.91092872619629, + "learning_rate": 8.197786712798265e-07, + "logits/chosen": -0.7775014638900757, + "logits/rejected": -0.8118532299995422, + "logps/chosen": -1.7293260097503662, + "logps/rejected": -1.9149547815322876, + "loss": 2.2801, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.293262481689453, + "rewards/margins": 1.8562860488891602, + "rewards/rejected": -19.149547576904297, + "step": 10420 + }, + { + "epoch": 0.35137685799993257, + "grad_norm": 16.439292907714844, + "learning_rate": 8.195525019439236e-07, + "logits/chosen": -1.0097054243087769, + "logits/rejected": -1.0496337413787842, + "logps/chosen": -1.8205091953277588, + "logps/rejected": -1.7729756832122803, + "loss": 3.6208, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.205089569091797, + "rewards/margins": -0.4753352999687195, + "rewards/rejected": -17.729755401611328, + "step": 10425 + }, + { + "epoch": 0.3515453840709158, + "grad_norm": 98.25023651123047, + "learning_rate": 8.193262220234894e-07, + "logits/chosen": -1.0923794507980347, + "logits/rejected": -0.9958206415176392, + "logps/chosen": -2.3235385417938232, + "logps/rejected": -2.1468660831451416, + "loss": 4.7969, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -23.23538589477539, + "rewards/margins": -1.7667248249053955, + "rewards/rejected": -21.46866226196289, + "step": 10430 + }, + { + "epoch": 0.35171391014189896, + "grad_norm": 16.97013282775879, + "learning_rate": 8.190998315968306e-07, + "logits/chosen": -1.2971888780593872, + "logits/rejected": -1.257934808731079, + "logps/chosen": -1.5942026376724243, + "logps/rejected": -1.7968189716339111, + "loss": 1.7453, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -15.942026138305664, + "rewards/margins": 2.026162624359131, + "rewards/rejected": -17.968189239501953, + "step": 10435 + }, + { + "epoch": 0.3518824362128821, + "grad_norm": 31.861385345458984, + "learning_rate": 8.188733307422923e-07, + "logits/chosen": -1.2741501331329346, + "logits/rejected": -1.0171701908111572, + "logps/chosen": -2.095390796661377, + "logps/rejected": -2.1119773387908936, + "loss": 3.6509, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -20.953907012939453, + "rewards/margins": 0.16586685180664062, + "rewards/rejected": -21.11977195739746, + "step": 10440 + }, + { + "epoch": 0.3520509622838653, + "grad_norm": 46.652584075927734, + "learning_rate": 8.186467195382572e-07, + "logits/chosen": -1.0171384811401367, + "logits/rejected": -1.2291090488433838, + "logps/chosen": -1.9784443378448486, + "logps/rejected": -2.19016695022583, + "loss": 2.1259, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.784442901611328, + "rewards/margins": 2.1172266006469727, + "rewards/rejected": -21.901668548583984, + "step": 10445 + }, + { + "epoch": 0.3522194883548485, + "grad_norm": 24.156171798706055, + "learning_rate": 8.184199980631467e-07, + "logits/chosen": -0.7162508964538574, + "logits/rejected": -0.8358560800552368, + "logps/chosen": -2.1988096237182617, + "logps/rejected": -2.6243538856506348, + "loss": 1.7779, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.988094329833984, + "rewards/margins": 4.255441188812256, + "rewards/rejected": -26.2435359954834, + "step": 10450 + }, + { + "epoch": 0.3523880144258317, + "grad_norm": 32.38210678100586, + "learning_rate": 8.181931663954201e-07, + "logits/chosen": -0.7578636407852173, + "logits/rejected": -0.8377211689949036, + "logps/chosen": -1.8838396072387695, + "logps/rejected": -2.02470064163208, + "loss": 1.9536, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.838396072387695, + "rewards/margins": 1.4086089134216309, + "rewards/rejected": -20.24700355529785, + "step": 10455 + }, + { + "epoch": 0.35255654049681484, + "grad_norm": 31.619720458984375, + "learning_rate": 8.17966224613575e-07, + "logits/chosen": -0.8775178790092468, + "logits/rejected": -1.0011556148529053, + "logps/chosen": -1.9142124652862549, + "logps/rejected": -2.068910598754883, + "loss": 2.953, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.142126083374023, + "rewards/margins": 1.5469805002212524, + "rewards/rejected": -20.689105987548828, + "step": 10460 + }, + { + "epoch": 0.35272506656779806, + "grad_norm": 13.446866989135742, + "learning_rate": 8.177391727961469e-07, + "logits/chosen": -0.8765958547592163, + "logits/rejected": -1.1049778461456299, + "logps/chosen": -2.0977909564971924, + "logps/rejected": -2.3150532245635986, + "loss": 2.0321, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.977909088134766, + "rewards/margins": 2.172624349594116, + "rewards/rejected": -23.150531768798828, + "step": 10465 + }, + { + "epoch": 0.35289359263878123, + "grad_norm": 21.661466598510742, + "learning_rate": 8.175120110217095e-07, + "logits/chosen": -0.9677863121032715, + "logits/rejected": -1.0532230138778687, + "logps/chosen": -2.0553691387176514, + "logps/rejected": -2.2560195922851562, + "loss": 2.1423, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.553691864013672, + "rewards/margins": 2.0065035820007324, + "rewards/rejected": -22.560195922851562, + "step": 10470 + }, + { + "epoch": 0.3530621187097644, + "grad_norm": 10.768431663513184, + "learning_rate": 8.172847393688747e-07, + "logits/chosen": -0.6174628734588623, + "logits/rejected": -0.8380719423294067, + "logps/chosen": -1.5090444087982178, + "logps/rejected": -1.721604347229004, + "loss": 3.0297, + "rewards/accuracies": 0.5, + "rewards/chosen": -15.090444564819336, + "rewards/margins": 2.1255993843078613, + "rewards/rejected": -17.216045379638672, + "step": 10475 + }, + { + "epoch": 0.35323064478074756, + "grad_norm": 27.460527420043945, + "learning_rate": 8.170573579162918e-07, + "logits/chosen": -0.5143523812294006, + "logits/rejected": -0.6044338941574097, + "logps/chosen": -1.9238542318344116, + "logps/rejected": -2.391577959060669, + "loss": 1.76, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.238540649414062, + "rewards/margins": 4.6772356033325195, + "rewards/rejected": -23.9157772064209, + "step": 10480 + }, + { + "epoch": 0.3533991708517308, + "grad_norm": 17.713415145874023, + "learning_rate": 8.168298667426492e-07, + "logits/chosen": -0.8852537870407104, + "logits/rejected": -0.9979242086410522, + "logps/chosen": -2.6422157287597656, + "logps/rejected": -2.943582534790039, + "loss": 2.0315, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -26.42215919494629, + "rewards/margins": 3.0136685371398926, + "rewards/rejected": -29.435827255249023, + "step": 10485 + }, + { + "epoch": 0.35356769692271395, + "grad_norm": 39.909637451171875, + "learning_rate": 8.166022659266722e-07, + "logits/chosen": -1.2096792459487915, + "logits/rejected": -1.0006765127182007, + "logps/chosen": -1.7474887371063232, + "logps/rejected": -1.59381103515625, + "loss": 5.1208, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.47488784790039, + "rewards/margins": -1.5367769002914429, + "rewards/rejected": -15.9381103515625, + "step": 10490 + }, + { + "epoch": 0.3537362229936971, + "grad_norm": 13.468647956848145, + "learning_rate": 8.163745555471246e-07, + "logits/chosen": -1.0280694961547852, + "logits/rejected": -1.1094509363174438, + "logps/chosen": -1.7760365009307861, + "logps/rejected": -2.0079100131988525, + "loss": 1.5435, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.760364532470703, + "rewards/margins": 2.3187355995178223, + "rewards/rejected": -20.079099655151367, + "step": 10495 + }, + { + "epoch": 0.3539047490646803, + "grad_norm": 33.24937438964844, + "learning_rate": 8.161467356828079e-07, + "logits/chosen": -0.4880562424659729, + "logits/rejected": -0.7934033870697021, + "logps/chosen": -2.284480571746826, + "logps/rejected": -2.696446180343628, + "loss": 1.5306, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -22.844804763793945, + "rewards/margins": 4.119656562805176, + "rewards/rejected": -26.964462280273438, + "step": 10500 + }, + { + "epoch": 0.3540732751356635, + "grad_norm": 27.55710792541504, + "learning_rate": 8.159188064125617e-07, + "logits/chosen": -0.588653028011322, + "logits/rejected": -0.5736607313156128, + "logps/chosen": -1.9094724655151367, + "logps/rejected": -1.8087133169174194, + "loss": 4.103, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.094724655151367, + "rewards/margins": -1.007592797279358, + "rewards/rejected": -18.087133407592773, + "step": 10505 + }, + { + "epoch": 0.35424180120664667, + "grad_norm": 31.66336441040039, + "learning_rate": 8.156907678152633e-07, + "logits/chosen": -0.8797151446342468, + "logits/rejected": -1.1634795665740967, + "logps/chosen": -1.987494707107544, + "logps/rejected": -2.1331000328063965, + "loss": 2.3538, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.87494468688965, + "rewards/margins": 1.4560561180114746, + "rewards/rejected": -21.33099937438965, + "step": 10510 + }, + { + "epoch": 0.35441032727762983, + "grad_norm": 24.7913875579834, + "learning_rate": 8.15462619969828e-07, + "logits/chosen": -1.3399163484573364, + "logits/rejected": -1.513481616973877, + "logps/chosen": -1.6611442565917969, + "logps/rejected": -1.8118454217910767, + "loss": 2.4241, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.61144256591797, + "rewards/margins": 1.507012128829956, + "rewards/rejected": -18.11845588684082, + "step": 10515 + }, + { + "epoch": 0.35457885334861305, + "grad_norm": 28.988832473754883, + "learning_rate": 8.152343629552086e-07, + "logits/chosen": -1.3251217603683472, + "logits/rejected": -1.4001457691192627, + "logps/chosen": -2.0002996921539307, + "logps/rejected": -2.1876602172851562, + "loss": 3.1388, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.002994537353516, + "rewards/margins": 1.873605728149414, + "rewards/rejected": -21.876602172851562, + "step": 10520 + }, + { + "epoch": 0.3547473794195962, + "grad_norm": 23.213520050048828, + "learning_rate": 8.15005996850396e-07, + "logits/chosen": -0.7930396795272827, + "logits/rejected": -0.8237592577934265, + "logps/chosen": -2.3455262184143066, + "logps/rejected": -2.8883988857269287, + "loss": 2.1257, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.455265045166016, + "rewards/margins": 5.428727149963379, + "rewards/rejected": -28.883987426757812, + "step": 10525 + }, + { + "epoch": 0.3549159054905794, + "grad_norm": 18.53113555908203, + "learning_rate": 8.147775217344183e-07, + "logits/chosen": -0.570213258266449, + "logits/rejected": -0.6492749452590942, + "logps/chosen": -1.9311374425888062, + "logps/rejected": -1.984156847000122, + "loss": 2.9684, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.31137466430664, + "rewards/margins": 0.5301931500434875, + "rewards/rejected": -19.841569900512695, + "step": 10530 + }, + { + "epoch": 0.35508443156156255, + "grad_norm": 18.7517147064209, + "learning_rate": 8.145489376863424e-07, + "logits/chosen": -1.0552548170089722, + "logits/rejected": -1.1530416011810303, + "logps/chosen": -2.1529173851013184, + "logps/rejected": -2.5861310958862305, + "loss": 1.6314, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.529176712036133, + "rewards/margins": 4.332135200500488, + "rewards/rejected": -25.861309051513672, + "step": 10535 + }, + { + "epoch": 0.3552529576325458, + "grad_norm": 19.308006286621094, + "learning_rate": 8.143202447852718e-07, + "logits/chosen": -0.7028933763504028, + "logits/rejected": -0.7312060594558716, + "logps/chosen": -1.9514005184173584, + "logps/rejected": -1.9244524240493774, + "loss": 3.4887, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.514005661010742, + "rewards/margins": -0.26948195695877075, + "rewards/rejected": -19.244525909423828, + "step": 10540 + }, + { + "epoch": 0.35542148370352894, + "grad_norm": 18.863069534301758, + "learning_rate": 8.140914431103482e-07, + "logits/chosen": -1.2888405323028564, + "logits/rejected": -1.218519926071167, + "logps/chosen": -1.8028770685195923, + "logps/rejected": -1.8393287658691406, + "loss": 2.804, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.02876853942871, + "rewards/margins": 0.36451882123947144, + "rewards/rejected": -18.39328956604004, + "step": 10545 + }, + { + "epoch": 0.3555900097745121, + "grad_norm": 95.41343688964844, + "learning_rate": 8.138625327407509e-07, + "logits/chosen": -1.0824968814849854, + "logits/rejected": -1.0274932384490967, + "logps/chosen": -2.0919995307922363, + "logps/rejected": -2.3005242347717285, + "loss": 2.4623, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.919994354248047, + "rewards/margins": 2.085249185562134, + "rewards/rejected": -23.005245208740234, + "step": 10550 + }, + { + "epoch": 0.35575853584549527, + "grad_norm": 39.427024841308594, + "learning_rate": 8.136335137556967e-07, + "logits/chosen": -1.059185266494751, + "logits/rejected": -1.1593902111053467, + "logps/chosen": -2.3896114826202393, + "logps/rejected": -2.362490653991699, + "loss": 3.4528, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.896114349365234, + "rewards/margins": -0.2712062895298004, + "rewards/rejected": -23.624908447265625, + "step": 10555 + }, + { + "epoch": 0.3559270619164785, + "grad_norm": 28.08155059814453, + "learning_rate": 8.134043862344399e-07, + "logits/chosen": -0.863645076751709, + "logits/rejected": -0.8519765138626099, + "logps/chosen": -1.8194077014923096, + "logps/rejected": -1.9381601810455322, + "loss": 2.887, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.194076538085938, + "rewards/margins": 1.1875252723693848, + "rewards/rejected": -19.381603240966797, + "step": 10560 + }, + { + "epoch": 0.35609558798746166, + "grad_norm": 35.566463470458984, + "learning_rate": 8.13175150256273e-07, + "logits/chosen": -0.351532518863678, + "logits/rejected": -0.5769934058189392, + "logps/chosen": -1.795641303062439, + "logps/rejected": -2.0936331748962402, + "loss": 2.3132, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.9564151763916, + "rewards/margins": 2.97991681098938, + "rewards/rejected": -20.936330795288086, + "step": 10565 + }, + { + "epoch": 0.3562641140584448, + "grad_norm": 18.616701126098633, + "learning_rate": 8.129458059005249e-07, + "logits/chosen": -0.9681928753852844, + "logits/rejected": -1.2117507457733154, + "logps/chosen": -2.4990522861480713, + "logps/rejected": -2.418123722076416, + "loss": 4.0166, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.990520477294922, + "rewards/margins": -0.8092843294143677, + "rewards/rejected": -24.181236267089844, + "step": 10570 + }, + { + "epoch": 0.35643264012942805, + "grad_norm": 13.77556037902832, + "learning_rate": 8.127163532465629e-07, + "logits/chosen": -0.9583713412284851, + "logits/rejected": -0.7233438491821289, + "logps/chosen": -2.600524425506592, + "logps/rejected": -2.3895583152770996, + "loss": 5.9725, + "rewards/accuracies": 0.5, + "rewards/chosen": -26.0052433013916, + "rewards/margins": -2.1096599102020264, + "rewards/rejected": -23.895580291748047, + "step": 10575 + }, + { + "epoch": 0.3566011662004112, + "grad_norm": 22.821674346923828, + "learning_rate": 8.124867923737918e-07, + "logits/chosen": -0.8695154190063477, + "logits/rejected": -0.8962182998657227, + "logps/chosen": -2.6580350399017334, + "logps/rejected": -2.571289300918579, + "loss": 4.253, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -26.580352783203125, + "rewards/margins": -0.8674640655517578, + "rewards/rejected": -25.712890625, + "step": 10580 + }, + { + "epoch": 0.3567696922713944, + "grad_norm": 37.85147476196289, + "learning_rate": 8.122571233616531e-07, + "logits/chosen": -0.7850193977355957, + "logits/rejected": -0.8626729846000671, + "logps/chosen": -1.9868669509887695, + "logps/rejected": -2.2017486095428467, + "loss": 3.1236, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.868671417236328, + "rewards/margins": 2.1488170623779297, + "rewards/rejected": -22.017486572265625, + "step": 10585 + }, + { + "epoch": 0.35693821834237754, + "grad_norm": 25.867671966552734, + "learning_rate": 8.120273462896267e-07, + "logits/chosen": -0.9501420259475708, + "logits/rejected": -1.3043444156646729, + "logps/chosen": -1.6017944812774658, + "logps/rejected": -1.914813756942749, + "loss": 2.0398, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.0179443359375, + "rewards/margins": 3.1301941871643066, + "rewards/rejected": -19.14813804626465, + "step": 10590 + }, + { + "epoch": 0.35710674441336077, + "grad_norm": 26.94976806640625, + "learning_rate": 8.11797461237229e-07, + "logits/chosen": -0.9019731283187866, + "logits/rejected": -1.0319669246673584, + "logps/chosen": -2.006082057952881, + "logps/rejected": -2.0699872970581055, + "loss": 2.8598, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.060821533203125, + "rewards/margins": 0.6390496492385864, + "rewards/rejected": -20.699871063232422, + "step": 10595 + }, + { + "epoch": 0.35727527048434393, + "grad_norm": 61.693450927734375, + "learning_rate": 8.115674682840143e-07, + "logits/chosen": -0.7386834025382996, + "logits/rejected": -0.798917293548584, + "logps/chosen": -2.1820013523101807, + "logps/rejected": -2.5649349689483643, + "loss": 1.9702, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.820011138916016, + "rewards/margins": 3.8293373584747314, + "rewards/rejected": -25.64935302734375, + "step": 10600 + }, + { + "epoch": 0.3574437965553271, + "grad_norm": 35.622066497802734, + "learning_rate": 8.113373675095743e-07, + "logits/chosen": -0.8943923711776733, + "logits/rejected": -0.7858074903488159, + "logps/chosen": -2.3475801944732666, + "logps/rejected": -2.061145305633545, + "loss": 6.0813, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.475801467895508, + "rewards/margins": -2.864348888397217, + "rewards/rejected": -20.611452102661133, + "step": 10605 + }, + { + "epoch": 0.35761232262631026, + "grad_norm": 21.826406478881836, + "learning_rate": 8.111071589935374e-07, + "logits/chosen": -0.8130962252616882, + "logits/rejected": -1.0828945636749268, + "logps/chosen": -1.6612212657928467, + "logps/rejected": -2.0095667839050293, + "loss": 1.5615, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -16.612215042114258, + "rewards/margins": 3.4834537506103516, + "rewards/rejected": -20.09566879272461, + "step": 10610 + }, + { + "epoch": 0.3577808486972935, + "grad_norm": 23.42716407775879, + "learning_rate": 8.108768428155699e-07, + "logits/chosen": -1.146475911140442, + "logits/rejected": -1.3742198944091797, + "logps/chosen": -1.9793484210968018, + "logps/rejected": -2.097473382949829, + "loss": 2.7614, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.79348373413086, + "rewards/margins": 1.181250810623169, + "rewards/rejected": -20.974733352661133, + "step": 10615 + }, + { + "epoch": 0.35794937476827665, + "grad_norm": 19.213890075683594, + "learning_rate": 8.106464190553753e-07, + "logits/chosen": -0.6341241002082825, + "logits/rejected": -0.7922481298446655, + "logps/chosen": -2.2231078147888184, + "logps/rejected": -1.9456430673599243, + "loss": 6.9201, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -22.231077194213867, + "rewards/margins": -2.7746474742889404, + "rewards/rejected": -19.45642852783203, + "step": 10620 + }, + { + "epoch": 0.3581179008392598, + "grad_norm": 20.00115203857422, + "learning_rate": 8.104158877926939e-07, + "logits/chosen": -0.8993018865585327, + "logits/rejected": -1.1128456592559814, + "logps/chosen": -2.005645751953125, + "logps/rejected": -2.5759527683258057, + "loss": 2.2756, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.056455612182617, + "rewards/margins": 5.70306921005249, + "rewards/rejected": -25.7595272064209, + "step": 10625 + }, + { + "epoch": 0.35828642691024304, + "grad_norm": 84.15482330322266, + "learning_rate": 8.101852491073036e-07, + "logits/chosen": -1.1985838413238525, + "logits/rejected": -1.105023980140686, + "logps/chosen": -2.0047457218170166, + "logps/rejected": -2.3635776042938232, + "loss": 1.906, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.047454833984375, + "rewards/margins": 3.588322401046753, + "rewards/rejected": -23.635778427124023, + "step": 10630 + }, + { + "epoch": 0.3584549529812262, + "grad_norm": 23.04952621459961, + "learning_rate": 8.099545030790196e-07, + "logits/chosen": -0.5161810517311096, + "logits/rejected": -0.5834816694259644, + "logps/chosen": -1.7309818267822266, + "logps/rejected": -1.7283589839935303, + "loss": 3.5396, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.3098201751709, + "rewards/margins": -0.02622966840863228, + "rewards/rejected": -17.28359031677246, + "step": 10635 + }, + { + "epoch": 0.35862347905220937, + "grad_norm": 17.653745651245117, + "learning_rate": 8.097236497876936e-07, + "logits/chosen": -0.992956817150116, + "logits/rejected": -1.181114912033081, + "logps/chosen": -2.4386329650878906, + "logps/rejected": -2.3491318225860596, + "loss": 4.2635, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.386327743530273, + "rewards/margins": -0.8950119018554688, + "rewards/rejected": -23.491315841674805, + "step": 10640 + }, + { + "epoch": 0.35879200512319254, + "grad_norm": 19.865116119384766, + "learning_rate": 8.094926893132151e-07, + "logits/chosen": -1.1021289825439453, + "logits/rejected": -1.2707788944244385, + "logps/chosen": -1.718862771987915, + "logps/rejected": -1.920997977256775, + "loss": 2.7385, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.188629150390625, + "rewards/margins": 2.0213510990142822, + "rewards/rejected": -19.209980010986328, + "step": 10645 + }, + { + "epoch": 0.35896053119417576, + "grad_norm": 26.011329650878906, + "learning_rate": 8.092616217355104e-07, + "logits/chosen": -0.8622426986694336, + "logits/rejected": -0.7457947731018066, + "logps/chosen": -2.4235494136810303, + "logps/rejected": -2.853537082672119, + "loss": 2.9544, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.23549461364746, + "rewards/margins": 4.2998762130737305, + "rewards/rejected": -28.535369873046875, + "step": 10650 + }, + { + "epoch": 0.3591290572651589, + "grad_norm": 39.413116455078125, + "learning_rate": 8.090304471345428e-07, + "logits/chosen": -0.4998112618923187, + "logits/rejected": -0.43951162695884705, + "logps/chosen": -2.3564367294311523, + "logps/rejected": -2.374368190765381, + "loss": 2.935, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.56436538696289, + "rewards/margins": 0.1793135702610016, + "rewards/rejected": -23.743680953979492, + "step": 10655 + }, + { + "epoch": 0.3592975833361421, + "grad_norm": 23.182138442993164, + "learning_rate": 8.087991655903129e-07, + "logits/chosen": -0.576167106628418, + "logits/rejected": -0.7032621502876282, + "logps/chosen": -2.637622833251953, + "logps/rejected": -2.7747979164123535, + "loss": 3.8412, + "rewards/accuracies": 0.5, + "rewards/chosen": -26.376230239868164, + "rewards/margins": 1.3717488050460815, + "rewards/rejected": -27.74798011779785, + "step": 10660 + }, + { + "epoch": 0.35946610940712526, + "grad_norm": 15.814130783081055, + "learning_rate": 8.085677771828577e-07, + "logits/chosen": -0.6864426136016846, + "logits/rejected": -0.8167473077774048, + "logps/chosen": -1.9061853885650635, + "logps/rejected": -2.3845667839050293, + "loss": 1.5243, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.061853408813477, + "rewards/margins": 4.783814430236816, + "rewards/rejected": -23.84566879272461, + "step": 10665 + }, + { + "epoch": 0.3596346354781085, + "grad_norm": 14.93944263458252, + "learning_rate": 8.083362819922521e-07, + "logits/chosen": -0.6831316351890564, + "logits/rejected": -0.7957452535629272, + "logps/chosen": -2.8043792247772217, + "logps/rejected": -3.365492582321167, + "loss": 1.2971, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -28.043792724609375, + "rewards/margins": 5.611135005950928, + "rewards/rejected": -33.65492630004883, + "step": 10670 + }, + { + "epoch": 0.35980316154909164, + "grad_norm": 21.99008560180664, + "learning_rate": 8.081046800986072e-07, + "logits/chosen": -0.8613970875740051, + "logits/rejected": -0.8172779083251953, + "logps/chosen": -2.1606273651123047, + "logps/rejected": -1.9760665893554688, + "loss": 5.3347, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.606273651123047, + "rewards/margins": -1.8456090688705444, + "rewards/rejected": -19.760665893554688, + "step": 10675 + }, + { + "epoch": 0.3599716876200748, + "grad_norm": 2.4829165935516357, + "learning_rate": 8.078729715820713e-07, + "logits/chosen": -1.1459004878997803, + "logits/rejected": -1.173221230506897, + "logps/chosen": -1.861358880996704, + "logps/rejected": -2.1315152645111084, + "loss": 2.0619, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.613590240478516, + "rewards/margins": 2.7015633583068848, + "rewards/rejected": -21.315153121948242, + "step": 10680 + }, + { + "epoch": 0.36014021369105803, + "grad_norm": 22.07672119140625, + "learning_rate": 8.076411565228298e-07, + "logits/chosen": -1.134795904159546, + "logits/rejected": -1.1395275592803955, + "logps/chosen": -1.8481004238128662, + "logps/rejected": -2.072533130645752, + "loss": 2.1095, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.481006622314453, + "rewards/margins": 2.2443251609802246, + "rewards/rejected": -20.725330352783203, + "step": 10685 + }, + { + "epoch": 0.3603087397620412, + "grad_norm": 34.374366760253906, + "learning_rate": 8.074092350011046e-07, + "logits/chosen": -1.1068211793899536, + "logits/rejected": -1.222532033920288, + "logps/chosen": -1.869600534439087, + "logps/rejected": -1.9473581314086914, + "loss": 2.7607, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.69600486755371, + "rewards/margins": 0.7775775790214539, + "rewards/rejected": -19.473583221435547, + "step": 10690 + }, + { + "epoch": 0.36047726583302436, + "grad_norm": 149.23831176757812, + "learning_rate": 8.071772070971546e-07, + "logits/chosen": -0.8026615977287292, + "logits/rejected": -0.8282175064086914, + "logps/chosen": -2.3600168228149414, + "logps/rejected": -2.2999606132507324, + "loss": 4.2298, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.600168228149414, + "rewards/margins": -0.6005603671073914, + "rewards/rejected": -22.99960708618164, + "step": 10695 + }, + { + "epoch": 0.36064579190400753, + "grad_norm": 19.745399475097656, + "learning_rate": 8.069450728912753e-07, + "logits/chosen": -0.7332836389541626, + "logits/rejected": -0.6940861940383911, + "logps/chosen": -2.3434970378875732, + "logps/rejected": -2.887530565261841, + "loss": 3.0107, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.43497085571289, + "rewards/margins": 5.440334320068359, + "rewards/rejected": -28.87530517578125, + "step": 10700 + }, + { + "epoch": 0.36081431797499075, + "grad_norm": 28.678794860839844, + "learning_rate": 8.067128324637997e-07, + "logits/chosen": -1.017822504043579, + "logits/rejected": -1.1551496982574463, + "logps/chosen": -2.1272358894348145, + "logps/rejected": -2.23545503616333, + "loss": 2.3345, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.272357940673828, + "rewards/margins": 1.0821908712387085, + "rewards/rejected": -22.354549407958984, + "step": 10705 + }, + { + "epoch": 0.3609828440459739, + "grad_norm": 17.994518280029297, + "learning_rate": 8.064804858950966e-07, + "logits/chosen": -1.330047845840454, + "logits/rejected": -1.540175199508667, + "logps/chosen": -1.7342472076416016, + "logps/rejected": -1.9005409479141235, + "loss": 2.1613, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.342472076416016, + "rewards/margins": 1.6629356145858765, + "rewards/rejected": -19.005409240722656, + "step": 10710 + }, + { + "epoch": 0.3611513701169571, + "grad_norm": 21.73023223876953, + "learning_rate": 8.062480332655722e-07, + "logits/chosen": -0.590639591217041, + "logits/rejected": -0.5785341858863831, + "logps/chosen": -2.293099880218506, + "logps/rejected": -2.547441005706787, + "loss": 1.9951, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.93099594116211, + "rewards/margins": 2.5434110164642334, + "rewards/rejected": -25.474407196044922, + "step": 10715 + }, + { + "epoch": 0.36131989618794025, + "grad_norm": 44.752376556396484, + "learning_rate": 8.060154746556694e-07, + "logits/chosen": -1.0894025564193726, + "logits/rejected": -0.9643278121948242, + "logps/chosen": -2.7998452186584473, + "logps/rejected": -2.6930184364318848, + "loss": 4.1691, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -27.99845314025879, + "rewards/margins": -1.0682717561721802, + "rewards/rejected": -26.9301815032959, + "step": 10720 + }, + { + "epoch": 0.36148842225892347, + "grad_norm": 22.641786575317383, + "learning_rate": 8.05782810145867e-07, + "logits/chosen": -0.69547039270401, + "logits/rejected": -0.9306710958480835, + "logps/chosen": -1.8364391326904297, + "logps/rejected": -2.280531167984009, + "loss": 2.5626, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.364391326904297, + "rewards/margins": 4.440920829772949, + "rewards/rejected": -22.805309295654297, + "step": 10725 + }, + { + "epoch": 0.36165694832990664, + "grad_norm": 19.32879066467285, + "learning_rate": 8.055500398166816e-07, + "logits/chosen": -0.8459224700927734, + "logits/rejected": -0.7579227685928345, + "logps/chosen": -2.598602056503296, + "logps/rejected": -2.1134042739868164, + "loss": 7.9173, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -25.986019134521484, + "rewards/margins": -4.851977348327637, + "rewards/rejected": -21.134042739868164, + "step": 10730 + }, + { + "epoch": 0.3618254744008898, + "grad_norm": 27.421846389770508, + "learning_rate": 8.053171637486656e-07, + "logits/chosen": -0.9135136604309082, + "logits/rejected": -0.8715440630912781, + "logps/chosen": -2.0788276195526123, + "logps/rejected": -2.096433162689209, + "loss": 3.4743, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -20.78827667236328, + "rewards/margins": 0.1760571449995041, + "rewards/rejected": -20.96433448791504, + "step": 10735 + }, + { + "epoch": 0.361994000471873, + "grad_norm": 16.79775047302246, + "learning_rate": 8.050841820224081e-07, + "logits/chosen": -0.8370095491409302, + "logits/rejected": -0.9754121899604797, + "logps/chosen": -1.9215948581695557, + "logps/rejected": -2.382934331893921, + "loss": 2.0463, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.2159481048584, + "rewards/margins": 4.613394737243652, + "rewards/rejected": -23.829343795776367, + "step": 10740 + }, + { + "epoch": 0.3621625265428562, + "grad_norm": 9.726906776428223, + "learning_rate": 8.048510947185353e-07, + "logits/chosen": -1.0751714706420898, + "logits/rejected": -1.2355901002883911, + "logps/chosen": -1.956291913986206, + "logps/rejected": -2.3752856254577637, + "loss": 1.4613, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.56292152404785, + "rewards/margins": 4.189937591552734, + "rewards/rejected": -23.752857208251953, + "step": 10745 + }, + { + "epoch": 0.36233105261383936, + "grad_norm": 16.294376373291016, + "learning_rate": 8.046179019177091e-07, + "logits/chosen": -0.8132011294364929, + "logits/rejected": -0.9101356267929077, + "logps/chosen": -2.291696786880493, + "logps/rejected": -2.310281753540039, + "loss": 3.8084, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.91696548461914, + "rewards/margins": 0.1858508139848709, + "rewards/rejected": -23.102819442749023, + "step": 10750 + }, + { + "epoch": 0.3624995786848225, + "grad_norm": 43.041324615478516, + "learning_rate": 8.043846037006285e-07, + "logits/chosen": -0.9655378460884094, + "logits/rejected": -0.837437629699707, + "logps/chosen": -2.0782408714294434, + "logps/rejected": -2.021411180496216, + "loss": 3.6769, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.78240966796875, + "rewards/margins": -0.5682979822158813, + "rewards/rejected": -20.214109420776367, + "step": 10755 + }, + { + "epoch": 0.36266810475580574, + "grad_norm": 23.52166175842285, + "learning_rate": 8.041512001480288e-07, + "logits/chosen": -1.079542875289917, + "logits/rejected": -1.1581979990005493, + "logps/chosen": -1.7527449131011963, + "logps/rejected": -2.023618221282959, + "loss": 1.5697, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -17.527446746826172, + "rewards/margins": 2.7087342739105225, + "rewards/rejected": -20.236183166503906, + "step": 10760 + }, + { + "epoch": 0.3628366308267889, + "grad_norm": 7.493854522705078, + "learning_rate": 8.03917691340682e-07, + "logits/chosen": -0.7019214034080505, + "logits/rejected": -0.9974255561828613, + "logps/chosen": -2.165797472000122, + "logps/rejected": -2.412288188934326, + "loss": 2.6236, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.657974243164062, + "rewards/margins": 2.4649100303649902, + "rewards/rejected": -24.12288475036621, + "step": 10765 + }, + { + "epoch": 0.3630051568977721, + "grad_norm": 0.5345020890235901, + "learning_rate": 8.036840773593958e-07, + "logits/chosen": -1.065690279006958, + "logits/rejected": -1.0886703729629517, + "logps/chosen": -1.682474136352539, + "logps/rejected": -2.0966532230377197, + "loss": 2.0629, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -16.824743270874023, + "rewards/margins": 4.141788482666016, + "rewards/rejected": -20.966529846191406, + "step": 10770 + }, + { + "epoch": 0.36317368296875524, + "grad_norm": 49.67851638793945, + "learning_rate": 8.034503582850154e-07, + "logits/chosen": -1.3109495639801025, + "logits/rejected": -1.2142508029937744, + "logps/chosen": -1.5672677755355835, + "logps/rejected": -1.5252450704574585, + "loss": 3.6729, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -15.672677993774414, + "rewards/margins": -0.42022705078125, + "rewards/rejected": -15.252450942993164, + "step": 10775 + }, + { + "epoch": 0.36334220903973846, + "grad_norm": 29.32314109802246, + "learning_rate": 8.032165341984214e-07, + "logits/chosen": -1.1105209589004517, + "logits/rejected": -1.3988972902297974, + "logps/chosen": -1.9345782995224, + "logps/rejected": -1.8571125268936157, + "loss": 3.8116, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -19.345783233642578, + "rewards/margins": -0.7746579051017761, + "rewards/rejected": -18.571125030517578, + "step": 10780 + }, + { + "epoch": 0.36351073511072163, + "grad_norm": 24.98900604248047, + "learning_rate": 8.029826051805311e-07, + "logits/chosen": -0.8778678178787231, + "logits/rejected": -1.202693223953247, + "logps/chosen": -2.0306835174560547, + "logps/rejected": -2.4914937019348145, + "loss": 2.0412, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.30683708190918, + "rewards/margins": 4.608098983764648, + "rewards/rejected": -24.914936065673828, + "step": 10785 + }, + { + "epoch": 0.3636792611817048, + "grad_norm": 51.06911087036133, + "learning_rate": 8.027485713122982e-07, + "logits/chosen": -0.7300389409065247, + "logits/rejected": -0.7390624284744263, + "logps/chosen": -2.4046947956085205, + "logps/rejected": -2.5226998329162598, + "loss": 3.5424, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -24.046947479248047, + "rewards/margins": 1.1800497770309448, + "rewards/rejected": -25.226999282836914, + "step": 10790 + }, + { + "epoch": 0.363847787252688, + "grad_norm": 45.48428726196289, + "learning_rate": 8.025144326747126e-07, + "logits/chosen": -1.0668237209320068, + "logits/rejected": -1.1761319637298584, + "logps/chosen": -2.078740358352661, + "logps/rejected": -2.179811477661133, + "loss": 3.0571, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.787403106689453, + "rewards/margins": 1.010711908340454, + "rewards/rejected": -21.798114776611328, + "step": 10795 + }, + { + "epoch": 0.3640163133236712, + "grad_norm": 26.712162017822266, + "learning_rate": 8.022801893488003e-07, + "logits/chosen": -1.1064934730529785, + "logits/rejected": -1.1405363082885742, + "logps/chosen": -1.9824740886688232, + "logps/rejected": -2.1991167068481445, + "loss": 2.0287, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.82474136352539, + "rewards/margins": 2.166426420211792, + "rewards/rejected": -21.991167068481445, + "step": 10800 + }, + { + "epoch": 0.3640163133236712, + "eval_logits/chosen": -1.346745491027832, + "eval_logits/rejected": -1.4354705810546875, + "eval_logps/chosen": -1.899340033531189, + "eval_logps/rejected": -1.9798485040664673, + "eval_loss": 3.053359270095825, + "eval_rewards/accuracies": 0.6200000047683716, + "eval_rewards/chosen": -18.99340057373047, + "eval_rewards/margins": 0.8050832152366638, + "eval_rewards/rejected": -19.798484802246094, + "eval_runtime": 12.9258, + "eval_samples_per_second": 7.736, + "eval_steps_per_second": 1.934, + "step": 10800 + }, + { + "epoch": 0.36418483939465435, + "grad_norm": 30.801185607910156, + "learning_rate": 8.020458414156239e-07, + "logits/chosen": -1.2585346698760986, + "logits/rejected": -1.40419602394104, + "logps/chosen": -2.2337536811828613, + "logps/rejected": -2.4760305881500244, + "loss": 2.0292, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.337535858154297, + "rewards/margins": 2.422769546508789, + "rewards/rejected": -24.760305404663086, + "step": 10805 + }, + { + "epoch": 0.3643533654656375, + "grad_norm": 48.36637496948242, + "learning_rate": 8.018113889562821e-07, + "logits/chosen": -1.1833237409591675, + "logits/rejected": -1.37998366355896, + "logps/chosen": -1.620487928390503, + "logps/rejected": -1.7583506107330322, + "loss": 2.9234, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.20488166809082, + "rewards/margins": 1.378624677658081, + "rewards/rejected": -17.583505630493164, + "step": 10810 + }, + { + "epoch": 0.36452189153662073, + "grad_norm": 20.77741813659668, + "learning_rate": 8.015768320519094e-07, + "logits/chosen": -0.5633755326271057, + "logits/rejected": -0.5698398351669312, + "logps/chosen": -2.248947858810425, + "logps/rejected": -2.2062184810638428, + "loss": 3.8129, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.489479064941406, + "rewards/margins": -0.4272943437099457, + "rewards/rejected": -22.062185287475586, + "step": 10815 + }, + { + "epoch": 0.3646904176076039, + "grad_norm": 24.723190307617188, + "learning_rate": 8.013421707836767e-07, + "logits/chosen": -0.9400695562362671, + "logits/rejected": -1.1459122896194458, + "logps/chosen": -2.1460351943969727, + "logps/rejected": -2.315514087677002, + "loss": 2.2302, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.46035385131836, + "rewards/margins": 1.6947863101959229, + "rewards/rejected": -23.155139923095703, + "step": 10820 + }, + { + "epoch": 0.36485894367858707, + "grad_norm": 32.624755859375, + "learning_rate": 8.01107405232791e-07, + "logits/chosen": -1.087473750114441, + "logits/rejected": -1.2249068021774292, + "logps/chosen": -1.879091501235962, + "logps/rejected": -2.0642178058624268, + "loss": 1.6607, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.79091453552246, + "rewards/margins": 1.8512637615203857, + "rewards/rejected": -20.64217758178711, + "step": 10825 + }, + { + "epoch": 0.36502746974957023, + "grad_norm": 34.93103790283203, + "learning_rate": 8.008725354804957e-07, + "logits/chosen": -0.7595319747924805, + "logits/rejected": -0.9362818598747253, + "logps/chosen": -1.7449228763580322, + "logps/rejected": -1.6130717992782593, + "loss": 5.0697, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.449230194091797, + "rewards/margins": -1.318509817123413, + "rewards/rejected": -16.130718231201172, + "step": 10830 + }, + { + "epoch": 0.36519599582055345, + "grad_norm": 16.051925659179688, + "learning_rate": 8.006375616080697e-07, + "logits/chosen": -1.0025599002838135, + "logits/rejected": -1.1287715435028076, + "logps/chosen": -2.0526137351989746, + "logps/rejected": -2.1133031845092773, + "loss": 2.8868, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.526134490966797, + "rewards/margins": 0.6068953275680542, + "rewards/rejected": -21.13302993774414, + "step": 10835 + }, + { + "epoch": 0.3653645218915366, + "grad_norm": 31.391122817993164, + "learning_rate": 8.004024836968284e-07, + "logits/chosen": -1.0124971866607666, + "logits/rejected": -1.1490424871444702, + "logps/chosen": -2.1096863746643066, + "logps/rejected": -2.255751132965088, + "loss": 2.6028, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.09686279296875, + "rewards/margins": 1.4606473445892334, + "rewards/rejected": -22.557512283325195, + "step": 10840 + }, + { + "epoch": 0.3655330479625198, + "grad_norm": 30.329187393188477, + "learning_rate": 8.001673018281228e-07, + "logits/chosen": -0.9520799517631531, + "logits/rejected": -1.063399076461792, + "logps/chosen": -2.088949203491211, + "logps/rejected": -2.2793190479278564, + "loss": 2.4518, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.88949203491211, + "rewards/margins": 1.9036967754364014, + "rewards/rejected": -22.793188095092773, + "step": 10845 + }, + { + "epoch": 0.365701574033503, + "grad_norm": 37.581329345703125, + "learning_rate": 7.9993201608334e-07, + "logits/chosen": -0.3347877562046051, + "logits/rejected": -0.5965047478675842, + "logps/chosen": -1.8404756784439087, + "logps/rejected": -2.2952017784118652, + "loss": 1.913, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.404754638671875, + "rewards/margins": 4.547261714935303, + "rewards/rejected": -22.952016830444336, + "step": 10850 + }, + { + "epoch": 0.3658701001044862, + "grad_norm": 21.93905258178711, + "learning_rate": 7.996966265439033e-07, + "logits/chosen": -0.9020354151725769, + "logits/rejected": -0.9424558877944946, + "logps/chosen": -1.7665306329727173, + "logps/rejected": -1.5817607641220093, + "loss": 5.0133, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -17.665306091308594, + "rewards/margins": -1.8476982116699219, + "rewards/rejected": -15.817608833312988, + "step": 10855 + }, + { + "epoch": 0.36603862617546934, + "grad_norm": 125.510986328125, + "learning_rate": 7.994611332912719e-07, + "logits/chosen": -1.2232359647750854, + "logits/rejected": -1.100111722946167, + "logps/chosen": -2.0492358207702637, + "logps/rejected": -2.1210994720458984, + "loss": 2.5122, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.492359161376953, + "rewards/margins": 0.7186365127563477, + "rewards/rejected": -21.210994720458984, + "step": 10860 + }, + { + "epoch": 0.3662071522464525, + "grad_norm": 26.81747817993164, + "learning_rate": 7.992255364069406e-07, + "logits/chosen": -0.8770850896835327, + "logits/rejected": -0.8681109547615051, + "logps/chosen": -1.8699661493301392, + "logps/rejected": -1.9965813159942627, + "loss": 2.6638, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.699661254882812, + "rewards/margins": 1.2661510705947876, + "rewards/rejected": -19.96581268310547, + "step": 10865 + }, + { + "epoch": 0.3663756783174357, + "grad_norm": 12.97459888458252, + "learning_rate": 7.989898359724401e-07, + "logits/chosen": -1.3153835535049438, + "logits/rejected": -1.5715057849884033, + "logps/chosen": -1.7097463607788086, + "logps/rejected": -1.838587999343872, + "loss": 2.3408, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.097463607788086, + "rewards/margins": 1.288414716720581, + "rewards/rejected": -18.38587760925293, + "step": 10870 + }, + { + "epoch": 0.3665442043884189, + "grad_norm": 24.906919479370117, + "learning_rate": 7.98754032069337e-07, + "logits/chosen": -1.2778717279434204, + "logits/rejected": -1.3702975511550903, + "logps/chosen": -1.830116629600525, + "logps/rejected": -1.8268821239471436, + "loss": 3.5659, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.301166534423828, + "rewards/margins": -0.03234300762414932, + "rewards/rejected": -18.268823623657227, + "step": 10875 + }, + { + "epoch": 0.36671273045940206, + "grad_norm": 35.55815887451172, + "learning_rate": 7.985181247792338e-07, + "logits/chosen": -0.7316654324531555, + "logits/rejected": -0.8771049380302429, + "logps/chosen": -1.9307162761688232, + "logps/rejected": -1.8940412998199463, + "loss": 3.4595, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -19.30716323852539, + "rewards/margins": -0.36675119400024414, + "rewards/rejected": -18.940412521362305, + "step": 10880 + }, + { + "epoch": 0.3668812565303852, + "grad_norm": 28.160144805908203, + "learning_rate": 7.982821141837691e-07, + "logits/chosen": -1.1204853057861328, + "logits/rejected": -1.2265549898147583, + "logps/chosen": -2.3023176193237305, + "logps/rejected": -2.135511636734009, + "loss": 5.0299, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.023174285888672, + "rewards/margins": -1.6680587530136108, + "rewards/rejected": -21.355113983154297, + "step": 10885 + }, + { + "epoch": 0.36704978260136845, + "grad_norm": 24.9565486907959, + "learning_rate": 7.980460003646162e-07, + "logits/chosen": -0.9870179295539856, + "logits/rejected": -1.0818120241165161, + "logps/chosen": -1.8075840473175049, + "logps/rejected": -1.8251380920410156, + "loss": 3.1207, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.07583999633789, + "rewards/margins": 0.17554087936878204, + "rewards/rejected": -18.251379013061523, + "step": 10890 + }, + { + "epoch": 0.3672183086723516, + "grad_norm": 45.430294036865234, + "learning_rate": 7.978097834034851e-07, + "logits/chosen": -1.2092764377593994, + "logits/rejected": -1.349082350730896, + "logps/chosen": -2.282064199447632, + "logps/rejected": -2.6417107582092285, + "loss": 3.4773, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -22.820642471313477, + "rewards/margins": 3.596468448638916, + "rewards/rejected": -26.417110443115234, + "step": 10895 + }, + { + "epoch": 0.3673868347433348, + "grad_norm": 33.20343017578125, + "learning_rate": 7.975734633821214e-07, + "logits/chosen": -1.1706898212432861, + "logits/rejected": -1.1777961254119873, + "logps/chosen": -2.0728137493133545, + "logps/rejected": -1.885846734046936, + "loss": 4.9354, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.728137969970703, + "rewards/margins": -1.8696720600128174, + "rewards/rejected": -18.85846519470215, + "step": 10900 + }, + { + "epoch": 0.367555360814318, + "grad_norm": 14.910902976989746, + "learning_rate": 7.973370403823059e-07, + "logits/chosen": -1.4453526735305786, + "logits/rejected": -1.3824089765548706, + "logps/chosen": -1.74431574344635, + "logps/rejected": -1.8275190591812134, + "loss": 2.6178, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.443157196044922, + "rewards/margins": 0.832032322883606, + "rewards/rejected": -18.275188446044922, + "step": 10905 + }, + { + "epoch": 0.36772388688530117, + "grad_norm": 23.91060447692871, + "learning_rate": 7.971005144858553e-07, + "logits/chosen": -1.0507118701934814, + "logits/rejected": -1.1027783155441284, + "logps/chosen": -2.1079440116882324, + "logps/rejected": -2.2358126640319824, + "loss": 3.259, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.07944107055664, + "rewards/margins": 1.2786868810653687, + "rewards/rejected": -22.35812759399414, + "step": 10910 + }, + { + "epoch": 0.36789241295628433, + "grad_norm": 18.480350494384766, + "learning_rate": 7.968638857746218e-07, + "logits/chosen": -0.8534606099128723, + "logits/rejected": -0.8906230926513672, + "logps/chosen": -2.049269437789917, + "logps/rejected": -2.2661845684051514, + "loss": 3.0378, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.49269676208496, + "rewards/margins": 2.169149875640869, + "rewards/rejected": -22.661846160888672, + "step": 10915 + }, + { + "epoch": 0.3680609390272675, + "grad_norm": 20.608287811279297, + "learning_rate": 7.966271543304937e-07, + "logits/chosen": -0.8241097331047058, + "logits/rejected": -0.7778711915016174, + "logps/chosen": -2.1182360649108887, + "logps/rejected": -2.205749034881592, + "loss": 2.646, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.182361602783203, + "rewards/margins": 0.8751300573348999, + "rewards/rejected": -22.057491302490234, + "step": 10920 + }, + { + "epoch": 0.3682294650982507, + "grad_norm": 22.974990844726562, + "learning_rate": 7.963903202353939e-07, + "logits/chosen": -1.1302852630615234, + "logits/rejected": -1.2610663175582886, + "logps/chosen": -1.9656798839569092, + "logps/rejected": -2.0262506008148193, + "loss": 3.0751, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.65679931640625, + "rewards/margins": 0.605708122253418, + "rewards/rejected": -20.26250648498535, + "step": 10925 + }, + { + "epoch": 0.3683979911692339, + "grad_norm": 20.506927490234375, + "learning_rate": 7.961533835712816e-07, + "logits/chosen": -0.9316355586051941, + "logits/rejected": -1.1611944437026978, + "logps/chosen": -1.856085181236267, + "logps/rejected": -1.9529523849487305, + "loss": 2.8926, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.56085205078125, + "rewards/margins": 0.9686723947525024, + "rewards/rejected": -19.529521942138672, + "step": 10930 + }, + { + "epoch": 0.36856651724021705, + "grad_norm": 28.772199630737305, + "learning_rate": 7.959163444201512e-07, + "logits/chosen": -1.3307853937149048, + "logits/rejected": -1.1167147159576416, + "logps/chosen": -2.1241683959960938, + "logps/rejected": -2.3638079166412354, + "loss": 2.7786, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.241682052612305, + "rewards/margins": 2.396395444869995, + "rewards/rejected": -23.638076782226562, + "step": 10935 + }, + { + "epoch": 0.3687350433112002, + "grad_norm": 9.803357124328613, + "learning_rate": 7.956792028640327e-07, + "logits/chosen": -1.039208173751831, + "logits/rejected": -0.9944947361946106, + "logps/chosen": -2.582061290740967, + "logps/rejected": -2.6542296409606934, + "loss": 2.7741, + "rewards/accuracies": 0.5, + "rewards/chosen": -25.82061195373535, + "rewards/margins": 0.7216871976852417, + "rewards/rejected": -26.54229736328125, + "step": 10940 + }, + { + "epoch": 0.36890356938218344, + "grad_norm": 25.190404891967773, + "learning_rate": 7.954419589849914e-07, + "logits/chosen": -1.2668789625167847, + "logits/rejected": -1.4306660890579224, + "logps/chosen": -1.7445892095565796, + "logps/rejected": -1.6929042339324951, + "loss": 3.692, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.445892333984375, + "rewards/margins": -0.5168499946594238, + "rewards/rejected": -16.92904281616211, + "step": 10945 + }, + { + "epoch": 0.3690720954531666, + "grad_norm": 30.531227111816406, + "learning_rate": 7.952046128651279e-07, + "logits/chosen": -0.958761990070343, + "logits/rejected": -0.9972974061965942, + "logps/chosen": -2.2063372135162354, + "logps/rejected": -2.076390266418457, + "loss": 4.3272, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -22.063373565673828, + "rewards/margins": -1.2994682788848877, + "rewards/rejected": -20.763904571533203, + "step": 10950 + }, + { + "epoch": 0.36924062152414977, + "grad_norm": 43.369598388671875, + "learning_rate": 7.949671645865788e-07, + "logits/chosen": -1.0231273174285889, + "logits/rejected": -1.2699315547943115, + "logps/chosen": -1.7658660411834717, + "logps/rejected": -1.9867603778839111, + "loss": 2.4551, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.658658981323242, + "rewards/margins": 2.208942413330078, + "rewards/rejected": -19.867603302001953, + "step": 10955 + }, + { + "epoch": 0.369409147595133, + "grad_norm": 25.24970817565918, + "learning_rate": 7.94729614231515e-07, + "logits/chosen": -1.1766541004180908, + "logits/rejected": -1.4894263744354248, + "logps/chosen": -1.7530008554458618, + "logps/rejected": -1.8725624084472656, + "loss": 2.419, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.53000831604004, + "rewards/margins": 1.1956161260604858, + "rewards/rejected": -18.725622177124023, + "step": 10960 + }, + { + "epoch": 0.36957767366611616, + "grad_norm": 4.202431678771973, + "learning_rate": 7.944919618821438e-07, + "logits/chosen": -1.3009445667266846, + "logits/rejected": -1.4404528141021729, + "logps/chosen": -2.084996461868286, + "logps/rejected": -2.300058126449585, + "loss": 2.1089, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.849964141845703, + "rewards/margins": 2.150618553161621, + "rewards/rejected": -23.00058364868164, + "step": 10965 + }, + { + "epoch": 0.3697461997370993, + "grad_norm": 23.584318161010742, + "learning_rate": 7.942542076207069e-07, + "logits/chosen": -0.710057258605957, + "logits/rejected": -0.7822506427764893, + "logps/chosen": -2.7446389198303223, + "logps/rejected": -3.029550075531006, + "loss": 2.1846, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -27.446392059326172, + "rewards/margins": 2.849109172821045, + "rewards/rejected": -30.295501708984375, + "step": 10970 + }, + { + "epoch": 0.3699147258080825, + "grad_norm": 22.66236114501953, + "learning_rate": 7.940163515294819e-07, + "logits/chosen": -0.9495447278022766, + "logits/rejected": -1.0309523344039917, + "logps/chosen": -2.10927677154541, + "logps/rejected": -2.3697586059570312, + "loss": 2.7297, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.0927677154541, + "rewards/margins": 2.6048176288604736, + "rewards/rejected": -23.697586059570312, + "step": 10975 + }, + { + "epoch": 0.3700832518790657, + "grad_norm": 32.25993347167969, + "learning_rate": 7.937783936907816e-07, + "logits/chosen": -0.9410039782524109, + "logits/rejected": -0.9836063385009766, + "logps/chosen": -2.0231223106384277, + "logps/rejected": -2.137643337249756, + "loss": 2.2719, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.23122215270996, + "rewards/margins": 1.1452109813690186, + "rewards/rejected": -21.376434326171875, + "step": 10980 + }, + { + "epoch": 0.3702517779500489, + "grad_norm": 45.95429229736328, + "learning_rate": 7.935403341869535e-07, + "logits/chosen": -0.5016958713531494, + "logits/rejected": -0.5743024945259094, + "logps/chosen": -2.0376670360565186, + "logps/rejected": -2.1427981853485107, + "loss": 2.3295, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.37666893005371, + "rewards/margins": 1.0513111352920532, + "rewards/rejected": -21.427982330322266, + "step": 10985 + }, + { + "epoch": 0.37042030402103204, + "grad_norm": 9.918193817138672, + "learning_rate": 7.933021731003809e-07, + "logits/chosen": -1.3090450763702393, + "logits/rejected": -1.3329724073410034, + "logps/chosen": -2.461430788040161, + "logps/rejected": -2.8623244762420654, + "loss": 1.8669, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -24.614307403564453, + "rewards/margins": 4.008938789367676, + "rewards/rejected": -28.623245239257812, + "step": 10990 + }, + { + "epoch": 0.3705888300920152, + "grad_norm": 22.17966079711914, + "learning_rate": 7.930639105134818e-07, + "logits/chosen": -1.0726702213287354, + "logits/rejected": -1.2153971195220947, + "logps/chosen": -2.1161656379699707, + "logps/rejected": -2.127230405807495, + "loss": 3.2105, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.161657333374023, + "rewards/margins": 0.11064748466014862, + "rewards/rejected": -21.27230453491211, + "step": 10995 + }, + { + "epoch": 0.37075735616299843, + "grad_norm": 39.328758239746094, + "learning_rate": 7.928255465087094e-07, + "logits/chosen": -1.2356667518615723, + "logits/rejected": -1.152830719947815, + "logps/chosen": -1.9329249858856201, + "logps/rejected": -1.9360370635986328, + "loss": 3.0736, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.32925033569336, + "rewards/margins": 0.03111877478659153, + "rewards/rejected": -19.360370635986328, + "step": 11000 + }, + { + "epoch": 0.3709258822339816, + "grad_norm": 9.758732795715332, + "learning_rate": 7.925870811685523e-07, + "logits/chosen": -0.9007709622383118, + "logits/rejected": -1.2970943450927734, + "logps/chosen": -1.790388822555542, + "logps/rejected": -2.073038101196289, + "loss": 1.8041, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.903888702392578, + "rewards/margins": 2.8264918327331543, + "rewards/rejected": -20.73038101196289, + "step": 11005 + }, + { + "epoch": 0.37109440830496476, + "grad_norm": 24.037540435791016, + "learning_rate": 7.923485145755339e-07, + "logits/chosen": -0.6779791712760925, + "logits/rejected": -0.953299880027771, + "logps/chosen": -2.328507661819458, + "logps/rejected": -2.587146043777466, + "loss": 1.678, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.285076141357422, + "rewards/margins": 2.586381435394287, + "rewards/rejected": -25.871456146240234, + "step": 11010 + }, + { + "epoch": 0.371262934375948, + "grad_norm": 28.73473358154297, + "learning_rate": 7.921098468122127e-07, + "logits/chosen": -1.0912433862686157, + "logits/rejected": -1.1714380979537964, + "logps/chosen": -1.9897487163543701, + "logps/rejected": -2.1489391326904297, + "loss": 3.4827, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.89748764038086, + "rewards/margins": 1.5919034481048584, + "rewards/rejected": -21.489391326904297, + "step": 11015 + }, + { + "epoch": 0.37143146044693115, + "grad_norm": 41.63922119140625, + "learning_rate": 7.918710779611822e-07, + "logits/chosen": -1.0436906814575195, + "logits/rejected": -1.1562167406082153, + "logps/chosen": -1.9038293361663818, + "logps/rejected": -1.8470821380615234, + "loss": 3.6442, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -19.038293838500977, + "rewards/margins": -0.5674721002578735, + "rewards/rejected": -18.470823287963867, + "step": 11020 + }, + { + "epoch": 0.3715999865179143, + "grad_norm": 14.585823059082031, + "learning_rate": 7.916322081050709e-07, + "logits/chosen": -0.8105791211128235, + "logits/rejected": -0.9758694767951965, + "logps/chosen": -2.1387295722961426, + "logps/rejected": -2.2473695278167725, + "loss": 2.674, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.38729476928711, + "rewards/margins": 1.0864002704620361, + "rewards/rejected": -22.47369384765625, + "step": 11025 + }, + { + "epoch": 0.3717685125888975, + "grad_norm": 56.67366409301758, + "learning_rate": 7.91393237326542e-07, + "logits/chosen": -1.2288243770599365, + "logits/rejected": -1.2605135440826416, + "logps/chosen": -2.3369829654693604, + "logps/rejected": -2.5689830780029297, + "loss": 3.9939, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.369831085205078, + "rewards/margins": 2.320000171661377, + "rewards/rejected": -25.689828872680664, + "step": 11030 + }, + { + "epoch": 0.3719370386598807, + "grad_norm": 39.05540466308594, + "learning_rate": 7.911541657082943e-07, + "logits/chosen": -0.34584683179855347, + "logits/rejected": -0.3778868615627289, + "logps/chosen": -1.9564568996429443, + "logps/rejected": -2.009974241256714, + "loss": 2.9104, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.56456756591797, + "rewards/margins": 0.5351727604866028, + "rewards/rejected": -20.099742889404297, + "step": 11035 + }, + { + "epoch": 0.37210556473086387, + "grad_norm": 32.99935531616211, + "learning_rate": 7.909149933330608e-07, + "logits/chosen": -0.9975587129592896, + "logits/rejected": -1.0851614475250244, + "logps/chosen": -1.80816650390625, + "logps/rejected": -1.9597301483154297, + "loss": 2.5019, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.081666946411133, + "rewards/margins": 1.5156358480453491, + "rewards/rejected": -19.597301483154297, + "step": 11040 + }, + { + "epoch": 0.37227409080184704, + "grad_norm": 30.022205352783203, + "learning_rate": 7.906757202836097e-07, + "logits/chosen": -1.1794826984405518, + "logits/rejected": -1.2481104135513306, + "logps/chosen": -2.236422300338745, + "logps/rejected": -2.309372663497925, + "loss": 3.417, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.36422348022461, + "rewards/margins": 0.7295053601264954, + "rewards/rejected": -23.09372901916504, + "step": 11045 + }, + { + "epoch": 0.3724426168728302, + "grad_norm": 24.919965744018555, + "learning_rate": 7.90436346642744e-07, + "logits/chosen": -1.1151336431503296, + "logits/rejected": -1.1000896692276, + "logps/chosen": -2.002037286758423, + "logps/rejected": -1.9458297491073608, + "loss": 3.7713, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -20.020374298095703, + "rewards/margins": -0.5620753169059753, + "rewards/rejected": -19.458297729492188, + "step": 11050 + }, + { + "epoch": 0.3726111429438134, + "grad_norm": 17.231094360351562, + "learning_rate": 7.901968724933015e-07, + "logits/chosen": -0.9927698969841003, + "logits/rejected": -1.0535566806793213, + "logps/chosen": -1.9820178747177124, + "logps/rejected": -2.3538105487823486, + "loss": 2.1248, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.820178985595703, + "rewards/margins": 3.7179248332977295, + "rewards/rejected": -23.538105010986328, + "step": 11055 + }, + { + "epoch": 0.3727796690147966, + "grad_norm": 118.672607421875, + "learning_rate": 7.899572979181545e-07, + "logits/chosen": -1.281635046005249, + "logits/rejected": -1.35465407371521, + "logps/chosen": -1.8064937591552734, + "logps/rejected": -1.8414586782455444, + "loss": 3.1491, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -18.064937591552734, + "rewards/margins": 0.34965038299560547, + "rewards/rejected": -18.414587020874023, + "step": 11060 + }, + { + "epoch": 0.37294819508577975, + "grad_norm": 27.215576171875, + "learning_rate": 7.897176230002108e-07, + "logits/chosen": -1.197999358177185, + "logits/rejected": -1.0447529554367065, + "logps/chosen": -2.089306592941284, + "logps/rejected": -2.06302547454834, + "loss": 3.4649, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.89306640625, + "rewards/margins": -0.26281100511550903, + "rewards/rejected": -20.630252838134766, + "step": 11065 + }, + { + "epoch": 0.373116721156763, + "grad_norm": 65.9457778930664, + "learning_rate": 7.894778478224123e-07, + "logits/chosen": -1.1507130861282349, + "logits/rejected": -1.225836992263794, + "logps/chosen": -2.147380828857422, + "logps/rejected": -2.0534119606018066, + "loss": 4.3662, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -21.47380828857422, + "rewards/margins": -0.9396876096725464, + "rewards/rejected": -20.53411865234375, + "step": 11070 + }, + { + "epoch": 0.37328524722774614, + "grad_norm": 22.707504272460938, + "learning_rate": 7.892379724677354e-07, + "logits/chosen": -0.7473500370979309, + "logits/rejected": -0.9139529466629028, + "logps/chosen": -2.0047521591186523, + "logps/rejected": -2.5226516723632812, + "loss": 1.0853, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -20.04751968383789, + "rewards/margins": 5.178994655609131, + "rewards/rejected": -25.226516723632812, + "step": 11075 + }, + { + "epoch": 0.3734537732987293, + "grad_norm": 24.477022171020508, + "learning_rate": 7.889979970191918e-07, + "logits/chosen": -1.004990816116333, + "logits/rejected": -0.9029370546340942, + "logps/chosen": -1.7345755100250244, + "logps/rejected": -1.6503496170043945, + "loss": 3.9531, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.345754623413086, + "rewards/margins": -0.8422587513923645, + "rewards/rejected": -16.503498077392578, + "step": 11080 + }, + { + "epoch": 0.3736222993697125, + "grad_norm": 50.63272476196289, + "learning_rate": 7.887579215598277e-07, + "logits/chosen": -0.6778031587600708, + "logits/rejected": -0.8914194107055664, + "logps/chosen": -2.030156373977661, + "logps/rejected": -2.2473902702331543, + "loss": 2.6024, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.301563262939453, + "rewards/margins": 2.172337293624878, + "rewards/rejected": -22.473901748657227, + "step": 11085 + }, + { + "epoch": 0.3737908254406957, + "grad_norm": 6.910362720489502, + "learning_rate": 7.885177461727233e-07, + "logits/chosen": -0.8329635858535767, + "logits/rejected": -0.8338441848754883, + "logps/chosen": -1.9738490581512451, + "logps/rejected": -2.2185306549072266, + "loss": 3.3751, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.73849105834961, + "rewards/margins": 2.4468133449554443, + "rewards/rejected": -22.185306549072266, + "step": 11090 + }, + { + "epoch": 0.37395935151167886, + "grad_norm": 15.434812545776367, + "learning_rate": 7.88277470940994e-07, + "logits/chosen": -1.3695753812789917, + "logits/rejected": -1.4795372486114502, + "logps/chosen": -2.154623031616211, + "logps/rejected": -2.047244071960449, + "loss": 4.2477, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -21.54623031616211, + "rewards/margins": -1.0737907886505127, + "rewards/rejected": -20.47243881225586, + "step": 11095 + }, + { + "epoch": 0.374127877582662, + "grad_norm": 20.866865158081055, + "learning_rate": 7.8803709594779e-07, + "logits/chosen": -1.5691171884536743, + "logits/rejected": -1.4664461612701416, + "logps/chosen": -1.8162921667099, + "logps/rejected": -1.9663314819335938, + "loss": 2.1468, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.162921905517578, + "rewards/margins": 1.5003941059112549, + "rewards/rejected": -19.663318634033203, + "step": 11100 + }, + { + "epoch": 0.3742964036536452, + "grad_norm": 18.494287490844727, + "learning_rate": 7.877966212762952e-07, + "logits/chosen": -1.304776906967163, + "logits/rejected": -1.2949206829071045, + "logps/chosen": -1.9437240362167358, + "logps/rejected": -2.265573740005493, + "loss": 1.9874, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.43724250793457, + "rewards/margins": 3.218493938446045, + "rewards/rejected": -22.655736923217773, + "step": 11105 + }, + { + "epoch": 0.3744649297246284, + "grad_norm": 122.1553955078125, + "learning_rate": 7.875560470097285e-07, + "logits/chosen": -1.307988166809082, + "logits/rejected": -1.3546937704086304, + "logps/chosen": -2.1325485706329346, + "logps/rejected": -2.3272907733917236, + "loss": 2.4457, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.325485229492188, + "rewards/margins": 1.9474217891693115, + "rewards/rejected": -23.27290916442871, + "step": 11110 + }, + { + "epoch": 0.3746334557956116, + "grad_norm": 25.68359375, + "learning_rate": 7.873153732313432e-07, + "logits/chosen": -1.16599440574646, + "logits/rejected": -1.28875732421875, + "logps/chosen": -1.7075920104980469, + "logps/rejected": -1.694820761680603, + "loss": 3.3776, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.07592010498047, + "rewards/margins": -0.1277121603488922, + "rewards/rejected": -16.948205947875977, + "step": 11115 + }, + { + "epoch": 0.37480198186659475, + "grad_norm": 100.46998596191406, + "learning_rate": 7.870746000244269e-07, + "logits/chosen": -0.9557684063911438, + "logits/rejected": -0.9845932126045227, + "logps/chosen": -2.7109880447387695, + "logps/rejected": -3.0132102966308594, + "loss": 3.0756, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -27.109878540039062, + "rewards/margins": 3.0222253799438477, + "rewards/rejected": -30.132104873657227, + "step": 11120 + }, + { + "epoch": 0.37497050793757797, + "grad_norm": 20.687875747680664, + "learning_rate": 7.868337274723018e-07, + "logits/chosen": -1.0712049007415771, + "logits/rejected": -1.2100828886032104, + "logps/chosen": -1.7976467609405518, + "logps/rejected": -2.0642523765563965, + "loss": 2.1613, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.976465225219727, + "rewards/margins": 2.666057586669922, + "rewards/rejected": -20.64252471923828, + "step": 11125 + }, + { + "epoch": 0.37513903400856113, + "grad_norm": 19.09157371520996, + "learning_rate": 7.865927556583245e-07, + "logits/chosen": -0.9747500419616699, + "logits/rejected": -1.2264236211776733, + "logps/chosen": -2.065001964569092, + "logps/rejected": -2.3609440326690674, + "loss": 1.7971, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.6500186920166, + "rewards/margins": 2.959421396255493, + "rewards/rejected": -23.609439849853516, + "step": 11130 + }, + { + "epoch": 0.3753075600795443, + "grad_norm": 21.92704963684082, + "learning_rate": 7.863516846658857e-07, + "logits/chosen": -0.8536840677261353, + "logits/rejected": -1.1422303915023804, + "logps/chosen": -1.911171317100525, + "logps/rejected": -1.7837629318237305, + "loss": 4.7212, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.111713409423828, + "rewards/margins": -1.2740838527679443, + "rewards/rejected": -17.837631225585938, + "step": 11135 + }, + { + "epoch": 0.37547608615052747, + "grad_norm": 20.51827621459961, + "learning_rate": 7.861105145784108e-07, + "logits/chosen": -0.713087260723114, + "logits/rejected": -0.7482129335403442, + "logps/chosen": -2.575368881225586, + "logps/rejected": -2.7310588359832764, + "loss": 2.7725, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.75368881225586, + "rewards/margins": 1.556899070739746, + "rewards/rejected": -27.31058692932129, + "step": 11140 + }, + { + "epoch": 0.3756446122215107, + "grad_norm": 18.282073974609375, + "learning_rate": 7.858692454793589e-07, + "logits/chosen": -0.7038525938987732, + "logits/rejected": -0.787943959236145, + "logps/chosen": -2.6329092979431152, + "logps/rejected": -2.9737820625305176, + "loss": 2.7294, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -26.3290958404541, + "rewards/margins": 3.408724308013916, + "rewards/rejected": -29.737817764282227, + "step": 11145 + }, + { + "epoch": 0.37581313829249385, + "grad_norm": 13.00042724609375, + "learning_rate": 7.856278774522242e-07, + "logits/chosen": -0.9289695024490356, + "logits/rejected": -1.0384521484375, + "logps/chosen": -1.6467241048812866, + "logps/rejected": -1.785021185874939, + "loss": 1.9632, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -16.467241287231445, + "rewards/margins": 1.3829692602157593, + "rewards/rejected": -17.850210189819336, + "step": 11150 + }, + { + "epoch": 0.375981664363477, + "grad_norm": 18.52608299255371, + "learning_rate": 7.853864105805342e-07, + "logits/chosen": -1.4320456981658936, + "logits/rejected": -1.6049312353134155, + "logps/chosen": -1.8154075145721436, + "logps/rejected": -1.932138204574585, + "loss": 2.4999, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.154075622558594, + "rewards/margins": 1.1673085689544678, + "rewards/rejected": -19.32138442993164, + "step": 11155 + }, + { + "epoch": 0.3761501904344602, + "grad_norm": 16.68703842163086, + "learning_rate": 7.851448449478513e-07, + "logits/chosen": -0.8950377702713013, + "logits/rejected": -0.6943656206130981, + "logps/chosen": -1.9260505437850952, + "logps/rejected": -2.0210115909576416, + "loss": 3.0619, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.26050567626953, + "rewards/margins": 0.949609100818634, + "rewards/rejected": -20.21011734008789, + "step": 11160 + }, + { + "epoch": 0.3763187165054434, + "grad_norm": 23.949373245239258, + "learning_rate": 7.84903180637772e-07, + "logits/chosen": -1.0606566667556763, + "logits/rejected": -0.9929370880126953, + "logps/chosen": -1.6432468891143799, + "logps/rejected": -1.7798480987548828, + "loss": 2.4106, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.43246841430664, + "rewards/margins": 1.366010308265686, + "rewards/rejected": -17.798480987548828, + "step": 11165 + }, + { + "epoch": 0.3764872425764266, + "grad_norm": 39.09119415283203, + "learning_rate": 7.846614177339264e-07, + "logits/chosen": -0.5430514812469482, + "logits/rejected": -0.8053268194198608, + "logps/chosen": -2.0910189151763916, + "logps/rejected": -2.1563382148742676, + "loss": 2.8637, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.910188674926758, + "rewards/margins": 0.6531929969787598, + "rewards/rejected": -21.563379287719727, + "step": 11170 + }, + { + "epoch": 0.37665576864740974, + "grad_norm": 14.804924964904785, + "learning_rate": 7.844195563199794e-07, + "logits/chosen": -0.9070854187011719, + "logits/rejected": -1.100773572921753, + "logps/chosen": -2.2509732246398926, + "logps/rejected": -2.2927937507629395, + "loss": 3.488, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.50973129272461, + "rewards/margins": 0.4182073473930359, + "rewards/rejected": -22.92793846130371, + "step": 11175 + }, + { + "epoch": 0.37682429471839296, + "grad_norm": 65.736083984375, + "learning_rate": 7.841775964796296e-07, + "logits/chosen": -0.9328392148017883, + "logits/rejected": -1.3494576215744019, + "logps/chosen": -2.0066604614257812, + "logps/rejected": -2.2377495765686035, + "loss": 2.3296, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.066606521606445, + "rewards/margins": 2.3108911514282227, + "rewards/rejected": -22.377498626708984, + "step": 11180 + }, + { + "epoch": 0.3769928207893761, + "grad_norm": 51.13811492919922, + "learning_rate": 7.8393553829661e-07, + "logits/chosen": -0.9025689959526062, + "logits/rejected": -1.1935182809829712, + "logps/chosen": -1.9853508472442627, + "logps/rejected": -2.0312693119049072, + "loss": 3.7426, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.85350799560547, + "rewards/margins": 0.4591858983039856, + "rewards/rejected": -20.312694549560547, + "step": 11185 + }, + { + "epoch": 0.3771613468603593, + "grad_norm": 23.337675094604492, + "learning_rate": 7.83693381854687e-07, + "logits/chosen": -0.9358808398246765, + "logits/rejected": -1.1639865636825562, + "logps/chosen": -1.9129003286361694, + "logps/rejected": -2.0121288299560547, + "loss": 3.1006, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.129003524780273, + "rewards/margins": 0.992284893989563, + "rewards/rejected": -20.121288299560547, + "step": 11190 + }, + { + "epoch": 0.37732987293134246, + "grad_norm": 22.129470825195312, + "learning_rate": 7.834511272376616e-07, + "logits/chosen": -1.1202242374420166, + "logits/rejected": -1.1665992736816406, + "logps/chosen": -1.6695282459259033, + "logps/rejected": -1.8295204639434814, + "loss": 1.887, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.695281982421875, + "rewards/margins": 1.5999218225479126, + "rewards/rejected": -18.295204162597656, + "step": 11195 + }, + { + "epoch": 0.3774983990023257, + "grad_norm": 19.949989318847656, + "learning_rate": 7.832087745293687e-07, + "logits/chosen": -1.039284348487854, + "logits/rejected": -1.0724128484725952, + "logps/chosen": -1.8849290609359741, + "logps/rejected": -2.1825642585754395, + "loss": 1.0473, + "rewards/accuracies": 1.0, + "rewards/chosen": -18.849288940429688, + "rewards/margins": 2.9763526916503906, + "rewards/rejected": -21.82564353942871, + "step": 11200 + }, + { + "epoch": 0.3774983990023257, + "eval_logits/chosen": -1.4173237085342407, + "eval_logits/rejected": -1.5109046697616577, + "eval_logps/chosen": -1.9158105850219727, + "eval_logps/rejected": -1.9985766410827637, + "eval_loss": 3.0528249740600586, + "eval_rewards/accuracies": 0.6100000143051147, + "eval_rewards/chosen": -19.158105850219727, + "eval_rewards/margins": 0.8276617527008057, + "eval_rewards/rejected": -19.985767364501953, + "eval_runtime": 12.8962, + "eval_samples_per_second": 7.754, + "eval_steps_per_second": 1.939, + "step": 11200 + }, + { + "epoch": 0.37766692507330885, + "grad_norm": 90.48750305175781, + "learning_rate": 7.829663238136769e-07, + "logits/chosen": -0.6559727787971497, + "logits/rejected": -0.6415562629699707, + "logps/chosen": -2.196298122406006, + "logps/rejected": -2.155961275100708, + "loss": 3.5922, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -21.96297836303711, + "rewards/margins": -0.40336617827415466, + "rewards/rejected": -21.559612274169922, + "step": 11205 + }, + { + "epoch": 0.377835451144292, + "grad_norm": 19.182260513305664, + "learning_rate": 7.827237751744889e-07, + "logits/chosen": -0.9353266954421997, + "logits/rejected": -1.0023210048675537, + "logps/chosen": -1.7862240076065063, + "logps/rejected": -1.8538305759429932, + "loss": 2.629, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.862239837646484, + "rewards/margins": 0.6760631799697876, + "rewards/rejected": -18.53830337524414, + "step": 11210 + }, + { + "epoch": 0.3780039772152752, + "grad_norm": 37.58772659301758, + "learning_rate": 7.824811286957411e-07, + "logits/chosen": -0.5556604862213135, + "logits/rejected": -0.8810178637504578, + "logps/chosen": -2.6270499229431152, + "logps/rejected": -2.7347140312194824, + "loss": 4.833, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -26.2705020904541, + "rewards/margins": 1.0766421556472778, + "rewards/rejected": -27.347143173217773, + "step": 11215 + }, + { + "epoch": 0.3781725032862584, + "grad_norm": 25.184528350830078, + "learning_rate": 7.82238384461404e-07, + "logits/chosen": -1.3108174800872803, + "logits/rejected": -1.623822808265686, + "logps/chosen": -2.8811333179473877, + "logps/rejected": -3.0212135314941406, + "loss": 4.4953, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -28.811330795288086, + "rewards/margins": 1.400800347328186, + "rewards/rejected": -30.212133407592773, + "step": 11220 + }, + { + "epoch": 0.37834102935724157, + "grad_norm": 97.33984375, + "learning_rate": 7.819955425554818e-07, + "logits/chosen": -1.1681185960769653, + "logits/rejected": -1.3820760250091553, + "logps/chosen": -2.2876718044281006, + "logps/rejected": -2.4516873359680176, + "loss": 3.3757, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.876718521118164, + "rewards/margins": 1.6401538848876953, + "rewards/rejected": -24.51687240600586, + "step": 11225 + }, + { + "epoch": 0.37850955542822473, + "grad_norm": 24.39796257019043, + "learning_rate": 7.817526030620125e-07, + "logits/chosen": -0.8352615237236023, + "logits/rejected": -0.9630219340324402, + "logps/chosen": -1.8876619338989258, + "logps/rejected": -1.981488823890686, + "loss": 2.2906, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -18.876617431640625, + "rewards/margins": 0.9382714033126831, + "rewards/rejected": -19.814889907836914, + "step": 11230 + }, + { + "epoch": 0.37867808149920795, + "grad_norm": 38.77871322631836, + "learning_rate": 7.815095660650679e-07, + "logits/chosen": -1.0813668966293335, + "logits/rejected": -1.2330832481384277, + "logps/chosen": -1.8022918701171875, + "logps/rejected": -1.9867006540298462, + "loss": 1.755, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.022918701171875, + "rewards/margins": 1.84408700466156, + "rewards/rejected": -19.867008209228516, + "step": 11235 + }, + { + "epoch": 0.3788466075701911, + "grad_norm": 39.55854034423828, + "learning_rate": 7.812664316487534e-07, + "logits/chosen": -1.3786113262176514, + "logits/rejected": -1.407435655593872, + "logps/chosen": -1.9177662134170532, + "logps/rejected": -2.2436928749084473, + "loss": 1.6302, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.177661895751953, + "rewards/margins": 3.2592673301696777, + "rewards/rejected": -22.43692970275879, + "step": 11240 + }, + { + "epoch": 0.3790151336411743, + "grad_norm": 14.872421264648438, + "learning_rate": 7.810231998972085e-07, + "logits/chosen": -0.9885151982307434, + "logits/rejected": -1.0631240606307983, + "logps/chosen": -1.8154971599578857, + "logps/rejected": -2.111203670501709, + "loss": 1.7458, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.15497398376465, + "rewards/margins": 2.957064151763916, + "rewards/rejected": -21.112035751342773, + "step": 11245 + }, + { + "epoch": 0.37918365971215745, + "grad_norm": 20.53430938720703, + "learning_rate": 7.80779870894606e-07, + "logits/chosen": -0.9703305959701538, + "logits/rejected": -1.236297369003296, + "logps/chosen": -1.994520902633667, + "logps/rejected": -2.0560126304626465, + "loss": 2.6613, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.945209503173828, + "rewards/margins": 0.6149150729179382, + "rewards/rejected": -20.560123443603516, + "step": 11250 + }, + { + "epoch": 0.37935218578314067, + "grad_norm": 30.372365951538086, + "learning_rate": 7.805364447251524e-07, + "logits/chosen": -0.8832541704177856, + "logits/rejected": -0.8103952407836914, + "logps/chosen": -1.8758599758148193, + "logps/rejected": -1.8299274444580078, + "loss": 3.6637, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.758602142333984, + "rewards/margins": -0.45932644605636597, + "rewards/rejected": -18.29927635192871, + "step": 11255 + }, + { + "epoch": 0.37952071185412384, + "grad_norm": 24.143922805786133, + "learning_rate": 7.80292921473088e-07, + "logits/chosen": -0.9485553503036499, + "logits/rejected": -1.2246477603912354, + "logps/chosen": -1.9528968334197998, + "logps/rejected": -1.9845561981201172, + "loss": 2.8398, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.528968811035156, + "rewards/margins": 0.31659451127052307, + "rewards/rejected": -19.845561981201172, + "step": 11260 + }, + { + "epoch": 0.379689237925107, + "grad_norm": 19.331287384033203, + "learning_rate": 7.800493012226865e-07, + "logits/chosen": -1.026592493057251, + "logits/rejected": -1.1320085525512695, + "logps/chosen": -1.7972173690795898, + "logps/rejected": -1.9342625141143799, + "loss": 2.1796, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.9721736907959, + "rewards/margins": 1.370451807975769, + "rewards/rejected": -19.34262466430664, + "step": 11265 + }, + { + "epoch": 0.37985776399609017, + "grad_norm": 45.51408767700195, + "learning_rate": 7.798055840582555e-07, + "logits/chosen": -0.6429299116134644, + "logits/rejected": -0.831339955329895, + "logps/chosen": -2.9012444019317627, + "logps/rejected": -2.4655685424804688, + "loss": 7.7611, + "rewards/accuracies": 0.5, + "rewards/chosen": -29.0124454498291, + "rewards/margins": -4.356759548187256, + "rewards/rejected": -24.655685424804688, + "step": 11270 + }, + { + "epoch": 0.3800262900670734, + "grad_norm": 75.51453399658203, + "learning_rate": 7.795617700641356e-07, + "logits/chosen": -0.9120687246322632, + "logits/rejected": -0.9755932688713074, + "logps/chosen": -2.571845769882202, + "logps/rejected": -2.770507574081421, + "loss": 2.619, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -25.718456268310547, + "rewards/margins": 1.986619234085083, + "rewards/rejected": -27.705074310302734, + "step": 11275 + }, + { + "epoch": 0.38019481613805656, + "grad_norm": 8.264686584472656, + "learning_rate": 7.793178593247014e-07, + "logits/chosen": -1.3956291675567627, + "logits/rejected": -1.326690435409546, + "logps/chosen": -2.0324623584747314, + "logps/rejected": -2.5627822875976562, + "loss": 1.8257, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.324626922607422, + "rewards/margins": 5.303196430206299, + "rewards/rejected": -25.627822875976562, + "step": 11280 + }, + { + "epoch": 0.3803633422090397, + "grad_norm": 24.20513916015625, + "learning_rate": 7.790738519243609e-07, + "logits/chosen": -1.0150988101959229, + "logits/rejected": -1.3087393045425415, + "logps/chosen": -1.8914064168930054, + "logps/rejected": -2.2175402641296387, + "loss": 2.9225, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.914064407348633, + "rewards/margins": 3.2613399028778076, + "rewards/rejected": -22.175403594970703, + "step": 11285 + }, + { + "epoch": 0.38053186828002294, + "grad_norm": 31.742076873779297, + "learning_rate": 7.788297479475552e-07, + "logits/chosen": -1.392856240272522, + "logits/rejected": -1.4201936721801758, + "logps/chosen": -1.9414689540863037, + "logps/rejected": -2.150960922241211, + "loss": 2.3635, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.414690017700195, + "rewards/margins": 2.094921112060547, + "rewards/rejected": -21.509611129760742, + "step": 11290 + }, + { + "epoch": 0.3807003943510061, + "grad_norm": 24.79743194580078, + "learning_rate": 7.785855474787593e-07, + "logits/chosen": -0.5785871744155884, + "logits/rejected": -0.6766026020050049, + "logps/chosen": -2.581320285797119, + "logps/rejected": -2.841749668121338, + "loss": 3.0005, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.813201904296875, + "rewards/margins": 2.604294538497925, + "rewards/rejected": -28.417495727539062, + "step": 11295 + }, + { + "epoch": 0.3808689204219893, + "grad_norm": 33.3655891418457, + "learning_rate": 7.783412506024811e-07, + "logits/chosen": -1.11223566532135, + "logits/rejected": -1.057945966720581, + "logps/chosen": -2.075849771499634, + "logps/rejected": -2.0295917987823486, + "loss": 3.6025, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.75849723815918, + "rewards/margins": -0.46258020401000977, + "rewards/rejected": -20.295917510986328, + "step": 11300 + }, + { + "epoch": 0.38103744649297244, + "grad_norm": 21.387447357177734, + "learning_rate": 7.780968574032625e-07, + "logits/chosen": -1.4144738912582397, + "logits/rejected": -1.3412866592407227, + "logps/chosen": -1.6566816568374634, + "logps/rejected": -1.8312675952911377, + "loss": 2.6812, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.566818237304688, + "rewards/margins": 1.7458572387695312, + "rewards/rejected": -18.312673568725586, + "step": 11305 + }, + { + "epoch": 0.38120597256395566, + "grad_norm": 24.164539337158203, + "learning_rate": 7.778523679656779e-07, + "logits/chosen": -1.141021490097046, + "logits/rejected": -1.0232211351394653, + "logps/chosen": -2.0743701457977295, + "logps/rejected": -2.0848662853240967, + "loss": 3.68, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.743701934814453, + "rewards/margins": 0.10496292263269424, + "rewards/rejected": -20.848665237426758, + "step": 11310 + }, + { + "epoch": 0.38137449863493883, + "grad_norm": 21.93690299987793, + "learning_rate": 7.776077823743357e-07, + "logits/chosen": -1.4113761186599731, + "logits/rejected": -1.4901399612426758, + "logps/chosen": -1.8558003902435303, + "logps/rejected": -1.9158865213394165, + "loss": 3.2107, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -18.55800437927246, + "rewards/margins": 0.6008610725402832, + "rewards/rejected": -19.158864974975586, + "step": 11315 + }, + { + "epoch": 0.381543024705922, + "grad_norm": 23.51809310913086, + "learning_rate": 7.773631007138774e-07, + "logits/chosen": -1.0101337432861328, + "logits/rejected": -1.177026391029358, + "logps/chosen": -1.8464853763580322, + "logps/rejected": -1.8554184436798096, + "loss": 3.04, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.464853286743164, + "rewards/margins": 0.08933134377002716, + "rewards/rejected": -18.554183959960938, + "step": 11320 + }, + { + "epoch": 0.38171155077690516, + "grad_norm": 17.446304321289062, + "learning_rate": 7.771183230689777e-07, + "logits/chosen": -0.7555183172225952, + "logits/rejected": -0.8380460739135742, + "logps/chosen": -1.8387556076049805, + "logps/rejected": -2.0808956623077393, + "loss": 1.905, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.387554168701172, + "rewards/margins": 2.421400547027588, + "rewards/rejected": -20.808956146240234, + "step": 11325 + }, + { + "epoch": 0.3818800768478884, + "grad_norm": 85.44268035888672, + "learning_rate": 7.768734495243443e-07, + "logits/chosen": -1.2770483493804932, + "logits/rejected": -1.438614845275879, + "logps/chosen": -2.207958698272705, + "logps/rejected": -2.1826369762420654, + "loss": 3.6512, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.079586029052734, + "rewards/margins": -0.25321730971336365, + "rewards/rejected": -21.826370239257812, + "step": 11330 + }, + { + "epoch": 0.38204860291887155, + "grad_norm": 24.92569351196289, + "learning_rate": 7.766284801647185e-07, + "logits/chosen": -0.9356335401535034, + "logits/rejected": -0.9447436332702637, + "logps/chosen": -1.8274080753326416, + "logps/rejected": -2.022212028503418, + "loss": 2.2281, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.274080276489258, + "rewards/margins": 1.948040246963501, + "rewards/rejected": -20.22212028503418, + "step": 11335 + }, + { + "epoch": 0.3822171289898547, + "grad_norm": 37.96074295043945, + "learning_rate": 7.763834150748744e-07, + "logits/chosen": -0.9241237640380859, + "logits/rejected": -1.1175363063812256, + "logps/chosen": -1.5214978456497192, + "logps/rejected": -1.6261125802993774, + "loss": 3.0458, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -15.21497917175293, + "rewards/margins": 1.046147108078003, + "rewards/rejected": -16.261127471923828, + "step": 11340 + }, + { + "epoch": 0.38238565506083794, + "grad_norm": 30.894428253173828, + "learning_rate": 7.761382543396194e-07, + "logits/chosen": -1.182045340538025, + "logits/rejected": -1.2811795473098755, + "logps/chosen": -1.7931095361709595, + "logps/rejected": -1.8102737665176392, + "loss": 2.9517, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.931095123291016, + "rewards/margins": 0.17164134979248047, + "rewards/rejected": -18.102737426757812, + "step": 11345 + }, + { + "epoch": 0.3825541811318211, + "grad_norm": 55.25584030151367, + "learning_rate": 7.758929980437938e-07, + "logits/chosen": -0.961665153503418, + "logits/rejected": -1.0043509006500244, + "logps/chosen": -1.9752800464630127, + "logps/rejected": -2.2029144763946533, + "loss": 2.7367, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.752798080444336, + "rewards/margins": 2.276346206665039, + "rewards/rejected": -22.029144287109375, + "step": 11350 + }, + { + "epoch": 0.38272270720280427, + "grad_norm": 17.4051513671875, + "learning_rate": 7.756476462722716e-07, + "logits/chosen": -0.5869064331054688, + "logits/rejected": -0.6633475422859192, + "logps/chosen": -2.171950578689575, + "logps/rejected": -2.3819758892059326, + "loss": 2.1582, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.719507217407227, + "rewards/margins": 2.1002535820007324, + "rewards/rejected": -23.819759368896484, + "step": 11355 + }, + { + "epoch": 0.38289123327378743, + "grad_norm": 29.156442642211914, + "learning_rate": 7.75402199109959e-07, + "logits/chosen": -1.1302787065505981, + "logits/rejected": -1.266941785812378, + "logps/chosen": -2.077214002609253, + "logps/rejected": -2.3803865909576416, + "loss": 2.6726, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.772140502929688, + "rewards/margins": 3.0317249298095703, + "rewards/rejected": -23.80386734008789, + "step": 11360 + }, + { + "epoch": 0.38305975934477066, + "grad_norm": 17.059425354003906, + "learning_rate": 7.751566566417957e-07, + "logits/chosen": -0.5787457823753357, + "logits/rejected": -0.9798294305801392, + "logps/chosen": -1.8301093578338623, + "logps/rejected": -2.0004382133483887, + "loss": 2.7989, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.301095962524414, + "rewards/margins": 1.7032867670059204, + "rewards/rejected": -20.00438117980957, + "step": 11365 + }, + { + "epoch": 0.3832282854157538, + "grad_norm": 47.650630950927734, + "learning_rate": 7.749110189527543e-07, + "logits/chosen": -1.153541088104248, + "logits/rejected": -1.209975242614746, + "logps/chosen": -1.8833907842636108, + "logps/rejected": -2.014538288116455, + "loss": 3.4553, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -18.833908081054688, + "rewards/margins": 1.3114765882492065, + "rewards/rejected": -20.145383834838867, + "step": 11370 + }, + { + "epoch": 0.383396811486737, + "grad_norm": 19.723957061767578, + "learning_rate": 7.746652861278403e-07, + "logits/chosen": -0.9713567495346069, + "logits/rejected": -1.041154384613037, + "logps/chosen": -2.3281097412109375, + "logps/rejected": -2.4205708503723145, + "loss": 3.5797, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.281097412109375, + "rewards/margins": 0.9246100187301636, + "rewards/rejected": -24.205707550048828, + "step": 11375 + }, + { + "epoch": 0.38356533755772015, + "grad_norm": 32.127601623535156, + "learning_rate": 7.744194582520922e-07, + "logits/chosen": -1.2071858644485474, + "logits/rejected": -1.1609976291656494, + "logps/chosen": -2.1086649894714355, + "logps/rejected": -2.12287974357605, + "loss": 3.3836, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.086650848388672, + "rewards/margins": 0.14214667677879333, + "rewards/rejected": -21.228797912597656, + "step": 11380 + }, + { + "epoch": 0.3837338636287034, + "grad_norm": 28.18889045715332, + "learning_rate": 7.741735354105812e-07, + "logits/chosen": -0.9639381170272827, + "logits/rejected": -0.873441219329834, + "logps/chosen": -1.969211220741272, + "logps/rejected": -2.263988733291626, + "loss": 2.5078, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.69211196899414, + "rewards/margins": 2.9477756023406982, + "rewards/rejected": -22.639888763427734, + "step": 11385 + }, + { + "epoch": 0.38390238969968654, + "grad_norm": 19.48329734802246, + "learning_rate": 7.739275176884117e-07, + "logits/chosen": -1.3636186122894287, + "logits/rejected": -1.2289994955062866, + "logps/chosen": -2.4750638008117676, + "logps/rejected": -2.5021533966064453, + "loss": 3.0527, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -24.75063705444336, + "rewards/margins": 0.2708970010280609, + "rewards/rejected": -25.021533966064453, + "step": 11390 + }, + { + "epoch": 0.3840709157706697, + "grad_norm": 13.585640907287598, + "learning_rate": 7.736814051707204e-07, + "logits/chosen": -1.298323392868042, + "logits/rejected": -1.3097971677780151, + "logps/chosen": -1.9604408740997314, + "logps/rejected": -2.163069486618042, + "loss": 2.75, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.604406356811523, + "rewards/margins": 2.026287794113159, + "rewards/rejected": -21.630695343017578, + "step": 11395 + }, + { + "epoch": 0.38423944184165293, + "grad_norm": 19.65452003479004, + "learning_rate": 7.734351979426776e-07, + "logits/chosen": -0.9546276330947876, + "logits/rejected": -1.0646032094955444, + "logps/chosen": -1.8960742950439453, + "logps/rejected": -2.45497465133667, + "loss": 2.2316, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.960744857788086, + "rewards/margins": 5.5890021324157715, + "rewards/rejected": -24.549747467041016, + "step": 11400 + }, + { + "epoch": 0.3844079679126361, + "grad_norm": 19.066781997680664, + "learning_rate": 7.731888960894857e-07, + "logits/chosen": -1.4005488157272339, + "logits/rejected": -1.523158311843872, + "logps/chosen": -2.310615301132202, + "logps/rejected": -2.218581438064575, + "loss": 4.5198, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.106151580810547, + "rewards/margins": -0.92033851146698, + "rewards/rejected": -22.185815811157227, + "step": 11405 + }, + { + "epoch": 0.38457649398361926, + "grad_norm": 22.03188705444336, + "learning_rate": 7.7294249969638e-07, + "logits/chosen": -1.5152934789657593, + "logits/rejected": -1.56234610080719, + "logps/chosen": -2.118349313735962, + "logps/rejected": -2.1849474906921387, + "loss": 2.6433, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.18349266052246, + "rewards/margins": 0.665981650352478, + "rewards/rejected": -21.84947395324707, + "step": 11410 + }, + { + "epoch": 0.3847450200546024, + "grad_norm": 33.175025939941406, + "learning_rate": 7.726960088486288e-07, + "logits/chosen": -0.6917494535446167, + "logits/rejected": -0.6354162693023682, + "logps/chosen": -2.822929859161377, + "logps/rejected": -2.600625514984131, + "loss": 5.3058, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -28.229299545288086, + "rewards/margins": -2.2230448722839355, + "rewards/rejected": -26.006256103515625, + "step": 11415 + }, + { + "epoch": 0.38491354612558565, + "grad_norm": 41.19036102294922, + "learning_rate": 7.724494236315327e-07, + "logits/chosen": -1.2923592329025269, + "logits/rejected": -1.0900766849517822, + "logps/chosen": -2.182579755783081, + "logps/rejected": -2.3524978160858154, + "loss": 3.9432, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.8257999420166, + "rewards/margins": 1.6991780996322632, + "rewards/rejected": -23.524974822998047, + "step": 11420 + }, + { + "epoch": 0.3850820721965688, + "grad_norm": 23.587520599365234, + "learning_rate": 7.722027441304251e-07, + "logits/chosen": -1.6018108129501343, + "logits/rejected": -1.6550190448760986, + "logps/chosen": -1.9862916469573975, + "logps/rejected": -2.0539584159851074, + "loss": 2.6674, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.862918853759766, + "rewards/margins": 0.6766681671142578, + "rewards/rejected": -20.53958511352539, + "step": 11425 + }, + { + "epoch": 0.385250598267552, + "grad_norm": 23.030256271362305, + "learning_rate": 7.719559704306719e-07, + "logits/chosen": -0.8091068267822266, + "logits/rejected": -0.777170717716217, + "logps/chosen": -2.224095582962036, + "logps/rejected": -2.261972665786743, + "loss": 3.4069, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -22.240955352783203, + "rewards/margins": 0.3787725865840912, + "rewards/rejected": -22.61972427368164, + "step": 11430 + }, + { + "epoch": 0.38541912433853515, + "grad_norm": 23.55622100830078, + "learning_rate": 7.717091026176724e-07, + "logits/chosen": -1.5814132690429688, + "logits/rejected": -1.5999425649642944, + "logps/chosen": -2.2364578247070312, + "logps/rejected": -2.3260645866394043, + "loss": 3.1145, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.364580154418945, + "rewards/margins": 0.8960673213005066, + "rewards/rejected": -23.26064682006836, + "step": 11435 + }, + { + "epoch": 0.38558765040951837, + "grad_norm": 24.407276153564453, + "learning_rate": 7.714621407768571e-07, + "logits/chosen": -1.0618751049041748, + "logits/rejected": -1.2050310373306274, + "logps/chosen": -1.7198562622070312, + "logps/rejected": -1.9511902332305908, + "loss": 2.3609, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.198562622070312, + "rewards/margins": 2.3133392333984375, + "rewards/rejected": -19.51190185546875, + "step": 11440 + }, + { + "epoch": 0.38575617648050153, + "grad_norm": 33.41534423828125, + "learning_rate": 7.712150849936902e-07, + "logits/chosen": -1.1403298377990723, + "logits/rejected": -0.9351509213447571, + "logps/chosen": -2.2158753871917725, + "logps/rejected": -2.1572763919830322, + "loss": 3.8557, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.158754348754883, + "rewards/margins": -0.5859910249710083, + "rewards/rejected": -21.572763442993164, + "step": 11445 + }, + { + "epoch": 0.3859247025514847, + "grad_norm": 25.274030685424805, + "learning_rate": 7.709679353536678e-07, + "logits/chosen": -1.1853322982788086, + "logits/rejected": -0.947810173034668, + "logps/chosen": -2.2994515895843506, + "logps/rejected": -2.129556894302368, + "loss": 5.0149, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.994516372680664, + "rewards/margins": -1.6989485025405884, + "rewards/rejected": -21.295568466186523, + "step": 11450 + }, + { + "epoch": 0.3860932286224679, + "grad_norm": 26.36060905456543, + "learning_rate": 7.707206919423186e-07, + "logits/chosen": -1.122575044631958, + "logits/rejected": -1.1204006671905518, + "logps/chosen": -2.4006028175354004, + "logps/rejected": -2.2933762073516846, + "loss": 4.2671, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.006031036376953, + "rewards/margins": -1.0722652673721313, + "rewards/rejected": -22.93376350402832, + "step": 11455 + }, + { + "epoch": 0.3862617546934511, + "grad_norm": 19.837413787841797, + "learning_rate": 7.704733548452041e-07, + "logits/chosen": -1.243242859840393, + "logits/rejected": -1.3733158111572266, + "logps/chosen": -1.903545618057251, + "logps/rejected": -1.9424865245819092, + "loss": 3.0499, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.03545570373535, + "rewards/margins": 0.38941067457199097, + "rewards/rejected": -19.42486572265625, + "step": 11460 + }, + { + "epoch": 0.38643028076443425, + "grad_norm": 20.840614318847656, + "learning_rate": 7.702259241479174e-07, + "logits/chosen": -1.3639594316482544, + "logits/rejected": -1.418534517288208, + "logps/chosen": -1.9076082706451416, + "logps/rejected": -2.043626308441162, + "loss": 2.1244, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.076082229614258, + "rewards/margins": 1.3601806163787842, + "rewards/rejected": -20.436264038085938, + "step": 11465 + }, + { + "epoch": 0.3865988068354174, + "grad_norm": 22.049299240112305, + "learning_rate": 7.69978399936085e-07, + "logits/chosen": -1.047729730606079, + "logits/rejected": -1.1679027080535889, + "logps/chosen": -1.8407936096191406, + "logps/rejected": -2.011669397354126, + "loss": 2.3953, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.407936096191406, + "rewards/margins": 1.7087585926055908, + "rewards/rejected": -20.116695404052734, + "step": 11470 + }, + { + "epoch": 0.38676733290640064, + "grad_norm": 19.621408462524414, + "learning_rate": 7.697307822953651e-07, + "logits/chosen": -0.9885059595108032, + "logits/rejected": -0.8718851208686829, + "logps/chosen": -2.2795028686523438, + "logps/rejected": -2.277373790740967, + "loss": 3.3437, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -22.795028686523438, + "rewards/margins": -0.021291160956025124, + "rewards/rejected": -22.773738861083984, + "step": 11475 + }, + { + "epoch": 0.3869358589773838, + "grad_norm": 77.31005859375, + "learning_rate": 7.694830713114484e-07, + "logits/chosen": -0.5937200784683228, + "logits/rejected": -0.6495085954666138, + "logps/chosen": -3.7535088062286377, + "logps/rejected": -3.6097118854522705, + "loss": 5.3228, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -37.53508758544922, + "rewards/margins": -1.4379686117172241, + "rewards/rejected": -36.09711837768555, + "step": 11480 + }, + { + "epoch": 0.387104385048367, + "grad_norm": 95.22547912597656, + "learning_rate": 7.69235267070058e-07, + "logits/chosen": -0.9418071508407593, + "logits/rejected": -0.9999884366989136, + "logps/chosen": -2.5587856769561768, + "logps/rejected": -2.812378406524658, + "loss": 2.48, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.587852478027344, + "rewards/margins": 2.535930871963501, + "rewards/rejected": -28.1237850189209, + "step": 11485 + }, + { + "epoch": 0.38727291111935014, + "grad_norm": 20.330053329467773, + "learning_rate": 7.689873696569491e-07, + "logits/chosen": -1.4741865396499634, + "logits/rejected": -1.4686228036880493, + "logps/chosen": -1.855538010597229, + "logps/rejected": -2.09028959274292, + "loss": 1.9487, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.555377960205078, + "rewards/margins": 2.347515821456909, + "rewards/rejected": -20.902894973754883, + "step": 11490 + }, + { + "epoch": 0.38744143719033336, + "grad_norm": 26.7628231048584, + "learning_rate": 7.687393791579092e-07, + "logits/chosen": -0.7385457754135132, + "logits/rejected": -0.8334264755249023, + "logps/chosen": -1.7570949792861938, + "logps/rejected": -2.526315212249756, + "loss": 2.4006, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.570947647094727, + "rewards/margins": 7.692204475402832, + "rewards/rejected": -25.263153076171875, + "step": 11495 + }, + { + "epoch": 0.3876099632613165, + "grad_norm": 24.974245071411133, + "learning_rate": 7.684912956587581e-07, + "logits/chosen": -0.8599656224250793, + "logits/rejected": -1.045178771018982, + "logps/chosen": -1.721909761428833, + "logps/rejected": -1.6989558935165405, + "loss": 3.4335, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.219097137451172, + "rewards/margins": -0.22953709959983826, + "rewards/rejected": -16.989561080932617, + "step": 11500 + }, + { + "epoch": 0.3877784893322997, + "grad_norm": 36.48255157470703, + "learning_rate": 7.682431192453476e-07, + "logits/chosen": -1.283911943435669, + "logits/rejected": -1.2216994762420654, + "logps/chosen": -1.7604789733886719, + "logps/rejected": -1.7060072422027588, + "loss": 3.5949, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -17.60478973388672, + "rewards/margins": -0.5447174310684204, + "rewards/rejected": -17.06007194519043, + "step": 11505 + }, + { + "epoch": 0.3879470154032829, + "grad_norm": 24.400299072265625, + "learning_rate": 7.67994850003562e-07, + "logits/chosen": -1.1598316431045532, + "logits/rejected": -1.2359213829040527, + "logps/chosen": -1.6177875995635986, + "logps/rejected": -1.597001075744629, + "loss": 4.4583, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.177875518798828, + "rewards/margins": -0.20786476135253906, + "rewards/rejected": -15.970010757446289, + "step": 11510 + }, + { + "epoch": 0.3881155414742661, + "grad_norm": 19.435991287231445, + "learning_rate": 7.677464880193173e-07, + "logits/chosen": -1.1459027528762817, + "logits/rejected": -1.52254319190979, + "logps/chosen": -1.9466493129730225, + "logps/rejected": -2.2855138778686523, + "loss": 2.229, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.466493606567383, + "rewards/margins": 3.388643980026245, + "rewards/rejected": -22.85513687133789, + "step": 11515 + }, + { + "epoch": 0.38828406754524925, + "grad_norm": 14.826272010803223, + "learning_rate": 7.67498033378562e-07, + "logits/chosen": -0.6543978452682495, + "logits/rejected": -0.8185670971870422, + "logps/chosen": -1.837378740310669, + "logps/rejected": -1.9322576522827148, + "loss": 2.5036, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.37378692626953, + "rewards/margins": 0.9487897753715515, + "rewards/rejected": -19.322574615478516, + "step": 11520 + }, + { + "epoch": 0.3884525936162324, + "grad_norm": 28.684703826904297, + "learning_rate": 7.672494861672763e-07, + "logits/chosen": -0.9026684761047363, + "logits/rejected": -0.8441116213798523, + "logps/chosen": -2.0589985847473145, + "logps/rejected": -1.9286243915557861, + "loss": 4.411, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -20.589984893798828, + "rewards/margins": -1.303741693496704, + "rewards/rejected": -19.286243438720703, + "step": 11525 + }, + { + "epoch": 0.38862111968721563, + "grad_norm": 74.1058120727539, + "learning_rate": 7.670008464714725e-07, + "logits/chosen": -1.1678255796432495, + "logits/rejected": -1.188652753829956, + "logps/chosen": -1.9451920986175537, + "logps/rejected": -1.9647331237792969, + "loss": 3.2948, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.451923370361328, + "rewards/margins": 0.19540786743164062, + "rewards/rejected": -19.647327423095703, + "step": 11530 + }, + { + "epoch": 0.3887896457581988, + "grad_norm": 36.787261962890625, + "learning_rate": 7.667521143771954e-07, + "logits/chosen": -1.0473954677581787, + "logits/rejected": -1.2901098728179932, + "logps/chosen": -1.8299624919891357, + "logps/rejected": -2.346789598464966, + "loss": 1.9515, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.299625396728516, + "rewards/margins": 5.168271064758301, + "rewards/rejected": -23.4678955078125, + "step": 11535 + }, + { + "epoch": 0.38895817182918196, + "grad_norm": 38.38254928588867, + "learning_rate": 7.665032899705211e-07, + "logits/chosen": -1.1778606176376343, + "logits/rejected": -1.1312202215194702, + "logps/chosen": -2.3744866847991943, + "logps/rejected": -2.650757312774658, + "loss": 2.4871, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.7448673248291, + "rewards/margins": 2.762705087661743, + "rewards/rejected": -26.507572174072266, + "step": 11540 + }, + { + "epoch": 0.38912669790016513, + "grad_norm": 20.765256881713867, + "learning_rate": 7.662543733375577e-07, + "logits/chosen": -1.1510334014892578, + "logits/rejected": -1.017518401145935, + "logps/chosen": -1.8481746912002563, + "logps/rejected": -1.763033151626587, + "loss": 4.0422, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.481746673583984, + "rewards/margins": -0.8514149785041809, + "rewards/rejected": -17.63033103942871, + "step": 11545 + }, + { + "epoch": 0.38929522397114835, + "grad_norm": 25.51265525817871, + "learning_rate": 7.66005364564446e-07, + "logits/chosen": -1.084987759590149, + "logits/rejected": -1.028355360031128, + "logps/chosen": -2.0981199741363525, + "logps/rejected": -2.294259548187256, + "loss": 2.1252, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.981199264526367, + "rewards/margins": 1.961395263671875, + "rewards/rejected": -22.942594528198242, + "step": 11550 + }, + { + "epoch": 0.3894637500421315, + "grad_norm": 15.384166717529297, + "learning_rate": 7.657562637373577e-07, + "logits/chosen": -1.0935142040252686, + "logits/rejected": -0.9374657869338989, + "logps/chosen": -1.8439613580703735, + "logps/rejected": -1.926548719406128, + "loss": 2.879, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -18.439613342285156, + "rewards/margins": 0.8258728981018066, + "rewards/rejected": -19.265485763549805, + "step": 11555 + }, + { + "epoch": 0.3896322761131147, + "grad_norm": 27.608064651489258, + "learning_rate": 7.655070709424969e-07, + "logits/chosen": -1.1121046543121338, + "logits/rejected": -1.1762913465499878, + "logps/chosen": -1.8197288513183594, + "logps/rejected": -1.8982279300689697, + "loss": 3.2518, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -18.197288513183594, + "rewards/margins": 0.7849894762039185, + "rewards/rejected": -18.98227882385254, + "step": 11560 + }, + { + "epoch": 0.3898008021840979, + "grad_norm": 24.960180282592773, + "learning_rate": 7.652577862660994e-07, + "logits/chosen": -0.9853776097297668, + "logits/rejected": -1.296339750289917, + "logps/chosen": -1.872950553894043, + "logps/rejected": -2.0671050548553467, + "loss": 2.1694, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.729503631591797, + "rewards/margins": 1.9415457248687744, + "rewards/rejected": -20.671052932739258, + "step": 11565 + }, + { + "epoch": 0.38996932825508107, + "grad_norm": 5.3271379470825195, + "learning_rate": 7.650084097944327e-07, + "logits/chosen": -1.2342129945755005, + "logits/rejected": -1.6344501972198486, + "logps/chosen": -1.9482825994491577, + "logps/rejected": -2.2002644538879395, + "loss": 3.2104, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.482826232910156, + "rewards/margins": 2.5198206901550293, + "rewards/rejected": -22.00264549255371, + "step": 11570 + }, + { + "epoch": 0.39013785432606424, + "grad_norm": 18.309335708618164, + "learning_rate": 7.647589416137965e-07, + "logits/chosen": -1.1587319374084473, + "logits/rejected": -1.1315759420394897, + "logps/chosen": -2.09301495552063, + "logps/rejected": -2.0794355869293213, + "loss": 3.4546, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -20.93014907836914, + "rewards/margins": -0.13579444587230682, + "rewards/rejected": -20.794353485107422, + "step": 11575 + }, + { + "epoch": 0.3903063803970474, + "grad_norm": 41.83700942993164, + "learning_rate": 7.645093818105215e-07, + "logits/chosen": -0.998862624168396, + "logits/rejected": -0.9568171501159668, + "logps/chosen": -1.9338080883026123, + "logps/rejected": -2.008169651031494, + "loss": 2.6539, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.338083267211914, + "rewards/margins": 0.7436147928237915, + "rewards/rejected": -20.081695556640625, + "step": 11580 + }, + { + "epoch": 0.3904749064680306, + "grad_norm": 32.795169830322266, + "learning_rate": 7.642597304709708e-07, + "logits/chosen": -1.2452964782714844, + "logits/rejected": -1.3635139465332031, + "logps/chosen": -2.2852675914764404, + "logps/rejected": -2.5104193687438965, + "loss": 2.4394, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.852676391601562, + "rewards/margins": 2.2515177726745605, + "rewards/rejected": -25.104196548461914, + "step": 11585 + }, + { + "epoch": 0.3906434325390138, + "grad_norm": 199.022705078125, + "learning_rate": 7.640099876815388e-07, + "logits/chosen": -0.74217689037323, + "logits/rejected": -1.0585310459136963, + "logps/chosen": -2.1841442584991455, + "logps/rejected": -2.2506299018859863, + "loss": 2.7874, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.841442108154297, + "rewards/margins": 0.6648585200309753, + "rewards/rejected": -22.50629997253418, + "step": 11590 + }, + { + "epoch": 0.39081195860999696, + "grad_norm": 32.575645446777344, + "learning_rate": 7.637601535286516e-07, + "logits/chosen": -0.9163684844970703, + "logits/rejected": -1.3957068920135498, + "logps/chosen": -2.022493839263916, + "logps/rejected": -2.0056674480438232, + "loss": 3.5664, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.224937438964844, + "rewards/margins": -0.1682640016078949, + "rewards/rejected": -20.05667495727539, + "step": 11595 + }, + { + "epoch": 0.3909804846809801, + "grad_norm": 20.356990814208984, + "learning_rate": 7.635102280987671e-07, + "logits/chosen": -0.9212282299995422, + "logits/rejected": -0.7368286848068237, + "logps/chosen": -2.0380892753601074, + "logps/rejected": -2.2226696014404297, + "loss": 2.8106, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.38089370727539, + "rewards/margins": 1.8458038568496704, + "rewards/rejected": -22.226696014404297, + "step": 11600 + }, + { + "epoch": 0.3909804846809801, + "eval_logits/chosen": -1.420649528503418, + "eval_logits/rejected": -1.5138413906097412, + "eval_logps/chosen": -1.9176331758499146, + "eval_logps/rejected": -1.999894380569458, + "eval_loss": 3.043593406677246, + "eval_rewards/accuracies": 0.6100000143051147, + "eval_rewards/chosen": -19.176332473754883, + "eval_rewards/margins": 0.8226120471954346, + "eval_rewards/rejected": -19.998943328857422, + "eval_runtime": 12.8886, + "eval_samples_per_second": 7.759, + "eval_steps_per_second": 1.94, + "step": 11600 + }, + { + "epoch": 0.39114901075196334, + "grad_norm": 20.27068328857422, + "learning_rate": 7.632602114783744e-07, + "logits/chosen": -1.3960545063018799, + "logits/rejected": -1.362420678138733, + "logps/chosen": -1.963235855102539, + "logps/rejected": -1.9808452129364014, + "loss": 3.4921, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.63235855102539, + "rewards/margins": 0.17609205842018127, + "rewards/rejected": -19.80845069885254, + "step": 11605 + }, + { + "epoch": 0.3913175368229465, + "grad_norm": 29.96604347229004, + "learning_rate": 7.630101037539947e-07, + "logits/chosen": -1.1657246351242065, + "logits/rejected": -1.2024915218353271, + "logps/chosen": -1.7683417797088623, + "logps/rejected": -1.9547497034072876, + "loss": 2.4799, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.683420181274414, + "rewards/margins": 1.864076018333435, + "rewards/rejected": -19.547494888305664, + "step": 11610 + }, + { + "epoch": 0.3914860628939297, + "grad_norm": 19.694589614868164, + "learning_rate": 7.627599050121803e-07, + "logits/chosen": -1.076236367225647, + "logits/rejected": -1.0020965337753296, + "logps/chosen": -1.7998813390731812, + "logps/rejected": -1.9322502613067627, + "loss": 2.5581, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.99881362915039, + "rewards/margins": 1.32368803024292, + "rewards/rejected": -19.322500228881836, + "step": 11615 + }, + { + "epoch": 0.3916545889649129, + "grad_norm": 29.93548011779785, + "learning_rate": 7.625096153395149e-07, + "logits/chosen": -1.3330743312835693, + "logits/rejected": -1.4957467317581177, + "logps/chosen": -1.6043914556503296, + "logps/rejected": -1.6570093631744385, + "loss": 2.6907, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.043914794921875, + "rewards/margins": 0.5261794328689575, + "rewards/rejected": -16.57009506225586, + "step": 11620 + }, + { + "epoch": 0.39182311503589606, + "grad_norm": 16.487661361694336, + "learning_rate": 7.622592348226142e-07, + "logits/chosen": -1.1505143642425537, + "logits/rejected": -1.137687087059021, + "logps/chosen": -1.5276297330856323, + "logps/rejected": -1.5988701581954956, + "loss": 2.6423, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -15.276298522949219, + "rewards/margins": 0.7124035954475403, + "rewards/rejected": -15.988700866699219, + "step": 11625 + }, + { + "epoch": 0.39199164110687923, + "grad_norm": 27.717079162597656, + "learning_rate": 7.62008763548125e-07, + "logits/chosen": -0.8346315622329712, + "logits/rejected": -1.1313467025756836, + "logps/chosen": -1.8140102624893188, + "logps/rejected": -2.153585910797119, + "loss": 2.2458, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.140100479125977, + "rewards/margins": 3.395756483078003, + "rewards/rejected": -21.535858154296875, + "step": 11630 + }, + { + "epoch": 0.3921601671778624, + "grad_norm": 18.077486038208008, + "learning_rate": 7.617582016027253e-07, + "logits/chosen": -0.8229688405990601, + "logits/rejected": -0.9524946212768555, + "logps/chosen": -1.5641162395477295, + "logps/rejected": -1.6118942499160767, + "loss": 2.71, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -15.641161918640137, + "rewards/margins": 0.4777793884277344, + "rewards/rejected": -16.118942260742188, + "step": 11635 + }, + { + "epoch": 0.3923286932488456, + "grad_norm": 21.34409523010254, + "learning_rate": 7.615075490731249e-07, + "logits/chosen": -0.8470916748046875, + "logits/rejected": -1.0112509727478027, + "logps/chosen": -2.080671787261963, + "logps/rejected": -2.3466696739196777, + "loss": 3.2121, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.806716918945312, + "rewards/margins": 2.6599812507629395, + "rewards/rejected": -23.466699600219727, + "step": 11640 + }, + { + "epoch": 0.3924972193198288, + "grad_norm": 12.055615425109863, + "learning_rate": 7.612568060460649e-07, + "logits/chosen": -1.058789610862732, + "logits/rejected": -1.077118158340454, + "logps/chosen": -1.869264006614685, + "logps/rejected": -2.0772647857666016, + "loss": 2.7164, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.69264030456543, + "rewards/margins": 2.080007553100586, + "rewards/rejected": -20.772647857666016, + "step": 11645 + }, + { + "epoch": 0.39266574539081195, + "grad_norm": 22.117664337158203, + "learning_rate": 7.610059726083174e-07, + "logits/chosen": -0.7841507196426392, + "logits/rejected": -0.8706964254379272, + "logps/chosen": -2.1044468879699707, + "logps/rejected": -2.599332094192505, + "loss": 2.7502, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.04446792602539, + "rewards/margins": 4.948855400085449, + "rewards/rejected": -25.993322372436523, + "step": 11650 + }, + { + "epoch": 0.3928342714617951, + "grad_norm": 30.5588321685791, + "learning_rate": 7.60755048846686e-07, + "logits/chosen": -0.9382207989692688, + "logits/rejected": -0.816710352897644, + "logps/chosen": -1.7673962116241455, + "logps/rejected": -1.7424449920654297, + "loss": 3.5536, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.673961639404297, + "rewards/margins": -0.24951085448265076, + "rewards/rejected": -17.424449920654297, + "step": 11655 + }, + { + "epoch": 0.39300279753277834, + "grad_norm": 22.62909507751465, + "learning_rate": 7.605040348480054e-07, + "logits/chosen": -1.1503394842147827, + "logits/rejected": -1.2069244384765625, + "logps/chosen": -2.2256691455841064, + "logps/rejected": -2.470478057861328, + "loss": 3.4489, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.25669288635254, + "rewards/margins": 2.448086977005005, + "rewards/rejected": -24.70477867126465, + "step": 11660 + }, + { + "epoch": 0.3931713236037615, + "grad_norm": 24.27674674987793, + "learning_rate": 7.602529306991418e-07, + "logits/chosen": -1.1156190633773804, + "logits/rejected": -1.256882905960083, + "logps/chosen": -2.3534419536590576, + "logps/rejected": -2.3057656288146973, + "loss": 3.8305, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -23.534420013427734, + "rewards/margins": -0.47676581144332886, + "rewards/rejected": -23.057653427124023, + "step": 11665 + }, + { + "epoch": 0.39333984967474467, + "grad_norm": 34.294864654541016, + "learning_rate": 7.600017364869926e-07, + "logits/chosen": -1.274646520614624, + "logits/rejected": -1.1494067907333374, + "logps/chosen": -2.619175672531128, + "logps/rejected": -2.60260272026062, + "loss": 3.4273, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -26.191753387451172, + "rewards/margins": -0.1657283753156662, + "rewards/rejected": -26.026025772094727, + "step": 11670 + }, + { + "epoch": 0.3935083757457279, + "grad_norm": 45.172237396240234, + "learning_rate": 7.59750452298486e-07, + "logits/chosen": -0.9087691307067871, + "logits/rejected": -1.0363165140151978, + "logps/chosen": -2.048802375793457, + "logps/rejected": -2.136763095855713, + "loss": 2.8038, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.48802375793457, + "rewards/margins": 0.8796059489250183, + "rewards/rejected": -21.367630004882812, + "step": 11675 + }, + { + "epoch": 0.39367690181671106, + "grad_norm": 142.33331298828125, + "learning_rate": 7.594990782205817e-07, + "logits/chosen": -0.9688955545425415, + "logits/rejected": -1.3358403444290161, + "logps/chosen": -2.2000503540039062, + "logps/rejected": -2.168553352355957, + "loss": 4.0627, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.00050163269043, + "rewards/margins": -0.31496915221214294, + "rewards/rejected": -21.68553352355957, + "step": 11680 + }, + { + "epoch": 0.3938454278876942, + "grad_norm": 48.449283599853516, + "learning_rate": 7.592476143402702e-07, + "logits/chosen": -1.389552116394043, + "logits/rejected": -1.2141063213348389, + "logps/chosen": -2.1961960792541504, + "logps/rejected": -2.036904811859131, + "loss": 4.7629, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.961963653564453, + "rewards/margins": -1.5929114818572998, + "rewards/rejected": -20.369050979614258, + "step": 11685 + }, + { + "epoch": 0.3940139539586774, + "grad_norm": 15.973644256591797, + "learning_rate": 7.589960607445734e-07, + "logits/chosen": -0.7461063265800476, + "logits/rejected": -1.0605086088180542, + "logps/chosen": -2.184495449066162, + "logps/rejected": -2.7327818870544434, + "loss": 1.8069, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.844953536987305, + "rewards/margins": 5.482865333557129, + "rewards/rejected": -27.32781982421875, + "step": 11690 + }, + { + "epoch": 0.3941824800296606, + "grad_norm": 15.492846488952637, + "learning_rate": 7.587444175205439e-07, + "logits/chosen": -1.1022851467132568, + "logits/rejected": -1.0290096998214722, + "logps/chosen": -2.023465633392334, + "logps/rejected": -2.073293924331665, + "loss": 2.9022, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.234655380249023, + "rewards/margins": 0.49828261137008667, + "rewards/rejected": -20.732938766479492, + "step": 11695 + }, + { + "epoch": 0.3943510061006438, + "grad_norm": 22.443706512451172, + "learning_rate": 7.584926847552656e-07, + "logits/chosen": -1.0928064584732056, + "logits/rejected": -0.9536017179489136, + "logps/chosen": -1.8068549633026123, + "logps/rejected": -1.7727839946746826, + "loss": 3.444, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.06855010986328, + "rewards/margins": -0.3407100737094879, + "rewards/rejected": -17.727840423583984, + "step": 11700 + }, + { + "epoch": 0.39451953217162694, + "grad_norm": 105.63274383544922, + "learning_rate": 7.582408625358534e-07, + "logits/chosen": -0.6659219264984131, + "logits/rejected": -0.7687760591506958, + "logps/chosen": -2.302253007888794, + "logps/rejected": -2.3893723487854004, + "loss": 3.26, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -23.02252960205078, + "rewards/margins": 0.8711929321289062, + "rewards/rejected": -23.893722534179688, + "step": 11705 + }, + { + "epoch": 0.3946880582426101, + "grad_norm": 9.452350616455078, + "learning_rate": 7.579889509494528e-07, + "logits/chosen": -0.9181884527206421, + "logits/rejected": -1.3316329717636108, + "logps/chosen": -1.9574193954467773, + "logps/rejected": -2.146073579788208, + "loss": 2.8546, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.574193954467773, + "rewards/margins": 1.8865394592285156, + "rewards/rejected": -21.460735321044922, + "step": 11710 + }, + { + "epoch": 0.39485658431359333, + "grad_norm": 41.55025100708008, + "learning_rate": 7.577369500832408e-07, + "logits/chosen": -0.987427830696106, + "logits/rejected": -1.0094937086105347, + "logps/chosen": -2.231602907180786, + "logps/rejected": -2.1909897327423096, + "loss": 3.7208, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -22.316028594970703, + "rewards/margins": -0.4061313569545746, + "rewards/rejected": -21.909896850585938, + "step": 11715 + }, + { + "epoch": 0.3950251103845765, + "grad_norm": 22.197267532348633, + "learning_rate": 7.574848600244249e-07, + "logits/chosen": -1.7265431880950928, + "logits/rejected": -1.9097391366958618, + "logps/chosen": -1.936183214187622, + "logps/rejected": -2.317960262298584, + "loss": 2.1239, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.361831665039062, + "rewards/margins": 3.8177738189697266, + "rewards/rejected": -23.17960548400879, + "step": 11720 + }, + { + "epoch": 0.39519363645555966, + "grad_norm": 102.47838592529297, + "learning_rate": 7.572326808602433e-07, + "logits/chosen": -1.2377512454986572, + "logits/rejected": -1.3128893375396729, + "logps/chosen": -2.288329601287842, + "logps/rejected": -2.1615147590637207, + "loss": 4.3777, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.8832950592041, + "rewards/margins": -1.2681492567062378, + "rewards/rejected": -21.61514663696289, + "step": 11725 + }, + { + "epoch": 0.3953621625265429, + "grad_norm": 20.766952514648438, + "learning_rate": 7.569804126779653e-07, + "logits/chosen": -1.397312879562378, + "logits/rejected": -1.6373573541641235, + "logps/chosen": -1.943434476852417, + "logps/rejected": -2.0884037017822266, + "loss": 2.0901, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.43434715270996, + "rewards/margins": 1.4496896266937256, + "rewards/rejected": -20.884037017822266, + "step": 11730 + }, + { + "epoch": 0.39553068859752605, + "grad_norm": 37.036781311035156, + "learning_rate": 7.567280555648914e-07, + "logits/chosen": -1.0928919315338135, + "logits/rejected": -1.3050086498260498, + "logps/chosen": -1.9236781597137451, + "logps/rejected": -2.1296448707580566, + "loss": 1.8989, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -19.23678207397461, + "rewards/margins": 2.059666395187378, + "rewards/rejected": -21.296449661254883, + "step": 11735 + }, + { + "epoch": 0.3956992146685092, + "grad_norm": 29.56365966796875, + "learning_rate": 7.564756096083519e-07, + "logits/chosen": -0.8099555969238281, + "logits/rejected": -0.937663197517395, + "logps/chosen": -1.5575193166732788, + "logps/rejected": -1.8885425329208374, + "loss": 2.7032, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -15.575193405151367, + "rewards/margins": 3.3102316856384277, + "rewards/rejected": -18.885425567626953, + "step": 11740 + }, + { + "epoch": 0.3958677407394924, + "grad_norm": 30.615787506103516, + "learning_rate": 7.562230748957086e-07, + "logits/chosen": -1.2532612085342407, + "logits/rejected": -1.6562509536743164, + "logps/chosen": -1.9327198266983032, + "logps/rejected": -1.9758819341659546, + "loss": 3.1936, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.327198028564453, + "rewards/margins": 0.4316198229789734, + "rewards/rejected": -19.758817672729492, + "step": 11745 + }, + { + "epoch": 0.3960362668104756, + "grad_norm": 17.918991088867188, + "learning_rate": 7.559704515143541e-07, + "logits/chosen": -0.8716901540756226, + "logits/rejected": -1.0966124534606934, + "logps/chosen": -2.116118907928467, + "logps/rejected": -2.3539252281188965, + "loss": 1.909, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.161190032958984, + "rewards/margins": 2.3780627250671387, + "rewards/rejected": -23.53925132751465, + "step": 11750 + }, + { + "epoch": 0.39620479288145877, + "grad_norm": 26.732141494750977, + "learning_rate": 7.557177395517111e-07, + "logits/chosen": -0.8900313377380371, + "logits/rejected": -1.020237684249878, + "logps/chosen": -2.0022330284118652, + "logps/rejected": -1.9952160120010376, + "loss": 3.2698, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.02233123779297, + "rewards/margins": -0.07017116248607635, + "rewards/rejected": -19.952159881591797, + "step": 11755 + }, + { + "epoch": 0.39637331895244193, + "grad_norm": 17.14510726928711, + "learning_rate": 7.554649390952333e-07, + "logits/chosen": -1.1153548955917358, + "logits/rejected": -1.2645814418792725, + "logps/chosen": -2.1429028511047363, + "logps/rejected": -2.2042899131774902, + "loss": 2.8233, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.429027557373047, + "rewards/margins": 0.6138709187507629, + "rewards/rejected": -22.042898178100586, + "step": 11760 + }, + { + "epoch": 0.3965418450234251, + "grad_norm": 42.62935256958008, + "learning_rate": 7.552120502324048e-07, + "logits/chosen": -1.1245473623275757, + "logits/rejected": -1.1443026065826416, + "logps/chosen": -1.7413899898529053, + "logps/rejected": -1.7725694179534912, + "loss": 3.104, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.413898468017578, + "rewards/margins": 0.31179675459861755, + "rewards/rejected": -17.72569465637207, + "step": 11765 + }, + { + "epoch": 0.3967103710944083, + "grad_norm": 0.033805444836616516, + "learning_rate": 7.549590730507409e-07, + "logits/chosen": -1.0588490962982178, + "logits/rejected": -1.0548990964889526, + "logps/chosen": -1.8918966054916382, + "logps/rejected": -2.3487842082977295, + "loss": 1.0045, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -18.91896629333496, + "rewards/margins": 4.568876266479492, + "rewards/rejected": -23.487842559814453, + "step": 11770 + }, + { + "epoch": 0.3968788971653915, + "grad_norm": 16.685335159301758, + "learning_rate": 7.547060076377868e-07, + "logits/chosen": -0.6314164400100708, + "logits/rejected": -0.7681409120559692, + "logps/chosen": -1.8253930807113647, + "logps/rejected": -1.9405262470245361, + "loss": 2.4047, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.25393295288086, + "rewards/margins": 1.1513316631317139, + "rewards/rejected": -19.405263900756836, + "step": 11775 + }, + { + "epoch": 0.39704742323637465, + "grad_norm": 37.280887603759766, + "learning_rate": 7.544528540811183e-07, + "logits/chosen": -1.0419279336929321, + "logits/rejected": -0.9182440638542175, + "logps/chosen": -2.2263355255126953, + "logps/rejected": -2.186289072036743, + "loss": 3.752, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.263357162475586, + "rewards/margins": -0.40046462416648865, + "rewards/rejected": -21.862892150878906, + "step": 11780 + }, + { + "epoch": 0.3972159493073579, + "grad_norm": 13.038125991821289, + "learning_rate": 7.541996124683423e-07, + "logits/chosen": -1.2539036273956299, + "logits/rejected": -1.3483214378356934, + "logps/chosen": -2.142634153366089, + "logps/rejected": -2.682434558868408, + "loss": 1.4145, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.426342010498047, + "rewards/margins": 5.398002624511719, + "rewards/rejected": -26.824344635009766, + "step": 11785 + }, + { + "epoch": 0.39738447537834104, + "grad_norm": 36.69297790527344, + "learning_rate": 7.539462828870953e-07, + "logits/chosen": -0.9368730783462524, + "logits/rejected": -1.0406259298324585, + "logps/chosen": -1.7773082256317139, + "logps/rejected": -2.0813655853271484, + "loss": 1.8501, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -17.773082733154297, + "rewards/margins": 3.04057240486145, + "rewards/rejected": -20.813655853271484, + "step": 11790 + }, + { + "epoch": 0.3975530014493242, + "grad_norm": 17.663898468017578, + "learning_rate": 7.53692865425045e-07, + "logits/chosen": -0.9241794347763062, + "logits/rejected": -0.6411947011947632, + "logps/chosen": -1.8682501316070557, + "logps/rejected": -2.117549419403076, + "loss": 3.3426, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.6825008392334, + "rewards/margins": 2.4929943084716797, + "rewards/rejected": -21.175495147705078, + "step": 11795 + }, + { + "epoch": 0.39772152752030737, + "grad_norm": 17.26961898803711, + "learning_rate": 7.53439360169889e-07, + "logits/chosen": -0.8073530197143555, + "logits/rejected": -0.9707645177841187, + "logps/chosen": -2.1161324977874756, + "logps/rejected": -2.29624605178833, + "loss": 2.9547, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -21.161327362060547, + "rewards/margins": 1.8011353015899658, + "rewards/rejected": -22.96246337890625, + "step": 11800 + }, + { + "epoch": 0.3978900535912906, + "grad_norm": 17.637361526489258, + "learning_rate": 7.531857672093556e-07, + "logits/chosen": -1.259456992149353, + "logits/rejected": -1.3594366312026978, + "logps/chosen": -1.818107008934021, + "logps/rejected": -2.2726337909698486, + "loss": 2.2051, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.18107032775879, + "rewards/margins": 4.5452680587768555, + "rewards/rejected": -22.726337432861328, + "step": 11805 + }, + { + "epoch": 0.39805857966227376, + "grad_norm": 34.132537841796875, + "learning_rate": 7.529320866312032e-07, + "logits/chosen": -1.1170294284820557, + "logits/rejected": -1.1737979650497437, + "logps/chosen": -1.8334858417510986, + "logps/rejected": -1.8769190311431885, + "loss": 4.1332, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.334856033325195, + "rewards/margins": 0.4343327581882477, + "rewards/rejected": -18.769189834594727, + "step": 11810 + }, + { + "epoch": 0.3982271057332569, + "grad_norm": 11.358888626098633, + "learning_rate": 7.526783185232207e-07, + "logits/chosen": -0.5799717307090759, + "logits/rejected": -0.7505512237548828, + "logps/chosen": -2.1831164360046387, + "logps/rejected": -2.3155195713043213, + "loss": 2.3181, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.831165313720703, + "rewards/margins": 1.3240302801132202, + "rewards/rejected": -23.155197143554688, + "step": 11815 + }, + { + "epoch": 0.3983956318042401, + "grad_norm": 31.22028923034668, + "learning_rate": 7.524244629732275e-07, + "logits/chosen": -1.0025193691253662, + "logits/rejected": -1.0711981058120728, + "logps/chosen": -1.750725507736206, + "logps/rejected": -1.9163103103637695, + "loss": 2.8004, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.50725746154785, + "rewards/margins": 1.6558473110198975, + "rewards/rejected": -19.163105010986328, + "step": 11820 + }, + { + "epoch": 0.3985641578752233, + "grad_norm": 27.10166358947754, + "learning_rate": 7.521705200690727e-07, + "logits/chosen": -1.5414470434188843, + "logits/rejected": -1.6252628564834595, + "logps/chosen": -2.156869411468506, + "logps/rejected": -2.4458861351013184, + "loss": 3.313, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.568696975708008, + "rewards/margins": 2.8901631832122803, + "rewards/rejected": -24.458858489990234, + "step": 11825 + }, + { + "epoch": 0.3987326839462065, + "grad_norm": 31.27543067932129, + "learning_rate": 7.519164898986358e-07, + "logits/chosen": -0.794817328453064, + "logits/rejected": -0.8538041114807129, + "logps/chosen": -1.9069769382476807, + "logps/rejected": -2.101755142211914, + "loss": 2.3943, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.06976890563965, + "rewards/margins": 1.9477818012237549, + "rewards/rejected": -21.01755142211914, + "step": 11830 + }, + { + "epoch": 0.39890121001718964, + "grad_norm": 61.70077896118164, + "learning_rate": 7.516623725498272e-07, + "logits/chosen": -0.8156031370162964, + "logits/rejected": -1.2082624435424805, + "logps/chosen": -2.378671884536743, + "logps/rejected": -3.244913101196289, + "loss": 1.4, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.78671646118164, + "rewards/margins": 8.662416458129883, + "rewards/rejected": -32.449134826660156, + "step": 11835 + }, + { + "epoch": 0.39906973608817287, + "grad_norm": 18.984455108642578, + "learning_rate": 7.514081681105864e-07, + "logits/chosen": -1.3033349514007568, + "logits/rejected": -1.254258394241333, + "logps/chosen": -1.8369280099868774, + "logps/rejected": -1.799774169921875, + "loss": 4.5203, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -18.369279861450195, + "rewards/margins": -0.3715387284755707, + "rewards/rejected": -17.997739791870117, + "step": 11840 + }, + { + "epoch": 0.39923826215915603, + "grad_norm": 24.976802825927734, + "learning_rate": 7.511538766688838e-07, + "logits/chosen": -1.1798292398452759, + "logits/rejected": -1.2531414031982422, + "logps/chosen": -1.946840524673462, + "logps/rejected": -2.2215023040771484, + "loss": 2.4009, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.46840476989746, + "rewards/margins": 2.7466187477111816, + "rewards/rejected": -22.215023040771484, + "step": 11845 + }, + { + "epoch": 0.3994067882301392, + "grad_norm": 26.44159507751465, + "learning_rate": 7.508994983127194e-07, + "logits/chosen": -0.9524277448654175, + "logits/rejected": -1.117555022239685, + "logps/chosen": -2.0292279720306396, + "logps/rejected": -2.4175662994384766, + "loss": 2.1058, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.292278289794922, + "rewards/margins": 3.8833839893341064, + "rewards/rejected": -24.175662994384766, + "step": 11850 + }, + { + "epoch": 0.39957531430112236, + "grad_norm": 21.922222137451172, + "learning_rate": 7.506450331301237e-07, + "logits/chosen": -1.2023600339889526, + "logits/rejected": -1.229527235031128, + "logps/chosen": -1.674254059791565, + "logps/rejected": -1.7711979150772095, + "loss": 2.8766, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -16.742542266845703, + "rewards/margins": 0.9694374799728394, + "rewards/rejected": -17.711978912353516, + "step": 11855 + }, + { + "epoch": 0.3997438403721056, + "grad_norm": 34.8781623840332, + "learning_rate": 7.503904812091572e-07, + "logits/chosen": -1.010766863822937, + "logits/rejected": -1.1134612560272217, + "logps/chosen": -2.0631964206695557, + "logps/rejected": -1.935153603553772, + "loss": 4.7799, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -20.6319637298584, + "rewards/margins": -1.2804267406463623, + "rewards/rejected": -19.35153579711914, + "step": 11860 + }, + { + "epoch": 0.39991236644308875, + "grad_norm": 23.074020385742188, + "learning_rate": 7.501358426379101e-07, + "logits/chosen": -1.3153374195098877, + "logits/rejected": -1.2573680877685547, + "logps/chosen": -2.031836986541748, + "logps/rejected": -1.9721667766571045, + "loss": 3.8169, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -20.318370819091797, + "rewards/margins": -0.5966991186141968, + "rewards/rejected": -19.721668243408203, + "step": 11865 + }, + { + "epoch": 0.4000808925140719, + "grad_norm": 18.008769989013672, + "learning_rate": 7.498811175045028e-07, + "logits/chosen": -0.9966435432434082, + "logits/rejected": -0.9778487086296082, + "logps/chosen": -2.4542758464813232, + "logps/rejected": -2.3798739910125732, + "loss": 4.6834, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -24.54275894165039, + "rewards/margins": -0.7440201640129089, + "rewards/rejected": -23.79874038696289, + "step": 11870 + }, + { + "epoch": 0.4002494185850551, + "grad_norm": 19.766666412353516, + "learning_rate": 7.496263058970855e-07, + "logits/chosen": -1.0324809551239014, + "logits/rejected": -1.2260249853134155, + "logps/chosen": -1.6027014255523682, + "logps/rejected": -1.7053003311157227, + "loss": 2.4804, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.027013778686523, + "rewards/margins": 1.025989055633545, + "rewards/rejected": -17.053003311157227, + "step": 11875 + }, + { + "epoch": 0.4004179446560383, + "grad_norm": 23.002748489379883, + "learning_rate": 7.493714079038388e-07, + "logits/chosen": -0.8100983500480652, + "logits/rejected": -1.1202142238616943, + "logps/chosen": -2.186525821685791, + "logps/rejected": -2.3713173866271973, + "loss": 2.4701, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.86526107788086, + "rewards/margins": 1.8479118347167969, + "rewards/rejected": -23.713171005249023, + "step": 11880 + }, + { + "epoch": 0.40058647072702147, + "grad_norm": 22.952287673950195, + "learning_rate": 7.491164236129726e-07, + "logits/chosen": -1.3353387117385864, + "logits/rejected": -1.3661134243011475, + "logps/chosen": -1.7970765829086304, + "logps/rejected": -2.0575413703918457, + "loss": 3.5522, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.970767974853516, + "rewards/margins": 2.6046481132507324, + "rewards/rejected": -20.57541275024414, + "step": 11885 + }, + { + "epoch": 0.40075499679800464, + "grad_norm": 14.258259773254395, + "learning_rate": 7.48861353112727e-07, + "logits/chosen": -0.8800445795059204, + "logits/rejected": -1.0589618682861328, + "logps/chosen": -1.7092921733856201, + "logps/rejected": -1.914419412612915, + "loss": 2.247, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.09292221069336, + "rewards/margins": 2.051269054412842, + "rewards/rejected": -19.14419174194336, + "step": 11890 + }, + { + "epoch": 0.40092352286898786, + "grad_norm": 26.45533561706543, + "learning_rate": 7.486061964913719e-07, + "logits/chosen": -1.2647030353546143, + "logits/rejected": -1.3658673763275146, + "logps/chosen": -1.7642624378204346, + "logps/rejected": -1.9021352529525757, + "loss": 2.7179, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.642620086669922, + "rewards/margins": 1.3787298202514648, + "rewards/rejected": -19.021352767944336, + "step": 11895 + }, + { + "epoch": 0.401092048939971, + "grad_norm": 17.83465003967285, + "learning_rate": 7.483509538372067e-07, + "logits/chosen": -1.247041940689087, + "logits/rejected": -1.282518744468689, + "logps/chosen": -2.0411016941070557, + "logps/rejected": -2.189762830734253, + "loss": 2.4295, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.4110164642334, + "rewards/margins": 1.486612319946289, + "rewards/rejected": -21.897628784179688, + "step": 11900 + }, + { + "epoch": 0.4012605750109542, + "grad_norm": 19.71072769165039, + "learning_rate": 7.480956252385612e-07, + "logits/chosen": -0.6247833371162415, + "logits/rejected": -0.8317705392837524, + "logps/chosen": -2.314605951309204, + "logps/rejected": -2.5528736114501953, + "loss": 1.5118, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -23.146060943603516, + "rewards/margins": 2.3826773166656494, + "rewards/rejected": -25.528738021850586, + "step": 11905 + }, + { + "epoch": 0.40142910108193736, + "grad_norm": 28.8355770111084, + "learning_rate": 7.478402107837942e-07, + "logits/chosen": -1.1492893695831299, + "logits/rejected": -1.1314630508422852, + "logps/chosen": -1.678655982017517, + "logps/rejected": -1.749389886856079, + "loss": 2.5565, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.786558151245117, + "rewards/margins": 0.7073384523391724, + "rewards/rejected": -17.493896484375, + "step": 11910 + }, + { + "epoch": 0.4015976271529206, + "grad_norm": 20.623319625854492, + "learning_rate": 7.47584710561295e-07, + "logits/chosen": -1.0180232524871826, + "logits/rejected": -1.1039505004882812, + "logps/chosen": -1.974373459815979, + "logps/rejected": -2.00376558303833, + "loss": 3.6947, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.743732452392578, + "rewards/margins": 0.2939223349094391, + "rewards/rejected": -20.037654876708984, + "step": 11915 + }, + { + "epoch": 0.40176615322390374, + "grad_norm": 22.25886344909668, + "learning_rate": 7.473291246594819e-07, + "logits/chosen": -1.4031606912612915, + "logits/rejected": -1.4442576169967651, + "logps/chosen": -1.717380166053772, + "logps/rejected": -1.8811886310577393, + "loss": 1.8976, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.17380142211914, + "rewards/margins": 1.6380836963653564, + "rewards/rejected": -18.8118839263916, + "step": 11920 + }, + { + "epoch": 0.4019346792948869, + "grad_norm": 40.460548400878906, + "learning_rate": 7.470734531668029e-07, + "logits/chosen": -0.8854770660400391, + "logits/rejected": -0.9225967526435852, + "logps/chosen": -2.049858570098877, + "logps/rejected": -2.3521180152893066, + "loss": 1.7058, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.498584747314453, + "rewards/margins": 3.0225963592529297, + "rewards/rejected": -23.521181106567383, + "step": 11925 + }, + { + "epoch": 0.4021032053658701, + "grad_norm": 26.445152282714844, + "learning_rate": 7.468176961717363e-07, + "logits/chosen": -0.9759553670883179, + "logits/rejected": -0.9978090524673462, + "logps/chosen": -1.817072868347168, + "logps/rejected": -1.8944177627563477, + "loss": 2.4529, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.170730590820312, + "rewards/margins": 0.7734484672546387, + "rewards/rejected": -18.944177627563477, + "step": 11930 + }, + { + "epoch": 0.4022717314368533, + "grad_norm": 25.400487899780273, + "learning_rate": 7.465618537627891e-07, + "logits/chosen": -1.1339561939239502, + "logits/rejected": -1.1581547260284424, + "logps/chosen": -2.5781798362731934, + "logps/rejected": -2.6100564002990723, + "loss": 3.1132, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.78179931640625, + "rewards/margins": 0.31876450777053833, + "rewards/rejected": -26.100561141967773, + "step": 11935 + }, + { + "epoch": 0.40244025750783646, + "grad_norm": 90.3258285522461, + "learning_rate": 7.463059260284985e-07, + "logits/chosen": -0.9859679937362671, + "logits/rejected": -1.0732853412628174, + "logps/chosen": -2.2488198280334473, + "logps/rejected": -2.398329496383667, + "loss": 2.7178, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.48819923400879, + "rewards/margins": 1.4950984716415405, + "rewards/rejected": -23.983295440673828, + "step": 11940 + }, + { + "epoch": 0.40260878357881963, + "grad_norm": 34.835697174072266, + "learning_rate": 7.46049913057431e-07, + "logits/chosen": -0.9262846112251282, + "logits/rejected": -1.155128836631775, + "logps/chosen": -2.1297969818115234, + "logps/rejected": -2.6779632568359375, + "loss": 3.041, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.2979679107666, + "rewards/margins": 5.481663227081299, + "rewards/rejected": -26.779632568359375, + "step": 11945 + }, + { + "epoch": 0.40277730964980285, + "grad_norm": 12.751852035522461, + "learning_rate": 7.457938149381826e-07, + "logits/chosen": -1.1583601236343384, + "logits/rejected": -1.1672272682189941, + "logps/chosen": -1.930572509765625, + "logps/rejected": -1.9824788570404053, + "loss": 3.419, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.30572509765625, + "rewards/margins": 0.5190634727478027, + "rewards/rejected": -19.82478904724121, + "step": 11950 + }, + { + "epoch": 0.402945835720786, + "grad_norm": 12.714435577392578, + "learning_rate": 7.455376317593787e-07, + "logits/chosen": -1.2571144104003906, + "logits/rejected": -1.3536908626556396, + "logps/chosen": -2.014531373977661, + "logps/rejected": -2.2088634967803955, + "loss": 2.8636, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.145313262939453, + "rewards/margins": 1.94331955909729, + "rewards/rejected": -22.088634490966797, + "step": 11955 + }, + { + "epoch": 0.4031143617917692, + "grad_norm": 22.396446228027344, + "learning_rate": 7.452813636096742e-07, + "logits/chosen": -1.0473554134368896, + "logits/rejected": -1.0027496814727783, + "logps/chosen": -2.0327858924865723, + "logps/rejected": -2.114520311355591, + "loss": 3.3383, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.32785987854004, + "rewards/margins": 0.8173434138298035, + "rewards/rejected": -21.14520263671875, + "step": 11960 + }, + { + "epoch": 0.40328288786275235, + "grad_norm": 16.245067596435547, + "learning_rate": 7.450250105777536e-07, + "logits/chosen": -0.6577657461166382, + "logits/rejected": -0.7657932043075562, + "logps/chosen": -2.4727909564971924, + "logps/rejected": -2.837554931640625, + "loss": 3.0212, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.727909088134766, + "rewards/margins": 3.6476409435272217, + "rewards/rejected": -28.37554931640625, + "step": 11965 + }, + { + "epoch": 0.40345141393373557, + "grad_norm": 23.989412307739258, + "learning_rate": 7.447685727523303e-07, + "logits/chosen": -0.9820396304130554, + "logits/rejected": -0.8650194406509399, + "logps/chosen": -1.8049871921539307, + "logps/rejected": -1.7340768575668335, + "loss": 3.8176, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.049869537353516, + "rewards/margins": -0.7091019749641418, + "rewards/rejected": -17.340768814086914, + "step": 11970 + }, + { + "epoch": 0.40361994000471874, + "grad_norm": 27.99403953552246, + "learning_rate": 7.445120502221475e-07, + "logits/chosen": -0.820398211479187, + "logits/rejected": -0.7926065921783447, + "logps/chosen": -1.9640032052993774, + "logps/rejected": -1.9566271305084229, + "loss": 3.3637, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.640033721923828, + "rewards/margins": -0.07375955581665039, + "rewards/rejected": -19.566272735595703, + "step": 11975 + }, + { + "epoch": 0.4037884660757019, + "grad_norm": 19.738489151000977, + "learning_rate": 7.442554430759775e-07, + "logits/chosen": -1.0407991409301758, + "logits/rejected": -1.423341989517212, + "logps/chosen": -1.6015815734863281, + "logps/rejected": -1.8600505590438843, + "loss": 2.8412, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.01581382751465, + "rewards/margins": 2.5846915245056152, + "rewards/rejected": -18.600505828857422, + "step": 11980 + }, + { + "epoch": 0.40395699214668507, + "grad_norm": 31.152938842773438, + "learning_rate": 7.43998751402622e-07, + "logits/chosen": -1.1023541688919067, + "logits/rejected": -1.0939910411834717, + "logps/chosen": -2.0495901107788086, + "logps/rejected": -2.3085341453552246, + "loss": 2.8457, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.495901107788086, + "rewards/margins": 2.5894412994384766, + "rewards/rejected": -23.085342407226562, + "step": 11985 + }, + { + "epoch": 0.4041255182176683, + "grad_norm": 25.51861000061035, + "learning_rate": 7.437419752909119e-07, + "logits/chosen": -1.249093770980835, + "logits/rejected": -1.0771114826202393, + "logps/chosen": -2.0163919925689697, + "logps/rejected": -2.0079128742218018, + "loss": 4.7058, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.163917541503906, + "rewards/margins": -0.08479070663452148, + "rewards/rejected": -20.07912826538086, + "step": 11990 + }, + { + "epoch": 0.40429404428865146, + "grad_norm": 23.77742576599121, + "learning_rate": 7.43485114829707e-07, + "logits/chosen": -1.5242725610733032, + "logits/rejected": -1.4089118242263794, + "logps/chosen": -2.0811915397644043, + "logps/rejected": -2.0558857917785645, + "loss": 4.1171, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.811914443969727, + "rewards/margins": -0.25305813550949097, + "rewards/rejected": -20.558856964111328, + "step": 11995 + }, + { + "epoch": 0.4044625703596346, + "grad_norm": 26.791170120239258, + "learning_rate": 7.432281701078969e-07, + "logits/chosen": -1.3711198568344116, + "logits/rejected": -1.464496374130249, + "logps/chosen": -1.9427284002304077, + "logps/rejected": -1.997841477394104, + "loss": 3.0344, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.427284240722656, + "rewards/margins": 0.5511296987533569, + "rewards/rejected": -19.97841453552246, + "step": 12000 + }, + { + "epoch": 0.4044625703596346, + "eval_logits/chosen": -1.465684175491333, + "eval_logits/rejected": -1.5627810955047607, + "eval_logps/chosen": -1.9252618551254272, + "eval_logps/rejected": -2.010789394378662, + "eval_loss": 3.0332751274108887, + "eval_rewards/accuracies": 0.6100000143051147, + "eval_rewards/chosen": -19.252620697021484, + "eval_rewards/margins": 0.855275571346283, + "eval_rewards/rejected": -20.107894897460938, + "eval_runtime": 12.907, + "eval_samples_per_second": 7.748, + "eval_steps_per_second": 1.937, + "step": 12000 + }, + { + "epoch": 0.40463109643061784, + "grad_norm": 24.16160774230957, + "learning_rate": 7.429711412143999e-07, + "logits/chosen": -1.0915724039077759, + "logits/rejected": -1.1342499256134033, + "logps/chosen": -1.7461715936660767, + "logps/rejected": -1.81634521484375, + "loss": 2.5254, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.461713790893555, + "rewards/margins": 0.7017372846603394, + "rewards/rejected": -18.1634521484375, + "step": 12005 + }, + { + "epoch": 0.404799622501601, + "grad_norm": 11.7499361038208, + "learning_rate": 7.427140282381636e-07, + "logits/chosen": -0.9528292417526245, + "logits/rejected": -1.1537973880767822, + "logps/chosen": -1.9370794296264648, + "logps/rejected": -2.340834140777588, + "loss": 1.3718, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -19.37079429626465, + "rewards/margins": 4.0375471115112305, + "rewards/rejected": -23.408340454101562, + "step": 12010 + }, + { + "epoch": 0.4049681485725842, + "grad_norm": 45.47167205810547, + "learning_rate": 7.424568312681647e-07, + "logits/chosen": -0.9921124577522278, + "logits/rejected": -1.0518230199813843, + "logps/chosen": -1.770341157913208, + "logps/rejected": -2.001453399658203, + "loss": 2.4102, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.703411102294922, + "rewards/margins": 2.3111231327056885, + "rewards/rejected": -20.0145320892334, + "step": 12015 + }, + { + "epoch": 0.40513667464356734, + "grad_norm": 21.313961029052734, + "learning_rate": 7.421995503934088e-07, + "logits/chosen": -1.3454720973968506, + "logits/rejected": -1.3453900814056396, + "logps/chosen": -1.8339992761611938, + "logps/rejected": -1.7264811992645264, + "loss": 4.1263, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -18.33999252319336, + "rewards/margins": -1.075181245803833, + "rewards/rejected": -17.264812469482422, + "step": 12020 + }, + { + "epoch": 0.40530520071455056, + "grad_norm": 12.074272155761719, + "learning_rate": 7.419421857029309e-07, + "logits/chosen": -1.1453709602355957, + "logits/rejected": -1.3053287267684937, + "logps/chosen": -2.2835354804992676, + "logps/rejected": -2.569051742553711, + "loss": 1.9534, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -22.835357666015625, + "rewards/margins": 2.8551602363586426, + "rewards/rejected": -25.69051742553711, + "step": 12025 + }, + { + "epoch": 0.40547372678553373, + "grad_norm": 19.331594467163086, + "learning_rate": 7.416847372857946e-07, + "logits/chosen": -1.0492010116577148, + "logits/rejected": -1.131423830986023, + "logps/chosen": -2.292367458343506, + "logps/rejected": -2.3346362113952637, + "loss": 3.5757, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.923675537109375, + "rewards/margins": 0.4226861894130707, + "rewards/rejected": -23.346363067626953, + "step": 12030 + }, + { + "epoch": 0.4056422528565169, + "grad_norm": 31.929597854614258, + "learning_rate": 7.414272052310928e-07, + "logits/chosen": -1.2143497467041016, + "logits/rejected": -1.1757112741470337, + "logps/chosen": -2.2437641620635986, + "logps/rejected": -2.503958225250244, + "loss": 3.0289, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.437641143798828, + "rewards/margins": 2.6019396781921387, + "rewards/rejected": -25.039579391479492, + "step": 12035 + }, + { + "epoch": 0.40581077892750006, + "grad_norm": 22.63580894470215, + "learning_rate": 7.41169589627947e-07, + "logits/chosen": -1.4139041900634766, + "logits/rejected": -1.4436061382293701, + "logps/chosen": -1.89009690284729, + "logps/rejected": -2.185490131378174, + "loss": 2.9563, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.90096664428711, + "rewards/margins": 2.953932523727417, + "rewards/rejected": -21.854900360107422, + "step": 12040 + }, + { + "epoch": 0.4059793049984833, + "grad_norm": 21.127553939819336, + "learning_rate": 7.409118905655082e-07, + "logits/chosen": -0.6792327761650085, + "logits/rejected": -0.8694466352462769, + "logps/chosen": -1.8247032165527344, + "logps/rejected": -1.8372104167938232, + "loss": 3.0029, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.247034072875977, + "rewards/margins": 0.12507256865501404, + "rewards/rejected": -18.37210464477539, + "step": 12045 + }, + { + "epoch": 0.40614783106946645, + "grad_norm": 57.487403869628906, + "learning_rate": 7.406541081329554e-07, + "logits/chosen": -1.2932124137878418, + "logits/rejected": -1.4253662824630737, + "logps/chosen": -2.3572797775268555, + "logps/rejected": -2.522393226623535, + "loss": 2.8901, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.572799682617188, + "rewards/margins": 1.6511356830596924, + "rewards/rejected": -25.223934173583984, + "step": 12050 + }, + { + "epoch": 0.4063163571404496, + "grad_norm": 6.55502462387085, + "learning_rate": 7.403962424194973e-07, + "logits/chosen": -1.4770417213439941, + "logits/rejected": -1.7608951330184937, + "logps/chosen": -2.6661248207092285, + "logps/rejected": -3.192875385284424, + "loss": 1.4518, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -26.6612491607666, + "rewards/margins": 5.2675065994262695, + "rewards/rejected": -31.928752899169922, + "step": 12055 + }, + { + "epoch": 0.40648488321143283, + "grad_norm": 26.68470001220703, + "learning_rate": 7.401382935143709e-07, + "logits/chosen": -0.8198683857917786, + "logits/rejected": -0.9025104641914368, + "logps/chosen": -1.8631515502929688, + "logps/rejected": -1.8569438457489014, + "loss": 3.1761, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.63151741027832, + "rewards/margins": -0.06207876279950142, + "rewards/rejected": -18.56943702697754, + "step": 12060 + }, + { + "epoch": 0.406653409282416, + "grad_norm": 22.029172897338867, + "learning_rate": 7.398802615068421e-07, + "logits/chosen": -1.0925521850585938, + "logits/rejected": -1.1846771240234375, + "logps/chosen": -1.9000976085662842, + "logps/rejected": -1.9701240062713623, + "loss": 2.6078, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.000974655151367, + "rewards/margins": 0.7002660632133484, + "rewards/rejected": -19.70124053955078, + "step": 12065 + }, + { + "epoch": 0.40682193535339917, + "grad_norm": 52.73628234863281, + "learning_rate": 7.396221464862058e-07, + "logits/chosen": -1.0964667797088623, + "logits/rejected": -0.6679283976554871, + "logps/chosen": -2.2612481117248535, + "logps/rejected": -1.9707530736923218, + "loss": 6.3113, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -22.61248207092285, + "rewards/margins": -2.904949188232422, + "rewards/rejected": -19.707530975341797, + "step": 12070 + }, + { + "epoch": 0.40699046142438233, + "grad_norm": 15.641594886779785, + "learning_rate": 7.393639485417852e-07, + "logits/chosen": -0.9506348371505737, + "logits/rejected": -0.9982539415359497, + "logps/chosen": -1.7841243743896484, + "logps/rejected": -1.9900413751602173, + "loss": 1.9571, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.841243743896484, + "rewards/margins": 2.0591704845428467, + "rewards/rejected": -19.900415420532227, + "step": 12075 + }, + { + "epoch": 0.40715898749536555, + "grad_norm": 23.61943817138672, + "learning_rate": 7.391056677629327e-07, + "logits/chosen": -1.5612033605575562, + "logits/rejected": -1.3875303268432617, + "logps/chosen": -2.126805543899536, + "logps/rejected": -2.2525746822357178, + "loss": 2.73, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.268054962158203, + "rewards/margins": 1.2576922178268433, + "rewards/rejected": -22.525747299194336, + "step": 12080 + }, + { + "epoch": 0.4073275135663487, + "grad_norm": 44.1659049987793, + "learning_rate": 7.388473042390289e-07, + "logits/chosen": -1.2619104385375977, + "logits/rejected": -1.3089015483856201, + "logps/chosen": -1.9846280813217163, + "logps/rejected": -2.134305953979492, + "loss": 3.0727, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.84627914428711, + "rewards/margins": 1.4967803955078125, + "rewards/rejected": -21.343059539794922, + "step": 12085 + }, + { + "epoch": 0.4074960396373319, + "grad_norm": 30.376798629760742, + "learning_rate": 7.385888580594834e-07, + "logits/chosen": -1.2105739116668701, + "logits/rejected": -1.2595316171646118, + "logps/chosen": -2.5818519592285156, + "logps/rejected": -2.8327412605285645, + "loss": 2.8547, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.818517684936523, + "rewards/margins": 2.508892774581909, + "rewards/rejected": -28.327411651611328, + "step": 12090 + }, + { + "epoch": 0.40766456570831505, + "grad_norm": 36.32497787475586, + "learning_rate": 7.383303293137339e-07, + "logits/chosen": -1.0478847026824951, + "logits/rejected": -1.2231203317642212, + "logps/chosen": -2.192035675048828, + "logps/rejected": -2.384169816970825, + "loss": 2.2683, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.92035484313965, + "rewards/margins": 1.9213413000106812, + "rewards/rejected": -23.84169578552246, + "step": 12095 + }, + { + "epoch": 0.4078330917792983, + "grad_norm": 35.671592712402344, + "learning_rate": 7.380717180912477e-07, + "logits/chosen": -1.0254673957824707, + "logits/rejected": -0.9650663137435913, + "logps/chosen": -2.426251173019409, + "logps/rejected": -2.793447971343994, + "loss": 4.8195, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.26251220703125, + "rewards/margins": 3.671968460083008, + "rewards/rejected": -27.93448257446289, + "step": 12100 + }, + { + "epoch": 0.40800161785028144, + "grad_norm": 46.74282455444336, + "learning_rate": 7.378130244815191e-07, + "logits/chosen": -1.1126468181610107, + "logits/rejected": -1.2673817873001099, + "logps/chosen": -1.7720956802368164, + "logps/rejected": -2.2362868785858154, + "loss": 2.1947, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.720956802368164, + "rewards/margins": 4.641912937164307, + "rewards/rejected": -22.362869262695312, + "step": 12105 + }, + { + "epoch": 0.4081701439212646, + "grad_norm": 29.57602310180664, + "learning_rate": 7.375542485740723e-07, + "logits/chosen": -1.2394063472747803, + "logits/rejected": -1.3660204410552979, + "logps/chosen": -1.8587286472320557, + "logps/rejected": -2.068803310394287, + "loss": 1.7865, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.58728790283203, + "rewards/margins": 2.1007449626922607, + "rewards/rejected": -20.688030242919922, + "step": 12110 + }, + { + "epoch": 0.4083386699922478, + "grad_norm": 27.55736541748047, + "learning_rate": 7.372953904584596e-07, + "logits/chosen": -1.0895322561264038, + "logits/rejected": -0.9657597541809082, + "logps/chosen": -1.6055514812469482, + "logps/rejected": -1.532454490661621, + "loss": 3.8164, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.05551528930664, + "rewards/margins": -0.7309691309928894, + "rewards/rejected": -15.324544906616211, + "step": 12115 + }, + { + "epoch": 0.408507196063231, + "grad_norm": 30.960561752319336, + "learning_rate": 7.37036450224261e-07, + "logits/chosen": -0.8229540586471558, + "logits/rejected": -1.2388590574264526, + "logps/chosen": -1.9742721319198608, + "logps/rejected": -2.200082302093506, + "loss": 2.202, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.742721557617188, + "rewards/margins": 2.2581019401550293, + "rewards/rejected": -22.000823974609375, + "step": 12120 + }, + { + "epoch": 0.40867572213421416, + "grad_norm": 33.830482482910156, + "learning_rate": 7.36777427961086e-07, + "logits/chosen": -0.8376771807670593, + "logits/rejected": -0.981291651725769, + "logps/chosen": -2.4010491371154785, + "logps/rejected": -2.5145821571350098, + "loss": 2.5353, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.010494232177734, + "rewards/margins": 1.135331630706787, + "rewards/rejected": -25.145824432373047, + "step": 12125 + }, + { + "epoch": 0.4088442482051973, + "grad_norm": 23.59634780883789, + "learning_rate": 7.365183237585718e-07, + "logits/chosen": -0.861966609954834, + "logits/rejected": -1.0989129543304443, + "logps/chosen": -1.946171522140503, + "logps/rejected": -1.9714372158050537, + "loss": 3.086, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.461715698242188, + "rewards/margins": 0.25265711545944214, + "rewards/rejected": -19.714372634887695, + "step": 12130 + }, + { + "epoch": 0.40901277427618055, + "grad_norm": 53.22785186767578, + "learning_rate": 7.362591377063841e-07, + "logits/chosen": -1.0354158878326416, + "logits/rejected": -1.0143946409225464, + "logps/chosen": -1.9543819427490234, + "logps/rejected": -1.9405418634414673, + "loss": 3.345, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.543819427490234, + "rewards/margins": -0.13840040564537048, + "rewards/rejected": -19.405418395996094, + "step": 12135 + }, + { + "epoch": 0.4091813003471637, + "grad_norm": 25.117643356323242, + "learning_rate": 7.359998698942173e-07, + "logits/chosen": -0.827044665813446, + "logits/rejected": -0.9510555267333984, + "logps/chosen": -2.408053159713745, + "logps/rejected": -2.6223647594451904, + "loss": 2.8323, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.08053207397461, + "rewards/margins": 2.1431150436401367, + "rewards/rejected": -26.223644256591797, + "step": 12140 + }, + { + "epoch": 0.4093498264181469, + "grad_norm": 32.903385162353516, + "learning_rate": 7.357405204117934e-07, + "logits/chosen": -0.9480105638504028, + "logits/rejected": -1.0420299768447876, + "logps/chosen": -1.8940540552139282, + "logps/rejected": -1.9430221319198608, + "loss": 2.7973, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.940540313720703, + "rewards/margins": 0.489682674407959, + "rewards/rejected": -19.43022346496582, + "step": 12145 + }, + { + "epoch": 0.40951835248913004, + "grad_norm": 13.586479187011719, + "learning_rate": 7.354810893488632e-07, + "logits/chosen": -1.3252044916152954, + "logits/rejected": -1.5341026782989502, + "logps/chosen": -2.290491819381714, + "logps/rejected": -2.34997296333313, + "loss": 2.9688, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.904918670654297, + "rewards/margins": 0.594811737537384, + "rewards/rejected": -23.49972915649414, + "step": 12150 + }, + { + "epoch": 0.40968687856011327, + "grad_norm": 18.08561897277832, + "learning_rate": 7.352215767952056e-07, + "logits/chosen": -1.4511380195617676, + "logits/rejected": -1.7551714181900024, + "logps/chosen": -2.110844373703003, + "logps/rejected": -2.080214738845825, + "loss": 3.5353, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.108444213867188, + "rewards/margins": -0.3062978684902191, + "rewards/rejected": -20.802148818969727, + "step": 12155 + }, + { + "epoch": 0.40985540463109643, + "grad_norm": 20.255584716796875, + "learning_rate": 7.349619828406277e-07, + "logits/chosen": -0.935697078704834, + "logits/rejected": -0.9104664921760559, + "logps/chosen": -2.4696171283721924, + "logps/rejected": -2.5782546997070312, + "loss": 3.1522, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.696170806884766, + "rewards/margins": 1.0863767862319946, + "rewards/rejected": -25.782546997070312, + "step": 12160 + }, + { + "epoch": 0.4100239307020796, + "grad_norm": 22.614980697631836, + "learning_rate": 7.347023075749645e-07, + "logits/chosen": -0.9744084477424622, + "logits/rejected": -0.9399921298027039, + "logps/chosen": -1.6697473526000977, + "logps/rejected": -2.09625506401062, + "loss": 1.7474, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -16.69747543334961, + "rewards/margins": 4.26507568359375, + "rewards/rejected": -20.962549209594727, + "step": 12165 + }, + { + "epoch": 0.4101924567730628, + "grad_norm": 55.02566909790039, + "learning_rate": 7.344425510880797e-07, + "logits/chosen": -1.2344236373901367, + "logits/rejected": -1.445399522781372, + "logps/chosen": -2.253945827484131, + "logps/rejected": -2.4210329055786133, + "loss": 2.2166, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.539459228515625, + "rewards/margins": 1.6708701848983765, + "rewards/rejected": -24.210330963134766, + "step": 12170 + }, + { + "epoch": 0.410360982844046, + "grad_norm": 12.205888748168945, + "learning_rate": 7.341827134698645e-07, + "logits/chosen": -0.9991312026977539, + "logits/rejected": -1.023807168006897, + "logps/chosen": -1.743577003479004, + "logps/rejected": -1.9013125896453857, + "loss": 2.273, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.435771942138672, + "rewards/margins": 1.577357292175293, + "rewards/rejected": -19.013126373291016, + "step": 12175 + }, + { + "epoch": 0.41052950891502915, + "grad_norm": 22.69434928894043, + "learning_rate": 7.339227948102387e-07, + "logits/chosen": -1.2785236835479736, + "logits/rejected": -1.3499950170516968, + "logps/chosen": -1.9484812021255493, + "logps/rejected": -2.2043070793151855, + "loss": 2.7824, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.48480796813965, + "rewards/margins": 2.5582618713378906, + "rewards/rejected": -22.043071746826172, + "step": 12180 + }, + { + "epoch": 0.4106980349860123, + "grad_norm": 70.99961853027344, + "learning_rate": 7.336627951991497e-07, + "logits/chosen": -0.8217660188674927, + "logits/rejected": -0.8359603881835938, + "logps/chosen": -1.856405258178711, + "logps/rejected": -1.7680613994598389, + "loss": 4.2279, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.564050674438477, + "rewards/margins": -0.8834368586540222, + "rewards/rejected": -17.680614471435547, + "step": 12185 + }, + { + "epoch": 0.41086656105699554, + "grad_norm": 26.163740158081055, + "learning_rate": 7.334027147265734e-07, + "logits/chosen": -0.8019935488700867, + "logits/rejected": -1.046931505203247, + "logps/chosen": -2.2931296825408936, + "logps/rejected": -2.110200881958008, + "loss": 5.3934, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.931297302246094, + "rewards/margins": -1.8292884826660156, + "rewards/rejected": -21.102008819580078, + "step": 12190 + }, + { + "epoch": 0.4110350871279787, + "grad_norm": 22.50705909729004, + "learning_rate": 7.331425534825131e-07, + "logits/chosen": -1.3941196203231812, + "logits/rejected": -1.3452855348587036, + "logps/chosen": -2.0837740898132324, + "logps/rejected": -2.4105069637298584, + "loss": 1.954, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.83774185180664, + "rewards/margins": 3.267329692840576, + "rewards/rejected": -24.105072021484375, + "step": 12195 + }, + { + "epoch": 0.41120361319896187, + "grad_norm": 19.575517654418945, + "learning_rate": 7.328823115570005e-07, + "logits/chosen": -1.1170722246170044, + "logits/rejected": -1.5405693054199219, + "logps/chosen": -1.7801272869110107, + "logps/rejected": -2.321302890777588, + "loss": 1.2349, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.801273345947266, + "rewards/margins": 5.411752700805664, + "rewards/rejected": -23.213027954101562, + "step": 12200 + }, + { + "epoch": 0.41137213926994504, + "grad_norm": 18.17782211303711, + "learning_rate": 7.326219890400951e-07, + "logits/chosen": -1.4105345010757446, + "logits/rejected": -1.3899109363555908, + "logps/chosen": -2.142193078994751, + "logps/rejected": -2.263820171356201, + "loss": 2.1952, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.42193031311035, + "rewards/margins": 1.216269612312317, + "rewards/rejected": -22.638198852539062, + "step": 12205 + }, + { + "epoch": 0.41154066534092826, + "grad_norm": 27.79196548461914, + "learning_rate": 7.323615860218842e-07, + "logits/chosen": -1.1210639476776123, + "logits/rejected": -1.2003812789916992, + "logps/chosen": -1.9306762218475342, + "logps/rejected": -1.9171720743179321, + "loss": 3.6701, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.3067626953125, + "rewards/margins": -0.13504056632518768, + "rewards/rejected": -19.171720504760742, + "step": 12210 + }, + { + "epoch": 0.4117091914119114, + "grad_norm": 11.662469863891602, + "learning_rate": 7.321011025924832e-07, + "logits/chosen": -1.377205729484558, + "logits/rejected": -1.4924352169036865, + "logps/chosen": -2.1127142906188965, + "logps/rejected": -2.2229113578796387, + "loss": 2.2694, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.12714195251465, + "rewards/margins": 1.1019700765609741, + "rewards/rejected": -22.229114532470703, + "step": 12215 + }, + { + "epoch": 0.4118777174828946, + "grad_norm": 30.770193099975586, + "learning_rate": 7.318405388420349e-07, + "logits/chosen": -1.1618680953979492, + "logits/rejected": -1.2497812509536743, + "logps/chosen": -2.0236659049987793, + "logps/rejected": -2.0624430179595947, + "loss": 2.8735, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.23666000366211, + "rewards/margins": 0.3877657949924469, + "rewards/rejected": -20.62442970275879, + "step": 12220 + }, + { + "epoch": 0.4120462435538778, + "grad_norm": 31.006916046142578, + "learning_rate": 7.315798948607102e-07, + "logits/chosen": -1.724689245223999, + "logits/rejected": -1.7922130823135376, + "logps/chosen": -1.9403785467147827, + "logps/rejected": -1.982122778892517, + "loss": 2.7834, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.40378761291504, + "rewards/margins": 0.41744089126586914, + "rewards/rejected": -19.82122802734375, + "step": 12225 + }, + { + "epoch": 0.412214769624861, + "grad_norm": 27.098119735717773, + "learning_rate": 7.313191707387079e-07, + "logits/chosen": -1.5360838174819946, + "logits/rejected": -1.4201209545135498, + "logps/chosen": -1.9972326755523682, + "logps/rejected": -2.020376682281494, + "loss": 3.0258, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.972326278686523, + "rewards/margins": 0.23143863677978516, + "rewards/rejected": -20.203763961791992, + "step": 12230 + }, + { + "epoch": 0.41238329569584414, + "grad_norm": 37.77764892578125, + "learning_rate": 7.310583665662542e-07, + "logits/chosen": -1.0606439113616943, + "logits/rejected": -1.2241575717926025, + "logps/chosen": -2.2028839588165283, + "logps/rejected": -2.37382173538208, + "loss": 1.8302, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -22.028841018676758, + "rewards/margins": 1.7093784809112549, + "rewards/rejected": -23.738218307495117, + "step": 12235 + }, + { + "epoch": 0.4125518217668273, + "grad_norm": 40.7574348449707, + "learning_rate": 7.30797482433603e-07, + "logits/chosen": -1.1301488876342773, + "logits/rejected": -1.4540493488311768, + "logps/chosen": -1.9775673151016235, + "logps/rejected": -2.2723512649536133, + "loss": 2.5747, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.775671005249023, + "rewards/margins": 2.9478418827056885, + "rewards/rejected": -22.723514556884766, + "step": 12240 + }, + { + "epoch": 0.41272034783781053, + "grad_norm": 14.587956428527832, + "learning_rate": 7.305365184310363e-07, + "logits/chosen": -0.8185412287712097, + "logits/rejected": -0.7269413471221924, + "logps/chosen": -1.9818493127822876, + "logps/rejected": -2.2220189571380615, + "loss": 2.2466, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.818492889404297, + "rewards/margins": 2.401698112487793, + "rewards/rejected": -22.220190048217773, + "step": 12245 + }, + { + "epoch": 0.4128888739087937, + "grad_norm": 25.93768882751465, + "learning_rate": 7.302754746488633e-07, + "logits/chosen": -1.1302485466003418, + "logits/rejected": -1.3396055698394775, + "logps/chosen": -2.0434622764587402, + "logps/rejected": -2.2037312984466553, + "loss": 2.0975, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.434621810913086, + "rewards/margins": 1.6026890277862549, + "rewards/rejected": -22.037311553955078, + "step": 12250 + }, + { + "epoch": 0.41305739997977686, + "grad_norm": 77.58814239501953, + "learning_rate": 7.300143511774211e-07, + "logits/chosen": -1.3720858097076416, + "logits/rejected": -1.4968478679656982, + "logps/chosen": -2.2451188564300537, + "logps/rejected": -2.2659029960632324, + "loss": 4.1834, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.451190948486328, + "rewards/margins": 0.20783929526805878, + "rewards/rejected": -22.659029006958008, + "step": 12255 + }, + { + "epoch": 0.41322592605076003, + "grad_norm": 26.527759552001953, + "learning_rate": 7.297531481070742e-07, + "logits/chosen": -1.1272186040878296, + "logits/rejected": -1.2402737140655518, + "logps/chosen": -1.7846702337265015, + "logps/rejected": -1.966025948524475, + "loss": 2.1777, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -17.846702575683594, + "rewards/margins": 1.8135563135147095, + "rewards/rejected": -19.660259246826172, + "step": 12260 + }, + { + "epoch": 0.41339445212174325, + "grad_norm": 34.187957763671875, + "learning_rate": 7.294918655282145e-07, + "logits/chosen": -1.4714405536651611, + "logits/rejected": -1.390163779258728, + "logps/chosen": -1.9039027690887451, + "logps/rejected": -2.0494437217712402, + "loss": 1.8899, + "rewards/accuracies": 1.0, + "rewards/chosen": -19.03902816772461, + "rewards/margins": 1.4554088115692139, + "rewards/rejected": -20.494434356689453, + "step": 12265 + }, + { + "epoch": 0.4135629781927264, + "grad_norm": 22.404212951660156, + "learning_rate": 7.292305035312618e-07, + "logits/chosen": -1.2922292947769165, + "logits/rejected": -1.1275092363357544, + "logps/chosen": -2.3387680053710938, + "logps/rejected": -2.4274537563323975, + "loss": 4.3855, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.387680053710938, + "rewards/margins": 0.8868575096130371, + "rewards/rejected": -24.274539947509766, + "step": 12270 + }, + { + "epoch": 0.4137315042637096, + "grad_norm": 108.76043701171875, + "learning_rate": 7.289690622066633e-07, + "logits/chosen": -0.7196947336196899, + "logits/rejected": -0.6757172346115112, + "logps/chosen": -2.1135897636413574, + "logps/rejected": -2.088670253753662, + "loss": 3.8477, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.135900497436523, + "rewards/margins": -0.24919691681861877, + "rewards/rejected": -20.886703491210938, + "step": 12275 + }, + { + "epoch": 0.4139000303346928, + "grad_norm": 30.70173454284668, + "learning_rate": 7.287075416448932e-07, + "logits/chosen": -0.8257554173469543, + "logits/rejected": -0.939769446849823, + "logps/chosen": -2.1116161346435547, + "logps/rejected": -2.2537155151367188, + "loss": 2.8061, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.116161346435547, + "rewards/margins": 1.4209929704666138, + "rewards/rejected": -22.537155151367188, + "step": 12280 + }, + { + "epoch": 0.41406855640567597, + "grad_norm": 28.507184982299805, + "learning_rate": 7.284459419364537e-07, + "logits/chosen": -1.1531884670257568, + "logits/rejected": -1.1810081005096436, + "logps/chosen": -1.9541816711425781, + "logps/rejected": -2.1569201946258545, + "loss": 2.3336, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.541818618774414, + "rewards/margins": 2.0273826122283936, + "rewards/rejected": -21.56920051574707, + "step": 12285 + }, + { + "epoch": 0.41423708247665914, + "grad_norm": 29.22176742553711, + "learning_rate": 7.281842631718742e-07, + "logits/chosen": -1.2742034196853638, + "logits/rejected": -1.4763638973236084, + "logps/chosen": -1.9598357677459717, + "logps/rejected": -2.181473970413208, + "loss": 2.33, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.598360061645508, + "rewards/margins": 2.2163803577423096, + "rewards/rejected": -21.814739227294922, + "step": 12290 + }, + { + "epoch": 0.4144056085476423, + "grad_norm": 27.640893936157227, + "learning_rate": 7.279225054417113e-07, + "logits/chosen": -1.4601396322250366, + "logits/rejected": -1.4143357276916504, + "logps/chosen": -1.7320646047592163, + "logps/rejected": -2.1392834186553955, + "loss": 2.036, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.320646286010742, + "rewards/margins": 4.072186470031738, + "rewards/rejected": -21.392831802368164, + "step": 12295 + }, + { + "epoch": 0.4145741346186255, + "grad_norm": 69.44369506835938, + "learning_rate": 7.27660668836549e-07, + "logits/chosen": -0.875129222869873, + "logits/rejected": -0.9124727249145508, + "logps/chosen": -1.7804782390594482, + "logps/rejected": -1.9126323461532593, + "loss": 2.0381, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.80478286743164, + "rewards/margins": 1.3215408325195312, + "rewards/rejected": -19.126323699951172, + "step": 12300 + }, + { + "epoch": 0.4147426606896087, + "grad_norm": 18.76422882080078, + "learning_rate": 7.273987534469987e-07, + "logits/chosen": -1.410447120666504, + "logits/rejected": -1.6111469268798828, + "logps/chosen": -1.6875078678131104, + "logps/rejected": -1.7965885400772095, + "loss": 2.3301, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.875080108642578, + "rewards/margins": 1.0908066034317017, + "rewards/rejected": -17.965885162353516, + "step": 12305 + }, + { + "epoch": 0.41491118676059185, + "grad_norm": 41.15472412109375, + "learning_rate": 7.27136759363699e-07, + "logits/chosen": -0.9998579025268555, + "logits/rejected": -1.189564824104309, + "logps/chosen": -1.9412838220596313, + "logps/rejected": -2.197477102279663, + "loss": 2.2405, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.412837982177734, + "rewards/margins": 2.561933994293213, + "rewards/rejected": -21.97477149963379, + "step": 12310 + }, + { + "epoch": 0.415079712831575, + "grad_norm": 25.710790634155273, + "learning_rate": 7.268746866773157e-07, + "logits/chosen": -1.1858714818954468, + "logits/rejected": -1.3653953075408936, + "logps/chosen": -1.9866607189178467, + "logps/rejected": -2.078334331512451, + "loss": 2.9149, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.866609573364258, + "rewards/margins": 0.9167356491088867, + "rewards/rejected": -20.783344268798828, + "step": 12315 + }, + { + "epoch": 0.41524823890255824, + "grad_norm": 45.082820892333984, + "learning_rate": 7.266125354785419e-07, + "logits/chosen": -0.9618284106254578, + "logits/rejected": -0.9656648635864258, + "logps/chosen": -2.5577244758605957, + "logps/rejected": -2.5045924186706543, + "loss": 4.5745, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -25.57724380493164, + "rewards/margins": -0.5313173532485962, + "rewards/rejected": -25.04592514038086, + "step": 12320 + }, + { + "epoch": 0.4154167649735414, + "grad_norm": 27.387304306030273, + "learning_rate": 7.263503058580975e-07, + "logits/chosen": -0.8120461702346802, + "logits/rejected": -1.0206549167633057, + "logps/chosen": -1.7548229694366455, + "logps/rejected": -1.9979721307754517, + "loss": 3.0462, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -17.548229217529297, + "rewards/margins": 2.4314920902252197, + "rewards/rejected": -19.979719161987305, + "step": 12325 + }, + { + "epoch": 0.4155852910445246, + "grad_norm": 58.04964065551758, + "learning_rate": 7.260879979067305e-07, + "logits/chosen": -1.2231042385101318, + "logits/rejected": -1.3412379026412964, + "logps/chosen": -2.661652088165283, + "logps/rejected": -2.8398842811584473, + "loss": 2.8371, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.61652183532715, + "rewards/margins": 1.782320261001587, + "rewards/rejected": -28.39884376525879, + "step": 12330 + }, + { + "epoch": 0.4157538171155078, + "grad_norm": 31.14594268798828, + "learning_rate": 7.258256117152147e-07, + "logits/chosen": -1.147136926651001, + "logits/rejected": -1.2516025304794312, + "logps/chosen": -2.197524309158325, + "logps/rejected": -2.545668601989746, + "loss": 1.9896, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.975244522094727, + "rewards/margins": 3.4814445972442627, + "rewards/rejected": -25.45668601989746, + "step": 12335 + }, + { + "epoch": 0.41592234318649096, + "grad_norm": 27.909366607666016, + "learning_rate": 7.255631473743517e-07, + "logits/chosen": -1.7197551727294922, + "logits/rejected": -1.5631155967712402, + "logps/chosen": -1.870761513710022, + "logps/rejected": -1.8892608880996704, + "loss": 3.0922, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.70761489868164, + "rewards/margins": 0.18499116599559784, + "rewards/rejected": -18.892608642578125, + "step": 12340 + }, + { + "epoch": 0.4160908692574741, + "grad_norm": 30.327396392822266, + "learning_rate": 7.253006049749704e-07, + "logits/chosen": -0.9778448939323425, + "logits/rejected": -1.1327383518218994, + "logps/chosen": -1.968382477760315, + "logps/rejected": -2.545841693878174, + "loss": 2.3467, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.683826446533203, + "rewards/margins": 5.774592399597168, + "rewards/rejected": -25.458415985107422, + "step": 12345 + }, + { + "epoch": 0.4162593953284573, + "grad_norm": 3.7972683906555176, + "learning_rate": 7.250379846079263e-07, + "logits/chosen": -0.7681287527084351, + "logits/rejected": -0.9513195157051086, + "logps/chosen": -2.9177188873291016, + "logps/rejected": -3.2327704429626465, + "loss": 2.2224, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -29.177188873291016, + "rewards/margins": 3.1505179405212402, + "rewards/rejected": -32.32770538330078, + "step": 12350 + }, + { + "epoch": 0.4164279213994405, + "grad_norm": 24.724031448364258, + "learning_rate": 7.247752863641018e-07, + "logits/chosen": -1.0654830932617188, + "logits/rejected": -1.0408488512039185, + "logps/chosen": -1.9206384420394897, + "logps/rejected": -2.0448288917541504, + "loss": 2.7421, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.206384658813477, + "rewards/margins": 1.24190354347229, + "rewards/rejected": -20.448287963867188, + "step": 12355 + }, + { + "epoch": 0.4165964474704237, + "grad_norm": 34.73861312866211, + "learning_rate": 7.245125103344066e-07, + "logits/chosen": -1.2476475238800049, + "logits/rejected": -1.307342290878296, + "logps/chosen": -1.658424735069275, + "logps/rejected": -1.6547248363494873, + "loss": 3.6264, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.58424949645996, + "rewards/margins": -0.036998748779296875, + "rewards/rejected": -16.547250747680664, + "step": 12360 + }, + { + "epoch": 0.41676497354140685, + "grad_norm": 33.121646881103516, + "learning_rate": 7.242496566097769e-07, + "logits/chosen": -1.4867914915084839, + "logits/rejected": -1.541632056236267, + "logps/chosen": -1.9344894886016846, + "logps/rejected": -2.1640186309814453, + "loss": 2.4745, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.344892501831055, + "rewards/margins": 2.2952919006347656, + "rewards/rejected": -21.64018440246582, + "step": 12365 + }, + { + "epoch": 0.41693349961239, + "grad_norm": 24.134441375732422, + "learning_rate": 7.23986725281176e-07, + "logits/chosen": -0.9161200523376465, + "logits/rejected": -0.9984935522079468, + "logps/chosen": -2.1371476650238037, + "logps/rejected": -2.1769909858703613, + "loss": 3.7732, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.371477127075195, + "rewards/margins": 0.3984335958957672, + "rewards/rejected": -21.769908905029297, + "step": 12370 + }, + { + "epoch": 0.41710202568337323, + "grad_norm": 16.265979766845703, + "learning_rate": 7.237237164395944e-07, + "logits/chosen": -1.368922233581543, + "logits/rejected": -1.6064525842666626, + "logps/chosen": -1.886877417564392, + "logps/rejected": -2.280475378036499, + "loss": 1.4464, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.868776321411133, + "rewards/margins": 3.9359793663024902, + "rewards/rejected": -22.80475425720215, + "step": 12375 + }, + { + "epoch": 0.4172705517543564, + "grad_norm": 14.606618881225586, + "learning_rate": 7.234606301760488e-07, + "logits/chosen": -0.9598191976547241, + "logits/rejected": -1.0696005821228027, + "logps/chosen": -1.8257827758789062, + "logps/rejected": -1.9989960193634033, + "loss": 2.0592, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.25782585144043, + "rewards/margins": 1.7321323156356812, + "rewards/rejected": -19.989957809448242, + "step": 12380 + }, + { + "epoch": 0.41743907782533957, + "grad_norm": 32.96151351928711, + "learning_rate": 7.231974665815831e-07, + "logits/chosen": -1.2338770627975464, + "logits/rejected": -1.3861429691314697, + "logps/chosen": -2.4079480171203613, + "logps/rejected": -2.3635220527648926, + "loss": 3.9454, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -24.079477310180664, + "rewards/margins": -0.44425565004348755, + "rewards/rejected": -23.635223388671875, + "step": 12385 + }, + { + "epoch": 0.4176076038963228, + "grad_norm": 12.575274467468262, + "learning_rate": 7.229342257472678e-07, + "logits/chosen": -1.2482370138168335, + "logits/rejected": -1.3556302785873413, + "logps/chosen": -2.2981467247009277, + "logps/rejected": -2.404623508453369, + "loss": 3.5159, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -22.981468200683594, + "rewards/margins": 1.064767599105835, + "rewards/rejected": -24.046234130859375, + "step": 12390 + }, + { + "epoch": 0.41777612996730595, + "grad_norm": 38.49678421020508, + "learning_rate": 7.226709077642002e-07, + "logits/chosen": -0.7314721941947937, + "logits/rejected": -0.8654648065567017, + "logps/chosen": -2.2405874729156494, + "logps/rejected": -2.4284911155700684, + "loss": 3.6159, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -22.405872344970703, + "rewards/margins": 1.8790397644042969, + "rewards/rejected": -24.284912109375, + "step": 12395 + }, + { + "epoch": 0.4179446560382891, + "grad_norm": 20.525619506835938, + "learning_rate": 7.224075127235044e-07, + "logits/chosen": -0.9811423420906067, + "logits/rejected": -1.0781348943710327, + "logps/chosen": -2.029435157775879, + "logps/rejected": -2.214388608932495, + "loss": 2.1886, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.294349670410156, + "rewards/margins": 1.849535346031189, + "rewards/rejected": -22.14388656616211, + "step": 12400 + }, + { + "epoch": 0.4179446560382891, + "eval_logits/chosen": -1.5217288732528687, + "eval_logits/rejected": -1.6246063709259033, + "eval_logps/chosen": -1.9450013637542725, + "eval_logps/rejected": -2.0381784439086914, + "eval_loss": 3.018660068511963, + "eval_rewards/accuracies": 0.6299999952316284, + "eval_rewards/chosen": -19.450016021728516, + "eval_rewards/margins": 0.9317706823348999, + "eval_rewards/rejected": -20.381784439086914, + "eval_runtime": 12.8916, + "eval_samples_per_second": 7.757, + "eval_steps_per_second": 1.939, + "step": 12400 + }, + { + "epoch": 0.4181131821092723, + "grad_norm": 23.479873657226562, + "learning_rate": 7.221440407163309e-07, + "logits/chosen": -1.2906603813171387, + "logits/rejected": -1.2902982234954834, + "logps/chosen": -1.9721931219100952, + "logps/rejected": -1.8723487854003906, + "loss": 4.2176, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -19.7219295501709, + "rewards/margins": -0.9984428286552429, + "rewards/rejected": -18.723487854003906, + "step": 12405 + }, + { + "epoch": 0.4182817081802555, + "grad_norm": 65.4778823852539, + "learning_rate": 7.218804918338572e-07, + "logits/chosen": -1.067082166671753, + "logits/rejected": -1.1751021146774292, + "logps/chosen": -2.318246603012085, + "logps/rejected": -2.514338970184326, + "loss": 2.0274, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.18246841430664, + "rewards/margins": 1.9609248638153076, + "rewards/rejected": -25.143390655517578, + "step": 12410 + }, + { + "epoch": 0.4184502342512387, + "grad_norm": 20.164350509643555, + "learning_rate": 7.216168661672868e-07, + "logits/chosen": -1.2625491619110107, + "logits/rejected": -1.2166945934295654, + "logps/chosen": -1.9427082538604736, + "logps/rejected": -1.942220687866211, + "loss": 3.556, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -19.427082061767578, + "rewards/margins": -0.004874229431152344, + "rewards/rejected": -19.42220687866211, + "step": 12415 + }, + { + "epoch": 0.41861876032222184, + "grad_norm": 27.37095069885254, + "learning_rate": 7.213531638078505e-07, + "logits/chosen": -1.3327056169509888, + "logits/rejected": -1.3353521823883057, + "logps/chosen": -2.1947896480560303, + "logps/rejected": -2.3421804904937744, + "loss": 2.2908, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.94789695739746, + "rewards/margins": 1.4739079475402832, + "rewards/rejected": -23.421804428100586, + "step": 12420 + }, + { + "epoch": 0.418787286393205, + "grad_norm": 27.996984481811523, + "learning_rate": 7.210893848468053e-07, + "logits/chosen": -1.1961950063705444, + "logits/rejected": -1.2545980215072632, + "logps/chosen": -1.6736834049224854, + "logps/rejected": -1.796057105064392, + "loss": 2.3931, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.736835479736328, + "rewards/margins": 1.2237374782562256, + "rewards/rejected": -17.960573196411133, + "step": 12425 + }, + { + "epoch": 0.4189558124641882, + "grad_norm": 26.38707160949707, + "learning_rate": 7.208255293754342e-07, + "logits/chosen": -1.335105299949646, + "logits/rejected": -1.4457073211669922, + "logps/chosen": -1.7503960132598877, + "logps/rejected": -1.9006984233856201, + "loss": 2.7425, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.503957748413086, + "rewards/margins": 1.503026008605957, + "rewards/rejected": -19.00698471069336, + "step": 12430 + }, + { + "epoch": 0.4191243385351714, + "grad_norm": 20.31515884399414, + "learning_rate": 7.20561597485048e-07, + "logits/chosen": -1.020716667175293, + "logits/rejected": -1.0862579345703125, + "logps/chosen": -1.5637071132659912, + "logps/rejected": -1.6635987758636475, + "loss": 2.2843, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -15.63707160949707, + "rewards/margins": 0.9989177584648132, + "rewards/rejected": -16.635990142822266, + "step": 12435 + }, + { + "epoch": 0.41929286460615456, + "grad_norm": 97.51506042480469, + "learning_rate": 7.202975892669824e-07, + "logits/chosen": -0.7687516212463379, + "logits/rejected": -0.8372823596000671, + "logps/chosen": -2.676095485687256, + "logps/rejected": -3.029672861099243, + "loss": 1.9994, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -26.76095962524414, + "rewards/margins": 3.5357728004455566, + "rewards/rejected": -30.296728134155273, + "step": 12440 + }, + { + "epoch": 0.4194613906771378, + "grad_norm": 22.99105453491211, + "learning_rate": 7.200335048126006e-07, + "logits/chosen": -0.9765909910202026, + "logits/rejected": -1.0797923803329468, + "logps/chosen": -2.3902230262756348, + "logps/rejected": -2.2430508136749268, + "loss": 4.7527, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -23.902233123779297, + "rewards/margins": -1.471724271774292, + "rewards/rejected": -22.43050765991211, + "step": 12445 + }, + { + "epoch": 0.41962991674812095, + "grad_norm": 14.573020935058594, + "learning_rate": 7.197693442132917e-07, + "logits/chosen": -1.458287000656128, + "logits/rejected": -1.5055878162384033, + "logps/chosen": -2.1899430751800537, + "logps/rejected": -2.093184471130371, + "loss": 4.5942, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.899433135986328, + "rewards/margins": -0.9675881266593933, + "rewards/rejected": -20.93184471130371, + "step": 12450 + }, + { + "epoch": 0.4197984428191041, + "grad_norm": 25.01654052734375, + "learning_rate": 7.195051075604715e-07, + "logits/chosen": -1.3347218036651611, + "logits/rejected": -1.4352153539657593, + "logps/chosen": -2.1493160724639893, + "logps/rejected": -2.3409645557403564, + "loss": 2.7931, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.493160247802734, + "rewards/margins": 1.9164841175079346, + "rewards/rejected": -23.409643173217773, + "step": 12455 + }, + { + "epoch": 0.4199669688900873, + "grad_norm": 34.97426223754883, + "learning_rate": 7.192407949455816e-07, + "logits/chosen": -1.1958450078964233, + "logits/rejected": -1.3197181224822998, + "logps/chosen": -2.0296499729156494, + "logps/rejected": -2.766195774078369, + "loss": 1.828, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.296499252319336, + "rewards/margins": 7.365456581115723, + "rewards/rejected": -27.661956787109375, + "step": 12460 + }, + { + "epoch": 0.4201354949610705, + "grad_norm": 31.839874267578125, + "learning_rate": 7.189764064600904e-07, + "logits/chosen": -1.214929223060608, + "logits/rejected": -1.1214382648468018, + "logps/chosen": -1.8447256088256836, + "logps/rejected": -2.0540318489074707, + "loss": 1.8063, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -18.447256088256836, + "rewards/margins": 2.0930612087249756, + "rewards/rejected": -20.54031753540039, + "step": 12465 + }, + { + "epoch": 0.42030402103205367, + "grad_norm": 23.187700271606445, + "learning_rate": 7.187119421954921e-07, + "logits/chosen": -1.2923511266708374, + "logits/rejected": -1.489856481552124, + "logps/chosen": -1.7116539478302002, + "logps/rejected": -1.5989964008331299, + "loss": 4.4184, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.116540908813477, + "rewards/margins": -1.1265767812728882, + "rewards/rejected": -15.989962577819824, + "step": 12470 + }, + { + "epoch": 0.42047254710303683, + "grad_norm": 24.130979537963867, + "learning_rate": 7.184474022433075e-07, + "logits/chosen": -1.2651400566101074, + "logits/rejected": -1.326103925704956, + "logps/chosen": -1.6987262964248657, + "logps/rejected": -1.8224769830703735, + "loss": 2.1328, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.987262725830078, + "rewards/margins": 1.237506628036499, + "rewards/rejected": -18.224769592285156, + "step": 12475 + }, + { + "epoch": 0.42064107317402, + "grad_norm": 15.125301361083984, + "learning_rate": 7.181827866950837e-07, + "logits/chosen": -1.270438313484192, + "logits/rejected": -1.2889509201049805, + "logps/chosen": -1.5895674228668213, + "logps/rejected": -1.6432710886001587, + "loss": 2.6452, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -15.895675659179688, + "rewards/margins": 0.5370348691940308, + "rewards/rejected": -16.432708740234375, + "step": 12480 + }, + { + "epoch": 0.4208095992450032, + "grad_norm": 5.707220554351807, + "learning_rate": 7.179180956423933e-07, + "logits/chosen": -1.1088390350341797, + "logits/rejected": -1.3981083631515503, + "logps/chosen": -1.8700625896453857, + "logps/rejected": -2.3774728775024414, + "loss": 2.1257, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.700626373291016, + "rewards/margins": 5.07410192489624, + "rewards/rejected": -23.774728775024414, + "step": 12485 + }, + { + "epoch": 0.4209781253159864, + "grad_norm": 17.590496063232422, + "learning_rate": 7.176533291768357e-07, + "logits/chosen": -1.1576659679412842, + "logits/rejected": -1.4298789501190186, + "logps/chosen": -2.152177333831787, + "logps/rejected": -2.259253740310669, + "loss": 2.457, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.521774291992188, + "rewards/margins": 1.0707619190216064, + "rewards/rejected": -22.592538833618164, + "step": 12490 + }, + { + "epoch": 0.42114665138696955, + "grad_norm": 22.579177856445312, + "learning_rate": 7.173884873900362e-07, + "logits/chosen": -1.2096421718597412, + "logits/rejected": -1.372266173362732, + "logps/chosen": -2.0245296955108643, + "logps/rejected": -2.4928340911865234, + "loss": 3.0018, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.245298385620117, + "rewards/margins": 4.683045387268066, + "rewards/rejected": -24.928342819213867, + "step": 12495 + }, + { + "epoch": 0.42131517745795277, + "grad_norm": 22.653228759765625, + "learning_rate": 7.171235703736458e-07, + "logits/chosen": -0.8634752035140991, + "logits/rejected": -1.0036604404449463, + "logps/chosen": -2.2977168560028076, + "logps/rejected": -2.3925795555114746, + "loss": 2.9089, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.977169036865234, + "rewards/margins": 0.9486261606216431, + "rewards/rejected": -23.92579460144043, + "step": 12500 + }, + { + "epoch": 0.42148370352893594, + "grad_norm": 28.098493576049805, + "learning_rate": 7.16858578219342e-07, + "logits/chosen": -1.356195092201233, + "logits/rejected": -1.5597960948944092, + "logps/chosen": -2.096545457839966, + "logps/rejected": -2.229048252105713, + "loss": 2.6018, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.965456008911133, + "rewards/margins": 1.3250248432159424, + "rewards/rejected": -22.29047966003418, + "step": 12505 + }, + { + "epoch": 0.4216522295999191, + "grad_norm": 19.7722225189209, + "learning_rate": 7.165935110188282e-07, + "logits/chosen": -1.291948676109314, + "logits/rejected": -1.2664096355438232, + "logps/chosen": -2.1727402210235596, + "logps/rejected": -2.754978656768799, + "loss": 2.2238, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.727405548095703, + "rewards/margins": 5.822384834289551, + "rewards/rejected": -27.549789428710938, + "step": 12510 + }, + { + "epoch": 0.42182075567090227, + "grad_norm": 18.146760940551758, + "learning_rate": 7.163283688638338e-07, + "logits/chosen": -0.8999283909797668, + "logits/rejected": -0.8663008809089661, + "logps/chosen": -2.0590133666992188, + "logps/rejected": -2.1434433460235596, + "loss": 2.928, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.590129852294922, + "rewards/margins": 0.84429931640625, + "rewards/rejected": -21.434432983398438, + "step": 12515 + }, + { + "epoch": 0.4219892817418855, + "grad_norm": 2.633085012435913, + "learning_rate": 7.160631518461138e-07, + "logits/chosen": -0.8085635900497437, + "logits/rejected": -1.2670907974243164, + "logps/chosen": -2.0810775756835938, + "logps/rejected": -2.7740936279296875, + "loss": 1.1083, + "rewards/accuracies": 1.0, + "rewards/chosen": -20.810775756835938, + "rewards/margins": 6.930161952972412, + "rewards/rejected": -27.740936279296875, + "step": 12520 + }, + { + "epoch": 0.42215780781286866, + "grad_norm": 33.739681243896484, + "learning_rate": 7.157978600574494e-07, + "logits/chosen": -1.1513853073120117, + "logits/rejected": -1.0155327320098877, + "logps/chosen": -2.2184622287750244, + "logps/rejected": -2.5932672023773193, + "loss": 2.3999, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.184621810913086, + "rewards/margins": 3.7480499744415283, + "rewards/rejected": -25.93267250061035, + "step": 12525 + }, + { + "epoch": 0.4223263338838518, + "grad_norm": 14.900190353393555, + "learning_rate": 7.155324935896481e-07, + "logits/chosen": -1.1839789152145386, + "logits/rejected": -1.4558827877044678, + "logps/chosen": -1.641060471534729, + "logps/rejected": -1.6600325107574463, + "loss": 3.9618, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.410602569580078, + "rewards/margins": 0.18971911072731018, + "rewards/rejected": -16.600324630737305, + "step": 12530 + }, + { + "epoch": 0.422494859954835, + "grad_norm": 63.875736236572266, + "learning_rate": 7.152670525345421e-07, + "logits/chosen": -1.1068434715270996, + "logits/rejected": -1.1477311849594116, + "logps/chosen": -2.444314956665039, + "logps/rejected": -2.6108994483947754, + "loss": 3.4869, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.44314956665039, + "rewards/margins": 1.6658456325531006, + "rewards/rejected": -26.108993530273438, + "step": 12535 + }, + { + "epoch": 0.4226633860258182, + "grad_norm": 21.8823184967041, + "learning_rate": 7.150015369839903e-07, + "logits/chosen": -0.818661093711853, + "logits/rejected": -1.0668702125549316, + "logps/chosen": -2.3808517456054688, + "logps/rejected": -2.7843680381774902, + "loss": 2.1808, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.808517456054688, + "rewards/margins": 4.035162925720215, + "rewards/rejected": -27.843679428100586, + "step": 12540 + }, + { + "epoch": 0.4228319120968014, + "grad_norm": 27.321157455444336, + "learning_rate": 7.147359470298776e-07, + "logits/chosen": -1.0066759586334229, + "logits/rejected": -0.9979284405708313, + "logps/chosen": -2.0069892406463623, + "logps/rejected": -1.8785688877105713, + "loss": 4.3458, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.069894790649414, + "rewards/margins": -1.2842042446136475, + "rewards/rejected": -18.785690307617188, + "step": 12545 + }, + { + "epoch": 0.42300043816778454, + "grad_norm": 21.502607345581055, + "learning_rate": 7.144702827641136e-07, + "logits/chosen": -1.2351725101470947, + "logits/rejected": -1.4520246982574463, + "logps/chosen": -1.9867042303085327, + "logps/rejected": -2.3051655292510986, + "loss": 2.2522, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.867042541503906, + "rewards/margins": 3.1846134662628174, + "rewards/rejected": -23.051654815673828, + "step": 12550 + }, + { + "epoch": 0.42316896423876776, + "grad_norm": 23.776119232177734, + "learning_rate": 7.142045442786346e-07, + "logits/chosen": -0.8708661794662476, + "logits/rejected": -1.0048704147338867, + "logps/chosen": -1.9848359823226929, + "logps/rejected": -2.0720088481903076, + "loss": 2.6712, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.84836196899414, + "rewards/margins": 0.8717293739318848, + "rewards/rejected": -20.720088958740234, + "step": 12555 + }, + { + "epoch": 0.42333749030975093, + "grad_norm": 20.22873878479004, + "learning_rate": 7.139387316654024e-07, + "logits/chosen": -1.050518274307251, + "logits/rejected": -1.279454231262207, + "logps/chosen": -2.533379077911377, + "logps/rejected": -2.4677693843841553, + "loss": 4.2271, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -25.333789825439453, + "rewards/margins": -0.6560948491096497, + "rewards/rejected": -24.67769432067871, + "step": 12560 + }, + { + "epoch": 0.4235060163807341, + "grad_norm": 19.31144905090332, + "learning_rate": 7.136728450164038e-07, + "logits/chosen": -0.9448343515396118, + "logits/rejected": -1.2927018404006958, + "logps/chosen": -2.0609188079833984, + "logps/rejected": -2.36385440826416, + "loss": 1.8349, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.609188079833984, + "rewards/margins": 3.029357671737671, + "rewards/rejected": -23.638545989990234, + "step": 12565 + }, + { + "epoch": 0.42367454245171726, + "grad_norm": 22.986955642700195, + "learning_rate": 7.134068844236518e-07, + "logits/chosen": -1.3164246082305908, + "logits/rejected": -1.5228006839752197, + "logps/chosen": -2.031179428100586, + "logps/rejected": -2.2485601902008057, + "loss": 1.9017, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.31179428100586, + "rewards/margins": 2.1738078594207764, + "rewards/rejected": -22.4856014251709, + "step": 12570 + }, + { + "epoch": 0.4238430685227005, + "grad_norm": 21.826759338378906, + "learning_rate": 7.131408499791853e-07, + "logits/chosen": -1.7282383441925049, + "logits/rejected": -2.004629135131836, + "logps/chosen": -2.011059045791626, + "logps/rejected": -2.124830722808838, + "loss": 3.4721, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.1105899810791, + "rewards/margins": 1.1377174854278564, + "rewards/rejected": -21.248306274414062, + "step": 12575 + }, + { + "epoch": 0.42401159459368365, + "grad_norm": 36.31961441040039, + "learning_rate": 7.128747417750678e-07, + "logits/chosen": -1.0672346353530884, + "logits/rejected": -1.3110134601593018, + "logps/chosen": -2.252652406692505, + "logps/rejected": -2.3968546390533447, + "loss": 1.9498, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -22.526521682739258, + "rewards/margins": 1.4420230388641357, + "rewards/rejected": -23.96854591369629, + "step": 12580 + }, + { + "epoch": 0.4241801206646668, + "grad_norm": 45.022438049316406, + "learning_rate": 7.126085599033892e-07, + "logits/chosen": -0.8541949987411499, + "logits/rejected": -1.1512668132781982, + "logps/chosen": -2.3442893028259277, + "logps/rejected": -2.7394356727600098, + "loss": 2.5909, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.442893981933594, + "rewards/margins": 3.951462507247925, + "rewards/rejected": -27.39435386657715, + "step": 12585 + }, + { + "epoch": 0.42434864673565, + "grad_norm": 17.76199722290039, + "learning_rate": 7.123423044562644e-07, + "logits/chosen": -1.2358906269073486, + "logits/rejected": -1.426826000213623, + "logps/chosen": -1.9580609798431396, + "logps/rejected": -2.097620725631714, + "loss": 2.9607, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.580608367919922, + "rewards/margins": 1.395599365234375, + "rewards/rejected": -20.976207733154297, + "step": 12590 + }, + { + "epoch": 0.4245171728066332, + "grad_norm": 15.683243751525879, + "learning_rate": 7.12075975525834e-07, + "logits/chosen": -1.072770118713379, + "logits/rejected": -1.5230886936187744, + "logps/chosen": -2.1154682636260986, + "logps/rejected": -2.5087642669677734, + "loss": 1.9563, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.154682159423828, + "rewards/margins": 3.9329605102539062, + "rewards/rejected": -25.0876407623291, + "step": 12595 + }, + { + "epoch": 0.42468569887761637, + "grad_norm": 25.16454315185547, + "learning_rate": 7.118095732042641e-07, + "logits/chosen": -0.9468148350715637, + "logits/rejected": -0.978870689868927, + "logps/chosen": -2.006333589553833, + "logps/rejected": -1.969488501548767, + "loss": 3.8357, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.063335418701172, + "rewards/margins": -0.36844903230667114, + "rewards/rejected": -19.69488525390625, + "step": 12600 + }, + { + "epoch": 0.42485422494859953, + "grad_norm": 15.496404647827148, + "learning_rate": 7.115430975837456e-07, + "logits/chosen": -1.466344952583313, + "logits/rejected": -1.5798033475875854, + "logps/chosen": -2.424255847930908, + "logps/rejected": -2.755340337753296, + "loss": 2.6929, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.242555618286133, + "rewards/margins": 3.310847520828247, + "rewards/rejected": -27.55340576171875, + "step": 12605 + }, + { + "epoch": 0.42502275101958276, + "grad_norm": 43.10670471191406, + "learning_rate": 7.112765487564957e-07, + "logits/chosen": -1.076716661453247, + "logits/rejected": -1.1707000732421875, + "logps/chosen": -2.2483749389648438, + "logps/rejected": -2.3862216472625732, + "loss": 3.0182, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.483749389648438, + "rewards/margins": 1.3784677982330322, + "rewards/rejected": -23.862218856811523, + "step": 12610 + }, + { + "epoch": 0.4251912770905659, + "grad_norm": 29.91777229309082, + "learning_rate": 7.110099268147562e-07, + "logits/chosen": -0.9753861427307129, + "logits/rejected": -1.0013278722763062, + "logps/chosen": -1.9687080383300781, + "logps/rejected": -1.981389045715332, + "loss": 3.097, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.68707847595215, + "rewards/margins": 0.12681102752685547, + "rewards/rejected": -19.813888549804688, + "step": 12615 + }, + { + "epoch": 0.4253598031615491, + "grad_norm": 35.663963317871094, + "learning_rate": 7.107432318507943e-07, + "logits/chosen": -1.1238733530044556, + "logits/rejected": -1.0368965864181519, + "logps/chosen": -2.0092053413391113, + "logps/rejected": -1.9546020030975342, + "loss": 4.2135, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -20.092052459716797, + "rewards/margins": -0.5460325479507446, + "rewards/rejected": -19.5460205078125, + "step": 12620 + }, + { + "epoch": 0.42552832923253225, + "grad_norm": 29.770458221435547, + "learning_rate": 7.10476463956903e-07, + "logits/chosen": -1.1682617664337158, + "logits/rejected": -1.2263028621673584, + "logps/chosen": -2.1611130237579346, + "logps/rejected": -2.257652997970581, + "loss": 2.6198, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.61113166809082, + "rewards/margins": 0.9653974771499634, + "rewards/rejected": -22.576528549194336, + "step": 12625 + }, + { + "epoch": 0.4256968553035155, + "grad_norm": 29.46103858947754, + "learning_rate": 7.102096232253999e-07, + "logits/chosen": -1.0423948764801025, + "logits/rejected": -1.1089481115341187, + "logps/chosen": -2.0582234859466553, + "logps/rejected": -2.3573241233825684, + "loss": 1.906, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.582233428955078, + "rewards/margins": 2.991006374359131, + "rewards/rejected": -23.5732421875, + "step": 12630 + }, + { + "epoch": 0.42586538137449864, + "grad_norm": 17.78944206237793, + "learning_rate": 7.099427097486283e-07, + "logits/chosen": -0.9830500483512878, + "logits/rejected": -1.443943738937378, + "logps/chosen": -2.2791683673858643, + "logps/rejected": -2.5998737812042236, + "loss": 2.1672, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -22.791683197021484, + "rewards/margins": 3.2070529460906982, + "rewards/rejected": -25.998737335205078, + "step": 12635 + }, + { + "epoch": 0.4260339074454818, + "grad_norm": 34.38286590576172, + "learning_rate": 7.09675723618956e-07, + "logits/chosen": -1.4862444400787354, + "logits/rejected": -1.2831571102142334, + "logps/chosen": -2.3077995777130127, + "logps/rejected": -2.3144662380218506, + "loss": 4.1222, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -23.07799530029297, + "rewards/margins": 0.06666803359985352, + "rewards/rejected": -23.144662857055664, + "step": 12640 + }, + { + "epoch": 0.426202433516465, + "grad_norm": 0.0135088711977005, + "learning_rate": 7.094086649287768e-07, + "logits/chosen": -1.286271333694458, + "logits/rejected": -1.4231679439544678, + "logps/chosen": -1.9492824077606201, + "logps/rejected": -2.2704503536224365, + "loss": 2.3507, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.49282455444336, + "rewards/margins": 3.211678981781006, + "rewards/rejected": -22.70450210571289, + "step": 12645 + }, + { + "epoch": 0.4263709595874482, + "grad_norm": 20.299243927001953, + "learning_rate": 7.09141533770509e-07, + "logits/chosen": -1.2199132442474365, + "logits/rejected": -1.5089060068130493, + "logps/chosen": -1.748953104019165, + "logps/rejected": -1.8105674982070923, + "loss": 2.633, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.489532470703125, + "rewards/margins": 0.6161444783210754, + "rewards/rejected": -18.105676651000977, + "step": 12650 + }, + { + "epoch": 0.42653948565843136, + "grad_norm": 37.17742919921875, + "learning_rate": 7.088743302365963e-07, + "logits/chosen": -1.1301769018173218, + "logits/rejected": -1.4917234182357788, + "logps/chosen": -2.305114269256592, + "logps/rejected": -2.4861512184143066, + "loss": 3.5311, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.05113983154297, + "rewards/margins": 1.8103729486465454, + "rewards/rejected": -24.861515045166016, + "step": 12655 + }, + { + "epoch": 0.4267080117294145, + "grad_norm": 36.21525192260742, + "learning_rate": 7.086070544195071e-07, + "logits/chosen": -1.2566407918930054, + "logits/rejected": -1.1735836267471313, + "logps/chosen": -2.330984592437744, + "logps/rejected": -2.2364494800567627, + "loss": 4.3382, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -23.309844970703125, + "rewards/margins": -0.9453502893447876, + "rewards/rejected": -22.3644962310791, + "step": 12660 + }, + { + "epoch": 0.42687653780039775, + "grad_norm": 40.654571533203125, + "learning_rate": 7.083397064117351e-07, + "logits/chosen": -1.0398657321929932, + "logits/rejected": -1.0622133016586304, + "logps/chosen": -2.4192018508911133, + "logps/rejected": -2.477231502532959, + "loss": 2.9533, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.1920166015625, + "rewards/margins": 0.5802987813949585, + "rewards/rejected": -24.77231788635254, + "step": 12665 + }, + { + "epoch": 0.4270450638713809, + "grad_norm": 28.800556182861328, + "learning_rate": 7.080722863057992e-07, + "logits/chosen": -1.5768417119979858, + "logits/rejected": -1.5328872203826904, + "logps/chosen": -1.8163115978240967, + "logps/rejected": -1.9694408178329468, + "loss": 2.5645, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.163116455078125, + "rewards/margins": 1.53129243850708, + "rewards/rejected": -19.694408416748047, + "step": 12670 + }, + { + "epoch": 0.4272135899423641, + "grad_norm": 18.046127319335938, + "learning_rate": 7.078047941942426e-07, + "logits/chosen": -0.7132788896560669, + "logits/rejected": -0.9055215716362, + "logps/chosen": -2.2301299571990967, + "logps/rejected": -2.510833740234375, + "loss": 2.5123, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.30130386352539, + "rewards/margins": 2.8070342540740967, + "rewards/rejected": -25.108333587646484, + "step": 12675 + }, + { + "epoch": 0.42738211601334725, + "grad_norm": 33.88047409057617, + "learning_rate": 7.075372301696339e-07, + "logits/chosen": -0.7558620572090149, + "logits/rejected": -0.7717004418373108, + "logps/chosen": -2.054013967514038, + "logps/rejected": -2.0608019828796387, + "loss": 3.3235, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.54014015197754, + "rewards/margins": 0.06788072735071182, + "rewards/rejected": -20.608020782470703, + "step": 12680 + }, + { + "epoch": 0.42755064208433047, + "grad_norm": 20.51235580444336, + "learning_rate": 7.072695943245664e-07, + "logits/chosen": -0.8837175369262695, + "logits/rejected": -1.0166467428207397, + "logps/chosen": -2.6695451736450195, + "logps/rejected": -2.8811819553375244, + "loss": 3.5117, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.695453643798828, + "rewards/margins": 2.116365432739258, + "rewards/rejected": -28.811819076538086, + "step": 12685 + }, + { + "epoch": 0.42771916815531363, + "grad_norm": 43.31715393066406, + "learning_rate": 7.070018867516585e-07, + "logits/chosen": -1.3836842775344849, + "logits/rejected": -1.291332721710205, + "logps/chosen": -2.0165090560913086, + "logps/rejected": -2.2391884326934814, + "loss": 2.756, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.165088653564453, + "rewards/margins": 2.226795196533203, + "rewards/rejected": -22.391883850097656, + "step": 12690 + }, + { + "epoch": 0.4278876942262968, + "grad_norm": 22.55230140686035, + "learning_rate": 7.067341075435531e-07, + "logits/chosen": -0.9342721700668335, + "logits/rejected": -1.154515027999878, + "logps/chosen": -1.9569528102874756, + "logps/rejected": -2.1937878131866455, + "loss": 2.1746, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.569528579711914, + "rewards/margins": 2.3683505058288574, + "rewards/rejected": -21.937877655029297, + "step": 12695 + }, + { + "epoch": 0.42805622029727997, + "grad_norm": 26.59225845336914, + "learning_rate": 7.06466256792918e-07, + "logits/chosen": -1.6623830795288086, + "logits/rejected": -1.609900712966919, + "logps/chosen": -2.140404462814331, + "logps/rejected": -2.286125659942627, + "loss": 2.6497, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.40404510498047, + "rewards/margins": 1.4572112560272217, + "rewards/rejected": -22.861255645751953, + "step": 12700 + }, + { + "epoch": 0.4282247463682632, + "grad_norm": 18.7648868560791, + "learning_rate": 7.061983345924462e-07, + "logits/chosen": -1.6123731136322021, + "logits/rejected": -1.5295023918151855, + "logps/chosen": -2.2385873794555664, + "logps/rejected": -2.4097483158111572, + "loss": 2.9512, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.385873794555664, + "rewards/margins": 1.7116081714630127, + "rewards/rejected": -24.097482681274414, + "step": 12705 + }, + { + "epoch": 0.42839327243924635, + "grad_norm": 27.55744171142578, + "learning_rate": 7.059303410348544e-07, + "logits/chosen": -1.27418053150177, + "logits/rejected": -1.2935948371887207, + "logps/chosen": -2.2106258869171143, + "logps/rejected": -2.7528538703918457, + "loss": 3.1742, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -22.106258392333984, + "rewards/margins": 5.4222822189331055, + "rewards/rejected": -27.528539657592773, + "step": 12710 + }, + { + "epoch": 0.4285617985102295, + "grad_norm": 149.60943603515625, + "learning_rate": 7.05662276212885e-07, + "logits/chosen": -1.3029506206512451, + "logits/rejected": -1.5379345417022705, + "logps/chosen": -2.348891019821167, + "logps/rejected": -2.646965503692627, + "loss": 3.6559, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.488908767700195, + "rewards/margins": 2.980748414993286, + "rewards/rejected": -26.469655990600586, + "step": 12715 + }, + { + "epoch": 0.42873032458121274, + "grad_norm": 84.20378112792969, + "learning_rate": 7.053941402193044e-07, + "logits/chosen": -0.8661912679672241, + "logits/rejected": -0.9065690040588379, + "logps/chosen": -1.9864723682403564, + "logps/rejected": -1.9592368602752686, + "loss": 3.4597, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.86472511291504, + "rewards/margins": -0.2723536491394043, + "rewards/rejected": -19.592369079589844, + "step": 12720 + }, + { + "epoch": 0.4288988506521959, + "grad_norm": 26.458824157714844, + "learning_rate": 7.051259331469044e-07, + "logits/chosen": -1.0588595867156982, + "logits/rejected": -1.0328760147094727, + "logps/chosen": -2.196181535720825, + "logps/rejected": -2.3460662364959717, + "loss": 3.5815, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.961816787719727, + "rewards/margins": 1.4988477230072021, + "rewards/rejected": -23.460662841796875, + "step": 12725 + }, + { + "epoch": 0.4290673767231791, + "grad_norm": 17.77597999572754, + "learning_rate": 7.048576550885004e-07, + "logits/chosen": -1.0335174798965454, + "logits/rejected": -1.2250608205795288, + "logps/chosen": -2.217999219894409, + "logps/rejected": -2.653132200241089, + "loss": 1.2628, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -22.179994583129883, + "rewards/margins": 4.351326942443848, + "rewards/rejected": -26.531322479248047, + "step": 12730 + }, + { + "epoch": 0.42923590279416224, + "grad_norm": 23.028162002563477, + "learning_rate": 7.04589306136933e-07, + "logits/chosen": -1.1175248622894287, + "logits/rejected": -1.1833066940307617, + "logps/chosen": -1.847046136856079, + "logps/rejected": -2.0023093223571777, + "loss": 2.6907, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.470462799072266, + "rewards/margins": 1.5526316165924072, + "rewards/rejected": -20.02309226989746, + "step": 12735 + }, + { + "epoch": 0.42940442886514546, + "grad_norm": 27.370683670043945, + "learning_rate": 7.043208863850672e-07, + "logits/chosen": -0.9457284808158875, + "logits/rejected": -1.0987629890441895, + "logps/chosen": -1.9186127185821533, + "logps/rejected": -2.0391685962677, + "loss": 2.2472, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -19.186124801635742, + "rewards/margins": 1.2055596113204956, + "rewards/rejected": -20.39168357849121, + "step": 12740 + }, + { + "epoch": 0.4295729549361286, + "grad_norm": 20.669946670532227, + "learning_rate": 7.040523959257927e-07, + "logits/chosen": -1.6514732837677002, + "logits/rejected": -1.6349937915802002, + "logps/chosen": -1.930381417274475, + "logps/rejected": -1.892491340637207, + "loss": 3.7019, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.303813934326172, + "rewards/margins": -0.3788990080356598, + "rewards/rejected": -18.924915313720703, + "step": 12745 + }, + { + "epoch": 0.4297414810071118, + "grad_norm": 21.375263214111328, + "learning_rate": 7.037838348520233e-07, + "logits/chosen": -1.4665489196777344, + "logits/rejected": -1.5651066303253174, + "logps/chosen": -1.99209725856781, + "logps/rejected": -2.207446813583374, + "loss": 1.8538, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.920970916748047, + "rewards/margins": 2.1534957885742188, + "rewards/rejected": -22.0744686126709, + "step": 12750 + }, + { + "epoch": 0.42991000707809496, + "grad_norm": 15.512998580932617, + "learning_rate": 7.035152032566973e-07, + "logits/chosen": -1.1759886741638184, + "logits/rejected": -1.4427909851074219, + "logps/chosen": -2.1886658668518066, + "logps/rejected": -2.348357677459717, + "loss": 1.8191, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.88665771484375, + "rewards/margins": 1.5969184637069702, + "rewards/rejected": -23.48357582092285, + "step": 12755 + }, + { + "epoch": 0.4300785331490782, + "grad_norm": 35.07164001464844, + "learning_rate": 7.032465012327777e-07, + "logits/chosen": -1.2698971033096313, + "logits/rejected": -1.5194822549819946, + "logps/chosen": -2.5975723266601562, + "logps/rejected": -2.8108088970184326, + "loss": 2.5388, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.975723266601562, + "rewards/margins": 2.132366895675659, + "rewards/rejected": -28.10808753967285, + "step": 12760 + }, + { + "epoch": 0.43024705922006135, + "grad_norm": 36.83730697631836, + "learning_rate": 7.029777288732516e-07, + "logits/chosen": -0.8789815902709961, + "logits/rejected": -1.0874645709991455, + "logps/chosen": -1.9733607769012451, + "logps/rejected": -2.3463854789733887, + "loss": 2.2044, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.73360824584961, + "rewards/margins": 3.730245590209961, + "rewards/rejected": -23.46385383605957, + "step": 12765 + }, + { + "epoch": 0.4304155852910445, + "grad_norm": 13.495026588439941, + "learning_rate": 7.027088862711305e-07, + "logits/chosen": -0.9406919479370117, + "logits/rejected": -1.2175328731536865, + "logps/chosen": -2.1586251258850098, + "logps/rejected": -2.4604978561401367, + "loss": 2.6017, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.586252212524414, + "rewards/margins": 3.0187273025512695, + "rewards/rejected": -24.60498046875, + "step": 12770 + }, + { + "epoch": 0.43058411136202773, + "grad_norm": 24.654760360717773, + "learning_rate": 7.024399735194503e-07, + "logits/chosen": -1.667497992515564, + "logits/rejected": -1.6784369945526123, + "logps/chosen": -2.176253080368042, + "logps/rejected": -2.3713762760162354, + "loss": 2.694, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.762531280517578, + "rewards/margins": 1.9512317180633545, + "rewards/rejected": -23.713764190673828, + "step": 12775 + }, + { + "epoch": 0.4307526374330109, + "grad_norm": 26.69776153564453, + "learning_rate": 7.021709907112711e-07, + "logits/chosen": -1.6615867614746094, + "logits/rejected": -1.6004226207733154, + "logps/chosen": -1.9908841848373413, + "logps/rejected": -2.079436779022217, + "loss": 3.5278, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -19.90884017944336, + "rewards/margins": 0.8855279684066772, + "rewards/rejected": -20.794368743896484, + "step": 12780 + }, + { + "epoch": 0.43092116350399406, + "grad_norm": 23.461130142211914, + "learning_rate": 7.019019379396772e-07, + "logits/chosen": -1.2154319286346436, + "logits/rejected": -1.3082120418548584, + "logps/chosen": -2.370018482208252, + "logps/rejected": -2.3778629302978516, + "loss": 3.5654, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.7001895904541, + "rewards/margins": 0.07844285666942596, + "rewards/rejected": -23.77863121032715, + "step": 12785 + }, + { + "epoch": 0.43108968957497723, + "grad_norm": 6.32010505796643e-06, + "learning_rate": 7.016328152977773e-07, + "logits/chosen": -1.21359384059906, + "logits/rejected": -1.5789474248886108, + "logps/chosen": -2.312983989715576, + "logps/rejected": -2.9869911670684814, + "loss": 1.6659, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -23.129838943481445, + "rewards/margins": 6.74007511138916, + "rewards/rejected": -29.869909286499023, + "step": 12790 + }, + { + "epoch": 0.43125821564596045, + "grad_norm": 23.659255981445312, + "learning_rate": 7.01363622878704e-07, + "logits/chosen": -0.9882528185844421, + "logits/rejected": -1.1198049783706665, + "logps/chosen": -1.950643539428711, + "logps/rejected": -1.860714316368103, + "loss": 4.0003, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -19.50643539428711, + "rewards/margins": -0.8992928266525269, + "rewards/rejected": -18.60714340209961, + "step": 12795 + }, + { + "epoch": 0.4314267417169436, + "grad_norm": 29.752731323242188, + "learning_rate": 7.010943607756142e-07, + "logits/chosen": -0.6915519833564758, + "logits/rejected": -0.7919496297836304, + "logps/chosen": -2.217557430267334, + "logps/rejected": -2.159320831298828, + "loss": 4.1181, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.175573348999023, + "rewards/margins": -0.5823682546615601, + "rewards/rejected": -21.593204498291016, + "step": 12800 + }, + { + "epoch": 0.4314267417169436, + "eval_logits/chosen": -1.581845760345459, + "eval_logits/rejected": -1.6885778903961182, + "eval_logps/chosen": -1.9620436429977417, + "eval_logps/rejected": -2.061039447784424, + "eval_loss": 3.008579969406128, + "eval_rewards/accuracies": 0.6299999952316284, + "eval_rewards/chosen": -19.62043571472168, + "eval_rewards/margins": 0.9899570345878601, + "eval_rewards/rejected": -20.610393524169922, + "eval_runtime": 12.918, + "eval_samples_per_second": 7.741, + "eval_steps_per_second": 1.935, + "step": 12800 + }, + { + "epoch": 0.4315952677879268, + "grad_norm": 38.067996978759766, + "learning_rate": 7.008250290816888e-07, + "logits/chosen": -1.0375231504440308, + "logits/rejected": -1.0609022378921509, + "logps/chosen": -2.1347129344940186, + "logps/rejected": -2.05728816986084, + "loss": 4.267, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.34712791442871, + "rewards/margins": -0.774248480796814, + "rewards/rejected": -20.5728816986084, + "step": 12805 + }, + { + "epoch": 0.43176379385890995, + "grad_norm": 59.478187561035156, + "learning_rate": 7.005556278901334e-07, + "logits/chosen": -1.0637800693511963, + "logits/rejected": -1.1035311222076416, + "logps/chosen": -2.100837469100952, + "logps/rejected": -2.370177745819092, + "loss": 2.2027, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.00837516784668, + "rewards/margins": 2.6934049129486084, + "rewards/rejected": -23.701780319213867, + "step": 12810 + }, + { + "epoch": 0.43193231992989317, + "grad_norm": 30.48736572265625, + "learning_rate": 7.002861572941764e-07, + "logits/chosen": -1.3173472881317139, + "logits/rejected": -1.440666913986206, + "logps/chosen": -1.78774094581604, + "logps/rejected": -1.9231376647949219, + "loss": 3.0321, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -17.87740707397461, + "rewards/margins": 1.3539693355560303, + "rewards/rejected": -19.23137664794922, + "step": 12815 + }, + { + "epoch": 0.43210084600087634, + "grad_norm": 21.779550552368164, + "learning_rate": 7.000166173870715e-07, + "logits/chosen": -1.1645904779434204, + "logits/rejected": -0.9815500974655151, + "logps/chosen": -2.5551183223724365, + "logps/rejected": -2.453338861465454, + "loss": 4.3826, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -25.551183700561523, + "rewards/margins": -1.017795443534851, + "rewards/rejected": -24.53338623046875, + "step": 12820 + }, + { + "epoch": 0.4322693720718595, + "grad_norm": 140.74278259277344, + "learning_rate": 6.997470082620955e-07, + "logits/chosen": -1.717813491821289, + "logits/rejected": -1.7180192470550537, + "logps/chosen": -2.6433820724487305, + "logps/rejected": -2.638429641723633, + "loss": 4.0287, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -26.433818817138672, + "rewards/margins": -0.04952258989214897, + "rewards/rejected": -26.384296417236328, + "step": 12825 + }, + { + "epoch": 0.4324378981428427, + "grad_norm": 59.987083435058594, + "learning_rate": 6.994773300125498e-07, + "logits/chosen": -0.9183546304702759, + "logits/rejected": -0.7480214834213257, + "logps/chosen": -3.556931257247925, + "logps/rejected": -3.59765625, + "loss": 6.2959, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -35.569313049316406, + "rewards/margins": 0.40724772214889526, + "rewards/rejected": -35.976558685302734, + "step": 12830 + }, + { + "epoch": 0.4326064242138259, + "grad_norm": 16.872873306274414, + "learning_rate": 6.992075827317593e-07, + "logits/chosen": -1.4935461282730103, + "logits/rejected": -1.6690353155136108, + "logps/chosen": -2.410545825958252, + "logps/rejected": -2.426908493041992, + "loss": 4.5857, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.105457305908203, + "rewards/margins": 0.16362905502319336, + "rewards/rejected": -24.269084930419922, + "step": 12835 + }, + { + "epoch": 0.43277495028480906, + "grad_norm": 16.417036056518555, + "learning_rate": 6.989377665130727e-07, + "logits/chosen": -1.4650869369506836, + "logits/rejected": -1.4241197109222412, + "logps/chosen": -2.010990619659424, + "logps/rejected": -2.867649555206299, + "loss": 2.4715, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.109905242919922, + "rewards/margins": 8.566590309143066, + "rewards/rejected": -28.676494598388672, + "step": 12840 + }, + { + "epoch": 0.4329434763557922, + "grad_norm": 151.86782836914062, + "learning_rate": 6.986678814498633e-07, + "logits/chosen": -0.7954455614089966, + "logits/rejected": -0.8803497552871704, + "logps/chosen": -2.5972073078155518, + "logps/rejected": -2.9544386863708496, + "loss": 1.3978, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -25.972070693969727, + "rewards/margins": 3.5723164081573486, + "rewards/rejected": -29.544384002685547, + "step": 12845 + }, + { + "epoch": 0.43311200242677544, + "grad_norm": 24.465486526489258, + "learning_rate": 6.98397927635527e-07, + "logits/chosen": -1.0608322620391846, + "logits/rejected": -1.2614036798477173, + "logps/chosen": -2.178588390350342, + "logps/rejected": -2.1134209632873535, + "loss": 3.7257, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.7858829498291, + "rewards/margins": -0.6516709327697754, + "rewards/rejected": -21.13421058654785, + "step": 12850 + }, + { + "epoch": 0.4332805284977586, + "grad_norm": 71.80946350097656, + "learning_rate": 6.981279051634845e-07, + "logits/chosen": -1.386091947555542, + "logits/rejected": -1.5041242837905884, + "logps/chosen": -2.2846813201904297, + "logps/rejected": -2.337955951690674, + "loss": 3.1754, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.846813201904297, + "rewards/margins": 0.5327471494674683, + "rewards/rejected": -23.379558563232422, + "step": 12855 + }, + { + "epoch": 0.4334490545687418, + "grad_norm": 19.060367584228516, + "learning_rate": 6.978578141271802e-07, + "logits/chosen": -1.181014895439148, + "logits/rejected": -1.188372254371643, + "logps/chosen": -2.3186752796173096, + "logps/rejected": -2.3536510467529297, + "loss": 3.1496, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.186752319335938, + "rewards/margins": 0.34975796937942505, + "rewards/rejected": -23.536510467529297, + "step": 12860 + }, + { + "epoch": 0.43361758063972494, + "grad_norm": 22.087045669555664, + "learning_rate": 6.975876546200815e-07, + "logits/chosen": -1.2001193761825562, + "logits/rejected": -1.1703085899353027, + "logps/chosen": -1.8144724369049072, + "logps/rejected": -1.942793607711792, + "loss": 3.0515, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.144723892211914, + "rewards/margins": 1.2832123041152954, + "rewards/rejected": -19.427936553955078, + "step": 12865 + }, + { + "epoch": 0.43378610671070816, + "grad_norm": 30.62040901184082, + "learning_rate": 6.973174267356804e-07, + "logits/chosen": -1.490786075592041, + "logits/rejected": -1.7894165515899658, + "logps/chosen": -2.078627109527588, + "logps/rejected": -2.2299420833587646, + "loss": 2.8385, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.786270141601562, + "rewards/margins": 1.51315176486969, + "rewards/rejected": -22.299423217773438, + "step": 12870 + }, + { + "epoch": 0.43395463278169133, + "grad_norm": 30.947246551513672, + "learning_rate": 6.970471305674917e-07, + "logits/chosen": -0.5229056477546692, + "logits/rejected": -0.6939235925674438, + "logps/chosen": -1.9918756484985352, + "logps/rejected": -2.3143186569213867, + "loss": 1.3739, + "rewards/accuracies": 1.0, + "rewards/chosen": -19.91875457763672, + "rewards/margins": 3.224431276321411, + "rewards/rejected": -23.143184661865234, + "step": 12875 + }, + { + "epoch": 0.4341231588526745, + "grad_norm": 47.894962310791016, + "learning_rate": 6.967767662090546e-07, + "logits/chosen": -0.884885311126709, + "logits/rejected": -0.9361233711242676, + "logps/chosen": -2.259148359298706, + "logps/rejected": -2.435331344604492, + "loss": 2.0028, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.59148597717285, + "rewards/margins": 1.7618297338485718, + "rewards/rejected": -24.353313446044922, + "step": 12880 + }, + { + "epoch": 0.4342916849236577, + "grad_norm": 30.30147361755371, + "learning_rate": 6.965063337539312e-07, + "logits/chosen": -1.1772197484970093, + "logits/rejected": -1.2234853506088257, + "logps/chosen": -1.6218032836914062, + "logps/rejected": -2.1243720054626465, + "loss": 1.6635, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -16.218032836914062, + "rewards/margins": 5.025688171386719, + "rewards/rejected": -21.24372100830078, + "step": 12885 + }, + { + "epoch": 0.4344602109946409, + "grad_norm": 32.74153137207031, + "learning_rate": 6.962358332957078e-07, + "logits/chosen": -0.8374770879745483, + "logits/rejected": -0.9318382143974304, + "logps/chosen": -2.2800254821777344, + "logps/rejected": -2.363006591796875, + "loss": 2.8854, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.800256729125977, + "rewards/margins": 0.829812228679657, + "rewards/rejected": -23.630067825317383, + "step": 12890 + }, + { + "epoch": 0.43462873706562405, + "grad_norm": 17.24728775024414, + "learning_rate": 6.959652649279941e-07, + "logits/chosen": -1.3443167209625244, + "logits/rejected": -1.3858791589736938, + "logps/chosen": -2.0147061347961426, + "logps/rejected": -2.057424306869507, + "loss": 3.3514, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.147062301635742, + "rewards/margins": 0.4271821081638336, + "rewards/rejected": -20.574243545532227, + "step": 12895 + }, + { + "epoch": 0.4347972631366072, + "grad_norm": 126.15062713623047, + "learning_rate": 6.956946287444227e-07, + "logits/chosen": -1.0633189678192139, + "logits/rejected": -1.052336573600769, + "logps/chosen": -2.6452765464782715, + "logps/rejected": -2.6808838844299316, + "loss": 3.7942, + "rewards/accuracies": 0.5, + "rewards/chosen": -26.4527645111084, + "rewards/margins": 0.3560709059238434, + "rewards/rejected": -26.808834075927734, + "step": 12900 + }, + { + "epoch": 0.43496578920759044, + "grad_norm": 19.88990020751953, + "learning_rate": 6.954239248386504e-07, + "logits/chosen": -1.5321104526519775, + "logits/rejected": -1.4127600193023682, + "logps/chosen": -1.8655850887298584, + "logps/rejected": -2.0147814750671387, + "loss": 2.5387, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.65584945678711, + "rewards/margins": 1.4919636249542236, + "rewards/rejected": -20.147815704345703, + "step": 12905 + }, + { + "epoch": 0.4351343152785736, + "grad_norm": 84.38320922851562, + "learning_rate": 6.951531533043572e-07, + "logits/chosen": -0.6111572980880737, + "logits/rejected": -0.744185745716095, + "logps/chosen": -2.453381061553955, + "logps/rejected": -3.282031536102295, + "loss": 1.5609, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.533809661865234, + "rewards/margins": 8.286506652832031, + "rewards/rejected": -32.820316314697266, + "step": 12910 + }, + { + "epoch": 0.43530284134955677, + "grad_norm": 53.10908889770508, + "learning_rate": 6.948823142352463e-07, + "logits/chosen": -1.4643628597259521, + "logits/rejected": -1.636060118675232, + "logps/chosen": -1.7974869012832642, + "logps/rejected": -1.7600347995758057, + "loss": 4.2903, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -17.974870681762695, + "rewards/margins": -0.3745214343070984, + "rewards/rejected": -17.6003475189209, + "step": 12915 + }, + { + "epoch": 0.43547136742053993, + "grad_norm": 28.004077911376953, + "learning_rate": 6.946114077250445e-07, + "logits/chosen": -1.2066113948822021, + "logits/rejected": -1.1940003633499146, + "logps/chosen": -2.1136012077331543, + "logps/rejected": -2.303692579269409, + "loss": 2.6832, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.13601303100586, + "rewards/margins": 1.9009119272232056, + "rewards/rejected": -23.03692626953125, + "step": 12920 + }, + { + "epoch": 0.43563989349152316, + "grad_norm": 26.950162887573242, + "learning_rate": 6.943404338675018e-07, + "logits/chosen": -0.9007886648178101, + "logits/rejected": -0.9533143043518066, + "logps/chosen": -2.1714673042297363, + "logps/rejected": -2.1655383110046387, + "loss": 3.6921, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.714672088623047, + "rewards/margins": -0.05929117277264595, + "rewards/rejected": -21.655384063720703, + "step": 12925 + }, + { + "epoch": 0.4358084195625063, + "grad_norm": 25.70941925048828, + "learning_rate": 6.940693927563918e-07, + "logits/chosen": -1.064649224281311, + "logits/rejected": -1.015718698501587, + "logps/chosen": -2.6266140937805176, + "logps/rejected": -2.615485668182373, + "loss": 4.4821, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -26.26613998413086, + "rewards/margins": -0.11128310859203339, + "rewards/rejected": -26.154857635498047, + "step": 12930 + }, + { + "epoch": 0.4359769456334895, + "grad_norm": 26.513484954833984, + "learning_rate": 6.937982844855109e-07, + "logits/chosen": -0.5383487343788147, + "logits/rejected": -0.8462270498275757, + "logps/chosen": -2.0189685821533203, + "logps/rejected": -2.3757314682006836, + "loss": 2.4246, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.189685821533203, + "rewards/margins": 3.5676300525665283, + "rewards/rejected": -23.757314682006836, + "step": 12935 + }, + { + "epoch": 0.4361454717044727, + "grad_norm": 20.81569480895996, + "learning_rate": 6.935271091486791e-07, + "logits/chosen": -1.6532917022705078, + "logits/rejected": -1.7369930744171143, + "logps/chosen": -2.276045560836792, + "logps/rejected": -2.694082736968994, + "loss": 2.3589, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.760456085205078, + "rewards/margins": 4.180371284484863, + "rewards/rejected": -26.940826416015625, + "step": 12940 + }, + { + "epoch": 0.4363139977754559, + "grad_norm": 26.31812286376953, + "learning_rate": 6.932558668397395e-07, + "logits/chosen": -1.0625091791152954, + "logits/rejected": -1.0464208126068115, + "logps/chosen": -1.7008140087127686, + "logps/rejected": -1.7959041595458984, + "loss": 2.5549, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.008142471313477, + "rewards/margins": 0.9508990049362183, + "rewards/rejected": -17.959041595458984, + "step": 12945 + }, + { + "epoch": 0.43648252384643904, + "grad_norm": 38.73316192626953, + "learning_rate": 6.929845576525584e-07, + "logits/chosen": -1.0894930362701416, + "logits/rejected": -1.0746827125549316, + "logps/chosen": -2.072542190551758, + "logps/rejected": -2.346071720123291, + "loss": 1.8593, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.725421905517578, + "rewards/margins": 2.7352960109710693, + "rewards/rejected": -23.460718154907227, + "step": 12950 + }, + { + "epoch": 0.4366510499174222, + "grad_norm": 64.09503173828125, + "learning_rate": 6.927131816810251e-07, + "logits/chosen": -1.3247054815292358, + "logits/rejected": -1.6320841312408447, + "logps/chosen": -2.17695951461792, + "logps/rejected": -2.305966377258301, + "loss": 3.0175, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.769596099853516, + "rewards/margins": 1.2900705337524414, + "rewards/rejected": -23.05966567993164, + "step": 12955 + }, + { + "epoch": 0.43681957598840543, + "grad_norm": 30.350339889526367, + "learning_rate": 6.924417390190522e-07, + "logits/chosen": -0.9131641387939453, + "logits/rejected": -0.9161790013313293, + "logps/chosen": -2.3749337196350098, + "logps/rejected": -2.497476100921631, + "loss": 3.0965, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.749340057373047, + "rewards/margins": 1.22542142868042, + "rewards/rejected": -24.974760055541992, + "step": 12960 + }, + { + "epoch": 0.4369881020593886, + "grad_norm": 25.28206443786621, + "learning_rate": 6.921702297605755e-07, + "logits/chosen": -0.969762921333313, + "logits/rejected": -1.2061833143234253, + "logps/chosen": -1.714321494102478, + "logps/rejected": -2.0063693523406982, + "loss": 1.6666, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.14321517944336, + "rewards/margins": 2.9204788208007812, + "rewards/rejected": -20.06369400024414, + "step": 12965 + }, + { + "epoch": 0.43715662813037176, + "grad_norm": 88.055419921875, + "learning_rate": 6.918986539995533e-07, + "logits/chosen": -0.8369825482368469, + "logits/rejected": -0.7208142876625061, + "logps/chosen": -2.2105679512023926, + "logps/rejected": -2.1658036708831787, + "loss": 4.0844, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.10567855834961, + "rewards/margins": -0.4476422667503357, + "rewards/rejected": -21.658037185668945, + "step": 12970 + }, + { + "epoch": 0.4373251542013549, + "grad_norm": 27.194414138793945, + "learning_rate": 6.916270118299677e-07, + "logits/chosen": -1.2619565725326538, + "logits/rejected": -1.45753014087677, + "logps/chosen": -1.8999649286270142, + "logps/rejected": -1.9333438873291016, + "loss": 3.1134, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.999652862548828, + "rewards/margins": 0.3337882161140442, + "rewards/rejected": -19.333438873291016, + "step": 12975 + }, + { + "epoch": 0.43749368027233815, + "grad_norm": 17.010786056518555, + "learning_rate": 6.913553033458228e-07, + "logits/chosen": -1.2112401723861694, + "logits/rejected": -1.004233717918396, + "logps/chosen": -1.9769935607910156, + "logps/rejected": -1.8977857828140259, + "loss": 4.0243, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.769933700561523, + "rewards/margins": -0.7920783758163452, + "rewards/rejected": -18.977855682373047, + "step": 12980 + }, + { + "epoch": 0.4376622063433213, + "grad_norm": 41.05767822265625, + "learning_rate": 6.910835286411466e-07, + "logits/chosen": -1.159142255783081, + "logits/rejected": -1.3187485933303833, + "logps/chosen": -2.0043880939483643, + "logps/rejected": -2.213470220565796, + "loss": 2.8533, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.043880462646484, + "rewards/margins": 2.090820789337158, + "rewards/rejected": -22.134702682495117, + "step": 12985 + }, + { + "epoch": 0.4378307324143045, + "grad_norm": 42.45212936401367, + "learning_rate": 6.908116878099894e-07, + "logits/chosen": -0.8741549253463745, + "logits/rejected": -0.9256917238235474, + "logps/chosen": -1.7342628240585327, + "logps/rejected": -1.8204578161239624, + "loss": 2.5843, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.34263038635254, + "rewards/margins": 0.8619493246078491, + "rewards/rejected": -18.204578399658203, + "step": 12990 + }, + { + "epoch": 0.4379992584852877, + "grad_norm": 25.65916633605957, + "learning_rate": 6.90539780946425e-07, + "logits/chosen": -1.1083229780197144, + "logits/rejected": -1.4796850681304932, + "logps/chosen": -2.369807004928589, + "logps/rejected": -2.748922824859619, + "loss": 2.4815, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.698070526123047, + "rewards/margins": 3.7911579608917236, + "rewards/rejected": -27.489227294921875, + "step": 12995 + }, + { + "epoch": 0.43816778455627087, + "grad_norm": 46.00324249267578, + "learning_rate": 6.902678081445494e-07, + "logits/chosen": -1.6248180866241455, + "logits/rejected": -1.7508703470230103, + "logps/chosen": -2.150437831878662, + "logps/rejected": -2.212613344192505, + "loss": 3.3764, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.504375457763672, + "rewards/margins": 0.6217565536499023, + "rewards/rejected": -22.12613296508789, + "step": 13000 + }, + { + "epoch": 0.43833631062725403, + "grad_norm": 29.252960205078125, + "learning_rate": 6.899957694984815e-07, + "logits/chosen": -1.0204622745513916, + "logits/rejected": -1.358551025390625, + "logps/chosen": -1.7499244213104248, + "logps/rejected": -2.167311668395996, + "loss": 1.7655, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -17.49924659729004, + "rewards/margins": 4.173872947692871, + "rewards/rejected": -21.673118591308594, + "step": 13005 + }, + { + "epoch": 0.4385048366982372, + "grad_norm": 21.64922332763672, + "learning_rate": 6.897236651023633e-07, + "logits/chosen": -1.1953816413879395, + "logits/rejected": -1.064687967300415, + "logps/chosen": -2.1950154304504395, + "logps/rejected": -2.2868361473083496, + "loss": 3.2271, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.95015525817871, + "rewards/margins": 0.9182068705558777, + "rewards/rejected": -22.868358612060547, + "step": 13010 + }, + { + "epoch": 0.4386733627692204, + "grad_norm": 8.935832977294922, + "learning_rate": 6.894514950503594e-07, + "logits/chosen": -1.1418415307998657, + "logits/rejected": -1.4403326511383057, + "logps/chosen": -1.8407323360443115, + "logps/rejected": -2.124626874923706, + "loss": 2.3638, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.407323837280273, + "rewards/margins": 2.8389463424682617, + "rewards/rejected": -21.24627113342285, + "step": 13015 + }, + { + "epoch": 0.4388418888402036, + "grad_norm": 13.57795524597168, + "learning_rate": 6.891792594366573e-07, + "logits/chosen": -1.514499545097351, + "logits/rejected": -1.5799944400787354, + "logps/chosen": -1.9179500341415405, + "logps/rejected": -2.3520543575286865, + "loss": 2.3665, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.179500579833984, + "rewards/margins": 4.341042995452881, + "rewards/rejected": -23.520544052124023, + "step": 13020 + }, + { + "epoch": 0.43901041491118675, + "grad_norm": 27.935443878173828, + "learning_rate": 6.889069583554667e-07, + "logits/chosen": -1.4357717037200928, + "logits/rejected": -1.4973537921905518, + "logps/chosen": -2.2571046352386475, + "logps/rejected": -2.332789659500122, + "loss": 2.6409, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.571048736572266, + "rewards/margins": 0.7568472623825073, + "rewards/rejected": -23.32789421081543, + "step": 13025 + }, + { + "epoch": 0.4391789409821699, + "grad_norm": 21.917020797729492, + "learning_rate": 6.886345919010207e-07, + "logits/chosen": -1.1875712871551514, + "logits/rejected": -1.2291758060455322, + "logps/chosen": -2.2137908935546875, + "logps/rejected": -2.2709903717041016, + "loss": 3.0548, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.137910842895508, + "rewards/margins": 0.5719939470291138, + "rewards/rejected": -22.709903717041016, + "step": 13030 + }, + { + "epoch": 0.43934746705315314, + "grad_norm": 15.354704856872559, + "learning_rate": 6.883621601675743e-07, + "logits/chosen": -1.095472812652588, + "logits/rejected": -1.132777452468872, + "logps/chosen": -2.2099857330322266, + "logps/rejected": -2.343184471130371, + "loss": 2.4734, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -22.099857330322266, + "rewards/margins": 1.3319861888885498, + "rewards/rejected": -23.431842803955078, + "step": 13035 + }, + { + "epoch": 0.4395159931241363, + "grad_norm": 27.11628532409668, + "learning_rate": 6.880896632494052e-07, + "logits/chosen": -0.7230058908462524, + "logits/rejected": -0.9955340623855591, + "logps/chosen": -3.2007107734680176, + "logps/rejected": -3.135807514190674, + "loss": 4.1483, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -32.00710678100586, + "rewards/margins": -0.6490316390991211, + "rewards/rejected": -31.358074188232422, + "step": 13040 + }, + { + "epoch": 0.43968451919511947, + "grad_norm": 17.648286819458008, + "learning_rate": 6.878171012408143e-07, + "logits/chosen": -1.3936727046966553, + "logits/rejected": -1.5251567363739014, + "logps/chosen": -1.7271394729614258, + "logps/rejected": -1.8262546062469482, + "loss": 2.3641, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.271394729614258, + "rewards/margins": 0.9911512136459351, + "rewards/rejected": -18.26254653930664, + "step": 13045 + }, + { + "epoch": 0.4398530452661027, + "grad_norm": 21.880645751953125, + "learning_rate": 6.875444742361243e-07, + "logits/chosen": -0.9139540791511536, + "logits/rejected": -1.0724513530731201, + "logps/chosen": -1.8850370645523071, + "logps/rejected": -2.1353516578674316, + "loss": 3.3787, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.85036849975586, + "rewards/margins": 2.503145933151245, + "rewards/rejected": -21.353517532348633, + "step": 13050 + }, + { + "epoch": 0.44002157133708586, + "grad_norm": 15.43720531463623, + "learning_rate": 6.872717823296806e-07, + "logits/chosen": -1.1720114946365356, + "logits/rejected": -1.1585838794708252, + "logps/chosen": -2.18475341796875, + "logps/rejected": -2.2911295890808105, + "loss": 3.6322, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -21.847530364990234, + "rewards/margins": 1.0637617111206055, + "rewards/rejected": -22.91129493713379, + "step": 13055 + }, + { + "epoch": 0.440190097408069, + "grad_norm": 25.999244689941406, + "learning_rate": 6.869990256158513e-07, + "logits/chosen": -1.3059625625610352, + "logits/rejected": -1.1716262102127075, + "logps/chosen": -2.208430051803589, + "logps/rejected": -1.9477787017822266, + "loss": 5.7393, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -22.084300994873047, + "rewards/margins": -2.6065142154693604, + "rewards/rejected": -19.477787017822266, + "step": 13060 + }, + { + "epoch": 0.4403586234790522, + "grad_norm": 93.29350280761719, + "learning_rate": 6.867262041890267e-07, + "logits/chosen": -1.0793451070785522, + "logits/rejected": -1.2775110006332397, + "logps/chosen": -2.104738712310791, + "logps/rejected": -2.279259443283081, + "loss": 2.4399, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.04738426208496, + "rewards/margins": 1.7452077865600586, + "rewards/rejected": -22.792593002319336, + "step": 13065 + }, + { + "epoch": 0.4405271495500354, + "grad_norm": 32.7735710144043, + "learning_rate": 6.864533181436197e-07, + "logits/chosen": -1.1295160055160522, + "logits/rejected": -1.2696306705474854, + "logps/chosen": -1.9000717401504517, + "logps/rejected": -2.0310187339782715, + "loss": 2.4621, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.000717163085938, + "rewards/margins": 1.3094708919525146, + "rewards/rejected": -20.3101863861084, + "step": 13070 + }, + { + "epoch": 0.4406956756210186, + "grad_norm": 115.73983001708984, + "learning_rate": 6.861803675740652e-07, + "logits/chosen": -1.2367197275161743, + "logits/rejected": -1.3083772659301758, + "logps/chosen": -2.917975664138794, + "logps/rejected": -2.668013572692871, + "loss": 5.6267, + "rewards/accuracies": 0.5, + "rewards/chosen": -29.17975425720215, + "rewards/margins": -2.4996211528778076, + "rewards/rejected": -26.680133819580078, + "step": 13075 + }, + { + "epoch": 0.44086420169200174, + "grad_norm": 25.333463668823242, + "learning_rate": 6.859073525748207e-07, + "logits/chosen": -1.2527525424957275, + "logits/rejected": -1.2299823760986328, + "logps/chosen": -2.4775257110595703, + "logps/rejected": -2.5582261085510254, + "loss": 2.7355, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.775257110595703, + "rewards/margins": 0.807004451751709, + "rewards/rejected": -25.582263946533203, + "step": 13080 + }, + { + "epoch": 0.4410327277629849, + "grad_norm": 20.128324508666992, + "learning_rate": 6.856342732403658e-07, + "logits/chosen": -1.1662752628326416, + "logits/rejected": -1.07258141040802, + "logps/chosen": -1.9656169414520264, + "logps/rejected": -2.0915415287017822, + "loss": 3.361, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.656169891357422, + "rewards/margins": 1.2592443227767944, + "rewards/rejected": -20.915414810180664, + "step": 13085 + }, + { + "epoch": 0.44120125383396813, + "grad_norm": 29.223894119262695, + "learning_rate": 6.853611296652028e-07, + "logits/chosen": -1.3792330026626587, + "logits/rejected": -1.6818948984146118, + "logps/chosen": -1.9177711009979248, + "logps/rejected": -2.044464588165283, + "loss": 2.8086, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.177711486816406, + "rewards/margins": 1.2669343948364258, + "rewards/rejected": -20.44464683532715, + "step": 13090 + }, + { + "epoch": 0.4413697799049513, + "grad_norm": 32.580604553222656, + "learning_rate": 6.850879219438555e-07, + "logits/chosen": -1.114403247833252, + "logits/rejected": -1.0609710216522217, + "logps/chosen": -1.9394359588623047, + "logps/rejected": -1.9422838687896729, + "loss": 3.1338, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -19.394359588623047, + "rewards/margins": 0.028481101617217064, + "rewards/rejected": -19.422840118408203, + "step": 13095 + }, + { + "epoch": 0.44153830597593446, + "grad_norm": 85.80226135253906, + "learning_rate": 6.848146501708709e-07, + "logits/chosen": -0.9836652874946594, + "logits/rejected": -1.0147119760513306, + "logps/chosen": -2.1077113151550293, + "logps/rejected": -2.0701069831848145, + "loss": 3.5635, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.077116012573242, + "rewards/margins": -0.3760454058647156, + "rewards/rejected": -20.70107078552246, + "step": 13100 + }, + { + "epoch": 0.4417068320469177, + "grad_norm": 25.502391815185547, + "learning_rate": 6.845413144408172e-07, + "logits/chosen": -1.5241189002990723, + "logits/rejected": -1.780220627784729, + "logps/chosen": -2.1454973220825195, + "logps/rejected": -2.860766887664795, + "loss": 1.6927, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.454975128173828, + "rewards/margins": 7.152696132659912, + "rewards/rejected": -28.607669830322266, + "step": 13105 + }, + { + "epoch": 0.44187535811790085, + "grad_norm": 25.432723999023438, + "learning_rate": 6.842679148482851e-07, + "logits/chosen": -1.1696789264678955, + "logits/rejected": -1.5731089115142822, + "logps/chosen": -1.8657867908477783, + "logps/rejected": -2.2207283973693848, + "loss": 1.8702, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.657867431640625, + "rewards/margins": 3.5494143962860107, + "rewards/rejected": -22.20728302001953, + "step": 13110 + }, + { + "epoch": 0.442043884188884, + "grad_norm": 0.36097314953804016, + "learning_rate": 6.839944514878877e-07, + "logits/chosen": -1.2934590578079224, + "logits/rejected": -1.476231575012207, + "logps/chosen": -2.865110397338867, + "logps/rejected": -3.2158114910125732, + "loss": 2.2426, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -28.65110206604004, + "rewards/margins": 3.507014513015747, + "rewards/rejected": -32.158119201660156, + "step": 13115 + }, + { + "epoch": 0.4422124102598672, + "grad_norm": 27.747323989868164, + "learning_rate": 6.837209244542595e-07, + "logits/chosen": -1.120197057723999, + "logits/rejected": -1.235581398010254, + "logps/chosen": -1.9799785614013672, + "logps/rejected": -2.0634286403656006, + "loss": 2.5903, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.799787521362305, + "rewards/margins": 0.8344995379447937, + "rewards/rejected": -20.634286880493164, + "step": 13120 + }, + { + "epoch": 0.4423809363308504, + "grad_norm": 90.0943374633789, + "learning_rate": 6.834473338420579e-07, + "logits/chosen": -1.396087884902954, + "logits/rejected": -1.4234291315078735, + "logps/chosen": -2.2245163917541504, + "logps/rejected": -2.019835948944092, + "loss": 5.2797, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -22.245161056518555, + "rewards/margins": -2.0468029975891113, + "rewards/rejected": -20.198360443115234, + "step": 13125 + }, + { + "epoch": 0.44254946240183357, + "grad_norm": 117.42842102050781, + "learning_rate": 6.831736797459614e-07, + "logits/chosen": -1.3987348079681396, + "logits/rejected": -1.3095295429229736, + "logps/chosen": -2.476482391357422, + "logps/rejected": -2.4489948749542236, + "loss": 3.766, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.76482582092285, + "rewards/margins": -0.27487725019454956, + "rewards/rejected": -24.48995018005371, + "step": 13130 + }, + { + "epoch": 0.44271798847281674, + "grad_norm": 28.931501388549805, + "learning_rate": 6.828999622606711e-07, + "logits/chosen": -0.5741135478019714, + "logits/rejected": -0.7597935199737549, + "logps/chosen": -1.8390392065048218, + "logps/rejected": -1.8604549169540405, + "loss": 2.9438, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.390392303466797, + "rewards/margins": 0.2141561508178711, + "rewards/rejected": -18.604549407958984, + "step": 13135 + }, + { + "epoch": 0.4428865145437999, + "grad_norm": 35.9239501953125, + "learning_rate": 6.8262618148091e-07, + "logits/chosen": -1.060102939605713, + "logits/rejected": -1.0475283861160278, + "logps/chosen": -1.917798638343811, + "logps/rejected": -2.2860186100006104, + "loss": 2.7481, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.17798614501953, + "rewards/margins": 3.6821999549865723, + "rewards/rejected": -22.860183715820312, + "step": 13140 + }, + { + "epoch": 0.4430550406147831, + "grad_norm": 19.328611373901367, + "learning_rate": 6.823523375014226e-07, + "logits/chosen": -0.9829468727111816, + "logits/rejected": -1.0679337978363037, + "logps/chosen": -2.110246419906616, + "logps/rejected": -2.191786527633667, + "loss": 3.3953, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.102466583251953, + "rewards/margins": 0.815396785736084, + "rewards/rejected": -21.917861938476562, + "step": 13145 + }, + { + "epoch": 0.4432235666857663, + "grad_norm": 71.68951416015625, + "learning_rate": 6.820784304169756e-07, + "logits/chosen": -1.6284596920013428, + "logits/rejected": -1.9653995037078857, + "logps/chosen": -2.408257007598877, + "logps/rejected": -3.189302921295166, + "loss": 1.2931, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -24.082571029663086, + "rewards/margins": 7.8104567527771, + "rewards/rejected": -31.89302635192871, + "step": 13150 + }, + { + "epoch": 0.44339209275674946, + "grad_norm": 20.27888298034668, + "learning_rate": 6.818044603223574e-07, + "logits/chosen": -1.0515351295471191, + "logits/rejected": -1.1399530172348022, + "logps/chosen": -2.0861306190490723, + "logps/rejected": -2.0385451316833496, + "loss": 3.8656, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -20.86130714416504, + "rewards/margins": -0.47585612535476685, + "rewards/rejected": -20.38545036315918, + "step": 13155 + }, + { + "epoch": 0.4435606188277327, + "grad_norm": 46.08041763305664, + "learning_rate": 6.815304273123783e-07, + "logits/chosen": -1.0970168113708496, + "logits/rejected": -1.1916046142578125, + "logps/chosen": -2.6436848640441895, + "logps/rejected": -2.4543864727020264, + "loss": 4.9371, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -26.436847686767578, + "rewards/margins": -1.8929836750030518, + "rewards/rejected": -24.543865203857422, + "step": 13160 + }, + { + "epoch": 0.44372914489871584, + "grad_norm": 37.04818344116211, + "learning_rate": 6.812563314818703e-07, + "logits/chosen": -1.206762671470642, + "logits/rejected": -1.246781349182129, + "logps/chosen": -1.8855199813842773, + "logps/rejected": -2.097358465194702, + "loss": 1.7062, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -18.855199813842773, + "rewards/margins": 2.1183857917785645, + "rewards/rejected": -20.973587036132812, + "step": 13165 + }, + { + "epoch": 0.443897670969699, + "grad_norm": 12.917704582214355, + "learning_rate": 6.809821729256873e-07, + "logits/chosen": -1.279144287109375, + "logits/rejected": -1.3216769695281982, + "logps/chosen": -1.8531615734100342, + "logps/rejected": -2.089557409286499, + "loss": 1.7713, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.5316162109375, + "rewards/margins": 2.3639566898345947, + "rewards/rejected": -20.895572662353516, + "step": 13170 + }, + { + "epoch": 0.4440661970406822, + "grad_norm": 29.798051834106445, + "learning_rate": 6.807079517387046e-07, + "logits/chosen": -1.0649659633636475, + "logits/rejected": -1.3647143840789795, + "logps/chosen": -1.8949416875839233, + "logps/rejected": -2.111315965652466, + "loss": 2.3844, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.949417114257812, + "rewards/margins": 2.163743257522583, + "rewards/rejected": -21.1131591796875, + "step": 13175 + }, + { + "epoch": 0.4442347231116654, + "grad_norm": 20.287752151489258, + "learning_rate": 6.804336680158192e-07, + "logits/chosen": -1.6447932720184326, + "logits/rejected": -2.0392098426818848, + "logps/chosen": -1.7290363311767578, + "logps/rejected": -2.252883195877075, + "loss": 1.5029, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.290363311767578, + "rewards/margins": 5.238468170166016, + "rewards/rejected": -22.528831481933594, + "step": 13180 + }, + { + "epoch": 0.44440324918264856, + "grad_norm": 36.967166900634766, + "learning_rate": 6.801593218519504e-07, + "logits/chosen": -0.7964226007461548, + "logits/rejected": -0.8987895250320435, + "logps/chosen": -2.4772469997406006, + "logps/rejected": -2.6278481483459473, + "loss": 3.3181, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.772472381591797, + "rewards/margins": 1.506009817123413, + "rewards/rejected": -26.27848243713379, + "step": 13185 + }, + { + "epoch": 0.44457177525363173, + "grad_norm": 8.808879852294922, + "learning_rate": 6.798849133420381e-07, + "logits/chosen": -1.4141619205474854, + "logits/rejected": -1.4586751461029053, + "logps/chosen": -1.8616573810577393, + "logps/rejected": -2.29601788520813, + "loss": 1.1475, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -18.616573333740234, + "rewards/margins": 4.34360408782959, + "rewards/rejected": -22.96017837524414, + "step": 13190 + }, + { + "epoch": 0.4447403013246149, + "grad_norm": 42.99454879760742, + "learning_rate": 6.796104425810447e-07, + "logits/chosen": -1.2810221910476685, + "logits/rejected": -1.4269440174102783, + "logps/chosen": -2.0660388469696045, + "logps/rejected": -2.1136868000030518, + "loss": 2.8976, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.660388946533203, + "rewards/margins": 0.4764803946018219, + "rewards/rejected": -21.13686752319336, + "step": 13195 + }, + { + "epoch": 0.4449088273955981, + "grad_norm": 28.964792251586914, + "learning_rate": 6.793359096639533e-07, + "logits/chosen": -0.6905937194824219, + "logits/rejected": -0.8187843561172485, + "logps/chosen": -1.9930311441421509, + "logps/rejected": -2.2221527099609375, + "loss": 1.6647, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.930309295654297, + "rewards/margins": 2.291217565536499, + "rewards/rejected": -22.221527099609375, + "step": 13200 + }, + { + "epoch": 0.4449088273955981, + "eval_logits/chosen": -1.6181302070617676, + "eval_logits/rejected": -1.7306538820266724, + "eval_logps/chosen": -1.9777320623397827, + "eval_logps/rejected": -2.079491376876831, + "eval_loss": 3.0125772953033447, + "eval_rewards/accuracies": 0.6299999952316284, + "eval_rewards/chosen": -19.77732276916504, + "eval_rewards/margins": 1.0175917148590088, + "eval_rewards/rejected": -20.79491424560547, + "eval_runtime": 12.9112, + "eval_samples_per_second": 7.745, + "eval_steps_per_second": 1.936, + "step": 13200 + }, + { + "epoch": 0.4450773534665813, + "grad_norm": 30.03069305419922, + "learning_rate": 6.790613146857691e-07, + "logits/chosen": -1.3091100454330444, + "logits/rejected": -1.6376367807388306, + "logps/chosen": -2.148491621017456, + "logps/rejected": -2.295868396759033, + "loss": 3.0759, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.484914779663086, + "rewards/margins": 1.473767876625061, + "rewards/rejected": -22.958683013916016, + "step": 13205 + }, + { + "epoch": 0.44524587953756445, + "grad_norm": 137.2454071044922, + "learning_rate": 6.787866577415186e-07, + "logits/chosen": -1.1861579418182373, + "logits/rejected": -1.4750853776931763, + "logps/chosen": -2.2643580436706543, + "logps/rejected": -2.4095728397369385, + "loss": 2.4487, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.64358139038086, + "rewards/margins": 1.4521455764770508, + "rewards/rejected": -24.09572982788086, + "step": 13210 + }, + { + "epoch": 0.44541440560854767, + "grad_norm": 195.74488830566406, + "learning_rate": 6.7851193892625e-07, + "logits/chosen": -1.5892274379730225, + "logits/rejected": -1.4221677780151367, + "logps/chosen": -2.283761501312256, + "logps/rejected": -2.062800168991089, + "loss": 5.2939, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -22.837615966796875, + "rewards/margins": -2.2096126079559326, + "rewards/rejected": -20.628002166748047, + "step": 13215 + }, + { + "epoch": 0.44558293167953084, + "grad_norm": 31.29680633544922, + "learning_rate": 6.782371583350323e-07, + "logits/chosen": -0.9922895431518555, + "logits/rejected": -1.0052015781402588, + "logps/chosen": -1.8493363857269287, + "logps/rejected": -1.829101324081421, + "loss": 4.1612, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.493366241455078, + "rewards/margins": -0.2023508995771408, + "rewards/rejected": -18.291013717651367, + "step": 13220 + }, + { + "epoch": 0.445751457750514, + "grad_norm": 41.823753356933594, + "learning_rate": 6.779623160629563e-07, + "logits/chosen": -1.229884386062622, + "logits/rejected": -1.6847175359725952, + "logps/chosen": -1.9959089756011963, + "logps/rejected": -2.059077501296997, + "loss": 3.4638, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -19.959091186523438, + "rewards/margins": 0.6316847801208496, + "rewards/rejected": -20.590774536132812, + "step": 13225 + }, + { + "epoch": 0.44591998382149717, + "grad_norm": 27.8101863861084, + "learning_rate": 6.776874122051343e-07, + "logits/chosen": -1.2641267776489258, + "logits/rejected": -1.4562067985534668, + "logps/chosen": -2.0732169151306152, + "logps/rejected": -2.3724536895751953, + "loss": 2.2068, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.732168197631836, + "rewards/margins": 2.9923672676086426, + "rewards/rejected": -23.724536895751953, + "step": 13230 + }, + { + "epoch": 0.4460885098924804, + "grad_norm": 14.324053764343262, + "learning_rate": 6.774124468566994e-07, + "logits/chosen": -1.0541682243347168, + "logits/rejected": -1.083929181098938, + "logps/chosen": -1.5968945026397705, + "logps/rejected": -1.8938548564910889, + "loss": 1.4582, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -15.968945503234863, + "rewards/margins": 2.9696033000946045, + "rewards/rejected": -18.938547134399414, + "step": 13235 + }, + { + "epoch": 0.44625703596346356, + "grad_norm": 23.845109939575195, + "learning_rate": 6.771374201128064e-07, + "logits/chosen": -1.324191927909851, + "logits/rejected": -1.682186484336853, + "logps/chosen": -1.9650967121124268, + "logps/rejected": -2.437678575515747, + "loss": 1.7045, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -19.65096664428711, + "rewards/margins": 4.725818634033203, + "rewards/rejected": -24.376785278320312, + "step": 13240 + }, + { + "epoch": 0.4464255620344467, + "grad_norm": 38.21913528442383, + "learning_rate": 6.768623320686315e-07, + "logits/chosen": -0.9799416661262512, + "logits/rejected": -0.7952501177787781, + "logps/chosen": -2.021001100540161, + "logps/rejected": -1.8483047485351562, + "loss": 4.7785, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -20.210010528564453, + "rewards/margins": -1.7269624471664429, + "rewards/rejected": -18.483047485351562, + "step": 13245 + }, + { + "epoch": 0.4465940881054299, + "grad_norm": 24.604358673095703, + "learning_rate": 6.765871828193712e-07, + "logits/chosen": -1.4743516445159912, + "logits/rejected": -1.4921091794967651, + "logps/chosen": -1.9836381673812866, + "logps/rejected": -2.2177886962890625, + "loss": 2.4211, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.836380004882812, + "rewards/margins": 2.341505289077759, + "rewards/rejected": -22.177886962890625, + "step": 13250 + }, + { + "epoch": 0.4467626141764131, + "grad_norm": 25.316375732421875, + "learning_rate": 6.763119724602444e-07, + "logits/chosen": -1.157165288925171, + "logits/rejected": -1.2132501602172852, + "logps/chosen": -2.028961181640625, + "logps/rejected": -2.47636079788208, + "loss": 1.1852, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -20.289613723754883, + "rewards/margins": 4.473998546600342, + "rewards/rejected": -24.763607025146484, + "step": 13255 + }, + { + "epoch": 0.4469311402473963, + "grad_norm": 65.39524841308594, + "learning_rate": 6.760367010864902e-07, + "logits/chosen": -1.1245633363723755, + "logits/rejected": -1.1890214681625366, + "logps/chosen": -2.193920135498047, + "logps/rejected": -2.1093883514404297, + "loss": 4.1426, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -21.93920135498047, + "rewards/margins": -0.8453181385993958, + "rewards/rejected": -21.093883514404297, + "step": 13260 + }, + { + "epoch": 0.44709966631837944, + "grad_norm": 81.17794036865234, + "learning_rate": 6.757613687933694e-07, + "logits/chosen": -1.566934585571289, + "logits/rejected": -1.8760780096054077, + "logps/chosen": -2.550309419631958, + "logps/rejected": -2.6105148792266846, + "loss": 4.4707, + "rewards/accuracies": 0.5, + "rewards/chosen": -25.503093719482422, + "rewards/margins": 0.6020570993423462, + "rewards/rejected": -26.105152130126953, + "step": 13265 + }, + { + "epoch": 0.44726819238936266, + "grad_norm": 16.7962703704834, + "learning_rate": 6.754859756761636e-07, + "logits/chosen": -0.7093914747238159, + "logits/rejected": -1.3773797750473022, + "logps/chosen": -2.6696250438690186, + "logps/rejected": -2.9403035640716553, + "loss": 2.9442, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -26.69624900817871, + "rewards/margins": 2.7067861557006836, + "rewards/rejected": -29.403034210205078, + "step": 13270 + }, + { + "epoch": 0.44743671846034583, + "grad_norm": 21.853174209594727, + "learning_rate": 6.752105218301756e-07, + "logits/chosen": -1.2181885242462158, + "logits/rejected": -1.489119291305542, + "logps/chosen": -2.325205087661743, + "logps/rejected": -2.50209379196167, + "loss": 2.7952, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -23.25205421447754, + "rewards/margins": 1.7688862085342407, + "rewards/rejected": -25.020938873291016, + "step": 13275 + }, + { + "epoch": 0.447605244531329, + "grad_norm": 23.012371063232422, + "learning_rate": 6.749350073507288e-07, + "logits/chosen": -1.453213095664978, + "logits/rejected": -1.8205163478851318, + "logps/chosen": -1.930599570274353, + "logps/rejected": -2.1534509658813477, + "loss": 2.5051, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.30599594116211, + "rewards/margins": 2.2285144329071045, + "rewards/rejected": -21.53451156616211, + "step": 13280 + }, + { + "epoch": 0.44777377060231216, + "grad_norm": 29.789447784423828, + "learning_rate": 6.746594323331681e-07, + "logits/chosen": -1.418830394744873, + "logits/rejected": -1.1861220598220825, + "logps/chosen": -2.4347853660583496, + "logps/rejected": -2.4367117881774902, + "loss": 3.1936, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.34785270690918, + "rewards/margins": 0.019265126436948776, + "rewards/rejected": -24.367116928100586, + "step": 13285 + }, + { + "epoch": 0.4479422966732954, + "grad_norm": 12.318198204040527, + "learning_rate": 6.743837968728594e-07, + "logits/chosen": -1.5645182132720947, + "logits/rejected": -1.6600430011749268, + "logps/chosen": -1.8471921682357788, + "logps/rejected": -1.8008241653442383, + "loss": 4.0371, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.471920013427734, + "rewards/margins": -0.46367913484573364, + "rewards/rejected": -18.008243560791016, + "step": 13290 + }, + { + "epoch": 0.44811082274427855, + "grad_norm": 16.357254028320312, + "learning_rate": 6.741081010651889e-07, + "logits/chosen": -1.1210089921951294, + "logits/rejected": -1.0778058767318726, + "logps/chosen": -1.6563653945922852, + "logps/rejected": -1.7912307977676392, + "loss": 2.2908, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.56365394592285, + "rewards/margins": 1.34865403175354, + "rewards/rejected": -17.912307739257812, + "step": 13295 + }, + { + "epoch": 0.4482793488152617, + "grad_norm": 15.02835750579834, + "learning_rate": 6.738323450055643e-07, + "logits/chosen": -0.8961246609687805, + "logits/rejected": -1.0249212980270386, + "logps/chosen": -1.69499933719635, + "logps/rejected": -2.0932421684265137, + "loss": 1.6754, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -16.94999122619629, + "rewards/margins": 3.982431411743164, + "rewards/rejected": -20.932422637939453, + "step": 13300 + }, + { + "epoch": 0.4484478748862449, + "grad_norm": 6.8707990646362305, + "learning_rate": 6.735565287894138e-07, + "logits/chosen": -1.0673387050628662, + "logits/rejected": -1.2874033451080322, + "logps/chosen": -2.0255348682403564, + "logps/rejected": -2.2231597900390625, + "loss": 1.8753, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.255346298217773, + "rewards/margins": 1.9762521982192993, + "rewards/rejected": -22.23160171508789, + "step": 13305 + }, + { + "epoch": 0.4486164009572281, + "grad_norm": 60.034698486328125, + "learning_rate": 6.732806525121865e-07, + "logits/chosen": -1.4277997016906738, + "logits/rejected": -1.6005510091781616, + "logps/chosen": -2.0236220359802246, + "logps/rejected": -2.0606486797332764, + "loss": 3.7093, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.23621940612793, + "rewards/margins": 0.3702692985534668, + "rewards/rejected": -20.606489181518555, + "step": 13310 + }, + { + "epoch": 0.44878492702821127, + "grad_norm": 22.498273849487305, + "learning_rate": 6.730047162693524e-07, + "logits/chosen": -0.6527112126350403, + "logits/rejected": -0.9839221835136414, + "logps/chosen": -1.7513904571533203, + "logps/rejected": -2.521151304244995, + "loss": 1.6066, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.513904571533203, + "rewards/margins": 7.697610378265381, + "rewards/rejected": -25.211515426635742, + "step": 13315 + }, + { + "epoch": 0.44895345309919443, + "grad_norm": 17.344219207763672, + "learning_rate": 6.72728720156402e-07, + "logits/chosen": -1.0116724967956543, + "logits/rejected": -1.0222210884094238, + "logps/chosen": -1.6952499151229858, + "logps/rejected": -1.765568494796753, + "loss": 2.7574, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.952499389648438, + "rewards/margins": 0.7031861543655396, + "rewards/rejected": -17.655685424804688, + "step": 13320 + }, + { + "epoch": 0.4491219791701776, + "grad_norm": 209.14768981933594, + "learning_rate": 6.724526642688469e-07, + "logits/chosen": -0.7147254943847656, + "logits/rejected": -0.6004077792167664, + "logps/chosen": -2.7620253562927246, + "logps/rejected": -2.6541154384613037, + "loss": 4.4528, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -27.620254516601562, + "rewards/margins": -1.079100251197815, + "rewards/rejected": -26.541156768798828, + "step": 13325 + }, + { + "epoch": 0.4492905052411608, + "grad_norm": 21.652204513549805, + "learning_rate": 6.72176548702219e-07, + "logits/chosen": -1.4568471908569336, + "logits/rejected": -1.3350114822387695, + "logps/chosen": -1.6353132724761963, + "logps/rejected": -1.6017754077911377, + "loss": 3.6454, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.353130340576172, + "rewards/margins": -0.33537864685058594, + "rewards/rejected": -16.017751693725586, + "step": 13330 + }, + { + "epoch": 0.449459031312144, + "grad_norm": 20.0999698638916, + "learning_rate": 6.719003735520711e-07, + "logits/chosen": -1.2092087268829346, + "logits/rejected": -1.0637407302856445, + "logps/chosen": -2.552417516708374, + "logps/rejected": -2.986699342727661, + "loss": 1.2213, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -25.5241756439209, + "rewards/margins": 4.3428168296813965, + "rewards/rejected": -29.866994857788086, + "step": 13335 + }, + { + "epoch": 0.44962755738312715, + "grad_norm": 12.592076301574707, + "learning_rate": 6.716241389139767e-07, + "logits/chosen": -1.3967036008834839, + "logits/rejected": -1.6072719097137451, + "logps/chosen": -2.0083084106445312, + "logps/rejected": -2.2066750526428223, + "loss": 2.3997, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.083087921142578, + "rewards/margins": 1.9836658239364624, + "rewards/rejected": -22.066753387451172, + "step": 13340 + }, + { + "epoch": 0.4497960834541104, + "grad_norm": 25.59599494934082, + "learning_rate": 6.713478448835292e-07, + "logits/chosen": -1.3335907459259033, + "logits/rejected": -1.4632683992385864, + "logps/chosen": -1.9616371393203735, + "logps/rejected": -2.127497434616089, + "loss": 2.5662, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.616371154785156, + "rewards/margins": 1.6586040258407593, + "rewards/rejected": -21.274974822998047, + "step": 13345 + }, + { + "epoch": 0.44996460952509354, + "grad_norm": 28.009143829345703, + "learning_rate": 6.710714915563436e-07, + "logits/chosen": -0.913608729839325, + "logits/rejected": -0.8590755462646484, + "logps/chosen": -2.1531331539154053, + "logps/rejected": -2.5827205181121826, + "loss": 2.3824, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.53133201599121, + "rewards/margins": 4.295873165130615, + "rewards/rejected": -25.827205657958984, + "step": 13350 + }, + { + "epoch": 0.4501331355960767, + "grad_norm": 29.852846145629883, + "learning_rate": 6.707950790280545e-07, + "logits/chosen": -1.193040370941162, + "logits/rejected": -1.324516773223877, + "logps/chosen": -2.145280599594116, + "logps/rejected": -2.0207278728485107, + "loss": 4.3902, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.45280647277832, + "rewards/margins": -1.2455263137817383, + "rewards/rejected": -20.2072811126709, + "step": 13355 + }, + { + "epoch": 0.45030166166705987, + "grad_norm": 9.560738563537598, + "learning_rate": 6.70518607394318e-07, + "logits/chosen": -0.8668038249015808, + "logits/rejected": -1.1544456481933594, + "logps/chosen": -2.8680739402770996, + "logps/rejected": -3.039849042892456, + "loss": 3.3841, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -28.680736541748047, + "rewards/margins": 1.7177526950836182, + "rewards/rejected": -30.39849281311035, + "step": 13360 + }, + { + "epoch": 0.4504701877380431, + "grad_norm": 28.32439613342285, + "learning_rate": 6.702420767508094e-07, + "logits/chosen": -1.2721959352493286, + "logits/rejected": -1.5142534971237183, + "logps/chosen": -2.3183741569519043, + "logps/rejected": -2.365943431854248, + "loss": 4.5788, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.183740615844727, + "rewards/margins": 0.47569331526756287, + "rewards/rejected": -23.659435272216797, + "step": 13365 + }, + { + "epoch": 0.45063871380902626, + "grad_norm": 24.397031784057617, + "learning_rate": 6.699654871932255e-07, + "logits/chosen": -1.5062090158462524, + "logits/rejected": -1.5440236330032349, + "logps/chosen": -1.8172622919082642, + "logps/rejected": -2.0760245323181152, + "loss": 1.6939, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.172622680664062, + "rewards/margins": 2.587623357772827, + "rewards/rejected": -20.760244369506836, + "step": 13370 + }, + { + "epoch": 0.4508072398800094, + "grad_norm": 28.845012664794922, + "learning_rate": 6.696888388172827e-07, + "logits/chosen": -1.766819715499878, + "logits/rejected": -1.8832629919052124, + "logps/chosen": -1.7977116107940674, + "logps/rejected": -2.1124632358551025, + "loss": 2.0223, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.977115631103516, + "rewards/margins": 3.147516965866089, + "rewards/rejected": -21.124631881713867, + "step": 13375 + }, + { + "epoch": 0.4509757659509926, + "grad_norm": 35.38254165649414, + "learning_rate": 6.694121317187182e-07, + "logits/chosen": -1.0718519687652588, + "logits/rejected": -1.0068604946136475, + "logps/chosen": -2.358485698699951, + "logps/rejected": -2.662508010864258, + "loss": 4.2874, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.584857940673828, + "rewards/margins": 3.0402235984802246, + "rewards/rejected": -26.625080108642578, + "step": 13380 + }, + { + "epoch": 0.4511442920219758, + "grad_norm": 58.901695251464844, + "learning_rate": 6.691353659932895e-07, + "logits/chosen": -1.763758659362793, + "logits/rejected": -1.3714280128479004, + "logps/chosen": -2.5071606636047363, + "logps/rejected": -2.5245368480682373, + "loss": 2.98, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.071605682373047, + "rewards/margins": 0.17376241087913513, + "rewards/rejected": -25.245370864868164, + "step": 13385 + }, + { + "epoch": 0.451312818092959, + "grad_norm": 41.93807601928711, + "learning_rate": 6.688585417367744e-07, + "logits/chosen": -1.47969388961792, + "logits/rejected": -1.490703821182251, + "logps/chosen": -2.10553240776062, + "logps/rejected": -1.976574182510376, + "loss": 4.7305, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -21.055326461791992, + "rewards/margins": -1.2895841598510742, + "rewards/rejected": -19.7657413482666, + "step": 13390 + }, + { + "epoch": 0.45148134416394214, + "grad_norm": 23.69333839416504, + "learning_rate": 6.685816590449708e-07, + "logits/chosen": -1.6030505895614624, + "logits/rejected": -1.6188846826553345, + "logps/chosen": -2.3643722534179688, + "logps/rejected": -2.304133415222168, + "loss": 3.9704, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.643722534179688, + "rewards/margins": -0.6023872494697571, + "rewards/rejected": -23.04133415222168, + "step": 13395 + }, + { + "epoch": 0.45164987023492537, + "grad_norm": 23.56180191040039, + "learning_rate": 6.683047180136968e-07, + "logits/chosen": -0.9816482663154602, + "logits/rejected": -1.2482496500015259, + "logps/chosen": -1.8885109424591064, + "logps/rejected": -2.057342052459717, + "loss": 2.3919, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.885107040405273, + "rewards/margins": 1.6883150339126587, + "rewards/rejected": -20.573421478271484, + "step": 13400 + }, + { + "epoch": 0.45181839630590853, + "grad_norm": 42.30891418457031, + "learning_rate": 6.680277187387908e-07, + "logits/chosen": -1.793176293373108, + "logits/rejected": -1.7645775079727173, + "logps/chosen": -1.954549789428711, + "logps/rejected": -2.162538528442383, + "loss": 2.726, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.54549789428711, + "rewards/margins": 2.0798871517181396, + "rewards/rejected": -21.625385284423828, + "step": 13405 + }, + { + "epoch": 0.4519869223768917, + "grad_norm": 45.36151885986328, + "learning_rate": 6.677506613161115e-07, + "logits/chosen": -1.1814281940460205, + "logits/rejected": -1.3925037384033203, + "logps/chosen": -2.0957963466644287, + "logps/rejected": -2.2337698936462402, + "loss": 2.4478, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.957962036132812, + "rewards/margins": 1.3797346353530884, + "rewards/rejected": -22.33769989013672, + "step": 13410 + }, + { + "epoch": 0.45215544844787486, + "grad_norm": 30.80636978149414, + "learning_rate": 6.674735458415371e-07, + "logits/chosen": -1.3277122974395752, + "logits/rejected": -1.2999038696289062, + "logps/chosen": -1.950740098953247, + "logps/rejected": -1.9885116815567017, + "loss": 3.0388, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.507400512695312, + "rewards/margins": 0.37771645188331604, + "rewards/rejected": -19.885116577148438, + "step": 13415 + }, + { + "epoch": 0.4523239745188581, + "grad_norm": 15.257139205932617, + "learning_rate": 6.67196372410967e-07, + "logits/chosen": -1.331177830696106, + "logits/rejected": -1.614814043045044, + "logps/chosen": -1.9961341619491577, + "logps/rejected": -2.163208484649658, + "loss": 2.151, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.96134376525879, + "rewards/margins": 1.6707426309585571, + "rewards/rejected": -21.6320858001709, + "step": 13420 + }, + { + "epoch": 0.45249250058984125, + "grad_norm": 18.619733810424805, + "learning_rate": 6.669191411203195e-07, + "logits/chosen": -0.8345550298690796, + "logits/rejected": -0.8545502424240112, + "logps/chosen": -2.327765941619873, + "logps/rejected": -2.5256428718566895, + "loss": 2.3759, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -23.277658462524414, + "rewards/margins": 1.978769302368164, + "rewards/rejected": -25.256427764892578, + "step": 13425 + }, + { + "epoch": 0.4526610266608244, + "grad_norm": 23.630504608154297, + "learning_rate": 6.666418520655337e-07, + "logits/chosen": -1.0486633777618408, + "logits/rejected": -0.9874933362007141, + "logps/chosen": -1.9027341604232788, + "logps/rejected": -1.8841438293457031, + "loss": 3.8279, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.027341842651367, + "rewards/margins": -0.1859048306941986, + "rewards/rejected": -18.8414363861084, + "step": 13430 + }, + { + "epoch": 0.4528295527318076, + "grad_norm": 21.301984786987305, + "learning_rate": 6.663645053425684e-07, + "logits/chosen": -1.3079849481582642, + "logits/rejected": -1.2970564365386963, + "logps/chosen": -1.8315374851226807, + "logps/rejected": -2.204904794692993, + "loss": 3.1383, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.315372467041016, + "rewards/margins": 3.733673095703125, + "rewards/rejected": -22.04904556274414, + "step": 13435 + }, + { + "epoch": 0.4529980788027908, + "grad_norm": 100.8381118774414, + "learning_rate": 6.660871010474022e-07, + "logits/chosen": -1.1379636526107788, + "logits/rejected": -0.9681974649429321, + "logps/chosen": -2.4161911010742188, + "logps/rejected": -2.1934218406677246, + "loss": 5.781, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -24.16191291809082, + "rewards/margins": -2.2276949882507324, + "rewards/rejected": -21.934215545654297, + "step": 13440 + }, + { + "epoch": 0.45316660487377397, + "grad_norm": 57.3202018737793, + "learning_rate": 6.658096392760339e-07, + "logits/chosen": -1.0857040882110596, + "logits/rejected": -1.133437991142273, + "logps/chosen": -1.7230842113494873, + "logps/rejected": -1.8146679401397705, + "loss": 2.8472, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.230844497680664, + "rewards/margins": 0.9158358573913574, + "rewards/rejected": -18.146678924560547, + "step": 13445 + }, + { + "epoch": 0.45333513094475714, + "grad_norm": 11.185892105102539, + "learning_rate": 6.655321201244822e-07, + "logits/chosen": -1.4162670373916626, + "logits/rejected": -1.4772002696990967, + "logps/chosen": -1.9110603332519531, + "logps/rejected": -2.6553866863250732, + "loss": 1.822, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.1106014251709, + "rewards/margins": 7.443265438079834, + "rewards/rejected": -26.553869247436523, + "step": 13450 + }, + { + "epoch": 0.45350365701574036, + "grad_norm": 19.761932373046875, + "learning_rate": 6.652545436887853e-07, + "logits/chosen": -1.1321470737457275, + "logits/rejected": -1.3900673389434814, + "logps/chosen": -2.0201334953308105, + "logps/rejected": -2.591609477996826, + "loss": 2.73, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.201332092285156, + "rewards/margins": 5.714763641357422, + "rewards/rejected": -25.916095733642578, + "step": 13455 + }, + { + "epoch": 0.4536721830867235, + "grad_norm": 26.253297805786133, + "learning_rate": 6.649769100650016e-07, + "logits/chosen": -1.4115238189697266, + "logits/rejected": -1.5038487911224365, + "logps/chosen": -2.2541861534118652, + "logps/rejected": -2.6011769771575928, + "loss": 1.8083, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -22.54186248779297, + "rewards/margins": 3.469909191131592, + "rewards/rejected": -26.01177406311035, + "step": 13460 + }, + { + "epoch": 0.4538407091577067, + "grad_norm": 66.70148468017578, + "learning_rate": 6.646992193492091e-07, + "logits/chosen": -1.5807290077209473, + "logits/rejected": -1.4729264974594116, + "logps/chosen": -1.8306732177734375, + "logps/rejected": -1.915564775466919, + "loss": 2.5989, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.306732177734375, + "rewards/margins": 0.8489131927490234, + "rewards/rejected": -19.1556453704834, + "step": 13465 + }, + { + "epoch": 0.45400923522868986, + "grad_norm": 21.561328887939453, + "learning_rate": 6.644214716375058e-07, + "logits/chosen": -1.6580655574798584, + "logits/rejected": -1.7166268825531006, + "logps/chosen": -1.9259936809539795, + "logps/rejected": -2.1397814750671387, + "loss": 3.2724, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.259937286376953, + "rewards/margins": 2.137877941131592, + "rewards/rejected": -21.397815704345703, + "step": 13470 + }, + { + "epoch": 0.4541777612996731, + "grad_norm": 74.84004974365234, + "learning_rate": 6.641436670260091e-07, + "logits/chosen": -0.9843997955322266, + "logits/rejected": -1.2904479503631592, + "logps/chosen": -2.055634021759033, + "logps/rejected": -2.116734504699707, + "loss": 3.042, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.55634117126465, + "rewards/margins": 0.6110035181045532, + "rewards/rejected": -21.167346954345703, + "step": 13475 + }, + { + "epoch": 0.45434628737065624, + "grad_norm": 27.720678329467773, + "learning_rate": 6.638658056108563e-07, + "logits/chosen": -1.02791166305542, + "logits/rejected": -1.3568519353866577, + "logps/chosen": -1.9348390102386475, + "logps/rejected": -1.9345744848251343, + "loss": 3.5751, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.348390579223633, + "rewards/margins": -0.0026446818374097347, + "rewards/rejected": -19.345746994018555, + "step": 13480 + }, + { + "epoch": 0.4545148134416394, + "grad_norm": 23.407005310058594, + "learning_rate": 6.63587887488204e-07, + "logits/chosen": -1.26626718044281, + "logits/rejected": -1.209788203239441, + "logps/chosen": -1.6449733972549438, + "logps/rejected": -1.5845093727111816, + "loss": 3.9153, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.449735641479492, + "rewards/margins": -0.6046417355537415, + "rewards/rejected": -15.845094680786133, + "step": 13485 + }, + { + "epoch": 0.4546833395126226, + "grad_norm": 23.646387100219727, + "learning_rate": 6.633099127542292e-07, + "logits/chosen": -1.390199899673462, + "logits/rejected": -1.3530223369598389, + "logps/chosen": -2.141261577606201, + "logps/rejected": -2.1175923347473145, + "loss": 3.3947, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.412616729736328, + "rewards/margins": -0.23669414222240448, + "rewards/rejected": -21.175920486450195, + "step": 13490 + }, + { + "epoch": 0.4548518655836058, + "grad_norm": 20.303586959838867, + "learning_rate": 6.630318815051276e-07, + "logits/chosen": -1.4647496938705444, + "logits/rejected": -1.537502408027649, + "logps/chosen": -2.4159634113311768, + "logps/rejected": -2.673832416534424, + "loss": 1.8577, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.159637451171875, + "rewards/margins": 2.5786876678466797, + "rewards/rejected": -26.738323211669922, + "step": 13495 + }, + { + "epoch": 0.45502039165458896, + "grad_norm": 29.78251075744629, + "learning_rate": 6.627537938371151e-07, + "logits/chosen": -1.3127989768981934, + "logits/rejected": -1.3036218881607056, + "logps/chosen": -1.9458658695220947, + "logps/rejected": -1.7856101989746094, + "loss": 4.6394, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -19.458660125732422, + "rewards/margins": -1.6025558710098267, + "rewards/rejected": -17.856103897094727, + "step": 13500 + }, + { + "epoch": 0.45518891772557213, + "grad_norm": 56.97768783569336, + "learning_rate": 6.624756498464268e-07, + "logits/chosen": -1.3639286756515503, + "logits/rejected": -1.4781858921051025, + "logps/chosen": -3.0983211994171143, + "logps/rejected": -2.691847562789917, + "loss": 7.4064, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -30.98320960998535, + "rewards/margins": -4.064734935760498, + "rewards/rejected": -26.918476104736328, + "step": 13505 + }, + { + "epoch": 0.45535744379655535, + "grad_norm": 23.73890495300293, + "learning_rate": 6.621974496293173e-07, + "logits/chosen": -1.0127825736999512, + "logits/rejected": -1.3120650053024292, + "logps/chosen": -2.0623297691345215, + "logps/rejected": -2.40421986579895, + "loss": 4.3155, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -20.6232967376709, + "rewards/margins": 3.4188995361328125, + "rewards/rejected": -24.04219627380371, + "step": 13510 + }, + { + "epoch": 0.4555259698675385, + "grad_norm": 30.769384384155273, + "learning_rate": 6.619191932820608e-07, + "logits/chosen": -1.323246717453003, + "logits/rejected": -1.534246802330017, + "logps/chosen": -2.2586467266082764, + "logps/rejected": -2.7533957958221436, + "loss": 1.5861, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -22.586467742919922, + "rewards/margins": 4.9474921226501465, + "rewards/rejected": -27.533960342407227, + "step": 13515 + }, + { + "epoch": 0.4556944959385217, + "grad_norm": 14.726211547851562, + "learning_rate": 6.616408809009508e-07, + "logits/chosen": -1.2696958780288696, + "logits/rejected": -1.288459062576294, + "logps/chosen": -2.2325329780578613, + "logps/rejected": -2.7220871448516846, + "loss": 1.5197, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -22.325326919555664, + "rewards/margins": 4.895545482635498, + "rewards/rejected": -27.220874786376953, + "step": 13520 + }, + { + "epoch": 0.45586302200950485, + "grad_norm": 18.62717056274414, + "learning_rate": 6.613625125823006e-07, + "logits/chosen": -0.8771476745605469, + "logits/rejected": -1.0887469053268433, + "logps/chosen": -2.093808650970459, + "logps/rejected": -2.270981550216675, + "loss": 1.9294, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.938087463378906, + "rewards/margins": 1.7717288732528687, + "rewards/rejected": -22.709814071655273, + "step": 13525 + }, + { + "epoch": 0.45603154808048807, + "grad_norm": 32.54737091064453, + "learning_rate": 6.610840884224419e-07, + "logits/chosen": -1.5765092372894287, + "logits/rejected": -1.587710976600647, + "logps/chosen": -2.069793462753296, + "logps/rejected": -2.1981959342956543, + "loss": 2.4058, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.697935104370117, + "rewards/margins": 1.2840235233306885, + "rewards/rejected": -21.981958389282227, + "step": 13530 + }, + { + "epoch": 0.45620007415147124, + "grad_norm": 43.75876998901367, + "learning_rate": 6.60805608517727e-07, + "logits/chosen": -1.2509219646453857, + "logits/rejected": -1.4516044855117798, + "logps/chosen": -1.8669036626815796, + "logps/rejected": -1.8626625537872314, + "loss": 3.7962, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -18.669036865234375, + "rewards/margins": -0.04241304472088814, + "rewards/rejected": -18.626625061035156, + "step": 13535 + }, + { + "epoch": 0.4563686002224544, + "grad_norm": 31.220243453979492, + "learning_rate": 6.605270729645263e-07, + "logits/chosen": -1.1909513473510742, + "logits/rejected": -1.2448523044586182, + "logps/chosen": -1.9961858987808228, + "logps/rejected": -2.35213041305542, + "loss": 2.2121, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.96185874938965, + "rewards/margins": 3.5594451427459717, + "rewards/rejected": -23.521305084228516, + "step": 13540 + }, + { + "epoch": 0.45653712629343757, + "grad_norm": 159.19329833984375, + "learning_rate": 6.6024848185923e-07, + "logits/chosen": -1.0990245342254639, + "logits/rejected": -1.2833720445632935, + "logps/chosen": -2.3984270095825195, + "logps/rejected": -2.343677520751953, + "loss": 4.4424, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.984272003173828, + "rewards/margins": -0.5474956631660461, + "rewards/rejected": -23.4367733001709, + "step": 13545 + }, + { + "epoch": 0.4567056523644208, + "grad_norm": 24.201881408691406, + "learning_rate": 6.599698352982479e-07, + "logits/chosen": -1.5832691192626953, + "logits/rejected": -1.9115978479385376, + "logps/chosen": -1.7946374416351318, + "logps/rejected": -1.907883882522583, + "loss": 2.2535, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.946374893188477, + "rewards/margins": 1.132464051246643, + "rewards/rejected": -19.078838348388672, + "step": 13550 + }, + { + "epoch": 0.45687417843540395, + "grad_norm": 28.029586791992188, + "learning_rate": 6.596911333780082e-07, + "logits/chosen": -1.019972562789917, + "logits/rejected": -0.9652736783027649, + "logps/chosen": -2.3742733001708984, + "logps/rejected": -2.439466953277588, + "loss": 2.5481, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.742733001708984, + "rewards/margins": 0.6519354581832886, + "rewards/rejected": -24.394670486450195, + "step": 13555 + }, + { + "epoch": 0.4570427045063871, + "grad_norm": 20.071243286132812, + "learning_rate": 6.594123761949589e-07, + "logits/chosen": -1.3693435192108154, + "logits/rejected": -1.8643760681152344, + "logps/chosen": -1.987213373184204, + "logps/rejected": -2.7671642303466797, + "loss": 2.0481, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.872133255004883, + "rewards/margins": 7.799508571624756, + "rewards/rejected": -27.671642303466797, + "step": 13560 + }, + { + "epoch": 0.45721123057737034, + "grad_norm": 30.94683837890625, + "learning_rate": 6.591335638455667e-07, + "logits/chosen": -1.4474550485610962, + "logits/rejected": -1.564068078994751, + "logps/chosen": -1.9418039321899414, + "logps/rejected": -1.8427324295043945, + "loss": 4.245, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -19.418039321899414, + "rewards/margins": -0.9907158613204956, + "rewards/rejected": -18.427324295043945, + "step": 13565 + }, + { + "epoch": 0.4573797566483535, + "grad_norm": 29.03971290588379, + "learning_rate": 6.588546964263178e-07, + "logits/chosen": -1.3514407873153687, + "logits/rejected": -1.6398487091064453, + "logps/chosen": -2.3361032009124756, + "logps/rejected": -2.876328468322754, + "loss": 1.5941, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -23.361032485961914, + "rewards/margins": 5.402250289916992, + "rewards/rejected": -28.76328468322754, + "step": 13570 + }, + { + "epoch": 0.4575482827193367, + "grad_norm": 24.678768157958984, + "learning_rate": 6.58575774033717e-07, + "logits/chosen": -1.069570779800415, + "logits/rejected": -1.5706819295883179, + "logps/chosen": -1.6676559448242188, + "logps/rejected": -2.0189266204833984, + "loss": 2.0779, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.676559448242188, + "rewards/margins": 3.5127081871032715, + "rewards/rejected": -20.189266204833984, + "step": 13575 + }, + { + "epoch": 0.45771680879031984, + "grad_norm": 23.059268951416016, + "learning_rate": 6.582967967642886e-07, + "logits/chosen": -1.284484624862671, + "logits/rejected": -1.2964885234832764, + "logps/chosen": -1.8898823261260986, + "logps/rejected": -2.063899278640747, + "loss": 2.1698, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.898822784423828, + "rewards/margins": 1.7401721477508545, + "rewards/rejected": -20.638994216918945, + "step": 13580 + }, + { + "epoch": 0.45788533486130306, + "grad_norm": 4.454768180847168, + "learning_rate": 6.580177647145753e-07, + "logits/chosen": -0.8917932510375977, + "logits/rejected": -0.643142819404602, + "logps/chosen": -2.9636824131011963, + "logps/rejected": -3.6665992736816406, + "loss": 1.6265, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -29.636821746826172, + "rewards/margins": 7.029170036315918, + "rewards/rejected": -36.665992736816406, + "step": 13585 + }, + { + "epoch": 0.4580538609322862, + "grad_norm": 27.495288848876953, + "learning_rate": 6.577386779811393e-07, + "logits/chosen": -1.07036554813385, + "logits/rejected": -0.9880205988883972, + "logps/chosen": -1.5988483428955078, + "logps/rejected": -1.5910999774932861, + "loss": 3.2392, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -15.988485336303711, + "rewards/margins": -0.07748489081859589, + "rewards/rejected": -15.910998344421387, + "step": 13590 + }, + { + "epoch": 0.4582223870032694, + "grad_norm": 30.054046630859375, + "learning_rate": 6.574595366605616e-07, + "logits/chosen": -0.9607011675834656, + "logits/rejected": -0.9186193346977234, + "logps/chosen": -2.318202257156372, + "logps/rejected": -2.5237011909484863, + "loss": 3.5981, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -23.182022094726562, + "rewards/margins": 2.054990768432617, + "rewards/rejected": -25.237010955810547, + "step": 13595 + }, + { + "epoch": 0.45839091307425256, + "grad_norm": 30.519376754760742, + "learning_rate": 6.571803408494419e-07, + "logits/chosen": -1.5208982229232788, + "logits/rejected": -1.7436546087265015, + "logps/chosen": -2.0867679119110107, + "logps/rejected": -2.053692579269409, + "loss": 4.8533, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.867679595947266, + "rewards/margins": -0.33075445890426636, + "rewards/rejected": -20.53692626953125, + "step": 13600 + }, + { + "epoch": 0.45839091307425256, + "eval_logits/chosen": -1.6288355588912964, + "eval_logits/rejected": -1.7436861991882324, + "eval_logps/chosen": -1.9900078773498535, + "eval_logps/rejected": -2.096331834793091, + "eval_loss": 3.001215934753418, + "eval_rewards/accuracies": 0.6299999952316284, + "eval_rewards/chosen": -19.90007972717285, + "eval_rewards/margins": 1.0632379055023193, + "eval_rewards/rejected": -20.963319778442383, + "eval_runtime": 12.9241, + "eval_samples_per_second": 7.737, + "eval_steps_per_second": 1.934, + "step": 13600 + }, + { + "epoch": 0.4585594391452358, + "grad_norm": 22.9370174407959, + "learning_rate": 6.569010906443989e-07, + "logits/chosen": -1.0309830904006958, + "logits/rejected": -1.232157826423645, + "logps/chosen": -1.890729546546936, + "logps/rejected": -1.792786955833435, + "loss": 4.2338, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.90729522705078, + "rewards/margins": -0.9794248342514038, + "rewards/rejected": -17.927867889404297, + "step": 13605 + }, + { + "epoch": 0.45872796521621895, + "grad_norm": 25.70821762084961, + "learning_rate": 6.566217861420701e-07, + "logits/chosen": -1.4500412940979004, + "logits/rejected": -1.3498857021331787, + "logps/chosen": -2.291757106781006, + "logps/rejected": -2.2664847373962402, + "loss": 3.3888, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.917570114135742, + "rewards/margins": -0.2527216076850891, + "rewards/rejected": -22.66485023498535, + "step": 13610 + }, + { + "epoch": 0.4588964912872021, + "grad_norm": 50.01831817626953, + "learning_rate": 6.563424274391116e-07, + "logits/chosen": -1.510155200958252, + "logits/rejected": -1.5413382053375244, + "logps/chosen": -2.076988697052002, + "logps/rejected": -2.128782272338867, + "loss": 4.0567, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.769887924194336, + "rewards/margins": 0.5179346799850464, + "rewards/rejected": -21.287822723388672, + "step": 13615 + }, + { + "epoch": 0.45906501735818533, + "grad_norm": 24.66239356994629, + "learning_rate": 6.560630146321989e-07, + "logits/chosen": -1.018226146697998, + "logits/rejected": -0.988991916179657, + "logps/chosen": -2.174020528793335, + "logps/rejected": -2.0794548988342285, + "loss": 4.0338, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.740205764770508, + "rewards/margins": -0.9456550478935242, + "rewards/rejected": -20.7945499420166, + "step": 13620 + }, + { + "epoch": 0.4592335434291685, + "grad_norm": 19.06427764892578, + "learning_rate": 6.557835478180251e-07, + "logits/chosen": -0.6391313672065735, + "logits/rejected": -0.686683714389801, + "logps/chosen": -2.8946290016174316, + "logps/rejected": -2.7074649333953857, + "loss": 5.8085, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -28.9462890625, + "rewards/margins": -1.8716394901275635, + "rewards/rejected": -27.07465171813965, + "step": 13625 + }, + { + "epoch": 0.45940206950015167, + "grad_norm": 8.433749198913574, + "learning_rate": 6.555040270933034e-07, + "logits/chosen": -1.2226756811141968, + "logits/rejected": -1.4951807260513306, + "logps/chosen": -2.156207323074341, + "logps/rejected": -2.4813437461853027, + "loss": 2.6955, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.56207275390625, + "rewards/margins": 3.251366376876831, + "rewards/rejected": -24.813438415527344, + "step": 13630 + }, + { + "epoch": 0.45957059557113483, + "grad_norm": 24.13922691345215, + "learning_rate": 6.552244525547643e-07, + "logits/chosen": -1.204192876815796, + "logits/rejected": -1.3513870239257812, + "logps/chosen": -1.8572742938995361, + "logps/rejected": -2.0948574542999268, + "loss": 1.8049, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.572742462158203, + "rewards/margins": 2.3758318424224854, + "rewards/rejected": -20.94857406616211, + "step": 13635 + }, + { + "epoch": 0.45973912164211805, + "grad_norm": 30.350019454956055, + "learning_rate": 6.549448242991579e-07, + "logits/chosen": -1.0757592916488647, + "logits/rejected": -1.455788016319275, + "logps/chosen": -1.5065065622329712, + "logps/rejected": -1.5797616243362427, + "loss": 2.6693, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -15.06506633758545, + "rewards/margins": 0.7325505018234253, + "rewards/rejected": -15.797616958618164, + "step": 13640 + }, + { + "epoch": 0.4599076477131012, + "grad_norm": 88.70205688476562, + "learning_rate": 6.546651424232523e-07, + "logits/chosen": -1.1661994457244873, + "logits/rejected": -1.6986852884292603, + "logps/chosen": -2.092421293258667, + "logps/rejected": -2.2523179054260254, + "loss": 2.1306, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.924211502075195, + "rewards/margins": 1.5989638566970825, + "rewards/rejected": -22.523174285888672, + "step": 13645 + }, + { + "epoch": 0.4600761737840844, + "grad_norm": 17.726274490356445, + "learning_rate": 6.543854070238344e-07, + "logits/chosen": -0.8371933698654175, + "logits/rejected": -1.2864792346954346, + "logps/chosen": -2.6805765628814697, + "logps/rejected": -2.8042044639587402, + "loss": 4.4959, + "rewards/accuracies": 0.5, + "rewards/chosen": -26.80576515197754, + "rewards/margins": 1.2362817525863647, + "rewards/rejected": -28.04204750061035, + "step": 13650 + }, + { + "epoch": 0.46024469985506755, + "grad_norm": 27.410917282104492, + "learning_rate": 6.541056181977098e-07, + "logits/chosen": -1.1484122276306152, + "logits/rejected": -1.2108361721038818, + "logps/chosen": -1.7693979740142822, + "logps/rejected": -1.885839819908142, + "loss": 2.7561, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.693981170654297, + "rewards/margins": 1.164419174194336, + "rewards/rejected": -18.858400344848633, + "step": 13655 + }, + { + "epoch": 0.4604132259260508, + "grad_norm": 23.559326171875, + "learning_rate": 6.538257760417022e-07, + "logits/chosen": -1.1835523843765259, + "logits/rejected": -1.2095699310302734, + "logps/chosen": -2.0060606002807617, + "logps/rejected": -2.050452709197998, + "loss": 2.9333, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.060604095458984, + "rewards/margins": 0.4439212679862976, + "rewards/rejected": -20.504526138305664, + "step": 13660 + }, + { + "epoch": 0.46058175199703394, + "grad_norm": 35.010986328125, + "learning_rate": 6.535458806526542e-07, + "logits/chosen": -1.0575556755065918, + "logits/rejected": -1.188498854637146, + "logps/chosen": -2.679745674133301, + "logps/rejected": -2.794583559036255, + "loss": 2.235, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.797454833984375, + "rewards/margins": 1.1483802795410156, + "rewards/rejected": -27.945837020874023, + "step": 13665 + }, + { + "epoch": 0.4607502780680171, + "grad_norm": 9.957199096679688, + "learning_rate": 6.532659321274262e-07, + "logits/chosen": -1.0374212265014648, + "logits/rejected": -1.3127609491348267, + "logps/chosen": -2.2107858657836914, + "logps/rejected": -2.331003189086914, + "loss": 2.5993, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.107858657836914, + "rewards/margins": 1.2021735906600952, + "rewards/rejected": -23.31003189086914, + "step": 13670 + }, + { + "epoch": 0.4609188041390003, + "grad_norm": 26.413589477539062, + "learning_rate": 6.529859305628976e-07, + "logits/chosen": -0.9869252443313599, + "logits/rejected": -1.2702324390411377, + "logps/chosen": -1.952105164527893, + "logps/rejected": -2.1264002323150635, + "loss": 2.5447, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.52104949951172, + "rewards/margins": 1.7429507970809937, + "rewards/rejected": -21.264001846313477, + "step": 13675 + }, + { + "epoch": 0.4610873302099835, + "grad_norm": 16.234878540039062, + "learning_rate": 6.527058760559657e-07, + "logits/chosen": -0.9233641624450684, + "logits/rejected": -0.9307794570922852, + "logps/chosen": -2.134063959121704, + "logps/rejected": -2.294506072998047, + "loss": 2.1801, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.340639114379883, + "rewards/margins": 1.604425072669983, + "rewards/rejected": -22.945064544677734, + "step": 13680 + }, + { + "epoch": 0.46125585628096666, + "grad_norm": 39.736698150634766, + "learning_rate": 6.524257687035465e-07, + "logits/chosen": -1.769451379776001, + "logits/rejected": -1.9414253234863281, + "logps/chosen": -2.2306134700775146, + "logps/rejected": -2.355893611907959, + "loss": 2.6742, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.306133270263672, + "rewards/margins": 1.2528001070022583, + "rewards/rejected": -23.558935165405273, + "step": 13685 + }, + { + "epoch": 0.4614243823519498, + "grad_norm": 28.430448532104492, + "learning_rate": 6.521456086025742e-07, + "logits/chosen": -1.4746885299682617, + "logits/rejected": -1.8396384716033936, + "logps/chosen": -1.8950374126434326, + "logps/rejected": -2.66398286819458, + "loss": 1.7045, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.950374603271484, + "rewards/margins": 7.689455986022949, + "rewards/rejected": -26.63983154296875, + "step": 13690 + }, + { + "epoch": 0.46159290842293305, + "grad_norm": 26.767501831054688, + "learning_rate": 6.518653958500008e-07, + "logits/chosen": -1.4325040578842163, + "logits/rejected": -1.2674458026885986, + "logps/chosen": -2.7604289054870605, + "logps/rejected": -3.158684253692627, + "loss": 4.1102, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -27.60428810119629, + "rewards/margins": 3.9825546741485596, + "rewards/rejected": -31.586841583251953, + "step": 13695 + }, + { + "epoch": 0.4617614344939162, + "grad_norm": 40.75784683227539, + "learning_rate": 6.515851305427975e-07, + "logits/chosen": -1.3811547756195068, + "logits/rejected": -1.3409370183944702, + "logps/chosen": -2.617219924926758, + "logps/rejected": -2.4330475330352783, + "loss": 5.5949, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -26.172199249267578, + "rewards/margins": -1.8417232036590576, + "rewards/rejected": -24.330474853515625, + "step": 13700 + }, + { + "epoch": 0.4619299605648994, + "grad_norm": 32.274192810058594, + "learning_rate": 6.513048127779526e-07, + "logits/chosen": -1.4233167171478271, + "logits/rejected": -1.477988839149475, + "logps/chosen": -2.823249578475952, + "logps/rejected": -2.5230438709259033, + "loss": 6.2899, + "rewards/accuracies": 0.5, + "rewards/chosen": -28.232494354248047, + "rewards/margins": -3.0020570755004883, + "rewards/rejected": -25.230438232421875, + "step": 13705 + }, + { + "epoch": 0.46209848663588254, + "grad_norm": 15.3927583694458, + "learning_rate": 6.510244426524731e-07, + "logits/chosen": -1.1456549167633057, + "logits/rejected": -1.2661025524139404, + "logps/chosen": -2.2368240356445312, + "logps/rejected": -2.704552412033081, + "loss": 1.7765, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -22.368236541748047, + "rewards/margins": 4.677285194396973, + "rewards/rejected": -27.0455265045166, + "step": 13710 + }, + { + "epoch": 0.46226701270686577, + "grad_norm": 10.991805076599121, + "learning_rate": 6.507440202633841e-07, + "logits/chosen": -1.2727103233337402, + "logits/rejected": -1.5026662349700928, + "logps/chosen": -2.2351553440093994, + "logps/rejected": -2.759641170501709, + "loss": 1.5506, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -22.351551055908203, + "rewards/margins": 5.244860649108887, + "rewards/rejected": -27.59641456604004, + "step": 13715 + }, + { + "epoch": 0.46243553877784893, + "grad_norm": 226.2294921875, + "learning_rate": 6.504635457077289e-07, + "logits/chosen": -1.145394206047058, + "logits/rejected": -1.1384984254837036, + "logps/chosen": -2.7437894344329834, + "logps/rejected": -2.415273427963257, + "loss": 7.1034, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -27.43789291381836, + "rewards/margins": -3.2851600646972656, + "rewards/rejected": -24.152732849121094, + "step": 13720 + }, + { + "epoch": 0.4626040648488321, + "grad_norm": 31.03740119934082, + "learning_rate": 6.501830190825685e-07, + "logits/chosen": -0.7348178625106812, + "logits/rejected": -0.8843439817428589, + "logps/chosen": -2.0869638919830322, + "logps/rejected": -2.4660804271698, + "loss": 2.054, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.869638442993164, + "rewards/margins": 3.7911689281463623, + "rewards/rejected": -24.66080665588379, + "step": 13725 + }, + { + "epoch": 0.4627725909198153, + "grad_norm": 17.494319915771484, + "learning_rate": 6.499024404849821e-07, + "logits/chosen": -1.1662698984146118, + "logits/rejected": -1.1467528343200684, + "logps/chosen": -1.6923586130142212, + "logps/rejected": -1.6379038095474243, + "loss": 3.7571, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.923587799072266, + "rewards/margins": -0.5445479154586792, + "rewards/rejected": -16.379037857055664, + "step": 13730 + }, + { + "epoch": 0.4629411169907985, + "grad_norm": 23.104352951049805, + "learning_rate": 6.496218100120672e-07, + "logits/chosen": -1.1973875761032104, + "logits/rejected": -1.36476731300354, + "logps/chosen": -1.6315116882324219, + "logps/rejected": -1.6547889709472656, + "loss": 3.2729, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.31511688232422, + "rewards/margins": 0.23277311027050018, + "rewards/rejected": -16.54789161682129, + "step": 13735 + }, + { + "epoch": 0.46310964306178165, + "grad_norm": 27.43927764892578, + "learning_rate": 6.493411277609385e-07, + "logits/chosen": -1.2858362197875977, + "logits/rejected": -1.3546708822250366, + "logps/chosen": -1.8596910238265991, + "logps/rejected": -2.0080785751342773, + "loss": 2.8077, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.596908569335938, + "rewards/margins": 1.4838764667510986, + "rewards/rejected": -20.08078384399414, + "step": 13740 + }, + { + "epoch": 0.4632781691327648, + "grad_norm": 30.223655700683594, + "learning_rate": 6.490603938287294e-07, + "logits/chosen": -1.3623411655426025, + "logits/rejected": -1.4095981121063232, + "logps/chosen": -2.1852593421936035, + "logps/rejected": -2.1864328384399414, + "loss": 3.8224, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.85259437561035, + "rewards/margins": 0.011735248379409313, + "rewards/rejected": -21.864328384399414, + "step": 13745 + }, + { + "epoch": 0.46344669520374804, + "grad_norm": 1.1034603118896484, + "learning_rate": 6.487796083125907e-07, + "logits/chosen": -1.5010812282562256, + "logits/rejected": -1.6568992137908936, + "logps/chosen": -1.8861267566680908, + "logps/rejected": -2.314347267150879, + "loss": 2.3827, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.861268997192383, + "rewards/margins": 4.282201290130615, + "rewards/rejected": -23.14347267150879, + "step": 13750 + }, + { + "epoch": 0.4636152212747312, + "grad_norm": 25.887685775756836, + "learning_rate": 6.484987713096911e-07, + "logits/chosen": -1.2538336515426636, + "logits/rejected": -1.1933120489120483, + "logps/chosen": -1.827857255935669, + "logps/rejected": -2.159615993499756, + "loss": 2.7857, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.278573989868164, + "rewards/margins": 3.317587375640869, + "rewards/rejected": -21.596160888671875, + "step": 13755 + }, + { + "epoch": 0.46378374734571437, + "grad_norm": 23.123455047607422, + "learning_rate": 6.482178829172175e-07, + "logits/chosen": -1.5872116088867188, + "logits/rejected": -1.8486239910125732, + "logps/chosen": -2.3375518321990967, + "logps/rejected": -2.6852965354919434, + "loss": 2.7182, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.375520706176758, + "rewards/margins": 3.4774460792541504, + "rewards/rejected": -26.85296630859375, + "step": 13760 + }, + { + "epoch": 0.46395227341669754, + "grad_norm": 30.978561401367188, + "learning_rate": 6.479369432323742e-07, + "logits/chosen": -0.8670709729194641, + "logits/rejected": -1.0293810367584229, + "logps/chosen": -1.9717384576797485, + "logps/rejected": -2.28629994392395, + "loss": 3.0605, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.71738624572754, + "rewards/margins": 3.145613193511963, + "rewards/rejected": -22.86299705505371, + "step": 13765 + }, + { + "epoch": 0.46412079948768076, + "grad_norm": 12.688581466674805, + "learning_rate": 6.476559523523833e-07, + "logits/chosen": -1.5742883682250977, + "logits/rejected": -1.7252527475357056, + "logps/chosen": -2.204754590988159, + "logps/rejected": -2.2683627605438232, + "loss": 2.7556, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.04754638671875, + "rewards/margins": 0.6360834836959839, + "rewards/rejected": -22.683629989624023, + "step": 13770 + }, + { + "epoch": 0.4642893255586639, + "grad_norm": 10.192408561706543, + "learning_rate": 6.473749103744848e-07, + "logits/chosen": -1.0795964002609253, + "logits/rejected": -1.212799310684204, + "logps/chosen": -1.7061240673065186, + "logps/rejected": -1.9400373697280884, + "loss": 1.8272, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -17.061241149902344, + "rewards/margins": 2.3391315937042236, + "rewards/rejected": -19.400375366210938, + "step": 13775 + }, + { + "epoch": 0.4644578516296471, + "grad_norm": 24.896669387817383, + "learning_rate": 6.470938173959361e-07, + "logits/chosen": -1.1159603595733643, + "logits/rejected": -1.3001426458358765, + "logps/chosen": -2.2676663398742676, + "logps/rejected": -2.2217071056365967, + "loss": 3.6825, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.67666244506836, + "rewards/margins": -0.45959025621414185, + "rewards/rejected": -22.217071533203125, + "step": 13780 + }, + { + "epoch": 0.4646263777006303, + "grad_norm": 28.015443801879883, + "learning_rate": 6.468126735140122e-07, + "logits/chosen": -1.4823081493377686, + "logits/rejected": -1.6397498846054077, + "logps/chosen": -2.4521548748016357, + "logps/rejected": -2.624523162841797, + "loss": 2.4646, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.521547317504883, + "rewards/margins": 1.7236862182617188, + "rewards/rejected": -26.2452335357666, + "step": 13785 + }, + { + "epoch": 0.4647949037716135, + "grad_norm": 102.11254119873047, + "learning_rate": 6.465314788260065e-07, + "logits/chosen": -0.9647348523139954, + "logits/rejected": -1.1873705387115479, + "logps/chosen": -2.272239923477173, + "logps/rejected": -2.569958448410034, + "loss": 2.1275, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.722400665283203, + "rewards/margins": 2.9771876335144043, + "rewards/rejected": -25.6995849609375, + "step": 13790 + }, + { + "epoch": 0.46496342984259664, + "grad_norm": 17.63508415222168, + "learning_rate": 6.46250233429229e-07, + "logits/chosen": -1.3895456790924072, + "logits/rejected": -1.3987621068954468, + "logps/chosen": -2.464350938796997, + "logps/rejected": -2.446322202682495, + "loss": 4.5296, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.643508911132812, + "rewards/margins": -0.18028488755226135, + "rewards/rejected": -24.46322250366211, + "step": 13795 + }, + { + "epoch": 0.4651319559135798, + "grad_norm": 14.113638877868652, + "learning_rate": 6.459689374210078e-07, + "logits/chosen": -1.4735864400863647, + "logits/rejected": -1.517884373664856, + "logps/chosen": -1.9817434549331665, + "logps/rejected": -2.366046905517578, + "loss": 2.2756, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.817434310913086, + "rewards/margins": 3.8430354595184326, + "rewards/rejected": -23.66046714782715, + "step": 13800 + }, + { + "epoch": 0.46530048198456303, + "grad_norm": 0.7048804759979248, + "learning_rate": 6.456875908986882e-07, + "logits/chosen": -1.4789502620697021, + "logits/rejected": -1.5977550745010376, + "logps/chosen": -1.8480441570281982, + "logps/rejected": -2.0476698875427246, + "loss": 2.3848, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.48044204711914, + "rewards/margins": 1.9962581396102905, + "rewards/rejected": -20.476699829101562, + "step": 13805 + }, + { + "epoch": 0.4654690080555462, + "grad_norm": 26.780628204345703, + "learning_rate": 6.454061939596334e-07, + "logits/chosen": -0.9053479433059692, + "logits/rejected": -0.8510274887084961, + "logps/chosen": -2.4856226444244385, + "logps/rejected": -2.364593982696533, + "loss": 4.3577, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.85622215270996, + "rewards/margins": -1.2102824449539185, + "rewards/rejected": -23.64594268798828, + "step": 13810 + }, + { + "epoch": 0.46563753412652936, + "grad_norm": 24.32168960571289, + "learning_rate": 6.451247467012234e-07, + "logits/chosen": -1.5808491706848145, + "logits/rejected": -1.6128294467926025, + "logps/chosen": -2.07789945602417, + "logps/rejected": -2.0538277626037598, + "loss": 3.4658, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.778995513916016, + "rewards/margins": -0.24072055518627167, + "rewards/rejected": -20.53827476501465, + "step": 13815 + }, + { + "epoch": 0.46580606019751253, + "grad_norm": 61.70922088623047, + "learning_rate": 6.448432492208566e-07, + "logits/chosen": -1.8394935131072998, + "logits/rejected": -1.8253707885742188, + "logps/chosen": -2.7731966972351074, + "logps/rejected": -2.8784608840942383, + "loss": 3.0673, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -27.73196792602539, + "rewards/margins": 1.0526418685913086, + "rewards/rejected": -28.784610748291016, + "step": 13820 + }, + { + "epoch": 0.46597458626849575, + "grad_norm": 17.952144622802734, + "learning_rate": 6.445617016159475e-07, + "logits/chosen": -1.2500659227371216, + "logits/rejected": -1.161217212677002, + "logps/chosen": -1.7770369052886963, + "logps/rejected": -1.9865745306015015, + "loss": 3.5517, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.770368576049805, + "rewards/margins": 2.095376491546631, + "rewards/rejected": -19.865747451782227, + "step": 13825 + }, + { + "epoch": 0.4661431123394789, + "grad_norm": 15.38546085357666, + "learning_rate": 6.442801039839292e-07, + "logits/chosen": -1.3103373050689697, + "logits/rejected": -1.481055736541748, + "logps/chosen": -1.623786211013794, + "logps/rejected": -1.827819585800171, + "loss": 2.5167, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.23786163330078, + "rewards/margins": 2.0403342247009277, + "rewards/rejected": -18.278194427490234, + "step": 13830 + }, + { + "epoch": 0.4663116384104621, + "grad_norm": 14.652449607849121, + "learning_rate": 6.439984564222511e-07, + "logits/chosen": -1.3200714588165283, + "logits/rejected": -1.2847893238067627, + "logps/chosen": -2.327244997024536, + "logps/rejected": -2.3881309032440186, + "loss": 3.8291, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.27244758605957, + "rewards/margins": 0.6088644862174988, + "rewards/rejected": -23.881309509277344, + "step": 13835 + }, + { + "epoch": 0.4664801644814453, + "grad_norm": 22.924610137939453, + "learning_rate": 6.437167590283808e-07, + "logits/chosen": -1.0864531993865967, + "logits/rejected": -1.2700326442718506, + "logps/chosen": -2.320392370223999, + "logps/rejected": -2.349609375, + "loss": 3.0595, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.20392417907715, + "rewards/margins": 0.29216688871383667, + "rewards/rejected": -23.49609375, + "step": 13840 + }, + { + "epoch": 0.46664869055242847, + "grad_norm": 33.14518737792969, + "learning_rate": 6.434350118998024e-07, + "logits/chosen": -1.2175791263580322, + "logits/rejected": -1.4766108989715576, + "logps/chosen": -1.7124464511871338, + "logps/rejected": -2.053079605102539, + "loss": 1.7015, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.124462127685547, + "rewards/margins": 3.4063332080841064, + "rewards/rejected": -20.530797958374023, + "step": 13845 + }, + { + "epoch": 0.46681721662341163, + "grad_norm": 23.546630859375, + "learning_rate": 6.431532151340172e-07, + "logits/chosen": -1.2797232866287231, + "logits/rejected": -1.1614303588867188, + "logps/chosen": -2.258538007736206, + "logps/rejected": -2.3078856468200684, + "loss": 4.2846, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.58538055419922, + "rewards/margins": 0.4934772551059723, + "rewards/rejected": -23.078855514526367, + "step": 13850 + }, + { + "epoch": 0.4669857426943948, + "grad_norm": 25.121030807495117, + "learning_rate": 6.428713688285446e-07, + "logits/chosen": -1.067025899887085, + "logits/rejected": -1.2792638540267944, + "logps/chosen": -2.191856861114502, + "logps/rejected": -2.5970585346221924, + "loss": 0.9155, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -21.918567657470703, + "rewards/margins": 4.052019119262695, + "rewards/rejected": -25.970584869384766, + "step": 13855 + }, + { + "epoch": 0.467154268765378, + "grad_norm": 6.469688892364502, + "learning_rate": 6.425894730809198e-07, + "logits/chosen": -1.7441765069961548, + "logits/rejected": -1.9495182037353516, + "logps/chosen": -2.265683889389038, + "logps/rejected": -2.710209608078003, + "loss": 1.1884, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -22.65683937072754, + "rewards/margins": 4.445258617401123, + "rewards/rejected": -27.102096557617188, + "step": 13860 + }, + { + "epoch": 0.4673227948363612, + "grad_norm": 21.148752212524414, + "learning_rate": 6.423075279886963e-07, + "logits/chosen": -1.4913420677185059, + "logits/rejected": -1.7002441883087158, + "logps/chosen": -1.9288151264190674, + "logps/rejected": -2.1135830879211426, + "loss": 2.7459, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.288150787353516, + "rewards/margins": 1.847680687904358, + "rewards/rejected": -21.13582992553711, + "step": 13865 + }, + { + "epoch": 0.46749132090734435, + "grad_norm": 18.34513282775879, + "learning_rate": 6.420255336494441e-07, + "logits/chosen": -1.4038597345352173, + "logits/rejected": -1.4210649728775024, + "logps/chosen": -2.7172553539276123, + "logps/rejected": -3.0526812076568604, + "loss": 2.7329, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -27.172550201416016, + "rewards/margins": 3.354259490966797, + "rewards/rejected": -30.526809692382812, + "step": 13870 + }, + { + "epoch": 0.4676598469783275, + "grad_norm": 36.14348602294922, + "learning_rate": 6.417434901607504e-07, + "logits/chosen": -1.338536262512207, + "logits/rejected": -1.4746836423873901, + "logps/chosen": -2.4771456718444824, + "logps/rejected": -2.722282886505127, + "loss": 3.0973, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -24.77145767211914, + "rewards/margins": 2.451371669769287, + "rewards/rejected": -27.222829818725586, + "step": 13875 + }, + { + "epoch": 0.46782837304931074, + "grad_norm": 72.72344207763672, + "learning_rate": 6.414613976202192e-07, + "logits/chosen": -0.7898836135864258, + "logits/rejected": -0.6323626637458801, + "logps/chosen": -1.9599090814590454, + "logps/rejected": -1.9197161197662354, + "loss": 3.5675, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.599090576171875, + "rewards/margins": -0.4019303321838379, + "rewards/rejected": -19.197160720825195, + "step": 13880 + }, + { + "epoch": 0.4679968991202939, + "grad_norm": 20.84559440612793, + "learning_rate": 6.411792561254715e-07, + "logits/chosen": -1.2231805324554443, + "logits/rejected": -1.2022231817245483, + "logps/chosen": -1.9406921863555908, + "logps/rejected": -2.292463541030884, + "loss": 2.1474, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.40692138671875, + "rewards/margins": 3.5177130699157715, + "rewards/rejected": -22.92463493347168, + "step": 13885 + }, + { + "epoch": 0.4681654251912771, + "grad_norm": 36.6888427734375, + "learning_rate": 6.408970657741457e-07, + "logits/chosen": -1.4471817016601562, + "logits/rejected": -1.2688713073730469, + "logps/chosen": -2.1352126598358154, + "logps/rejected": -2.2517178058624268, + "loss": 3.4562, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.352127075195312, + "rewards/margins": 1.1650497913360596, + "rewards/rejected": -22.51717758178711, + "step": 13890 + }, + { + "epoch": 0.4683339512622603, + "grad_norm": 33.751834869384766, + "learning_rate": 6.406148266638963e-07, + "logits/chosen": -1.157727599143982, + "logits/rejected": -1.255642294883728, + "logps/chosen": -2.18957781791687, + "logps/rejected": -2.665783405303955, + "loss": 1.6078, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -21.89577865600586, + "rewards/margins": 4.7620530128479, + "rewards/rejected": -26.657833099365234, + "step": 13895 + }, + { + "epoch": 0.46850247733324346, + "grad_norm": 43.1579475402832, + "learning_rate": 6.403325388923956e-07, + "logits/chosen": -1.3292075395584106, + "logits/rejected": -1.3559629917144775, + "logps/chosen": -1.9967514276504517, + "logps/rejected": -2.118342399597168, + "loss": 2.8668, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.967514038085938, + "rewards/margins": 1.2159093618392944, + "rewards/rejected": -21.183422088623047, + "step": 13900 + }, + { + "epoch": 0.4686710034042266, + "grad_norm": 16.178476333618164, + "learning_rate": 6.400502025573319e-07, + "logits/chosen": -1.3434818983078003, + "logits/rejected": -1.5653314590454102, + "logps/chosen": -1.8827110528945923, + "logps/rejected": -2.1971614360809326, + "loss": 3.3364, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.82710838317871, + "rewards/margins": 3.144505023956299, + "rewards/rejected": -21.971614837646484, + "step": 13905 + }, + { + "epoch": 0.4688395294752098, + "grad_norm": 27.721033096313477, + "learning_rate": 6.397678177564109e-07, + "logits/chosen": -1.3376166820526123, + "logits/rejected": -1.5822420120239258, + "logps/chosen": -1.7184581756591797, + "logps/rejected": -2.0001845359802246, + "loss": 2.1544, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.184581756591797, + "rewards/margins": 2.817262649536133, + "rewards/rejected": -20.001846313476562, + "step": 13910 + }, + { + "epoch": 0.469008055546193, + "grad_norm": 50.78194808959961, + "learning_rate": 6.39485384587355e-07, + "logits/chosen": -1.2960230112075806, + "logits/rejected": -1.3487869501113892, + "logps/chosen": -1.9403388500213623, + "logps/rejected": -1.9828157424926758, + "loss": 3.7826, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.40338897705078, + "rewards/margins": 0.424767404794693, + "rewards/rejected": -19.828155517578125, + "step": 13915 + }, + { + "epoch": 0.4691765816171762, + "grad_norm": 201.55856323242188, + "learning_rate": 6.392029031479029e-07, + "logits/chosen": -0.938056468963623, + "logits/rejected": -1.2472608089447021, + "logps/chosen": -2.160804510116577, + "logps/rejected": -2.3859035968780518, + "loss": 2.4459, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.608047485351562, + "rewards/margins": 2.2509894371032715, + "rewards/rejected": -23.85903549194336, + "step": 13920 + }, + { + "epoch": 0.46934510768815935, + "grad_norm": 39.64912796020508, + "learning_rate": 6.389203735358103e-07, + "logits/chosen": -1.1982097625732422, + "logits/rejected": -1.1630780696868896, + "logps/chosen": -2.0709540843963623, + "logps/rejected": -1.9648492336273193, + "loss": 4.1928, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.709543228149414, + "rewards/margins": -1.0610501766204834, + "rewards/rejected": -19.64849281311035, + "step": 13925 + }, + { + "epoch": 0.4695136337591425, + "grad_norm": 4.722727298736572, + "learning_rate": 6.386377958488497e-07, + "logits/chosen": -1.9029403924942017, + "logits/rejected": -1.9499372243881226, + "logps/chosen": -2.29158091545105, + "logps/rejected": -2.7147128582000732, + "loss": 1.8003, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.915807723999023, + "rewards/margins": 4.231320381164551, + "rewards/rejected": -27.14712905883789, + "step": 13930 + }, + { + "epoch": 0.46968215983012573, + "grad_norm": 142.9805145263672, + "learning_rate": 6.3835517018481e-07, + "logits/chosen": -1.6389110088348389, + "logits/rejected": -1.3937580585479736, + "logps/chosen": -2.1385300159454346, + "logps/rejected": -2.0082430839538574, + "loss": 4.8887, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -21.385299682617188, + "rewards/margins": -1.302868127822876, + "rewards/rejected": -20.08243179321289, + "step": 13935 + }, + { + "epoch": 0.4698506859011089, + "grad_norm": 29.812519073486328, + "learning_rate": 6.38072496641497e-07, + "logits/chosen": -1.1409599781036377, + "logits/rejected": -1.3570410013198853, + "logps/chosen": -2.3800430297851562, + "logps/rejected": -2.7600884437561035, + "loss": 2.6882, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.800432205200195, + "rewards/margins": 3.8004555702209473, + "rewards/rejected": -27.60088539123535, + "step": 13940 + }, + { + "epoch": 0.47001921197209207, + "grad_norm": 35.49007034301758, + "learning_rate": 6.377897753167328e-07, + "logits/chosen": -1.118445873260498, + "logits/rejected": -1.2374904155731201, + "logps/chosen": -1.9628002643585205, + "logps/rejected": -2.1348910331726074, + "loss": 2.7892, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.62800407409668, + "rewards/margins": 1.7209068536758423, + "rewards/rejected": -21.34891128540039, + "step": 13945 + }, + { + "epoch": 0.4701877380430753, + "grad_norm": 27.707048416137695, + "learning_rate": 6.375070063083558e-07, + "logits/chosen": -1.0523077249526978, + "logits/rejected": -1.3618173599243164, + "logps/chosen": -2.0080909729003906, + "logps/rejected": -2.720878839492798, + "loss": 1.3578, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -20.080907821655273, + "rewards/margins": 7.127877712249756, + "rewards/rejected": -27.208789825439453, + "step": 13950 + }, + { + "epoch": 0.47035626411405845, + "grad_norm": 26.838281631469727, + "learning_rate": 6.372241897142217e-07, + "logits/chosen": -0.7201957702636719, + "logits/rejected": -0.920723557472229, + "logps/chosen": -1.9195568561553955, + "logps/rejected": -2.176978826522827, + "loss": 2.7029, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.195566177368164, + "rewards/margins": 2.5742220878601074, + "rewards/rejected": -21.76978874206543, + "step": 13955 + }, + { + "epoch": 0.4705247901850416, + "grad_norm": 69.2751693725586, + "learning_rate": 6.36941325632202e-07, + "logits/chosen": -1.1831648349761963, + "logits/rejected": -1.8111673593521118, + "logps/chosen": -2.3680479526519775, + "logps/rejected": -2.7056612968444824, + "loss": 3.0904, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -23.68048095703125, + "rewards/margins": 3.3761305809020996, + "rewards/rejected": -27.056610107421875, + "step": 13960 + }, + { + "epoch": 0.4706933162560248, + "grad_norm": 9.624326705932617, + "learning_rate": 6.366584141601845e-07, + "logits/chosen": -1.4053133726119995, + "logits/rejected": -1.5715116262435913, + "logps/chosen": -3.049325466156006, + "logps/rejected": -3.178790807723999, + "loss": 4.7181, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -30.493255615234375, + "rewards/margins": 1.2946542501449585, + "rewards/rejected": -31.787912368774414, + "step": 13965 + }, + { + "epoch": 0.470861842327008, + "grad_norm": 24.864315032958984, + "learning_rate": 6.363754553960743e-07, + "logits/chosen": -1.1395964622497559, + "logits/rejected": -1.1456211805343628, + "logps/chosen": -1.9944721460342407, + "logps/rejected": -2.139186382293701, + "loss": 2.812, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.944721221923828, + "rewards/margins": 1.4471421241760254, + "rewards/rejected": -21.391860961914062, + "step": 13970 + }, + { + "epoch": 0.4710303683979912, + "grad_norm": 25.200536727905273, + "learning_rate": 6.360924494377918e-07, + "logits/chosen": -1.249730110168457, + "logits/rejected": -1.3911710977554321, + "logps/chosen": -2.2585055828094482, + "logps/rejected": -2.606250286102295, + "loss": 2.649, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.58505630493164, + "rewards/margins": 3.4774513244628906, + "rewards/rejected": -26.062503814697266, + "step": 13975 + }, + { + "epoch": 0.47119889446897434, + "grad_norm": 22.978961944580078, + "learning_rate": 6.358093963832745e-07, + "logits/chosen": -1.1088197231292725, + "logits/rejected": -1.2002352476119995, + "logps/chosen": -1.988402009010315, + "logps/rejected": -2.3186020851135254, + "loss": 1.9841, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.88401985168457, + "rewards/margins": 3.301999568939209, + "rewards/rejected": -23.186019897460938, + "step": 13980 + }, + { + "epoch": 0.4713674205399575, + "grad_norm": 29.248794555664062, + "learning_rate": 6.355262963304756e-07, + "logits/chosen": -2.046112298965454, + "logits/rejected": -2.1253397464752197, + "logps/chosen": -1.8194358348846436, + "logps/rejected": -1.847980260848999, + "loss": 2.9246, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.194358825683594, + "rewards/margins": 0.28544360399246216, + "rewards/rejected": -18.47980308532715, + "step": 13985 + }, + { + "epoch": 0.4715359466109407, + "grad_norm": 4.779592037200928, + "learning_rate": 6.352431493773651e-07, + "logits/chosen": -1.2561156749725342, + "logits/rejected": -1.378087043762207, + "logps/chosen": -2.207831859588623, + "logps/rejected": -2.593492031097412, + "loss": 1.459, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -22.078319549560547, + "rewards/margins": 3.8566012382507324, + "rewards/rejected": -25.934921264648438, + "step": 13990 + }, + { + "epoch": 0.4717044726819239, + "grad_norm": 70.30220031738281, + "learning_rate": 6.349599556219291e-07, + "logits/chosen": -1.090355634689331, + "logits/rejected": -1.3683557510375977, + "logps/chosen": -2.2222657203674316, + "logps/rejected": -2.777714252471924, + "loss": 2.9842, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.222658157348633, + "rewards/margins": 5.554482460021973, + "rewards/rejected": -27.77713966369629, + "step": 13995 + }, + { + "epoch": 0.47187299875290706, + "grad_norm": 18.473787307739258, + "learning_rate": 6.346767151621696e-07, + "logits/chosen": -1.13655686378479, + "logits/rejected": -0.9696399569511414, + "logps/chosen": -1.8745672702789307, + "logps/rejected": -1.9711214303970337, + "loss": 2.9945, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.74567222595215, + "rewards/margins": 0.9655420184135437, + "rewards/rejected": -19.71121597290039, + "step": 14000 + }, + { + "epoch": 0.47187299875290706, + "eval_logits/chosen": -1.6666821241378784, + "eval_logits/rejected": -1.7839144468307495, + "eval_logps/chosen": -1.9983142614364624, + "eval_logps/rejected": -2.1036059856414795, + "eval_loss": 3.0070505142211914, + "eval_rewards/accuracies": 0.6299999952316284, + "eval_rewards/chosen": -19.983142852783203, + "eval_rewards/margins": 1.052917242050171, + "eval_rewards/rejected": -21.036060333251953, + "eval_runtime": 12.8911, + "eval_samples_per_second": 7.757, + "eval_steps_per_second": 1.939, + "step": 14000 + }, + { + "epoch": 0.4720415248238903, + "grad_norm": 80.3492660522461, + "learning_rate": 6.343934280961051e-07, + "logits/chosen": -1.4856373071670532, + "logits/rejected": -1.2735168933868408, + "logps/chosen": -2.5143871307373047, + "logps/rejected": -2.415238857269287, + "loss": 4.5798, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -25.143877029418945, + "rewards/margins": -0.9914867281913757, + "rewards/rejected": -24.152387619018555, + "step": 14005 + }, + { + "epoch": 0.47221005089487345, + "grad_norm": 19.168996810913086, + "learning_rate": 6.341100945217699e-07, + "logits/chosen": -1.0347862243652344, + "logits/rejected": -1.228040337562561, + "logps/chosen": -1.7857773303985596, + "logps/rejected": -2.0644729137420654, + "loss": 2.1059, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.85777473449707, + "rewards/margins": 2.786953926086426, + "rewards/rejected": -20.64472770690918, + "step": 14010 + }, + { + "epoch": 0.4723785769658566, + "grad_norm": 38.48196029663086, + "learning_rate": 6.338267145372147e-07, + "logits/chosen": -1.4567896127700806, + "logits/rejected": -1.55691659450531, + "logps/chosen": -2.0581021308898926, + "logps/rejected": -2.0358104705810547, + "loss": 3.5322, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.58102035522461, + "rewards/margins": -0.22291651368141174, + "rewards/rejected": -20.358102798461914, + "step": 14015 + }, + { + "epoch": 0.4725471030368398, + "grad_norm": 19.195449829101562, + "learning_rate": 6.335432882405062e-07, + "logits/chosen": -1.2669869661331177, + "logits/rejected": -1.4322224855422974, + "logps/chosen": -1.9909021854400635, + "logps/rejected": -2.0929489135742188, + "loss": 2.9498, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.909021377563477, + "rewards/margins": 1.0204694271087646, + "rewards/rejected": -20.92949104309082, + "step": 14020 + }, + { + "epoch": 0.472715629107823, + "grad_norm": 27.397371292114258, + "learning_rate": 6.332598157297271e-07, + "logits/chosen": -1.4022011756896973, + "logits/rejected": -1.1833802461624146, + "logps/chosen": -1.933884620666504, + "logps/rejected": -1.8614232540130615, + "loss": 3.8703, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -19.33884620666504, + "rewards/margins": -0.724612832069397, + "rewards/rejected": -18.61423110961914, + "step": 14025 + }, + { + "epoch": 0.47288415517880616, + "grad_norm": 30.178300857543945, + "learning_rate": 6.329762971029759e-07, + "logits/chosen": -1.2592933177947998, + "logits/rejected": -1.3360066413879395, + "logps/chosen": -2.270819902420044, + "logps/rejected": -2.3751158714294434, + "loss": 2.8132, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.70819664001465, + "rewards/margins": 1.0429630279541016, + "rewards/rejected": -23.751161575317383, + "step": 14030 + }, + { + "epoch": 0.47305268124978933, + "grad_norm": 23.464101791381836, + "learning_rate": 6.326927324583674e-07, + "logits/chosen": -1.0942169427871704, + "logits/rejected": -1.1212034225463867, + "logps/chosen": -1.8277006149291992, + "logps/rejected": -1.815222978591919, + "loss": 3.3575, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.277008056640625, + "rewards/margins": -0.12477798759937286, + "rewards/rejected": -18.15222930908203, + "step": 14035 + }, + { + "epoch": 0.4732212073207725, + "grad_norm": 34.82871627807617, + "learning_rate": 6.324091218940322e-07, + "logits/chosen": -1.5310542583465576, + "logits/rejected": -1.5248639583587646, + "logps/chosen": -2.289816379547119, + "logps/rejected": -2.739499807357788, + "loss": 4.0646, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.898162841796875, + "rewards/margins": 4.4968366622924805, + "rewards/rejected": -27.39499855041504, + "step": 14040 + }, + { + "epoch": 0.4733897333917557, + "grad_norm": 25.91857147216797, + "learning_rate": 6.321254655081165e-07, + "logits/chosen": -1.0183018445968628, + "logits/rejected": -1.0823280811309814, + "logps/chosen": -1.8455663919448853, + "logps/rejected": -1.8559458255767822, + "loss": 3.2614, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.455663681030273, + "rewards/margins": 0.10379400104284286, + "rewards/rejected": -18.559459686279297, + "step": 14045 + }, + { + "epoch": 0.4735582594627389, + "grad_norm": 33.44871139526367, + "learning_rate": 6.318417633987826e-07, + "logits/chosen": -1.2877857685089111, + "logits/rejected": -1.4010345935821533, + "logps/chosen": -2.4368534088134766, + "logps/rejected": -2.478266477584839, + "loss": 2.9642, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.368532180786133, + "rewards/margins": 0.4141322076320648, + "rewards/rejected": -24.782665252685547, + "step": 14050 + }, + { + "epoch": 0.47372678553372205, + "grad_norm": 3.0363457202911377, + "learning_rate": 6.31558015664209e-07, + "logits/chosen": -1.3036028146743774, + "logits/rejected": -1.3670432567596436, + "logps/chosen": -2.2221055030822754, + "logps/rejected": -2.45412015914917, + "loss": 2.2587, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -22.221054077148438, + "rewards/margins": 2.320145845413208, + "rewards/rejected": -24.541200637817383, + "step": 14055 + }, + { + "epoch": 0.47389531160470527, + "grad_norm": 19.182397842407227, + "learning_rate": 6.312742224025891e-07, + "logits/chosen": -1.494376301765442, + "logits/rejected": -1.308542013168335, + "logps/chosen": -1.7802484035491943, + "logps/rejected": -1.8587958812713623, + "loss": 2.7654, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.802486419677734, + "rewards/margins": 0.7854740023612976, + "rewards/rejected": -18.58795738220215, + "step": 14060 + }, + { + "epoch": 0.47406383767568844, + "grad_norm": 24.555452346801758, + "learning_rate": 6.30990383712133e-07, + "logits/chosen": -1.552181363105774, + "logits/rejected": -1.574568510055542, + "logps/chosen": -1.7801599502563477, + "logps/rejected": -1.9626197814941406, + "loss": 2.7652, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.801597595214844, + "rewards/margins": 1.8245998620986938, + "rewards/rejected": -19.626197814941406, + "step": 14065 + }, + { + "epoch": 0.4742323637466716, + "grad_norm": 42.724327087402344, + "learning_rate": 6.307064996910658e-07, + "logits/chosen": -1.3822429180145264, + "logits/rejected": -1.4136865139007568, + "logps/chosen": -1.8683557510375977, + "logps/rejected": -1.9732444286346436, + "loss": 3.3348, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.683557510375977, + "rewards/margins": 1.0488868951797485, + "rewards/rejected": -19.732444763183594, + "step": 14070 + }, + { + "epoch": 0.47440088981765477, + "grad_norm": 26.224445343017578, + "learning_rate": 6.304225704376288e-07, + "logits/chosen": -1.298749566078186, + "logits/rejected": -1.3914250135421753, + "logps/chosen": -1.7438873052597046, + "logps/rejected": -1.8251692056655884, + "loss": 2.9275, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.438873291015625, + "rewards/margins": 0.8128176927566528, + "rewards/rejected": -18.251689910888672, + "step": 14075 + }, + { + "epoch": 0.474569415888638, + "grad_norm": 24.768003463745117, + "learning_rate": 6.301385960500784e-07, + "logits/chosen": -1.6690937280654907, + "logits/rejected": -1.5592644214630127, + "logps/chosen": -2.0394389629364014, + "logps/rejected": -2.316621780395508, + "loss": 1.6017, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.394390106201172, + "rewards/margins": 2.771827220916748, + "rewards/rejected": -23.166217803955078, + "step": 14080 + }, + { + "epoch": 0.47473794195962116, + "grad_norm": 31.17376708984375, + "learning_rate": 6.298545766266874e-07, + "logits/chosen": -1.3782024383544922, + "logits/rejected": -1.5816433429718018, + "logps/chosen": -1.900072455406189, + "logps/rejected": -2.0293757915496826, + "loss": 2.3152, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.00072479248047, + "rewards/margins": 1.2930349111557007, + "rewards/rejected": -20.293758392333984, + "step": 14085 + }, + { + "epoch": 0.4749064680306043, + "grad_norm": 18.251209259033203, + "learning_rate": 6.295705122657435e-07, + "logits/chosen": -1.1897153854370117, + "logits/rejected": -1.1693168878555298, + "logps/chosen": -2.068413257598877, + "logps/rejected": -2.2642273902893066, + "loss": 1.9694, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -20.684133529663086, + "rewards/margins": 1.9581406116485596, + "rewards/rejected": -22.64227294921875, + "step": 14090 + }, + { + "epoch": 0.4750749941015875, + "grad_norm": 69.73340606689453, + "learning_rate": 6.2928640306555e-07, + "logits/chosen": -1.4802566766738892, + "logits/rejected": -1.614418387413025, + "logps/chosen": -2.283409357070923, + "logps/rejected": -2.8050522804260254, + "loss": 3.6148, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -22.834091186523438, + "rewards/margins": 5.216431617736816, + "rewards/rejected": -28.050525665283203, + "step": 14095 + }, + { + "epoch": 0.4752435201725707, + "grad_norm": 27.110261917114258, + "learning_rate": 6.290022491244262e-07, + "logits/chosen": -0.9759872555732727, + "logits/rejected": -1.0511130094528198, + "logps/chosen": -1.9783554077148438, + "logps/rejected": -2.054938316345215, + "loss": 2.7153, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.783552169799805, + "rewards/margins": 0.7658289670944214, + "rewards/rejected": -20.549381256103516, + "step": 14100 + }, + { + "epoch": 0.4754120462435539, + "grad_norm": 23.795833587646484, + "learning_rate": 6.287180505407065e-07, + "logits/chosen": -1.2092092037200928, + "logits/rejected": -1.0659422874450684, + "logps/chosen": -1.832767128944397, + "logps/rejected": -1.7705835103988647, + "loss": 3.7276, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.32767105102539, + "rewards/margins": -0.6218371391296387, + "rewards/rejected": -17.705833435058594, + "step": 14105 + }, + { + "epoch": 0.47558057231453704, + "grad_norm": 14.359508514404297, + "learning_rate": 6.284338074127407e-07, + "logits/chosen": -1.1778762340545654, + "logits/rejected": -1.3249976634979248, + "logps/chosen": -1.9273595809936523, + "logps/rejected": -2.1541523933410645, + "loss": 2.5223, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.27359390258789, + "rewards/margins": 2.2679269313812256, + "rewards/rejected": -21.541522979736328, + "step": 14110 + }, + { + "epoch": 0.47574909838552026, + "grad_norm": 13.926602363586426, + "learning_rate": 6.281495198388944e-07, + "logits/chosen": -1.1781768798828125, + "logits/rejected": -1.0687205791473389, + "logps/chosen": -1.4076114892959595, + "logps/rejected": -1.5143462419509888, + "loss": 2.439, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -14.0761137008667, + "rewards/margins": 1.0673487186431885, + "rewards/rejected": -15.143463134765625, + "step": 14115 + }, + { + "epoch": 0.47591762445650343, + "grad_norm": 30.59756851196289, + "learning_rate": 6.278651879175481e-07, + "logits/chosen": -1.2486302852630615, + "logits/rejected": -1.612501859664917, + "logps/chosen": -2.2371091842651367, + "logps/rejected": -2.649040937423706, + "loss": 1.8951, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -22.371091842651367, + "rewards/margins": 4.119317054748535, + "rewards/rejected": -26.49041175842285, + "step": 14120 + }, + { + "epoch": 0.4760861505274866, + "grad_norm": 27.569778442382812, + "learning_rate": 6.275808117470979e-07, + "logits/chosen": -1.5844205617904663, + "logits/rejected": -1.7609293460845947, + "logps/chosen": -2.280905246734619, + "logps/rejected": -2.6194040775299072, + "loss": 2.4388, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.809053421020508, + "rewards/margins": 3.384986400604248, + "rewards/rejected": -26.194040298461914, + "step": 14125 + }, + { + "epoch": 0.47625467659846976, + "grad_norm": 31.418270111083984, + "learning_rate": 6.272963914259551e-07, + "logits/chosen": -1.1242899894714355, + "logits/rejected": -1.2510685920715332, + "logps/chosen": -2.14827036857605, + "logps/rejected": -2.277985095977783, + "loss": 3.4789, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.482702255249023, + "rewards/margins": 1.2971477508544922, + "rewards/rejected": -22.77985191345215, + "step": 14130 + }, + { + "epoch": 0.476423202669453, + "grad_norm": 51.32257080078125, + "learning_rate": 6.270119270525468e-07, + "logits/chosen": -0.9274293184280396, + "logits/rejected": -1.0708013772964478, + "logps/chosen": -2.1544365882873535, + "logps/rejected": -2.3122477531433105, + "loss": 2.7029, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.544368743896484, + "rewards/margins": 1.5781086683273315, + "rewards/rejected": -23.12247657775879, + "step": 14135 + }, + { + "epoch": 0.47659172874043615, + "grad_norm": 24.99764633178711, + "learning_rate": 6.267274187253144e-07, + "logits/chosen": -1.298872947692871, + "logits/rejected": -1.4449822902679443, + "logps/chosen": -2.611276388168335, + "logps/rejected": -2.9252381324768066, + "loss": 3.4442, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.11276626586914, + "rewards/margins": 3.1396145820617676, + "rewards/rejected": -29.25238037109375, + "step": 14140 + }, + { + "epoch": 0.4767602548114193, + "grad_norm": 47.8037109375, + "learning_rate": 6.264428665427153e-07, + "logits/chosen": -1.2006722688674927, + "logits/rejected": -1.2854722738265991, + "logps/chosen": -1.9553956985473633, + "logps/rejected": -2.218827724456787, + "loss": 2.0508, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.553955078125, + "rewards/margins": 2.63432240486145, + "rewards/rejected": -22.188278198242188, + "step": 14145 + }, + { + "epoch": 0.4769287808824025, + "grad_norm": 27.07879066467285, + "learning_rate": 6.261582706032218e-07, + "logits/chosen": -1.2765188217163086, + "logits/rejected": -1.3474172353744507, + "logps/chosen": -1.7571680545806885, + "logps/rejected": -1.7844899892807007, + "loss": 3.4131, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.571680068969727, + "rewards/margins": 0.27321872115135193, + "rewards/rejected": -17.844898223876953, + "step": 14150 + }, + { + "epoch": 0.4770973069533857, + "grad_norm": 10.174779891967773, + "learning_rate": 6.258736310053212e-07, + "logits/chosen": -1.4351608753204346, + "logits/rejected": -1.4350874423980713, + "logps/chosen": -2.893874406814575, + "logps/rejected": -3.1628687381744385, + "loss": 2.306, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -28.93874168395996, + "rewards/margins": 2.6899428367614746, + "rewards/rejected": -31.628686904907227, + "step": 14155 + }, + { + "epoch": 0.47726583302436887, + "grad_norm": 14.811348915100098, + "learning_rate": 6.255889478475161e-07, + "logits/chosen": -1.382568597793579, + "logits/rejected": -1.4855680465698242, + "logps/chosen": -1.9862836599349976, + "logps/rejected": -2.301663637161255, + "loss": 1.8944, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -19.862834930419922, + "rewards/margins": 3.153799533843994, + "rewards/rejected": -23.01663589477539, + "step": 14160 + }, + { + "epoch": 0.47743435909535203, + "grad_norm": 23.847272872924805, + "learning_rate": 6.253042212283241e-07, + "logits/chosen": -1.6445223093032837, + "logits/rejected": -1.3189189434051514, + "logps/chosen": -1.8566372394561768, + "logps/rejected": -1.814192771911621, + "loss": 3.7629, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.566370010375977, + "rewards/margins": -0.42444291710853577, + "rewards/rejected": -18.14192771911621, + "step": 14165 + }, + { + "epoch": 0.47760288516633526, + "grad_norm": 39.18086624145508, + "learning_rate": 6.250194512462782e-07, + "logits/chosen": -1.2912828922271729, + "logits/rejected": -1.3584439754486084, + "logps/chosen": -1.8556301593780518, + "logps/rejected": -1.8538516759872437, + "loss": 3.1592, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.55630111694336, + "rewards/margins": -0.01778392866253853, + "rewards/rejected": -18.538516998291016, + "step": 14170 + }, + { + "epoch": 0.4777714112373184, + "grad_norm": 25.342227935791016, + "learning_rate": 6.247346379999257e-07, + "logits/chosen": -1.5513664484024048, + "logits/rejected": -1.6413652896881104, + "logps/chosen": -2.6058261394500732, + "logps/rejected": -2.8640480041503906, + "loss": 2.4241, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -26.058263778686523, + "rewards/margins": 2.582216262817383, + "rewards/rejected": -28.640478134155273, + "step": 14175 + }, + { + "epoch": 0.4779399373083016, + "grad_norm": 10.93128490447998, + "learning_rate": 6.244497815878292e-07, + "logits/chosen": -0.7427780032157898, + "logits/rejected": -1.1368509531021118, + "logps/chosen": -2.306380033493042, + "logps/rejected": -3.289423704147339, + "loss": 1.7394, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.063800811767578, + "rewards/margins": 9.830431938171387, + "rewards/rejected": -32.89423751831055, + "step": 14180 + }, + { + "epoch": 0.47810846337928475, + "grad_norm": 27.874006271362305, + "learning_rate": 6.241648821085665e-07, + "logits/chosen": -0.9375996589660645, + "logits/rejected": -1.060762643814087, + "logps/chosen": -2.2712631225585938, + "logps/rejected": -2.4879183769226074, + "loss": 1.9797, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.712631225585938, + "rewards/margins": 2.1665539741516113, + "rewards/rejected": -24.879186630249023, + "step": 14185 + }, + { + "epoch": 0.478276989450268, + "grad_norm": 35.346622467041016, + "learning_rate": 6.238799396607299e-07, + "logits/chosen": -1.0190104246139526, + "logits/rejected": -1.0845071077346802, + "logps/chosen": -2.458282709121704, + "logps/rejected": -2.4083683490753174, + "loss": 3.8692, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -24.58282470703125, + "rewards/margins": -0.4991399645805359, + "rewards/rejected": -24.08368492126465, + "step": 14190 + }, + { + "epoch": 0.47844551552125114, + "grad_norm": 30.713102340698242, + "learning_rate": 6.235949543429271e-07, + "logits/chosen": -1.664607286453247, + "logits/rejected": -1.994667649269104, + "logps/chosen": -2.1017775535583496, + "logps/rejected": -2.7052159309387207, + "loss": 3.0572, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.01777458190918, + "rewards/margins": 6.0343828201293945, + "rewards/rejected": -27.052160263061523, + "step": 14195 + }, + { + "epoch": 0.4786140415922343, + "grad_norm": 16.882923126220703, + "learning_rate": 6.233099262537798e-07, + "logits/chosen": -1.3837839365005493, + "logits/rejected": -1.538823127746582, + "logps/chosen": -2.1491425037384033, + "logps/rejected": -2.6276285648345947, + "loss": 2.3587, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.491424560546875, + "rewards/margins": 4.784861087799072, + "rewards/rejected": -26.27628517150879, + "step": 14200 + }, + { + "epoch": 0.4787825676632175, + "grad_norm": 15.137643814086914, + "learning_rate": 6.230248554919254e-07, + "logits/chosen": -1.432544469833374, + "logits/rejected": -1.466481328010559, + "logps/chosen": -2.5068650245666504, + "logps/rejected": -2.5227551460266113, + "loss": 3.0185, + "rewards/accuracies": 0.5, + "rewards/chosen": -25.068649291992188, + "rewards/margins": 0.1589018851518631, + "rewards/rejected": -25.227550506591797, + "step": 14205 + }, + { + "epoch": 0.4789510937342007, + "grad_norm": 21.842287063598633, + "learning_rate": 6.227397421560156e-07, + "logits/chosen": -1.2101644277572632, + "logits/rejected": -1.2983782291412354, + "logps/chosen": -2.416313648223877, + "logps/rejected": -2.7157235145568848, + "loss": 2.7991, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.163137435913086, + "rewards/margins": 2.9940972328186035, + "rewards/rejected": -27.1572322845459, + "step": 14210 + }, + { + "epoch": 0.47911961980518386, + "grad_norm": 25.819618225097656, + "learning_rate": 6.224545863447164e-07, + "logits/chosen": -1.0523632764816284, + "logits/rejected": -1.081443190574646, + "logps/chosen": -1.8073928356170654, + "logps/rejected": -1.915116548538208, + "loss": 2.2054, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.073930740356445, + "rewards/margins": 1.0772359371185303, + "rewards/rejected": -19.151165008544922, + "step": 14215 + }, + { + "epoch": 0.479288145876167, + "grad_norm": 33.45602035522461, + "learning_rate": 6.221693881567097e-07, + "logits/chosen": -1.0425139665603638, + "logits/rejected": -1.362992286682129, + "logps/chosen": -1.824812650680542, + "logps/rejected": -2.031705141067505, + "loss": 3.2246, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.248126983642578, + "rewards/margins": 2.0689239501953125, + "rewards/rejected": -20.31705093383789, + "step": 14220 + }, + { + "epoch": 0.47945667194715025, + "grad_norm": 25.50204086303711, + "learning_rate": 6.21884147690691e-07, + "logits/chosen": -1.3271222114562988, + "logits/rejected": -1.2059987783432007, + "logps/chosen": -1.9234594106674194, + "logps/rejected": -2.1173043251037598, + "loss": 1.9135, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.234594345092773, + "rewards/margins": 1.9384514093399048, + "rewards/rejected": -21.173046112060547, + "step": 14225 + }, + { + "epoch": 0.4796251980181334, + "grad_norm": 0.04183439910411835, + "learning_rate": 6.215988650453707e-07, + "logits/chosen": -1.230825424194336, + "logits/rejected": -1.699907660484314, + "logps/chosen": -2.3121089935302734, + "logps/rejected": -2.669900894165039, + "loss": 2.2012, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -23.121089935302734, + "rewards/margins": 3.577920436859131, + "rewards/rejected": -26.699010848999023, + "step": 14230 + }, + { + "epoch": 0.4797937240891166, + "grad_norm": 103.14031982421875, + "learning_rate": 6.21313540319474e-07, + "logits/chosen": -1.5439348220825195, + "logits/rejected": -2.149160385131836, + "logps/chosen": -2.681786298751831, + "logps/rejected": -3.1404526233673096, + "loss": 3.5701, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.817861557006836, + "rewards/margins": 4.586663246154785, + "rewards/rejected": -31.404525756835938, + "step": 14235 + }, + { + "epoch": 0.47996225016009975, + "grad_norm": 24.254440307617188, + "learning_rate": 6.210281736117407e-07, + "logits/chosen": -0.8741234540939331, + "logits/rejected": -0.8805392384529114, + "logps/chosen": -2.045274019241333, + "logps/rejected": -1.999871850013733, + "loss": 3.8074, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.452739715576172, + "rewards/margins": -0.4540228843688965, + "rewards/rejected": -19.99871826171875, + "step": 14240 + }, + { + "epoch": 0.48013077623108297, + "grad_norm": 37.69783401489258, + "learning_rate": 6.20742765020925e-07, + "logits/chosen": -1.3421859741210938, + "logits/rejected": -1.4220188856124878, + "logps/chosen": -1.9036476612091064, + "logps/rejected": -1.9306730031967163, + "loss": 3.0629, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.036474227905273, + "rewards/margins": 0.2702566981315613, + "rewards/rejected": -19.306730270385742, + "step": 14245 + }, + { + "epoch": 0.48029930230206613, + "grad_norm": 20.065317153930664, + "learning_rate": 6.20457314645795e-07, + "logits/chosen": -1.2782478332519531, + "logits/rejected": -1.2565407752990723, + "logps/chosen": -2.7419238090515137, + "logps/rejected": -2.7357048988342285, + "loss": 5.0031, + "rewards/accuracies": 0.5, + "rewards/chosen": -27.419235229492188, + "rewards/margins": -0.062186289578676224, + "rewards/rejected": -27.3570499420166, + "step": 14250 + }, + { + "epoch": 0.4804678283730493, + "grad_norm": 48.01775360107422, + "learning_rate": 6.201718225851345e-07, + "logits/chosen": -1.6276464462280273, + "logits/rejected": -1.634319543838501, + "logps/chosen": -3.138836145401001, + "logps/rejected": -3.149662494659424, + "loss": 5.2109, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -31.38836097717285, + "rewards/margins": 0.10826186835765839, + "rewards/rejected": -31.496623992919922, + "step": 14255 + }, + { + "epoch": 0.48063635444403247, + "grad_norm": 61.33388137817383, + "learning_rate": 6.198862889377407e-07, + "logits/chosen": -0.9420528411865234, + "logits/rejected": -1.111307144165039, + "logps/chosen": -2.386169910430908, + "logps/rejected": -2.489887237548828, + "loss": 2.4275, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.861698150634766, + "rewards/margins": 1.0371739864349365, + "rewards/rejected": -24.89887237548828, + "step": 14260 + }, + { + "epoch": 0.4808048805150157, + "grad_norm": 16.47574806213379, + "learning_rate": 6.196007138024257e-07, + "logits/chosen": -1.4349069595336914, + "logits/rejected": -1.9009329080581665, + "logps/chosen": -2.3457863330841064, + "logps/rejected": -2.69038462638855, + "loss": 2.016, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.457860946655273, + "rewards/margins": 3.4459869861602783, + "rewards/rejected": -26.90384864807129, + "step": 14265 + }, + { + "epoch": 0.48097340658599885, + "grad_norm": 29.380556106567383, + "learning_rate": 6.193150972780156e-07, + "logits/chosen": -0.8126411437988281, + "logits/rejected": -1.0067518949508667, + "logps/chosen": -2.386798620223999, + "logps/rejected": -2.546583890914917, + "loss": 2.976, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.867984771728516, + "rewards/margins": 1.5978561639785767, + "rewards/rejected": -25.465839385986328, + "step": 14270 + }, + { + "epoch": 0.481141932656982, + "grad_norm": 22.0316162109375, + "learning_rate": 6.190294394633513e-07, + "logits/chosen": -1.327775001525879, + "logits/rejected": -1.6017240285873413, + "logps/chosen": -2.261784315109253, + "logps/rejected": -2.3681483268737793, + "loss": 3.1997, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.617843627929688, + "rewards/margins": 1.0636417865753174, + "rewards/rejected": -23.68148422241211, + "step": 14275 + }, + { + "epoch": 0.48131045872796524, + "grad_norm": 16.934232711791992, + "learning_rate": 6.187437404572875e-07, + "logits/chosen": -1.2591432332992554, + "logits/rejected": -1.6536871194839478, + "logps/chosen": -1.532456636428833, + "logps/rejected": -1.7124006748199463, + "loss": 2.4245, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -15.324564933776855, + "rewards/margins": 1.7994403839111328, + "rewards/rejected": -17.124004364013672, + "step": 14280 + }, + { + "epoch": 0.4814789847989484, + "grad_norm": 25.954748153686523, + "learning_rate": 6.184580003586934e-07, + "logits/chosen": -0.8407789468765259, + "logits/rejected": -0.9403706789016724, + "logps/chosen": -2.3137755393981934, + "logps/rejected": -2.3520450592041016, + "loss": 2.9743, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.137752532958984, + "rewards/margins": 0.3826959729194641, + "rewards/rejected": -23.520448684692383, + "step": 14285 + }, + { + "epoch": 0.48164751086993157, + "grad_norm": 33.69615936279297, + "learning_rate": 6.181722192664525e-07, + "logits/chosen": -1.4487159252166748, + "logits/rejected": -1.4866324663162231, + "logps/chosen": -2.4114739894866943, + "logps/rejected": -2.2642366886138916, + "loss": 4.5777, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -24.11473846435547, + "rewards/margins": -1.4723711013793945, + "rewards/rejected": -22.64236831665039, + "step": 14290 + }, + { + "epoch": 0.48181603694091474, + "grad_norm": 14.018428802490234, + "learning_rate": 6.178863972794623e-07, + "logits/chosen": -1.5550386905670166, + "logits/rejected": -1.626044511795044, + "logps/chosen": -2.4448163509368896, + "logps/rejected": -2.5209197998046875, + "loss": 3.7098, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.448165893554688, + "rewards/margins": 0.7610336542129517, + "rewards/rejected": -25.209197998046875, + "step": 14295 + }, + { + "epoch": 0.48198456301189796, + "grad_norm": 23.618711471557617, + "learning_rate": 6.176005344966344e-07, + "logits/chosen": -1.7064844369888306, + "logits/rejected": -1.9256782531738281, + "logps/chosen": -2.080040454864502, + "logps/rejected": -2.499323606491089, + "loss": 2.6969, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.800403594970703, + "rewards/margins": 4.1928300857543945, + "rewards/rejected": -24.993236541748047, + "step": 14300 + }, + { + "epoch": 0.4821530890828811, + "grad_norm": 67.77555847167969, + "learning_rate": 6.17314631016895e-07, + "logits/chosen": -1.3775417804718018, + "logits/rejected": -1.1704440116882324, + "logps/chosen": -2.4363372325897217, + "logps/rejected": -2.379361629486084, + "loss": 3.9586, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.363372802734375, + "rewards/margins": -0.5697550773620605, + "rewards/rejected": -23.79361915588379, + "step": 14305 + }, + { + "epoch": 0.4823216151538643, + "grad_norm": 25.39704704284668, + "learning_rate": 6.170286869391836e-07, + "logits/chosen": -1.0143911838531494, + "logits/rejected": -1.1997044086456299, + "logps/chosen": -1.80712890625, + "logps/rejected": -1.854230523109436, + "loss": 2.8305, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.0712890625, + "rewards/margins": 0.47101593017578125, + "rewards/rejected": -18.542306900024414, + "step": 14310 + }, + { + "epoch": 0.48249014122484746, + "grad_norm": 21.75940704345703, + "learning_rate": 6.167427023624547e-07, + "logits/chosen": -1.05575692653656, + "logits/rejected": -1.2112061977386475, + "logps/chosen": -1.9335291385650635, + "logps/rejected": -1.9409252405166626, + "loss": 3.2197, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.335290908813477, + "rewards/margins": 0.07396335899829865, + "rewards/rejected": -19.409252166748047, + "step": 14315 + }, + { + "epoch": 0.4826586672958307, + "grad_norm": 22.265239715576172, + "learning_rate": 6.164566773856757e-07, + "logits/chosen": -1.438683271408081, + "logits/rejected": -1.3932020664215088, + "logps/chosen": -1.8244634866714478, + "logps/rejected": -2.039867877960205, + "loss": 1.8846, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.244632720947266, + "rewards/margins": 2.1540427207946777, + "rewards/rejected": -20.398677825927734, + "step": 14320 + }, + { + "epoch": 0.48282719336681384, + "grad_norm": 29.78438949584961, + "learning_rate": 6.16170612107829e-07, + "logits/chosen": -1.921547532081604, + "logits/rejected": -1.909881591796875, + "logps/chosen": -1.7852840423583984, + "logps/rejected": -2.009737014770508, + "loss": 2.2081, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.852840423583984, + "rewards/margins": 2.2445316314697266, + "rewards/rejected": -20.097370147705078, + "step": 14325 + }, + { + "epoch": 0.482995719437797, + "grad_norm": 17.730634689331055, + "learning_rate": 6.158845066279103e-07, + "logits/chosen": -1.072278618812561, + "logits/rejected": -1.5491522550582886, + "logps/chosen": -2.598126173019409, + "logps/rejected": -2.719301700592041, + "loss": 3.4963, + "rewards/accuracies": 0.5, + "rewards/chosen": -25.98126220703125, + "rewards/margins": 1.2117526531219482, + "rewards/rejected": -27.19301414489746, + "step": 14330 + }, + { + "epoch": 0.48316424550878023, + "grad_norm": 29.073835372924805, + "learning_rate": 6.155983610449298e-07, + "logits/chosen": -1.2550567388534546, + "logits/rejected": -1.3643629550933838, + "logps/chosen": -1.7392864227294922, + "logps/rejected": -1.9145896434783936, + "loss": 2.6637, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.392864227294922, + "rewards/margins": 1.7530326843261719, + "rewards/rejected": -19.145896911621094, + "step": 14335 + }, + { + "epoch": 0.4833327715797634, + "grad_norm": 33.49995803833008, + "learning_rate": 6.153121754579107e-07, + "logits/chosen": -1.3938661813735962, + "logits/rejected": -1.3946878910064697, + "logps/chosen": -2.068878650665283, + "logps/rejected": -2.1564369201660156, + "loss": 2.9194, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.68878746032715, + "rewards/margins": 0.8755823373794556, + "rewards/rejected": -21.564369201660156, + "step": 14340 + }, + { + "epoch": 0.48350129765074656, + "grad_norm": 25.166501998901367, + "learning_rate": 6.150259499658909e-07, + "logits/chosen": -1.1227186918258667, + "logits/rejected": -1.2852933406829834, + "logps/chosen": -2.0173442363739014, + "logps/rejected": -2.4354665279388428, + "loss": 2.1515, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.173442840576172, + "rewards/margins": 4.181223392486572, + "rewards/rejected": -24.354665756225586, + "step": 14345 + }, + { + "epoch": 0.48366982372172973, + "grad_norm": 26.04254722595215, + "learning_rate": 6.147396846679216e-07, + "logits/chosen": -1.2595160007476807, + "logits/rejected": -1.3415896892547607, + "logps/chosen": -2.605316638946533, + "logps/rejected": -2.745382785797119, + "loss": 3.355, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -26.05316734313965, + "rewards/margins": 1.400661587715149, + "rewards/rejected": -27.453826904296875, + "step": 14350 + }, + { + "epoch": 0.48383834979271295, + "grad_norm": 30.954938888549805, + "learning_rate": 6.144533796630678e-07, + "logits/chosen": -1.0422312021255493, + "logits/rejected": -1.0594663619995117, + "logps/chosen": -1.8631160259246826, + "logps/rejected": -1.7447509765625, + "loss": 4.2904, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -18.631160736083984, + "rewards/margins": -1.1836521625518799, + "rewards/rejected": -17.447509765625, + "step": 14355 + }, + { + "epoch": 0.4840068758636961, + "grad_norm": 29.922924041748047, + "learning_rate": 6.141670350504089e-07, + "logits/chosen": -1.3065602779388428, + "logits/rejected": -1.418766736984253, + "logps/chosen": -1.9556224346160889, + "logps/rejected": -2.0475106239318848, + "loss": 3.0816, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.556224822998047, + "rewards/margins": 0.9188838005065918, + "rewards/rejected": -20.475109100341797, + "step": 14360 + }, + { + "epoch": 0.4841754019346793, + "grad_norm": 34.842384338378906, + "learning_rate": 6.13880650929037e-07, + "logits/chosen": -1.3274986743927002, + "logits/rejected": -1.19040048122406, + "logps/chosen": -1.783129334449768, + "logps/rejected": -1.8191875219345093, + "loss": 2.8422, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.8312931060791, + "rewards/margins": 0.3605828285217285, + "rewards/rejected": -18.191875457763672, + "step": 14365 + }, + { + "epoch": 0.48434392800566245, + "grad_norm": 19.06740379333496, + "learning_rate": 6.135942273980586e-07, + "logits/chosen": -1.246483325958252, + "logits/rejected": -1.3938615322113037, + "logps/chosen": -2.066582202911377, + "logps/rejected": -2.1912307739257812, + "loss": 2.4974, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.665822982788086, + "rewards/margins": 1.2464841604232788, + "rewards/rejected": -21.912309646606445, + "step": 14370 + }, + { + "epoch": 0.48451245407664567, + "grad_norm": 15.381619453430176, + "learning_rate": 6.133077645565935e-07, + "logits/chosen": -1.049902319908142, + "logits/rejected": -1.299940824508667, + "logps/chosen": -1.895345687866211, + "logps/rejected": -2.2535789012908936, + "loss": 1.1043, + "rewards/accuracies": 1.0, + "rewards/chosen": -18.95345687866211, + "rewards/margins": 3.5823326110839844, + "rewards/rejected": -22.53578758239746, + "step": 14375 + }, + { + "epoch": 0.48468098014762884, + "grad_norm": 15.546119689941406, + "learning_rate": 6.130212625037752e-07, + "logits/chosen": -1.4986063241958618, + "logits/rejected": -1.5750732421875, + "logps/chosen": -2.206378936767578, + "logps/rejected": -2.759787082672119, + "loss": 1.7383, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.063793182373047, + "rewards/margins": 5.534076690673828, + "rewards/rejected": -27.59786605834961, + "step": 14380 + }, + { + "epoch": 0.484849506218612, + "grad_norm": 27.19158172607422, + "learning_rate": 6.12734721338751e-07, + "logits/chosen": -1.4182652235031128, + "logits/rejected": -1.5037392377853394, + "logps/chosen": -1.734135389328003, + "logps/rejected": -1.8829662799835205, + "loss": 1.9696, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -17.341354370117188, + "rewards/margins": 1.4883079528808594, + "rewards/rejected": -18.829662322998047, + "step": 14385 + }, + { + "epoch": 0.4850180322895952, + "grad_norm": 30.321714401245117, + "learning_rate": 6.12448141160681e-07, + "logits/chosen": -1.1180613040924072, + "logits/rejected": -1.4290263652801514, + "logps/chosen": -2.056044101715088, + "logps/rejected": -2.676231622695923, + "loss": 1.2935, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -20.560440063476562, + "rewards/margins": 6.20187520980835, + "rewards/rejected": -26.762313842773438, + "step": 14390 + }, + { + "epoch": 0.4851865583605784, + "grad_norm": 159.37966918945312, + "learning_rate": 6.121615220687398e-07, + "logits/chosen": -1.4194531440734863, + "logits/rejected": -1.2666727304458618, + "logps/chosen": -2.57710337638855, + "logps/rejected": -2.4502930641174316, + "loss": 4.6584, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -25.77103614807129, + "rewards/margins": -1.2681033611297607, + "rewards/rejected": -24.5029296875, + "step": 14395 + }, + { + "epoch": 0.48535508443156156, + "grad_norm": 42.41606140136719, + "learning_rate": 6.118748641621148e-07, + "logits/chosen": -1.2904958724975586, + "logits/rejected": -1.2220988273620605, + "logps/chosen": -2.0724587440490723, + "logps/rejected": -2.123455047607422, + "loss": 2.9377, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.72458839416504, + "rewards/margins": 0.5099626779556274, + "rewards/rejected": -21.234548568725586, + "step": 14400 + }, + { + "epoch": 0.48535508443156156, + "eval_logits/chosen": -1.7178480625152588, + "eval_logits/rejected": -1.8386316299438477, + "eval_logps/chosen": -2.0116519927978516, + "eval_logps/rejected": -2.1217188835144043, + "eval_loss": 2.994631290435791, + "eval_rewards/accuracies": 0.6399999856948853, + "eval_rewards/chosen": -20.11652183532715, + "eval_rewards/margins": 1.1006675958633423, + "eval_rewards/rejected": -21.217187881469727, + "eval_runtime": 12.8896, + "eval_samples_per_second": 7.758, + "eval_steps_per_second": 1.94, + "step": 14400 + }, + { + "epoch": 0.4855236105025447, + "grad_norm": 15.567980766296387, + "learning_rate": 6.11588167540007e-07, + "logits/chosen": -1.2109471559524536, + "logits/rejected": -1.536413550376892, + "logps/chosen": -1.9555637836456299, + "logps/rejected": -2.1613593101501465, + "loss": 2.1833, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.55563735961914, + "rewards/margins": 2.0579581260681152, + "rewards/rejected": -21.613595962524414, + "step": 14405 + }, + { + "epoch": 0.48569213657352794, + "grad_norm": 11.709920883178711, + "learning_rate": 6.113014323016307e-07, + "logits/chosen": -1.006519079208374, + "logits/rejected": -1.320786476135254, + "logps/chosen": -1.6655772924423218, + "logps/rejected": -1.8181434869766235, + "loss": 2.2558, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.655773162841797, + "rewards/margins": 1.5256626605987549, + "rewards/rejected": -18.181434631347656, + "step": 14410 + }, + { + "epoch": 0.4858606626445111, + "grad_norm": 28.14756202697754, + "learning_rate": 6.11014658546214e-07, + "logits/chosen": -1.0801985263824463, + "logits/rejected": -1.355345368385315, + "logps/chosen": -1.7694141864776611, + "logps/rejected": -2.0815796852111816, + "loss": 1.9312, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.694141387939453, + "rewards/margins": 3.1216535568237305, + "rewards/rejected": -20.815793991088867, + "step": 14415 + }, + { + "epoch": 0.4860291887154943, + "grad_norm": 57.387203216552734, + "learning_rate": 6.107278463729977e-07, + "logits/chosen": -1.1415436267852783, + "logits/rejected": -1.7277675867080688, + "logps/chosen": -2.552786350250244, + "logps/rejected": -2.63765287399292, + "loss": 3.79, + "rewards/accuracies": 0.5, + "rewards/chosen": -25.52786636352539, + "rewards/margins": 0.8486614227294922, + "rewards/rejected": -26.376529693603516, + "step": 14420 + }, + { + "epoch": 0.48619771478647744, + "grad_norm": 38.32411193847656, + "learning_rate": 6.104409958812362e-07, + "logits/chosen": -1.50121009349823, + "logits/rejected": -1.1826785802841187, + "logps/chosen": -1.7358640432357788, + "logps/rejected": -1.7259807586669922, + "loss": 3.3679, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.358638763427734, + "rewards/margins": -0.09883232414722443, + "rewards/rejected": -17.259807586669922, + "step": 14425 + }, + { + "epoch": 0.48636624085746066, + "grad_norm": 16.106178283691406, + "learning_rate": 6.101541071701974e-07, + "logits/chosen": -1.0993359088897705, + "logits/rejected": -1.3681131601333618, + "logps/chosen": -1.9167248010635376, + "logps/rejected": -2.063552141189575, + "loss": 2.2598, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.16724967956543, + "rewards/margins": 1.468271255493164, + "rewards/rejected": -20.63551902770996, + "step": 14430 + }, + { + "epoch": 0.48653476692844383, + "grad_norm": 33.390281677246094, + "learning_rate": 6.098671803391618e-07, + "logits/chosen": -1.3295332193374634, + "logits/rejected": -1.732404351234436, + "logps/chosen": -2.3224639892578125, + "logps/rejected": -2.6696527004241943, + "loss": 2.3211, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.22464370727539, + "rewards/margins": 3.4718856811523438, + "rewards/rejected": -26.6965274810791, + "step": 14435 + }, + { + "epoch": 0.486703292999427, + "grad_norm": 38.27173614501953, + "learning_rate": 6.095802154874238e-07, + "logits/chosen": -0.702488124370575, + "logits/rejected": -0.9602154493331909, + "logps/chosen": -3.133786678314209, + "logps/rejected": -2.500781297683716, + "loss": 9.6033, + "rewards/accuracies": 0.5, + "rewards/chosen": -31.337865829467773, + "rewards/margins": -6.33005428314209, + "rewards/rejected": -25.007810592651367, + "step": 14440 + }, + { + "epoch": 0.4868718190704102, + "grad_norm": 30.617448806762695, + "learning_rate": 6.092932127142904e-07, + "logits/chosen": -1.3489675521850586, + "logits/rejected": -1.422295331954956, + "logps/chosen": -1.9906165599822998, + "logps/rejected": -2.0119526386260986, + "loss": 3.6732, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.90616798400879, + "rewards/margins": 0.21335992217063904, + "rewards/rejected": -20.119525909423828, + "step": 14445 + }, + { + "epoch": 0.4870403451413934, + "grad_norm": 44.26568603515625, + "learning_rate": 6.09006172119082e-07, + "logits/chosen": -1.2701125144958496, + "logits/rejected": -1.8389867544174194, + "logps/chosen": -2.1882669925689697, + "logps/rejected": -2.1582720279693604, + "loss": 3.402, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.882671356201172, + "rewards/margins": -0.2999493479728699, + "rewards/rejected": -21.582721710205078, + "step": 14450 + }, + { + "epoch": 0.48720887121237655, + "grad_norm": 34.02650451660156, + "learning_rate": 6.087190938011322e-07, + "logits/chosen": -1.2723720073699951, + "logits/rejected": -1.2750638723373413, + "logps/chosen": -2.176562786102295, + "logps/rejected": -2.5528616905212402, + "loss": 2.337, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.765628814697266, + "rewards/margins": 3.762988328933716, + "rewards/rejected": -25.52861785888672, + "step": 14455 + }, + { + "epoch": 0.4873773972833597, + "grad_norm": 40.86668014526367, + "learning_rate": 6.084319778597875e-07, + "logits/chosen": -0.954400897026062, + "logits/rejected": -1.1919571161270142, + "logps/chosen": -2.130120277404785, + "logps/rejected": -2.39589262008667, + "loss": 1.5986, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.30120277404785, + "rewards/margins": 2.6577227115631104, + "rewards/rejected": -23.958925247192383, + "step": 14460 + }, + { + "epoch": 0.48754592335434294, + "grad_norm": 76.33245086669922, + "learning_rate": 6.081448243944073e-07, + "logits/chosen": -1.9409711360931396, + "logits/rejected": -1.8011581897735596, + "logps/chosen": -2.597461700439453, + "logps/rejected": -2.948514223098755, + "loss": 1.7805, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.9746150970459, + "rewards/margins": 3.510526180267334, + "rewards/rejected": -29.485143661499023, + "step": 14465 + }, + { + "epoch": 0.4877144494253261, + "grad_norm": 27.40862464904785, + "learning_rate": 6.07857633504364e-07, + "logits/chosen": -0.9310008883476257, + "logits/rejected": -0.8623281717300415, + "logps/chosen": -2.396544933319092, + "logps/rejected": -2.29371976852417, + "loss": 4.1105, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -23.9654483795166, + "rewards/margins": -1.0282517671585083, + "rewards/rejected": -22.937198638916016, + "step": 14470 + }, + { + "epoch": 0.48788297549630927, + "grad_norm": 17.24898910522461, + "learning_rate": 6.075704052890432e-07, + "logits/chosen": -1.9165115356445312, + "logits/rejected": -2.039600372314453, + "logps/chosen": -2.1480746269226074, + "logps/rejected": -2.7809290885925293, + "loss": 2.4188, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.48074722290039, + "rewards/margins": 6.328543663024902, + "rewards/rejected": -27.809289932250977, + "step": 14475 + }, + { + "epoch": 0.48805150156729243, + "grad_norm": 48.96101760864258, + "learning_rate": 6.072831398478433e-07, + "logits/chosen": -1.634894609451294, + "logits/rejected": -1.6797151565551758, + "logps/chosen": -2.297348976135254, + "logps/rejected": -2.287741184234619, + "loss": 3.8043, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.97348976135254, + "rewards/margins": -0.0960756316781044, + "rewards/rejected": -22.87741470336914, + "step": 14480 + }, + { + "epoch": 0.48822002763827566, + "grad_norm": 9.349481582641602, + "learning_rate": 6.069958372801753e-07, + "logits/chosen": -1.4112704992294312, + "logits/rejected": -1.444657564163208, + "logps/chosen": -2.316025495529175, + "logps/rejected": -2.368542432785034, + "loss": 2.9264, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -23.160253524780273, + "rewards/margins": 0.5251716375350952, + "rewards/rejected": -23.6854248046875, + "step": 14485 + }, + { + "epoch": 0.4883885537092588, + "grad_norm": 17.727920532226562, + "learning_rate": 6.067084976854637e-07, + "logits/chosen": -1.3877828121185303, + "logits/rejected": -1.4814679622650146, + "logps/chosen": -2.2088212966918945, + "logps/rejected": -2.6402947902679443, + "loss": 3.2358, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.088214874267578, + "rewards/margins": 4.314736366271973, + "rewards/rejected": -26.4029483795166, + "step": 14490 + }, + { + "epoch": 0.488557079780242, + "grad_norm": 42.89591598510742, + "learning_rate": 6.064211211631451e-07, + "logits/chosen": -1.392857313156128, + "logits/rejected": -1.4012411832809448, + "logps/chosen": -2.6497719287872314, + "logps/rejected": -2.85886287689209, + "loss": 2.3042, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.497716903686523, + "rewards/margins": 2.090907335281372, + "rewards/rejected": -28.588626861572266, + "step": 14495 + }, + { + "epoch": 0.4887256058512252, + "grad_norm": 45.4229850769043, + "learning_rate": 6.061337078126693e-07, + "logits/chosen": -1.9174840450286865, + "logits/rejected": -1.8754726648330688, + "logps/chosen": -2.0455451011657715, + "logps/rejected": -2.170490264892578, + "loss": 3.5905, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.45545196533203, + "rewards/margins": 1.2494512796401978, + "rewards/rejected": -21.70490264892578, + "step": 14500 + }, + { + "epoch": 0.4888941319222084, + "grad_norm": 44.48363494873047, + "learning_rate": 6.058462577334987e-07, + "logits/chosen": -1.233435034751892, + "logits/rejected": -1.3299095630645752, + "logps/chosen": -1.6596050262451172, + "logps/rejected": -1.8056104183197021, + "loss": 2.2678, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -16.596050262451172, + "rewards/margins": 1.4600555896759033, + "rewards/rejected": -18.05610466003418, + "step": 14505 + }, + { + "epoch": 0.48906265799319154, + "grad_norm": 36.73933029174805, + "learning_rate": 6.055587710251086e-07, + "logits/chosen": -1.4670766592025757, + "logits/rejected": -1.645054817199707, + "logps/chosen": -2.200580358505249, + "logps/rejected": -2.275089979171753, + "loss": 2.583, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -22.00580406188965, + "rewards/margins": 0.745094895362854, + "rewards/rejected": -22.750900268554688, + "step": 14510 + }, + { + "epoch": 0.4892311840641747, + "grad_norm": 14.582422256469727, + "learning_rate": 6.052712477869866e-07, + "logits/chosen": -1.8081855773925781, + "logits/rejected": -1.8254966735839844, + "logps/chosen": -1.8928664922714233, + "logps/rejected": -2.5116868019104004, + "loss": 1.3671, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.928665161132812, + "rewards/margins": 6.188204288482666, + "rewards/rejected": -25.116870880126953, + "step": 14515 + }, + { + "epoch": 0.48939971013515793, + "grad_norm": 24.57520866394043, + "learning_rate": 6.049836881186334e-07, + "logits/chosen": -1.295364499092102, + "logits/rejected": -1.5239925384521484, + "logps/chosen": -2.708310842514038, + "logps/rejected": -3.2306361198425293, + "loss": 2.9936, + "rewards/accuracies": 0.5, + "rewards/chosen": -27.083110809326172, + "rewards/margins": 5.223249912261963, + "rewards/rejected": -32.306358337402344, + "step": 14520 + }, + { + "epoch": 0.4895682362061411, + "grad_norm": 4.418197154998779, + "learning_rate": 6.046960921195616e-07, + "logits/chosen": -1.1067047119140625, + "logits/rejected": -1.0724799633026123, + "logps/chosen": -2.7758538722991943, + "logps/rejected": -2.9014029502868652, + "loss": 2.8487, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -27.7585391998291, + "rewards/margins": 1.2554924488067627, + "rewards/rejected": -29.0140323638916, + "step": 14525 + }, + { + "epoch": 0.48973676227712426, + "grad_norm": 82.43099975585938, + "learning_rate": 6.044084598892973e-07, + "logits/chosen": -1.6011970043182373, + "logits/rejected": -1.5154972076416016, + "logps/chosen": -2.1913952827453613, + "logps/rejected": -2.110191822052002, + "loss": 3.9255, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -21.913951873779297, + "rewards/margins": -0.812035083770752, + "rewards/rejected": -21.101917266845703, + "step": 14530 + }, + { + "epoch": 0.4899052883481074, + "grad_norm": 32.03125, + "learning_rate": 6.041207915273787e-07, + "logits/chosen": -1.1000322103500366, + "logits/rejected": -0.9549843072891235, + "logps/chosen": -2.2531070709228516, + "logps/rejected": -2.343667507171631, + "loss": 2.4727, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.531070709228516, + "rewards/margins": 0.9056074023246765, + "rewards/rejected": -23.436676025390625, + "step": 14535 + }, + { + "epoch": 0.49007381441909065, + "grad_norm": 48.95552062988281, + "learning_rate": 6.038330871333563e-07, + "logits/chosen": -1.446215033531189, + "logits/rejected": -1.434485912322998, + "logps/chosen": -1.998910665512085, + "logps/rejected": -1.985263466835022, + "loss": 3.2234, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.98910903930664, + "rewards/margins": -0.13647422194480896, + "rewards/rejected": -19.852632522583008, + "step": 14540 + }, + { + "epoch": 0.4902423404900738, + "grad_norm": 29.8745174407959, + "learning_rate": 6.035453468067934e-07, + "logits/chosen": -1.5774548053741455, + "logits/rejected": -1.8568214178085327, + "logps/chosen": -1.9072927236557007, + "logps/rejected": -2.082167625427246, + "loss": 2.8992, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.072927474975586, + "rewards/margins": 1.7487468719482422, + "rewards/rejected": -20.821674346923828, + "step": 14545 + }, + { + "epoch": 0.490410866561057, + "grad_norm": 20.004297256469727, + "learning_rate": 6.032575706472654e-07, + "logits/chosen": -1.165950059890747, + "logits/rejected": -1.2661291360855103, + "logps/chosen": -2.0044305324554443, + "logps/rejected": -2.1626477241516113, + "loss": 3.1361, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.0443058013916, + "rewards/margins": 1.5821691751480103, + "rewards/rejected": -21.626474380493164, + "step": 14550 + }, + { + "epoch": 0.4905793926320402, + "grad_norm": 29.16197395324707, + "learning_rate": 6.029697587543603e-07, + "logits/chosen": -1.1389122009277344, + "logits/rejected": -1.1878808736801147, + "logps/chosen": -2.138899803161621, + "logps/rejected": -2.3903207778930664, + "loss": 1.5864, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.388996124267578, + "rewards/margins": 2.5142085552215576, + "rewards/rejected": -23.903209686279297, + "step": 14555 + }, + { + "epoch": 0.49074791870302337, + "grad_norm": 48.880123138427734, + "learning_rate": 6.026819112276786e-07, + "logits/chosen": -1.1174449920654297, + "logits/rejected": -1.1852917671203613, + "logps/chosen": -2.229419231414795, + "logps/rejected": -2.2317538261413574, + "loss": 3.3643, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.294193267822266, + "rewards/margins": 0.023347090929746628, + "rewards/rejected": -22.317541122436523, + "step": 14560 + }, + { + "epoch": 0.49091644477400653, + "grad_norm": 35.779170989990234, + "learning_rate": 6.02394028166833e-07, + "logits/chosen": -1.393424153327942, + "logits/rejected": -1.6953125, + "logps/chosen": -2.20637845993042, + "logps/rejected": -2.665292739868164, + "loss": 1.8522, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.063785552978516, + "rewards/margins": 4.589139461517334, + "rewards/rejected": -26.652923583984375, + "step": 14565 + }, + { + "epoch": 0.4910849708449897, + "grad_norm": 22.484113693237305, + "learning_rate": 6.021061096714484e-07, + "logits/chosen": -1.3832494020462036, + "logits/rejected": -1.5571014881134033, + "logps/chosen": -2.7218856811523438, + "logps/rejected": -3.0552916526794434, + "loss": 1.4561, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -27.218856811523438, + "rewards/margins": 3.334063768386841, + "rewards/rejected": -30.552921295166016, + "step": 14570 + }, + { + "epoch": 0.4912534969159729, + "grad_norm": 37.85688400268555, + "learning_rate": 6.01818155841162e-07, + "logits/chosen": -1.389123558998108, + "logits/rejected": -1.6732994318008423, + "logps/chosen": -1.9779088497161865, + "logps/rejected": -2.153419017791748, + "loss": 2.1336, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.779090881347656, + "rewards/margins": 1.755099892616272, + "rewards/rejected": -21.534189224243164, + "step": 14575 + }, + { + "epoch": 0.4914220229869561, + "grad_norm": 56.15266418457031, + "learning_rate": 6.015301667756233e-07, + "logits/chosen": -1.1650410890579224, + "logits/rejected": -1.157894492149353, + "logps/chosen": -1.8984973430633545, + "logps/rejected": -2.231627941131592, + "loss": 1.5791, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -18.984973907470703, + "rewards/margins": 3.331307888031006, + "rewards/rejected": -22.316282272338867, + "step": 14580 + }, + { + "epoch": 0.49159054905793925, + "grad_norm": 37.5297737121582, + "learning_rate": 6.012421425744941e-07, + "logits/chosen": -1.0445148944854736, + "logits/rejected": -1.4065015316009521, + "logps/chosen": -1.8271509408950806, + "logps/rejected": -1.9741817712783813, + "loss": 2.9193, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.271509170532227, + "rewards/margins": 1.4703084230422974, + "rewards/rejected": -19.741817474365234, + "step": 14585 + }, + { + "epoch": 0.4917590751289224, + "grad_norm": 39.2474365234375, + "learning_rate": 6.009540833374481e-07, + "logits/chosen": -1.191789984703064, + "logits/rejected": -1.3047010898590088, + "logps/chosen": -2.168593645095825, + "logps/rejected": -2.7140133380889893, + "loss": 1.7459, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.68593406677246, + "rewards/margins": 5.454197883605957, + "rewards/rejected": -27.140132904052734, + "step": 14590 + }, + { + "epoch": 0.49192760119990564, + "grad_norm": 24.918357849121094, + "learning_rate": 6.006659891641712e-07, + "logits/chosen": -0.8684003949165344, + "logits/rejected": -1.174020767211914, + "logps/chosen": -2.2814254760742188, + "logps/rejected": -2.5541939735412598, + "loss": 3.1475, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.814252853393555, + "rewards/margins": 2.7276878356933594, + "rewards/rejected": -25.541942596435547, + "step": 14595 + }, + { + "epoch": 0.4920961272708888, + "grad_norm": 59.820919036865234, + "learning_rate": 6.003778601543616e-07, + "logits/chosen": -1.7482401132583618, + "logits/rejected": -1.6752662658691406, + "logps/chosen": -2.2109625339508057, + "logps/rejected": -2.252445936203003, + "loss": 3.9494, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -22.1096248626709, + "rewards/margins": 0.4148353934288025, + "rewards/rejected": -22.524459838867188, + "step": 14600 + }, + { + "epoch": 0.49226465334187197, + "grad_norm": 34.3822135925293, + "learning_rate": 6.000896964077295e-07, + "logits/chosen": -0.9001103639602661, + "logits/rejected": -0.9175226092338562, + "logps/chosen": -2.6637980937957764, + "logps/rejected": -2.9894144535064697, + "loss": 3.8383, + "rewards/accuracies": 0.5, + "rewards/chosen": -26.63797950744629, + "rewards/margins": 3.2561659812927246, + "rewards/rejected": -29.89414405822754, + "step": 14605 + }, + { + "epoch": 0.4924331794128552, + "grad_norm": 20.390865325927734, + "learning_rate": 5.998014980239966e-07, + "logits/chosen": -1.5274779796600342, + "logits/rejected": -1.6380846500396729, + "logps/chosen": -2.4721086025238037, + "logps/rejected": -2.516230821609497, + "loss": 3.6446, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.721084594726562, + "rewards/margins": 0.4412227272987366, + "rewards/rejected": -25.162307739257812, + "step": 14610 + }, + { + "epoch": 0.49260170548383836, + "grad_norm": 28.802907943725586, + "learning_rate": 5.995132651028973e-07, + "logits/chosen": -1.3802731037139893, + "logits/rejected": -1.5385868549346924, + "logps/chosen": -2.067274570465088, + "logps/rejected": -2.7522835731506348, + "loss": 2.2341, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.672746658325195, + "rewards/margins": 6.850091457366943, + "rewards/rejected": -27.522836685180664, + "step": 14615 + }, + { + "epoch": 0.4927702315548215, + "grad_norm": 33.500038146972656, + "learning_rate": 5.992249977441778e-07, + "logits/chosen": -1.3794944286346436, + "logits/rejected": -1.4457772970199585, + "logps/chosen": -2.42356538772583, + "logps/rejected": -2.1479406356811523, + "loss": 5.8888, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -24.23565673828125, + "rewards/margins": -2.7562496662139893, + "rewards/rejected": -21.47940444946289, + "step": 14620 + }, + { + "epoch": 0.4929387576258047, + "grad_norm": 25.33816909790039, + "learning_rate": 5.989366960475956e-07, + "logits/chosen": -1.3720991611480713, + "logits/rejected": -1.6218681335449219, + "logps/chosen": -1.9994127750396729, + "logps/rejected": -2.1939616203308105, + "loss": 2.91, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.99412727355957, + "rewards/margins": 1.9454885721206665, + "rewards/rejected": -21.93961524963379, + "step": 14625 + }, + { + "epoch": 0.4931072836967879, + "grad_norm": 1.7631736993789673, + "learning_rate": 5.986483601129212e-07, + "logits/chosen": -0.9612113833427429, + "logits/rejected": -1.0274362564086914, + "logps/chosen": -2.0222010612487793, + "logps/rejected": -2.2431979179382324, + "loss": 2.969, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.22201156616211, + "rewards/margins": 2.209967851638794, + "rewards/rejected": -22.43198013305664, + "step": 14630 + }, + { + "epoch": 0.4932758097677711, + "grad_norm": 23.257797241210938, + "learning_rate": 5.983599900399357e-07, + "logits/chosen": -1.47468101978302, + "logits/rejected": -1.5135730504989624, + "logps/chosen": -2.2988877296447754, + "logps/rejected": -1.9558820724487305, + "loss": 6.8303, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -22.988876342773438, + "rewards/margins": -3.430056095123291, + "rewards/rejected": -19.558818817138672, + "step": 14635 + }, + { + "epoch": 0.49344433583875424, + "grad_norm": 121.70762634277344, + "learning_rate": 5.98071585928433e-07, + "logits/chosen": -1.7339370250701904, + "logits/rejected": -1.6378087997436523, + "logps/chosen": -2.540347099304199, + "logps/rejected": -2.673609733581543, + "loss": 3.1187, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -25.40346908569336, + "rewards/margins": 1.3326267004013062, + "rewards/rejected": -26.736095428466797, + "step": 14640 + }, + { + "epoch": 0.4936128619097374, + "grad_norm": 22.278928756713867, + "learning_rate": 5.977831478782181e-07, + "logits/chosen": -1.2727556228637695, + "logits/rejected": -1.4096349477767944, + "logps/chosen": -1.9860179424285889, + "logps/rejected": -2.102168083190918, + "loss": 2.8199, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.860179901123047, + "rewards/margins": 1.1615017652511597, + "rewards/rejected": -21.02168083190918, + "step": 14645 + }, + { + "epoch": 0.49378138798072063, + "grad_norm": 12.310074806213379, + "learning_rate": 5.974946759891084e-07, + "logits/chosen": -1.1900991201400757, + "logits/rejected": -1.4266362190246582, + "logps/chosen": -2.0711121559143066, + "logps/rejected": -2.1807780265808105, + "loss": 2.5928, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.711122512817383, + "rewards/margins": 1.0966581106185913, + "rewards/rejected": -21.80777931213379, + "step": 14650 + }, + { + "epoch": 0.4939499140517038, + "grad_norm": 139.086669921875, + "learning_rate": 5.972061703609326e-07, + "logits/chosen": -1.0209219455718994, + "logits/rejected": -0.8229808807373047, + "logps/chosen": -2.4855475425720215, + "logps/rejected": -2.6358425617218018, + "loss": 3.4862, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -24.8554744720459, + "rewards/margins": 1.502950668334961, + "rewards/rejected": -26.35842514038086, + "step": 14655 + }, + { + "epoch": 0.49411844012268696, + "grad_norm": 32.465938568115234, + "learning_rate": 5.969176310935307e-07, + "logits/chosen": -1.2797296047210693, + "logits/rejected": -1.3183832168579102, + "logps/chosen": -1.6758267879486084, + "logps/rejected": -1.6694438457489014, + "loss": 3.3985, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -16.758268356323242, + "rewards/margins": -0.0638284683227539, + "rewards/rejected": -16.694438934326172, + "step": 14660 + }, + { + "epoch": 0.4942869661936702, + "grad_norm": 57.931907653808594, + "learning_rate": 5.966290582867552e-07, + "logits/chosen": -1.1228911876678467, + "logits/rejected": -1.4107589721679688, + "logps/chosen": -2.399655818939209, + "logps/rejected": -2.4190433025360107, + "loss": 3.325, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.996559143066406, + "rewards/margins": 0.19387368857860565, + "rewards/rejected": -24.190433502197266, + "step": 14665 + }, + { + "epoch": 0.49445549226465335, + "grad_norm": 24.14126205444336, + "learning_rate": 5.963404520404696e-07, + "logits/chosen": -1.2213075160980225, + "logits/rejected": -1.4488023519515991, + "logps/chosen": -1.8714441061019897, + "logps/rejected": -2.1866447925567627, + "loss": 2.0998, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.714441299438477, + "rewards/margins": 3.152009963989258, + "rewards/rejected": -21.8664493560791, + "step": 14670 + }, + { + "epoch": 0.4946240183356365, + "grad_norm": 32.929386138916016, + "learning_rate": 5.960518124545492e-07, + "logits/chosen": -1.1682794094085693, + "logits/rejected": -1.2816708087921143, + "logps/chosen": -2.183274030685425, + "logps/rejected": -2.345292329788208, + "loss": 3.108, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.832740783691406, + "rewards/margins": 1.6201798915863037, + "rewards/rejected": -23.45292091369629, + "step": 14675 + }, + { + "epoch": 0.4947925444066197, + "grad_norm": 19.513643264770508, + "learning_rate": 5.957631396288809e-07, + "logits/chosen": -1.9665130376815796, + "logits/rejected": -2.2521214485168457, + "logps/chosen": -2.527388334274292, + "logps/rejected": -2.6848843097686768, + "loss": 3.246, + "rewards/accuracies": 0.5, + "rewards/chosen": -25.273883819580078, + "rewards/margins": 1.5749595165252686, + "rewards/rejected": -26.848840713500977, + "step": 14680 + }, + { + "epoch": 0.4949610704776029, + "grad_norm": 31.406166076660156, + "learning_rate": 5.954744336633629e-07, + "logits/chosen": -1.33124840259552, + "logits/rejected": -1.4813728332519531, + "logps/chosen": -2.0210530757904053, + "logps/rejected": -2.093916893005371, + "loss": 2.7094, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.210533142089844, + "rewards/margins": 0.7286360859870911, + "rewards/rejected": -20.939167022705078, + "step": 14685 + }, + { + "epoch": 0.49512959654858607, + "grad_norm": 23.128408432006836, + "learning_rate": 5.95185694657905e-07, + "logits/chosen": -1.0466244220733643, + "logits/rejected": -1.5269792079925537, + "logps/chosen": -2.0125339031219482, + "logps/rejected": -2.319976329803467, + "loss": 1.7864, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.125341415405273, + "rewards/margins": 3.0744259357452393, + "rewards/rejected": -23.199764251708984, + "step": 14690 + }, + { + "epoch": 0.49529812261956924, + "grad_norm": 50.07123947143555, + "learning_rate": 5.948969227124282e-07, + "logits/chosen": -0.7739205956459045, + "logits/rejected": -1.1197071075439453, + "logps/chosen": -2.1050896644592285, + "logps/rejected": -2.4487571716308594, + "loss": 2.6332, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.050893783569336, + "rewards/margins": 3.4366748332977295, + "rewards/rejected": -24.487571716308594, + "step": 14695 + }, + { + "epoch": 0.4954666486905524, + "grad_norm": 11.936738967895508, + "learning_rate": 5.946081179268654e-07, + "logits/chosen": -1.552634596824646, + "logits/rejected": -1.8280937671661377, + "logps/chosen": -2.6402595043182373, + "logps/rejected": -2.8458423614501953, + "loss": 3.8194, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -26.4025936126709, + "rewards/margins": 2.0558295249938965, + "rewards/rejected": -28.458423614501953, + "step": 14700 + }, + { + "epoch": 0.4956351747615356, + "grad_norm": 71.45115661621094, + "learning_rate": 5.943192804011602e-07, + "logits/chosen": -1.7254797220230103, + "logits/rejected": -1.431593894958496, + "logps/chosen": -3.140167474746704, + "logps/rejected": -3.0458261966705322, + "loss": 4.3807, + "rewards/accuracies": 0.5, + "rewards/chosen": -31.401676177978516, + "rewards/margins": -0.9434127807617188, + "rewards/rejected": -30.4582576751709, + "step": 14705 + }, + { + "epoch": 0.4958037008325188, + "grad_norm": 17.0635929107666, + "learning_rate": 5.940304102352682e-07, + "logits/chosen": -1.3017793893814087, + "logits/rejected": -1.3800328969955444, + "logps/chosen": -1.8271408081054688, + "logps/rejected": -1.947548508644104, + "loss": 3.4235, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.271406173706055, + "rewards/margins": 1.2040780782699585, + "rewards/rejected": -19.475482940673828, + "step": 14710 + }, + { + "epoch": 0.49597222690350196, + "grad_norm": 18.700368881225586, + "learning_rate": 5.93741507529156e-07, + "logits/chosen": -1.353197455406189, + "logits/rejected": -1.3648579120635986, + "logps/chosen": -1.6878719329833984, + "logps/rejected": -1.8210432529449463, + "loss": 2.3033, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.878719329833984, + "rewards/margins": 1.3317129611968994, + "rewards/rejected": -18.210430145263672, + "step": 14715 + }, + { + "epoch": 0.4961407529744852, + "grad_norm": 20.979001998901367, + "learning_rate": 5.934525723828011e-07, + "logits/chosen": -1.2096859216690063, + "logits/rejected": -1.4101965427398682, + "logps/chosen": -2.714660167694092, + "logps/rejected": -2.635310649871826, + "loss": 3.9666, + "rewards/accuracies": 0.5, + "rewards/chosen": -27.146602630615234, + "rewards/margins": -0.7934969663619995, + "rewards/rejected": -26.353107452392578, + "step": 14720 + }, + { + "epoch": 0.49630927904546834, + "grad_norm": 28.868972778320312, + "learning_rate": 5.931636048961928e-07, + "logits/chosen": -2.1657252311706543, + "logits/rejected": -2.1079888343811035, + "logps/chosen": -1.6346423625946045, + "logps/rejected": -1.6175251007080078, + "loss": 3.3324, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -16.346424102783203, + "rewards/margins": -0.1711721420288086, + "rewards/rejected": -16.17525291442871, + "step": 14725 + }, + { + "epoch": 0.4964778051164515, + "grad_norm": 65.39803314208984, + "learning_rate": 5.928746051693314e-07, + "logits/chosen": -1.4344148635864258, + "logits/rejected": -1.676160454750061, + "logps/chosen": -2.6490559577941895, + "logps/rejected": -2.7858047485351562, + "loss": 3.5182, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -26.490558624267578, + "rewards/margins": 1.367490530014038, + "rewards/rejected": -27.858051300048828, + "step": 14730 + }, + { + "epoch": 0.4966463311874347, + "grad_norm": 28.76473617553711, + "learning_rate": 5.925855733022284e-07, + "logits/chosen": -1.472895860671997, + "logits/rejected": -1.8007482290267944, + "logps/chosen": -2.340292453765869, + "logps/rejected": -2.5708696842193604, + "loss": 1.9541, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.40292739868164, + "rewards/margins": 2.305771827697754, + "rewards/rejected": -25.708698272705078, + "step": 14735 + }, + { + "epoch": 0.4968148572584179, + "grad_norm": 32.8542594909668, + "learning_rate": 5.922965093949059e-07, + "logits/chosen": -1.4455643892288208, + "logits/rejected": -1.7135651111602783, + "logps/chosen": -2.043985366821289, + "logps/rejected": -2.2156598567962646, + "loss": 2.1238, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.43985366821289, + "rewards/margins": 1.7167431116104126, + "rewards/rejected": -22.156597137451172, + "step": 14740 + }, + { + "epoch": 0.49698338332940106, + "grad_norm": 21.394441604614258, + "learning_rate": 5.92007413547398e-07, + "logits/chosen": -1.5994513034820557, + "logits/rejected": -1.3545200824737549, + "logps/chosen": -1.9160792827606201, + "logps/rejected": -1.9209601879119873, + "loss": 3.2191, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.16079330444336, + "rewards/margins": 0.048807524144649506, + "rewards/rejected": -19.20960235595703, + "step": 14745 + }, + { + "epoch": 0.49715190940038423, + "grad_norm": 1.6420506238937378, + "learning_rate": 5.917182858597493e-07, + "logits/chosen": -0.9913978576660156, + "logits/rejected": -1.404176950454712, + "logps/chosen": -2.147432804107666, + "logps/rejected": -2.678109645843506, + "loss": 2.0465, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.47432518005371, + "rewards/margins": 5.3067708015441895, + "rewards/rejected": -26.781097412109375, + "step": 14750 + }, + { + "epoch": 0.4973204354713674, + "grad_norm": 22.916858673095703, + "learning_rate": 5.914291264320152e-07, + "logits/chosen": -1.6764614582061768, + "logits/rejected": -1.8273632526397705, + "logps/chosen": -2.4563040733337402, + "logps/rejected": -2.4414238929748535, + "loss": 4.3066, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.56304168701172, + "rewards/margins": -0.14880123734474182, + "rewards/rejected": -24.41423988342285, + "step": 14755 + }, + { + "epoch": 0.4974889615423506, + "grad_norm": 30.65839195251465, + "learning_rate": 5.911399353642629e-07, + "logits/chosen": -1.2902991771697998, + "logits/rejected": -1.3787257671356201, + "logps/chosen": -2.3649189472198486, + "logps/rejected": -2.294633388519287, + "loss": 4.6882, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.649185180664062, + "rewards/margins": -0.7028514742851257, + "rewards/rejected": -22.946334838867188, + "step": 14760 + }, + { + "epoch": 0.4976574876133338, + "grad_norm": 44.71762466430664, + "learning_rate": 5.908507127565695e-07, + "logits/chosen": -1.3229894638061523, + "logits/rejected": -1.5332590341567993, + "logps/chosen": -2.423475742340088, + "logps/rejected": -2.6610312461853027, + "loss": 2.5497, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.23475456237793, + "rewards/margins": 2.375556230545044, + "rewards/rejected": -26.61031150817871, + "step": 14765 + }, + { + "epoch": 0.49782601368431695, + "grad_norm": 11.168540954589844, + "learning_rate": 5.905614587090239e-07, + "logits/chosen": -1.39534592628479, + "logits/rejected": -1.234565258026123, + "logps/chosen": -2.0755391120910645, + "logps/rejected": -2.285327434539795, + "loss": 2.1858, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.755390167236328, + "rewards/margins": 2.0978846549987793, + "rewards/rejected": -22.853275299072266, + "step": 14770 + }, + { + "epoch": 0.49799453975530017, + "grad_norm": 27.3898983001709, + "learning_rate": 5.902721733217254e-07, + "logits/chosen": -1.3990356922149658, + "logits/rejected": -1.514904499053955, + "logps/chosen": -1.944977045059204, + "logps/rejected": -1.9022576808929443, + "loss": 3.5773, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.449771881103516, + "rewards/margins": -0.4271933436393738, + "rewards/rejected": -19.0225772857666, + "step": 14775 + }, + { + "epoch": 0.49816306582628334, + "grad_norm": 7.661595344543457, + "learning_rate": 5.899828566947843e-07, + "logits/chosen": -1.1851098537445068, + "logits/rejected": -1.463303565979004, + "logps/chosen": -2.121722459793091, + "logps/rejected": -2.8411059379577637, + "loss": 1.9996, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.21722412109375, + "rewards/margins": 7.193833827972412, + "rewards/rejected": -28.411062240600586, + "step": 14780 + }, + { + "epoch": 0.4983315918972665, + "grad_norm": 30.427043914794922, + "learning_rate": 5.896935089283217e-07, + "logits/chosen": -1.6385900974273682, + "logits/rejected": -1.9292678833007812, + "logps/chosen": -2.1936774253845215, + "logps/rejected": -2.4388184547424316, + "loss": 2.0809, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.9367733001709, + "rewards/margins": 2.4514102935791016, + "rewards/rejected": -24.38818359375, + "step": 14785 + }, + { + "epoch": 0.49850011796824967, + "grad_norm": 22.440723419189453, + "learning_rate": 5.894041301224694e-07, + "logits/chosen": -1.774770736694336, + "logits/rejected": -2.0120785236358643, + "logps/chosen": -2.0042014122009277, + "logps/rejected": -2.512622356414795, + "loss": 1.9292, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.04201316833496, + "rewards/margins": 5.084211826324463, + "rewards/rejected": -25.126224517822266, + "step": 14790 + }, + { + "epoch": 0.4986686440392329, + "grad_norm": 32.15318298339844, + "learning_rate": 5.8911472037737e-07, + "logits/chosen": -1.5917437076568604, + "logits/rejected": -1.4512475728988647, + "logps/chosen": -2.2175662517547607, + "logps/rejected": -2.5062191486358643, + "loss": 3.2555, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -22.175662994384766, + "rewards/margins": 2.886528253555298, + "rewards/rejected": -25.06218910217285, + "step": 14795 + }, + { + "epoch": 0.49883717011021605, + "grad_norm": 24.423494338989258, + "learning_rate": 5.88825279793177e-07, + "logits/chosen": -1.39306640625, + "logits/rejected": -1.2912009954452515, + "logps/chosen": -2.3427510261535645, + "logps/rejected": -2.494655132293701, + "loss": 2.7856, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.427509307861328, + "rewards/margins": 1.5190414190292358, + "rewards/rejected": -24.946552276611328, + "step": 14800 + }, + { + "epoch": 0.49883717011021605, + "eval_logits/chosen": -1.746840238571167, + "eval_logits/rejected": -1.8720086812973022, + "eval_logps/chosen": -2.02829647064209, + "eval_logps/rejected": -2.141511917114258, + "eval_loss": 2.990762948989868, + "eval_rewards/accuracies": 0.6299999952316284, + "eval_rewards/chosen": -20.2829647064209, + "eval_rewards/margins": 1.1321519613265991, + "eval_rewards/rejected": -21.415117263793945, + "eval_runtime": 12.8988, + "eval_samples_per_second": 7.753, + "eval_steps_per_second": 1.938, + "step": 14800 + }, + { + "epoch": 0.4990056961811992, + "grad_norm": 23.94206428527832, + "learning_rate": 5.885358084700542e-07, + "logits/chosen": -1.1922471523284912, + "logits/rejected": -1.102346658706665, + "logps/chosen": -2.3455393314361572, + "logps/rejected": -2.550114870071411, + "loss": 2.7445, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.455392837524414, + "rewards/margins": 2.045755624771118, + "rewards/rejected": -25.501148223876953, + "step": 14805 + }, + { + "epoch": 0.4991742222521824, + "grad_norm": 27.79281997680664, + "learning_rate": 5.882463065081762e-07, + "logits/chosen": -1.541636347770691, + "logits/rejected": -1.4492474794387817, + "logps/chosen": -2.2527847290039062, + "logps/rejected": -2.196924924850464, + "loss": 3.8975, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.52784538269043, + "rewards/margins": -0.5585947036743164, + "rewards/rejected": -21.96925163269043, + "step": 14810 + }, + { + "epoch": 0.4993427483231656, + "grad_norm": 16.550588607788086, + "learning_rate": 5.879567740077283e-07, + "logits/chosen": -1.3408617973327637, + "logits/rejected": -1.3791230916976929, + "logps/chosen": -1.9284662008285522, + "logps/rejected": -2.4746458530426025, + "loss": 1.635, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.28466033935547, + "rewards/margins": 5.461796760559082, + "rewards/rejected": -24.746456146240234, + "step": 14815 + }, + { + "epoch": 0.4995112743941488, + "grad_norm": 26.888790130615234, + "learning_rate": 5.876672110689063e-07, + "logits/chosen": -1.716398000717163, + "logits/rejected": -1.6853067874908447, + "logps/chosen": -1.8864831924438477, + "logps/rejected": -1.9089370965957642, + "loss": 2.9516, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.864831924438477, + "rewards/margins": 0.22453880310058594, + "rewards/rejected": -19.089370727539062, + "step": 14820 + }, + { + "epoch": 0.49967980046513194, + "grad_norm": 142.75662231445312, + "learning_rate": 5.873776177919163e-07, + "logits/chosen": -2.0140433311462402, + "logits/rejected": -2.0492501258850098, + "logps/chosen": -1.961703896522522, + "logps/rejected": -2.112826108932495, + "loss": 2.7826, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.61703872680664, + "rewards/margins": 1.511224389076233, + "rewards/rejected": -21.12826156616211, + "step": 14825 + }, + { + "epoch": 0.49984832653611516, + "grad_norm": 12.503056526184082, + "learning_rate": 5.870879942769757e-07, + "logits/chosen": -1.238468050956726, + "logits/rejected": -1.2341539859771729, + "logps/chosen": -2.7365505695343018, + "logps/rejected": -2.764101028442383, + "loss": 3.6693, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -27.365509033203125, + "rewards/margins": 0.2755018174648285, + "rewards/rejected": -27.641006469726562, + "step": 14830 + }, + { + "epoch": 0.5000168526070983, + "grad_norm": 52.38449478149414, + "learning_rate": 5.867983406243111e-07, + "logits/chosen": -1.2844483852386475, + "logits/rejected": -1.7105772495269775, + "logps/chosen": -2.4828011989593506, + "logps/rejected": -2.6509883403778076, + "loss": 2.5619, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.828014373779297, + "rewards/margins": 1.6818711757659912, + "rewards/rejected": -26.509883880615234, + "step": 14835 + }, + { + "epoch": 0.5001853786780815, + "grad_norm": 26.385717391967773, + "learning_rate": 5.865086569341606e-07, + "logits/chosen": -0.7888490557670593, + "logits/rejected": -0.9685935974121094, + "logps/chosen": -2.1663498878479004, + "logps/rejected": -2.4157395362854004, + "loss": 3.3503, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.663497924804688, + "rewards/margins": 2.493898391723633, + "rewards/rejected": -24.157394409179688, + "step": 14840 + }, + { + "epoch": 0.5003539047490647, + "grad_norm": 26.826446533203125, + "learning_rate": 5.862189433067722e-07, + "logits/chosen": -1.3891171216964722, + "logits/rejected": -1.4050616025924683, + "logps/chosen": -1.6742547750473022, + "logps/rejected": -1.6001243591308594, + "loss": 3.8043, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -16.7425479888916, + "rewards/margins": -0.7413040995597839, + "rewards/rejected": -16.001245498657227, + "step": 14845 + }, + { + "epoch": 0.5005224308200479, + "grad_norm": 291.24468994140625, + "learning_rate": 5.859291998424047e-07, + "logits/chosen": -1.1373765468597412, + "logits/rejected": -1.0353875160217285, + "logps/chosen": -2.4684436321258545, + "logps/rejected": -2.557105541229248, + "loss": 4.6681, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.684436798095703, + "rewards/margins": 0.8866220712661743, + "rewards/rejected": -25.57105827331543, + "step": 14850 + }, + { + "epoch": 0.500690956891031, + "grad_norm": 42.07600784301758, + "learning_rate": 5.856394266413264e-07, + "logits/chosen": -1.3034619092941284, + "logits/rejected": -1.3409957885742188, + "logps/chosen": -2.2791152000427246, + "logps/rejected": -2.2514889240264893, + "loss": 3.4252, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.791152954101562, + "rewards/margins": -0.27626290917396545, + "rewards/rejected": -22.514888763427734, + "step": 14855 + }, + { + "epoch": 0.5008594829620142, + "grad_norm": 2.786142110824585, + "learning_rate": 5.853496238038165e-07, + "logits/chosen": -1.2134068012237549, + "logits/rejected": -1.3753328323364258, + "logps/chosen": -2.052281141281128, + "logps/rejected": -2.3305046558380127, + "loss": 1.8951, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.522811889648438, + "rewards/margins": 2.7822327613830566, + "rewards/rejected": -23.305044174194336, + "step": 14860 + }, + { + "epoch": 0.5010280090329974, + "grad_norm": 25.388391494750977, + "learning_rate": 5.850597914301646e-07, + "logits/chosen": -1.2176518440246582, + "logits/rejected": -1.2763893604278564, + "logps/chosen": -2.0402097702026367, + "logps/rejected": -2.1866259574890137, + "loss": 2.6111, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.402097702026367, + "rewards/margins": 1.4641621112823486, + "rewards/rejected": -21.866260528564453, + "step": 14865 + }, + { + "epoch": 0.5011965351039805, + "grad_norm": 62.56755065917969, + "learning_rate": 5.847699296206699e-07, + "logits/chosen": -1.7310209274291992, + "logits/rejected": -1.6556918621063232, + "logps/chosen": -2.3317017555236816, + "logps/rejected": -2.410207748413086, + "loss": 4.2986, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.317020416259766, + "rewards/margins": 0.7850597500801086, + "rewards/rejected": -24.102079391479492, + "step": 14870 + }, + { + "epoch": 0.5013650611749638, + "grad_norm": 42.40745162963867, + "learning_rate": 5.844800384756427e-07, + "logits/chosen": -0.902258038520813, + "logits/rejected": -0.7612020373344421, + "logps/chosen": -2.677332639694214, + "logps/rejected": -3.0184645652770996, + "loss": 3.5555, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -26.773326873779297, + "rewards/margins": 3.4113197326660156, + "rewards/rejected": -30.184650421142578, + "step": 14875 + }, + { + "epoch": 0.501533587245947, + "grad_norm": 34.04011535644531, + "learning_rate": 5.841901180954023e-07, + "logits/chosen": -1.5480183362960815, + "logits/rejected": -1.486154317855835, + "logps/chosen": -2.7318997383117676, + "logps/rejected": -2.1882028579711914, + "loss": 9.325, + "rewards/accuracies": 0.5, + "rewards/chosen": -27.318994522094727, + "rewards/margins": -5.436966896057129, + "rewards/rejected": -21.882028579711914, + "step": 14880 + }, + { + "epoch": 0.5017021133169302, + "grad_norm": 31.812999725341797, + "learning_rate": 5.839001685802791e-07, + "logits/chosen": -1.213324785232544, + "logits/rejected": -1.4107666015625, + "logps/chosen": -2.0232436656951904, + "logps/rejected": -2.030778646469116, + "loss": 3.2473, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.232437133789062, + "rewards/margins": 0.0753483772277832, + "rewards/rejected": -20.307785034179688, + "step": 14885 + }, + { + "epoch": 0.5018706393879133, + "grad_norm": 51.604061126708984, + "learning_rate": 5.83610190030613e-07, + "logits/chosen": -1.1090004444122314, + "logits/rejected": -1.0023685693740845, + "logps/chosen": -1.8414013385772705, + "logps/rejected": -2.0963680744171143, + "loss": 3.827, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.414012908935547, + "rewards/margins": 2.5496678352355957, + "rewards/rejected": -20.963680267333984, + "step": 14890 + }, + { + "epoch": 0.5020391654588965, + "grad_norm": 6.897836685180664, + "learning_rate": 5.833201825467542e-07, + "logits/chosen": -1.3187196254730225, + "logits/rejected": -1.425402283668518, + "logps/chosen": -1.9114145040512085, + "logps/rejected": -2.122316598892212, + "loss": 2.0474, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.114147186279297, + "rewards/margins": 2.109020948410034, + "rewards/rejected": -21.22316551208496, + "step": 14895 + }, + { + "epoch": 0.5022076915298797, + "grad_norm": 51.942203521728516, + "learning_rate": 5.830301462290631e-07, + "logits/chosen": -1.130313515663147, + "logits/rejected": -1.3741600513458252, + "logps/chosen": -2.0696234703063965, + "logps/rejected": -2.4011638164520264, + "loss": 2.609, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.696231842041016, + "rewards/margins": 3.3154044151306152, + "rewards/rejected": -24.011638641357422, + "step": 14900 + }, + { + "epoch": 0.5023762176008628, + "grad_norm": 16.872711181640625, + "learning_rate": 5.827400811779094e-07, + "logits/chosen": -1.4417014122009277, + "logits/rejected": -1.4584705829620361, + "logps/chosen": -2.0206668376922607, + "logps/rejected": -2.132718324661255, + "loss": 2.462, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.206668853759766, + "rewards/margins": 1.120514154434204, + "rewards/rejected": -21.32718276977539, + "step": 14905 + }, + { + "epoch": 0.502544743671846, + "grad_norm": 36.745567321777344, + "learning_rate": 5.824499874936737e-07, + "logits/chosen": -1.2186622619628906, + "logits/rejected": -1.7788499593734741, + "logps/chosen": -2.42600154876709, + "logps/rejected": -2.794022560119629, + "loss": 2.5989, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.2600154876709, + "rewards/margins": 3.680208206176758, + "rewards/rejected": -27.94022560119629, + "step": 14910 + }, + { + "epoch": 0.5027132697428293, + "grad_norm": 20.291852951049805, + "learning_rate": 5.821598652767456e-07, + "logits/chosen": -1.5000197887420654, + "logits/rejected": -1.3767282962799072, + "logps/chosen": -2.5916619300842285, + "logps/rejected": -2.7663090229034424, + "loss": 2.7872, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.916616439819336, + "rewards/margins": 1.7464720010757446, + "rewards/rejected": -27.663089752197266, + "step": 14915 + }, + { + "epoch": 0.5028817958138124, + "grad_norm": 39.71499252319336, + "learning_rate": 5.818697146275251e-07, + "logits/chosen": -1.3548834323883057, + "logits/rejected": -1.595643162727356, + "logps/chosen": -2.2784557342529297, + "logps/rejected": -2.455989122390747, + "loss": 1.7163, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -22.784555435180664, + "rewards/margins": 1.7753328084945679, + "rewards/rejected": -24.55988883972168, + "step": 14920 + }, + { + "epoch": 0.5030503218847956, + "grad_norm": 21.26605796813965, + "learning_rate": 5.815795356464219e-07, + "logits/chosen": -1.3504364490509033, + "logits/rejected": -1.2849876880645752, + "logps/chosen": -1.9238868951797485, + "logps/rejected": -1.8664134740829468, + "loss": 4.2078, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.23887062072754, + "rewards/margins": -0.5747331380844116, + "rewards/rejected": -18.664134979248047, + "step": 14925 + }, + { + "epoch": 0.5032188479557788, + "grad_norm": 54.63640213012695, + "learning_rate": 5.812893284338554e-07, + "logits/chosen": -1.5353538990020752, + "logits/rejected": -1.5873312950134277, + "logps/chosen": -2.139723300933838, + "logps/rejected": -2.3303074836730957, + "loss": 3.1369, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.397235870361328, + "rewards/margins": 1.9058374166488647, + "rewards/rejected": -23.303071975708008, + "step": 14930 + }, + { + "epoch": 0.5033873740267619, + "grad_norm": 21.923625946044922, + "learning_rate": 5.809990930902553e-07, + "logits/chosen": -1.544602870941162, + "logits/rejected": -1.413293719291687, + "logps/chosen": -1.8571140766143799, + "logps/rejected": -2.018515110015869, + "loss": 2.363, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.57114028930664, + "rewards/margins": 1.614009141921997, + "rewards/rejected": -20.185152053833008, + "step": 14935 + }, + { + "epoch": 0.5035559000977451, + "grad_norm": 67.67894744873047, + "learning_rate": 5.8070882971606e-07, + "logits/chosen": -1.0476644039154053, + "logits/rejected": -0.980495274066925, + "logps/chosen": -2.2644171714782715, + "logps/rejected": -2.1518642902374268, + "loss": 4.2407, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.644174575805664, + "rewards/margins": -1.1255325078964233, + "rewards/rejected": -21.51864242553711, + "step": 14940 + }, + { + "epoch": 0.5037244261687283, + "grad_norm": 16.13987922668457, + "learning_rate": 5.804185384117189e-07, + "logits/chosen": -1.2747676372528076, + "logits/rejected": -1.362653136253357, + "logps/chosen": -1.9708786010742188, + "logps/rejected": -2.423872470855713, + "loss": 1.4325, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -19.708786010742188, + "rewards/margins": 4.529940128326416, + "rewards/rejected": -24.238727569580078, + "step": 14945 + }, + { + "epoch": 0.5038929522397115, + "grad_norm": 15.113187789916992, + "learning_rate": 5.801282192776897e-07, + "logits/chosen": -0.9203144907951355, + "logits/rejected": -1.1155325174331665, + "logps/chosen": -1.7292630672454834, + "logps/rejected": -2.20222806930542, + "loss": 1.5978, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -17.29262924194336, + "rewards/margins": 4.729650974273682, + "rewards/rejected": -22.022281646728516, + "step": 14950 + }, + { + "epoch": 0.5040614783106947, + "grad_norm": 20.39295196533203, + "learning_rate": 5.798378724144408e-07, + "logits/chosen": -1.5319854021072388, + "logits/rejected": -1.7888109683990479, + "logps/chosen": -2.368319272994995, + "logps/rejected": -2.5020503997802734, + "loss": 3.2456, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.68319320678711, + "rewards/margins": 1.3373106718063354, + "rewards/rejected": -25.020505905151367, + "step": 14955 + }, + { + "epoch": 0.5042300043816779, + "grad_norm": 41.03092575073242, + "learning_rate": 5.795474979224497e-07, + "logits/chosen": -1.3623888492584229, + "logits/rejected": -1.5820858478546143, + "logps/chosen": -1.880578637123108, + "logps/rejected": -1.8766975402832031, + "loss": 3.2324, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.805784225463867, + "rewards/margins": -0.03881063312292099, + "rewards/rejected": -18.76697540283203, + "step": 14960 + }, + { + "epoch": 0.504398530452661, + "grad_norm": 28.810705184936523, + "learning_rate": 5.792570959022036e-07, + "logits/chosen": -1.0483916997909546, + "logits/rejected": -1.350998878479004, + "logps/chosen": -1.980957269668579, + "logps/rejected": -2.296393394470215, + "loss": 2.5662, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.809574127197266, + "rewards/margins": 3.154359817504883, + "rewards/rejected": -22.96393394470215, + "step": 14965 + }, + { + "epoch": 0.5045670565236442, + "grad_norm": 18.993446350097656, + "learning_rate": 5.789666664541995e-07, + "logits/chosen": -0.7886873483657837, + "logits/rejected": -0.8404957056045532, + "logps/chosen": -2.4663655757904053, + "logps/rejected": -2.771146774291992, + "loss": 2.7351, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.663654327392578, + "rewards/margins": 3.0478122234344482, + "rewards/rejected": -27.711467742919922, + "step": 14970 + }, + { + "epoch": 0.5047355825946274, + "grad_norm": 61.77982711791992, + "learning_rate": 5.78676209678943e-07, + "logits/chosen": -1.4243978261947632, + "logits/rejected": -1.5058616399765015, + "logps/chosen": -3.024482250213623, + "logps/rejected": -3.1523780822753906, + "loss": 3.9404, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -30.244823455810547, + "rewards/margins": 1.2789571285247803, + "rewards/rejected": -31.523778915405273, + "step": 14975 + }, + { + "epoch": 0.5049041086656105, + "grad_norm": 19.27959442138672, + "learning_rate": 5.783857256769503e-07, + "logits/chosen": -1.178634762763977, + "logits/rejected": -1.251468300819397, + "logps/chosen": -1.8392133712768555, + "logps/rejected": -1.937748670578003, + "loss": 2.3974, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.392135620117188, + "rewards/margins": 0.9853529930114746, + "rewards/rejected": -19.377490997314453, + "step": 14980 + }, + { + "epoch": 0.5050726347365938, + "grad_norm": 8.128096580505371, + "learning_rate": 5.78095214548746e-07, + "logits/chosen": -1.1162395477294922, + "logits/rejected": -1.6783126592636108, + "logps/chosen": -1.9752323627471924, + "logps/rejected": -2.4849510192871094, + "loss": 1.2585, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.752323150634766, + "rewards/margins": 5.097184658050537, + "rewards/rejected": -24.84950828552246, + "step": 14985 + }, + { + "epoch": 0.505241160807577, + "grad_norm": 31.431013107299805, + "learning_rate": 5.778046763948649e-07, + "logits/chosen": -1.3854622840881348, + "logits/rejected": -1.2875674962997437, + "logps/chosen": -2.1900856494903564, + "logps/rejected": -1.9820547103881836, + "loss": 5.1442, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -21.900854110717773, + "rewards/margins": -2.080308437347412, + "rewards/rejected": -19.820547103881836, + "step": 14990 + }, + { + "epoch": 0.5054096868785601, + "grad_norm": 1.898257851600647, + "learning_rate": 5.775141113158506e-07, + "logits/chosen": -1.081305742263794, + "logits/rejected": -1.121368646621704, + "logps/chosen": -2.686314582824707, + "logps/rejected": -2.8313148021698, + "loss": 3.5432, + "rewards/accuracies": 0.5, + "rewards/chosen": -26.863147735595703, + "rewards/margins": 1.450002670288086, + "rewards/rejected": -28.313146591186523, + "step": 14995 + }, + { + "epoch": 0.5055782129495433, + "grad_norm": 30.13627815246582, + "learning_rate": 5.772235194122564e-07, + "logits/chosen": -1.1736291646957397, + "logits/rejected": -1.7007135152816772, + "logps/chosen": -2.089254856109619, + "logps/rejected": -2.541189670562744, + "loss": 2.0636, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.892545700073242, + "rewards/margins": 4.519349098205566, + "rewards/rejected": -25.411895751953125, + "step": 15000 + }, + { + "epoch": 0.5057467390205265, + "grad_norm": 79.65396881103516, + "learning_rate": 5.769329007846445e-07, + "logits/chosen": -1.4201363325119019, + "logits/rejected": -1.2374026775360107, + "logps/chosen": -2.4222466945648193, + "logps/rejected": -2.21855092048645, + "loss": 5.2571, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -24.22246742248535, + "rewards/margins": -2.0369582176208496, + "rewards/rejected": -22.18550682067871, + "step": 15005 + }, + { + "epoch": 0.5059152650915096, + "grad_norm": 36.43207550048828, + "learning_rate": 5.766422555335866e-07, + "logits/chosen": -1.0224153995513916, + "logits/rejected": -1.168966293334961, + "logps/chosen": -2.2806081771850586, + "logps/rejected": -2.5962941646575928, + "loss": 2.3357, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.806079864501953, + "rewards/margins": 3.1568641662597656, + "rewards/rejected": -25.96294593811035, + "step": 15010 + }, + { + "epoch": 0.5060837911624928, + "grad_norm": 137.0742645263672, + "learning_rate": 5.763515837596638e-07, + "logits/chosen": -1.1404750347137451, + "logits/rejected": -1.0680490732192993, + "logps/chosen": -2.4310100078582764, + "logps/rejected": -2.489689350128174, + "loss": 2.8058, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.310100555419922, + "rewards/margins": 0.5867937207221985, + "rewards/rejected": -24.896894454956055, + "step": 15015 + }, + { + "epoch": 0.506252317233476, + "grad_norm": 20.5747127532959, + "learning_rate": 5.760608855634661e-07, + "logits/chosen": -1.1737111806869507, + "logits/rejected": -1.6075645685195923, + "logps/chosen": -1.9366137981414795, + "logps/rejected": -2.1530511379241943, + "loss": 1.5798, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.366138458251953, + "rewards/margins": 2.1643741130828857, + "rewards/rejected": -21.5305118560791, + "step": 15020 + }, + { + "epoch": 0.5064208433044592, + "grad_norm": 34.711265563964844, + "learning_rate": 5.757701610455924e-07, + "logits/chosen": -1.3211795091629028, + "logits/rejected": -1.266367793083191, + "logps/chosen": -1.9475847482681274, + "logps/rejected": -2.24564528465271, + "loss": 1.8382, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.475847244262695, + "rewards/margins": 2.9806065559387207, + "rewards/rejected": -22.45645523071289, + "step": 15025 + }, + { + "epoch": 0.5065893693754424, + "grad_norm": 29.361732482910156, + "learning_rate": 5.754794103066511e-07, + "logits/chosen": -1.6665149927139282, + "logits/rejected": -1.9457658529281616, + "logps/chosen": -2.569488048553467, + "logps/rejected": -2.8468360900878906, + "loss": 1.7092, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -25.69488525390625, + "rewards/margins": 2.7734780311584473, + "rewards/rejected": -28.46836280822754, + "step": 15030 + }, + { + "epoch": 0.5067578954464256, + "grad_norm": 27.143373489379883, + "learning_rate": 5.751886334472598e-07, + "logits/chosen": -0.9758981466293335, + "logits/rejected": -1.1668593883514404, + "logps/chosen": -2.2181034088134766, + "logps/rejected": -2.323904275894165, + "loss": 3.4183, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.1810359954834, + "rewards/margins": 1.0580068826675415, + "rewards/rejected": -23.23904037475586, + "step": 15035 + }, + { + "epoch": 0.5069264215174087, + "grad_norm": 35.79897689819336, + "learning_rate": 5.748978305680448e-07, + "logits/chosen": -1.2053627967834473, + "logits/rejected": -1.2655714750289917, + "logps/chosen": -1.6004425287246704, + "logps/rejected": -1.8336282968521118, + "loss": 1.7421, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -16.004425048828125, + "rewards/margins": 2.3318583965301514, + "rewards/rejected": -18.336284637451172, + "step": 15040 + }, + { + "epoch": 0.5070949475883919, + "grad_norm": 28.32414436340332, + "learning_rate": 5.746070017696415e-07, + "logits/chosen": -1.4866182804107666, + "logits/rejected": -1.7096736431121826, + "logps/chosen": -1.7442781925201416, + "logps/rejected": -1.6984007358551025, + "loss": 3.5889, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.442779541015625, + "rewards/margins": -0.4587737023830414, + "rewards/rejected": -16.984004974365234, + "step": 15045 + }, + { + "epoch": 0.5072634736593751, + "grad_norm": 32.17911911010742, + "learning_rate": 5.743161471526943e-07, + "logits/chosen": -1.323290467262268, + "logits/rejected": -1.168046236038208, + "logps/chosen": -1.8326724767684937, + "logps/rejected": -1.8872158527374268, + "loss": 2.6512, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.326725006103516, + "rewards/margins": 0.5454355478286743, + "rewards/rejected": -18.872159957885742, + "step": 15050 + }, + { + "epoch": 0.5074319997303582, + "grad_norm": 36.145240783691406, + "learning_rate": 5.740252668178565e-07, + "logits/chosen": -1.0528347492218018, + "logits/rejected": -1.01715886592865, + "logps/chosen": -2.278759241104126, + "logps/rejected": -2.063829183578491, + "loss": 5.1952, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -22.78759002685547, + "rewards/margins": -2.149296522140503, + "rewards/rejected": -20.638294219970703, + "step": 15055 + }, + { + "epoch": 0.5076005258013415, + "grad_norm": 21.49053192138672, + "learning_rate": 5.737343608657903e-07, + "logits/chosen": -1.3348206281661987, + "logits/rejected": -1.3797569274902344, + "logps/chosen": -2.138988971710205, + "logps/rejected": -2.2335526943206787, + "loss": 2.9161, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.389888763427734, + "rewards/margins": 0.9456375241279602, + "rewards/rejected": -22.335527420043945, + "step": 15060 + }, + { + "epoch": 0.5077690518723247, + "grad_norm": 24.10394859313965, + "learning_rate": 5.734434293971668e-07, + "logits/chosen": -1.3850148916244507, + "logits/rejected": -1.6381429433822632, + "logps/chosen": -1.8205817937850952, + "logps/rejected": -2.170335292816162, + "loss": 2.4044, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.2058162689209, + "rewards/margins": 3.497534990310669, + "rewards/rejected": -21.703351974487305, + "step": 15065 + }, + { + "epoch": 0.5079375779433078, + "grad_norm": 103.56597900390625, + "learning_rate": 5.73152472512666e-07, + "logits/chosen": -1.0463709831237793, + "logits/rejected": -1.275679588317871, + "logps/chosen": -2.388948440551758, + "logps/rejected": -2.543187379837036, + "loss": 2.6494, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -23.889484405517578, + "rewards/margins": 1.5423904657363892, + "rewards/rejected": -25.431873321533203, + "step": 15070 + }, + { + "epoch": 0.508106104014291, + "grad_norm": 124.68612670898438, + "learning_rate": 5.728614903129765e-07, + "logits/chosen": -1.4812027215957642, + "logits/rejected": -1.6524875164031982, + "logps/chosen": -2.292492628097534, + "logps/rejected": -2.298862934112549, + "loss": 3.0968, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.9249267578125, + "rewards/margins": 0.06370306015014648, + "rewards/rejected": -22.988630294799805, + "step": 15075 + }, + { + "epoch": 0.5082746300852742, + "grad_norm": 26.963624954223633, + "learning_rate": 5.725704828987959e-07, + "logits/chosen": -1.2273039817810059, + "logits/rejected": -1.193224549293518, + "logps/chosen": -2.1760504245758057, + "logps/rejected": -2.2250120639801025, + "loss": 3.1869, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -21.7605037689209, + "rewards/margins": 0.48961561918258667, + "rewards/rejected": -22.250120162963867, + "step": 15080 + }, + { + "epoch": 0.5084431561562573, + "grad_norm": 27.633817672729492, + "learning_rate": 5.722794503708303e-07, + "logits/chosen": -1.3985062837600708, + "logits/rejected": -1.6811736822128296, + "logps/chosen": -2.1464569568634033, + "logps/rejected": -2.389500379562378, + "loss": 2.1451, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.464569091796875, + "rewards/margins": 2.4304375648498535, + "rewards/rejected": -23.89500617980957, + "step": 15085 + }, + { + "epoch": 0.5086116822272405, + "grad_norm": 30.096017837524414, + "learning_rate": 5.719883928297946e-07, + "logits/chosen": -1.3296657800674438, + "logits/rejected": -1.5122735500335693, + "logps/chosen": -2.052288293838501, + "logps/rejected": -2.173229932785034, + "loss": 2.4221, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.522884368896484, + "rewards/margins": 1.2094166278839111, + "rewards/rejected": -21.7322998046875, + "step": 15090 + }, + { + "epoch": 0.5087802082982238, + "grad_norm": 38.03456115722656, + "learning_rate": 5.716973103764123e-07, + "logits/chosen": -1.9371150732040405, + "logits/rejected": -2.0468668937683105, + "logps/chosen": -2.0087790489196777, + "logps/rejected": -2.5527255535125732, + "loss": 2.3194, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.08778953552246, + "rewards/margins": 5.439466953277588, + "rewards/rejected": -25.52725601196289, + "step": 15095 + }, + { + "epoch": 0.508948734369207, + "grad_norm": 20.301576614379883, + "learning_rate": 5.714062031114159e-07, + "logits/chosen": -1.9820003509521484, + "logits/rejected": -2.280233383178711, + "logps/chosen": -1.8788375854492188, + "logps/rejected": -2.440950870513916, + "loss": 1.7261, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.788375854492188, + "rewards/margins": 5.621129035949707, + "rewards/rejected": -24.409503936767578, + "step": 15100 + }, + { + "epoch": 0.5091172604401901, + "grad_norm": 124.19692993164062, + "learning_rate": 5.711150711355456e-07, + "logits/chosen": -1.373255968093872, + "logits/rejected": -1.834007978439331, + "logps/chosen": -2.469758987426758, + "logps/rejected": -2.4483742713928223, + "loss": 4.5018, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -24.697589874267578, + "rewards/margins": -0.2138504981994629, + "rewards/rejected": -24.483741760253906, + "step": 15105 + }, + { + "epoch": 0.5092857865111733, + "grad_norm": 27.15669059753418, + "learning_rate": 5.70823914549551e-07, + "logits/chosen": -1.1241658926010132, + "logits/rejected": -1.5976471900939941, + "logps/chosen": -2.844392776489258, + "logps/rejected": -3.51300311088562, + "loss": 1.8059, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -28.443927764892578, + "rewards/margins": 6.686103820800781, + "rewards/rejected": -35.130027770996094, + "step": 15110 + }, + { + "epoch": 0.5094543125821565, + "grad_norm": 34.46886444091797, + "learning_rate": 5.705327334541901e-07, + "logits/chosen": -1.1911742687225342, + "logits/rejected": -1.2832351922988892, + "logps/chosen": -1.9296703338623047, + "logps/rejected": -2.128652811050415, + "loss": 2.3969, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.296703338623047, + "rewards/margins": 1.9898231029510498, + "rewards/rejected": -21.28652572631836, + "step": 15115 + }, + { + "epoch": 0.5096228386531396, + "grad_norm": 19.99987030029297, + "learning_rate": 5.702415279502289e-07, + "logits/chosen": -1.6466939449310303, + "logits/rejected": -1.8893673419952393, + "logps/chosen": -2.2943835258483887, + "logps/rejected": -2.482844829559326, + "loss": 2.9223, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.94383430480957, + "rewards/margins": 1.8846137523651123, + "rewards/rejected": -24.828449249267578, + "step": 15120 + }, + { + "epoch": 0.5097913647241228, + "grad_norm": 12.42703914642334, + "learning_rate": 5.699502981384424e-07, + "logits/chosen": -1.3233040571212769, + "logits/rejected": -1.3392280340194702, + "logps/chosen": -2.143939971923828, + "logps/rejected": -2.0123233795166016, + "loss": 6.2954, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -21.43939781188965, + "rewards/margins": -1.3161662817001343, + "rewards/rejected": -20.123231887817383, + "step": 15125 + }, + { + "epoch": 0.509959890795106, + "grad_norm": 37.99794006347656, + "learning_rate": 5.696590441196137e-07, + "logits/chosen": -1.403928518295288, + "logits/rejected": -1.2134307622909546, + "logps/chosen": -2.2327723503112793, + "logps/rejected": -2.414923906326294, + "loss": 3.2015, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -22.32772445678711, + "rewards/margins": 1.8215110301971436, + "rewards/rejected": -24.14923667907715, + "step": 15130 + }, + { + "epoch": 0.5101284168660892, + "grad_norm": 32.46255111694336, + "learning_rate": 5.693677659945342e-07, + "logits/chosen": -1.5124094486236572, + "logits/rejected": -1.5291669368743896, + "logps/chosen": -1.6440776586532593, + "logps/rejected": -1.6526644229888916, + "loss": 3.0941, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.440776824951172, + "rewards/margins": 0.08586740493774414, + "rewards/rejected": -16.526643753051758, + "step": 15135 + }, + { + "epoch": 0.5102969429370724, + "grad_norm": 26.882415771484375, + "learning_rate": 5.690764638640037e-07, + "logits/chosen": -1.208224892616272, + "logits/rejected": -1.1025320291519165, + "logps/chosen": -2.1468870639801025, + "logps/rejected": -2.654031276702881, + "loss": 1.5857, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.468868255615234, + "rewards/margins": 5.071444034576416, + "rewards/rejected": -26.540313720703125, + "step": 15140 + }, + { + "epoch": 0.5104654690080556, + "grad_norm": 33.146541595458984, + "learning_rate": 5.687851378288309e-07, + "logits/chosen": -1.5718231201171875, + "logits/rejected": -1.6999132633209229, + "logps/chosen": -2.2277894020080566, + "logps/rejected": -2.578054666519165, + "loss": 3.0773, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.277894973754883, + "rewards/margins": 3.502652406692505, + "rewards/rejected": -25.78054428100586, + "step": 15145 + }, + { + "epoch": 0.5106339950790387, + "grad_norm": 25.321151733398438, + "learning_rate": 5.684937879898316e-07, + "logits/chosen": -0.8661189079284668, + "logits/rejected": -0.9531176686286926, + "logps/chosen": -2.0083203315734863, + "logps/rejected": -2.0512888431549072, + "loss": 3.563, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.083200454711914, + "rewards/margins": 0.42968788743019104, + "rewards/rejected": -20.512887954711914, + "step": 15150 + }, + { + "epoch": 0.5108025211500219, + "grad_norm": 71.89868927001953, + "learning_rate": 5.68202414447831e-07, + "logits/chosen": -1.2475855350494385, + "logits/rejected": -1.1813002824783325, + "logps/chosen": -2.005934238433838, + "logps/rejected": -2.3727059364318848, + "loss": 2.8826, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.059343338012695, + "rewards/margins": 3.6677181720733643, + "rewards/rejected": -23.727060317993164, + "step": 15155 + }, + { + "epoch": 0.5109710472210051, + "grad_norm": 41.4750862121582, + "learning_rate": 5.679110173036619e-07, + "logits/chosen": -1.2968660593032837, + "logits/rejected": -1.4929004907608032, + "logps/chosen": -2.0121240615844727, + "logps/rejected": -2.3801047801971436, + "loss": 2.112, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.121240615844727, + "rewards/margins": 3.6798081398010254, + "rewards/rejected": -23.801050186157227, + "step": 15160 + }, + { + "epoch": 0.5111395732919882, + "grad_norm": 24.9608211517334, + "learning_rate": 5.67619596658165e-07, + "logits/chosen": -1.766427993774414, + "logits/rejected": -2.0109450817108154, + "logps/chosen": -2.5077290534973145, + "logps/rejected": -2.861133337020874, + "loss": 3.4502, + "rewards/accuracies": 0.5, + "rewards/chosen": -25.07729148864746, + "rewards/margins": 3.5340423583984375, + "rewards/rejected": -28.6113338470459, + "step": 15165 + }, + { + "epoch": 0.5113080993629715, + "grad_norm": 99.52765655517578, + "learning_rate": 5.673281526121901e-07, + "logits/chosen": -1.3471081256866455, + "logits/rejected": -1.2552907466888428, + "logps/chosen": -2.918884754180908, + "logps/rejected": -2.788374662399292, + "loss": 4.8588, + "rewards/accuracies": 0.5, + "rewards/chosen": -29.1888484954834, + "rewards/margins": -1.3051000833511353, + "rewards/rejected": -27.883747100830078, + "step": 15170 + }, + { + "epoch": 0.5114766254339547, + "grad_norm": 19.62303352355957, + "learning_rate": 5.670366852665941e-07, + "logits/chosen": -1.0323598384857178, + "logits/rejected": -1.5572612285614014, + "logps/chosen": -2.121976852416992, + "logps/rejected": -2.498755693435669, + "loss": 2.5533, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.219768524169922, + "rewards/margins": 3.7677853107452393, + "rewards/rejected": -24.9875545501709, + "step": 15175 + }, + { + "epoch": 0.5116451515049378, + "grad_norm": 14.84547233581543, + "learning_rate": 5.667451947222424e-07, + "logits/chosen": -1.4115025997161865, + "logits/rejected": -1.6721994876861572, + "logps/chosen": -2.5753791332244873, + "logps/rejected": -2.889843463897705, + "loss": 3.2453, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.7537899017334, + "rewards/margins": 3.144641876220703, + "rewards/rejected": -28.8984317779541, + "step": 15180 + }, + { + "epoch": 0.511813677575921, + "grad_norm": 98.96126556396484, + "learning_rate": 5.664536810800086e-07, + "logits/chosen": -0.8689087629318237, + "logits/rejected": -1.1629440784454346, + "logps/chosen": -3.3273167610168457, + "logps/rejected": -3.6868317127227783, + "loss": 3.3101, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -33.273170471191406, + "rewards/margins": 3.5951449871063232, + "rewards/rejected": -36.86831283569336, + "step": 15185 + }, + { + "epoch": 0.5119822036469042, + "grad_norm": 13.844437599182129, + "learning_rate": 5.661621444407738e-07, + "logits/chosen": -1.2056928873062134, + "logits/rejected": -1.2074638605117798, + "logps/chosen": -2.4977755546569824, + "logps/rejected": -2.657663106918335, + "loss": 2.1132, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.97775650024414, + "rewards/margins": 1.598876953125, + "rewards/rejected": -26.576629638671875, + "step": 15190 + }, + { + "epoch": 0.5121507297178873, + "grad_norm": 21.894990921020508, + "learning_rate": 5.658705849054276e-07, + "logits/chosen": -0.9881241917610168, + "logits/rejected": -1.1200683116912842, + "logps/chosen": -1.9984909296035767, + "logps/rejected": -2.072193145751953, + "loss": 3.4589, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.984909057617188, + "rewards/margins": 0.7370238304138184, + "rewards/rejected": -20.72193145751953, + "step": 15195 + }, + { + "epoch": 0.5123192557888705, + "grad_norm": 128.0411834716797, + "learning_rate": 5.655790025748672e-07, + "logits/chosen": -1.3919637203216553, + "logits/rejected": -1.5302560329437256, + "logps/chosen": -2.3977060317993164, + "logps/rejected": -2.2407610416412354, + "loss": 4.9446, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -23.977062225341797, + "rewards/margins": -1.5694499015808105, + "rewards/rejected": -22.407611846923828, + "step": 15200 + }, + { + "epoch": 0.5123192557888705, + "eval_logits/chosen": -1.7760441303253174, + "eval_logits/rejected": -1.9057296514511108, + "eval_logps/chosen": -2.0414392948150635, + "eval_logps/rejected": -2.1566874980926514, + "eval_loss": 2.990462064743042, + "eval_rewards/accuracies": 0.6299999952316284, + "eval_rewards/chosen": -20.414392471313477, + "eval_rewards/margins": 1.1524845361709595, + "eval_rewards/rejected": -21.566875457763672, + "eval_runtime": 12.9128, + "eval_samples_per_second": 7.744, + "eval_steps_per_second": 1.936, + "step": 15200 + }, + { + "epoch": 0.5124877818598538, + "grad_norm": 104.97480010986328, + "learning_rate": 5.652873975499977e-07, + "logits/chosen": -1.511674165725708, + "logits/rejected": -1.7704378366470337, + "logps/chosen": -2.450023889541626, + "logps/rejected": -2.663987159729004, + "loss": 2.8018, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.5002384185791, + "rewards/margins": 2.1396327018737793, + "rewards/rejected": -26.63987159729004, + "step": 15205 + }, + { + "epoch": 0.5126563079308369, + "grad_norm": 3.2481727600097656, + "learning_rate": 5.649957699317319e-07, + "logits/chosen": -1.3668832778930664, + "logits/rejected": -1.6022402048110962, + "logps/chosen": -2.3622775077819824, + "logps/rejected": -3.1148085594177246, + "loss": 1.7853, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -23.62277603149414, + "rewards/margins": 7.525309085845947, + "rewards/rejected": -31.148086547851562, + "step": 15210 + }, + { + "epoch": 0.5128248340018201, + "grad_norm": 41.21067428588867, + "learning_rate": 5.647041198209912e-07, + "logits/chosen": -1.402848243713379, + "logits/rejected": -1.4903753995895386, + "logps/chosen": -2.3762662410736084, + "logps/rejected": -2.614976167678833, + "loss": 2.2179, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.762662887573242, + "rewards/margins": 2.387101173400879, + "rewards/rejected": -26.149761199951172, + "step": 15215 + }, + { + "epoch": 0.5129933600728033, + "grad_norm": 18.946138381958008, + "learning_rate": 5.644124473187038e-07, + "logits/chosen": -1.4866359233856201, + "logits/rejected": -1.8214614391326904, + "logps/chosen": -2.311133861541748, + "logps/rejected": -2.6970841884613037, + "loss": 2.1406, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.111337661743164, + "rewards/margins": 3.8595046997070312, + "rewards/rejected": -26.970844268798828, + "step": 15220 + }, + { + "epoch": 0.5131618861437864, + "grad_norm": 20.410324096679688, + "learning_rate": 5.641207525258059e-07, + "logits/chosen": -1.355177879333496, + "logits/rejected": -1.4015527963638306, + "logps/chosen": -1.7830779552459717, + "logps/rejected": -1.906141996383667, + "loss": 2.4046, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.830781936645508, + "rewards/margins": 1.2306405305862427, + "rewards/rejected": -19.061420440673828, + "step": 15225 + }, + { + "epoch": 0.5133304122147696, + "grad_norm": 21.630565643310547, + "learning_rate": 5.63829035543242e-07, + "logits/chosen": -1.3447643518447876, + "logits/rejected": -1.690081000328064, + "logps/chosen": -2.441915988922119, + "logps/rejected": -2.865563154220581, + "loss": 2.5523, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.41915512084961, + "rewards/margins": 4.236475944519043, + "rewards/rejected": -28.6556339263916, + "step": 15230 + }, + { + "epoch": 0.5134989382857528, + "grad_norm": 33.107730865478516, + "learning_rate": 5.635372964719635e-07, + "logits/chosen": -1.1950870752334595, + "logits/rejected": -1.6577335596084595, + "logps/chosen": -1.9552379846572876, + "logps/rejected": -2.3201534748077393, + "loss": 2.7634, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.552379608154297, + "rewards/margins": 3.6491541862487793, + "rewards/rejected": -23.201534271240234, + "step": 15235 + }, + { + "epoch": 0.5136674643567359, + "grad_norm": 17.149858474731445, + "learning_rate": 5.632455354129302e-07, + "logits/chosen": -1.1954476833343506, + "logits/rejected": -1.427310585975647, + "logps/chosen": -2.1515591144561768, + "logps/rejected": -2.411625385284424, + "loss": 2.4254, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.51559066772461, + "rewards/margins": 2.6006622314453125, + "rewards/rejected": -24.116252899169922, + "step": 15240 + }, + { + "epoch": 0.5138359904277192, + "grad_norm": 62.84499740600586, + "learning_rate": 5.629537524671086e-07, + "logits/chosen": -1.7799403667449951, + "logits/rejected": -1.6227846145629883, + "logps/chosen": -1.9937528371810913, + "logps/rejected": -1.9934908151626587, + "loss": 3.6365, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.937528610229492, + "rewards/margins": -0.0026217461563646793, + "rewards/rejected": -19.934907913208008, + "step": 15245 + }, + { + "epoch": 0.5140045164987024, + "grad_norm": 27.73377799987793, + "learning_rate": 5.626619477354738e-07, + "logits/chosen": -1.7326381206512451, + "logits/rejected": -1.859344482421875, + "logps/chosen": -2.5812244415283203, + "logps/rejected": -2.9161319732666016, + "loss": 2.1002, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.812240600585938, + "rewards/margins": 3.349076509475708, + "rewards/rejected": -29.16132164001465, + "step": 15250 + }, + { + "epoch": 0.5141730425696855, + "grad_norm": 23.284900665283203, + "learning_rate": 5.623701213190075e-07, + "logits/chosen": -1.6898269653320312, + "logits/rejected": -1.8181917667388916, + "logps/chosen": -2.469947576522827, + "logps/rejected": -2.9490766525268555, + "loss": 1.9946, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.699474334716797, + "rewards/margins": 4.7912917137146, + "rewards/rejected": -29.490764617919922, + "step": 15255 + }, + { + "epoch": 0.5143415686406687, + "grad_norm": 20.285890579223633, + "learning_rate": 5.620782733186995e-07, + "logits/chosen": -1.0866138935089111, + "logits/rejected": -1.1041343212127686, + "logps/chosen": -2.3508377075195312, + "logps/rejected": -2.452939510345459, + "loss": 2.8995, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.50837516784668, + "rewards/margins": 1.021020531654358, + "rewards/rejected": -24.529394149780273, + "step": 15260 + }, + { + "epoch": 0.5145100947116519, + "grad_norm": 31.823381423950195, + "learning_rate": 5.617864038355469e-07, + "logits/chosen": -1.625353455543518, + "logits/rejected": -1.5252519845962524, + "logps/chosen": -2.1418838500976562, + "logps/rejected": -2.5077455043792725, + "loss": 2.7457, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.418838500976562, + "rewards/margins": 3.658615827560425, + "rewards/rejected": -25.07745361328125, + "step": 15265 + }, + { + "epoch": 0.514678620782635, + "grad_norm": 34.04703903198242, + "learning_rate": 5.614945129705543e-07, + "logits/chosen": -1.2527238130569458, + "logits/rejected": -1.4769929647445679, + "logps/chosen": -2.013667106628418, + "logps/rejected": -2.987143039703369, + "loss": 2.7115, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.136669158935547, + "rewards/margins": 9.734761238098145, + "rewards/rejected": -29.871429443359375, + "step": 15270 + }, + { + "epoch": 0.5148471468536182, + "grad_norm": 28.078725814819336, + "learning_rate": 5.612026008247336e-07, + "logits/chosen": -2.0508275032043457, + "logits/rejected": -1.9603971242904663, + "logps/chosen": -2.383984327316284, + "logps/rejected": -2.6936373710632324, + "loss": 2.3759, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.839847564697266, + "rewards/margins": 3.096529006958008, + "rewards/rejected": -26.93637466430664, + "step": 15275 + }, + { + "epoch": 0.5150156729246015, + "grad_norm": 28.85657501220703, + "learning_rate": 5.609106674991038e-07, + "logits/chosen": -1.2822644710540771, + "logits/rejected": -1.3018858432769775, + "logps/chosen": -2.2119603157043457, + "logps/rejected": -2.025120973587036, + "loss": 5.0357, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -22.11960220336914, + "rewards/margins": -1.8683927059173584, + "rewards/rejected": -20.251211166381836, + "step": 15280 + }, + { + "epoch": 0.5151841989955847, + "grad_norm": 75.3193588256836, + "learning_rate": 5.606187130946921e-07, + "logits/chosen": -1.9307209253311157, + "logits/rejected": -1.889897108078003, + "logps/chosen": -2.0299391746520996, + "logps/rejected": -2.8823294639587402, + "loss": 2.1457, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.29939079284668, + "rewards/margins": 8.523903846740723, + "rewards/rejected": -28.823293685913086, + "step": 15285 + }, + { + "epoch": 0.5153527250665678, + "grad_norm": 21.13232421875, + "learning_rate": 5.603267377125319e-07, + "logits/chosen": -1.5366874933242798, + "logits/rejected": -1.7862281799316406, + "logps/chosen": -2.5976200103759766, + "logps/rejected": -3.0689826011657715, + "loss": 1.3576, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -25.976200103759766, + "rewards/margins": 4.713629245758057, + "rewards/rejected": -30.689828872680664, + "step": 15290 + }, + { + "epoch": 0.515521251137551, + "grad_norm": 23.98374366760254, + "learning_rate": 5.600347414536645e-07, + "logits/chosen": -1.440609335899353, + "logits/rejected": -1.6278479099273682, + "logps/chosen": -2.1875216960906982, + "logps/rejected": -2.355437755584717, + "loss": 2.2062, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.87521743774414, + "rewards/margins": 1.6791574954986572, + "rewards/rejected": -23.55437469482422, + "step": 15295 + }, + { + "epoch": 0.5156897772085342, + "grad_norm": 27.256919860839844, + "learning_rate": 5.597427244191385e-07, + "logits/chosen": -1.8316819667816162, + "logits/rejected": -1.9437427520751953, + "logps/chosen": -1.7571513652801514, + "logps/rejected": -2.0879530906677246, + "loss": 2.5299, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.571514129638672, + "rewards/margins": 3.308016300201416, + "rewards/rejected": -20.879528045654297, + "step": 15300 + }, + { + "epoch": 0.5158583032795173, + "grad_norm": 22.483091354370117, + "learning_rate": 5.594506867100092e-07, + "logits/chosen": -1.382283329963684, + "logits/rejected": -1.304034948348999, + "logps/chosen": -2.455326557159424, + "logps/rejected": -2.544750213623047, + "loss": 3.8275, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.553264617919922, + "rewards/margins": 0.894239068031311, + "rewards/rejected": -25.4475040435791, + "step": 15305 + }, + { + "epoch": 0.5160268293505005, + "grad_norm": 29.94123077392578, + "learning_rate": 5.591586284273396e-07, + "logits/chosen": -1.4842712879180908, + "logits/rejected": -1.387634038925171, + "logps/chosen": -1.6527531147003174, + "logps/rejected": -1.783780813217163, + "loss": 1.9562, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -16.527530670166016, + "rewards/margins": 1.3102777004241943, + "rewards/rejected": -17.837810516357422, + "step": 15310 + }, + { + "epoch": 0.5161953554214836, + "grad_norm": 32.287227630615234, + "learning_rate": 5.588665496721994e-07, + "logits/chosen": -1.6405065059661865, + "logits/rejected": -2.1743381023406982, + "logps/chosen": -2.823437213897705, + "logps/rejected": -3.476367950439453, + "loss": 1.34, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -28.234371185302734, + "rewards/margins": 6.529305934906006, + "rewards/rejected": -34.76367950439453, + "step": 15315 + }, + { + "epoch": 0.5163638814924669, + "grad_norm": 29.671825408935547, + "learning_rate": 5.585744505456656e-07, + "logits/chosen": -1.4803146123886108, + "logits/rejected": -1.6037018299102783, + "logps/chosen": -1.7614666223526, + "logps/rejected": -2.1801557540893555, + "loss": 1.7482, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -17.614665985107422, + "rewards/margins": 4.186891555786133, + "rewards/rejected": -21.801555633544922, + "step": 15320 + }, + { + "epoch": 0.5165324075634501, + "grad_norm": 32.12974548339844, + "learning_rate": 5.582823311488222e-07, + "logits/chosen": -1.0687305927276611, + "logits/rejected": -1.4367082118988037, + "logps/chosen": -2.857874631881714, + "logps/rejected": -2.8543312549591064, + "loss": 3.6711, + "rewards/accuracies": 0.5, + "rewards/chosen": -28.578746795654297, + "rewards/margins": -0.03543538972735405, + "rewards/rejected": -28.543310165405273, + "step": 15325 + }, + { + "epoch": 0.5167009336344333, + "grad_norm": 56.81086730957031, + "learning_rate": 5.579901915827601e-07, + "logits/chosen": -1.816873550415039, + "logits/rejected": -1.7994168996810913, + "logps/chosen": -2.356872320175171, + "logps/rejected": -2.617316722869873, + "loss": 2.9243, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.5687255859375, + "rewards/margins": 2.604442596435547, + "rewards/rejected": -26.173168182373047, + "step": 15330 + }, + { + "epoch": 0.5168694597054164, + "grad_norm": 27.684396743774414, + "learning_rate": 5.576980319485777e-07, + "logits/chosen": -1.2684608697891235, + "logits/rejected": -1.4406415224075317, + "logps/chosen": -2.101574420928955, + "logps/rejected": -2.1958396434783936, + "loss": 2.8061, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.015743255615234, + "rewards/margins": 0.9426544308662415, + "rewards/rejected": -21.958398818969727, + "step": 15335 + }, + { + "epoch": 0.5170379857763996, + "grad_norm": 53.1370735168457, + "learning_rate": 5.574058523473794e-07, + "logits/chosen": -1.0984376668930054, + "logits/rejected": -1.0726025104522705, + "logps/chosen": -2.186927318572998, + "logps/rejected": -2.5625603199005127, + "loss": 2.5126, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.869272232055664, + "rewards/margins": 3.7563300132751465, + "rewards/rejected": -25.6256046295166, + "step": 15340 + }, + { + "epoch": 0.5172065118473828, + "grad_norm": 22.909120559692383, + "learning_rate": 5.571136528802775e-07, + "logits/chosen": -1.4302793741226196, + "logits/rejected": -1.3703594207763672, + "logps/chosen": -1.9710960388183594, + "logps/rejected": -1.9895975589752197, + "loss": 3.0761, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.710962295532227, + "rewards/margins": 0.18501415848731995, + "rewards/rejected": -19.89597511291504, + "step": 15345 + }, + { + "epoch": 0.5173750379183659, + "grad_norm": 28.171419143676758, + "learning_rate": 5.568214336483904e-07, + "logits/chosen": -1.289721131324768, + "logits/rejected": -1.4089850187301636, + "logps/chosen": -1.9875361919403076, + "logps/rejected": -2.204806089401245, + "loss": 2.5122, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.875362396240234, + "rewards/margins": 2.172700881958008, + "rewards/rejected": -22.048063278198242, + "step": 15350 + }, + { + "epoch": 0.5175435639893492, + "grad_norm": 21.062044143676758, + "learning_rate": 5.56529194752844e-07, + "logits/chosen": -0.7967745065689087, + "logits/rejected": -1.043370246887207, + "logps/chosen": -2.4613327980041504, + "logps/rejected": -2.6952717304229736, + "loss": 2.6294, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.613330841064453, + "rewards/margins": 2.3393893241882324, + "rewards/rejected": -26.95271873474121, + "step": 15355 + }, + { + "epoch": 0.5177120900603324, + "grad_norm": 88.55207824707031, + "learning_rate": 5.562369362947703e-07, + "logits/chosen": -1.3059293031692505, + "logits/rejected": -1.3453352451324463, + "logps/chosen": -2.1375696659088135, + "logps/rejected": -2.015935182571411, + "loss": 4.2888, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -21.37569808959961, + "rewards/margins": -1.216347336769104, + "rewards/rejected": -20.15934944152832, + "step": 15360 + }, + { + "epoch": 0.5178806161313155, + "grad_norm": 86.71758270263672, + "learning_rate": 5.559446583753086e-07, + "logits/chosen": -1.2455203533172607, + "logits/rejected": -1.2034311294555664, + "logps/chosen": -2.1374192237854004, + "logps/rejected": -2.324183464050293, + "loss": 3.5776, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -21.374195098876953, + "rewards/margins": 1.8676401376724243, + "rewards/rejected": -23.241836547851562, + "step": 15365 + }, + { + "epoch": 0.5180491422022987, + "grad_norm": 34.24867248535156, + "learning_rate": 5.556523610956047e-07, + "logits/chosen": -1.469974160194397, + "logits/rejected": -1.6809148788452148, + "logps/chosen": -2.598238468170166, + "logps/rejected": -2.6210434436798096, + "loss": 3.7393, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -25.982385635375977, + "rewards/margins": 0.22805070877075195, + "rewards/rejected": -26.210433959960938, + "step": 15370 + }, + { + "epoch": 0.5182176682732819, + "grad_norm": 36.28508758544922, + "learning_rate": 5.553600445568113e-07, + "logits/chosen": -1.4834458827972412, + "logits/rejected": -1.4371143579483032, + "logps/chosen": -2.5968239307403564, + "logps/rejected": -2.711268901824951, + "loss": 4.7324, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -25.968236923217773, + "rewards/margins": 1.1444545984268188, + "rewards/rejected": -27.11269187927246, + "step": 15375 + }, + { + "epoch": 0.518386194344265, + "grad_norm": 17.586444854736328, + "learning_rate": 5.550677088600876e-07, + "logits/chosen": -1.2126684188842773, + "logits/rejected": -1.5736459493637085, + "logps/chosen": -2.134211301803589, + "logps/rejected": -2.6339869499206543, + "loss": 2.5797, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.342113494873047, + "rewards/margins": 4.997755527496338, + "rewards/rejected": -26.339868545532227, + "step": 15380 + }, + { + "epoch": 0.5185547204152482, + "grad_norm": 22.281524658203125, + "learning_rate": 5.547753541065993e-07, + "logits/chosen": -1.3549929857254028, + "logits/rejected": -1.396969199180603, + "logps/chosen": -1.6693958044052124, + "logps/rejected": -1.815840482711792, + "loss": 2.511, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.693960189819336, + "rewards/margins": 1.4644463062286377, + "rewards/rejected": -18.158405303955078, + "step": 15385 + }, + { + "epoch": 0.5187232464862315, + "grad_norm": 34.48127746582031, + "learning_rate": 5.544829803975193e-07, + "logits/chosen": -1.3881969451904297, + "logits/rejected": -1.4901460409164429, + "logps/chosen": -2.1605522632598877, + "logps/rejected": -2.43407940864563, + "loss": 2.5932, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.60552406311035, + "rewards/margins": 2.735269784927368, + "rewards/rejected": -24.34079360961914, + "step": 15390 + }, + { + "epoch": 0.5188917725572146, + "grad_norm": 63.25047302246094, + "learning_rate": 5.541905878340261e-07, + "logits/chosen": -0.8809518814086914, + "logits/rejected": -1.2939692735671997, + "logps/chosen": -2.41929292678833, + "logps/rejected": -2.7904906272888184, + "loss": 1.4925, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -24.192928314208984, + "rewards/margins": 3.7119758129119873, + "rewards/rejected": -27.904903411865234, + "step": 15395 + }, + { + "epoch": 0.5190602986281978, + "grad_norm": 86.68781280517578, + "learning_rate": 5.538981765173055e-07, + "logits/chosen": -1.6341667175292969, + "logits/rejected": -1.7704432010650635, + "logps/chosen": -2.3183207511901855, + "logps/rejected": -2.383631944656372, + "loss": 3.7675, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.183204650878906, + "rewards/margins": 0.6531141400337219, + "rewards/rejected": -23.836318969726562, + "step": 15400 + }, + { + "epoch": 0.519228824699181, + "grad_norm": 29.288299560546875, + "learning_rate": 5.536057465485495e-07, + "logits/chosen": -1.5594924688339233, + "logits/rejected": -1.3881484270095825, + "logps/chosen": -1.7050707340240479, + "logps/rejected": -1.7124258279800415, + "loss": 3.1131, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.050708770751953, + "rewards/margins": 0.0735509842634201, + "rewards/rejected": -17.124258041381836, + "step": 15405 + }, + { + "epoch": 0.5193973507701641, + "grad_norm": 59.807960510253906, + "learning_rate": 5.533132980289567e-07, + "logits/chosen": -1.6147937774658203, + "logits/rejected": -1.9224376678466797, + "logps/chosen": -2.599799633026123, + "logps/rejected": -2.6466755867004395, + "loss": 4.5049, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.997997283935547, + "rewards/margins": 0.46876010298728943, + "rewards/rejected": -26.46675682067871, + "step": 15410 + }, + { + "epoch": 0.5195658768411473, + "grad_norm": 25.46161460876465, + "learning_rate": 5.530208310597318e-07, + "logits/chosen": -1.5502841472625732, + "logits/rejected": -1.5365631580352783, + "logps/chosen": -3.0772483348846436, + "logps/rejected": -3.437613010406494, + "loss": 3.7961, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -30.77248191833496, + "rewards/margins": 3.6036434173583984, + "rewards/rejected": -34.37612533569336, + "step": 15415 + }, + { + "epoch": 0.5197344029121305, + "grad_norm": 105.07298278808594, + "learning_rate": 5.527283457420862e-07, + "logits/chosen": -1.4746211767196655, + "logits/rejected": -1.7897018194198608, + "logps/chosen": -2.3653385639190674, + "logps/rejected": -2.210273027420044, + "loss": 5.108, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.653385162353516, + "rewards/margins": -1.5506563186645508, + "rewards/rejected": -22.10272789001465, + "step": 15420 + }, + { + "epoch": 0.5199029289831136, + "grad_norm": 31.118650436401367, + "learning_rate": 5.524358421772377e-07, + "logits/chosen": -0.7948622703552246, + "logits/rejected": -0.8744922876358032, + "logps/chosen": -2.0413529872894287, + "logps/rejected": -2.088682174682617, + "loss": 2.8332, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.413528442382812, + "rewards/margins": 0.47329291701316833, + "rewards/rejected": -20.886821746826172, + "step": 15425 + }, + { + "epoch": 0.5200714550540969, + "grad_norm": 17.658946990966797, + "learning_rate": 5.521433204664101e-07, + "logits/chosen": -1.8911575078964233, + "logits/rejected": -2.300468683242798, + "logps/chosen": -1.8099273443222046, + "logps/rejected": -2.1652398109436035, + "loss": 2.6423, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.099271774291992, + "rewards/margins": 3.553126573562622, + "rewards/rejected": -21.65239906311035, + "step": 15430 + }, + { + "epoch": 0.5202399811250801, + "grad_norm": 32.41124725341797, + "learning_rate": 5.518507807108335e-07, + "logits/chosen": -1.2449058294296265, + "logits/rejected": -1.6564595699310303, + "logps/chosen": -2.1771650314331055, + "logps/rejected": -3.255115509033203, + "loss": 1.5736, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.771650314331055, + "rewards/margins": 10.779507637023926, + "rewards/rejected": -32.55115509033203, + "step": 15435 + }, + { + "epoch": 0.5204085071960632, + "grad_norm": 9.971014022827148, + "learning_rate": 5.515582230117448e-07, + "logits/chosen": -1.3065838813781738, + "logits/rejected": -1.6391983032226562, + "logps/chosen": -1.7685725688934326, + "logps/rejected": -2.0779318809509277, + "loss": 1.3878, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -17.685726165771484, + "rewards/margins": 3.09359073638916, + "rewards/rejected": -20.779315948486328, + "step": 15440 + }, + { + "epoch": 0.5205770332670464, + "grad_norm": 16.967708587646484, + "learning_rate": 5.512656474703861e-07, + "logits/chosen": -1.1333194971084595, + "logits/rejected": -1.6499792337417603, + "logps/chosen": -2.0808238983154297, + "logps/rejected": -2.2162888050079346, + "loss": 2.6948, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.808238983154297, + "rewards/margins": 1.3546478748321533, + "rewards/rejected": -22.16288948059082, + "step": 15445 + }, + { + "epoch": 0.5207455593380296, + "grad_norm": 22.480432510375977, + "learning_rate": 5.509730541880068e-07, + "logits/chosen": -1.1527360677719116, + "logits/rejected": -1.5316417217254639, + "logps/chosen": -2.3182454109191895, + "logps/rejected": -2.688753366470337, + "loss": 2.913, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -23.18245506286621, + "rewards/margins": 3.7050795555114746, + "rewards/rejected": -26.88753318786621, + "step": 15450 + }, + { + "epoch": 0.5209140854090127, + "grad_norm": 41.222991943359375, + "learning_rate": 5.506804432658615e-07, + "logits/chosen": -1.7212040424346924, + "logits/rejected": -1.7893693447113037, + "logps/chosen": -1.9451452493667603, + "logps/rejected": -2.0703797340393066, + "loss": 3.169, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.451452255249023, + "rewards/margins": 1.252342939376831, + "rewards/rejected": -20.70379638671875, + "step": 15455 + }, + { + "epoch": 0.5210826114799959, + "grad_norm": 20.651769638061523, + "learning_rate": 5.503878148052118e-07, + "logits/chosen": -1.4760338068008423, + "logits/rejected": -1.548568844795227, + "logps/chosen": -2.059722423553467, + "logps/rejected": -2.5742545127868652, + "loss": 3.1436, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.597225189208984, + "rewards/margins": 5.145321846008301, + "rewards/rejected": -25.7425479888916, + "step": 15460 + }, + { + "epoch": 0.5212511375509792, + "grad_norm": 40.60196304321289, + "learning_rate": 5.500951689073244e-07, + "logits/chosen": -0.966766357421875, + "logits/rejected": -1.1678035259246826, + "logps/chosen": -2.260707139968872, + "logps/rejected": -2.3044838905334473, + "loss": 3.4371, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -22.607070922851562, + "rewards/margins": 0.4377668499946594, + "rewards/rejected": -23.044836044311523, + "step": 15465 + }, + { + "epoch": 0.5214196636219623, + "grad_norm": 36.945743560791016, + "learning_rate": 5.498025056734727e-07, + "logits/chosen": -0.9160255193710327, + "logits/rejected": -1.3329023122787476, + "logps/chosen": -2.5447611808776855, + "logps/rejected": -2.7440483570098877, + "loss": 5.0179, + "rewards/accuracies": 0.5, + "rewards/chosen": -25.447612762451172, + "rewards/margins": 1.9928712844848633, + "rewards/rejected": -27.44048500061035, + "step": 15470 + }, + { + "epoch": 0.5215881896929455, + "grad_norm": 20.789392471313477, + "learning_rate": 5.49509825204936e-07, + "logits/chosen": -1.9368362426757812, + "logits/rejected": -1.951311469078064, + "logps/chosen": -2.766106128692627, + "logps/rejected": -2.6983156204223633, + "loss": 4.2261, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -27.661060333251953, + "rewards/margins": -0.6779062151908875, + "rewards/rejected": -26.983154296875, + "step": 15475 + }, + { + "epoch": 0.5217567157639287, + "grad_norm": 33.815025329589844, + "learning_rate": 5.492171276029994e-07, + "logits/chosen": -1.4312325716018677, + "logits/rejected": -1.7131084203720093, + "logps/chosen": -2.202226400375366, + "logps/rejected": -2.3178181648254395, + "loss": 3.5403, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.022266387939453, + "rewards/margins": 1.1559193134307861, + "rewards/rejected": -23.178184509277344, + "step": 15480 + }, + { + "epoch": 0.5219252418349118, + "grad_norm": 1.9928309917449951, + "learning_rate": 5.48924412968954e-07, + "logits/chosen": -0.6165008544921875, + "logits/rejected": -0.9194254875183105, + "logps/chosen": -2.514402389526367, + "logps/rejected": -2.7848479747772217, + "loss": 1.7571, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -25.144020080566406, + "rewards/margins": 2.7044589519500732, + "rewards/rejected": -27.848480224609375, + "step": 15485 + }, + { + "epoch": 0.522093767905895, + "grad_norm": 28.686857223510742, + "learning_rate": 5.486316814040968e-07, + "logits/chosen": -1.7597252130508423, + "logits/rejected": -1.6932029724121094, + "logps/chosen": -2.4415581226348877, + "logps/rejected": -2.6324195861816406, + "loss": 3.7649, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -24.41558074951172, + "rewards/margins": 1.9086145162582397, + "rewards/rejected": -26.324193954467773, + "step": 15490 + }, + { + "epoch": 0.5222622939768782, + "grad_norm": 27.051673889160156, + "learning_rate": 5.483389330097308e-07, + "logits/chosen": -1.7283437252044678, + "logits/rejected": -1.5780450105667114, + "logps/chosen": -2.3943264484405518, + "logps/rejected": -2.3182897567749023, + "loss": 4.1081, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -23.94326400756836, + "rewards/margins": -0.7603681683540344, + "rewards/rejected": -23.18289566040039, + "step": 15495 + }, + { + "epoch": 0.5224308200478615, + "grad_norm": 37.90834426879883, + "learning_rate": 5.480461678871645e-07, + "logits/chosen": -1.6101102828979492, + "logits/rejected": -1.6730715036392212, + "logps/chosen": -1.935234785079956, + "logps/rejected": -1.872582197189331, + "loss": 3.8334, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.35234832763672, + "rewards/margins": -0.6265257596969604, + "rewards/rejected": -18.7258243560791, + "step": 15500 + }, + { + "epoch": 0.5225993461188446, + "grad_norm": 21.659454345703125, + "learning_rate": 5.477533861377123e-07, + "logits/chosen": -1.2597486972808838, + "logits/rejected": -1.7310842275619507, + "logps/chosen": -1.9114364385604858, + "logps/rejected": -2.3817667961120605, + "loss": 2.5768, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.11436653137207, + "rewards/margins": 4.70330286026001, + "rewards/rejected": -23.81766700744629, + "step": 15505 + }, + { + "epoch": 0.5227678721898278, + "grad_norm": 16.150802612304688, + "learning_rate": 5.474605878626948e-07, + "logits/chosen": -1.0654808282852173, + "logits/rejected": -1.4487148523330688, + "logps/chosen": -2.6950135231018066, + "logps/rejected": -2.666337490081787, + "loss": 4.7582, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.95013427734375, + "rewards/margins": -0.286760151386261, + "rewards/rejected": -26.663372039794922, + "step": 15510 + }, + { + "epoch": 0.522936398260811, + "grad_norm": 78.9401626586914, + "learning_rate": 5.471677731634375e-07, + "logits/chosen": -1.832585096359253, + "logits/rejected": -1.858319640159607, + "logps/chosen": -2.2961020469665527, + "logps/rejected": -2.3054680824279785, + "loss": 4.1239, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.961023330688477, + "rewards/margins": 0.09365816414356232, + "rewards/rejected": -23.0546817779541, + "step": 15515 + }, + { + "epoch": 0.5231049243317941, + "grad_norm": 31.371814727783203, + "learning_rate": 5.468749421412723e-07, + "logits/chosen": -1.6428378820419312, + "logits/rejected": -1.6746675968170166, + "logps/chosen": -2.245944023132324, + "logps/rejected": -2.602429151535034, + "loss": 1.88, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.45943832397461, + "rewards/margins": 3.5648505687713623, + "rewards/rejected": -26.024288177490234, + "step": 15520 + }, + { + "epoch": 0.5232734504027773, + "grad_norm": 16.432018280029297, + "learning_rate": 5.465820948975366e-07, + "logits/chosen": -1.270250916481018, + "logits/rejected": -1.2934939861297607, + "logps/chosen": -1.722895622253418, + "logps/rejected": -1.7790238857269287, + "loss": 3.5555, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.22895622253418, + "rewards/margins": 0.5612838864326477, + "rewards/rejected": -17.790239334106445, + "step": 15525 + }, + { + "epoch": 0.5234419764737605, + "grad_norm": 20.018625259399414, + "learning_rate": 5.462892315335729e-07, + "logits/chosen": -1.6446031332015991, + "logits/rejected": -1.5727581977844238, + "logps/chosen": -1.9519649744033813, + "logps/rejected": -2.2087297439575195, + "loss": 2.8335, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.519649505615234, + "rewards/margins": 2.5676486492156982, + "rewards/rejected": -22.087299346923828, + "step": 15530 + }, + { + "epoch": 0.5236105025447436, + "grad_norm": 44.28361892700195, + "learning_rate": 5.4599635215073e-07, + "logits/chosen": -1.4462125301361084, + "logits/rejected": -1.6265175342559814, + "logps/chosen": -2.038994789123535, + "logps/rejected": -2.193636178970337, + "loss": 3.1313, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.38994789123535, + "rewards/margins": 1.5464133024215698, + "rewards/rejected": -21.93636131286621, + "step": 15535 + }, + { + "epoch": 0.5237790286157269, + "grad_norm": 110.08293151855469, + "learning_rate": 5.457034568503616e-07, + "logits/chosen": -1.8243696689605713, + "logits/rejected": -1.8024543523788452, + "logps/chosen": -2.3185818195343018, + "logps/rejected": -2.424232006072998, + "loss": 2.6836, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.18581771850586, + "rewards/margins": 1.0565001964569092, + "rewards/rejected": -24.242319107055664, + "step": 15540 + }, + { + "epoch": 0.5239475546867101, + "grad_norm": 80.31021881103516, + "learning_rate": 5.454105457338278e-07, + "logits/chosen": -1.5435346364974976, + "logits/rejected": -1.5518152713775635, + "logps/chosen": -2.4903368949890137, + "logps/rejected": -2.444121837615967, + "loss": 3.6497, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -24.903369903564453, + "rewards/margins": -0.46214962005615234, + "rewards/rejected": -24.441219329833984, + "step": 15545 + }, + { + "epoch": 0.5241160807576932, + "grad_norm": 40.0004768371582, + "learning_rate": 5.45117618902493e-07, + "logits/chosen": -1.5282537937164307, + "logits/rejected": -1.8590351343154907, + "logps/chosen": -2.001547336578369, + "logps/rejected": -2.2384941577911377, + "loss": 2.0766, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.015472412109375, + "rewards/margins": 2.369469404220581, + "rewards/rejected": -22.384939193725586, + "step": 15550 + }, + { + "epoch": 0.5242846068286764, + "grad_norm": 20.842981338500977, + "learning_rate": 5.448246764577278e-07, + "logits/chosen": -0.5346226096153259, + "logits/rejected": -0.5988118648529053, + "logps/chosen": -2.4696402549743652, + "logps/rejected": -2.6132705211639404, + "loss": 2.9447, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.69640350341797, + "rewards/margins": 1.4363019466400146, + "rewards/rejected": -26.132705688476562, + "step": 15555 + }, + { + "epoch": 0.5244531328996596, + "grad_norm": 27.3537654876709, + "learning_rate": 5.445317185009082e-07, + "logits/chosen": -1.1227928400039673, + "logits/rejected": -1.205906867980957, + "logps/chosen": -2.1927571296691895, + "logps/rejected": -2.4103589057922363, + "loss": 1.8171, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.92757225036621, + "rewards/margins": 2.1760153770446777, + "rewards/rejected": -24.103588104248047, + "step": 15560 + }, + { + "epoch": 0.5246216589706427, + "grad_norm": 35.99321746826172, + "learning_rate": 5.442387451334152e-07, + "logits/chosen": -1.2954622507095337, + "logits/rejected": -1.3853565454483032, + "logps/chosen": -1.78121018409729, + "logps/rejected": -1.9041932821273804, + "loss": 2.5959, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.812103271484375, + "rewards/margins": 1.229830026626587, + "rewards/rejected": -19.04193115234375, + "step": 15565 + }, + { + "epoch": 0.5247901850416259, + "grad_norm": 44.51189041137695, + "learning_rate": 5.439457564566356e-07, + "logits/chosen": -1.7479088306427002, + "logits/rejected": -1.7973203659057617, + "logps/chosen": -1.5744832754135132, + "logps/rejected": -1.9441041946411133, + "loss": 2.0475, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -15.744832038879395, + "rewards/margins": 3.6962077617645264, + "rewards/rejected": -19.441041946411133, + "step": 15570 + }, + { + "epoch": 0.5249587111126092, + "grad_norm": 22.456087112426758, + "learning_rate": 5.43652752571961e-07, + "logits/chosen": -1.5364540815353394, + "logits/rejected": -1.588979721069336, + "logps/chosen": -2.6818394660949707, + "logps/rejected": -2.505458354949951, + "loss": 5.4875, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -26.818395614624023, + "rewards/margins": -1.7638130187988281, + "rewards/rejected": -25.054582595825195, + "step": 15575 + }, + { + "epoch": 0.5251272371835923, + "grad_norm": 28.281593322753906, + "learning_rate": 5.433597335807887e-07, + "logits/chosen": -1.2712260484695435, + "logits/rejected": -1.3536813259124756, + "logps/chosen": -2.145684242248535, + "logps/rejected": -2.089423418045044, + "loss": 4.0857, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.456844329833984, + "rewards/margins": -0.5626105070114136, + "rewards/rejected": -20.89423370361328, + "step": 15580 + }, + { + "epoch": 0.5252957632545755, + "grad_norm": 19.869348526000977, + "learning_rate": 5.430666995845207e-07, + "logits/chosen": -1.6280781030654907, + "logits/rejected": -1.7816858291625977, + "logps/chosen": -2.778658866882324, + "logps/rejected": -3.0360519886016846, + "loss": 2.51, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -27.78658676147461, + "rewards/margins": 2.5739312171936035, + "rewards/rejected": -30.360515594482422, + "step": 15585 + }, + { + "epoch": 0.5254642893255587, + "grad_norm": 61.13578796386719, + "learning_rate": 5.42773650684565e-07, + "logits/chosen": -1.3919565677642822, + "logits/rejected": -1.010761022567749, + "logps/chosen": -2.5698320865631104, + "logps/rejected": -2.5252742767333984, + "loss": 3.7041, + "rewards/accuracies": 0.5, + "rewards/chosen": -25.698322296142578, + "rewards/margins": -0.4455797076225281, + "rewards/rejected": -25.25274085998535, + "step": 15590 + }, + { + "epoch": 0.5256328153965418, + "grad_norm": 256.7560729980469, + "learning_rate": 5.424805869823338e-07, + "logits/chosen": -1.6359916925430298, + "logits/rejected": -1.6572707891464233, + "logps/chosen": -3.07808256149292, + "logps/rejected": -2.8269076347351074, + "loss": 5.5903, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -30.78082275390625, + "rewards/margins": -2.5117459297180176, + "rewards/rejected": -28.26907730102539, + "step": 15595 + }, + { + "epoch": 0.525801341467525, + "grad_norm": 39.58763885498047, + "learning_rate": 5.421875085792451e-07, + "logits/chosen": -1.4183658361434937, + "logits/rejected": -1.4681622982025146, + "logps/chosen": -2.238878011703491, + "logps/rejected": -2.2670669555664062, + "loss": 3.2834, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.388778686523438, + "rewards/margins": 0.28189095854759216, + "rewards/rejected": -22.670669555664062, + "step": 15600 + }, + { + "epoch": 0.525801341467525, + "eval_logits/chosen": -1.7632553577423096, + "eval_logits/rejected": -1.8928484916687012, + "eval_logps/chosen": -2.0442748069763184, + "eval_logps/rejected": -2.1599292755126953, + "eval_loss": 2.9858274459838867, + "eval_rewards/accuracies": 0.6299999952316284, + "eval_rewards/chosen": -20.442750930786133, + "eval_rewards/margins": 1.156540870666504, + "eval_rewards/rejected": -21.599288940429688, + "eval_runtime": 12.8962, + "eval_samples_per_second": 7.754, + "eval_steps_per_second": 1.939, + "step": 15600 + }, + { + "epoch": 0.5259698675385082, + "grad_norm": 22.91073226928711, + "learning_rate": 5.41894415576722e-07, + "logits/chosen": -1.1644032001495361, + "logits/rejected": -1.226810336112976, + "logps/chosen": -2.3184869289398193, + "logps/rejected": -2.561067819595337, + "loss": 2.5009, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.18486976623535, + "rewards/margins": 2.4258084297180176, + "rewards/rejected": -25.610681533813477, + "step": 15605 + }, + { + "epoch": 0.5261383936094914, + "grad_norm": 30.144290924072266, + "learning_rate": 5.416013080761921e-07, + "logits/chosen": -1.4322903156280518, + "logits/rejected": -1.5939862728118896, + "logps/chosen": -2.02023983001709, + "logps/rejected": -2.257467269897461, + "loss": 1.6984, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.202396392822266, + "rewards/margins": 2.372274398803711, + "rewards/rejected": -22.57467269897461, + "step": 15610 + }, + { + "epoch": 0.5263069196804746, + "grad_norm": 12.66912841796875, + "learning_rate": 5.413081861790884e-07, + "logits/chosen": -1.5032761096954346, + "logits/rejected": -1.7086365222930908, + "logps/chosen": -1.7659047842025757, + "logps/rejected": -2.06687593460083, + "loss": 2.1078, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.659048080444336, + "rewards/margins": 3.009711742401123, + "rewards/rejected": -20.668760299682617, + "step": 15615 + }, + { + "epoch": 0.5264754457514578, + "grad_norm": 34.912384033203125, + "learning_rate": 5.410150499868491e-07, + "logits/chosen": -1.7015644311904907, + "logits/rejected": -1.7704875469207764, + "logps/chosen": -1.6861746311187744, + "logps/rejected": -2.1708366870880127, + "loss": 1.7276, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -16.861745834350586, + "rewards/margins": 4.846621036529541, + "rewards/rejected": -21.70836639404297, + "step": 15620 + }, + { + "epoch": 0.5266439718224409, + "grad_norm": 23.303464889526367, + "learning_rate": 5.407218996009168e-07, + "logits/chosen": -1.414489507675171, + "logits/rejected": -1.2904560565948486, + "logps/chosen": -1.8772773742675781, + "logps/rejected": -1.9689223766326904, + "loss": 2.6413, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.77277183532715, + "rewards/margins": 0.9164519309997559, + "rewards/rejected": -19.689224243164062, + "step": 15625 + }, + { + "epoch": 0.5268124978934241, + "grad_norm": 7.382728099822998, + "learning_rate": 5.404287351227397e-07, + "logits/chosen": -1.51156747341156, + "logits/rejected": -1.5182366371154785, + "logps/chosen": -2.156038999557495, + "logps/rejected": -2.454796552658081, + "loss": 2.1912, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.56039047241211, + "rewards/margins": 2.987576723098755, + "rewards/rejected": -24.5479679107666, + "step": 15630 + }, + { + "epoch": 0.5269810239644073, + "grad_norm": 24.27437973022461, + "learning_rate": 5.401355566537698e-07, + "logits/chosen": -1.5087625980377197, + "logits/rejected": -1.5422756671905518, + "logps/chosen": -2.191742420196533, + "logps/rejected": -2.490962028503418, + "loss": 2.7534, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.91742515563965, + "rewards/margins": 2.9921936988830566, + "rewards/rejected": -24.909618377685547, + "step": 15635 + }, + { + "epoch": 0.5271495500353904, + "grad_norm": 34.50971984863281, + "learning_rate": 5.398423642954654e-07, + "logits/chosen": -1.3371788263320923, + "logits/rejected": -1.3717623949050903, + "logps/chosen": -2.7144930362701416, + "logps/rejected": -2.642326831817627, + "loss": 3.8951, + "rewards/accuracies": 0.5, + "rewards/chosen": -27.144927978515625, + "rewards/margins": -0.7216583490371704, + "rewards/rejected": -26.423269271850586, + "step": 15640 + }, + { + "epoch": 0.5273180761063736, + "grad_norm": 57.722877502441406, + "learning_rate": 5.395491581492883e-07, + "logits/chosen": -1.2004446983337402, + "logits/rejected": -1.3373278379440308, + "logps/chosen": -3.4159648418426514, + "logps/rejected": -3.5445168018341064, + "loss": 6.5148, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -34.15964889526367, + "rewards/margins": 1.2855199575424194, + "rewards/rejected": -35.445167541503906, + "step": 15645 + }, + { + "epoch": 0.5274866021773569, + "grad_norm": 33.64910125732422, + "learning_rate": 5.392559383167057e-07, + "logits/chosen": -1.420398473739624, + "logits/rejected": -1.4803659915924072, + "logps/chosen": -2.0454297065734863, + "logps/rejected": -1.9843571186065674, + "loss": 3.7885, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.454294204711914, + "rewards/margins": -0.6107238531112671, + "rewards/rejected": -19.843570709228516, + "step": 15650 + }, + { + "epoch": 0.52765512824834, + "grad_norm": 143.58140563964844, + "learning_rate": 5.389627048991894e-07, + "logits/chosen": -1.1687041521072388, + "logits/rejected": -1.2767616510391235, + "logps/chosen": -2.470137357711792, + "logps/rejected": -2.4478909969329834, + "loss": 3.6825, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.701374053955078, + "rewards/margins": -0.2224619835615158, + "rewards/rejected": -24.478910446166992, + "step": 15655 + }, + { + "epoch": 0.5278236543193232, + "grad_norm": 36.96298599243164, + "learning_rate": 5.386694579982161e-07, + "logits/chosen": -1.4571011066436768, + "logits/rejected": -1.7262732982635498, + "logps/chosen": -2.092268466949463, + "logps/rejected": -2.3306639194488525, + "loss": 1.82, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.92268180847168, + "rewards/margins": 2.38395619392395, + "rewards/rejected": -23.306636810302734, + "step": 15660 + }, + { + "epoch": 0.5279921803903064, + "grad_norm": 21.692073822021484, + "learning_rate": 5.38376197715267e-07, + "logits/chosen": -0.784595251083374, + "logits/rejected": -0.8547506332397461, + "logps/chosen": -2.1192760467529297, + "logps/rejected": -2.105349540710449, + "loss": 3.6934, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.192760467529297, + "rewards/margins": -0.13926735520362854, + "rewards/rejected": -21.05349349975586, + "step": 15665 + }, + { + "epoch": 0.5281607064612895, + "grad_norm": 13.716679573059082, + "learning_rate": 5.380829241518277e-07, + "logits/chosen": -1.217454195022583, + "logits/rejected": -1.7114721536636353, + "logps/chosen": -2.3606152534484863, + "logps/rejected": -2.695775270462036, + "loss": 3.1203, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.606151580810547, + "rewards/margins": 3.351599931716919, + "rewards/rejected": -26.957752227783203, + "step": 15670 + }, + { + "epoch": 0.5283292325322727, + "grad_norm": 28.68997573852539, + "learning_rate": 5.377896374093889e-07, + "logits/chosen": -1.3477064371109009, + "logits/rejected": -1.5586186647415161, + "logps/chosen": -1.914340615272522, + "logps/rejected": -1.8554699420928955, + "loss": 3.8192, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.14340591430664, + "rewards/margins": -0.588705837726593, + "rewards/rejected": -18.554698944091797, + "step": 15675 + }, + { + "epoch": 0.5284977586032559, + "grad_norm": 33.395748138427734, + "learning_rate": 5.374963375894452e-07, + "logits/chosen": -1.627058982849121, + "logits/rejected": -1.6720809936523438, + "logps/chosen": -1.5370395183563232, + "logps/rejected": -1.6897306442260742, + "loss": 2.9374, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -15.370396614074707, + "rewards/margins": 1.5269111394882202, + "rewards/rejected": -16.897306442260742, + "step": 15680 + }, + { + "epoch": 0.5286662846742392, + "grad_norm": 39.92876434326172, + "learning_rate": 5.372030247934965e-07, + "logits/chosen": -1.1321537494659424, + "logits/rejected": -1.3527767658233643, + "logps/chosen": -1.9309250116348267, + "logps/rejected": -2.1466927528381348, + "loss": 3.1527, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.309249877929688, + "rewards/margins": 2.157675266265869, + "rewards/rejected": -21.4669246673584, + "step": 15685 + }, + { + "epoch": 0.5288348107452223, + "grad_norm": 39.63317108154297, + "learning_rate": 5.369096991230467e-07, + "logits/chosen": -1.7300984859466553, + "logits/rejected": -1.8779960870742798, + "logps/chosen": -1.7245010137557983, + "logps/rejected": -2.174797773361206, + "loss": 2.0401, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.245010375976562, + "rewards/margins": 4.5029683113098145, + "rewards/rejected": -21.74797821044922, + "step": 15690 + }, + { + "epoch": 0.5290033368162055, + "grad_norm": 30.05023193359375, + "learning_rate": 5.366163606796042e-07, + "logits/chosen": -1.5409890413284302, + "logits/rejected": -1.5917844772338867, + "logps/chosen": -1.865624189376831, + "logps/rejected": -2.017108201980591, + "loss": 2.5183, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.65624237060547, + "rewards/margins": 1.5148383378982544, + "rewards/rejected": -20.171083450317383, + "step": 15695 + }, + { + "epoch": 0.5291718628871886, + "grad_norm": 19.90330696105957, + "learning_rate": 5.363230095646818e-07, + "logits/chosen": -1.2542389631271362, + "logits/rejected": -1.4719120264053345, + "logps/chosen": -1.8331972360610962, + "logps/rejected": -1.7900031805038452, + "loss": 3.6632, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.331972122192383, + "rewards/margins": -0.4319402575492859, + "rewards/rejected": -17.90003204345703, + "step": 15700 + }, + { + "epoch": 0.5293403889581718, + "grad_norm": 18.70873260498047, + "learning_rate": 5.360296458797969e-07, + "logits/chosen": -1.2316617965698242, + "logits/rejected": -1.6386626958847046, + "logps/chosen": -2.234480381011963, + "logps/rejected": -2.251962184906006, + "loss": 3.1477, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.344802856445312, + "rewards/margins": 0.17481890320777893, + "rewards/rejected": -22.519622802734375, + "step": 15705 + }, + { + "epoch": 0.529508915029155, + "grad_norm": 3.4370267391204834, + "learning_rate": 5.357362697264711e-07, + "logits/chosen": -1.5080573558807373, + "logits/rejected": -1.6581541299819946, + "logps/chosen": -2.3186049461364746, + "logps/rejected": -2.8751769065856934, + "loss": 1.6822, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -23.186050415039062, + "rewards/margins": 5.565718650817871, + "rewards/rejected": -28.75177001953125, + "step": 15710 + }, + { + "epoch": 0.5296774411001381, + "grad_norm": 30.35115623474121, + "learning_rate": 5.354428812062303e-07, + "logits/chosen": -1.646667718887329, + "logits/rejected": -1.6787481307983398, + "logps/chosen": -2.1127872467041016, + "logps/rejected": -2.3904125690460205, + "loss": 3.2484, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.12787437438965, + "rewards/margins": 2.7762503623962402, + "rewards/rejected": -23.904123306274414, + "step": 15715 + }, + { + "epoch": 0.5298459671711214, + "grad_norm": 35.458404541015625, + "learning_rate": 5.351494804206047e-07, + "logits/chosen": -1.3847260475158691, + "logits/rejected": -1.9206058979034424, + "logps/chosen": -2.1701271533966064, + "logps/rejected": -2.5087239742279053, + "loss": 2.0344, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.701271057128906, + "rewards/margins": 3.3859705924987793, + "rewards/rejected": -25.08724021911621, + "step": 15720 + }, + { + "epoch": 0.5300144932421046, + "grad_norm": 28.80376434326172, + "learning_rate": 5.348560674711289e-07, + "logits/chosen": -1.468523621559143, + "logits/rejected": -1.8824526071548462, + "logps/chosen": -2.12115740776062, + "logps/rejected": -2.4652764797210693, + "loss": 4.2581, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.21157455444336, + "rewards/margins": 3.4411914348602295, + "rewards/rejected": -24.65276527404785, + "step": 15725 + }, + { + "epoch": 0.5301830193130878, + "grad_norm": 16.96817970275879, + "learning_rate": 5.345626424593412e-07, + "logits/chosen": -1.7553907632827759, + "logits/rejected": -1.8569806814193726, + "logps/chosen": -2.455338478088379, + "logps/rejected": -2.970616102218628, + "loss": 2.0558, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -24.55338478088379, + "rewards/margins": 5.152777671813965, + "rewards/rejected": -29.706165313720703, + "step": 15730 + }, + { + "epoch": 0.5303515453840709, + "grad_norm": 33.88883590698242, + "learning_rate": 5.342692054867848e-07, + "logits/chosen": -0.7830812931060791, + "logits/rejected": -1.0861982107162476, + "logps/chosen": -2.214578628540039, + "logps/rejected": -2.686673641204834, + "loss": 1.3673, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -22.145784378051758, + "rewards/margins": 4.720952033996582, + "rewards/rejected": -26.86673927307129, + "step": 15735 + }, + { + "epoch": 0.5305200714550541, + "grad_norm": 11.75301456451416, + "learning_rate": 5.339757566550065e-07, + "logits/chosen": -1.480642557144165, + "logits/rejected": -1.5226514339447021, + "logps/chosen": -2.491964817047119, + "logps/rejected": -2.958143949508667, + "loss": 1.4401, + "rewards/accuracies": 1.0, + "rewards/chosen": -24.91964340209961, + "rewards/margins": 4.661794185638428, + "rewards/rejected": -29.581439971923828, + "step": 15740 + }, + { + "epoch": 0.5306885975260373, + "grad_norm": 18.430078506469727, + "learning_rate": 5.336822960655574e-07, + "logits/chosen": -1.3387706279754639, + "logits/rejected": -1.4303174018859863, + "logps/chosen": -1.6745388507843018, + "logps/rejected": -1.630464792251587, + "loss": 3.5253, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.74538803100586, + "rewards/margins": -0.44073954224586487, + "rewards/rejected": -16.30464744567871, + "step": 15745 + }, + { + "epoch": 0.5308571235970204, + "grad_norm": 59.17100143432617, + "learning_rate": 5.333888238199926e-07, + "logits/chosen": -1.1929603815078735, + "logits/rejected": -1.2906194925308228, + "logps/chosen": -2.2294490337371826, + "logps/rejected": -2.385697841644287, + "loss": 2.3508, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.294490814208984, + "rewards/margins": 1.5624854564666748, + "rewards/rejected": -23.856977462768555, + "step": 15750 + }, + { + "epoch": 0.5310256496680036, + "grad_norm": 44.3913688659668, + "learning_rate": 5.330953400198715e-07, + "logits/chosen": -1.5607414245605469, + "logits/rejected": -1.7760826349258423, + "logps/chosen": -2.3757803440093994, + "logps/rejected": -2.758803606033325, + "loss": 2.1524, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.757801055908203, + "rewards/margins": 3.8302321434020996, + "rewards/rejected": -27.588037490844727, + "step": 15755 + }, + { + "epoch": 0.5311941757389869, + "grad_norm": 19.41259765625, + "learning_rate": 5.32801844766757e-07, + "logits/chosen": -1.5063214302062988, + "logits/rejected": -1.431069016456604, + "logps/chosen": -3.2000415325164795, + "logps/rejected": -3.125478506088257, + "loss": 5.374, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -32.00041580200195, + "rewards/margins": -0.7456296682357788, + "rewards/rejected": -31.254785537719727, + "step": 15760 + }, + { + "epoch": 0.53136270180997, + "grad_norm": 17.94995880126953, + "learning_rate": 5.325083381622164e-07, + "logits/chosen": -1.1592814922332764, + "logits/rejected": -1.3412177562713623, + "logps/chosen": -2.687490463256836, + "logps/rejected": -2.77734375, + "loss": 2.949, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -26.874902725219727, + "rewards/margins": 0.8985313177108765, + "rewards/rejected": -27.7734375, + "step": 15765 + }, + { + "epoch": 0.5315312278809532, + "grad_norm": 12.215071678161621, + "learning_rate": 5.322148203078206e-07, + "logits/chosen": -1.4378396272659302, + "logits/rejected": -1.5946756601333618, + "logps/chosen": -2.456948757171631, + "logps/rejected": -2.9714131355285645, + "loss": 0.9776, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -24.56949234008789, + "rewards/margins": 5.1446404457092285, + "rewards/rejected": -29.714130401611328, + "step": 15770 + }, + { + "epoch": 0.5316997539519364, + "grad_norm": 29.457229614257812, + "learning_rate": 5.319212913051449e-07, + "logits/chosen": -1.3002904653549194, + "logits/rejected": -1.7504593133926392, + "logps/chosen": -1.988246202468872, + "logps/rejected": -2.1686580181121826, + "loss": 4.2004, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.882461547851562, + "rewards/margins": 1.8041210174560547, + "rewards/rejected": -21.686582565307617, + "step": 15775 + }, + { + "epoch": 0.5318682800229195, + "grad_norm": 94.38124084472656, + "learning_rate": 5.316277512557678e-07, + "logits/chosen": -1.3595154285430908, + "logits/rejected": -1.8648008108139038, + "logps/chosen": -2.644291400909424, + "logps/rejected": -2.9432849884033203, + "loss": 4.225, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -26.442913055419922, + "rewards/margins": 2.9899346828460693, + "rewards/rejected": -29.432849884033203, + "step": 15780 + }, + { + "epoch": 0.5320368060939027, + "grad_norm": 31.942821502685547, + "learning_rate": 5.31334200261272e-07, + "logits/chosen": -1.6898002624511719, + "logits/rejected": -1.6755949258804321, + "logps/chosen": -2.614825963973999, + "logps/rejected": -2.7868762016296387, + "loss": 3.4297, + "rewards/accuracies": 0.5, + "rewards/chosen": -26.148258209228516, + "rewards/margins": 1.720507025718689, + "rewards/rejected": -27.868764877319336, + "step": 15785 + }, + { + "epoch": 0.5322053321648859, + "grad_norm": 23.538860321044922, + "learning_rate": 5.310406384232443e-07, + "logits/chosen": -0.8642290234565735, + "logits/rejected": -1.1689766645431519, + "logps/chosen": -2.2893424034118652, + "logps/rejected": -2.6651718616485596, + "loss": 2.8563, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.8934268951416, + "rewards/margins": 3.758291244506836, + "rewards/rejected": -26.651714324951172, + "step": 15790 + }, + { + "epoch": 0.5323738582358691, + "grad_norm": 5.7882914543151855, + "learning_rate": 5.307470658432745e-07, + "logits/chosen": -1.9477641582489014, + "logits/rejected": -2.2056195735931396, + "logps/chosen": -2.2544777393341064, + "logps/rejected": -2.708986282348633, + "loss": 1.7698, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.544776916503906, + "rewards/margins": 4.545085430145264, + "rewards/rejected": -27.089862823486328, + "step": 15795 + }, + { + "epoch": 0.5325423843068523, + "grad_norm": 14.628530502319336, + "learning_rate": 5.304534826229565e-07, + "logits/chosen": -1.5568211078643799, + "logits/rejected": -2.0258517265319824, + "logps/chosen": -2.239051342010498, + "logps/rejected": -2.902362108230591, + "loss": 1.7635, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.39051628112793, + "rewards/margins": 6.633103847503662, + "rewards/rejected": -29.02362060546875, + "step": 15800 + }, + { + "epoch": 0.5327109103778355, + "grad_norm": 17.869384765625, + "learning_rate": 5.30159888863888e-07, + "logits/chosen": -1.4855738878250122, + "logits/rejected": -1.5085865259170532, + "logps/chosen": -2.267606735229492, + "logps/rejected": -2.5644633769989014, + "loss": 2.3582, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.676069259643555, + "rewards/margins": 2.9685654640197754, + "rewards/rejected": -25.644634246826172, + "step": 15805 + }, + { + "epoch": 0.5328794364488186, + "grad_norm": 64.83020782470703, + "learning_rate": 5.298662846676702e-07, + "logits/chosen": -1.9247585535049438, + "logits/rejected": -1.9699634313583374, + "logps/chosen": -2.598067283630371, + "logps/rejected": -2.7303786277770996, + "loss": 3.14, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -25.98067283630371, + "rewards/margins": 1.3231112957000732, + "rewards/rejected": -27.303783416748047, + "step": 15810 + }, + { + "epoch": 0.5330479625198018, + "grad_norm": 152.54348754882812, + "learning_rate": 5.295726701359081e-07, + "logits/chosen": -1.5798695087432861, + "logits/rejected": -1.5234687328338623, + "logps/chosen": -2.802265167236328, + "logps/rejected": -2.816190719604492, + "loss": 4.435, + "rewards/accuracies": 0.5, + "rewards/chosen": -28.02264976501465, + "rewards/margins": 0.13925638794898987, + "rewards/rejected": -28.16190528869629, + "step": 15815 + }, + { + "epoch": 0.533216488590785, + "grad_norm": 40.41664505004883, + "learning_rate": 5.292790453702098e-07, + "logits/chosen": -0.893968403339386, + "logits/rejected": -0.8400181531906128, + "logps/chosen": -1.9188792705535889, + "logps/rejected": -1.7751582860946655, + "loss": 4.5872, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.188793182373047, + "rewards/margins": -1.4372103214263916, + "rewards/rejected": -17.751583099365234, + "step": 15820 + }, + { + "epoch": 0.5333850146617681, + "grad_norm": 5.9038262367248535, + "learning_rate": 5.289854104721876e-07, + "logits/chosen": -1.1986534595489502, + "logits/rejected": -1.4004700183868408, + "logps/chosen": -2.0523171424865723, + "logps/rejected": -2.20119309425354, + "loss": 2.5557, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.523168563842773, + "rewards/margins": 1.4887616634368896, + "rewards/rejected": -22.01192855834961, + "step": 15825 + }, + { + "epoch": 0.5335535407327514, + "grad_norm": 28.159732818603516, + "learning_rate": 5.286917655434568e-07, + "logits/chosen": -1.5185017585754395, + "logits/rejected": -1.7343857288360596, + "logps/chosen": -1.7940750122070312, + "logps/rejected": -1.7864980697631836, + "loss": 3.3753, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.940750122070312, + "rewards/margins": -0.07577009499073029, + "rewards/rejected": -17.864978790283203, + "step": 15830 + }, + { + "epoch": 0.5337220668037346, + "grad_norm": 27.44189453125, + "learning_rate": 5.283981106856362e-07, + "logits/chosen": -1.4348413944244385, + "logits/rejected": -1.4643663167953491, + "logps/chosen": -2.1176836490631104, + "logps/rejected": -2.0485522747039795, + "loss": 3.8682, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.176837921142578, + "rewards/margins": -0.6913127899169922, + "rewards/rejected": -20.485523223876953, + "step": 15835 + }, + { + "epoch": 0.5338905928747177, + "grad_norm": 16.955625534057617, + "learning_rate": 5.281044460003485e-07, + "logits/chosen": -1.3504717350006104, + "logits/rejected": -1.4131934642791748, + "logps/chosen": -2.345491886138916, + "logps/rejected": -2.415341854095459, + "loss": 2.9348, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.45492172241211, + "rewards/margins": 0.6984950304031372, + "rewards/rejected": -24.15341567993164, + "step": 15840 + }, + { + "epoch": 0.5340591189457009, + "grad_norm": 104.76861572265625, + "learning_rate": 5.278107715892192e-07, + "logits/chosen": -1.08576500415802, + "logits/rejected": -1.1210057735443115, + "logps/chosen": -2.4601125717163086, + "logps/rejected": -2.3961403369903564, + "loss": 3.7628, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -24.601125717163086, + "rewards/margins": -0.6397234797477722, + "rewards/rejected": -23.961400985717773, + "step": 15845 + }, + { + "epoch": 0.5342276450166841, + "grad_norm": 27.159868240356445, + "learning_rate": 5.275170875538776e-07, + "logits/chosen": -1.3380801677703857, + "logits/rejected": -1.3865848779678345, + "logps/chosen": -1.9246442317962646, + "logps/rejected": -1.8412472009658813, + "loss": 3.8892, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.246442794799805, + "rewards/margins": -0.8339722752571106, + "rewards/rejected": -18.412471771240234, + "step": 15850 + }, + { + "epoch": 0.5343961710876672, + "grad_norm": 102.9177474975586, + "learning_rate": 5.272233939959559e-07, + "logits/chosen": -1.1191086769104004, + "logits/rejected": -0.9336107969284058, + "logps/chosen": -2.1518714427948, + "logps/rejected": -2.009411334991455, + "loss": 4.5251, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.518714904785156, + "rewards/margins": -1.42460298538208, + "rewards/rejected": -20.0941104888916, + "step": 15855 + }, + { + "epoch": 0.5345646971586504, + "grad_norm": 26.768428802490234, + "learning_rate": 5.269296910170905e-07, + "logits/chosen": -1.1772959232330322, + "logits/rejected": -1.4456255435943604, + "logps/chosen": -1.839946985244751, + "logps/rejected": -2.2853474617004395, + "loss": 2.4164, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.39946937561035, + "rewards/margins": 4.454005241394043, + "rewards/rejected": -22.85347557067871, + "step": 15860 + }, + { + "epoch": 0.5347332232296336, + "grad_norm": 31.624685287475586, + "learning_rate": 5.266359787189199e-07, + "logits/chosen": -1.5025027990341187, + "logits/rejected": -1.947291612625122, + "logps/chosen": -1.8523876667022705, + "logps/rejected": -2.393691301345825, + "loss": 1.875, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.523876190185547, + "rewards/margins": 5.413036823272705, + "rewards/rejected": -23.936914443969727, + "step": 15865 + }, + { + "epoch": 0.5349017493006168, + "grad_norm": 20.274646759033203, + "learning_rate": 5.263422572030863e-07, + "logits/chosen": -1.776607871055603, + "logits/rejected": -1.7238489389419556, + "logps/chosen": -1.981871247291565, + "logps/rejected": -2.054335832595825, + "loss": 2.6384, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.818714141845703, + "rewards/margins": 0.7246443033218384, + "rewards/rejected": -20.54335594177246, + "step": 15870 + }, + { + "epoch": 0.5350702753716, + "grad_norm": 29.006074905395508, + "learning_rate": 5.260485265712355e-07, + "logits/chosen": -1.447188138961792, + "logits/rejected": -1.4549908638000488, + "logps/chosen": -1.9105160236358643, + "logps/rejected": -1.9352041482925415, + "loss": 3.2192, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.105159759521484, + "rewards/margins": 0.24688215553760529, + "rewards/rejected": -19.352041244506836, + "step": 15875 + }, + { + "epoch": 0.5352388014425832, + "grad_norm": 31.608394622802734, + "learning_rate": 5.257547869250159e-07, + "logits/chosen": -1.1805084943771362, + "logits/rejected": -1.463069200515747, + "logps/chosen": -1.7122770547866821, + "logps/rejected": -2.39255690574646, + "loss": 2.7306, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.122770309448242, + "rewards/margins": 6.802798271179199, + "rewards/rejected": -23.925569534301758, + "step": 15880 + }, + { + "epoch": 0.5354073275135663, + "grad_norm": 23.542816162109375, + "learning_rate": 5.254610383660793e-07, + "logits/chosen": -1.764362096786499, + "logits/rejected": -1.6827386617660522, + "logps/chosen": -2.032930850982666, + "logps/rejected": -2.121166944503784, + "loss": 2.8885, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.32931137084961, + "rewards/margins": 0.882359504699707, + "rewards/rejected": -21.211669921875, + "step": 15885 + }, + { + "epoch": 0.5355758535845495, + "grad_norm": 13.081600189208984, + "learning_rate": 5.251672809960802e-07, + "logits/chosen": -1.3148635625839233, + "logits/rejected": -1.5017060041427612, + "logps/chosen": -1.7135963439941406, + "logps/rejected": -1.729269027709961, + "loss": 3.5379, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.135963439941406, + "rewards/margins": 0.15672659873962402, + "rewards/rejected": -17.29269027709961, + "step": 15890 + }, + { + "epoch": 0.5357443796555327, + "grad_norm": 19.698957443237305, + "learning_rate": 5.24873514916677e-07, + "logits/chosen": -1.3450462818145752, + "logits/rejected": -1.1864540576934814, + "logps/chosen": -2.393937826156616, + "logps/rejected": -2.188688278198242, + "loss": 5.1997, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -23.93937873840332, + "rewards/margins": -2.0524964332580566, + "rewards/rejected": -21.886882781982422, + "step": 15895 + }, + { + "epoch": 0.5359129057265158, + "grad_norm": 29.669261932373047, + "learning_rate": 5.245797402295301e-07, + "logits/chosen": -1.623167634010315, + "logits/rejected": -1.6045089960098267, + "logps/chosen": -2.1398420333862305, + "logps/rejected": -2.199852466583252, + "loss": 2.7214, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.398418426513672, + "rewards/margins": 0.6001054644584656, + "rewards/rejected": -21.998523712158203, + "step": 15900 + }, + { + "epoch": 0.5360814317974991, + "grad_norm": 32.335594177246094, + "learning_rate": 5.242859570363035e-07, + "logits/chosen": -1.3742711544036865, + "logits/rejected": -1.4153274297714233, + "logps/chosen": -1.8817806243896484, + "logps/rejected": -1.8834056854248047, + "loss": 3.8013, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -18.817806243896484, + "rewards/margins": 0.01625032350420952, + "rewards/rejected": -18.834056854248047, + "step": 15905 + }, + { + "epoch": 0.5362499578684823, + "grad_norm": 24.32137107849121, + "learning_rate": 5.239921654386641e-07, + "logits/chosen": -1.215399980545044, + "logits/rejected": -1.2015448808670044, + "logps/chosen": -1.8794883489608765, + "logps/rejected": -2.0633597373962402, + "loss": 1.7739, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -18.794885635375977, + "rewards/margins": 1.8387138843536377, + "rewards/rejected": -20.633596420288086, + "step": 15910 + }, + { + "epoch": 0.5364184839394655, + "grad_norm": 28.42302131652832, + "learning_rate": 5.236983655382813e-07, + "logits/chosen": -1.375603199005127, + "logits/rejected": -1.2848405838012695, + "logps/chosen": -1.872772455215454, + "logps/rejected": -1.9896090030670166, + "loss": 2.7718, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.727725982666016, + "rewards/margins": 1.1683663129806519, + "rewards/rejected": -19.89609146118164, + "step": 15915 + }, + { + "epoch": 0.5365870100104486, + "grad_norm": 33.60725784301758, + "learning_rate": 5.23404557436828e-07, + "logits/chosen": -0.9588086009025574, + "logits/rejected": -0.9080830812454224, + "logps/chosen": -2.8476717472076416, + "logps/rejected": -2.907454013824463, + "loss": 3.9853, + "rewards/accuracies": 0.5, + "rewards/chosen": -28.476715087890625, + "rewards/margins": 0.5978223085403442, + "rewards/rejected": -29.074539184570312, + "step": 15920 + }, + { + "epoch": 0.5367555360814318, + "grad_norm": 26.43718910217285, + "learning_rate": 5.231107412359794e-07, + "logits/chosen": -1.1706427335739136, + "logits/rejected": -1.4406774044036865, + "logps/chosen": -2.108630895614624, + "logps/rejected": -2.3447341918945312, + "loss": 2.9799, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.0863094329834, + "rewards/margins": 2.361030340194702, + "rewards/rejected": -23.44734001159668, + "step": 15925 + }, + { + "epoch": 0.536924062152415, + "grad_norm": 23.173118591308594, + "learning_rate": 5.228169170374139e-07, + "logits/chosen": -1.5244656801223755, + "logits/rejected": -1.5963716506958008, + "logps/chosen": -1.9139703512191772, + "logps/rejected": -2.1006815433502197, + "loss": 2.2664, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.13970375061035, + "rewards/margins": 1.867110252380371, + "rewards/rejected": -21.006813049316406, + "step": 15930 + }, + { + "epoch": 0.5370925882233981, + "grad_norm": 26.308143615722656, + "learning_rate": 5.225230849428124e-07, + "logits/chosen": -1.360687494277954, + "logits/rejected": -1.4157650470733643, + "logps/chosen": -2.1507084369659424, + "logps/rejected": -2.197394371032715, + "loss": 2.7048, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.5070858001709, + "rewards/margins": 0.46685847640037537, + "rewards/rejected": -21.97394371032715, + "step": 15935 + }, + { + "epoch": 0.5372611142943814, + "grad_norm": 26.92331314086914, + "learning_rate": 5.222292450538584e-07, + "logits/chosen": -0.9067287445068359, + "logits/rejected": -0.9419302940368652, + "logps/chosen": -3.1705856323242188, + "logps/rejected": -3.2947776317596436, + "loss": 2.4813, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -31.705860137939453, + "rewards/margins": 1.2419227361679077, + "rewards/rejected": -32.947776794433594, + "step": 15940 + }, + { + "epoch": 0.5374296403653646, + "grad_norm": 21.930761337280273, + "learning_rate": 5.219353974722387e-07, + "logits/chosen": -1.3587646484375, + "logits/rejected": -1.5184298753738403, + "logps/chosen": -2.0198464393615723, + "logps/rejected": -2.459423780441284, + "loss": 2.6066, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.198467254638672, + "rewards/margins": 4.395775318145752, + "rewards/rejected": -24.59423828125, + "step": 15945 + }, + { + "epoch": 0.5375981664363477, + "grad_norm": 44.06086349487305, + "learning_rate": 5.21641542299642e-07, + "logits/chosen": -0.7717021107673645, + "logits/rejected": -1.0169395208358765, + "logps/chosen": -2.177698850631714, + "logps/rejected": -2.4020535945892334, + "loss": 2.0949, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.77699089050293, + "rewards/margins": 2.2435462474823, + "rewards/rejected": -24.02053451538086, + "step": 15950 + }, + { + "epoch": 0.5377666925073309, + "grad_norm": 40.27283477783203, + "learning_rate": 5.213476796377603e-07, + "logits/chosen": -1.5478652715682983, + "logits/rejected": -1.6528785228729248, + "logps/chosen": -1.8562246561050415, + "logps/rejected": -1.9601234197616577, + "loss": 2.7799, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.562244415283203, + "rewards/margins": 1.0389883518218994, + "rewards/rejected": -19.601234436035156, + "step": 15955 + }, + { + "epoch": 0.5379352185783141, + "grad_norm": 27.30537986755371, + "learning_rate": 5.210538095882875e-07, + "logits/chosen": -1.1312801837921143, + "logits/rejected": -1.1560137271881104, + "logps/chosen": -2.146221160888672, + "logps/rejected": -2.3140676021575928, + "loss": 2.0642, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.46221160888672, + "rewards/margins": 1.6784664392471313, + "rewards/rejected": -23.14067840576172, + "step": 15960 + }, + { + "epoch": 0.5381037446492972, + "grad_norm": 18.12936782836914, + "learning_rate": 5.207599322529209e-07, + "logits/chosen": -1.1602661609649658, + "logits/rejected": -1.3525993824005127, + "logps/chosen": -1.647727370262146, + "logps/rejected": -2.0149059295654297, + "loss": 2.121, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.47727394104004, + "rewards/margins": 3.6717867851257324, + "rewards/rejected": -20.14906120300293, + "step": 15965 + }, + { + "epoch": 0.5382722707202804, + "grad_norm": 31.879697799682617, + "learning_rate": 5.204660477333595e-07, + "logits/chosen": -1.3323277235031128, + "logits/rejected": -1.6333240270614624, + "logps/chosen": -2.490701913833618, + "logps/rejected": -3.067025661468506, + "loss": 2.6407, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.907018661499023, + "rewards/margins": 5.763237953186035, + "rewards/rejected": -30.67025375366211, + "step": 15970 + }, + { + "epoch": 0.5384407967912636, + "grad_norm": 22.474822998046875, + "learning_rate": 5.201721561313054e-07, + "logits/chosen": -1.035434365272522, + "logits/rejected": -1.3812066316604614, + "logps/chosen": -1.76253342628479, + "logps/rejected": -2.0237176418304443, + "loss": 1.2305, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.62533187866211, + "rewards/margins": 2.6118431091308594, + "rewards/rejected": -20.2371768951416, + "step": 15975 + }, + { + "epoch": 0.5386093228622468, + "grad_norm": 22.040910720825195, + "learning_rate": 5.198782575484629e-07, + "logits/chosen": -1.5590795278549194, + "logits/rejected": -1.5408340692520142, + "logps/chosen": -2.3352012634277344, + "logps/rejected": -2.256068229675293, + "loss": 3.988, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.352014541625977, + "rewards/margins": -0.7913322448730469, + "rewards/rejected": -22.56068229675293, + "step": 15980 + }, + { + "epoch": 0.53877784893323, + "grad_norm": 56.712703704833984, + "learning_rate": 5.195843520865385e-07, + "logits/chosen": -1.1708321571350098, + "logits/rejected": -1.1317713260650635, + "logps/chosen": -2.1852142810821533, + "logps/rejected": -2.4626965522766113, + "loss": 2.6013, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.852140426635742, + "rewards/margins": 2.774822950363159, + "rewards/rejected": -24.626964569091797, + "step": 15985 + }, + { + "epoch": 0.5389463750042132, + "grad_norm": 40.07321548461914, + "learning_rate": 5.192904398472414e-07, + "logits/chosen": -1.5711636543273926, + "logits/rejected": -1.618537187576294, + "logps/chosen": -2.194077253341675, + "logps/rejected": -2.444293975830078, + "loss": 2.8609, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.940771102905273, + "rewards/margins": 2.502168655395508, + "rewards/rejected": -24.44293785095215, + "step": 15990 + }, + { + "epoch": 0.5391149010751963, + "grad_norm": 24.483991622924805, + "learning_rate": 5.189965209322832e-07, + "logits/chosen": -1.692317008972168, + "logits/rejected": -1.6503814458847046, + "logps/chosen": -2.3598055839538574, + "logps/rejected": -2.362114667892456, + "loss": 4.052, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -23.59805679321289, + "rewards/margins": 0.023090552538633347, + "rewards/rejected": -23.621145248413086, + "step": 15995 + }, + { + "epoch": 0.5392834271461795, + "grad_norm": 26.952857971191406, + "learning_rate": 5.187025954433775e-07, + "logits/chosen": -1.8345916271209717, + "logits/rejected": -2.290605068206787, + "logps/chosen": -2.6928634643554688, + "logps/rejected": -3.4724318981170654, + "loss": 1.8705, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -26.928638458251953, + "rewards/margins": 7.7956862449646, + "rewards/rejected": -34.72432327270508, + "step": 16000 + }, + { + "epoch": 0.5392834271461795, + "eval_logits/chosen": -1.8009322881698608, + "eval_logits/rejected": -1.9339642524719238, + "eval_logps/chosen": -2.0592150688171387, + "eval_logps/rejected": -2.1777427196502686, + "eval_loss": 2.988820791244507, + "eval_rewards/accuracies": 0.6299999952316284, + "eval_rewards/chosen": -20.592151641845703, + "eval_rewards/margins": 1.1852753162384033, + "eval_rewards/rejected": -21.777429580688477, + "eval_runtime": 12.9249, + "eval_samples_per_second": 7.737, + "eval_steps_per_second": 1.934, + "step": 16000 + }, + { + "epoch": 0.5394519532171627, + "grad_norm": 35.81848907470703, + "learning_rate": 5.184086634822403e-07, + "logits/chosen": -1.6082760095596313, + "logits/rejected": -1.6484458446502686, + "logps/chosen": -2.346975803375244, + "logps/rejected": -2.5124051570892334, + "loss": 2.7089, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.46976089477539, + "rewards/margins": 1.654293417930603, + "rewards/rejected": -25.124052047729492, + "step": 16005 + }, + { + "epoch": 0.5396204792881458, + "grad_norm": 16.334346771240234, + "learning_rate": 5.1811472515059e-07, + "logits/chosen": -1.5007803440093994, + "logits/rejected": -1.9453624486923218, + "logps/chosen": -1.8338782787322998, + "logps/rejected": -2.1191000938415527, + "loss": 2.9343, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.338781356811523, + "rewards/margins": 2.8522186279296875, + "rewards/rejected": -21.191001892089844, + "step": 16010 + }, + { + "epoch": 0.5397890053591291, + "grad_norm": 17.845060348510742, + "learning_rate": 5.17820780550147e-07, + "logits/chosen": -1.0943386554718018, + "logits/rejected": -1.19479238986969, + "logps/chosen": -1.8260581493377686, + "logps/rejected": -2.105739116668701, + "loss": 1.4134, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.260583877563477, + "rewards/margins": 2.796811580657959, + "rewards/rejected": -21.05739402770996, + "step": 16015 + }, + { + "epoch": 0.5399575314301123, + "grad_norm": 29.95235824584961, + "learning_rate": 5.175268297826339e-07, + "logits/chosen": -1.506792664527893, + "logits/rejected": -1.7213274240493774, + "logps/chosen": -1.9028027057647705, + "logps/rejected": -1.939144492149353, + "loss": 3.0789, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.028026580810547, + "rewards/margins": 0.36341866850852966, + "rewards/rejected": -19.39144515991211, + "step": 16020 + }, + { + "epoch": 0.5401260575010954, + "grad_norm": 35.79323959350586, + "learning_rate": 5.172328729497757e-07, + "logits/chosen": -1.24240243434906, + "logits/rejected": -1.4514870643615723, + "logps/chosen": -2.0713295936584473, + "logps/rejected": -2.2839739322662354, + "loss": 1.4821, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -20.71329689025879, + "rewards/margins": 2.1264426708221436, + "rewards/rejected": -22.839738845825195, + "step": 16025 + }, + { + "epoch": 0.5402945835720786, + "grad_norm": 21.9273738861084, + "learning_rate": 5.169389101532992e-07, + "logits/chosen": -1.3595364093780518, + "logits/rejected": -1.4825410842895508, + "logps/chosen": -2.9908063411712646, + "logps/rejected": -3.0221633911132812, + "loss": 4.5762, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -29.908061981201172, + "rewards/margins": 0.31356924772262573, + "rewards/rejected": -30.221630096435547, + "step": 16030 + }, + { + "epoch": 0.5404631096430618, + "grad_norm": 29.29009246826172, + "learning_rate": 5.16644941494933e-07, + "logits/chosen": -1.5605688095092773, + "logits/rejected": -1.5835740566253662, + "logps/chosen": -2.1153295040130615, + "logps/rejected": -2.410200595855713, + "loss": 2.5973, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.15329360961914, + "rewards/margins": 2.948711395263672, + "rewards/rejected": -24.102006912231445, + "step": 16035 + }, + { + "epoch": 0.5406316357140449, + "grad_norm": 34.95164489746094, + "learning_rate": 5.163509670764085e-07, + "logits/chosen": -1.321013331413269, + "logits/rejected": -1.2671291828155518, + "logps/chosen": -1.869821310043335, + "logps/rejected": -1.9189687967300415, + "loss": 2.6713, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.69821548461914, + "rewards/margins": 0.49147263169288635, + "rewards/rejected": -19.189685821533203, + "step": 16040 + }, + { + "epoch": 0.5408001617850281, + "grad_norm": 25.569393157958984, + "learning_rate": 5.160569869994583e-07, + "logits/chosen": -1.5885627269744873, + "logits/rejected": -1.7263247966766357, + "logps/chosen": -1.9965381622314453, + "logps/rejected": -2.2323851585388184, + "loss": 2.582, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.96537971496582, + "rewards/margins": 2.3584752082824707, + "rewards/rejected": -22.323854446411133, + "step": 16045 + }, + { + "epoch": 0.5409686878560114, + "grad_norm": 27.766555786132812, + "learning_rate": 5.157630013658177e-07, + "logits/chosen": -1.088120937347412, + "logits/rejected": -1.072862148284912, + "logps/chosen": -1.9776265621185303, + "logps/rejected": -2.1201555728912354, + "loss": 2.5222, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.776268005371094, + "rewards/margins": 1.4252907037734985, + "rewards/rejected": -21.201557159423828, + "step": 16050 + }, + { + "epoch": 0.5411372139269945, + "grad_norm": 26.957998275756836, + "learning_rate": 5.154690102772233e-07, + "logits/chosen": -1.8490188121795654, + "logits/rejected": -2.186875820159912, + "logps/chosen": -1.4853366613388062, + "logps/rejected": -1.7421538829803467, + "loss": 2.0116, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -14.853365898132324, + "rewards/margins": 2.568171262741089, + "rewards/rejected": -17.421539306640625, + "step": 16055 + }, + { + "epoch": 0.5413057399979777, + "grad_norm": 15.618478775024414, + "learning_rate": 5.151750138354139e-07, + "logits/chosen": -1.0816246271133423, + "logits/rejected": -1.3231147527694702, + "logps/chosen": -2.101982831954956, + "logps/rejected": -2.9028682708740234, + "loss": 1.5279, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -21.01982879638672, + "rewards/margins": 8.0088529586792, + "rewards/rejected": -29.0286808013916, + "step": 16060 + }, + { + "epoch": 0.5414742660689609, + "grad_norm": 11.756542205810547, + "learning_rate": 5.148810121421301e-07, + "logits/chosen": -1.383972406387329, + "logits/rejected": -1.4302829504013062, + "logps/chosen": -2.4469587802886963, + "logps/rejected": -3.174461603164673, + "loss": 2.2436, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.469587326049805, + "rewards/margins": 7.275025844573975, + "rewards/rejected": -31.744617462158203, + "step": 16065 + }, + { + "epoch": 0.541642792139944, + "grad_norm": 46.22239303588867, + "learning_rate": 5.145870052991142e-07, + "logits/chosen": -1.664219617843628, + "logits/rejected": -2.0187935829162598, + "logps/chosen": -1.9643936157226562, + "logps/rejected": -2.1999523639678955, + "loss": 2.2296, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -19.64393424987793, + "rewards/margins": 2.3555893898010254, + "rewards/rejected": -21.999523162841797, + "step": 16070 + }, + { + "epoch": 0.5418113182109272, + "grad_norm": 20.696165084838867, + "learning_rate": 5.142929934081107e-07, + "logits/chosen": -1.3868186473846436, + "logits/rejected": -1.6188242435455322, + "logps/chosen": -1.7030565738677979, + "logps/rejected": -1.6263227462768555, + "loss": 3.9108, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.03056526184082, + "rewards/margins": -0.7673369646072388, + "rewards/rejected": -16.263227462768555, + "step": 16075 + }, + { + "epoch": 0.5419798442819104, + "grad_norm": 41.721031188964844, + "learning_rate": 5.139989765708651e-07, + "logits/chosen": -1.031810998916626, + "logits/rejected": -1.10875403881073, + "logps/chosen": -2.078503131866455, + "logps/rejected": -2.17492938041687, + "loss": 2.7677, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.785030364990234, + "rewards/margins": 0.9642614126205444, + "rewards/rejected": -21.749292373657227, + "step": 16080 + }, + { + "epoch": 0.5421483703528935, + "grad_norm": 66.38001251220703, + "learning_rate": 5.137049548891253e-07, + "logits/chosen": -0.5578786134719849, + "logits/rejected": -0.5402613878250122, + "logps/chosen": -1.9524940252304077, + "logps/rejected": -2.130063533782959, + "loss": 2.1677, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.524944305419922, + "rewards/margins": 1.7756946086883545, + "rewards/rejected": -21.300636291503906, + "step": 16085 + }, + { + "epoch": 0.5423168964238768, + "grad_norm": 129.77603149414062, + "learning_rate": 5.134109284646405e-07, + "logits/chosen": -1.6398814916610718, + "logits/rejected": -1.465562343597412, + "logps/chosen": -2.069758415222168, + "logps/rejected": -2.019371271133423, + "loss": 4.3334, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -20.697582244873047, + "rewards/margins": -0.5038704872131348, + "rewards/rejected": -20.193714141845703, + "step": 16090 + }, + { + "epoch": 0.54248542249486, + "grad_norm": 29.77696418762207, + "learning_rate": 5.131168973991618e-07, + "logits/chosen": -1.091217279434204, + "logits/rejected": -1.1410750150680542, + "logps/chosen": -2.4660487174987793, + "logps/rejected": -2.612558364868164, + "loss": 2.7581, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.66048812866211, + "rewards/margins": 1.4650931358337402, + "rewards/rejected": -26.125579833984375, + "step": 16095 + }, + { + "epoch": 0.5426539485658431, + "grad_norm": 10.7893705368042, + "learning_rate": 5.128228617944418e-07, + "logits/chosen": -1.3055378198623657, + "logits/rejected": -1.4175994396209717, + "logps/chosen": -2.1625494956970215, + "logps/rejected": -2.2795069217681885, + "loss": 3.7809, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.62549591064453, + "rewards/margins": 1.1695719957351685, + "rewards/rejected": -22.795068740844727, + "step": 16100 + }, + { + "epoch": 0.5428224746368263, + "grad_norm": 27.38039207458496, + "learning_rate": 5.125288217522344e-07, + "logits/chosen": -1.5777299404144287, + "logits/rejected": -1.7784534692764282, + "logps/chosen": -2.2568578720092773, + "logps/rejected": -2.5661861896514893, + "loss": 3.0235, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.568580627441406, + "rewards/margins": 3.09328031539917, + "rewards/rejected": -25.6618595123291, + "step": 16105 + }, + { + "epoch": 0.5429910007078095, + "grad_norm": 32.53067398071289, + "learning_rate": 5.122347773742956e-07, + "logits/chosen": -1.8470481634140015, + "logits/rejected": -1.6745433807373047, + "logps/chosen": -2.235819101333618, + "logps/rejected": -2.2479605674743652, + "loss": 3.0407, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.358190536499023, + "rewards/margins": 0.1214146614074707, + "rewards/rejected": -22.479604721069336, + "step": 16110 + }, + { + "epoch": 0.5431595267787926, + "grad_norm": 21.476905822753906, + "learning_rate": 5.11940728762382e-07, + "logits/chosen": -1.3058350086212158, + "logits/rejected": -1.5694831609725952, + "logps/chosen": -2.243704080581665, + "logps/rejected": -2.6150970458984375, + "loss": 1.2742, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -22.437042236328125, + "rewards/margins": 3.7139289379119873, + "rewards/rejected": -26.150970458984375, + "step": 16115 + }, + { + "epoch": 0.5433280528497758, + "grad_norm": 0.27187997102737427, + "learning_rate": 5.116466760182529e-07, + "logits/chosen": -1.6408789157867432, + "logits/rejected": -1.9273831844329834, + "logps/chosen": -1.9865741729736328, + "logps/rejected": -2.6092066764831543, + "loss": 1.2092, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.865741729736328, + "rewards/margins": 6.22632360458374, + "rewards/rejected": -26.092065811157227, + "step": 16120 + }, + { + "epoch": 0.5434965789207591, + "grad_norm": 36.5335807800293, + "learning_rate": 5.11352619243668e-07, + "logits/chosen": -1.7007386684417725, + "logits/rejected": -2.2129995822906494, + "logps/chosen": -2.2495627403259277, + "logps/rejected": -2.761093854904175, + "loss": 2.9749, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.495628356933594, + "rewards/margins": 5.115310192108154, + "rewards/rejected": -27.610937118530273, + "step": 16125 + }, + { + "epoch": 0.5436651049917423, + "grad_norm": 35.36635208129883, + "learning_rate": 5.11058558540389e-07, + "logits/chosen": -0.7347787618637085, + "logits/rejected": -1.0197536945343018, + "logps/chosen": -1.9708576202392578, + "logps/rejected": -2.1755924224853516, + "loss": 2.4735, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.708574295043945, + "rewards/margins": 2.047349452972412, + "rewards/rejected": -21.755924224853516, + "step": 16130 + }, + { + "epoch": 0.5438336310627254, + "grad_norm": 24.370132446289062, + "learning_rate": 5.107644940101784e-07, + "logits/chosen": -0.6877826452255249, + "logits/rejected": -0.806088924407959, + "logps/chosen": -2.206653118133545, + "logps/rejected": -2.56797456741333, + "loss": 1.1178, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -22.066530227661133, + "rewards/margins": 3.6132149696350098, + "rewards/rejected": -25.67974281311035, + "step": 16135 + }, + { + "epoch": 0.5440021571337086, + "grad_norm": 16.870189666748047, + "learning_rate": 5.104704257548005e-07, + "logits/chosen": -1.4101765155792236, + "logits/rejected": -1.325073480606079, + "logps/chosen": -2.379502534866333, + "logps/rejected": -2.5285820960998535, + "loss": 2.683, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.795024871826172, + "rewards/margins": 1.4907970428466797, + "rewards/rejected": -25.28582191467285, + "step": 16140 + }, + { + "epoch": 0.5441706832046918, + "grad_norm": 46.246456146240234, + "learning_rate": 5.101763538760209e-07, + "logits/chosen": -1.5839884281158447, + "logits/rejected": -1.957201600074768, + "logps/chosen": -1.852923035621643, + "logps/rejected": -2.072567939758301, + "loss": 2.7152, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.52923011779785, + "rewards/margins": 2.196450710296631, + "rewards/rejected": -20.72568130493164, + "step": 16145 + }, + { + "epoch": 0.5443392092756749, + "grad_norm": 97.65252685546875, + "learning_rate": 5.098822784756061e-07, + "logits/chosen": -1.8456027507781982, + "logits/rejected": -1.7750129699707031, + "logps/chosen": -2.749521255493164, + "logps/rejected": -2.648500919342041, + "loss": 5.1515, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -27.495208740234375, + "rewards/margins": -1.0102026462554932, + "rewards/rejected": -26.48500633239746, + "step": 16150 + }, + { + "epoch": 0.5445077353466581, + "grad_norm": 80.65924072265625, + "learning_rate": 5.095881996553242e-07, + "logits/chosen": -1.3277708292007446, + "logits/rejected": -1.4319045543670654, + "logps/chosen": -1.9848464727401733, + "logps/rejected": -1.9942963123321533, + "loss": 3.5708, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.848468780517578, + "rewards/margins": 0.09449663013219833, + "rewards/rejected": -19.942962646484375, + "step": 16155 + }, + { + "epoch": 0.5446762614176414, + "grad_norm": 54.68977355957031, + "learning_rate": 5.09294117516944e-07, + "logits/chosen": -1.5335499048233032, + "logits/rejected": -1.610508680343628, + "logps/chosen": -2.8308680057525635, + "logps/rejected": -3.0489935874938965, + "loss": 2.3802, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -28.30868148803711, + "rewards/margins": 2.181258201599121, + "rewards/rejected": -30.489938735961914, + "step": 16160 + }, + { + "epoch": 0.5448447874886245, + "grad_norm": 27.68987464904785, + "learning_rate": 5.090000321622358e-07, + "logits/chosen": -1.3854801654815674, + "logits/rejected": -1.203540563583374, + "logps/chosen": -1.650176763534546, + "logps/rejected": -2.056550979614258, + "loss": 2.6758, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.501766204833984, + "rewards/margins": 4.063741207122803, + "rewards/rejected": -20.565509796142578, + "step": 16165 + }, + { + "epoch": 0.5450133135596077, + "grad_norm": 54.86668014526367, + "learning_rate": 5.087059436929714e-07, + "logits/chosen": -0.4492467939853668, + "logits/rejected": -0.6218141317367554, + "logps/chosen": -2.3843488693237305, + "logps/rejected": -2.4283106327056885, + "loss": 3.0914, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.843486785888672, + "rewards/margins": 0.4396181106567383, + "rewards/rejected": -24.28310775756836, + "step": 16170 + }, + { + "epoch": 0.5451818396305909, + "grad_norm": 21.276992797851562, + "learning_rate": 5.084118522109225e-07, + "logits/chosen": -1.6168181896209717, + "logits/rejected": -1.83038330078125, + "logps/chosen": -1.897952675819397, + "logps/rejected": -2.1104540824890137, + "loss": 2.5915, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.97952651977539, + "rewards/margins": 2.1250123977661133, + "rewards/rejected": -21.104537963867188, + "step": 16175 + }, + { + "epoch": 0.545350365701574, + "grad_norm": 31.154457092285156, + "learning_rate": 5.081177578178632e-07, + "logits/chosen": -2.0428037643432617, + "logits/rejected": -2.1841020584106445, + "logps/chosen": -2.541215181350708, + "logps/rejected": -3.1152877807617188, + "loss": 2.5159, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.412151336669922, + "rewards/margins": 5.740725994110107, + "rewards/rejected": -31.152877807617188, + "step": 16180 + }, + { + "epoch": 0.5455188917725572, + "grad_norm": 25.28038787841797, + "learning_rate": 5.078236606155677e-07, + "logits/chosen": -1.2056846618652344, + "logits/rejected": -1.2339636087417603, + "logps/chosen": -2.2586770057678223, + "logps/rejected": -2.537292957305908, + "loss": 2.1905, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.586769104003906, + "rewards/margins": 2.786158800125122, + "rewards/rejected": -25.372928619384766, + "step": 16185 + }, + { + "epoch": 0.5456874178435404, + "grad_norm": 74.27156066894531, + "learning_rate": 5.075295607058116e-07, + "logits/chosen": -1.1293174028396606, + "logits/rejected": -1.1770614385604858, + "logps/chosen": -2.194941759109497, + "logps/rejected": -2.2006897926330566, + "loss": 3.2653, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.949419021606445, + "rewards/margins": 0.05747966840863228, + "rewards/rejected": -22.00689697265625, + "step": 16190 + }, + { + "epoch": 0.5458559439145235, + "grad_norm": 63.31100082397461, + "learning_rate": 5.072354581903709e-07, + "logits/chosen": -1.2766873836517334, + "logits/rejected": -1.6367841958999634, + "logps/chosen": -1.8417125940322876, + "logps/rejected": -1.7709461450576782, + "loss": 3.8441, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.417125701904297, + "rewards/margins": -0.7076643705368042, + "rewards/rejected": -17.709461212158203, + "step": 16195 + }, + { + "epoch": 0.5460244699855068, + "grad_norm": 0.0952446460723877, + "learning_rate": 5.069413531710235e-07, + "logits/chosen": -1.78704833984375, + "logits/rejected": -1.8815076351165771, + "logps/chosen": -2.5174808502197266, + "logps/rejected": -3.0017216205596924, + "loss": 1.5106, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -25.174808502197266, + "rewards/margins": 4.842409610748291, + "rewards/rejected": -30.0172176361084, + "step": 16200 + }, + { + "epoch": 0.54619299605649, + "grad_norm": 4.080849647521973, + "learning_rate": 5.066472457495471e-07, + "logits/chosen": -1.1902334690093994, + "logits/rejected": -1.4749139547348022, + "logps/chosen": -1.9703378677368164, + "logps/rejected": -2.3434581756591797, + "loss": 1.7, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.70337677001953, + "rewards/margins": 3.731203556060791, + "rewards/rejected": -23.434581756591797, + "step": 16205 + }, + { + "epoch": 0.5463615221274731, + "grad_norm": 19.52630615234375, + "learning_rate": 5.063531360277209e-07, + "logits/chosen": -1.5813535451889038, + "logits/rejected": -1.4736263751983643, + "logps/chosen": -1.56548011302948, + "logps/rejected": -1.4851534366607666, + "loss": 4.1395, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -15.654800415039062, + "rewards/margins": -0.8032673597335815, + "rewards/rejected": -14.851534843444824, + "step": 16210 + }, + { + "epoch": 0.5465300481984563, + "grad_norm": 30.350008010864258, + "learning_rate": 5.060590241073245e-07, + "logits/chosen": -1.7225860357284546, + "logits/rejected": -1.749176263809204, + "logps/chosen": -2.0334715843200684, + "logps/rejected": -2.5191073417663574, + "loss": 1.8731, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.334712982177734, + "rewards/margins": 4.856356620788574, + "rewards/rejected": -25.191070556640625, + "step": 16215 + }, + { + "epoch": 0.5466985742694395, + "grad_norm": 53.04943084716797, + "learning_rate": 5.057649100901386e-07, + "logits/chosen": -1.5750441551208496, + "logits/rejected": -1.8706716299057007, + "logps/chosen": -1.9647775888442993, + "logps/rejected": -2.0691940784454346, + "loss": 2.207, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.647777557373047, + "rewards/margins": 1.0441657304763794, + "rewards/rejected": -20.69194221496582, + "step": 16220 + }, + { + "epoch": 0.5468671003404226, + "grad_norm": 16.87798500061035, + "learning_rate": 5.054707940779446e-07, + "logits/chosen": -1.5316976308822632, + "logits/rejected": -1.7342464923858643, + "logps/chosen": -2.130229949951172, + "logps/rejected": -2.5026209354400635, + "loss": 2.1972, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.302297592163086, + "rewards/margins": 3.7239105701446533, + "rewards/rejected": -25.02621078491211, + "step": 16225 + }, + { + "epoch": 0.5470356264114058, + "grad_norm": 63.59160232543945, + "learning_rate": 5.051766761725241e-07, + "logits/chosen": -1.3253222703933716, + "logits/rejected": -1.2189723253250122, + "logps/chosen": -2.0285019874572754, + "logps/rejected": -1.908988356590271, + "loss": 4.2631, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.28502082824707, + "rewards/margins": -1.1951375007629395, + "rewards/rejected": -19.08988380432129, + "step": 16230 + }, + { + "epoch": 0.5472041524823891, + "grad_norm": 92.26155090332031, + "learning_rate": 5.048825564756601e-07, + "logits/chosen": -1.8079001903533936, + "logits/rejected": -1.8996975421905518, + "logps/chosen": -2.1402010917663574, + "logps/rejected": -2.247673749923706, + "loss": 2.2508, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.40201187133789, + "rewards/margins": 1.0747264623641968, + "rewards/rejected": -22.47673797607422, + "step": 16235 + }, + { + "epoch": 0.5473726785533722, + "grad_norm": 69.60755157470703, + "learning_rate": 5.045884350891356e-07, + "logits/chosen": -1.0289746522903442, + "logits/rejected": -1.0647555589675903, + "logps/chosen": -2.0716910362243652, + "logps/rejected": -2.075446605682373, + "loss": 3.2364, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.716907501220703, + "rewards/margins": 0.03755836561322212, + "rewards/rejected": -20.754467010498047, + "step": 16240 + }, + { + "epoch": 0.5475412046243554, + "grad_norm": 31.6566219329834, + "learning_rate": 5.042943121147345e-07, + "logits/chosen": -1.6412330865859985, + "logits/rejected": -2.096470355987549, + "logps/chosen": -2.3620076179504395, + "logps/rejected": -3.1981656551361084, + "loss": 1.9026, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.620075225830078, + "rewards/margins": 8.361583709716797, + "rewards/rejected": -31.981658935546875, + "step": 16245 + }, + { + "epoch": 0.5477097306953386, + "grad_norm": 19.45050811767578, + "learning_rate": 5.040001876542413e-07, + "logits/chosen": -1.898911714553833, + "logits/rejected": -1.8914964199066162, + "logps/chosen": -1.802631139755249, + "logps/rejected": -1.9166710376739502, + "loss": 2.6816, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.02631187438965, + "rewards/margins": 1.1403964757919312, + "rewards/rejected": -19.166709899902344, + "step": 16250 + }, + { + "epoch": 0.5478782567663217, + "grad_norm": 26.124082565307617, + "learning_rate": 5.037060618094406e-07, + "logits/chosen": -1.0378376245498657, + "logits/rejected": -1.0364210605621338, + "logps/chosen": -2.5162196159362793, + "logps/rejected": -2.6829471588134766, + "loss": 3.888, + "rewards/accuracies": 0.5, + "rewards/chosen": -25.16219711303711, + "rewards/margins": 1.6672769784927368, + "rewards/rejected": -26.8294734954834, + "step": 16255 + }, + { + "epoch": 0.5480467828373049, + "grad_norm": 24.959684371948242, + "learning_rate": 5.034119346821179e-07, + "logits/chosen": -1.591496229171753, + "logits/rejected": -1.5482546091079712, + "logps/chosen": -2.5285449028015137, + "logps/rejected": -2.6545941829681396, + "loss": 3.1635, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.285449981689453, + "rewards/margins": 1.2604939937591553, + "rewards/rejected": -26.545940399169922, + "step": 16260 + }, + { + "epoch": 0.5482153089082881, + "grad_norm": 25.58453369140625, + "learning_rate": 5.031178063740591e-07, + "logits/chosen": -1.7127612829208374, + "logits/rejected": -1.9984369277954102, + "logps/chosen": -2.3069615364074707, + "logps/rejected": -2.550511121749878, + "loss": 2.1247, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -23.06961441040039, + "rewards/margins": 2.4354963302612305, + "rewards/rejected": -25.505109786987305, + "step": 16265 + }, + { + "epoch": 0.5483838349792713, + "grad_norm": 13.465657234191895, + "learning_rate": 5.028236769870503e-07, + "logits/chosen": -1.6997343301773071, + "logits/rejected": -1.6074409484863281, + "logps/chosen": -2.0852694511413574, + "logps/rejected": -2.3273866176605225, + "loss": 1.6164, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.85269546508789, + "rewards/margins": 2.4211738109588623, + "rewards/rejected": -23.273868560791016, + "step": 16270 + }, + { + "epoch": 0.5485523610502545, + "grad_norm": 6.806887149810791, + "learning_rate": 5.025295466228782e-07, + "logits/chosen": -2.103903293609619, + "logits/rejected": -2.28928804397583, + "logps/chosen": -2.131436824798584, + "logps/rejected": -2.859297275543213, + "loss": 1.2612, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.314367294311523, + "rewards/margins": 7.278607368469238, + "rewards/rejected": -28.592975616455078, + "step": 16275 + }, + { + "epoch": 0.5487208871212377, + "grad_norm": 28.80763053894043, + "learning_rate": 5.022354153833296e-07, + "logits/chosen": -1.6582714319229126, + "logits/rejected": -1.998355507850647, + "logps/chosen": -2.045281410217285, + "logps/rejected": -2.201249599456787, + "loss": 2.4043, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.452816009521484, + "rewards/margins": 1.5596843957901, + "rewards/rejected": -22.012500762939453, + "step": 16280 + }, + { + "epoch": 0.5488894131922208, + "grad_norm": 23.410083770751953, + "learning_rate": 5.019412833701917e-07, + "logits/chosen": -1.7809860706329346, + "logits/rejected": -1.9740571975708008, + "logps/chosen": -1.9353249073028564, + "logps/rejected": -1.9824997186660767, + "loss": 2.9308, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.353246688842773, + "rewards/margins": 0.4717481732368469, + "rewards/rejected": -19.824996948242188, + "step": 16285 + }, + { + "epoch": 0.549057939263204, + "grad_norm": 34.78386688232422, + "learning_rate": 5.016471506852522e-07, + "logits/chosen": -1.3661186695098877, + "logits/rejected": -1.5314580202102661, + "logps/chosen": -1.7690000534057617, + "logps/rejected": -1.9127906560897827, + "loss": 2.3856, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.690000534057617, + "rewards/margins": 1.4379034042358398, + "rewards/rejected": -19.127904891967773, + "step": 16290 + }, + { + "epoch": 0.5492264653341872, + "grad_norm": 24.4421329498291, + "learning_rate": 5.013530174302989e-07, + "logits/chosen": -1.5516514778137207, + "logits/rejected": -1.7300994396209717, + "logps/chosen": -2.3746907711029053, + "logps/rejected": -2.7230477333068848, + "loss": 2.4286, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.746906280517578, + "rewards/margins": 3.4835681915283203, + "rewards/rejected": -27.2304744720459, + "step": 16295 + }, + { + "epoch": 0.5493949914051703, + "grad_norm": 13.170940399169922, + "learning_rate": 5.010588837071196e-07, + "logits/chosen": -1.0385621786117554, + "logits/rejected": -1.3224549293518066, + "logps/chosen": -2.315553665161133, + "logps/rejected": -2.626375913619995, + "loss": 2.0412, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.155536651611328, + "rewards/margins": 3.108222723007202, + "rewards/rejected": -26.26375961303711, + "step": 16300 + }, + { + "epoch": 0.5495635174761535, + "grad_norm": 75.63530731201172, + "learning_rate": 5.007647496175021e-07, + "logits/chosen": -1.0984992980957031, + "logits/rejected": -1.217091679573059, + "logps/chosen": -3.2464778423309326, + "logps/rejected": -3.294538974761963, + "loss": 4.7708, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -32.464778900146484, + "rewards/margins": 0.4806079864501953, + "rewards/rejected": -32.94538879394531, + "step": 16305 + }, + { + "epoch": 0.5497320435471368, + "grad_norm": 39.796756744384766, + "learning_rate": 5.004706152632351e-07, + "logits/chosen": -1.3322203159332275, + "logits/rejected": -1.4762696027755737, + "logps/chosen": -2.046513319015503, + "logps/rejected": -2.0476295948028564, + "loss": 3.142, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.465133666992188, + "rewards/margins": 0.011162233538925648, + "rewards/rejected": -20.476295471191406, + "step": 16310 + }, + { + "epoch": 0.54990056961812, + "grad_norm": 38.28547668457031, + "learning_rate": 5.001764807461065e-07, + "logits/chosen": -1.0382802486419678, + "logits/rejected": -1.4499540328979492, + "logps/chosen": -2.783287763595581, + "logps/rejected": -2.572631359100342, + "loss": 5.7225, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -27.832876205444336, + "rewards/margins": -2.1065640449523926, + "rewards/rejected": -25.7263126373291, + "step": 16315 + }, + { + "epoch": 0.5500690956891031, + "grad_norm": 24.27626609802246, + "learning_rate": 4.998823461679051e-07, + "logits/chosen": -1.0992909669876099, + "logits/rejected": -1.2056782245635986, + "logps/chosen": -2.2541472911834717, + "logps/rejected": -2.2591731548309326, + "loss": 3.6655, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -22.54146957397461, + "rewards/margins": 0.05026273801922798, + "rewards/rejected": -22.591732025146484, + "step": 16320 + }, + { + "epoch": 0.5502376217600863, + "grad_norm": 1.48041832447052, + "learning_rate": 4.995882116304189e-07, + "logits/chosen": -1.4299277067184448, + "logits/rejected": -1.3262441158294678, + "logps/chosen": -2.3866114616394043, + "logps/rejected": -2.650369644165039, + "loss": 2.7072, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.86611557006836, + "rewards/margins": 2.6375796794891357, + "rewards/rejected": -26.50369644165039, + "step": 16325 + }, + { + "epoch": 0.5504061478310694, + "grad_norm": 31.15922737121582, + "learning_rate": 4.992940772354364e-07, + "logits/chosen": -1.0533661842346191, + "logits/rejected": -1.3468220233917236, + "logps/chosen": -2.4013590812683105, + "logps/rejected": -2.3216609954833984, + "loss": 5.1866, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -24.013591766357422, + "rewards/margins": -0.7969821691513062, + "rewards/rejected": -23.216609954833984, + "step": 16330 + }, + { + "epoch": 0.5505746739020526, + "grad_norm": 31.58074188232422, + "learning_rate": 4.989999430847463e-07, + "logits/chosen": -2.013664722442627, + "logits/rejected": -1.8949086666107178, + "logps/chosen": -1.9614967107772827, + "logps/rejected": -2.0524280071258545, + "loss": 2.6342, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.614965438842773, + "rewards/margins": 0.9093145132064819, + "rewards/rejected": -20.524280548095703, + "step": 16335 + }, + { + "epoch": 0.5507431999730358, + "grad_norm": 31.68235206604004, + "learning_rate": 4.987058092801361e-07, + "logits/chosen": -1.184633493423462, + "logits/rejected": -1.1958637237548828, + "logps/chosen": -1.8362147808074951, + "logps/rejected": -1.8909202814102173, + "loss": 2.7622, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.36214828491211, + "rewards/margins": 0.547053337097168, + "rewards/rejected": -18.909204483032227, + "step": 16340 + }, + { + "epoch": 0.5509117260440191, + "grad_norm": 26.33146858215332, + "learning_rate": 4.984116759233944e-07, + "logits/chosen": -1.4128179550170898, + "logits/rejected": -1.6827160120010376, + "logps/chosen": -2.2236125469207764, + "logps/rejected": -3.1885323524475098, + "loss": 2.1096, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.236125946044922, + "rewards/margins": 9.649200439453125, + "rewards/rejected": -31.885326385498047, + "step": 16345 + }, + { + "epoch": 0.5510802521150022, + "grad_norm": 37.639930725097656, + "learning_rate": 4.981175431163092e-07, + "logits/chosen": -1.5246819257736206, + "logits/rejected": -1.458280324935913, + "logps/chosen": -2.300814151763916, + "logps/rejected": -2.4303762912750244, + "loss": 2.9189, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.00814437866211, + "rewards/margins": 1.2956197261810303, + "rewards/rejected": -24.30376434326172, + "step": 16350 + }, + { + "epoch": 0.5512487781859854, + "grad_norm": 33.55547332763672, + "learning_rate": 4.978234109606681e-07, + "logits/chosen": -1.6795823574066162, + "logits/rejected": -1.840654730796814, + "logps/chosen": -2.044466733932495, + "logps/rejected": -2.403834819793701, + "loss": 1.9782, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.44466781616211, + "rewards/margins": 3.5936789512634277, + "rewards/rejected": -24.038349151611328, + "step": 16355 + }, + { + "epoch": 0.5514173042569686, + "grad_norm": 24.78546714782715, + "learning_rate": 4.975292795582588e-07, + "logits/chosen": -1.0123234987258911, + "logits/rejected": -1.229949712753296, + "logps/chosen": -2.1764540672302246, + "logps/rejected": -2.229409694671631, + "loss": 3.0214, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.764545440673828, + "rewards/margins": 0.529554009437561, + "rewards/rejected": -22.294097900390625, + "step": 16360 + }, + { + "epoch": 0.5515858303279517, + "grad_norm": 18.02266502380371, + "learning_rate": 4.972351490108683e-07, + "logits/chosen": -1.6163885593414307, + "logits/rejected": -1.8083269596099854, + "logps/chosen": -2.210312604904175, + "logps/rejected": -2.2474353313446045, + "loss": 4.8088, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.103126525878906, + "rewards/margins": 0.37122592329978943, + "rewards/rejected": -22.474353790283203, + "step": 16365 + }, + { + "epoch": 0.5517543563989349, + "grad_norm": 15.657951354980469, + "learning_rate": 4.96941019420284e-07, + "logits/chosen": -1.1342637538909912, + "logits/rejected": -1.3603713512420654, + "logps/chosen": -2.810260772705078, + "logps/rejected": -2.916231155395508, + "loss": 3.2622, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -28.102609634399414, + "rewards/margins": 1.0597028732299805, + "rewards/rejected": -29.162311553955078, + "step": 16370 + }, + { + "epoch": 0.551922882469918, + "grad_norm": 37.477928161621094, + "learning_rate": 4.966468908882921e-07, + "logits/chosen": -1.0693776607513428, + "logits/rejected": -1.183394193649292, + "logps/chosen": -2.5060973167419434, + "logps/rejected": -2.6473300457000732, + "loss": 2.2649, + "rewards/accuracies": 0.5, + "rewards/chosen": -25.06097412109375, + "rewards/margins": 1.4123274087905884, + "rewards/rejected": -26.47330093383789, + "step": 16375 + }, + { + "epoch": 0.5520914085409013, + "grad_norm": 34.93418884277344, + "learning_rate": 4.963527635166793e-07, + "logits/chosen": -1.20332670211792, + "logits/rejected": -1.1986877918243408, + "logps/chosen": -2.035940408706665, + "logps/rejected": -2.0706629753112793, + "loss": 2.7629, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.359403610229492, + "rewards/margins": 0.34722432494163513, + "rewards/rejected": -20.706628799438477, + "step": 16380 + }, + { + "epoch": 0.5522599346118845, + "grad_norm": 23.54374885559082, + "learning_rate": 4.960586374072316e-07, + "logits/chosen": -1.2519137859344482, + "logits/rejected": -1.3859598636627197, + "logps/chosen": -2.3384459018707275, + "logps/rejected": -3.0998260974884033, + "loss": 1.8127, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.38446044921875, + "rewards/margins": 7.6138014793396, + "rewards/rejected": -30.998260498046875, + "step": 16385 + }, + { + "epoch": 0.5524284606828677, + "grad_norm": 24.325088500976562, + "learning_rate": 4.957645126617339e-07, + "logits/chosen": -1.4304975271224976, + "logits/rejected": -1.6704849004745483, + "logps/chosen": -2.214526653289795, + "logps/rejected": -2.578474521636963, + "loss": 1.8956, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.145265579223633, + "rewards/margins": 3.6394755840301514, + "rewards/rejected": -25.784744262695312, + "step": 16390 + }, + { + "epoch": 0.5525969867538508, + "grad_norm": 4.897208213806152, + "learning_rate": 4.954703893819715e-07, + "logits/chosen": -1.407099962234497, + "logits/rejected": -1.4926787614822388, + "logps/chosen": -2.6405577659606934, + "logps/rejected": -3.410717487335205, + "loss": 2.6697, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.405574798583984, + "rewards/margins": 7.701594352722168, + "rewards/rejected": -34.10717010498047, + "step": 16395 + }, + { + "epoch": 0.552765512824834, + "grad_norm": 57.99406051635742, + "learning_rate": 4.951762676697292e-07, + "logits/chosen": -1.7110874652862549, + "logits/rejected": -1.8076190948486328, + "logps/chosen": -2.230250835418701, + "logps/rejected": -2.1426777839660645, + "loss": 4.0587, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.302507400512695, + "rewards/margins": -0.8757309913635254, + "rewards/rejected": -21.426776885986328, + "step": 16400 + }, + { + "epoch": 0.552765512824834, + "eval_logits/chosen": -1.862671971321106, + "eval_logits/rejected": -2.0018630027770996, + "eval_logps/chosen": -2.0881171226501465, + "eval_logps/rejected": -2.213587760925293, + "eval_loss": 2.9925146102905273, + "eval_rewards/accuracies": 0.6299999952316284, + "eval_rewards/chosen": -20.88117218017578, + "eval_rewards/margins": 1.2547067403793335, + "eval_rewards/rejected": -22.135875701904297, + "eval_runtime": 12.916, + "eval_samples_per_second": 7.742, + "eval_steps_per_second": 1.936, + "step": 16400 + }, + { + "epoch": 0.5529340388958172, + "grad_norm": 30.2686824798584, + "learning_rate": 4.948821476267902e-07, + "logits/chosen": -1.4786027669906616, + "logits/rejected": -1.5657284259796143, + "logps/chosen": -2.48724627494812, + "logps/rejected": -3.2754569053649902, + "loss": 1.9817, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.87246322631836, + "rewards/margins": 7.882106781005859, + "rewards/rejected": -32.75457000732422, + "step": 16405 + }, + { + "epoch": 0.5531025649668003, + "grad_norm": 26.35026741027832, + "learning_rate": 4.945880293549384e-07, + "logits/chosen": -1.3459535837173462, + "logits/rejected": -1.4547795057296753, + "logps/chosen": -1.9763944149017334, + "logps/rejected": -1.8858134746551514, + "loss": 4.0438, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.763944625854492, + "rewards/margins": -0.9058086276054382, + "rewards/rejected": -18.858135223388672, + "step": 16410 + }, + { + "epoch": 0.5532710910377835, + "grad_norm": 10.583443641662598, + "learning_rate": 4.942939129559564e-07, + "logits/chosen": -1.0191407203674316, + "logits/rejected": -1.2290555238723755, + "logps/chosen": -2.1333813667297363, + "logps/rejected": -2.4551522731781006, + "loss": 1.9698, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.333812713623047, + "rewards/margins": 3.217708110809326, + "rewards/rejected": -24.5515193939209, + "step": 16415 + }, + { + "epoch": 0.5534396171087668, + "grad_norm": 69.04009246826172, + "learning_rate": 4.939997985316265e-07, + "logits/chosen": -1.6069825887680054, + "logits/rejected": -1.5675886869430542, + "logps/chosen": -1.7600486278533936, + "logps/rejected": -2.053833484649658, + "loss": 1.7592, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.600486755371094, + "rewards/margins": 2.9378466606140137, + "rewards/rejected": -20.538333892822266, + "step": 16420 + }, + { + "epoch": 0.5536081431797499, + "grad_norm": 30.52703285217285, + "learning_rate": 4.937056861837298e-07, + "logits/chosen": -1.2858994007110596, + "logits/rejected": -1.4342375993728638, + "logps/chosen": -2.0015480518341064, + "logps/rejected": -2.1410858631134033, + "loss": 2.2274, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.015480041503906, + "rewards/margins": 1.3953787088394165, + "rewards/rejected": -21.410858154296875, + "step": 16425 + }, + { + "epoch": 0.5537766692507331, + "grad_norm": 23.32746696472168, + "learning_rate": 4.934115760140472e-07, + "logits/chosen": -1.1391804218292236, + "logits/rejected": -1.6687994003295898, + "logps/chosen": -2.4570107460021973, + "logps/rejected": -3.014371395111084, + "loss": 1.5547, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -24.57010841369629, + "rewards/margins": 5.573606014251709, + "rewards/rejected": -30.143712997436523, + "step": 16430 + }, + { + "epoch": 0.5539451953217163, + "grad_norm": 29.025510787963867, + "learning_rate": 4.931174681243586e-07, + "logits/chosen": -1.0708847045898438, + "logits/rejected": -1.4539520740509033, + "logps/chosen": -1.974205732345581, + "logps/rejected": -2.124587059020996, + "loss": 2.2244, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.74205780029297, + "rewards/margins": 1.5038119554519653, + "rewards/rejected": -21.24587059020996, + "step": 16435 + }, + { + "epoch": 0.5541137213926994, + "grad_norm": 33.02252197265625, + "learning_rate": 4.928233626164428e-07, + "logits/chosen": -1.5927103757858276, + "logits/rejected": -2.0426902770996094, + "logps/chosen": -2.281912326812744, + "logps/rejected": -2.7135491371154785, + "loss": 1.8576, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.819122314453125, + "rewards/margins": 4.316366672515869, + "rewards/rejected": -27.135488510131836, + "step": 16440 + }, + { + "epoch": 0.5542822474636826, + "grad_norm": 26.73567008972168, + "learning_rate": 4.925292595920787e-07, + "logits/chosen": -1.7700579166412354, + "logits/rejected": -1.8333972692489624, + "logps/chosen": -1.8393447399139404, + "logps/rejected": -2.0610852241516113, + "loss": 2.1441, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.393447875976562, + "rewards/margins": 2.2174034118652344, + "rewards/rejected": -20.610851287841797, + "step": 16445 + }, + { + "epoch": 0.5544507735346658, + "grad_norm": 26.375341415405273, + "learning_rate": 4.922351591530434e-07, + "logits/chosen": -1.185505747795105, + "logits/rejected": -1.398880958557129, + "logps/chosen": -2.5037283897399902, + "logps/rejected": -2.6086947917938232, + "loss": 3.5968, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -25.03728485107422, + "rewards/margins": 1.0496633052825928, + "rewards/rejected": -26.08694839477539, + "step": 16450 + }, + { + "epoch": 0.554619299605649, + "grad_norm": 130.64976501464844, + "learning_rate": 4.919410614011138e-07, + "logits/chosen": -1.4663383960723877, + "logits/rejected": -2.1934359073638916, + "logps/chosen": -2.6172523498535156, + "logps/rejected": -2.5515918731689453, + "loss": 5.0914, + "rewards/accuracies": 0.5, + "rewards/chosen": -26.17252540588379, + "rewards/margins": -0.6566047668457031, + "rewards/rejected": -25.515918731689453, + "step": 16455 + }, + { + "epoch": 0.5547878256766322, + "grad_norm": 20.96159553527832, + "learning_rate": 4.916469664380652e-07, + "logits/chosen": -1.6890236139297485, + "logits/rejected": -1.9183467626571655, + "logps/chosen": -2.627854824066162, + "logps/rejected": -2.8655261993408203, + "loss": 4.5326, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.278553009033203, + "rewards/margins": 2.3767104148864746, + "rewards/rejected": -28.655261993408203, + "step": 16460 + }, + { + "epoch": 0.5549563517476154, + "grad_norm": 0.08136291056871414, + "learning_rate": 4.913528743656724e-07, + "logits/chosen": -1.2650330066680908, + "logits/rejected": -1.2096878290176392, + "logps/chosen": -2.059722661972046, + "logps/rejected": -2.423049211502075, + "loss": 2.4642, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.597225189208984, + "rewards/margins": 3.6332650184631348, + "rewards/rejected": -24.230493545532227, + "step": 16465 + }, + { + "epoch": 0.5551248778185985, + "grad_norm": 10.423648834228516, + "learning_rate": 4.910587852857093e-07, + "logits/chosen": -1.5063730478286743, + "logits/rejected": -1.798951506614685, + "logps/chosen": -2.671128034591675, + "logps/rejected": -3.014394521713257, + "loss": 2.0443, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.711278915405273, + "rewards/margins": 3.432664155960083, + "rewards/rejected": -30.143945693969727, + "step": 16470 + }, + { + "epoch": 0.5552934038895817, + "grad_norm": 23.890661239624023, + "learning_rate": 4.907646992999481e-07, + "logits/chosen": -1.442610740661621, + "logits/rejected": -1.9095418453216553, + "logps/chosen": -2.0867397785186768, + "logps/rejected": -2.4975295066833496, + "loss": 1.7156, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.86739730834961, + "rewards/margins": 4.1078996658325195, + "rewards/rejected": -24.975296020507812, + "step": 16475 + }, + { + "epoch": 0.5554619299605649, + "grad_norm": 45.274497985839844, + "learning_rate": 4.904706165101607e-07, + "logits/chosen": -1.3336973190307617, + "logits/rejected": -1.534952163696289, + "logps/chosen": -3.592500686645508, + "logps/rejected": -3.7593085765838623, + "loss": 2.1646, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -35.92500686645508, + "rewards/margins": 1.6680819988250732, + "rewards/rejected": -37.59308624267578, + "step": 16480 + }, + { + "epoch": 0.555630456031548, + "grad_norm": 29.72709083557129, + "learning_rate": 4.901765370181174e-07, + "logits/chosen": -1.1310837268829346, + "logits/rejected": -1.2525126934051514, + "logps/chosen": -1.883829116821289, + "logps/rejected": -1.9360030889511108, + "loss": 2.8968, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.83829116821289, + "rewards/margins": 0.5217410326004028, + "rewards/rejected": -19.360031127929688, + "step": 16485 + }, + { + "epoch": 0.5557989821025313, + "grad_norm": 35.20262908935547, + "learning_rate": 4.898824609255879e-07, + "logits/chosen": -1.30862295627594, + "logits/rejected": -1.5366251468658447, + "logps/chosen": -2.2552974224090576, + "logps/rejected": -2.888899087905884, + "loss": 2.4783, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -22.552974700927734, + "rewards/margins": 6.336014747619629, + "rewards/rejected": -28.888988494873047, + "step": 16490 + }, + { + "epoch": 0.5559675081735145, + "grad_norm": 63.60124588012695, + "learning_rate": 4.895883883343398e-07, + "logits/chosen": -1.5948011875152588, + "logits/rejected": -1.6446367502212524, + "logps/chosen": -2.000032663345337, + "logps/rejected": -2.2590491771698, + "loss": 1.9738, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.00032615661621, + "rewards/margins": 2.5901684761047363, + "rewards/rejected": -22.59049415588379, + "step": 16495 + }, + { + "epoch": 0.5561360342444976, + "grad_norm": 68.49860382080078, + "learning_rate": 4.892943193461403e-07, + "logits/chosen": -1.2705062627792358, + "logits/rejected": -1.4505457878112793, + "logps/chosen": -1.9772846698760986, + "logps/rejected": -1.9667972326278687, + "loss": 3.8444, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -19.77284812927246, + "rewards/margins": -0.10487423092126846, + "rewards/rejected": -19.667972564697266, + "step": 16500 + }, + { + "epoch": 0.5563045603154808, + "grad_norm": 29.31913948059082, + "learning_rate": 4.890002540627552e-07, + "logits/chosen": -1.715620756149292, + "logits/rejected": -1.643994688987732, + "logps/chosen": -2.7153449058532715, + "logps/rejected": -2.9860455989837646, + "loss": 2.0, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -27.1534481048584, + "rewards/margins": 2.7070071697235107, + "rewards/rejected": -29.860454559326172, + "step": 16505 + }, + { + "epoch": 0.556473086386464, + "grad_norm": 14.820429801940918, + "learning_rate": 4.887061925859487e-07, + "logits/chosen": -1.5017743110656738, + "logits/rejected": -1.7025244235992432, + "logps/chosen": -2.3334708213806152, + "logps/rejected": -2.7292990684509277, + "loss": 2.9727, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.334707260131836, + "rewards/margins": 3.9582855701446533, + "rewards/rejected": -27.29298973083496, + "step": 16510 + }, + { + "epoch": 0.5566416124574471, + "grad_norm": 22.817903518676758, + "learning_rate": 4.88412135017484e-07, + "logits/chosen": -0.9818054437637329, + "logits/rejected": -1.0674546957015991, + "logps/chosen": -2.76274037361145, + "logps/rejected": -2.9394752979278564, + "loss": 2.6797, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -27.627399444580078, + "rewards/margins": 1.767350435256958, + "rewards/rejected": -29.394750595092773, + "step": 16515 + }, + { + "epoch": 0.5568101385284303, + "grad_norm": 23.144840240478516, + "learning_rate": 4.881180814591226e-07, + "logits/chosen": -1.8649994134902954, + "logits/rejected": -1.4928921461105347, + "logps/chosen": -2.244431972503662, + "logps/rejected": -2.0774097442626953, + "loss": 4.7936, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.444316864013672, + "rewards/margins": -1.6702207326889038, + "rewards/rejected": -20.77409553527832, + "step": 16520 + }, + { + "epoch": 0.5569786645994135, + "grad_norm": 25.49403190612793, + "learning_rate": 4.878240320126256e-07, + "logits/chosen": -2.3358254432678223, + "logits/rejected": -2.589106798171997, + "logps/chosen": -2.41384220123291, + "logps/rejected": -2.698692798614502, + "loss": 2.5465, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.1384220123291, + "rewards/margins": 2.8485052585601807, + "rewards/rejected": -26.986928939819336, + "step": 16525 + }, + { + "epoch": 0.5571471906703968, + "grad_norm": 30.329010009765625, + "learning_rate": 4.87529986779751e-07, + "logits/chosen": -1.149228572845459, + "logits/rejected": -1.606133222579956, + "logps/chosen": -1.8847917318344116, + "logps/rejected": -2.1649017333984375, + "loss": 2.6444, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.847917556762695, + "rewards/margins": 2.8010976314544678, + "rewards/rejected": -21.64901351928711, + "step": 16530 + }, + { + "epoch": 0.5573157167413799, + "grad_norm": 97.66452026367188, + "learning_rate": 4.872359458622568e-07, + "logits/chosen": -1.7025953531265259, + "logits/rejected": -1.810329794883728, + "logps/chosen": -1.7309554815292358, + "logps/rejected": -1.6912472248077393, + "loss": 3.6162, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.309553146362305, + "rewards/margins": -0.39708080887794495, + "rewards/rejected": -16.912473678588867, + "step": 16535 + }, + { + "epoch": 0.5574842428123631, + "grad_norm": 17.021034240722656, + "learning_rate": 4.869419093618991e-07, + "logits/chosen": -1.5559533834457397, + "logits/rejected": -1.6489553451538086, + "logps/chosen": -2.4044666290283203, + "logps/rejected": -2.8175644874572754, + "loss": 2.9486, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.04466438293457, + "rewards/margins": 4.130979061126709, + "rewards/rejected": -28.175643920898438, + "step": 16540 + }, + { + "epoch": 0.5576527688833463, + "grad_norm": 73.31028747558594, + "learning_rate": 4.866478773804317e-07, + "logits/chosen": -1.2781057357788086, + "logits/rejected": -1.4325788021087646, + "logps/chosen": -1.9948387145996094, + "logps/rejected": -2.0437731742858887, + "loss": 3.4351, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.948389053344727, + "rewards/margins": 0.4893454611301422, + "rewards/rejected": -20.437732696533203, + "step": 16545 + }, + { + "epoch": 0.5578212949543294, + "grad_norm": 15.803415298461914, + "learning_rate": 4.863538500196081e-07, + "logits/chosen": -1.2874476909637451, + "logits/rejected": -1.8122501373291016, + "logps/chosen": -2.1901726722717285, + "logps/rejected": -2.637709140777588, + "loss": 1.5549, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.901723861694336, + "rewards/margins": 4.475368022918701, + "rewards/rejected": -26.377094268798828, + "step": 16550 + }, + { + "epoch": 0.5579898210253126, + "grad_norm": 39.55556106567383, + "learning_rate": 4.860598273811792e-07, + "logits/chosen": -1.4781594276428223, + "logits/rejected": -1.6253509521484375, + "logps/chosen": -2.057183027267456, + "logps/rejected": -2.060908555984497, + "loss": 3.7672, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.571828842163086, + "rewards/margins": 0.03725280612707138, + "rewards/rejected": -20.609081268310547, + "step": 16555 + }, + { + "epoch": 0.5581583470962957, + "grad_norm": 68.54031372070312, + "learning_rate": 4.857658095668951e-07, + "logits/chosen": -1.6205686330795288, + "logits/rejected": -1.6138147115707397, + "logps/chosen": -1.9181791543960571, + "logps/rejected": -2.0273666381835938, + "loss": 2.6595, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.181793212890625, + "rewards/margins": 1.091873288154602, + "rewards/rejected": -20.273664474487305, + "step": 16560 + }, + { + "epoch": 0.558326873167279, + "grad_norm": 29.88010597229004, + "learning_rate": 4.854717966785033e-07, + "logits/chosen": -1.522566556930542, + "logits/rejected": -1.5306899547576904, + "logps/chosen": -1.7374460697174072, + "logps/rejected": -1.7058292627334595, + "loss": 3.7956, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.374460220336914, + "rewards/margins": -0.3161682188510895, + "rewards/rejected": -17.058292388916016, + "step": 16565 + }, + { + "epoch": 0.5584953992382622, + "grad_norm": 14.674449920654297, + "learning_rate": 4.851777888177503e-07, + "logits/chosen": -1.492299199104309, + "logits/rejected": -1.5254344940185547, + "logps/chosen": -2.6740927696228027, + "logps/rejected": -2.8673977851867676, + "loss": 3.7997, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.740930557250977, + "rewards/margins": 1.933049201965332, + "rewards/rejected": -28.67397689819336, + "step": 16570 + }, + { + "epoch": 0.5586639253092454, + "grad_norm": 20.321304321289062, + "learning_rate": 4.848837860863807e-07, + "logits/chosen": -1.5205323696136475, + "logits/rejected": -1.4475538730621338, + "logps/chosen": -1.978070855140686, + "logps/rejected": -2.5035884380340576, + "loss": 0.9607, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -19.780710220336914, + "rewards/margins": 5.255176544189453, + "rewards/rejected": -25.035886764526367, + "step": 16575 + }, + { + "epoch": 0.5588324513802285, + "grad_norm": 35.91277313232422, + "learning_rate": 4.845897885861371e-07, + "logits/chosen": -1.4589111804962158, + "logits/rejected": -1.4083130359649658, + "logps/chosen": -2.222066640853882, + "logps/rejected": -2.1525063514709473, + "loss": 3.8385, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.220666885375977, + "rewards/margins": -0.6956036686897278, + "rewards/rejected": -21.52506446838379, + "step": 16580 + }, + { + "epoch": 0.5590009774512117, + "grad_norm": 24.30882453918457, + "learning_rate": 4.842957964187604e-07, + "logits/chosen": -1.6066770553588867, + "logits/rejected": -1.6216022968292236, + "logps/chosen": -1.8694368600845337, + "logps/rejected": -1.9515972137451172, + "loss": 2.5597, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.69437026977539, + "rewards/margins": 0.8216029405593872, + "rewards/rejected": -19.51597023010254, + "step": 16585 + }, + { + "epoch": 0.5591695035221949, + "grad_norm": 128.77745056152344, + "learning_rate": 4.8400180968599e-07, + "logits/chosen": -1.5237901210784912, + "logits/rejected": -1.6502147912979126, + "logps/chosen": -2.442279815673828, + "logps/rejected": -2.3668129444122314, + "loss": 4.9498, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -24.42279624938965, + "rewards/margins": -0.754668116569519, + "rewards/rejected": -23.66813087463379, + "step": 16590 + }, + { + "epoch": 0.559338029593178, + "grad_norm": 31.42833137512207, + "learning_rate": 4.837078284895631e-07, + "logits/chosen": -2.0330276489257812, + "logits/rejected": -1.8534698486328125, + "logps/chosen": -2.165086269378662, + "logps/rejected": -2.5630440711975098, + "loss": 3.0482, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.650861740112305, + "rewards/margins": 3.9795784950256348, + "rewards/rejected": -25.630441665649414, + "step": 16595 + }, + { + "epoch": 0.5595065556641613, + "grad_norm": 24.275657653808594, + "learning_rate": 4.834138529312146e-07, + "logits/chosen": -1.7258002758026123, + "logits/rejected": -1.767745018005371, + "logps/chosen": -1.896641492843628, + "logps/rejected": -2.3668007850646973, + "loss": 1.7978, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.966419219970703, + "rewards/margins": 4.701590538024902, + "rewards/rejected": -23.66800880432129, + "step": 16600 + }, + { + "epoch": 0.5596750817351445, + "grad_norm": 21.67288589477539, + "learning_rate": 4.831198831126784e-07, + "logits/chosen": -1.5068045854568481, + "logits/rejected": -1.4906129837036133, + "logps/chosen": -1.9527183771133423, + "logps/rejected": -2.0906100273132324, + "loss": 2.4302, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.527183532714844, + "rewards/margins": 1.3789176940917969, + "rewards/rejected": -20.90610122680664, + "step": 16605 + }, + { + "epoch": 0.5598436078061276, + "grad_norm": 29.13378143310547, + "learning_rate": 4.828259191356855e-07, + "logits/chosen": -1.6987136602401733, + "logits/rejected": -1.7911326885223389, + "logps/chosen": -1.9743280410766602, + "logps/rejected": -2.221207857131958, + "loss": 1.8545, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.7432804107666, + "rewards/margins": 2.4687979221343994, + "rewards/rejected": -22.212078094482422, + "step": 16610 + }, + { + "epoch": 0.5600121338771108, + "grad_norm": 30.432849884033203, + "learning_rate": 4.825319611019653e-07, + "logits/chosen": -1.3041839599609375, + "logits/rejected": -1.5292479991912842, + "logps/chosen": -1.8356603384017944, + "logps/rejected": -2.110485315322876, + "loss": 2.4426, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.356603622436523, + "rewards/margins": 2.748248815536499, + "rewards/rejected": -21.1048526763916, + "step": 16615 + }, + { + "epoch": 0.560180659948094, + "grad_norm": 28.228656768798828, + "learning_rate": 4.822380091132452e-07, + "logits/chosen": -1.496335744857788, + "logits/rejected": -1.5300164222717285, + "logps/chosen": -2.2147862911224365, + "logps/rejected": -2.358285665512085, + "loss": 3.8083, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.147863388061523, + "rewards/margins": 1.4349899291992188, + "rewards/rejected": -23.582855224609375, + "step": 16620 + }, + { + "epoch": 0.5603491860190771, + "grad_norm": 7.746950626373291, + "learning_rate": 4.819440632712502e-07, + "logits/chosen": -1.1421386003494263, + "logits/rejected": -1.6735588312149048, + "logps/chosen": -1.7586784362792969, + "logps/rejected": -2.379070997238159, + "loss": 1.7404, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.5867862701416, + "rewards/margins": 6.203924179077148, + "rewards/rejected": -23.790708541870117, + "step": 16625 + }, + { + "epoch": 0.5605177120900603, + "grad_norm": 55.8576774597168, + "learning_rate": 4.816501236777038e-07, + "logits/chosen": -1.6667125225067139, + "logits/rejected": -1.4844706058502197, + "logps/chosen": -1.8552688360214233, + "logps/rejected": -1.9931942224502563, + "loss": 2.5257, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.552688598632812, + "rewards/margins": 1.3792531490325928, + "rewards/rejected": -19.931941986083984, + "step": 16630 + }, + { + "epoch": 0.5606862381610435, + "grad_norm": 20.889873504638672, + "learning_rate": 4.813561904343265e-07, + "logits/chosen": -1.006071925163269, + "logits/rejected": -1.044333815574646, + "logps/chosen": -2.241271734237671, + "logps/rejected": -2.404127836227417, + "loss": 1.9726, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.412717819213867, + "rewards/margins": 1.6285613775253296, + "rewards/rejected": -24.041278839111328, + "step": 16635 + }, + { + "epoch": 0.5608547642320267, + "grad_norm": 55.024349212646484, + "learning_rate": 4.810622636428371e-07, + "logits/chosen": -1.270347237586975, + "logits/rejected": -1.3348054885864258, + "logps/chosen": -1.9361298084259033, + "logps/rejected": -1.9386670589447021, + "loss": 3.1037, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.361297607421875, + "rewards/margins": 0.025374317541718483, + "rewards/rejected": -19.38667106628418, + "step": 16640 + }, + { + "epoch": 0.5610232903030099, + "grad_norm": 29.22247886657715, + "learning_rate": 4.807683434049522e-07, + "logits/chosen": -1.417856216430664, + "logits/rejected": -1.9524939060211182, + "logps/chosen": -2.4791512489318848, + "logps/rejected": -2.824134349822998, + "loss": 1.92, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -24.79151153564453, + "rewards/margins": 3.449831485748291, + "rewards/rejected": -28.241342544555664, + "step": 16645 + }, + { + "epoch": 0.5611918163739931, + "grad_norm": 30.65985679626465, + "learning_rate": 4.804744298223859e-07, + "logits/chosen": -1.4590588808059692, + "logits/rejected": -1.7810996770858765, + "logps/chosen": -2.255669116973877, + "logps/rejected": -2.6106760501861572, + "loss": 1.4134, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -22.55669403076172, + "rewards/margins": 3.550067186355591, + "rewards/rejected": -26.106760025024414, + "step": 16650 + }, + { + "epoch": 0.5613603424449762, + "grad_norm": 28.592660903930664, + "learning_rate": 4.8018052299685e-07, + "logits/chosen": -1.8148752450942993, + "logits/rejected": -2.0093588829040527, + "logps/chosen": -2.1500916481018066, + "logps/rejected": -2.5256147384643555, + "loss": 1.4492, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.50091552734375, + "rewards/margins": 3.755230665206909, + "rewards/rejected": -25.256145477294922, + "step": 16655 + }, + { + "epoch": 0.5615288685159594, + "grad_norm": 83.37126159667969, + "learning_rate": 4.798866230300547e-07, + "logits/chosen": -0.9690818786621094, + "logits/rejected": -1.2768938541412354, + "logps/chosen": -2.553854465484619, + "logps/rejected": -3.4719252586364746, + "loss": 2.0541, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.538543701171875, + "rewards/margins": 9.180707931518555, + "rewards/rejected": -34.71925354003906, + "step": 16660 + }, + { + "epoch": 0.5616973945869426, + "grad_norm": 31.286157608032227, + "learning_rate": 4.795927300237065e-07, + "logits/chosen": -1.402068853378296, + "logits/rejected": -1.401330590248108, + "logps/chosen": -1.912672758102417, + "logps/rejected": -1.8983705043792725, + "loss": 3.6742, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.126728057861328, + "rewards/margins": -0.1430220603942871, + "rewards/rejected": -18.98370361328125, + "step": 16665 + }, + { + "epoch": 0.5618659206579257, + "grad_norm": 32.06412887573242, + "learning_rate": 4.792988440795103e-07, + "logits/chosen": -1.5014759302139282, + "logits/rejected": -1.4859968423843384, + "logps/chosen": -2.089202880859375, + "logps/rejected": -2.309732675552368, + "loss": 2.6381, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.89202880859375, + "rewards/margins": 2.2052981853485107, + "rewards/rejected": -23.097328186035156, + "step": 16670 + }, + { + "epoch": 0.562034446728909, + "grad_norm": 0.22517681121826172, + "learning_rate": 4.790049652991685e-07, + "logits/chosen": -1.5998106002807617, + "logits/rejected": -1.8751304149627686, + "logps/chosen": -1.7838659286499023, + "logps/rejected": -2.068983793258667, + "loss": 2.4504, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.83865737915039, + "rewards/margins": 2.8511791229248047, + "rewards/rejected": -20.689838409423828, + "step": 16675 + }, + { + "epoch": 0.5622029727998922, + "grad_norm": 39.847381591796875, + "learning_rate": 4.787110937843814e-07, + "logits/chosen": -1.1157238483428955, + "logits/rejected": -1.418677568435669, + "logps/chosen": -2.137622356414795, + "logps/rejected": -2.336545944213867, + "loss": 2.6717, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.376224517822266, + "rewards/margins": 1.989235281944275, + "rewards/rejected": -23.365459442138672, + "step": 16680 + }, + { + "epoch": 0.5623714988708753, + "grad_norm": 8.367581367492676, + "learning_rate": 4.784172296368457e-07, + "logits/chosen": -1.6157697439193726, + "logits/rejected": -2.3373653888702393, + "logps/chosen": -2.44496488571167, + "logps/rejected": -3.342592239379883, + "loss": 0.4014, + "rewards/accuracies": 1.0, + "rewards/chosen": -24.449649810791016, + "rewards/margins": 8.976272583007812, + "rewards/rejected": -33.425926208496094, + "step": 16685 + }, + { + "epoch": 0.5625400249418585, + "grad_norm": 64.52335357666016, + "learning_rate": 4.781233729582565e-07, + "logits/chosen": -0.9468668699264526, + "logits/rejected": -0.905168890953064, + "logps/chosen": -2.1241188049316406, + "logps/rejected": -2.2420175075531006, + "loss": 2.7584, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.241186141967773, + "rewards/margins": 1.1789891719818115, + "rewards/rejected": -22.420177459716797, + "step": 16690 + }, + { + "epoch": 0.5627085510128417, + "grad_norm": 25.68901824951172, + "learning_rate": 4.778295238503061e-07, + "logits/chosen": -1.390925645828247, + "logits/rejected": -1.3971506357192993, + "logps/chosen": -2.2836949825286865, + "logps/rejected": -2.681293249130249, + "loss": 2.3995, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -22.836950302124023, + "rewards/margins": 3.9759840965270996, + "rewards/rejected": -26.81293296813965, + "step": 16695 + }, + { + "epoch": 0.5628770770838248, + "grad_norm": 31.65011978149414, + "learning_rate": 4.775356824146842e-07, + "logits/chosen": -1.9038221836090088, + "logits/rejected": -1.802114725112915, + "logps/chosen": -2.901379346847534, + "logps/rejected": -3.1506600379943848, + "loss": 2.4802, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -29.0137939453125, + "rewards/margins": 2.4928066730499268, + "rewards/rejected": -31.5065975189209, + "step": 16700 + }, + { + "epoch": 0.563045603154808, + "grad_norm": 23.219512939453125, + "learning_rate": 4.772418487530773e-07, + "logits/chosen": -1.4731714725494385, + "logits/rejected": -1.4118788242340088, + "logps/chosen": -2.0316500663757324, + "logps/rejected": -2.0555057525634766, + "loss": 3.2372, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.31650161743164, + "rewards/margins": 0.23855523765087128, + "rewards/rejected": -20.555057525634766, + "step": 16705 + }, + { + "epoch": 0.5632141292257913, + "grad_norm": 27.701637268066406, + "learning_rate": 4.769480229671699e-07, + "logits/chosen": -1.1318514347076416, + "logits/rejected": -1.1759233474731445, + "logps/chosen": -2.0909242630004883, + "logps/rejected": -2.103691816329956, + "loss": 3.2503, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.909244537353516, + "rewards/margins": 0.12767677009105682, + "rewards/rejected": -21.03692054748535, + "step": 16710 + }, + { + "epoch": 0.5633826552967744, + "grad_norm": 32.668514251708984, + "learning_rate": 4.7665420515864374e-07, + "logits/chosen": -1.5871978998184204, + "logits/rejected": -1.787335991859436, + "logps/chosen": -1.980926275253296, + "logps/rejected": -2.1331100463867188, + "loss": 2.471, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.809263229370117, + "rewards/margins": 1.5218383073806763, + "rewards/rejected": -21.33110237121582, + "step": 16715 + }, + { + "epoch": 0.5635511813677576, + "grad_norm": 30.054622650146484, + "learning_rate": 4.7636039542917716e-07, + "logits/chosen": -1.2195504903793335, + "logits/rejected": -1.4400713443756104, + "logps/chosen": -2.8065850734710693, + "logps/rejected": -2.85850191116333, + "loss": 4.9515, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -28.06585121154785, + "rewards/margins": 0.5191686749458313, + "rewards/rejected": -28.585018157958984, + "step": 16720 + }, + { + "epoch": 0.5637197074387408, + "grad_norm": 34.892494201660156, + "learning_rate": 4.760665938804466e-07, + "logits/chosen": -1.404201626777649, + "logits/rejected": -1.6849861145019531, + "logps/chosen": -2.6109631061553955, + "logps/rejected": -2.7899582386016846, + "loss": 2.5493, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -26.109630584716797, + "rewards/margins": 1.7899490594863892, + "rewards/rejected": -27.899578094482422, + "step": 16725 + }, + { + "epoch": 0.563888233509724, + "grad_norm": 32.79019546508789, + "learning_rate": 4.7577280061412474e-07, + "logits/chosen": -1.413694977760315, + "logits/rejected": -1.3809101581573486, + "logps/chosen": -2.7751529216766357, + "logps/rejected": -2.6814167499542236, + "loss": 4.4692, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -27.751529693603516, + "rewards/margins": -0.9373645782470703, + "rewards/rejected": -26.814163208007812, + "step": 16730 + }, + { + "epoch": 0.5640567595807071, + "grad_norm": 6.341630935668945, + "learning_rate": 4.754790157318822e-07, + "logits/chosen": -1.2289912700653076, + "logits/rejected": -1.5710041522979736, + "logps/chosen": -2.0926711559295654, + "logps/rejected": -2.535799503326416, + "loss": 1.8432, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.926708221435547, + "rewards/margins": 4.431286334991455, + "rewards/rejected": -25.35799789428711, + "step": 16735 + }, + { + "epoch": 0.5642252856516903, + "grad_norm": 49.58246612548828, + "learning_rate": 4.7518523933538613e-07, + "logits/chosen": -1.8374273777008057, + "logits/rejected": -1.8193261623382568, + "logps/chosen": -3.3235645294189453, + "logps/rejected": -3.626161575317383, + "loss": 3.2184, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -33.23564529418945, + "rewards/margins": 3.025968313217163, + "rewards/rejected": -36.26161193847656, + "step": 16740 + }, + { + "epoch": 0.5643938117226734, + "grad_norm": 25.741788864135742, + "learning_rate": 4.7489147152630104e-07, + "logits/chosen": -1.4145604372024536, + "logits/rejected": -1.5160937309265137, + "logps/chosen": -2.352865219116211, + "logps/rejected": -2.9049177169799805, + "loss": 2.2004, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.528654098510742, + "rewards/margins": 5.520522117614746, + "rewards/rejected": -29.049175262451172, + "step": 16745 + }, + { + "epoch": 0.5645623377936567, + "grad_norm": 26.192583084106445, + "learning_rate": 4.745977124062887e-07, + "logits/chosen": -1.2434992790222168, + "logits/rejected": -1.0916662216186523, + "logps/chosen": -2.044055938720703, + "logps/rejected": -2.2048280239105225, + "loss": 3.14, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.44055938720703, + "rewards/margins": 1.6077194213867188, + "rewards/rejected": -22.048280715942383, + "step": 16750 + }, + { + "epoch": 0.5647308638646399, + "grad_norm": 140.10960388183594, + "learning_rate": 4.74303962077007e-07, + "logits/chosen": -1.3205041885375977, + "logits/rejected": -1.650608777999878, + "logps/chosen": -2.4417591094970703, + "logps/rejected": -2.5953195095062256, + "loss": 2.543, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.417591094970703, + "rewards/margins": 1.5356042385101318, + "rewards/rejected": -25.953195571899414, + "step": 16755 + }, + { + "epoch": 0.564899389935623, + "grad_norm": 42.08061218261719, + "learning_rate": 4.740102206401117e-07, + "logits/chosen": -2.0617008209228516, + "logits/rejected": -2.03901743888855, + "logps/chosen": -2.1589865684509277, + "logps/rejected": -2.2602858543395996, + "loss": 3.0386, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.58986473083496, + "rewards/margins": 1.0129940509796143, + "rewards/rejected": -22.602859497070312, + "step": 16760 + }, + { + "epoch": 0.5650679160066062, + "grad_norm": 37.95011520385742, + "learning_rate": 4.737164881972551e-07, + "logits/chosen": -0.8276047706604004, + "logits/rejected": -0.9013395309448242, + "logps/chosen": -1.9446722269058228, + "logps/rejected": -2.0602924823760986, + "loss": 2.992, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.446720123291016, + "rewards/margins": 1.156203269958496, + "rewards/rejected": -20.602924346923828, + "step": 16765 + }, + { + "epoch": 0.5652364420775894, + "grad_norm": 43.475223541259766, + "learning_rate": 4.7342276485008654e-07, + "logits/chosen": -1.6008399724960327, + "logits/rejected": -1.54447340965271, + "logps/chosen": -2.948169708251953, + "logps/rejected": -3.309300661087036, + "loss": 2.6869, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -29.481698989868164, + "rewards/margins": 3.6113078594207764, + "rewards/rejected": -33.0930061340332, + "step": 16770 + }, + { + "epoch": 0.5654049681485726, + "grad_norm": 78.703125, + "learning_rate": 4.7312905070025177e-07, + "logits/chosen": -0.9836323857307434, + "logits/rejected": -0.9874518513679504, + "logps/chosen": -2.5067899227142334, + "logps/rejected": -3.001052141189575, + "loss": 3.4794, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.067901611328125, + "rewards/margins": 4.942620277404785, + "rewards/rejected": -30.010522842407227, + "step": 16775 + }, + { + "epoch": 0.5655734942195557, + "grad_norm": 36.86214065551758, + "learning_rate": 4.728353458493939e-07, + "logits/chosen": -1.0935938358306885, + "logits/rejected": -1.7105945348739624, + "logps/chosen": -2.6541755199432373, + "logps/rejected": -3.2404963970184326, + "loss": 2.6482, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -26.5417537689209, + "rewards/margins": 5.863210201263428, + "rewards/rejected": -32.404964447021484, + "step": 16780 + }, + { + "epoch": 0.565742020290539, + "grad_norm": 0.07047080993652344, + "learning_rate": 4.7254165039915265e-07, + "logits/chosen": -1.5007909536361694, + "logits/rejected": -1.7393224239349365, + "logps/chosen": -2.6301045417785645, + "logps/rejected": -3.297396183013916, + "loss": 1.2243, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -26.301044464111328, + "rewards/margins": 6.672916412353516, + "rewards/rejected": -32.973960876464844, + "step": 16785 + }, + { + "epoch": 0.5659105463615222, + "grad_norm": 43.58036422729492, + "learning_rate": 4.7224796445116446e-07, + "logits/chosen": -1.2965277433395386, + "logits/rejected": -1.6817373037338257, + "logps/chosen": -2.2248032093048096, + "logps/rejected": -2.647348642349243, + "loss": 2.0617, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.248035430908203, + "rewards/margins": 4.225451946258545, + "rewards/rejected": -26.473485946655273, + "step": 16790 + }, + { + "epoch": 0.5660790724325053, + "grad_norm": 33.99382019042969, + "learning_rate": 4.7195428810706224e-07, + "logits/chosen": -1.4129993915557861, + "logits/rejected": -1.655768632888794, + "logps/chosen": -2.2138402462005615, + "logps/rejected": -2.304614782333374, + "loss": 3.2053, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -22.138404846191406, + "rewards/margins": 0.9077442288398743, + "rewards/rejected": -23.046146392822266, + "step": 16795 + }, + { + "epoch": 0.5662475985034885, + "grad_norm": 25.1485595703125, + "learning_rate": 4.7166062146847593e-07, + "logits/chosen": -1.9000380039215088, + "logits/rejected": -1.7676162719726562, + "logps/chosen": -2.2173843383789062, + "logps/rejected": -2.3960306644439697, + "loss": 3.0706, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.173843383789062, + "rewards/margins": 1.7864621877670288, + "rewards/rejected": -23.96030616760254, + "step": 16800 + }, + { + "epoch": 0.5662475985034885, + "eval_logits/chosen": -1.9103525876998901, + "eval_logits/rejected": -2.0533409118652344, + "eval_logps/chosen": -2.1100523471832275, + "eval_logps/rejected": -2.24176025390625, + "eval_loss": 2.994614362716675, + "eval_rewards/accuracies": 0.6299999952316284, + "eval_rewards/chosen": -21.100522994995117, + "eval_rewards/margins": 1.3170799016952515, + "eval_rewards/rejected": -22.417604446411133, + "eval_runtime": 12.9176, + "eval_samples_per_second": 7.741, + "eval_steps_per_second": 1.935, + "step": 16800 + }, + { + "epoch": 0.5664161245744717, + "grad_norm": 11.869410514831543, + "learning_rate": 4.713669646370321e-07, + "logits/chosen": -0.9209138751029968, + "logits/rejected": -1.0028040409088135, + "logps/chosen": -1.5366700887680054, + "logps/rejected": -1.610845923423767, + "loss": 3.0163, + "rewards/accuracies": 0.5, + "rewards/chosen": -15.366701126098633, + "rewards/margins": 0.7417588233947754, + "rewards/rejected": -16.10845947265625, + "step": 16805 + }, + { + "epoch": 0.5665846506454548, + "grad_norm": 41.69317626953125, + "learning_rate": 4.7107331771435366e-07, + "logits/chosen": -1.6482641696929932, + "logits/rejected": -1.9954522848129272, + "logps/chosen": -2.4083595275878906, + "logps/rejected": -2.967284679412842, + "loss": 3.5173, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.08359718322754, + "rewards/margins": 5.5892534255981445, + "rewards/rejected": -29.6728515625, + "step": 16810 + }, + { + "epoch": 0.566753176716438, + "grad_norm": 23.685644149780273, + "learning_rate": 4.7077968080206025e-07, + "logits/chosen": -1.785162329673767, + "logits/rejected": -1.7933508157730103, + "logps/chosen": -2.0810201168060303, + "logps/rejected": -2.196655511856079, + "loss": 2.9084, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.81020164489746, + "rewards/margins": 1.1563531160354614, + "rewards/rejected": -21.966556549072266, + "step": 16815 + }, + { + "epoch": 0.5669217027874213, + "grad_norm": 170.0150909423828, + "learning_rate": 4.7048605400176835e-07, + "logits/chosen": -2.2140371799468994, + "logits/rejected": -1.9532238245010376, + "logps/chosen": -2.8031115531921387, + "logps/rejected": -3.2334632873535156, + "loss": 3.6139, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -28.031116485595703, + "rewards/margins": 4.303518295288086, + "rewards/rejected": -32.334632873535156, + "step": 16820 + }, + { + "epoch": 0.5670902288584044, + "grad_norm": 23.4686279296875, + "learning_rate": 4.701924374150901e-07, + "logits/chosen": -1.7128368616104126, + "logits/rejected": -1.8601795434951782, + "logps/chosen": -2.399580478668213, + "logps/rejected": -2.2932586669921875, + "loss": 5.407, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.995803833007812, + "rewards/margins": -1.063218116760254, + "rewards/rejected": -22.932588577270508, + "step": 16825 + }, + { + "epoch": 0.5672587549293876, + "grad_norm": 215.03001403808594, + "learning_rate": 4.6989883114363486e-07, + "logits/chosen": -1.6193939447402954, + "logits/rejected": -2.1604743003845215, + "logps/chosen": -2.8643462657928467, + "logps/rejected": -3.4826483726501465, + "loss": 3.6061, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -28.643463134765625, + "rewards/margins": 6.183023452758789, + "rewards/rejected": -34.82648468017578, + "step": 16830 + }, + { + "epoch": 0.5674272810003708, + "grad_norm": 42.07413864135742, + "learning_rate": 4.6960523528900823e-07, + "logits/chosen": -1.6678909063339233, + "logits/rejected": -1.6212352514266968, + "logps/chosen": -1.9012730121612549, + "logps/rejected": -2.26922869682312, + "loss": 2.7834, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.01272964477539, + "rewards/margins": 3.679558277130127, + "rewards/rejected": -22.692289352416992, + "step": 16835 + }, + { + "epoch": 0.5675958070713539, + "grad_norm": 13.32239818572998, + "learning_rate": 4.693116499528124e-07, + "logits/chosen": -1.7592246532440186, + "logits/rejected": -2.169157028198242, + "logps/chosen": -2.433260917663574, + "logps/rejected": -3.041790246963501, + "loss": 1.4877, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.33260726928711, + "rewards/margins": 6.085291862487793, + "rewards/rejected": -30.41790199279785, + "step": 16840 + }, + { + "epoch": 0.5677643331423371, + "grad_norm": 31.183002471923828, + "learning_rate": 4.690180752366453e-07, + "logits/chosen": -1.2726290225982666, + "logits/rejected": -1.692091703414917, + "logps/chosen": -2.0409367084503174, + "logps/rejected": -2.078146457672119, + "loss": 2.8352, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.40936851501465, + "rewards/margins": 0.3720945417881012, + "rewards/rejected": -20.781463623046875, + "step": 16845 + }, + { + "epoch": 0.5679328592133203, + "grad_norm": 25.570552825927734, + "learning_rate": 4.687245112421016e-07, + "logits/chosen": -1.5729811191558838, + "logits/rejected": -1.711248755455017, + "logps/chosen": -2.6292476654052734, + "logps/rejected": -2.7394912242889404, + "loss": 2.8007, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -26.292476654052734, + "rewards/margins": 1.1024351119995117, + "rewards/rejected": -27.394912719726562, + "step": 16850 + }, + { + "epoch": 0.5681013852843034, + "grad_norm": 32.55588150024414, + "learning_rate": 4.684309580707727e-07, + "logits/chosen": -1.56328547000885, + "logits/rejected": -1.5660221576690674, + "logps/chosen": -3.4068398475646973, + "logps/rejected": -4.1345391273498535, + "loss": 2.0818, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -34.068397521972656, + "rewards/margins": 7.276989936828613, + "rewards/rejected": -41.34539031982422, + "step": 16855 + }, + { + "epoch": 0.5682699113552867, + "grad_norm": 2.4072084426879883, + "learning_rate": 4.681374158242451e-07, + "logits/chosen": -1.4371076822280884, + "logits/rejected": -1.7409675121307373, + "logps/chosen": -2.604935884475708, + "logps/rejected": -3.037245273590088, + "loss": 2.1005, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.049358367919922, + "rewards/margins": 4.32309103012085, + "rewards/rejected": -30.372451782226562, + "step": 16860 + }, + { + "epoch": 0.5684384374262699, + "grad_norm": 30.55173110961914, + "learning_rate": 4.6784388460410257e-07, + "logits/chosen": -1.604943037033081, + "logits/rejected": -1.6222620010375977, + "logps/chosen": -2.298088788986206, + "logps/rejected": -1.992645263671875, + "loss": 6.1099, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -22.98088836669922, + "rewards/margins": -3.0544333457946777, + "rewards/rejected": -19.926454544067383, + "step": 16865 + }, + { + "epoch": 0.568606963497253, + "grad_norm": 21.035503387451172, + "learning_rate": 4.675503645119247e-07, + "logits/chosen": -1.6293731927871704, + "logits/rejected": -1.6573808193206787, + "logps/chosen": -2.3817954063415527, + "logps/rejected": -2.569385528564453, + "loss": 2.4276, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.817955017089844, + "rewards/margins": 1.8758999109268188, + "rewards/rejected": -25.6938533782959, + "step": 16870 + }, + { + "epoch": 0.5687754895682362, + "grad_norm": 42.433319091796875, + "learning_rate": 4.672568556492873e-07, + "logits/chosen": -0.7304778695106506, + "logits/rejected": -0.9662677645683289, + "logps/chosen": -2.424504041671753, + "logps/rejected": -3.062227725982666, + "loss": 1.7874, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.24504280090332, + "rewards/margins": 6.377236843109131, + "rewards/rejected": -30.622278213500977, + "step": 16875 + }, + { + "epoch": 0.5689440156392194, + "grad_norm": 30.08279037475586, + "learning_rate": 4.669633581177621e-07, + "logits/chosen": -1.5956250429153442, + "logits/rejected": -1.4057915210723877, + "logps/chosen": -1.5880109071731567, + "logps/rejected": -1.5237188339233398, + "loss": 3.6982, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -15.880106925964355, + "rewards/margins": -0.6429195404052734, + "rewards/rejected": -15.237188339233398, + "step": 16880 + }, + { + "epoch": 0.5691125417102025, + "grad_norm": 32.80216979980469, + "learning_rate": 4.66669872018917e-07, + "logits/chosen": -1.4292566776275635, + "logits/rejected": -1.4082731008529663, + "logps/chosen": -2.8172738552093506, + "logps/rejected": -3.159550189971924, + "loss": 1.5456, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -28.172739028930664, + "rewards/margins": 3.4227638244628906, + "rewards/rejected": -31.595500946044922, + "step": 16885 + }, + { + "epoch": 0.5692810677811857, + "grad_norm": 31.284208297729492, + "learning_rate": 4.6637639745431626e-07, + "logits/chosen": -0.8667623400688171, + "logits/rejected": -0.9213277697563171, + "logps/chosen": -1.9619200229644775, + "logps/rejected": -2.0677216053009033, + "loss": 2.7827, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.619197845458984, + "rewards/margins": 1.0580158233642578, + "rewards/rejected": -20.677213668823242, + "step": 16890 + }, + { + "epoch": 0.569449593852169, + "grad_norm": 23.00290870666504, + "learning_rate": 4.6608293452551947e-07, + "logits/chosen": -1.3267605304718018, + "logits/rejected": -1.2977806329727173, + "logps/chosen": -2.193504810333252, + "logps/rejected": -2.4723658561706543, + "loss": 2.3626, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.935049057006836, + "rewards/margins": 2.78861403465271, + "rewards/rejected": -24.723661422729492, + "step": 16895 + }, + { + "epoch": 0.5696181199231521, + "grad_norm": 66.30839538574219, + "learning_rate": 4.657894833340827e-07, + "logits/chosen": -1.3002276420593262, + "logits/rejected": -1.927890419960022, + "logps/chosen": -2.195603609085083, + "logps/rejected": -2.5019192695617676, + "loss": 2.504, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.95603370666504, + "rewards/margins": 3.0631580352783203, + "rewards/rejected": -25.01919174194336, + "step": 16900 + }, + { + "epoch": 0.5697866459941353, + "grad_norm": 30.63758087158203, + "learning_rate": 4.654960439815581e-07, + "logits/chosen": -1.4829597473144531, + "logits/rejected": -1.4372055530548096, + "logps/chosen": -1.5658553838729858, + "logps/rejected": -1.7043907642364502, + "loss": 2.202, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -15.658554077148438, + "rewards/margins": 1.3853533267974854, + "rewards/rejected": -17.043907165527344, + "step": 16905 + }, + { + "epoch": 0.5699551720651185, + "grad_norm": 25.67203712463379, + "learning_rate": 4.6520261656949315e-07, + "logits/chosen": -1.7035623788833618, + "logits/rejected": -1.4969885349273682, + "logps/chosen": -2.9192066192626953, + "logps/rejected": -3.232933759689331, + "loss": 5.5975, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -29.192066192626953, + "rewards/margins": 3.1372737884521484, + "rewards/rejected": -32.32933807373047, + "step": 16910 + }, + { + "epoch": 0.5701236981361016, + "grad_norm": 33.323753356933594, + "learning_rate": 4.649092011994316e-07, + "logits/chosen": -1.6091026067733765, + "logits/rejected": -1.7056491374969482, + "logps/chosen": -2.0236144065856934, + "logps/rejected": -2.0775866508483887, + "loss": 3.0293, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.23614501953125, + "rewards/margins": 0.5397213697433472, + "rewards/rejected": -20.77586555480957, + "step": 16915 + }, + { + "epoch": 0.5702922242070848, + "grad_norm": 145.63626098632812, + "learning_rate": 4.64615797972913e-07, + "logits/chosen": -1.4785640239715576, + "logits/rejected": -1.8875732421875, + "logps/chosen": -2.968642234802246, + "logps/rejected": -3.1860897541046143, + "loss": 4.7265, + "rewards/accuracies": 0.5, + "rewards/chosen": -29.686426162719727, + "rewards/margins": 2.174473285675049, + "rewards/rejected": -31.860897064208984, + "step": 16920 + }, + { + "epoch": 0.570460750278068, + "grad_norm": 48.70518112182617, + "learning_rate": 4.6432240699147283e-07, + "logits/chosen": -1.6808608770370483, + "logits/rejected": -1.7317768335342407, + "logps/chosen": -3.0904078483581543, + "logps/rejected": -3.425537586212158, + "loss": 2.7118, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -30.904077529907227, + "rewards/margins": 3.3512985706329346, + "rewards/rejected": -34.25537872314453, + "step": 16925 + }, + { + "epoch": 0.5706292763490513, + "grad_norm": 23.636653900146484, + "learning_rate": 4.6402902835664177e-07, + "logits/chosen": -1.3742854595184326, + "logits/rejected": -1.3601338863372803, + "logps/chosen": -2.2360167503356934, + "logps/rejected": -2.3575634956359863, + "loss": 3.1559, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.36016845703125, + "rewards/margins": 1.215468168258667, + "rewards/rejected": -23.57563591003418, + "step": 16930 + }, + { + "epoch": 0.5707978024200344, + "grad_norm": 4.566060543060303, + "learning_rate": 4.637356621699468e-07, + "logits/chosen": -1.5993291139602661, + "logits/rejected": -1.9258677959442139, + "logps/chosen": -2.4710073471069336, + "logps/rejected": -3.1647419929504395, + "loss": 2.4363, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.710071563720703, + "rewards/margins": 6.937350273132324, + "rewards/rejected": -31.647424697875977, + "step": 16935 + }, + { + "epoch": 0.5709663284910176, + "grad_norm": 29.32889175415039, + "learning_rate": 4.634423085329105e-07, + "logits/chosen": -1.786794900894165, + "logits/rejected": -1.8577169179916382, + "logps/chosen": -2.4155313968658447, + "logps/rejected": -2.431154489517212, + "loss": 3.2679, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.155315399169922, + "rewards/margins": 0.15623120963573456, + "rewards/rejected": -24.31154441833496, + "step": 16940 + }, + { + "epoch": 0.5711348545620007, + "grad_norm": 25.315120697021484, + "learning_rate": 4.6314896754705075e-07, + "logits/chosen": -1.314682960510254, + "logits/rejected": -1.5629719495773315, + "logps/chosen": -1.8124099969863892, + "logps/rejected": -1.908725380897522, + "loss": 2.6222, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.124099731445312, + "rewards/margins": 0.9631543159484863, + "rewards/rejected": -19.08725357055664, + "step": 16945 + }, + { + "epoch": 0.5713033806329839, + "grad_norm": 13.031963348388672, + "learning_rate": 4.628556393138815e-07, + "logits/chosen": -1.0483187437057495, + "logits/rejected": -1.5672765970230103, + "logps/chosen": -2.2765183448791504, + "logps/rejected": -2.5370960235595703, + "loss": 2.0186, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.765186309814453, + "rewards/margins": 2.6057753562927246, + "rewards/rejected": -25.370960235595703, + "step": 16950 + }, + { + "epoch": 0.5714719067039671, + "grad_norm": 20.017946243286133, + "learning_rate": 4.625623239349121e-07, + "logits/chosen": -1.6982698440551758, + "logits/rejected": -1.9008452892303467, + "logps/chosen": -2.2406089305877686, + "logps/rejected": -2.3991425037384033, + "loss": 2.4108, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.406091690063477, + "rewards/margins": 1.585334062576294, + "rewards/rejected": -23.991424560546875, + "step": 16955 + }, + { + "epoch": 0.5716404327749502, + "grad_norm": 62.65876388549805, + "learning_rate": 4.622690215116475e-07, + "logits/chosen": -1.2825062274932861, + "logits/rejected": -1.6194877624511719, + "logps/chosen": -1.9833399057388306, + "logps/rejected": -2.9209718704223633, + "loss": 1.831, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.833398818969727, + "rewards/margins": 9.37631893157959, + "rewards/rejected": -29.209716796875, + "step": 16960 + }, + { + "epoch": 0.5718089588459334, + "grad_norm": 2.7115726470947266, + "learning_rate": 4.619757321455879e-07, + "logits/chosen": -1.9648278951644897, + "logits/rejected": -1.9568313360214233, + "logps/chosen": -2.6452038288116455, + "logps/rejected": -3.012327194213867, + "loss": 2.7066, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -26.452037811279297, + "rewards/margins": 3.671234130859375, + "rewards/rejected": -30.123271942138672, + "step": 16965 + }, + { + "epoch": 0.5719774849169167, + "grad_norm": 0.24442708492279053, + "learning_rate": 4.6168245593822923e-07, + "logits/chosen": -1.2114921808242798, + "logits/rejected": -1.5997109413146973, + "logps/chosen": -1.999610185623169, + "logps/rejected": -2.6512880325317383, + "loss": 1.4446, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.996103286743164, + "rewards/margins": 6.516776084899902, + "rewards/rejected": -26.51287841796875, + "step": 16970 + }, + { + "epoch": 0.5721460109878999, + "grad_norm": 9.06457233428955, + "learning_rate": 4.613891929910632e-07, + "logits/chosen": -1.558680772781372, + "logits/rejected": -1.8719091415405273, + "logps/chosen": -2.5277926921844482, + "logps/rejected": -3.1365044116973877, + "loss": 2.7746, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.277929306030273, + "rewards/margins": 6.0871148109436035, + "rewards/rejected": -31.365041732788086, + "step": 16975 + }, + { + "epoch": 0.572314537058883, + "grad_norm": 7.560523509979248, + "learning_rate": 4.61095943405576e-07, + "logits/chosen": -1.4988155364990234, + "logits/rejected": -1.532820463180542, + "logps/chosen": -1.8676990270614624, + "logps/rejected": -2.1474297046661377, + "loss": 2.0479, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.676990509033203, + "rewards/margins": 2.7973062992095947, + "rewards/rejected": -21.47429656982422, + "step": 16980 + }, + { + "epoch": 0.5724830631298662, + "grad_norm": 58.599769592285156, + "learning_rate": 4.6080270728325006e-07, + "logits/chosen": -1.50767183303833, + "logits/rejected": -1.8170439004898071, + "logps/chosen": -2.332676887512207, + "logps/rejected": -2.4412198066711426, + "loss": 2.7629, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.326770782470703, + "rewards/margins": 1.0854309797286987, + "rewards/rejected": -24.412199020385742, + "step": 16985 + }, + { + "epoch": 0.5726515892008494, + "grad_norm": 13.395484924316406, + "learning_rate": 4.605094847255628e-07, + "logits/chosen": -1.3667609691619873, + "logits/rejected": -1.3412193059921265, + "logps/chosen": -1.8395763635635376, + "logps/rejected": -2.1235289573669434, + "loss": 2.2519, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.395763397216797, + "rewards/margins": 2.839524745941162, + "rewards/rejected": -21.235288619995117, + "step": 16990 + }, + { + "epoch": 0.5728201152718325, + "grad_norm": 10.290194511413574, + "learning_rate": 4.602162758339873e-07, + "logits/chosen": -1.4792242050170898, + "logits/rejected": -2.0029921531677246, + "logps/chosen": -2.0362067222595215, + "logps/rejected": -2.3818199634552, + "loss": 1.0937, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -20.36206817626953, + "rewards/margins": 3.4561309814453125, + "rewards/rejected": -23.818199157714844, + "step": 16995 + }, + { + "epoch": 0.5729886413428157, + "grad_norm": 33.30671310424805, + "learning_rate": 4.5992308070999096e-07, + "logits/chosen": -1.5765219926834106, + "logits/rejected": -2.062494993209839, + "logps/chosen": -1.8226219415664673, + "logps/rejected": -2.2019715309143066, + "loss": 2.1542, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.226221084594727, + "rewards/margins": 3.7934963703155518, + "rewards/rejected": -22.019716262817383, + "step": 17000 + }, + { + "epoch": 0.573157167413799, + "grad_norm": 23.92595672607422, + "learning_rate": 4.596298994550375e-07, + "logits/chosen": -1.568203330039978, + "logits/rejected": -1.6613171100616455, + "logps/chosen": -2.349910259246826, + "logps/rejected": -2.626659870147705, + "loss": 2.1305, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.499103546142578, + "rewards/margins": 2.7674946784973145, + "rewards/rejected": -26.2665958404541, + "step": 17005 + }, + { + "epoch": 0.5733256934847821, + "grad_norm": 82.73783111572266, + "learning_rate": 4.5933673217058543e-07, + "logits/chosen": -1.2131386995315552, + "logits/rejected": -1.4366130828857422, + "logps/chosen": -2.282435655593872, + "logps/rejected": -2.7135918140411377, + "loss": 3.3078, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.824357986450195, + "rewards/margins": 4.311557769775391, + "rewards/rejected": -27.135913848876953, + "step": 17010 + }, + { + "epoch": 0.5734942195557653, + "grad_norm": 35.42367172241211, + "learning_rate": 4.5904357895808815e-07, + "logits/chosen": -1.7238378524780273, + "logits/rejected": -1.8969109058380127, + "logps/chosen": -1.975142478942871, + "logps/rejected": -2.1562931537628174, + "loss": 2.2837, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.751422882080078, + "rewards/margins": 1.8115079402923584, + "rewards/rejected": -21.56293296813965, + "step": 17015 + }, + { + "epoch": 0.5736627456267485, + "grad_norm": 26.47319221496582, + "learning_rate": 4.587504399189946e-07, + "logits/chosen": -1.3628904819488525, + "logits/rejected": -1.342140793800354, + "logps/chosen": -1.874319314956665, + "logps/rejected": -2.0152149200439453, + "loss": 1.8363, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -18.74319076538086, + "rewards/margins": 1.4089562892913818, + "rewards/rejected": -20.152149200439453, + "step": 17020 + }, + { + "epoch": 0.5738312716977316, + "grad_norm": 15.671334266662598, + "learning_rate": 4.5845731515474873e-07, + "logits/chosen": -1.347197413444519, + "logits/rejected": -1.512352466583252, + "logps/chosen": -1.7438786029815674, + "logps/rejected": -2.090334415435791, + "loss": 1.4458, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -17.438785552978516, + "rewards/margins": 3.464556932449341, + "rewards/rejected": -20.903343200683594, + "step": 17025 + }, + { + "epoch": 0.5739997977687148, + "grad_norm": 33.49119186401367, + "learning_rate": 4.5816420476678936e-07, + "logits/chosen": -1.4836363792419434, + "logits/rejected": -1.5364410877227783, + "logps/chosen": -2.810619831085205, + "logps/rejected": -2.9169609546661377, + "loss": 3.2122, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -28.106197357177734, + "rewards/margins": 1.0634124279022217, + "rewards/rejected": -29.16961097717285, + "step": 17030 + }, + { + "epoch": 0.574168323839698, + "grad_norm": 27.717208862304688, + "learning_rate": 4.578711088565504e-07, + "logits/chosen": -1.3634545803070068, + "logits/rejected": -1.412527322769165, + "logps/chosen": -2.6148791313171387, + "logps/rejected": -3.1877834796905518, + "loss": 1.6711, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.148792266845703, + "rewards/margins": 5.7290449142456055, + "rewards/rejected": -31.87783432006836, + "step": 17035 + }, + { + "epoch": 0.5743368499106812, + "grad_norm": 0.033766914159059525, + "learning_rate": 4.5757802752546074e-07, + "logits/chosen": -1.450412631034851, + "logits/rejected": -1.698773980140686, + "logps/chosen": -2.0109925270080566, + "logps/rejected": -2.916412115097046, + "loss": 1.2433, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -20.109926223754883, + "rewards/margins": 9.054194450378418, + "rewards/rejected": -29.164119720458984, + "step": 17040 + }, + { + "epoch": 0.5745053759816644, + "grad_norm": 8.274916648864746, + "learning_rate": 4.572849608749447e-07, + "logits/chosen": -1.4016960859298706, + "logits/rejected": -1.663762092590332, + "logps/chosen": -2.1348116397857666, + "logps/rejected": -2.4447596073150635, + "loss": 1.6618, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.34811782836914, + "rewards/margins": 3.099478006362915, + "rewards/rejected": -24.447593688964844, + "step": 17045 + }, + { + "epoch": 0.5746739020526476, + "grad_norm": 23.06145668029785, + "learning_rate": 4.5699190900642057e-07, + "logits/chosen": -1.5405018329620361, + "logits/rejected": -1.792536973953247, + "logps/chosen": -2.386561632156372, + "logps/rejected": -2.670846462249756, + "loss": 2.5033, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.865615844726562, + "rewards/margins": 2.8428475856781006, + "rewards/rejected": -26.70846176147461, + "step": 17050 + }, + { + "epoch": 0.5748424281236307, + "grad_norm": 12.915722846984863, + "learning_rate": 4.566988720213023e-07, + "logits/chosen": -1.1587172746658325, + "logits/rejected": -1.3730021715164185, + "logps/chosen": -1.8425954580307007, + "logps/rejected": -2.190491199493408, + "loss": 3.0152, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.425954818725586, + "rewards/margins": 3.478956699371338, + "rewards/rejected": -21.904911041259766, + "step": 17055 + }, + { + "epoch": 0.5750109541946139, + "grad_norm": 20.196578979492188, + "learning_rate": 4.5640585002099835e-07, + "logits/chosen": -1.5783436298370361, + "logits/rejected": -1.5012297630310059, + "logps/chosen": -2.2923500537872314, + "logps/rejected": -2.4385783672332764, + "loss": 3.1821, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.923500061035156, + "rewards/margins": 1.4622838497161865, + "rewards/rejected": -24.385784149169922, + "step": 17060 + }, + { + "epoch": 0.5751794802655971, + "grad_norm": 16.635955810546875, + "learning_rate": 4.5611284310691246e-07, + "logits/chosen": -1.9602220058441162, + "logits/rejected": -2.0683677196502686, + "logps/chosen": -2.7638659477233887, + "logps/rejected": -3.1422085762023926, + "loss": 1.454, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -27.638662338256836, + "rewards/margins": 3.7834243774414062, + "rewards/rejected": -31.422088623046875, + "step": 17065 + }, + { + "epoch": 0.5753480063365802, + "grad_norm": 20.59967613220215, + "learning_rate": 4.558198513804422e-07, + "logits/chosen": -1.8842146396636963, + "logits/rejected": -1.9829429388046265, + "logps/chosen": -2.4420864582061768, + "logps/rejected": -2.790311336517334, + "loss": 1.5725, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -24.420862197875977, + "rewards/margins": 3.482252836227417, + "rewards/rejected": -27.90311622619629, + "step": 17070 + }, + { + "epoch": 0.5755165324075634, + "grad_norm": 20.192537307739258, + "learning_rate": 4.555268749429808e-07, + "logits/chosen": -1.5222409963607788, + "logits/rejected": -1.757651925086975, + "logps/chosen": -2.3623366355895996, + "logps/rejected": -2.4851298332214355, + "loss": 4.5759, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.623369216918945, + "rewards/margins": 1.2279298305511475, + "rewards/rejected": -24.851299285888672, + "step": 17075 + }, + { + "epoch": 0.5756850584785467, + "grad_norm": 32.318233489990234, + "learning_rate": 4.5523391389591595e-07, + "logits/chosen": -1.657088041305542, + "logits/rejected": -1.7773571014404297, + "logps/chosen": -2.460700273513794, + "logps/rejected": -2.5693631172180176, + "loss": 2.61, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.60700035095215, + "rewards/margins": 1.0866304636001587, + "rewards/rejected": -25.69363021850586, + "step": 17080 + }, + { + "epoch": 0.5758535845495298, + "grad_norm": 26.118494033813477, + "learning_rate": 4.5494096834062963e-07, + "logits/chosen": -0.7557857632637024, + "logits/rejected": -0.8230286836624146, + "logps/chosen": -2.1196682453155518, + "logps/rejected": -2.166259288787842, + "loss": 3.5991, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.19668197631836, + "rewards/margins": 0.46591252088546753, + "rewards/rejected": -21.662593841552734, + "step": 17085 + }, + { + "epoch": 0.576022110620513, + "grad_norm": 0.6773899793624878, + "learning_rate": 4.5464803837849904e-07, + "logits/chosen": -1.4761030673980713, + "logits/rejected": -1.5611333847045898, + "logps/chosen": -2.6379261016845703, + "logps/rejected": -2.967628240585327, + "loss": 3.1823, + "rewards/accuracies": 0.5, + "rewards/chosen": -26.379261016845703, + "rewards/margins": 3.2970237731933594, + "rewards/rejected": -29.676280975341797, + "step": 17090 + }, + { + "epoch": 0.5761906366914962, + "grad_norm": 82.37804412841797, + "learning_rate": 4.5435512411089545e-07, + "logits/chosen": -1.3212661743164062, + "logits/rejected": -2.0065817832946777, + "logps/chosen": -1.9370911121368408, + "logps/rejected": -2.3542609214782715, + "loss": 1.4699, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -19.37091064453125, + "rewards/margins": 4.17169713973999, + "rewards/rejected": -23.542606353759766, + "step": 17095 + }, + { + "epoch": 0.5763591627624793, + "grad_norm": 34.555946350097656, + "learning_rate": 4.5406222563918515e-07, + "logits/chosen": -1.3918941020965576, + "logits/rejected": -1.291394591331482, + "logps/chosen": -2.2668352127075195, + "logps/rejected": -2.3042871952056885, + "loss": 3.7059, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.668350219726562, + "rewards/margins": 0.37451982498168945, + "rewards/rejected": -23.042869567871094, + "step": 17100 + }, + { + "epoch": 0.5765276888334625, + "grad_norm": 35.844886779785156, + "learning_rate": 4.537693430647286e-07, + "logits/chosen": -1.9387611150741577, + "logits/rejected": -2.1475517749786377, + "logps/chosen": -2.0147666931152344, + "logps/rejected": -2.07336163520813, + "loss": 2.7322, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.147666931152344, + "rewards/margins": 0.585949718952179, + "rewards/rejected": -20.73361587524414, + "step": 17105 + }, + { + "epoch": 0.5766962149044457, + "grad_norm": 132.61204528808594, + "learning_rate": 4.53476476488881e-07, + "logits/chosen": -1.348154067993164, + "logits/rejected": -1.716099739074707, + "logps/chosen": -2.2341794967651367, + "logps/rejected": -2.8116281032562256, + "loss": 1.878, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.341794967651367, + "rewards/margins": 5.774487495422363, + "rewards/rejected": -28.116281509399414, + "step": 17110 + }, + { + "epoch": 0.576864740975429, + "grad_norm": 37.44850158691406, + "learning_rate": 4.5318362601299217e-07, + "logits/chosen": -1.4237921237945557, + "logits/rejected": -1.8257821798324585, + "logps/chosen": -1.934027910232544, + "logps/rejected": -2.2331252098083496, + "loss": 1.3484, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.340280532836914, + "rewards/margins": 2.990973711013794, + "rewards/rejected": -22.331253051757812, + "step": 17115 + }, + { + "epoch": 0.5770332670464121, + "grad_norm": 42.07789611816406, + "learning_rate": 4.528907917384056e-07, + "logits/chosen": -1.6937841176986694, + "logits/rejected": -1.7856550216674805, + "logps/chosen": -2.3766939640045166, + "logps/rejected": -2.406951427459717, + "loss": 3.1782, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.766937255859375, + "rewards/margins": 0.3025781512260437, + "rewards/rejected": -24.069515228271484, + "step": 17120 + }, + { + "epoch": 0.5772017931173953, + "grad_norm": 20.366586685180664, + "learning_rate": 4.5259797376646007e-07, + "logits/chosen": -1.8341726064682007, + "logits/rejected": -2.335084915161133, + "logps/chosen": -2.359943151473999, + "logps/rejected": -3.0176966190338135, + "loss": 1.9511, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.599430084228516, + "rewards/margins": 6.577535152435303, + "rewards/rejected": -30.176965713500977, + "step": 17125 + }, + { + "epoch": 0.5773703191883784, + "grad_norm": 71.93550109863281, + "learning_rate": 4.5230517219848816e-07, + "logits/chosen": -1.4756571054458618, + "logits/rejected": -1.6808812618255615, + "logps/chosen": -2.351409912109375, + "logps/rejected": -2.6855571269989014, + "loss": 1.1408, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -23.514097213745117, + "rewards/margins": 3.341470241546631, + "rewards/rejected": -26.85556983947754, + "step": 17130 + }, + { + "epoch": 0.5775388452593616, + "grad_norm": 21.230552673339844, + "learning_rate": 4.5201238713581735e-07, + "logits/chosen": -1.8363711833953857, + "logits/rejected": -1.81638503074646, + "logps/chosen": -2.2280547618865967, + "logps/rejected": -2.2871644496917725, + "loss": 2.8284, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.280548095703125, + "rewards/margins": 0.5910956263542175, + "rewards/rejected": -22.87164306640625, + "step": 17135 + }, + { + "epoch": 0.5777073713303448, + "grad_norm": 31.255447387695312, + "learning_rate": 4.5171961867976847e-07, + "logits/chosen": -1.5670968294143677, + "logits/rejected": -2.0457816123962402, + "logps/chosen": -2.262490749359131, + "logps/rejected": -2.6034095287323, + "loss": 2.4048, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -22.624908447265625, + "rewards/margins": 3.409191131591797, + "rewards/rejected": -26.034099578857422, + "step": 17140 + }, + { + "epoch": 0.5778758974013279, + "grad_norm": 43.904598236083984, + "learning_rate": 4.5142686693165744e-07, + "logits/chosen": -1.659224510192871, + "logits/rejected": -1.6053335666656494, + "logps/chosen": -2.29905366897583, + "logps/rejected": -2.3424086570739746, + "loss": 3.2108, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.990535736083984, + "rewards/margins": 0.43354931473731995, + "rewards/rejected": -23.424083709716797, + "step": 17145 + }, + { + "epoch": 0.5780444234723112, + "grad_norm": 21.321529388427734, + "learning_rate": 4.511341319927941e-07, + "logits/chosen": -1.8803768157958984, + "logits/rejected": -1.943765640258789, + "logps/chosen": -2.151759386062622, + "logps/rejected": -2.35597562789917, + "loss": 3.3356, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.517593383789062, + "rewards/margins": 2.0421626567840576, + "rewards/rejected": -23.559757232666016, + "step": 17150 + }, + { + "epoch": 0.5782129495432944, + "grad_norm": 55.82296371459961, + "learning_rate": 4.5084141396448245e-07, + "logits/chosen": -1.3013948202133179, + "logits/rejected": -1.8239634037017822, + "logps/chosen": -1.8853652477264404, + "logps/rejected": -2.176085948944092, + "loss": 2.8686, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.853652954101562, + "rewards/margins": 2.907205581665039, + "rewards/rejected": -21.7608585357666, + "step": 17155 + }, + { + "epoch": 0.5783814756142776, + "grad_norm": 21.96685218811035, + "learning_rate": 4.5054871294802056e-07, + "logits/chosen": -1.6689783334732056, + "logits/rejected": -1.7678687572479248, + "logps/chosen": -2.287881374359131, + "logps/rejected": -2.6410670280456543, + "loss": 2.5601, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.87881088256836, + "rewards/margins": 3.531856060028076, + "rewards/rejected": -26.410669326782227, + "step": 17160 + }, + { + "epoch": 0.5785500016852607, + "grad_norm": 87.77336120605469, + "learning_rate": 4.5025602904470084e-07, + "logits/chosen": -1.5644152164459229, + "logits/rejected": -1.750683069229126, + "logps/chosen": -2.5638442039489746, + "logps/rejected": -2.700592041015625, + "loss": 3.1935, + "rewards/accuracies": 0.5, + "rewards/chosen": -25.638439178466797, + "rewards/margins": 1.367479681968689, + "rewards/rejected": -27.00592041015625, + "step": 17165 + }, + { + "epoch": 0.5787185277562439, + "grad_norm": 22.31219482421875, + "learning_rate": 4.499633623558097e-07, + "logits/chosen": -2.0407156944274902, + "logits/rejected": -1.9517043828964233, + "logps/chosen": -2.0538718700408936, + "logps/rejected": -2.209723949432373, + "loss": 3.3678, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.538721084594727, + "rewards/margins": 1.5585215091705322, + "rewards/rejected": -22.097240447998047, + "step": 17170 + }, + { + "epoch": 0.578887053827227, + "grad_norm": 24.899978637695312, + "learning_rate": 4.496707129826274e-07, + "logits/chosen": -1.3175891637802124, + "logits/rejected": -1.6883060932159424, + "logps/chosen": -2.261355400085449, + "logps/rejected": -2.381133556365967, + "loss": 2.5608, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.61355209350586, + "rewards/margins": 1.1977792978286743, + "rewards/rejected": -23.81133460998535, + "step": 17175 + }, + { + "epoch": 0.5790555798982102, + "grad_norm": 18.139432907104492, + "learning_rate": 4.493780810264284e-07, + "logits/chosen": -1.407244324684143, + "logits/rejected": -1.4826358556747437, + "logps/chosen": -2.506412982940674, + "logps/rejected": -2.7521300315856934, + "loss": 1.9552, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -25.064128875732422, + "rewards/margins": 2.4571704864501953, + "rewards/rejected": -27.521297454833984, + "step": 17180 + }, + { + "epoch": 0.5792241059691934, + "grad_norm": 27.345720291137695, + "learning_rate": 4.490854665884814e-07, + "logits/chosen": -1.4379069805145264, + "logits/rejected": -1.525801420211792, + "logps/chosen": -2.407961368560791, + "logps/rejected": -2.475123167037964, + "loss": 3.0396, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -24.07961654663086, + "rewards/margins": 0.6716176271438599, + "rewards/rejected": -24.751232147216797, + "step": 17185 + }, + { + "epoch": 0.5793926320401767, + "grad_norm": 22.395584106445312, + "learning_rate": 4.487928697700482e-07, + "logits/chosen": -1.871355652809143, + "logits/rejected": -1.9694700241088867, + "logps/chosen": -2.5732643604278564, + "logps/rejected": -3.7709145545959473, + "loss": 1.3826, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -25.732641220092773, + "rewards/margins": 11.976503372192383, + "rewards/rejected": -37.709144592285156, + "step": 17190 + }, + { + "epoch": 0.5795611581111598, + "grad_norm": 99.64684295654297, + "learning_rate": 4.4850029067238536e-07, + "logits/chosen": -1.2232414484024048, + "logits/rejected": -1.756906270980835, + "logps/chosen": -2.6735825538635254, + "logps/rejected": -3.592395782470703, + "loss": 1.9555, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -26.735824584960938, + "rewards/margins": 9.18813419342041, + "rewards/rejected": -35.92395782470703, + "step": 17195 + }, + { + "epoch": 0.579729684182143, + "grad_norm": 20.795007705688477, + "learning_rate": 4.4820772939674286e-07, + "logits/chosen": -1.6001487970352173, + "logits/rejected": -1.7138382196426392, + "logps/chosen": -2.831948757171631, + "logps/rejected": -3.1137797832489014, + "loss": 3.152, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -28.31948471069336, + "rewards/margins": 2.8183116912841797, + "rewards/rejected": -31.137798309326172, + "step": 17200 + }, + { + "epoch": 0.579729684182143, + "eval_logits/chosen": -1.9627069234848022, + "eval_logits/rejected": -2.10943603515625, + "eval_logps/chosen": -2.1293704509735107, + "eval_logps/rejected": -2.267233371734619, + "eval_loss": 2.9915850162506104, + "eval_rewards/accuracies": 0.6200000047683716, + "eval_rewards/chosen": -21.2937068939209, + "eval_rewards/margins": 1.3786267042160034, + "eval_rewards/rejected": -22.672330856323242, + "eval_runtime": 12.9271, + "eval_samples_per_second": 7.736, + "eval_steps_per_second": 1.934, + "step": 17200 + }, + { + "epoch": 0.5798982102531262, + "grad_norm": 20.8897647857666, + "learning_rate": 4.479151860443649e-07, + "logits/chosen": -2.012810468673706, + "logits/rejected": -2.060838222503662, + "logps/chosen": -2.4459307193756104, + "logps/rejected": -2.3485682010650635, + "loss": 4.6796, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.459306716918945, + "rewards/margins": -0.9736261367797852, + "rewards/rejected": -23.485681533813477, + "step": 17205 + }, + { + "epoch": 0.5800667363241093, + "grad_norm": 43.36587905883789, + "learning_rate": 4.476226607164888e-07, + "logits/chosen": -2.007704973220825, + "logits/rejected": -1.7056808471679688, + "logps/chosen": -2.324647903442383, + "logps/rejected": -2.256413459777832, + "loss": 4.0325, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.246479034423828, + "rewards/margins": -0.6823431849479675, + "rewards/rejected": -22.564136505126953, + "step": 17210 + }, + { + "epoch": 0.5802352623950925, + "grad_norm": 18.71867561340332, + "learning_rate": 4.473301535143462e-07, + "logits/chosen": -1.7911930084228516, + "logits/rejected": -1.8705055713653564, + "logps/chosen": -1.8644154071807861, + "logps/rejected": -1.8933528661727905, + "loss": 3.1285, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.644153594970703, + "rewards/margins": 0.2893770635128021, + "rewards/rejected": -18.933528900146484, + "step": 17215 + }, + { + "epoch": 0.5804037884660757, + "grad_norm": 30.68587875366211, + "learning_rate": 4.4703766453916263e-07, + "logits/chosen": -1.858590841293335, + "logits/rejected": -2.0414671897888184, + "logps/chosen": -2.1605629920959473, + "logps/rejected": -2.3674232959747314, + "loss": 3.1348, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.605627059936523, + "rewards/margins": 2.068603515625, + "rewards/rejected": -23.674230575561523, + "step": 17220 + }, + { + "epoch": 0.5805723145370589, + "grad_norm": 31.98077964782715, + "learning_rate": 4.467451938921565e-07, + "logits/chosen": -1.209812879562378, + "logits/rejected": -1.4913972616195679, + "logps/chosen": -1.8544597625732422, + "logps/rejected": -2.0227410793304443, + "loss": 3.0677, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.544597625732422, + "rewards/margins": 1.682814359664917, + "rewards/rejected": -20.227413177490234, + "step": 17225 + }, + { + "epoch": 0.5807408406080421, + "grad_norm": 56.92514419555664, + "learning_rate": 4.4645274167454053e-07, + "logits/chosen": -1.7451435327529907, + "logits/rejected": -1.7276217937469482, + "logps/chosen": -2.22159743309021, + "logps/rejected": -2.313671112060547, + "loss": 3.6201, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.215974807739258, + "rewards/margins": 0.9207379221916199, + "rewards/rejected": -23.136709213256836, + "step": 17230 + }, + { + "epoch": 0.5809093666790253, + "grad_norm": 30.251081466674805, + "learning_rate": 4.4616030798752106e-07, + "logits/chosen": -1.287561297416687, + "logits/rejected": -1.501903772354126, + "logps/chosen": -2.222918748855591, + "logps/rejected": -2.593648910522461, + "loss": 1.8023, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.22918701171875, + "rewards/margins": 3.7073047161102295, + "rewards/rejected": -25.93648910522461, + "step": 17235 + }, + { + "epoch": 0.5810778927500084, + "grad_norm": 28.843341827392578, + "learning_rate": 4.458678929322979e-07, + "logits/chosen": -1.5157232284545898, + "logits/rejected": -1.330273985862732, + "logps/chosen": -3.0622832775115967, + "logps/rejected": -3.581799268722534, + "loss": 2.1884, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -30.622833251953125, + "rewards/margins": 5.195162296295166, + "rewards/rejected": -35.8179931640625, + "step": 17240 + }, + { + "epoch": 0.5812464188209916, + "grad_norm": 14.910189628601074, + "learning_rate": 4.4557549661006417e-07, + "logits/chosen": -2.0796093940734863, + "logits/rejected": -2.4145195484161377, + "logps/chosen": -2.002706527709961, + "logps/rejected": -2.060037612915039, + "loss": 3.3767, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.027063369750977, + "rewards/margins": 0.5733121037483215, + "rewards/rejected": -20.60037612915039, + "step": 17245 + }, + { + "epoch": 0.5814149448919748, + "grad_norm": 5.407770156860352, + "learning_rate": 4.4528311912200685e-07, + "logits/chosen": -1.4320753812789917, + "logits/rejected": -1.643031120300293, + "logps/chosen": -2.594674587249756, + "logps/rejected": -3.0666213035583496, + "loss": 1.6656, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -25.946746826171875, + "rewards/margins": 4.719465255737305, + "rewards/rejected": -30.666210174560547, + "step": 17250 + }, + { + "epoch": 0.5815834709629579, + "grad_norm": 294.2926940917969, + "learning_rate": 4.449907605693064e-07, + "logits/chosen": -1.483459711074829, + "logits/rejected": -1.3290231227874756, + "logps/chosen": -2.4088211059570312, + "logps/rejected": -2.4589452743530273, + "loss": 3.8198, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.088211059570312, + "rewards/margins": 0.501244843006134, + "rewards/rejected": -24.589452743530273, + "step": 17255 + }, + { + "epoch": 0.5817519970339412, + "grad_norm": 20.795665740966797, + "learning_rate": 4.446984210531363e-07, + "logits/chosen": -1.3401178121566772, + "logits/rejected": -1.5118831396102905, + "logps/chosen": -1.9169412851333618, + "logps/rejected": -1.932276964187622, + "loss": 3.2381, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.169414520263672, + "rewards/margins": 0.1533549278974533, + "rewards/rejected": -19.322769165039062, + "step": 17260 + }, + { + "epoch": 0.5819205231049244, + "grad_norm": 7.758377552032471, + "learning_rate": 4.444061006746638e-07, + "logits/chosen": -1.9944626092910767, + "logits/rejected": -2.3274025917053223, + "logps/chosen": -2.7576446533203125, + "logps/rejected": -3.061986207962036, + "loss": 2.1118, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -27.57644271850586, + "rewards/margins": 3.043414354324341, + "rewards/rejected": -30.619861602783203, + "step": 17265 + }, + { + "epoch": 0.5820890491759075, + "grad_norm": 18.494991302490234, + "learning_rate": 4.441137995350496e-07, + "logits/chosen": -1.7012603282928467, + "logits/rejected": -1.7783534526824951, + "logps/chosen": -2.5444867610931396, + "logps/rejected": -2.4805233478546143, + "loss": 5.1853, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.444866180419922, + "rewards/margins": -0.6396337747573853, + "rewards/rejected": -24.805233001708984, + "step": 17270 + }, + { + "epoch": 0.5822575752468907, + "grad_norm": 51.310508728027344, + "learning_rate": 4.438215177354477e-07, + "logits/chosen": -1.655430555343628, + "logits/rejected": -1.7902733087539673, + "logps/chosen": -1.914145827293396, + "logps/rejected": -1.9054285287857056, + "loss": 3.3195, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.141460418701172, + "rewards/margins": -0.08717469871044159, + "rewards/rejected": -19.054285049438477, + "step": 17275 + }, + { + "epoch": 0.5824261013178739, + "grad_norm": 22.34128189086914, + "learning_rate": 4.43529255377005e-07, + "logits/chosen": -1.6213443279266357, + "logits/rejected": -1.6654713153839111, + "logps/chosen": -2.4206910133361816, + "logps/rejected": -2.515307903289795, + "loss": 3.7372, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -24.206905364990234, + "rewards/margins": 0.9461703300476074, + "rewards/rejected": -25.153076171875, + "step": 17280 + }, + { + "epoch": 0.582594627388857, + "grad_norm": 27.15727424621582, + "learning_rate": 4.432370125608622e-07, + "logits/chosen": -1.858704924583435, + "logits/rejected": -2.0804834365844727, + "logps/chosen": -1.887465238571167, + "logps/rejected": -2.244636058807373, + "loss": 1.7039, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.874652862548828, + "rewards/margins": 3.5717086791992188, + "rewards/rejected": -22.446361541748047, + "step": 17285 + }, + { + "epoch": 0.5827631534598402, + "grad_norm": 50.979434967041016, + "learning_rate": 4.429447893881531e-07, + "logits/chosen": -1.3986759185791016, + "logits/rejected": -1.5457799434661865, + "logps/chosen": -1.9006513357162476, + "logps/rejected": -1.94772207736969, + "loss": 3.0563, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.006513595581055, + "rewards/margins": 0.4707070291042328, + "rewards/rejected": -19.47722053527832, + "step": 17290 + }, + { + "epoch": 0.5829316795308234, + "grad_norm": 43.54460525512695, + "learning_rate": 4.4265258596000434e-07, + "logits/chosen": -1.9640693664550781, + "logits/rejected": -2.306492567062378, + "logps/chosen": -2.4300239086151123, + "logps/rejected": -3.905445098876953, + "loss": 1.4328, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -24.30023956298828, + "rewards/margins": 14.75421142578125, + "rewards/rejected": -39.05445098876953, + "step": 17295 + }, + { + "epoch": 0.5831002056018066, + "grad_norm": 17.482149124145508, + "learning_rate": 4.423604023775361e-07, + "logits/chosen": -1.5966932773590088, + "logits/rejected": -1.7289857864379883, + "logps/chosen": -3.195779323577881, + "logps/rejected": -3.491218090057373, + "loss": 2.7372, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -31.957794189453125, + "rewards/margins": 2.9543869495391846, + "rewards/rejected": -34.91218185424805, + "step": 17300 + }, + { + "epoch": 0.5832687316727898, + "grad_norm": 47.70451354980469, + "learning_rate": 4.4206823874186184e-07, + "logits/chosen": -1.1667314767837524, + "logits/rejected": -1.2670345306396484, + "logps/chosen": -3.233905076980591, + "logps/rejected": -3.1098103523254395, + "loss": 5.1382, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -32.339054107666016, + "rewards/margins": -1.2409486770629883, + "rewards/rejected": -31.098102569580078, + "step": 17305 + }, + { + "epoch": 0.583437257743773, + "grad_norm": 42.176513671875, + "learning_rate": 4.4177609515408773e-07, + "logits/chosen": -2.0141780376434326, + "logits/rejected": -2.237900495529175, + "logps/chosen": -2.178071975708008, + "logps/rejected": -2.326620101928711, + "loss": 4.262, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.780719757080078, + "rewards/margins": 1.4854825735092163, + "rewards/rejected": -23.26620101928711, + "step": 17310 + }, + { + "epoch": 0.5836057838147561, + "grad_norm": 37.14058303833008, + "learning_rate": 4.414839717153131e-07, + "logits/chosen": -1.9146333932876587, + "logits/rejected": -2.0396671295166016, + "logps/chosen": -2.8835787773132324, + "logps/rejected": -2.882293701171875, + "loss": 4.3341, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -28.83578872680664, + "rewards/margins": -0.012850761413574219, + "rewards/rejected": -28.82293701171875, + "step": 17315 + }, + { + "epoch": 0.5837743098857393, + "grad_norm": 35.45816421508789, + "learning_rate": 4.411918685266304e-07, + "logits/chosen": -1.8805458545684814, + "logits/rejected": -1.6515352725982666, + "logps/chosen": -2.736062526702881, + "logps/rejected": -2.856140613555908, + "loss": 3.9732, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -27.360626220703125, + "rewards/margins": 1.200781226158142, + "rewards/rejected": -28.5614070892334, + "step": 17320 + }, + { + "epoch": 0.5839428359567225, + "grad_norm": 38.65712356567383, + "learning_rate": 4.408997856891253e-07, + "logits/chosen": -1.8081859350204468, + "logits/rejected": -2.07198166847229, + "logps/chosen": -2.44268536567688, + "logps/rejected": -3.3284401893615723, + "loss": 2.2041, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.42685317993164, + "rewards/margins": 8.85754680633545, + "rewards/rejected": -33.284400939941406, + "step": 17325 + }, + { + "epoch": 0.5841113620277056, + "grad_norm": 15.86058235168457, + "learning_rate": 4.4060772330387564e-07, + "logits/chosen": -1.563691258430481, + "logits/rejected": -1.9738092422485352, + "logps/chosen": -2.079322099685669, + "logps/rejected": -2.628232717514038, + "loss": 1.3509, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.793222427368164, + "rewards/margins": 5.489107131958008, + "rewards/rejected": -26.28232765197754, + "step": 17330 + }, + { + "epoch": 0.5842798880986889, + "grad_norm": 16.25583839416504, + "learning_rate": 4.40315681471953e-07, + "logits/chosen": -1.6212646961212158, + "logits/rejected": -1.8121631145477295, + "logps/chosen": -2.2856125831604004, + "logps/rejected": -2.2472023963928223, + "loss": 3.7451, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -22.85612678527832, + "rewards/margins": -0.38410359621047974, + "rewards/rejected": -22.47202491760254, + "step": 17335 + }, + { + "epoch": 0.5844484141696721, + "grad_norm": 29.76826286315918, + "learning_rate": 4.4002366029442154e-07, + "logits/chosen": -1.1239495277404785, + "logits/rejected": -1.1791913509368896, + "logps/chosen": -3.060885190963745, + "logps/rejected": -3.046093702316284, + "loss": 4.3675, + "rewards/accuracies": 0.5, + "rewards/chosen": -30.608850479125977, + "rewards/margins": -0.14791660010814667, + "rewards/rejected": -30.4609375, + "step": 17340 + }, + { + "epoch": 0.5846169402406552, + "grad_norm": 20.629085540771484, + "learning_rate": 4.397316598723385e-07, + "logits/chosen": -1.5575644969940186, + "logits/rejected": -2.092625141143799, + "logps/chosen": -2.773283004760742, + "logps/rejected": -3.0996828079223633, + "loss": 2.2424, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -27.732830047607422, + "rewards/margins": 3.2639968395233154, + "rewards/rejected": -30.996826171875, + "step": 17345 + }, + { + "epoch": 0.5847854663116384, + "grad_norm": 18.53481101989746, + "learning_rate": 4.394396803067533e-07, + "logits/chosen": -1.792412519454956, + "logits/rejected": -1.9561207294464111, + "logps/chosen": -2.1911630630493164, + "logps/rejected": -2.5339317321777344, + "loss": 3.261, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.911630630493164, + "rewards/margins": 3.4276881217956543, + "rewards/rejected": -25.339317321777344, + "step": 17350 + }, + { + "epoch": 0.5849539923826216, + "grad_norm": 9.772783279418945, + "learning_rate": 4.391477216987088e-07, + "logits/chosen": -0.737585723400116, + "logits/rejected": -1.3849313259124756, + "logps/chosen": -2.2479326725006104, + "logps/rejected": -3.033356189727783, + "loss": 2.179, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.479326248168945, + "rewards/margins": 7.8542375564575195, + "rewards/rejected": -30.33356285095215, + "step": 17355 + }, + { + "epoch": 0.5851225184536047, + "grad_norm": 35.8283805847168, + "learning_rate": 4.3885578414924054e-07, + "logits/chosen": -1.527430534362793, + "logits/rejected": -1.5946701765060425, + "logps/chosen": -2.0074801445007324, + "logps/rejected": -1.9396559000015259, + "loss": 3.876, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -20.07480239868164, + "rewards/margins": -0.6782447695732117, + "rewards/rejected": -19.396560668945312, + "step": 17360 + }, + { + "epoch": 0.5852910445245879, + "grad_norm": 44.97746658325195, + "learning_rate": 4.385638677593761e-07, + "logits/chosen": -1.3549509048461914, + "logits/rejected": -1.7077839374542236, + "logps/chosen": -2.1543612480163574, + "logps/rejected": -2.287923574447632, + "loss": 2.8444, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.543611526489258, + "rewards/margins": 1.3356250524520874, + "rewards/rejected": -22.87923812866211, + "step": 17365 + }, + { + "epoch": 0.5854595705955712, + "grad_norm": 40.57670974731445, + "learning_rate": 4.382719726301366e-07, + "logits/chosen": -0.7811486124992371, + "logits/rejected": -1.128101110458374, + "logps/chosen": -2.0559234619140625, + "logps/rejected": -2.356846332550049, + "loss": 1.8674, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.559234619140625, + "rewards/margins": 3.0092263221740723, + "rewards/rejected": -23.568462371826172, + "step": 17370 + }, + { + "epoch": 0.5856280966665544, + "grad_norm": 23.4588565826416, + "learning_rate": 4.379800988625354e-07, + "logits/chosen": -1.4437754154205322, + "logits/rejected": -1.4596506357192993, + "logps/chosen": -1.832118272781372, + "logps/rejected": -1.9595096111297607, + "loss": 2.1277, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.321184158325195, + "rewards/margins": 1.273911952972412, + "rewards/rejected": -19.595096588134766, + "step": 17375 + }, + { + "epoch": 0.5857966227375375, + "grad_norm": 68.99215698242188, + "learning_rate": 4.3768824655757854e-07, + "logits/chosen": -1.3112378120422363, + "logits/rejected": -1.148902416229248, + "logps/chosen": -2.7348835468292236, + "logps/rejected": -2.5241928100585938, + "loss": 5.5747, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -27.348834991455078, + "rewards/margins": -2.1069068908691406, + "rewards/rejected": -25.241928100585938, + "step": 17380 + }, + { + "epoch": 0.5859651488085207, + "grad_norm": 2.611574649810791, + "learning_rate": 4.3739641581626453e-07, + "logits/chosen": -1.5041404962539673, + "logits/rejected": -1.6578476428985596, + "logps/chosen": -2.2694530487060547, + "logps/rejected": -2.3508238792419434, + "loss": 3.3058, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.694530487060547, + "rewards/margins": 0.8137068748474121, + "rewards/rejected": -23.508237838745117, + "step": 17385 + }, + { + "epoch": 0.5861336748795039, + "grad_norm": 28.102005004882812, + "learning_rate": 4.371046067395846e-07, + "logits/chosen": -1.4805330038070679, + "logits/rejected": -1.919136643409729, + "logps/chosen": -1.7919280529022217, + "logps/rejected": -1.934385061264038, + "loss": 2.1956, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.919281005859375, + "rewards/margins": 1.4245693683624268, + "rewards/rejected": -19.34385108947754, + "step": 17390 + }, + { + "epoch": 0.586302200950487, + "grad_norm": 78.70321655273438, + "learning_rate": 4.368128194285223e-07, + "logits/chosen": -1.7666633129119873, + "logits/rejected": -1.8986902236938477, + "logps/chosen": -2.514099597930908, + "logps/rejected": -2.6970717906951904, + "loss": 2.7538, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.140995025634766, + "rewards/margins": 1.829724669456482, + "rewards/rejected": -26.970718383789062, + "step": 17395 + }, + { + "epoch": 0.5864707270214702, + "grad_norm": 33.610618591308594, + "learning_rate": 4.365210539840536e-07, + "logits/chosen": -1.8147218227386475, + "logits/rejected": -1.7239938974380493, + "logps/chosen": -1.9119583368301392, + "logps/rejected": -1.9474995136260986, + "loss": 3.2428, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.119583129882812, + "rewards/margins": 0.3554133474826813, + "rewards/rejected": -19.47499656677246, + "step": 17400 + }, + { + "epoch": 0.5866392530924533, + "grad_norm": 78.48345947265625, + "learning_rate": 4.3622931050714713e-07, + "logits/chosen": -1.8521192073822021, + "logits/rejected": -1.9285533428192139, + "logps/chosen": -2.5423624515533447, + "logps/rejected": -2.8399410247802734, + "loss": 2.6434, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.42362403869629, + "rewards/margins": 2.9757864475250244, + "rewards/rejected": -28.399410247802734, + "step": 17405 + }, + { + "epoch": 0.5868077791634366, + "grad_norm": 25.898334503173828, + "learning_rate": 4.359375890987641e-07, + "logits/chosen": -1.4780672788619995, + "logits/rejected": -1.896118402481079, + "logps/chosen": -2.5103867053985596, + "logps/rejected": -3.384552001953125, + "loss": 1.8391, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.103864669799805, + "rewards/margins": 8.741653442382812, + "rewards/rejected": -33.84552001953125, + "step": 17410 + }, + { + "epoch": 0.5869763052344198, + "grad_norm": 102.30123901367188, + "learning_rate": 4.356458898598572e-07, + "logits/chosen": -1.2664697170257568, + "logits/rejected": -1.3566185235977173, + "logps/chosen": -2.1262001991271973, + "logps/rejected": -2.1818442344665527, + "loss": 3.5012, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.26200294494629, + "rewards/margins": 0.5564396977424622, + "rewards/rejected": -21.81844139099121, + "step": 17415 + }, + { + "epoch": 0.587144831305403, + "grad_norm": 192.08316040039062, + "learning_rate": 4.353542128913725e-07, + "logits/chosen": -1.8345438241958618, + "logits/rejected": -1.7843061685562134, + "logps/chosen": -3.290046215057373, + "logps/rejected": -2.989392042160034, + "loss": 7.0941, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -32.90045928955078, + "rewards/margins": -3.0065383911132812, + "rewards/rejected": -29.8939208984375, + "step": 17420 + }, + { + "epoch": 0.5873133573763861, + "grad_norm": 25.507287979125977, + "learning_rate": 4.350625582942477e-07, + "logits/chosen": -1.751320481300354, + "logits/rejected": -1.5903288125991821, + "logps/chosen": -2.034891366958618, + "logps/rejected": -2.0643208026885986, + "loss": 3.1321, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.34891128540039, + "rewards/margins": 0.2942947745323181, + "rewards/rejected": -20.643207550048828, + "step": 17425 + }, + { + "epoch": 0.5874818834473693, + "grad_norm": 51.03654479980469, + "learning_rate": 4.347709261694133e-07, + "logits/chosen": -0.9774719476699829, + "logits/rejected": -1.4587715864181519, + "logps/chosen": -2.1797828674316406, + "logps/rejected": -2.532813549041748, + "loss": 2.059, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.797828674316406, + "rewards/margins": 3.530308485031128, + "rewards/rejected": -25.328136444091797, + "step": 17430 + }, + { + "epoch": 0.5876504095183525, + "grad_norm": 13.80114459991455, + "learning_rate": 4.3447931661779117e-07, + "logits/chosen": -2.0016977787017822, + "logits/rejected": -2.1547093391418457, + "logps/chosen": -2.693709135055542, + "logps/rejected": -2.9496166706085205, + "loss": 3.2403, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -26.937091827392578, + "rewards/margins": 2.559072732925415, + "rewards/rejected": -29.496164321899414, + "step": 17435 + }, + { + "epoch": 0.5878189355893356, + "grad_norm": 26.799381256103516, + "learning_rate": 4.341877297402962e-07, + "logits/chosen": -1.6604926586151123, + "logits/rejected": -2.074387550354004, + "logps/chosen": -1.9511470794677734, + "logps/rejected": -2.075636386871338, + "loss": 2.371, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.5114688873291, + "rewards/margins": 1.2448922395706177, + "rewards/rejected": -20.756362915039062, + "step": 17440 + }, + { + "epoch": 0.5879874616603189, + "grad_norm": 20.89389419555664, + "learning_rate": 4.3389616563783513e-07, + "logits/chosen": -1.4348245859146118, + "logits/rejected": -1.3038320541381836, + "logps/chosen": -2.3064918518066406, + "logps/rejected": -2.6267943382263184, + "loss": 3.0722, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.064918518066406, + "rewards/margins": 3.203024387359619, + "rewards/rejected": -26.2679443359375, + "step": 17445 + }, + { + "epoch": 0.5881559877313021, + "grad_norm": 63.85792541503906, + "learning_rate": 4.336046244113066e-07, + "logits/chosen": -1.6739234924316406, + "logits/rejected": -1.5325616598129272, + "logps/chosen": -1.8348830938339233, + "logps/rejected": -1.812511682510376, + "loss": 3.3974, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.348831176757812, + "rewards/margins": -0.2237166464328766, + "rewards/rejected": -18.1251163482666, + "step": 17450 + }, + { + "epoch": 0.5883245138022852, + "grad_norm": 24.068666458129883, + "learning_rate": 4.3331310616160187e-07, + "logits/chosen": -1.8047908544540405, + "logits/rejected": -1.758195161819458, + "logps/chosen": -1.8868213891983032, + "logps/rejected": -1.9505535364151, + "loss": 2.784, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.868213653564453, + "rewards/margins": 0.6373217701911926, + "rewards/rejected": -19.505535125732422, + "step": 17455 + }, + { + "epoch": 0.5884930398732684, + "grad_norm": 0.08875492960214615, + "learning_rate": 4.3302161098960364e-07, + "logits/chosen": -1.6345468759536743, + "logits/rejected": -1.958866834640503, + "logps/chosen": -2.1410791873931885, + "logps/rejected": -3.067662000656128, + "loss": 2.0759, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.41079330444336, + "rewards/margins": 9.265830039978027, + "rewards/rejected": -30.676620483398438, + "step": 17460 + }, + { + "epoch": 0.5886615659442516, + "grad_norm": 3.284158083260991e-05, + "learning_rate": 4.3273013899618704e-07, + "logits/chosen": -1.3332509994506836, + "logits/rejected": -1.8917903900146484, + "logps/chosen": -2.107168436050415, + "logps/rejected": -2.826239824295044, + "loss": 1.5995, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.07168197631836, + "rewards/margins": 7.1907148361206055, + "rewards/rejected": -28.262399673461914, + "step": 17465 + }, + { + "epoch": 0.5888300920152347, + "grad_norm": 44.276790618896484, + "learning_rate": 4.32438690282219e-07, + "logits/chosen": -1.041285753250122, + "logits/rejected": -1.167011022567749, + "logps/chosen": -2.1292452812194824, + "logps/rejected": -2.432764768600464, + "loss": 2.8508, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.292451858520508, + "rewards/margins": 3.0351970195770264, + "rewards/rejected": -24.327648162841797, + "step": 17470 + }, + { + "epoch": 0.5889986180862179, + "grad_norm": 0.5383897423744202, + "learning_rate": 4.3214726494855836e-07, + "logits/chosen": -1.7013810873031616, + "logits/rejected": -1.6997826099395752, + "logps/chosen": -3.0948708057403564, + "logps/rejected": -3.3002877235412598, + "loss": 4.2099, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -30.948705673217773, + "rewards/margins": 2.0541675090789795, + "rewards/rejected": -33.00287628173828, + "step": 17475 + }, + { + "epoch": 0.5891671441572012, + "grad_norm": 19.071176528930664, + "learning_rate": 4.3185586309605627e-07, + "logits/chosen": -1.5275566577911377, + "logits/rejected": -1.7071269750595093, + "logps/chosen": -2.2619528770446777, + "logps/rejected": -2.693704128265381, + "loss": 2.9347, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.619525909423828, + "rewards/margins": 4.317513465881348, + "rewards/rejected": -26.937042236328125, + "step": 17480 + }, + { + "epoch": 0.5893356702281843, + "grad_norm": 36.96125411987305, + "learning_rate": 4.3156448482555494e-07, + "logits/chosen": -1.653538465499878, + "logits/rejected": -1.7368927001953125, + "logps/chosen": -2.780684232711792, + "logps/rejected": -2.837693214416504, + "loss": 2.7991, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -27.80684471130371, + "rewards/margins": 0.5700892210006714, + "rewards/rejected": -28.376934051513672, + "step": 17485 + }, + { + "epoch": 0.5895041962991675, + "grad_norm": 66.32109069824219, + "learning_rate": 4.312731302378892e-07, + "logits/chosen": -1.5482505559921265, + "logits/rejected": -1.755061149597168, + "logps/chosen": -2.361372470855713, + "logps/rejected": -2.75365948677063, + "loss": 1.4334, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -23.613723754882812, + "rewards/margins": 3.922870635986328, + "rewards/rejected": -27.536596298217773, + "step": 17490 + }, + { + "epoch": 0.5896727223701507, + "grad_norm": 29.832361221313477, + "learning_rate": 4.3098179943388535e-07, + "logits/chosen": -1.2624939680099487, + "logits/rejected": -1.3678514957427979, + "logps/chosen": -2.529294729232788, + "logps/rejected": -2.463853120803833, + "loss": 5.1863, + "rewards/accuracies": 0.5, + "rewards/chosen": -25.29294776916504, + "rewards/margins": -0.6544168591499329, + "rewards/rejected": -24.638530731201172, + "step": 17495 + }, + { + "epoch": 0.5898412484411338, + "grad_norm": 32.630340576171875, + "learning_rate": 4.3069049251436175e-07, + "logits/chosen": -1.7602989673614502, + "logits/rejected": -1.858923316001892, + "logps/chosen": -2.544949769973755, + "logps/rejected": -2.550809860229492, + "loss": 5.0199, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.44949722290039, + "rewards/margins": 0.05859985202550888, + "rewards/rejected": -25.508098602294922, + "step": 17500 + }, + { + "epoch": 0.590009774512117, + "grad_norm": 22.349571228027344, + "learning_rate": 4.3039920958012776e-07, + "logits/chosen": -1.661318063735962, + "logits/rejected": -1.627806305885315, + "logps/chosen": -2.583085298538208, + "logps/rejected": -2.57246470451355, + "loss": 3.8321, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.830852508544922, + "rewards/margins": -0.10620470345020294, + "rewards/rejected": -25.724645614624023, + "step": 17505 + }, + { + "epoch": 0.5901783005831002, + "grad_norm": 68.3171615600586, + "learning_rate": 4.3010795073198513e-07, + "logits/chosen": -1.728459119796753, + "logits/rejected": -1.9351695775985718, + "logps/chosen": -2.076179265975952, + "logps/rejected": -2.5930678844451904, + "loss": 2.3447, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.761795043945312, + "rewards/margins": 5.168886661529541, + "rewards/rejected": -25.930679321289062, + "step": 17510 + }, + { + "epoch": 0.5903468266540833, + "grad_norm": 27.479597091674805, + "learning_rate": 4.2981671607072727e-07, + "logits/chosen": -1.640201210975647, + "logits/rejected": -1.7427374124526978, + "logps/chosen": -2.0141711235046387, + "logps/rejected": -2.28794527053833, + "loss": 1.9525, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -20.141712188720703, + "rewards/margins": 2.7377383708953857, + "rewards/rejected": -22.87944984436035, + "step": 17515 + }, + { + "epoch": 0.5905153527250666, + "grad_norm": 37.086692810058594, + "learning_rate": 4.295255056971389e-07, + "logits/chosen": -1.4790493249893188, + "logits/rejected": -1.347656488418579, + "logps/chosen": -2.7744202613830566, + "logps/rejected": -2.695765972137451, + "loss": 5.654, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -27.74420166015625, + "rewards/margins": -0.7865422368049622, + "rewards/rejected": -26.957660675048828, + "step": 17520 + }, + { + "epoch": 0.5906838787960498, + "grad_norm": 16.76278305053711, + "learning_rate": 4.2923431971199624e-07, + "logits/chosen": -1.0991032123565674, + "logits/rejected": -1.166424036026001, + "logps/chosen": -2.3206911087036133, + "logps/rejected": -2.3980796337127686, + "loss": 2.8055, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.2069091796875, + "rewards/margins": 0.7738858461380005, + "rewards/rejected": -23.980796813964844, + "step": 17525 + }, + { + "epoch": 0.5908524048670329, + "grad_norm": 17.811054229736328, + "learning_rate": 4.289431582160675e-07, + "logits/chosen": -1.578426718711853, + "logits/rejected": -2.2804975509643555, + "logps/chosen": -2.642085313796997, + "logps/rejected": -3.4870471954345703, + "loss": 2.2679, + "rewards/accuracies": 0.5, + "rewards/chosen": -26.420848846435547, + "rewards/margins": 8.449621200561523, + "rewards/rejected": -34.87046813964844, + "step": 17530 + }, + { + "epoch": 0.5910209309380161, + "grad_norm": 24.69849967956543, + "learning_rate": 4.286520213101123e-07, + "logits/chosen": -1.7663204669952393, + "logits/rejected": -1.9045088291168213, + "logps/chosen": -2.673767566680908, + "logps/rejected": -2.9358999729156494, + "loss": 2.1756, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.7376766204834, + "rewards/margins": 2.621324300765991, + "rewards/rejected": -29.358999252319336, + "step": 17535 + }, + { + "epoch": 0.5911894570089993, + "grad_norm": 53.869384765625, + "learning_rate": 4.283609090948814e-07, + "logits/chosen": -1.6090246438980103, + "logits/rejected": -1.5867881774902344, + "logps/chosen": -2.0779833793640137, + "logps/rejected": -1.9043381214141846, + "loss": 4.9391, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.779834747314453, + "rewards/margins": -1.7364532947540283, + "rewards/rejected": -19.043380737304688, + "step": 17540 + }, + { + "epoch": 0.5913579830799824, + "grad_norm": 14.778437614440918, + "learning_rate": 4.280698216711174e-07, + "logits/chosen": -1.898503303527832, + "logits/rejected": -2.0935044288635254, + "logps/chosen": -2.9049651622772217, + "logps/rejected": -3.0754752159118652, + "loss": 2.9957, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -29.049652099609375, + "rewards/margins": 1.7050997018814087, + "rewards/rejected": -30.754751205444336, + "step": 17545 + }, + { + "epoch": 0.5915265091509656, + "grad_norm": 60.79536056518555, + "learning_rate": 4.2777875913955443e-07, + "logits/chosen": -1.1955629587173462, + "logits/rejected": -1.308807134628296, + "logps/chosen": -2.1304657459259033, + "logps/rejected": -2.615813732147217, + "loss": 2.2048, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.304656982421875, + "rewards/margins": 4.853476047515869, + "rewards/rejected": -26.158132553100586, + "step": 17550 + }, + { + "epoch": 0.5916950352219489, + "grad_norm": 35.70866775512695, + "learning_rate": 4.2748772160091727e-07, + "logits/chosen": -2.0288426876068115, + "logits/rejected": -2.134122610092163, + "logps/chosen": -2.0519161224365234, + "logps/rejected": -2.507661819458008, + "loss": 3.2737, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.519161224365234, + "rewards/margins": 4.557457447052002, + "rewards/rejected": -25.076618194580078, + "step": 17555 + }, + { + "epoch": 0.591863561292932, + "grad_norm": 26.37740707397461, + "learning_rate": 4.271967091559228e-07, + "logits/chosen": -1.6266578435897827, + "logits/rejected": -1.8625901937484741, + "logps/chosen": -1.8834251165390015, + "logps/rejected": -1.9434534311294556, + "loss": 3.822, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.834253311157227, + "rewards/margins": 0.6002805829048157, + "rewards/rejected": -19.434532165527344, + "step": 17560 + }, + { + "epoch": 0.5920320873639152, + "grad_norm": 20.472694396972656, + "learning_rate": 4.2690572190527895e-07, + "logits/chosen": -1.5904171466827393, + "logits/rejected": -2.1096081733703613, + "logps/chosen": -2.228877305984497, + "logps/rejected": -2.7297446727752686, + "loss": 1.0736, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -22.28877067565918, + "rewards/margins": 5.008677005767822, + "rewards/rejected": -27.297449111938477, + "step": 17565 + }, + { + "epoch": 0.5922006134348984, + "grad_norm": 34.900054931640625, + "learning_rate": 4.266147599496852e-07, + "logits/chosen": -1.158190131187439, + "logits/rejected": -1.2331039905548096, + "logps/chosen": -2.0225605964660645, + "logps/rejected": -2.1574673652648926, + "loss": 2.6277, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.22560691833496, + "rewards/margins": 1.349064588546753, + "rewards/rejected": -21.57467269897461, + "step": 17570 + }, + { + "epoch": 0.5923691395058815, + "grad_norm": 54.93463897705078, + "learning_rate": 4.2632382338983153e-07, + "logits/chosen": -1.7310314178466797, + "logits/rejected": -1.6517353057861328, + "logps/chosen": -2.441493511199951, + "logps/rejected": -2.7141823768615723, + "loss": 3.6634, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.414932250976562, + "rewards/margins": 2.7268924713134766, + "rewards/rejected": -27.14182472229004, + "step": 17575 + }, + { + "epoch": 0.5925376655768647, + "grad_norm": 43.11684799194336, + "learning_rate": 4.2603291232639984e-07, + "logits/chosen": -1.8522008657455444, + "logits/rejected": -1.6992623805999756, + "logps/chosen": -2.5552072525024414, + "logps/rejected": -2.281377077102661, + "loss": 5.8346, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -25.552072525024414, + "rewards/margins": -2.7383031845092773, + "rewards/rejected": -22.813770294189453, + "step": 17580 + }, + { + "epoch": 0.5927061916478479, + "grad_norm": 16.700607299804688, + "learning_rate": 4.257420268600632e-07, + "logits/chosen": -1.6553192138671875, + "logits/rejected": -1.7701327800750732, + "logps/chosen": -3.3446273803710938, + "logps/rejected": -4.0541839599609375, + "loss": 1.9073, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -33.44627380371094, + "rewards/margins": 7.095564365386963, + "rewards/rejected": -40.541839599609375, + "step": 17585 + }, + { + "epoch": 0.5928747177188312, + "grad_norm": 25.058446884155273, + "learning_rate": 4.2545116709148526e-07, + "logits/chosen": -1.351576328277588, + "logits/rejected": -1.4393280744552612, + "logps/chosen": -1.845435380935669, + "logps/rejected": -1.8508222103118896, + "loss": 3.1528, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.4543514251709, + "rewards/margins": 0.05386962741613388, + "rewards/rejected": -18.508222579956055, + "step": 17590 + }, + { + "epoch": 0.5930432437898143, + "grad_norm": 40.690330505371094, + "learning_rate": 4.251603331213213e-07, + "logits/chosen": -0.8327827453613281, + "logits/rejected": -1.0143380165100098, + "logps/chosen": -2.506579875946045, + "logps/rejected": -2.4614579677581787, + "loss": 3.6507, + "rewards/accuracies": 0.5, + "rewards/chosen": -25.065799713134766, + "rewards/margins": -0.451219379901886, + "rewards/rejected": -24.614582061767578, + "step": 17595 + }, + { + "epoch": 0.5932117698607975, + "grad_norm": 37.29527282714844, + "learning_rate": 4.248695250502174e-07, + "logits/chosen": -1.3765560388565063, + "logits/rejected": -1.6383224725723267, + "logps/chosen": -2.117464780807495, + "logps/rejected": -2.629913330078125, + "loss": 1.8856, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.17464828491211, + "rewards/margins": 5.124485969543457, + "rewards/rejected": -26.29913330078125, + "step": 17600 + }, + { + "epoch": 0.5932117698607975, + "eval_logits/chosen": -1.9636667966842651, + "eval_logits/rejected": -2.110840082168579, + "eval_logps/chosen": -2.127274513244629, + "eval_logps/rejected": -2.264631509780884, + "eval_loss": 2.9846508502960205, + "eval_rewards/accuracies": 0.6200000047683716, + "eval_rewards/chosen": -21.272747039794922, + "eval_rewards/margins": 1.3735666275024414, + "eval_rewards/rejected": -22.646312713623047, + "eval_runtime": 12.9237, + "eval_samples_per_second": 7.738, + "eval_steps_per_second": 1.934, + "step": 17600 + }, + { + "epoch": 0.5933802959317807, + "grad_norm": 19.281713485717773, + "learning_rate": 4.2457874297881105e-07, + "logits/chosen": -1.4907379150390625, + "logits/rejected": -1.6906015872955322, + "logps/chosen": -2.0307023525238037, + "logps/rejected": -2.305128574371338, + "loss": 2.3951, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.307024002075195, + "rewards/margins": 2.744263172149658, + "rewards/rejected": -23.051288604736328, + "step": 17605 + }, + { + "epoch": 0.5935488220027638, + "grad_norm": 32.606544494628906, + "learning_rate": 4.242879870077301e-07, + "logits/chosen": -1.3404357433319092, + "logits/rejected": -1.6816742420196533, + "logps/chosen": -2.534003734588623, + "logps/rejected": -2.823920726776123, + "loss": 2.3159, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.340038299560547, + "rewards/margins": 2.899167060852051, + "rewards/rejected": -28.23920249938965, + "step": 17610 + }, + { + "epoch": 0.593717348073747, + "grad_norm": 21.681140899658203, + "learning_rate": 4.2399725723759395e-07, + "logits/chosen": -1.5766531229019165, + "logits/rejected": -2.337092399597168, + "logps/chosen": -2.5060973167419434, + "logps/rejected": -2.8640053272247314, + "loss": 1.2001, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -25.060977935791016, + "rewards/margins": 3.5790741443634033, + "rewards/rejected": -28.64004898071289, + "step": 17615 + }, + { + "epoch": 0.5938858741447302, + "grad_norm": 30.393402099609375, + "learning_rate": 4.2370655376901286e-07, + "logits/chosen": -1.6135174036026, + "logits/rejected": -1.6738744974136353, + "logps/chosen": -2.3300719261169434, + "logps/rejected": -2.5740814208984375, + "loss": 1.3831, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -23.30072021484375, + "rewards/margins": 2.4400906562805176, + "rewards/rejected": -25.740814208984375, + "step": 17620 + }, + { + "epoch": 0.5940544002157133, + "grad_norm": 48.35916519165039, + "learning_rate": 4.234158767025875e-07, + "logits/chosen": -1.544013261795044, + "logits/rejected": -1.8595256805419922, + "logps/chosen": -2.1459286212921143, + "logps/rejected": -2.489976406097412, + "loss": 1.4423, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -21.459285736083984, + "rewards/margins": 3.4404807090759277, + "rewards/rejected": -24.89976692199707, + "step": 17625 + }, + { + "epoch": 0.5942229262866966, + "grad_norm": 18.79473876953125, + "learning_rate": 4.231252261389099e-07, + "logits/chosen": -1.719613790512085, + "logits/rejected": -1.609312653541565, + "logps/chosen": -2.18891978263855, + "logps/rejected": -2.1651229858398438, + "loss": 3.4594, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -21.88920021057129, + "rewards/margins": -0.23796892166137695, + "rewards/rejected": -21.651229858398438, + "step": 17630 + }, + { + "epoch": 0.5943914523576798, + "grad_norm": 35.13705062866211, + "learning_rate": 4.2283460217856275e-07, + "logits/chosen": -1.4112998247146606, + "logits/rejected": -1.5887477397918701, + "logps/chosen": -2.176684856414795, + "logps/rejected": -2.39996337890625, + "loss": 2.3311, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.766849517822266, + "rewards/margins": 2.232788562774658, + "rewards/rejected": -23.999637603759766, + "step": 17635 + }, + { + "epoch": 0.5945599784286629, + "grad_norm": 41.7555046081543, + "learning_rate": 4.225440049221198e-07, + "logits/chosen": -1.7434478998184204, + "logits/rejected": -1.6766493320465088, + "logps/chosen": -1.836517333984375, + "logps/rejected": -1.7924884557724, + "loss": 3.6239, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.365171432495117, + "rewards/margins": -0.4402883052825928, + "rewards/rejected": -17.924884796142578, + "step": 17640 + }, + { + "epoch": 0.5947285044996461, + "grad_norm": 21.95073890686035, + "learning_rate": 4.222534344701449e-07, + "logits/chosen": -1.4925669431686401, + "logits/rejected": -1.6031299829483032, + "logps/chosen": -2.6258418560028076, + "logps/rejected": -2.924691677093506, + "loss": 2.6878, + "rewards/accuracies": 0.5, + "rewards/chosen": -26.258419036865234, + "rewards/margins": 2.9885010719299316, + "rewards/rejected": -29.24692153930664, + "step": 17645 + }, + { + "epoch": 0.5948970305706293, + "grad_norm": 30.1873779296875, + "learning_rate": 4.2196289092319333e-07, + "logits/chosen": -1.2937986850738525, + "logits/rejected": -1.3382008075714111, + "logps/chosen": -2.9433345794677734, + "logps/rejected": -2.8709471225738525, + "loss": 5.1872, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -29.4333438873291, + "rewards/margins": -0.7238739728927612, + "rewards/rejected": -28.709468841552734, + "step": 17650 + }, + { + "epoch": 0.5950655566416124, + "grad_norm": 0.43724510073661804, + "learning_rate": 4.216723743818108e-07, + "logits/chosen": -1.4383492469787598, + "logits/rejected": -1.4420338869094849, + "logps/chosen": -1.880933403968811, + "logps/rejected": -2.153778553009033, + "loss": 2.3401, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.8093318939209, + "rewards/margins": 2.728450059890747, + "rewards/rejected": -21.537784576416016, + "step": 17655 + }, + { + "epoch": 0.5952340827125956, + "grad_norm": 60.6141357421875, + "learning_rate": 4.2138188494653336e-07, + "logits/chosen": -1.254163146018982, + "logits/rejected": -1.2855738401412964, + "logps/chosen": -1.8590952157974243, + "logps/rejected": -2.130317211151123, + "loss": 2.0457, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.590951919555664, + "rewards/margins": 2.712218999862671, + "rewards/rejected": -21.303173065185547, + "step": 17660 + }, + { + "epoch": 0.5954026087835789, + "grad_norm": 222.0001678466797, + "learning_rate": 4.2109142271788805e-07, + "logits/chosen": -1.451233983039856, + "logits/rejected": -1.878170371055603, + "logps/chosen": -2.879201889038086, + "logps/rejected": -3.0601067543029785, + "loss": 5.7789, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -28.79201889038086, + "rewards/margins": 1.8090499639511108, + "rewards/rejected": -30.6010684967041, + "step": 17665 + }, + { + "epoch": 0.595571134854562, + "grad_norm": 18.758010864257812, + "learning_rate": 4.208009877963925e-07, + "logits/chosen": -1.7818104028701782, + "logits/rejected": -2.403594493865967, + "logps/chosen": -2.9406161308288574, + "logps/rejected": -3.6528496742248535, + "loss": 1.6939, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -29.406158447265625, + "rewards/margins": 7.1223344802856445, + "rewards/rejected": -36.52849578857422, + "step": 17670 + }, + { + "epoch": 0.5957396609255452, + "grad_norm": 15.573593139648438, + "learning_rate": 4.205105802825548e-07, + "logits/chosen": -1.4992997646331787, + "logits/rejected": -1.5927377939224243, + "logps/chosen": -2.3704192638397217, + "logps/rejected": -2.571882724761963, + "loss": 2.4671, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.704195022583008, + "rewards/margins": 2.0146334171295166, + "rewards/rejected": -25.718826293945312, + "step": 17675 + }, + { + "epoch": 0.5959081869965284, + "grad_norm": 59.856201171875, + "learning_rate": 4.2022020027687344e-07, + "logits/chosen": -1.6479030847549438, + "logits/rejected": -1.8206367492675781, + "logps/chosen": -2.8668527603149414, + "logps/rejected": -3.6330673694610596, + "loss": 1.8714, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -28.66852378845215, + "rewards/margins": 7.662148952484131, + "rewards/rejected": -36.33067321777344, + "step": 17680 + }, + { + "epoch": 0.5960767130675115, + "grad_norm": 26.6012020111084, + "learning_rate": 4.199298478798376e-07, + "logits/chosen": -1.8493913412094116, + "logits/rejected": -1.903660774230957, + "logps/chosen": -1.858231782913208, + "logps/rejected": -2.213120937347412, + "loss": 1.4821, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -18.582317352294922, + "rewards/margins": 3.5488898754119873, + "rewards/rejected": -22.131208419799805, + "step": 17685 + }, + { + "epoch": 0.5962452391384947, + "grad_norm": 101.23904418945312, + "learning_rate": 4.1963952319192695e-07, + "logits/chosen": -1.0026901960372925, + "logits/rejected": -1.2731549739837646, + "logps/chosen": -2.68685245513916, + "logps/rejected": -2.9386978149414062, + "loss": 2.6075, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -26.868526458740234, + "rewards/margins": 2.518451452255249, + "rewards/rejected": -29.386978149414062, + "step": 17690 + }, + { + "epoch": 0.5964137652094779, + "grad_norm": 27.395618438720703, + "learning_rate": 4.1934922631361104e-07, + "logits/chosen": -1.4647185802459717, + "logits/rejected": -1.7177009582519531, + "logps/chosen": -2.9183998107910156, + "logps/rejected": -3.2998645305633545, + "loss": 1.8271, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -29.183996200561523, + "rewards/margins": 3.8146488666534424, + "rewards/rejected": -32.99864959716797, + "step": 17695 + }, + { + "epoch": 0.5965822912804611, + "grad_norm": 13.962396621704102, + "learning_rate": 4.190589573453504e-07, + "logits/chosen": -1.97426438331604, + "logits/rejected": -1.9399926662445068, + "logps/chosen": -2.150571823120117, + "logps/rejected": -2.5680508613586426, + "loss": 2.7481, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.505718231201172, + "rewards/margins": 4.174788951873779, + "rewards/rejected": -25.680505752563477, + "step": 17700 + }, + { + "epoch": 0.5967508173514443, + "grad_norm": 188.11080932617188, + "learning_rate": 4.1876871638759564e-07, + "logits/chosen": -2.3743977546691895, + "logits/rejected": -2.6676254272460938, + "logps/chosen": -3.02490234375, + "logps/rejected": -3.4125359058380127, + "loss": 4.419, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -30.2490234375, + "rewards/margins": 3.8763325214385986, + "rewards/rejected": -34.12535858154297, + "step": 17705 + }, + { + "epoch": 0.5969193434224275, + "grad_norm": 22.14044761657715, + "learning_rate": 4.18478503540788e-07, + "logits/chosen": -1.5527372360229492, + "logits/rejected": -1.6013343334197998, + "logps/chosen": -2.241884708404541, + "logps/rejected": -2.361104965209961, + "loss": 2.9046, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.418848037719727, + "rewards/margins": 1.1922025680541992, + "rewards/rejected": -23.61104965209961, + "step": 17710 + }, + { + "epoch": 0.5970878694934106, + "grad_norm": 19.193300247192383, + "learning_rate": 4.181883189053582e-07, + "logits/chosen": -1.4520524740219116, + "logits/rejected": -1.5924266576766968, + "logps/chosen": -1.887930154800415, + "logps/rejected": -2.1257078647613525, + "loss": 2.7585, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.879301071166992, + "rewards/margins": 2.377777338027954, + "rewards/rejected": -21.257076263427734, + "step": 17715 + }, + { + "epoch": 0.5972563955643938, + "grad_norm": 23.460098266601562, + "learning_rate": 4.1789816258172805e-07, + "logits/chosen": -1.358764886856079, + "logits/rejected": -2.004973888397217, + "logps/chosen": -2.920631170272827, + "logps/rejected": -3.5469462871551514, + "loss": 3.206, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -29.206314086914062, + "rewards/margins": 6.263152122497559, + "rewards/rejected": -35.46946334838867, + "step": 17720 + }, + { + "epoch": 0.597424921635377, + "grad_norm": 37.642578125, + "learning_rate": 4.176080346703094e-07, + "logits/chosen": -1.4246867895126343, + "logits/rejected": -1.5094234943389893, + "logps/chosen": -1.9959986209869385, + "logps/rejected": -2.001661539077759, + "loss": 3.374, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.95998764038086, + "rewards/margins": 0.056629084050655365, + "rewards/rejected": -20.016613006591797, + "step": 17725 + }, + { + "epoch": 0.5975934477063601, + "grad_norm": 21.875577926635742, + "learning_rate": 4.173179352715035e-07, + "logits/chosen": -1.5243735313415527, + "logits/rejected": -1.5015779733657837, + "logps/chosen": -2.428837299346924, + "logps/rejected": -2.3328146934509277, + "loss": 4.3343, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.288372039794922, + "rewards/margins": -0.9602264165878296, + "rewards/rejected": -23.32814598083496, + "step": 17730 + }, + { + "epoch": 0.5977619737773433, + "grad_norm": 6.548096179962158, + "learning_rate": 4.170278644857027e-07, + "logits/chosen": -1.662825584411621, + "logits/rejected": -1.599381446838379, + "logps/chosen": -2.056088924407959, + "logps/rejected": -2.751908540725708, + "loss": 1.445, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.560888290405273, + "rewards/margins": 6.958195686340332, + "rewards/rejected": -27.519084930419922, + "step": 17735 + }, + { + "epoch": 0.5979304998483266, + "grad_norm": 63.508277893066406, + "learning_rate": 4.16737822413289e-07, + "logits/chosen": -2.2823574542999268, + "logits/rejected": -2.3403286933898926, + "logps/chosen": -2.6337478160858154, + "logps/rejected": -2.5546653270721436, + "loss": 5.8875, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.337478637695312, + "rewards/margins": -0.7908235788345337, + "rewards/rejected": -25.546653747558594, + "step": 17740 + }, + { + "epoch": 0.5980990259193097, + "grad_norm": 18.882797241210938, + "learning_rate": 4.1644780915463475e-07, + "logits/chosen": -1.9262142181396484, + "logits/rejected": -2.1272881031036377, + "logps/chosen": -1.8121017217636108, + "logps/rejected": -2.1528286933898926, + "loss": 1.4948, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -18.121017456054688, + "rewards/margins": 3.4072678089141846, + "rewards/rejected": -21.52828598022461, + "step": 17745 + }, + { + "epoch": 0.5982675519902929, + "grad_norm": 18.638851165771484, + "learning_rate": 4.1615782481010176e-07, + "logits/chosen": -1.487438678741455, + "logits/rejected": -1.7235790491104126, + "logps/chosen": -1.9567781686782837, + "logps/rejected": -2.219252347946167, + "loss": 1.372, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.56778335571289, + "rewards/margins": 2.6247432231903076, + "rewards/rejected": -22.19252586364746, + "step": 17750 + }, + { + "epoch": 0.5984360780612761, + "grad_norm": 37.2097053527832, + "learning_rate": 4.158678694800425e-07, + "logits/chosen": -1.640181303024292, + "logits/rejected": -1.6364223957061768, + "logps/chosen": -2.0055267810821533, + "logps/rejected": -2.185953378677368, + "loss": 3.3978, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.055267333984375, + "rewards/margins": 1.8042659759521484, + "rewards/rejected": -21.859533309936523, + "step": 17755 + }, + { + "epoch": 0.5986046041322592, + "grad_norm": 22.888492584228516, + "learning_rate": 4.155779432647989e-07, + "logits/chosen": -1.4140545129776, + "logits/rejected": -1.5783103704452515, + "logps/chosen": -2.144395589828491, + "logps/rejected": -2.281733989715576, + "loss": 2.4318, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.443958282470703, + "rewards/margins": 1.3733841180801392, + "rewards/rejected": -22.817340850830078, + "step": 17760 + }, + { + "epoch": 0.5987731302032424, + "grad_norm": 24.782909393310547, + "learning_rate": 4.1528804626470295e-07, + "logits/chosen": -1.4038830995559692, + "logits/rejected": -1.4523396492004395, + "logps/chosen": -2.3506646156311035, + "logps/rejected": -2.3315212726593018, + "loss": 3.7075, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -23.506643295288086, + "rewards/margins": -0.1914331465959549, + "rewards/rejected": -23.31521224975586, + "step": 17765 + }, + { + "epoch": 0.5989416562742256, + "grad_norm": 31.55765151977539, + "learning_rate": 4.149981785800767e-07, + "logits/chosen": -1.700679063796997, + "logits/rejected": -1.971419095993042, + "logps/chosen": -2.1803455352783203, + "logps/rejected": -2.843294858932495, + "loss": 1.6472, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.803455352783203, + "rewards/margins": 6.629496097564697, + "rewards/rejected": -28.43294906616211, + "step": 17770 + }, + { + "epoch": 0.5991101823452089, + "grad_norm": 29.38243293762207, + "learning_rate": 4.147083403112319e-07, + "logits/chosen": -1.5240113735198975, + "logits/rejected": -1.3688710927963257, + "logps/chosen": -1.7080087661743164, + "logps/rejected": -1.7847347259521484, + "loss": 2.4782, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.080087661743164, + "rewards/margins": 0.7672606706619263, + "rewards/rejected": -17.847349166870117, + "step": 17775 + }, + { + "epoch": 0.599278708416192, + "grad_norm": 67.73983001708984, + "learning_rate": 4.144185315584703e-07, + "logits/chosen": -1.9726877212524414, + "logits/rejected": -1.9675414562225342, + "logps/chosen": -2.4346508979797363, + "logps/rejected": -2.096621036529541, + "loss": 6.6933, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -24.34650993347168, + "rewards/margins": -3.380300521850586, + "rewards/rejected": -20.966209411621094, + "step": 17780 + }, + { + "epoch": 0.5994472344871752, + "grad_norm": 29.38736915588379, + "learning_rate": 4.14128752422083e-07, + "logits/chosen": -1.3103973865509033, + "logits/rejected": -1.4525163173675537, + "logps/chosen": -1.706693410873413, + "logps/rejected": -1.8178211450576782, + "loss": 2.1655, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.06693458557129, + "rewards/margins": 1.1112741231918335, + "rewards/rejected": -18.17820930480957, + "step": 17785 + }, + { + "epoch": 0.5996157605581584, + "grad_norm": 31.563465118408203, + "learning_rate": 4.1383900300235125e-07, + "logits/chosen": -2.0533287525177, + "logits/rejected": -2.32878041267395, + "logps/chosen": -2.641245126724243, + "logps/rejected": -2.7328808307647705, + "loss": 3.7078, + "rewards/accuracies": 0.5, + "rewards/chosen": -26.412450790405273, + "rewards/margins": 0.9163557887077332, + "rewards/rejected": -27.328805923461914, + "step": 17790 + }, + { + "epoch": 0.5997842866291415, + "grad_norm": 49.81260681152344, + "learning_rate": 4.135492833995462e-07, + "logits/chosen": -1.5277329683303833, + "logits/rejected": -1.401781678199768, + "logps/chosen": -2.6558830738067627, + "logps/rejected": -3.2873096466064453, + "loss": 3.8545, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -26.5588321685791, + "rewards/margins": 6.31426477432251, + "rewards/rejected": -32.87309646606445, + "step": 17795 + }, + { + "epoch": 0.5999528127001247, + "grad_norm": 36.82050323486328, + "learning_rate": 4.1325959371392796e-07, + "logits/chosen": -0.9483410120010376, + "logits/rejected": -1.0351123809814453, + "logps/chosen": -2.4580655097961426, + "logps/rejected": -2.5040602684020996, + "loss": 3.2038, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -24.580652236938477, + "rewards/margins": 0.4599494934082031, + "rewards/rejected": -25.040603637695312, + "step": 17800 + }, + { + "epoch": 0.6001213387711078, + "grad_norm": 17.785057067871094, + "learning_rate": 4.1296993404574687e-07, + "logits/chosen": -1.2796690464019775, + "logits/rejected": -2.216463327407837, + "logps/chosen": -2.1483981609344482, + "logps/rejected": -2.617771863937378, + "loss": 1.394, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -21.48398208618164, + "rewards/margins": 4.693737506866455, + "rewards/rejected": -26.177722930908203, + "step": 17805 + }, + { + "epoch": 0.6002898648420911, + "grad_norm": 22.225467681884766, + "learning_rate": 4.1268030449524275e-07, + "logits/chosen": -2.2345798015594482, + "logits/rejected": -2.384531021118164, + "logps/chosen": -2.7949657440185547, + "logps/rejected": -2.8740978240966797, + "loss": 3.4624, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -27.949655532836914, + "rewards/margins": 0.7913219332695007, + "rewards/rejected": -28.740978240966797, + "step": 17810 + }, + { + "epoch": 0.6004583909130743, + "grad_norm": 18.74883270263672, + "learning_rate": 4.1239070516264506e-07, + "logits/chosen": -1.6532881259918213, + "logits/rejected": -1.6195957660675049, + "logps/chosen": -2.23626446723938, + "logps/rejected": -2.477332353591919, + "loss": 3.5523, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.36264419555664, + "rewards/margins": 2.4106783866882324, + "rewards/rejected": -24.7733211517334, + "step": 17815 + }, + { + "epoch": 0.6006269169840575, + "grad_norm": 24.84805679321289, + "learning_rate": 4.1210113614817273e-07, + "logits/chosen": -1.1539795398712158, + "logits/rejected": -1.2512967586517334, + "logps/chosen": -2.1468093395233154, + "logps/rejected": -2.287897825241089, + "loss": 2.038, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.468093872070312, + "rewards/margins": 1.4108854532241821, + "rewards/rejected": -22.878978729248047, + "step": 17820 + }, + { + "epoch": 0.6007954430550406, + "grad_norm": 45.94430923461914, + "learning_rate": 4.11811597552034e-07, + "logits/chosen": -1.8820756673812866, + "logits/rejected": -1.816383719444275, + "logps/chosen": -2.616534948348999, + "logps/rejected": -2.635910987854004, + "loss": 3.6579, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -26.16534996032715, + "rewards/margins": 0.1937606781721115, + "rewards/rejected": -26.35910987854004, + "step": 17825 + }, + { + "epoch": 0.6009639691260238, + "grad_norm": 35.467071533203125, + "learning_rate": 4.115220894744269e-07, + "logits/chosen": -1.4625093936920166, + "logits/rejected": -1.6223608255386353, + "logps/chosen": -1.9035320281982422, + "logps/rejected": -2.360581159591675, + "loss": 2.4854, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.035320281982422, + "rewards/margins": 4.570491790771484, + "rewards/rejected": -23.605812072753906, + "step": 17830 + }, + { + "epoch": 0.601132495197007, + "grad_norm": 17.500070571899414, + "learning_rate": 4.1123261201553867e-07, + "logits/chosen": -1.5405447483062744, + "logits/rejected": -1.847246527671814, + "logps/chosen": -2.1715755462646484, + "logps/rejected": -2.882485866546631, + "loss": 3.3008, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.71575355529785, + "rewards/margins": 7.109103202819824, + "rewards/rejected": -28.824859619140625, + "step": 17835 + }, + { + "epoch": 0.6013010212679901, + "grad_norm": 141.09652709960938, + "learning_rate": 4.109431652755461e-07, + "logits/chosen": -1.4695584774017334, + "logits/rejected": -1.6095157861709595, + "logps/chosen": -2.1822612285614014, + "logps/rejected": -2.1554157733917236, + "loss": 3.5666, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.822612762451172, + "rewards/margins": -0.2684548497200012, + "rewards/rejected": -21.554157257080078, + "step": 17840 + }, + { + "epoch": 0.6014695473389733, + "grad_norm": 21.075143814086914, + "learning_rate": 4.106537493546154e-07, + "logits/chosen": -1.46273672580719, + "logits/rejected": -1.464318871498108, + "logps/chosen": -2.298161268234253, + "logps/rejected": -2.3293509483337402, + "loss": 3.4996, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.981613159179688, + "rewards/margins": 0.3118970990180969, + "rewards/rejected": -23.293508529663086, + "step": 17845 + }, + { + "epoch": 0.6016380734099566, + "grad_norm": 25.357160568237305, + "learning_rate": 4.1036436435290207e-07, + "logits/chosen": -1.5115927457809448, + "logits/rejected": -1.9485218524932861, + "logps/chosen": -1.806051254272461, + "logps/rejected": -2.0344676971435547, + "loss": 2.5676, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.06051254272461, + "rewards/margins": 2.2841649055480957, + "rewards/rejected": -20.344676971435547, + "step": 17850 + }, + { + "epoch": 0.6018065994809397, + "grad_norm": 25.672758102416992, + "learning_rate": 4.100750103705506e-07, + "logits/chosen": -1.677353858947754, + "logits/rejected": -2.0314478874206543, + "logps/chosen": -2.9779555797576904, + "logps/rejected": -3.4202136993408203, + "loss": 1.8307, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -29.779556274414062, + "rewards/margins": 4.422582626342773, + "rewards/rejected": -34.2021369934082, + "step": 17855 + }, + { + "epoch": 0.6019751255519229, + "grad_norm": 16.537107467651367, + "learning_rate": 4.09785687507695e-07, + "logits/chosen": -1.6343332529067993, + "logits/rejected": -1.8614110946655273, + "logps/chosen": -2.0546581745147705, + "logps/rejected": -2.017251491546631, + "loss": 3.8123, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.546581268310547, + "rewards/margins": -0.37406882643699646, + "rewards/rejected": -20.17251205444336, + "step": 17860 + }, + { + "epoch": 0.6021436516229061, + "grad_norm": 44.8232421875, + "learning_rate": 4.0949639586445907e-07, + "logits/chosen": -1.3961999416351318, + "logits/rejected": -1.5271848440170288, + "logps/chosen": -2.1533312797546387, + "logps/rejected": -2.27251935005188, + "loss": 2.4688, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.533313751220703, + "rewards/margins": 1.1918811798095703, + "rewards/rejected": -22.725194931030273, + "step": 17865 + }, + { + "epoch": 0.6023121776938892, + "grad_norm": 21.053346633911133, + "learning_rate": 4.092071355409545e-07, + "logits/chosen": -1.7526776790618896, + "logits/rejected": -1.7933683395385742, + "logps/chosen": -1.8899450302124023, + "logps/rejected": -1.890005111694336, + "loss": 3.3379, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.899450302124023, + "rewards/margins": 0.0006008148193359375, + "rewards/rejected": -18.90005111694336, + "step": 17870 + }, + { + "epoch": 0.6024807037648724, + "grad_norm": 44.42392349243164, + "learning_rate": 4.089179066372832e-07, + "logits/chosen": -1.336414098739624, + "logits/rejected": -1.5446573495864868, + "logps/chosen": -2.743844985961914, + "logps/rejected": -3.301013231277466, + "loss": 2.7564, + "rewards/accuracies": 0.5, + "rewards/chosen": -27.438451766967773, + "rewards/margins": 5.571681022644043, + "rewards/rejected": -33.0101318359375, + "step": 17875 + }, + { + "epoch": 0.6026492298358556, + "grad_norm": 35.24571990966797, + "learning_rate": 4.0862870925353597e-07, + "logits/chosen": -1.813269019126892, + "logits/rejected": -1.7184432744979858, + "logps/chosen": -2.3771960735321045, + "logps/rejected": -2.766353130340576, + "loss": 2.1354, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.771961212158203, + "rewards/margins": 3.8915724754333496, + "rewards/rejected": -27.66353416442871, + "step": 17880 + }, + { + "epoch": 0.6028177559068388, + "grad_norm": 29.940200805664062, + "learning_rate": 4.083395434897928e-07, + "logits/chosen": -2.2942934036254883, + "logits/rejected": -2.215756893157959, + "logps/chosen": -2.2887065410614014, + "logps/rejected": -2.3103342056274414, + "loss": 3.4214, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.887065887451172, + "rewards/margins": 0.21627846360206604, + "rewards/rejected": -23.103343963623047, + "step": 17885 + }, + { + "epoch": 0.602986281977822, + "grad_norm": 15.23022174835205, + "learning_rate": 4.0805040944612215e-07, + "logits/chosen": -1.6742618083953857, + "logits/rejected": -2.1392934322357178, + "logps/chosen": -2.9912867546081543, + "logps/rejected": -3.2775497436523438, + "loss": 2.0122, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -29.91286849975586, + "rewards/margins": 2.8626296520233154, + "rewards/rejected": -32.7755012512207, + "step": 17890 + }, + { + "epoch": 0.6031548080488052, + "grad_norm": 29.538070678710938, + "learning_rate": 4.0776130722258207e-07, + "logits/chosen": -1.221923589706421, + "logits/rejected": -1.2887871265411377, + "logps/chosen": -2.2940049171447754, + "logps/rejected": -2.515733003616333, + "loss": 2.6864, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.940052032470703, + "rewards/margins": 2.2172799110412598, + "rewards/rejected": -25.157331466674805, + "step": 17895 + }, + { + "epoch": 0.6033233341197883, + "grad_norm": 0.11861720681190491, + "learning_rate": 4.0747223691921963e-07, + "logits/chosen": -1.7873830795288086, + "logits/rejected": -1.839321494102478, + "logps/chosen": -2.739753007888794, + "logps/rejected": -3.109933376312256, + "loss": 2.1349, + "rewards/accuracies": 0.5, + "rewards/chosen": -27.39752769470215, + "rewards/margins": 3.7018027305603027, + "rewards/rejected": -31.099334716796875, + "step": 17900 + }, + { + "epoch": 0.6034918601907715, + "grad_norm": 23.25163459777832, + "learning_rate": 4.071831986360704e-07, + "logits/chosen": -1.5324798822402954, + "logits/rejected": -1.7978417873382568, + "logps/chosen": -3.7456448078155518, + "logps/rejected": -3.5534369945526123, + "loss": 5.4472, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -37.456443786621094, + "rewards/margins": -1.9220736026763916, + "rewards/rejected": -35.53437042236328, + "step": 17905 + }, + { + "epoch": 0.6036603862617547, + "grad_norm": 23.313236236572266, + "learning_rate": 4.0689419247315935e-07, + "logits/chosen": -1.3383638858795166, + "logits/rejected": -1.4649317264556885, + "logps/chosen": -2.1240153312683105, + "logps/rejected": -2.2492001056671143, + "loss": 2.3375, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.240154266357422, + "rewards/margins": 1.251846194267273, + "rewards/rejected": -22.492000579833984, + "step": 17910 + }, + { + "epoch": 0.6038289123327378, + "grad_norm": 16.772438049316406, + "learning_rate": 4.066052185305e-07, + "logits/chosen": -1.185973882675171, + "logits/rejected": -1.493067979812622, + "logps/chosen": -1.9985774755477905, + "logps/rejected": -2.1876299381256104, + "loss": 1.9702, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.985774993896484, + "rewards/margins": 1.8905220031738281, + "rewards/rejected": -21.876296997070312, + "step": 17915 + }, + { + "epoch": 0.6039974384037211, + "grad_norm": 30.339557647705078, + "learning_rate": 4.063162769080952e-07, + "logits/chosen": -1.7155368328094482, + "logits/rejected": -1.5918371677398682, + "logps/chosen": -2.586151599884033, + "logps/rejected": -2.4468390941619873, + "loss": 5.2645, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.86151695251465, + "rewards/margins": -1.3931264877319336, + "rewards/rejected": -24.4683895111084, + "step": 17920 + }, + { + "epoch": 0.6041659644747043, + "grad_norm": 71.9335708618164, + "learning_rate": 4.060273677059357e-07, + "logits/chosen": -2.01371431350708, + "logits/rejected": -2.0844101905822754, + "logps/chosen": -2.2792601585388184, + "logps/rejected": -2.360816478729248, + "loss": 2.8228, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.792600631713867, + "rewards/margins": 0.8155611157417297, + "rewards/rejected": -23.60816192626953, + "step": 17925 + }, + { + "epoch": 0.6043344905456874, + "grad_norm": 9.240427017211914, + "learning_rate": 4.0573849102400185e-07, + "logits/chosen": -1.902093529701233, + "logits/rejected": -2.0091750621795654, + "logps/chosen": -2.6528334617614746, + "logps/rejected": -3.194786787033081, + "loss": 1.0115, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -26.528331756591797, + "rewards/margins": 5.419533729553223, + "rewards/rejected": -31.947866439819336, + "step": 17930 + }, + { + "epoch": 0.6045030166166706, + "grad_norm": 26.025230407714844, + "learning_rate": 4.054496469622628e-07, + "logits/chosen": -1.269789695739746, + "logits/rejected": -1.9125232696533203, + "logps/chosen": -2.085925579071045, + "logps/rejected": -2.6767005920410156, + "loss": 1.9414, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.859254837036133, + "rewards/margins": 5.907750129699707, + "rewards/rejected": -26.767004013061523, + "step": 17935 + }, + { + "epoch": 0.6046715426876538, + "grad_norm": 52.2057991027832, + "learning_rate": 4.051608356206755e-07, + "logits/chosen": -1.7119247913360596, + "logits/rejected": -1.4851834774017334, + "logps/chosen": -1.791717767715454, + "logps/rejected": -1.9203579425811768, + "loss": 2.4681, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.917177200317383, + "rewards/margins": 1.2864009141921997, + "rewards/rejected": -19.20357894897461, + "step": 17940 + }, + { + "epoch": 0.6048400687586369, + "grad_norm": 17.035730361938477, + "learning_rate": 4.048720570991865e-07, + "logits/chosen": -1.4668445587158203, + "logits/rejected": -1.8648614883422852, + "logps/chosen": -2.6843013763427734, + "logps/rejected": -3.5758934020996094, + "loss": 1.7242, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -26.843013763427734, + "rewards/margins": 8.915924072265625, + "rewards/rejected": -35.75893783569336, + "step": 17945 + }, + { + "epoch": 0.6050085948296201, + "grad_norm": 17.1804141998291, + "learning_rate": 4.045833114977309e-07, + "logits/chosen": -1.7299177646636963, + "logits/rejected": -2.482475996017456, + "logps/chosen": -2.2938883304595947, + "logps/rejected": -3.3217194080352783, + "loss": 1.6015, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -22.93888282775879, + "rewards/margins": 10.27830982208252, + "rewards/rejected": -33.21718978881836, + "step": 17950 + }, + { + "epoch": 0.6051771209006033, + "grad_norm": 3.4808218479156494, + "learning_rate": 4.0429459891623165e-07, + "logits/chosen": -1.123740315437317, + "logits/rejected": -1.7537816762924194, + "logps/chosen": -1.9857628345489502, + "logps/rejected": -2.486551523208618, + "loss": 1.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -19.85762596130371, + "rewards/margins": 5.0078911781311035, + "rewards/rejected": -24.86551856994629, + "step": 17955 + }, + { + "epoch": 0.6053456469715865, + "grad_norm": 51.88289260864258, + "learning_rate": 4.040059194546011e-07, + "logits/chosen": -1.659250259399414, + "logits/rejected": -1.814026117324829, + "logps/chosen": -2.484945774078369, + "logps/rejected": -2.844405174255371, + "loss": 2.0936, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -24.849456787109375, + "rewards/margins": 3.594592571258545, + "rewards/rejected": -28.44405174255371, + "step": 17960 + }, + { + "epoch": 0.6055141730425697, + "grad_norm": 9.73103141784668, + "learning_rate": 4.0371727321273987e-07, + "logits/chosen": -1.4572718143463135, + "logits/rejected": -1.7130321264266968, + "logps/chosen": -2.1037278175354004, + "logps/rejected": -2.5024209022521973, + "loss": 1.9432, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.037277221679688, + "rewards/margins": 3.986931324005127, + "rewards/rejected": -25.02420997619629, + "step": 17965 + }, + { + "epoch": 0.6056826991135529, + "grad_norm": 20.67357063293457, + "learning_rate": 4.0342866029053703e-07, + "logits/chosen": -1.54425847530365, + "logits/rejected": -1.5728416442871094, + "logps/chosen": -1.8009445667266846, + "logps/rejected": -1.9770351648330688, + "loss": 2.644, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.009445190429688, + "rewards/margins": 1.7609062194824219, + "rewards/rejected": -19.77035140991211, + "step": 17970 + }, + { + "epoch": 0.605851225184536, + "grad_norm": 32.49703598022461, + "learning_rate": 4.0314008078787e-07, + "logits/chosen": -1.3170311450958252, + "logits/rejected": -1.2635383605957031, + "logps/chosen": -2.206963062286377, + "logps/rejected": -2.1144444942474365, + "loss": 4.0325, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -22.069629669189453, + "rewards/margins": -0.9251849055290222, + "rewards/rejected": -21.144445419311523, + "step": 17975 + }, + { + "epoch": 0.6060197512555192, + "grad_norm": 39.733985900878906, + "learning_rate": 4.028515348046049e-07, + "logits/chosen": -2.097215175628662, + "logits/rejected": -2.3491263389587402, + "logps/chosen": -2.108121395111084, + "logps/rejected": -2.3394417762756348, + "loss": 2.8359, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.081212997436523, + "rewards/margins": 2.3132030963897705, + "rewards/rejected": -23.3944149017334, + "step": 17980 + }, + { + "epoch": 0.6061882773265024, + "grad_norm": 12.164617538452148, + "learning_rate": 4.0256302244059623e-07, + "logits/chosen": -1.217043161392212, + "logits/rejected": -1.5332605838775635, + "logps/chosen": -2.25597882270813, + "logps/rejected": -2.373389720916748, + "loss": 2.6482, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.55978775024414, + "rewards/margins": 1.1741094589233398, + "rewards/rejected": -23.733896255493164, + "step": 17985 + }, + { + "epoch": 0.6063568033974855, + "grad_norm": 31.064668655395508, + "learning_rate": 4.0227454379568653e-07, + "logits/chosen": -1.4396828413009644, + "logits/rejected": -1.7883985042572021, + "logps/chosen": -2.1677372455596924, + "logps/rejected": -2.3215463161468506, + "loss": 2.3949, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.677370071411133, + "rewards/margins": 1.538094162940979, + "rewards/rejected": -23.215463638305664, + "step": 17990 + }, + { + "epoch": 0.6065253294684688, + "grad_norm": 33.58404541015625, + "learning_rate": 4.01986098969707e-07, + "logits/chosen": -1.8123286962509155, + "logits/rejected": -1.6961183547973633, + "logps/chosen": -2.0836260318756104, + "logps/rejected": -2.062624454498291, + "loss": 3.3407, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.836257934570312, + "rewards/margins": -0.21001510322093964, + "rewards/rejected": -20.626243591308594, + "step": 17995 + }, + { + "epoch": 0.606693855539452, + "grad_norm": 0.05182600021362305, + "learning_rate": 4.0169768806247697e-07, + "logits/chosen": -1.3305574655532837, + "logits/rejected": -1.6693122386932373, + "logps/chosen": -2.4600489139556885, + "logps/rejected": -3.0529866218566895, + "loss": 1.1291, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -24.600488662719727, + "rewards/margins": 5.929378509521484, + "rewards/rejected": -30.52986717224121, + "step": 18000 + }, + { + "epoch": 0.606693855539452, + "eval_logits/chosen": -2.0212080478668213, + "eval_logits/rejected": -2.1736414432525635, + "eval_logps/chosen": -2.1531260013580322, + "eval_logps/rejected": -2.2950658798217773, + "eval_loss": 2.9981372356414795, + "eval_rewards/accuracies": 0.6200000047683716, + "eval_rewards/chosen": -21.531261444091797, + "eval_rewards/margins": 1.4193978309631348, + "eval_rewards/rejected": -22.950658798217773, + "eval_runtime": 12.8857, + "eval_samples_per_second": 7.761, + "eval_steps_per_second": 1.94, + "step": 18000 + }, + { + "epoch": 0.6068623816104352, + "grad_norm": 89.49707794189453, + "learning_rate": 4.0140931117380437e-07, + "logits/chosen": -1.5910829305648804, + "logits/rejected": -1.7629798650741577, + "logps/chosen": -2.1482040882110596, + "logps/rejected": -2.142286777496338, + "loss": 3.6854, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.482038497924805, + "rewards/margins": -0.059171102941036224, + "rewards/rejected": -21.422866821289062, + "step": 18005 + }, + { + "epoch": 0.6070309076814183, + "grad_norm": 2.001401298912242e-05, + "learning_rate": 4.011209684034846e-07, + "logits/chosen": -1.7829933166503906, + "logits/rejected": -2.354343891143799, + "logps/chosen": -2.8509857654571533, + "logps/rejected": -3.5344605445861816, + "loss": 2.6997, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -28.50986099243164, + "rewards/margins": 6.834742546081543, + "rewards/rejected": -35.34459686279297, + "step": 18010 + }, + { + "epoch": 0.6071994337524015, + "grad_norm": 29.717529296875, + "learning_rate": 4.008326598513021e-07, + "logits/chosen": -1.4794981479644775, + "logits/rejected": -1.6470489501953125, + "logps/chosen": -2.9904773235321045, + "logps/rejected": -3.15329909324646, + "loss": 2.9079, + "rewards/accuracies": 0.5, + "rewards/chosen": -29.904775619506836, + "rewards/margins": 1.6282180547714233, + "rewards/rejected": -31.53299331665039, + "step": 18015 + }, + { + "epoch": 0.6073679598233847, + "grad_norm": 51.709434509277344, + "learning_rate": 4.005443856170291e-07, + "logits/chosen": -1.5839917659759521, + "logits/rejected": -1.4346948862075806, + "logps/chosen": -2.345694065093994, + "logps/rejected": -2.851513385772705, + "loss": 2.9015, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.456941604614258, + "rewards/margins": 5.058190822601318, + "rewards/rejected": -28.515132904052734, + "step": 18020 + }, + { + "epoch": 0.6075364858943678, + "grad_norm": 19.4397029876709, + "learning_rate": 4.0025614580042565e-07, + "logits/chosen": -1.346427321434021, + "logits/rejected": -1.7577238082885742, + "logps/chosen": -2.1209535598754883, + "logps/rejected": -2.2037816047668457, + "loss": 2.8587, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.20953369140625, + "rewards/margins": 0.8282777667045593, + "rewards/rejected": -22.03781509399414, + "step": 18025 + }, + { + "epoch": 0.6077050119653511, + "grad_norm": 23.9757137298584, + "learning_rate": 3.999679405012404e-07, + "logits/chosen": -2.53595232963562, + "logits/rejected": -2.367096185684204, + "logps/chosen": -3.45710825920105, + "logps/rejected": -3.697390079498291, + "loss": 3.8363, + "rewards/accuracies": 0.5, + "rewards/chosen": -34.571083068847656, + "rewards/margins": 2.402817964553833, + "rewards/rejected": -36.973899841308594, + "step": 18030 + }, + { + "epoch": 0.6078735380363343, + "grad_norm": 33.260257720947266, + "learning_rate": 3.9967976981920987e-07, + "logits/chosen": -1.5834105014801025, + "logits/rejected": -1.6174606084823608, + "logps/chosen": -2.2329487800598145, + "logps/rejected": -2.2986979484558105, + "loss": 2.947, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.32948875427246, + "rewards/margins": 0.6574923396110535, + "rewards/rejected": -22.986980438232422, + "step": 18035 + }, + { + "epoch": 0.6080420641073174, + "grad_norm": 28.550710678100586, + "learning_rate": 3.993916338540586e-07, + "logits/chosen": -1.7848262786865234, + "logits/rejected": -1.7436761856079102, + "logps/chosen": -2.402068853378296, + "logps/rejected": -2.468722343444824, + "loss": 3.0888, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.020687103271484, + "rewards/margins": 0.6665343046188354, + "rewards/rejected": -24.68722152709961, + "step": 18040 + }, + { + "epoch": 0.6082105901783006, + "grad_norm": 18.684982299804688, + "learning_rate": 3.9910353270549895e-07, + "logits/chosen": -1.147879958152771, + "logits/rejected": -1.5647351741790771, + "logps/chosen": -2.320190906524658, + "logps/rejected": -2.9074273109436035, + "loss": 2.8898, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.2019100189209, + "rewards/margins": 5.872366428375244, + "rewards/rejected": -29.07427406311035, + "step": 18045 + }, + { + "epoch": 0.6083791162492838, + "grad_norm": 29.315391540527344, + "learning_rate": 3.988154664732315e-07, + "logits/chosen": -1.4805309772491455, + "logits/rejected": -1.7728363275527954, + "logps/chosen": -2.3441691398620605, + "logps/rejected": -3.008974552154541, + "loss": 2.8736, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -23.441692352294922, + "rewards/margins": 6.648050785064697, + "rewards/rejected": -30.089740753173828, + "step": 18050 + }, + { + "epoch": 0.6085476423202669, + "grad_norm": 30.374649047851562, + "learning_rate": 3.9852743525694477e-07, + "logits/chosen": -1.4703962802886963, + "logits/rejected": -1.6875982284545898, + "logps/chosen": -2.152924060821533, + "logps/rejected": -2.2890422344207764, + "loss": 2.2531, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.529239654541016, + "rewards/margins": 1.3611841201782227, + "rewards/rejected": -22.890422821044922, + "step": 18055 + }, + { + "epoch": 0.6087161683912501, + "grad_norm": 48.82684326171875, + "learning_rate": 3.9823943915631466e-07, + "logits/chosen": -1.6669085025787354, + "logits/rejected": -2.1665635108947754, + "logps/chosen": -2.514723777770996, + "logps/rejected": -2.925302028656006, + "loss": 3.5687, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.14723777770996, + "rewards/margins": 4.1057844161987305, + "rewards/rejected": -29.253021240234375, + "step": 18060 + }, + { + "epoch": 0.6088846944622333, + "grad_norm": 194.8758087158203, + "learning_rate": 3.979514782710054e-07, + "logits/chosen": -1.4292688369750977, + "logits/rejected": -1.3867441415786743, + "logps/chosen": -2.669638156890869, + "logps/rejected": -2.704360008239746, + "loss": 3.4805, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.69638442993164, + "rewards/margins": 0.347219854593277, + "rewards/rejected": -27.043603897094727, + "step": 18065 + }, + { + "epoch": 0.6090532205332165, + "grad_norm": 23.801340103149414, + "learning_rate": 3.97663552700669e-07, + "logits/chosen": -1.9906822443008423, + "logits/rejected": -1.7741702795028687, + "logps/chosen": -1.8173282146453857, + "logps/rejected": -1.7739219665527344, + "loss": 3.493, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.173282623291016, + "rewards/margins": -0.43406257033348083, + "rewards/rejected": -17.739219665527344, + "step": 18070 + }, + { + "epoch": 0.6092217466041997, + "grad_norm": 17.24144172668457, + "learning_rate": 3.9737566254494533e-07, + "logits/chosen": -1.5699522495269775, + "logits/rejected": -1.4470058679580688, + "logps/chosen": -2.3631958961486816, + "logps/rejected": -2.5016205310821533, + "loss": 3.8795, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -23.631961822509766, + "rewards/margins": 1.3842445611953735, + "rewards/rejected": -25.016204833984375, + "step": 18075 + }, + { + "epoch": 0.6093902726751829, + "grad_norm": 26.506839752197266, + "learning_rate": 3.9708780790346133e-07, + "logits/chosen": -1.7675163745880127, + "logits/rejected": -1.9242709875106812, + "logps/chosen": -2.3664536476135254, + "logps/rejected": -2.7819604873657227, + "loss": 2.7749, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -23.66453742980957, + "rewards/margins": 4.155068397521973, + "rewards/rejected": -27.819604873657227, + "step": 18080 + }, + { + "epoch": 0.609558798746166, + "grad_norm": 229.21978759765625, + "learning_rate": 3.967999888758325e-07, + "logits/chosen": -1.8047516345977783, + "logits/rejected": -2.22436261177063, + "logps/chosen": -3.2236645221710205, + "logps/rejected": -3.230384349822998, + "loss": 7.2379, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -32.23664474487305, + "rewards/margins": 0.06719855964183807, + "rewards/rejected": -32.30384063720703, + "step": 18085 + }, + { + "epoch": 0.6097273248171492, + "grad_norm": 26.054292678833008, + "learning_rate": 3.9651220556166183e-07, + "logits/chosen": -1.5580518245697021, + "logits/rejected": -1.891649603843689, + "logps/chosen": -2.72765851020813, + "logps/rejected": -2.717656135559082, + "loss": 3.3043, + "rewards/accuracies": 0.5, + "rewards/chosen": -27.276586532592773, + "rewards/margins": -0.10002384334802628, + "rewards/rejected": -27.176563262939453, + "step": 18090 + }, + { + "epoch": 0.6098958508881324, + "grad_norm": 21.03340721130371, + "learning_rate": 3.9622445806053925e-07, + "logits/chosen": -1.4648394584655762, + "logits/rejected": -1.4504142999649048, + "logps/chosen": -2.3387691974639893, + "logps/rejected": -2.473045825958252, + "loss": 2.2089, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.387691497802734, + "rewards/margins": 1.3427667617797852, + "rewards/rejected": -24.730457305908203, + "step": 18095 + }, + { + "epoch": 0.6100643769591155, + "grad_norm": 6.288976669311523, + "learning_rate": 3.959367464720433e-07, + "logits/chosen": -1.4934344291687012, + "logits/rejected": -1.6204888820648193, + "logps/chosen": -2.0959277153015137, + "logps/rejected": -2.4498965740203857, + "loss": 2.2194, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.959278106689453, + "rewards/margins": 3.5396888256073, + "rewards/rejected": -24.498966217041016, + "step": 18100 + }, + { + "epoch": 0.6102329030300988, + "grad_norm": 36.37209701538086, + "learning_rate": 3.9564907089573934e-07, + "logits/chosen": -1.1374117136001587, + "logits/rejected": -1.1836864948272705, + "logps/chosen": -2.0730018615722656, + "logps/rejected": -2.164442777633667, + "loss": 2.4365, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.730016708374023, + "rewards/margins": 0.9144119024276733, + "rewards/rejected": -21.64443016052246, + "step": 18105 + }, + { + "epoch": 0.610401429101082, + "grad_norm": 45.56204605102539, + "learning_rate": 3.953614314311808e-07, + "logits/chosen": -1.6337623596191406, + "logits/rejected": -1.5129172801971436, + "logps/chosen": -2.0202136039733887, + "logps/rejected": -2.1366028785705566, + "loss": 3.4906, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.202136993408203, + "rewards/margins": 1.1638901233673096, + "rewards/rejected": -21.36602783203125, + "step": 18110 + }, + { + "epoch": 0.6105699551720651, + "grad_norm": 14.165609359741211, + "learning_rate": 3.950738281779082e-07, + "logits/chosen": -1.9523528814315796, + "logits/rejected": -2.077697992324829, + "logps/chosen": -2.342078447341919, + "logps/rejected": -2.615379810333252, + "loss": 2.851, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -23.4207820892334, + "rewards/margins": 2.7330145835876465, + "rewards/rejected": -26.153797149658203, + "step": 18115 + }, + { + "epoch": 0.6107384812430483, + "grad_norm": 40.21343994140625, + "learning_rate": 3.9478626123544985e-07, + "logits/chosen": -1.9517990350723267, + "logits/rejected": -2.384939193725586, + "logps/chosen": -2.8035922050476074, + "logps/rejected": -2.9411544799804688, + "loss": 3.0387, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -28.035924911499023, + "rewards/margins": 1.375620722770691, + "rewards/rejected": -29.411544799804688, + "step": 18120 + }, + { + "epoch": 0.6109070073140315, + "grad_norm": 30.41344451904297, + "learning_rate": 3.944987307033212e-07, + "logits/chosen": -1.464687705039978, + "logits/rejected": -1.6759631633758545, + "logps/chosen": -1.9217870235443115, + "logps/rejected": -2.374483585357666, + "loss": 1.3678, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.217870712280273, + "rewards/margins": 4.5269646644592285, + "rewards/rejected": -23.744834899902344, + "step": 18125 + }, + { + "epoch": 0.6110755333850146, + "grad_norm": 43.67235565185547, + "learning_rate": 3.9421123668102515e-07, + "logits/chosen": -1.7854055166244507, + "logits/rejected": -2.2990849018096924, + "logps/chosen": -2.1993279457092285, + "logps/rejected": -2.435128688812256, + "loss": 1.8303, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.99327850341797, + "rewards/margins": 2.3580093383789062, + "rewards/rejected": -24.351287841796875, + "step": 18130 + }, + { + "epoch": 0.6112440594559978, + "grad_norm": 46.27460479736328, + "learning_rate": 3.939237792680522e-07, + "logits/chosen": -1.3446651697158813, + "logits/rejected": -1.3674166202545166, + "logps/chosen": -2.7552475929260254, + "logps/rejected": -3.0105865001678467, + "loss": 1.5578, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -27.552471160888672, + "rewards/margins": 2.5533928871154785, + "rewards/rejected": -30.105865478515625, + "step": 18135 + }, + { + "epoch": 0.6114125855269811, + "grad_norm": 20.77875518798828, + "learning_rate": 3.9363635856388e-07, + "logits/chosen": -1.725376844406128, + "logits/rejected": -1.628379464149475, + "logps/chosen": -2.1728854179382324, + "logps/rejected": -2.4002127647399902, + "loss": 2.1471, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.72885513305664, + "rewards/margins": 2.2732746601104736, + "rewards/rejected": -24.002126693725586, + "step": 18140 + }, + { + "epoch": 0.6115811115979642, + "grad_norm": 31.333084106445312, + "learning_rate": 3.933489746679737e-07, + "logits/chosen": -1.8086239099502563, + "logits/rejected": -1.7974382638931274, + "logps/chosen": -1.94889235496521, + "logps/rejected": -2.0811383724212646, + "loss": 2.4491, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.48892593383789, + "rewards/margins": 1.3224602937698364, + "rewards/rejected": -20.811384201049805, + "step": 18145 + }, + { + "epoch": 0.6117496376689474, + "grad_norm": 132.87933349609375, + "learning_rate": 3.9306162767978526e-07, + "logits/chosen": -1.6830793619155884, + "logits/rejected": -2.086782932281494, + "logps/chosen": -2.5300583839416504, + "logps/rejected": -2.8638129234313965, + "loss": 1.7163, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -25.300582885742188, + "rewards/margins": 3.337545871734619, + "rewards/rejected": -28.63812828063965, + "step": 18150 + }, + { + "epoch": 0.6119181637399306, + "grad_norm": 17.956872940063477, + "learning_rate": 3.9277431769875425e-07, + "logits/chosen": -1.276592493057251, + "logits/rejected": -1.4645140171051025, + "logps/chosen": -3.1796116828918457, + "logps/rejected": -3.4596123695373535, + "loss": 2.8719, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -31.796117782592773, + "rewards/margins": 2.8000075817108154, + "rewards/rejected": -34.596126556396484, + "step": 18155 + }, + { + "epoch": 0.6120866898109137, + "grad_norm": 2.6117608547210693, + "learning_rate": 3.924870448243075e-07, + "logits/chosen": -1.7809474468231201, + "logits/rejected": -2.395023822784424, + "logps/chosen": -2.2598633766174316, + "logps/rejected": -2.346357822418213, + "loss": 2.9899, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.5986328125, + "rewards/margins": 0.8649458885192871, + "rewards/rejected": -23.463581085205078, + "step": 18160 + }, + { + "epoch": 0.6122552158818969, + "grad_norm": 36.84818649291992, + "learning_rate": 3.921998091558586e-07, + "logits/chosen": -1.473480224609375, + "logits/rejected": -1.471651315689087, + "logps/chosen": -2.3036561012268066, + "logps/rejected": -2.150559663772583, + "loss": 4.7204, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.03656005859375, + "rewards/margins": -1.53096604347229, + "rewards/rejected": -21.505596160888672, + "step": 18165 + }, + { + "epoch": 0.6124237419528801, + "grad_norm": 39.324222564697266, + "learning_rate": 3.919126107928085e-07, + "logits/chosen": -1.783125877380371, + "logits/rejected": -1.8939613103866577, + "logps/chosen": -2.0952346324920654, + "logps/rejected": -2.0697779655456543, + "loss": 3.5832, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.952346801757812, + "rewards/margins": -0.2545679211616516, + "rewards/rejected": -20.697778701782227, + "step": 18170 + }, + { + "epoch": 0.6125922680238632, + "grad_norm": 34.55978775024414, + "learning_rate": 3.916254498345454e-07, + "logits/chosen": -2.1542255878448486, + "logits/rejected": -2.322671890258789, + "logps/chosen": -2.3772964477539062, + "logps/rejected": -3.2974255084991455, + "loss": 2.7457, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.772964477539062, + "rewards/margins": 9.201289176940918, + "rewards/rejected": -32.97425079345703, + "step": 18175 + }, + { + "epoch": 0.6127607940948465, + "grad_norm": 35.717376708984375, + "learning_rate": 3.913383263804444e-07, + "logits/chosen": -1.193623423576355, + "logits/rejected": -1.332782506942749, + "logps/chosen": -2.154371738433838, + "logps/rejected": -2.138970375061035, + "loss": 3.3613, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.543718338012695, + "rewards/margins": -0.15401363372802734, + "rewards/rejected": -21.389705657958984, + "step": 18180 + }, + { + "epoch": 0.6129293201658297, + "grad_norm": 31.9857120513916, + "learning_rate": 3.910512405298675e-07, + "logits/chosen": -1.9725778102874756, + "logits/rejected": -1.8313255310058594, + "logps/chosen": -1.9979254007339478, + "logps/rejected": -2.13980770111084, + "loss": 2.3521, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.9792537689209, + "rewards/margins": 1.418821930885315, + "rewards/rejected": -21.3980770111084, + "step": 18185 + }, + { + "epoch": 0.6130978462368128, + "grad_norm": 110.69065856933594, + "learning_rate": 3.907641923821638e-07, + "logits/chosen": -1.2381213903427124, + "logits/rejected": -1.6605393886566162, + "logps/chosen": -2.7820382118225098, + "logps/rejected": -2.967272996902466, + "loss": 2.7824, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -27.820383071899414, + "rewards/margins": 1.8523496389389038, + "rewards/rejected": -29.672733306884766, + "step": 18190 + }, + { + "epoch": 0.613266372307796, + "grad_norm": 41.08340072631836, + "learning_rate": 3.9047718203666947e-07, + "logits/chosen": -1.500327706336975, + "logits/rejected": -1.98642098903656, + "logps/chosen": -2.553170919418335, + "logps/rejected": -3.072920560836792, + "loss": 2.2151, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -25.53171157836914, + "rewards/margins": 5.1974968910217285, + "rewards/rejected": -30.729206085205078, + "step": 18195 + }, + { + "epoch": 0.6134348983787792, + "grad_norm": 37.24290084838867, + "learning_rate": 3.9019020959270733e-07, + "logits/chosen": -1.6461604833602905, + "logits/rejected": -1.7050399780273438, + "logps/chosen": -2.5508205890655518, + "logps/rejected": -2.390434741973877, + "loss": 5.6425, + "rewards/accuracies": 0.5, + "rewards/chosen": -25.50820541381836, + "rewards/margins": -1.6038585901260376, + "rewards/rejected": -23.904346466064453, + "step": 18200 + }, + { + "epoch": 0.6136034244497623, + "grad_norm": 40.20940017700195, + "learning_rate": 3.899032751495873e-07, + "logits/chosen": -1.0868529081344604, + "logits/rejected": -1.1445951461791992, + "logps/chosen": -2.8452606201171875, + "logps/rejected": -2.932924270629883, + "loss": 3.5403, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -28.452606201171875, + "rewards/margins": 0.8766378164291382, + "rewards/rejected": -29.32924461364746, + "step": 18205 + }, + { + "epoch": 0.6137719505207455, + "grad_norm": 61.71173858642578, + "learning_rate": 3.896163788066061e-07, + "logits/chosen": -1.2802660465240479, + "logits/rejected": -1.3684922456741333, + "logps/chosen": -3.304487943649292, + "logps/rejected": -3.5455081462860107, + "loss": 4.7107, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -33.04487991333008, + "rewards/margins": 2.410205364227295, + "rewards/rejected": -35.45508575439453, + "step": 18210 + }, + { + "epoch": 0.6139404765917288, + "grad_norm": 73.96290588378906, + "learning_rate": 3.8932952066304745e-07, + "logits/chosen": -1.6128242015838623, + "logits/rejected": -1.8162364959716797, + "logps/chosen": -3.207860231399536, + "logps/rejected": -3.3573410511016846, + "loss": 5.1996, + "rewards/accuracies": 0.5, + "rewards/chosen": -32.0786018371582, + "rewards/margins": 1.4948112964630127, + "rewards/rejected": -33.57341384887695, + "step": 18215 + }, + { + "epoch": 0.614109002662712, + "grad_norm": 26.064023971557617, + "learning_rate": 3.8904270081818125e-07, + "logits/chosen": -2.0467605590820312, + "logits/rejected": -2.187319278717041, + "logps/chosen": -2.2105517387390137, + "logps/rejected": -2.0833380222320557, + "loss": 4.5352, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -22.105518341064453, + "rewards/margins": -1.2721388339996338, + "rewards/rejected": -20.833377838134766, + "step": 18220 + }, + { + "epoch": 0.6142775287336951, + "grad_norm": 12.313178062438965, + "learning_rate": 3.8875591937126477e-07, + "logits/chosen": -1.4248065948486328, + "logits/rejected": -2.334559202194214, + "logps/chosen": -1.9030323028564453, + "logps/rejected": -2.7566123008728027, + "loss": 2.0344, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.030323028564453, + "rewards/margins": 8.535801887512207, + "rewards/rejected": -27.566125869750977, + "step": 18225 + }, + { + "epoch": 0.6144460548046783, + "grad_norm": 48.34172439575195, + "learning_rate": 3.88469176421542e-07, + "logits/chosen": -1.7388198375701904, + "logits/rejected": -1.784240961074829, + "logps/chosen": -2.641664743423462, + "logps/rejected": -2.8418049812316895, + "loss": 3.8033, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -26.416645050048828, + "rewards/margins": 2.0014047622680664, + "rewards/rejected": -28.41805076599121, + "step": 18230 + }, + { + "epoch": 0.6146145808756615, + "grad_norm": 34.234649658203125, + "learning_rate": 3.8818247206824284e-07, + "logits/chosen": -1.5583927631378174, + "logits/rejected": -1.8008098602294922, + "logps/chosen": -1.9230226278305054, + "logps/rejected": -2.3101422786712646, + "loss": 2.172, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.230226516723633, + "rewards/margins": 3.8711953163146973, + "rewards/rejected": -23.101421356201172, + "step": 18235 + }, + { + "epoch": 0.6147831069466446, + "grad_norm": 16.408966064453125, + "learning_rate": 3.878958064105847e-07, + "logits/chosen": -1.8375132083892822, + "logits/rejected": -2.240048885345459, + "logps/chosen": -1.6549434661865234, + "logps/rejected": -2.166231393814087, + "loss": 1.856, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -16.549434661865234, + "rewards/margins": 5.112878799438477, + "rewards/rejected": -21.66231346130371, + "step": 18240 + }, + { + "epoch": 0.6149516330176278, + "grad_norm": 17.53336524963379, + "learning_rate": 3.8760917954777123e-07, + "logits/chosen": -1.7445443868637085, + "logits/rejected": -1.7051893472671509, + "logps/chosen": -2.308375835418701, + "logps/rejected": -2.4736084938049316, + "loss": 3.2146, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.083759307861328, + "rewards/margins": 1.6523252725601196, + "rewards/rejected": -24.736083984375, + "step": 18245 + }, + { + "epoch": 0.6151201590886111, + "grad_norm": 87.04015350341797, + "learning_rate": 3.8732259157899295e-07, + "logits/chosen": -1.6780481338500977, + "logits/rejected": -2.038491725921631, + "logps/chosen": -3.206683397293091, + "logps/rejected": -3.6300296783447266, + "loss": 1.6632, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -32.06683349609375, + "rewards/margins": 4.233463287353516, + "rewards/rejected": -36.30029296875, + "step": 18250 + }, + { + "epoch": 0.6152886851595942, + "grad_norm": 25.186378479003906, + "learning_rate": 3.8703604260342616e-07, + "logits/chosen": -1.5214375257492065, + "logits/rejected": -1.491634488105774, + "logps/chosen": -2.2915260791778564, + "logps/rejected": -2.593151092529297, + "loss": 1.7989, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -22.915258407592773, + "rewards/margins": 3.016251564025879, + "rewards/rejected": -25.9315128326416, + "step": 18255 + }, + { + "epoch": 0.6154572112305774, + "grad_norm": 27.948070526123047, + "learning_rate": 3.8674953272023443e-07, + "logits/chosen": -1.2256909608840942, + "logits/rejected": -1.4115798473358154, + "logps/chosen": -2.4634010791778564, + "logps/rejected": -2.0862600803375244, + "loss": 6.8228, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -24.634008407592773, + "rewards/margins": -3.771408796310425, + "rewards/rejected": -20.862600326538086, + "step": 18260 + }, + { + "epoch": 0.6156257373015606, + "grad_norm": 0.05216868966817856, + "learning_rate": 3.864630620285676e-07, + "logits/chosen": -1.603329062461853, + "logits/rejected": -2.0473170280456543, + "logps/chosen": -2.211454391479492, + "logps/rejected": -2.6565604209899902, + "loss": 1.4562, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -22.114543914794922, + "rewards/margins": 4.4510602951049805, + "rewards/rejected": -26.565603256225586, + "step": 18265 + }, + { + "epoch": 0.6157942633725437, + "grad_norm": 150.82110595703125, + "learning_rate": 3.8617663062756177e-07, + "logits/chosen": -1.921468734741211, + "logits/rejected": -2.095893144607544, + "logps/chosen": -2.2001423835754395, + "logps/rejected": -2.6294565200805664, + "loss": 2.827, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.00142478942871, + "rewards/margins": 4.293142318725586, + "rewards/rejected": -26.294567108154297, + "step": 18270 + }, + { + "epoch": 0.6159627894435269, + "grad_norm": 0.3024599552154541, + "learning_rate": 3.8589023861633965e-07, + "logits/chosen": -2.103994369506836, + "logits/rejected": -2.313030242919922, + "logps/chosen": -2.3458099365234375, + "logps/rejected": -2.509472608566284, + "loss": 3.5971, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.458097457885742, + "rewards/margins": 1.6366304159164429, + "rewards/rejected": -25.094730377197266, + "step": 18275 + }, + { + "epoch": 0.6161313155145101, + "grad_norm": 81.68231201171875, + "learning_rate": 3.8560388609401015e-07, + "logits/chosen": -1.8704826831817627, + "logits/rejected": -1.8536460399627686, + "logps/chosen": -2.5050511360168457, + "logps/rejected": -2.513740062713623, + "loss": 3.299, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.05051040649414, + "rewards/margins": 0.08688764274120331, + "rewards/rejected": -25.13739585876465, + "step": 18280 + }, + { + "epoch": 0.6162998415854932, + "grad_norm": 28.812223434448242, + "learning_rate": 3.8531757315966883e-07, + "logits/chosen": -1.7152595520019531, + "logits/rejected": -1.6138585805892944, + "logps/chosen": -2.5530338287353516, + "logps/rejected": -2.981071949005127, + "loss": 2.2196, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.530338287353516, + "rewards/margins": 4.280381679534912, + "rewards/rejected": -29.810720443725586, + "step": 18285 + }, + { + "epoch": 0.6164683676564765, + "grad_norm": 58.12255859375, + "learning_rate": 3.8503129991239695e-07, + "logits/chosen": -1.7070395946502686, + "logits/rejected": -1.852764368057251, + "logps/chosen": -2.6107370853424072, + "logps/rejected": -2.647188186645508, + "loss": 3.906, + "rewards/accuracies": 0.5, + "rewards/chosen": -26.107370376586914, + "rewards/margins": 0.36450958251953125, + "rewards/rejected": -26.471881866455078, + "step": 18290 + }, + { + "epoch": 0.6166368937274597, + "grad_norm": 68.67720031738281, + "learning_rate": 3.8474506645126257e-07, + "logits/chosen": -1.6032400131225586, + "logits/rejected": -1.540244221687317, + "logps/chosen": -2.1491074562072754, + "logps/rejected": -2.198559284210205, + "loss": 2.7402, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.491073608398438, + "rewards/margins": 0.4945201873779297, + "rewards/rejected": -21.985593795776367, + "step": 18295 + }, + { + "epoch": 0.6168054197984428, + "grad_norm": 102.33248138427734, + "learning_rate": 3.8445887287532006e-07, + "logits/chosen": -1.7181625366210938, + "logits/rejected": -1.9780042171478271, + "logps/chosen": -2.8356316089630127, + "logps/rejected": -3.348367691040039, + "loss": 1.4323, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -28.3563175201416, + "rewards/margins": 5.127358913421631, + "rewards/rejected": -33.483680725097656, + "step": 18300 + }, + { + "epoch": 0.616973945869426, + "grad_norm": 36.54032516479492, + "learning_rate": 3.8417271928360934e-07, + "logits/chosen": -1.6353356838226318, + "logits/rejected": -1.8744697570800781, + "logps/chosen": -1.9833621978759766, + "logps/rejected": -2.1706643104553223, + "loss": 3.0216, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.833620071411133, + "rewards/margins": 1.873023271560669, + "rewards/rejected": -21.70664405822754, + "step": 18305 + }, + { + "epoch": 0.6171424719404092, + "grad_norm": 25.216777801513672, + "learning_rate": 3.83886605775157e-07, + "logits/chosen": -1.900887131690979, + "logits/rejected": -1.8457438945770264, + "logps/chosen": -3.0988681316375732, + "logps/rejected": -3.0910229682922363, + "loss": 4.2173, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -30.988683700561523, + "rewards/margins": -0.07845230400562286, + "rewards/rejected": -30.910228729248047, + "step": 18310 + }, + { + "epoch": 0.6173109980113923, + "grad_norm": 48.769683837890625, + "learning_rate": 3.8360053244897573e-07, + "logits/chosen": -1.7366855144500732, + "logits/rejected": -1.7216198444366455, + "logps/chosen": -2.0870800018310547, + "logps/rejected": -2.0824666023254395, + "loss": 3.4526, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.870800018310547, + "rewards/margins": -0.04613447189331055, + "rewards/rejected": -20.824665069580078, + "step": 18315 + }, + { + "epoch": 0.6174795240823755, + "grad_norm": 31.436220169067383, + "learning_rate": 3.8331449940406444e-07, + "logits/chosen": -2.0017778873443604, + "logits/rejected": -2.344151496887207, + "logps/chosen": -3.020869493484497, + "logps/rejected": -3.665548801422119, + "loss": 1.5649, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -30.208694458007812, + "rewards/margins": 6.4467949867248535, + "rewards/rejected": -36.655487060546875, + "step": 18320 + }, + { + "epoch": 0.6176480501533588, + "grad_norm": 11.105485916137695, + "learning_rate": 3.8302850673940745e-07, + "logits/chosen": -1.6430978775024414, + "logits/rejected": -1.8722255229949951, + "logps/chosen": -2.1614890098571777, + "logps/rejected": -2.7056336402893066, + "loss": 2.1378, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.614891052246094, + "rewards/margins": 5.441445827484131, + "rewards/rejected": -27.05633544921875, + "step": 18325 + }, + { + "epoch": 0.6178165762243419, + "grad_norm": 35.78452682495117, + "learning_rate": 3.8274255455397585e-07, + "logits/chosen": -1.2529391050338745, + "logits/rejected": -1.4416625499725342, + "logps/chosen": -2.2270255088806152, + "logps/rejected": -2.205737352371216, + "loss": 3.5502, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -22.2702579498291, + "rewards/margins": -0.21288509666919708, + "rewards/rejected": -22.057371139526367, + "step": 18330 + }, + { + "epoch": 0.6179851022953251, + "grad_norm": 64.93318939208984, + "learning_rate": 3.8245664294672644e-07, + "logits/chosen": -1.5049943923950195, + "logits/rejected": -1.4447296857833862, + "logps/chosen": -1.9939762353897095, + "logps/rejected": -2.002965211868286, + "loss": 3.1059, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.93976593017578, + "rewards/margins": 0.08988761901855469, + "rewards/rejected": -20.029653549194336, + "step": 18335 + }, + { + "epoch": 0.6181536283663083, + "grad_norm": 29.312524795532227, + "learning_rate": 3.821707720166018e-07, + "logits/chosen": -1.8286025524139404, + "logits/rejected": -1.8722509145736694, + "logps/chosen": -1.8341219425201416, + "logps/rejected": -1.8297138214111328, + "loss": 3.2439, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.341217041015625, + "rewards/margins": -0.044080257415771484, + "rewards/rejected": -18.297138214111328, + "step": 18340 + }, + { + "epoch": 0.6183221544372914, + "grad_norm": 127.18154907226562, + "learning_rate": 3.818849418625306e-07, + "logits/chosen": -1.9834444522857666, + "logits/rejected": -1.7992169857025146, + "logps/chosen": -2.6786978244781494, + "logps/rejected": -2.7950711250305176, + "loss": 4.7649, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.786977767944336, + "rewards/margins": 1.1637320518493652, + "rewards/rejected": -27.950708389282227, + "step": 18345 + }, + { + "epoch": 0.6184906805082746, + "grad_norm": 34.55775833129883, + "learning_rate": 3.815991525834276e-07, + "logits/chosen": -2.2684998512268066, + "logits/rejected": -2.30085825920105, + "logps/chosen": -3.213099241256714, + "logps/rejected": -3.595430850982666, + "loss": 4.0705, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -32.13098907470703, + "rewards/margins": 3.8233160972595215, + "rewards/rejected": -35.954307556152344, + "step": 18350 + }, + { + "epoch": 0.6186592065792578, + "grad_norm": 21.957063674926758, + "learning_rate": 3.8131340427819307e-07, + "logits/chosen": -1.6553875207901, + "logits/rejected": -1.6048953533172607, + "logps/chosen": -1.9786068201065063, + "logps/rejected": -2.27418851852417, + "loss": 2.6846, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.786067962646484, + "rewards/margins": 2.955817222595215, + "rewards/rejected": -22.741886138916016, + "step": 18355 + }, + { + "epoch": 0.618827732650241, + "grad_norm": 43.23855972290039, + "learning_rate": 3.810276970457132e-07, + "logits/chosen": -2.354893445968628, + "logits/rejected": -2.361724853515625, + "logps/chosen": -2.808964729309082, + "logps/rejected": -3.1177735328674316, + "loss": 3.0821, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -28.089649200439453, + "rewards/margins": 3.088087558746338, + "rewards/rejected": -31.177734375, + "step": 18360 + }, + { + "epoch": 0.6189962587212242, + "grad_norm": 30.515972137451172, + "learning_rate": 3.8074203098486004e-07, + "logits/chosen": -1.6836265325546265, + "logits/rejected": -2.1789212226867676, + "logps/chosen": -2.1196069717407227, + "logps/rejected": -2.5858001708984375, + "loss": 2.8524, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.19607162475586, + "rewards/margins": 4.66193151473999, + "rewards/rejected": -25.858001708984375, + "step": 18365 + }, + { + "epoch": 0.6191647847922074, + "grad_norm": 30.81611442565918, + "learning_rate": 3.804564061944916e-07, + "logits/chosen": -1.2702562808990479, + "logits/rejected": -1.6156784296035767, + "logps/chosen": -2.31827974319458, + "logps/rejected": -2.7410683631896973, + "loss": 1.8617, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -23.182796478271484, + "rewards/margins": 4.227887153625488, + "rewards/rejected": -27.41068458557129, + "step": 18370 + }, + { + "epoch": 0.6193333108631905, + "grad_norm": 25.21677589416504, + "learning_rate": 3.801708227734509e-07, + "logits/chosen": -1.7795536518096924, + "logits/rejected": -2.1119651794433594, + "logps/chosen": -1.9674450159072876, + "logps/rejected": -2.2505908012390137, + "loss": 3.0825, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.674449920654297, + "rewards/margins": 2.8314576148986816, + "rewards/rejected": -22.505908966064453, + "step": 18375 + }, + { + "epoch": 0.6195018369341737, + "grad_norm": 162.6773681640625, + "learning_rate": 3.798852808205674e-07, + "logits/chosen": -2.0111989974975586, + "logits/rejected": -2.380748748779297, + "logps/chosen": -3.5007591247558594, + "logps/rejected": -3.966818332672119, + "loss": 2.9209, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -35.007591247558594, + "rewards/margins": 4.660592079162598, + "rewards/rejected": -39.668182373046875, + "step": 18380 + }, + { + "epoch": 0.6196703630051569, + "grad_norm": 29.002161026000977, + "learning_rate": 3.7959978043465584e-07, + "logits/chosen": -1.1822447776794434, + "logits/rejected": -1.4777024984359741, + "logps/chosen": -2.1516406536102295, + "logps/rejected": -2.1278204917907715, + "loss": 4.0319, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.516408920288086, + "rewards/margins": -0.2382035255432129, + "rewards/rejected": -21.2782039642334, + "step": 18385 + }, + { + "epoch": 0.61983888907614, + "grad_norm": 33.008941650390625, + "learning_rate": 3.7931432171451695e-07, + "logits/chosen": -1.0619652271270752, + "logits/rejected": -1.018744707107544, + "logps/chosen": -3.1147468090057373, + "logps/rejected": -3.2658798694610596, + "loss": 2.2896, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -31.1474666595459, + "rewards/margins": 1.5113319158554077, + "rewards/rejected": -32.6588020324707, + "step": 18390 + }, + { + "epoch": 0.6200074151471232, + "grad_norm": 29.102638244628906, + "learning_rate": 3.7902890475893625e-07, + "logits/chosen": -1.483496904373169, + "logits/rejected": -1.4409980773925781, + "logps/chosen": -2.2974536418914795, + "logps/rejected": -2.2440028190612793, + "loss": 4.9437, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.974538803100586, + "rewards/margins": -0.5345101356506348, + "rewards/rejected": -22.44002914428711, + "step": 18395 + }, + { + "epoch": 0.6201759412181065, + "grad_norm": 26.34992218017578, + "learning_rate": 3.787435296666855e-07, + "logits/chosen": -1.703963279724121, + "logits/rejected": -1.8446147441864014, + "logps/chosen": -2.14034366607666, + "logps/rejected": -2.2772395610809326, + "loss": 2.9894, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.403438568115234, + "rewards/margins": 1.3689591884613037, + "rewards/rejected": -22.772396087646484, + "step": 18400 + }, + { + "epoch": 0.6201759412181065, + "eval_logits/chosen": -2.0543246269226074, + "eval_logits/rejected": -2.208879232406616, + "eval_logps/chosen": -2.161914110183716, + "eval_logps/rejected": -2.3027634620666504, + "eval_loss": 3.003286838531494, + "eval_rewards/accuracies": 0.6200000047683716, + "eval_rewards/chosen": -21.619140625, + "eval_rewards/margins": 1.4084923267364502, + "eval_rewards/rejected": -23.027631759643555, + "eval_runtime": 12.9004, + "eval_samples_per_second": 7.752, + "eval_steps_per_second": 1.938, + "step": 18400 + }, + { + "epoch": 0.6203444672890897, + "grad_norm": 158.8337860107422, + "learning_rate": 3.78458196536522e-07, + "logits/chosen": -1.6829944849014282, + "logits/rejected": -1.850947618484497, + "logps/chosen": -2.789701461791992, + "logps/rejected": -2.4470295906066895, + "loss": 6.636, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -27.89701271057129, + "rewards/margins": -3.426718235015869, + "rewards/rejected": -24.470294952392578, + "step": 18405 + }, + { + "epoch": 0.6205129933600728, + "grad_norm": 36.712547302246094, + "learning_rate": 3.7817290546718796e-07, + "logits/chosen": -1.125700831413269, + "logits/rejected": -1.4983158111572266, + "logps/chosen": -2.071444034576416, + "logps/rejected": -2.1972525119781494, + "loss": 3.7233, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.714439392089844, + "rewards/margins": 1.2580852508544922, + "rewards/rejected": -21.972524642944336, + "step": 18410 + }, + { + "epoch": 0.620681519431056, + "grad_norm": 16.614980697631836, + "learning_rate": 3.7788765655741165e-07, + "logits/chosen": -1.7382490634918213, + "logits/rejected": -2.124817132949829, + "logps/chosen": -2.513784408569336, + "logps/rejected": -2.7320122718811035, + "loss": 2.1141, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.13784408569336, + "rewards/margins": 2.182276964187622, + "rewards/rejected": -27.32012367248535, + "step": 18415 + }, + { + "epoch": 0.6208500455020391, + "grad_norm": 21.424095153808594, + "learning_rate": 3.7760244990590627e-07, + "logits/chosen": -1.7637748718261719, + "logits/rejected": -1.7967971563339233, + "logps/chosen": -2.9038190841674805, + "logps/rejected": -3.031123399734497, + "loss": 3.3259, + "rewards/accuracies": 0.5, + "rewards/chosen": -29.038188934326172, + "rewards/margins": 1.2730454206466675, + "rewards/rejected": -30.311237335205078, + "step": 18420 + }, + { + "epoch": 0.6210185715730223, + "grad_norm": 24.72469711303711, + "learning_rate": 3.773172856113709e-07, + "logits/chosen": -1.4290255308151245, + "logits/rejected": -1.7143604755401611, + "logps/chosen": -1.9013131856918335, + "logps/rejected": -2.246443510055542, + "loss": 2.1578, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.013132095336914, + "rewards/margins": 3.4513046741485596, + "rewards/rejected": -22.464435577392578, + "step": 18425 + }, + { + "epoch": 0.6211870976440055, + "grad_norm": 33.128170013427734, + "learning_rate": 3.770321637724893e-07, + "logits/chosen": -1.646192193031311, + "logits/rejected": -1.898533582687378, + "logps/chosen": -3.1607718467712402, + "logps/rejected": -3.6098225116729736, + "loss": 3.0863, + "rewards/accuracies": 0.5, + "rewards/chosen": -31.607717514038086, + "rewards/margins": 4.490505218505859, + "rewards/rejected": -36.09822463989258, + "step": 18430 + }, + { + "epoch": 0.6213556237149888, + "grad_norm": 36.44154357910156, + "learning_rate": 3.7674708448793105e-07, + "logits/chosen": -1.2727489471435547, + "logits/rejected": -1.7410869598388672, + "logps/chosen": -2.239375114440918, + "logps/rejected": -3.270235776901245, + "loss": 1.8125, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.393749237060547, + "rewards/margins": 10.308609008789062, + "rewards/rejected": -32.70235824584961, + "step": 18435 + }, + { + "epoch": 0.6215241497859719, + "grad_norm": 28.19940948486328, + "learning_rate": 3.764620478563511e-07, + "logits/chosen": -1.2106577157974243, + "logits/rejected": -1.5160796642303467, + "logps/chosen": -2.4029972553253174, + "logps/rejected": -2.482666492462158, + "loss": 2.6059, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.02997398376465, + "rewards/margins": 0.7966904640197754, + "rewards/rejected": -24.826663970947266, + "step": 18440 + }, + { + "epoch": 0.6216926758569551, + "grad_norm": 24.05192756652832, + "learning_rate": 3.76177053976389e-07, + "logits/chosen": -1.7198930978775024, + "logits/rejected": -2.0885720252990723, + "logps/chosen": -1.9368069171905518, + "logps/rejected": -2.056891918182373, + "loss": 3.4735, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.36806869506836, + "rewards/margins": 1.200850009918213, + "rewards/rejected": -20.568918228149414, + "step": 18445 + }, + { + "epoch": 0.6218612019279383, + "grad_norm": 17.315340042114258, + "learning_rate": 3.758921029466701e-07, + "logits/chosen": -1.7316211462020874, + "logits/rejected": -1.7989072799682617, + "logps/chosen": -2.3561413288116455, + "logps/rejected": -2.818152904510498, + "loss": 2.7612, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -23.561410903930664, + "rewards/margins": 4.620118141174316, + "rewards/rejected": -28.181528091430664, + "step": 18450 + }, + { + "epoch": 0.6220297279989214, + "grad_norm": 35.148826599121094, + "learning_rate": 3.7560719486580494e-07, + "logits/chosen": -2.0707952976226807, + "logits/rejected": -2.0945842266082764, + "logps/chosen": -2.033674955368042, + "logps/rejected": -1.9517732858657837, + "loss": 4.0518, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.33675193786621, + "rewards/margins": -0.8190194368362427, + "rewards/rejected": -19.517730712890625, + "step": 18455 + }, + { + "epoch": 0.6221982540699046, + "grad_norm": 41.66608810424805, + "learning_rate": 3.7532232983238847e-07, + "logits/chosen": -1.5666892528533936, + "logits/rejected": -1.5280810594558716, + "logps/chosen": -1.975049614906311, + "logps/rejected": -1.9352716207504272, + "loss": 3.5712, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.7504940032959, + "rewards/margins": -0.39777764678001404, + "rewards/rejected": -19.352718353271484, + "step": 18460 + }, + { + "epoch": 0.6223667801408878, + "grad_norm": 65.85220336914062, + "learning_rate": 3.750375079450016e-07, + "logits/chosen": -1.7846683263778687, + "logits/rejected": -1.8842815160751343, + "logps/chosen": -2.4808125495910645, + "logps/rejected": -2.9712767601013184, + "loss": 2.332, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.80812644958496, + "rewards/margins": 4.90463924407959, + "rewards/rejected": -29.712764739990234, + "step": 18465 + }, + { + "epoch": 0.622535306211871, + "grad_norm": 44.47334289550781, + "learning_rate": 3.747527293022099e-07, + "logits/chosen": -1.5464017391204834, + "logits/rejected": -1.6421788930892944, + "logps/chosen": -2.0164692401885986, + "logps/rejected": -1.9157434701919556, + "loss": 4.4096, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -20.164691925048828, + "rewards/margins": -1.0072581768035889, + "rewards/rejected": -19.157434463500977, + "step": 18470 + }, + { + "epoch": 0.6227038322828542, + "grad_norm": 54.05680847167969, + "learning_rate": 3.7446799400256415e-07, + "logits/chosen": -1.3993407487869263, + "logits/rejected": -1.7749563455581665, + "logps/chosen": -1.9316022396087646, + "logps/rejected": -2.392576217651367, + "loss": 2.0747, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.316022872924805, + "rewards/margins": 4.609738349914551, + "rewards/rejected": -23.92576026916504, + "step": 18475 + }, + { + "epoch": 0.6228723583538374, + "grad_norm": 101.51625061035156, + "learning_rate": 3.741833021445999e-07, + "logits/chosen": -1.7461254596710205, + "logits/rejected": -1.7653440237045288, + "logps/chosen": -2.2352042198181152, + "logps/rejected": -2.2143850326538086, + "loss": 3.3942, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.35204315185547, + "rewards/margins": -0.20819291472434998, + "rewards/rejected": -22.143848419189453, + "step": 18480 + }, + { + "epoch": 0.6230408844248205, + "grad_norm": 170.65745544433594, + "learning_rate": 3.7389865382683774e-07, + "logits/chosen": -1.717813491821289, + "logits/rejected": -1.915365219116211, + "logps/chosen": -2.594407081604004, + "logps/rejected": -2.9728779792785645, + "loss": 4.5426, + "rewards/accuracies": 0.5, + "rewards/chosen": -25.94407081604004, + "rewards/margins": 3.784705400466919, + "rewards/rejected": -29.728778839111328, + "step": 18485 + }, + { + "epoch": 0.6232094104958037, + "grad_norm": 30.940324783325195, + "learning_rate": 3.7361404914778326e-07, + "logits/chosen": -1.867049217224121, + "logits/rejected": -2.008021354675293, + "logps/chosen": -2.037647008895874, + "logps/rejected": -2.712981700897217, + "loss": 2.4101, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.3764705657959, + "rewards/margins": 6.753344535827637, + "rewards/rejected": -27.12981605529785, + "step": 18490 + }, + { + "epoch": 0.6233779365667869, + "grad_norm": 42.33964538574219, + "learning_rate": 3.73329488205927e-07, + "logits/chosen": -1.4565184116363525, + "logits/rejected": -1.9266363382339478, + "logps/chosen": -2.5136559009552, + "logps/rejected": -3.28765606880188, + "loss": 1.3486, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -25.136554718017578, + "rewards/margins": 7.7400031089782715, + "rewards/rejected": -32.87656021118164, + "step": 18495 + }, + { + "epoch": 0.62354646263777, + "grad_norm": 22.314546585083008, + "learning_rate": 3.730449710997442e-07, + "logits/chosen": -1.321411371231079, + "logits/rejected": -1.730507254600525, + "logps/chosen": -1.797588586807251, + "logps/rejected": -1.8665136098861694, + "loss": 2.5434, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.97588539123535, + "rewards/margins": 0.6892514228820801, + "rewards/rejected": -18.665136337280273, + "step": 18500 + }, + { + "epoch": 0.6237149887087532, + "grad_norm": 21.432235717773438, + "learning_rate": 3.727604979276951e-07, + "logits/chosen": -1.1045910120010376, + "logits/rejected": -1.2482346296310425, + "logps/chosen": -2.0690102577209473, + "logps/rejected": -2.294459581375122, + "loss": 2.9685, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.69010353088379, + "rewards/margins": 2.2544922828674316, + "rewards/rejected": -22.944595336914062, + "step": 18505 + }, + { + "epoch": 0.6238835147797365, + "grad_norm": 30.58997344970703, + "learning_rate": 3.724760687882248e-07, + "logits/chosen": -1.981286644935608, + "logits/rejected": -1.9685707092285156, + "logps/chosen": -1.7274444103240967, + "logps/rejected": -1.8757808208465576, + "loss": 2.8326, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.274444580078125, + "rewards/margins": 1.483363151550293, + "rewards/rejected": -18.757808685302734, + "step": 18510 + }, + { + "epoch": 0.6240520408507196, + "grad_norm": 28.33938217163086, + "learning_rate": 3.7219168377976267e-07, + "logits/chosen": -1.7231643199920654, + "logits/rejected": -1.5711719989776611, + "logps/chosen": -2.3871026039123535, + "logps/rejected": -2.6765151023864746, + "loss": 3.2495, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -23.87102508544922, + "rewards/margins": 2.89412522315979, + "rewards/rejected": -26.765151977539062, + "step": 18515 + }, + { + "epoch": 0.6242205669217028, + "grad_norm": 27.47281837463379, + "learning_rate": 3.7190734300072336e-07, + "logits/chosen": -2.195241689682007, + "logits/rejected": -2.453075885772705, + "logps/chosen": -1.7438371181488037, + "logps/rejected": -1.9724502563476562, + "loss": 2.3724, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.438373565673828, + "rewards/margins": 2.2861297130584717, + "rewards/rejected": -19.724502563476562, + "step": 18520 + }, + { + "epoch": 0.624389092992686, + "grad_norm": 28.859298706054688, + "learning_rate": 3.7162304654950614e-07, + "logits/chosen": -1.7981083393096924, + "logits/rejected": -1.8701518774032593, + "logps/chosen": -2.183924436569214, + "logps/rejected": -2.228469133377075, + "loss": 3.7653, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.839244842529297, + "rewards/margins": 0.4454454481601715, + "rewards/rejected": -22.28468894958496, + "step": 18525 + }, + { + "epoch": 0.6245576190636691, + "grad_norm": 28.86895751953125, + "learning_rate": 3.7133879452449446e-07, + "logits/chosen": -1.7908798456192017, + "logits/rejected": -1.7884708642959595, + "logps/chosen": -2.4270756244659424, + "logps/rejected": -2.6708824634552, + "loss": 2.6429, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.2707576751709, + "rewards/margins": 2.438068151473999, + "rewards/rejected": -26.708826065063477, + "step": 18530 + }, + { + "epoch": 0.6247261451346523, + "grad_norm": 50.5154914855957, + "learning_rate": 3.71054587024057e-07, + "logits/chosen": -1.2883013486862183, + "logits/rejected": -1.6345545053482056, + "logps/chosen": -1.8493115901947021, + "logps/rejected": -2.293156385421753, + "loss": 2.6221, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.493114471435547, + "rewards/margins": 4.438447952270508, + "rewards/rejected": -22.931560516357422, + "step": 18535 + }, + { + "epoch": 0.6248946712056355, + "grad_norm": 30.61741065979004, + "learning_rate": 3.707704241465467e-07, + "logits/chosen": -0.9950865507125854, + "logits/rejected": -1.1939284801483154, + "logps/chosen": -2.134911298751831, + "logps/rejected": -3.1475205421447754, + "loss": 1.5967, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.349111557006836, + "rewards/margins": 10.126092910766602, + "rewards/rejected": -31.475208282470703, + "step": 18540 + }, + { + "epoch": 0.6250631972766187, + "grad_norm": 84.05421447753906, + "learning_rate": 3.7048630599030134e-07, + "logits/chosen": -1.2695919275283813, + "logits/rejected": -1.594789743423462, + "logps/chosen": -1.8781297206878662, + "logps/rejected": -2.194685697555542, + "loss": 1.3803, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -18.78129768371582, + "rewards/margins": 3.1655619144439697, + "rewards/rejected": -21.946857452392578, + "step": 18545 + }, + { + "epoch": 0.6252317233476019, + "grad_norm": 52.571868896484375, + "learning_rate": 3.7020223265364264e-07, + "logits/chosen": -1.192299485206604, + "logits/rejected": -1.9590389728546143, + "logps/chosen": -2.2939975261688232, + "logps/rejected": -2.977759838104248, + "loss": 2.1195, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -22.939977645874023, + "rewards/margins": 6.837620735168457, + "rewards/rejected": -29.777597427368164, + "step": 18550 + }, + { + "epoch": 0.6254002494185851, + "grad_norm": 33.046817779541016, + "learning_rate": 3.699182042348774e-07, + "logits/chosen": -1.9198243618011475, + "logits/rejected": -1.962834358215332, + "logps/chosen": -3.0863873958587646, + "logps/rejected": -3.3279757499694824, + "loss": 5.3368, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -30.863876342773438, + "rewards/margins": 2.415881633758545, + "rewards/rejected": -33.27975845336914, + "step": 18555 + }, + { + "epoch": 0.6255687754895682, + "grad_norm": 45.52570724487305, + "learning_rate": 3.6963422083229676e-07, + "logits/chosen": -1.1146682500839233, + "logits/rejected": -1.5024160146713257, + "logps/chosen": -2.387057304382324, + "logps/rejected": -2.8693313598632812, + "loss": 3.674, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.870573043823242, + "rewards/margins": 4.822741508483887, + "rewards/rejected": -28.693313598632812, + "step": 18560 + }, + { + "epoch": 0.6257373015605514, + "grad_norm": 34.899070739746094, + "learning_rate": 3.6935028254417597e-07, + "logits/chosen": -1.2353322505950928, + "logits/rejected": -1.1657068729400635, + "logps/chosen": -1.8352171182632446, + "logps/rejected": -1.9835834503173828, + "loss": 2.1929, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.352170944213867, + "rewards/margins": 1.4836634397506714, + "rewards/rejected": -19.835834503173828, + "step": 18565 + }, + { + "epoch": 0.6259058276315346, + "grad_norm": 16.575700759887695, + "learning_rate": 3.69066389468775e-07, + "logits/chosen": -1.628157377243042, + "logits/rejected": -1.5780006647109985, + "logps/chosen": -1.9046649932861328, + "logps/rejected": -2.117918014526367, + "loss": 2.2805, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.046649932861328, + "rewards/margins": 2.1325302124023438, + "rewards/rejected": -21.179180145263672, + "step": 18570 + }, + { + "epoch": 0.6260743537025177, + "grad_norm": 32.12807846069336, + "learning_rate": 3.687825417043381e-07, + "logits/chosen": -1.7738593816757202, + "logits/rejected": -1.9184929132461548, + "logps/chosen": -2.171799659729004, + "logps/rejected": -2.3933589458465576, + "loss": 3.5577, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.71799659729004, + "rewards/margins": 2.215592622756958, + "rewards/rejected": -23.933589935302734, + "step": 18575 + }, + { + "epoch": 0.626242879773501, + "grad_norm": 34.624046325683594, + "learning_rate": 3.684987393490939e-07, + "logits/chosen": -0.7456585764884949, + "logits/rejected": -0.8155146837234497, + "logps/chosen": -2.190500259399414, + "logps/rejected": -2.361079692840576, + "loss": 1.9946, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.90500259399414, + "rewards/margins": 1.705796480178833, + "rewards/rejected": -23.61079978942871, + "step": 18580 + }, + { + "epoch": 0.6264114058444842, + "grad_norm": 23.193527221679688, + "learning_rate": 3.6821498250125494e-07, + "logits/chosen": -1.531273603439331, + "logits/rejected": -1.4955469369888306, + "logps/chosen": -2.295144557952881, + "logps/rejected": -2.4351449012756348, + "loss": 3.1515, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.951446533203125, + "rewards/margins": 1.4000024795532227, + "rewards/rejected": -24.3514461517334, + "step": 18585 + }, + { + "epoch": 0.6265799319154673, + "grad_norm": 35.371726989746094, + "learning_rate": 3.679312712590183e-07, + "logits/chosen": -1.5566551685333252, + "logits/rejected": -1.6705825328826904, + "logps/chosen": -2.2006218433380127, + "logps/rejected": -2.512608766555786, + "loss": 2.6644, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.00621795654297, + "rewards/margins": 3.1198689937591553, + "rewards/rejected": -25.126087188720703, + "step": 18590 + }, + { + "epoch": 0.6267484579864505, + "grad_norm": 46.78335189819336, + "learning_rate": 3.6764760572056567e-07, + "logits/chosen": -1.7316887378692627, + "logits/rejected": -1.6605297327041626, + "logps/chosen": -1.9013340473175049, + "logps/rejected": -2.0264945030212402, + "loss": 2.8801, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.01333999633789, + "rewards/margins": 1.2516052722930908, + "rewards/rejected": -20.26494598388672, + "step": 18595 + }, + { + "epoch": 0.6269169840574337, + "grad_norm": 80.91983032226562, + "learning_rate": 3.6736398598406205e-07, + "logits/chosen": -1.4064184427261353, + "logits/rejected": -1.4780693054199219, + "logps/chosen": -2.071498155593872, + "logps/rejected": -2.684077739715576, + "loss": 2.1366, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.71497917175293, + "rewards/margins": 6.125799179077148, + "rewards/rejected": -26.840778350830078, + "step": 18600 + }, + { + "epoch": 0.6270855101284168, + "grad_norm": 22.150487899780273, + "learning_rate": 3.670804121476571e-07, + "logits/chosen": -1.170275330543518, + "logits/rejected": -1.510514259338379, + "logps/chosen": -2.4041285514831543, + "logps/rejected": -3.1954643726348877, + "loss": 2.4695, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.04128646850586, + "rewards/margins": 7.913358211517334, + "rewards/rejected": -31.95464515686035, + "step": 18605 + }, + { + "epoch": 0.6272540361994, + "grad_norm": 16.839929580688477, + "learning_rate": 3.6679688430948477e-07, + "logits/chosen": -1.8504005670547485, + "logits/rejected": -2.0550291538238525, + "logps/chosen": -2.6161468029022217, + "logps/rejected": -4.031783103942871, + "loss": 2.3714, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.161468505859375, + "rewards/margins": 14.156366348266602, + "rewards/rejected": -40.317832946777344, + "step": 18610 + }, + { + "epoch": 0.6274225622703832, + "grad_norm": 25.382099151611328, + "learning_rate": 3.66513402567663e-07, + "logits/chosen": -1.4021594524383545, + "logits/rejected": -1.5731501579284668, + "logps/chosen": -2.061854600906372, + "logps/rejected": -2.361215114593506, + "loss": 1.9625, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -20.61854362487793, + "rewards/margins": 2.9936070442199707, + "rewards/rejected": -23.612152099609375, + "step": 18615 + }, + { + "epoch": 0.6275910883413665, + "grad_norm": 26.835275650024414, + "learning_rate": 3.6622996702029317e-07, + "logits/chosen": -1.4052951335906982, + "logits/rejected": -1.4855515956878662, + "logps/chosen": -1.8088502883911133, + "logps/rejected": -2.0100693702697754, + "loss": 1.6008, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.088504791259766, + "rewards/margins": 2.0121896266937256, + "rewards/rejected": -20.100692749023438, + "step": 18620 + }, + { + "epoch": 0.6277596144123496, + "grad_norm": 23.951950073242188, + "learning_rate": 3.659465777654615e-07, + "logits/chosen": -1.716301679611206, + "logits/rejected": -1.5523309707641602, + "logps/chosen": -1.6933162212371826, + "logps/rejected": -1.782979965209961, + "loss": 2.6362, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.933162689208984, + "rewards/margins": 0.8966361880302429, + "rewards/rejected": -17.82979965209961, + "step": 18625 + }, + { + "epoch": 0.6279281404833328, + "grad_norm": 28.52524757385254, + "learning_rate": 3.6566323490123785e-07, + "logits/chosen": -1.995849370956421, + "logits/rejected": -2.2262625694274902, + "logps/chosen": -2.3244833946228027, + "logps/rejected": -2.6942267417907715, + "loss": 3.0486, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.24483299255371, + "rewards/margins": 3.697434663772583, + "rewards/rejected": -26.942270278930664, + "step": 18630 + }, + { + "epoch": 0.628096666554316, + "grad_norm": 25.172317504882812, + "learning_rate": 3.6537993852567584e-07, + "logits/chosen": -1.965527892112732, + "logits/rejected": -2.0461528301239014, + "logps/chosen": -2.907482147216797, + "logps/rejected": -3.6528172492980957, + "loss": 2.0917, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -29.074819564819336, + "rewards/margins": 7.453348636627197, + "rewards/rejected": -36.528175354003906, + "step": 18635 + }, + { + "epoch": 0.6282651926252991, + "grad_norm": 25.730562210083008, + "learning_rate": 3.6509668873681327e-07, + "logits/chosen": -1.723894715309143, + "logits/rejected": -1.7027992010116577, + "logps/chosen": -2.7941901683807373, + "logps/rejected": -2.780871868133545, + "loss": 3.8698, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -27.941904067993164, + "rewards/margins": -0.1331832855939865, + "rewards/rejected": -27.8087215423584, + "step": 18640 + }, + { + "epoch": 0.6284337186962823, + "grad_norm": 26.492856979370117, + "learning_rate": 3.6481348563267176e-07, + "logits/chosen": -1.3790489435195923, + "logits/rejected": -1.5762332677841187, + "logps/chosen": -1.9941978454589844, + "logps/rejected": -2.02683687210083, + "loss": 2.8477, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.941980361938477, + "rewards/margins": 0.3263886570930481, + "rewards/rejected": -20.268369674682617, + "step": 18645 + }, + { + "epoch": 0.6286022447672655, + "grad_norm": 21.514509201049805, + "learning_rate": 3.6453032931125695e-07, + "logits/chosen": -2.6226284503936768, + "logits/rejected": -2.9686291217803955, + "logps/chosen": -3.428541898727417, + "logps/rejected": -4.568081855773926, + "loss": 2.2685, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -34.28541946411133, + "rewards/margins": 11.395398139953613, + "rewards/rejected": -45.680816650390625, + "step": 18650 + }, + { + "epoch": 0.6287707708382487, + "grad_norm": 40.166297912597656, + "learning_rate": 3.642472198705576e-07, + "logits/chosen": -1.676222562789917, + "logits/rejected": -1.9952523708343506, + "logps/chosen": -2.4934425354003906, + "logps/rejected": -2.7475247383117676, + "loss": 2.4238, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.93442726135254, + "rewards/margins": 2.540821075439453, + "rewards/rejected": -27.47524642944336, + "step": 18655 + }, + { + "epoch": 0.6289392969092319, + "grad_norm": 43.843589782714844, + "learning_rate": 3.6396415740854715e-07, + "logits/chosen": -1.8575794696807861, + "logits/rejected": -2.393746852874756, + "logps/chosen": -2.755718946456909, + "logps/rejected": -3.4682717323303223, + "loss": 2.7489, + "rewards/accuracies": 0.5, + "rewards/chosen": -27.55718994140625, + "rewards/margins": 7.1255292892456055, + "rewards/rejected": -34.682716369628906, + "step": 18660 + }, + { + "epoch": 0.6291078229802151, + "grad_norm": 34.07551193237305, + "learning_rate": 3.6368114202318234e-07, + "logits/chosen": -2.294307231903076, + "logits/rejected": -2.680905818939209, + "logps/chosen": -2.486971139907837, + "logps/rejected": -2.966942310333252, + "loss": 1.8959, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.869714736938477, + "rewards/margins": 4.799710750579834, + "rewards/rejected": -29.669422149658203, + "step": 18665 + }, + { + "epoch": 0.6292763490511982, + "grad_norm": 22.968944549560547, + "learning_rate": 3.6339817381240336e-07, + "logits/chosen": -1.6254408359527588, + "logits/rejected": -1.8740613460540771, + "logps/chosen": -2.6052937507629395, + "logps/rejected": -3.0102014541625977, + "loss": 1.7571, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -26.05293846130371, + "rewards/margins": 4.049075126647949, + "rewards/rejected": -30.102014541625977, + "step": 18670 + }, + { + "epoch": 0.6294448751221814, + "grad_norm": 39.4991455078125, + "learning_rate": 3.631152528741345e-07, + "logits/chosen": -1.8259556293487549, + "logits/rejected": -1.862667441368103, + "logps/chosen": -1.8470585346221924, + "logps/rejected": -1.8689968585968018, + "loss": 3.0458, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.470584869384766, + "rewards/margins": 0.2193818986415863, + "rewards/rejected": -18.68996810913086, + "step": 18675 + }, + { + "epoch": 0.6296134011931646, + "grad_norm": 22.74688148498535, + "learning_rate": 3.6283237930628354e-07, + "logits/chosen": -1.3108330965042114, + "logits/rejected": -1.2930123805999756, + "logps/chosen": -2.7357821464538574, + "logps/rejected": -2.361436367034912, + "loss": 8.0913, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -27.357824325561523, + "rewards/margins": -3.7434592247009277, + "rewards/rejected": -23.614362716674805, + "step": 18680 + }, + { + "epoch": 0.6297819272641477, + "grad_norm": 33.82335662841797, + "learning_rate": 3.6254955320674215e-07, + "logits/chosen": -1.7322124242782593, + "logits/rejected": -2.1520094871520996, + "logps/chosen": -2.2102978229522705, + "logps/rejected": -3.8706984519958496, + "loss": 1.6132, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -22.102977752685547, + "rewards/margins": 16.604013442993164, + "rewards/rejected": -38.706993103027344, + "step": 18685 + }, + { + "epoch": 0.629950453335131, + "grad_norm": 39.25462341308594, + "learning_rate": 3.6226677467338486e-07, + "logits/chosen": -1.723928451538086, + "logits/rejected": -1.8014103174209595, + "logps/chosen": -2.4551825523376465, + "logps/rejected": -2.642056941986084, + "loss": 3.9859, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -24.551822662353516, + "rewards/margins": 1.868748664855957, + "rewards/rejected": -26.420568466186523, + "step": 18690 + }, + { + "epoch": 0.6301189794061142, + "grad_norm": 40.322025299072266, + "learning_rate": 3.6198404380407034e-07, + "logits/chosen": -1.3129554986953735, + "logits/rejected": -1.4795254468917847, + "logps/chosen": -2.9755096435546875, + "logps/rejected": -2.795775890350342, + "loss": 6.003, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -29.755096435546875, + "rewards/margins": -1.7973363399505615, + "rewards/rejected": -27.9577579498291, + "step": 18695 + }, + { + "epoch": 0.6302875054770973, + "grad_norm": 14.616473197937012, + "learning_rate": 3.617013606966408e-07, + "logits/chosen": -1.6453460454940796, + "logits/rejected": -2.410950183868408, + "logps/chosen": -1.7156639099121094, + "logps/rejected": -2.1873421669006348, + "loss": 2.3051, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.156639099121094, + "rewards/margins": 4.716780185699463, + "rewards/rejected": -21.8734188079834, + "step": 18700 + }, + { + "epoch": 0.6304560315480805, + "grad_norm": 52.16621780395508, + "learning_rate": 3.614187254489215e-07, + "logits/chosen": -1.852903962135315, + "logits/rejected": -1.9818837642669678, + "logps/chosen": -2.1786704063415527, + "logps/rejected": -2.2185847759246826, + "loss": 3.7794, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.786705017089844, + "rewards/margins": 0.39914292097091675, + "rewards/rejected": -22.185848236083984, + "step": 18705 + }, + { + "epoch": 0.6306245576190637, + "grad_norm": 40.281700134277344, + "learning_rate": 3.6113613815872136e-07, + "logits/chosen": -1.6567414999008179, + "logits/rejected": -1.8920471668243408, + "logps/chosen": -2.4621388912200928, + "logps/rejected": -2.8170342445373535, + "loss": 3.8563, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.621389389038086, + "rewards/margins": 3.5489540100097656, + "rewards/rejected": -28.17034339904785, + "step": 18710 + }, + { + "epoch": 0.6307930836900468, + "grad_norm": 18.528535842895508, + "learning_rate": 3.6085359892383293e-07, + "logits/chosen": -1.451129674911499, + "logits/rejected": -1.6147100925445557, + "logps/chosen": -2.878284454345703, + "logps/rejected": -3.1676900386810303, + "loss": 3.541, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -28.782846450805664, + "rewards/margins": 2.8940553665161133, + "rewards/rejected": -31.676898956298828, + "step": 18715 + }, + { + "epoch": 0.63096160976103, + "grad_norm": 15.683385848999023, + "learning_rate": 3.6057110784203174e-07, + "logits/chosen": -1.6452653408050537, + "logits/rejected": -1.8250606060028076, + "logps/chosen": -1.8918319940567017, + "logps/rejected": -2.104295492172241, + "loss": 1.6575, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -18.918319702148438, + "rewards/margins": 2.124636173248291, + "rewards/rejected": -21.04295539855957, + "step": 18720 + }, + { + "epoch": 0.6311301358320132, + "grad_norm": 559.3909301757812, + "learning_rate": 3.602886650110768e-07, + "logits/chosen": -1.2559688091278076, + "logits/rejected": -1.6382989883422852, + "logps/chosen": -2.667348861694336, + "logps/rejected": -2.693481922149658, + "loss": 4.4701, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -26.67348861694336, + "rewards/margins": 0.26132678985595703, + "rewards/rejected": -26.934814453125, + "step": 18725 + }, + { + "epoch": 0.6312986619029964, + "grad_norm": 26.204259872436523, + "learning_rate": 3.600062705287105e-07, + "logits/chosen": -1.9202476739883423, + "logits/rejected": -2.0799126625061035, + "logps/chosen": -2.1364963054656982, + "logps/rejected": -2.337533473968506, + "loss": 2.3524, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.364961624145508, + "rewards/margins": 2.01037335395813, + "rewards/rejected": -23.375335693359375, + "step": 18730 + }, + { + "epoch": 0.6314671879739796, + "grad_norm": 71.08409118652344, + "learning_rate": 3.5972392449265854e-07, + "logits/chosen": -1.4504081010818481, + "logits/rejected": -1.4608997106552124, + "logps/chosen": -2.9600231647491455, + "logps/rejected": -2.9094111919403076, + "loss": 3.7138, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -29.600229263305664, + "rewards/margins": -0.5061157941818237, + "rewards/rejected": -29.0941104888916, + "step": 18735 + }, + { + "epoch": 0.6316357140449628, + "grad_norm": 38.98657989501953, + "learning_rate": 3.594416270006295e-07, + "logits/chosen": -1.581471562385559, + "logits/rejected": -1.865757942199707, + "logps/chosen": -2.142603635787964, + "logps/rejected": -2.3634285926818848, + "loss": 2.3702, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.426036834716797, + "rewards/margins": 2.2082512378692627, + "rewards/rejected": -23.634288787841797, + "step": 18740 + }, + { + "epoch": 0.6318042401159459, + "grad_norm": 39.91908264160156, + "learning_rate": 3.591593781503156e-07, + "logits/chosen": -1.7952802181243896, + "logits/rejected": -1.8274444341659546, + "logps/chosen": -2.224351167678833, + "logps/rejected": -2.292336940765381, + "loss": 2.7999, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.243511199951172, + "rewards/margins": 0.6798585057258606, + "rewards/rejected": -22.923370361328125, + "step": 18745 + }, + { + "epoch": 0.6319727661869291, + "grad_norm": 66.18889617919922, + "learning_rate": 3.58877178039392e-07, + "logits/chosen": -1.6533877849578857, + "logits/rejected": -1.7162796258926392, + "logps/chosen": -2.54699969291687, + "logps/rejected": -2.590531587600708, + "loss": 3.4206, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -25.46999740600586, + "rewards/margins": 0.43531855940818787, + "rewards/rejected": -25.905315399169922, + "step": 18750 + }, + { + "epoch": 0.6321412922579123, + "grad_norm": 30.72490119934082, + "learning_rate": 3.5859502676551736e-07, + "logits/chosen": -1.9574337005615234, + "logits/rejected": -1.5460635423660278, + "logps/chosen": -2.3032872676849365, + "logps/rejected": -2.066685438156128, + "loss": 6.0732, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.032873153686523, + "rewards/margins": -2.366018772125244, + "rewards/rejected": -20.666854858398438, + "step": 18755 + }, + { + "epoch": 0.6323098183288954, + "grad_norm": 27.569772720336914, + "learning_rate": 3.583129244263325e-07, + "logits/chosen": -1.2762352228164673, + "logits/rejected": -1.673752784729004, + "logps/chosen": -2.3343570232391357, + "logps/rejected": -3.042248010635376, + "loss": 1.5226, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -23.343570709228516, + "rewards/margins": 7.0789079666137695, + "rewards/rejected": -30.422481536865234, + "step": 18760 + }, + { + "epoch": 0.6324783443998787, + "grad_norm": 31.950868606567383, + "learning_rate": 3.5803087111946226e-07, + "logits/chosen": -0.9125463366508484, + "logits/rejected": -1.0970439910888672, + "logps/chosen": -2.523500919342041, + "logps/rejected": -2.504364490509033, + "loss": 3.4955, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -25.23501205444336, + "rewards/margins": -0.19136667251586914, + "rewards/rejected": -25.043643951416016, + "step": 18765 + }, + { + "epoch": 0.6326468704708619, + "grad_norm": 27.117694854736328, + "learning_rate": 3.5774886694251426e-07, + "logits/chosen": -1.55544912815094, + "logits/rejected": -2.047374963760376, + "logps/chosen": -2.2005863189697266, + "logps/rejected": -2.595324754714966, + "loss": 2.0615, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.005863189697266, + "rewards/margins": 3.947382688522339, + "rewards/rejected": -25.9532470703125, + "step": 18770 + }, + { + "epoch": 0.632815396541845, + "grad_norm": 32.97996520996094, + "learning_rate": 3.574669119930789e-07, + "logits/chosen": -1.7787446975708008, + "logits/rejected": -2.044855833053589, + "logps/chosen": -2.233386278152466, + "logps/rejected": -2.2153115272521973, + "loss": 3.3101, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.3338623046875, + "rewards/margins": -0.18074798583984375, + "rewards/rejected": -22.153112411499023, + "step": 18775 + }, + { + "epoch": 0.6329839226128282, + "grad_norm": 32.860652923583984, + "learning_rate": 3.5718500636872983e-07, + "logits/chosen": -1.2347030639648438, + "logits/rejected": -1.4687144756317139, + "logps/chosen": -2.5832676887512207, + "logps/rejected": -2.9072065353393555, + "loss": 2.0371, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.83267593383789, + "rewards/margins": 3.2393898963928223, + "rewards/rejected": -29.072063446044922, + "step": 18780 + }, + { + "epoch": 0.6331524486838114, + "grad_norm": 92.78843688964844, + "learning_rate": 3.569031501670232e-07, + "logits/chosen": -1.433812141418457, + "logits/rejected": -1.804659128189087, + "logps/chosen": -2.26090669631958, + "logps/rejected": -2.6193065643310547, + "loss": 2.3625, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.609066009521484, + "rewards/margins": 3.5839996337890625, + "rewards/rejected": -26.193065643310547, + "step": 18785 + }, + { + "epoch": 0.6333209747547945, + "grad_norm": 72.81118774414062, + "learning_rate": 3.5662134348549867e-07, + "logits/chosen": -1.4099839925765991, + "logits/rejected": -1.9020986557006836, + "logps/chosen": -2.07269549369812, + "logps/rejected": -2.3570284843444824, + "loss": 1.9019, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.72695541381836, + "rewards/margins": 2.8433303833007812, + "rewards/rejected": -23.57028579711914, + "step": 18790 + }, + { + "epoch": 0.6334895008257777, + "grad_norm": 74.02709197998047, + "learning_rate": 3.563395864216781e-07, + "logits/chosen": -1.6922391653060913, + "logits/rejected": -1.9072993993759155, + "logps/chosen": -2.716442823410034, + "logps/rejected": -3.1500327587127686, + "loss": 2.693, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -27.1644287109375, + "rewards/margins": 4.335899829864502, + "rewards/rejected": -31.50032615661621, + "step": 18795 + }, + { + "epoch": 0.633658026896761, + "grad_norm": 108.89360809326172, + "learning_rate": 3.560578790730667e-07, + "logits/chosen": -1.768334150314331, + "logits/rejected": -1.6725940704345703, + "logps/chosen": -2.8877758979797363, + "logps/rejected": -2.924286365509033, + "loss": 3.497, + "rewards/accuracies": 0.5, + "rewards/chosen": -28.877758026123047, + "rewards/margins": 0.3651053309440613, + "rewards/rejected": -29.24286460876465, + "step": 18800 + }, + { + "epoch": 0.633658026896761, + "eval_logits/chosen": -2.071402072906494, + "eval_logits/rejected": -2.2284653186798096, + "eval_logps/chosen": -2.1819846630096436, + "eval_logps/rejected": -2.3242621421813965, + "eval_loss": 3.0252206325531006, + "eval_rewards/accuracies": 0.6200000047683716, + "eval_rewards/chosen": -21.819847106933594, + "eval_rewards/margins": 1.42277193069458, + "eval_rewards/rejected": -23.24262046813965, + "eval_runtime": 12.8999, + "eval_samples_per_second": 7.752, + "eval_steps_per_second": 1.938, + "step": 18800 + }, + { + "epoch": 0.6338265529677442, + "grad_norm": 41.892486572265625, + "learning_rate": 3.557762215371525e-07, + "logits/chosen": -1.066054105758667, + "logits/rejected": -1.4403715133666992, + "logps/chosen": -1.9089603424072266, + "logps/rejected": -2.2967567443847656, + "loss": 1.7666, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.089603424072266, + "rewards/margins": 3.877964496612549, + "rewards/rejected": -22.96756935119629, + "step": 18805 + }, + { + "epoch": 0.6339950790387273, + "grad_norm": 29.515609741210938, + "learning_rate": 3.5549461391140557e-07, + "logits/chosen": -1.927973747253418, + "logits/rejected": -2.1799073219299316, + "logps/chosen": -2.299041748046875, + "logps/rejected": -2.4362006187438965, + "loss": 3.3194, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -22.99041748046875, + "rewards/margins": 1.3715906143188477, + "rewards/rejected": -24.36200714111328, + "step": 18810 + }, + { + "epoch": 0.6341636051097105, + "grad_norm": 29.148569107055664, + "learning_rate": 3.5521305629327953e-07, + "logits/chosen": -1.8402526378631592, + "logits/rejected": -1.7619283199310303, + "logps/chosen": -1.8162330389022827, + "logps/rejected": -2.185051679611206, + "loss": 1.8324, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.16233253479004, + "rewards/margins": 3.688185214996338, + "rewards/rejected": -21.850515365600586, + "step": 18815 + }, + { + "epoch": 0.6343321311806936, + "grad_norm": 54.641075134277344, + "learning_rate": 3.549315487802103e-07, + "logits/chosen": -0.9678794741630554, + "logits/rejected": -0.9939224123954773, + "logps/chosen": -2.24064302444458, + "logps/rejected": -2.242785930633545, + "loss": 3.2498, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.406429290771484, + "rewards/margins": 0.021430587396025658, + "rewards/rejected": -22.427860260009766, + "step": 18820 + }, + { + "epoch": 0.6345006572516768, + "grad_norm": 15.241156578063965, + "learning_rate": 3.546500914696168e-07, + "logits/chosen": -1.4445557594299316, + "logits/rejected": -1.633917212486267, + "logps/chosen": -2.0695130825042725, + "logps/rejected": -2.4762346744537354, + "loss": 1.6071, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.695133209228516, + "rewards/margins": 4.067215442657471, + "rewards/rejected": -24.762348175048828, + "step": 18825 + }, + { + "epoch": 0.63466918332266, + "grad_norm": 31.20334815979004, + "learning_rate": 3.543686844588999e-07, + "logits/chosen": -1.7547988891601562, + "logits/rejected": -2.0018696784973145, + "logps/chosen": -2.5072829723358154, + "logps/rejected": -3.3072426319122314, + "loss": 2.0993, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.072830200195312, + "rewards/margins": 7.999594211578369, + "rewards/rejected": -33.072425842285156, + "step": 18830 + }, + { + "epoch": 0.6348377093936431, + "grad_norm": 38.99212646484375, + "learning_rate": 3.540873278454436e-07, + "logits/chosen": -1.6647943258285522, + "logits/rejected": -2.0520567893981934, + "logps/chosen": -2.5589611530303955, + "logps/rejected": -2.987727403640747, + "loss": 3.0522, + "rewards/accuracies": 0.5, + "rewards/chosen": -25.589611053466797, + "rewards/margins": 4.287664413452148, + "rewards/rejected": -29.877277374267578, + "step": 18835 + }, + { + "epoch": 0.6350062354646264, + "grad_norm": 17.255659103393555, + "learning_rate": 3.5380602172661454e-07, + "logits/chosen": -1.7852544784545898, + "logits/rejected": -1.8239473104476929, + "logps/chosen": -2.2291624546051025, + "logps/rejected": -2.2721869945526123, + "loss": 3.1657, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -22.291624069213867, + "rewards/margins": 0.43024587631225586, + "rewards/rejected": -22.721872329711914, + "step": 18840 + }, + { + "epoch": 0.6351747615356096, + "grad_norm": 43.60550308227539, + "learning_rate": 3.535247661997616e-07, + "logits/chosen": -1.6584608554840088, + "logits/rejected": -1.7699321508407593, + "logps/chosen": -2.3036136627197266, + "logps/rejected": -2.459486484527588, + "loss": 4.5349, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -23.036136627197266, + "rewards/margins": 1.5587265491485596, + "rewards/rejected": -24.594863891601562, + "step": 18845 + }, + { + "epoch": 0.6353432876065928, + "grad_norm": 22.359338760375977, + "learning_rate": 3.53243561362216e-07, + "logits/chosen": -1.2959634065628052, + "logits/rejected": -1.7488740682601929, + "logps/chosen": -1.9807817935943604, + "logps/rejected": -2.1481728553771973, + "loss": 2.394, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.807817459106445, + "rewards/margins": 1.673910140991211, + "rewards/rejected": -21.481727600097656, + "step": 18850 + }, + { + "epoch": 0.6355118136775759, + "grad_norm": 18.181415557861328, + "learning_rate": 3.529624073112918e-07, + "logits/chosen": -1.4957122802734375, + "logits/rejected": -1.6004817485809326, + "logps/chosen": -2.4971108436584473, + "logps/rejected": -2.80725359916687, + "loss": 1.7375, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.971107482910156, + "rewards/margins": 3.101428270339966, + "rewards/rejected": -28.07253646850586, + "step": 18855 + }, + { + "epoch": 0.6356803397485591, + "grad_norm": 22.70155143737793, + "learning_rate": 3.526813041442855e-07, + "logits/chosen": -1.983559012413025, + "logits/rejected": -2.0745460987091064, + "logps/chosen": -1.7819048166275024, + "logps/rejected": -2.053633213043213, + "loss": 2.3117, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.819047927856445, + "rewards/margins": 2.717284679412842, + "rewards/rejected": -20.536331176757812, + "step": 18860 + }, + { + "epoch": 0.6358488658195423, + "grad_norm": 47.017906188964844, + "learning_rate": 3.524002519584757e-07, + "logits/chosen": -1.9625564813613892, + "logits/rejected": -2.141477108001709, + "logps/chosen": -1.758323073387146, + "logps/rejected": -1.9290144443511963, + "loss": 2.6992, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.58323097229004, + "rewards/margins": 1.706911325454712, + "rewards/rejected": -19.290143966674805, + "step": 18865 + }, + { + "epoch": 0.6360173918905254, + "grad_norm": 29.950515747070312, + "learning_rate": 3.5211925085112347e-07, + "logits/chosen": -1.0737838745117188, + "logits/rejected": -1.1902484893798828, + "logps/chosen": -2.2974486351013184, + "logps/rejected": -2.5132625102996826, + "loss": 3.3409, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.974483489990234, + "rewards/margins": 2.15814208984375, + "rewards/rejected": -25.132625579833984, + "step": 18870 + }, + { + "epoch": 0.6361859179615087, + "grad_norm": 20.321805953979492, + "learning_rate": 3.518383009194724e-07, + "logits/chosen": -1.7747859954833984, + "logits/rejected": -1.9187867641448975, + "logps/chosen": -2.972046375274658, + "logps/rejected": -2.7977707386016846, + "loss": 6.132, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -29.7204647064209, + "rewards/margins": -1.7427597045898438, + "rewards/rejected": -27.977706909179688, + "step": 18875 + }, + { + "epoch": 0.6363544440324919, + "grad_norm": 22.178987503051758, + "learning_rate": 3.5155740226074793e-07, + "logits/chosen": -2.397124767303467, + "logits/rejected": -2.566413402557373, + "logps/chosen": -2.435213565826416, + "logps/rejected": -2.9507038593292236, + "loss": 2.6944, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.35213851928711, + "rewards/margins": 5.154902458190918, + "rewards/rejected": -29.50704002380371, + "step": 18880 + }, + { + "epoch": 0.636522970103475, + "grad_norm": 14.269989013671875, + "learning_rate": 3.512765549721581e-07, + "logits/chosen": -1.5212559700012207, + "logits/rejected": -1.7017109394073486, + "logps/chosen": -1.7232303619384766, + "logps/rejected": -1.922149658203125, + "loss": 2.4591, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.232303619384766, + "rewards/margins": 1.989192247390747, + "rewards/rejected": -19.221494674682617, + "step": 18885 + }, + { + "epoch": 0.6366914961744582, + "grad_norm": 23.027902603149414, + "learning_rate": 3.5099575915089307e-07, + "logits/chosen": -1.8852980136871338, + "logits/rejected": -2.2309741973876953, + "logps/chosen": -2.5858306884765625, + "logps/rejected": -2.9226760864257812, + "loss": 2.2128, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.858306884765625, + "rewards/margins": 3.3684539794921875, + "rewards/rejected": -29.226760864257812, + "step": 18890 + }, + { + "epoch": 0.6368600222454414, + "grad_norm": 57.29917907714844, + "learning_rate": 3.507150148941255e-07, + "logits/chosen": -1.6230862140655518, + "logits/rejected": -2.040215015411377, + "logps/chosen": -1.9775832891464233, + "logps/rejected": -2.1819052696228027, + "loss": 2.2303, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.775833129882812, + "rewards/margins": 2.0432209968566895, + "rewards/rejected": -21.819053649902344, + "step": 18895 + }, + { + "epoch": 0.6370285483164245, + "grad_norm": 62.11034393310547, + "learning_rate": 3.5043432229900946e-07, + "logits/chosen": -1.6436758041381836, + "logits/rejected": -2.031874179840088, + "logps/chosen": -1.860642671585083, + "logps/rejected": -1.9398397207260132, + "loss": 2.9126, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.606426239013672, + "rewards/margins": 0.7919692993164062, + "rewards/rejected": -19.398395538330078, + "step": 18900 + }, + { + "epoch": 0.6371970743874077, + "grad_norm": 253.6606903076172, + "learning_rate": 3.5015368146268186e-07, + "logits/chosen": -1.4768943786621094, + "logits/rejected": -1.5061800479888916, + "logps/chosen": -2.92598295211792, + "logps/rejected": -3.0826478004455566, + "loss": 3.2916, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -29.25983238220215, + "rewards/margins": 1.566645860671997, + "rewards/rejected": -30.82647705078125, + "step": 18905 + }, + { + "epoch": 0.637365600458391, + "grad_norm": 38.190616607666016, + "learning_rate": 3.498730924822616e-07, + "logits/chosen": -1.4217101335525513, + "logits/rejected": -1.5524682998657227, + "logps/chosen": -2.2179980278015137, + "logps/rejected": -2.49385404586792, + "loss": 2.591, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -22.179981231689453, + "rewards/margins": 2.7585608959198, + "rewards/rejected": -24.938539505004883, + "step": 18910 + }, + { + "epoch": 0.6375341265293741, + "grad_norm": 14.970929145812988, + "learning_rate": 3.4959255545484907e-07, + "logits/chosen": -1.656989336013794, + "logits/rejected": -1.7032983303070068, + "logps/chosen": -1.9883617162704468, + "logps/rejected": -2.0293631553649902, + "loss": 3.119, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.883617401123047, + "rewards/margins": 0.41001588106155396, + "rewards/rejected": -20.29363441467285, + "step": 18915 + }, + { + "epoch": 0.6377026526003573, + "grad_norm": 106.11602020263672, + "learning_rate": 3.4931207047752725e-07, + "logits/chosen": -1.71733820438385, + "logits/rejected": -1.701205849647522, + "logps/chosen": -2.496513843536377, + "logps/rejected": -2.5461325645446777, + "loss": 2.7739, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.965137481689453, + "rewards/margins": 0.4961865544319153, + "rewards/rejected": -25.461322784423828, + "step": 18920 + }, + { + "epoch": 0.6378711786713405, + "grad_norm": 25.053218841552734, + "learning_rate": 3.49031637647361e-07, + "logits/chosen": -1.6383533477783203, + "logits/rejected": -2.0752038955688477, + "logps/chosen": -2.378575563430786, + "logps/rejected": -2.8636727333068848, + "loss": 2.3228, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.785757064819336, + "rewards/margins": 4.850972652435303, + "rewards/rejected": -28.636730194091797, + "step": 18925 + }, + { + "epoch": 0.6380397047423236, + "grad_norm": 26.07686996459961, + "learning_rate": 3.487512570613971e-07, + "logits/chosen": -1.9094693660736084, + "logits/rejected": -2.106689453125, + "logps/chosen": -2.4322314262390137, + "logps/rejected": -3.324227809906006, + "loss": 1.8016, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -24.322317123413086, + "rewards/margins": 8.919961929321289, + "rewards/rejected": -33.242279052734375, + "step": 18930 + }, + { + "epoch": 0.6382082308133068, + "grad_norm": 19.347030639648438, + "learning_rate": 3.484709288166641e-07, + "logits/chosen": -1.3109431266784668, + "logits/rejected": -1.3125630617141724, + "logps/chosen": -2.019881010055542, + "logps/rejected": -2.2754335403442383, + "loss": 2.3485, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.198810577392578, + "rewards/margins": 2.555525302886963, + "rewards/rejected": -22.754335403442383, + "step": 18935 + }, + { + "epoch": 0.63837675688429, + "grad_norm": 24.105995178222656, + "learning_rate": 3.481906530101726e-07, + "logits/chosen": -1.6839663982391357, + "logits/rejected": -1.6865516901016235, + "logps/chosen": -3.2025482654571533, + "logps/rejected": -3.766237735748291, + "loss": 2.4375, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -32.02547836303711, + "rewards/margins": 5.636894226074219, + "rewards/rejected": -37.66237258911133, + "step": 18940 + }, + { + "epoch": 0.6385452829552731, + "grad_norm": 30.606367111206055, + "learning_rate": 3.4791042973891524e-07, + "logits/chosen": -2.0236551761627197, + "logits/rejected": -2.570330858230591, + "logps/chosen": -3.1450514793395996, + "logps/rejected": -4.160052299499512, + "loss": 1.957, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -31.450511932373047, + "rewards/margins": 10.15001106262207, + "rewards/rejected": -41.60052490234375, + "step": 18945 + }, + { + "epoch": 0.6387138090262564, + "grad_norm": 49.75942611694336, + "learning_rate": 3.476302590998659e-07, + "logits/chosen": -2.274714469909668, + "logits/rejected": -2.318877935409546, + "logps/chosen": -2.3748977184295654, + "logps/rejected": -2.564833164215088, + "loss": 3.0908, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.74897575378418, + "rewards/margins": 1.899355173110962, + "rewards/rejected": -25.648334503173828, + "step": 18950 + }, + { + "epoch": 0.6388823350972396, + "grad_norm": 31.032297134399414, + "learning_rate": 3.4735014118998073e-07, + "logits/chosen": -1.6689374446868896, + "logits/rejected": -1.5681211948394775, + "logps/chosen": -2.400958299636841, + "logps/rejected": -2.3238704204559326, + "loss": 4.0307, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.009584426879883, + "rewards/margins": -0.7708789110183716, + "rewards/rejected": -23.238704681396484, + "step": 18955 + }, + { + "epoch": 0.6390508611682227, + "grad_norm": 91.0964584350586, + "learning_rate": 3.4707007610619777e-07, + "logits/chosen": -1.6875574588775635, + "logits/rejected": -1.9102585315704346, + "logps/chosen": -2.6997759342193604, + "logps/rejected": -3.2260589599609375, + "loss": 2.3887, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -26.997756958007812, + "rewards/margins": 5.2628350257873535, + "rewards/rejected": -32.26059341430664, + "step": 18960 + }, + { + "epoch": 0.6392193872392059, + "grad_norm": 118.5235824584961, + "learning_rate": 3.4679006394543606e-07, + "logits/chosen": -1.7751781940460205, + "logits/rejected": -1.6583400964736938, + "logps/chosen": -2.2704877853393555, + "logps/rejected": -2.3859403133392334, + "loss": 4.3287, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.704877853393555, + "rewards/margins": 1.154525637626648, + "rewards/rejected": -23.859405517578125, + "step": 18965 + }, + { + "epoch": 0.6393879133101891, + "grad_norm": 38.85346221923828, + "learning_rate": 3.4651010480459697e-07, + "logits/chosen": -1.6581932306289673, + "logits/rejected": -1.7150039672851562, + "logps/chosen": -2.4210574626922607, + "logps/rejected": -2.385411500930786, + "loss": 3.6377, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -24.210575103759766, + "rewards/margins": -0.3564607501029968, + "rewards/rejected": -23.854114532470703, + "step": 18970 + }, + { + "epoch": 0.6395564393811722, + "grad_norm": 38.01725769042969, + "learning_rate": 3.462301987805634e-07, + "logits/chosen": -1.2921791076660156, + "logits/rejected": -1.5788782835006714, + "logps/chosen": -2.8870227336883545, + "logps/rejected": -3.223814010620117, + "loss": 3.081, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -28.870227813720703, + "rewards/margins": 3.3679096698760986, + "rewards/rejected": -32.238136291503906, + "step": 18975 + }, + { + "epoch": 0.6397249654521554, + "grad_norm": 83.44391632080078, + "learning_rate": 3.459503459701998e-07, + "logits/chosen": -1.817146897315979, + "logits/rejected": -2.30041766166687, + "logps/chosen": -2.1918838024139404, + "logps/rejected": -2.373908281326294, + "loss": 1.9129, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.918842315673828, + "rewards/margins": 1.8202422857284546, + "rewards/rejected": -23.73908233642578, + "step": 18980 + }, + { + "epoch": 0.6398934915231387, + "grad_norm": 35.81073760986328, + "learning_rate": 3.456705464703521e-07, + "logits/chosen": -1.5047098398208618, + "logits/rejected": -1.621691107749939, + "logps/chosen": -1.9369847774505615, + "logps/rejected": -2.070429563522339, + "loss": 2.3392, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.369848251342773, + "rewards/margins": 1.3344460725784302, + "rewards/rejected": -20.704296112060547, + "step": 18985 + }, + { + "epoch": 0.6400620175941218, + "grad_norm": 69.3550796508789, + "learning_rate": 3.4539080037784783e-07, + "logits/chosen": -1.557337999343872, + "logits/rejected": -1.6153488159179688, + "logps/chosen": -2.2617194652557373, + "logps/rejected": -2.4085395336151123, + "loss": 3.6134, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -22.617197036743164, + "rewards/margins": 1.468200922012329, + "rewards/rejected": -24.085397720336914, + "step": 18990 + }, + { + "epoch": 0.640230543665105, + "grad_norm": 36.558536529541016, + "learning_rate": 3.451111077894963e-07, + "logits/chosen": -1.361604928970337, + "logits/rejected": -1.8662503957748413, + "logps/chosen": -2.5303022861480713, + "logps/rejected": -2.9746086597442627, + "loss": 2.3299, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.303020477294922, + "rewards/margins": 4.443061828613281, + "rewards/rejected": -29.746084213256836, + "step": 18995 + }, + { + "epoch": 0.6403990697360882, + "grad_norm": 42.665504455566406, + "learning_rate": 3.448314688020879e-07, + "logits/chosen": -1.1075172424316406, + "logits/rejected": -0.9852533340454102, + "logps/chosen": -2.483982801437378, + "logps/rejected": -2.329929828643799, + "loss": 4.841, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.839826583862305, + "rewards/margins": -1.540529489517212, + "rewards/rejected": -23.299297332763672, + "step": 19000 + }, + { + "epoch": 0.6405675958070713, + "grad_norm": 19.328079223632812, + "learning_rate": 3.445518835123948e-07, + "logits/chosen": -1.4079978466033936, + "logits/rejected": -1.3210872411727905, + "logps/chosen": -2.058558225631714, + "logps/rejected": -1.71990168094635, + "loss": 6.4632, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -20.585582733154297, + "rewards/margins": -3.386566638946533, + "rewards/rejected": -17.199016571044922, + "step": 19005 + }, + { + "epoch": 0.6407361218780545, + "grad_norm": 41.33991622924805, + "learning_rate": 3.442723520171703e-07, + "logits/chosen": -2.0147464275360107, + "logits/rejected": -1.8899329900741577, + "logps/chosen": -2.5618722438812256, + "logps/rejected": -2.6130149364471436, + "loss": 3.2193, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.618722915649414, + "rewards/margins": 0.5114272236824036, + "rewards/rejected": -26.130151748657227, + "step": 19010 + }, + { + "epoch": 0.6409046479490377, + "grad_norm": 17.656309127807617, + "learning_rate": 3.439928744131497e-07, + "logits/chosen": -1.745766043663025, + "logits/rejected": -1.926134467124939, + "logps/chosen": -2.9451346397399902, + "logps/rejected": -3.5164706707000732, + "loss": 2.238, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -29.45134925842285, + "rewards/margins": 5.713357925415039, + "rewards/rejected": -35.164703369140625, + "step": 19015 + }, + { + "epoch": 0.6410731740200208, + "grad_norm": 90.32007598876953, + "learning_rate": 3.437134507970485e-07, + "logits/chosen": -1.8085434436798096, + "logits/rejected": -1.9374297857284546, + "logps/chosen": -2.5853800773620605, + "logps/rejected": -2.4945881366729736, + "loss": 4.7094, + "rewards/accuracies": 0.5, + "rewards/chosen": -25.853801727294922, + "rewards/margins": -0.9079216122627258, + "rewards/rejected": -24.94588279724121, + "step": 19020 + }, + { + "epoch": 0.6412417000910041, + "grad_norm": 20.479576110839844, + "learning_rate": 3.4343408126556455e-07, + "logits/chosen": -1.6717798709869385, + "logits/rejected": -1.769441843032837, + "logps/chosen": -1.9906914234161377, + "logps/rejected": -2.4700753688812256, + "loss": 1.6959, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.90691375732422, + "rewards/margins": 4.793841361999512, + "rewards/rejected": -24.700756072998047, + "step": 19025 + }, + { + "epoch": 0.6414102261619873, + "grad_norm": 35.265438079833984, + "learning_rate": 3.4315476591537683e-07, + "logits/chosen": -1.5250223875045776, + "logits/rejected": -1.7313295602798462, + "logps/chosen": -2.3894386291503906, + "logps/rejected": -2.6715071201324463, + "loss": 3.4896, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.894384384155273, + "rewards/margins": 2.8206865787506104, + "rewards/rejected": -26.715068817138672, + "step": 19030 + }, + { + "epoch": 0.6415787522329705, + "grad_norm": 69.1385269165039, + "learning_rate": 3.4287550484314497e-07, + "logits/chosen": -1.355473518371582, + "logits/rejected": -1.6162292957305908, + "logps/chosen": -2.0230188369750977, + "logps/rejected": -2.365042209625244, + "loss": 2.5705, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.23019027709961, + "rewards/margins": 3.420231580734253, + "rewards/rejected": -23.650421142578125, + "step": 19035 + }, + { + "epoch": 0.6417472783039536, + "grad_norm": 47.425376892089844, + "learning_rate": 3.425962981455105e-07, + "logits/chosen": -1.7746198177337646, + "logits/rejected": -1.81686532497406, + "logps/chosen": -2.368058681488037, + "logps/rejected": -2.510368585586548, + "loss": 2.9706, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -23.680587768554688, + "rewards/margins": 1.423097848892212, + "rewards/rejected": -25.10368537902832, + "step": 19040 + }, + { + "epoch": 0.6419158043749368, + "grad_norm": 52.12995529174805, + "learning_rate": 3.4231714591909573e-07, + "logits/chosen": -1.6075427532196045, + "logits/rejected": -1.6615245342254639, + "logps/chosen": -1.8489784002304077, + "logps/rejected": -1.945669412612915, + "loss": 2.9096, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.489784240722656, + "rewards/margins": 0.9669092893600464, + "rewards/rejected": -19.456695556640625, + "step": 19045 + }, + { + "epoch": 0.64208433044592, + "grad_norm": 27.588619232177734, + "learning_rate": 3.420380482605045e-07, + "logits/chosen": -1.9126943349838257, + "logits/rejected": -1.8093080520629883, + "logps/chosen": -2.4399096965789795, + "logps/rejected": -3.0001025199890137, + "loss": 2.9082, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.399097442626953, + "rewards/margins": 5.6019287109375, + "rewards/rejected": -30.001026153564453, + "step": 19050 + }, + { + "epoch": 0.6422528565169031, + "grad_norm": 90.55980682373047, + "learning_rate": 3.417590052663211e-07, + "logits/chosen": -2.1148598194122314, + "logits/rejected": -2.1183550357818604, + "logps/chosen": -2.115551710128784, + "logps/rejected": -2.486673593521118, + "loss": 2.0638, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.155513763427734, + "rewards/margins": 3.7112205028533936, + "rewards/rejected": -24.866735458374023, + "step": 19055 + }, + { + "epoch": 0.6424213825878864, + "grad_norm": 18.421188354492188, + "learning_rate": 3.414800170331116e-07, + "logits/chosen": -1.5110455751419067, + "logits/rejected": -1.6215041875839233, + "logps/chosen": -1.7555958032608032, + "logps/rejected": -1.8505535125732422, + "loss": 2.9856, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.555957794189453, + "rewards/margins": 0.9495766758918762, + "rewards/rejected": -18.505535125732422, + "step": 19060 + }, + { + "epoch": 0.6425899086588696, + "grad_norm": 24.81529998779297, + "learning_rate": 3.4120108365742274e-07, + "logits/chosen": -1.4183194637298584, + "logits/rejected": -1.4200835227966309, + "logps/chosen": -2.245863437652588, + "logps/rejected": -2.2999191284179688, + "loss": 2.7156, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.458633422851562, + "rewards/margins": 0.5405560731887817, + "rewards/rejected": -22.999189376831055, + "step": 19065 + }, + { + "epoch": 0.6427584347298527, + "grad_norm": 18.90172576904297, + "learning_rate": 3.4092220523578244e-07, + "logits/chosen": -1.430742859840393, + "logits/rejected": -1.766953468322754, + "logps/chosen": -2.45387601852417, + "logps/rejected": -2.677732467651367, + "loss": 2.8874, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.538759231567383, + "rewards/margins": 2.23856520652771, + "rewards/rejected": -26.777324676513672, + "step": 19070 + }, + { + "epoch": 0.6429269608008359, + "grad_norm": 93.47918701171875, + "learning_rate": 3.406433818646993e-07, + "logits/chosen": -1.84613835811615, + "logits/rejected": -2.163917064666748, + "logps/chosen": -2.7454466819763184, + "logps/rejected": -3.1951904296875, + "loss": 2.0116, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -27.4544677734375, + "rewards/margins": 4.497437477111816, + "rewards/rejected": -31.951904296875, + "step": 19075 + }, + { + "epoch": 0.643095486871819, + "grad_norm": 256.0754699707031, + "learning_rate": 3.403646136406636e-07, + "logits/chosen": -1.865033745765686, + "logits/rejected": -1.7697813510894775, + "logps/chosen": -2.9825804233551025, + "logps/rejected": -3.022819995880127, + "loss": 4.0846, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -29.8258056640625, + "rewards/margins": 0.4023931622505188, + "rewards/rejected": -30.228199005126953, + "step": 19080 + }, + { + "epoch": 0.6432640129428022, + "grad_norm": 52.34347915649414, + "learning_rate": 3.4008590066014564e-07, + "logits/chosen": -1.9278236627578735, + "logits/rejected": -2.2672438621520996, + "logps/chosen": -3.0003390312194824, + "logps/rejected": -3.2712783813476562, + "loss": 3.44, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -30.003393173217773, + "rewards/margins": 2.7093968391418457, + "rewards/rejected": -32.712791442871094, + "step": 19085 + }, + { + "epoch": 0.6434325390137854, + "grad_norm": 18.355587005615234, + "learning_rate": 3.3980724301959704e-07, + "logits/chosen": -1.8786852359771729, + "logits/rejected": -2.1181275844573975, + "logps/chosen": -1.9924042224884033, + "logps/rejected": -2.353553295135498, + "loss": 2.2139, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.924043655395508, + "rewards/margins": 3.611489772796631, + "rewards/rejected": -23.535533905029297, + "step": 19090 + }, + { + "epoch": 0.6436010650847687, + "grad_norm": 28.176570892333984, + "learning_rate": 3.3952864081545017e-07, + "logits/chosen": -1.975188970565796, + "logits/rejected": -2.2873916625976562, + "logps/chosen": -1.818352460861206, + "logps/rejected": -2.311967134475708, + "loss": 2.5429, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.18352699279785, + "rewards/margins": 4.936144828796387, + "rewards/rejected": -23.119670867919922, + "step": 19095 + }, + { + "epoch": 0.6437695911557518, + "grad_norm": 18.773603439331055, + "learning_rate": 3.392500941441188e-07, + "logits/chosen": -1.4203368425369263, + "logits/rejected": -2.007047653198242, + "logps/chosen": -2.115145206451416, + "logps/rejected": -2.8709805011749268, + "loss": 1.8211, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.15144920349121, + "rewards/margins": 7.558354377746582, + "rewards/rejected": -28.709802627563477, + "step": 19100 + }, + { + "epoch": 0.643938117226735, + "grad_norm": 31.940587997436523, + "learning_rate": 3.389716031019962e-07, + "logits/chosen": -0.8345028758049011, + "logits/rejected": -0.8378894925117493, + "logps/chosen": -2.056838274002075, + "logps/rejected": -2.223741054534912, + "loss": 3.4076, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -20.568382263183594, + "rewards/margins": 1.6690285205841064, + "rewards/rejected": -22.237411499023438, + "step": 19105 + }, + { + "epoch": 0.6441066432977182, + "grad_norm": 10.822291374206543, + "learning_rate": 3.3869316778545754e-07, + "logits/chosen": -2.2176527976989746, + "logits/rejected": -2.4825215339660645, + "logps/chosen": -2.1370999813079834, + "logps/rejected": -2.312455654144287, + "loss": 2.7543, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.37099838256836, + "rewards/margins": 1.7535556554794312, + "rewards/rejected": -23.124553680419922, + "step": 19110 + }, + { + "epoch": 0.6442751693687013, + "grad_norm": 29.39801788330078, + "learning_rate": 3.384147882908582e-07, + "logits/chosen": -1.725995659828186, + "logits/rejected": -2.026106834411621, + "logps/chosen": -1.9508934020996094, + "logps/rejected": -2.574272871017456, + "loss": 1.511, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.508935928344727, + "rewards/margins": 6.233796119689941, + "rewards/rejected": -25.74273109436035, + "step": 19115 + }, + { + "epoch": 0.6444436954396845, + "grad_norm": 27.90772247314453, + "learning_rate": 3.3813646471453473e-07, + "logits/chosen": -1.8614917993545532, + "logits/rejected": -1.9006048440933228, + "logps/chosen": -2.628399610519409, + "logps/rejected": -2.999600410461426, + "loss": 2.4409, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.28399658203125, + "rewards/margins": 3.712005615234375, + "rewards/rejected": -29.996002197265625, + "step": 19120 + }, + { + "epoch": 0.6446122215106677, + "grad_norm": 21.94499397277832, + "learning_rate": 3.3785819715280343e-07, + "logits/chosen": -1.3559751510620117, + "logits/rejected": -1.900770902633667, + "logps/chosen": -2.3102097511291504, + "logps/rejected": -2.738304853439331, + "loss": 1.551, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -23.10209846496582, + "rewards/margins": 4.2809529304504395, + "rewards/rejected": -27.3830509185791, + "step": 19125 + }, + { + "epoch": 0.6447807475816508, + "grad_norm": 60.39374923706055, + "learning_rate": 3.37579985701962e-07, + "logits/chosen": -2.079019069671631, + "logits/rejected": -2.3765835762023926, + "logps/chosen": -1.9837615489959717, + "logps/rejected": -2.609318494796753, + "loss": 1.6134, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.837615966796875, + "rewards/margins": 6.255568504333496, + "rewards/rejected": -26.093181610107422, + "step": 19130 + }, + { + "epoch": 0.6449492736526341, + "grad_norm": 36.971927642822266, + "learning_rate": 3.373018304582884e-07, + "logits/chosen": -1.6064996719360352, + "logits/rejected": -1.6553875207901, + "logps/chosen": -2.9100847244262695, + "logps/rejected": -2.7735984325408936, + "loss": 4.6983, + "rewards/accuracies": 0.5, + "rewards/chosen": -29.100849151611328, + "rewards/margins": -1.3648655414581299, + "rewards/rejected": -27.73598289489746, + "step": 19135 + }, + { + "epoch": 0.6451177997236173, + "grad_norm": 76.00817108154297, + "learning_rate": 3.3702373151804124e-07, + "logits/chosen": -1.5281689167022705, + "logits/rejected": -1.7328161001205444, + "logps/chosen": -2.793159246444702, + "logps/rejected": -3.744621753692627, + "loss": 2.5595, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -27.931591033935547, + "rewards/margins": 9.514626502990723, + "rewards/rejected": -37.44622039794922, + "step": 19140 + }, + { + "epoch": 0.6452863257946004, + "grad_norm": 36.74580764770508, + "learning_rate": 3.367456889774597e-07, + "logits/chosen": -2.091965675354004, + "logits/rejected": -2.1845428943634033, + "logps/chosen": -2.186652660369873, + "logps/rejected": -2.5208215713500977, + "loss": 2.1896, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.86652946472168, + "rewards/margins": 3.3416855335235596, + "rewards/rejected": -25.208215713500977, + "step": 19145 + }, + { + "epoch": 0.6454548518655836, + "grad_norm": 78.41200256347656, + "learning_rate": 3.3646770293276303e-07, + "logits/chosen": -1.791298270225525, + "logits/rejected": -1.397806167602539, + "logps/chosen": -2.0304665565490723, + "logps/rejected": -2.0542068481445312, + "loss": 3.3975, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.30466651916504, + "rewards/margins": 0.2374025285243988, + "rewards/rejected": -20.542068481445312, + "step": 19150 + }, + { + "epoch": 0.6456233779365668, + "grad_norm": 38.56045150756836, + "learning_rate": 3.3618977348015166e-07, + "logits/chosen": -1.378115177154541, + "logits/rejected": -1.7747703790664673, + "logps/chosen": -1.6445846557617188, + "logps/rejected": -1.9529813528060913, + "loss": 1.9329, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -16.44584846496582, + "rewards/margins": 3.0839669704437256, + "rewards/rejected": -19.529815673828125, + "step": 19155 + }, + { + "epoch": 0.6457919040075499, + "grad_norm": 39.86232376098633, + "learning_rate": 3.3591190071580574e-07, + "logits/chosen": -1.5744518041610718, + "logits/rejected": -1.954493761062622, + "logps/chosen": -1.6901108026504517, + "logps/rejected": -2.289529323577881, + "loss": 1.4474, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -16.90110969543457, + "rewards/margins": 5.9941816329956055, + "rewards/rejected": -22.89529037475586, + "step": 19160 + }, + { + "epoch": 0.6459604300785331, + "grad_norm": 20.42445945739746, + "learning_rate": 3.356340847358861e-07, + "logits/chosen": -1.4461759328842163, + "logits/rejected": -1.6852458715438843, + "logps/chosen": -3.33441424369812, + "logps/rejected": -3.3620505332946777, + "loss": 3.4576, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -33.34414291381836, + "rewards/margins": 0.27636146545410156, + "rewards/rejected": -33.620506286621094, + "step": 19165 + }, + { + "epoch": 0.6461289561495164, + "grad_norm": 25.505706787109375, + "learning_rate": 3.353563256365342e-07, + "logits/chosen": -1.3414726257324219, + "logits/rejected": -1.9695316553115845, + "logps/chosen": -1.883315086364746, + "logps/rejected": -2.6686549186706543, + "loss": 1.8392, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.833148956298828, + "rewards/margins": 7.853400230407715, + "rewards/rejected": -26.686548233032227, + "step": 19170 + }, + { + "epoch": 0.6462974822204995, + "grad_norm": 55.38985061645508, + "learning_rate": 3.350786235138711e-07, + "logits/chosen": -1.4357484579086304, + "logits/rejected": -2.1208090782165527, + "logps/chosen": -1.828082799911499, + "logps/rejected": -2.372054100036621, + "loss": 2.4524, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.280826568603516, + "rewards/margins": 5.4397125244140625, + "rewards/rejected": -23.720539093017578, + "step": 19175 + }, + { + "epoch": 0.6464660082914827, + "grad_norm": 37.25720977783203, + "learning_rate": 3.348009784639988e-07, + "logits/chosen": -1.5288727283477783, + "logits/rejected": -2.281703233718872, + "logps/chosen": -2.1740665435791016, + "logps/rejected": -3.175799608230591, + "loss": 2.1291, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.740665435791016, + "rewards/margins": 10.017329216003418, + "rewards/rejected": -31.757991790771484, + "step": 19180 + }, + { + "epoch": 0.6466345343624659, + "grad_norm": 32.539100646972656, + "learning_rate": 3.3452339058299914e-07, + "logits/chosen": -1.4856078624725342, + "logits/rejected": -1.3350521326065063, + "logps/chosen": -2.816305160522461, + "logps/rejected": -2.017688751220703, + "loss": 11.0178, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -28.16305160522461, + "rewards/margins": -7.986166477203369, + "rewards/rejected": -20.176883697509766, + "step": 19185 + }, + { + "epoch": 0.646803060433449, + "grad_norm": 61.53752517700195, + "learning_rate": 3.3424585996693483e-07, + "logits/chosen": -1.2890323400497437, + "logits/rejected": -1.2937710285186768, + "logps/chosen": -2.464566707611084, + "logps/rejected": -2.404175281524658, + "loss": 6.1059, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.64566993713379, + "rewards/margins": -0.6039150357246399, + "rewards/rejected": -24.0417537689209, + "step": 19190 + }, + { + "epoch": 0.6469715865044322, + "grad_norm": 17.450136184692383, + "learning_rate": 3.339683867118477e-07, + "logits/chosen": -1.4439754486083984, + "logits/rejected": -1.957381010055542, + "logps/chosen": -3.2206637859344482, + "logps/rejected": -3.8896000385284424, + "loss": 1.3379, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -32.206642150878906, + "rewards/margins": 6.689364433288574, + "rewards/rejected": -38.89600372314453, + "step": 19195 + }, + { + "epoch": 0.6471401125754154, + "grad_norm": 25.236295700073242, + "learning_rate": 3.3369097091376045e-07, + "logits/chosen": -1.224825143814087, + "logits/rejected": -1.328102946281433, + "logps/chosen": -2.1852471828460693, + "logps/rejected": -2.2270686626434326, + "loss": 3.18, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.852474212646484, + "rewards/margins": 0.41821593046188354, + "rewards/rejected": -22.270687103271484, + "step": 19200 + }, + { + "epoch": 0.6471401125754154, + "eval_logits/chosen": -2.086235761642456, + "eval_logits/rejected": -2.2461965084075928, + "eval_logps/chosen": -2.1888742446899414, + "eval_logps/rejected": -2.3300468921661377, + "eval_loss": 3.0306525230407715, + "eval_rewards/accuracies": 0.6200000047683716, + "eval_rewards/chosen": -21.88874053955078, + "eval_rewards/margins": 1.4117244482040405, + "eval_rewards/rejected": -23.30046844482422, + "eval_runtime": 12.8872, + "eval_samples_per_second": 7.76, + "eval_steps_per_second": 1.94, + "step": 19200 + }, + { + "epoch": 0.6473086386463986, + "grad_norm": 30.1646785736084, + "learning_rate": 3.3341361266867607e-07, + "logits/chosen": -1.3984628915786743, + "logits/rejected": -1.7773818969726562, + "logps/chosen": -1.8794208765029907, + "logps/rejected": -2.1332011222839355, + "loss": 2.7122, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.79421043395996, + "rewards/margins": 2.5378024578094482, + "rewards/rejected": -21.33201026916504, + "step": 19205 + }, + { + "epoch": 0.6474771647173818, + "grad_norm": 9.875208854675293, + "learning_rate": 3.33136312072577e-07, + "logits/chosen": -1.5803048610687256, + "logits/rejected": -1.8212120532989502, + "logps/chosen": -1.9327504634857178, + "logps/rejected": -1.9376983642578125, + "loss": 3.4788, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.327505111694336, + "rewards/margins": 0.04947824403643608, + "rewards/rejected": -19.376985549926758, + "step": 19210 + }, + { + "epoch": 0.647645690788365, + "grad_norm": 22.80118751525879, + "learning_rate": 3.3285906922142615e-07, + "logits/chosen": -1.0638067722320557, + "logits/rejected": -1.1562116146087646, + "logps/chosen": -2.210082530975342, + "logps/rejected": -2.3568356037139893, + "loss": 1.9867, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -22.100826263427734, + "rewards/margins": 1.4675300121307373, + "rewards/rejected": -23.568355560302734, + "step": 19215 + }, + { + "epoch": 0.6478142168593481, + "grad_norm": 48.995601654052734, + "learning_rate": 3.325818842111663e-07, + "logits/chosen": -1.4287619590759277, + "logits/rejected": -1.3261592388153076, + "logps/chosen": -2.314736843109131, + "logps/rejected": -2.5384268760681152, + "loss": 3.3182, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.147369384765625, + "rewards/margins": 2.2368996143341064, + "rewards/rejected": -25.384267807006836, + "step": 19220 + }, + { + "epoch": 0.6479827429303313, + "grad_norm": 5.7302073400933295e-05, + "learning_rate": 3.3230475713772044e-07, + "logits/chosen": -1.295851230621338, + "logits/rejected": -1.7130746841430664, + "logps/chosen": -2.718839406967163, + "logps/rejected": -3.571959972381592, + "loss": 2.3557, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -27.188396453857422, + "rewards/margins": 8.531206130981445, + "rewards/rejected": -35.719600677490234, + "step": 19225 + }, + { + "epoch": 0.6481512690013145, + "grad_norm": 45.10464096069336, + "learning_rate": 3.3202768809699106e-07, + "logits/chosen": -1.7688840627670288, + "logits/rejected": -1.9455190896987915, + "logps/chosen": -2.6712350845336914, + "logps/rejected": -2.82537841796875, + "loss": 2.169, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.712352752685547, + "rewards/margins": 1.541433334350586, + "rewards/rejected": -28.2537841796875, + "step": 19230 + }, + { + "epoch": 0.6483197950722976, + "grad_norm": 10.823365211486816, + "learning_rate": 3.3175067718486103e-07, + "logits/chosen": -1.8669370412826538, + "logits/rejected": -2.175741672515869, + "logps/chosen": -2.35686993598938, + "logps/rejected": -3.105584144592285, + "loss": 1.8598, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -23.56869888305664, + "rewards/margins": 7.487143516540527, + "rewards/rejected": -31.05584144592285, + "step": 19235 + }, + { + "epoch": 0.6484883211432808, + "grad_norm": 68.01353454589844, + "learning_rate": 3.3147372449719304e-07, + "logits/chosen": -1.0006816387176514, + "logits/rejected": -1.3833884000778198, + "logps/chosen": -2.709277629852295, + "logps/rejected": -3.23040509223938, + "loss": 1.3677, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -27.0927734375, + "rewards/margins": 5.211273670196533, + "rewards/rejected": -32.304046630859375, + "step": 19240 + }, + { + "epoch": 0.6486568472142641, + "grad_norm": 25.578697204589844, + "learning_rate": 3.311968301298291e-07, + "logits/chosen": -1.8713645935058594, + "logits/rejected": -1.9751352071762085, + "logps/chosen": -2.5791544914245605, + "logps/rejected": -2.552963972091675, + "loss": 4.1632, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.79154396057129, + "rewards/margins": -0.2619051933288574, + "rewards/rejected": -25.529638290405273, + "step": 19245 + }, + { + "epoch": 0.6488253732852473, + "grad_norm": 18.423656463623047, + "learning_rate": 3.3091999417859174e-07, + "logits/chosen": -1.3926771879196167, + "logits/rejected": -1.9350593090057373, + "logps/chosen": -2.724782943725586, + "logps/rejected": -3.3932957649230957, + "loss": 1.9378, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -27.24782943725586, + "rewards/margins": 6.685128211975098, + "rewards/rejected": -33.932960510253906, + "step": 19250 + }, + { + "epoch": 0.6489938993562304, + "grad_norm": 27.966829299926758, + "learning_rate": 3.306432167392829e-07, + "logits/chosen": -1.7937549352645874, + "logits/rejected": -1.6492153406143188, + "logps/chosen": -2.1946823596954346, + "logps/rejected": -2.183840751647949, + "loss": 3.5634, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.946823120117188, + "rewards/margins": -0.10841550678014755, + "rewards/rejected": -21.83840560913086, + "step": 19255 + }, + { + "epoch": 0.6491624254272136, + "grad_norm": 32.890052795410156, + "learning_rate": 3.3036649790768454e-07, + "logits/chosen": -1.5309436321258545, + "logits/rejected": -1.5625251531600952, + "logps/chosen": -1.864524483680725, + "logps/rejected": -1.8880809545516968, + "loss": 2.9337, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.645244598388672, + "rewards/margins": 0.2355668991804123, + "rewards/rejected": -18.880809783935547, + "step": 19260 + }, + { + "epoch": 0.6493309514981968, + "grad_norm": 15.722186088562012, + "learning_rate": 3.300898377795578e-07, + "logits/chosen": -1.6200459003448486, + "logits/rejected": -1.7546627521514893, + "logps/chosen": -2.278107166290283, + "logps/rejected": -2.427480459213257, + "loss": 3.1703, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.78107261657715, + "rewards/margins": 1.4937334060668945, + "rewards/rejected": -24.27480697631836, + "step": 19265 + }, + { + "epoch": 0.6494994775691799, + "grad_norm": 37.55821228027344, + "learning_rate": 3.2981323645064397e-07, + "logits/chosen": -1.5431878566741943, + "logits/rejected": -1.5832126140594482, + "logps/chosen": -1.9823036193847656, + "logps/rejected": -2.360731840133667, + "loss": 1.3685, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -19.82303810119629, + "rewards/margins": 3.784282684326172, + "rewards/rejected": -23.607318878173828, + "step": 19270 + }, + { + "epoch": 0.6496680036401631, + "grad_norm": 33.9102783203125, + "learning_rate": 3.2953669401666405e-07, + "logits/chosen": -1.7654139995574951, + "logits/rejected": -2.2431235313415527, + "logps/chosen": -3.234036922454834, + "logps/rejected": -3.5689334869384766, + "loss": 4.0651, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -32.340370178222656, + "rewards/margins": 3.348963499069214, + "rewards/rejected": -35.689334869384766, + "step": 19275 + }, + { + "epoch": 0.6498365297111464, + "grad_norm": 16.531702041625977, + "learning_rate": 3.292602105733182e-07, + "logits/chosen": -1.793821096420288, + "logits/rejected": -2.153486728668213, + "logps/chosen": -2.093123435974121, + "logps/rejected": -2.2675602436065674, + "loss": 3.106, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.931236267089844, + "rewards/margins": 1.7443650960922241, + "rewards/rejected": -22.675600051879883, + "step": 19280 + }, + { + "epoch": 0.6500050557821295, + "grad_norm": 29.99871063232422, + "learning_rate": 3.2898378621628663e-07, + "logits/chosen": -1.416312575340271, + "logits/rejected": -1.304660677909851, + "logps/chosen": -2.879304885864258, + "logps/rejected": -2.3571860790252686, + "loss": 8.2708, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -28.79305076599121, + "rewards/margins": -5.221189975738525, + "rewards/rejected": -23.57185935974121, + "step": 19285 + }, + { + "epoch": 0.6501735818531127, + "grad_norm": 76.10118865966797, + "learning_rate": 3.2870742104122885e-07, + "logits/chosen": -1.12397038936615, + "logits/rejected": -1.5121071338653564, + "logps/chosen": -2.4723703861236572, + "logps/rejected": -3.379451274871826, + "loss": 1.7656, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -24.723703384399414, + "rewards/margins": 9.070809364318848, + "rewards/rejected": -33.79451370239258, + "step": 19290 + }, + { + "epoch": 0.6503421079240959, + "grad_norm": 0.30136585235595703, + "learning_rate": 3.2843111514378406e-07, + "logits/chosen": -1.2250374555587769, + "logits/rejected": -1.424181342124939, + "logps/chosen": -3.2688241004943848, + "logps/rejected": -3.828373670578003, + "loss": 3.0649, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -32.68824005126953, + "rewards/margins": 5.595494270324707, + "rewards/rejected": -38.28373336791992, + "step": 19295 + }, + { + "epoch": 0.650510633995079, + "grad_norm": 21.783082962036133, + "learning_rate": 3.2815486861957073e-07, + "logits/chosen": -1.941187858581543, + "logits/rejected": -1.7948150634765625, + "logps/chosen": -2.5010275840759277, + "logps/rejected": -2.561431407928467, + "loss": 3.7398, + "rewards/accuracies": 0.5, + "rewards/chosen": -25.010276794433594, + "rewards/margins": 0.6040407419204712, + "rewards/rejected": -25.614315032958984, + "step": 19300 + }, + { + "epoch": 0.6506791600660622, + "grad_norm": 204.24371337890625, + "learning_rate": 3.2787868156418697e-07, + "logits/chosen": -1.5785753726959229, + "logits/rejected": -1.9905147552490234, + "logps/chosen": -2.33290696144104, + "logps/rejected": -3.1187796592712402, + "loss": 2.1347, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.329069137573242, + "rewards/margins": 7.858725070953369, + "rewards/rejected": -31.18779945373535, + "step": 19305 + }, + { + "epoch": 0.6508476861370454, + "grad_norm": 61.92455291748047, + "learning_rate": 3.276025540732104e-07, + "logits/chosen": -1.7628555297851562, + "logits/rejected": -1.9517265558242798, + "logps/chosen": -2.293363332748413, + "logps/rejected": -2.615460157394409, + "loss": 2.8875, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.933635711669922, + "rewards/margins": 3.2209677696228027, + "rewards/rejected": -26.15460205078125, + "step": 19310 + }, + { + "epoch": 0.6510162122080286, + "grad_norm": 28.794397354125977, + "learning_rate": 3.273264862421974e-07, + "logits/chosen": -1.9501034021377563, + "logits/rejected": -2.310251235961914, + "logps/chosen": -2.6617469787597656, + "logps/rejected": -2.8715603351593018, + "loss": 3.1637, + "rewards/accuracies": 0.5, + "rewards/chosen": -26.617467880249023, + "rewards/margins": 2.0981335639953613, + "rewards/rejected": -28.71560287475586, + "step": 19315 + }, + { + "epoch": 0.6511847382790118, + "grad_norm": 55.15589904785156, + "learning_rate": 3.270504781666845e-07, + "logits/chosen": -1.3807077407836914, + "logits/rejected": -1.6009747982025146, + "logps/chosen": -2.3612663745880127, + "logps/rejected": -2.615450382232666, + "loss": 3.8383, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.6126651763916, + "rewards/margins": 2.5418403148651123, + "rewards/rejected": -26.154504776000977, + "step": 19320 + }, + { + "epoch": 0.651353264349995, + "grad_norm": 28.586013793945312, + "learning_rate": 3.267745299421871e-07, + "logits/chosen": -1.59463369846344, + "logits/rejected": -1.751935601234436, + "logps/chosen": -2.453174114227295, + "logps/rejected": -2.8395793437957764, + "loss": 3.0157, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.531740188598633, + "rewards/margins": 3.864053726196289, + "rewards/rejected": -28.395793914794922, + "step": 19325 + }, + { + "epoch": 0.6515217904209781, + "grad_norm": 17.22910499572754, + "learning_rate": 3.2649864166420037e-07, + "logits/chosen": -1.5936510562896729, + "logits/rejected": -1.992539405822754, + "logps/chosen": -2.317352771759033, + "logps/rejected": -3.149284839630127, + "loss": 1.7068, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.17352867126465, + "rewards/margins": 8.319319725036621, + "rewards/rejected": -31.492849349975586, + "step": 19330 + }, + { + "epoch": 0.6516903164919613, + "grad_norm": 46.21883010864258, + "learning_rate": 3.262228134281978e-07, + "logits/chosen": -1.5569813251495361, + "logits/rejected": -1.4711658954620361, + "logps/chosen": -3.3634109497070312, + "logps/rejected": -3.3014659881591797, + "loss": 3.8962, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -33.63411331176758, + "rewards/margins": -0.6194513440132141, + "rewards/rejected": -33.0146598815918, + "step": 19335 + }, + { + "epoch": 0.6518588425629445, + "grad_norm": 29.301965713500977, + "learning_rate": 3.2594704532963304e-07, + "logits/chosen": -0.8339643478393555, + "logits/rejected": -1.0341277122497559, + "logps/chosen": -2.964996576309204, + "logps/rejected": -3.104041576385498, + "loss": 3.136, + "rewards/accuracies": 0.5, + "rewards/chosen": -29.649967193603516, + "rewards/margins": 1.3904485702514648, + "rewards/rejected": -31.040414810180664, + "step": 19340 + }, + { + "epoch": 0.6520273686339276, + "grad_norm": 64.72969818115234, + "learning_rate": 3.256713374639386e-07, + "logits/chosen": -2.102191209793091, + "logits/rejected": -2.0357930660247803, + "logps/chosen": -2.5758633613586426, + "logps/rejected": -2.6760551929473877, + "loss": 2.8529, + "rewards/accuracies": 0.5, + "rewards/chosen": -25.758636474609375, + "rewards/margins": 1.001916766166687, + "rewards/rejected": -26.76055335998535, + "step": 19345 + }, + { + "epoch": 0.6521958947049108, + "grad_norm": 35.41569900512695, + "learning_rate": 3.253956899265258e-07, + "logits/chosen": -1.9102615118026733, + "logits/rejected": -2.1288914680480957, + "logps/chosen": -2.2416248321533203, + "logps/rejected": -2.2110986709594727, + "loss": 3.7389, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.416248321533203, + "rewards/margins": -0.30526217818260193, + "rewards/rejected": -22.110986709594727, + "step": 19350 + }, + { + "epoch": 0.6523644207758941, + "grad_norm": 26.262041091918945, + "learning_rate": 3.251201028127856e-07, + "logits/chosen": -1.384140968322754, + "logits/rejected": -1.5131553411483765, + "logps/chosen": -1.9074770212173462, + "logps/rejected": -1.9531618356704712, + "loss": 3.0518, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.074771881103516, + "rewards/margins": 0.4568476676940918, + "rewards/rejected": -19.531620025634766, + "step": 19355 + }, + { + "epoch": 0.6525329468468772, + "grad_norm": 18.94156837463379, + "learning_rate": 3.248445762180878e-07, + "logits/chosen": -2.1064388751983643, + "logits/rejected": -2.721240520477295, + "logps/chosen": -2.8058600425720215, + "logps/rejected": -3.3915016651153564, + "loss": 4.1979, + "rewards/accuracies": 0.5, + "rewards/chosen": -28.0585994720459, + "rewards/margins": 5.856412887573242, + "rewards/rejected": -33.915016174316406, + "step": 19360 + }, + { + "epoch": 0.6527014729178604, + "grad_norm": 25.645156860351562, + "learning_rate": 3.245691102377814e-07, + "logits/chosen": -1.8133264780044556, + "logits/rejected": -2.2402541637420654, + "logps/chosen": -2.4837303161621094, + "logps/rejected": -3.0722298622131348, + "loss": 1.9654, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -24.83730125427246, + "rewards/margins": 5.884993076324463, + "rewards/rejected": -30.7222957611084, + "step": 19365 + }, + { + "epoch": 0.6528699989888436, + "grad_norm": 41.649349212646484, + "learning_rate": 3.2429370496719425e-07, + "logits/chosen": -1.8655116558074951, + "logits/rejected": -1.7398531436920166, + "logps/chosen": -2.2461180686950684, + "logps/rejected": -2.325387477874756, + "loss": 3.5284, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -22.461181640625, + "rewards/margins": 0.7926937341690063, + "rewards/rejected": -23.253875732421875, + "step": 19370 + }, + { + "epoch": 0.6530385250598267, + "grad_norm": 38.51300048828125, + "learning_rate": 3.2401836050163323e-07, + "logits/chosen": -1.1705830097198486, + "logits/rejected": -2.015150308609009, + "logps/chosen": -2.3863024711608887, + "logps/rejected": -3.1402745246887207, + "loss": 2.464, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -23.863025665283203, + "rewards/margins": 7.5397186279296875, + "rewards/rejected": -31.402746200561523, + "step": 19375 + }, + { + "epoch": 0.6532070511308099, + "grad_norm": 61.92262649536133, + "learning_rate": 3.2374307693638444e-07, + "logits/chosen": -1.3954817056655884, + "logits/rejected": -1.8168102502822876, + "logps/chosen": -2.647247791290283, + "logps/rejected": -3.5583183765411377, + "loss": 3.0354, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -26.472476959228516, + "rewards/margins": 9.11070442199707, + "rewards/rejected": -35.58317947387695, + "step": 19380 + }, + { + "epoch": 0.6533755772017931, + "grad_norm": 0.01797177828848362, + "learning_rate": 3.234678543667122e-07, + "logits/chosen": -1.431006669998169, + "logits/rejected": -1.6097593307495117, + "logps/chosen": -3.0501410961151123, + "logps/rejected": -3.4992783069610596, + "loss": 2.6282, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -30.501415252685547, + "rewards/margins": 4.491368293762207, + "rewards/rejected": -34.99277877807617, + "step": 19385 + }, + { + "epoch": 0.6535441032727763, + "grad_norm": 55.26557159423828, + "learning_rate": 3.2319269288786057e-07, + "logits/chosen": -1.0813277959823608, + "logits/rejected": -1.3444488048553467, + "logps/chosen": -2.77297306060791, + "logps/rejected": -3.128937244415283, + "loss": 3.0345, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -27.7297306060791, + "rewards/margins": 3.559643268585205, + "rewards/rejected": -31.28937339782715, + "step": 19390 + }, + { + "epoch": 0.6537126293437595, + "grad_norm": 54.47288513183594, + "learning_rate": 3.229175925950519e-07, + "logits/chosen": -1.9032952785491943, + "logits/rejected": -2.183346748352051, + "logps/chosen": -2.3724331855773926, + "logps/rejected": -2.8111534118652344, + "loss": 3.0137, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.72433090209961, + "rewards/margins": 4.387204647064209, + "rewards/rejected": -28.111536026000977, + "step": 19395 + }, + { + "epoch": 0.6538811554147427, + "grad_norm": 190.4893798828125, + "learning_rate": 3.226425535834879e-07, + "logits/chosen": -1.549120306968689, + "logits/rejected": -1.7237541675567627, + "logps/chosen": -3.474799633026123, + "logps/rejected": -3.365884780883789, + "loss": 4.8594, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -34.74799728393555, + "rewards/margins": -1.0891517400741577, + "rewards/rejected": -33.65884780883789, + "step": 19400 + }, + { + "epoch": 0.6540496814857258, + "grad_norm": 35.21147918701172, + "learning_rate": 3.2236757594834834e-07, + "logits/chosen": -1.985065221786499, + "logits/rejected": -1.8145793676376343, + "logps/chosen": -1.9986995458602905, + "logps/rejected": -1.9813121557235718, + "loss": 3.5778, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.98699378967285, + "rewards/margins": -0.17387238144874573, + "rewards/rejected": -19.813121795654297, + "step": 19405 + }, + { + "epoch": 0.654218207556709, + "grad_norm": 38.15839767456055, + "learning_rate": 3.220926597847923e-07, + "logits/chosen": -1.4903342723846436, + "logits/rejected": -1.9833576679229736, + "logps/chosen": -2.1800193786621094, + "logps/rejected": -2.4973537921905518, + "loss": 3.3582, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -21.800195693969727, + "rewards/margins": 3.1733405590057373, + "rewards/rejected": -24.97353744506836, + "step": 19410 + }, + { + "epoch": 0.6543867336276922, + "grad_norm": 48.588050842285156, + "learning_rate": 3.2181780518795765e-07, + "logits/chosen": -1.302261471748352, + "logits/rejected": -1.409800410270691, + "logps/chosen": -2.139906406402588, + "logps/rejected": -2.2833118438720703, + "loss": 1.8871, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.399066925048828, + "rewards/margins": 1.4340507984161377, + "rewards/rejected": -22.833118438720703, + "step": 19415 + }, + { + "epoch": 0.6545552596986753, + "grad_norm": 0.09502626210451126, + "learning_rate": 3.2154301225296033e-07, + "logits/chosen": -1.509668231010437, + "logits/rejected": -1.8733934164047241, + "logps/chosen": -2.271191120147705, + "logps/rejected": -3.536611557006836, + "loss": 0.8511, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -22.7119083404541, + "rewards/margins": 12.654206275939941, + "rewards/rejected": -35.36611557006836, + "step": 19420 + }, + { + "epoch": 0.6547237857696586, + "grad_norm": 26.83744239807129, + "learning_rate": 3.212682810748955e-07, + "logits/chosen": -1.461922287940979, + "logits/rejected": -1.5134670734405518, + "logps/chosen": -3.206855297088623, + "logps/rejected": -3.866116762161255, + "loss": 2.3329, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -32.06855392456055, + "rewards/margins": 6.592613220214844, + "rewards/rejected": -38.661170959472656, + "step": 19425 + }, + { + "epoch": 0.6548923118406418, + "grad_norm": 39.84358596801758, + "learning_rate": 3.2099361174883683e-07, + "logits/chosen": -1.2554028034210205, + "logits/rejected": -1.2571032047271729, + "logps/chosen": -2.093191623687744, + "logps/rejected": -2.3733489513397217, + "loss": 2.7174, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.931913375854492, + "rewards/margins": 2.8015756607055664, + "rewards/rejected": -23.733489990234375, + "step": 19430 + }, + { + "epoch": 0.655060837911625, + "grad_norm": 69.73009490966797, + "learning_rate": 3.207190043698367e-07, + "logits/chosen": -2.204716920852661, + "logits/rejected": -2.272737741470337, + "logps/chosen": -2.727163791656494, + "logps/rejected": -2.85206937789917, + "loss": 4.5027, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -27.271636962890625, + "rewards/margins": 1.2490571737289429, + "rewards/rejected": -28.52069664001465, + "step": 19435 + }, + { + "epoch": 0.6552293639826081, + "grad_norm": 40.216697692871094, + "learning_rate": 3.204444590329256e-07, + "logits/chosen": -1.6425358057022095, + "logits/rejected": -1.55009126663208, + "logps/chosen": -2.5580005645751953, + "logps/rejected": -2.6416759490966797, + "loss": 4.1616, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -25.580005645751953, + "rewards/margins": 0.8367554545402527, + "rewards/rejected": -26.416757583618164, + "step": 19440 + }, + { + "epoch": 0.6553978900535913, + "grad_norm": 17.301225662231445, + "learning_rate": 3.2016997583311323e-07, + "logits/chosen": -1.6697677373886108, + "logits/rejected": -1.9882276058197021, + "logps/chosen": -2.2435336112976074, + "logps/rejected": -2.221325159072876, + "loss": 3.8423, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.43533706665039, + "rewards/margins": -0.2220836579799652, + "rewards/rejected": -22.213253021240234, + "step": 19445 + }, + { + "epoch": 0.6555664161245744, + "grad_norm": 42.60205078125, + "learning_rate": 3.1989555486538716e-07, + "logits/chosen": -1.0797595977783203, + "logits/rejected": -1.429962158203125, + "logps/chosen": -2.0917086601257324, + "logps/rejected": -2.2917962074279785, + "loss": 2.1877, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.91708755493164, + "rewards/margins": 2.0008738040924072, + "rewards/rejected": -22.9179630279541, + "step": 19450 + }, + { + "epoch": 0.6557349421955576, + "grad_norm": 50.20454406738281, + "learning_rate": 3.196211962247136e-07, + "logits/chosen": -1.5836347341537476, + "logits/rejected": -1.5361778736114502, + "logps/chosen": -2.3230605125427246, + "logps/rejected": -2.3607373237609863, + "loss": 2.9018, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.230606079101562, + "rewards/margins": 0.37676936388015747, + "rewards/rejected": -23.607372283935547, + "step": 19455 + }, + { + "epoch": 0.6559034682665408, + "grad_norm": 31.224794387817383, + "learning_rate": 3.193469000060374e-07, + "logits/chosen": -1.5918365716934204, + "logits/rejected": -1.2786345481872559, + "logps/chosen": -2.7298786640167236, + "logps/rejected": -2.472374439239502, + "loss": 6.6671, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -27.298786163330078, + "rewards/margins": -2.5750393867492676, + "rewards/rejected": -24.723745346069336, + "step": 19460 + }, + { + "epoch": 0.6560719943375241, + "grad_norm": 24.223526000976562, + "learning_rate": 3.1907266630428165e-07, + "logits/chosen": -1.9028345346450806, + "logits/rejected": -2.3104963302612305, + "logps/chosen": -3.585310697555542, + "logps/rejected": -3.973658323287964, + "loss": 3.0605, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -35.85310745239258, + "rewards/margins": 3.8834731578826904, + "rewards/rejected": -39.73657989501953, + "step": 19465 + }, + { + "epoch": 0.6562405204085072, + "grad_norm": 32.01412582397461, + "learning_rate": 3.187984952143481e-07, + "logits/chosen": -1.718444585800171, + "logits/rejected": -1.6569147109985352, + "logps/chosen": -3.2525010108947754, + "logps/rejected": -3.6777184009552, + "loss": 4.2341, + "rewards/accuracies": 0.5, + "rewards/chosen": -32.52500915527344, + "rewards/margins": 4.252173900604248, + "rewards/rejected": -36.777183532714844, + "step": 19470 + }, + { + "epoch": 0.6564090464794904, + "grad_norm": 212.0823974609375, + "learning_rate": 3.1852438683111603e-07, + "logits/chosen": -1.5602266788482666, + "logits/rejected": -1.596228837966919, + "logps/chosen": -2.254568576812744, + "logps/rejected": -2.299839496612549, + "loss": 2.8802, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.54568099975586, + "rewards/margins": 0.4527137875556946, + "rewards/rejected": -22.998395919799805, + "step": 19475 + }, + { + "epoch": 0.6565775725504736, + "grad_norm": 0.14562419056892395, + "learning_rate": 3.1825034124944384e-07, + "logits/chosen": -2.2185866832733154, + "logits/rejected": -2.6798908710479736, + "logps/chosen": -3.334772825241089, + "logps/rejected": -3.952873706817627, + "loss": 4.9938, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -33.34772872924805, + "rewards/margins": 6.18100643157959, + "rewards/rejected": -39.52873611450195, + "step": 19480 + }, + { + "epoch": 0.6567460986214567, + "grad_norm": 17.60808563232422, + "learning_rate": 3.179763585641681e-07, + "logits/chosen": -1.7934448719024658, + "logits/rejected": -1.743231177330017, + "logps/chosen": -2.447374105453491, + "logps/rejected": -2.9404006004333496, + "loss": 1.6274, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -24.473739624023438, + "rewards/margins": 4.930263519287109, + "rewards/rejected": -29.404003143310547, + "step": 19485 + }, + { + "epoch": 0.6569146246924399, + "grad_norm": 134.12086486816406, + "learning_rate": 3.17702438870103e-07, + "logits/chosen": -1.1001700162887573, + "logits/rejected": -1.2772667407989502, + "logps/chosen": -2.1697604656219482, + "logps/rejected": -2.358163595199585, + "loss": 2.444, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.697607040405273, + "rewards/margins": 1.884027123451233, + "rewards/rejected": -23.581636428833008, + "step": 19490 + }, + { + "epoch": 0.657083150763423, + "grad_norm": 26.62551498413086, + "learning_rate": 3.174285822620416e-07, + "logits/chosen": -1.5494797229766846, + "logits/rejected": -1.5621850490570068, + "logps/chosen": -2.2668309211730957, + "logps/rejected": -2.612889528274536, + "loss": 2.6577, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.66830825805664, + "rewards/margins": 3.4605870246887207, + "rewards/rejected": -26.128894805908203, + "step": 19495 + }, + { + "epoch": 0.6572516768344063, + "grad_norm": 39.074554443359375, + "learning_rate": 3.1715478883475495e-07, + "logits/chosen": -2.109074592590332, + "logits/rejected": -2.007145404815674, + "logps/chosen": -2.1038336753845215, + "logps/rejected": -2.416748523712158, + "loss": 2.8232, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.0383358001709, + "rewards/margins": 3.129146099090576, + "rewards/rejected": -24.167484283447266, + "step": 19500 + }, + { + "epoch": 0.6574202029053895, + "grad_norm": 28.4183349609375, + "learning_rate": 3.1688105868299193e-07, + "logits/chosen": -1.3516571521759033, + "logits/rejected": -1.6358953714370728, + "logps/chosen": -2.264101505279541, + "logps/rejected": -2.4743664264678955, + "loss": 2.117, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.64101219177246, + "rewards/margins": 2.102649688720703, + "rewards/rejected": -24.743663787841797, + "step": 19505 + }, + { + "epoch": 0.6575887289763727, + "grad_norm": 20.12200164794922, + "learning_rate": 3.1660739190148e-07, + "logits/chosen": -1.7156364917755127, + "logits/rejected": -1.9024250507354736, + "logps/chosen": -2.3485307693481445, + "logps/rejected": -3.1334264278411865, + "loss": 2.453, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -23.485305786132812, + "rewards/margins": 7.848960876464844, + "rewards/rejected": -31.334264755249023, + "step": 19510 + }, + { + "epoch": 0.6577572550473558, + "grad_norm": 27.374279022216797, + "learning_rate": 3.163337885849243e-07, + "logits/chosen": -1.6456096172332764, + "logits/rejected": -1.7837450504302979, + "logps/chosen": -1.983033537864685, + "logps/rejected": -1.9265260696411133, + "loss": 3.7782, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.830333709716797, + "rewards/margins": -0.5650733709335327, + "rewards/rejected": -19.265262603759766, + "step": 19515 + }, + { + "epoch": 0.657925781118339, + "grad_norm": 109.21072387695312, + "learning_rate": 3.160602488280083e-07, + "logits/chosen": -1.409911036491394, + "logits/rejected": -1.4950873851776123, + "logps/chosen": -3.305628538131714, + "logps/rejected": -3.413794755935669, + "loss": 3.2585, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -33.05628204345703, + "rewards/margins": 1.0816668272018433, + "rewards/rejected": -34.1379508972168, + "step": 19520 + }, + { + "epoch": 0.6580943071893222, + "grad_norm": 25.108606338500977, + "learning_rate": 3.1578677272539313e-07, + "logits/chosen": -1.7747972011566162, + "logits/rejected": -2.1871495246887207, + "logps/chosen": -1.8980529308319092, + "logps/rejected": -2.155996799468994, + "loss": 1.6926, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.98052978515625, + "rewards/margins": 2.579437255859375, + "rewards/rejected": -21.559967041015625, + "step": 19525 + }, + { + "epoch": 0.6582628332603053, + "grad_norm": 55.106502532958984, + "learning_rate": 3.155133603717182e-07, + "logits/chosen": -1.4665242433547974, + "logits/rejected": -1.449638843536377, + "logps/chosen": -2.216754913330078, + "logps/rejected": -2.4463374614715576, + "loss": 2.5417, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.16754913330078, + "rewards/margins": 2.2958261966705322, + "rewards/rejected": -24.463375091552734, + "step": 19530 + }, + { + "epoch": 0.6584313593312886, + "grad_norm": 31.020858764648438, + "learning_rate": 3.15240011861601e-07, + "logits/chosen": -1.8325130939483643, + "logits/rejected": -2.208038330078125, + "logps/chosen": -1.9647200107574463, + "logps/rejected": -2.779484987258911, + "loss": 1.5215, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.647197723388672, + "rewards/margins": 8.147647857666016, + "rewards/rejected": -27.794849395751953, + "step": 19535 + }, + { + "epoch": 0.6585998854022718, + "grad_norm": 52.70795822143555, + "learning_rate": 3.1496672728963625e-07, + "logits/chosen": -1.2615281343460083, + "logits/rejected": -1.745319128036499, + "logps/chosen": -2.6752312183380127, + "logps/rejected": -2.9537129402160645, + "loss": 2.1299, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.7523136138916, + "rewards/margins": 2.7848167419433594, + "rewards/rejected": -29.537128448486328, + "step": 19540 + }, + { + "epoch": 0.6587684114732549, + "grad_norm": 32.79944610595703, + "learning_rate": 3.1469350675039706e-07, + "logits/chosen": -1.7413183450698853, + "logits/rejected": -1.7953037023544312, + "logps/chosen": -2.31355881690979, + "logps/rejected": -2.3361732959747314, + "loss": 3.5028, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.13558578491211, + "rewards/margins": 0.22614488005638123, + "rewards/rejected": -23.361730575561523, + "step": 19545 + }, + { + "epoch": 0.6589369375442381, + "grad_norm": 6.505997657775879, + "learning_rate": 3.144203503384345e-07, + "logits/chosen": -1.4171102046966553, + "logits/rejected": -1.7000093460083008, + "logps/chosen": -2.309157133102417, + "logps/rejected": -2.585911512374878, + "loss": 3.0135, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.091571807861328, + "rewards/margins": 2.7675464153289795, + "rewards/rejected": -25.859119415283203, + "step": 19550 + }, + { + "epoch": 0.6591054636152213, + "grad_norm": 18.754776000976562, + "learning_rate": 3.1414725814827735e-07, + "logits/chosen": -1.275989294052124, + "logits/rejected": -1.4211461544036865, + "logps/chosen": -2.091052293777466, + "logps/rejected": -2.1982831954956055, + "loss": 2.5494, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.9105224609375, + "rewards/margins": 1.072310209274292, + "rewards/rejected": -21.982831954956055, + "step": 19555 + }, + { + "epoch": 0.6592739896862044, + "grad_norm": 46.30010223388672, + "learning_rate": 3.138742302744316e-07, + "logits/chosen": -1.2075470685958862, + "logits/rejected": -1.5399049520492554, + "logps/chosen": -2.470909595489502, + "logps/rejected": -3.1514031887054443, + "loss": 1.9756, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -24.70909881591797, + "rewards/margins": 6.804935455322266, + "rewards/rejected": -31.514034271240234, + "step": 19560 + }, + { + "epoch": 0.6594425157571876, + "grad_norm": 18.922914505004883, + "learning_rate": 3.1360126681138164e-07, + "logits/chosen": -2.1684887409210205, + "logits/rejected": -2.0825414657592773, + "logps/chosen": -1.9475510120391846, + "logps/rejected": -2.1734211444854736, + "loss": 1.7755, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.475509643554688, + "rewards/margins": 2.258702039718628, + "rewards/rejected": -21.734210968017578, + "step": 19565 + }, + { + "epoch": 0.6596110418281708, + "grad_norm": 44.96382141113281, + "learning_rate": 3.1332836785358964e-07, + "logits/chosen": -1.8090126514434814, + "logits/rejected": -1.8098065853118896, + "logps/chosen": -2.098357677459717, + "logps/rejected": -2.2060370445251465, + "loss": 2.5793, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.98357582092285, + "rewards/margins": 1.0767956972122192, + "rewards/rejected": -22.06036949157715, + "step": 19570 + }, + { + "epoch": 0.659779567899154, + "grad_norm": 24.288698196411133, + "learning_rate": 3.130555334954949e-07, + "logits/chosen": -1.448203206062317, + "logits/rejected": -1.4640809297561646, + "logps/chosen": -2.248979091644287, + "logps/rejected": -2.4473307132720947, + "loss": 2.1316, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.489791870117188, + "rewards/margins": 1.9835140705108643, + "rewards/rejected": -24.47330665588379, + "step": 19575 + }, + { + "epoch": 0.6599480939701372, + "grad_norm": 29.102195739746094, + "learning_rate": 3.127827638315146e-07, + "logits/chosen": -1.6922508478164673, + "logits/rejected": -1.743398666381836, + "logps/chosen": -2.742082118988037, + "logps/rejected": -3.214224338531494, + "loss": 2.7073, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -27.420822143554688, + "rewards/margins": 4.721421241760254, + "rewards/rejected": -32.142242431640625, + "step": 19580 + }, + { + "epoch": 0.6601166200411204, + "grad_norm": 69.89996337890625, + "learning_rate": 3.1251005895604363e-07, + "logits/chosen": -1.2156705856323242, + "logits/rejected": -1.1699566841125488, + "logps/chosen": -2.3415446281433105, + "logps/rejected": -2.812303066253662, + "loss": 3.7849, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -23.41544532775879, + "rewards/margins": 4.707589149475098, + "rewards/rejected": -28.123035430908203, + "step": 19585 + }, + { + "epoch": 0.6602851461121035, + "grad_norm": 28.79608726501465, + "learning_rate": 3.122374189634546e-07, + "logits/chosen": -1.7804441452026367, + "logits/rejected": -2.0019617080688477, + "logps/chosen": -1.7656739950180054, + "logps/rejected": -1.958898901939392, + "loss": 2.2752, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.656740188598633, + "rewards/margins": 1.9322493076324463, + "rewards/rejected": -19.5889892578125, + "step": 19590 + }, + { + "epoch": 0.6604536721830867, + "grad_norm": 208.58645629882812, + "learning_rate": 3.119648439480972e-07, + "logits/chosen": -1.7226556539535522, + "logits/rejected": -1.7559149265289307, + "logps/chosen": -2.9009034633636475, + "logps/rejected": -3.0689263343811035, + "loss": 2.6959, + "rewards/accuracies": 0.5, + "rewards/chosen": -29.009033203125, + "rewards/margins": 1.6802289485931396, + "rewards/rejected": -30.689266204833984, + "step": 19595 + }, + { + "epoch": 0.6606221982540699, + "grad_norm": 20.864017486572266, + "learning_rate": 3.1169233400429907e-07, + "logits/chosen": -1.8954029083251953, + "logits/rejected": -2.0006442070007324, + "logps/chosen": -2.0070273876190186, + "logps/rejected": -2.6326088905334473, + "loss": 1.9522, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.070276260375977, + "rewards/margins": 6.255814552307129, + "rewards/rejected": -26.32608985900879, + "step": 19600 + }, + { + "epoch": 0.6606221982540699, + "eval_logits/chosen": -2.0875275135040283, + "eval_logits/rejected": -2.2476277351379395, + "eval_logps/chosen": -2.1917946338653564, + "eval_logps/rejected": -2.3321449756622314, + "eval_loss": 3.0390613079071045, + "eval_rewards/accuracies": 0.6299999952316284, + "eval_rewards/chosen": -21.917945861816406, + "eval_rewards/margins": 1.4035052061080933, + "eval_rewards/rejected": -23.321449279785156, + "eval_runtime": 12.9389, + "eval_samples_per_second": 7.729, + "eval_steps_per_second": 1.932, + "step": 19600 + }, + { + "epoch": 0.660790724325053, + "grad_norm": 31.91147804260254, + "learning_rate": 3.1141988922636525e-07, + "logits/chosen": -1.6353679895401, + "logits/rejected": -1.8300693035125732, + "logps/chosen": -2.4113149642944336, + "logps/rejected": -3.3126251697540283, + "loss": 2.4313, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.113149642944336, + "rewards/margins": 9.013103485107422, + "rewards/rejected": -33.126251220703125, + "step": 19605 + }, + { + "epoch": 0.6609592503960363, + "grad_norm": 30.281909942626953, + "learning_rate": 3.1114750970857784e-07, + "logits/chosen": -1.512751817703247, + "logits/rejected": -2.048257827758789, + "logps/chosen": -2.5183193683624268, + "logps/rejected": -4.193761825561523, + "loss": 0.8449, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -25.18319320678711, + "rewards/margins": 16.754425048828125, + "rewards/rejected": -41.93761444091797, + "step": 19610 + }, + { + "epoch": 0.6611277764670195, + "grad_norm": 23.415830612182617, + "learning_rate": 3.108751955451968e-07, + "logits/chosen": -1.5418158769607544, + "logits/rejected": -1.8583043813705444, + "logps/chosen": -2.7758288383483887, + "logps/rejected": -3.548363208770752, + "loss": 1.8529, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -27.758289337158203, + "rewards/margins": 7.725339412689209, + "rewards/rejected": -35.4836311340332, + "step": 19615 + }, + { + "epoch": 0.6612963025380026, + "grad_norm": 21.576784133911133, + "learning_rate": 3.106029468304594e-07, + "logits/chosen": -1.6991370916366577, + "logits/rejected": -1.7369697093963623, + "logps/chosen": -2.20546293258667, + "logps/rejected": -2.2247977256774902, + "loss": 3.1269, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.054630279541016, + "rewards/margins": 0.1933467835187912, + "rewards/rejected": -22.247976303100586, + "step": 19620 + }, + { + "epoch": 0.6614648286089858, + "grad_norm": 208.43551635742188, + "learning_rate": 3.1033076365858036e-07, + "logits/chosen": -1.1789934635162354, + "logits/rejected": -2.0805132389068604, + "logps/chosen": -2.531059741973877, + "logps/rejected": -3.4619078636169434, + "loss": 2.0194, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -25.310596466064453, + "rewards/margins": 9.308481216430664, + "rewards/rejected": -34.61907958984375, + "step": 19625 + }, + { + "epoch": 0.661633354679969, + "grad_norm": 1.221866488456726, + "learning_rate": 3.100586461237511e-07, + "logits/chosen": -1.6778274774551392, + "logits/rejected": -1.9052091836929321, + "logps/chosen": -2.725083827972412, + "logps/rejected": -3.2458865642547607, + "loss": 1.9867, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -27.250839233398438, + "rewards/margins": 5.208024024963379, + "rewards/rejected": -32.4588623046875, + "step": 19630 + }, + { + "epoch": 0.6618018807509521, + "grad_norm": 22.51280975341797, + "learning_rate": 3.0978659432014103e-07, + "logits/chosen": -1.9016478061676025, + "logits/rejected": -2.3998100757598877, + "logps/chosen": -3.3028228282928467, + "logps/rejected": -3.3499350547790527, + "loss": 6.1235, + "rewards/accuracies": 0.5, + "rewards/chosen": -33.02823257446289, + "rewards/margins": 0.47111815214157104, + "rewards/rejected": -33.49934768676758, + "step": 19635 + }, + { + "epoch": 0.6619704068219353, + "grad_norm": 62.46411895751953, + "learning_rate": 3.095146083418968e-07, + "logits/chosen": -1.3130820989608765, + "logits/rejected": -1.4224445819854736, + "logps/chosen": -2.2050185203552246, + "logps/rejected": -2.242039918899536, + "loss": 4.3249, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -22.050186157226562, + "rewards/margins": 0.3702128529548645, + "rewards/rejected": -22.420398712158203, + "step": 19640 + }, + { + "epoch": 0.6621389328929186, + "grad_norm": 55.873966217041016, + "learning_rate": 3.092426882831416e-07, + "logits/chosen": -1.7375984191894531, + "logits/rejected": -1.5786654949188232, + "logps/chosen": -2.8116352558135986, + "logps/rejected": -2.9780755043029785, + "loss": 3.8662, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -28.116352081298828, + "rewards/margins": 1.664402961730957, + "rewards/rejected": -29.780752182006836, + "step": 19645 + }, + { + "epoch": 0.6623074589639018, + "grad_norm": 60.345523834228516, + "learning_rate": 3.089708342379764e-07, + "logits/chosen": -1.3604744672775269, + "logits/rejected": -1.313848614692688, + "logps/chosen": -2.6019206047058105, + "logps/rejected": -2.614013910293579, + "loss": 3.0137, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -26.01920509338379, + "rewards/margins": 0.120935820043087, + "rewards/rejected": -26.140140533447266, + "step": 19650 + }, + { + "epoch": 0.6624759850348849, + "grad_norm": 47.997467041015625, + "learning_rate": 3.086990463004792e-07, + "logits/chosen": -1.4324225187301636, + "logits/rejected": -1.6653724908828735, + "logps/chosen": -1.972364068031311, + "logps/rejected": -2.192746639251709, + "loss": 2.0297, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.723642349243164, + "rewards/margins": 2.2038259506225586, + "rewards/rejected": -21.927465438842773, + "step": 19655 + }, + { + "epoch": 0.6626445111058681, + "grad_norm": 49.364105224609375, + "learning_rate": 3.0842732456470527e-07, + "logits/chosen": -1.995145559310913, + "logits/rejected": -2.4126739501953125, + "logps/chosen": -2.6052517890930176, + "logps/rejected": -3.049837827682495, + "loss": 1.9231, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -26.052515029907227, + "rewards/margins": 4.445866107940674, + "rewards/rejected": -30.49837875366211, + "step": 19660 + }, + { + "epoch": 0.6628130371768512, + "grad_norm": 68.35008239746094, + "learning_rate": 3.0815566912468657e-07, + "logits/chosen": -2.1833343505859375, + "logits/rejected": -2.2756967544555664, + "logps/chosen": -2.228520631790161, + "logps/rejected": -2.616534471511841, + "loss": 2.9025, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.285205841064453, + "rewards/margins": 3.8801398277282715, + "rewards/rejected": -26.16534423828125, + "step": 19665 + }, + { + "epoch": 0.6629815632478344, + "grad_norm": 23.333072662353516, + "learning_rate": 3.0788408007443234e-07, + "logits/chosen": -1.176478624343872, + "logits/rejected": -1.5110712051391602, + "logps/chosen": -1.7166290283203125, + "logps/rejected": -1.8860689401626587, + "loss": 1.9023, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.166288375854492, + "rewards/margins": 1.6943992376327515, + "rewards/rejected": -18.86069107055664, + "step": 19670 + }, + { + "epoch": 0.6631500893188176, + "grad_norm": 18.771587371826172, + "learning_rate": 3.0761255750792923e-07, + "logits/chosen": -1.8307679891586304, + "logits/rejected": -2.152930736541748, + "logps/chosen": -2.309884786605835, + "logps/rejected": -2.8270676136016846, + "loss": 2.0373, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.098846435546875, + "rewards/margins": 5.1718268394470215, + "rewards/rejected": -28.270675659179688, + "step": 19675 + }, + { + "epoch": 0.6633186153898007, + "grad_norm": 33.46798324584961, + "learning_rate": 3.0734110151913995e-07, + "logits/chosen": -1.5328623056411743, + "logits/rejected": -1.9079945087432861, + "logps/chosen": -2.2217116355895996, + "logps/rejected": -2.8022282123565674, + "loss": 1.8753, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.21711540222168, + "rewards/margins": 5.8051652908325195, + "rewards/rejected": -28.02227783203125, + "step": 19680 + }, + { + "epoch": 0.663487141460784, + "grad_norm": 31.7010498046875, + "learning_rate": 3.0706971220200494e-07, + "logits/chosen": -1.3515177965164185, + "logits/rejected": -1.5968728065490723, + "logps/chosen": -2.9291114807128906, + "logps/rejected": -3.167548656463623, + "loss": 2.6971, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -29.291112899780273, + "rewards/margins": 2.384373664855957, + "rewards/rejected": -31.675487518310547, + "step": 19685 + }, + { + "epoch": 0.6636556675317672, + "grad_norm": 19.04203987121582, + "learning_rate": 3.0679838965044147e-07, + "logits/chosen": -1.6338005065917969, + "logits/rejected": -1.7416985034942627, + "logps/chosen": -2.837009906768799, + "logps/rejected": -3.1580300331115723, + "loss": 4.8603, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -28.370098114013672, + "rewards/margins": 3.2102017402648926, + "rewards/rejected": -31.58030128479004, + "step": 19690 + }, + { + "epoch": 0.6638241936027504, + "grad_norm": 17.882585525512695, + "learning_rate": 3.065271339583436e-07, + "logits/chosen": -1.746091604232788, + "logits/rejected": -2.024017810821533, + "logps/chosen": -1.873356580734253, + "logps/rejected": -2.518157720565796, + "loss": 1.3999, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -18.733566284179688, + "rewards/margins": 6.4480085372924805, + "rewards/rejected": -25.181575775146484, + "step": 19695 + }, + { + "epoch": 0.6639927196737335, + "grad_norm": 48.590450286865234, + "learning_rate": 3.06255945219582e-07, + "logits/chosen": -1.691332221031189, + "logits/rejected": -2.2088327407836914, + "logps/chosen": -2.7080318927764893, + "logps/rejected": -2.8886380195617676, + "loss": 4.762, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -27.080318450927734, + "rewards/margins": 1.8060623407363892, + "rewards/rejected": -28.88637924194336, + "step": 19700 + }, + { + "epoch": 0.6641612457447167, + "grad_norm": 215.69692993164062, + "learning_rate": 3.0598482352800457e-07, + "logits/chosen": -1.6511586904525757, + "logits/rejected": -1.527772307395935, + "logps/chosen": -4.019328594207764, + "logps/rejected": -4.086058616638184, + "loss": 3.9622, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -40.19328689575195, + "rewards/margins": 0.6673009991645813, + "rewards/rejected": -40.86058807373047, + "step": 19705 + }, + { + "epoch": 0.6643297718156999, + "grad_norm": 116.02570343017578, + "learning_rate": 3.0571376897743606e-07, + "logits/chosen": -1.7304351329803467, + "logits/rejected": -1.770742416381836, + "logps/chosen": -2.940293312072754, + "logps/rejected": -2.576066493988037, + "loss": 7.1427, + "rewards/accuracies": 0.5, + "rewards/chosen": -29.40293312072754, + "rewards/margins": -3.6422653198242188, + "rewards/rejected": -25.760665893554688, + "step": 19710 + }, + { + "epoch": 0.664498297886683, + "grad_norm": 19.49298858642578, + "learning_rate": 3.0544278166167725e-07, + "logits/chosen": -1.6837724447250366, + "logits/rejected": -1.6870133876800537, + "logps/chosen": -2.900510787963867, + "logps/rejected": -3.3730030059814453, + "loss": 1.0817, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -29.005102157592773, + "rewards/margins": 4.724923133850098, + "rewards/rejected": -33.73003005981445, + "step": 19715 + }, + { + "epoch": 0.6646668239576663, + "grad_norm": 21.581035614013672, + "learning_rate": 3.0517186167450647e-07, + "logits/chosen": -1.40189528465271, + "logits/rejected": -1.6113313436508179, + "logps/chosen": -2.0576136112213135, + "logps/rejected": -2.196462869644165, + "loss": 2.172, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.576135635375977, + "rewards/margins": 1.388492465019226, + "rewards/rejected": -21.964628219604492, + "step": 19720 + }, + { + "epoch": 0.6648353500286495, + "grad_norm": 11.661018371582031, + "learning_rate": 3.049010091096784e-07, + "logits/chosen": -2.140958309173584, + "logits/rejected": -2.1676430702209473, + "logps/chosen": -1.8523032665252686, + "logps/rejected": -2.0676932334899902, + "loss": 3.0915, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.523033142089844, + "rewards/margins": 2.153900384902954, + "rewards/rejected": -20.67693519592285, + "step": 19725 + }, + { + "epoch": 0.6650038760996326, + "grad_norm": 50.89949035644531, + "learning_rate": 3.046302240609247e-07, + "logits/chosen": -1.6583229303359985, + "logits/rejected": -1.9790871143341064, + "logps/chosen": -2.465359687805176, + "logps/rejected": -2.7347371578216553, + "loss": 2.9139, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.653594970703125, + "rewards/margins": 2.6937763690948486, + "rewards/rejected": -27.347375869750977, + "step": 19730 + }, + { + "epoch": 0.6651724021706158, + "grad_norm": 51.48941421508789, + "learning_rate": 3.04359506621953e-07, + "logits/chosen": -1.8139331340789795, + "logits/rejected": -1.7735220193862915, + "logps/chosen": -2.390505313873291, + "logps/rejected": -2.332714319229126, + "loss": 3.8516, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -23.905052185058594, + "rewards/margins": -0.5779077410697937, + "rewards/rejected": -23.3271427154541, + "step": 19735 + }, + { + "epoch": 0.665340928241599, + "grad_norm": 37.086307525634766, + "learning_rate": 3.040888568864482e-07, + "logits/chosen": -1.5597200393676758, + "logits/rejected": -1.7371556758880615, + "logps/chosen": -1.9877293109893799, + "logps/rejected": -2.0962672233581543, + "loss": 2.5319, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.877294540405273, + "rewards/margins": 1.085381269454956, + "rewards/rejected": -20.962675094604492, + "step": 19740 + }, + { + "epoch": 0.6655094543125821, + "grad_norm": 13.57636833190918, + "learning_rate": 3.038182749480716e-07, + "logits/chosen": -1.6128686666488647, + "logits/rejected": -1.851527214050293, + "logps/chosen": -2.4245975017547607, + "logps/rejected": -3.140558958053589, + "loss": 1.9717, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.2459774017334, + "rewards/margins": 7.15961217880249, + "rewards/rejected": -31.405590057373047, + "step": 19745 + }, + { + "epoch": 0.6656779803835653, + "grad_norm": 50.33108139038086, + "learning_rate": 3.035477609004606e-07, + "logits/chosen": -2.2638416290283203, + "logits/rejected": -2.2421774864196777, + "logps/chosen": -2.322725772857666, + "logps/rejected": -2.16920804977417, + "loss": 4.5887, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -23.22725486755371, + "rewards/margins": -1.5351752042770386, + "rewards/rejected": -21.692081451416016, + "step": 19750 + }, + { + "epoch": 0.6658465064545486, + "grad_norm": 47.44742202758789, + "learning_rate": 3.0327731483722965e-07, + "logits/chosen": -1.5761334896087646, + "logits/rejected": -1.4156379699707031, + "logps/chosen": -2.0456478595733643, + "logps/rejected": -2.1322078704833984, + "loss": 2.6839, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.456480026245117, + "rewards/margins": 0.8655961751937866, + "rewards/rejected": -21.32207679748535, + "step": 19755 + }, + { + "epoch": 0.6660150325255317, + "grad_norm": 135.6229705810547, + "learning_rate": 3.030069368519694e-07, + "logits/chosen": -1.7876793146133423, + "logits/rejected": -2.074373245239258, + "logps/chosen": -2.859711170196533, + "logps/rejected": -3.498471736907959, + "loss": 1.6566, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -28.59711265563965, + "rewards/margins": 6.3876051902771, + "rewards/rejected": -34.984718322753906, + "step": 19760 + }, + { + "epoch": 0.6661835585965149, + "grad_norm": 0.10053889453411102, + "learning_rate": 3.0273662703824737e-07, + "logits/chosen": -1.8345619440078735, + "logits/rejected": -1.8314402103424072, + "logps/chosen": -2.1931605339050293, + "logps/rejected": -2.520170211791992, + "loss": 2.2617, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.93160629272461, + "rewards/margins": 3.27009654045105, + "rewards/rejected": -25.201702117919922, + "step": 19765 + }, + { + "epoch": 0.6663520846674981, + "grad_norm": 43.061500549316406, + "learning_rate": 3.024663854896067e-07, + "logits/chosen": -1.4732874631881714, + "logits/rejected": -1.4747284650802612, + "logps/chosen": -2.1525635719299316, + "logps/rejected": -2.309382915496826, + "loss": 3.044, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.525634765625, + "rewards/margins": 1.5681923627853394, + "rewards/rejected": -23.093828201293945, + "step": 19770 + }, + { + "epoch": 0.6665206107384812, + "grad_norm": 5.061534404754639, + "learning_rate": 3.0219621229956735e-07, + "logits/chosen": -1.9069154262542725, + "logits/rejected": -2.5560789108276367, + "logps/chosen": -1.9357631206512451, + "logps/rejected": -2.6078197956085205, + "loss": 1.6453, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.357629776000977, + "rewards/margins": 6.7205657958984375, + "rewards/rejected": -26.078197479248047, + "step": 19775 + }, + { + "epoch": 0.6666891368094644, + "grad_norm": 12.537591934204102, + "learning_rate": 3.0192610756162606e-07, + "logits/chosen": -1.9024620056152344, + "logits/rejected": -2.025599241256714, + "logps/chosen": -1.7322824001312256, + "logps/rejected": -1.9820201396942139, + "loss": 1.6466, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.322824478149414, + "rewards/margins": 2.4973764419555664, + "rewards/rejected": -19.820201873779297, + "step": 19780 + }, + { + "epoch": 0.6668576628804476, + "grad_norm": 23.594646453857422, + "learning_rate": 3.0165607136925496e-07, + "logits/chosen": -1.9728679656982422, + "logits/rejected": -1.887183427810669, + "logps/chosen": -2.2772622108459473, + "logps/rejected": -2.125269889831543, + "loss": 4.8542, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -22.772621154785156, + "rewards/margins": -1.519923448562622, + "rewards/rejected": -21.252696990966797, + "step": 19785 + }, + { + "epoch": 0.6670261889514307, + "grad_norm": 31.816057205200195, + "learning_rate": 3.013861038159031e-07, + "logits/chosen": -1.525428056716919, + "logits/rejected": -1.2926287651062012, + "logps/chosen": -2.7483131885528564, + "logps/rejected": -3.068668842315674, + "loss": 2.3267, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -27.483129501342773, + "rewards/margins": 3.203556537628174, + "rewards/rejected": -30.686687469482422, + "step": 19790 + }, + { + "epoch": 0.667194715022414, + "grad_norm": 19.48959732055664, + "learning_rate": 3.0111620499499555e-07, + "logits/chosen": -1.4664475917816162, + "logits/rejected": -1.9527008533477783, + "logps/chosen": -2.288588523864746, + "logps/rejected": -2.7013888359069824, + "loss": 3.1565, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.88588523864746, + "rewards/margins": 4.128004550933838, + "rewards/rejected": -27.01388931274414, + "step": 19795 + }, + { + "epoch": 0.6673632410933972, + "grad_norm": 43.20466995239258, + "learning_rate": 3.008463749999339e-07, + "logits/chosen": -1.6210861206054688, + "logits/rejected": -1.7428817749023438, + "logps/chosen": -2.5191102027893066, + "logps/rejected": -2.910252571105957, + "loss": 2.0976, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.19110107421875, + "rewards/margins": 3.911424160003662, + "rewards/rejected": -29.102527618408203, + "step": 19800 + }, + { + "epoch": 0.6675317671643803, + "grad_norm": 72.56021118164062, + "learning_rate": 3.005766139240955e-07, + "logits/chosen": -2.2898874282836914, + "logits/rejected": -2.5569610595703125, + "logps/chosen": -2.3946192264556885, + "logps/rejected": -2.511489152908325, + "loss": 3.3885, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -23.946191787719727, + "rewards/margins": 1.1687005758285522, + "rewards/rejected": -25.11488914489746, + "step": 19805 + }, + { + "epoch": 0.6677002932353635, + "grad_norm": 32.11191940307617, + "learning_rate": 3.0030692186083405e-07, + "logits/chosen": -2.098188877105713, + "logits/rejected": -2.284949541091919, + "logps/chosen": -2.6981968879699707, + "logps/rejected": -2.643249988555908, + "loss": 3.9236, + "rewards/accuracies": 0.5, + "rewards/chosen": -26.981969833374023, + "rewards/margins": -0.5494720339775085, + "rewards/rejected": -26.432498931884766, + "step": 19810 + }, + { + "epoch": 0.6678688193063467, + "grad_norm": 13.116031646728516, + "learning_rate": 3.000372989034794e-07, + "logits/chosen": -1.3008522987365723, + "logits/rejected": -1.7032623291015625, + "logps/chosen": -2.2843902111053467, + "logps/rejected": -2.548161745071411, + "loss": 2.0715, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -22.843904495239258, + "rewards/margins": 2.637714147567749, + "rewards/rejected": -25.481616973876953, + "step": 19815 + }, + { + "epoch": 0.6680373453773298, + "grad_norm": 35.49582290649414, + "learning_rate": 2.997677451453373e-07, + "logits/chosen": -1.7446882724761963, + "logits/rejected": -1.876037836074829, + "logps/chosen": -2.699276924133301, + "logps/rejected": -2.7415661811828613, + "loss": 2.7519, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -26.99277114868164, + "rewards/margins": 0.4228929579257965, + "rewards/rejected": -27.415660858154297, + "step": 19820 + }, + { + "epoch": 0.668205871448313, + "grad_norm": 12.453043937683105, + "learning_rate": 2.9949826067968977e-07, + "logits/chosen": -1.5163322687149048, + "logits/rejected": -1.9970420598983765, + "logps/chosen": -2.086151599884033, + "logps/rejected": -2.855980157852173, + "loss": 1.0368, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -20.861515045166016, + "rewards/margins": 7.698286533355713, + "rewards/rejected": -28.559803009033203, + "step": 19825 + }, + { + "epoch": 0.6683743975192963, + "grad_norm": 51.53904342651367, + "learning_rate": 2.992288455997947e-07, + "logits/chosen": -1.2252388000488281, + "logits/rejected": -1.4193706512451172, + "logps/chosen": -2.433497667312622, + "logps/rejected": -2.7764475345611572, + "loss": 2.9424, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.334978103637695, + "rewards/margins": 3.4294967651367188, + "rewards/rejected": -27.764474868774414, + "step": 19830 + }, + { + "epoch": 0.6685429235902794, + "grad_norm": 62.04422378540039, + "learning_rate": 2.989594999988864e-07, + "logits/chosen": -2.034576177597046, + "logits/rejected": -2.177302837371826, + "logps/chosen": -2.8381359577178955, + "logps/rejected": -3.122462034225464, + "loss": 2.5747, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -28.381359100341797, + "rewards/margins": 2.8432610034942627, + "rewards/rejected": -31.224618911743164, + "step": 19835 + }, + { + "epoch": 0.6687114496612626, + "grad_norm": 32.734657287597656, + "learning_rate": 2.9869022397017417e-07, + "logits/chosen": -2.064215898513794, + "logits/rejected": -2.021044969558716, + "logps/chosen": -2.555880546569824, + "logps/rejected": -2.721607208251953, + "loss": 2.5477, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.55880355834961, + "rewards/margins": 1.657266616821289, + "rewards/rejected": -27.2160701751709, + "step": 19840 + }, + { + "epoch": 0.6688799757322458, + "grad_norm": 33.11255645751953, + "learning_rate": 2.9842101760684413e-07, + "logits/chosen": -1.5624778270721436, + "logits/rejected": -1.7367737293243408, + "logps/chosen": -2.5347161293029785, + "logps/rejected": -2.9798059463500977, + "loss": 1.5732, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.3471622467041, + "rewards/margins": 4.45089864730835, + "rewards/rejected": -29.79806137084961, + "step": 19845 + }, + { + "epoch": 0.669048501803229, + "grad_norm": 23.20467758178711, + "learning_rate": 2.9815188100205824e-07, + "logits/chosen": -1.9864904880523682, + "logits/rejected": -2.2545325756073, + "logps/chosen": -2.7851338386535645, + "logps/rejected": -3.3120033740997314, + "loss": 1.7605, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -27.851337432861328, + "rewards/margins": 5.2686967849731445, + "rewards/rejected": -33.120033264160156, + "step": 19850 + }, + { + "epoch": 0.6692170278742121, + "grad_norm": 27.634586334228516, + "learning_rate": 2.978828142489537e-07, + "logits/chosen": -1.9006726741790771, + "logits/rejected": -1.9433460235595703, + "logps/chosen": -2.817883014678955, + "logps/rejected": -3.0188820362091064, + "loss": 2.6239, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -28.178829193115234, + "rewards/margins": 2.0099892616271973, + "rewards/rejected": -30.18882179260254, + "step": 19855 + }, + { + "epoch": 0.6693855539451953, + "grad_norm": 17.127777099609375, + "learning_rate": 2.9761381744064396e-07, + "logits/chosen": -2.049617290496826, + "logits/rejected": -2.2871267795562744, + "logps/chosen": -1.9521675109863281, + "logps/rejected": -2.246138095855713, + "loss": 1.4065, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.52167320251465, + "rewards/margins": 2.939706325531006, + "rewards/rejected": -22.461380004882812, + "step": 19860 + }, + { + "epoch": 0.6695540800161786, + "grad_norm": 56.237342834472656, + "learning_rate": 2.9734489067021836e-07, + "logits/chosen": -1.6401517391204834, + "logits/rejected": -1.9211409091949463, + "logps/chosen": -2.022190570831299, + "logps/rejected": -2.0948269367218018, + "loss": 2.5922, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.221904754638672, + "rewards/margins": 0.7263639569282532, + "rewards/rejected": -20.94826889038086, + "step": 19865 + }, + { + "epoch": 0.6697226060871617, + "grad_norm": 43.48698425292969, + "learning_rate": 2.9707603403074187e-07, + "logits/chosen": -2.499911308288574, + "logits/rejected": -2.7677135467529297, + "logps/chosen": -3.4701619148254395, + "logps/rejected": -3.94292950630188, + "loss": 2.2561, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -34.70161819458008, + "rewards/margins": 4.72767448425293, + "rewards/rejected": -39.429290771484375, + "step": 19870 + }, + { + "epoch": 0.6698911321581449, + "grad_norm": 0.6691756844520569, + "learning_rate": 2.9680724761525513e-07, + "logits/chosen": -1.852129578590393, + "logits/rejected": -1.8271507024765015, + "logps/chosen": -3.7556660175323486, + "logps/rejected": -4.138113498687744, + "loss": 3.4457, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -37.55665969848633, + "rewards/margins": 3.8244755268096924, + "rewards/rejected": -41.381134033203125, + "step": 19875 + }, + { + "epoch": 0.670059658229128, + "grad_norm": 58.062313079833984, + "learning_rate": 2.9653853151677443e-07, + "logits/chosen": -1.591839075088501, + "logits/rejected": -2.509289264678955, + "logps/chosen": -2.5534749031066895, + "logps/rejected": -4.027466773986816, + "loss": 2.536, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -25.534751892089844, + "rewards/margins": 14.73991870880127, + "rewards/rejected": -40.2746696472168, + "step": 19880 + }, + { + "epoch": 0.6702281843001112, + "grad_norm": 23.800106048583984, + "learning_rate": 2.9626988582829197e-07, + "logits/chosen": -1.6431554555892944, + "logits/rejected": -2.029618740081787, + "logps/chosen": -2.032714605331421, + "logps/rejected": -2.3359105587005615, + "loss": 2.3751, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.327144622802734, + "rewards/margins": 3.0319602489471436, + "rewards/rejected": -23.359106063842773, + "step": 19885 + }, + { + "epoch": 0.6703967103710944, + "grad_norm": 18.587617874145508, + "learning_rate": 2.9600131064277534e-07, + "logits/chosen": -1.9746696949005127, + "logits/rejected": -2.062826633453369, + "logps/chosen": -2.09183406829834, + "logps/rejected": -2.246274471282959, + "loss": 2.9238, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.9183406829834, + "rewards/margins": 1.5444018840789795, + "rewards/rejected": -22.46274185180664, + "step": 19890 + }, + { + "epoch": 0.6705652364420776, + "grad_norm": 18.86264419555664, + "learning_rate": 2.957328060531678e-07, + "logits/chosen": -2.121891498565674, + "logits/rejected": -1.968225121498108, + "logps/chosen": -2.1642074584960938, + "logps/rejected": -2.0462965965270996, + "loss": 4.5091, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -21.642074584960938, + "rewards/margins": -1.179107666015625, + "rewards/rejected": -20.462966918945312, + "step": 19895 + }, + { + "epoch": 0.6707337625130607, + "grad_norm": 46.66054153442383, + "learning_rate": 2.9546437215238827e-07, + "logits/chosen": -1.9920495748519897, + "logits/rejected": -2.144864320755005, + "logps/chosen": -1.730360984802246, + "logps/rejected": -1.6745617389678955, + "loss": 3.6781, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.30360984802246, + "rewards/margins": -0.5579929351806641, + "rewards/rejected": -16.745616912841797, + "step": 19900 + }, + { + "epoch": 0.670902288584044, + "grad_norm": 33.12007141113281, + "learning_rate": 2.951960090333314e-07, + "logits/chosen": -1.970428228378296, + "logits/rejected": -1.892177939414978, + "logps/chosen": -2.675799608230591, + "logps/rejected": -2.7696533203125, + "loss": 3.9091, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -26.75799560546875, + "rewards/margins": 0.9385347366333008, + "rewards/rejected": -27.696533203125, + "step": 19905 + }, + { + "epoch": 0.6710708146550272, + "grad_norm": 25.728837966918945, + "learning_rate": 2.9492771678886675e-07, + "logits/chosen": -1.4254019260406494, + "logits/rejected": -1.8275810480117798, + "logps/chosen": -2.298034429550171, + "logps/rejected": -3.1815507411956787, + "loss": 2.7914, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.980342864990234, + "rewards/margins": 8.835161209106445, + "rewards/rejected": -31.815505981445312, + "step": 19910 + }, + { + "epoch": 0.6712393407260103, + "grad_norm": 27.136716842651367, + "learning_rate": 2.9465949551183966e-07, + "logits/chosen": -2.2184853553771973, + "logits/rejected": -2.2532405853271484, + "logps/chosen": -3.0618937015533447, + "logps/rejected": -3.311782121658325, + "loss": 3.4152, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -30.61893653869629, + "rewards/margins": 2.4988853931427, + "rewards/rejected": -33.117820739746094, + "step": 19915 + }, + { + "epoch": 0.6714078667969935, + "grad_norm": 25.29293441772461, + "learning_rate": 2.9439134529507127e-07, + "logits/chosen": -1.8177769184112549, + "logits/rejected": -2.0616250038146973, + "logps/chosen": -2.5643460750579834, + "logps/rejected": -2.580888509750366, + "loss": 3.0727, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.643463134765625, + "rewards/margins": 0.16542330384254456, + "rewards/rejected": -25.808883666992188, + "step": 19920 + }, + { + "epoch": 0.6715763928679767, + "grad_norm": 24.884119033813477, + "learning_rate": 2.9412326623135755e-07, + "logits/chosen": -1.701251745223999, + "logits/rejected": -2.263763904571533, + "logps/chosen": -2.0247585773468018, + "logps/rejected": -3.072613000869751, + "loss": 1.6354, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -20.24758529663086, + "rewards/margins": 10.478544235229492, + "rewards/rejected": -30.726131439208984, + "step": 19925 + }, + { + "epoch": 0.6717449189389598, + "grad_norm": 27.353504180908203, + "learning_rate": 2.9385525841347004e-07, + "logits/chosen": -1.5897279977798462, + "logits/rejected": -1.5412429571151733, + "logps/chosen": -1.9933135509490967, + "logps/rejected": -2.19027042388916, + "loss": 1.8097, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.933135986328125, + "rewards/margins": 1.9695701599121094, + "rewards/rejected": -21.9027042388916, + "step": 19930 + }, + { + "epoch": 0.671913445009943, + "grad_norm": 36.17189025878906, + "learning_rate": 2.935873219341558e-07, + "logits/chosen": -1.9658607244491577, + "logits/rejected": -2.1116909980773926, + "logps/chosen": -2.512491464614868, + "logps/rejected": -2.5998706817626953, + "loss": 2.6017, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.124914169311523, + "rewards/margins": 0.873791515827179, + "rewards/rejected": -25.998706817626953, + "step": 19935 + }, + { + "epoch": 0.6720819710809263, + "grad_norm": 99.07889556884766, + "learning_rate": 2.9331945688613736e-07, + "logits/chosen": -1.8254836797714233, + "logits/rejected": -2.083385944366455, + "logps/chosen": -2.940991163253784, + "logps/rejected": -3.8571383953094482, + "loss": 1.2032, + "rewards/accuracies": 1.0, + "rewards/chosen": -29.409912109375, + "rewards/margins": 9.161473274230957, + "rewards/rejected": -38.571388244628906, + "step": 19940 + }, + { + "epoch": 0.6722504971519094, + "grad_norm": 22.57912254333496, + "learning_rate": 2.9305166336211187e-07, + "logits/chosen": -1.6052013635635376, + "logits/rejected": -1.779809594154358, + "logps/chosen": -1.822079062461853, + "logps/rejected": -2.1713473796844482, + "loss": 1.7866, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.22079086303711, + "rewards/margins": 3.4926846027374268, + "rewards/rejected": -21.71347427368164, + "step": 19945 + }, + { + "epoch": 0.6724190232228926, + "grad_norm": 41.973777770996094, + "learning_rate": 2.9278394145475214e-07, + "logits/chosen": -1.430271863937378, + "logits/rejected": -1.4732592105865479, + "logps/chosen": -2.8167452812194824, + "logps/rejected": -3.216135025024414, + "loss": 1.7358, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -28.167449951171875, + "rewards/margins": 3.993901014328003, + "rewards/rejected": -32.161354064941406, + "step": 19950 + }, + { + "epoch": 0.6725875492938758, + "grad_norm": 87.2608413696289, + "learning_rate": 2.925162912567064e-07, + "logits/chosen": -2.058218240737915, + "logits/rejected": -2.2669763565063477, + "logps/chosen": -2.5060524940490723, + "logps/rejected": -2.9572110176086426, + "loss": 3.5245, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -25.06052589416504, + "rewards/margins": 4.511587142944336, + "rewards/rejected": -29.572113037109375, + "step": 19955 + }, + { + "epoch": 0.6727560753648589, + "grad_norm": 22.192533493041992, + "learning_rate": 2.922487128605977e-07, + "logits/chosen": -1.6927998065948486, + "logits/rejected": -1.853003740310669, + "logps/chosen": -2.4295566082000732, + "logps/rejected": -2.8034374713897705, + "loss": 3.1124, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.29556655883789, + "rewards/margins": 3.7388081550598145, + "rewards/rejected": -28.034374237060547, + "step": 19960 + }, + { + "epoch": 0.6729246014358421, + "grad_norm": 25.506179809570312, + "learning_rate": 2.9198120635902437e-07, + "logits/chosen": -2.323963165283203, + "logits/rejected": -2.281276226043701, + "logps/chosen": -1.8685518503189087, + "logps/rejected": -2.068497896194458, + "loss": 2.338, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.685518264770508, + "rewards/margins": 1.9994609355926514, + "rewards/rejected": -20.684978485107422, + "step": 19965 + }, + { + "epoch": 0.6730931275068253, + "grad_norm": 40.8975830078125, + "learning_rate": 2.917137718445598e-07, + "logits/chosen": -1.6007277965545654, + "logits/rejected": -1.6875572204589844, + "logps/chosen": -2.6180386543273926, + "logps/rejected": -2.8947513103485107, + "loss": 3.3363, + "rewards/accuracies": 0.5, + "rewards/chosen": -26.180383682250977, + "rewards/margins": 2.7671265602111816, + "rewards/rejected": -28.947513580322266, + "step": 19970 + }, + { + "epoch": 0.6732616535778085, + "grad_norm": 36.157615661621094, + "learning_rate": 2.9144640940975296e-07, + "logits/chosen": -1.505743384361267, + "logits/rejected": -1.3464056253433228, + "logps/chosen": -2.9090559482574463, + "logps/rejected": -2.8138246536254883, + "loss": 4.7922, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -29.090557098388672, + "rewards/margins": -0.9523128271102905, + "rewards/rejected": -28.138248443603516, + "step": 19975 + }, + { + "epoch": 0.6734301796487917, + "grad_norm": 44.5861930847168, + "learning_rate": 2.911791191471269e-07, + "logits/chosen": -1.250292181968689, + "logits/rejected": -1.2156884670257568, + "logps/chosen": -2.3468449115753174, + "logps/rejected": -2.5400891304016113, + "loss": 2.9251, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -23.46845054626465, + "rewards/margins": 1.932440161705017, + "rewards/rejected": -25.400888442993164, + "step": 19980 + }, + { + "epoch": 0.6735987057197749, + "grad_norm": 41.93446350097656, + "learning_rate": 2.909119011491805e-07, + "logits/chosen": -1.940731406211853, + "logits/rejected": -2.3662357330322266, + "logps/chosen": -1.8077341318130493, + "logps/rejected": -2.249481678009033, + "loss": 2.6443, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.07733917236328, + "rewards/margins": 4.417477607727051, + "rewards/rejected": -22.494815826416016, + "step": 19985 + }, + { + "epoch": 0.673767231790758, + "grad_norm": 24.274431228637695, + "learning_rate": 2.9064475550838764e-07, + "logits/chosen": -1.6842902898788452, + "logits/rejected": -2.0487284660339355, + "logps/chosen": -2.380117416381836, + "logps/rejected": -2.9363510608673096, + "loss": 2.1863, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.80117416381836, + "rewards/margins": 5.562333583831787, + "rewards/rejected": -29.363506317138672, + "step": 19990 + }, + { + "epoch": 0.6739357578617412, + "grad_norm": 19.593360900878906, + "learning_rate": 2.9037768231719636e-07, + "logits/chosen": -0.8565117716789246, + "logits/rejected": -1.156738042831421, + "logps/chosen": -2.1061644554138184, + "logps/rejected": -2.46700382232666, + "loss": 1.9911, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.0616455078125, + "rewards/margins": 3.608396053314209, + "rewards/rejected": -24.670042037963867, + "step": 19995 + }, + { + "epoch": 0.6741042839327244, + "grad_norm": 12.377226829528809, + "learning_rate": 2.9011068166803046e-07, + "logits/chosen": -1.6227328777313232, + "logits/rejected": -1.8104091882705688, + "logps/chosen": -2.387129306793213, + "logps/rejected": -2.4981789588928223, + "loss": 2.4878, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.871295928955078, + "rewards/margins": 1.1104968786239624, + "rewards/rejected": -24.981792449951172, + "step": 20000 + }, + { + "epoch": 0.6741042839327244, + "eval_logits/chosen": -2.1333370208740234, + "eval_logits/rejected": -2.2969448566436768, + "eval_logps/chosen": -2.2102105617523193, + "eval_logps/rejected": -2.3554303646087646, + "eval_loss": 3.0430943965911865, + "eval_rewards/accuracies": 0.6299999952316284, + "eval_rewards/chosen": -22.10210418701172, + "eval_rewards/margins": 1.4521974325180054, + "eval_rewards/rejected": -23.554304122924805, + "eval_runtime": 12.9424, + "eval_samples_per_second": 7.727, + "eval_steps_per_second": 1.932, + "step": 20000 + }, + { + "epoch": 0.6742728100037075, + "grad_norm": 4.8359270095825195, + "learning_rate": 2.898437536532885e-07, + "logits/chosen": -1.0973485708236694, + "logits/rejected": -1.7246170043945312, + "logps/chosen": -3.404980182647705, + "logps/rejected": -4.6295485496521, + "loss": 1.127, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -34.04979705810547, + "rewards/margins": 12.245687484741211, + "rewards/rejected": -46.29548645019531, + "step": 20005 + }, + { + "epoch": 0.6744413360746907, + "grad_norm": 67.98348236083984, + "learning_rate": 2.8957689836534336e-07, + "logits/chosen": -1.6053552627563477, + "logits/rejected": -1.9920330047607422, + "logps/chosen": -2.480193614959717, + "logps/rejected": -2.6489195823669434, + "loss": 3.5401, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -24.801939010620117, + "rewards/margins": 1.687256097793579, + "rewards/rejected": -26.489192962646484, + "step": 20010 + }, + { + "epoch": 0.674609862145674, + "grad_norm": 42.41163635253906, + "learning_rate": 2.893101158965434e-07, + "logits/chosen": -1.9345791339874268, + "logits/rejected": -2.219799757003784, + "logps/chosen": -2.2220282554626465, + "logps/rejected": -2.655217170715332, + "loss": 2.3733, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.22028160095215, + "rewards/margins": 4.331890106201172, + "rewards/rejected": -26.552169799804688, + "step": 20015 + }, + { + "epoch": 0.6747783882166571, + "grad_norm": 35.31697463989258, + "learning_rate": 2.890434063392114e-07, + "logits/chosen": -1.5198293924331665, + "logits/rejected": -1.8901045322418213, + "logps/chosen": -2.567343235015869, + "logps/rejected": -3.333820343017578, + "loss": 2.5047, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.67343521118164, + "rewards/margins": 7.66477108001709, + "rewards/rejected": -33.33820724487305, + "step": 20020 + }, + { + "epoch": 0.6749469142876403, + "grad_norm": 29.586748123168945, + "learning_rate": 2.887767697856454e-07, + "logits/chosen": -1.6142957210540771, + "logits/rejected": -2.133453369140625, + "logps/chosen": -2.0409905910491943, + "logps/rejected": -2.410146713256836, + "loss": 2.2076, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.4099063873291, + "rewards/margins": 3.6915602684020996, + "rewards/rejected": -24.10146713256836, + "step": 20025 + }, + { + "epoch": 0.6751154403586235, + "grad_norm": 104.16493225097656, + "learning_rate": 2.885102063281173e-07, + "logits/chosen": -1.5088083744049072, + "logits/rejected": -1.4387980699539185, + "logps/chosen": -2.9790401458740234, + "logps/rejected": -3.0391666889190674, + "loss": 4.008, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -29.790401458740234, + "rewards/margins": 0.6012633442878723, + "rewards/rejected": -30.39166831970215, + "step": 20030 + }, + { + "epoch": 0.6752839664296066, + "grad_norm": 37.88565444946289, + "learning_rate": 2.882437160588744e-07, + "logits/chosen": -1.602617859840393, + "logits/rejected": -1.81451416015625, + "logps/chosen": -2.366647243499756, + "logps/rejected": -2.5041327476501465, + "loss": 2.6427, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.666473388671875, + "rewards/margins": 1.374853253364563, + "rewards/rejected": -25.04132652282715, + "step": 20035 + }, + { + "epoch": 0.6754524925005898, + "grad_norm": 43.18614196777344, + "learning_rate": 2.879772990701387e-07, + "logits/chosen": -1.196619987487793, + "logits/rejected": -1.1416454315185547, + "logps/chosen": -2.1735050678253174, + "logps/rejected": -2.4336700439453125, + "loss": 2.4065, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.73505401611328, + "rewards/margins": 2.6016478538513184, + "rewards/rejected": -24.336700439453125, + "step": 20040 + }, + { + "epoch": 0.675621018571573, + "grad_norm": 32.108089447021484, + "learning_rate": 2.8771095545410627e-07, + "logits/chosen": -1.4238228797912598, + "logits/rejected": -1.4724472761154175, + "logps/chosen": -2.4174728393554688, + "logps/rejected": -2.5063812732696533, + "loss": 3.1065, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.174728393554688, + "rewards/margins": 0.8890829086303711, + "rewards/rejected": -25.063812255859375, + "step": 20045 + }, + { + "epoch": 0.6757895446425563, + "grad_norm": 30.244998931884766, + "learning_rate": 2.874446853029483e-07, + "logits/chosen": -1.2692375183105469, + "logits/rejected": -1.465065598487854, + "logps/chosen": -2.7822213172912598, + "logps/rejected": -2.912348508834839, + "loss": 3.6016, + "rewards/accuracies": 0.5, + "rewards/chosen": -27.822214126586914, + "rewards/margins": 1.3012707233428955, + "rewards/rejected": -29.123483657836914, + "step": 20050 + }, + { + "epoch": 0.6759580707135394, + "grad_norm": 24.151615142822266, + "learning_rate": 2.8717848870881033e-07, + "logits/chosen": -1.6752557754516602, + "logits/rejected": -2.0266435146331787, + "logps/chosen": -2.9445536136627197, + "logps/rejected": -3.129530429840088, + "loss": 5.0165, + "rewards/accuracies": 0.5, + "rewards/chosen": -29.445537567138672, + "rewards/margins": 1.8497695922851562, + "rewards/rejected": -31.295307159423828, + "step": 20055 + }, + { + "epoch": 0.6761265967845226, + "grad_norm": 28.681072235107422, + "learning_rate": 2.869123657638126e-07, + "logits/chosen": -2.0757529735565186, + "logits/rejected": -2.134483575820923, + "logps/chosen": -1.8745758533477783, + "logps/rejected": -2.227299928665161, + "loss": 1.2008, + "rewards/accuracies": 1.0, + "rewards/chosen": -18.745756149291992, + "rewards/margins": 3.5272421836853027, + "rewards/rejected": -22.272998809814453, + "step": 20060 + }, + { + "epoch": 0.6762951228555057, + "grad_norm": 31.584806442260742, + "learning_rate": 2.8664631656004984e-07, + "logits/chosen": -1.453299880027771, + "logits/rejected": -1.7309995889663696, + "logps/chosen": -2.1703834533691406, + "logps/rejected": -2.3654937744140625, + "loss": 2.009, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.703832626342773, + "rewards/margins": 1.9511051177978516, + "rewards/rejected": -23.65494155883789, + "step": 20065 + }, + { + "epoch": 0.6764636489264889, + "grad_norm": 25.078779220581055, + "learning_rate": 2.863803411895911e-07, + "logits/chosen": -1.1493384838104248, + "logits/rejected": -1.3260166645050049, + "logps/chosen": -2.2854998111724854, + "logps/rejected": -2.580888509750366, + "loss": 2.1081, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.854999542236328, + "rewards/margins": 2.9538865089416504, + "rewards/rejected": -25.808887481689453, + "step": 20070 + }, + { + "epoch": 0.6766321749974721, + "grad_norm": 38.19968795776367, + "learning_rate": 2.8611443974448015e-07, + "logits/chosen": -1.5478785037994385, + "logits/rejected": -1.4771803617477417, + "logps/chosen": -2.46025013923645, + "logps/rejected": -2.3180558681488037, + "loss": 4.5903, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -24.602500915527344, + "rewards/margins": -1.421942114830017, + "rewards/rejected": -23.180557250976562, + "step": 20075 + }, + { + "epoch": 0.6768007010684552, + "grad_norm": 35.97868347167969, + "learning_rate": 2.858486123167346e-07, + "logits/chosen": -2.461002826690674, + "logits/rejected": -2.8161003589630127, + "logps/chosen": -2.063539743423462, + "logps/rejected": -2.8001794815063477, + "loss": 1.391, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.635398864746094, + "rewards/margins": 7.366394996643066, + "rewards/rejected": -28.001794815063477, + "step": 20080 + }, + { + "epoch": 0.6769692271394385, + "grad_norm": 78.83557891845703, + "learning_rate": 2.855828589983472e-07, + "logits/chosen": -1.955715537071228, + "logits/rejected": -1.9469232559204102, + "logps/chosen": -1.9905261993408203, + "logps/rejected": -1.7431453466415405, + "loss": 5.602, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -19.905261993408203, + "rewards/margins": -2.4738082885742188, + "rewards/rejected": -17.431453704833984, + "step": 20085 + }, + { + "epoch": 0.6771377532104217, + "grad_norm": 51.79839324951172, + "learning_rate": 2.8531717988128463e-07, + "logits/chosen": -1.5449891090393066, + "logits/rejected": -1.8164851665496826, + "logps/chosen": -2.6464550495147705, + "logps/rejected": -3.3289992809295654, + "loss": 3.4826, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -26.464550018310547, + "rewards/margins": 6.825445652008057, + "rewards/rejected": -33.28999710083008, + "step": 20090 + }, + { + "epoch": 0.6773062792814049, + "grad_norm": 30.013301849365234, + "learning_rate": 2.8505157505748804e-07, + "logits/chosen": -1.1776177883148193, + "logits/rejected": -1.3167506456375122, + "logps/chosen": -2.100921154022217, + "logps/rejected": -2.1228299140930176, + "loss": 3.0622, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.00921058654785, + "rewards/margins": 0.21908855438232422, + "rewards/rejected": -21.22829818725586, + "step": 20095 + }, + { + "epoch": 0.677474805352388, + "grad_norm": 19.57428741455078, + "learning_rate": 2.8478604461887255e-07, + "logits/chosen": -1.5491914749145508, + "logits/rejected": -1.5144532918930054, + "logps/chosen": -2.411500930786133, + "logps/rejected": -2.600080966949463, + "loss": 3.1888, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -24.115009307861328, + "rewards/margins": 1.8858016729354858, + "rewards/rejected": -26.000812530517578, + "step": 20100 + }, + { + "epoch": 0.6776433314233712, + "grad_norm": 40.35907745361328, + "learning_rate": 2.845205886573279e-07, + "logits/chosen": -1.5372756719589233, + "logits/rejected": -1.847243309020996, + "logps/chosen": -2.269357919692993, + "logps/rejected": -2.4268269538879395, + "loss": 2.3753, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.693578720092773, + "rewards/margins": 1.5746897459030151, + "rewards/rejected": -24.268268585205078, + "step": 20105 + }, + { + "epoch": 0.6778118574943544, + "grad_norm": 47.303367614746094, + "learning_rate": 2.842552072647182e-07, + "logits/chosen": -1.74314284324646, + "logits/rejected": -1.6884737014770508, + "logps/chosen": -2.1921029090881348, + "logps/rejected": -3.185455560684204, + "loss": 2.4485, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.921030044555664, + "rewards/margins": 9.933526039123535, + "rewards/rejected": -31.854557037353516, + "step": 20110 + }, + { + "epoch": 0.6779803835653375, + "grad_norm": 123.9562759399414, + "learning_rate": 2.83989900532881e-07, + "logits/chosen": -1.790907859802246, + "logits/rejected": -2.0922181606292725, + "logps/chosen": -2.6318843364715576, + "logps/rejected": -2.8555569648742676, + "loss": 2.0623, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.3188419342041, + "rewards/margins": 2.2367305755615234, + "rewards/rejected": -28.555572509765625, + "step": 20115 + }, + { + "epoch": 0.6781489096363207, + "grad_norm": 21.239953994750977, + "learning_rate": 2.8372466855362883e-07, + "logits/chosen": -2.271080493927002, + "logits/rejected": -3.0720982551574707, + "logps/chosen": -3.933926820755005, + "logps/rejected": -5.070860385894775, + "loss": 6.406, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -39.33926773071289, + "rewards/margins": 11.36933422088623, + "rewards/rejected": -50.70859909057617, + "step": 20120 + }, + { + "epoch": 0.678317435707304, + "grad_norm": 23.059619903564453, + "learning_rate": 2.834595114187479e-07, + "logits/chosen": -1.4889625310897827, + "logits/rejected": -1.7724645137786865, + "logps/chosen": -1.981604814529419, + "logps/rejected": -2.2343153953552246, + "loss": 2.1921, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.81604766845703, + "rewards/margins": 2.5271058082580566, + "rewards/rejected": -22.343151092529297, + "step": 20125 + }, + { + "epoch": 0.6784859617782871, + "grad_norm": 24.74485969543457, + "learning_rate": 2.83194429219999e-07, + "logits/chosen": -1.1593676805496216, + "logits/rejected": -1.7265560626983643, + "logps/chosen": -2.5378623008728027, + "logps/rejected": -3.2800393104553223, + "loss": 1.6983, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.378623962402344, + "rewards/margins": 7.421769618988037, + "rewards/rejected": -32.800392150878906, + "step": 20130 + }, + { + "epoch": 0.6786544878492703, + "grad_norm": 98.45122528076172, + "learning_rate": 2.829294220491161e-07, + "logits/chosen": -1.537777304649353, + "logits/rejected": -1.8030803203582764, + "logps/chosen": -2.5062530040740967, + "logps/rejected": -3.077735185623169, + "loss": 3.7697, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.062528610229492, + "rewards/margins": 5.714821815490723, + "rewards/rejected": -30.7773494720459, + "step": 20135 + }, + { + "epoch": 0.6788230139202535, + "grad_norm": 19.68750762939453, + "learning_rate": 2.82664489997808e-07, + "logits/chosen": -1.295419692993164, + "logits/rejected": -1.708397626876831, + "logps/chosen": -2.520289659500122, + "logps/rejected": -2.7188165187835693, + "loss": 3.2872, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.20289421081543, + "rewards/margins": 1.985269546508789, + "rewards/rejected": -27.18816566467285, + "step": 20140 + }, + { + "epoch": 0.6789915399912366, + "grad_norm": 16.885944366455078, + "learning_rate": 2.823996331577574e-07, + "logits/chosen": -1.9829976558685303, + "logits/rejected": -2.066995143890381, + "logps/chosen": -2.4660446643829346, + "logps/rejected": -3.0983173847198486, + "loss": 1.8963, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -24.660446166992188, + "rewards/margins": 6.322728633880615, + "rewards/rejected": -30.983173370361328, + "step": 20145 + }, + { + "epoch": 0.6791600660622198, + "grad_norm": 30.617427825927734, + "learning_rate": 2.821348516206204e-07, + "logits/chosen": -1.7437076568603516, + "logits/rejected": -2.1927478313446045, + "logps/chosen": -1.8544002771377563, + "logps/rejected": -2.3719706535339355, + "loss": 1.7637, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.544002532958984, + "rewards/margins": 5.1757025718688965, + "rewards/rejected": -23.719707489013672, + "step": 20150 + }, + { + "epoch": 0.679328592133203, + "grad_norm": 23.765220642089844, + "learning_rate": 2.8187014547802783e-07, + "logits/chosen": -1.847679853439331, + "logits/rejected": -1.8367445468902588, + "logps/chosen": -2.689016819000244, + "logps/rejected": -3.5438976287841797, + "loss": 1.2487, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -26.890167236328125, + "rewards/margins": 8.548810005187988, + "rewards/rejected": -35.4389762878418, + "step": 20155 + }, + { + "epoch": 0.6794971182041862, + "grad_norm": 33.91009521484375, + "learning_rate": 2.816055148215839e-07, + "logits/chosen": -1.806243658065796, + "logits/rejected": -1.8416646718978882, + "logps/chosen": -1.8404827117919922, + "logps/rejected": -1.952391266822815, + "loss": 2.6134, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.404827117919922, + "rewards/margins": 1.1190874576568604, + "rewards/rejected": -19.523914337158203, + "step": 20160 + }, + { + "epoch": 0.6796656442751694, + "grad_norm": 32.12192153930664, + "learning_rate": 2.813409597428671e-07, + "logits/chosen": -1.9678351879119873, + "logits/rejected": -2.0218756198883057, + "logps/chosen": -3.2655386924743652, + "logps/rejected": -3.7912089824676514, + "loss": 3.1875, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -32.65538787841797, + "rewards/margins": 5.256703853607178, + "rewards/rejected": -37.91209030151367, + "step": 20165 + }, + { + "epoch": 0.6798341703461526, + "grad_norm": 43.30851745605469, + "learning_rate": 2.8107648033342914e-07, + "logits/chosen": -1.8117700815200806, + "logits/rejected": -1.8282169103622437, + "logps/chosen": -2.4342598915100098, + "logps/rejected": -2.890298366546631, + "loss": 2.7606, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.342599868774414, + "rewards/margins": 4.560386657714844, + "rewards/rejected": -28.90298843383789, + "step": 20170 + }, + { + "epoch": 0.6800026964171357, + "grad_norm": 34.93639373779297, + "learning_rate": 2.80812076684796e-07, + "logits/chosen": -1.8555113077163696, + "logits/rejected": -1.8551677465438843, + "logps/chosen": -2.5153555870056152, + "logps/rejected": -2.742401599884033, + "loss": 3.1483, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.153554916381836, + "rewards/margins": 2.2704625129699707, + "rewards/rejected": -27.42401695251465, + "step": 20175 + }, + { + "epoch": 0.6801712224881189, + "grad_norm": 44.3549919128418, + "learning_rate": 2.805477488884677e-07, + "logits/chosen": -1.478435754776001, + "logits/rejected": -1.7246806621551514, + "logps/chosen": -1.7173773050308228, + "logps/rejected": -1.8425076007843018, + "loss": 3.089, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.173770904541016, + "rewards/margins": 1.2513021230697632, + "rewards/rejected": -18.42507553100586, + "step": 20180 + }, + { + "epoch": 0.6803397485591021, + "grad_norm": 122.29318237304688, + "learning_rate": 2.8028349703591727e-07, + "logits/chosen": -1.7741448879241943, + "logits/rejected": -1.771937608718872, + "logps/chosen": -2.4081344604492188, + "logps/rejected": -2.4229705333709717, + "loss": 4.366, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.081344604492188, + "rewards/margins": 0.14835968613624573, + "rewards/rejected": -24.22970199584961, + "step": 20185 + }, + { + "epoch": 0.6805082746300852, + "grad_norm": 31.629911422729492, + "learning_rate": 2.8001932121859195e-07, + "logits/chosen": -1.6347239017486572, + "logits/rejected": -1.6003844738006592, + "logps/chosen": -1.9684116840362549, + "logps/rejected": -1.9572652578353882, + "loss": 3.2296, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.68411636352539, + "rewards/margins": -0.11146555095911026, + "rewards/rejected": -19.572650909423828, + "step": 20190 + }, + { + "epoch": 0.6806768007010685, + "grad_norm": 36.26969528198242, + "learning_rate": 2.7975522152791274e-07, + "logits/chosen": -1.643143892288208, + "logits/rejected": -1.7509486675262451, + "logps/chosen": -2.7773661613464355, + "logps/rejected": -3.1645007133483887, + "loss": 2.8131, + "rewards/accuracies": 0.5, + "rewards/chosen": -27.773662567138672, + "rewards/margins": 3.871345043182373, + "rewards/rejected": -31.645008087158203, + "step": 20195 + }, + { + "epoch": 0.6808453267720517, + "grad_norm": 26.0379638671875, + "learning_rate": 2.7949119805527406e-07, + "logits/chosen": -1.2348109483718872, + "logits/rejected": -1.275914192199707, + "logps/chosen": -2.392324686050415, + "logps/rejected": -2.7133028507232666, + "loss": 0.9852, + "rewards/accuracies": 1.0, + "rewards/chosen": -23.92324447631836, + "rewards/margins": 3.2097830772399902, + "rewards/rejected": -27.13302993774414, + "step": 20200 + }, + { + "epoch": 0.6810138528430348, + "grad_norm": 36.44670104980469, + "learning_rate": 2.7922725089204425e-07, + "logits/chosen": -1.4761666059494019, + "logits/rejected": -1.729008674621582, + "logps/chosen": -1.9980462789535522, + "logps/rejected": -2.1607513427734375, + "loss": 2.8235, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.9804630279541, + "rewards/margins": 1.6270501613616943, + "rewards/rejected": -21.607511520385742, + "step": 20205 + }, + { + "epoch": 0.681182378914018, + "grad_norm": 14.305713653564453, + "learning_rate": 2.789633801295645e-07, + "logits/chosen": -1.8914934396743774, + "logits/rejected": -2.159989833831787, + "logps/chosen": -1.9110405445098877, + "logps/rejected": -2.0172300338745117, + "loss": 2.6372, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.110403060913086, + "rewards/margins": 1.0618977546691895, + "rewards/rejected": -20.17230224609375, + "step": 20210 + }, + { + "epoch": 0.6813509049850012, + "grad_norm": 39.7442741394043, + "learning_rate": 2.786995858591505e-07, + "logits/chosen": -1.5674628019332886, + "logits/rejected": -1.659325361251831, + "logps/chosen": -2.3448400497436523, + "logps/rejected": -2.6547160148620605, + "loss": 3.0196, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.448400497436523, + "rewards/margins": 3.0987625122070312, + "rewards/rejected": -26.547161102294922, + "step": 20215 + }, + { + "epoch": 0.6815194310559843, + "grad_norm": 35.93793487548828, + "learning_rate": 2.784358681720909e-07, + "logits/chosen": -1.6853892803192139, + "logits/rejected": -1.776785135269165, + "logps/chosen": -1.8474948406219482, + "logps/rejected": -1.982184648513794, + "loss": 2.7998, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.47494888305664, + "rewards/margins": 1.3468974828720093, + "rewards/rejected": -19.821847915649414, + "step": 20220 + }, + { + "epoch": 0.6816879571269675, + "grad_norm": 24.072486877441406, + "learning_rate": 2.7817222715964807e-07, + "logits/chosen": -1.866061806678772, + "logits/rejected": -2.0546302795410156, + "logps/chosen": -2.9646809101104736, + "logps/rejected": -3.516381025314331, + "loss": 2.0249, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -29.646808624267578, + "rewards/margins": 5.517003536224365, + "rewards/rejected": -35.16381072998047, + "step": 20225 + }, + { + "epoch": 0.6818564831979507, + "grad_norm": 41.478538513183594, + "learning_rate": 2.779086629130577e-07, + "logits/chosen": -1.5552170276641846, + "logits/rejected": -2.01354718208313, + "logps/chosen": -2.577141284942627, + "logps/rejected": -3.2818055152893066, + "loss": 1.9669, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -25.771411895751953, + "rewards/margins": 7.046643257141113, + "rewards/rejected": -32.81805419921875, + "step": 20230 + }, + { + "epoch": 0.682025009268934, + "grad_norm": 34.2642707824707, + "learning_rate": 2.776451755235293e-07, + "logits/chosen": -1.435533881187439, + "logits/rejected": -1.877467155456543, + "logps/chosen": -2.060547351837158, + "logps/rejected": -2.422755002975464, + "loss": 2.2323, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.605472564697266, + "rewards/margins": 3.622079372406006, + "rewards/rejected": -24.227550506591797, + "step": 20235 + }, + { + "epoch": 0.6821935353399171, + "grad_norm": 45.35870361328125, + "learning_rate": 2.77381765082245e-07, + "logits/chosen": -1.6485698223114014, + "logits/rejected": -1.6269375085830688, + "logps/chosen": -3.022157669067383, + "logps/rejected": -3.5015807151794434, + "loss": 3.5989, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -30.221576690673828, + "rewards/margins": 4.7942352294921875, + "rewards/rejected": -35.01581573486328, + "step": 20240 + }, + { + "epoch": 0.6823620614109003, + "grad_norm": 42.239959716796875, + "learning_rate": 2.77118431680361e-07, + "logits/chosen": -1.5377813577651978, + "logits/rejected": -1.3784878253936768, + "logps/chosen": -2.4813880920410156, + "logps/rejected": -3.0015201568603516, + "loss": 4.149, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -24.813879013061523, + "rewards/margins": 5.201320648193359, + "rewards/rejected": -30.015201568603516, + "step": 20245 + }, + { + "epoch": 0.6825305874818834, + "grad_norm": 27.377073287963867, + "learning_rate": 2.768551754090068e-07, + "logits/chosen": -1.577036738395691, + "logits/rejected": -1.7084392309188843, + "logps/chosen": -2.2118425369262695, + "logps/rejected": -2.247002363204956, + "loss": 3.2371, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.118425369262695, + "rewards/margins": 0.35159969329833984, + "rewards/rejected": -22.47002601623535, + "step": 20250 + }, + { + "epoch": 0.6826991135528666, + "grad_norm": 19.689481735229492, + "learning_rate": 2.7659199635928465e-07, + "logits/chosen": -1.7298429012298584, + "logits/rejected": -2.325814962387085, + "logps/chosen": -1.8912522792816162, + "logps/rejected": -2.167006015777588, + "loss": 2.8236, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.91252326965332, + "rewards/margins": 2.7575371265411377, + "rewards/rejected": -21.670061111450195, + "step": 20255 + }, + { + "epoch": 0.6828676396238498, + "grad_norm": 33.25885009765625, + "learning_rate": 2.763288946222707e-07, + "logits/chosen": -1.6983985900878906, + "logits/rejected": -1.697054147720337, + "logps/chosen": -2.0792253017425537, + "logps/rejected": -2.391404867172241, + "loss": 2.6382, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.792253494262695, + "rewards/margins": 3.1217968463897705, + "rewards/rejected": -23.914051055908203, + "step": 20260 + }, + { + "epoch": 0.6830361656948329, + "grad_norm": 27.963102340698242, + "learning_rate": 2.7606587028901395e-07, + "logits/chosen": -1.5231996774673462, + "logits/rejected": -1.3576040267944336, + "logps/chosen": -2.1193814277648926, + "logps/rejected": -2.295095443725586, + "loss": 2.6454, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.19381332397461, + "rewards/margins": 1.7571430206298828, + "rewards/rejected": -22.950956344604492, + "step": 20265 + }, + { + "epoch": 0.6832046917658162, + "grad_norm": 28.47199821472168, + "learning_rate": 2.75802923450537e-07, + "logits/chosen": -1.8896493911743164, + "logits/rejected": -1.9708207845687866, + "logps/chosen": -2.278593063354492, + "logps/rejected": -2.3892312049865723, + "loss": 2.5397, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.785930633544922, + "rewards/margins": 1.1063793897628784, + "rewards/rejected": -23.89231300354004, + "step": 20270 + }, + { + "epoch": 0.6833732178367994, + "grad_norm": 28.51179313659668, + "learning_rate": 2.7554005419783516e-07, + "logits/chosen": -1.6566784381866455, + "logits/rejected": -1.8939708471298218, + "logps/chosen": -2.288196325302124, + "logps/rejected": -2.2760863304138184, + "loss": 3.7188, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -22.881961822509766, + "rewards/margins": -0.12109851837158203, + "rewards/rejected": -22.7608642578125, + "step": 20275 + }, + { + "epoch": 0.6835417439077826, + "grad_norm": 29.09737205505371, + "learning_rate": 2.752772626218771e-07, + "logits/chosen": -1.6456453800201416, + "logits/rejected": -1.872393012046814, + "logps/chosen": -2.492414951324463, + "logps/rejected": -3.059534788131714, + "loss": 2.0609, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.924148559570312, + "rewards/margins": 5.671200752258301, + "rewards/rejected": -30.595348358154297, + "step": 20280 + }, + { + "epoch": 0.6837102699787657, + "grad_norm": 23.130584716796875, + "learning_rate": 2.7501454881360496e-07, + "logits/chosen": -2.1061911582946777, + "logits/rejected": -2.1469058990478516, + "logps/chosen": -2.26118540763855, + "logps/rejected": -2.4657797813415527, + "loss": 2.4224, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.611852645874023, + "rewards/margins": 2.0459442138671875, + "rewards/rejected": -24.65779685974121, + "step": 20285 + }, + { + "epoch": 0.6838787960497489, + "grad_norm": 4.321815013885498, + "learning_rate": 2.7475191286393316e-07, + "logits/chosen": -1.7793070077896118, + "logits/rejected": -1.7671607732772827, + "logps/chosen": -1.7020763158798218, + "logps/rejected": -1.816414475440979, + "loss": 3.0518, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.020761489868164, + "rewards/margins": 1.1433823108673096, + "rewards/rejected": -18.16414451599121, + "step": 20290 + }, + { + "epoch": 0.684047322120732, + "grad_norm": 19.862804412841797, + "learning_rate": 2.7448935486374994e-07, + "logits/chosen": -2.074662685394287, + "logits/rejected": -2.0828239917755127, + "logps/chosen": -2.2630491256713867, + "logps/rejected": -2.365959644317627, + "loss": 3.3963, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -22.630489349365234, + "rewards/margins": 1.0291064977645874, + "rewards/rejected": -23.659595489501953, + "step": 20295 + }, + { + "epoch": 0.6842158481917152, + "grad_norm": 67.76535034179688, + "learning_rate": 2.7422687490391627e-07, + "logits/chosen": -1.5843805074691772, + "logits/rejected": -1.6821810007095337, + "logps/chosen": -2.2501184940338135, + "logps/rejected": -2.322180986404419, + "loss": 2.7301, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.50118637084961, + "rewards/margins": 0.7206247448921204, + "rewards/rejected": -23.221811294555664, + "step": 20300 + }, + { + "epoch": 0.6843843742626985, + "grad_norm": 20.505523681640625, + "learning_rate": 2.739644730752662e-07, + "logits/chosen": -1.7529728412628174, + "logits/rejected": -1.9020655155181885, + "logps/chosen": -2.148254871368408, + "logps/rejected": -2.5298733711242676, + "loss": 1.4661, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -21.4825496673584, + "rewards/margins": 3.816183567047119, + "rewards/rejected": -25.29873275756836, + "step": 20305 + }, + { + "epoch": 0.6845529003336817, + "grad_norm": 101.92227172851562, + "learning_rate": 2.737021494686064e-07, + "logits/chosen": -2.2013256549835205, + "logits/rejected": -2.3193464279174805, + "logps/chosen": -2.3510169982910156, + "logps/rejected": -2.4058213233947754, + "loss": 2.9042, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.51017189025879, + "rewards/margins": 0.5480403900146484, + "rewards/rejected": -24.058212280273438, + "step": 20310 + }, + { + "epoch": 0.6847214264046648, + "grad_norm": 22.332473754882812, + "learning_rate": 2.734399041747169e-07, + "logits/chosen": -1.2684409618377686, + "logits/rejected": -1.8357795476913452, + "logps/chosen": -2.1953673362731934, + "logps/rejected": -2.5396382808685303, + "loss": 1.8932, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.953670501708984, + "rewards/margins": 3.4427096843719482, + "rewards/rejected": -25.396381378173828, + "step": 20315 + }, + { + "epoch": 0.684889952475648, + "grad_norm": 9.619364738464355, + "learning_rate": 2.7317773728435067e-07, + "logits/chosen": -1.5909945964813232, + "logits/rejected": -1.9193031787872314, + "logps/chosen": -2.6117565631866455, + "logps/rejected": -3.1586432456970215, + "loss": 2.584, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -26.117565155029297, + "rewards/margins": 5.468869686126709, + "rewards/rejected": -31.5864315032959, + "step": 20320 + }, + { + "epoch": 0.6850584785466312, + "grad_norm": 39.636714935302734, + "learning_rate": 2.7291564888823287e-07, + "logits/chosen": -1.0083258152008057, + "logits/rejected": -1.1197991371154785, + "logps/chosen": -2.370436906814575, + "logps/rejected": -2.7647347450256348, + "loss": 2.3407, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.704368591308594, + "rewards/margins": 3.942978620529175, + "rewards/rejected": -27.6473445892334, + "step": 20325 + }, + { + "epoch": 0.6852270046176143, + "grad_norm": 47.81154251098633, + "learning_rate": 2.726536390770623e-07, + "logits/chosen": -1.4197077751159668, + "logits/rejected": -2.2119290828704834, + "logps/chosen": -2.0732133388519287, + "logps/rejected": -2.5548577308654785, + "loss": 3.0506, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.732135772705078, + "rewards/margins": 4.816441059112549, + "rewards/rejected": -25.54857635498047, + "step": 20330 + }, + { + "epoch": 0.6853955306885975, + "grad_norm": 29.99540901184082, + "learning_rate": 2.7239170794151007e-07, + "logits/chosen": -2.2027525901794434, + "logits/rejected": -2.388345718383789, + "logps/chosen": -3.244304656982422, + "logps/rejected": -3.8606619834899902, + "loss": 0.961, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -32.44304656982422, + "rewards/margins": 6.163573265075684, + "rewards/rejected": -38.60662078857422, + "step": 20335 + }, + { + "epoch": 0.6855640567595807, + "grad_norm": 17.19338607788086, + "learning_rate": 2.7212985557222056e-07, + "logits/chosen": -2.053317070007324, + "logits/rejected": -2.0944182872772217, + "logps/chosen": -2.624318838119507, + "logps/rejected": -2.6115059852600098, + "loss": 4.0212, + "rewards/accuracies": 0.5, + "rewards/chosen": -26.243188858032227, + "rewards/margins": -0.12812776863574982, + "rewards/rejected": -26.115060806274414, + "step": 20340 + }, + { + "epoch": 0.6857325828305639, + "grad_norm": 24.04823112487793, + "learning_rate": 2.718680820598101e-07, + "logits/chosen": -1.5281686782836914, + "logits/rejected": -2.1130008697509766, + "logps/chosen": -1.9165928363800049, + "logps/rejected": -2.1830132007598877, + "loss": 2.018, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.16592788696289, + "rewards/margins": 2.6642043590545654, + "rewards/rejected": -21.83013343811035, + "step": 20345 + }, + { + "epoch": 0.6859011089015471, + "grad_norm": 1.5715937479399145e-05, + "learning_rate": 2.716063874948684e-07, + "logits/chosen": -1.7485952377319336, + "logits/rejected": -1.8180173635482788, + "logps/chosen": -2.7975385189056396, + "logps/rejected": -3.3084304332733154, + "loss": 2.3908, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -27.975383758544922, + "rewards/margins": 5.108920097351074, + "rewards/rejected": -33.08430480957031, + "step": 20350 + }, + { + "epoch": 0.6860696349725303, + "grad_norm": 25.99323272705078, + "learning_rate": 2.7134477196795764e-07, + "logits/chosen": -2.0262441635131836, + "logits/rejected": -2.1174750328063965, + "logps/chosen": -1.8926416635513306, + "logps/rejected": -2.128100633621216, + "loss": 3.0479, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.926416397094727, + "rewards/margins": 2.35459041595459, + "rewards/rejected": -21.281007766723633, + "step": 20355 + }, + { + "epoch": 0.6862381610435134, + "grad_norm": 23.152528762817383, + "learning_rate": 2.7108323556961266e-07, + "logits/chosen": -1.719618558883667, + "logits/rejected": -2.6587231159210205, + "logps/chosen": -3.0434048175811768, + "logps/rejected": -4.10079288482666, + "loss": 1.2253, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -30.43404769897461, + "rewards/margins": 10.573877334594727, + "rewards/rejected": -41.00792694091797, + "step": 20360 + }, + { + "epoch": 0.6864066871144966, + "grad_norm": 15.43291187286377, + "learning_rate": 2.7082177839034087e-07, + "logits/chosen": -2.2665624618530273, + "logits/rejected": -2.249471426010132, + "logps/chosen": -2.7999186515808105, + "logps/rejected": -2.9842801094055176, + "loss": 3.6626, + "rewards/accuracies": 0.5, + "rewards/chosen": -27.999187469482422, + "rewards/margins": 1.8436133861541748, + "rewards/rejected": -29.84280014038086, + "step": 20365 + }, + { + "epoch": 0.6865752131854798, + "grad_norm": 51.50522232055664, + "learning_rate": 2.705604005206223e-07, + "logits/chosen": -1.8549646139144897, + "logits/rejected": -1.838091254234314, + "logps/chosen": -2.249647617340088, + "logps/rejected": -2.5575718879699707, + "loss": 3.3206, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.496477127075195, + "rewards/margins": 3.079240560531616, + "rewards/rejected": -25.57571792602539, + "step": 20370 + }, + { + "epoch": 0.6867437392564629, + "grad_norm": 33.62204360961914, + "learning_rate": 2.7029910205090975e-07, + "logits/chosen": -1.884319543838501, + "logits/rejected": -1.66828191280365, + "logps/chosen": -2.4199607372283936, + "logps/rejected": -2.259284019470215, + "loss": 5.1467, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.19960594177246, + "rewards/margins": -1.6067683696746826, + "rewards/rejected": -22.59284019470215, + "step": 20375 + }, + { + "epoch": 0.6869122653274462, + "grad_norm": 187.69085693359375, + "learning_rate": 2.70037883071628e-07, + "logits/chosen": -2.0082826614379883, + "logits/rejected": -1.971462607383728, + "logps/chosen": -3.232483386993408, + "logps/rejected": -3.4104182720184326, + "loss": 2.9896, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -32.324832916259766, + "rewards/margins": 1.7793477773666382, + "rewards/rejected": -34.10417938232422, + "step": 20380 + }, + { + "epoch": 0.6870807913984294, + "grad_norm": 37.74311065673828, + "learning_rate": 2.697767436731747e-07, + "logits/chosen": -1.6654325723648071, + "logits/rejected": -2.0131616592407227, + "logps/chosen": -2.0143442153930664, + "logps/rejected": -2.516138792037964, + "loss": 1.6317, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -20.143442153930664, + "rewards/margins": 5.017943859100342, + "rewards/rejected": -25.161386489868164, + "step": 20385 + }, + { + "epoch": 0.6872493174694125, + "grad_norm": 5.722283363342285, + "learning_rate": 2.6951568394592024e-07, + "logits/chosen": -0.9658223390579224, + "logits/rejected": -1.3423798084259033, + "logps/chosen": -2.0055198669433594, + "logps/rejected": -3.0546345710754395, + "loss": 1.2205, + "rewards/accuracies": 1.0, + "rewards/chosen": -20.055200576782227, + "rewards/margins": 10.491147994995117, + "rewards/rejected": -30.54634666442871, + "step": 20390 + }, + { + "epoch": 0.6874178435403957, + "grad_norm": 28.412220001220703, + "learning_rate": 2.6925470398020656e-07, + "logits/chosen": -2.0573315620422363, + "logits/rejected": -2.2049522399902344, + "logps/chosen": -1.9439353942871094, + "logps/rejected": -2.2200284004211426, + "loss": 1.806, + "rewards/accuracies": 1.0, + "rewards/chosen": -19.439355850219727, + "rewards/margins": 2.760927438735962, + "rewards/rejected": -22.20028305053711, + "step": 20395 + }, + { + "epoch": 0.6875863696113789, + "grad_norm": 29.253477096557617, + "learning_rate": 2.689938038663489e-07, + "logits/chosen": -1.6225097179412842, + "logits/rejected": -2.028738498687744, + "logps/chosen": -2.0318007469177246, + "logps/rejected": -2.413212299346924, + "loss": 2.3506, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.318008422851562, + "rewards/margins": 3.814117908477783, + "rewards/rejected": -24.132125854492188, + "step": 20400 + }, + { + "epoch": 0.6875863696113789, + "eval_logits/chosen": -2.160346746444702, + "eval_logits/rejected": -2.325753688812256, + "eval_logps/chosen": -2.223787546157837, + "eval_logps/rejected": -2.3722009658813477, + "eval_loss": 3.045292854309082, + "eval_rewards/accuracies": 0.6299999952316284, + "eval_rewards/chosen": -22.237878799438477, + "eval_rewards/margins": 1.484131932258606, + "eval_rewards/rejected": -23.722009658813477, + "eval_runtime": 12.8876, + "eval_samples_per_second": 7.759, + "eval_steps_per_second": 1.94, + "step": 20400 + }, + { + "epoch": 0.687754895682362, + "grad_norm": 41.0540657043457, + "learning_rate": 2.6873298369463443e-07, + "logits/chosen": -1.7240146398544312, + "logits/rejected": -1.768457055091858, + "logps/chosen": -2.686223030090332, + "logps/rejected": -2.7694153785705566, + "loss": 4.6757, + "rewards/accuracies": 0.5, + "rewards/chosen": -26.862232208251953, + "rewards/margins": 0.8319219350814819, + "rewards/rejected": -27.69415283203125, + "step": 20405 + }, + { + "epoch": 0.6879234217533452, + "grad_norm": 40.388038635253906, + "learning_rate": 2.6847224355532296e-07, + "logits/chosen": -1.528822660446167, + "logits/rejected": -1.8464807271957397, + "logps/chosen": -2.5221710205078125, + "logps/rejected": -2.816987991333008, + "loss": 2.6089, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -25.221710205078125, + "rewards/margins": 2.948167324066162, + "rewards/rejected": -28.169879913330078, + "step": 20410 + }, + { + "epoch": 0.6880919478243285, + "grad_norm": 32.86744689941406, + "learning_rate": 2.6821158353864595e-07, + "logits/chosen": -2.0796568393707275, + "logits/rejected": -2.2831714153289795, + "logps/chosen": -2.6853480339050293, + "logps/rejected": -2.882089138031006, + "loss": 3.618, + "rewards/accuracies": 0.5, + "rewards/chosen": -26.853485107421875, + "rewards/margins": 1.9674084186553955, + "rewards/rejected": -28.820892333984375, + "step": 20415 + }, + { + "epoch": 0.6882604738953116, + "grad_norm": 95.84664916992188, + "learning_rate": 2.679510037348077e-07, + "logits/chosen": -1.9441862106323242, + "logits/rejected": -2.119619607925415, + "logps/chosen": -3.1259143352508545, + "logps/rejected": -3.42171049118042, + "loss": 3.0782, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -31.259143829345703, + "rewards/margins": 2.9579625129699707, + "rewards/rejected": -34.217105865478516, + "step": 20420 + }, + { + "epoch": 0.6884289999662948, + "grad_norm": 24.20979118347168, + "learning_rate": 2.67690504233985e-07, + "logits/chosen": -2.0083043575286865, + "logits/rejected": -2.086310863494873, + "logps/chosen": -2.2964296340942383, + "logps/rejected": -2.89279842376709, + "loss": 2.1356, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -22.964298248291016, + "rewards/margins": 5.963687896728516, + "rewards/rejected": -28.9279842376709, + "step": 20425 + }, + { + "epoch": 0.688597526037278, + "grad_norm": 31.725812911987305, + "learning_rate": 2.674300851263259e-07, + "logits/chosen": -1.6422641277313232, + "logits/rejected": -1.8533084392547607, + "logps/chosen": -1.8975311517715454, + "logps/rejected": -2.256784200668335, + "loss": 1.8337, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -18.975311279296875, + "rewards/margins": 3.592531204223633, + "rewards/rejected": -22.567840576171875, + "step": 20430 + }, + { + "epoch": 0.6887660521082611, + "grad_norm": 59.62810134887695, + "learning_rate": 2.671697465019515e-07, + "logits/chosen": -1.5278303623199463, + "logits/rejected": -1.8468002080917358, + "logps/chosen": -1.755629539489746, + "logps/rejected": -1.7148040533065796, + "loss": 3.7058, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.55629539489746, + "rewards/margins": -0.40825533866882324, + "rewards/rejected": -17.148038864135742, + "step": 20435 + }, + { + "epoch": 0.6889345781792443, + "grad_norm": 377.57159423828125, + "learning_rate": 2.669094884509546e-07, + "logits/chosen": -1.3748226165771484, + "logits/rejected": -1.3104236125946045, + "logps/chosen": -3.0436301231384277, + "logps/rejected": -3.194868803024292, + "loss": 3.356, + "rewards/accuracies": 0.5, + "rewards/chosen": -30.436298370361328, + "rewards/margins": 1.5123873949050903, + "rewards/rejected": -31.948688507080078, + "step": 20440 + }, + { + "epoch": 0.6891031042502275, + "grad_norm": 46.6607551574707, + "learning_rate": 2.6664931106340064e-07, + "logits/chosen": -2.249420166015625, + "logits/rejected": -2.2896761894226074, + "logps/chosen": -2.5717592239379883, + "logps/rejected": -2.9195244312286377, + "loss": 1.9671, + "rewards/accuracies": 0.5, + "rewards/chosen": -25.71759033203125, + "rewards/margins": 3.477654218673706, + "rewards/rejected": -29.19524574279785, + "step": 20445 + }, + { + "epoch": 0.6892716303212106, + "grad_norm": 46.07664108276367, + "learning_rate": 2.6638921442932627e-07, + "logits/chosen": -1.4022904634475708, + "logits/rejected": -1.3712940216064453, + "logps/chosen": -2.298978328704834, + "logps/rejected": -2.069422721862793, + "loss": 5.3755, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -22.98978614807129, + "rewards/margins": -2.2955572605133057, + "rewards/rejected": -20.694225311279297, + "step": 20450 + }, + { + "epoch": 0.6894401563921939, + "grad_norm": 39.8826904296875, + "learning_rate": 2.6612919863874084e-07, + "logits/chosen": -1.74521005153656, + "logits/rejected": -1.838160514831543, + "logps/chosen": -2.082080364227295, + "logps/rejected": -2.506239652633667, + "loss": 2.9543, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.820804595947266, + "rewards/margins": 4.241593837738037, + "rewards/rejected": -25.062397003173828, + "step": 20455 + }, + { + "epoch": 0.6896086824631771, + "grad_norm": 12.789778709411621, + "learning_rate": 2.658692637816258e-07, + "logits/chosen": -1.5193207263946533, + "logits/rejected": -1.6041972637176514, + "logps/chosen": -2.3925070762634277, + "logps/rejected": -2.643028736114502, + "loss": 1.9822, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -23.92506980895996, + "rewards/margins": 2.5052151679992676, + "rewards/rejected": -26.430286407470703, + "step": 20460 + }, + { + "epoch": 0.6897772085341602, + "grad_norm": 31.83477210998535, + "learning_rate": 2.6560940994793403e-07, + "logits/chosen": -1.491260290145874, + "logits/rejected": -1.6489356756210327, + "logps/chosen": -2.430779218673706, + "logps/rejected": -2.6424241065979004, + "loss": 3.0858, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.30779457092285, + "rewards/margins": 2.1164519786834717, + "rewards/rejected": -26.424243927001953, + "step": 20465 + }, + { + "epoch": 0.6899457346051434, + "grad_norm": 16.46843719482422, + "learning_rate": 2.6534963722759085e-07, + "logits/chosen": -2.0733752250671387, + "logits/rejected": -1.9191596508026123, + "logps/chosen": -1.8992841243743896, + "logps/rejected": -1.9869730472564697, + "loss": 2.9444, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.992841720581055, + "rewards/margins": 0.8768887519836426, + "rewards/rejected": -19.869731903076172, + "step": 20470 + }, + { + "epoch": 0.6901142606761266, + "grad_norm": 93.31346130371094, + "learning_rate": 2.6508994571049337e-07, + "logits/chosen": -1.970423936843872, + "logits/rejected": -2.2703733444213867, + "logps/chosen": -3.0131518840789795, + "logps/rejected": -3.596086025238037, + "loss": 1.6824, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -30.131519317626953, + "rewards/margins": 5.829344272613525, + "rewards/rejected": -35.96086120605469, + "step": 20475 + }, + { + "epoch": 0.6902827867471097, + "grad_norm": 59.351375579833984, + "learning_rate": 2.648303354865108e-07, + "logits/chosen": -1.5795114040374756, + "logits/rejected": -1.784462571144104, + "logps/chosen": -2.711531162261963, + "logps/rejected": -2.824735164642334, + "loss": 3.4056, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -27.115314483642578, + "rewards/margins": 1.1320381164550781, + "rewards/rejected": -28.247350692749023, + "step": 20480 + }, + { + "epoch": 0.6904513128180929, + "grad_norm": 36.14025115966797, + "learning_rate": 2.645708066454836e-07, + "logits/chosen": -2.1569674015045166, + "logits/rejected": -2.4725911617279053, + "logps/chosen": -2.0673184394836426, + "logps/rejected": -2.4919426441192627, + "loss": 2.1936, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.67318344116211, + "rewards/margins": 4.246241569519043, + "rewards/rejected": -24.91942596435547, + "step": 20485 + }, + { + "epoch": 0.6906198388890762, + "grad_norm": 36.176048278808594, + "learning_rate": 2.643113592772247e-07, + "logits/chosen": -1.268363356590271, + "logits/rejected": -1.4549598693847656, + "logps/chosen": -2.6716246604919434, + "logps/rejected": -3.0097477436065674, + "loss": 1.3822, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -26.71624755859375, + "rewards/margins": 3.38122820854187, + "rewards/rejected": -30.09747314453125, + "step": 20490 + }, + { + "epoch": 0.6907883649600594, + "grad_norm": 26.0761661529541, + "learning_rate": 2.6405199347151853e-07, + "logits/chosen": -1.4917685985565186, + "logits/rejected": -1.7317289113998413, + "logps/chosen": -2.0684714317321777, + "logps/rejected": -2.4049010276794434, + "loss": 2.7544, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.68471336364746, + "rewards/margins": 3.3642945289611816, + "rewards/rejected": -24.049007415771484, + "step": 20495 + }, + { + "epoch": 0.6909568910310425, + "grad_norm": 40.55641555786133, + "learning_rate": 2.637927093181215e-07, + "logits/chosen": -1.4711533784866333, + "logits/rejected": -1.232787847518921, + "logps/chosen": -2.4316444396972656, + "logps/rejected": -2.3839781284332275, + "loss": 3.7814, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -24.316442489624023, + "rewards/margins": -0.4766610264778137, + "rewards/rejected": -23.83978271484375, + "step": 20500 + }, + { + "epoch": 0.6911254171020257, + "grad_norm": 34.28879928588867, + "learning_rate": 2.635335069067617e-07, + "logits/chosen": -1.5314624309539795, + "logits/rejected": -1.6001373529434204, + "logps/chosen": -2.8761401176452637, + "logps/rejected": -2.869184970855713, + "loss": 3.1917, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -28.761402130126953, + "rewards/margins": -0.06955299526453018, + "rewards/rejected": -28.691852569580078, + "step": 20505 + }, + { + "epoch": 0.6912939431730089, + "grad_norm": 21.706228256225586, + "learning_rate": 2.632743863271386e-07, + "logits/chosen": -1.8885313272476196, + "logits/rejected": -2.1504628658294678, + "logps/chosen": -2.0663483142852783, + "logps/rejected": -2.186183452606201, + "loss": 2.5927, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.663482666015625, + "rewards/margins": 1.1983544826507568, + "rewards/rejected": -21.861835479736328, + "step": 20510 + }, + { + "epoch": 0.691462469243992, + "grad_norm": 41.952720642089844, + "learning_rate": 2.6301534766892383e-07, + "logits/chosen": -1.4128437042236328, + "logits/rejected": -1.5041228532791138, + "logps/chosen": -1.9707103967666626, + "logps/rejected": -2.113816738128662, + "loss": 2.0716, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.707103729248047, + "rewards/margins": 1.4310643672943115, + "rewards/rejected": -21.138168334960938, + "step": 20515 + }, + { + "epoch": 0.6916309953149752, + "grad_norm": 18.8629207611084, + "learning_rate": 2.627563910217603e-07, + "logits/chosen": -1.3868954181671143, + "logits/rejected": -1.3035885095596313, + "logps/chosen": -3.475510358810425, + "logps/rejected": -3.5753989219665527, + "loss": 5.7918, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -34.755104064941406, + "rewards/margins": 0.9988861083984375, + "rewards/rejected": -35.753990173339844, + "step": 20520 + }, + { + "epoch": 0.6917995213859585, + "grad_norm": 20.878881454467773, + "learning_rate": 2.6249751647526284e-07, + "logits/chosen": -1.4501540660858154, + "logits/rejected": -1.6170969009399414, + "logps/chosen": -1.8804584741592407, + "logps/rejected": -2.7165768146514893, + "loss": 2.1882, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.804584503173828, + "rewards/margins": 8.361184120178223, + "rewards/rejected": -27.165767669677734, + "step": 20525 + }, + { + "epoch": 0.6919680474569416, + "grad_norm": 41.715492248535156, + "learning_rate": 2.62238724119018e-07, + "logits/chosen": -1.8132165670394897, + "logits/rejected": -1.8326460123062134, + "logps/chosen": -2.0126876831054688, + "logps/rejected": -2.183061122894287, + "loss": 2.6684, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.126874923706055, + "rewards/margins": 1.7037353515625, + "rewards/rejected": -21.830608367919922, + "step": 20530 + }, + { + "epoch": 0.6921365735279248, + "grad_norm": 78.58806610107422, + "learning_rate": 2.6198001404258306e-07, + "logits/chosen": -1.914900779724121, + "logits/rejected": -2.038290500640869, + "logps/chosen": -2.3590197563171387, + "logps/rejected": -2.56164288520813, + "loss": 2.936, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.590198516845703, + "rewards/margins": 2.0262298583984375, + "rewards/rejected": -25.616430282592773, + "step": 20535 + }, + { + "epoch": 0.692305099598908, + "grad_norm": 21.73616600036621, + "learning_rate": 2.617213863354876e-07, + "logits/chosen": -1.6423991918563843, + "logits/rejected": -1.8211101293563843, + "logps/chosen": -2.6549861431121826, + "logps/rejected": -2.900494337081909, + "loss": 2.0109, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.549861907958984, + "rewards/margins": 2.4550812244415283, + "rewards/rejected": -29.00494384765625, + "step": 20540 + }, + { + "epoch": 0.6924736256698911, + "grad_norm": 28.13576316833496, + "learning_rate": 2.614628410872328e-07, + "logits/chosen": -1.8414901494979858, + "logits/rejected": -1.8130661249160767, + "logps/chosen": -3.396226406097412, + "logps/rejected": -3.634342908859253, + "loss": 2.4793, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -33.96226501464844, + "rewards/margins": 2.3811628818511963, + "rewards/rejected": -36.34342956542969, + "step": 20545 + }, + { + "epoch": 0.6926421517408743, + "grad_norm": 57.43549346923828, + "learning_rate": 2.612043783872905e-07, + "logits/chosen": -1.3797636032104492, + "logits/rejected": -1.239461064338684, + "logps/chosen": -2.233649730682373, + "logps/rejected": -2.3911919593811035, + "loss": 3.2192, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.336496353149414, + "rewards/margins": 1.5754238367080688, + "rewards/rejected": -23.91192054748535, + "step": 20550 + }, + { + "epoch": 0.6928106778118575, + "grad_norm": 47.604774475097656, + "learning_rate": 2.609459983251046e-07, + "logits/chosen": -1.5278592109680176, + "logits/rejected": -1.573185682296753, + "logps/chosen": -2.482006072998047, + "logps/rejected": -2.4904625415802, + "loss": 3.5412, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.820064544677734, + "rewards/margins": 0.08456306159496307, + "rewards/rejected": -24.904626846313477, + "step": 20555 + }, + { + "epoch": 0.6929792038828406, + "grad_norm": 16.714946746826172, + "learning_rate": 2.606877009900904e-07, + "logits/chosen": -1.9386390447616577, + "logits/rejected": -1.9331505298614502, + "logps/chosen": -1.9429633617401123, + "logps/rejected": -2.433903694152832, + "loss": 2.6649, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.42963218688965, + "rewards/margins": 4.909402370452881, + "rewards/rejected": -24.339035034179688, + "step": 20560 + }, + { + "epoch": 0.6931477299538239, + "grad_norm": 5.052648544311523, + "learning_rate": 2.6042948647163456e-07, + "logits/chosen": -2.081897020339966, + "logits/rejected": -2.2000374794006348, + "logps/chosen": -2.3183398246765137, + "logps/rejected": -2.636362075805664, + "loss": 1.475, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -23.183399200439453, + "rewards/margins": 3.180218458175659, + "rewards/rejected": -26.36362075805664, + "step": 20565 + }, + { + "epoch": 0.6933162560248071, + "grad_norm": 20.045211791992188, + "learning_rate": 2.6017135485909445e-07, + "logits/chosen": -1.1111745834350586, + "logits/rejected": -1.1631591320037842, + "logps/chosen": -1.8602969646453857, + "logps/rejected": -1.9264259338378906, + "loss": 2.8664, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.602970123291016, + "rewards/margins": 0.6612892150878906, + "rewards/rejected": -19.264257431030273, + "step": 20570 + }, + { + "epoch": 0.6934847820957902, + "grad_norm": 31.42527198791504, + "learning_rate": 2.5991330624179967e-07, + "logits/chosen": -1.3147919178009033, + "logits/rejected": -2.23165225982666, + "logps/chosen": -1.960165023803711, + "logps/rejected": -2.15248966217041, + "loss": 2.2321, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.60165023803711, + "rewards/margins": 1.9232457876205444, + "rewards/rejected": -21.5248966217041, + "step": 20575 + }, + { + "epoch": 0.6936533081667734, + "grad_norm": 21.13239288330078, + "learning_rate": 2.596553407090507e-07, + "logits/chosen": -1.5352892875671387, + "logits/rejected": -1.9442498683929443, + "logps/chosen": -2.198362350463867, + "logps/rejected": -2.5642826557159424, + "loss": 2.603, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.983623504638672, + "rewards/margins": 3.659203290939331, + "rewards/rejected": -25.6428279876709, + "step": 20580 + }, + { + "epoch": 0.6938218342377566, + "grad_norm": 0.00042211037361994386, + "learning_rate": 2.5939745835011895e-07, + "logits/chosen": -1.496949553489685, + "logits/rejected": -1.9834476709365845, + "logps/chosen": -2.410764455795288, + "logps/rejected": -2.8452978134155273, + "loss": 2.2064, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.10764503479004, + "rewards/margins": 4.345332622528076, + "rewards/rejected": -28.452978134155273, + "step": 20585 + }, + { + "epoch": 0.6939903603087397, + "grad_norm": 22.815431594848633, + "learning_rate": 2.5913965925424754e-07, + "logits/chosen": -1.6021522283554077, + "logits/rejected": -1.7010421752929688, + "logps/chosen": -1.9284210205078125, + "logps/rejected": -2.514775514602661, + "loss": 1.6196, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.28420639038086, + "rewards/margins": 5.863546848297119, + "rewards/rejected": -25.147754669189453, + "step": 20590 + }, + { + "epoch": 0.6941588863797229, + "grad_norm": 128.66635131835938, + "learning_rate": 2.588819435106504e-07, + "logits/chosen": -2.0419507026672363, + "logits/rejected": -1.9064838886260986, + "logps/chosen": -2.3883216381073, + "logps/rejected": -2.404067277908325, + "loss": 2.9514, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.883216857910156, + "rewards/margins": 0.15745744109153748, + "rewards/rejected": -24.040674209594727, + "step": 20595 + }, + { + "epoch": 0.6943274124507062, + "grad_norm": 30.456762313842773, + "learning_rate": 2.5862431120851324e-07, + "logits/chosen": -1.8142229318618774, + "logits/rejected": -2.2456400394439697, + "logps/chosen": -2.5279510021209717, + "logps/rejected": -3.1119022369384766, + "loss": 2.8751, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.279508590698242, + "rewards/margins": 5.839513778686523, + "rewards/rejected": -31.1190242767334, + "step": 20600 + }, + { + "epoch": 0.6944959385216893, + "grad_norm": 26.131343841552734, + "learning_rate": 2.58366762436992e-07, + "logits/chosen": -1.7309370040893555, + "logits/rejected": -2.1586380004882812, + "logps/chosen": -2.619094133377075, + "logps/rejected": -2.662580966949463, + "loss": 4.1593, + "rewards/accuracies": 0.5, + "rewards/chosen": -26.190942764282227, + "rewards/margins": 0.434866338968277, + "rewards/rejected": -26.625812530517578, + "step": 20605 + }, + { + "epoch": 0.6946644645926725, + "grad_norm": 43.826229095458984, + "learning_rate": 2.5810929728521417e-07, + "logits/chosen": -1.9372804164886475, + "logits/rejected": -1.9223651885986328, + "logps/chosen": -2.482267379760742, + "logps/rejected": -2.6594791412353516, + "loss": 2.3857, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.822673797607422, + "rewards/margins": 1.7721197605133057, + "rewards/rejected": -26.594791412353516, + "step": 20610 + }, + { + "epoch": 0.6948329906636557, + "grad_norm": 15.244111061096191, + "learning_rate": 2.578519158422787e-07, + "logits/chosen": -1.0707063674926758, + "logits/rejected": -1.3074023723602295, + "logps/chosen": -2.4689464569091797, + "logps/rejected": -2.585275173187256, + "loss": 2.8056, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.68946647644043, + "rewards/margins": 1.1632845401763916, + "rewards/rejected": -25.852752685546875, + "step": 20615 + }, + { + "epoch": 0.6950015167346388, + "grad_norm": 35.30781555175781, + "learning_rate": 2.575946181972547e-07, + "logits/chosen": -1.7129011154174805, + "logits/rejected": -1.8900816440582275, + "logps/chosen": -2.7587363719940186, + "logps/rejected": -3.468085527420044, + "loss": 3.7825, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -27.58736228942871, + "rewards/margins": 7.093493461608887, + "rewards/rejected": -34.68085479736328, + "step": 20620 + }, + { + "epoch": 0.695170042805622, + "grad_norm": 27.71534538269043, + "learning_rate": 2.57337404439183e-07, + "logits/chosen": -1.5547401905059814, + "logits/rejected": -1.6852400302886963, + "logps/chosen": -2.4740939140319824, + "logps/rejected": -2.8958001136779785, + "loss": 2.7703, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.74094009399414, + "rewards/margins": 4.217062473297119, + "rewards/rejected": -28.9580020904541, + "step": 20625 + }, + { + "epoch": 0.6953385688766052, + "grad_norm": 17.399873733520508, + "learning_rate": 2.5708027465707507e-07, + "logits/chosen": -1.6606941223144531, + "logits/rejected": -1.7552680969238281, + "logps/chosen": -2.045785427093506, + "logps/rejected": -2.209364175796509, + "loss": 3.1454, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.45785140991211, + "rewards/margins": 1.6357879638671875, + "rewards/rejected": -22.093639373779297, + "step": 20630 + }, + { + "epoch": 0.6955070949475884, + "grad_norm": 5.373404026031494, + "learning_rate": 2.568232289399136e-07, + "logits/chosen": -2.0698084831237793, + "logits/rejected": -2.2751216888427734, + "logps/chosen": -2.794049024581909, + "logps/rejected": -3.2170653343200684, + "loss": 1.6037, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -27.940486907958984, + "rewards/margins": 4.230165004730225, + "rewards/rejected": -32.170654296875, + "step": 20635 + }, + { + "epoch": 0.6956756210185716, + "grad_norm": 26.428239822387695, + "learning_rate": 2.5656626737665166e-07, + "logits/chosen": -1.5411195755004883, + "logits/rejected": -1.6710189580917358, + "logps/chosen": -2.2027552127838135, + "logps/rejected": -2.4791810512542725, + "loss": 2.2223, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.02755355834961, + "rewards/margins": 2.7642579078674316, + "rewards/rejected": -24.791810989379883, + "step": 20640 + }, + { + "epoch": 0.6958441470895548, + "grad_norm": 25.9244384765625, + "learning_rate": 2.5630939005621367e-07, + "logits/chosen": -1.068873643875122, + "logits/rejected": -1.6588131189346313, + "logps/chosen": -2.1675708293914795, + "logps/rejected": -2.701908588409424, + "loss": 1.2802, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.675708770751953, + "rewards/margins": 5.343373775482178, + "rewards/rejected": -27.01908302307129, + "step": 20645 + }, + { + "epoch": 0.6960126731605379, + "grad_norm": 16.891956329345703, + "learning_rate": 2.560525970674947e-07, + "logits/chosen": -1.6296188831329346, + "logits/rejected": -1.6298997402191162, + "logps/chosen": -2.7089664936065674, + "logps/rejected": -3.0198957920074463, + "loss": 2.3573, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -27.089664459228516, + "rewards/margins": 3.1092934608459473, + "rewards/rejected": -30.198955535888672, + "step": 20650 + }, + { + "epoch": 0.6961811992315211, + "grad_norm": 21.705472946166992, + "learning_rate": 2.557958884993607e-07, + "logits/chosen": -2.1672890186309814, + "logits/rejected": -2.3492963314056396, + "logps/chosen": -1.8646361827850342, + "logps/rejected": -2.2616653442382812, + "loss": 2.0145, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.6463623046875, + "rewards/margins": 3.970294237136841, + "rewards/rejected": -22.616657257080078, + "step": 20655 + }, + { + "epoch": 0.6963497253025043, + "grad_norm": 36.769920349121094, + "learning_rate": 2.5553926444064856e-07, + "logits/chosen": -1.727367639541626, + "logits/rejected": -2.071427822113037, + "logps/chosen": -2.185485363006592, + "logps/rejected": -2.6844260692596436, + "loss": 1.6868, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.8548526763916, + "rewards/margins": 4.989406108856201, + "rewards/rejected": -26.84425926208496, + "step": 20660 + }, + { + "epoch": 0.6965182513734874, + "grad_norm": 18.47760009765625, + "learning_rate": 2.552827249801656e-07, + "logits/chosen": -1.3826062679290771, + "logits/rejected": -1.8453384637832642, + "logps/chosen": -2.333775281906128, + "logps/rejected": -2.763153553009033, + "loss": 1.6567, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -23.337753295898438, + "rewards/margins": 4.293783664703369, + "rewards/rejected": -27.63153648376465, + "step": 20665 + }, + { + "epoch": 0.6966867774444706, + "grad_norm": 68.66205596923828, + "learning_rate": 2.550262702066902e-07, + "logits/chosen": -1.6487001180648804, + "logits/rejected": -2.1160476207733154, + "logps/chosen": -1.9262282848358154, + "logps/rejected": -2.10188889503479, + "loss": 2.5036, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.262283325195312, + "rewards/margins": 1.756604552268982, + "rewards/rejected": -21.01888656616211, + "step": 20670 + }, + { + "epoch": 0.6968553035154539, + "grad_norm": 18.352575302124023, + "learning_rate": 2.547699002089709e-07, + "logits/chosen": -1.6079021692276, + "logits/rejected": -1.6530320644378662, + "logps/chosen": -3.115535259246826, + "logps/rejected": -3.553027391433716, + "loss": 2.8882, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -31.155353546142578, + "rewards/margins": 4.374924659729004, + "rewards/rejected": -35.5302734375, + "step": 20675 + }, + { + "epoch": 0.697023829586437, + "grad_norm": 4.454761505126953, + "learning_rate": 2.545136150757275e-07, + "logits/chosen": -1.9702221155166626, + "logits/rejected": -2.310147285461426, + "logps/chosen": -3.109532117843628, + "logps/rejected": -3.1416773796081543, + "loss": 3.6061, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -31.095317840576172, + "rewards/margins": 0.3214544355869293, + "rewards/rejected": -31.41677474975586, + "step": 20680 + }, + { + "epoch": 0.6971923556574202, + "grad_norm": 24.627824783325195, + "learning_rate": 2.5425741489565035e-07, + "logits/chosen": -1.629888892173767, + "logits/rejected": -1.887681007385254, + "logps/chosen": -2.459501028060913, + "logps/rejected": -3.4645283222198486, + "loss": 1.9909, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.595012664794922, + "rewards/margins": 10.050271987915039, + "rewards/rejected": -34.64528274536133, + "step": 20685 + }, + { + "epoch": 0.6973608817284034, + "grad_norm": 21.283910751342773, + "learning_rate": 2.5400129975739973e-07, + "logits/chosen": -1.552143931388855, + "logits/rejected": -1.7271095514297485, + "logps/chosen": -1.927848219871521, + "logps/rejected": -2.278165340423584, + "loss": 1.8268, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.27848243713379, + "rewards/margins": 3.503171443939209, + "rewards/rejected": -22.78165626525879, + "step": 20690 + }, + { + "epoch": 0.6975294077993865, + "grad_norm": 42.463069915771484, + "learning_rate": 2.537452697496074e-07, + "logits/chosen": -1.7690792083740234, + "logits/rejected": -2.0024960041046143, + "logps/chosen": -3.1089041233062744, + "logps/rejected": -3.203892946243286, + "loss": 6.6679, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -31.089040756225586, + "rewards/margins": 0.9498880505561829, + "rewards/rejected": -32.03893280029297, + "step": 20695 + }, + { + "epoch": 0.6976979338703697, + "grad_norm": 219.65562438964844, + "learning_rate": 2.5348932496087514e-07, + "logits/chosen": -1.3836790323257446, + "logits/rejected": -1.856666922569275, + "logps/chosen": -2.4839062690734863, + "logps/rejected": -3.0888781547546387, + "loss": 2.7525, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -24.839061737060547, + "rewards/margins": 6.04971981048584, + "rewards/rejected": -30.888782501220703, + "step": 20700 + }, + { + "epoch": 0.6978664599413529, + "grad_norm": 164.99838256835938, + "learning_rate": 2.532334654797756e-07, + "logits/chosen": -1.8446996212005615, + "logits/rejected": -2.135688304901123, + "logps/chosen": -2.1551125049591064, + "logps/rejected": -2.0222866535186768, + "loss": 4.7932, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.55112075805664, + "rewards/margins": -1.3282577991485596, + "rewards/rejected": -20.222864151000977, + "step": 20705 + }, + { + "epoch": 0.6980349860123362, + "grad_norm": 44.29646682739258, + "learning_rate": 2.5297769139485126e-07, + "logits/chosen": -1.5946813821792603, + "logits/rejected": -1.9342540502548218, + "logps/chosen": -2.563175916671753, + "logps/rejected": -2.7002692222595215, + "loss": 4.7077, + "rewards/accuracies": 0.5, + "rewards/chosen": -25.631759643554688, + "rewards/margins": 1.3709338903427124, + "rewards/rejected": -27.002695083618164, + "step": 20710 + }, + { + "epoch": 0.6982035120833193, + "grad_norm": 27.76547622680664, + "learning_rate": 2.5272200279461554e-07, + "logits/chosen": -1.6258023977279663, + "logits/rejected": -1.7282302379608154, + "logps/chosen": -2.21496319770813, + "logps/rejected": -2.229520797729492, + "loss": 4.102, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.149633407592773, + "rewards/margins": 0.14557456970214844, + "rewards/rejected": -22.29520606994629, + "step": 20715 + }, + { + "epoch": 0.6983720381543025, + "grad_norm": 20.176912307739258, + "learning_rate": 2.5246639976755256e-07, + "logits/chosen": -1.7401567697525024, + "logits/rejected": -2.296189546585083, + "logps/chosen": -2.269164562225342, + "logps/rejected": -2.47044038772583, + "loss": 3.4469, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.69164276123047, + "rewards/margins": 2.012756824493408, + "rewards/rejected": -24.704402923583984, + "step": 20720 + }, + { + "epoch": 0.6985405642252857, + "grad_norm": 26.711063385009766, + "learning_rate": 2.5221088240211595e-07, + "logits/chosen": -1.606109619140625, + "logits/rejected": -2.1354870796203613, + "logps/chosen": -2.067237377166748, + "logps/rejected": -2.362346649169922, + "loss": 2.7425, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.672372817993164, + "rewards/margins": 2.9510934352874756, + "rewards/rejected": -23.62346839904785, + "step": 20725 + }, + { + "epoch": 0.6987090902962688, + "grad_norm": 33.88330841064453, + "learning_rate": 2.5195545078673055e-07, + "logits/chosen": -1.2273902893066406, + "logits/rejected": -1.6899696588516235, + "logps/chosen": -2.1341545581817627, + "logps/rejected": -2.3928472995758057, + "loss": 2.0875, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.341543197631836, + "rewards/margins": 2.586925983428955, + "rewards/rejected": -23.928470611572266, + "step": 20730 + }, + { + "epoch": 0.698877616367252, + "grad_norm": 28.957759857177734, + "learning_rate": 2.517001050097909e-07, + "logits/chosen": -1.882333755493164, + "logits/rejected": -1.8907684087753296, + "logps/chosen": -1.8769035339355469, + "logps/rejected": -2.2192816734313965, + "loss": 3.232, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.769033432006836, + "rewards/margins": 3.423780918121338, + "rewards/rejected": -22.19281578063965, + "step": 20735 + }, + { + "epoch": 0.6990461424382352, + "grad_norm": 38.6989631652832, + "learning_rate": 2.5144484515966257e-07, + "logits/chosen": -1.7968485355377197, + "logits/rejected": -2.1946663856506348, + "logps/chosen": -1.6905304193496704, + "logps/rejected": -1.97846257686615, + "loss": 2.7855, + "rewards/accuracies": 0.5, + "rewards/chosen": -16.905303955078125, + "rewards/margins": 2.8793225288391113, + "rewards/rejected": -19.784626007080078, + "step": 20740 + }, + { + "epoch": 0.6992146685092184, + "grad_norm": 36.020259857177734, + "learning_rate": 2.511896713246804e-07, + "logits/chosen": -2.1380550861358643, + "logits/rejected": -2.318734645843506, + "logps/chosen": -2.142519950866699, + "logps/rejected": -2.305680513381958, + "loss": 2.8405, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.425201416015625, + "rewards/margins": 1.6316025257110596, + "rewards/rejected": -23.05680274963379, + "step": 20745 + }, + { + "epoch": 0.6993831945802016, + "grad_norm": 31.211973190307617, + "learning_rate": 2.509345835931503e-07, + "logits/chosen": -1.5926382541656494, + "logits/rejected": -1.6798031330108643, + "logps/chosen": -2.4966330528259277, + "logps/rejected": -2.52616810798645, + "loss": 3.0846, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.966331481933594, + "rewards/margins": 0.2953473925590515, + "rewards/rejected": -25.26167869567871, + "step": 20750 + }, + { + "epoch": 0.6995517206511848, + "grad_norm": 17.617544174194336, + "learning_rate": 2.506795820533483e-07, + "logits/chosen": -1.350720763206482, + "logits/rejected": -1.8617594242095947, + "logps/chosen": -2.2058634757995605, + "logps/rejected": -2.441958427429199, + "loss": 4.0364, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.058635711669922, + "rewards/margins": 2.360948085784912, + "rewards/rejected": -24.41958236694336, + "step": 20755 + }, + { + "epoch": 0.6997202467221679, + "grad_norm": 215.24154663085938, + "learning_rate": 2.504246667935198e-07, + "logits/chosen": -2.053504228591919, + "logits/rejected": -2.227134943008423, + "logps/chosen": -2.590571880340576, + "logps/rejected": -2.6792871952056885, + "loss": 6.1276, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.905715942382812, + "rewards/margins": 0.8871553540229797, + "rewards/rejected": -26.79287338256836, + "step": 20760 + }, + { + "epoch": 0.6998887727931511, + "grad_norm": 46.3771858215332, + "learning_rate": 2.501698379018813e-07, + "logits/chosen": -1.6019909381866455, + "logits/rejected": -1.7526795864105225, + "logps/chosen": -1.9972448348999023, + "logps/rejected": -2.048427104949951, + "loss": 3.7901, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -19.972448348999023, + "rewards/margins": 0.5118247270584106, + "rewards/rejected": -20.48427391052246, + "step": 20765 + }, + { + "epoch": 0.7000572988641343, + "grad_norm": 35.8875617980957, + "learning_rate": 2.4991509546661896e-07, + "logits/chosen": -1.7468448877334595, + "logits/rejected": -2.6992127895355225, + "logps/chosen": -2.6763198375701904, + "logps/rejected": -3.5259883403778076, + "loss": 1.6343, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -26.763198852539062, + "rewards/margins": 8.496684074401855, + "rewards/rejected": -35.25988006591797, + "step": 20770 + }, + { + "epoch": 0.7002258249351174, + "grad_norm": 20.1556339263916, + "learning_rate": 2.496604395758893e-07, + "logits/chosen": -1.9715760946273804, + "logits/rejected": -2.0841028690338135, + "logps/chosen": -2.025458574295044, + "logps/rejected": -2.2173969745635986, + "loss": 2.2974, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.254587173461914, + "rewards/margins": 1.9193828105926514, + "rewards/rejected": -22.173969268798828, + "step": 20775 + }, + { + "epoch": 0.7003943510061006, + "grad_norm": 35.320377349853516, + "learning_rate": 2.494058703178184e-07, + "logits/chosen": -1.4174038171768188, + "logits/rejected": -1.5131930112838745, + "logps/chosen": -2.280522584915161, + "logps/rejected": -2.666675090789795, + "loss": 1.6929, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.805225372314453, + "rewards/margins": 3.8615264892578125, + "rewards/rejected": -26.666751861572266, + "step": 20780 + }, + { + "epoch": 0.7005628770770839, + "grad_norm": 25.47304344177246, + "learning_rate": 2.491513877805027e-07, + "logits/chosen": -1.593122959136963, + "logits/rejected": -1.8874908685684204, + "logps/chosen": -2.6540579795837402, + "logps/rejected": -3.080254077911377, + "loss": 3.1685, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -26.54058265686035, + "rewards/margins": 4.261958122253418, + "rewards/rejected": -30.802539825439453, + "step": 20785 + }, + { + "epoch": 0.700731403148067, + "grad_norm": 30.636350631713867, + "learning_rate": 2.4889699205200873e-07, + "logits/chosen": -1.7904140949249268, + "logits/rejected": -1.8597707748413086, + "logps/chosen": -2.0977234840393066, + "logps/rejected": -2.4295527935028076, + "loss": 2.3094, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.977237701416016, + "rewards/margins": 3.3182921409606934, + "rewards/rejected": -24.295530319213867, + "step": 20790 + }, + { + "epoch": 0.7008999292190502, + "grad_norm": 33.05244827270508, + "learning_rate": 2.486426832203727e-07, + "logits/chosen": -1.911708116531372, + "logits/rejected": -1.7518056631088257, + "logps/chosen": -2.130955696105957, + "logps/rejected": -2.179682970046997, + "loss": 4.1876, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.309558868408203, + "rewards/margins": 0.4872714877128601, + "rewards/rejected": -21.796829223632812, + "step": 20795 + }, + { + "epoch": 0.7010684552900334, + "grad_norm": 136.613037109375, + "learning_rate": 2.483884613736009e-07, + "logits/chosen": -1.943140983581543, + "logits/rejected": -1.82207453250885, + "logps/chosen": -2.9635822772979736, + "logps/rejected": -2.933582305908203, + "loss": 3.9719, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -29.63582420349121, + "rewards/margins": -0.29999876022338867, + "rewards/rejected": -29.3358211517334, + "step": 20800 + }, + { + "epoch": 0.7010684552900334, + "eval_logits/chosen": -2.159958600997925, + "eval_logits/rejected": -2.326280117034912, + "eval_logps/chosen": -2.2271804809570312, + "eval_logps/rejected": -2.3731677532196045, + "eval_loss": 3.059114694595337, + "eval_rewards/accuracies": 0.6299999952316284, + "eval_rewards/chosen": -22.271800994873047, + "eval_rewards/margins": 1.4598737955093384, + "eval_rewards/rejected": -23.731678009033203, + "eval_runtime": 12.8911, + "eval_samples_per_second": 7.757, + "eval_steps_per_second": 1.939, + "step": 20800 + }, + { + "epoch": 0.7012369813610165, + "grad_norm": 9.893948554992676, + "learning_rate": 2.481343265996697e-07, + "logits/chosen": -2.0953831672668457, + "logits/rejected": -2.207719087600708, + "logps/chosen": -2.3842544555664062, + "logps/rejected": -2.877701997756958, + "loss": 1.0689, + "rewards/accuracies": 1.0, + "rewards/chosen": -23.84254264831543, + "rewards/margins": 4.934475898742676, + "rewards/rejected": -28.777019500732422, + "step": 20805 + }, + { + "epoch": 0.7014055074319997, + "grad_norm": 32.668060302734375, + "learning_rate": 2.478802789865248e-07, + "logits/chosen": -1.3642785549163818, + "logits/rejected": -1.369171380996704, + "logps/chosen": -2.1678147315979004, + "logps/rejected": -2.1231637001037598, + "loss": 3.666, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.67814826965332, + "rewards/margins": -0.44651031494140625, + "rewards/rejected": -21.231637954711914, + "step": 20810 + }, + { + "epoch": 0.7015740335029829, + "grad_norm": 20.985637664794922, + "learning_rate": 2.476263186220822e-07, + "logits/chosen": -1.3993022441864014, + "logits/rejected": -1.4622784852981567, + "logps/chosen": -2.4692177772521973, + "logps/rejected": -2.36029314994812, + "loss": 4.523, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -24.69217872619629, + "rewards/margins": -1.0892469882965088, + "rewards/rejected": -23.60293197631836, + "step": 20815 + }, + { + "epoch": 0.7017425595739661, + "grad_norm": 19.307912826538086, + "learning_rate": 2.4737244559422765e-07, + "logits/chosen": -1.744577407836914, + "logits/rejected": -1.6804134845733643, + "logps/chosen": -2.9628689289093018, + "logps/rejected": -3.169175386428833, + "loss": 3.7887, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -29.628686904907227, + "rewards/margins": 2.0630671977996826, + "rewards/rejected": -31.691753387451172, + "step": 20820 + }, + { + "epoch": 0.7019110856449493, + "grad_norm": 99.77359008789062, + "learning_rate": 2.471186599908167e-07, + "logits/chosen": -1.737992525100708, + "logits/rejected": -1.4738280773162842, + "logps/chosen": -3.280036211013794, + "logps/rejected": -3.5343995094299316, + "loss": 3.4004, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -32.80036544799805, + "rewards/margins": 2.543631076812744, + "rewards/rejected": -35.343994140625, + "step": 20825 + }, + { + "epoch": 0.7020796117159325, + "grad_norm": 34.829166412353516, + "learning_rate": 2.468649618996742e-07, + "logits/chosen": -1.4518276453018188, + "logits/rejected": -1.854933500289917, + "logps/chosen": -2.5755703449249268, + "logps/rejected": -2.7659616470336914, + "loss": 2.5933, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.755701065063477, + "rewards/margins": 1.903914451599121, + "rewards/rejected": -27.659616470336914, + "step": 20830 + }, + { + "epoch": 0.7022481377869156, + "grad_norm": 41.89667510986328, + "learning_rate": 2.466113514085953e-07, + "logits/chosen": -1.7497894763946533, + "logits/rejected": -1.406106948852539, + "logps/chosen": -2.2218799591064453, + "logps/rejected": -2.411547899246216, + "loss": 2.422, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.218799591064453, + "rewards/margins": 1.8966801166534424, + "rewards/rejected": -24.115480422973633, + "step": 20835 + }, + { + "epoch": 0.7024166638578988, + "grad_norm": 29.444116592407227, + "learning_rate": 2.4635782860534454e-07, + "logits/chosen": -1.7633018493652344, + "logits/rejected": -1.7266566753387451, + "logps/chosen": -1.7943710088729858, + "logps/rejected": -1.8583873510360718, + "loss": 2.841, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.94371223449707, + "rewards/margins": 0.6401627659797668, + "rewards/rejected": -18.583873748779297, + "step": 20840 + }, + { + "epoch": 0.702585189928882, + "grad_norm": 20.048221588134766, + "learning_rate": 2.4610439357765637e-07, + "logits/chosen": -2.148761034011841, + "logits/rejected": -2.7610249519348145, + "logps/chosen": -2.7624688148498535, + "logps/rejected": -3.8066444396972656, + "loss": 1.4623, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -27.62468910217285, + "rewards/margins": 10.441758155822754, + "rewards/rejected": -38.066444396972656, + "step": 20845 + }, + { + "epoch": 0.7027537159998651, + "grad_norm": 19.170835494995117, + "learning_rate": 2.458510464132343e-07, + "logits/chosen": -1.7249538898468018, + "logits/rejected": -1.8920398950576782, + "logps/chosen": -1.8914012908935547, + "logps/rejected": -2.133737087249756, + "loss": 2.353, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.914012908935547, + "rewards/margins": 2.4233574867248535, + "rewards/rejected": -21.337369918823242, + "step": 20850 + }, + { + "epoch": 0.7029222420708484, + "grad_norm": 12.12578296661377, + "learning_rate": 2.4559778719975207e-07, + "logits/chosen": -0.9357792139053345, + "logits/rejected": -1.007215142250061, + "logps/chosen": -2.2251079082489014, + "logps/rejected": -2.2693939208984375, + "loss": 3.4564, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -22.251079559326172, + "rewards/margins": 0.4428592622280121, + "rewards/rejected": -22.693939208984375, + "step": 20855 + }, + { + "epoch": 0.7030907681418316, + "grad_norm": 16.79740333557129, + "learning_rate": 2.453446160248528e-07, + "logits/chosen": -1.680640459060669, + "logits/rejected": -1.6724519729614258, + "logps/chosen": -2.378614664077759, + "logps/rejected": -2.7835144996643066, + "loss": 2.2559, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.78614616394043, + "rewards/margins": 4.048998832702637, + "rewards/rejected": -27.835147857666016, + "step": 20860 + }, + { + "epoch": 0.7032592942128147, + "grad_norm": 19.344402313232422, + "learning_rate": 2.4509153297614865e-07, + "logits/chosen": -0.960638701915741, + "logits/rejected": -1.0211657285690308, + "logps/chosen": -2.17290997505188, + "logps/rejected": -2.6399240493774414, + "loss": 2.242, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.729097366333008, + "rewards/margins": 4.67014217376709, + "rewards/rejected": -26.399240493774414, + "step": 20865 + }, + { + "epoch": 0.7034278202837979, + "grad_norm": 33.54787826538086, + "learning_rate": 2.44838538141222e-07, + "logits/chosen": -1.5037105083465576, + "logits/rejected": -1.8484117984771729, + "logps/chosen": -2.09523344039917, + "logps/rejected": -2.2684638500213623, + "loss": 2.36, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.952335357666016, + "rewards/margins": 1.7323029041290283, + "rewards/rejected": -22.684640884399414, + "step": 20870 + }, + { + "epoch": 0.7035963463547811, + "grad_norm": 24.71225929260254, + "learning_rate": 2.4458563160762435e-07, + "logits/chosen": -1.7637755870819092, + "logits/rejected": -1.9183088541030884, + "logps/chosen": -2.2071845531463623, + "logps/rejected": -2.446314811706543, + "loss": 1.6382, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -22.07184600830078, + "rewards/margins": 2.391300678253174, + "rewards/rejected": -24.463146209716797, + "step": 20875 + }, + { + "epoch": 0.7037648724257642, + "grad_norm": 35.04969787597656, + "learning_rate": 2.4433281346287683e-07, + "logits/chosen": -1.8257334232330322, + "logits/rejected": -1.829982042312622, + "logps/chosen": -1.7968488931655884, + "logps/rejected": -1.9402484893798828, + "loss": 2.2478, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -17.968490600585938, + "rewards/margins": 1.433994174003601, + "rewards/rejected": -19.402484893798828, + "step": 20880 + }, + { + "epoch": 0.7039333984967474, + "grad_norm": 15.392056465148926, + "learning_rate": 2.4408008379446956e-07, + "logits/chosen": -1.6506710052490234, + "logits/rejected": -1.8374980688095093, + "logps/chosen": -2.2430996894836426, + "logps/rejected": -2.7005362510681152, + "loss": 2.1661, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -22.43099594116211, + "rewards/margins": 4.574366092681885, + "rewards/rejected": -27.005361557006836, + "step": 20885 + }, + { + "epoch": 0.7041019245677306, + "grad_norm": 218.97528076171875, + "learning_rate": 2.4382744268986235e-07, + "logits/chosen": -1.8860218524932861, + "logits/rejected": -2.2579519748687744, + "logps/chosen": -2.500788927078247, + "logps/rejected": -2.6374239921569824, + "loss": 3.5651, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -25.007888793945312, + "rewards/margins": 1.3663525581359863, + "rewards/rejected": -26.37424087524414, + "step": 20890 + }, + { + "epoch": 0.7042704506387139, + "grad_norm": 124.85858154296875, + "learning_rate": 2.435748902364847e-07, + "logits/chosen": -1.345632791519165, + "logits/rejected": -1.4876656532287598, + "logps/chosen": -2.6240341663360596, + "logps/rejected": -2.737029552459717, + "loss": 4.7415, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -26.240345001220703, + "rewards/margins": 1.1299524307250977, + "rewards/rejected": -27.37029457092285, + "step": 20895 + }, + { + "epoch": 0.704438976709697, + "grad_norm": 20.623889923095703, + "learning_rate": 2.4332242652173455e-07, + "logits/chosen": -2.132624387741089, + "logits/rejected": -2.2081000804901123, + "logps/chosen": -2.1483864784240723, + "logps/rejected": -2.2890305519104004, + "loss": 3.4368, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.483861923217773, + "rewards/margins": 1.4064419269561768, + "rewards/rejected": -22.89030647277832, + "step": 20900 + }, + { + "epoch": 0.7046075027806802, + "grad_norm": 18.654865264892578, + "learning_rate": 2.430700516329799e-07, + "logits/chosen": -1.672620415687561, + "logits/rejected": -1.7438329458236694, + "logps/chosen": -2.1926429271698, + "logps/rejected": -2.2783713340759277, + "loss": 2.5433, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.926427841186523, + "rewards/margins": 0.8572842478752136, + "rewards/rejected": -22.783714294433594, + "step": 20905 + }, + { + "epoch": 0.7047760288516633, + "grad_norm": 30.57003402709961, + "learning_rate": 2.4281776565755776e-07, + "logits/chosen": -1.8434076309204102, + "logits/rejected": -1.8538297414779663, + "logps/chosen": -3.4328250885009766, + "logps/rejected": -3.8476052284240723, + "loss": 4.098, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -34.328250885009766, + "rewards/margins": 4.147800922393799, + "rewards/rejected": -38.476051330566406, + "step": 20910 + }, + { + "epoch": 0.7049445549226465, + "grad_norm": 32.916893005371094, + "learning_rate": 2.425655686827745e-07, + "logits/chosen": -2.005236864089966, + "logits/rejected": -2.2738490104675293, + "logps/chosen": -2.3984122276306152, + "logps/rejected": -2.629697561264038, + "loss": 2.0332, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.984121322631836, + "rewards/margins": 2.3128538131713867, + "rewards/rejected": -26.29697608947754, + "step": 20915 + }, + { + "epoch": 0.7051130809936297, + "grad_norm": 22.032899856567383, + "learning_rate": 2.4231346079590525e-07, + "logits/chosen": -1.897334337234497, + "logits/rejected": -2.2675790786743164, + "logps/chosen": -2.951364517211914, + "logps/rejected": -2.873973846435547, + "loss": 4.657, + "rewards/accuracies": 0.5, + "rewards/chosen": -29.513647079467773, + "rewards/margins": -0.7739073038101196, + "rewards/rejected": -28.7397403717041, + "step": 20920 + }, + { + "epoch": 0.7052816070646128, + "grad_norm": 175.79537963867188, + "learning_rate": 2.4206144208419484e-07, + "logits/chosen": -1.4970096349716187, + "logits/rejected": -1.7550102472305298, + "logps/chosen": -2.5946249961853027, + "logps/rejected": -2.578613758087158, + "loss": 3.6519, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.946252822875977, + "rewards/margins": -0.16011133790016174, + "rewards/rejected": -25.7861385345459, + "step": 20925 + }, + { + "epoch": 0.7054501331355961, + "grad_norm": 37.29673767089844, + "learning_rate": 2.418095126348568e-07, + "logits/chosen": -1.363229751586914, + "logits/rejected": -1.3798617124557495, + "logps/chosen": -2.4268722534179688, + "logps/rejected": -2.4810168743133545, + "loss": 3.277, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -24.26872444152832, + "rewards/margins": 0.5414448976516724, + "rewards/rejected": -24.810169219970703, + "step": 20930 + }, + { + "epoch": 0.7056186592065793, + "grad_norm": 32.56948471069336, + "learning_rate": 2.415576725350745e-07, + "logits/chosen": -1.3033921718597412, + "logits/rejected": -1.7104156017303467, + "logps/chosen": -1.9265705347061157, + "logps/rejected": -1.993730902671814, + "loss": 2.7227, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.265705108642578, + "rewards/margins": 0.6716042757034302, + "rewards/rejected": -19.93731117248535, + "step": 20935 + }, + { + "epoch": 0.7057871852775625, + "grad_norm": 88.04150390625, + "learning_rate": 2.413059218719992e-07, + "logits/chosen": -1.3537064790725708, + "logits/rejected": -1.228635549545288, + "logps/chosen": -2.790931463241577, + "logps/rejected": -2.846214771270752, + "loss": 4.0388, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -27.909313201904297, + "rewards/margins": 0.5528322458267212, + "rewards/rejected": -28.462146759033203, + "step": 20940 + }, + { + "epoch": 0.7059557113485456, + "grad_norm": 106.84761047363281, + "learning_rate": 2.4105426073275227e-07, + "logits/chosen": -1.6752468347549438, + "logits/rejected": -1.9212989807128906, + "logps/chosen": -2.588087558746338, + "logps/rejected": -2.9983863830566406, + "loss": 2.8237, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.880874633789062, + "rewards/margins": 4.1029887199401855, + "rewards/rejected": -29.983861923217773, + "step": 20945 + }, + { + "epoch": 0.7061242374195288, + "grad_norm": 23.526409149169922, + "learning_rate": 2.408026892044236e-07, + "logits/chosen": -1.0860170125961304, + "logits/rejected": -1.2228302955627441, + "logps/chosen": -2.1994411945343018, + "logps/rejected": -2.527712345123291, + "loss": 1.9611, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.994409561157227, + "rewards/margins": 3.2827117443084717, + "rewards/rejected": -25.277124404907227, + "step": 20950 + }, + { + "epoch": 0.706292763490512, + "grad_norm": 119.02529907226562, + "learning_rate": 2.405512073740722e-07, + "logits/chosen": -1.8486545085906982, + "logits/rejected": -2.03426194190979, + "logps/chosen": -3.235337018966675, + "logps/rejected": -3.7842392921447754, + "loss": 3.0592, + "rewards/accuracies": 0.5, + "rewards/chosen": -32.353370666503906, + "rewards/margins": 5.489018440246582, + "rewards/rejected": -37.84238815307617, + "step": 20955 + }, + { + "epoch": 0.7064612895614951, + "grad_norm": 157.5975799560547, + "learning_rate": 2.402998153287261e-07, + "logits/chosen": -1.907758355140686, + "logits/rejected": -1.7703840732574463, + "logps/chosen": -3.10050630569458, + "logps/rejected": -2.9990952014923096, + "loss": 4.1782, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -31.00506019592285, + "rewards/margins": -1.0141105651855469, + "rewards/rejected": -29.990951538085938, + "step": 20960 + }, + { + "epoch": 0.7066298156324784, + "grad_norm": 33.8292121887207, + "learning_rate": 2.400485131553823e-07, + "logits/chosen": -1.400076150894165, + "logits/rejected": -1.7552769184112549, + "logps/chosen": -2.0182392597198486, + "logps/rejected": -2.322455644607544, + "loss": 2.0623, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.182388305664062, + "rewards/margins": 3.0421645641326904, + "rewards/rejected": -23.22455596923828, + "step": 20965 + }, + { + "epoch": 0.7067983417034616, + "grad_norm": 28.48842430114746, + "learning_rate": 2.397973009410063e-07, + "logits/chosen": -1.8866052627563477, + "logits/rejected": -2.1081717014312744, + "logps/chosen": -2.4975154399871826, + "logps/rejected": -2.766672134399414, + "loss": 2.5916, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.97515869140625, + "rewards/margins": 2.6915647983551025, + "rewards/rejected": -27.666723251342773, + "step": 20970 + }, + { + "epoch": 0.7069668677744447, + "grad_norm": 31.44121551513672, + "learning_rate": 2.395461787725328e-07, + "logits/chosen": -1.7562042474746704, + "logits/rejected": -1.8775659799575806, + "logps/chosen": -2.86181378364563, + "logps/rejected": -3.6873226165771484, + "loss": 1.5245, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -28.618139266967773, + "rewards/margins": 8.255086898803711, + "rewards/rejected": -36.87322235107422, + "step": 20975 + }, + { + "epoch": 0.7071353938454279, + "grad_norm": 24.774744033813477, + "learning_rate": 2.392951467368654e-07, + "logits/chosen": -1.6393998861312866, + "logits/rejected": -1.6322141885757446, + "logps/chosen": -2.874117612838745, + "logps/rejected": -3.365924835205078, + "loss": 1.5556, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -28.741174697875977, + "rewards/margins": 4.918069362640381, + "rewards/rejected": -33.659244537353516, + "step": 20980 + }, + { + "epoch": 0.7073039199164111, + "grad_norm": 21.506885528564453, + "learning_rate": 2.390442049208765e-07, + "logits/chosen": -1.6719391345977783, + "logits/rejected": -2.0707993507385254, + "logps/chosen": -1.975489854812622, + "logps/rejected": -2.183727741241455, + "loss": 1.8297, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.754898071289062, + "rewards/margins": 2.0823795795440674, + "rewards/rejected": -21.837276458740234, + "step": 20985 + }, + { + "epoch": 0.7074724459873942, + "grad_norm": 8.369352340698242, + "learning_rate": 2.3879335341140684e-07, + "logits/chosen": -1.5787404775619507, + "logits/rejected": -1.8783071041107178, + "logps/chosen": -1.9943172931671143, + "logps/rejected": -2.20145583152771, + "loss": 2.0157, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.943172454833984, + "rewards/margins": 2.071387767791748, + "rewards/rejected": -22.01456069946289, + "step": 20990 + }, + { + "epoch": 0.7076409720583774, + "grad_norm": 34.375, + "learning_rate": 2.3854259229526647e-07, + "logits/chosen": -1.6097021102905273, + "logits/rejected": -2.271876096725464, + "logps/chosen": -2.341718912124634, + "logps/rejected": -2.7711498737335205, + "loss": 1.8549, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.417186737060547, + "rewards/margins": 4.294312477111816, + "rewards/rejected": -27.711498260498047, + "step": 20995 + }, + { + "epoch": 0.7078094981293606, + "grad_norm": 54.840301513671875, + "learning_rate": 2.3829192165923407e-07, + "logits/chosen": -2.0436813831329346, + "logits/rejected": -2.13887095451355, + "logps/chosen": -2.341813802719116, + "logps/rejected": -2.5769951343536377, + "loss": 2.6942, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.418140411376953, + "rewards/margins": 2.351813793182373, + "rewards/rejected": -25.76995277404785, + "step": 21000 + }, + { + "epoch": 0.7079780242003438, + "grad_norm": 41.41640853881836, + "learning_rate": 2.3804134159005652e-07, + "logits/chosen": -1.459006667137146, + "logits/rejected": -1.6995465755462646, + "logps/chosen": -2.172149181365967, + "logps/rejected": -2.6103742122650146, + "loss": 3.07, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.721492767333984, + "rewards/margins": 4.382250785827637, + "rewards/rejected": -26.103744506835938, + "step": 21005 + }, + { + "epoch": 0.708146550271327, + "grad_norm": 48.500545501708984, + "learning_rate": 2.3779085217444983e-07, + "logits/chosen": -1.4249950647354126, + "logits/rejected": -1.457912802696228, + "logps/chosen": -2.4428913593292236, + "logps/rejected": -2.856995105743408, + "loss": 1.5174, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -24.428913116455078, + "rewards/margins": 4.141038417816162, + "rewards/rejected": -28.5699520111084, + "step": 21010 + }, + { + "epoch": 0.7083150763423102, + "grad_norm": 55.933231353759766, + "learning_rate": 2.3754045349909862e-07, + "logits/chosen": -2.118337869644165, + "logits/rejected": -2.549415111541748, + "logps/chosen": -2.5966598987579346, + "logps/rejected": -2.9401957988739014, + "loss": 2.3674, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.966602325439453, + "rewards/margins": 3.435359239578247, + "rewards/rejected": -29.40195655822754, + "step": 21015 + }, + { + "epoch": 0.7084836024132933, + "grad_norm": 33.618892669677734, + "learning_rate": 2.3729014565065614e-07, + "logits/chosen": -1.8896814584732056, + "logits/rejected": -2.2039222717285156, + "logps/chosen": -2.2860429286956787, + "logps/rejected": -2.5437560081481934, + "loss": 1.7982, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -22.860427856445312, + "rewards/margins": 2.577130079269409, + "rewards/rejected": -25.43756103515625, + "step": 21020 + }, + { + "epoch": 0.7086521284842765, + "grad_norm": 43.32668685913086, + "learning_rate": 2.3703992871574367e-07, + "logits/chosen": -1.524289846420288, + "logits/rejected": -1.53403902053833, + "logps/chosen": -2.3910186290740967, + "logps/rejected": -2.4618887901306152, + "loss": 3.302, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.910188674926758, + "rewards/margins": 0.7087002992630005, + "rewards/rejected": -24.618886947631836, + "step": 21025 + }, + { + "epoch": 0.7088206545552597, + "grad_norm": 21.206262588500977, + "learning_rate": 2.3678980278095174e-07, + "logits/chosen": -2.2566733360290527, + "logits/rejected": -2.3251144886016846, + "logps/chosen": -2.616288423538208, + "logps/rejected": -2.5294570922851562, + "loss": 4.4404, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -26.16288185119629, + "rewards/margins": -0.8683112263679504, + "rewards/rejected": -25.294570922851562, + "step": 21030 + }, + { + "epoch": 0.7089891806262428, + "grad_norm": 30.227651596069336, + "learning_rate": 2.3653976793283913e-07, + "logits/chosen": -1.8887183666229248, + "logits/rejected": -2.139434337615967, + "logps/chosen": -2.6790931224823, + "logps/rejected": -3.093452215194702, + "loss": 2.1032, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -26.790935516357422, + "rewards/margins": 4.143587112426758, + "rewards/rejected": -30.934520721435547, + "step": 21035 + }, + { + "epoch": 0.7091577066972261, + "grad_norm": 29.77997589111328, + "learning_rate": 2.3628982425793276e-07, + "logits/chosen": -2.377270460128784, + "logits/rejected": -2.5480237007141113, + "logps/chosen": -3.316850185394287, + "logps/rejected": -3.6830153465270996, + "loss": 2.4897, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -33.16849899291992, + "rewards/margins": 3.6616523265838623, + "rewards/rejected": -36.83015441894531, + "step": 21040 + }, + { + "epoch": 0.7093262327682093, + "grad_norm": 26.390913009643555, + "learning_rate": 2.3603997184272845e-07, + "logits/chosen": -1.281688928604126, + "logits/rejected": -1.5339549779891968, + "logps/chosen": -2.191223621368408, + "logps/rejected": -2.870591878890991, + "loss": 2.2178, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.9122371673584, + "rewards/margins": 6.793682098388672, + "rewards/rejected": -28.705921173095703, + "step": 21045 + }, + { + "epoch": 0.7094947588391924, + "grad_norm": 0.2677067816257477, + "learning_rate": 2.3579021077369045e-07, + "logits/chosen": -1.5607569217681885, + "logits/rejected": -1.7881824970245361, + "logps/chosen": -3.0007331371307373, + "logps/rejected": -3.3100173473358154, + "loss": 3.1781, + "rewards/accuracies": 0.5, + "rewards/chosen": -30.0073299407959, + "rewards/margins": 3.092841625213623, + "rewards/rejected": -33.10017776489258, + "step": 21050 + }, + { + "epoch": 0.7096632849101756, + "grad_norm": 26.878314971923828, + "learning_rate": 2.3554054113725087e-07, + "logits/chosen": -1.8124243021011353, + "logits/rejected": -1.9376004934310913, + "logps/chosen": -2.7536587715148926, + "logps/rejected": -3.0808892250061035, + "loss": 2.0869, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -27.536584854125977, + "rewards/margins": 3.272303819656372, + "rewards/rejected": -30.808889389038086, + "step": 21055 + }, + { + "epoch": 0.7098318109811588, + "grad_norm": 27.73183250427246, + "learning_rate": 2.3529096301981066e-07, + "logits/chosen": -1.2320560216903687, + "logits/rejected": -1.4639769792556763, + "logps/chosen": -2.032846689224243, + "logps/rejected": -2.275391101837158, + "loss": 2.2245, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.328466415405273, + "rewards/margins": 2.425446033477783, + "rewards/rejected": -22.7539119720459, + "step": 21060 + }, + { + "epoch": 0.7100003370521419, + "grad_norm": 36.29167556762695, + "learning_rate": 2.3504147650773908e-07, + "logits/chosen": -1.829874038696289, + "logits/rejected": -2.2702250480651855, + "logps/chosen": -1.9189624786376953, + "logps/rejected": -2.343379020690918, + "loss": 2.3178, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.189624786376953, + "rewards/margins": 4.24416446685791, + "rewards/rejected": -23.433788299560547, + "step": 21065 + }, + { + "epoch": 0.7101688631231251, + "grad_norm": 16.855567932128906, + "learning_rate": 2.3479208168737375e-07, + "logits/chosen": -1.4624030590057373, + "logits/rejected": -1.6850998401641846, + "logps/chosen": -2.534311294555664, + "logps/rejected": -3.1778666973114014, + "loss": 1.4996, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -25.34311294555664, + "rewards/margins": 6.435555458068848, + "rewards/rejected": -31.778667449951172, + "step": 21070 + }, + { + "epoch": 0.7103373891941084, + "grad_norm": 33.9194221496582, + "learning_rate": 2.3454277864501993e-07, + "logits/chosen": -1.355196237564087, + "logits/rejected": -1.965765357017517, + "logps/chosen": -2.3128390312194824, + "logps/rejected": -2.840618133544922, + "loss": 2.5798, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.12839126586914, + "rewards/margins": 5.2777886390686035, + "rewards/rejected": -28.40618324279785, + "step": 21075 + }, + { + "epoch": 0.7105059152650915, + "grad_norm": 116.49337005615234, + "learning_rate": 2.3429356746695183e-07, + "logits/chosen": -2.102574348449707, + "logits/rejected": -2.062224864959717, + "logps/chosen": -3.443988800048828, + "logps/rejected": -3.4509689807891846, + "loss": 7.7899, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -34.43988800048828, + "rewards/margins": 0.06980228424072266, + "rewards/rejected": -34.50969314575195, + "step": 21080 + }, + { + "epoch": 0.7106744413360747, + "grad_norm": 51.51720428466797, + "learning_rate": 2.340444482394116e-07, + "logits/chosen": -2.1616411209106445, + "logits/rejected": -1.7606723308563232, + "logps/chosen": -2.331195831298828, + "logps/rejected": -2.2554385662078857, + "loss": 4.5934, + "rewards/accuracies": 0.10000000149011612, + "rewards/chosen": -23.311954498291016, + "rewards/margins": -0.7575688362121582, + "rewards/rejected": -22.55438804626465, + "step": 21085 + }, + { + "epoch": 0.7108429674070579, + "grad_norm": 23.593618392944336, + "learning_rate": 2.3379542104860966e-07, + "logits/chosen": -1.722654104232788, + "logits/rejected": -2.115140438079834, + "logps/chosen": -2.236232042312622, + "logps/rejected": -2.5139546394348145, + "loss": 1.7579, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -22.36231803894043, + "rewards/margins": 2.7772271633148193, + "rewards/rejected": -25.139545440673828, + "step": 21090 + }, + { + "epoch": 0.711011493478041, + "grad_norm": 31.937267303466797, + "learning_rate": 2.335464859807244e-07, + "logits/chosen": -1.7309108972549438, + "logits/rejected": -1.8715778589248657, + "logps/chosen": -1.8176692724227905, + "logps/rejected": -2.0656280517578125, + "loss": 1.9252, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.176692962646484, + "rewards/margins": 2.479588747024536, + "rewards/rejected": -20.656280517578125, + "step": 21095 + }, + { + "epoch": 0.7111800195490242, + "grad_norm": 1.1448978185653687, + "learning_rate": 2.3329764312190252e-07, + "logits/chosen": -1.9613860845565796, + "logits/rejected": -2.300971269607544, + "logps/chosen": -2.1797537803649902, + "logps/rejected": -3.2657864093780518, + "loss": 0.8235, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -21.797536849975586, + "rewards/margins": 10.860325813293457, + "rewards/rejected": -32.65786361694336, + "step": 21100 + }, + { + "epoch": 0.7113485456200074, + "grad_norm": 23.70174217224121, + "learning_rate": 2.3304889255825894e-07, + "logits/chosen": -1.4914168119430542, + "logits/rejected": -1.7354528903961182, + "logps/chosen": -2.097081184387207, + "logps/rejected": -2.174804210662842, + "loss": 3.2589, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.970809936523438, + "rewards/margins": 0.7772310376167297, + "rewards/rejected": -21.7480411529541, + "step": 21105 + }, + { + "epoch": 0.7115170716909905, + "grad_norm": 29.179771423339844, + "learning_rate": 2.3280023437587592e-07, + "logits/chosen": -1.5822515487670898, + "logits/rejected": -1.7653348445892334, + "logps/chosen": -2.2154393196105957, + "logps/rejected": -2.228299617767334, + "loss": 2.9847, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.15439224243164, + "rewards/margins": 0.12860460579395294, + "rewards/rejected": -22.28299903869629, + "step": 21110 + }, + { + "epoch": 0.7116855977619738, + "grad_norm": 26.02365493774414, + "learning_rate": 2.3255166866080456e-07, + "logits/chosen": -1.494226098060608, + "logits/rejected": -1.6853415966033936, + "logps/chosen": -2.2476089000701904, + "logps/rejected": -2.3578267097473145, + "loss": 2.6978, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.47608757019043, + "rewards/margins": 1.1021760702133179, + "rewards/rejected": -23.578266143798828, + "step": 21115 + }, + { + "epoch": 0.711854123832957, + "grad_norm": 103.08861541748047, + "learning_rate": 2.3230319549906385e-07, + "logits/chosen": -1.0157320499420166, + "logits/rejected": -1.02422034740448, + "logps/chosen": -2.9622600078582764, + "logps/rejected": -3.1863622665405273, + "loss": 3.1049, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -29.622600555419922, + "rewards/margins": 2.2410223484039307, + "rewards/rejected": -31.86362075805664, + "step": 21120 + }, + { + "epoch": 0.7120226499039402, + "grad_norm": 46.99019241333008, + "learning_rate": 2.320548149766401e-07, + "logits/chosen": -2.065828800201416, + "logits/rejected": -1.975515604019165, + "logps/chosen": -2.1992688179016113, + "logps/rejected": -2.2332983016967773, + "loss": 3.2294, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.992687225341797, + "rewards/margins": 0.3402930200099945, + "rewards/rejected": -22.33298110961914, + "step": 21125 + }, + { + "epoch": 0.7121911759749233, + "grad_norm": 46.34275436401367, + "learning_rate": 2.3180652717948828e-07, + "logits/chosen": -1.742457389831543, + "logits/rejected": -1.711629867553711, + "logps/chosen": -2.0267436504364014, + "logps/rejected": -2.6312241554260254, + "loss": 2.1364, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.267440795898438, + "rewards/margins": 6.044801235198975, + "rewards/rejected": -26.312240600585938, + "step": 21130 + }, + { + "epoch": 0.7123597020459065, + "grad_norm": 23.298921585083008, + "learning_rate": 2.3155833219353104e-07, + "logits/chosen": -1.6443456411361694, + "logits/rejected": -1.826258897781372, + "logps/chosen": -2.348782777786255, + "logps/rejected": -2.5271294116973877, + "loss": 2.8658, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.487829208374023, + "rewards/margins": 1.7834653854370117, + "rewards/rejected": -25.27129554748535, + "step": 21135 + }, + { + "epoch": 0.7125282281168897, + "grad_norm": 23.32587242126465, + "learning_rate": 2.31310230104659e-07, + "logits/chosen": -1.3168076276779175, + "logits/rejected": -1.4803967475891113, + "logps/chosen": -2.104248523712158, + "logps/rejected": -2.55351185798645, + "loss": 2.4836, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.042484283447266, + "rewards/margins": 4.4926347732543945, + "rewards/rejected": -25.535120010375977, + "step": 21140 + }, + { + "epoch": 0.7126967541878728, + "grad_norm": 20.411678314208984, + "learning_rate": 2.3106222099873023e-07, + "logits/chosen": -1.807246208190918, + "logits/rejected": -1.8495361804962158, + "logps/chosen": -2.8721916675567627, + "logps/rejected": -2.8132948875427246, + "loss": 4.907, + "rewards/accuracies": 0.10000000149011612, + "rewards/chosen": -28.721914291381836, + "rewards/margins": -0.5889650583267212, + "rewards/rejected": -28.132949829101562, + "step": 21145 + }, + { + "epoch": 0.7128652802588561, + "grad_norm": 39.6473503112793, + "learning_rate": 2.30814304961571e-07, + "logits/chosen": -1.4973148107528687, + "logits/rejected": -1.563876748085022, + "logps/chosen": -2.365460157394409, + "logps/rejected": -2.8280510902404785, + "loss": 2.2636, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.65460205078125, + "rewards/margins": 4.625908851623535, + "rewards/rejected": -28.2805118560791, + "step": 21150 + }, + { + "epoch": 0.7130338063298393, + "grad_norm": 50.854774475097656, + "learning_rate": 2.3056648207897555e-07, + "logits/chosen": -1.129978060722351, + "logits/rejected": -1.2141058444976807, + "logps/chosen": -2.1422510147094727, + "logps/rejected": -2.164353370666504, + "loss": 3.4514, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.422510147094727, + "rewards/margins": 0.2210227996110916, + "rewards/rejected": -21.64353370666504, + "step": 21155 + }, + { + "epoch": 0.7132023324008224, + "grad_norm": 32.68754959106445, + "learning_rate": 2.3031875243670519e-07, + "logits/chosen": -2.121105432510376, + "logits/rejected": -2.453059673309326, + "logps/chosen": -2.97033953666687, + "logps/rejected": -3.187030792236328, + "loss": 2.5426, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -29.703393936157227, + "rewards/margins": 2.1669116020202637, + "rewards/rejected": -31.87030601501465, + "step": 21160 + }, + { + "epoch": 0.7133708584718056, + "grad_norm": 21.080669403076172, + "learning_rate": 2.3007111612048958e-07, + "logits/chosen": -2.093998908996582, + "logits/rejected": -2.063572883605957, + "logps/chosen": -3.4782378673553467, + "logps/rejected": -3.571840286254883, + "loss": 2.6477, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -34.78237533569336, + "rewards/margins": 0.9360250234603882, + "rewards/rejected": -35.71840286254883, + "step": 21165 + }, + { + "epoch": 0.7135393845427888, + "grad_norm": 17.68836784362793, + "learning_rate": 2.298235732160259e-07, + "logits/chosen": -1.7278655767440796, + "logits/rejected": -1.7685997486114502, + "logps/chosen": -2.0162787437438965, + "logps/rejected": -2.3139195442199707, + "loss": 2.8728, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.162790298461914, + "rewards/margins": 2.9764046669006348, + "rewards/rejected": -23.13919448852539, + "step": 21170 + }, + { + "epoch": 0.7137079106137719, + "grad_norm": 24.257272720336914, + "learning_rate": 2.2957612380897924e-07, + "logits/chosen": -1.8252532482147217, + "logits/rejected": -1.8329311609268188, + "logps/chosen": -2.2917399406433105, + "logps/rejected": -2.804558277130127, + "loss": 2.0278, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.917400360107422, + "rewards/margins": 5.128180503845215, + "rewards/rejected": -28.045581817626953, + "step": 21175 + }, + { + "epoch": 0.7138764366847551, + "grad_norm": 108.06800079345703, + "learning_rate": 2.2932876798498164e-07, + "logits/chosen": -1.6782376766204834, + "logits/rejected": -2.0210893154144287, + "logps/chosen": -2.508038282394409, + "logps/rejected": -2.7307159900665283, + "loss": 2.2803, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.080385208129883, + "rewards/margins": 2.226774215698242, + "rewards/rejected": -27.307159423828125, + "step": 21180 + }, + { + "epoch": 0.7140449627557384, + "grad_norm": 37.349796295166016, + "learning_rate": 2.2908150582963343e-07, + "logits/chosen": -1.6880900859832764, + "logits/rejected": -1.6230026483535767, + "logps/chosen": -2.2963674068450928, + "logps/rejected": -2.5860037803649902, + "loss": 2.5056, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.963674545288086, + "rewards/margins": 2.896362781524658, + "rewards/rejected": -25.860036849975586, + "step": 21185 + }, + { + "epoch": 0.7142134888267215, + "grad_norm": 54.41622543334961, + "learning_rate": 2.2883433742850245e-07, + "logits/chosen": -1.4802095890045166, + "logits/rejected": -1.592818021774292, + "logps/chosen": -2.937121629714966, + "logps/rejected": -3.137934923171997, + "loss": 2.6537, + "rewards/accuracies": 0.5, + "rewards/chosen": -29.371212005615234, + "rewards/margins": 2.008134365081787, + "rewards/rejected": -31.379344940185547, + "step": 21190 + }, + { + "epoch": 0.7143820148977047, + "grad_norm": 160.5109405517578, + "learning_rate": 2.285872628671236e-07, + "logits/chosen": -1.8429698944091797, + "logits/rejected": -2.2430481910705566, + "logps/chosen": -3.6799235343933105, + "logps/rejected": -4.076421737670898, + "loss": 4.0979, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -36.79923629760742, + "rewards/margins": 3.964979648590088, + "rewards/rejected": -40.764217376708984, + "step": 21195 + }, + { + "epoch": 0.7145505409686879, + "grad_norm": 48.12770462036133, + "learning_rate": 2.2834028223099982e-07, + "logits/chosen": -1.7956221103668213, + "logits/rejected": -2.4896199703216553, + "logps/chosen": -2.2983362674713135, + "logps/rejected": -2.975998640060425, + "loss": 1.4942, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -22.98336410522461, + "rewards/margins": 6.776625633239746, + "rewards/rejected": -29.759984970092773, + "step": 21200 + }, + { + "epoch": 0.7145505409686879, + "eval_logits/chosen": -2.167952537536621, + "eval_logits/rejected": -2.3352444171905518, + "eval_logps/chosen": -2.232255697250366, + "eval_logps/rejected": -2.380444288253784, + "eval_loss": 3.0574114322662354, + "eval_rewards/accuracies": 0.6299999952316284, + "eval_rewards/chosen": -22.322555541992188, + "eval_rewards/margins": 1.4818838834762573, + "eval_rewards/rejected": -23.8044376373291, + "eval_runtime": 12.8946, + "eval_samples_per_second": 7.755, + "eval_steps_per_second": 1.939, + "step": 21200 + }, + { + "epoch": 0.714719067039671, + "grad_norm": 82.61286926269531, + "learning_rate": 2.2809339560560143e-07, + "logits/chosen": -1.8070186376571655, + "logits/rejected": -1.924077033996582, + "logps/chosen": -2.382108449935913, + "logps/rejected": -2.96075439453125, + "loss": 2.6643, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.82108497619629, + "rewards/margins": 5.7864580154418945, + "rewards/rejected": -29.6075439453125, + "step": 21205 + }, + { + "epoch": 0.7148875931106542, + "grad_norm": 60.51424026489258, + "learning_rate": 2.2784660307636632e-07, + "logits/chosen": -2.4727723598480225, + "logits/rejected": -2.318603038787842, + "logps/chosen": -2.613403558731079, + "logps/rejected": -2.723193645477295, + "loss": 4.2397, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -26.134037017822266, + "rewards/margins": 1.097900390625, + "rewards/rejected": -27.23193359375, + "step": 21210 + }, + { + "epoch": 0.7150561191816374, + "grad_norm": 28.987451553344727, + "learning_rate": 2.2759990472869926e-07, + "logits/chosen": -1.657486915588379, + "logits/rejected": -1.9913294315338135, + "logps/chosen": -2.1268715858459473, + "logps/rejected": -2.752849817276001, + "loss": 2.7899, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.26871681213379, + "rewards/margins": 6.259782314300537, + "rewards/rejected": -27.52849769592285, + "step": 21215 + }, + { + "epoch": 0.7152246452526205, + "grad_norm": 32.00741195678711, + "learning_rate": 2.273533006479731e-07, + "logits/chosen": -1.8473243713378906, + "logits/rejected": -1.6231390237808228, + "logps/chosen": -2.018846035003662, + "logps/rejected": -2.16618013381958, + "loss": 2.0689, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.188459396362305, + "rewards/margins": 1.4733377695083618, + "rewards/rejected": -21.66179847717285, + "step": 21220 + }, + { + "epoch": 0.7153931713236038, + "grad_norm": 45.651912689208984, + "learning_rate": 2.2710679091952767e-07, + "logits/chosen": -2.4465718269348145, + "logits/rejected": -2.3830413818359375, + "logps/chosen": -2.0005125999450684, + "logps/rejected": -2.189295768737793, + "loss": 2.6768, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.005125045776367, + "rewards/margins": 1.8878326416015625, + "rewards/rejected": -21.892959594726562, + "step": 21225 + }, + { + "epoch": 0.715561697394587, + "grad_norm": 35.45180130004883, + "learning_rate": 2.2686037562867033e-07, + "logits/chosen": -1.246711254119873, + "logits/rejected": -1.3242307901382446, + "logps/chosen": -2.182126522064209, + "logps/rejected": -2.203878164291382, + "loss": 3.1722, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.821266174316406, + "rewards/margins": 0.21751537919044495, + "rewards/rejected": -22.038782119750977, + "step": 21230 + }, + { + "epoch": 0.7157302234655701, + "grad_norm": 17.120080947875977, + "learning_rate": 2.2661405486067593e-07, + "logits/chosen": -2.0206122398376465, + "logits/rejected": -2.1763134002685547, + "logps/chosen": -2.324014902114868, + "logps/rejected": -2.6340198516845703, + "loss": 2.9857, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.240148544311523, + "rewards/margins": 3.100048065185547, + "rewards/rejected": -26.340198516845703, + "step": 21235 + }, + { + "epoch": 0.7158987495365533, + "grad_norm": 23.65846061706543, + "learning_rate": 2.2636782870078598e-07, + "logits/chosen": -1.6090011596679688, + "logits/rejected": -1.6499748229980469, + "logps/chosen": -2.645697832107544, + "logps/rejected": -2.6968419551849365, + "loss": 3.213, + "rewards/accuracies": 0.5, + "rewards/chosen": -26.456979751586914, + "rewards/margins": 0.5114401578903198, + "rewards/rejected": -26.968420028686523, + "step": 21240 + }, + { + "epoch": 0.7160672756075365, + "grad_norm": 5.814166069030762, + "learning_rate": 2.2612169723420983e-07, + "logits/chosen": -1.8302971124649048, + "logits/rejected": -2.1456241607666016, + "logps/chosen": -2.0068020820617676, + "logps/rejected": -2.1064953804016113, + "loss": 3.3416, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.06801986694336, + "rewards/margins": 0.9969308972358704, + "rewards/rejected": -21.064952850341797, + "step": 21245 + }, + { + "epoch": 0.7162358016785196, + "grad_norm": 189.8372344970703, + "learning_rate": 2.258756605461239e-07, + "logits/chosen": -1.873355507850647, + "logits/rejected": -1.8405964374542236, + "logps/chosen": -2.281764268875122, + "logps/rejected": -2.165700912475586, + "loss": 4.2068, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -22.817642211914062, + "rewards/margins": -1.1606316566467285, + "rewards/rejected": -21.657011032104492, + "step": 21250 + }, + { + "epoch": 0.7164043277495028, + "grad_norm": 34.90528106689453, + "learning_rate": 2.2562971872167175e-07, + "logits/chosen": -1.7773349285125732, + "logits/rejected": -1.6610603332519531, + "logps/chosen": -2.300401449203491, + "logps/rejected": -2.500701904296875, + "loss": 3.2863, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.004016876220703, + "rewards/margins": 2.0030055046081543, + "rewards/rejected": -25.007020950317383, + "step": 21255 + }, + { + "epoch": 0.7165728538204861, + "grad_norm": 30.30763816833496, + "learning_rate": 2.2538387184596443e-07, + "logits/chosen": -1.5406862497329712, + "logits/rejected": -1.7500499486923218, + "logps/chosen": -2.7660608291625977, + "logps/rejected": -2.8649933338165283, + "loss": 3.7678, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -27.660608291625977, + "rewards/margins": 0.9893225431442261, + "rewards/rejected": -28.64992904663086, + "step": 21260 + }, + { + "epoch": 0.7167413798914692, + "grad_norm": 63.39667510986328, + "learning_rate": 2.251381200040794e-07, + "logits/chosen": -1.757519006729126, + "logits/rejected": -1.9305756092071533, + "logps/chosen": -3.3314311504364014, + "logps/rejected": -3.6929244995117188, + "loss": 4.2631, + "rewards/accuracies": 0.5, + "rewards/chosen": -33.31431198120117, + "rewards/margins": 3.6149322986602783, + "rewards/rejected": -36.92924499511719, + "step": 21265 + }, + { + "epoch": 0.7169099059624524, + "grad_norm": 29.48974609375, + "learning_rate": 2.2489246328106193e-07, + "logits/chosen": -1.7308318614959717, + "logits/rejected": -1.8175618648529053, + "logps/chosen": -2.5321240425109863, + "logps/rejected": -2.9798617362976074, + "loss": 1.3057, + "rewards/accuracies": 1.0, + "rewards/chosen": -25.32124137878418, + "rewards/margins": 4.477375507354736, + "rewards/rejected": -29.79861831665039, + "step": 21270 + }, + { + "epoch": 0.7170784320334356, + "grad_norm": 28.888696670532227, + "learning_rate": 2.2464690176192413e-07, + "logits/chosen": -2.0127971172332764, + "logits/rejected": -2.030998468399048, + "logps/chosen": -1.9513905048370361, + "logps/rejected": -2.0580296516418457, + "loss": 2.8038, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.513904571533203, + "rewards/margins": 1.066389799118042, + "rewards/rejected": -20.58029556274414, + "step": 21275 + }, + { + "epoch": 0.7172469581044187, + "grad_norm": 62.83280563354492, + "learning_rate": 2.244014355316453e-07, + "logits/chosen": -1.9647674560546875, + "logits/rejected": -2.2305192947387695, + "logps/chosen": -2.1211609840393066, + "logps/rejected": -2.109130859375, + "loss": 3.6599, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.21160888671875, + "rewards/margins": -0.12029991298913956, + "rewards/rejected": -21.09130859375, + "step": 21280 + }, + { + "epoch": 0.7174154841754019, + "grad_norm": 28.013324737548828, + "learning_rate": 2.2415606467517134e-07, + "logits/chosen": -1.4304853677749634, + "logits/rejected": -1.919377326965332, + "logps/chosen": -2.0542383193969727, + "logps/rejected": -2.3345751762390137, + "loss": 2.6828, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.54238510131836, + "rewards/margins": 2.803367853164673, + "rewards/rejected": -23.34575080871582, + "step": 21285 + }, + { + "epoch": 0.7175840102463851, + "grad_norm": 35.321327209472656, + "learning_rate": 2.2391078927741552e-07, + "logits/chosen": -1.984724760055542, + "logits/rejected": -1.718353509902954, + "logps/chosen": -2.211531162261963, + "logps/rejected": -2.1570630073547363, + "loss": 4.121, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -22.115312576293945, + "rewards/margins": -0.5446780920028687, + "rewards/rejected": -21.57063102722168, + "step": 21290 + }, + { + "epoch": 0.7177525363173684, + "grad_norm": 13.352673530578613, + "learning_rate": 2.2366560942325828e-07, + "logits/chosen": -1.9127241373062134, + "logits/rejected": -1.9945110082626343, + "logps/chosen": -2.2655529975891113, + "logps/rejected": -2.767404079437256, + "loss": 2.0763, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -22.655529022216797, + "rewards/margins": 5.01850700378418, + "rewards/rejected": -27.674036026000977, + "step": 21295 + }, + { + "epoch": 0.7179210623883515, + "grad_norm": 5.086029052734375, + "learning_rate": 2.234205251975463e-07, + "logits/chosen": -1.4625164270401, + "logits/rejected": -1.3165785074234009, + "logps/chosen": -1.9075345993041992, + "logps/rejected": -1.817773461341858, + "loss": 4.7245, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.07534408569336, + "rewards/margins": -0.8976105451583862, + "rewards/rejected": -18.177734375, + "step": 21300 + }, + { + "epoch": 0.7180895884593347, + "grad_norm": 91.38097381591797, + "learning_rate": 2.231755366850937e-07, + "logits/chosen": -1.311603307723999, + "logits/rejected": -1.4673454761505127, + "logps/chosen": -3.1911635398864746, + "logps/rejected": -3.2235331535339355, + "loss": 4.8665, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -31.911640167236328, + "rewards/margins": 0.3236920237541199, + "rewards/rejected": -32.235328674316406, + "step": 21305 + }, + { + "epoch": 0.7182581145303178, + "grad_norm": 8.877885818481445, + "learning_rate": 2.2293064397068144e-07, + "logits/chosen": -1.6854312419891357, + "logits/rejected": -1.9205318689346313, + "logps/chosen": -2.0600550174713135, + "logps/rejected": -2.436044692993164, + "loss": 1.8768, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.600549697875977, + "rewards/margins": 3.7598977088928223, + "rewards/rejected": -24.36044692993164, + "step": 21310 + }, + { + "epoch": 0.718426640601301, + "grad_norm": 161.917236328125, + "learning_rate": 2.226858471390574e-07, + "logits/chosen": -1.7705590724945068, + "logits/rejected": -1.7386350631713867, + "logps/chosen": -2.8510050773620605, + "logps/rejected": -2.8456594944000244, + "loss": 3.4966, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -28.510046005249023, + "rewards/margins": -0.05345315858721733, + "rewards/rejected": -28.456594467163086, + "step": 21315 + }, + { + "epoch": 0.7185951666722842, + "grad_norm": 34.89402770996094, + "learning_rate": 2.2244114627493578e-07, + "logits/chosen": -1.9679571390151978, + "logits/rejected": -2.3387842178344727, + "logps/chosen": -2.206179618835449, + "logps/rejected": -2.8861498832702637, + "loss": 3.2962, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -22.061796188354492, + "rewards/margins": 6.799704551696777, + "rewards/rejected": -28.861501693725586, + "step": 21320 + }, + { + "epoch": 0.7187636927432673, + "grad_norm": 23.90579605102539, + "learning_rate": 2.2219654146299794e-07, + "logits/chosen": -1.3234543800354004, + "logits/rejected": -1.437170386314392, + "logps/chosen": -2.106889247894287, + "logps/rejected": -2.464322090148926, + "loss": 1.503, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -21.068889617919922, + "rewards/margins": 3.574329376220703, + "rewards/rejected": -24.643220901489258, + "step": 21325 + }, + { + "epoch": 0.7189322188142505, + "grad_norm": 0.4208122193813324, + "learning_rate": 2.2195203278789232e-07, + "logits/chosen": -1.780714750289917, + "logits/rejected": -1.8529062271118164, + "logps/chosen": -1.8633159399032593, + "logps/rejected": -2.1635186672210693, + "loss": 1.4399, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -18.633159637451172, + "rewards/margins": 3.0020289421081543, + "rewards/rejected": -21.635189056396484, + "step": 21330 + }, + { + "epoch": 0.7191007448852338, + "grad_norm": 24.755207061767578, + "learning_rate": 2.2170762033423334e-07, + "logits/chosen": -2.186357021331787, + "logits/rejected": -2.4847683906555176, + "logps/chosen": -2.5217902660369873, + "logps/rejected": -3.0550482273101807, + "loss": 1.1454, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -25.2179012298584, + "rewards/margins": 5.332579135894775, + "rewards/rejected": -30.55048179626465, + "step": 21335 + }, + { + "epoch": 0.719269270956217, + "grad_norm": 37.603004455566406, + "learning_rate": 2.2146330418660265e-07, + "logits/chosen": -1.308406114578247, + "logits/rejected": -1.3398100137710571, + "logps/chosen": -2.137680768966675, + "logps/rejected": -2.3950021266937256, + "loss": 2.558, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.376808166503906, + "rewards/margins": 2.5732150077819824, + "rewards/rejected": -23.950023651123047, + "step": 21340 + }, + { + "epoch": 0.7194377970272001, + "grad_norm": 58.8840446472168, + "learning_rate": 2.2121908442954852e-07, + "logits/chosen": -1.9633537530899048, + "logits/rejected": -2.081953525543213, + "logps/chosen": -2.1084682941436768, + "logps/rejected": -2.199111223220825, + "loss": 2.9718, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.08468246459961, + "rewards/margins": 0.9064277410507202, + "rewards/rejected": -21.99110984802246, + "step": 21345 + }, + { + "epoch": 0.7196063230981833, + "grad_norm": 142.93063354492188, + "learning_rate": 2.2097496114758585e-07, + "logits/chosen": -1.987000823020935, + "logits/rejected": -2.1771368980407715, + "logps/chosen": -2.0718207359313965, + "logps/rejected": -1.9550584554672241, + "loss": 4.651, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.71820640563965, + "rewards/margins": -1.1676223278045654, + "rewards/rejected": -19.55058479309082, + "step": 21350 + }, + { + "epoch": 0.7197748491691665, + "grad_norm": 26.261070251464844, + "learning_rate": 2.2073093442519587e-07, + "logits/chosen": -1.6336174011230469, + "logits/rejected": -2.0398337841033936, + "logps/chosen": -2.14435076713562, + "logps/rejected": -2.5782923698425293, + "loss": 2.0267, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.44350814819336, + "rewards/margins": 4.339417934417725, + "rewards/rejected": -25.782922744750977, + "step": 21355 + }, + { + "epoch": 0.7199433752401496, + "grad_norm": 300.9296875, + "learning_rate": 2.2048700434682666e-07, + "logits/chosen": -1.2496579885482788, + "logits/rejected": -1.2768909931182861, + "logps/chosen": -2.8326752185821533, + "logps/rejected": -2.3903040885925293, + "loss": 8.9275, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -28.326751708984375, + "rewards/margins": -4.42371129989624, + "rewards/rejected": -23.903039932250977, + "step": 21360 + }, + { + "epoch": 0.7201119013111328, + "grad_norm": 39.62416076660156, + "learning_rate": 2.202431709968931e-07, + "logits/chosen": -1.3857898712158203, + "logits/rejected": -1.5687038898468018, + "logps/chosen": -2.131641387939453, + "logps/rejected": -2.549037456512451, + "loss": 1.6817, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.316415786743164, + "rewards/margins": 4.173960208892822, + "rewards/rejected": -25.490375518798828, + "step": 21365 + }, + { + "epoch": 0.7202804273821161, + "grad_norm": 74.57008361816406, + "learning_rate": 2.1999943445977586e-07, + "logits/chosen": -1.8909308910369873, + "logits/rejected": -1.9793068170547485, + "logps/chosen": -2.3767809867858887, + "logps/rejected": -2.5203075408935547, + "loss": 2.5488, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.767810821533203, + "rewards/margins": 1.4352657794952393, + "rewards/rejected": -25.203075408935547, + "step": 21370 + }, + { + "epoch": 0.7204489534530992, + "grad_norm": 23.843839645385742, + "learning_rate": 2.1975579481982283e-07, + "logits/chosen": -1.8189566135406494, + "logits/rejected": -1.9706977605819702, + "logps/chosen": -2.760148525238037, + "logps/rejected": -3.534060001373291, + "loss": 2.303, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -27.601486206054688, + "rewards/margins": 7.739114284515381, + "rewards/rejected": -35.340599060058594, + "step": 21375 + }, + { + "epoch": 0.7206174795240824, + "grad_norm": 26.39014434814453, + "learning_rate": 2.1951225216134795e-07, + "logits/chosen": -1.7611877918243408, + "logits/rejected": -2.1785178184509277, + "logps/chosen": -2.0668892860412598, + "logps/rejected": -2.2843990325927734, + "loss": 2.5125, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.668895721435547, + "rewards/margins": 2.1750943660736084, + "rewards/rejected": -22.843990325927734, + "step": 21380 + }, + { + "epoch": 0.7207860055950656, + "grad_norm": 46.218658447265625, + "learning_rate": 2.192688065686319e-07, + "logits/chosen": -1.968400239944458, + "logits/rejected": -2.208972930908203, + "logps/chosen": -2.1618003845214844, + "logps/rejected": -2.215686321258545, + "loss": 3.6735, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -21.618003845214844, + "rewards/margins": 0.538860023021698, + "rewards/rejected": -22.156864166259766, + "step": 21385 + }, + { + "epoch": 0.7209545316660487, + "grad_norm": 31.82733154296875, + "learning_rate": 2.1902545812592144e-07, + "logits/chosen": -1.3051466941833496, + "logits/rejected": -1.4728295803070068, + "logps/chosen": -2.800342559814453, + "logps/rejected": -2.817983388900757, + "loss": 3.601, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -28.0034236907959, + "rewards/margins": 0.1764104813337326, + "rewards/rejected": -28.179834365844727, + "step": 21390 + }, + { + "epoch": 0.7211230577370319, + "grad_norm": 9.380635261535645, + "learning_rate": 2.1878220691743005e-07, + "logits/chosen": -1.5431849956512451, + "logits/rejected": -1.637035608291626, + "logps/chosen": -2.2975473403930664, + "logps/rejected": -2.5784294605255127, + "loss": 2.5353, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.975473403930664, + "rewards/margins": 2.8088207244873047, + "rewards/rejected": -25.784292221069336, + "step": 21395 + }, + { + "epoch": 0.7212915838080151, + "grad_norm": 25.51203155517578, + "learning_rate": 2.1853905302733744e-07, + "logits/chosen": -1.9188916683197021, + "logits/rejected": -1.8357101678848267, + "logps/chosen": -3.159135341644287, + "logps/rejected": -4.11862850189209, + "loss": 1.667, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -31.591350555419922, + "rewards/margins": 9.59493637084961, + "rewards/rejected": -41.18628692626953, + "step": 21400 + }, + { + "epoch": 0.7214601098789983, + "grad_norm": 20.456008911132812, + "learning_rate": 2.1829599653978932e-07, + "logits/chosen": -1.7088804244995117, + "logits/rejected": -2.2845683097839355, + "logps/chosen": -1.9112327098846436, + "logps/rejected": -2.8263041973114014, + "loss": 2.1699, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.112329483032227, + "rewards/margins": 9.150714874267578, + "rewards/rejected": -28.263042449951172, + "step": 21405 + }, + { + "epoch": 0.7216286359499815, + "grad_norm": 28.70714569091797, + "learning_rate": 2.1805303753889803e-07, + "logits/chosen": -1.6716718673706055, + "logits/rejected": -1.8000621795654297, + "logps/chosen": -2.972921371459961, + "logps/rejected": -2.9745922088623047, + "loss": 4.9837, + "rewards/accuracies": 0.5, + "rewards/chosen": -29.72921371459961, + "rewards/margins": 0.016707420349121094, + "rewards/rejected": -29.745920181274414, + "step": 21410 + }, + { + "epoch": 0.7217971620209647, + "grad_norm": 21.922080993652344, + "learning_rate": 2.1781017610874224e-07, + "logits/chosen": -1.9619964361190796, + "logits/rejected": -2.128202199935913, + "logps/chosen": -2.0648131370544434, + "logps/rejected": -2.676927089691162, + "loss": 2.4329, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.64813232421875, + "rewards/margins": 6.121140003204346, + "rewards/rejected": -26.769271850585938, + "step": 21415 + }, + { + "epoch": 0.7219656880919478, + "grad_norm": 49.752410888671875, + "learning_rate": 2.1756741233336683e-07, + "logits/chosen": -1.1550843715667725, + "logits/rejected": -1.7084548473358154, + "logps/chosen": -2.2608838081359863, + "logps/rejected": -2.5417706966400146, + "loss": 2.1431, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.60883903503418, + "rewards/margins": 2.808867931365967, + "rewards/rejected": -25.417705535888672, + "step": 21420 + }, + { + "epoch": 0.722134214162931, + "grad_norm": 20.315689086914062, + "learning_rate": 2.1732474629678243e-07, + "logits/chosen": -2.0127696990966797, + "logits/rejected": -2.288529872894287, + "logps/chosen": -2.3630669116973877, + "logps/rejected": -2.7239952087402344, + "loss": 2.6911, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.63067054748535, + "rewards/margins": 3.609281539916992, + "rewards/rejected": -27.239953994750977, + "step": 21425 + }, + { + "epoch": 0.7223027402339142, + "grad_norm": 24.40152359008789, + "learning_rate": 2.1708217808296642e-07, + "logits/chosen": -1.7259283065795898, + "logits/rejected": -2.223496675491333, + "logps/chosen": -2.1215977668762207, + "logps/rejected": -2.2175374031066895, + "loss": 3.1128, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.21597671508789, + "rewards/margins": 0.9593954086303711, + "rewards/rejected": -22.175373077392578, + "step": 21430 + }, + { + "epoch": 0.7224712663048973, + "grad_norm": 24.670318603515625, + "learning_rate": 2.168397077758622e-07, + "logits/chosen": -1.7781130075454712, + "logits/rejected": -2.312152147293091, + "logps/chosen": -2.622331142425537, + "logps/rejected": -3.033611536026001, + "loss": 2.44, + "rewards/accuracies": 0.5, + "rewards/chosen": -26.223312377929688, + "rewards/margins": 4.112800598144531, + "rewards/rejected": -30.33611488342285, + "step": 21435 + }, + { + "epoch": 0.7226397923758805, + "grad_norm": 21.8465518951416, + "learning_rate": 2.1659733545937886e-07, + "logits/chosen": -2.0275444984436035, + "logits/rejected": -2.103527545928955, + "logps/chosen": -3.047454357147217, + "logps/rejected": -3.24442982673645, + "loss": 3.1846, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -30.474544525146484, + "rewards/margins": 1.9697548151016235, + "rewards/rejected": -32.444297790527344, + "step": 21440 + }, + { + "epoch": 0.7228083184468638, + "grad_norm": 29.372770309448242, + "learning_rate": 2.163550612173921e-07, + "logits/chosen": -1.7228193283081055, + "logits/rejected": -1.6601688861846924, + "logps/chosen": -2.5070953369140625, + "logps/rejected": -2.589710235595703, + "loss": 3.0728, + "rewards/accuracies": 0.5, + "rewards/chosen": -25.070951461791992, + "rewards/margins": 0.8261513710021973, + "rewards/rejected": -25.897104263305664, + "step": 21445 + }, + { + "epoch": 0.7229768445178469, + "grad_norm": 31.65980339050293, + "learning_rate": 2.161128851337435e-07, + "logits/chosen": -1.6353622674942017, + "logits/rejected": -1.20289945602417, + "logps/chosen": -2.7376632690429688, + "logps/rejected": -2.623929500579834, + "loss": 4.7818, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -27.376636505126953, + "rewards/margins": -1.137336015701294, + "rewards/rejected": -26.23929786682129, + "step": 21450 + }, + { + "epoch": 0.7231453705888301, + "grad_norm": 16.73038673400879, + "learning_rate": 2.1587080729224082e-07, + "logits/chosen": -1.5904583930969238, + "logits/rejected": -1.8517332077026367, + "logps/chosen": -2.191336154937744, + "logps/rejected": -2.4386138916015625, + "loss": 1.7566, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.913360595703125, + "rewards/margins": 2.472780704498291, + "rewards/rejected": -24.386140823364258, + "step": 21455 + }, + { + "epoch": 0.7233138966598133, + "grad_norm": 47.32572555541992, + "learning_rate": 2.156288277766573e-07, + "logits/chosen": -1.8892157077789307, + "logits/rejected": -1.5970206260681152, + "logps/chosen": -2.255237579345703, + "logps/rejected": -2.3379788398742676, + "loss": 3.1796, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.5523738861084, + "rewards/margins": 0.827414870262146, + "rewards/rejected": -23.379789352416992, + "step": 21460 + }, + { + "epoch": 0.7234824227307964, + "grad_norm": 26.47463035583496, + "learning_rate": 2.153869466707327e-07, + "logits/chosen": -1.2520034313201904, + "logits/rejected": -1.5026814937591553, + "logps/chosen": -2.371990203857422, + "logps/rejected": -2.86955189704895, + "loss": 1.6533, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -23.71990394592285, + "rewards/margins": 4.975613594055176, + "rewards/rejected": -28.69551658630371, + "step": 21465 + }, + { + "epoch": 0.7236509488017796, + "grad_norm": 36.53496170043945, + "learning_rate": 2.151451640581728e-07, + "logits/chosen": -1.8263139724731445, + "logits/rejected": -2.1204657554626465, + "logps/chosen": -1.880746603012085, + "logps/rejected": -2.126354932785034, + "loss": 2.4378, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.80746841430664, + "rewards/margins": 2.4560837745666504, + "rewards/rejected": -21.2635498046875, + "step": 21470 + }, + { + "epoch": 0.7238194748727628, + "grad_norm": 25.182451248168945, + "learning_rate": 2.1490348002264852e-07, + "logits/chosen": -1.9280401468276978, + "logits/rejected": -1.8660892248153687, + "logps/chosen": -2.6166787147521973, + "logps/rejected": -2.53410005569458, + "loss": 5.44, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -26.166784286499023, + "rewards/margins": -0.8257870674133301, + "rewards/rejected": -25.34099769592285, + "step": 21475 + }, + { + "epoch": 0.723988000943746, + "grad_norm": 33.53519821166992, + "learning_rate": 2.146618946477975e-07, + "logits/chosen": -2.0185818672180176, + "logits/rejected": -1.8434902429580688, + "logps/chosen": -2.1258037090301514, + "logps/rejected": -2.264908790588379, + "loss": 2.8339, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.25803565979004, + "rewards/margins": 1.3910521268844604, + "rewards/rejected": -22.649089813232422, + "step": 21480 + }, + { + "epoch": 0.7241565270147292, + "grad_norm": 22.76723289489746, + "learning_rate": 2.144204080172229e-07, + "logits/chosen": -1.6953926086425781, + "logits/rejected": -1.9088554382324219, + "logps/chosen": -2.2550201416015625, + "logps/rejected": -2.693312883377075, + "loss": 1.5315, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -22.550201416015625, + "rewards/margins": 4.3829264640808105, + "rewards/rejected": -26.933130264282227, + "step": 21485 + }, + { + "epoch": 0.7243250530857124, + "grad_norm": 26.930259704589844, + "learning_rate": 2.141790202144938e-07, + "logits/chosen": -1.878997564315796, + "logits/rejected": -1.7778708934783936, + "logps/chosen": -2.3526453971862793, + "logps/rejected": -2.3866665363311768, + "loss": 3.0321, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.526456832885742, + "rewards/margins": 0.34020644426345825, + "rewards/rejected": -23.86666488647461, + "step": 21490 + }, + { + "epoch": 0.7244935791566955, + "grad_norm": 9.314671516418457, + "learning_rate": 2.1393773132314479e-07, + "logits/chosen": -1.6867443323135376, + "logits/rejected": -2.1116082668304443, + "logps/chosen": -2.4089341163635254, + "logps/rejected": -3.092846393585205, + "loss": 1.388, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -24.08934211730957, + "rewards/margins": 6.839125156402588, + "rewards/rejected": -30.928466796875, + "step": 21495 + }, + { + "epoch": 0.7246621052276787, + "grad_norm": 23.345691680908203, + "learning_rate": 2.1369654142667653e-07, + "logits/chosen": -1.89193856716156, + "logits/rejected": -2.0643019676208496, + "logps/chosen": -2.404980182647705, + "logps/rejected": -2.6729843616485596, + "loss": 1.9652, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.049800872802734, + "rewards/margins": 2.680041551589966, + "rewards/rejected": -26.729843139648438, + "step": 21500 + }, + { + "epoch": 0.7248306312986619, + "grad_norm": 7.167284965515137, + "learning_rate": 2.1345545060855558e-07, + "logits/chosen": -1.5573482513427734, + "logits/rejected": -1.5786257982254028, + "logps/chosen": -2.6754653453826904, + "logps/rejected": -2.7059824466705322, + "loss": 3.1412, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -26.754650115966797, + "rewards/margins": 0.30517178773880005, + "rewards/rejected": -27.059825897216797, + "step": 21505 + }, + { + "epoch": 0.724999157369645, + "grad_norm": 20.261817932128906, + "learning_rate": 2.1321445895221357e-07, + "logits/chosen": -2.548706531524658, + "logits/rejected": -2.389806032180786, + "logps/chosen": -3.0154504776000977, + "logps/rejected": -3.1051926612854004, + "loss": 3.0242, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -30.154504776000977, + "rewards/margins": 0.8974201083183289, + "rewards/rejected": -31.051921844482422, + "step": 21510 + }, + { + "epoch": 0.7251676834406283, + "grad_norm": 43.0755615234375, + "learning_rate": 2.129735665410484e-07, + "logits/chosen": -1.6869655847549438, + "logits/rejected": -2.2092459201812744, + "logps/chosen": -2.0269999504089355, + "logps/rejected": -2.411229133605957, + "loss": 2.5056, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.270000457763672, + "rewards/margins": 3.8422927856445312, + "rewards/rejected": -24.112289428710938, + "step": 21515 + }, + { + "epoch": 0.7253362095116115, + "grad_norm": 36.908592224121094, + "learning_rate": 2.127327734584235e-07, + "logits/chosen": -1.4574025869369507, + "logits/rejected": -1.7088209390640259, + "logps/chosen": -2.0809457302093506, + "logps/rejected": -2.0220389366149902, + "loss": 3.6727, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -20.809457778930664, + "rewards/margins": -0.5890698432922363, + "rewards/rejected": -20.220388412475586, + "step": 21520 + }, + { + "epoch": 0.7255047355825947, + "grad_norm": 31.039684295654297, + "learning_rate": 2.124920797876678e-07, + "logits/chosen": -1.992561936378479, + "logits/rejected": -2.0856804847717285, + "logps/chosen": -2.183537244796753, + "logps/rejected": -2.502293348312378, + "loss": 1.6649, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.835372924804688, + "rewards/margins": 3.1875603199005127, + "rewards/rejected": -25.022933959960938, + "step": 21525 + }, + { + "epoch": 0.7256732616535778, + "grad_norm": 18.880573272705078, + "learning_rate": 2.1225148561207596e-07, + "logits/chosen": -1.826438307762146, + "logits/rejected": -2.0268962383270264, + "logps/chosen": -2.406874179840088, + "logps/rejected": -2.645177125930786, + "loss": 2.9077, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.068742752075195, + "rewards/margins": 2.383025646209717, + "rewards/rejected": -26.451770782470703, + "step": 21530 + }, + { + "epoch": 0.725841787724561, + "grad_norm": 110.51516723632812, + "learning_rate": 2.1201099101490828e-07, + "logits/chosen": -1.7749906778335571, + "logits/rejected": -1.9072506427764893, + "logps/chosen": -2.6400880813598633, + "logps/rejected": -2.54280424118042, + "loss": 4.5673, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.400882720947266, + "rewards/margins": -0.9728401303291321, + "rewards/rejected": -25.428041458129883, + "step": 21535 + }, + { + "epoch": 0.7260103137955441, + "grad_norm": 79.18938446044922, + "learning_rate": 2.1177059607939014e-07, + "logits/chosen": -2.305410861968994, + "logits/rejected": -2.0641355514526367, + "logps/chosen": -3.205970287322998, + "logps/rejected": -3.1420233249664307, + "loss": 6.4197, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -32.0597038269043, + "rewards/margins": -0.6394695043563843, + "rewards/rejected": -31.42023277282715, + "step": 21540 + }, + { + "epoch": 0.7261788398665273, + "grad_norm": 4.201801300048828, + "learning_rate": 2.1153030088871286e-07, + "logits/chosen": -1.6951990127563477, + "logits/rejected": -2.0906145572662354, + "logps/chosen": -2.429492473602295, + "logps/rejected": -2.592778444290161, + "loss": 2.5728, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.294925689697266, + "rewards/margins": 1.6328589916229248, + "rewards/rejected": -25.927783966064453, + "step": 21545 + }, + { + "epoch": 0.7263473659375105, + "grad_norm": 38.4725341796875, + "learning_rate": 2.112901055260332e-07, + "logits/chosen": -1.0017874240875244, + "logits/rejected": -1.5967439413070679, + "logps/chosen": -1.7733027935028076, + "logps/rejected": -2.150214195251465, + "loss": 1.7988, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.733028411865234, + "rewards/margins": 3.7691142559051514, + "rewards/rejected": -21.50214195251465, + "step": 21550 + }, + { + "epoch": 0.7265158920084938, + "grad_norm": 39.86610412597656, + "learning_rate": 2.1105001007447348e-07, + "logits/chosen": -1.7047908306121826, + "logits/rejected": -1.934041976928711, + "logps/chosen": -3.5561611652374268, + "logps/rejected": -3.844970703125, + "loss": 6.4161, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -35.56161117553711, + "rewards/margins": 2.8880958557128906, + "rewards/rejected": -38.44970703125, + "step": 21555 + }, + { + "epoch": 0.7266844180794769, + "grad_norm": 34.7709846496582, + "learning_rate": 2.1081001461712096e-07, + "logits/chosen": -2.2614529132843018, + "logits/rejected": -2.164703369140625, + "logps/chosen": -2.1901984214782715, + "logps/rejected": -2.157827854156494, + "loss": 3.7716, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -21.901987075805664, + "rewards/margins": -0.3237057626247406, + "rewards/rejected": -21.57828140258789, + "step": 21560 + }, + { + "epoch": 0.7268529441504601, + "grad_norm": 48.2080078125, + "learning_rate": 2.1057011923702872e-07, + "logits/chosen": -2.027026653289795, + "logits/rejected": -2.1803174018859863, + "logps/chosen": -2.5710997581481934, + "logps/rejected": -2.8667140007019043, + "loss": 2.6841, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.71099853515625, + "rewards/margins": 2.9561429023742676, + "rewards/rejected": -28.667144775390625, + "step": 21565 + }, + { + "epoch": 0.7270214702214433, + "grad_norm": 26.905202865600586, + "learning_rate": 2.103303240172151e-07, + "logits/chosen": -1.483336329460144, + "logits/rejected": -1.5595595836639404, + "logps/chosen": -2.6003048419952393, + "logps/rejected": -2.9392549991607666, + "loss": 1.9289, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -26.003047943115234, + "rewards/margins": 3.3895020484924316, + "rewards/rejected": -29.39255142211914, + "step": 21570 + }, + { + "epoch": 0.7271899962924264, + "grad_norm": 77.60855865478516, + "learning_rate": 2.1009062904066404e-07, + "logits/chosen": -1.9670803546905518, + "logits/rejected": -2.1249303817749023, + "logps/chosen": -2.6444172859191895, + "logps/rejected": -2.739316463470459, + "loss": 3.3144, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.444171905517578, + "rewards/margins": 0.948990523815155, + "rewards/rejected": -27.393163681030273, + "step": 21575 + }, + { + "epoch": 0.7273585223634096, + "grad_norm": 32.01676559448242, + "learning_rate": 2.098510343903241e-07, + "logits/chosen": -1.2383935451507568, + "logits/rejected": -1.593308687210083, + "logps/chosen": -2.18575382232666, + "logps/rejected": -2.4231770038604736, + "loss": 3.2295, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.8575382232666, + "rewards/margins": 2.3742308616638184, + "rewards/rejected": -24.231767654418945, + "step": 21580 + }, + { + "epoch": 0.7275270484343928, + "grad_norm": 13.398002624511719, + "learning_rate": 2.096115401491097e-07, + "logits/chosen": -2.0275418758392334, + "logits/rejected": -2.2678370475769043, + "logps/chosen": -1.8518550395965576, + "logps/rejected": -2.0165534019470215, + "loss": 2.3763, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.518550872802734, + "rewards/margins": 1.6469831466674805, + "rewards/rejected": -20.1655330657959, + "step": 21585 + }, + { + "epoch": 0.727695574505376, + "grad_norm": 19.24317169189453, + "learning_rate": 2.0937214639990064e-07, + "logits/chosen": -2.054893732070923, + "logits/rejected": -2.430537700653076, + "logps/chosen": -2.9003663063049316, + "logps/rejected": -2.5996203422546387, + "loss": 7.1915, + "rewards/accuracies": 0.5, + "rewards/chosen": -29.003662109375, + "rewards/margins": -3.0074591636657715, + "rewards/rejected": -25.996204376220703, + "step": 21590 + }, + { + "epoch": 0.7278641005763592, + "grad_norm": 18.928489685058594, + "learning_rate": 2.0913285322554126e-07, + "logits/chosen": -1.5994584560394287, + "logits/rejected": -1.9036979675292969, + "logps/chosen": -2.419440746307373, + "logps/rejected": -2.740779161453247, + "loss": 3.6663, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.194406509399414, + "rewards/margins": 3.213385820388794, + "rewards/rejected": -27.407794952392578, + "step": 21595 + }, + { + "epoch": 0.7280326266473424, + "grad_norm": 24.36540985107422, + "learning_rate": 2.0889366070884161e-07, + "logits/chosen": -1.7841564416885376, + "logits/rejected": -2.3233582973480225, + "logps/chosen": -3.2880959510803223, + "logps/rejected": -4.030459403991699, + "loss": 0.8797, + "rewards/accuracies": 1.0, + "rewards/chosen": -32.880958557128906, + "rewards/margins": 7.4236345291137695, + "rewards/rejected": -40.304588317871094, + "step": 21600 + }, + { + "epoch": 0.7280326266473424, + "eval_logits/chosen": -2.172097682952881, + "eval_logits/rejected": -2.3394265174865723, + "eval_logps/chosen": -2.234185218811035, + "eval_logps/rejected": -2.3823485374450684, + "eval_loss": 3.061626672744751, + "eval_rewards/accuracies": 0.6299999952316284, + "eval_rewards/chosen": -22.34185218811035, + "eval_rewards/margins": 1.4816354513168335, + "eval_rewards/rejected": -23.823486328125, + "eval_runtime": 12.8908, + "eval_samples_per_second": 7.757, + "eval_steps_per_second": 1.939, + "step": 21600 + }, + { + "epoch": 0.7282011527183255, + "grad_norm": 31.25184440612793, + "learning_rate": 2.0865456893257688e-07, + "logits/chosen": -1.7714402675628662, + "logits/rejected": -2.0110652446746826, + "logps/chosen": -2.775097608566284, + "logps/rejected": -3.070955753326416, + "loss": 3.4729, + "rewards/accuracies": 0.5, + "rewards/chosen": -27.750980377197266, + "rewards/margins": 2.9585795402526855, + "rewards/rejected": -30.709558486938477, + "step": 21605 + }, + { + "epoch": 0.7283696787893087, + "grad_norm": 27.05602264404297, + "learning_rate": 2.084155779794875e-07, + "logits/chosen": -2.4862136840820312, + "logits/rejected": -2.4018337726593018, + "logps/chosen": -3.081584930419922, + "logps/rejected": -3.008526086807251, + "loss": 6.1938, + "rewards/accuracies": 0.5, + "rewards/chosen": -30.81585121154785, + "rewards/margins": -0.7305895090103149, + "rewards/rejected": -30.08526039123535, + "step": 21610 + }, + { + "epoch": 0.7285382048602919, + "grad_norm": 19.466569900512695, + "learning_rate": 2.0817668793227845e-07, + "logits/chosen": -1.7015174627304077, + "logits/rejected": -2.1962692737579346, + "logps/chosen": -2.1845622062683105, + "logps/rejected": -3.028851270675659, + "loss": 2.3892, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.845623016357422, + "rewards/margins": 8.442889213562012, + "rewards/rejected": -30.28851318359375, + "step": 21615 + }, + { + "epoch": 0.728706730931275, + "grad_norm": 27.886024475097656, + "learning_rate": 2.0793789887362022e-07, + "logits/chosen": -1.9878759384155273, + "logits/rejected": -2.0809290409088135, + "logps/chosen": -2.6173412799835205, + "logps/rejected": -2.7520089149475098, + "loss": 2.9027, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -26.173410415649414, + "rewards/margins": 1.3466761112213135, + "rewards/rejected": -27.520090103149414, + "step": 21620 + }, + { + "epoch": 0.7288752570022583, + "grad_norm": 59.48930740356445, + "learning_rate": 2.0769921088614867e-07, + "logits/chosen": -2.083529472351074, + "logits/rejected": -1.9938551187515259, + "logps/chosen": -2.6014790534973145, + "logps/rejected": -2.4967360496520996, + "loss": 4.5187, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -26.014789581298828, + "rewards/margins": -1.0474289655685425, + "rewards/rejected": -24.967361450195312, + "step": 21625 + }, + { + "epoch": 0.7290437830732415, + "grad_norm": 35.612510681152344, + "learning_rate": 2.0746062405246384e-07, + "logits/chosen": -1.6287791728973389, + "logits/rejected": -1.6877319812774658, + "logps/chosen": -2.3290133476257324, + "logps/rejected": -2.4741005897521973, + "loss": 2.4639, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.29013442993164, + "rewards/margins": 1.4508706331253052, + "rewards/rejected": -24.741003036499023, + "step": 21630 + }, + { + "epoch": 0.7292123091442246, + "grad_norm": 14.086270332336426, + "learning_rate": 2.0722213845513147e-07, + "logits/chosen": -2.198855400085449, + "logits/rejected": -2.485460042953491, + "logps/chosen": -2.5541272163391113, + "logps/rejected": -2.784548282623291, + "loss": 3.081, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.54127311706543, + "rewards/margins": 2.3042120933532715, + "rewards/rejected": -27.84548568725586, + "step": 21635 + }, + { + "epoch": 0.7293808352152078, + "grad_norm": 24.064496994018555, + "learning_rate": 2.0698375417668194e-07, + "logits/chosen": -1.8448317050933838, + "logits/rejected": -2.0081303119659424, + "logps/chosen": -1.7897602319717407, + "logps/rejected": -2.2101948261260986, + "loss": 0.8858, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.897600173950195, + "rewards/margins": 4.204346179962158, + "rewards/rejected": -22.101947784423828, + "step": 21640 + }, + { + "epoch": 0.729549361286191, + "grad_norm": 52.95700454711914, + "learning_rate": 2.0674547129961096e-07, + "logits/chosen": -1.9925181865692139, + "logits/rejected": -2.0233118534088135, + "logps/chosen": -2.1348519325256348, + "logps/rejected": -2.157832622528076, + "loss": 3.098, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.34851837158203, + "rewards/margins": 0.22980785369873047, + "rewards/rejected": -21.57832908630371, + "step": 21645 + }, + { + "epoch": 0.7297178873571741, + "grad_norm": 41.6870002746582, + "learning_rate": 2.0650728990637833e-07, + "logits/chosen": -1.3519022464752197, + "logits/rejected": -1.4920880794525146, + "logps/chosen": -2.5849311351776123, + "logps/rejected": -2.673745632171631, + "loss": 2.6725, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.84930992126465, + "rewards/margins": 0.8881477117538452, + "rewards/rejected": -26.737457275390625, + "step": 21650 + }, + { + "epoch": 0.7298864134281573, + "grad_norm": 79.82129669189453, + "learning_rate": 2.0626921007940955e-07, + "logits/chosen": -1.5634056329727173, + "logits/rejected": -2.13328218460083, + "logps/chosen": -2.2908542156219482, + "logps/rejected": -2.903014659881592, + "loss": 1.9705, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.90854263305664, + "rewards/margins": 6.121607303619385, + "rewards/rejected": -29.030147552490234, + "step": 21655 + }, + { + "epoch": 0.7300549394991405, + "grad_norm": 124.51602935791016, + "learning_rate": 2.0603123190109468e-07, + "logits/chosen": -1.8668529987335205, + "logits/rejected": -2.3392491340637207, + "logps/chosen": -2.717681407928467, + "logps/rejected": -2.839423418045044, + "loss": 3.7129, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -27.17681312561035, + "rewards/margins": 1.2174217700958252, + "rewards/rejected": -28.394235610961914, + "step": 21660 + }, + { + "epoch": 0.7302234655701237, + "grad_norm": 25.56239891052246, + "learning_rate": 2.0579335545378862e-07, + "logits/chosen": -1.793898582458496, + "logits/rejected": -1.988956093788147, + "logps/chosen": -2.0297439098358154, + "logps/rejected": -2.320495843887329, + "loss": 2.1599, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.297439575195312, + "rewards/margins": 2.907517910003662, + "rewards/rejected": -23.204957962036133, + "step": 21665 + }, + { + "epoch": 0.7303919916411069, + "grad_norm": 220.97686767578125, + "learning_rate": 2.0555558081981085e-07, + "logits/chosen": -1.5371949672698975, + "logits/rejected": -1.4318342208862305, + "logps/chosen": -3.315932035446167, + "logps/rejected": -3.3558952808380127, + "loss": 5.6326, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -33.15932083129883, + "rewards/margins": 0.3996322751045227, + "rewards/rejected": -33.55895233154297, + "step": 21670 + }, + { + "epoch": 0.7305605177120901, + "grad_norm": 18.90399742126465, + "learning_rate": 2.053179080814459e-07, + "logits/chosen": -1.383420705795288, + "logits/rejected": -1.8868669271469116, + "logps/chosen": -2.0650277137756348, + "logps/rejected": -2.419672727584839, + "loss": 1.9787, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.6502742767334, + "rewards/margins": 3.5464529991149902, + "rewards/rejected": -24.196727752685547, + "step": 21675 + }, + { + "epoch": 0.7307290437830732, + "grad_norm": 25.166046142578125, + "learning_rate": 2.0508033732094294e-07, + "logits/chosen": -1.5126234292984009, + "logits/rejected": -2.2251594066619873, + "logps/chosen": -2.121800661087036, + "logps/rejected": -2.5393898487091064, + "loss": 1.339, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.218008041381836, + "rewards/margins": 4.17588996887207, + "rewards/rejected": -25.393896102905273, + "step": 21680 + }, + { + "epoch": 0.7308975698540564, + "grad_norm": 1.5466541051864624, + "learning_rate": 2.0484286862051585e-07, + "logits/chosen": -1.4658101797103882, + "logits/rejected": -1.957542061805725, + "logps/chosen": -2.613654851913452, + "logps/rejected": -2.6913158893585205, + "loss": 4.5056, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -26.136547088623047, + "rewards/margins": 0.7766073942184448, + "rewards/rejected": -26.913158416748047, + "step": 21685 + }, + { + "epoch": 0.7310660959250396, + "grad_norm": 38.16489791870117, + "learning_rate": 2.0460550206234323e-07, + "logits/chosen": -2.2058348655700684, + "logits/rejected": -2.6453592777252197, + "logps/chosen": -2.519449234008789, + "logps/rejected": -3.174541473388672, + "loss": 1.4793, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -25.19449234008789, + "rewards/margins": 6.550921440124512, + "rewards/rejected": -31.745412826538086, + "step": 21690 + }, + { + "epoch": 0.7312346219960227, + "grad_norm": 81.57676696777344, + "learning_rate": 2.0436823772856843e-07, + "logits/chosen": -2.1794464588165283, + "logits/rejected": -2.2463772296905518, + "logps/chosen": -3.040895938873291, + "logps/rejected": -2.8675930500030518, + "loss": 5.1556, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -30.408960342407227, + "rewards/margins": -1.7330306768417358, + "rewards/rejected": -28.675933837890625, + "step": 21695 + }, + { + "epoch": 0.731403148067006, + "grad_norm": 39.454044342041016, + "learning_rate": 2.0413107570129894e-07, + "logits/chosen": -1.1450724601745605, + "logits/rejected": -1.4699748754501343, + "logps/chosen": -2.2500405311584473, + "logps/rejected": -2.3740577697753906, + "loss": 2.1046, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.500404357910156, + "rewards/margins": 1.2401739358901978, + "rewards/rejected": -23.740581512451172, + "step": 21700 + }, + { + "epoch": 0.7315716741379892, + "grad_norm": 29.22942352294922, + "learning_rate": 2.0389401606260743e-07, + "logits/chosen": -2.020216941833496, + "logits/rejected": -2.078310489654541, + "logps/chosen": -2.3463661670684814, + "logps/rejected": -2.4976742267608643, + "loss": 2.2146, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.463661193847656, + "rewards/margins": 1.513080358505249, + "rewards/rejected": -24.976741790771484, + "step": 21705 + }, + { + "epoch": 0.7317402002089723, + "grad_norm": 39.88352966308594, + "learning_rate": 2.0365705889453083e-07, + "logits/chosen": -1.6458019018173218, + "logits/rejected": -1.7097450494766235, + "logps/chosen": -2.1416940689086914, + "logps/rejected": -2.4000911712646484, + "loss": 3.2204, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.416940689086914, + "rewards/margins": 2.5839715003967285, + "rewards/rejected": -24.000911712646484, + "step": 21710 + }, + { + "epoch": 0.7319087262799555, + "grad_norm": 36.849876403808594, + "learning_rate": 2.0342020427907086e-07, + "logits/chosen": -1.592294692993164, + "logits/rejected": -1.305525541305542, + "logps/chosen": -2.492601156234741, + "logps/rejected": -2.5987658500671387, + "loss": 2.9679, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.926013946533203, + "rewards/margins": 1.061646580696106, + "rewards/rejected": -25.987659454345703, + "step": 21715 + }, + { + "epoch": 0.7320772523509387, + "grad_norm": 59.083961486816406, + "learning_rate": 2.0318345229819324e-07, + "logits/chosen": -1.4433258771896362, + "logits/rejected": -2.345215320587158, + "logps/chosen": -2.0695242881774902, + "logps/rejected": -3.579845428466797, + "loss": 2.4304, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.695240020751953, + "rewards/margins": 15.103212356567383, + "rewards/rejected": -35.79845428466797, + "step": 21720 + }, + { + "epoch": 0.7322457784219218, + "grad_norm": 30.461536407470703, + "learning_rate": 2.0294680303382867e-07, + "logits/chosen": -1.554012656211853, + "logits/rejected": -1.4713451862335205, + "logps/chosen": -2.2490663528442383, + "logps/rejected": -2.099107027053833, + "loss": 4.9726, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.490663528442383, + "rewards/margins": -1.499592661857605, + "rewards/rejected": -20.991069793701172, + "step": 21725 + }, + { + "epoch": 0.732414304492905, + "grad_norm": 0.02380460500717163, + "learning_rate": 2.0271025656787232e-07, + "logits/chosen": -2.025754690170288, + "logits/rejected": -2.0077271461486816, + "logps/chosen": -3.112980842590332, + "logps/rejected": -2.8330283164978027, + "loss": 7.9069, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -31.129806518554688, + "rewards/margins": -2.7995221614837646, + "rewards/rejected": -28.330286026000977, + "step": 21730 + }, + { + "epoch": 0.7325828305638883, + "grad_norm": 56.081871032714844, + "learning_rate": 2.0247381298218324e-07, + "logits/chosen": -1.9024174213409424, + "logits/rejected": -2.314527988433838, + "logps/chosen": -2.104179859161377, + "logps/rejected": -2.716348648071289, + "loss": 1.859, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.041799545288086, + "rewards/margins": 6.121685028076172, + "rewards/rejected": -27.163482666015625, + "step": 21735 + }, + { + "epoch": 0.7327513566348715, + "grad_norm": 26.001535415649414, + "learning_rate": 2.022374723585854e-07, + "logits/chosen": -1.456640601158142, + "logits/rejected": -1.551238775253296, + "logps/chosen": -2.132565975189209, + "logps/rejected": -2.308089256286621, + "loss": 2.2998, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.325660705566406, + "rewards/margins": 1.755231261253357, + "rewards/rejected": -23.080890655517578, + "step": 21740 + }, + { + "epoch": 0.7329198827058546, + "grad_norm": 0.0018961316673085093, + "learning_rate": 2.0200123477886706e-07, + "logits/chosen": -1.740966558456421, + "logits/rejected": -2.4154534339904785, + "logps/chosen": -2.9531397819519043, + "logps/rejected": -3.728076934814453, + "loss": 1.3412, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -29.53139877319336, + "rewards/margins": 7.7493696212768555, + "rewards/rejected": -37.28076934814453, + "step": 21745 + }, + { + "epoch": 0.7330884087768378, + "grad_norm": 71.07219696044922, + "learning_rate": 2.0176510032478083e-07, + "logits/chosen": -0.8832302093505859, + "logits/rejected": -0.9097579717636108, + "logps/chosen": -2.8612914085388184, + "logps/rejected": -2.9164257049560547, + "loss": 3.4554, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -28.6129150390625, + "rewards/margins": 0.5513428449630737, + "rewards/rejected": -29.164257049560547, + "step": 21750 + }, + { + "epoch": 0.733256934847821, + "grad_norm": 0.0001923279487527907, + "learning_rate": 2.0152906907804317e-07, + "logits/chosen": -2.055886745452881, + "logits/rejected": -2.815279006958008, + "logps/chosen": -2.7214102745056152, + "logps/rejected": -3.637563705444336, + "loss": 2.0419, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -27.2141056060791, + "rewards/margins": 9.161534309387207, + "rewards/rejected": -36.375640869140625, + "step": 21755 + }, + { + "epoch": 0.7334254609188041, + "grad_norm": 26.346817016601562, + "learning_rate": 2.0129314112033552e-07, + "logits/chosen": -1.566145658493042, + "logits/rejected": -1.60275137424469, + "logps/chosen": -2.2926278114318848, + "logps/rejected": -2.2890093326568604, + "loss": 3.6163, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.926279067993164, + "rewards/margins": -0.03618669509887695, + "rewards/rejected": -22.890094757080078, + "step": 21760 + }, + { + "epoch": 0.7335939869897873, + "grad_norm": 35.32753372192383, + "learning_rate": 2.010573165333032e-07, + "logits/chosen": -1.9177045822143555, + "logits/rejected": -2.006000280380249, + "logps/chosen": -2.9610650539398193, + "logps/rejected": -3.1530046463012695, + "loss": 2.6623, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -29.61065101623535, + "rewards/margins": 1.9193947315216064, + "rewards/rejected": -31.530048370361328, + "step": 21765 + }, + { + "epoch": 0.7337625130607704, + "grad_norm": 35.251258850097656, + "learning_rate": 2.008215953985557e-07, + "logits/chosen": -2.1620099544525146, + "logits/rejected": -2.229039430618286, + "logps/chosen": -1.9183037281036377, + "logps/rejected": -1.9755512475967407, + "loss": 2.9403, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.18303871154785, + "rewards/margins": 0.5724747776985168, + "rewards/rejected": -19.755512237548828, + "step": 21770 + }, + { + "epoch": 0.7339310391317537, + "grad_norm": 38.914920806884766, + "learning_rate": 2.0058597779766677e-07, + "logits/chosen": -1.7548776865005493, + "logits/rejected": -1.7073177099227905, + "logps/chosen": -1.9944050312042236, + "logps/rejected": -2.2155983448028564, + "loss": 1.486, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -19.944049835205078, + "rewards/margins": 2.211933135986328, + "rewards/rejected": -22.155981063842773, + "step": 21775 + }, + { + "epoch": 0.7340995652027369, + "grad_norm": 63.15382766723633, + "learning_rate": 2.0035046381217458e-07, + "logits/chosen": -1.6686474084854126, + "logits/rejected": -2.051553249359131, + "logps/chosen": -2.022413730621338, + "logps/rejected": -2.373833417892456, + "loss": 2.5306, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.224136352539062, + "rewards/margins": 3.5141963958740234, + "rewards/rejected": -23.73833465576172, + "step": 21780 + }, + { + "epoch": 0.7342680912737201, + "grad_norm": 27.028345108032227, + "learning_rate": 2.0011505352358126e-07, + "logits/chosen": -2.099684715270996, + "logits/rejected": -2.4309449195861816, + "logps/chosen": -2.7647461891174316, + "logps/rejected": -2.9898247718811035, + "loss": 1.6346, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -27.6474609375, + "rewards/margins": 2.2507858276367188, + "rewards/rejected": -29.89824867248535, + "step": 21785 + }, + { + "epoch": 0.7344366173447032, + "grad_norm": 16.62060546875, + "learning_rate": 1.9987974701335276e-07, + "logits/chosen": -1.8711074590682983, + "logits/rejected": -1.8878005743026733, + "logps/chosen": -1.918784499168396, + "logps/rejected": -2.1599090099334717, + "loss": 1.6143, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.18784523010254, + "rewards/margins": 2.4112448692321777, + "rewards/rejected": -21.599090576171875, + "step": 21790 + }, + { + "epoch": 0.7346051434156864, + "grad_norm": 23.89042091369629, + "learning_rate": 1.9964454436291955e-07, + "logits/chosen": -1.636922836303711, + "logits/rejected": -1.909189224243164, + "logps/chosen": -2.8264358043670654, + "logps/rejected": -3.0672054290771484, + "loss": 2.023, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -28.264358520507812, + "rewards/margins": 2.40769624710083, + "rewards/rejected": -30.672054290771484, + "step": 21795 + }, + { + "epoch": 0.7347736694866696, + "grad_norm": 40.953765869140625, + "learning_rate": 1.9940944565367617e-07, + "logits/chosen": -1.708216667175293, + "logits/rejected": -1.8522497415542603, + "logps/chosen": -2.2710158824920654, + "logps/rejected": -2.5011610984802246, + "loss": 2.6869, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.710155487060547, + "rewards/margins": 2.301453113555908, + "rewards/rejected": -25.011611938476562, + "step": 21800 + }, + { + "epoch": 0.7349421955576527, + "grad_norm": 37.34510803222656, + "learning_rate": 1.9917445096698065e-07, + "logits/chosen": -1.5141806602478027, + "logits/rejected": -2.053837299346924, + "logps/chosen": -2.2500662803649902, + "logps/rejected": -2.249181032180786, + "loss": 3.3769, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -22.500661849975586, + "rewards/margins": -0.008852005004882812, + "rewards/rejected": -22.491809844970703, + "step": 21805 + }, + { + "epoch": 0.735110721628636, + "grad_norm": 21.374662399291992, + "learning_rate": 1.9893956038415565e-07, + "logits/chosen": -2.0776031017303467, + "logits/rejected": -2.0971264839172363, + "logps/chosen": -2.3597798347473145, + "logps/rejected": -2.544868230819702, + "loss": 2.7494, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.597797393798828, + "rewards/margins": 1.850882887840271, + "rewards/rejected": -25.448680877685547, + "step": 21810 + }, + { + "epoch": 0.7352792476996192, + "grad_norm": 32.04645538330078, + "learning_rate": 1.987047739864875e-07, + "logits/chosen": -1.3755344152450562, + "logits/rejected": -1.5694763660430908, + "logps/chosen": -2.604581832885742, + "logps/rejected": -2.738844633102417, + "loss": 2.8888, + "rewards/accuracies": 0.5, + "rewards/chosen": -26.045818328857422, + "rewards/margins": 1.3426287174224854, + "rewards/rejected": -27.388446807861328, + "step": 21815 + }, + { + "epoch": 0.7354477737706023, + "grad_norm": 39.647884368896484, + "learning_rate": 1.9847009185522644e-07, + "logits/chosen": -1.5242843627929688, + "logits/rejected": -1.7097200155258179, + "logps/chosen": -2.7526488304138184, + "logps/rejected": -2.7933335304260254, + "loss": 3.2408, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -27.5264892578125, + "rewards/margins": 0.4068460464477539, + "rewards/rejected": -27.933338165283203, + "step": 21820 + }, + { + "epoch": 0.7356162998415855, + "grad_norm": 23.728511810302734, + "learning_rate": 1.982355140715869e-07, + "logits/chosen": -1.4998576641082764, + "logits/rejected": -2.016965389251709, + "logps/chosen": -2.0560142993927, + "logps/rejected": -2.4636600017547607, + "loss": 1.8951, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.560142517089844, + "rewards/margins": 4.0764570236206055, + "rewards/rejected": -24.636600494384766, + "step": 21825 + }, + { + "epoch": 0.7357848259125687, + "grad_norm": 149.23117065429688, + "learning_rate": 1.9800104071674677e-07, + "logits/chosen": -1.884606957435608, + "logits/rejected": -1.9795339107513428, + "logps/chosen": -2.4407334327697754, + "logps/rejected": -2.8185982704162598, + "loss": 3.4615, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.407337188720703, + "rewards/margins": 3.7786457538604736, + "rewards/rejected": -28.18597984313965, + "step": 21830 + }, + { + "epoch": 0.7359533519835518, + "grad_norm": 30.48876953125, + "learning_rate": 1.9776667187184842e-07, + "logits/chosen": -2.699343204498291, + "logits/rejected": -2.3093960285186768, + "logps/chosen": -2.7900753021240234, + "logps/rejected": -3.133699893951416, + "loss": 1.6805, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -27.9007511138916, + "rewards/margins": 3.4362499713897705, + "rewards/rejected": -31.336999893188477, + "step": 21835 + }, + { + "epoch": 0.736121878054535, + "grad_norm": 26.56561279296875, + "learning_rate": 1.9753240761799722e-07, + "logits/chosen": -1.8246999979019165, + "logits/rejected": -1.5348434448242188, + "logps/chosen": -1.9646905660629272, + "logps/rejected": -1.8137729167938232, + "loss": 4.6944, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -19.64690589904785, + "rewards/margins": -1.5091748237609863, + "rewards/rejected": -18.13772964477539, + "step": 21840 + }, + { + "epoch": 0.7362904041255183, + "grad_norm": 31.89987564086914, + "learning_rate": 1.9729824803626299e-07, + "logits/chosen": -1.7160171270370483, + "logits/rejected": -2.2106146812438965, + "logps/chosen": -2.4249229431152344, + "logps/rejected": -4.063868999481201, + "loss": 1.6264, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -24.24922752380371, + "rewards/margins": 16.38946533203125, + "rewards/rejected": -40.638694763183594, + "step": 21845 + }, + { + "epoch": 0.7364589301965014, + "grad_norm": 38.55632019042969, + "learning_rate": 1.9706419320767915e-07, + "logits/chosen": -1.6241347789764404, + "logits/rejected": -1.7468106746673584, + "logps/chosen": -2.586089849472046, + "logps/rejected": -2.7664897441864014, + "loss": 2.2135, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -25.860897064208984, + "rewards/margins": 1.8039993047714233, + "rewards/rejected": -27.66489601135254, + "step": 21850 + }, + { + "epoch": 0.7366274562674846, + "grad_norm": 16.978574752807617, + "learning_rate": 1.9683024321324304e-07, + "logits/chosen": -2.144387722015381, + "logits/rejected": -2.554112195968628, + "logps/chosen": -1.8638019561767578, + "logps/rejected": -2.06876802444458, + "loss": 2.8064, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.638019561767578, + "rewards/margins": 2.0496623516082764, + "rewards/rejected": -20.687681198120117, + "step": 21855 + }, + { + "epoch": 0.7367959823384678, + "grad_norm": 27.162925720214844, + "learning_rate": 1.9659639813391515e-07, + "logits/chosen": -1.3586866855621338, + "logits/rejected": -1.717599868774414, + "logps/chosen": -2.207620620727539, + "logps/rejected": -2.515143394470215, + "loss": 2.1462, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.07620620727539, + "rewards/margins": 3.075228214263916, + "rewards/rejected": -25.151432037353516, + "step": 21860 + }, + { + "epoch": 0.7369645084094509, + "grad_norm": 26.866268157958984, + "learning_rate": 1.9636265805062025e-07, + "logits/chosen": -1.8105628490447998, + "logits/rejected": -2.0254015922546387, + "logps/chosen": -1.65911865234375, + "logps/rejected": -1.9263187646865845, + "loss": 1.389, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -16.5911865234375, + "rewards/margins": 2.6720001697540283, + "rewards/rejected": -19.263187408447266, + "step": 21865 + }, + { + "epoch": 0.7371330344804341, + "grad_norm": 12.93826675415039, + "learning_rate": 1.9612902304424672e-07, + "logits/chosen": -1.498923897743225, + "logits/rejected": -1.587410569190979, + "logps/chosen": -2.786804676055908, + "logps/rejected": -3.3624961376190186, + "loss": 1.4787, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -27.868045806884766, + "rewards/margins": 5.75691556930542, + "rewards/rejected": -33.624961853027344, + "step": 21870 + }, + { + "epoch": 0.7373015605514173, + "grad_norm": 19.367074966430664, + "learning_rate": 1.9589549319564607e-07, + "logits/chosen": -2.255361557006836, + "logits/rejected": -2.319303035736084, + "logps/chosen": -1.961960792541504, + "logps/rejected": -2.3379883766174316, + "loss": 1.6691, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.619606018066406, + "rewards/margins": 3.7602760791778564, + "rewards/rejected": -23.3798828125, + "step": 21875 + }, + { + "epoch": 0.7374700866224004, + "grad_norm": 24.227609634399414, + "learning_rate": 1.9566206858563406e-07, + "logits/chosen": -2.2398581504821777, + "logits/rejected": -2.4608020782470703, + "logps/chosen": -2.5069398880004883, + "logps/rejected": -2.7963309288024902, + "loss": 2.5762, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.06939697265625, + "rewards/margins": 2.893913745880127, + "rewards/rejected": -27.96331214904785, + "step": 21880 + }, + { + "epoch": 0.7376386126933837, + "grad_norm": 34.777587890625, + "learning_rate": 1.9542874929498964e-07, + "logits/chosen": -2.167205572128296, + "logits/rejected": -2.2443249225616455, + "logps/chosen": -3.534986972808838, + "logps/rejected": -4.648120880126953, + "loss": 4.5919, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -35.3498649597168, + "rewards/margins": 11.131341934204102, + "rewards/rejected": -46.48120880126953, + "step": 21885 + }, + { + "epoch": 0.7378071387643669, + "grad_norm": 24.37884521484375, + "learning_rate": 1.9519553540445562e-07, + "logits/chosen": -1.5812952518463135, + "logits/rejected": -1.5319125652313232, + "logps/chosen": -2.2089767456054688, + "logps/rejected": -2.3872642517089844, + "loss": 2.5415, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.089771270751953, + "rewards/margins": 1.7828737497329712, + "rewards/rejected": -23.872644424438477, + "step": 21890 + }, + { + "epoch": 0.73797566483535, + "grad_norm": 62.177066802978516, + "learning_rate": 1.9496242699473782e-07, + "logits/chosen": -1.7420861721038818, + "logits/rejected": -1.941145658493042, + "logps/chosen": -2.2418718338012695, + "logps/rejected": -2.2937049865722656, + "loss": 3.5042, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.418716430664062, + "rewards/margins": 0.5183313488960266, + "rewards/rejected": -22.937047958374023, + "step": 21895 + }, + { + "epoch": 0.7381441909063332, + "grad_norm": 49.71686935424805, + "learning_rate": 1.9472942414650607e-07, + "logits/chosen": -1.7992541790008545, + "logits/rejected": -1.9160549640655518, + "logps/chosen": -1.7772258520126343, + "logps/rejected": -1.9301140308380127, + "loss": 2.9361, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.772258758544922, + "rewards/margins": 1.5288810729980469, + "rewards/rejected": -19.30113983154297, + "step": 21900 + }, + { + "epoch": 0.7383127169773164, + "grad_norm": 23.071325302124023, + "learning_rate": 1.9449652694039353e-07, + "logits/chosen": -1.5916250944137573, + "logits/rejected": -1.8258146047592163, + "logps/chosen": -2.6305465698242188, + "logps/rejected": -3.2942707538604736, + "loss": 2.7448, + "rewards/accuracies": 0.5, + "rewards/chosen": -26.305465698242188, + "rewards/margins": 6.637242317199707, + "rewards/rejected": -32.942710876464844, + "step": 21905 + }, + { + "epoch": 0.7384812430482995, + "grad_norm": 31.649948120117188, + "learning_rate": 1.9426373545699658e-07, + "logits/chosen": -1.8089519739151, + "logits/rejected": -1.8158622980117798, + "logps/chosen": -2.6677355766296387, + "logps/rejected": -3.247864246368408, + "loss": 2.4772, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -26.677358627319336, + "rewards/margins": 5.801283836364746, + "rewards/rejected": -32.478641510009766, + "step": 21910 + }, + { + "epoch": 0.7386497691192827, + "grad_norm": 3.3252029418945312, + "learning_rate": 1.9403104977687524e-07, + "logits/chosen": -1.416884422302246, + "logits/rejected": -1.889899492263794, + "logps/chosen": -2.3453822135925293, + "logps/rejected": -2.5371739864349365, + "loss": 2.646, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.453821182250977, + "rewards/margins": 1.9179189205169678, + "rewards/rejected": -25.37173843383789, + "step": 21915 + }, + { + "epoch": 0.738818295190266, + "grad_norm": 80.28643798828125, + "learning_rate": 1.9379846998055282e-07, + "logits/chosen": -1.7270739078521729, + "logits/rejected": -1.7358312606811523, + "logps/chosen": -3.119675874710083, + "logps/rejected": -3.312995195388794, + "loss": 4.8238, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -31.196758270263672, + "rewards/margins": 1.933192491531372, + "rewards/rejected": -33.12995147705078, + "step": 21920 + }, + { + "epoch": 0.7389868212612491, + "grad_norm": 26.531578063964844, + "learning_rate": 1.935659961485163e-07, + "logits/chosen": -1.6422741413116455, + "logits/rejected": -1.9076168537139893, + "logps/chosen": -2.1916511058807373, + "logps/rejected": -2.3707642555236816, + "loss": 2.3595, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.91651153564453, + "rewards/margins": 1.7911300659179688, + "rewards/rejected": -23.7076416015625, + "step": 21925 + }, + { + "epoch": 0.7391553473322323, + "grad_norm": 40.92249298095703, + "learning_rate": 1.933336283612153e-07, + "logits/chosen": -1.8644893169403076, + "logits/rejected": -2.33244252204895, + "logps/chosen": -1.8013432025909424, + "logps/rejected": -2.192413806915283, + "loss": 1.865, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.013431549072266, + "rewards/margins": 3.9107062816619873, + "rewards/rejected": -21.924137115478516, + "step": 21930 + }, + { + "epoch": 0.7393238734032155, + "grad_norm": 25.538143157958984, + "learning_rate": 1.9310136669906342e-07, + "logits/chosen": -1.5520654916763306, + "logits/rejected": -1.9955809116363525, + "logps/chosen": -1.8076364994049072, + "logps/rejected": -2.3288650512695312, + "loss": 1.831, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.076366424560547, + "rewards/margins": 5.21228551864624, + "rewards/rejected": -23.288650512695312, + "step": 21935 + }, + { + "epoch": 0.7394923994741986, + "grad_norm": 30.85343360900879, + "learning_rate": 1.9286921124243727e-07, + "logits/chosen": -1.5067452192306519, + "logits/rejected": -1.6436704397201538, + "logps/chosen": -2.2649459838867188, + "logps/rejected": -2.452059507369995, + "loss": 2.6169, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.649459838867188, + "rewards/margins": 1.8711345195770264, + "rewards/rejected": -24.520593643188477, + "step": 21940 + }, + { + "epoch": 0.7396609255451818, + "grad_norm": 42.99625778198242, + "learning_rate": 1.9263716207167652e-07, + "logits/chosen": -1.8859636783599854, + "logits/rejected": -2.007584810256958, + "logps/chosen": -2.6758522987365723, + "logps/rejected": -2.7993979454040527, + "loss": 2.8658, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -26.75852394104004, + "rewards/margins": 1.2354570627212524, + "rewards/rejected": -27.993982315063477, + "step": 21945 + }, + { + "epoch": 0.739829451616165, + "grad_norm": 9.061617851257324, + "learning_rate": 1.9240521926708437e-07, + "logits/chosen": -1.9813066720962524, + "logits/rejected": -2.2144722938537598, + "logps/chosen": -3.057587146759033, + "logps/rejected": -3.302992343902588, + "loss": 4.0341, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -30.57587242126465, + "rewards/margins": 2.454049587249756, + "rewards/rejected": -33.02992248535156, + "step": 21950 + }, + { + "epoch": 0.7399979776871483, + "grad_norm": 53.799285888671875, + "learning_rate": 1.9217338290892704e-07, + "logits/chosen": -1.5386488437652588, + "logits/rejected": -2.009295701980591, + "logps/chosen": -2.03143310546875, + "logps/rejected": -2.4899253845214844, + "loss": 2.1955, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.3143310546875, + "rewards/margins": 4.584921836853027, + "rewards/rejected": -24.89925193786621, + "step": 21955 + }, + { + "epoch": 0.7401665037581314, + "grad_norm": 24.71092414855957, + "learning_rate": 1.9194165307743403e-07, + "logits/chosen": -1.5193939208984375, + "logits/rejected": -1.3868157863616943, + "logps/chosen": -2.746831178665161, + "logps/rejected": -2.5405361652374268, + "loss": 5.1587, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -27.468311309814453, + "rewards/margins": -2.062947988510132, + "rewards/rejected": -25.40536117553711, + "step": 21960 + }, + { + "epoch": 0.7403350298291146, + "grad_norm": 22.135698318481445, + "learning_rate": 1.917100298527981e-07, + "logits/chosen": -1.7267589569091797, + "logits/rejected": -1.7475883960723877, + "logps/chosen": -2.539499282836914, + "logps/rejected": -2.4604432582855225, + "loss": 4.0905, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.39499282836914, + "rewards/margins": -0.7905600666999817, + "rewards/rejected": -24.60443115234375, + "step": 21965 + }, + { + "epoch": 0.7405035559000978, + "grad_norm": 241.43174743652344, + "learning_rate": 1.9147851331517445e-07, + "logits/chosen": -1.717034935951233, + "logits/rejected": -1.8810991048812866, + "logps/chosen": -2.7885756492614746, + "logps/rejected": -2.564950704574585, + "loss": 7.6565, + "rewards/accuracies": 0.5, + "rewards/chosen": -27.885757446289062, + "rewards/margins": -2.236248731613159, + "rewards/rejected": -25.649505615234375, + "step": 21970 + }, + { + "epoch": 0.7406720819710809, + "grad_norm": 0.6432590484619141, + "learning_rate": 1.912471035446821e-07, + "logits/chosen": -1.8602104187011719, + "logits/rejected": -2.3250601291656494, + "logps/chosen": -2.2839043140411377, + "logps/rejected": -2.538177967071533, + "loss": 1.9896, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.83904457092285, + "rewards/margins": 2.5427379608154297, + "rewards/rejected": -25.38178062438965, + "step": 21975 + }, + { + "epoch": 0.7408406080420641, + "grad_norm": 59.0754280090332, + "learning_rate": 1.910158006214029e-07, + "logits/chosen": -2.180945873260498, + "logits/rejected": -2.2051949501037598, + "logps/chosen": -2.14461612701416, + "logps/rejected": -2.3139724731445312, + "loss": 3.2225, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.4461612701416, + "rewards/margins": 1.6935627460479736, + "rewards/rejected": -23.139724731445312, + "step": 21980 + }, + { + "epoch": 0.7410091341130473, + "grad_norm": 47.94268798828125, + "learning_rate": 1.907846046253815e-07, + "logits/chosen": -1.857304573059082, + "logits/rejected": -1.8857357501983643, + "logps/chosen": -2.83512544631958, + "logps/rejected": -2.9262855052948, + "loss": 3.4687, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -28.35125160217285, + "rewards/margins": 0.9116016626358032, + "rewards/rejected": -29.262853622436523, + "step": 21985 + }, + { + "epoch": 0.7411776601840304, + "grad_norm": 22.46523666381836, + "learning_rate": 1.9055351563662593e-07, + "logits/chosen": -1.1714767217636108, + "logits/rejected": -1.649778962135315, + "logps/chosen": -2.0318779945373535, + "logps/rejected": -2.8038489818573, + "loss": 1.9946, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.31878089904785, + "rewards/margins": 7.7197113037109375, + "rewards/rejected": -28.038488388061523, + "step": 21990 + }, + { + "epoch": 0.7413461862550137, + "grad_norm": 76.65923309326172, + "learning_rate": 1.9032253373510697e-07, + "logits/chosen": -1.9239925146102905, + "logits/rejected": -1.8971790075302124, + "logps/chosen": -2.953947067260742, + "logps/rejected": -3.118098735809326, + "loss": 3.1773, + "rewards/accuracies": 0.5, + "rewards/chosen": -29.539474487304688, + "rewards/margins": 1.6415166854858398, + "rewards/rejected": -31.18099021911621, + "step": 21995 + }, + { + "epoch": 0.7415147123259969, + "grad_norm": 24.52005958557129, + "learning_rate": 1.9009165900075819e-07, + "logits/chosen": -1.0461251735687256, + "logits/rejected": -1.2524454593658447, + "logps/chosen": -2.185828447341919, + "logps/rejected": -2.4685850143432617, + "loss": 2.8176, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.858285903930664, + "rewards/margins": 2.8275644779205322, + "rewards/rejected": -24.685850143432617, + "step": 22000 + }, + { + "epoch": 0.7415147123259969, + "eval_logits/chosen": -2.2072925567626953, + "eval_logits/rejected": -2.376696825027466, + "eval_logps/chosen": -2.2478811740875244, + "eval_logps/rejected": -2.396430015563965, + "eval_loss": 3.075082302093506, + "eval_rewards/accuracies": 0.6299999952316284, + "eval_rewards/chosen": -22.47881317138672, + "eval_rewards/margins": 1.4854872226715088, + "eval_rewards/rejected": -23.96430015563965, + "eval_runtime": 12.91, + "eval_samples_per_second": 7.746, + "eval_steps_per_second": 1.936, + "step": 22000 + }, + { + "epoch": 0.74168323839698, + "grad_norm": 27.378284454345703, + "learning_rate": 1.8986089151347628e-07, + "logits/chosen": -1.6426585912704468, + "logits/rejected": -1.6798603534698486, + "logps/chosen": -2.4855704307556152, + "logps/rejected": -2.8420779705047607, + "loss": 2.8563, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.855703353881836, + "rewards/margins": 3.5650744438171387, + "rewards/rejected": -28.4207820892334, + "step": 22005 + }, + { + "epoch": 0.7418517644679632, + "grad_norm": 24.746652603149414, + "learning_rate": 1.8963023135312105e-07, + "logits/chosen": -1.9978439807891846, + "logits/rejected": -2.34228515625, + "logps/chosen": -2.251620292663574, + "logps/rejected": -2.3410422801971436, + "loss": 3.9954, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -22.51620101928711, + "rewards/margins": 0.8942203521728516, + "rewards/rejected": -23.41042137145996, + "step": 22010 + }, + { + "epoch": 0.7420202905389464, + "grad_norm": 64.21375274658203, + "learning_rate": 1.8939967859951445e-07, + "logits/chosen": -1.7721850872039795, + "logits/rejected": -2.458425998687744, + "logps/chosen": -2.4934096336364746, + "logps/rejected": -3.225297451019287, + "loss": 2.2091, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.93409538269043, + "rewards/margins": 7.318881034851074, + "rewards/rejected": -32.25297927856445, + "step": 22015 + }, + { + "epoch": 0.7421888166099295, + "grad_norm": 26.012672424316406, + "learning_rate": 1.8916923333244195e-07, + "logits/chosen": -1.8884680271148682, + "logits/rejected": -1.7687313556671143, + "logps/chosen": -2.0241780281066895, + "logps/rejected": -2.5447230339050293, + "loss": 2.282, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.24178123474121, + "rewards/margins": 5.20544958114624, + "rewards/rejected": -25.44723129272461, + "step": 22020 + }, + { + "epoch": 0.7423573426809127, + "grad_norm": 29.735652923583984, + "learning_rate": 1.8893889563165154e-07, + "logits/chosen": -1.8298534154891968, + "logits/rejected": -1.7624599933624268, + "logps/chosen": -1.849662184715271, + "logps/rejected": -1.876684546470642, + "loss": 2.8919, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.49662208557129, + "rewards/margins": 0.2702246606349945, + "rewards/rejected": -18.766845703125, + "step": 22025 + }, + { + "epoch": 0.742525868751896, + "grad_norm": 35.02006530761719, + "learning_rate": 1.8870866557685421e-07, + "logits/chosen": -2.155496597290039, + "logits/rejected": -2.304814577102661, + "logps/chosen": -2.7024080753326416, + "logps/rejected": -2.8451664447784424, + "loss": 3.3615, + "rewards/accuracies": 0.5, + "rewards/chosen": -27.024084091186523, + "rewards/margins": 1.4275829792022705, + "rewards/rejected": -28.4516658782959, + "step": 22030 + }, + { + "epoch": 0.7426943948228791, + "grad_norm": 18.64474105834961, + "learning_rate": 1.8847854324772316e-07, + "logits/chosen": -1.7854385375976562, + "logits/rejected": -2.331878662109375, + "logps/chosen": -2.454838991165161, + "logps/rejected": -3.2257533073425293, + "loss": 1.3929, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -24.548389434814453, + "rewards/margins": 7.709145545959473, + "rewards/rejected": -32.25753402709961, + "step": 22035 + }, + { + "epoch": 0.7428629208938623, + "grad_norm": 124.89165496826172, + "learning_rate": 1.8824852872389486e-07, + "logits/chosen": -1.9832309484481812, + "logits/rejected": -2.0631701946258545, + "logps/chosen": -2.7330057621002197, + "logps/rejected": -2.7496206760406494, + "loss": 3.7363, + "rewards/accuracies": 0.5, + "rewards/chosen": -27.330059051513672, + "rewards/margins": 0.16614732146263123, + "rewards/rejected": -27.4962100982666, + "step": 22040 + }, + { + "epoch": 0.7430314469648455, + "grad_norm": 32.47340774536133, + "learning_rate": 1.8801862208496838e-07, + "logits/chosen": -2.0714099407196045, + "logits/rejected": -2.1332778930664062, + "logps/chosen": -1.9636199474334717, + "logps/rejected": -2.1338706016540527, + "loss": 2.1992, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.636199951171875, + "rewards/margins": 1.702506422996521, + "rewards/rejected": -21.33870506286621, + "step": 22045 + }, + { + "epoch": 0.7431999730358286, + "grad_norm": 318.710693359375, + "learning_rate": 1.8778882341050505e-07, + "logits/chosen": -1.1691004037857056, + "logits/rejected": -1.5123542547225952, + "logps/chosen": -3.776531219482422, + "logps/rejected": -4.898382663726807, + "loss": 2.9377, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -37.76531219482422, + "rewards/margins": 11.218512535095215, + "rewards/rejected": -48.98382568359375, + "step": 22050 + }, + { + "epoch": 0.7433684991068118, + "grad_norm": 30.12228012084961, + "learning_rate": 1.8755913278002933e-07, + "logits/chosen": -2.0730338096618652, + "logits/rejected": -2.237550973892212, + "logps/chosen": -2.0106124877929688, + "logps/rejected": -2.141998767852783, + "loss": 2.3136, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.10612678527832, + "rewards/margins": 1.3138636350631714, + "rewards/rejected": -21.41998863220215, + "step": 22055 + }, + { + "epoch": 0.743537025177795, + "grad_norm": 53.70478057861328, + "learning_rate": 1.8732955027302805e-07, + "logits/chosen": -1.5072424411773682, + "logits/rejected": -2.0281248092651367, + "logps/chosen": -3.2066917419433594, + "logps/rejected": -4.702208518981934, + "loss": 1.5268, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -32.066917419433594, + "rewards/margins": 14.955164909362793, + "rewards/rejected": -47.02208709716797, + "step": 22060 + }, + { + "epoch": 0.7437055512487782, + "grad_norm": 34.362606048583984, + "learning_rate": 1.8710007596895088e-07, + "logits/chosen": -2.142402410507202, + "logits/rejected": -2.3605005741119385, + "logps/chosen": -2.8594348430633545, + "logps/rejected": -3.0598931312561035, + "loss": 2.1342, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -28.594350814819336, + "rewards/margins": 2.0045814514160156, + "rewards/rejected": -30.59893226623535, + "step": 22065 + }, + { + "epoch": 0.7438740773197614, + "grad_norm": 22.987091064453125, + "learning_rate": 1.868707099472095e-07, + "logits/chosen": -1.7191402912139893, + "logits/rejected": -1.91571044921875, + "logps/chosen": -2.633007287979126, + "logps/rejected": -3.2162888050079346, + "loss": 1.0829, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -26.3300724029541, + "rewards/margins": 5.832815647125244, + "rewards/rejected": -32.16288757324219, + "step": 22070 + }, + { + "epoch": 0.7440426033907446, + "grad_norm": 43.26948165893555, + "learning_rate": 1.866414522871786e-07, + "logits/chosen": -1.5649702548980713, + "logits/rejected": -2.07462739944458, + "logps/chosen": -2.266348123550415, + "logps/rejected": -2.657762050628662, + "loss": 2.2072, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.663482666015625, + "rewards/margins": 3.91413950920105, + "rewards/rejected": -26.577621459960938, + "step": 22075 + }, + { + "epoch": 0.7442111294617277, + "grad_norm": 70.24766540527344, + "learning_rate": 1.864123030681954e-07, + "logits/chosen": -2.052701234817505, + "logits/rejected": -2.133179187774658, + "logps/chosen": -3.3242931365966797, + "logps/rejected": -3.694363832473755, + "loss": 2.0295, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -33.24292755126953, + "rewards/margins": 3.7007064819335938, + "rewards/rejected": -36.943634033203125, + "step": 22080 + }, + { + "epoch": 0.7443796555327109, + "grad_norm": 29.723024368286133, + "learning_rate": 1.8618326236955906e-07, + "logits/chosen": -1.730105996131897, + "logits/rejected": -2.710209369659424, + "logps/chosen": -2.478010416030884, + "logps/rejected": -3.3492538928985596, + "loss": 2.1972, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.780101776123047, + "rewards/margins": 8.712434768676758, + "rewards/rejected": -33.49253463745117, + "step": 22085 + }, + { + "epoch": 0.7445481816036941, + "grad_norm": 17.219541549682617, + "learning_rate": 1.8595433027053177e-07, + "logits/chosen": -1.7738357782363892, + "logits/rejected": -2.1546788215637207, + "logps/chosen": -2.251901626586914, + "logps/rejected": -2.3508944511413574, + "loss": 2.7481, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.51901626586914, + "rewards/margins": 0.989930272102356, + "rewards/rejected": -23.50894546508789, + "step": 22090 + }, + { + "epoch": 0.7447167076746772, + "grad_norm": 23.76531219482422, + "learning_rate": 1.85725506850338e-07, + "logits/chosen": -2.312504768371582, + "logits/rejected": -1.9970581531524658, + "logps/chosen": -2.0622830390930176, + "logps/rejected": -2.181011199951172, + "loss": 3.7324, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.62282943725586, + "rewards/margins": 1.187281847000122, + "rewards/rejected": -21.81011390686035, + "step": 22095 + }, + { + "epoch": 0.7448852337456604, + "grad_norm": 11.535994529724121, + "learning_rate": 1.854967921881642e-07, + "logits/chosen": -1.7529932260513306, + "logits/rejected": -2.226323366165161, + "logps/chosen": -2.4964802265167236, + "logps/rejected": -2.9389634132385254, + "loss": 1.7485, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.964801788330078, + "rewards/margins": 4.424830436706543, + "rewards/rejected": -29.389633178710938, + "step": 22100 + }, + { + "epoch": 0.7450537598166437, + "grad_norm": 184.34542846679688, + "learning_rate": 1.852681863631597e-07, + "logits/chosen": -1.7626209259033203, + "logits/rejected": -1.7059158086776733, + "logps/chosen": -2.2885444164276123, + "logps/rejected": -2.4290459156036377, + "loss": 2.3917, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.88544464111328, + "rewards/margins": 1.4050140380859375, + "rewards/rejected": -24.29045867919922, + "step": 22105 + }, + { + "epoch": 0.7452222858876268, + "grad_norm": 154.77064514160156, + "learning_rate": 1.8503968945443599e-07, + "logits/chosen": -1.4389687776565552, + "logits/rejected": -1.8824350833892822, + "logps/chosen": -2.680974245071411, + "logps/rejected": -2.93117094039917, + "loss": 1.974, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -26.809743881225586, + "rewards/margins": 2.5019686222076416, + "rewards/rejected": -29.31171226501465, + "step": 22110 + }, + { + "epoch": 0.74539081195861, + "grad_norm": 39.07148361206055, + "learning_rate": 1.8481130154106684e-07, + "logits/chosen": -1.7964918613433838, + "logits/rejected": -1.9969040155410767, + "logps/chosen": -2.613196611404419, + "logps/rejected": -2.705244302749634, + "loss": 2.9414, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -26.1319637298584, + "rewards/margins": 0.9204772710800171, + "rewards/rejected": -27.052440643310547, + "step": 22115 + }, + { + "epoch": 0.7455593380295932, + "grad_norm": 76.31917572021484, + "learning_rate": 1.8458302270208825e-07, + "logits/chosen": -1.2408196926116943, + "logits/rejected": -1.2059440612792969, + "logps/chosen": -2.816004514694214, + "logps/rejected": -2.9252734184265137, + "loss": 2.7429, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -28.160045623779297, + "rewards/margins": 1.092685341835022, + "rewards/rejected": -29.252731323242188, + "step": 22120 + }, + { + "epoch": 0.7457278641005763, + "grad_norm": 77.58131408691406, + "learning_rate": 1.8435485301649857e-07, + "logits/chosen": -2.2920916080474854, + "logits/rejected": -2.2795474529266357, + "logps/chosen": -2.514387845993042, + "logps/rejected": -2.436331033706665, + "loss": 4.8348, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -25.14388084411621, + "rewards/margins": -0.7805711030960083, + "rewards/rejected": -24.363309860229492, + "step": 22125 + }, + { + "epoch": 0.7458963901715595, + "grad_norm": 2.1885435581207275, + "learning_rate": 1.8412679256325852e-07, + "logits/chosen": -1.1843568086624146, + "logits/rejected": -1.6690727472305298, + "logps/chosen": -2.1616756916046143, + "logps/rejected": -2.833143711090088, + "loss": 1.5591, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.616756439208984, + "rewards/margins": 6.714682579040527, + "rewards/rejected": -28.331436157226562, + "step": 22130 + }, + { + "epoch": 0.7460649162425427, + "grad_norm": 33.07001876831055, + "learning_rate": 1.8389884142129047e-07, + "logits/chosen": -2.085460662841797, + "logits/rejected": -2.267630100250244, + "logps/chosen": -2.2407565116882324, + "logps/rejected": -2.623178005218506, + "loss": 1.4193, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -22.40756607055664, + "rewards/margins": 3.824214458465576, + "rewards/rejected": -26.231781005859375, + "step": 22135 + }, + { + "epoch": 0.746233442313526, + "grad_norm": 90.47783660888672, + "learning_rate": 1.8367099966947952e-07, + "logits/chosen": -1.9926780462265015, + "logits/rejected": -1.9720783233642578, + "logps/chosen": -2.5652921199798584, + "logps/rejected": -3.247729778289795, + "loss": 1.9665, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -25.65291976928711, + "rewards/margins": 6.8243818283081055, + "rewards/rejected": -32.47730255126953, + "step": 22140 + }, + { + "epoch": 0.7464019683845091, + "grad_norm": 9.481569290161133, + "learning_rate": 1.834432673866727e-07, + "logits/chosen": -1.9665133953094482, + "logits/rejected": -2.1472795009613037, + "logps/chosen": -2.3283274173736572, + "logps/rejected": -2.7122232913970947, + "loss": 2.1223, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.283275604248047, + "rewards/margins": 3.838958740234375, + "rewards/rejected": -27.12223243713379, + "step": 22145 + }, + { + "epoch": 0.7465704944554923, + "grad_norm": 30.502643585205078, + "learning_rate": 1.8321564465167943e-07, + "logits/chosen": -1.9200775623321533, + "logits/rejected": -1.9199256896972656, + "logps/chosen": -2.313190460205078, + "logps/rejected": -2.3956024646759033, + "loss": 3.3351, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -23.13190460205078, + "rewards/margins": 0.8241198658943176, + "rewards/rejected": -23.956024169921875, + "step": 22150 + }, + { + "epoch": 0.7467390205264754, + "grad_norm": 0.6498861908912659, + "learning_rate": 1.8298813154327052e-07, + "logits/chosen": -1.862210988998413, + "logits/rejected": -2.1515557765960693, + "logps/chosen": -2.6309103965759277, + "logps/rejected": -2.9605705738067627, + "loss": 2.0137, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -26.30910301208496, + "rewards/margins": 3.296602725982666, + "rewards/rejected": -29.6057071685791, + "step": 22155 + }, + { + "epoch": 0.7469075465974586, + "grad_norm": 24.51485252380371, + "learning_rate": 1.827607281401795e-07, + "logits/chosen": -1.7067981958389282, + "logits/rejected": -1.831194519996643, + "logps/chosen": -1.6155074834823608, + "logps/rejected": -1.7414348125457764, + "loss": 2.4547, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.155075073242188, + "rewards/margins": 1.2592729330062866, + "rewards/rejected": -17.414348602294922, + "step": 22160 + }, + { + "epoch": 0.7470760726684418, + "grad_norm": 21.368152618408203, + "learning_rate": 1.8253343452110197e-07, + "logits/chosen": -1.6644436120986938, + "logits/rejected": -1.6071170568466187, + "logps/chosen": -1.9112002849578857, + "logps/rejected": -2.0044965744018555, + "loss": 2.83, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.112003326416016, + "rewards/margins": 0.9329609870910645, + "rewards/rejected": -20.044963836669922, + "step": 22165 + }, + { + "epoch": 0.747244598739425, + "grad_norm": 32.65996170043945, + "learning_rate": 1.8230625076469486e-07, + "logits/chosen": -2.3414549827575684, + "logits/rejected": -2.433055877685547, + "logps/chosen": -2.396184206008911, + "logps/rejected": -2.6614813804626465, + "loss": 2.0128, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.961841583251953, + "rewards/margins": 2.652974843978882, + "rewards/rejected": -26.61481285095215, + "step": 22170 + }, + { + "epoch": 0.7474131248104082, + "grad_norm": 28.22422981262207, + "learning_rate": 1.8207917694957775e-07, + "logits/chosen": -1.752981424331665, + "logits/rejected": -2.565385103225708, + "logps/chosen": -2.0491116046905518, + "logps/rejected": -2.334890842437744, + "loss": 2.5923, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.49111557006836, + "rewards/margins": 2.857790946960449, + "rewards/rejected": -23.348905563354492, + "step": 22175 + }, + { + "epoch": 0.7475816508813914, + "grad_norm": 32.882240295410156, + "learning_rate": 1.818522131543319e-07, + "logits/chosen": -1.775252103805542, + "logits/rejected": -1.9257984161376953, + "logps/chosen": -3.0990793704986572, + "logps/rejected": -3.4123873710632324, + "loss": 3.3253, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -30.990793228149414, + "rewards/margins": 3.133082628250122, + "rewards/rejected": -34.12387466430664, + "step": 22180 + }, + { + "epoch": 0.7477501769523746, + "grad_norm": 30.46446418762207, + "learning_rate": 1.8162535945750072e-07, + "logits/chosen": -1.8090412616729736, + "logits/rejected": -2.1050121784210205, + "logps/chosen": -2.088987112045288, + "logps/rejected": -2.3451297283172607, + "loss": 2.756, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.889873504638672, + "rewards/margins": 2.561424970626831, + "rewards/rejected": -23.451297760009766, + "step": 22185 + }, + { + "epoch": 0.7479187030233577, + "grad_norm": 21.73251724243164, + "learning_rate": 1.8139861593758903e-07, + "logits/chosen": -1.5950464010238647, + "logits/rejected": -1.5290101766586304, + "logps/chosen": -1.873583197593689, + "logps/rejected": -1.7633142471313477, + "loss": 4.2461, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -18.7358341217041, + "rewards/margins": -1.1026910543441772, + "rewards/rejected": -17.633142471313477, + "step": 22190 + }, + { + "epoch": 0.7480872290943409, + "grad_norm": 44.56904220581055, + "learning_rate": 1.8117198267306394e-07, + "logits/chosen": -1.9961559772491455, + "logits/rejected": -2.7102062702178955, + "logps/chosen": -2.2197203636169434, + "logps/rejected": -2.4373273849487305, + "loss": 2.4196, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.197200775146484, + "rewards/margins": 2.176071882247925, + "rewards/rejected": -24.373271942138672, + "step": 22195 + }, + { + "epoch": 0.748255755165324, + "grad_norm": 22.732515335083008, + "learning_rate": 1.8094545974235453e-07, + "logits/chosen": -1.7332115173339844, + "logits/rejected": -1.7054576873779297, + "logps/chosen": -2.0260987281799316, + "logps/rejected": -2.120617389678955, + "loss": 2.4321, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.260986328125, + "rewards/margins": 0.9451854825019836, + "rewards/rejected": -21.206172943115234, + "step": 22200 + }, + { + "epoch": 0.7484242812363072, + "grad_norm": 134.5745391845703, + "learning_rate": 1.8071904722385107e-07, + "logits/chosen": -1.819849967956543, + "logits/rejected": -1.9714431762695312, + "logps/chosen": -3.182410717010498, + "logps/rejected": -3.592799663543701, + "loss": 2.2189, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -31.824108123779297, + "rewards/margins": 4.103890419006348, + "rewards/rejected": -35.928001403808594, + "step": 22205 + }, + { + "epoch": 0.7485928073072904, + "grad_norm": 19.461380004882812, + "learning_rate": 1.8049274519590618e-07, + "logits/chosen": -1.819338083267212, + "logits/rejected": -2.1516706943511963, + "logps/chosen": -2.2335047721862793, + "logps/rejected": -2.5605053901672363, + "loss": 3.4628, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.335046768188477, + "rewards/margins": 3.2700066566467285, + "rewards/rejected": -25.605051040649414, + "step": 22210 + }, + { + "epoch": 0.7487613333782737, + "grad_norm": 43.49894332885742, + "learning_rate": 1.8026655373683407e-07, + "logits/chosen": -1.8901605606079102, + "logits/rejected": -2.1546928882598877, + "logps/chosen": -2.9359912872314453, + "logps/rejected": -3.4877407550811768, + "loss": 2.6148, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -29.359912872314453, + "rewards/margins": 5.517492771148682, + "rewards/rejected": -34.877403259277344, + "step": 22215 + }, + { + "epoch": 0.7489298594492568, + "grad_norm": 39.32984924316406, + "learning_rate": 1.8004047292491094e-07, + "logits/chosen": -1.7076501846313477, + "logits/rejected": -1.9654871225357056, + "logps/chosen": -2.4198029041290283, + "logps/rejected": -2.534273862838745, + "loss": 2.2676, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.198028564453125, + "rewards/margins": 1.1447094678878784, + "rewards/rejected": -25.34273910522461, + "step": 22220 + }, + { + "epoch": 0.74909838552024, + "grad_norm": 20.094335556030273, + "learning_rate": 1.79814502838374e-07, + "logits/chosen": -1.2183277606964111, + "logits/rejected": -1.6691389083862305, + "logps/chosen": -2.220008373260498, + "logps/rejected": -2.553109884262085, + "loss": 2.2165, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.200082778930664, + "rewards/margins": 3.3310158252716064, + "rewards/rejected": -25.531099319458008, + "step": 22225 + }, + { + "epoch": 0.7492669115912232, + "grad_norm": 13.931550025939941, + "learning_rate": 1.795886435554229e-07, + "logits/chosen": -1.722768783569336, + "logits/rejected": -1.9010473489761353, + "logps/chosen": -2.112097978591919, + "logps/rejected": -2.6837105751037598, + "loss": 1.1627, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.120981216430664, + "rewards/margins": 5.71612548828125, + "rewards/rejected": -26.837106704711914, + "step": 22230 + }, + { + "epoch": 0.7494354376622063, + "grad_norm": 9.187200546264648, + "learning_rate": 1.793628951542187e-07, + "logits/chosen": -1.9483964443206787, + "logits/rejected": -1.9009296894073486, + "logps/chosen": -2.5094029903411865, + "logps/rejected": -2.2897486686706543, + "loss": 5.5848, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.09402847290039, + "rewards/margins": -2.1965441703796387, + "rewards/rejected": -22.89748764038086, + "step": 22235 + }, + { + "epoch": 0.7496039637331895, + "grad_norm": 52.516719818115234, + "learning_rate": 1.7913725771288368e-07, + "logits/chosen": -1.4216723442077637, + "logits/rejected": -1.5206291675567627, + "logps/chosen": -1.9325135946273804, + "logps/rejected": -2.0983996391296387, + "loss": 2.3383, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.325138092041016, + "rewards/margins": 1.6588608026504517, + "rewards/rejected": -20.983993530273438, + "step": 22240 + }, + { + "epoch": 0.7497724898041727, + "grad_norm": 28.921022415161133, + "learning_rate": 1.7891173130950233e-07, + "logits/chosen": -1.7567373514175415, + "logits/rejected": -2.1790812015533447, + "logps/chosen": -2.419783115386963, + "logps/rejected": -2.6635146141052246, + "loss": 1.936, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.197826385498047, + "rewards/margins": 2.437316417694092, + "rewards/rejected": -26.635147094726562, + "step": 22245 + }, + { + "epoch": 0.7499410158751559, + "grad_norm": 28.339021682739258, + "learning_rate": 1.7868631602212037e-07, + "logits/chosen": -2.138418674468994, + "logits/rejected": -2.2020459175109863, + "logps/chosen": -3.1503100395202637, + "logps/rejected": -3.557222366333008, + "loss": 3.0416, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -31.503103256225586, + "rewards/margins": 4.069121360778809, + "rewards/rejected": -35.57222366333008, + "step": 22250 + }, + { + "epoch": 0.7501095419461391, + "grad_norm": 31.886789321899414, + "learning_rate": 1.784610119287452e-07, + "logits/chosen": -1.6445305347442627, + "logits/rejected": -1.7116506099700928, + "logps/chosen": -2.769115447998047, + "logps/rejected": -2.8107247352600098, + "loss": 3.139, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -27.6911563873291, + "rewards/margins": 0.41608962416648865, + "rewards/rejected": -28.10724449157715, + "step": 22255 + }, + { + "epoch": 0.7502780680171223, + "grad_norm": 46.03837966918945, + "learning_rate": 1.7823581910734564e-07, + "logits/chosen": -2.1681454181671143, + "logits/rejected": -2.4093782901763916, + "logps/chosen": -2.065337657928467, + "logps/rejected": -2.368126392364502, + "loss": 2.1653, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.653377532958984, + "rewards/margins": 3.027885913848877, + "rewards/rejected": -23.681264877319336, + "step": 22260 + }, + { + "epoch": 0.7504465940881054, + "grad_norm": 31.069231033325195, + "learning_rate": 1.7801073763585227e-07, + "logits/chosen": -1.8537037372589111, + "logits/rejected": -2.0984387397766113, + "logps/chosen": -1.594420313835144, + "logps/rejected": -2.136216640472412, + "loss": 1.6542, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -15.94420337677002, + "rewards/margins": 5.417963981628418, + "rewards/rejected": -21.362167358398438, + "step": 22265 + }, + { + "epoch": 0.7506151201590886, + "grad_norm": 32.4260368347168, + "learning_rate": 1.7778576759215663e-07, + "logits/chosen": -1.5697544813156128, + "logits/rejected": -1.641465187072754, + "logps/chosen": -2.854722738265991, + "logps/rejected": -2.992177724838257, + "loss": 3.0165, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -28.547225952148438, + "rewards/margins": 1.3745505809783936, + "rewards/rejected": -29.921777725219727, + "step": 22270 + }, + { + "epoch": 0.7507836462300718, + "grad_norm": 24.50908851623535, + "learning_rate": 1.7756090905411204e-07, + "logits/chosen": -1.6597118377685547, + "logits/rejected": -2.5056252479553223, + "logps/chosen": -2.06673002243042, + "logps/rejected": -2.915621757507324, + "loss": 1.316, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.667299270629883, + "rewards/margins": 8.488917350769043, + "rewards/rejected": -29.15621566772461, + "step": 22275 + }, + { + "epoch": 0.7509521723010549, + "grad_norm": 16.406259536743164, + "learning_rate": 1.7733616209953317e-07, + "logits/chosen": -2.0685009956359863, + "logits/rejected": -2.1869888305664062, + "logps/chosen": -2.3974456787109375, + "logps/rejected": -2.5520505905151367, + "loss": 2.4664, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.974454879760742, + "rewards/margins": 1.5460479259490967, + "rewards/rejected": -25.520503997802734, + "step": 22280 + }, + { + "epoch": 0.7511206983720382, + "grad_norm": 22.887910842895508, + "learning_rate": 1.7711152680619622e-07, + "logits/chosen": -1.8368467092514038, + "logits/rejected": -1.9409831762313843, + "logps/chosen": -2.277331829071045, + "logps/rejected": -2.750244617462158, + "loss": 1.0961, + "rewards/accuracies": 1.0, + "rewards/chosen": -22.773319244384766, + "rewards/margins": 4.729128837585449, + "rewards/rejected": -27.5024471282959, + "step": 22285 + }, + { + "epoch": 0.7512892244430214, + "grad_norm": 60.598419189453125, + "learning_rate": 1.768870032518387e-07, + "logits/chosen": -2.0507781505584717, + "logits/rejected": -2.0986313819885254, + "logps/chosen": -2.0975115299224854, + "logps/rejected": -2.2042901515960693, + "loss": 2.1705, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.975116729736328, + "rewards/margins": 1.067787766456604, + "rewards/rejected": -22.04290199279785, + "step": 22290 + }, + { + "epoch": 0.7514577505140045, + "grad_norm": 28.564823150634766, + "learning_rate": 1.7666259151415908e-07, + "logits/chosen": -1.376070261001587, + "logits/rejected": -1.6946004629135132, + "logps/chosen": -2.3124992847442627, + "logps/rejected": -2.3266541957855225, + "loss": 4.3577, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.124990463256836, + "rewards/margins": 0.14155101776123047, + "rewards/rejected": -23.26654052734375, + "step": 22295 + }, + { + "epoch": 0.7516262765849877, + "grad_norm": 144.43304443359375, + "learning_rate": 1.7643829167081746e-07, + "logits/chosen": -2.3734333515167236, + "logits/rejected": -2.6592297554016113, + "logps/chosen": -2.635132312774658, + "logps/rejected": -2.699852228164673, + "loss": 3.4685, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -26.351318359375, + "rewards/margins": 0.647201657295227, + "rewards/rejected": -26.998523712158203, + "step": 22300 + }, + { + "epoch": 0.7517948026559709, + "grad_norm": 30.19963264465332, + "learning_rate": 1.7621410379943551e-07, + "logits/chosen": -1.405311942100525, + "logits/rejected": -1.310832142829895, + "logps/chosen": -2.3792216777801514, + "logps/rejected": -2.3998732566833496, + "loss": 3.3591, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.792217254638672, + "rewards/margins": 0.20651578903198242, + "rewards/rejected": -23.998733520507812, + "step": 22305 + }, + { + "epoch": 0.751963328726954, + "grad_norm": 73.10344696044922, + "learning_rate": 1.7599002797759542e-07, + "logits/chosen": -1.719347596168518, + "logits/rejected": -2.036027431488037, + "logps/chosen": -2.421614170074463, + "logps/rejected": -3.365711212158203, + "loss": 2.121, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -24.216140747070312, + "rewards/margins": 9.440972328186035, + "rewards/rejected": -33.65711212158203, + "step": 22310 + }, + { + "epoch": 0.7521318547979372, + "grad_norm": 25.012081146240234, + "learning_rate": 1.7576606428284114e-07, + "logits/chosen": -1.8271840810775757, + "logits/rejected": -2.110874652862549, + "logps/chosen": -2.605572462081909, + "logps/rejected": -3.6537890434265137, + "loss": 1.924, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -26.05572509765625, + "rewards/margins": 10.48216438293457, + "rewards/rejected": -36.53789138793945, + "step": 22315 + }, + { + "epoch": 0.7523003808689204, + "grad_norm": 3.9147238731384277, + "learning_rate": 1.7554221279267768e-07, + "logits/chosen": -1.5303199291229248, + "logits/rejected": -1.9406137466430664, + "logps/chosen": -2.8770370483398438, + "logps/rejected": -3.2161426544189453, + "loss": 3.0375, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -28.770370483398438, + "rewards/margins": 3.391056776046753, + "rewards/rejected": -32.16143035888672, + "step": 22320 + }, + { + "epoch": 0.7524689069399036, + "grad_norm": 32.75675964355469, + "learning_rate": 1.7531847358457148e-07, + "logits/chosen": -1.7144191265106201, + "logits/rejected": -2.0489983558654785, + "logps/chosen": -2.0209603309631348, + "logps/rejected": -2.873371124267578, + "loss": 2.5432, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.20960235595703, + "rewards/margins": 8.52410888671875, + "rewards/rejected": -28.73370933532715, + "step": 22325 + }, + { + "epoch": 0.7526374330108868, + "grad_norm": 141.016845703125, + "learning_rate": 1.7509484673594938e-07, + "logits/chosen": -1.8166608810424805, + "logits/rejected": -1.8591245412826538, + "logps/chosen": -2.8313632011413574, + "logps/rejected": -2.9321489334106445, + "loss": 3.0837, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -28.313629150390625, + "rewards/margins": 1.0078595876693726, + "rewards/rejected": -29.321491241455078, + "step": 22330 + }, + { + "epoch": 0.75280595908187, + "grad_norm": 20.358863830566406, + "learning_rate": 1.748713323242001e-07, + "logits/chosen": -1.726928472518921, + "logits/rejected": -1.9667476415634155, + "logps/chosen": -3.285855531692505, + "logps/rejected": -4.028448581695557, + "loss": 3.8553, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -32.858558654785156, + "rewards/margins": 7.425924777984619, + "rewards/rejected": -40.28447723388672, + "step": 22335 + }, + { + "epoch": 0.7529744851528531, + "grad_norm": 130.31146240234375, + "learning_rate": 1.7464793042667337e-07, + "logits/chosen": -1.8716411590576172, + "logits/rejected": -2.0586330890655518, + "logps/chosen": -2.2499797344207764, + "logps/rejected": -2.5665526390075684, + "loss": 4.8833, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -22.499799728393555, + "rewards/margins": 3.165727376937866, + "rewards/rejected": -25.66552734375, + "step": 22340 + }, + { + "epoch": 0.7531430112238363, + "grad_norm": 44.93946838378906, + "learning_rate": 1.7442464112067935e-07, + "logits/chosen": -1.6285629272460938, + "logits/rejected": -1.6942113637924194, + "logps/chosen": -2.2660062313079834, + "logps/rejected": -2.411550998687744, + "loss": 3.1253, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.66006088256836, + "rewards/margins": 1.4554458856582642, + "rewards/rejected": -24.115509033203125, + "step": 22345 + }, + { + "epoch": 0.7533115372948195, + "grad_norm": 180.3627471923828, + "learning_rate": 1.7420146448348982e-07, + "logits/chosen": -1.4327377080917358, + "logits/rejected": -1.430065393447876, + "logps/chosen": -3.0352001190185547, + "logps/rejected": -2.8664352893829346, + "loss": 4.7893, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -30.352001190185547, + "rewards/margins": -1.6876468658447266, + "rewards/rejected": -28.664356231689453, + "step": 22350 + }, + { + "epoch": 0.7534800633658026, + "grad_norm": 30.93425750732422, + "learning_rate": 1.7397840059233754e-07, + "logits/chosen": -1.6150524616241455, + "logits/rejected": -2.075601100921631, + "logps/chosen": -2.180380344390869, + "logps/rejected": -2.4855568408966064, + "loss": 1.4605, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.803804397583008, + "rewards/margins": 3.051762580871582, + "rewards/rejected": -24.855566024780273, + "step": 22355 + }, + { + "epoch": 0.7536485894367859, + "grad_norm": 120.5937728881836, + "learning_rate": 1.7375544952441628e-07, + "logits/chosen": -1.124226450920105, + "logits/rejected": -1.2850375175476074, + "logps/chosen": -3.2305736541748047, + "logps/rejected": -3.4257309436798096, + "loss": 4.1787, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -32.30573654174805, + "rewards/margins": 1.9515708684921265, + "rewards/rejected": -34.25730895996094, + "step": 22360 + }, + { + "epoch": 0.7538171155077691, + "grad_norm": 112.42485046386719, + "learning_rate": 1.735326113568802e-07, + "logits/chosen": -1.9617102146148682, + "logits/rejected": -2.3836417198181152, + "logps/chosen": -2.8195998668670654, + "logps/rejected": -3.2562496662139893, + "loss": 4.2705, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -28.196002960205078, + "rewards/margins": 4.366497039794922, + "rewards/rejected": -32.562496185302734, + "step": 22365 + }, + { + "epoch": 0.7539856415787523, + "grad_norm": 34.154544830322266, + "learning_rate": 1.7330988616684505e-07, + "logits/chosen": -2.2110228538513184, + "logits/rejected": -2.7860636711120605, + "logps/chosen": -2.109682321548462, + "logps/rejected": -2.5072388648986816, + "loss": 1.4138, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.09682273864746, + "rewards/margins": 3.9755663871765137, + "rewards/rejected": -25.072391510009766, + "step": 22370 + }, + { + "epoch": 0.7541541676497354, + "grad_norm": 55.39371871948242, + "learning_rate": 1.7308727403138734e-07, + "logits/chosen": -0.8848699331283569, + "logits/rejected": -0.8680378198623657, + "logps/chosen": -3.3436615467071533, + "logps/rejected": -3.356152057647705, + "loss": 3.4747, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -33.436614990234375, + "rewards/margins": 0.12490396201610565, + "rewards/rejected": -33.561519622802734, + "step": 22375 + }, + { + "epoch": 0.7543226937207186, + "grad_norm": 33.278385162353516, + "learning_rate": 1.7286477502754415e-07, + "logits/chosen": -1.8989862203598022, + "logits/rejected": -2.0420544147491455, + "logps/chosen": -1.8241840600967407, + "logps/rejected": -1.867881178855896, + "loss": 2.7088, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.24184226989746, + "rewards/margins": 0.43697088956832886, + "rewards/rejected": -18.67881202697754, + "step": 22380 + }, + { + "epoch": 0.7544912197917018, + "grad_norm": 50.16250228881836, + "learning_rate": 1.7264238923231366e-07, + "logits/chosen": -1.6946766376495361, + "logits/rejected": -2.0134615898132324, + "logps/chosen": -2.77992582321167, + "logps/rejected": -2.884464979171753, + "loss": 3.8556, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -27.79926109313965, + "rewards/margins": 1.0453927516937256, + "rewards/rejected": -28.844654083251953, + "step": 22385 + }, + { + "epoch": 0.7546597458626849, + "grad_norm": 61.37696075439453, + "learning_rate": 1.724201167226549e-07, + "logits/chosen": -1.5504992008209229, + "logits/rejected": -1.4514929056167603, + "logps/chosen": -3.9573092460632324, + "logps/rejected": -3.8085074424743652, + "loss": 6.4388, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -39.573097229003906, + "rewards/margins": -1.488017201423645, + "rewards/rejected": -38.085079193115234, + "step": 22390 + }, + { + "epoch": 0.7548282719336682, + "grad_norm": 22.884544372558594, + "learning_rate": 1.7219795757548778e-07, + "logits/chosen": -1.9424610137939453, + "logits/rejected": -2.200577974319458, + "logps/chosen": -2.2687830924987793, + "logps/rejected": -3.1191654205322266, + "loss": 2.0236, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -22.68783187866211, + "rewards/margins": 8.50382137298584, + "rewards/rejected": -31.191654205322266, + "step": 22395 + }, + { + "epoch": 0.7549967980046514, + "grad_norm": 48.091827392578125, + "learning_rate": 1.7197591186769245e-07, + "logits/chosen": -1.8664665222167969, + "logits/rejected": -1.8764442205429077, + "logps/chosen": -2.2680325508117676, + "logps/rejected": -2.3142781257629395, + "loss": 3.3744, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.68032455444336, + "rewards/margins": 0.4624575674533844, + "rewards/rejected": -23.142780303955078, + "step": 22400 + }, + { + "epoch": 0.7549967980046514, + "eval_logits/chosen": -2.2423062324523926, + "eval_logits/rejected": -2.414647102355957, + "eval_logps/chosen": -2.2602834701538086, + "eval_logps/rejected": -2.4113729000091553, + "eval_loss": 3.0775012969970703, + "eval_rewards/accuracies": 0.6299999952316284, + "eval_rewards/chosen": -22.602834701538086, + "eval_rewards/margins": 1.5108985900878906, + "eval_rewards/rejected": -24.113731384277344, + "eval_runtime": 12.8925, + "eval_samples_per_second": 7.756, + "eval_steps_per_second": 1.939, + "step": 22400 + }, + { + "epoch": 0.7551653240756345, + "grad_norm": 15.63908576965332, + "learning_rate": 1.7175397967611043e-07, + "logits/chosen": -1.7012121677398682, + "logits/rejected": -2.0834169387817383, + "logps/chosen": -2.3098204135894775, + "logps/rejected": -2.574169158935547, + "loss": 3.1278, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.09820556640625, + "rewards/margins": 2.643484592437744, + "rewards/rejected": -25.741689682006836, + "step": 22405 + }, + { + "epoch": 0.7553338501466177, + "grad_norm": 186.98109436035156, + "learning_rate": 1.7153216107754365e-07, + "logits/chosen": -1.6505035161972046, + "logits/rejected": -1.967118263244629, + "logps/chosen": -2.6132192611694336, + "logps/rejected": -3.1035962104797363, + "loss": 3.2179, + "rewards/accuracies": 0.5, + "rewards/chosen": -26.132190704345703, + "rewards/margins": 4.903773307800293, + "rewards/rejected": -31.035964965820312, + "step": 22410 + }, + { + "epoch": 0.7555023762176009, + "grad_norm": 60.38026809692383, + "learning_rate": 1.7131045614875484e-07, + "logits/chosen": -1.731563925743103, + "logits/rejected": -1.6696627140045166, + "logps/chosen": -2.5402698516845703, + "logps/rejected": -2.457115888595581, + "loss": 4.8036, + "rewards/accuracies": 0.5, + "rewards/chosen": -25.402698516845703, + "rewards/margins": -0.8315426111221313, + "rewards/rejected": -24.571157455444336, + "step": 22415 + }, + { + "epoch": 0.755670902288584, + "grad_norm": 122.55125427246094, + "learning_rate": 1.710888649664673e-07, + "logits/chosen": -1.7970136404037476, + "logits/rejected": -1.8179333209991455, + "logps/chosen": -2.197624683380127, + "logps/rejected": -2.4766898155212402, + "loss": 2.565, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.976247787475586, + "rewards/margins": 2.7906508445739746, + "rewards/rejected": -24.766897201538086, + "step": 22420 + }, + { + "epoch": 0.7558394283595672, + "grad_norm": 206.24832153320312, + "learning_rate": 1.7086738760736497e-07, + "logits/chosen": -2.3038604259490967, + "logits/rejected": -2.688873052597046, + "logps/chosen": -2.979055881500244, + "logps/rejected": -3.692131757736206, + "loss": 2.4169, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -29.79056167602539, + "rewards/margins": 7.130758762359619, + "rewards/rejected": -36.92131805419922, + "step": 22425 + }, + { + "epoch": 0.7560079544305504, + "grad_norm": 21.21005630493164, + "learning_rate": 1.7064602414809266e-07, + "logits/chosen": -1.6251780986785889, + "logits/rejected": -2.547797203063965, + "logps/chosen": -2.238882064819336, + "logps/rejected": -2.8598296642303467, + "loss": 2.0533, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.388818740844727, + "rewards/margins": 6.209475517272949, + "rewards/rejected": -28.598297119140625, + "step": 22430 + }, + { + "epoch": 0.7561764805015336, + "grad_norm": 23.16986656188965, + "learning_rate": 1.7042477466525522e-07, + "logits/chosen": -1.8212858438491821, + "logits/rejected": -1.88141667842865, + "logps/chosen": -2.4218106269836426, + "logps/rejected": -2.696655750274658, + "loss": 3.2363, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -24.21810531616211, + "rewards/margins": 2.7484498023986816, + "rewards/rejected": -26.9665584564209, + "step": 22435 + }, + { + "epoch": 0.7563450065725168, + "grad_norm": 31.574495315551758, + "learning_rate": 1.7020363923541853e-07, + "logits/chosen": -1.8986423015594482, + "logits/rejected": -1.8368682861328125, + "logps/chosen": -3.057534694671631, + "logps/rejected": -3.26399302482605, + "loss": 3.2009, + "rewards/accuracies": 0.5, + "rewards/chosen": -30.57534408569336, + "rewards/margins": 2.064589500427246, + "rewards/rejected": -32.63993453979492, + "step": 22440 + }, + { + "epoch": 0.7565135326435, + "grad_norm": 83.39399719238281, + "learning_rate": 1.6998261793510898e-07, + "logits/chosen": -2.201836109161377, + "logits/rejected": -2.4175848960876465, + "logps/chosen": -2.4501421451568604, + "logps/rejected": -3.2026398181915283, + "loss": 3.1793, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.501422882080078, + "rewards/margins": 7.524975776672363, + "rewards/rejected": -32.026397705078125, + "step": 22445 + }, + { + "epoch": 0.7566820587144831, + "grad_norm": 39.38967514038086, + "learning_rate": 1.6976171084081304e-07, + "logits/chosen": -1.823545217514038, + "logits/rejected": -2.4631857872009277, + "logps/chosen": -2.3719120025634766, + "logps/rejected": -3.7122325897216797, + "loss": 1.0425, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -23.719120025634766, + "rewards/margins": 13.403205871582031, + "rewards/rejected": -37.1223258972168, + "step": 22450 + }, + { + "epoch": 0.7568505847854663, + "grad_norm": 152.68515014648438, + "learning_rate": 1.6954091802897807e-07, + "logits/chosen": -1.5671319961547852, + "logits/rejected": -1.5362260341644287, + "logps/chosen": -2.3477840423583984, + "logps/rejected": -2.3875017166137695, + "loss": 3.1308, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.47783851623535, + "rewards/margins": 0.3971790373325348, + "rewards/rejected": -23.875019073486328, + "step": 22455 + }, + { + "epoch": 0.7570191108564495, + "grad_norm": 33.73069381713867, + "learning_rate": 1.6932023957601187e-07, + "logits/chosen": -1.9830052852630615, + "logits/rejected": -1.470280647277832, + "logps/chosen": -3.3719642162323, + "logps/rejected": -3.146393299102783, + "loss": 5.4149, + "rewards/accuracies": 0.5, + "rewards/chosen": -33.719642639160156, + "rewards/margins": -2.2557122707366943, + "rewards/rejected": -31.463932037353516, + "step": 22460 + }, + { + "epoch": 0.7571876369274326, + "grad_norm": 26.94874382019043, + "learning_rate": 1.6909967555828263e-07, + "logits/chosen": -1.3639678955078125, + "logits/rejected": -1.6912784576416016, + "logps/chosen": -2.296612024307251, + "logps/rejected": -2.5016751289367676, + "loss": 2.0897, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.96611976623535, + "rewards/margins": 2.050632953643799, + "rewards/rejected": -25.01675033569336, + "step": 22465 + }, + { + "epoch": 0.7573561629984159, + "grad_norm": 44.422882080078125, + "learning_rate": 1.6887922605211858e-07, + "logits/chosen": -2.0387985706329346, + "logits/rejected": -2.1799306869506836, + "logps/chosen": -2.5838475227355957, + "logps/rejected": -2.6631133556365967, + "loss": 3.9902, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -25.83847427368164, + "rewards/margins": 0.792662501335144, + "rewards/rejected": -26.631134033203125, + "step": 22470 + }, + { + "epoch": 0.7575246890693991, + "grad_norm": 43.87267303466797, + "learning_rate": 1.686588911338087e-07, + "logits/chosen": -1.1579951047897339, + "logits/rejected": -1.377206563949585, + "logps/chosen": -2.371851682662964, + "logps/rejected": -2.8188374042510986, + "loss": 2.9854, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.718515396118164, + "rewards/margins": 4.469855785369873, + "rewards/rejected": -28.188369750976562, + "step": 22475 + }, + { + "epoch": 0.7576932151403822, + "grad_norm": 35.04606628417969, + "learning_rate": 1.6843867087960251e-07, + "logits/chosen": -1.3556041717529297, + "logits/rejected": -1.4663169384002686, + "logps/chosen": -2.151585102081299, + "logps/rejected": -2.2910666465759277, + "loss": 3.1251, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.515850067138672, + "rewards/margins": 1.3948184251785278, + "rewards/rejected": -22.910667419433594, + "step": 22480 + }, + { + "epoch": 0.7578617412113654, + "grad_norm": 39.85191345214844, + "learning_rate": 1.682185653657091e-07, + "logits/chosen": -2.1124298572540283, + "logits/rejected": -2.0933234691619873, + "logps/chosen": -2.2313594818115234, + "logps/rejected": -2.3498880863189697, + "loss": 2.6802, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.3135929107666, + "rewards/margins": 1.1852903366088867, + "rewards/rejected": -23.498882293701172, + "step": 22485 + }, + { + "epoch": 0.7580302672823486, + "grad_norm": 54.735355377197266, + "learning_rate": 1.6799857466829858e-07, + "logits/chosen": -1.6174736022949219, + "logits/rejected": -1.3525209426879883, + "logps/chosen": -2.1450111865997314, + "logps/rejected": -2.0265889167785645, + "loss": 4.3679, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.450109481811523, + "rewards/margins": -1.1842209100723267, + "rewards/rejected": -20.26589012145996, + "step": 22490 + }, + { + "epoch": 0.7581987933533317, + "grad_norm": 24.917158126831055, + "learning_rate": 1.6777869886350104e-07, + "logits/chosen": -2.9526991844177246, + "logits/rejected": -3.172231674194336, + "logps/chosen": -2.476539134979248, + "logps/rejected": -2.646613836288452, + "loss": 3.8738, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -24.765390396118164, + "rewards/margins": 1.7007482051849365, + "rewards/rejected": -26.466136932373047, + "step": 22495 + }, + { + "epoch": 0.7583673194243149, + "grad_norm": 23.80600357055664, + "learning_rate": 1.67558938027407e-07, + "logits/chosen": -2.386019229888916, + "logits/rejected": -2.5371181964874268, + "logps/chosen": -2.0828425884246826, + "logps/rejected": -2.561619758605957, + "loss": 1.7093, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.828426361083984, + "rewards/margins": 4.7877702713012695, + "rewards/rejected": -25.616199493408203, + "step": 22500 + }, + { + "epoch": 0.7585358454952982, + "grad_norm": 86.9273910522461, + "learning_rate": 1.673392922360667e-07, + "logits/chosen": -1.7680097818374634, + "logits/rejected": -2.501622438430786, + "logps/chosen": -2.6157374382019043, + "logps/rejected": -3.2141737937927246, + "loss": 2.935, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.157373428344727, + "rewards/margins": 5.984361171722412, + "rewards/rejected": -32.14173889160156, + "step": 22505 + }, + { + "epoch": 0.7587043715662813, + "grad_norm": 37.30070114135742, + "learning_rate": 1.67119761565491e-07, + "logits/chosen": -1.8771638870239258, + "logits/rejected": -1.8926589488983154, + "logps/chosen": -3.0613274574279785, + "logps/rejected": -3.1823253631591797, + "loss": 3.0842, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -30.6132755279541, + "rewards/margins": 1.2099756002426147, + "rewards/rejected": -31.823253631591797, + "step": 22510 + }, + { + "epoch": 0.7588728976372645, + "grad_norm": 1.0162453651428223, + "learning_rate": 1.669003460916511e-07, + "logits/chosen": -1.4025744199752808, + "logits/rejected": -1.448706030845642, + "logps/chosen": -2.603241443634033, + "logps/rejected": -2.6898093223571777, + "loss": 3.7204, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.03241539001465, + "rewards/margins": 0.8656784296035767, + "rewards/rejected": -26.89809226989746, + "step": 22515 + }, + { + "epoch": 0.7590414237082477, + "grad_norm": 23.106170654296875, + "learning_rate": 1.666810458904776e-07, + "logits/chosen": -2.604548692703247, + "logits/rejected": -2.551825523376465, + "logps/chosen": -2.1498825550079346, + "logps/rejected": -2.1720776557922363, + "loss": 3.3861, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.49882698059082, + "rewards/margins": 0.22195252776145935, + "rewards/rejected": -21.720775604248047, + "step": 22520 + }, + { + "epoch": 0.7592099497792308, + "grad_norm": 21.098031997680664, + "learning_rate": 1.6646186103786187e-07, + "logits/chosen": -1.3105382919311523, + "logits/rejected": -2.053884983062744, + "logps/chosen": -2.0351314544677734, + "logps/rejected": -3.2010607719421387, + "loss": 1.3937, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.3513126373291, + "rewards/margins": 11.659296035766602, + "rewards/rejected": -32.01061248779297, + "step": 22525 + }, + { + "epoch": 0.759378475850214, + "grad_norm": 33.70224380493164, + "learning_rate": 1.6624279160965522e-07, + "logits/chosen": -1.2020736932754517, + "logits/rejected": -1.5865360498428345, + "logps/chosen": -2.410283327102661, + "logps/rejected": -3.008296489715576, + "loss": 2.2057, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -24.102832794189453, + "rewards/margins": 5.980130195617676, + "rewards/rejected": -30.082965850830078, + "step": 22530 + }, + { + "epoch": 0.7595470019211972, + "grad_norm": 42.09781265258789, + "learning_rate": 1.6602383768166895e-07, + "logits/chosen": -0.8306490778923035, + "logits/rejected": -1.1787729263305664, + "logps/chosen": -2.244062662124634, + "logps/rejected": -2.4677631855010986, + "loss": 2.303, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.440624237060547, + "rewards/margins": 2.2370076179504395, + "rewards/rejected": -24.67763328552246, + "step": 22535 + }, + { + "epoch": 0.7597155279921803, + "grad_norm": 21.09007453918457, + "learning_rate": 1.6580499932967424e-07, + "logits/chosen": -2.110147476196289, + "logits/rejected": -2.182302474975586, + "logps/chosen": -3.0315299034118652, + "logps/rejected": -3.077430248260498, + "loss": 2.7172, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -30.3153018951416, + "rewards/margins": 0.45900383591651917, + "rewards/rejected": -30.774303436279297, + "step": 22540 + }, + { + "epoch": 0.7598840540631636, + "grad_norm": 53.60516357421875, + "learning_rate": 1.6558627662940245e-07, + "logits/chosen": -1.1278090476989746, + "logits/rejected": -1.2264466285705566, + "logps/chosen": -1.8437979221343994, + "logps/rejected": -1.921600341796875, + "loss": 2.8029, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.43798065185547, + "rewards/margins": 0.778022289276123, + "rewards/rejected": -19.216001510620117, + "step": 22545 + }, + { + "epoch": 0.7600525801341468, + "grad_norm": 45.398101806640625, + "learning_rate": 1.6536766965654497e-07, + "logits/chosen": -1.9748245477676392, + "logits/rejected": -1.8975473642349243, + "logps/chosen": -2.218376636505127, + "logps/rejected": -2.4128360748291016, + "loss": 4.6966, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -22.183765411376953, + "rewards/margins": 1.9445937871932983, + "rewards/rejected": -24.128360748291016, + "step": 22550 + }, + { + "epoch": 0.76022110620513, + "grad_norm": 66.98893737792969, + "learning_rate": 1.6514917848675302e-07, + "logits/chosen": -1.7163118124008179, + "logits/rejected": -2.033578395843506, + "logps/chosen": -2.355811595916748, + "logps/rejected": -2.6961662769317627, + "loss": 2.6563, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.558116912841797, + "rewards/margins": 3.4035465717315674, + "rewards/rejected": -26.9616641998291, + "step": 22555 + }, + { + "epoch": 0.7603896322761131, + "grad_norm": 41.96858596801758, + "learning_rate": 1.6493080319563786e-07, + "logits/chosen": -1.6755859851837158, + "logits/rejected": -2.1102638244628906, + "logps/chosen": -2.6356406211853027, + "logps/rejected": -3.725564479827881, + "loss": 1.5622, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.356409072875977, + "rewards/margins": 10.899238586425781, + "rewards/rejected": -37.25564956665039, + "step": 22560 + }, + { + "epoch": 0.7605581583470963, + "grad_norm": 24.43367576599121, + "learning_rate": 1.6471254385877058e-07, + "logits/chosen": -1.9659569263458252, + "logits/rejected": -2.5237998962402344, + "logps/chosen": -2.4973161220550537, + "logps/rejected": -3.3989574909210205, + "loss": 1.214, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -24.973163604736328, + "rewards/margins": 9.016408920288086, + "rewards/rejected": -33.98957061767578, + "step": 22565 + }, + { + "epoch": 0.7607266844180794, + "grad_norm": 32.95793151855469, + "learning_rate": 1.6449440055168197e-07, + "logits/chosen": -1.3517484664916992, + "logits/rejected": -1.5391706228256226, + "logps/chosen": -2.0335590839385986, + "logps/rejected": -2.13909649848938, + "loss": 3.0144, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.335590362548828, + "rewards/margins": 1.0553737878799438, + "rewards/rejected": -21.39096450805664, + "step": 22570 + }, + { + "epoch": 0.7608952104890626, + "grad_norm": 34.37199020385742, + "learning_rate": 1.6427637334986295e-07, + "logits/chosen": -2.1165642738342285, + "logits/rejected": -2.1103622913360596, + "logps/chosen": -2.2701714038848877, + "logps/rejected": -2.4335379600524902, + "loss": 4.2387, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.701711654663086, + "rewards/margins": 1.6336677074432373, + "rewards/rejected": -24.335378646850586, + "step": 22575 + }, + { + "epoch": 0.7610637365600459, + "grad_norm": 28.55931282043457, + "learning_rate": 1.640584623287641e-07, + "logits/chosen": -1.371361494064331, + "logits/rejected": -1.538761019706726, + "logps/chosen": -1.875836968421936, + "logps/rejected": -1.9495811462402344, + "loss": 2.4925, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.758371353149414, + "rewards/margins": 0.7374424338340759, + "rewards/rejected": -19.495811462402344, + "step": 22580 + }, + { + "epoch": 0.761232262631029, + "grad_norm": 25.89690399169922, + "learning_rate": 1.6384066756379606e-07, + "logits/chosen": -1.989154577255249, + "logits/rejected": -2.138568878173828, + "logps/chosen": -2.7180099487304688, + "logps/rejected": -2.8691840171813965, + "loss": 2.7374, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -27.180099487304688, + "rewards/margins": 1.511740803718567, + "rewards/rejected": -28.69183921813965, + "step": 22585 + }, + { + "epoch": 0.7614007887020122, + "grad_norm": 69.94366455078125, + "learning_rate": 1.6362298913032861e-07, + "logits/chosen": -1.814008355140686, + "logits/rejected": -1.816664695739746, + "logps/chosen": -2.587378978729248, + "logps/rejected": -2.3037238121032715, + "loss": 5.8606, + "rewards/accuracies": 0.10000000149011612, + "rewards/chosen": -25.873790740966797, + "rewards/margins": -2.836550712585449, + "rewards/rejected": -23.03723907470703, + "step": 22590 + }, + { + "epoch": 0.7615693147729954, + "grad_norm": 28.40800666809082, + "learning_rate": 1.6340542710369193e-07, + "logits/chosen": -1.4182841777801514, + "logits/rejected": -1.7591663599014282, + "logps/chosen": -1.932254433631897, + "logps/rejected": -2.272496461868286, + "loss": 3.8638, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -19.32254409790039, + "rewards/margins": 3.402418851852417, + "rewards/rejected": -22.724964141845703, + "step": 22595 + }, + { + "epoch": 0.7617378408439786, + "grad_norm": 14.029182434082031, + "learning_rate": 1.631879815591758e-07, + "logits/chosen": -1.8987289667129517, + "logits/rejected": -2.287255048751831, + "logps/chosen": -2.929877996444702, + "logps/rejected": -4.1070146560668945, + "loss": 2.4117, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -29.298778533935547, + "rewards/margins": 11.7713623046875, + "rewards/rejected": -41.07014083862305, + "step": 22600 + }, + { + "epoch": 0.7619063669149617, + "grad_norm": 8.976080894470215, + "learning_rate": 1.6297065257202924e-07, + "logits/chosen": -1.4358826875686646, + "logits/rejected": -1.8369331359863281, + "logps/chosen": -2.0056796073913574, + "logps/rejected": -2.3201420307159424, + "loss": 1.7562, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.05679702758789, + "rewards/margins": 3.1446213722229004, + "rewards/rejected": -23.201419830322266, + "step": 22605 + }, + { + "epoch": 0.7620748929859449, + "grad_norm": 97.5491714477539, + "learning_rate": 1.6275344021746135e-07, + "logits/chosen": -1.5037356615066528, + "logits/rejected": -1.7426496744155884, + "logps/chosen": -2.2114577293395996, + "logps/rejected": -2.473027467727661, + "loss": 2.3805, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.11457633972168, + "rewards/margins": 2.6156959533691406, + "rewards/rejected": -24.730274200439453, + "step": 22610 + }, + { + "epoch": 0.7622434190569282, + "grad_norm": 42.989097595214844, + "learning_rate": 1.6253634457064085e-07, + "logits/chosen": -1.2196903228759766, + "logits/rejected": -1.2983381748199463, + "logps/chosen": -2.3565046787261963, + "logps/rejected": -2.637000560760498, + "loss": 2.1788, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.565048217773438, + "rewards/margins": 2.804961681365967, + "rewards/rejected": -26.370006561279297, + "step": 22615 + }, + { + "epoch": 0.7624119451279113, + "grad_norm": 14.925318717956543, + "learning_rate": 1.6231936570669614e-07, + "logits/chosen": -1.4819129705429077, + "logits/rejected": -1.4523422718048096, + "logps/chosen": -1.8426742553710938, + "logps/rejected": -2.0914957523345947, + "loss": 2.7286, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.426740646362305, + "rewards/margins": 2.4882164001464844, + "rewards/rejected": -20.91495704650879, + "step": 22620 + }, + { + "epoch": 0.7625804711988945, + "grad_norm": 14.706117630004883, + "learning_rate": 1.6210250370071465e-07, + "logits/chosen": -2.005725860595703, + "logits/rejected": -1.8718922138214111, + "logps/chosen": -2.3563499450683594, + "logps/rejected": -2.945220708847046, + "loss": 0.9005, + "rewards/accuracies": 1.0, + "rewards/chosen": -23.563499450683594, + "rewards/margins": 5.888709545135498, + "rewards/rejected": -29.45220947265625, + "step": 22625 + }, + { + "epoch": 0.7627489972698777, + "grad_norm": 81.91179656982422, + "learning_rate": 1.6188575862774405e-07, + "logits/chosen": -1.7499510049819946, + "logits/rejected": -1.71432626247406, + "logps/chosen": -2.5580358505249023, + "logps/rejected": -2.9207465648651123, + "loss": 2.9396, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.580360412597656, + "rewards/margins": 3.6271071434020996, + "rewards/rejected": -29.207468032836914, + "step": 22630 + }, + { + "epoch": 0.7629175233408608, + "grad_norm": 3.5403726617033726e-09, + "learning_rate": 1.6166913056279136e-07, + "logits/chosen": -1.9457800388336182, + "logits/rejected": -2.086674928665161, + "logps/chosen": -3.528853178024292, + "logps/rejected": -4.209619045257568, + "loss": 2.6988, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -35.288536071777344, + "rewards/margins": 6.807660102844238, + "rewards/rejected": -42.09619140625, + "step": 22635 + }, + { + "epoch": 0.763086049411844, + "grad_norm": 86.31592559814453, + "learning_rate": 1.6145261958082273e-07, + "logits/chosen": -1.683431625366211, + "logits/rejected": -2.1868739128112793, + "logps/chosen": -2.386417865753174, + "logps/rejected": -2.968527317047119, + "loss": 2.3259, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.864177703857422, + "rewards/margins": 5.8210954666137695, + "rewards/rejected": -29.685272216796875, + "step": 22640 + }, + { + "epoch": 0.7632545754828272, + "grad_norm": 28.384031295776367, + "learning_rate": 1.6123622575676422e-07, + "logits/chosen": -1.5124573707580566, + "logits/rejected": -1.656079649925232, + "logps/chosen": -2.3514010906219482, + "logps/rejected": -2.981489896774292, + "loss": 2.9862, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.514013290405273, + "rewards/margins": 6.300887584686279, + "rewards/rejected": -29.81490135192871, + "step": 22645 + }, + { + "epoch": 0.7634231015538103, + "grad_norm": 44.00370407104492, + "learning_rate": 1.610199491655012e-07, + "logits/chosen": -1.9806187152862549, + "logits/rejected": -2.0687432289123535, + "logps/chosen": -2.1002681255340576, + "logps/rejected": -2.3335559368133545, + "loss": 2.0197, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.002683639526367, + "rewards/margins": 2.3328773975372314, + "rewards/rejected": -23.335561752319336, + "step": 22650 + }, + { + "epoch": 0.7635916276247936, + "grad_norm": 1.1379077434539795, + "learning_rate": 1.608037898818787e-07, + "logits/chosen": -1.4696893692016602, + "logits/rejected": -1.8817825317382812, + "logps/chosen": -2.316413402557373, + "logps/rejected": -2.648519992828369, + "loss": 2.2046, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.16413688659668, + "rewards/margins": 3.3210597038269043, + "rewards/rejected": -26.48519515991211, + "step": 22655 + }, + { + "epoch": 0.7637601536957768, + "grad_norm": 26.41794776916504, + "learning_rate": 1.605877479807005e-07, + "logits/chosen": -1.6426475048065186, + "logits/rejected": -2.1441574096679688, + "logps/chosen": -1.9908136129379272, + "logps/rejected": -2.6799676418304443, + "loss": 1.9165, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.90813636779785, + "rewards/margins": 6.891541957855225, + "rewards/rejected": -26.799678802490234, + "step": 22660 + }, + { + "epoch": 0.7639286797667599, + "grad_norm": 28.51700782775879, + "learning_rate": 1.6037182353673044e-07, + "logits/chosen": -1.8227428197860718, + "logits/rejected": -1.5971324443817139, + "logps/chosen": -2.399543285369873, + "logps/rejected": -2.171342372894287, + "loss": 5.7566, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.995433807373047, + "rewards/margins": -2.2820091247558594, + "rewards/rejected": -21.71342658996582, + "step": 22665 + }, + { + "epoch": 0.7640972058377431, + "grad_norm": 36.324859619140625, + "learning_rate": 1.6015601662469164e-07, + "logits/chosen": -1.6870263814926147, + "logits/rejected": -1.814182996749878, + "logps/chosen": -2.6212759017944336, + "logps/rejected": -3.208042860031128, + "loss": 2.8824, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -26.212759017944336, + "rewards/margins": 5.867671966552734, + "rewards/rejected": -32.08042907714844, + "step": 22670 + }, + { + "epoch": 0.7642657319087263, + "grad_norm": 27.71980094909668, + "learning_rate": 1.59940327319266e-07, + "logits/chosen": -1.6754287481307983, + "logits/rejected": -1.8766758441925049, + "logps/chosen": -2.4628703594207764, + "logps/rejected": -2.6281509399414062, + "loss": 4.069, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.62870216369629, + "rewards/margins": 1.6528069972991943, + "rewards/rejected": -26.281509399414062, + "step": 22675 + }, + { + "epoch": 0.7644342579797094, + "grad_norm": 16.1252498626709, + "learning_rate": 1.597247556950952e-07, + "logits/chosen": -2.084392547607422, + "logits/rejected": -2.031343460083008, + "logps/chosen": -2.1783878803253174, + "logps/rejected": -2.549133062362671, + "loss": 2.1847, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.78388023376465, + "rewards/margins": 3.7074522972106934, + "rewards/rejected": -25.491331100463867, + "step": 22680 + }, + { + "epoch": 0.7646027840506926, + "grad_norm": 29.193500518798828, + "learning_rate": 1.595093018267802e-07, + "logits/chosen": -1.4438226222991943, + "logits/rejected": -1.5522502660751343, + "logps/chosen": -2.001771926879883, + "logps/rejected": -2.1155412197113037, + "loss": 3.0072, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.017719268798828, + "rewards/margins": 1.1376922130584717, + "rewards/rejected": -21.155412673950195, + "step": 22685 + }, + { + "epoch": 0.7647713101216759, + "grad_norm": 22.629920959472656, + "learning_rate": 1.59293965788881e-07, + "logits/chosen": -1.7114652395248413, + "logits/rejected": -1.8377281427383423, + "logps/chosen": -1.7160522937774658, + "logps/rejected": -1.9835751056671143, + "loss": 1.5033, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -17.160524368286133, + "rewards/margins": 2.6752288341522217, + "rewards/rejected": -19.835750579833984, + "step": 22690 + }, + { + "epoch": 0.764939836192659, + "grad_norm": 31.570363998413086, + "learning_rate": 1.5907874765591717e-07, + "logits/chosen": -1.819737434387207, + "logits/rejected": -2.656897783279419, + "logps/chosen": -2.095639705657959, + "logps/rejected": -2.8702781200408936, + "loss": 1.6942, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.956396102905273, + "rewards/margins": 7.7463860511779785, + "rewards/rejected": -28.702783584594727, + "step": 22695 + }, + { + "epoch": 0.7651083622636422, + "grad_norm": 29.860578536987305, + "learning_rate": 1.588636475023668e-07, + "logits/chosen": -1.793116569519043, + "logits/rejected": -1.9782568216323853, + "logps/chosen": -3.1871731281280518, + "logps/rejected": -3.6866703033447266, + "loss": 1.7431, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -31.87173080444336, + "rewards/margins": 4.9949727058410645, + "rewards/rejected": -36.86670684814453, + "step": 22700 + }, + { + "epoch": 0.7652768883346254, + "grad_norm": 29.30422592163086, + "learning_rate": 1.586486654026678e-07, + "logits/chosen": -1.2873570919036865, + "logits/rejected": -2.1461853981018066, + "logps/chosen": -2.487175464630127, + "logps/rejected": -3.387939929962158, + "loss": 2.3507, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.871755599975586, + "rewards/margins": 9.007641792297363, + "rewards/rejected": -33.87939453125, + "step": 22705 + }, + { + "epoch": 0.7654454144056085, + "grad_norm": 14.670337677001953, + "learning_rate": 1.5843380143121703e-07, + "logits/chosen": -1.8407291173934937, + "logits/rejected": -1.8218971490859985, + "logps/chosen": -2.770521640777588, + "logps/rejected": -2.9160516262054443, + "loss": 3.4715, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -27.705215454101562, + "rewards/margins": 1.455300211906433, + "rewards/rejected": -29.1605167388916, + "step": 22710 + }, + { + "epoch": 0.7656139404765917, + "grad_norm": 36.592506408691406, + "learning_rate": 1.5821905566237038e-07, + "logits/chosen": -2.127350330352783, + "logits/rejected": -2.3790395259857178, + "logps/chosen": -2.19892954826355, + "logps/rejected": -2.3953166007995605, + "loss": 2.4215, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.989294052124023, + "rewards/margins": 1.9638723134994507, + "rewards/rejected": -23.953166961669922, + "step": 22715 + }, + { + "epoch": 0.7657824665475749, + "grad_norm": 19.594825744628906, + "learning_rate": 1.5800442817044297e-07, + "logits/chosen": -2.041064739227295, + "logits/rejected": -2.1058108806610107, + "logps/chosen": -2.9583261013031006, + "logps/rejected": -3.4392478466033936, + "loss": 1.9416, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -29.583261489868164, + "rewards/margins": 4.809213161468506, + "rewards/rejected": -34.39247512817383, + "step": 22720 + }, + { + "epoch": 0.7659509926185581, + "grad_norm": 0.45482581853866577, + "learning_rate": 1.57789919029709e-07, + "logits/chosen": -1.9175220727920532, + "logits/rejected": -2.370159387588501, + "logps/chosen": -2.3254458904266357, + "logps/rejected": -3.4092516899108887, + "loss": 1.4107, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -23.25446128845215, + "rewards/margins": 10.838057518005371, + "rewards/rejected": -34.09252166748047, + "step": 22725 + }, + { + "epoch": 0.7661195186895413, + "grad_norm": 22.42098617553711, + "learning_rate": 1.5757552831440141e-07, + "logits/chosen": -2.0766656398773193, + "logits/rejected": -2.1389594078063965, + "logps/chosen": -2.143092393875122, + "logps/rejected": -2.2686245441436768, + "loss": 2.7787, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.430923461914062, + "rewards/margins": 1.2553188800811768, + "rewards/rejected": -22.68623924255371, + "step": 22730 + }, + { + "epoch": 0.7662880447605245, + "grad_norm": 48.564796447753906, + "learning_rate": 1.5736125609871243e-07, + "logits/chosen": -2.0202689170837402, + "logits/rejected": -2.1596198081970215, + "logps/chosen": -2.347080945968628, + "logps/rejected": -2.568044662475586, + "loss": 2.9386, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.470806121826172, + "rewards/margins": 2.2096376419067383, + "rewards/rejected": -25.680444717407227, + "step": 22735 + }, + { + "epoch": 0.7664565708315076, + "grad_norm": 25.688514709472656, + "learning_rate": 1.5714710245679346e-07, + "logits/chosen": -1.8988473415374756, + "logits/rejected": -2.291731119155884, + "logps/chosen": -1.5387709140777588, + "logps/rejected": -1.7353506088256836, + "loss": 1.9236, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -15.38770866394043, + "rewards/margins": 1.9657974243164062, + "rewards/rejected": -17.353506088256836, + "step": 22740 + }, + { + "epoch": 0.7666250969024908, + "grad_norm": 42.179935455322266, + "learning_rate": 1.5693306746275432e-07, + "logits/chosen": -1.7268253564834595, + "logits/rejected": -1.8562453985214233, + "logps/chosen": -2.071321487426758, + "logps/rejected": -2.1327714920043945, + "loss": 3.8071, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.713214874267578, + "rewards/margins": 0.6145005226135254, + "rewards/rejected": -21.327716827392578, + "step": 22745 + }, + { + "epoch": 0.766793622973474, + "grad_norm": 25.364154815673828, + "learning_rate": 1.5671915119066426e-07, + "logits/chosen": -1.301816701889038, + "logits/rejected": -1.5432207584381104, + "logps/chosen": -2.3390283584594727, + "logps/rejected": -2.549079418182373, + "loss": 2.4451, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.390283584594727, + "rewards/margins": 2.1005122661590576, + "rewards/rejected": -25.490795135498047, + "step": 22750 + }, + { + "epoch": 0.7669621490444571, + "grad_norm": 33.825782775878906, + "learning_rate": 1.565053537145512e-07, + "logits/chosen": -1.6760823726654053, + "logits/rejected": -1.7630828619003296, + "logps/chosen": -2.232607364654541, + "logps/rejected": -2.3341293334960938, + "loss": 3.0354, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.32607078552246, + "rewards/margins": 1.015222191810608, + "rewards/rejected": -23.341297149658203, + "step": 22755 + }, + { + "epoch": 0.7671306751154403, + "grad_norm": 15.665143966674805, + "learning_rate": 1.5629167510840224e-07, + "logits/chosen": -1.711958646774292, + "logits/rejected": -1.994261384010315, + "logps/chosen": -2.732503890991211, + "logps/rejected": -3.117694139480591, + "loss": 1.5793, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -27.32503890991211, + "rewards/margins": 3.8519043922424316, + "rewards/rejected": -31.17694091796875, + "step": 22760 + }, + { + "epoch": 0.7672992011864236, + "grad_norm": 31.382097244262695, + "learning_rate": 1.560781154461628e-07, + "logits/chosen": -1.7627441883087158, + "logits/rejected": -1.7876489162445068, + "logps/chosen": -2.144718885421753, + "logps/rejected": -2.2064361572265625, + "loss": 2.6592, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.447189331054688, + "rewards/margins": 0.6171743273735046, + "rewards/rejected": -22.064361572265625, + "step": 22765 + }, + { + "epoch": 0.7674677272574068, + "grad_norm": 28.25269889831543, + "learning_rate": 1.5586467480173766e-07, + "logits/chosen": -2.0783777236938477, + "logits/rejected": -2.0805037021636963, + "logps/chosen": -2.2239603996276855, + "logps/rejected": -2.4322445392608643, + "loss": 2.3538, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.239604949951172, + "rewards/margins": 2.082839012145996, + "rewards/rejected": -24.322444915771484, + "step": 22770 + }, + { + "epoch": 0.7676362533283899, + "grad_norm": 26.404125213623047, + "learning_rate": 1.5565135324899026e-07, + "logits/chosen": -2.1612420082092285, + "logits/rejected": -2.238856077194214, + "logps/chosen": -2.175217628479004, + "logps/rejected": -2.4848060607910156, + "loss": 2.6324, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.752174377441406, + "rewards/margins": 3.0958826541900635, + "rewards/rejected": -24.848058700561523, + "step": 22775 + }, + { + "epoch": 0.7678047793993731, + "grad_norm": 40.29751968383789, + "learning_rate": 1.554381508617426e-07, + "logits/chosen": -2.263197660446167, + "logits/rejected": -2.3222174644470215, + "logps/chosen": -2.2314186096191406, + "logps/rejected": -2.469470262527466, + "loss": 1.9906, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.31418800354004, + "rewards/margins": 2.3805153369903564, + "rewards/rejected": -24.6947021484375, + "step": 22780 + }, + { + "epoch": 0.7679733054703562, + "grad_norm": 22.63186264038086, + "learning_rate": 1.5522506771377576e-07, + "logits/chosen": -1.6402429342269897, + "logits/rejected": -1.5860573053359985, + "logps/chosen": -1.707369089126587, + "logps/rejected": -1.9794480800628662, + "loss": 1.7679, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.07369041442871, + "rewards/margins": 2.720792055130005, + "rewards/rejected": -19.794483184814453, + "step": 22785 + }, + { + "epoch": 0.7681418315413394, + "grad_norm": 52.35350036621094, + "learning_rate": 1.5501210387882933e-07, + "logits/chosen": -1.4197752475738525, + "logits/rejected": -1.8860286474227905, + "logps/chosen": -2.8877930641174316, + "logps/rejected": -3.428576707839966, + "loss": 1.8367, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -28.8779296875, + "rewards/margins": 5.4078369140625, + "rewards/rejected": -34.2857666015625, + "step": 22790 + }, + { + "epoch": 0.7683103576123226, + "grad_norm": 26.972814559936523, + "learning_rate": 1.5479925943060195e-07, + "logits/chosen": -1.6417248249053955, + "logits/rejected": -1.6887985467910767, + "logps/chosen": -2.1866393089294434, + "logps/rejected": -2.9747371673583984, + "loss": 2.9489, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.86639404296875, + "rewards/margins": 7.880978584289551, + "rewards/rejected": -29.747371673583984, + "step": 22795 + }, + { + "epoch": 0.7684788836833059, + "grad_norm": 21.10647964477539, + "learning_rate": 1.5458653444275038e-07, + "logits/chosen": -1.5778142213821411, + "logits/rejected": -1.9015181064605713, + "logps/chosen": -2.6732568740844727, + "logps/rejected": -3.028304100036621, + "loss": 1.9708, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.732568740844727, + "rewards/margins": 3.550471544265747, + "rewards/rejected": -30.283039093017578, + "step": 22800 + }, + { + "epoch": 0.7684788836833059, + "eval_logits/chosen": -2.248206853866577, + "eval_logits/rejected": -2.421609878540039, + "eval_logps/chosen": -2.2624881267547607, + "eval_logps/rejected": -2.4147942066192627, + "eval_loss": 3.0768299102783203, + "eval_rewards/accuracies": 0.6299999952316284, + "eval_rewards/chosen": -22.624879837036133, + "eval_rewards/margins": 1.5230610370635986, + "eval_rewards/rejected": -24.1479434967041, + "eval_runtime": 12.9007, + "eval_samples_per_second": 7.751, + "eval_steps_per_second": 1.938, + "step": 22800 + }, + { + "epoch": 0.768647409754289, + "grad_norm": 24.942798614501953, + "learning_rate": 1.5437392898889046e-07, + "logits/chosen": -1.7756656408309937, + "logits/rejected": -1.8584327697753906, + "logps/chosen": -1.8894437551498413, + "logps/rejected": -1.8411388397216797, + "loss": 3.5599, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.894437789916992, + "rewards/margins": -0.4830484390258789, + "rewards/rejected": -18.411388397216797, + "step": 22805 + }, + { + "epoch": 0.7688159358252722, + "grad_norm": 39.55795669555664, + "learning_rate": 1.5416144314259677e-07, + "logits/chosen": -1.845928430557251, + "logits/rejected": -2.0967960357666016, + "logps/chosen": -2.7359180450439453, + "logps/rejected": -2.971412420272827, + "loss": 1.9504, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -27.359180450439453, + "rewards/margins": 2.354943037033081, + "rewards/rejected": -29.714122772216797, + "step": 22810 + }, + { + "epoch": 0.7689844618962554, + "grad_norm": 131.97705078125, + "learning_rate": 1.5394907697740194e-07, + "logits/chosen": -1.8618195056915283, + "logits/rejected": -2.1782288551330566, + "logps/chosen": -2.682870388031006, + "logps/rejected": -2.9631354808807373, + "loss": 1.5972, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -26.82870864868164, + "rewards/margins": 2.8026511669158936, + "rewards/rejected": -29.631357192993164, + "step": 22815 + }, + { + "epoch": 0.7691529879672385, + "grad_norm": 0.32372578978538513, + "learning_rate": 1.537368305667977e-07, + "logits/chosen": -2.0023844242095947, + "logits/rejected": -2.6936569213867188, + "logps/chosen": -2.029672145843506, + "logps/rejected": -2.9066014289855957, + "loss": 2.5648, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.296720504760742, + "rewards/margins": 8.769292831420898, + "rewards/rejected": -29.066015243530273, + "step": 22820 + }, + { + "epoch": 0.7693215140382217, + "grad_norm": 58.08989334106445, + "learning_rate": 1.5352470398423423e-07, + "logits/chosen": -2.2171883583068848, + "logits/rejected": -2.2867395877838135, + "logps/chosen": -3.2307028770446777, + "logps/rejected": -3.6406402587890625, + "loss": 2.1023, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -32.307029724121094, + "rewards/margins": 4.09937047958374, + "rewards/rejected": -36.406402587890625, + "step": 22825 + }, + { + "epoch": 0.7694900401092049, + "grad_norm": 36.715049743652344, + "learning_rate": 1.5331269730312025e-07, + "logits/chosen": -1.6262295246124268, + "logits/rejected": -2.023651361465454, + "logps/chosen": -2.8353304862976074, + "logps/rejected": -3.7760651111602783, + "loss": 1.2619, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -28.35330581665039, + "rewards/margins": 9.407342910766602, + "rewards/rejected": -37.76064682006836, + "step": 22830 + }, + { + "epoch": 0.769658566180188, + "grad_norm": 34.941463470458984, + "learning_rate": 1.531008105968226e-07, + "logits/chosen": -1.6637403964996338, + "logits/rejected": -2.284741163253784, + "logps/chosen": -1.8302547931671143, + "logps/rejected": -2.269709825515747, + "loss": 2.3833, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.302549362182617, + "rewards/margins": 4.394549369812012, + "rewards/rejected": -22.697097778320312, + "step": 22835 + }, + { + "epoch": 0.7698270922511713, + "grad_norm": 26.151050567626953, + "learning_rate": 1.528890439386672e-07, + "logits/chosen": -1.7768512964248657, + "logits/rejected": -1.8191230297088623, + "logps/chosen": -2.1474437713623047, + "logps/rejected": -2.3349575996398926, + "loss": 2.3431, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.474435806274414, + "rewards/margins": 1.8751392364501953, + "rewards/rejected": -23.34957504272461, + "step": 22840 + }, + { + "epoch": 0.7699956183221545, + "grad_norm": 35.694366455078125, + "learning_rate": 1.5267739740193801e-07, + "logits/chosen": -2.0323410034179688, + "logits/rejected": -2.495180606842041, + "logps/chosen": -2.026014804840088, + "logps/rejected": -2.490914821624756, + "loss": 2.4506, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.260150909423828, + "rewards/margins": 4.648998260498047, + "rewards/rejected": -24.909147262573242, + "step": 22845 + }, + { + "epoch": 0.7701641443931376, + "grad_norm": 26.962989807128906, + "learning_rate": 1.5246587105987762e-07, + "logits/chosen": -1.271196961402893, + "logits/rejected": -1.243116021156311, + "logps/chosen": -2.028836965560913, + "logps/rejected": -2.181765079498291, + "loss": 2.8807, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.28837013244629, + "rewards/margins": 1.5292824506759644, + "rewards/rejected": -21.817651748657227, + "step": 22850 + }, + { + "epoch": 0.7703326704641208, + "grad_norm": 35.68492889404297, + "learning_rate": 1.5225446498568694e-07, + "logits/chosen": -1.7652562856674194, + "logits/rejected": -2.1971378326416016, + "logps/chosen": -2.2682933807373047, + "logps/rejected": -2.5781054496765137, + "loss": 2.8534, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.682933807373047, + "rewards/margins": 3.0981197357177734, + "rewards/rejected": -25.781055450439453, + "step": 22855 + }, + { + "epoch": 0.770501196535104, + "grad_norm": 50.4202880859375, + "learning_rate": 1.5204317925252553e-07, + "logits/chosen": -1.9249420166015625, + "logits/rejected": -2.014530658721924, + "logps/chosen": -2.3662045001983643, + "logps/rejected": -2.5241458415985107, + "loss": 3.5158, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -23.662044525146484, + "rewards/margins": 1.5794137716293335, + "rewards/rejected": -25.241458892822266, + "step": 22860 + }, + { + "epoch": 0.7706697226060871, + "grad_norm": 4.7089128494262695, + "learning_rate": 1.5183201393351064e-07, + "logits/chosen": -1.807294487953186, + "logits/rejected": -1.8672775030136108, + "logps/chosen": -2.128382921218872, + "logps/rejected": -2.1345269680023193, + "loss": 3.4027, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.283828735351562, + "rewards/margins": 0.061441611498594284, + "rewards/rejected": -21.34527015686035, + "step": 22865 + }, + { + "epoch": 0.7708382486770703, + "grad_norm": 17.503753662109375, + "learning_rate": 1.516209691017184e-07, + "logits/chosen": -1.9709889888763428, + "logits/rejected": -2.1105175018310547, + "logps/chosen": -2.447917938232422, + "logps/rejected": -2.793539047241211, + "loss": 1.2948, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -24.47917938232422, + "rewards/margins": 3.456209182739258, + "rewards/rejected": -27.935388565063477, + "step": 22870 + }, + { + "epoch": 0.7710067747480536, + "grad_norm": 42.22770690917969, + "learning_rate": 1.5141004483018322e-07, + "logits/chosen": -2.3059794902801514, + "logits/rejected": -2.0448668003082275, + "logps/chosen": -3.028981924057007, + "logps/rejected": -3.5030417442321777, + "loss": 4.0377, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -30.289819717407227, + "rewards/margins": 4.740601539611816, + "rewards/rejected": -35.030418395996094, + "step": 22875 + }, + { + "epoch": 0.7711753008190367, + "grad_norm": 29.942577362060547, + "learning_rate": 1.511992411918978e-07, + "logits/chosen": -2.524641513824463, + "logits/rejected": -2.246544361114502, + "logps/chosen": -2.187622547149658, + "logps/rejected": -2.3799984455108643, + "loss": 2.3435, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.876224517822266, + "rewards/margins": 1.9237569570541382, + "rewards/rejected": -23.79998207092285, + "step": 22880 + }, + { + "epoch": 0.7713438268900199, + "grad_norm": 0.0972161665558815, + "learning_rate": 1.509885582598126e-07, + "logits/chosen": -1.412776231765747, + "logits/rejected": -2.6804697513580322, + "logps/chosen": -2.9052116870880127, + "logps/rejected": -3.8140385150909424, + "loss": 1.1475, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -29.0521183013916, + "rewards/margins": 9.088268280029297, + "rewards/rejected": -38.140384674072266, + "step": 22885 + }, + { + "epoch": 0.7715123529610031, + "grad_norm": 28.691585540771484, + "learning_rate": 1.5077799610683694e-07, + "logits/chosen": -2.3171160221099854, + "logits/rejected": -2.5718681812286377, + "logps/chosen": -2.6705522537231445, + "logps/rejected": -2.6094350814819336, + "loss": 3.9495, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.705524444580078, + "rewards/margins": -0.6111720204353333, + "rewards/rejected": -26.094350814819336, + "step": 22890 + }, + { + "epoch": 0.7716808790319862, + "grad_norm": 18.407123565673828, + "learning_rate": 1.50567554805838e-07, + "logits/chosen": -1.6417083740234375, + "logits/rejected": -1.7448304891586304, + "logps/chosen": -1.9469597339630127, + "logps/rejected": -1.9432523250579834, + "loss": 3.5781, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.46959686279297, + "rewards/margins": -0.037073612213134766, + "rewards/rejected": -19.432523727416992, + "step": 22895 + }, + { + "epoch": 0.7718494051029694, + "grad_norm": 207.2261962890625, + "learning_rate": 1.5035723442964137e-07, + "logits/chosen": -1.3821115493774414, + "logits/rejected": -1.5796051025390625, + "logps/chosen": -2.5829050540924072, + "logps/rejected": -2.776493787765503, + "loss": 2.1161, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.829050064086914, + "rewards/margins": 1.9358898401260376, + "rewards/rejected": -27.764938354492188, + "step": 22900 + }, + { + "epoch": 0.7720179311739526, + "grad_norm": 18.419729232788086, + "learning_rate": 1.5014703505103042e-07, + "logits/chosen": -1.5121935606002808, + "logits/rejected": -2.028642177581787, + "logps/chosen": -2.0915684700012207, + "logps/rejected": -2.209536075592041, + "loss": 2.8085, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.91568374633789, + "rewards/margins": 1.1796770095825195, + "rewards/rejected": -22.095359802246094, + "step": 22905 + }, + { + "epoch": 0.7721864572449358, + "grad_norm": 17.541202545166016, + "learning_rate": 1.4993695674274697e-07, + "logits/chosen": -1.6946613788604736, + "logits/rejected": -1.8450828790664673, + "logps/chosen": -2.4303412437438965, + "logps/rejected": -2.487248182296753, + "loss": 2.9624, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.30341148376465, + "rewards/margins": 0.5690677762031555, + "rewards/rejected": -24.872478485107422, + "step": 22910 + }, + { + "epoch": 0.772354983315919, + "grad_norm": 30.39205551147461, + "learning_rate": 1.4972699957749102e-07, + "logits/chosen": -1.9528687000274658, + "logits/rejected": -1.8657668828964233, + "logps/chosen": -2.3502016067504883, + "logps/rejected": -2.5009114742279053, + "loss": 2.6588, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.502017974853516, + "rewards/margins": 1.5070993900299072, + "rewards/rejected": -25.00911521911621, + "step": 22915 + }, + { + "epoch": 0.7725235093869022, + "grad_norm": 35.62753677368164, + "learning_rate": 1.4951716362792017e-07, + "logits/chosen": -2.1138663291931152, + "logits/rejected": -2.126997470855713, + "logps/chosen": -1.9578602313995361, + "logps/rejected": -1.9695158004760742, + "loss": 3.4066, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.578601837158203, + "rewards/margins": 0.1165565475821495, + "rewards/rejected": -19.695158004760742, + "step": 22920 + }, + { + "epoch": 0.7726920354578853, + "grad_norm": 26.032629013061523, + "learning_rate": 1.4930744896665048e-07, + "logits/chosen": -1.8728666305541992, + "logits/rejected": -1.8308374881744385, + "logps/chosen": -2.678856372833252, + "logps/rejected": -2.6533203125, + "loss": 5.4955, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -26.788562774658203, + "rewards/margins": -0.2553566098213196, + "rewards/rejected": -26.533206939697266, + "step": 22925 + }, + { + "epoch": 0.7728605615288685, + "grad_norm": 21.67523193359375, + "learning_rate": 1.4909785566625598e-07, + "logits/chosen": -1.8303050994873047, + "logits/rejected": -2.086219549179077, + "logps/chosen": -1.8535133600234985, + "logps/rejected": -2.158536195755005, + "loss": 1.6427, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.535133361816406, + "rewards/margins": 3.050227403640747, + "rewards/rejected": -21.58536148071289, + "step": 22930 + }, + { + "epoch": 0.7730290875998517, + "grad_norm": 44.84750747680664, + "learning_rate": 1.4888838379926883e-07, + "logits/chosen": -1.8299903869628906, + "logits/rejected": -2.076554775238037, + "logps/chosen": -2.6862049102783203, + "logps/rejected": -3.086336612701416, + "loss": 2.0808, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.862049102783203, + "rewards/margins": 4.001317501068115, + "rewards/rejected": -30.863367080688477, + "step": 22935 + }, + { + "epoch": 0.7731976136708348, + "grad_norm": 18.299293518066406, + "learning_rate": 1.486790334381786e-07, + "logits/chosen": -2.179386615753174, + "logits/rejected": -2.3787121772766113, + "logps/chosen": -2.733924150466919, + "logps/rejected": -2.6912307739257812, + "loss": 5.3055, + "rewards/accuracies": 0.5, + "rewards/chosen": -27.3392391204834, + "rewards/margins": -0.426931768655777, + "rewards/rejected": -26.912307739257812, + "step": 22940 + }, + { + "epoch": 0.773366139741818, + "grad_norm": 30.414318084716797, + "learning_rate": 1.4846980465543347e-07, + "logits/chosen": -1.5662306547164917, + "logits/rejected": -1.731276512145996, + "logps/chosen": -2.159959077835083, + "logps/rejected": -3.0159831047058105, + "loss": 2.9906, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.599590301513672, + "rewards/margins": 8.560237884521484, + "rewards/rejected": -30.159826278686523, + "step": 22945 + }, + { + "epoch": 0.7735346658128013, + "grad_norm": 22.8912353515625, + "learning_rate": 1.4826069752343928e-07, + "logits/chosen": -1.644881248474121, + "logits/rejected": -1.354028344154358, + "logps/chosen": -2.835266590118408, + "logps/rejected": -2.5100321769714355, + "loss": 7.9276, + "rewards/accuracies": 0.5, + "rewards/chosen": -28.352664947509766, + "rewards/margins": -3.2523417472839355, + "rewards/rejected": -25.10032081604004, + "step": 22950 + }, + { + "epoch": 0.7737031918837844, + "grad_norm": 10.242867469787598, + "learning_rate": 1.480517121145596e-07, + "logits/chosen": -1.512997031211853, + "logits/rejected": -1.731942892074585, + "logps/chosen": -2.153104782104492, + "logps/rejected": -2.394892454147339, + "loss": 2.8295, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.531047821044922, + "rewards/margins": 2.417874574661255, + "rewards/rejected": -23.948925018310547, + "step": 22955 + }, + { + "epoch": 0.7738717179547676, + "grad_norm": 26.37079429626465, + "learning_rate": 1.4784284850111611e-07, + "logits/chosen": -1.8254725933074951, + "logits/rejected": -2.0033957958221436, + "logps/chosen": -2.168997287750244, + "logps/rejected": -2.5505526065826416, + "loss": 1.7174, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.689970016479492, + "rewards/margins": 3.81555438041687, + "rewards/rejected": -25.50552749633789, + "step": 22960 + }, + { + "epoch": 0.7740402440257508, + "grad_norm": 57.707820892333984, + "learning_rate": 1.4763410675538835e-07, + "logits/chosen": -2.0322554111480713, + "logits/rejected": -2.0387206077575684, + "logps/chosen": -2.6514925956726074, + "logps/rejected": -2.516125440597534, + "loss": 4.4089, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -26.514923095703125, + "rewards/margins": -1.3536683320999146, + "rewards/rejected": -25.1612548828125, + "step": 22965 + }, + { + "epoch": 0.774208770096734, + "grad_norm": 18.135936737060547, + "learning_rate": 1.4742548694961377e-07, + "logits/chosen": -2.2560200691223145, + "logits/rejected": -2.3864707946777344, + "logps/chosen": -2.3666298389434814, + "logps/rejected": -2.5894882678985596, + "loss": 2.7261, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.666301727294922, + "rewards/margins": 2.2285819053649902, + "rewards/rejected": -25.894882202148438, + "step": 22970 + }, + { + "epoch": 0.7743772961677171, + "grad_norm": 20.712677001953125, + "learning_rate": 1.4721698915598702e-07, + "logits/chosen": -1.5596562623977661, + "logits/rejected": -1.7954515218734741, + "logps/chosen": -2.4734067916870117, + "logps/rejected": -3.3166375160217285, + "loss": 2.9244, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.734067916870117, + "rewards/margins": 8.432307243347168, + "rewards/rejected": -33.16637420654297, + "step": 22975 + }, + { + "epoch": 0.7745458222387003, + "grad_norm": 10.812604904174805, + "learning_rate": 1.4700861344666132e-07, + "logits/chosen": -1.5280256271362305, + "logits/rejected": -1.799912691116333, + "logps/chosen": -1.7556688785552979, + "logps/rejected": -1.943488359451294, + "loss": 2.2683, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.556686401367188, + "rewards/margins": 1.8781960010528564, + "rewards/rejected": -19.434885025024414, + "step": 22980 + }, + { + "epoch": 0.7747143483096836, + "grad_norm": 30.655550003051758, + "learning_rate": 1.4680035989374718e-07, + "logits/chosen": -1.5087058544158936, + "logits/rejected": -2.066072463989258, + "logps/chosen": -1.9316574335098267, + "logps/rejected": -2.208230972290039, + "loss": 2.4866, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.316572189331055, + "rewards/margins": 2.7657370567321777, + "rewards/rejected": -22.08230972290039, + "step": 22985 + }, + { + "epoch": 0.7748828743806667, + "grad_norm": 45.217041015625, + "learning_rate": 1.4659222856931308e-07, + "logits/chosen": -1.862138032913208, + "logits/rejected": -2.3082938194274902, + "logps/chosen": -2.321685314178467, + "logps/rejected": -2.6933255195617676, + "loss": 1.8816, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -23.216854095458984, + "rewards/margins": 3.7164008617401123, + "rewards/rejected": -26.933252334594727, + "step": 22990 + }, + { + "epoch": 0.7750514004516499, + "grad_norm": 61.95263671875, + "learning_rate": 1.4638421954538482e-07, + "logits/chosen": -1.5636898279190063, + "logits/rejected": -1.2484227418899536, + "logps/chosen": -2.4466958045959473, + "logps/rejected": -2.9692986011505127, + "loss": 3.9933, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -24.46695899963379, + "rewards/margins": 5.226029396057129, + "rewards/rejected": -29.6929874420166, + "step": 22995 + }, + { + "epoch": 0.775219926522633, + "grad_norm": 24.363027572631836, + "learning_rate": 1.4617633289394633e-07, + "logits/chosen": -2.067631483078003, + "logits/rejected": -2.001176357269287, + "logps/chosen": -2.2669143676757812, + "logps/rejected": -2.5735621452331543, + "loss": 2.9926, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.669143676757812, + "rewards/margins": 3.066478967666626, + "rewards/rejected": -25.735620498657227, + "step": 23000 + }, + { + "epoch": 0.7753884525936162, + "grad_norm": 16.292213439941406, + "learning_rate": 1.4596856868693885e-07, + "logits/chosen": -1.9688247442245483, + "logits/rejected": -2.3371658325195312, + "logps/chosen": -2.187854290008545, + "logps/rejected": -2.4243526458740234, + "loss": 1.6859, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -21.878543853759766, + "rewards/margins": 2.364981174468994, + "rewards/rejected": -24.2435245513916, + "step": 23005 + }, + { + "epoch": 0.7755569786645994, + "grad_norm": 7.324104309082031, + "learning_rate": 1.4576092699626152e-07, + "logits/chosen": -1.372618317604065, + "logits/rejected": -1.5508311986923218, + "logps/chosen": -2.4179587364196777, + "logps/rejected": -2.629817008972168, + "loss": 1.7698, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -24.17958641052246, + "rewards/margins": 2.118582248687744, + "rewards/rejected": -26.298168182373047, + "step": 23010 + }, + { + "epoch": 0.7757255047355825, + "grad_norm": 62.67533493041992, + "learning_rate": 1.4555340789377085e-07, + "logits/chosen": -1.9175342321395874, + "logits/rejected": -2.3815016746520996, + "logps/chosen": -2.818748950958252, + "logps/rejected": -3.300917863845825, + "loss": 3.7856, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -28.187490463256836, + "rewards/margins": 4.821689128875732, + "rewards/rejected": -33.009178161621094, + "step": 23015 + }, + { + "epoch": 0.7758940308065658, + "grad_norm": 61.15652847290039, + "learning_rate": 1.4534601145128128e-07, + "logits/chosen": -1.769521713256836, + "logits/rejected": -1.6290092468261719, + "logps/chosen": -2.7396092414855957, + "logps/rejected": -3.124126434326172, + "loss": 2.4957, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -27.39609146118164, + "rewards/margins": 3.8451716899871826, + "rewards/rejected": -31.241262435913086, + "step": 23020 + }, + { + "epoch": 0.776062556877549, + "grad_norm": 45.90425491333008, + "learning_rate": 1.4513873774056412e-07, + "logits/chosen": -1.338941216468811, + "logits/rejected": -1.2785598039627075, + "logps/chosen": -2.2803165912628174, + "logps/rejected": -2.18070912361145, + "loss": 4.1639, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -22.803163528442383, + "rewards/margins": -0.9960732460021973, + "rewards/rejected": -21.807090759277344, + "step": 23025 + }, + { + "epoch": 0.7762310829485322, + "grad_norm": 23.82891082763672, + "learning_rate": 1.449315868333489e-07, + "logits/chosen": -1.6797630786895752, + "logits/rejected": -1.6392351388931274, + "logps/chosen": -2.1757025718688965, + "logps/rejected": -2.223583221435547, + "loss": 2.8337, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.75702476501465, + "rewards/margins": 0.4788082242012024, + "rewards/rejected": -22.2358341217041, + "step": 23030 + }, + { + "epoch": 0.7763996090195153, + "grad_norm": 135.72125244140625, + "learning_rate": 1.4472455880132234e-07, + "logits/chosen": -1.7913395166397095, + "logits/rejected": -1.7838003635406494, + "logps/chosen": -2.557555675506592, + "logps/rejected": -2.5798721313476562, + "loss": 3.3435, + "rewards/accuracies": 0.5, + "rewards/chosen": -25.575559616088867, + "rewards/margins": 0.22316360473632812, + "rewards/rejected": -25.798721313476562, + "step": 23035 + }, + { + "epoch": 0.7765681350904985, + "grad_norm": 31.134138107299805, + "learning_rate": 1.4451765371612878e-07, + "logits/chosen": -1.6820363998413086, + "logits/rejected": -1.8307892084121704, + "logps/chosen": -2.7770934104919434, + "logps/rejected": -3.1867949962615967, + "loss": 2.6863, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -27.77093505859375, + "rewards/margins": 4.0970139503479, + "rewards/rejected": -31.867950439453125, + "step": 23040 + }, + { + "epoch": 0.7767366611614817, + "grad_norm": 45.60488510131836, + "learning_rate": 1.4431087164936972e-07, + "logits/chosen": -1.9355823993682861, + "logits/rejected": -2.1896002292633057, + "logps/chosen": -3.5485992431640625, + "logps/rejected": -3.7806289196014404, + "loss": 3.903, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -35.48598861694336, + "rewards/margins": 2.320297956466675, + "rewards/rejected": -37.80629348754883, + "step": 23045 + }, + { + "epoch": 0.7769051872324648, + "grad_norm": 7.562737941741943, + "learning_rate": 1.441042126726044e-07, + "logits/chosen": -2.149552822113037, + "logits/rejected": -2.076618194580078, + "logps/chosen": -2.2214457988739014, + "logps/rejected": -2.2276124954223633, + "loss": 4.1425, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.214458465576172, + "rewards/margins": 0.06166648864746094, + "rewards/rejected": -22.276126861572266, + "step": 23050 + }, + { + "epoch": 0.777073713303448, + "grad_norm": 22.309995651245117, + "learning_rate": 1.438976768573495e-07, + "logits/chosen": -1.158691644668579, + "logits/rejected": -1.3676048517227173, + "logps/chosen": -2.538264513015747, + "logps/rejected": -3.150280475616455, + "loss": 1.7112, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -25.382644653320312, + "rewards/margins": 6.120162487030029, + "rewards/rejected": -31.5028076171875, + "step": 23055 + }, + { + "epoch": 0.7772422393744313, + "grad_norm": 24.967859268188477, + "learning_rate": 1.4369126427507855e-07, + "logits/chosen": -1.9301315546035767, + "logits/rejected": -2.185620069503784, + "logps/chosen": -2.295980930328369, + "logps/rejected": -2.754180431365967, + "loss": 2.413, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.95981216430664, + "rewards/margins": 4.581993579864502, + "rewards/rejected": -27.541805267333984, + "step": 23060 + }, + { + "epoch": 0.7774107654454144, + "grad_norm": 30.21352195739746, + "learning_rate": 1.4348497499722306e-07, + "logits/chosen": -2.2400124073028564, + "logits/rejected": -2.523691177368164, + "logps/chosen": -2.7316737174987793, + "logps/rejected": -3.1302685737609863, + "loss": 1.3373, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -27.316736221313477, + "rewards/margins": 3.985947370529175, + "rewards/rejected": -31.302684783935547, + "step": 23065 + }, + { + "epoch": 0.7775792915163976, + "grad_norm": 24.961088180541992, + "learning_rate": 1.4327880909517166e-07, + "logits/chosen": -1.9912430047988892, + "logits/rejected": -2.3256280422210693, + "logps/chosen": -2.2469804286956787, + "logps/rejected": -2.587294578552246, + "loss": 1.5903, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.469806671142578, + "rewards/margins": 3.4031403064727783, + "rewards/rejected": -25.872943878173828, + "step": 23070 + }, + { + "epoch": 0.7777478175873808, + "grad_norm": 31.824460983276367, + "learning_rate": 1.4307276664027024e-07, + "logits/chosen": -1.4603766202926636, + "logits/rejected": -1.5443477630615234, + "logps/chosen": -1.936466932296753, + "logps/rejected": -2.063175916671753, + "loss": 3.2179, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.364669799804688, + "rewards/margins": 1.2670886516571045, + "rewards/rejected": -20.631759643554688, + "step": 23075 + }, + { + "epoch": 0.7779163436583639, + "grad_norm": 33.54826736450195, + "learning_rate": 1.4286684770382178e-07, + "logits/chosen": -1.6126028299331665, + "logits/rejected": -1.8728916645050049, + "logps/chosen": -2.1415352821350098, + "logps/rejected": -2.4509973526000977, + "loss": 2.4713, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.415353775024414, + "rewards/margins": 3.0946192741394043, + "rewards/rejected": -24.509973526000977, + "step": 23080 + }, + { + "epoch": 0.7780848697293471, + "grad_norm": 63.88078689575195, + "learning_rate": 1.4266105235708687e-07, + "logits/chosen": -1.996490716934204, + "logits/rejected": -2.11602783203125, + "logps/chosen": -2.670262575149536, + "logps/rejected": -2.7985289096832275, + "loss": 2.7266, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.702627182006836, + "rewards/margins": 1.2826632261276245, + "rewards/rejected": -27.985286712646484, + "step": 23085 + }, + { + "epoch": 0.7782533958003303, + "grad_norm": 224.8692169189453, + "learning_rate": 1.4245538067128331e-07, + "logits/chosen": -1.309300184249878, + "logits/rejected": -1.3923568725585938, + "logps/chosen": -2.338399887084961, + "logps/rejected": -2.3223764896392822, + "loss": 3.6622, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.384000778198242, + "rewards/margins": -0.16023759543895721, + "rewards/rejected": -23.223764419555664, + "step": 23090 + }, + { + "epoch": 0.7784219218713135, + "grad_norm": 117.828125, + "learning_rate": 1.422498327175856e-07, + "logits/chosen": -1.7504974603652954, + "logits/rejected": -2.420707941055298, + "logps/chosen": -3.289490222930908, + "logps/rejected": -4.021206378936768, + "loss": 2.4788, + "rewards/accuracies": 0.5, + "rewards/chosen": -32.894901275634766, + "rewards/margins": 7.317163944244385, + "rewards/rejected": -40.212066650390625, + "step": 23095 + }, + { + "epoch": 0.7785904479422967, + "grad_norm": 154.18460083007812, + "learning_rate": 1.42044408567126e-07, + "logits/chosen": -1.9211801290512085, + "logits/rejected": -2.178114414215088, + "logps/chosen": -2.7812247276306152, + "logps/rejected": -2.6831390857696533, + "loss": 4.6546, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -27.812246322631836, + "rewards/margins": -0.9808561205863953, + "rewards/rejected": -26.831390380859375, + "step": 23100 + }, + { + "epoch": 0.7787589740132799, + "grad_norm": 7.306661427719519e-05, + "learning_rate": 1.4183910829099393e-07, + "logits/chosen": -1.4283428192138672, + "logits/rejected": -1.7082526683807373, + "logps/chosen": -2.776681900024414, + "logps/rejected": -3.5103487968444824, + "loss": 2.2582, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -27.766815185546875, + "rewards/margins": 7.336672782897949, + "rewards/rejected": -35.10348892211914, + "step": 23105 + }, + { + "epoch": 0.778927500084263, + "grad_norm": 30.130817413330078, + "learning_rate": 1.4163393196023532e-07, + "logits/chosen": -2.1264195442199707, + "logits/rejected": -2.2917914390563965, + "logps/chosen": -2.5027756690979004, + "logps/rejected": -2.6923680305480957, + "loss": 2.0476, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -25.02775764465332, + "rewards/margins": 1.8959224224090576, + "rewards/rejected": -26.92367935180664, + "step": 23110 + }, + { + "epoch": 0.7790960261552462, + "grad_norm": 36.973567962646484, + "learning_rate": 1.4142887964585375e-07, + "logits/chosen": -1.9158130884170532, + "logits/rejected": -2.056734800338745, + "logps/chosen": -2.228407382965088, + "logps/rejected": -2.534636974334717, + "loss": 2.6497, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.284072875976562, + "rewards/margins": 3.0622963905334473, + "rewards/rejected": -25.34636878967285, + "step": 23115 + }, + { + "epoch": 0.7792645522262294, + "grad_norm": 31.22389793395996, + "learning_rate": 1.4122395141880983e-07, + "logits/chosen": -1.7744057178497314, + "logits/rejected": -2.0564589500427246, + "logps/chosen": -2.008413076400757, + "logps/rejected": -2.5604119300842285, + "loss": 1.9569, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.084131240844727, + "rewards/margins": 5.519989490509033, + "rewards/rejected": -25.6041202545166, + "step": 23120 + }, + { + "epoch": 0.7794330782972125, + "grad_norm": 36.156959533691406, + "learning_rate": 1.4101914735002128e-07, + "logits/chosen": -1.4270397424697876, + "logits/rejected": -1.7191730737686157, + "logps/chosen": -2.226762294769287, + "logps/rejected": -2.4052212238311768, + "loss": 2.4218, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.267623901367188, + "rewards/margins": 1.784589409828186, + "rewards/rejected": -24.052213668823242, + "step": 23125 + }, + { + "epoch": 0.7796016043681958, + "grad_norm": 12.364837646484375, + "learning_rate": 1.4081446751036242e-07, + "logits/chosen": -1.8880256414413452, + "logits/rejected": -2.7479958534240723, + "logps/chosen": -2.484072208404541, + "logps/rejected": -3.452897548675537, + "loss": 0.7836, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -24.84072494506836, + "rewards/margins": 9.688249588012695, + "rewards/rejected": -34.52897644042969, + "step": 23130 + }, + { + "epoch": 0.779770130439179, + "grad_norm": 11.456212043762207, + "learning_rate": 1.4060991197066496e-07, + "logits/chosen": -1.887709379196167, + "logits/rejected": -1.9433212280273438, + "logps/chosen": -2.3668179512023926, + "logps/rejected": -2.5772693157196045, + "loss": 2.0588, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.66817855834961, + "rewards/margins": 2.1045122146606445, + "rewards/rejected": -25.772689819335938, + "step": 23135 + }, + { + "epoch": 0.7799386565101621, + "grad_norm": 0.0005731512210331857, + "learning_rate": 1.4040548080171754e-07, + "logits/chosen": -1.120009183883667, + "logits/rejected": -1.2262059450149536, + "logps/chosen": -2.447812557220459, + "logps/rejected": -2.5543696880340576, + "loss": 5.2049, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.47812271118164, + "rewards/margins": 1.0655739307403564, + "rewards/rejected": -25.543697357177734, + "step": 23140 + }, + { + "epoch": 0.7801071825811453, + "grad_norm": 46.96331787109375, + "learning_rate": 1.402011740742658e-07, + "logits/chosen": -1.763390302658081, + "logits/rejected": -2.1351780891418457, + "logps/chosen": -1.7994730472564697, + "logps/rejected": -1.8151382207870483, + "loss": 3.6935, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.99472999572754, + "rewards/margins": 0.15665188431739807, + "rewards/rejected": -18.15138053894043, + "step": 23145 + }, + { + "epoch": 0.7802757086521285, + "grad_norm": 37.356719970703125, + "learning_rate": 1.3999699185901222e-07, + "logits/chosen": -1.8704512119293213, + "logits/rejected": -2.1423542499542236, + "logps/chosen": -2.3729965686798096, + "logps/rejected": -2.660909652709961, + "loss": 2.2502, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.729965209960938, + "rewards/margins": 2.8791308403015137, + "rewards/rejected": -26.609094619750977, + "step": 23150 + }, + { + "epoch": 0.7804442347231116, + "grad_norm": 70.5757064819336, + "learning_rate": 1.397929342266162e-07, + "logits/chosen": -1.9407947063446045, + "logits/rejected": -1.8652915954589844, + "logps/chosen": -2.179978132247925, + "logps/rejected": -2.2899811267852783, + "loss": 2.6966, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.799779891967773, + "rewards/margins": 1.100031852722168, + "rewards/rejected": -22.899810791015625, + "step": 23155 + }, + { + "epoch": 0.7806127607940948, + "grad_norm": 19.091533660888672, + "learning_rate": 1.395890012476942e-07, + "logits/chosen": -1.7762082815170288, + "logits/rejected": -1.6420847177505493, + "logps/chosen": -1.8515875339508057, + "logps/rejected": -2.181715488433838, + "loss": 2.9497, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.515872955322266, + "rewards/margins": 3.301278591156006, + "rewards/rejected": -21.81715202331543, + "step": 23160 + }, + { + "epoch": 0.780781286865078, + "grad_norm": 20.589876174926758, + "learning_rate": 1.3938519299281903e-07, + "logits/chosen": -2.0344557762145996, + "logits/rejected": -2.105841636657715, + "logps/chosen": -1.8849172592163086, + "logps/rejected": -2.2034912109375, + "loss": 2.7466, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.849172592163086, + "rewards/margins": 3.1857378482818604, + "rewards/rejected": -22.034912109375, + "step": 23165 + }, + { + "epoch": 0.7809498129360612, + "grad_norm": 43.91060256958008, + "learning_rate": 1.3918150953252096e-07, + "logits/chosen": -1.9605381488800049, + "logits/rejected": -2.1528801918029785, + "logps/chosen": -2.89855694770813, + "logps/rejected": -3.4155421257019043, + "loss": 2.7643, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -28.98556900024414, + "rewards/margins": 5.1698527336120605, + "rewards/rejected": -34.155418395996094, + "step": 23170 + }, + { + "epoch": 0.7811183390070444, + "grad_norm": 38.220924377441406, + "learning_rate": 1.3897795093728692e-07, + "logits/chosen": -2.337184429168701, + "logits/rejected": -2.533107280731201, + "logps/chosen": -2.7642264366149902, + "logps/rejected": -3.6560683250427246, + "loss": 1.7109, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -27.64226722717285, + "rewards/margins": 8.918416023254395, + "rewards/rejected": -36.5606803894043, + "step": 23175 + }, + { + "epoch": 0.7812868650780276, + "grad_norm": 51.32099151611328, + "learning_rate": 1.3877451727756017e-07, + "logits/chosen": -1.914634108543396, + "logits/rejected": -1.7938172817230225, + "logps/chosen": -2.7175326347351074, + "logps/rejected": -2.4325027465820312, + "loss": 6.0719, + "rewards/accuracies": 0.5, + "rewards/chosen": -27.17532730102539, + "rewards/margins": -2.850299596786499, + "rewards/rejected": -24.325027465820312, + "step": 23180 + }, + { + "epoch": 0.7814553911490107, + "grad_norm": 17.17637825012207, + "learning_rate": 1.3857120862374134e-07, + "logits/chosen": -1.8925994634628296, + "logits/rejected": -2.0988128185272217, + "logps/chosen": -2.4949610233306885, + "logps/rejected": -2.6762216091156006, + "loss": 3.1127, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -24.949609756469727, + "rewards/margins": 1.8126052618026733, + "rewards/rejected": -26.762216567993164, + "step": 23185 + }, + { + "epoch": 0.7816239172199939, + "grad_norm": 14.754379272460938, + "learning_rate": 1.3836802504618743e-07, + "logits/chosen": -1.419999361038208, + "logits/rejected": -1.9414488077163696, + "logps/chosen": -2.772274971008301, + "logps/rejected": -3.133202075958252, + "loss": 3.968, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -27.722747802734375, + "rewards/margins": 3.6092727184295654, + "rewards/rejected": -31.3320255279541, + "step": 23190 + }, + { + "epoch": 0.7817924432909771, + "grad_norm": 83.89252471923828, + "learning_rate": 1.3816496661521247e-07, + "logits/chosen": -1.7252734899520874, + "logits/rejected": -1.954825758934021, + "logps/chosen": -2.2423574924468994, + "logps/rejected": -2.609626531600952, + "loss": 2.8739, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.423574447631836, + "rewards/margins": 3.672689914703369, + "rewards/rejected": -26.096263885498047, + "step": 23195 + }, + { + "epoch": 0.7819609693619602, + "grad_norm": 50.774505615234375, + "learning_rate": 1.3796203340108669e-07, + "logits/chosen": -2.3243584632873535, + "logits/rejected": -2.7399606704711914, + "logps/chosen": -2.7891287803649902, + "logps/rejected": -3.1757395267486572, + "loss": 2.1589, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -27.891284942626953, + "rewards/margins": 3.8661091327667236, + "rewards/rejected": -31.75739097595215, + "step": 23200 + }, + { + "epoch": 0.7819609693619602, + "eval_logits/chosen": -2.2590906620025635, + "eval_logits/rejected": -2.4323461055755615, + "eval_logps/chosen": -2.2656962871551514, + "eval_logps/rejected": -2.419363498687744, + "eval_loss": 3.069692373275757, + "eval_rewards/accuracies": 0.6299999952316284, + "eval_rewards/chosen": -22.656963348388672, + "eval_rewards/margins": 1.5366699695587158, + "eval_rewards/rejected": -24.193632125854492, + "eval_runtime": 12.9266, + "eval_samples_per_second": 7.736, + "eval_steps_per_second": 1.934, + "step": 23200 + }, + { + "epoch": 0.7821294954329435, + "grad_norm": 9.897990226745605, + "learning_rate": 1.3775922547403747e-07, + "logits/chosen": -1.3797047138214111, + "logits/rejected": -1.4906001091003418, + "logps/chosen": -2.4802145957946777, + "logps/rejected": -2.776656150817871, + "loss": 1.7795, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.802146911621094, + "rewards/margins": 2.9644157886505127, + "rewards/rejected": -27.76656150817871, + "step": 23205 + }, + { + "epoch": 0.7822980215039267, + "grad_norm": 224.88113403320312, + "learning_rate": 1.3755654290424867e-07, + "logits/chosen": -2.0499682426452637, + "logits/rejected": -2.2290892601013184, + "logps/chosen": -2.3797082901000977, + "logps/rejected": -2.57685923576355, + "loss": 3.0556, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.79707908630371, + "rewards/margins": 1.9715118408203125, + "rewards/rejected": -25.768590927124023, + "step": 23210 + }, + { + "epoch": 0.7824665475749099, + "grad_norm": 5.163259983062744, + "learning_rate": 1.3735398576186058e-07, + "logits/chosen": -1.5662257671356201, + "logits/rejected": -1.7009817361831665, + "logps/chosen": -2.46049427986145, + "logps/rejected": -2.8982386589050293, + "loss": 1.4512, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -24.604944229125977, + "rewards/margins": 4.377443790435791, + "rewards/rejected": -28.982385635375977, + "step": 23215 + }, + { + "epoch": 0.782635073645893, + "grad_norm": 25.366844177246094, + "learning_rate": 1.3715155411697028e-07, + "logits/chosen": -1.9533096551895142, + "logits/rejected": -2.640411615371704, + "logps/chosen": -2.7735981941223145, + "logps/rejected": -2.4231772422790527, + "loss": 7.7772, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -27.735980987548828, + "rewards/margins": -3.5042080879211426, + "rewards/rejected": -24.231773376464844, + "step": 23220 + }, + { + "epoch": 0.7828035997168762, + "grad_norm": 273.75946044921875, + "learning_rate": 1.3694924803963147e-07, + "logits/chosen": -1.8088428974151611, + "logits/rejected": -1.6246687173843384, + "logps/chosen": -2.374206304550171, + "logps/rejected": -2.3597331047058105, + "loss": 3.3866, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.7420654296875, + "rewards/margins": -0.14473219215869904, + "rewards/rejected": -23.597332000732422, + "step": 23225 + }, + { + "epoch": 0.7829721257878594, + "grad_norm": 1.5523768663406372, + "learning_rate": 1.3674706759985444e-07, + "logits/chosen": -2.1185832023620605, + "logits/rejected": -2.421220541000366, + "logps/chosen": -2.440904140472412, + "logps/rejected": -3.2022106647491455, + "loss": 0.6537, + "rewards/accuracies": 1.0, + "rewards/chosen": -24.409038543701172, + "rewards/margins": 7.613066673278809, + "rewards/rejected": -32.0221061706543, + "step": 23230 + }, + { + "epoch": 0.7831406518588425, + "grad_norm": 116.09601593017578, + "learning_rate": 1.3654501286760555e-07, + "logits/chosen": -1.8746535778045654, + "logits/rejected": -2.2961676120758057, + "logps/chosen": -2.961397647857666, + "logps/rejected": -3.421003818511963, + "loss": 2.4693, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -29.613977432250977, + "rewards/margins": 4.596056938171387, + "rewards/rejected": -34.21003341674805, + "step": 23235 + }, + { + "epoch": 0.7833091779298258, + "grad_norm": 26.170934677124023, + "learning_rate": 1.3634308391280818e-07, + "logits/chosen": -2.3364009857177734, + "logits/rejected": -2.408446788787842, + "logps/chosen": -2.0243003368377686, + "logps/rejected": -2.1905949115753174, + "loss": 2.5335, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.243005752563477, + "rewards/margins": 1.6629432439804077, + "rewards/rejected": -21.905948638916016, + "step": 23240 + }, + { + "epoch": 0.783477704000809, + "grad_norm": 35.10651397705078, + "learning_rate": 1.361412808053421e-07, + "logits/chosen": -1.689139723777771, + "logits/rejected": -1.9725558757781982, + "logps/chosen": -2.2845559120178223, + "logps/rejected": -2.4833407402038574, + "loss": 2.0314, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.845556259155273, + "rewards/margins": 1.9878495931625366, + "rewards/rejected": -24.83340835571289, + "step": 23245 + }, + { + "epoch": 0.7836462300717921, + "grad_norm": 22.14246940612793, + "learning_rate": 1.359396036150431e-07, + "logits/chosen": -1.912557601928711, + "logits/rejected": -1.9069467782974243, + "logps/chosen": -2.4497103691101074, + "logps/rejected": -2.5474560260772705, + "loss": 2.9122, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -24.49710464477539, + "rewards/margins": 0.9774559140205383, + "rewards/rejected": -25.474559783935547, + "step": 23250 + }, + { + "epoch": 0.7838147561427753, + "grad_norm": 32.53044891357422, + "learning_rate": 1.3573805241170388e-07, + "logits/chosen": -1.7146705389022827, + "logits/rejected": -1.7869758605957031, + "logps/chosen": -2.8418736457824707, + "logps/rejected": -2.825669765472412, + "loss": 4.9545, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -28.41873550415039, + "rewards/margins": -0.16203632950782776, + "rewards/rejected": -28.256698608398438, + "step": 23255 + }, + { + "epoch": 0.7839832822137585, + "grad_norm": 0.5539440512657166, + "learning_rate": 1.3553662726507343e-07, + "logits/chosen": -1.527477502822876, + "logits/rejected": -1.8698577880859375, + "logps/chosen": -2.849269390106201, + "logps/rejected": -3.2912509441375732, + "loss": 2.1832, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -28.492691040039062, + "rewards/margins": 4.419821262359619, + "rewards/rejected": -32.912513732910156, + "step": 23260 + }, + { + "epoch": 0.7841518082847416, + "grad_norm": 18.821990966796875, + "learning_rate": 1.353353282448571e-07, + "logits/chosen": -1.9978233575820923, + "logits/rejected": -2.158778429031372, + "logps/chosen": -1.6932750940322876, + "logps/rejected": -1.9092578887939453, + "loss": 1.9132, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -16.932750701904297, + "rewards/margins": 2.1598281860351562, + "rewards/rejected": -19.092578887939453, + "step": 23265 + }, + { + "epoch": 0.7843203343557248, + "grad_norm": 0.9460822939872742, + "learning_rate": 1.3513415542071627e-07, + "logits/chosen": -1.9015109539031982, + "logits/rejected": -2.185220241546631, + "logps/chosen": -1.9466800689697266, + "logps/rejected": -2.194200038909912, + "loss": 2.1957, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.466800689697266, + "rewards/margins": 2.4751992225646973, + "rewards/rejected": -21.941997528076172, + "step": 23270 + }, + { + "epoch": 0.784488860426708, + "grad_norm": 93.4100112915039, + "learning_rate": 1.3493310886226917e-07, + "logits/chosen": -1.8904443979263306, + "logits/rejected": -1.738896131515503, + "logps/chosen": -2.364542007446289, + "logps/rejected": -2.4334211349487305, + "loss": 4.0342, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.64542007446289, + "rewards/margins": 0.6887893676757812, + "rewards/rejected": -24.334209442138672, + "step": 23275 + }, + { + "epoch": 0.7846573864976912, + "grad_norm": 28.355125427246094, + "learning_rate": 1.3473218863909002e-07, + "logits/chosen": -1.9899078607559204, + "logits/rejected": -2.364184856414795, + "logps/chosen": -2.944575786590576, + "logps/rejected": -3.3720479011535645, + "loss": 5.6914, + "rewards/accuracies": 0.5, + "rewards/chosen": -29.445758819580078, + "rewards/margins": 4.274716377258301, + "rewards/rejected": -33.7204704284668, + "step": 23280 + }, + { + "epoch": 0.7848259125686744, + "grad_norm": 30.1297664642334, + "learning_rate": 1.3453139482070936e-07, + "logits/chosen": -1.5648924112319946, + "logits/rejected": -1.971692681312561, + "logps/chosen": -2.311891555786133, + "logps/rejected": -3.1094870567321777, + "loss": 2.2711, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.118911743164062, + "rewards/margins": 7.975958347320557, + "rewards/rejected": -31.094873428344727, + "step": 23285 + }, + { + "epoch": 0.7849944386396576, + "grad_norm": 47.018577575683594, + "learning_rate": 1.3433072747661427e-07, + "logits/chosen": -1.5098412036895752, + "logits/rejected": -1.6824413537979126, + "logps/chosen": -2.133009195327759, + "logps/rejected": -2.2505862712860107, + "loss": 2.5595, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.330089569091797, + "rewards/margins": 1.1757726669311523, + "rewards/rejected": -22.505863189697266, + "step": 23290 + }, + { + "epoch": 0.7851629647106407, + "grad_norm": 19.697479248046875, + "learning_rate": 1.3413018667624742e-07, + "logits/chosen": -1.9054797887802124, + "logits/rejected": -2.207296371459961, + "logps/chosen": -2.088989734649658, + "logps/rejected": -2.263735055923462, + "loss": 3.8373, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.889896392822266, + "rewards/margins": 1.747450590133667, + "rewards/rejected": -22.637348175048828, + "step": 23295 + }, + { + "epoch": 0.7853314907816239, + "grad_norm": 107.42939758300781, + "learning_rate": 1.3392977248900827e-07, + "logits/chosen": -1.213679552078247, + "logits/rejected": -2.19240665435791, + "logps/chosen": -2.506530284881592, + "logps/rejected": -3.082782030105591, + "loss": 2.0125, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.0653018951416, + "rewards/margins": 5.762515068054199, + "rewards/rejected": -30.82781982421875, + "step": 23300 + }, + { + "epoch": 0.7855000168526071, + "grad_norm": 43.644493103027344, + "learning_rate": 1.3372948498425229e-07, + "logits/chosen": -1.627753496170044, + "logits/rejected": -1.5755800008773804, + "logps/chosen": -2.293330669403076, + "logps/rejected": -2.4177563190460205, + "loss": 2.4611, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.933305740356445, + "rewards/margins": 1.2442573308944702, + "rewards/rejected": -24.177562713623047, + "step": 23305 + }, + { + "epoch": 0.7856685429235902, + "grad_norm": 26.87129783630371, + "learning_rate": 1.335293242312911e-07, + "logits/chosen": -2.177293300628662, + "logits/rejected": -2.4072842597961426, + "logps/chosen": -1.9114128351211548, + "logps/rejected": -1.9970004558563232, + "loss": 2.9647, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.114126205444336, + "rewards/margins": 0.8558780550956726, + "rewards/rejected": -19.97000503540039, + "step": 23310 + }, + { + "epoch": 0.7858370689945735, + "grad_norm": 158.449462890625, + "learning_rate": 1.3332929029939249e-07, + "logits/chosen": -1.6649404764175415, + "logits/rejected": -1.7888128757476807, + "logps/chosen": -2.672362804412842, + "logps/rejected": -2.7433390617370605, + "loss": 4.5421, + "rewards/accuracies": 0.5, + "rewards/chosen": -26.7236270904541, + "rewards/margins": 0.7097650766372681, + "rewards/rejected": -27.433391571044922, + "step": 23315 + }, + { + "epoch": 0.7860055950655567, + "grad_norm": 40.22739791870117, + "learning_rate": 1.3312938325778017e-07, + "logits/chosen": -1.8461437225341797, + "logits/rejected": -1.9166162014007568, + "logps/chosen": -2.19929575920105, + "logps/rejected": -2.339583158493042, + "loss": 2.528, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.99295425415039, + "rewards/margins": 1.4028747081756592, + "rewards/rejected": -23.395832061767578, + "step": 23320 + }, + { + "epoch": 0.7861741211365398, + "grad_norm": 16.53434944152832, + "learning_rate": 1.3292960317563416e-07, + "logits/chosen": -1.9835697412490845, + "logits/rejected": -2.3433592319488525, + "logps/chosen": -2.381385326385498, + "logps/rejected": -2.6794729232788086, + "loss": 1.9686, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.813854217529297, + "rewards/margins": 2.9808764457702637, + "rewards/rejected": -26.794729232788086, + "step": 23325 + }, + { + "epoch": 0.786342647207523, + "grad_norm": 171.1400604248047, + "learning_rate": 1.3272995012209054e-07, + "logits/chosen": -2.139479398727417, + "logits/rejected": -2.3308639526367188, + "logps/chosen": -2.979889392852783, + "logps/rejected": -3.3997435569763184, + "loss": 4.3946, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -29.79889488220215, + "rewards/margins": 4.19854211807251, + "rewards/rejected": -33.9974365234375, + "step": 23330 + }, + { + "epoch": 0.7865111732785062, + "grad_norm": 19.148937225341797, + "learning_rate": 1.3253042416624145e-07, + "logits/chosen": -2.0896546840667725, + "logits/rejected": -2.177683115005493, + "logps/chosen": -2.602734327316284, + "logps/rejected": -2.8803181648254395, + "loss": 2.4611, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.02734375, + "rewards/margins": 2.775836229324341, + "rewards/rejected": -28.803180694580078, + "step": 23335 + }, + { + "epoch": 0.7866796993494893, + "grad_norm": 45.68458557128906, + "learning_rate": 1.3233102537713465e-07, + "logits/chosen": -1.3132305145263672, + "logits/rejected": -1.513753890991211, + "logps/chosen": -2.453195095062256, + "logps/rejected": -3.172518014907837, + "loss": 1.2306, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -24.531949996948242, + "rewards/margins": 7.193228244781494, + "rewards/rejected": -31.725177764892578, + "step": 23340 + }, + { + "epoch": 0.7868482254204725, + "grad_norm": 54.43955993652344, + "learning_rate": 1.321317538237744e-07, + "logits/chosen": -1.7532365322113037, + "logits/rejected": -2.0115723609924316, + "logps/chosen": -2.954211950302124, + "logps/rejected": -3.273526430130005, + "loss": 3.8185, + "rewards/accuracies": 0.5, + "rewards/chosen": -29.5421199798584, + "rewards/margins": 3.1931443214416504, + "rewards/rejected": -32.735267639160156, + "step": 23345 + }, + { + "epoch": 0.7870167514914558, + "grad_norm": 158.7488555908203, + "learning_rate": 1.3193260957512087e-07, + "logits/chosen": -2.010417938232422, + "logits/rejected": -2.2069597244262695, + "logps/chosen": -3.5379459857940674, + "logps/rejected": -3.6150882244110107, + "loss": 3.2882, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -35.37946319580078, + "rewards/margins": 0.771422266960144, + "rewards/rejected": -36.150882720947266, + "step": 23350 + }, + { + "epoch": 0.787185277562439, + "grad_norm": 33.453224182128906, + "learning_rate": 1.317335927000897e-07, + "logits/chosen": -1.8199208974838257, + "logits/rejected": -2.153441905975342, + "logps/chosen": -2.2132675647735596, + "logps/rejected": -2.4548451900482178, + "loss": 3.2995, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.132671356201172, + "rewards/margins": 2.4157779216766357, + "rewards/rejected": -24.548452377319336, + "step": 23355 + }, + { + "epoch": 0.7873538036334221, + "grad_norm": 28.82305145263672, + "learning_rate": 1.3153470326755307e-07, + "logits/chosen": -1.5860540866851807, + "logits/rejected": -1.6239850521087646, + "logps/chosen": -2.708892822265625, + "logps/rejected": -3.2508621215820312, + "loss": 2.395, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -27.08892822265625, + "rewards/margins": 5.419692516326904, + "rewards/rejected": -32.50861740112305, + "step": 23360 + }, + { + "epoch": 0.7875223297044053, + "grad_norm": 31.444128036499023, + "learning_rate": 1.3133594134633862e-07, + "logits/chosen": -1.9128261804580688, + "logits/rejected": -2.18424391746521, + "logps/chosen": -2.671001672744751, + "logps/rejected": -3.0663018226623535, + "loss": 2.4916, + "rewards/accuracies": 0.5, + "rewards/chosen": -26.71001625061035, + "rewards/margins": 3.953002452850342, + "rewards/rejected": -30.663015365600586, + "step": 23365 + }, + { + "epoch": 0.7876908557753884, + "grad_norm": 18.52704620361328, + "learning_rate": 1.3113730700523024e-07, + "logits/chosen": -2.3228516578674316, + "logits/rejected": -2.5755486488342285, + "logps/chosen": -2.3647756576538086, + "logps/rejected": -2.4866623878479004, + "loss": 2.6455, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.647756576538086, + "rewards/margins": 1.2188684940338135, + "rewards/rejected": -24.86662483215332, + "step": 23370 + }, + { + "epoch": 0.7878593818463716, + "grad_norm": 233.28515625, + "learning_rate": 1.3093880031296718e-07, + "logits/chosen": -1.7100486755371094, + "logits/rejected": -2.266857624053955, + "logps/chosen": -3.6958794593811035, + "logps/rejected": -4.913882255554199, + "loss": 2.0399, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -36.95878982543945, + "rewards/margins": 12.180027961730957, + "rewards/rejected": -49.138816833496094, + "step": 23375 + }, + { + "epoch": 0.7880279079173548, + "grad_norm": 29.36772918701172, + "learning_rate": 1.3074042133824486e-07, + "logits/chosen": -1.528952956199646, + "logits/rejected": -1.6609337329864502, + "logps/chosen": -2.4949135780334473, + "logps/rejected": -2.8517425060272217, + "loss": 2.036, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.94913673400879, + "rewards/margins": 3.568286418914795, + "rewards/rejected": -28.51742172241211, + "step": 23380 + }, + { + "epoch": 0.7881964339883379, + "grad_norm": 20.27956771850586, + "learning_rate": 1.3054217014971465e-07, + "logits/chosen": -1.9842973947525024, + "logits/rejected": -2.2391180992126465, + "logps/chosen": -2.5889785289764404, + "logps/rejected": -3.751030445098877, + "loss": 1.8571, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -25.889789581298828, + "rewards/margins": 11.620512008666992, + "rewards/rejected": -37.51030349731445, + "step": 23385 + }, + { + "epoch": 0.7883649600593212, + "grad_norm": 206.58560180664062, + "learning_rate": 1.3034404681598316e-07, + "logits/chosen": -1.965024709701538, + "logits/rejected": -1.7408416271209717, + "logps/chosen": -2.933290481567383, + "logps/rejected": -2.8030002117156982, + "loss": 5.3126, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -29.332904815673828, + "rewards/margins": -1.3029001951217651, + "rewards/rejected": -28.030004501342773, + "step": 23390 + }, + { + "epoch": 0.7885334861303044, + "grad_norm": 47.58219528198242, + "learning_rate": 1.3014605140561314e-07, + "logits/chosen": -1.651240587234497, + "logits/rejected": -1.6010128259658813, + "logps/chosen": -2.0883679389953613, + "logps/rejected": -2.7429397106170654, + "loss": 2.0386, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.883678436279297, + "rewards/margins": 6.545716762542725, + "rewards/rejected": -27.429393768310547, + "step": 23395 + }, + { + "epoch": 0.7887020122012876, + "grad_norm": 55.15037536621094, + "learning_rate": 1.2994818398712309e-07, + "logits/chosen": -1.6969373226165771, + "logits/rejected": -1.8868554830551147, + "logps/chosen": -1.8029073476791382, + "logps/rejected": -1.7463340759277344, + "loss": 3.8652, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.02907371520996, + "rewards/margins": -0.5657330751419067, + "rewards/rejected": -17.463340759277344, + "step": 23400 + }, + { + "epoch": 0.7888705382722707, + "grad_norm": 45.554691314697266, + "learning_rate": 1.2975044462898727e-07, + "logits/chosen": -1.935333251953125, + "logits/rejected": -2.3411459922790527, + "logps/chosen": -2.15376615524292, + "logps/rejected": -3.0946743488311768, + "loss": 1.3598, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.537662506103516, + "rewards/margins": 9.409080505371094, + "rewards/rejected": -30.946741104125977, + "step": 23405 + }, + { + "epoch": 0.7890390643432539, + "grad_norm": 40.636802673339844, + "learning_rate": 1.295528333996352e-07, + "logits/chosen": -1.7083832025527954, + "logits/rejected": -2.2345995903015137, + "logps/chosen": -2.28090763092041, + "logps/rejected": -2.776479482650757, + "loss": 1.9808, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -22.809078216552734, + "rewards/margins": 4.955718040466309, + "rewards/rejected": -27.764795303344727, + "step": 23410 + }, + { + "epoch": 0.789207590414237, + "grad_norm": 264.3153991699219, + "learning_rate": 1.2935535036745238e-07, + "logits/chosen": -1.5900719165802002, + "logits/rejected": -1.3120791912078857, + "logps/chosen": -3.130431652069092, + "logps/rejected": -3.3375420570373535, + "loss": 3.7296, + "rewards/accuracies": 0.5, + "rewards/chosen": -31.3043212890625, + "rewards/margins": 2.0710978507995605, + "rewards/rejected": -33.37541961669922, + "step": 23415 + }, + { + "epoch": 0.7893761164852202, + "grad_norm": 65.93145751953125, + "learning_rate": 1.2915799560078017e-07, + "logits/chosen": -1.4226365089416504, + "logits/rejected": -1.7168121337890625, + "logps/chosen": -2.6043052673339844, + "logps/rejected": -2.3740992546081543, + "loss": 6.4785, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -26.043054580688477, + "rewards/margins": -2.302061080932617, + "rewards/rejected": -23.74099349975586, + "step": 23420 + }, + { + "epoch": 0.7895446425562035, + "grad_norm": 38.1806755065918, + "learning_rate": 1.2896076916791493e-07, + "logits/chosen": -1.9888120889663696, + "logits/rejected": -2.302474021911621, + "logps/chosen": -2.144757032394409, + "logps/rejected": -2.4852633476257324, + "loss": 2.3075, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.44757080078125, + "rewards/margins": 3.4050631523132324, + "rewards/rejected": -24.85263442993164, + "step": 23425 + }, + { + "epoch": 0.7897131686271867, + "grad_norm": 16.340255737304688, + "learning_rate": 1.2876367113710912e-07, + "logits/chosen": -1.9023936986923218, + "logits/rejected": -1.8551537990570068, + "logps/chosen": -3.38207745552063, + "logps/rejected": -3.6513431072235107, + "loss": 2.8637, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -33.820777893066406, + "rewards/margins": 2.692657709121704, + "rewards/rejected": -36.513431549072266, + "step": 23430 + }, + { + "epoch": 0.7898816946981698, + "grad_norm": 13.2061128616333, + "learning_rate": 1.2856670157657063e-07, + "logits/chosen": -1.9358323812484741, + "logits/rejected": -1.9602285623550415, + "logps/chosen": -3.3243343830108643, + "logps/rejected": -4.002040863037109, + "loss": 2.5865, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -33.24333953857422, + "rewards/margins": 6.777066707611084, + "rewards/rejected": -40.020408630371094, + "step": 23435 + }, + { + "epoch": 0.790050220769153, + "grad_norm": 63.23234939575195, + "learning_rate": 1.2836986055446282e-07, + "logits/chosen": -1.6777303218841553, + "logits/rejected": -1.462426781654358, + "logps/chosen": -2.3692269325256348, + "logps/rejected": -2.870598316192627, + "loss": 2.7527, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.6922664642334, + "rewards/margins": 5.013715744018555, + "rewards/rejected": -28.705982208251953, + "step": 23440 + }, + { + "epoch": 0.7902187468401362, + "grad_norm": 36.042152404785156, + "learning_rate": 1.2817314813890462e-07, + "logits/chosen": -1.772571325302124, + "logits/rejected": -1.8282972574234009, + "logps/chosen": -2.993535280227661, + "logps/rejected": -2.908937931060791, + "loss": 6.4062, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -29.935354232788086, + "rewards/margins": -0.8459756970405579, + "rewards/rejected": -29.08937644958496, + "step": 23445 + }, + { + "epoch": 0.7903872729111193, + "grad_norm": 5.146857606774802e-09, + "learning_rate": 1.2797656439797045e-07, + "logits/chosen": -1.7889522314071655, + "logits/rejected": -2.487628698348999, + "logps/chosen": -3.045865058898926, + "logps/rejected": -4.883096218109131, + "loss": 1.714, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -30.458648681640625, + "rewards/margins": 18.372310638427734, + "rewards/rejected": -48.830955505371094, + "step": 23450 + }, + { + "epoch": 0.7905557989821025, + "grad_norm": 18.409088134765625, + "learning_rate": 1.2778010939969036e-07, + "logits/chosen": -1.5572829246520996, + "logits/rejected": -1.7321348190307617, + "logps/chosen": -1.9669567346572876, + "logps/rejected": -2.5498642921447754, + "loss": 1.7725, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.669567108154297, + "rewards/margins": 5.829077243804932, + "rewards/rejected": -25.498645782470703, + "step": 23455 + }, + { + "epoch": 0.7907243250530858, + "grad_norm": 25.91119956970215, + "learning_rate": 1.2758378321204937e-07, + "logits/chosen": -1.4228665828704834, + "logits/rejected": -1.7013216018676758, + "logps/chosen": -2.8095977306365967, + "logps/rejected": -2.6884658336639404, + "loss": 4.7333, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -28.095977783203125, + "rewards/margins": -1.2113181352615356, + "rewards/rejected": -26.884658813476562, + "step": 23460 + }, + { + "epoch": 0.7908928511240689, + "grad_norm": 36.09959030151367, + "learning_rate": 1.2738758590298837e-07, + "logits/chosen": -1.487137794494629, + "logits/rejected": -1.973081350326538, + "logps/chosen": -2.0125210285186768, + "logps/rejected": -2.5702998638153076, + "loss": 2.3875, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.12520980834961, + "rewards/margins": 5.577790260314941, + "rewards/rejected": -25.702999114990234, + "step": 23465 + }, + { + "epoch": 0.7910613771950521, + "grad_norm": 18.3976993560791, + "learning_rate": 1.271915175404036e-07, + "logits/chosen": -1.9622561931610107, + "logits/rejected": -2.2915892601013184, + "logps/chosen": -2.484745740890503, + "logps/rejected": -3.077031373977661, + "loss": 1.5175, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -24.847457885742188, + "rewards/margins": 5.922853469848633, + "rewards/rejected": -30.770313262939453, + "step": 23470 + }, + { + "epoch": 0.7912299032660353, + "grad_norm": 56.609535217285156, + "learning_rate": 1.2699557819214668e-07, + "logits/chosen": -1.1058142185211182, + "logits/rejected": -1.6427888870239258, + "logps/chosen": -2.4761815071105957, + "logps/rejected": -3.5051283836364746, + "loss": 3.4386, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.76181411743164, + "rewards/margins": 10.289472579956055, + "rewards/rejected": -35.05128479003906, + "step": 23475 + }, + { + "epoch": 0.7913984293370184, + "grad_norm": 288.9089050292969, + "learning_rate": 1.267997679260242e-07, + "logits/chosen": -1.9424368143081665, + "logits/rejected": -1.7826957702636719, + "logps/chosen": -3.042111396789551, + "logps/rejected": -2.601473331451416, + "loss": 10.5931, + "rewards/accuracies": 0.5, + "rewards/chosen": -30.42111587524414, + "rewards/margins": -4.406381607055664, + "rewards/rejected": -26.01473045349121, + "step": 23480 + }, + { + "epoch": 0.7915669554080016, + "grad_norm": 29.078601837158203, + "learning_rate": 1.2660408680979855e-07, + "logits/chosen": -1.6248836517333984, + "logits/rejected": -1.9362905025482178, + "logps/chosen": -2.0807981491088867, + "logps/rejected": -2.4495043754577637, + "loss": 1.7965, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.807979583740234, + "rewards/margins": 3.6870627403259277, + "rewards/rejected": -24.495044708251953, + "step": 23485 + }, + { + "epoch": 0.7917354814789848, + "grad_norm": 17.588529586791992, + "learning_rate": 1.2640853491118736e-07, + "logits/chosen": -2.079242706298828, + "logits/rejected": -2.186800479888916, + "logps/chosen": -3.2707207202911377, + "logps/rejected": -3.467264175415039, + "loss": 3.3609, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -32.70720672607422, + "rewards/margins": 1.965433120727539, + "rewards/rejected": -34.67264175415039, + "step": 23490 + }, + { + "epoch": 0.7919040075499679, + "grad_norm": 55.01066207885742, + "learning_rate": 1.262131122978632e-07, + "logits/chosen": -1.6255840063095093, + "logits/rejected": -1.6399444341659546, + "logps/chosen": -2.202665328979492, + "logps/rejected": -2.0976974964141846, + "loss": 4.2069, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -22.026653289794922, + "rewards/margins": -1.0496762990951538, + "rewards/rejected": -20.976974487304688, + "step": 23495 + }, + { + "epoch": 0.7920725336209512, + "grad_norm": 43.822635650634766, + "learning_rate": 1.2601781903745428e-07, + "logits/chosen": -2.414681911468506, + "logits/rejected": -2.323493480682373, + "logps/chosen": -2.716387987136841, + "logps/rejected": -3.0716395378112793, + "loss": 1.8628, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -27.16387939453125, + "rewards/margins": 3.5525169372558594, + "rewards/rejected": -30.71639633178711, + "step": 23500 + }, + { + "epoch": 0.7922410596919344, + "grad_norm": 30.146278381347656, + "learning_rate": 1.2582265519754383e-07, + "logits/chosen": -1.63968026638031, + "logits/rejected": -1.736728310585022, + "logps/chosen": -2.398235559463501, + "logps/rejected": -2.7017436027526855, + "loss": 2.6609, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.98235511779785, + "rewards/margins": 3.0350775718688965, + "rewards/rejected": -27.017431259155273, + "step": 23505 + }, + { + "epoch": 0.7924095857629175, + "grad_norm": 15.89415168762207, + "learning_rate": 1.256276208456706e-07, + "logits/chosen": -1.4977285861968994, + "logits/rejected": -1.5650742053985596, + "logps/chosen": -2.1227152347564697, + "logps/rejected": -2.2276690006256104, + "loss": 2.452, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.22715187072754, + "rewards/margins": 1.0495368242263794, + "rewards/rejected": -22.276689529418945, + "step": 23510 + }, + { + "epoch": 0.7925781118339007, + "grad_norm": 18.189443588256836, + "learning_rate": 1.2543271604932798e-07, + "logits/chosen": -1.3050501346588135, + "logits/rejected": -1.4186303615570068, + "logps/chosen": -2.170626163482666, + "logps/rejected": -2.408052921295166, + "loss": 3.0794, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.70625877380371, + "rewards/margins": 2.3742713928222656, + "rewards/rejected": -24.080530166625977, + "step": 23515 + }, + { + "epoch": 0.7927466379048839, + "grad_norm": 21.401525497436523, + "learning_rate": 1.2523794087596497e-07, + "logits/chosen": -1.9882118701934814, + "logits/rejected": -2.178072214126587, + "logps/chosen": -2.76887583732605, + "logps/rejected": -2.8130533695220947, + "loss": 4.3253, + "rewards/accuracies": 0.5, + "rewards/chosen": -27.688756942749023, + "rewards/margins": 0.44177302718162537, + "rewards/rejected": -28.13053321838379, + "step": 23520 + }, + { + "epoch": 0.792915163975867, + "grad_norm": 20.015840530395508, + "learning_rate": 1.250432953929857e-07, + "logits/chosen": -0.9792146682739258, + "logits/rejected": -1.1626332998275757, + "logps/chosen": -3.1012685298919678, + "logps/rejected": -3.83912992477417, + "loss": 4.7123, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -31.012685775756836, + "rewards/margins": 7.3786163330078125, + "rewards/rejected": -38.39130401611328, + "step": 23525 + }, + { + "epoch": 0.7930836900468502, + "grad_norm": 17.142728805541992, + "learning_rate": 1.2484877966774903e-07, + "logits/chosen": -1.9387973546981812, + "logits/rejected": -2.458888292312622, + "logps/chosen": -2.6190226078033447, + "logps/rejected": -3.124077320098877, + "loss": 1.9793, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.190227508544922, + "rewards/margins": 5.050547122955322, + "rewards/rejected": -31.240772247314453, + "step": 23530 + }, + { + "epoch": 0.7932522161178335, + "grad_norm": 67.72013092041016, + "learning_rate": 1.2465439376756937e-07, + "logits/chosen": -2.1118016242980957, + "logits/rejected": -2.0462646484375, + "logps/chosen": -2.5230441093444824, + "logps/rejected": -2.5223631858825684, + "loss": 3.2112, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.230443954467773, + "rewards/margins": -0.006808662321418524, + "rewards/rejected": -25.223636627197266, + "step": 23535 + }, + { + "epoch": 0.7934207421888166, + "grad_norm": 1.92659592628479, + "learning_rate": 1.2446013775971604e-07, + "logits/chosen": -1.891710638999939, + "logits/rejected": -2.7494044303894043, + "logps/chosen": -1.8182204961776733, + "logps/rejected": -2.3620352745056152, + "loss": 0.9304, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -18.182205200195312, + "rewards/margins": 5.438145637512207, + "rewards/rejected": -23.620351791381836, + "step": 23540 + }, + { + "epoch": 0.7935892682597998, + "grad_norm": 16.09982681274414, + "learning_rate": 1.2426601171141344e-07, + "logits/chosen": -1.489386796951294, + "logits/rejected": -1.820673942565918, + "logps/chosen": -1.9332993030548096, + "logps/rejected": -2.682340621948242, + "loss": 1.2905, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.332992553710938, + "rewards/margins": 7.490413665771484, + "rewards/rejected": -26.823406219482422, + "step": 23545 + }, + { + "epoch": 0.793757794330783, + "grad_norm": 16.377511978149414, + "learning_rate": 1.240720156898407e-07, + "logits/chosen": -1.8569927215576172, + "logits/rejected": -1.9203824996948242, + "logps/chosen": -2.1887829303741455, + "logps/rejected": -2.242539882659912, + "loss": 3.3256, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.887828826904297, + "rewards/margins": 0.5375703573226929, + "rewards/rejected": -22.425399780273438, + "step": 23550 + }, + { + "epoch": 0.7939263204017661, + "grad_norm": 290.3534240722656, + "learning_rate": 1.238781497621324e-07, + "logits/chosen": -2.299469470977783, + "logits/rejected": -2.2020373344421387, + "logps/chosen": -2.653339385986328, + "logps/rejected": -2.5723299980163574, + "loss": 4.8782, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -26.533395767211914, + "rewards/margins": -0.8100942373275757, + "rewards/rejected": -25.72330093383789, + "step": 23555 + }, + { + "epoch": 0.7940948464727493, + "grad_norm": 3.090402603149414, + "learning_rate": 1.2368441399537804e-07, + "logits/chosen": -1.4215683937072754, + "logits/rejected": -1.6714990139007568, + "logps/chosen": -2.486147403717041, + "logps/rejected": -3.095050811767578, + "loss": 1.8828, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.86147689819336, + "rewards/margins": 6.089034080505371, + "rewards/rejected": -30.950510025024414, + "step": 23560 + }, + { + "epoch": 0.7942633725437325, + "grad_norm": 92.12581634521484, + "learning_rate": 1.234908084566215e-07, + "logits/chosen": -1.4645830392837524, + "logits/rejected": -1.3868095874786377, + "logps/chosen": -1.9499835968017578, + "logps/rejected": -1.8435981273651123, + "loss": 4.3021, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -19.499835968017578, + "rewards/margins": -1.0638525485992432, + "rewards/rejected": -18.435983657836914, + "step": 23565 + }, + { + "epoch": 0.7944318986147157, + "grad_norm": 26.912521362304688, + "learning_rate": 1.232973332128624e-07, + "logits/chosen": -1.7435210943222046, + "logits/rejected": -2.2686400413513184, + "logps/chosen": -2.8063995838165283, + "logps/rejected": -3.7166335582733154, + "loss": 1.2754, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -28.063995361328125, + "rewards/margins": 9.102341651916504, + "rewards/rejected": -37.16633987426758, + "step": 23570 + }, + { + "epoch": 0.7946004246856989, + "grad_norm": 23.525171279907227, + "learning_rate": 1.2310398833105473e-07, + "logits/chosen": -1.9364773035049438, + "logits/rejected": -2.058380603790283, + "logps/chosen": -2.36995792388916, + "logps/rejected": -2.8152060508728027, + "loss": 1.8172, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.6995792388916, + "rewards/margins": 4.452480792999268, + "rewards/rejected": -28.15205955505371, + "step": 23575 + }, + { + "epoch": 0.7947689507566821, + "grad_norm": 23.474639892578125, + "learning_rate": 1.229107738781076e-07, + "logits/chosen": -1.6277767419815063, + "logits/rejected": -2.2872207164764404, + "logps/chosen": -3.2178142070770264, + "logps/rejected": -4.609158515930176, + "loss": 2.4449, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -32.178138732910156, + "rewards/margins": 13.913442611694336, + "rewards/rejected": -46.091583251953125, + "step": 23580 + }, + { + "epoch": 0.7949374768276652, + "grad_norm": 70.98563385009766, + "learning_rate": 1.227176899208849e-07, + "logits/chosen": -1.8348493576049805, + "logits/rejected": -2.1374592781066895, + "logps/chosen": -2.2831220626831055, + "logps/rejected": -2.608473777770996, + "loss": 2.3469, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.831220626831055, + "rewards/margins": 3.253516674041748, + "rewards/rejected": -26.08473777770996, + "step": 23585 + }, + { + "epoch": 0.7951060028986484, + "grad_norm": 30.179494857788086, + "learning_rate": 1.2252473652620555e-07, + "logits/chosen": -1.1523596048355103, + "logits/rejected": -1.3122644424438477, + "logps/chosen": -1.8992496728897095, + "logps/rejected": -1.9855247735977173, + "loss": 2.7997, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.992496490478516, + "rewards/margins": 0.8627524375915527, + "rewards/rejected": -19.855249404907227, + "step": 23590 + }, + { + "epoch": 0.7952745289696316, + "grad_norm": 34.42359924316406, + "learning_rate": 1.2233191376084278e-07, + "logits/chosen": -1.2946655750274658, + "logits/rejected": -1.474416732788086, + "logps/chosen": -2.49336838722229, + "logps/rejected": -2.689332962036133, + "loss": 2.6393, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -24.933683395385742, + "rewards/margins": 1.9596456289291382, + "rewards/rejected": -26.893329620361328, + "step": 23595 + }, + { + "epoch": 0.7954430550406147, + "grad_norm": 32.83942794799805, + "learning_rate": 1.2213922169152512e-07, + "logits/chosen": -2.071537494659424, + "logits/rejected": -1.9408347606658936, + "logps/chosen": -2.3998780250549316, + "logps/rejected": -2.4216084480285645, + "loss": 3.0872, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.998775482177734, + "rewards/margins": 0.21730585396289825, + "rewards/rejected": -24.216081619262695, + "step": 23600 + }, + { + "epoch": 0.7954430550406147, + "eval_logits/chosen": -2.268324375152588, + "eval_logits/rejected": -2.4430205821990967, + "eval_logps/chosen": -2.2717368602752686, + "eval_logps/rejected": -2.42488694190979, + "eval_loss": 3.081331491470337, + "eval_rewards/accuracies": 0.6299999952316284, + "eval_rewards/chosen": -22.717369079589844, + "eval_rewards/margins": 1.5315022468566895, + "eval_rewards/rejected": -24.248868942260742, + "eval_runtime": 12.9011, + "eval_samples_per_second": 7.751, + "eval_steps_per_second": 1.938, + "step": 23600 + }, + { + "epoch": 0.7956115811115979, + "grad_norm": 23.862712860107422, + "learning_rate": 1.2194666038493572e-07, + "logits/chosen": -2.4514338970184326, + "logits/rejected": -2.3811779022216797, + "logps/chosen": -2.263093948364258, + "logps/rejected": -2.6213536262512207, + "loss": 2.0757, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.63094139099121, + "rewards/margins": 3.582595109939575, + "rewards/rejected": -26.213537216186523, + "step": 23605 + }, + { + "epoch": 0.7957801071825812, + "grad_norm": 18.74156951904297, + "learning_rate": 1.217542299077125e-07, + "logits/chosen": -1.8089141845703125, + "logits/rejected": -2.0515694618225098, + "logps/chosen": -3.133936643600464, + "logps/rejected": -2.848822593688965, + "loss": 6.858, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -31.339366912841797, + "rewards/margins": -2.8511433601379395, + "rewards/rejected": -28.48822593688965, + "step": 23610 + }, + { + "epoch": 0.7959486332535644, + "grad_norm": 16.450408935546875, + "learning_rate": 1.2156193032644814e-07, + "logits/chosen": -1.2309521436691284, + "logits/rejected": -1.6049835681915283, + "logps/chosen": -1.9556461572647095, + "logps/rejected": -2.1087331771850586, + "loss": 2.3946, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.556461334228516, + "rewards/margins": 1.5308706760406494, + "rewards/rejected": -21.087329864501953, + "step": 23615 + }, + { + "epoch": 0.7961171593245475, + "grad_norm": 23.52358627319336, + "learning_rate": 1.2136976170768964e-07, + "logits/chosen": -2.2708792686462402, + "logits/rejected": -2.4395358562469482, + "logps/chosen": -2.665672540664673, + "logps/rejected": -2.482041358947754, + "loss": 5.9053, + "rewards/accuracies": 0.5, + "rewards/chosen": -26.656723022460938, + "rewards/margins": -1.8363081216812134, + "rewards/rejected": -24.820415496826172, + "step": 23620 + }, + { + "epoch": 0.7962856853955307, + "grad_norm": 43.37750244140625, + "learning_rate": 1.2117772411793926e-07, + "logits/chosen": -1.9793859720230103, + "logits/rejected": -2.1826746463775635, + "logps/chosen": -2.1819968223571777, + "logps/rejected": -2.199878215789795, + "loss": 2.9864, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.819969177246094, + "rewards/margins": 0.17881163954734802, + "rewards/rejected": -21.998783111572266, + "step": 23625 + }, + { + "epoch": 0.7964542114665139, + "grad_norm": 59.14590835571289, + "learning_rate": 1.2098581762365362e-07, + "logits/chosen": -1.2757107019424438, + "logits/rejected": -1.9035346508026123, + "logps/chosen": -2.4866976737976074, + "logps/rejected": -3.309864044189453, + "loss": 2.3694, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.86697769165039, + "rewards/margins": 8.23166561126709, + "rewards/rejected": -33.09864044189453, + "step": 23630 + }, + { + "epoch": 0.796622737537497, + "grad_norm": 185.86553955078125, + "learning_rate": 1.2079404229124384e-07, + "logits/chosen": -2.433590888977051, + "logits/rejected": -2.8083548545837402, + "logps/chosen": -3.4761478900909424, + "logps/rejected": -3.9497249126434326, + "loss": 3.5581, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -34.761478424072266, + "rewards/margins": 4.73577356338501, + "rewards/rejected": -39.497249603271484, + "step": 23635 + }, + { + "epoch": 0.7967912636084802, + "grad_norm": 29.933055877685547, + "learning_rate": 1.206023981870759e-07, + "logits/chosen": -1.9267528057098389, + "logits/rejected": -2.0702621936798096, + "logps/chosen": -2.0923218727111816, + "logps/rejected": -2.2572989463806152, + "loss": 2.6568, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.923219680786133, + "rewards/margins": 1.6497719287872314, + "rewards/rejected": -22.57299041748047, + "step": 23640 + }, + { + "epoch": 0.7969597896794635, + "grad_norm": 35.02862548828125, + "learning_rate": 1.204108853774704e-07, + "logits/chosen": -2.425234317779541, + "logits/rejected": -2.033950090408325, + "logps/chosen": -2.5497958660125732, + "logps/rejected": -2.62798810005188, + "loss": 2.8787, + "rewards/accuracies": 0.5, + "rewards/chosen": -25.49795913696289, + "rewards/margins": 0.7819207906723022, + "rewards/rejected": -26.27988052368164, + "step": 23645 + }, + { + "epoch": 0.7971283157504466, + "grad_norm": 36.276649475097656, + "learning_rate": 1.2021950392870217e-07, + "logits/chosen": -1.880113959312439, + "logits/rejected": -2.2360730171203613, + "logps/chosen": -2.114194393157959, + "logps/rejected": -2.259273052215576, + "loss": 2.6131, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.14194107055664, + "rewards/margins": 1.4507843255996704, + "rewards/rejected": -22.592727661132812, + "step": 23650 + }, + { + "epoch": 0.7972968418214298, + "grad_norm": 0.8350377678871155, + "learning_rate": 1.2002825390700083e-07, + "logits/chosen": -1.2839758396148682, + "logits/rejected": -1.7719905376434326, + "logps/chosen": -2.047053575515747, + "logps/rejected": -2.2489142417907715, + "loss": 4.0195, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.470537185668945, + "rewards/margins": 2.0186100006103516, + "rewards/rejected": -22.489147186279297, + "step": 23655 + }, + { + "epoch": 0.797465367892413, + "grad_norm": 30.018207550048828, + "learning_rate": 1.1983713537855057e-07, + "logits/chosen": -1.1491228342056274, + "logits/rejected": -1.546242356300354, + "logps/chosen": -1.9170169830322266, + "logps/rejected": -1.9767497777938843, + "loss": 3.0863, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.170169830322266, + "rewards/margins": 0.5973286628723145, + "rewards/rejected": -19.767498016357422, + "step": 23660 + }, + { + "epoch": 0.7976338939633961, + "grad_norm": 19.978004455566406, + "learning_rate": 1.1964614840949e-07, + "logits/chosen": -1.8129030466079712, + "logits/rejected": -2.4642412662506104, + "logps/chosen": -2.960409164428711, + "logps/rejected": -3.7300522327423096, + "loss": 1.272, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -29.604089736938477, + "rewards/margins": 7.696431636810303, + "rewards/rejected": -37.30052185058594, + "step": 23665 + }, + { + "epoch": 0.7978024200343793, + "grad_norm": 30.25568962097168, + "learning_rate": 1.19455293065912e-07, + "logits/chosen": -2.0144896507263184, + "logits/rejected": -2.042292833328247, + "logps/chosen": -2.054476499557495, + "logps/rejected": -2.0532383918762207, + "loss": 3.2688, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.54476547241211, + "rewards/margins": -0.012381648644804955, + "rewards/rejected": -20.532384872436523, + "step": 23670 + }, + { + "epoch": 0.7979709461053625, + "grad_norm": 14.91845417022705, + "learning_rate": 1.1926456941386427e-07, + "logits/chosen": -1.9736425876617432, + "logits/rejected": -2.074195146560669, + "logps/chosen": -2.37538743019104, + "logps/rejected": -3.2154929637908936, + "loss": 1.028, + "rewards/accuracies": 1.0, + "rewards/chosen": -23.753873825073242, + "rewards/margins": 8.401058197021484, + "rewards/rejected": -32.154930114746094, + "step": 23675 + }, + { + "epoch": 0.7981394721763457, + "grad_norm": 25.458602905273438, + "learning_rate": 1.1907397751934878e-07, + "logits/chosen": -2.014819622039795, + "logits/rejected": -2.256798028945923, + "logps/chosen": -2.9026269912719727, + "logps/rejected": -3.2940673828125, + "loss": 1.7844, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -29.026269912719727, + "rewards/margins": 3.9144043922424316, + "rewards/rejected": -32.940677642822266, + "step": 23680 + }, + { + "epoch": 0.7983079982473289, + "grad_norm": 41.000709533691406, + "learning_rate": 1.1888351744832165e-07, + "logits/chosen": -1.6192891597747803, + "logits/rejected": -1.9515184164047241, + "logps/chosen": -2.7270865440368652, + "logps/rejected": -2.7937889099121094, + "loss": 3.3432, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -27.270864486694336, + "rewards/margins": 0.6670233011245728, + "rewards/rejected": -27.937885284423828, + "step": 23685 + }, + { + "epoch": 0.7984765243183121, + "grad_norm": 53.08238983154297, + "learning_rate": 1.1869318926669375e-07, + "logits/chosen": -1.8159668445587158, + "logits/rejected": -2.2407050132751465, + "logps/chosen": -2.7758102416992188, + "logps/rejected": -2.7419583797454834, + "loss": 4.1484, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -27.758102416992188, + "rewards/margins": -0.3385186195373535, + "rewards/rejected": -27.419586181640625, + "step": 23690 + }, + { + "epoch": 0.7986450503892952, + "grad_norm": 31.205656051635742, + "learning_rate": 1.1850299304033012e-07, + "logits/chosen": -1.4135322570800781, + "logits/rejected": -2.1289544105529785, + "logps/chosen": -2.77282452583313, + "logps/rejected": -3.640005588531494, + "loss": 3.2513, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -27.728246688842773, + "rewards/margins": 8.671809196472168, + "rewards/rejected": -36.400054931640625, + "step": 23695 + }, + { + "epoch": 0.7988135764602784, + "grad_norm": 11.864728927612305, + "learning_rate": 1.183129288350504e-07, + "logits/chosen": -1.782881736755371, + "logits/rejected": -1.5927501916885376, + "logps/chosen": -2.4381699562072754, + "logps/rejected": -2.653639793395996, + "loss": 3.9229, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.381698608398438, + "rewards/margins": 2.1546998023986816, + "rewards/rejected": -26.536401748657227, + "step": 23700 + }, + { + "epoch": 0.7989821025312616, + "grad_norm": 36.167850494384766, + "learning_rate": 1.1812299671662801e-07, + "logits/chosen": -1.3345615863800049, + "logits/rejected": -1.5490849018096924, + "logps/chosen": -2.146177053451538, + "logps/rejected": -3.1967315673828125, + "loss": 2.8808, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.461772918701172, + "rewards/margins": 10.505544662475586, + "rewards/rejected": -31.967315673828125, + "step": 23705 + }, + { + "epoch": 0.7991506286022447, + "grad_norm": 25.301176071166992, + "learning_rate": 1.1793319675079105e-07, + "logits/chosen": -1.9392036199569702, + "logits/rejected": -2.0048937797546387, + "logps/chosen": -2.289959669113159, + "logps/rejected": -2.714442491531372, + "loss": 1.4998, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -22.899595260620117, + "rewards/margins": 4.244827747344971, + "rewards/rejected": -27.144420623779297, + "step": 23710 + }, + { + "epoch": 0.7993191546732279, + "grad_norm": 68.43116760253906, + "learning_rate": 1.1774352900322193e-07, + "logits/chosen": -2.4753763675689697, + "logits/rejected": -2.485348701477051, + "logps/chosen": -3.014070510864258, + "logps/rejected": -3.5698540210723877, + "loss": 2.7535, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -30.140705108642578, + "rewards/margins": 5.557833671569824, + "rewards/rejected": -35.69853973388672, + "step": 23715 + }, + { + "epoch": 0.7994876807442112, + "grad_norm": 18.36795425415039, + "learning_rate": 1.1755399353955719e-07, + "logits/chosen": -2.0299389362335205, + "logits/rejected": -2.1144871711730957, + "logps/chosen": -2.5242087841033936, + "logps/rejected": -2.5602338314056396, + "loss": 4.3361, + "rewards/accuracies": 0.5, + "rewards/chosen": -25.24208641052246, + "rewards/margins": 0.36025285720825195, + "rewards/rejected": -25.602340698242188, + "step": 23720 + }, + { + "epoch": 0.7996562068151943, + "grad_norm": 22.82927131652832, + "learning_rate": 1.1736459042538744e-07, + "logits/chosen": -2.183875799179077, + "logits/rejected": -2.2232208251953125, + "logps/chosen": -2.3259165287017822, + "logps/rejected": -2.5631721019744873, + "loss": 2.9509, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.259164810180664, + "rewards/margins": 2.3725597858428955, + "rewards/rejected": -25.631725311279297, + "step": 23725 + }, + { + "epoch": 0.7998247328861775, + "grad_norm": 42.674198150634766, + "learning_rate": 1.1717531972625766e-07, + "logits/chosen": -1.6192405223846436, + "logits/rejected": -1.6916412115097046, + "logps/chosen": -1.889173150062561, + "logps/rejected": -1.9974628686904907, + "loss": 3.0027, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -18.891727447509766, + "rewards/margins": 1.082900047302246, + "rewards/rejected": -19.974628448486328, + "step": 23730 + }, + { + "epoch": 0.7999932589571607, + "grad_norm": 29.702539443969727, + "learning_rate": 1.1698618150766703e-07, + "logits/chosen": -1.8145778179168701, + "logits/rejected": -2.477713108062744, + "logps/chosen": -2.2364907264709473, + "logps/rejected": -2.771986484527588, + "loss": 1.8934, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -22.36490821838379, + "rewards/margins": 5.354956150054932, + "rewards/rejected": -27.719863891601562, + "step": 23735 + }, + { + "epoch": 0.8001617850281438, + "grad_norm": 117.44536590576172, + "learning_rate": 1.1679717583506887e-07, + "logits/chosen": -2.1266894340515137, + "logits/rejected": -2.262590169906616, + "logps/chosen": -2.91984224319458, + "logps/rejected": -2.8861470222473145, + "loss": 4.2483, + "rewards/accuracies": 0.5, + "rewards/chosen": -29.198421478271484, + "rewards/margins": -0.33695316314697266, + "rewards/rejected": -28.861469268798828, + "step": 23740 + }, + { + "epoch": 0.800330311099127, + "grad_norm": 27.362186431884766, + "learning_rate": 1.1660830277387057e-07, + "logits/chosen": -1.4639588594436646, + "logits/rejected": -1.4785763025283813, + "logps/chosen": -2.717275619506836, + "logps/rejected": -2.8771772384643555, + "loss": 3.1879, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -27.17275619506836, + "rewards/margins": 1.5990158319473267, + "rewards/rejected": -28.771770477294922, + "step": 23745 + }, + { + "epoch": 0.8004988371701102, + "grad_norm": 36.77997589111328, + "learning_rate": 1.1641956238943374e-07, + "logits/chosen": -2.3897011280059814, + "logits/rejected": -2.4779114723205566, + "logps/chosen": -2.4858672618865967, + "logps/rejected": -3.1868858337402344, + "loss": 2.9593, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.858673095703125, + "rewards/margins": 7.010186672210693, + "rewards/rejected": -31.868860244750977, + "step": 23750 + }, + { + "epoch": 0.8006673632410934, + "grad_norm": 48.805233001708984, + "learning_rate": 1.1623095474707384e-07, + "logits/chosen": -2.0982353687286377, + "logits/rejected": -2.279383897781372, + "logps/chosen": -2.692783832550049, + "logps/rejected": -2.985405206680298, + "loss": 2.3231, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.927837371826172, + "rewards/margins": 2.926213502883911, + "rewards/rejected": -29.854053497314453, + "step": 23755 + }, + { + "epoch": 0.8008358893120766, + "grad_norm": 32.98495101928711, + "learning_rate": 1.160424799120605e-07, + "logits/chosen": -1.9733015298843384, + "logits/rejected": -2.725742816925049, + "logps/chosen": -2.0132980346679688, + "logps/rejected": -2.7367465496063232, + "loss": 0.4751, + "rewards/accuracies": 1.0, + "rewards/chosen": -20.132980346679688, + "rewards/margins": 7.234484672546387, + "rewards/rejected": -27.36746597290039, + "step": 23760 + }, + { + "epoch": 0.8010044153830598, + "grad_norm": 9.345693588256836, + "learning_rate": 1.1585413794961763e-07, + "logits/chosen": -2.085667848587036, + "logits/rejected": -2.239039659500122, + "logps/chosen": -2.438840866088867, + "logps/rejected": -3.195539712905884, + "loss": 1.1036, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -24.388408660888672, + "rewards/margins": 7.566989898681641, + "rewards/rejected": -31.955394744873047, + "step": 23765 + }, + { + "epoch": 0.8011729414540429, + "grad_norm": 45.57161331176758, + "learning_rate": 1.1566592892492299e-07, + "logits/chosen": -1.311514139175415, + "logits/rejected": -1.605931043624878, + "logps/chosen": -2.6677563190460205, + "logps/rejected": -2.8092823028564453, + "loss": 2.9337, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.677562713623047, + "rewards/margins": 1.415259599685669, + "rewards/rejected": -28.092823028564453, + "step": 23770 + }, + { + "epoch": 0.8013414675250261, + "grad_norm": 34.057655334472656, + "learning_rate": 1.1547785290310801e-07, + "logits/chosen": -0.9410927891731262, + "logits/rejected": -1.327196717262268, + "logps/chosen": -3.036648988723755, + "logps/rejected": -3.29571533203125, + "loss": 3.1449, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -30.366491317749023, + "rewards/margins": 2.5906639099121094, + "rewards/rejected": -32.9571533203125, + "step": 23775 + }, + { + "epoch": 0.8015099935960093, + "grad_norm": 61.107078552246094, + "learning_rate": 1.1528990994925864e-07, + "logits/chosen": -1.8493750095367432, + "logits/rejected": -2.0228495597839355, + "logps/chosen": -3.3258419036865234, + "logps/rejected": -3.1576480865478516, + "loss": 6.4411, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -33.2584228515625, + "rewards/margins": -1.6819394826889038, + "rewards/rejected": -31.576480865478516, + "step": 23780 + }, + { + "epoch": 0.8016785196669924, + "grad_norm": 46.79604721069336, + "learning_rate": 1.1510210012841454e-07, + "logits/chosen": -0.7938812375068665, + "logits/rejected": -0.8427656292915344, + "logps/chosen": -2.245758533477783, + "logps/rejected": -2.696263551712036, + "loss": 2.7041, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.457584381103516, + "rewards/margins": 4.505049705505371, + "rewards/rejected": -26.962635040283203, + "step": 23785 + }, + { + "epoch": 0.8018470457379757, + "grad_norm": 10.535088539123535, + "learning_rate": 1.1491442350556913e-07, + "logits/chosen": -1.5898624658584595, + "logits/rejected": -1.7450120449066162, + "logps/chosen": -2.6972622871398926, + "logps/rejected": -2.9257194995880127, + "loss": 1.6859, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -26.972620010375977, + "rewards/margins": 2.2845757007598877, + "rewards/rejected": -29.257198333740234, + "step": 23790 + }, + { + "epoch": 0.8020155718089589, + "grad_norm": 22.262107849121094, + "learning_rate": 1.1472688014566994e-07, + "logits/chosen": -2.0982842445373535, + "logits/rejected": -2.317521095275879, + "logps/chosen": -3.2699222564697266, + "logps/rejected": -3.510211944580078, + "loss": 4.1141, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -32.69922637939453, + "rewards/margins": 2.4028992652893066, + "rewards/rejected": -35.10212326049805, + "step": 23795 + }, + { + "epoch": 0.802184097879942, + "grad_norm": 19.394956588745117, + "learning_rate": 1.1453947011361837e-07, + "logits/chosen": -1.8001686334609985, + "logits/rejected": -2.0557053089141846, + "logps/chosen": -2.1141293048858643, + "logps/rejected": -2.363154888153076, + "loss": 2.1705, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.141294479370117, + "rewards/margins": 2.490255832672119, + "rewards/rejected": -23.631549835205078, + "step": 23800 + }, + { + "epoch": 0.8023526239509252, + "grad_norm": 33.673126220703125, + "learning_rate": 1.1435219347426983e-07, + "logits/chosen": -1.997754693031311, + "logits/rejected": -2.667611598968506, + "logps/chosen": -2.3104987144470215, + "logps/rejected": -3.8297152519226074, + "loss": 1.729, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -23.10498809814453, + "rewards/margins": 15.192166328430176, + "rewards/rejected": -38.297157287597656, + "step": 23805 + }, + { + "epoch": 0.8025211500219084, + "grad_norm": 46.4930534362793, + "learning_rate": 1.1416505029243307e-07, + "logits/chosen": -1.2600901126861572, + "logits/rejected": -1.6382179260253906, + "logps/chosen": -2.0742053985595703, + "logps/rejected": -2.07768177986145, + "loss": 3.2559, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.742053985595703, + "rewards/margins": 0.03476228564977646, + "rewards/rejected": -20.776817321777344, + "step": 23810 + }, + { + "epoch": 0.8026896760928915, + "grad_norm": 20.000106811523438, + "learning_rate": 1.1397804063287109e-07, + "logits/chosen": -1.8872020244598389, + "logits/rejected": -2.278292179107666, + "logps/chosen": -2.424549102783203, + "logps/rejected": -2.9823575019836426, + "loss": 2.7308, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.2454891204834, + "rewards/margins": 5.578084468841553, + "rewards/rejected": -29.82357406616211, + "step": 23815 + }, + { + "epoch": 0.8028582021638747, + "grad_norm": 77.44605255126953, + "learning_rate": 1.1379116456030074e-07, + "logits/chosen": -1.7925255298614502, + "logits/rejected": -1.7629890441894531, + "logps/chosen": -2.2893612384796143, + "logps/rejected": -2.439579486846924, + "loss": 2.0319, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -22.89361000061035, + "rewards/margins": 1.5021837949752808, + "rewards/rejected": -24.395793914794922, + "step": 23820 + }, + { + "epoch": 0.8030267282348579, + "grad_norm": 43.680755615234375, + "learning_rate": 1.1360442213939215e-07, + "logits/chosen": -1.9656673669815063, + "logits/rejected": -2.002274990081787, + "logps/chosen": -2.452765941619873, + "logps/rejected": -2.5063955783843994, + "loss": 4.3844, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.52766227722168, + "rewards/margins": 0.5362951159477234, + "rewards/rejected": -25.063955307006836, + "step": 23825 + }, + { + "epoch": 0.8031952543058412, + "grad_norm": 58.82378005981445, + "learning_rate": 1.1341781343476969e-07, + "logits/chosen": -2.0588455200195312, + "logits/rejected": -2.2516112327575684, + "logps/chosen": -2.668566942214966, + "logps/rejected": -2.5644493103027344, + "loss": 4.2421, + "rewards/accuracies": 0.5, + "rewards/chosen": -26.685672760009766, + "rewards/margins": -1.0411770343780518, + "rewards/rejected": -25.64449691772461, + "step": 23830 + }, + { + "epoch": 0.8033637803768243, + "grad_norm": 20.719966888427734, + "learning_rate": 1.132313385110113e-07, + "logits/chosen": -1.1614582538604736, + "logits/rejected": -1.6384683847427368, + "logps/chosen": -2.3579354286193848, + "logps/rejected": -2.794785499572754, + "loss": 0.9973, + "rewards/accuracies": 1.0, + "rewards/chosen": -23.579357147216797, + "rewards/margins": 4.368496417999268, + "rewards/rejected": -27.94785499572754, + "step": 23835 + }, + { + "epoch": 0.8035323064478075, + "grad_norm": 256.3408203125, + "learning_rate": 1.1304499743264867e-07, + "logits/chosen": -2.2006125450134277, + "logits/rejected": -2.2175331115722656, + "logps/chosen": -3.5510592460632324, + "logps/rejected": -3.5691275596618652, + "loss": 4.0592, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -35.510597229003906, + "rewards/margins": 0.18068209290504456, + "rewards/rejected": -35.69127655029297, + "step": 23840 + }, + { + "epoch": 0.8037008325187907, + "grad_norm": 39.30622863769531, + "learning_rate": 1.1285879026416689e-07, + "logits/chosen": -1.4549888372421265, + "logits/rejected": -1.4136347770690918, + "logps/chosen": -2.2924153804779053, + "logps/rejected": -2.366925001144409, + "loss": 2.9058, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.924156188964844, + "rewards/margins": 0.7450965046882629, + "rewards/rejected": -23.66925048828125, + "step": 23845 + }, + { + "epoch": 0.8038693585897738, + "grad_norm": 30.57029151916504, + "learning_rate": 1.1267271707000509e-07, + "logits/chosen": -1.4190336465835571, + "logits/rejected": -1.3612552881240845, + "logps/chosen": -2.9762558937072754, + "logps/rejected": -3.1376760005950928, + "loss": 2.493, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -29.762561798095703, + "rewards/margins": 1.6142009496688843, + "rewards/rejected": -31.376760482788086, + "step": 23850 + }, + { + "epoch": 0.804037884660757, + "grad_norm": 34.12413787841797, + "learning_rate": 1.124867779145559e-07, + "logits/chosen": -1.8991806507110596, + "logits/rejected": -1.9499584436416626, + "logps/chosen": -2.0986106395721436, + "logps/rejected": -2.1971726417541504, + "loss": 2.7685, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.986108779907227, + "rewards/margins": 0.985619068145752, + "rewards/rejected": -21.97172737121582, + "step": 23855 + }, + { + "epoch": 0.8042064107317402, + "grad_norm": 44.969398498535156, + "learning_rate": 1.1230097286216539e-07, + "logits/chosen": -1.7429790496826172, + "logits/rejected": -1.8025766611099243, + "logps/chosen": -2.431539297103882, + "logps/rejected": -2.4856417179107666, + "loss": 3.4906, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -24.31539535522461, + "rewards/margins": 0.5410224199295044, + "rewards/rejected": -24.856416702270508, + "step": 23860 + }, + { + "epoch": 0.8043749368027234, + "grad_norm": 257.18902587890625, + "learning_rate": 1.1211530197713337e-07, + "logits/chosen": -1.8138539791107178, + "logits/rejected": -1.8681204319000244, + "logps/chosen": -2.7075095176696777, + "logps/rejected": -2.7071266174316406, + "loss": 5.5266, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -27.075098037719727, + "rewards/margins": -0.0038280487060546875, + "rewards/rejected": -27.071269989013672, + "step": 23865 + }, + { + "epoch": 0.8045434628737066, + "grad_norm": 23.051807403564453, + "learning_rate": 1.1192976532371334e-07, + "logits/chosen": -1.281162142753601, + "logits/rejected": -1.1859794855117798, + "logps/chosen": -2.280212879180908, + "logps/rejected": -2.2272915840148926, + "loss": 3.8509, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.802125930786133, + "rewards/margins": -0.5292104482650757, + "rewards/rejected": -22.27291488647461, + "step": 23870 + }, + { + "epoch": 0.8047119889446898, + "grad_norm": 20.657026290893555, + "learning_rate": 1.1174436296611212e-07, + "logits/chosen": -1.8478273153305054, + "logits/rejected": -1.8438236713409424, + "logps/chosen": -3.1947686672210693, + "logps/rejected": -3.2079625129699707, + "loss": 4.3144, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -31.94768714904785, + "rewards/margins": 0.13193626701831818, + "rewards/rejected": -32.07962417602539, + "step": 23875 + }, + { + "epoch": 0.8048805150156729, + "grad_norm": 45.678001403808594, + "learning_rate": 1.1155909496849026e-07, + "logits/chosen": -1.617417335510254, + "logits/rejected": -1.949262022972107, + "logps/chosen": -2.0899531841278076, + "logps/rejected": -2.289635419845581, + "loss": 2.7244, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.899532318115234, + "rewards/margins": 1.9968221187591553, + "rewards/rejected": -22.89635467529297, + "step": 23880 + }, + { + "epoch": 0.8050490410866561, + "grad_norm": 34.41527557373047, + "learning_rate": 1.1137396139496164e-07, + "logits/chosen": -1.926828145980835, + "logits/rejected": -1.8516514301300049, + "logps/chosen": -2.007347583770752, + "logps/rejected": -2.1980199813842773, + "loss": 2.5529, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.073474884033203, + "rewards/margins": 1.9067256450653076, + "rewards/rejected": -21.980199813842773, + "step": 23885 + }, + { + "epoch": 0.8052175671576393, + "grad_norm": 85.15401458740234, + "learning_rate": 1.111889623095939e-07, + "logits/chosen": -1.903693437576294, + "logits/rejected": -2.0786874294281006, + "logps/chosen": -2.0593502521514893, + "logps/rejected": -2.3961517810821533, + "loss": 2.7698, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.593502044677734, + "rewards/margins": 3.3680152893066406, + "rewards/rejected": -23.961519241333008, + "step": 23890 + }, + { + "epoch": 0.8053860932286224, + "grad_norm": 24.764240264892578, + "learning_rate": 1.1100409777640762e-07, + "logits/chosen": -1.818833351135254, + "logits/rejected": -1.9280961751937866, + "logps/chosen": -2.0653393268585205, + "logps/rejected": -2.341346263885498, + "loss": 1.8859, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.653390884399414, + "rewards/margins": 2.760075330734253, + "rewards/rejected": -23.413467407226562, + "step": 23895 + }, + { + "epoch": 0.8055546192996057, + "grad_norm": 29.278295516967773, + "learning_rate": 1.1081936785937724e-07, + "logits/chosen": -1.7818940877914429, + "logits/rejected": -1.3903292417526245, + "logps/chosen": -2.088256597518921, + "logps/rejected": -2.1572296619415283, + "loss": 3.4176, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.882568359375, + "rewards/margins": 0.6897293329238892, + "rewards/rejected": -21.572296142578125, + "step": 23900 + }, + { + "epoch": 0.8057231453705889, + "grad_norm": 67.3364486694336, + "learning_rate": 1.106347726224306e-07, + "logits/chosen": -2.0864202976226807, + "logits/rejected": -1.8650718927383423, + "logps/chosen": -2.2361273765563965, + "logps/rejected": -2.6703240871429443, + "loss": 2.9271, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.36127281188965, + "rewards/margins": 4.341969966888428, + "rewards/rejected": -26.7032413482666, + "step": 23905 + }, + { + "epoch": 0.805891671441572, + "grad_norm": 31.025312423706055, + "learning_rate": 1.1045031212944884e-07, + "logits/chosen": -1.1579580307006836, + "logits/rejected": -1.2433195114135742, + "logps/chosen": -2.173211097717285, + "logps/rejected": -2.2033867835998535, + "loss": 3.1625, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.73211097717285, + "rewards/margins": 0.30175837874412537, + "rewards/rejected": -22.033870697021484, + "step": 23910 + }, + { + "epoch": 0.8060601975125552, + "grad_norm": 30.541427612304688, + "learning_rate": 1.1026598644426632e-07, + "logits/chosen": -1.7815994024276733, + "logits/rejected": -1.8972110748291016, + "logps/chosen": -2.0432934761047363, + "logps/rejected": -2.1355528831481934, + "loss": 2.3722, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.432933807373047, + "rewards/margins": 0.9225980043411255, + "rewards/rejected": -21.355531692504883, + "step": 23915 + }, + { + "epoch": 0.8062287235835384, + "grad_norm": 19.195152282714844, + "learning_rate": 1.1008179563067093e-07, + "logits/chosen": -2.0765273571014404, + "logits/rejected": -2.0552916526794434, + "logps/chosen": -3.042235851287842, + "logps/rejected": -3.305041790008545, + "loss": 2.5792, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -30.422359466552734, + "rewards/margins": 2.6280593872070312, + "rewards/rejected": -33.050418853759766, + "step": 23920 + }, + { + "epoch": 0.8063972496545215, + "grad_norm": 21.681163787841797, + "learning_rate": 1.0989773975240412e-07, + "logits/chosen": -1.1462697982788086, + "logits/rejected": -1.4292978048324585, + "logps/chosen": -2.134056568145752, + "logps/rejected": -2.217719793319702, + "loss": 2.7037, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.340566635131836, + "rewards/margins": 0.8366325497627258, + "rewards/rejected": -22.17719841003418, + "step": 23925 + }, + { + "epoch": 0.8065657757255047, + "grad_norm": 10.539362907409668, + "learning_rate": 1.0971381887316e-07, + "logits/chosen": -1.6412235498428345, + "logits/rejected": -2.147385835647583, + "logps/chosen": -2.3741135597229004, + "logps/rejected": -2.6469130516052246, + "loss": 3.3071, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.741130828857422, + "rewards/margins": 2.7279977798461914, + "rewards/rejected": -26.469131469726562, + "step": 23930 + }, + { + "epoch": 0.8067343017964879, + "grad_norm": 35.09379577636719, + "learning_rate": 1.0953003305658648e-07, + "logits/chosen": -1.7946780920028687, + "logits/rejected": -1.9496517181396484, + "logps/chosen": -2.2903881072998047, + "logps/rejected": -2.7255663871765137, + "loss": 2.1984, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.903881072998047, + "rewards/margins": 4.351784706115723, + "rewards/rejected": -27.255664825439453, + "step": 23935 + }, + { + "epoch": 0.8069028278674711, + "grad_norm": 73.50012969970703, + "learning_rate": 1.0934638236628463e-07, + "logits/chosen": -1.9948482513427734, + "logits/rejected": -2.410553455352783, + "logps/chosen": -2.9056334495544434, + "logps/rejected": -3.6007399559020996, + "loss": 4.061, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -29.05633544921875, + "rewards/margins": 6.951064109802246, + "rewards/rejected": -36.00739669799805, + "step": 23940 + }, + { + "epoch": 0.8070713539384543, + "grad_norm": 11.287801742553711, + "learning_rate": 1.0916286686580884e-07, + "logits/chosen": -1.756542444229126, + "logits/rejected": -1.8862693309783936, + "logps/chosen": -2.3044886589050293, + "logps/rejected": -2.6915547847747803, + "loss": 3.0784, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.04488754272461, + "rewards/margins": 3.8706603050231934, + "rewards/rejected": -26.91554832458496, + "step": 23945 + }, + { + "epoch": 0.8072398800094375, + "grad_norm": 24.094289779663086, + "learning_rate": 1.0897948661866636e-07, + "logits/chosen": -1.5406244993209839, + "logits/rejected": -2.3224754333496094, + "logps/chosen": -1.9037678241729736, + "logps/rejected": -2.257594585418701, + "loss": 1.1585, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.037675857543945, + "rewards/margins": 3.53826904296875, + "rewards/rejected": -22.575944900512695, + "step": 23950 + }, + { + "epoch": 0.8074084060804206, + "grad_norm": 22.659090042114258, + "learning_rate": 1.0879624168831792e-07, + "logits/chosen": -1.7213237285614014, + "logits/rejected": -2.021697521209717, + "logps/chosen": -2.254326105117798, + "logps/rejected": -2.6272685527801514, + "loss": 1.8801, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -22.543262481689453, + "rewards/margins": 3.7294254302978516, + "rewards/rejected": -26.272686004638672, + "step": 23955 + }, + { + "epoch": 0.8075769321514038, + "grad_norm": 33.168418884277344, + "learning_rate": 1.0861313213817758e-07, + "logits/chosen": -2.592442035675049, + "logits/rejected": -2.4539074897766113, + "logps/chosen": -2.8346259593963623, + "logps/rejected": -3.7187466621398926, + "loss": 1.586, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -28.34625816345215, + "rewards/margins": 8.841202735900879, + "rewards/rejected": -37.187461853027344, + "step": 23960 + }, + { + "epoch": 0.807745458222387, + "grad_norm": 21.68806266784668, + "learning_rate": 1.0843015803161204e-07, + "logits/chosen": -1.7804466485977173, + "logits/rejected": -2.094736337661743, + "logps/chosen": -2.116252899169922, + "logps/rejected": -2.531083106994629, + "loss": 1.1304, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -21.16252899169922, + "rewards/margins": 4.148301124572754, + "rewards/rejected": -25.31083106994629, + "step": 23965 + }, + { + "epoch": 0.8079139842933701, + "grad_norm": 26.970895767211914, + "learning_rate": 1.0824731943194154e-07, + "logits/chosen": -2.08791184425354, + "logits/rejected": -2.282702922821045, + "logps/chosen": -1.9778106212615967, + "logps/rejected": -2.2054924964904785, + "loss": 2.0294, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.778106689453125, + "rewards/margins": 2.276818037033081, + "rewards/rejected": -22.0549259185791, + "step": 23970 + }, + { + "epoch": 0.8080825103643534, + "grad_norm": 25.497480392456055, + "learning_rate": 1.0806461640243941e-07, + "logits/chosen": -1.6113965511322021, + "logits/rejected": -1.5287015438079834, + "logps/chosen": -2.3509488105773926, + "logps/rejected": -2.640751361846924, + "loss": 1.7576, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -23.509485244750977, + "rewards/margins": 2.8980259895324707, + "rewards/rejected": -26.40751075744629, + "step": 23975 + }, + { + "epoch": 0.8082510364353366, + "grad_norm": 57.712249755859375, + "learning_rate": 1.0788204900633196e-07, + "logits/chosen": -1.449377417564392, + "logits/rejected": -1.6721227169036865, + "logps/chosen": -2.294976234436035, + "logps/rejected": -2.4277760982513428, + "loss": 3.0375, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.94976234436035, + "rewards/margins": 1.3279987573623657, + "rewards/rejected": -24.277761459350586, + "step": 23980 + }, + { + "epoch": 0.8084195625063197, + "grad_norm": 17.932680130004883, + "learning_rate": 1.0769961730679844e-07, + "logits/chosen": -1.9043604135513306, + "logits/rejected": -2.0780739784240723, + "logps/chosen": -2.340055227279663, + "logps/rejected": -2.5520808696746826, + "loss": 3.3177, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.400554656982422, + "rewards/margins": 2.120255708694458, + "rewards/rejected": -25.520809173583984, + "step": 23985 + }, + { + "epoch": 0.8085880885773029, + "grad_norm": 29.642797470092773, + "learning_rate": 1.0751732136697134e-07, + "logits/chosen": -1.8419221639633179, + "logits/rejected": -2.1341745853424072, + "logps/chosen": -2.4016568660736084, + "logps/rejected": -2.761948347091675, + "loss": 2.234, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.01656723022461, + "rewards/margins": 3.6029136180877686, + "rewards/rejected": -27.619482040405273, + "step": 23990 + }, + { + "epoch": 0.8087566146482861, + "grad_norm": 34.91474151611328, + "learning_rate": 1.0733516124993625e-07, + "logits/chosen": -2.087256908416748, + "logits/rejected": -2.281942129135132, + "logps/chosen": -2.0903306007385254, + "logps/rejected": -2.9100751876831055, + "loss": 1.656, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -20.903305053710938, + "rewards/margins": 8.1974458694458, + "rewards/rejected": -29.100749969482422, + "step": 23995 + }, + { + "epoch": 0.8089251407192692, + "grad_norm": 30.762310028076172, + "learning_rate": 1.0715313701873135e-07, + "logits/chosen": -1.854391098022461, + "logits/rejected": -2.0410971641540527, + "logps/chosen": -2.2059502601623535, + "logps/rejected": -2.372504711151123, + "loss": 3.9705, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -22.05950164794922, + "rewards/margins": 1.6655467748641968, + "rewards/rejected": -23.725046157836914, + "step": 24000 + }, + { + "epoch": 0.8089251407192692, + "eval_logits/chosen": -2.2840046882629395, + "eval_logits/rejected": -2.4597790241241455, + "eval_logps/chosen": -2.2764413356781006, + "eval_logps/rejected": -2.4307594299316406, + "eval_loss": 3.080594539642334, + "eval_rewards/accuracies": 0.6299999952316284, + "eval_rewards/chosen": -22.764413833618164, + "eval_rewards/margins": 1.5431796312332153, + "eval_rewards/rejected": -24.307592391967773, + "eval_runtime": 12.8927, + "eval_samples_per_second": 7.756, + "eval_steps_per_second": 1.939, + "step": 24000 + }, + { + "epoch": 0.8090936667902524, + "grad_norm": 76.99275970458984, + "learning_rate": 1.0697124873634816e-07, + "logits/chosen": -1.6733137369155884, + "logits/rejected": -1.85502028465271, + "logps/chosen": -2.536296844482422, + "logps/rejected": -2.6223788261413574, + "loss": 2.8727, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.36296844482422, + "rewards/margins": 0.8608211278915405, + "rewards/rejected": -26.22378921508789, + "step": 24005 + }, + { + "epoch": 0.8092621928612357, + "grad_norm": 25.34918212890625, + "learning_rate": 1.0678949646573104e-07, + "logits/chosen": -2.3985438346862793, + "logits/rejected": -2.3265886306762695, + "logps/chosen": -2.819058656692505, + "logps/rejected": -2.8983523845672607, + "loss": 4.5749, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -28.190587997436523, + "rewards/margins": 0.7929363250732422, + "rewards/rejected": -28.983524322509766, + "step": 24010 + }, + { + "epoch": 0.8094307189322189, + "grad_norm": 51.82828903198242, + "learning_rate": 1.0660788026977735e-07, + "logits/chosen": -2.010450839996338, + "logits/rejected": -1.9830372333526611, + "logps/chosen": -2.299954891204834, + "logps/rejected": -2.283881664276123, + "loss": 3.8194, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -22.99955177307129, + "rewards/margins": -0.16073504090309143, + "rewards/rejected": -22.838817596435547, + "step": 24015 + }, + { + "epoch": 0.809599245003202, + "grad_norm": 3.4491851329803467, + "learning_rate": 1.0642640021133742e-07, + "logits/chosen": -1.538646936416626, + "logits/rejected": -1.7420097589492798, + "logps/chosen": -3.4108078479766846, + "logps/rejected": -4.116189479827881, + "loss": 0.9612, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -34.10807418823242, + "rewards/margins": 7.053819179534912, + "rewards/rejected": -41.161895751953125, + "step": 24020 + }, + { + "epoch": 0.8097677710741852, + "grad_norm": 9.866703033447266, + "learning_rate": 1.0624505635321406e-07, + "logits/chosen": -2.0352749824523926, + "logits/rejected": -2.267897129058838, + "logps/chosen": -2.5204594135284424, + "logps/rejected": -3.058020830154419, + "loss": 1.6614, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -25.20458984375, + "rewards/margins": 5.375618934631348, + "rewards/rejected": -30.580211639404297, + "step": 24025 + }, + { + "epoch": 0.8099362971451683, + "grad_norm": 63.24565124511719, + "learning_rate": 1.0606384875816332e-07, + "logits/chosen": -2.1843369007110596, + "logits/rejected": -2.30873966217041, + "logps/chosen": -2.49751615524292, + "logps/rejected": -2.7526285648345947, + "loss": 3.4675, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.975162506103516, + "rewards/margins": 2.5511229038238525, + "rewards/rejected": -27.526287078857422, + "step": 24030 + }, + { + "epoch": 0.8101048232161515, + "grad_norm": 35.56220245361328, + "learning_rate": 1.0588277748889412e-07, + "logits/chosen": -1.576695442199707, + "logits/rejected": -1.7100387811660767, + "logps/chosen": -2.1149020195007324, + "logps/rejected": -2.810107707977295, + "loss": 2.6821, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.14902114868164, + "rewards/margins": 6.952054500579834, + "rewards/rejected": -28.10107421875, + "step": 24035 + }, + { + "epoch": 0.8102733492871347, + "grad_norm": 37.3370246887207, + "learning_rate": 1.0570184260806802e-07, + "logits/chosen": -1.415395975112915, + "logits/rejected": -1.6331355571746826, + "logps/chosen": -2.5750272274017334, + "logps/rejected": -2.3799424171447754, + "loss": 5.4584, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -25.750274658203125, + "rewards/margins": -1.9508476257324219, + "rewards/rejected": -23.799427032470703, + "step": 24040 + }, + { + "epoch": 0.8104418753581178, + "grad_norm": 47.88873291015625, + "learning_rate": 1.0552104417829944e-07, + "logits/chosen": -1.9850772619247437, + "logits/rejected": -2.4515490531921387, + "logps/chosen": -2.7267796993255615, + "logps/rejected": -3.2105274200439453, + "loss": 1.6153, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -27.26779556274414, + "rewards/margins": 4.8374786376953125, + "rewards/rejected": -32.10527801513672, + "step": 24045 + }, + { + "epoch": 0.8106104014291011, + "grad_norm": 29.122587203979492, + "learning_rate": 1.0534038226215574e-07, + "logits/chosen": -2.318523406982422, + "logits/rejected": -2.468736410140991, + "logps/chosen": -3.361487865447998, + "logps/rejected": -3.160301923751831, + "loss": 6.1576, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -33.6148796081543, + "rewards/margins": -2.0118582248687744, + "rewards/rejected": -31.603023529052734, + "step": 24050 + }, + { + "epoch": 0.8107789275000843, + "grad_norm": 80.43611907958984, + "learning_rate": 1.0515985692215667e-07, + "logits/chosen": -2.533133029937744, + "logits/rejected": -2.5773842334747314, + "logps/chosen": -3.062659740447998, + "logps/rejected": -3.4902923107147217, + "loss": 2.0103, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -30.626602172851562, + "rewards/margins": 4.2763237953186035, + "rewards/rejected": -34.902923583984375, + "step": 24055 + }, + { + "epoch": 0.8109474535710675, + "grad_norm": 33.907955169677734, + "learning_rate": 1.0497946822077503e-07, + "logits/chosen": -2.0659396648406982, + "logits/rejected": -2.092923641204834, + "logps/chosen": -2.0031232833862305, + "logps/rejected": -2.340515375137329, + "loss": 3.295, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.031230926513672, + "rewards/margins": 3.37391996383667, + "rewards/rejected": -23.405153274536133, + "step": 24060 + }, + { + "epoch": 0.8111159796420506, + "grad_norm": 6.4482102394104, + "learning_rate": 1.0479921622043642e-07, + "logits/chosen": -2.037416934967041, + "logits/rejected": -2.331026792526245, + "logps/chosen": -2.0453298091888428, + "logps/rejected": -3.061678409576416, + "loss": 1.5471, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.453298568725586, + "rewards/margins": 10.163487434387207, + "rewards/rejected": -30.616785049438477, + "step": 24065 + }, + { + "epoch": 0.8112845057130338, + "grad_norm": 0.9323909878730774, + "learning_rate": 1.0461910098351862e-07, + "logits/chosen": -1.7518703937530518, + "logits/rejected": -2.306114435195923, + "logps/chosen": -2.4827165603637695, + "logps/rejected": -3.222071886062622, + "loss": 1.9647, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.827163696289062, + "rewards/margins": 7.3935546875, + "rewards/rejected": -32.22071838378906, + "step": 24070 + }, + { + "epoch": 0.811453031784017, + "grad_norm": 28.054025650024414, + "learning_rate": 1.044391225723526e-07, + "logits/chosen": -2.0947256088256836, + "logits/rejected": -2.361001491546631, + "logps/chosen": -3.1201844215393066, + "logps/rejected": -3.464200258255005, + "loss": 1.8679, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -31.201847076416016, + "rewards/margins": 3.4401588439941406, + "rewards/rejected": -34.642005920410156, + "step": 24075 + }, + { + "epoch": 0.8116215578550001, + "grad_norm": 19.00840187072754, + "learning_rate": 1.0425928104922171e-07, + "logits/chosen": -1.4683290719985962, + "logits/rejected": -1.6970717906951904, + "logps/chosen": -2.109851360321045, + "logps/rejected": -2.150763750076294, + "loss": 3.2336, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.098514556884766, + "rewards/margins": 0.40912121534347534, + "rewards/rejected": -21.50763511657715, + "step": 24080 + }, + { + "epoch": 0.8117900839259834, + "grad_norm": 36.81915283203125, + "learning_rate": 1.0407957647636229e-07, + "logits/chosen": -2.2459583282470703, + "logits/rejected": -2.238729953765869, + "logps/chosen": -2.171417713165283, + "logps/rejected": -2.372509002685547, + "loss": 2.351, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.71417808532715, + "rewards/margins": 2.010913372039795, + "rewards/rejected": -23.725093841552734, + "step": 24085 + }, + { + "epoch": 0.8119586099969666, + "grad_norm": 48.59904861450195, + "learning_rate": 1.0390000891596268e-07, + "logits/chosen": -1.8941738605499268, + "logits/rejected": -2.4483718872070312, + "logps/chosen": -1.977226972579956, + "logps/rejected": -2.3452813625335693, + "loss": 2.1481, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.77227210998535, + "rewards/margins": 3.6805419921875, + "rewards/rejected": -23.45281410217285, + "step": 24090 + }, + { + "epoch": 0.8121271360679497, + "grad_norm": 25.916887283325195, + "learning_rate": 1.0372057843016424e-07, + "logits/chosen": -1.7557846307754517, + "logits/rejected": -2.196135997772217, + "logps/chosen": -2.585275650024414, + "logps/rejected": -3.183326005935669, + "loss": 2.5426, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.852752685546875, + "rewards/margins": 5.980503082275391, + "rewards/rejected": -31.8332576751709, + "step": 24095 + }, + { + "epoch": 0.8122956621389329, + "grad_norm": 30.466371536254883, + "learning_rate": 1.0354128508106098e-07, + "logits/chosen": -2.0006163120269775, + "logits/rejected": -2.707191228866577, + "logps/chosen": -2.314324140548706, + "logps/rejected": -2.785937547683716, + "loss": 2.3359, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.14324188232422, + "rewards/margins": 4.716134071350098, + "rewards/rejected": -27.859375, + "step": 24100 + }, + { + "epoch": 0.8124641882099161, + "grad_norm": 94.53244018554688, + "learning_rate": 1.0336212893069895e-07, + "logits/chosen": -2.5970489978790283, + "logits/rejected": -2.78641939163208, + "logps/chosen": -3.6945462226867676, + "logps/rejected": -3.814669132232666, + "loss": 6.3957, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -36.94546127319336, + "rewards/margins": 1.2012335062026978, + "rewards/rejected": -38.14669418334961, + "step": 24105 + }, + { + "epoch": 0.8126327142808992, + "grad_norm": 37.557125091552734, + "learning_rate": 1.0318311004107716e-07, + "logits/chosen": -1.805131196975708, + "logits/rejected": -1.9349620342254639, + "logps/chosen": -2.4366652965545654, + "logps/rejected": -2.357182264328003, + "loss": 4.5073, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -24.366655349731445, + "rewards/margins": -0.7948305010795593, + "rewards/rejected": -23.571823120117188, + "step": 24110 + }, + { + "epoch": 0.8128012403518824, + "grad_norm": 1.4744274616241455, + "learning_rate": 1.0300422847414708e-07, + "logits/chosen": -2.667433738708496, + "logits/rejected": -2.4561023712158203, + "logps/chosen": -3.353752851486206, + "logps/rejected": -3.409027576446533, + "loss": 4.8988, + "rewards/accuracies": 0.5, + "rewards/chosen": -33.53752899169922, + "rewards/margins": 0.5527437329292297, + "rewards/rejected": -34.09027862548828, + "step": 24115 + }, + { + "epoch": 0.8129697664228657, + "grad_norm": 41.911800384521484, + "learning_rate": 1.0282548429181265e-07, + "logits/chosen": -1.0657398700714111, + "logits/rejected": -1.0967696905136108, + "logps/chosen": -1.9195921421051025, + "logps/rejected": -2.0219099521636963, + "loss": 3.1227, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.195920944213867, + "rewards/margins": 1.023177981376648, + "rewards/rejected": -20.219097137451172, + "step": 24120 + }, + { + "epoch": 0.8131382924938488, + "grad_norm": 22.111970901489258, + "learning_rate": 1.0264687755592987e-07, + "logits/chosen": -1.5700323581695557, + "logits/rejected": -1.79294753074646, + "logps/chosen": -2.4795994758605957, + "logps/rejected": -2.7823715209960938, + "loss": 1.6587, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -24.79599380493164, + "rewards/margins": 3.027724027633667, + "rewards/rejected": -27.823715209960938, + "step": 24125 + }, + { + "epoch": 0.813306818564832, + "grad_norm": 86.154052734375, + "learning_rate": 1.0246840832830772e-07, + "logits/chosen": -1.6293919086456299, + "logits/rejected": -1.7340667247772217, + "logps/chosen": -2.80242657661438, + "logps/rejected": -2.5946459770202637, + "loss": 5.1913, + "rewards/accuracies": 0.5, + "rewards/chosen": -28.02426528930664, + "rewards/margins": -2.077803134918213, + "rewards/rejected": -25.946462631225586, + "step": 24130 + }, + { + "epoch": 0.8134753446358152, + "grad_norm": 34.58270263671875, + "learning_rate": 1.0229007667070743e-07, + "logits/chosen": -1.5737640857696533, + "logits/rejected": -1.3462755680084229, + "logps/chosen": -2.6154274940490723, + "logps/rejected": -2.8824868202209473, + "loss": 2.1968, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.15427589416504, + "rewards/margins": 2.6705925464630127, + "rewards/rejected": -28.82486915588379, + "step": 24135 + }, + { + "epoch": 0.8136438707067983, + "grad_norm": 191.56494140625, + "learning_rate": 1.0211188264484233e-07, + "logits/chosen": -1.8556255102157593, + "logits/rejected": -1.7768996953964233, + "logps/chosen": -3.0176990032196045, + "logps/rejected": -3.110203504562378, + "loss": 4.2338, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -30.176992416381836, + "rewards/margins": 0.9250432848930359, + "rewards/rejected": -31.102035522460938, + "step": 24140 + }, + { + "epoch": 0.8138123967777815, + "grad_norm": 52.166526794433594, + "learning_rate": 1.0193382631237851e-07, + "logits/chosen": -1.8037277460098267, + "logits/rejected": -1.9889767169952393, + "logps/chosen": -2.7616565227508545, + "logps/rejected": -2.6427969932556152, + "loss": 5.5955, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -27.616565704345703, + "rewards/margins": -1.1885948181152344, + "rewards/rejected": -26.427968978881836, + "step": 24145 + }, + { + "epoch": 0.8139809228487647, + "grad_norm": 26.984853744506836, + "learning_rate": 1.0175590773493431e-07, + "logits/chosen": -2.0901906490325928, + "logits/rejected": -1.9249852895736694, + "logps/chosen": -2.3587594032287598, + "logps/rejected": -2.214332342147827, + "loss": 5.0239, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -23.58759117126465, + "rewards/margins": -1.4442687034606934, + "rewards/rejected": -22.14332389831543, + "step": 24150 + }, + { + "epoch": 0.8141494489197478, + "grad_norm": 30.049564361572266, + "learning_rate": 1.0157812697408019e-07, + "logits/chosen": -1.4776041507720947, + "logits/rejected": -1.5003935098648071, + "logps/chosen": -2.169267416000366, + "logps/rejected": -2.48649001121521, + "loss": 2.6431, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.692672729492188, + "rewards/margins": 3.172226667404175, + "rewards/rejected": -24.864900588989258, + "step": 24155 + }, + { + "epoch": 0.8143179749907311, + "grad_norm": 43.715599060058594, + "learning_rate": 1.0140048409133906e-07, + "logits/chosen": -1.9859644174575806, + "logits/rejected": -2.3302559852600098, + "logps/chosen": -2.533536195755005, + "logps/rejected": -2.7264256477355957, + "loss": 3.4219, + "rewards/accuracies": 0.5, + "rewards/chosen": -25.33536148071289, + "rewards/margins": 1.9288944005966187, + "rewards/rejected": -27.264257431030273, + "step": 24160 + }, + { + "epoch": 0.8144865010617143, + "grad_norm": 36.16670608520508, + "learning_rate": 1.0122297914818623e-07, + "logits/chosen": -2.3879990577697754, + "logits/rejected": -2.2910168170928955, + "logps/chosen": -3.131276845932007, + "logps/rejected": -3.8100860118865967, + "loss": 3.3137, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -31.312768936157227, + "rewards/margins": 6.788092136383057, + "rewards/rejected": -38.100860595703125, + "step": 24165 + }, + { + "epoch": 0.8146550271326974, + "grad_norm": 40.567203521728516, + "learning_rate": 1.0104561220604913e-07, + "logits/chosen": -2.137629270553589, + "logits/rejected": -2.922985553741455, + "logps/chosen": -2.1473476886749268, + "logps/rejected": -2.918635606765747, + "loss": 2.4425, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.47347640991211, + "rewards/margins": 7.712876796722412, + "rewards/rejected": -29.186355590820312, + "step": 24170 + }, + { + "epoch": 0.8148235532036806, + "grad_norm": 183.20619201660156, + "learning_rate": 1.0086838332630743e-07, + "logits/chosen": -2.144543409347534, + "logits/rejected": -2.3791539669036865, + "logps/chosen": -2.6417734622955322, + "logps/rejected": -2.595959186553955, + "loss": 3.6555, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -26.417734146118164, + "rewards/margins": -0.45814236998558044, + "rewards/rejected": -25.959590911865234, + "step": 24175 + }, + { + "epoch": 0.8149920792746638, + "grad_norm": 54.81478500366211, + "learning_rate": 1.0069129257029313e-07, + "logits/chosen": -1.75040602684021, + "logits/rejected": -2.152054786682129, + "logps/chosen": -2.6521573066711426, + "logps/rejected": -2.500854969024658, + "loss": 4.9655, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.52157211303711, + "rewards/margins": -1.5130245685577393, + "rewards/rejected": -25.008548736572266, + "step": 24180 + }, + { + "epoch": 0.8151606053456469, + "grad_norm": 25.453004837036133, + "learning_rate": 1.0051433999929049e-07, + "logits/chosen": -1.3161516189575195, + "logits/rejected": -1.4525598287582397, + "logps/chosen": -2.7192330360412598, + "logps/rejected": -3.028667449951172, + "loss": 1.4393, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -27.19232749938965, + "rewards/margins": 3.094343900680542, + "rewards/rejected": -30.28667640686035, + "step": 24185 + }, + { + "epoch": 0.8153291314166301, + "grad_norm": 17.820598602294922, + "learning_rate": 1.0033752567453551e-07, + "logits/chosen": -1.4301345348358154, + "logits/rejected": -1.5557069778442383, + "logps/chosen": -2.266484498977661, + "logps/rejected": -2.372802972793579, + "loss": 2.297, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -22.664844512939453, + "rewards/margins": 1.0631874799728394, + "rewards/rejected": -23.728031158447266, + "step": 24190 + }, + { + "epoch": 0.8154976574876134, + "grad_norm": 114.78311157226562, + "learning_rate": 1.0016084965721682e-07, + "logits/chosen": -1.9529441595077515, + "logits/rejected": -1.9129083156585693, + "logps/chosen": -2.329055070877075, + "logps/rejected": -2.5822417736053467, + "loss": 3.8328, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.29054832458496, + "rewards/margins": 2.5318689346313477, + "rewards/rejected": -25.82242202758789, + "step": 24195 + }, + { + "epoch": 0.8156661835585965, + "grad_norm": 50.35905838012695, + "learning_rate": 9.998431200847506e-08, + "logits/chosen": -1.9478317499160767, + "logits/rejected": -1.6423200368881226, + "logps/chosen": -2.32122540473938, + "logps/rejected": -2.347656726837158, + "loss": 2.9022, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.212251663208008, + "rewards/margins": 0.26431483030319214, + "rewards/rejected": -23.476566314697266, + "step": 24200 + }, + { + "epoch": 0.8158347096295797, + "grad_norm": 142.4918975830078, + "learning_rate": 9.980791278940304e-08, + "logits/chosen": -2.4216580390930176, + "logits/rejected": -2.449982166290283, + "logps/chosen": -2.682478666305542, + "logps/rejected": -2.609151840209961, + "loss": 4.4322, + "rewards/accuracies": 0.5, + "rewards/chosen": -26.824787139892578, + "rewards/margins": -0.7332667112350464, + "rewards/rejected": -26.09151840209961, + "step": 24205 + }, + { + "epoch": 0.8160032357005629, + "grad_norm": 29.56013298034668, + "learning_rate": 9.963165206104529e-08, + "logits/chosen": -1.9350833892822266, + "logits/rejected": -1.5915058851242065, + "logps/chosen": -2.3900949954986572, + "logps/rejected": -1.978994607925415, + "loss": 7.1794, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -23.90094566345215, + "rewards/margins": -4.111001491546631, + "rewards/rejected": -19.789945602416992, + "step": 24210 + }, + { + "epoch": 0.816171761771546, + "grad_norm": 27.79477882385254, + "learning_rate": 9.945552988439893e-08, + "logits/chosen": -1.9351444244384766, + "logits/rejected": -1.809313416481018, + "logps/chosen": -3.3555362224578857, + "logps/rejected": -3.6098411083221436, + "loss": 2.7335, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -33.555362701416016, + "rewards/margins": 2.543045997619629, + "rewards/rejected": -36.09840774536133, + "step": 24215 + }, + { + "epoch": 0.8163402878425292, + "grad_norm": 33.100006103515625, + "learning_rate": 9.927954632041297e-08, + "logits/chosen": -1.5907175540924072, + "logits/rejected": -1.7621221542358398, + "logps/chosen": -1.743699312210083, + "logps/rejected": -2.0336194038391113, + "loss": 1.9667, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.43699073791504, + "rewards/margins": 2.899199962615967, + "rewards/rejected": -20.336193084716797, + "step": 24220 + }, + { + "epoch": 0.8165088139135124, + "grad_norm": 31.314096450805664, + "learning_rate": 9.910370142998814e-08, + "logits/chosen": -2.312352418899536, + "logits/rejected": -2.5222764015197754, + "logps/chosen": -2.5663654804229736, + "logps/rejected": -3.113084554672241, + "loss": 2.6303, + "rewards/accuracies": 0.5, + "rewards/chosen": -25.663654327392578, + "rewards/margins": 5.467194557189941, + "rewards/rejected": -31.130847930908203, + "step": 24225 + }, + { + "epoch": 0.8166773399844957, + "grad_norm": 52.19409942626953, + "learning_rate": 9.892799527397755e-08, + "logits/chosen": -1.5479198694229126, + "logits/rejected": -1.8709392547607422, + "logps/chosen": -2.036996603012085, + "logps/rejected": -2.095541477203369, + "loss": 2.6826, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.36996841430664, + "rewards/margins": 0.5854486227035522, + "rewards/rejected": -20.95541763305664, + "step": 24230 + }, + { + "epoch": 0.8168458660554788, + "grad_norm": 19.438608169555664, + "learning_rate": 9.875242791318622e-08, + "logits/chosen": -1.9275665283203125, + "logits/rejected": -2.6109588146209717, + "logps/chosen": -2.3209636211395264, + "logps/rejected": -3.3178200721740723, + "loss": 2.6408, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.209636688232422, + "rewards/margins": 9.968561172485352, + "rewards/rejected": -33.178199768066406, + "step": 24235 + }, + { + "epoch": 0.817014392126462, + "grad_norm": 50.3494873046875, + "learning_rate": 9.857699940837116e-08, + "logits/chosen": -1.8657255172729492, + "logits/rejected": -2.113823413848877, + "logps/chosen": -1.711350679397583, + "logps/rejected": -1.7632169723510742, + "loss": 2.9808, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.113506317138672, + "rewards/margins": 0.5186625719070435, + "rewards/rejected": -17.63216781616211, + "step": 24240 + }, + { + "epoch": 0.8171829181974452, + "grad_norm": 27.25106430053711, + "learning_rate": 9.84017098202411e-08, + "logits/chosen": -1.7077363729476929, + "logits/rejected": -2.0198585987091064, + "logps/chosen": -2.3715415000915527, + "logps/rejected": -2.9775185585021973, + "loss": 2.3465, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.715417861938477, + "rewards/margins": 6.05977201461792, + "rewards/rejected": -29.775188446044922, + "step": 24245 + }, + { + "epoch": 0.8173514442684283, + "grad_norm": 57.114891052246094, + "learning_rate": 9.822655920945689e-08, + "logits/chosen": -1.7086464166641235, + "logits/rejected": -1.7238966226577759, + "logps/chosen": -1.6781396865844727, + "logps/rejected": -1.5659065246582031, + "loss": 4.1638, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -16.781396865844727, + "rewards/margins": -1.122330904006958, + "rewards/rejected": -15.659067153930664, + "step": 24250 + }, + { + "epoch": 0.8175199703394115, + "grad_norm": 23.778400421142578, + "learning_rate": 9.805154763663143e-08, + "logits/chosen": -2.0748565196990967, + "logits/rejected": -2.5501840114593506, + "logps/chosen": -3.339693069458008, + "logps/rejected": -4.343489646911621, + "loss": 1.7495, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -33.39692687988281, + "rewards/margins": 10.037964820861816, + "rewards/rejected": -43.43489456176758, + "step": 24255 + }, + { + "epoch": 0.8176884964103946, + "grad_norm": 21.111228942871094, + "learning_rate": 9.787667516232906e-08, + "logits/chosen": -1.0510246753692627, + "logits/rejected": -1.7527587413787842, + "logps/chosen": -2.5654733180999756, + "logps/rejected": -3.7719593048095703, + "loss": 1.7378, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.654733657836914, + "rewards/margins": 12.064860343933105, + "rewards/rejected": -37.71959686279297, + "step": 24260 + }, + { + "epoch": 0.8178570224813778, + "grad_norm": 16.419200897216797, + "learning_rate": 9.770194184706637e-08, + "logits/chosen": -2.0978894233703613, + "logits/rejected": -2.425337553024292, + "logps/chosen": -2.3975110054016113, + "logps/rejected": -3.037055253982544, + "loss": 2.1612, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.975107192993164, + "rewards/margins": 6.395442008972168, + "rewards/rejected": -30.37055015563965, + "step": 24265 + }, + { + "epoch": 0.8180255485523611, + "grad_norm": 38.14176940917969, + "learning_rate": 9.752734775131171e-08, + "logits/chosen": -1.8208221197128296, + "logits/rejected": -1.9604995250701904, + "logps/chosen": -1.9669945240020752, + "logps/rejected": -1.8961633443832397, + "loss": 3.8409, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.669946670532227, + "rewards/margins": -0.7083131670951843, + "rewards/rejected": -18.961633682250977, + "step": 24270 + }, + { + "epoch": 0.8181940746233443, + "grad_norm": 47.77272033691406, + "learning_rate": 9.735289293548537e-08, + "logits/chosen": -2.3018887042999268, + "logits/rejected": -2.4231314659118652, + "logps/chosen": -2.951404094696045, + "logps/rejected": -2.9356772899627686, + "loss": 4.2997, + "rewards/accuracies": 0.5, + "rewards/chosen": -29.514041900634766, + "rewards/margins": -0.15726737678050995, + "rewards/rejected": -29.356775283813477, + "step": 24275 + }, + { + "epoch": 0.8183626006943274, + "grad_norm": 1.0143815279006958, + "learning_rate": 9.717857745995894e-08, + "logits/chosen": -1.3401901721954346, + "logits/rejected": -1.683653473854065, + "logps/chosen": -2.7770957946777344, + "logps/rejected": -3.4555981159210205, + "loss": 1.8986, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -27.770959854125977, + "rewards/margins": 6.785020351409912, + "rewards/rejected": -34.55597686767578, + "step": 24280 + }, + { + "epoch": 0.8185311267653106, + "grad_norm": 82.10999298095703, + "learning_rate": 9.700440138505633e-08, + "logits/chosen": -1.3875019550323486, + "logits/rejected": -1.4035696983337402, + "logps/chosen": -2.593018054962158, + "logps/rejected": -2.754159927368164, + "loss": 3.7198, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.930179595947266, + "rewards/margins": 1.6114212274551392, + "rewards/rejected": -27.541601181030273, + "step": 24285 + }, + { + "epoch": 0.8186996528362938, + "grad_norm": 32.000885009765625, + "learning_rate": 9.683036477105316e-08, + "logits/chosen": -2.18599796295166, + "logits/rejected": -2.2437338829040527, + "logps/chosen": -2.5854225158691406, + "logps/rejected": -2.7304766178131104, + "loss": 3.1102, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -25.854223251342773, + "rewards/margins": 1.4505420923233032, + "rewards/rejected": -27.304767608642578, + "step": 24290 + }, + { + "epoch": 0.8188681789072769, + "grad_norm": 28.451539993286133, + "learning_rate": 9.665646767817636e-08, + "logits/chosen": -2.0066299438476562, + "logits/rejected": -2.0902113914489746, + "logps/chosen": -2.9684078693389893, + "logps/rejected": -3.7975857257843018, + "loss": 2.2862, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -29.6840763092041, + "rewards/margins": 8.291781425476074, + "rewards/rejected": -37.97585678100586, + "step": 24295 + }, + { + "epoch": 0.8190367049782601, + "grad_norm": 24.359365463256836, + "learning_rate": 9.648271016660503e-08, + "logits/chosen": -2.2018818855285645, + "logits/rejected": -2.1886439323425293, + "logps/chosen": -2.38197660446167, + "logps/rejected": -2.835458278656006, + "loss": 2.2134, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.819766998291016, + "rewards/margins": 4.53481388092041, + "rewards/rejected": -28.35457992553711, + "step": 24300 + }, + { + "epoch": 0.8192052310492434, + "grad_norm": 19.370195388793945, + "learning_rate": 9.630909229646972e-08, + "logits/chosen": -1.8896777629852295, + "logits/rejected": -2.023592472076416, + "logps/chosen": -1.7500760555267334, + "logps/rejected": -1.8328708410263062, + "loss": 2.6741, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.500761032104492, + "rewards/margins": 0.8279494047164917, + "rewards/rejected": -18.328710556030273, + "step": 24305 + }, + { + "epoch": 0.8193737571202265, + "grad_norm": 30.398090362548828, + "learning_rate": 9.613561412785277e-08, + "logits/chosen": -2.4260175228118896, + "logits/rejected": -2.715618133544922, + "logps/chosen": -2.8365581035614014, + "logps/rejected": -2.6541521549224854, + "loss": 5.9238, + "rewards/accuracies": 0.5, + "rewards/chosen": -28.365581512451172, + "rewards/margins": -1.8240602016448975, + "rewards/rejected": -26.541522979736328, + "step": 24310 + }, + { + "epoch": 0.8195422831912097, + "grad_norm": 71.60824584960938, + "learning_rate": 9.596227572078819e-08, + "logits/chosen": -2.237004041671753, + "logits/rejected": -2.439582109451294, + "logps/chosen": -2.3587868213653564, + "logps/rejected": -2.8456122875213623, + "loss": 1.5354, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -23.587865829467773, + "rewards/margins": 4.868255615234375, + "rewards/rejected": -28.45612144470215, + "step": 24315 + }, + { + "epoch": 0.8197108092621929, + "grad_norm": 24.230173110961914, + "learning_rate": 9.578907713526163e-08, + "logits/chosen": -2.4077305793762207, + "logits/rejected": -2.8008816242218018, + "logps/chosen": -1.9239914417266846, + "logps/rejected": -2.4241859912872314, + "loss": 2.336, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.239913940429688, + "rewards/margins": 5.0019450187683105, + "rewards/rejected": -24.241859436035156, + "step": 24320 + }, + { + "epoch": 0.819879335333176, + "grad_norm": 17.881893157958984, + "learning_rate": 9.561601843121003e-08, + "logits/chosen": -2.061577320098877, + "logits/rejected": -2.34552264213562, + "logps/chosen": -3.029741048812866, + "logps/rejected": -3.769167423248291, + "loss": 1.8675, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -30.297412872314453, + "rewards/margins": 7.394262790679932, + "rewards/rejected": -37.691673278808594, + "step": 24325 + }, + { + "epoch": 0.8200478614041592, + "grad_norm": 45.83840560913086, + "learning_rate": 9.544309966852243e-08, + "logits/chosen": -1.9961233139038086, + "logits/rejected": -2.096349000930786, + "logps/chosen": -2.4318995475769043, + "logps/rejected": -2.354107618331909, + "loss": 4.9792, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.318994522094727, + "rewards/margins": -0.7779159545898438, + "rewards/rejected": -23.54107666015625, + "step": 24330 + }, + { + "epoch": 0.8202163874751424, + "grad_norm": 19.41057777404785, + "learning_rate": 9.527032090703913e-08, + "logits/chosen": -1.7249418497085571, + "logits/rejected": -1.6345096826553345, + "logps/chosen": -1.8474910259246826, + "logps/rejected": -2.0146989822387695, + "loss": 2.3606, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.474910736083984, + "rewards/margins": 1.672079086303711, + "rewards/rejected": -20.146991729736328, + "step": 24335 + }, + { + "epoch": 0.8203849135461256, + "grad_norm": 202.65652465820312, + "learning_rate": 9.509768220655201e-08, + "logits/chosen": -2.2872161865234375, + "logits/rejected": -2.4015910625457764, + "logps/chosen": -3.318181276321411, + "logps/rejected": -3.330636501312256, + "loss": 6.8898, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -33.18181228637695, + "rewards/margins": 0.12455234676599503, + "rewards/rejected": -33.306365966796875, + "step": 24340 + }, + { + "epoch": 0.8205534396171088, + "grad_norm": 44.25444412231445, + "learning_rate": 9.492518362680469e-08, + "logits/chosen": -2.0388379096984863, + "logits/rejected": -2.184521436691284, + "logps/chosen": -2.8452885150909424, + "logps/rejected": -3.5608794689178467, + "loss": 2.6052, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -28.452880859375, + "rewards/margins": 7.155913352966309, + "rewards/rejected": -35.608795166015625, + "step": 24345 + }, + { + "epoch": 0.820721965688092, + "grad_norm": 21.56203269958496, + "learning_rate": 9.475282522749189e-08, + "logits/chosen": -1.3232519626617432, + "logits/rejected": -1.813665747642517, + "logps/chosen": -2.3560421466827393, + "logps/rejected": -2.6592459678649902, + "loss": 2.2359, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.560420989990234, + "rewards/margins": 3.032042980194092, + "rewards/rejected": -26.59246253967285, + "step": 24350 + }, + { + "epoch": 0.8208904917590751, + "grad_norm": 31.2237606048584, + "learning_rate": 9.458060706826021e-08, + "logits/chosen": -1.8143165111541748, + "logits/rejected": -1.8684632778167725, + "logps/chosen": -2.4288649559020996, + "logps/rejected": -2.423241376876831, + "loss": 3.5141, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.288654327392578, + "rewards/margins": -0.05624275282025337, + "rewards/rejected": -24.232410430908203, + "step": 24355 + }, + { + "epoch": 0.8210590178300583, + "grad_norm": 14.442959785461426, + "learning_rate": 9.440852920870762e-08, + "logits/chosen": -2.293940305709839, + "logits/rejected": -2.269188404083252, + "logps/chosen": -2.117640256881714, + "logps/rejected": -2.502702236175537, + "loss": 2.5759, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.176403045654297, + "rewards/margins": 3.8506178855895996, + "rewards/rejected": -25.027021408081055, + "step": 24360 + }, + { + "epoch": 0.8212275439010415, + "grad_norm": 21.63381576538086, + "learning_rate": 9.423659170838327e-08, + "logits/chosen": -1.926725149154663, + "logits/rejected": -2.1903584003448486, + "logps/chosen": -2.3233580589294434, + "logps/rejected": -2.667724132537842, + "loss": 2.1465, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.233577728271484, + "rewards/margins": 3.443665027618408, + "rewards/rejected": -26.677242279052734, + "step": 24365 + }, + { + "epoch": 0.8213960699720246, + "grad_norm": 101.54517364501953, + "learning_rate": 9.406479462678812e-08, + "logits/chosen": -2.2762808799743652, + "logits/rejected": -2.214491605758667, + "logps/chosen": -2.777794599533081, + "logps/rejected": -2.526498317718506, + "loss": 6.0451, + "rewards/accuracies": 0.5, + "rewards/chosen": -27.7779483795166, + "rewards/margins": -2.512964963912964, + "rewards/rejected": -25.26498031616211, + "step": 24370 + }, + { + "epoch": 0.8215645960430078, + "grad_norm": 11.986262321472168, + "learning_rate": 9.389313802337434e-08, + "logits/chosen": -2.0726375579833984, + "logits/rejected": -2.5973610877990723, + "logps/chosen": -2.276418685913086, + "logps/rejected": -2.66465425491333, + "loss": 1.4631, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -22.76418685913086, + "rewards/margins": 3.882354259490967, + "rewards/rejected": -26.64653968811035, + "step": 24375 + }, + { + "epoch": 0.8217331221139911, + "grad_norm": 26.458959579467773, + "learning_rate": 9.372162195754563e-08, + "logits/chosen": -1.9591572284698486, + "logits/rejected": -1.9244375228881836, + "logps/chosen": -2.5298287868499756, + "logps/rejected": -2.657670497894287, + "loss": 2.2302, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.298288345336914, + "rewards/margins": 1.2784183025360107, + "rewards/rejected": -26.576705932617188, + "step": 24380 + }, + { + "epoch": 0.8219016481849742, + "grad_norm": 45.323184967041016, + "learning_rate": 9.355024648865673e-08, + "logits/chosen": -1.9862937927246094, + "logits/rejected": -2.0360422134399414, + "logps/chosen": -2.7361838817596436, + "logps/rejected": -3.230208158493042, + "loss": 1.8254, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -27.361841201782227, + "rewards/margins": 4.940243721008301, + "rewards/rejected": -32.302085876464844, + "step": 24385 + }, + { + "epoch": 0.8220701742559574, + "grad_norm": 0.9159602522850037, + "learning_rate": 9.337901167601404e-08, + "logits/chosen": -1.2810310125350952, + "logits/rejected": -1.7395668029785156, + "logps/chosen": -2.1692142486572266, + "logps/rejected": -2.639930248260498, + "loss": 1.484, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.692142486572266, + "rewards/margins": 4.707159996032715, + "rewards/rejected": -26.399301528930664, + "step": 24390 + }, + { + "epoch": 0.8222387003269406, + "grad_norm": 29.296329498291016, + "learning_rate": 9.320791757887525e-08, + "logits/chosen": -1.996145248413086, + "logits/rejected": -2.350928544998169, + "logps/chosen": -2.3054845333099365, + "logps/rejected": -2.567873001098633, + "loss": 4.1882, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.054845809936523, + "rewards/margins": 2.623884677886963, + "rewards/rejected": -25.678730010986328, + "step": 24395 + }, + { + "epoch": 0.8224072263979237, + "grad_norm": 28.01740074157715, + "learning_rate": 9.303696425644914e-08, + "logits/chosen": -1.6943343877792358, + "logits/rejected": -2.1938259601593018, + "logps/chosen": -2.7483012676239014, + "logps/rejected": -3.2290358543395996, + "loss": 3.5691, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -27.48301124572754, + "rewards/margins": 4.807345867156982, + "rewards/rejected": -32.29035949707031, + "step": 24400 + }, + { + "epoch": 0.8224072263979237, + "eval_logits/chosen": -2.2856757640838623, + "eval_logits/rejected": -2.4621024131774902, + "eval_logps/chosen": -2.276271104812622, + "eval_logps/rejected": -2.429314613342285, + "eval_loss": 3.0806543827056885, + "eval_rewards/accuracies": 0.6299999952316284, + "eval_rewards/chosen": -22.762710571289062, + "eval_rewards/margins": 1.5304350852966309, + "eval_rewards/rejected": -24.29314422607422, + "eval_runtime": 12.9037, + "eval_samples_per_second": 7.75, + "eval_steps_per_second": 1.937, + "step": 24400 + }, + { + "epoch": 0.8225757524689069, + "grad_norm": 21.572216033935547, + "learning_rate": 9.286615176789603e-08, + "logits/chosen": -1.6033306121826172, + "logits/rejected": -2.2495296001434326, + "logps/chosen": -2.1205878257751465, + "logps/rejected": -3.219911575317383, + "loss": 1.5082, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.20587921142578, + "rewards/margins": 10.99323844909668, + "rewards/rejected": -32.199119567871094, + "step": 24405 + }, + { + "epoch": 0.8227442785398901, + "grad_norm": 29.8074893951416, + "learning_rate": 9.269548017232731e-08, + "logits/chosen": -1.3725274801254272, + "logits/rejected": -2.1665830612182617, + "logps/chosen": -2.617130994796753, + "logps/rejected": -3.4140186309814453, + "loss": 2.4409, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -26.171310424804688, + "rewards/margins": 7.968874931335449, + "rewards/rejected": -34.14018249511719, + "step": 24410 + }, + { + "epoch": 0.8229128046108733, + "grad_norm": 367.8448791503906, + "learning_rate": 9.252494952880585e-08, + "logits/chosen": -2.3928165435791016, + "logits/rejected": -2.5799431800842285, + "logps/chosen": -2.7322371006011963, + "logps/rejected": -2.877946376800537, + "loss": 3.0924, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -27.322372436523438, + "rewards/margins": 1.4570938348770142, + "rewards/rejected": -28.779464721679688, + "step": 24415 + }, + { + "epoch": 0.8230813306818565, + "grad_norm": 30.02773666381836, + "learning_rate": 9.235455989634539e-08, + "logits/chosen": -1.4501005411148071, + "logits/rejected": -1.9619977474212646, + "logps/chosen": -2.1792657375335693, + "logps/rejected": -2.3502280712127686, + "loss": 2.8787, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.79265785217285, + "rewards/margins": 1.7096226215362549, + "rewards/rejected": -23.50227928161621, + "step": 24420 + }, + { + "epoch": 0.8232498567528397, + "grad_norm": 23.372652053833008, + "learning_rate": 9.218431133391119e-08, + "logits/chosen": -2.177403211593628, + "logits/rejected": -2.612704038619995, + "logps/chosen": -2.234631061553955, + "logps/rejected": -2.9745616912841797, + "loss": 1.5008, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -22.3463077545166, + "rewards/margins": 7.399305820465088, + "rewards/rejected": -29.745615005493164, + "step": 24425 + }, + { + "epoch": 0.8234183828238228, + "grad_norm": 33.17243194580078, + "learning_rate": 9.201420390041964e-08, + "logits/chosen": -2.2947487831115723, + "logits/rejected": -2.219944477081299, + "logps/chosen": -2.5415103435516357, + "logps/rejected": -2.7752723693847656, + "loss": 2.4336, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.41510581970215, + "rewards/margins": 2.337618350982666, + "rewards/rejected": -27.752721786499023, + "step": 24430 + }, + { + "epoch": 0.823586908894806, + "grad_norm": 93.92717742919922, + "learning_rate": 9.184423765473798e-08, + "logits/chosen": -1.8181613683700562, + "logits/rejected": -2.419252395629883, + "logps/chosen": -3.1017751693725586, + "logps/rejected": -3.9386048316955566, + "loss": 1.7762, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -31.01775550842285, + "rewards/margins": 8.368292808532715, + "rewards/rejected": -39.386043548583984, + "step": 24435 + }, + { + "epoch": 0.8237554349657892, + "grad_norm": 44.434810638427734, + "learning_rate": 9.167441265568499e-08, + "logits/chosen": -1.3838471174240112, + "logits/rejected": -1.4956175088882446, + "logps/chosen": -2.231163740158081, + "logps/rejected": -2.0925040245056152, + "loss": 4.5103, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -22.311635971069336, + "rewards/margins": -1.3865975141525269, + "rewards/rejected": -20.925039291381836, + "step": 24440 + }, + { + "epoch": 0.8239239610367723, + "grad_norm": 48.002071380615234, + "learning_rate": 9.150472896203038e-08, + "logits/chosen": -2.1138761043548584, + "logits/rejected": -2.857396364212036, + "logps/chosen": -1.9746402502059937, + "logps/rejected": -2.4951186180114746, + "loss": 2.845, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.746402740478516, + "rewards/margins": 5.204786777496338, + "rewards/rejected": -24.951187133789062, + "step": 24445 + }, + { + "epoch": 0.8240924871077556, + "grad_norm": 38.526649475097656, + "learning_rate": 9.133518663249512e-08, + "logits/chosen": -1.917676568031311, + "logits/rejected": -1.9917066097259521, + "logps/chosen": -2.883349895477295, + "logps/rejected": -3.125060558319092, + "loss": 2.1997, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -28.833499908447266, + "rewards/margins": 2.417109251022339, + "rewards/rejected": -31.250606536865234, + "step": 24450 + }, + { + "epoch": 0.8242610131787388, + "grad_norm": 30.155296325683594, + "learning_rate": 9.11657857257509e-08, + "logits/chosen": -2.3301873207092285, + "logits/rejected": -2.3411762714385986, + "logps/chosen": -2.7758543491363525, + "logps/rejected": -2.8965957164764404, + "loss": 2.9838, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -27.758544921875, + "rewards/margins": 1.207413911819458, + "rewards/rejected": -28.965957641601562, + "step": 24455 + }, + { + "epoch": 0.824429539249722, + "grad_norm": 56.81505584716797, + "learning_rate": 9.099652630042082e-08, + "logits/chosen": -1.6030937433242798, + "logits/rejected": -1.4450093507766724, + "logps/chosen": -2.240893840789795, + "logps/rejected": -2.38731050491333, + "loss": 2.5314, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.408939361572266, + "rewards/margins": 1.4641635417938232, + "rewards/rejected": -23.873104095458984, + "step": 24460 + }, + { + "epoch": 0.8245980653207051, + "grad_norm": 26.720672607421875, + "learning_rate": 9.082740841507891e-08, + "logits/chosen": -1.6110947132110596, + "logits/rejected": -1.9232873916625977, + "logps/chosen": -2.1672120094299316, + "logps/rejected": -2.193887233734131, + "loss": 3.3855, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.672119140625, + "rewards/margins": 0.266756147146225, + "rewards/rejected": -21.938875198364258, + "step": 24465 + }, + { + "epoch": 0.8247665913916883, + "grad_norm": 8.478957176208496, + "learning_rate": 9.065843212825014e-08, + "logits/chosen": -2.654512405395508, + "logits/rejected": -2.91544508934021, + "logps/chosen": -2.9694647789001465, + "logps/rejected": -3.8186492919921875, + "loss": 1.6066, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -29.69464683532715, + "rewards/margins": 8.491846084594727, + "rewards/rejected": -38.186492919921875, + "step": 24470 + }, + { + "epoch": 0.8249351174626715, + "grad_norm": 49.12944793701172, + "learning_rate": 9.048959749841067e-08, + "logits/chosen": -1.9129912853240967, + "logits/rejected": -2.236804485321045, + "logps/chosen": -2.4809210300445557, + "logps/rejected": -2.9880154132843018, + "loss": 3.2204, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.8092098236084, + "rewards/margins": 5.070943355560303, + "rewards/rejected": -29.880151748657227, + "step": 24475 + }, + { + "epoch": 0.8251036435336546, + "grad_norm": 43.3686637878418, + "learning_rate": 9.03209045839874e-08, + "logits/chosen": -1.8041359186172485, + "logits/rejected": -2.211690664291382, + "logps/chosen": -2.227994441986084, + "logps/rejected": -2.4465460777282715, + "loss": 3.7016, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -22.279943466186523, + "rewards/margins": 2.1855177879333496, + "rewards/rejected": -24.4654598236084, + "step": 24480 + }, + { + "epoch": 0.8252721696046378, + "grad_norm": 57.436214447021484, + "learning_rate": 9.015235344335848e-08, + "logits/chosen": -1.828051209449768, + "logits/rejected": -1.9708757400512695, + "logps/chosen": -2.138115406036377, + "logps/rejected": -2.4871132373809814, + "loss": 1.6225, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.381155014038086, + "rewards/margins": 3.489978313446045, + "rewards/rejected": -24.87113380432129, + "step": 24485 + }, + { + "epoch": 0.8254406956756211, + "grad_norm": 139.59628295898438, + "learning_rate": 8.998394413485249e-08, + "logits/chosen": -2.0176608562469482, + "logits/rejected": -2.130763530731201, + "logps/chosen": -2.8235087394714355, + "logps/rejected": -3.0206499099731445, + "loss": 3.2285, + "rewards/accuracies": 0.5, + "rewards/chosen": -28.235088348388672, + "rewards/margins": 1.9714100360870361, + "rewards/rejected": -30.206497192382812, + "step": 24490 + }, + { + "epoch": 0.8256092217466042, + "grad_norm": 33.01143264770508, + "learning_rate": 8.981567671674956e-08, + "logits/chosen": -2.0660080909729004, + "logits/rejected": -2.4264297485351562, + "logps/chosen": -2.2755260467529297, + "logps/rejected": -3.569042205810547, + "loss": 1.0104, + "rewards/accuracies": 1.0, + "rewards/chosen": -22.755260467529297, + "rewards/margins": 12.935162544250488, + "rewards/rejected": -35.69042205810547, + "step": 24495 + }, + { + "epoch": 0.8257777478175874, + "grad_norm": 62.452022552490234, + "learning_rate": 8.964755124728035e-08, + "logits/chosen": -1.6145250797271729, + "logits/rejected": -1.9589201211929321, + "logps/chosen": -2.197807550430298, + "logps/rejected": -2.4428963661193848, + "loss": 2.2725, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.978076934814453, + "rewards/margins": 2.450887680053711, + "rewards/rejected": -24.428964614868164, + "step": 24500 + }, + { + "epoch": 0.8259462738885706, + "grad_norm": 34.67888259887695, + "learning_rate": 8.947956778462628e-08, + "logits/chosen": -1.963996171951294, + "logits/rejected": -1.929396629333496, + "logps/chosen": -2.4872591495513916, + "logps/rejected": -2.4773287773132324, + "loss": 3.2287, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.87259292602539, + "rewards/margins": -0.0993032455444336, + "rewards/rejected": -24.77328872680664, + "step": 24505 + }, + { + "epoch": 0.8261147999595537, + "grad_norm": 21.794479370117188, + "learning_rate": 8.931172638691998e-08, + "logits/chosen": -1.7423756122589111, + "logits/rejected": -2.079932689666748, + "logps/chosen": -2.141746997833252, + "logps/rejected": -2.7574732303619385, + "loss": 1.5836, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.417470932006836, + "rewards/margins": 6.15726375579834, + "rewards/rejected": -27.57473373413086, + "step": 24510 + }, + { + "epoch": 0.8262833260305369, + "grad_norm": 33.65979766845703, + "learning_rate": 8.914402711224466e-08, + "logits/chosen": -1.8532603979110718, + "logits/rejected": -1.9771888256072998, + "logps/chosen": -2.039398670196533, + "logps/rejected": -2.1789710521698, + "loss": 2.1006, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.39398956298828, + "rewards/margins": 1.3957213163375854, + "rewards/rejected": -21.789709091186523, + "step": 24515 + }, + { + "epoch": 0.8264518521015201, + "grad_norm": 19.37758445739746, + "learning_rate": 8.897647001863467e-08, + "logits/chosen": -2.056464433670044, + "logits/rejected": -2.1448254585266113, + "logps/chosen": -2.0192456245422363, + "logps/rejected": -2.1061623096466064, + "loss": 3.4028, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -20.192453384399414, + "rewards/margins": 0.8691700100898743, + "rewards/rejected": -21.06162452697754, + "step": 24520 + }, + { + "epoch": 0.8266203781725033, + "grad_norm": 65.1499252319336, + "learning_rate": 8.880905516407456e-08, + "logits/chosen": -1.6522516012191772, + "logits/rejected": -2.139988899230957, + "logps/chosen": -3.164013385772705, + "logps/rejected": -3.8580169677734375, + "loss": 2.2586, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -31.6401309967041, + "rewards/margins": 6.940039157867432, + "rewards/rejected": -38.58017349243164, + "step": 24525 + }, + { + "epoch": 0.8267889042434865, + "grad_norm": 61.83774948120117, + "learning_rate": 8.864178260650018e-08, + "logits/chosen": -1.669846534729004, + "logits/rejected": -2.1356027126312256, + "logps/chosen": -2.799243688583374, + "logps/rejected": -4.0782670974731445, + "loss": 1.836, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -27.9924373626709, + "rewards/margins": 12.79023551940918, + "rewards/rejected": -40.782676696777344, + "step": 24530 + }, + { + "epoch": 0.8269574303144697, + "grad_norm": 23.5116024017334, + "learning_rate": 8.847465240379809e-08, + "logits/chosen": -1.7187703847885132, + "logits/rejected": -2.0123953819274902, + "logps/chosen": -2.03355073928833, + "logps/rejected": -3.2723755836486816, + "loss": 1.617, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.335506439208984, + "rewards/margins": 12.388250350952148, + "rewards/rejected": -32.7237548828125, + "step": 24535 + }, + { + "epoch": 0.8271259563854528, + "grad_norm": 27.858238220214844, + "learning_rate": 8.830766461380523e-08, + "logits/chosen": -1.7541147470474243, + "logits/rejected": -1.9249897003173828, + "logps/chosen": -2.0622360706329346, + "logps/rejected": -2.1741714477539062, + "loss": 2.5385, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.622358322143555, + "rewards/margins": 1.1193554401397705, + "rewards/rejected": -21.741714477539062, + "step": 24540 + }, + { + "epoch": 0.827294482456436, + "grad_norm": 23.42979621887207, + "learning_rate": 8.814081929430967e-08, + "logits/chosen": -1.9817981719970703, + "logits/rejected": -1.6340019702911377, + "logps/chosen": -2.1842682361602783, + "logps/rejected": -2.083808422088623, + "loss": 4.1161, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -21.842681884765625, + "rewards/margins": -1.0045974254608154, + "rewards/rejected": -20.838083267211914, + "step": 24545 + }, + { + "epoch": 0.8274630085274192, + "grad_norm": 62.881187438964844, + "learning_rate": 8.797411650304986e-08, + "logits/chosen": -1.9652912616729736, + "logits/rejected": -1.9075905084609985, + "logps/chosen": -2.004683017730713, + "logps/rejected": -2.0662283897399902, + "loss": 2.7007, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.046829223632812, + "rewards/margins": 0.6154546737670898, + "rewards/rejected": -20.662282943725586, + "step": 24550 + }, + { + "epoch": 0.8276315345984023, + "grad_norm": 22.706445693969727, + "learning_rate": 8.780755629771536e-08, + "logits/chosen": -2.144796848297119, + "logits/rejected": -2.1325831413269043, + "logps/chosen": -2.8483757972717285, + "logps/rejected": -2.9826736450195312, + "loss": 3.5277, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -28.4837589263916, + "rewards/margins": 1.3429784774780273, + "rewards/rejected": -29.826736450195312, + "step": 24555 + }, + { + "epoch": 0.8278000606693856, + "grad_norm": 14.372481346130371, + "learning_rate": 8.764113873594575e-08, + "logits/chosen": -1.8322279453277588, + "logits/rejected": -1.653124451637268, + "logps/chosen": -2.967151403427124, + "logps/rejected": -3.2807929515838623, + "loss": 2.5602, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -29.6715145111084, + "rewards/margins": 3.1364188194274902, + "rewards/rejected": -32.80793380737305, + "step": 24560 + }, + { + "epoch": 0.8279685867403688, + "grad_norm": 151.4964141845703, + "learning_rate": 8.747486387533171e-08, + "logits/chosen": -2.8778085708618164, + "logits/rejected": -2.7823691368103027, + "logps/chosen": -2.7946856021881104, + "logps/rejected": -2.818932056427002, + "loss": 3.4742, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -27.946857452392578, + "rewards/margins": 0.2424612045288086, + "rewards/rejected": -28.189319610595703, + "step": 24565 + }, + { + "epoch": 0.8281371128113519, + "grad_norm": 25.029939651489258, + "learning_rate": 8.730873177341458e-08, + "logits/chosen": -1.8756672143936157, + "logits/rejected": -2.349959135055542, + "logps/chosen": -1.728493332862854, + "logps/rejected": -1.984562635421753, + "loss": 1.9933, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.28493309020996, + "rewards/margins": 2.5606913566589355, + "rewards/rejected": -19.845626831054688, + "step": 24570 + }, + { + "epoch": 0.8283056388823351, + "grad_norm": 71.39663696289062, + "learning_rate": 8.714274248768583e-08, + "logits/chosen": -1.9558569192886353, + "logits/rejected": -1.8518749475479126, + "logps/chosen": -2.5088768005371094, + "logps/rejected": -2.4664669036865234, + "loss": 3.622, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -25.088764190673828, + "rewards/margins": -0.42409858107566833, + "rewards/rejected": -24.6646671295166, + "step": 24575 + }, + { + "epoch": 0.8284741649533183, + "grad_norm": 64.44959259033203, + "learning_rate": 8.697689607558801e-08, + "logits/chosen": -2.6909384727478027, + "logits/rejected": -2.5570266246795654, + "logps/chosen": -2.24957537651062, + "logps/rejected": -2.215425968170166, + "loss": 3.8927, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.49575424194336, + "rewards/margins": -0.34149590134620667, + "rewards/rejected": -22.154260635375977, + "step": 24580 + }, + { + "epoch": 0.8286426910243014, + "grad_norm": 19.722009658813477, + "learning_rate": 8.681119259451403e-08, + "logits/chosen": -1.6167974472045898, + "logits/rejected": -1.870568037033081, + "logps/chosen": -2.371868848800659, + "logps/rejected": -2.4427285194396973, + "loss": 3.2064, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -23.718690872192383, + "rewards/margins": 0.7085939645767212, + "rewards/rejected": -24.42728614807129, + "step": 24585 + }, + { + "epoch": 0.8288112170952846, + "grad_norm": 19.004295349121094, + "learning_rate": 8.664563210180736e-08, + "logits/chosen": -1.7222764492034912, + "logits/rejected": -2.047950267791748, + "logps/chosen": -1.800819754600525, + "logps/rejected": -2.29779052734375, + "loss": 1.6882, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.008197784423828, + "rewards/margins": 4.969709873199463, + "rewards/rejected": -22.9779052734375, + "step": 24590 + }, + { + "epoch": 0.8289797431662678, + "grad_norm": 63.078643798828125, + "learning_rate": 8.648021465476185e-08, + "logits/chosen": -2.3208119869232178, + "logits/rejected": -2.6440043449401855, + "logps/chosen": -2.479806423187256, + "logps/rejected": -2.5025460720062256, + "loss": 5.985, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -24.798065185546875, + "rewards/margins": 0.22739505767822266, + "rewards/rejected": -25.025461196899414, + "step": 24595 + }, + { + "epoch": 0.829148269237251, + "grad_norm": 22.138776779174805, + "learning_rate": 8.631494031062197e-08, + "logits/chosen": -1.9542160034179688, + "logits/rejected": -1.8714958429336548, + "logps/chosen": -2.1395716667175293, + "logps/rejected": -2.96736478805542, + "loss": 1.4994, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.39571762084961, + "rewards/margins": 8.277932167053223, + "rewards/rejected": -29.673648834228516, + "step": 24600 + }, + { + "epoch": 0.8293167953082342, + "grad_norm": 23.740110397338867, + "learning_rate": 8.61498091265827e-08, + "logits/chosen": -1.223777174949646, + "logits/rejected": -1.5701664686203003, + "logps/chosen": -1.949045181274414, + "logps/rejected": -2.445789337158203, + "loss": 2.6138, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.49045181274414, + "rewards/margins": 4.967441558837891, + "rewards/rejected": -24.457895278930664, + "step": 24605 + }, + { + "epoch": 0.8294853213792174, + "grad_norm": 35.623653411865234, + "learning_rate": 8.59848211597895e-08, + "logits/chosen": -1.9874868392944336, + "logits/rejected": -2.273397445678711, + "logps/chosen": -1.8599836826324463, + "logps/rejected": -2.0248303413391113, + "loss": 2.7576, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.599836349487305, + "rewards/margins": 1.6484657526016235, + "rewards/rejected": -20.248302459716797, + "step": 24610 + }, + { + "epoch": 0.8296538474502005, + "grad_norm": 18.77968406677246, + "learning_rate": 8.581997646733812e-08, + "logits/chosen": -2.080842971801758, + "logits/rejected": -2.308432102203369, + "logps/chosen": -1.9153869152069092, + "logps/rejected": -2.152315139770508, + "loss": 3.1835, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.153867721557617, + "rewards/margins": 2.369281768798828, + "rewards/rejected": -21.523151397705078, + "step": 24615 + }, + { + "epoch": 0.8298223735211837, + "grad_norm": 44.094932556152344, + "learning_rate": 8.565527510627496e-08, + "logits/chosen": -1.8816394805908203, + "logits/rejected": -1.940839171409607, + "logps/chosen": -2.2022149562835693, + "logps/rejected": -2.478142499923706, + "loss": 2.3778, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.022151947021484, + "rewards/margins": 2.759272813796997, + "rewards/rejected": -24.781423568725586, + "step": 24620 + }, + { + "epoch": 0.8299908995921669, + "grad_norm": 29.145606994628906, + "learning_rate": 8.549071713359646e-08, + "logits/chosen": -1.9039310216903687, + "logits/rejected": -1.9348599910736084, + "logps/chosen": -2.906386375427246, + "logps/rejected": -3.3684539794921875, + "loss": 2.2814, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -29.063861846923828, + "rewards/margins": 4.6206769943237305, + "rewards/rejected": -33.684539794921875, + "step": 24625 + }, + { + "epoch": 0.83015942566315, + "grad_norm": 27.65276336669922, + "learning_rate": 8.532630260624974e-08, + "logits/chosen": -1.5308537483215332, + "logits/rejected": -1.6142187118530273, + "logps/chosen": -2.1859543323516846, + "logps/rejected": -2.2249884605407715, + "loss": 2.8996, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.859542846679688, + "rewards/margins": 0.3903442323207855, + "rewards/rejected": -22.24988555908203, + "step": 24630 + }, + { + "epoch": 0.8303279517341333, + "grad_norm": 19.151803970336914, + "learning_rate": 8.516203158113216e-08, + "logits/chosen": -1.777120590209961, + "logits/rejected": -1.6125797033309937, + "logps/chosen": -2.1398093700408936, + "logps/rejected": -2.089583396911621, + "loss": 3.9823, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.398096084594727, + "rewards/margins": -0.5022621750831604, + "rewards/rejected": -20.895832061767578, + "step": 24635 + }, + { + "epoch": 0.8304964778051165, + "grad_norm": 20.655881881713867, + "learning_rate": 8.499790411509161e-08, + "logits/chosen": -1.474410057067871, + "logits/rejected": -1.4502770900726318, + "logps/chosen": -2.045407295227051, + "logps/rejected": -2.0277533531188965, + "loss": 3.5089, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.454071044921875, + "rewards/margins": -0.17653894424438477, + "rewards/rejected": -20.27753257751465, + "step": 24640 + }, + { + "epoch": 0.8306650038760997, + "grad_norm": 32.811119079589844, + "learning_rate": 8.483392026492592e-08, + "logits/chosen": -1.5620620250701904, + "logits/rejected": -1.6248960494995117, + "logps/chosen": -2.8196754455566406, + "logps/rejected": -2.5971808433532715, + "loss": 5.5318, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -28.19675636291504, + "rewards/margins": -2.224942684173584, + "rewards/rejected": -25.971811294555664, + "step": 24645 + }, + { + "epoch": 0.8308335299470828, + "grad_norm": 220.64158630371094, + "learning_rate": 8.467008008738352e-08, + "logits/chosen": -1.8569514751434326, + "logits/rejected": -1.8670618534088135, + "logps/chosen": -2.6269702911376953, + "logps/rejected": -2.8930702209472656, + "loss": 2.7798, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -26.269702911376953, + "rewards/margins": 2.6609978675842285, + "rewards/rejected": -28.93070411682129, + "step": 24650 + }, + { + "epoch": 0.831002056018066, + "grad_norm": 37.396507263183594, + "learning_rate": 8.450638363916324e-08, + "logits/chosen": -1.9712955951690674, + "logits/rejected": -2.224093437194824, + "logps/chosen": -2.5156314373016357, + "logps/rejected": -3.635897159576416, + "loss": 0.8962, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -25.15631675720215, + "rewards/margins": 11.202653884887695, + "rewards/rejected": -36.358970642089844, + "step": 24655 + }, + { + "epoch": 0.8311705820890491, + "grad_norm": 32.989925384521484, + "learning_rate": 8.434283097691359e-08, + "logits/chosen": -1.557471513748169, + "logits/rejected": -1.5122106075286865, + "logps/chosen": -3.2903800010681152, + "logps/rejected": -3.4535770416259766, + "loss": 2.5971, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -32.90380096435547, + "rewards/margins": 1.631969690322876, + "rewards/rejected": -34.53577423095703, + "step": 24660 + }, + { + "epoch": 0.8313391081600323, + "grad_norm": 34.660987854003906, + "learning_rate": 8.417942215723394e-08, + "logits/chosen": -1.938816785812378, + "logits/rejected": -2.1259303092956543, + "logps/chosen": -2.5374019145965576, + "logps/rejected": -2.644392490386963, + "loss": 3.3285, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.374019622802734, + "rewards/margins": 1.0699094533920288, + "rewards/rejected": -26.443927764892578, + "step": 24665 + }, + { + "epoch": 0.8315076342310156, + "grad_norm": 27.914525985717773, + "learning_rate": 8.401615723667354e-08, + "logits/chosen": -2.4170756340026855, + "logits/rejected": -2.3611080646514893, + "logps/chosen": -2.390336513519287, + "logps/rejected": -2.480630397796631, + "loss": 2.8004, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.903366088867188, + "rewards/margins": 0.9029388427734375, + "rewards/rejected": -24.806303024291992, + "step": 24670 + }, + { + "epoch": 0.8316761603019988, + "grad_norm": 16.455949783325195, + "learning_rate": 8.385303627173223e-08, + "logits/chosen": -1.707933783531189, + "logits/rejected": -1.9285682439804077, + "logps/chosen": -2.2291674613952637, + "logps/rejected": -2.5350022315979004, + "loss": 1.8161, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -22.291675567626953, + "rewards/margins": 3.0583457946777344, + "rewards/rejected": -25.350019454956055, + "step": 24675 + }, + { + "epoch": 0.8318446863729819, + "grad_norm": 18.660741806030273, + "learning_rate": 8.369005931885936e-08, + "logits/chosen": -1.756413221359253, + "logits/rejected": -2.0539305210113525, + "logps/chosen": -2.5960230827331543, + "logps/rejected": -3.1703128814697266, + "loss": 1.8669, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -25.96023178100586, + "rewards/margins": 5.742895603179932, + "rewards/rejected": -31.703128814697266, + "step": 24680 + }, + { + "epoch": 0.8320132124439651, + "grad_norm": 21.118820190429688, + "learning_rate": 8.352722643445498e-08, + "logits/chosen": -2.343622922897339, + "logits/rejected": -2.329498291015625, + "logps/chosen": -2.4431777000427246, + "logps/rejected": -2.580986499786377, + "loss": 2.7023, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.431777954101562, + "rewards/margins": 1.3780863285064697, + "rewards/rejected": -25.809864044189453, + "step": 24685 + }, + { + "epoch": 0.8321817385149483, + "grad_norm": 33.85835647583008, + "learning_rate": 8.336453767486929e-08, + "logits/chosen": -1.9275716543197632, + "logits/rejected": -2.071688413619995, + "logps/chosen": -2.496863842010498, + "logps/rejected": -3.0280239582061768, + "loss": 2.5997, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.968637466430664, + "rewards/margins": 5.311602592468262, + "rewards/rejected": -30.280237197875977, + "step": 24690 + }, + { + "epoch": 0.8323502645859314, + "grad_norm": 24.390417098999023, + "learning_rate": 8.320199309640224e-08, + "logits/chosen": -1.8352603912353516, + "logits/rejected": -2.3818843364715576, + "logps/chosen": -2.051905870437622, + "logps/rejected": -2.1428139209747314, + "loss": 3.232, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.519058227539062, + "rewards/margins": 0.9090802073478699, + "rewards/rejected": -21.428136825561523, + "step": 24695 + }, + { + "epoch": 0.8325187906569146, + "grad_norm": 33.29798126220703, + "learning_rate": 8.303959275530415e-08, + "logits/chosen": -2.1693029403686523, + "logits/rejected": -2.5497283935546875, + "logps/chosen": -2.0754661560058594, + "logps/rejected": -2.577219247817993, + "loss": 1.7509, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.75465965270996, + "rewards/margins": 5.017529010772705, + "rewards/rejected": -25.772192001342773, + "step": 24700 + }, + { + "epoch": 0.8326873167278978, + "grad_norm": 91.18734741210938, + "learning_rate": 8.287733670777547e-08, + "logits/chosen": -1.9818317890167236, + "logits/rejected": -1.9187809228897095, + "logps/chosen": -3.0059261322021484, + "logps/rejected": -3.015350580215454, + "loss": 4.522, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -30.05925941467285, + "rewards/margins": 0.09424237906932831, + "rewards/rejected": -30.15350341796875, + "step": 24705 + }, + { + "epoch": 0.832855842798881, + "grad_norm": 29.47276496887207, + "learning_rate": 8.27152250099667e-08, + "logits/chosen": -2.0462698936462402, + "logits/rejected": -2.1758196353912354, + "logps/chosen": -2.5688459873199463, + "logps/rejected": -3.169067859649658, + "loss": 1.4751, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -25.688457489013672, + "rewards/margins": 6.002219200134277, + "rewards/rejected": -31.690677642822266, + "step": 24710 + }, + { + "epoch": 0.8330243688698642, + "grad_norm": 35.716827392578125, + "learning_rate": 8.255325771797799e-08, + "logits/chosen": -1.1083600521087646, + "logits/rejected": -1.4641042947769165, + "logps/chosen": -2.1212143898010254, + "logps/rejected": -2.767564296722412, + "loss": 1.8724, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.212142944335938, + "rewards/margins": 6.463498115539551, + "rewards/rejected": -27.675640106201172, + "step": 24715 + }, + { + "epoch": 0.8331928949408474, + "grad_norm": 24.23432159423828, + "learning_rate": 8.23914348878601e-08, + "logits/chosen": -1.3574097156524658, + "logits/rejected": -1.5571671724319458, + "logps/chosen": -1.7626625299453735, + "logps/rejected": -2.3101305961608887, + "loss": 3.3752, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.626623153686523, + "rewards/margins": 5.474681854248047, + "rewards/rejected": -23.101306915283203, + "step": 24720 + }, + { + "epoch": 0.8333614210118305, + "grad_norm": 100.50273895263672, + "learning_rate": 8.222975657561359e-08, + "logits/chosen": -1.8818786144256592, + "logits/rejected": -1.8507133722305298, + "logps/chosen": -3.0965118408203125, + "logps/rejected": -3.2869343757629395, + "loss": 2.5407, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -30.96511459350586, + "rewards/margins": 1.9042247533798218, + "rewards/rejected": -32.86934280395508, + "step": 24725 + }, + { + "epoch": 0.8335299470828137, + "grad_norm": 51.20134353637695, + "learning_rate": 8.206822283718873e-08, + "logits/chosen": -2.1385796070098877, + "logits/rejected": -2.4248080253601074, + "logps/chosen": -1.8668806552886963, + "logps/rejected": -2.1117656230926514, + "loss": 3.0479, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.668806076049805, + "rewards/margins": 2.44884991645813, + "rewards/rejected": -21.117656707763672, + "step": 24730 + }, + { + "epoch": 0.8336984731537969, + "grad_norm": 20.14624786376953, + "learning_rate": 8.190683372848612e-08, + "logits/chosen": -1.7058826684951782, + "logits/rejected": -1.75360107421875, + "logps/chosen": -2.259692668914795, + "logps/rejected": -2.3245644569396973, + "loss": 3.055, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -22.596923828125, + "rewards/margins": 0.6487201452255249, + "rewards/rejected": -23.245647430419922, + "step": 24735 + }, + { + "epoch": 0.83386699922478, + "grad_norm": 24.15453338623047, + "learning_rate": 8.174558930535608e-08, + "logits/chosen": -2.644989490509033, + "logits/rejected": -2.87270450592041, + "logps/chosen": -2.3526439666748047, + "logps/rejected": -2.963080883026123, + "loss": 1.8598, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -23.526439666748047, + "rewards/margins": 6.104373931884766, + "rewards/rejected": -29.630813598632812, + "step": 24740 + }, + { + "epoch": 0.8340355252957633, + "grad_norm": 24.785890579223633, + "learning_rate": 8.158448962359903e-08, + "logits/chosen": -1.4721378087997437, + "logits/rejected": -1.8219953775405884, + "logps/chosen": -1.9124889373779297, + "logps/rejected": -2.7076058387756348, + "loss": 1.8269, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.124889373779297, + "rewards/margins": 7.951168060302734, + "rewards/rejected": -27.0760555267334, + "step": 24745 + }, + { + "epoch": 0.8342040513667465, + "grad_norm": 17.511924743652344, + "learning_rate": 8.142353473896535e-08, + "logits/chosen": -1.6391900777816772, + "logits/rejected": -1.8954302072525024, + "logps/chosen": -2.1554198265075684, + "logps/rejected": -2.301879405975342, + "loss": 2.0994, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.55419921875, + "rewards/margins": 1.4645962715148926, + "rewards/rejected": -23.018795013427734, + "step": 24750 + }, + { + "epoch": 0.8343725774377296, + "grad_norm": 31.78805923461914, + "learning_rate": 8.126272470715489e-08, + "logits/chosen": -1.901834487915039, + "logits/rejected": -2.024052619934082, + "logps/chosen": -2.2320590019226074, + "logps/rejected": -2.8132669925689697, + "loss": 2.1427, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.320592880249023, + "rewards/margins": 5.812076568603516, + "rewards/rejected": -28.13266944885254, + "step": 24755 + }, + { + "epoch": 0.8345411035087128, + "grad_norm": 14.08447551727295, + "learning_rate": 8.110205958381783e-08, + "logits/chosen": -1.8176250457763672, + "logits/rejected": -1.7208465337753296, + "logps/chosen": -1.8047428131103516, + "logps/rejected": -2.0405871868133545, + "loss": 1.8179, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.047428131103516, + "rewards/margins": 2.3584465980529785, + "rewards/rejected": -20.405874252319336, + "step": 24760 + }, + { + "epoch": 0.834709629579696, + "grad_norm": 8.739221572875977, + "learning_rate": 8.094153942455406e-08, + "logits/chosen": -2.625211715698242, + "logits/rejected": -3.1657795906066895, + "logps/chosen": -4.064680576324463, + "logps/rejected": -4.758119106292725, + "loss": 3.9832, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -40.64680480957031, + "rewards/margins": 6.934388637542725, + "rewards/rejected": -47.58119583129883, + "step": 24765 + }, + { + "epoch": 0.8348781556506791, + "grad_norm": 32.949581146240234, + "learning_rate": 8.078116428491322e-08, + "logits/chosen": -2.286863327026367, + "logits/rejected": -2.008315324783325, + "logps/chosen": -2.388113260269165, + "logps/rejected": -2.4670004844665527, + "loss": 2.6394, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.881132125854492, + "rewards/margins": 0.7888728976249695, + "rewards/rejected": -24.670005798339844, + "step": 24770 + }, + { + "epoch": 0.8350466817216623, + "grad_norm": 3.3901429176330566, + "learning_rate": 8.062093422039484e-08, + "logits/chosen": -2.0986361503601074, + "logits/rejected": -2.428938388824463, + "logps/chosen": -2.475968360900879, + "logps/rejected": -2.965801954269409, + "loss": 2.1051, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -24.759685516357422, + "rewards/margins": 4.89833927154541, + "rewards/rejected": -29.65802001953125, + "step": 24775 + }, + { + "epoch": 0.8352152077926456, + "grad_norm": 88.18102264404297, + "learning_rate": 8.046084928644841e-08, + "logits/chosen": -1.3793189525604248, + "logits/rejected": -2.444030284881592, + "logps/chosen": -2.2284750938415527, + "logps/rejected": -3.5414485931396484, + "loss": 3.5037, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.28474998474121, + "rewards/margins": 13.129733085632324, + "rewards/rejected": -35.414485931396484, + "step": 24780 + }, + { + "epoch": 0.8353837338636287, + "grad_norm": 26.334148406982422, + "learning_rate": 8.030090953847274e-08, + "logits/chosen": -2.3015308380126953, + "logits/rejected": -2.426604747772217, + "logps/chosen": -1.8632885217666626, + "logps/rejected": -2.1273903846740723, + "loss": 2.4709, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.632884979248047, + "rewards/margins": 2.641018867492676, + "rewards/rejected": -21.27390480041504, + "step": 24785 + }, + { + "epoch": 0.8355522599346119, + "grad_norm": 17.940793991088867, + "learning_rate": 8.014111503181675e-08, + "logits/chosen": -2.3400533199310303, + "logits/rejected": -1.8803203105926514, + "logps/chosen": -2.7042131423950195, + "logps/rejected": -3.0254132747650146, + "loss": 4.6191, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -27.042133331298828, + "rewards/margins": 3.2120022773742676, + "rewards/rejected": -30.254131317138672, + "step": 24790 + }, + { + "epoch": 0.8357207860055951, + "grad_norm": 46.67781066894531, + "learning_rate": 7.998146582177923e-08, + "logits/chosen": -2.4982428550720215, + "logits/rejected": -2.790593385696411, + "logps/chosen": -3.1638150215148926, + "logps/rejected": -3.3133628368377686, + "loss": 3.7873, + "rewards/accuracies": 0.5, + "rewards/chosen": -31.638153076171875, + "rewards/margins": 1.4954760074615479, + "rewards/rejected": -33.133628845214844, + "step": 24795 + }, + { + "epoch": 0.8358893120765782, + "grad_norm": 29.687307357788086, + "learning_rate": 7.982196196360819e-08, + "logits/chosen": -1.8978008031845093, + "logits/rejected": -2.2759690284729004, + "logps/chosen": -2.5504889488220215, + "logps/rejected": -3.3600997924804688, + "loss": 1.4467, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -25.504892349243164, + "rewards/margins": 8.096107482910156, + "rewards/rejected": -33.60099792480469, + "step": 24800 + }, + { + "epoch": 0.8358893120765782, + "eval_logits/chosen": -2.2963404655456543, + "eval_logits/rejected": -2.4741809368133545, + "eval_logps/chosen": -2.281320810317993, + "eval_logps/rejected": -2.4352529048919678, + "eval_loss": 3.0853753089904785, + "eval_rewards/accuracies": 0.6299999952316284, + "eval_rewards/chosen": -22.81320571899414, + "eval_rewards/margins": 1.5393211841583252, + "eval_rewards/rejected": -24.352527618408203, + "eval_runtime": 12.9051, + "eval_samples_per_second": 7.749, + "eval_steps_per_second": 1.937, + "step": 24800 + }, + { + "epoch": 0.8360578381475614, + "grad_norm": 299.58172607421875, + "learning_rate": 7.966260351250176e-08, + "logits/chosen": -1.731415033340454, + "logits/rejected": -1.6957142353057861, + "logps/chosen": -2.898508310317993, + "logps/rejected": -3.6182937622070312, + "loss": 1.572, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -28.985082626342773, + "rewards/margins": 7.1978559494018555, + "rewards/rejected": -36.18293762207031, + "step": 24805 + }, + { + "epoch": 0.8362263642185446, + "grad_norm": 11.822115898132324, + "learning_rate": 7.950339052360761e-08, + "logits/chosen": -1.9383857250213623, + "logits/rejected": -2.0658161640167236, + "logps/chosen": -2.1835899353027344, + "logps/rejected": -2.295480966567993, + "loss": 2.6226, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.835901260375977, + "rewards/margins": 1.1189079284667969, + "rewards/rejected": -22.954809188842773, + "step": 24810 + }, + { + "epoch": 0.8363948902895277, + "grad_norm": 35.01670455932617, + "learning_rate": 7.934432305202321e-08, + "logits/chosen": -2.0714824199676514, + "logits/rejected": -2.514486789703369, + "logps/chosen": -2.0434935092926025, + "logps/rejected": -2.7274627685546875, + "loss": 1.6889, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.434932708740234, + "rewards/margins": 6.839694023132324, + "rewards/rejected": -27.274627685546875, + "step": 24815 + }, + { + "epoch": 0.836563416360511, + "grad_norm": 28.495513916015625, + "learning_rate": 7.918540115279538e-08, + "logits/chosen": -2.2264180183410645, + "logits/rejected": -2.5804407596588135, + "logps/chosen": -2.661252737045288, + "logps/rejected": -4.125033378601074, + "loss": 1.6787, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -26.61252784729004, + "rewards/margins": 14.637804985046387, + "rewards/rejected": -41.250335693359375, + "step": 24820 + }, + { + "epoch": 0.8367319424314942, + "grad_norm": 41.43541717529297, + "learning_rate": 7.902662488092071e-08, + "logits/chosen": -1.8777663707733154, + "logits/rejected": -2.1416497230529785, + "logps/chosen": -2.1105735301971436, + "logps/rejected": -2.2156014442443848, + "loss": 2.7785, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.10573387145996, + "rewards/margins": 1.0502779483795166, + "rewards/rejected": -22.15601348876953, + "step": 24825 + }, + { + "epoch": 0.8369004685024773, + "grad_norm": 28.61469078063965, + "learning_rate": 7.88679942913456e-08, + "logits/chosen": -1.651914358139038, + "logits/rejected": -1.790924310684204, + "logps/chosen": -2.0216152667999268, + "logps/rejected": -2.168140411376953, + "loss": 2.639, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.21615219116211, + "rewards/margins": 1.4652526378631592, + "rewards/rejected": -21.68140411376953, + "step": 24830 + }, + { + "epoch": 0.8370689945734605, + "grad_norm": 20.57470703125, + "learning_rate": 7.870950943896559e-08, + "logits/chosen": -1.73696768283844, + "logits/rejected": -1.9076378345489502, + "logps/chosen": -2.084214687347412, + "logps/rejected": -2.452416181564331, + "loss": 2.1224, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.842144012451172, + "rewards/margins": 3.6820149421691895, + "rewards/rejected": -24.52416229248047, + "step": 24835 + }, + { + "epoch": 0.8372375206444437, + "grad_norm": 12.077346801757812, + "learning_rate": 7.855117037862624e-08, + "logits/chosen": -1.2803000211715698, + "logits/rejected": -1.6471437215805054, + "logps/chosen": -2.5319161415100098, + "logps/rejected": -2.940267562866211, + "loss": 2.2514, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.31916046142578, + "rewards/margins": 4.083517551422119, + "rewards/rejected": -29.402679443359375, + "step": 24840 + }, + { + "epoch": 0.8374060467154268, + "grad_norm": 50.33457946777344, + "learning_rate": 7.839297716512233e-08, + "logits/chosen": -1.7693202495574951, + "logits/rejected": -1.9289439916610718, + "logps/chosen": -2.4417946338653564, + "logps/rejected": -3.2712059020996094, + "loss": 3.0719, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.417943954467773, + "rewards/margins": 8.294113159179688, + "rewards/rejected": -32.712059020996094, + "step": 24845 + }, + { + "epoch": 0.83757457278641, + "grad_norm": 28.605205535888672, + "learning_rate": 7.823492985319857e-08, + "logits/chosen": -1.704897165298462, + "logits/rejected": -1.7164885997772217, + "logps/chosen": -2.399608612060547, + "logps/rejected": -2.45776104927063, + "loss": 3.4444, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -23.9960880279541, + "rewards/margins": 0.5815240740776062, + "rewards/rejected": -24.577611923217773, + "step": 24850 + }, + { + "epoch": 0.8377430988573933, + "grad_norm": 39.61781311035156, + "learning_rate": 7.807702849754854e-08, + "logits/chosen": -1.7676922082901, + "logits/rejected": -1.6432530879974365, + "logps/chosen": -2.191768169403076, + "logps/rejected": -2.3741490840911865, + "loss": 2.5188, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.917682647705078, + "rewards/margins": 1.823809027671814, + "rewards/rejected": -23.74148941040039, + "step": 24855 + }, + { + "epoch": 0.8379116249283765, + "grad_norm": 28.61402130126953, + "learning_rate": 7.791927315281582e-08, + "logits/chosen": -1.956154227256775, + "logits/rejected": -2.0019774436950684, + "logps/chosen": -2.687230110168457, + "logps/rejected": -3.2162253856658936, + "loss": 1.9956, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.872303009033203, + "rewards/margins": 5.289952278137207, + "rewards/rejected": -32.162254333496094, + "step": 24860 + }, + { + "epoch": 0.8380801509993596, + "grad_norm": 30.63452911376953, + "learning_rate": 7.77616638735935e-08, + "logits/chosen": -1.0301138162612915, + "logits/rejected": -1.3081434965133667, + "logps/chosen": -3.067924976348877, + "logps/rejected": -3.3489010334014893, + "loss": 3.011, + "rewards/accuracies": 0.5, + "rewards/chosen": -30.679248809814453, + "rewards/margins": 2.8097636699676514, + "rewards/rejected": -33.489013671875, + "step": 24865 + }, + { + "epoch": 0.8382486770703428, + "grad_norm": 36.37446594238281, + "learning_rate": 7.76042007144237e-08, + "logits/chosen": -1.5581233501434326, + "logits/rejected": -1.9835189580917358, + "logps/chosen": -2.5085432529449463, + "logps/rejected": -2.1060569286346436, + "loss": 8.3278, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -25.085430145263672, + "rewards/margins": -4.024866104125977, + "rewards/rejected": -21.060565948486328, + "step": 24870 + }, + { + "epoch": 0.838417203141326, + "grad_norm": 34.3663215637207, + "learning_rate": 7.744688372979824e-08, + "logits/chosen": -1.7154502868652344, + "logits/rejected": -1.7041962146759033, + "logps/chosen": -2.716712474822998, + "logps/rejected": -2.6436030864715576, + "loss": 3.8873, + "rewards/accuracies": 0.5, + "rewards/chosen": -27.167123794555664, + "rewards/margins": -0.7310911417007446, + "rewards/rejected": -26.436031341552734, + "step": 24875 + }, + { + "epoch": 0.8385857292123091, + "grad_norm": 235.8781280517578, + "learning_rate": 7.728971297415843e-08, + "logits/chosen": -1.8020703792572021, + "logits/rejected": -2.0559287071228027, + "logps/chosen": -3.1783413887023926, + "logps/rejected": -3.4103896617889404, + "loss": 6.2529, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -31.783416748046875, + "rewards/margins": 2.3204798698425293, + "rewards/rejected": -34.1038932800293, + "step": 24880 + }, + { + "epoch": 0.8387542552832923, + "grad_norm": 16.511754989624023, + "learning_rate": 7.713268850189492e-08, + "logits/chosen": -1.7213910818099976, + "logits/rejected": -1.850507140159607, + "logps/chosen": -2.0041606426239014, + "logps/rejected": -2.1789450645446777, + "loss": 2.7103, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.041606903076172, + "rewards/margins": 1.7478454113006592, + "rewards/rejected": -21.789453506469727, + "step": 24885 + }, + { + "epoch": 0.8389227813542756, + "grad_norm": 29.656295776367188, + "learning_rate": 7.697581036734752e-08, + "logits/chosen": -1.7956311702728271, + "logits/rejected": -1.9855846166610718, + "logps/chosen": -3.3117146492004395, + "logps/rejected": -3.126675605773926, + "loss": 6.4772, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -33.11714553833008, + "rewards/margins": -1.8503891229629517, + "rewards/rejected": -31.26675796508789, + "step": 24890 + }, + { + "epoch": 0.8390913074252587, + "grad_norm": 23.54472541809082, + "learning_rate": 7.681907862480569e-08, + "logits/chosen": -1.9615615606307983, + "logits/rejected": -1.8660144805908203, + "logps/chosen": -2.3936643600463867, + "logps/rejected": -2.4466681480407715, + "loss": 2.8234, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.936643600463867, + "rewards/margins": 0.5300378799438477, + "rewards/rejected": -24.4666805267334, + "step": 24895 + }, + { + "epoch": 0.8392598334962419, + "grad_norm": 36.822994232177734, + "learning_rate": 7.666249332850805e-08, + "logits/chosen": -1.5024158954620361, + "logits/rejected": -1.6682920455932617, + "logps/chosen": -1.9932746887207031, + "logps/rejected": -2.1869759559631348, + "loss": 2.5083, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.93274688720703, + "rewards/margins": 1.937015175819397, + "rewards/rejected": -21.869760513305664, + "step": 24900 + }, + { + "epoch": 0.8394283595672251, + "grad_norm": 47.96998596191406, + "learning_rate": 7.650605453264263e-08, + "logits/chosen": -1.840775489807129, + "logits/rejected": -2.2100350856781006, + "logps/chosen": -2.1522419452667236, + "logps/rejected": -2.880509853363037, + "loss": 1.1539, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -21.522417068481445, + "rewards/margins": 7.282681941986084, + "rewards/rejected": -28.805099487304688, + "step": 24905 + }, + { + "epoch": 0.8395968856382082, + "grad_norm": 26.887475967407227, + "learning_rate": 7.634976229134677e-08, + "logits/chosen": -1.9716761112213135, + "logits/rejected": -2.277108907699585, + "logps/chosen": -2.2041330337524414, + "logps/rejected": -2.7298479080200195, + "loss": 2.846, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -22.041330337524414, + "rewards/margins": 5.257149696350098, + "rewards/rejected": -27.298480987548828, + "step": 24910 + }, + { + "epoch": 0.8397654117091914, + "grad_norm": 55.090423583984375, + "learning_rate": 7.619361665870699e-08, + "logits/chosen": -1.6557731628417969, + "logits/rejected": -1.8671079874038696, + "logps/chosen": -2.6471877098083496, + "logps/rejected": -2.532266855239868, + "loss": 5.3774, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -26.471878051757812, + "rewards/margins": -1.1492098569869995, + "rewards/rejected": -25.32266616821289, + "step": 24915 + }, + { + "epoch": 0.8399339377801746, + "grad_norm": 119.17420196533203, + "learning_rate": 7.603761768875933e-08, + "logits/chosen": -1.460766315460205, + "logits/rejected": -1.6520782709121704, + "logps/chosen": -3.4117214679718018, + "logps/rejected": -3.665818691253662, + "loss": 4.2282, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -34.11721420288086, + "rewards/margins": 2.54097580909729, + "rewards/rejected": -36.65818786621094, + "step": 24920 + }, + { + "epoch": 0.8401024638511577, + "grad_norm": 44.00700378417969, + "learning_rate": 7.588176543548863e-08, + "logits/chosen": -2.1163854598999023, + "logits/rejected": -2.0090603828430176, + "logps/chosen": -2.562870502471924, + "logps/rejected": -2.6641266345977783, + "loss": 3.5659, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -25.628704071044922, + "rewards/margins": 1.0125625133514404, + "rewards/rejected": -26.641265869140625, + "step": 24925 + }, + { + "epoch": 0.840270989922141, + "grad_norm": 61.31565475463867, + "learning_rate": 7.572605995282932e-08, + "logits/chosen": -1.1729356050491333, + "logits/rejected": -2.2421317100524902, + "logps/chosen": -2.4731881618499756, + "logps/rejected": -3.079061985015869, + "loss": 3.0966, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.731884002685547, + "rewards/margins": 6.058734893798828, + "rewards/rejected": -30.79061508178711, + "step": 24930 + }, + { + "epoch": 0.8404395159931242, + "grad_norm": 26.54155731201172, + "learning_rate": 7.557050129466503e-08, + "logits/chosen": -1.2745441198349, + "logits/rejected": -1.866265058517456, + "logps/chosen": -3.1167454719543457, + "logps/rejected": -3.6627628803253174, + "loss": 1.9667, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -31.16745376586914, + "rewards/margins": 5.46017599105835, + "rewards/rejected": -36.62763214111328, + "step": 24935 + }, + { + "epoch": 0.8406080420641073, + "grad_norm": 37.995361328125, + "learning_rate": 7.541508951482828e-08, + "logits/chosen": -1.3408405780792236, + "logits/rejected": -1.9099143743515015, + "logps/chosen": -2.429614543914795, + "logps/rejected": -2.8554434776306152, + "loss": 1.9876, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.296146392822266, + "rewards/margins": 4.2582902908325195, + "rewards/rejected": -28.554433822631836, + "step": 24940 + }, + { + "epoch": 0.8407765681350905, + "grad_norm": 11.022102355957031, + "learning_rate": 7.525982466710107e-08, + "logits/chosen": -1.692125678062439, + "logits/rejected": -2.2618660926818848, + "logps/chosen": -2.951873302459717, + "logps/rejected": -3.3857064247131348, + "loss": 3.0067, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -29.518728256225586, + "rewards/margins": 4.33833122253418, + "rewards/rejected": -33.85706329345703, + "step": 24945 + }, + { + "epoch": 0.8409450942060737, + "grad_norm": 43.10789489746094, + "learning_rate": 7.510470680521442e-08, + "logits/chosen": -1.518248438835144, + "logits/rejected": -1.6292740106582642, + "logps/chosen": -2.6451168060302734, + "logps/rejected": -2.6429200172424316, + "loss": 4.3998, + "rewards/accuracies": 0.5, + "rewards/chosen": -26.4511661529541, + "rewards/margins": -0.02196817472577095, + "rewards/rejected": -26.42919921875, + "step": 24950 + }, + { + "epoch": 0.8411136202770568, + "grad_norm": 23.555614471435547, + "learning_rate": 7.494973598284871e-08, + "logits/chosen": -1.820709228515625, + "logits/rejected": -2.0816328525543213, + "logps/chosen": -2.2437901496887207, + "logps/rejected": -3.068833112716675, + "loss": 1.043, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -22.43790054321289, + "rewards/margins": 8.2504301071167, + "rewards/rejected": -30.688329696655273, + "step": 24955 + }, + { + "epoch": 0.84128214634804, + "grad_norm": 4.327147483825684, + "learning_rate": 7.479491225363289e-08, + "logits/chosen": -1.9262707233428955, + "logits/rejected": -2.1278395652770996, + "logps/chosen": -2.530325412750244, + "logps/rejected": -3.2573580741882324, + "loss": 1.5586, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.30324935913086, + "rewards/margins": 7.27032995223999, + "rewards/rejected": -32.57358169555664, + "step": 24960 + }, + { + "epoch": 0.8414506724190233, + "grad_norm": 21.99888038635254, + "learning_rate": 7.464023567114558e-08, + "logits/chosen": -1.6207927465438843, + "logits/rejected": -1.5682424306869507, + "logps/chosen": -2.9613633155822754, + "logps/rejected": -2.809797763824463, + "loss": 5.7672, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -29.613636016845703, + "rewards/margins": -1.515655755996704, + "rewards/rejected": -28.097976684570312, + "step": 24965 + }, + { + "epoch": 0.8416191984900064, + "grad_norm": 36.24042510986328, + "learning_rate": 7.448570628891426e-08, + "logits/chosen": -1.524966835975647, + "logits/rejected": -1.888681173324585, + "logps/chosen": -2.220306634902954, + "logps/rejected": -2.5548994541168213, + "loss": 2.9771, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.203067779541016, + "rewards/margins": 3.345930814743042, + "rewards/rejected": -25.548995971679688, + "step": 24970 + }, + { + "epoch": 0.8417877245609896, + "grad_norm": 47.27100372314453, + "learning_rate": 7.433132416041532e-08, + "logits/chosen": -1.385962724685669, + "logits/rejected": -1.4744789600372314, + "logps/chosen": -1.9234859943389893, + "logps/rejected": -2.1203255653381348, + "loss": 2.8177, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.234861373901367, + "rewards/margins": 1.9683917760849, + "rewards/rejected": -21.2032527923584, + "step": 24975 + }, + { + "epoch": 0.8419562506319728, + "grad_norm": 65.24883270263672, + "learning_rate": 7.41770893390744e-08, + "logits/chosen": -2.4800801277160645, + "logits/rejected": -2.3463408946990967, + "logps/chosen": -2.263475179672241, + "logps/rejected": -2.277034282684326, + "loss": 3.3677, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -22.634754180908203, + "rewards/margins": 0.13558892905712128, + "rewards/rejected": -22.770343780517578, + "step": 24980 + }, + { + "epoch": 0.8421247767029559, + "grad_norm": 22.446992874145508, + "learning_rate": 7.40230018782661e-08, + "logits/chosen": -2.0303831100463867, + "logits/rejected": -2.356431722640991, + "logps/chosen": -2.162581443786621, + "logps/rejected": -2.779609203338623, + "loss": 1.9192, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.62581443786621, + "rewards/margins": 6.1702752113342285, + "rewards/rejected": -27.796092987060547, + "step": 24985 + }, + { + "epoch": 0.8422933027739391, + "grad_norm": 403.3161926269531, + "learning_rate": 7.386906183131414e-08, + "logits/chosen": -1.5163092613220215, + "logits/rejected": -1.9733335971832275, + "logps/chosen": -2.665555238723755, + "logps/rejected": -2.582866907119751, + "loss": 4.4604, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.65555191040039, + "rewards/margins": -0.8268814086914062, + "rewards/rejected": -25.82866859436035, + "step": 24990 + }, + { + "epoch": 0.8424618288449223, + "grad_norm": 64.99507141113281, + "learning_rate": 7.37152692514909e-08, + "logits/chosen": -1.9403884410858154, + "logits/rejected": -2.4308600425720215, + "logps/chosen": -2.6366209983825684, + "logps/rejected": -2.773503541946411, + "loss": 3.0987, + "rewards/accuracies": 0.5, + "rewards/chosen": -26.366207122802734, + "rewards/margins": 1.3688265085220337, + "rewards/rejected": -27.735034942626953, + "step": 24995 + }, + { + "epoch": 0.8426303549159055, + "grad_norm": 43.74968338012695, + "learning_rate": 7.3561624192018e-08, + "logits/chosen": -2.2320027351379395, + "logits/rejected": -2.564361333847046, + "logps/chosen": -2.5689501762390137, + "logps/rejected": -2.8458104133605957, + "loss": 2.9299, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.689504623413086, + "rewards/margins": 2.7685999870300293, + "rewards/rejected": -28.458105087280273, + "step": 25000 + }, + { + "epoch": 0.8427988809868887, + "grad_norm": 49.172855377197266, + "learning_rate": 7.340812670606611e-08, + "logits/chosen": -1.9074571132659912, + "logits/rejected": -2.3194408416748047, + "logps/chosen": -2.8225278854370117, + "logps/rejected": -2.9414353370666504, + "loss": 6.2644, + "rewards/accuracies": 0.5, + "rewards/chosen": -28.22528076171875, + "rewards/margins": 1.1890745162963867, + "rewards/rejected": -29.414352416992188, + "step": 25005 + }, + { + "epoch": 0.8429674070578719, + "grad_norm": 20.995792388916016, + "learning_rate": 7.32547768467544e-08, + "logits/chosen": -1.8266541957855225, + "logits/rejected": -2.029745578765869, + "logps/chosen": -1.8279956579208374, + "logps/rejected": -2.4323911666870117, + "loss": 2.3263, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.279956817626953, + "rewards/margins": 6.043955326080322, + "rewards/rejected": -24.323911666870117, + "step": 25010 + }, + { + "epoch": 0.843135933128855, + "grad_norm": 41.86899185180664, + "learning_rate": 7.310157466715133e-08, + "logits/chosen": -0.9306265115737915, + "logits/rejected": -2.0167274475097656, + "logps/chosen": -2.461030960083008, + "logps/rejected": -3.634398937225342, + "loss": 2.6665, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.610309600830078, + "rewards/margins": 11.733680725097656, + "rewards/rejected": -36.343990325927734, + "step": 25015 + }, + { + "epoch": 0.8433044591998382, + "grad_norm": 44.50137710571289, + "learning_rate": 7.294852022027409e-08, + "logits/chosen": -1.8055055141448975, + "logits/rejected": -2.123600721359253, + "logps/chosen": -2.092360019683838, + "logps/rejected": -2.493429660797119, + "loss": 2.0098, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.923603057861328, + "rewards/margins": 4.010695457458496, + "rewards/rejected": -24.934295654296875, + "step": 25020 + }, + { + "epoch": 0.8434729852708214, + "grad_norm": 30.74772071838379, + "learning_rate": 7.279561355908903e-08, + "logits/chosen": -1.1198111772537231, + "logits/rejected": -1.6509565114974976, + "logps/chosen": -2.195266008377075, + "logps/rejected": -2.4629082679748535, + "loss": 3.6763, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.95265769958496, + "rewards/margins": 2.6764230728149414, + "rewards/rejected": -24.62908172607422, + "step": 25025 + }, + { + "epoch": 0.8436415113418045, + "grad_norm": 39.40415954589844, + "learning_rate": 7.264285473651078e-08, + "logits/chosen": -1.5459685325622559, + "logits/rejected": -1.7445091009140015, + "logps/chosen": -2.3278846740722656, + "logps/rejected": -2.5380375385284424, + "loss": 1.627, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -23.27884864807129, + "rewards/margins": 2.101527690887451, + "rewards/rejected": -25.3803768157959, + "step": 25030 + }, + { + "epoch": 0.8438100374127877, + "grad_norm": 13.756035804748535, + "learning_rate": 7.249024380540331e-08, + "logits/chosen": -2.231560230255127, + "logits/rejected": -2.355930805206299, + "logps/chosen": -2.355832576751709, + "logps/rejected": -2.809471607208252, + "loss": 1.2205, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -23.55832862854004, + "rewards/margins": 4.5363874435424805, + "rewards/rejected": -28.094715118408203, + "step": 25035 + }, + { + "epoch": 0.843978563483771, + "grad_norm": 290.2860107421875, + "learning_rate": 7.233778081857928e-08, + "logits/chosen": -1.5744444131851196, + "logits/rejected": -2.3242926597595215, + "logps/chosen": -3.13602352142334, + "logps/rejected": -3.7153167724609375, + "loss": 4.2686, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -31.360239028930664, + "rewards/margins": 5.792929649353027, + "rewards/rejected": -37.153160095214844, + "step": 25040 + }, + { + "epoch": 0.8441470895547541, + "grad_norm": 29.042400360107422, + "learning_rate": 7.218546582880003e-08, + "logits/chosen": -1.6595700979232788, + "logits/rejected": -1.9214191436767578, + "logps/chosen": -2.1897659301757812, + "logps/rejected": -2.8830249309539795, + "loss": 1.8043, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.897661209106445, + "rewards/margins": 6.932587623596191, + "rewards/rejected": -28.830249786376953, + "step": 25045 + }, + { + "epoch": 0.8443156156257373, + "grad_norm": 38.23175811767578, + "learning_rate": 7.203329888877602e-08, + "logits/chosen": -1.7221580743789673, + "logits/rejected": -1.9687376022338867, + "logps/chosen": -3.106229782104492, + "logps/rejected": -3.474976062774658, + "loss": 3.0086, + "rewards/accuracies": 0.5, + "rewards/chosen": -31.062297821044922, + "rewards/margins": 3.687462568283081, + "rewards/rejected": -34.74976348876953, + "step": 25050 + }, + { + "epoch": 0.8444841416967205, + "grad_norm": 19.823972702026367, + "learning_rate": 7.188128005116589e-08, + "logits/chosen": -1.8666727542877197, + "logits/rejected": -2.0208840370178223, + "logps/chosen": -1.9966617822647095, + "logps/rejected": -2.352268934249878, + "loss": 2.7333, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.966617584228516, + "rewards/margins": 3.556072950363159, + "rewards/rejected": -23.522689819335938, + "step": 25055 + }, + { + "epoch": 0.8446526677677036, + "grad_norm": 34.18622589111328, + "learning_rate": 7.172940936857751e-08, + "logits/chosen": -1.8579282760620117, + "logits/rejected": -1.8631632328033447, + "logps/chosen": -2.143566846847534, + "logps/rejected": -2.2731566429138184, + "loss": 2.4161, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.4356689453125, + "rewards/margins": 1.295898199081421, + "rewards/rejected": -22.7315673828125, + "step": 25060 + }, + { + "epoch": 0.8448211938386868, + "grad_norm": 37.837615966796875, + "learning_rate": 7.157768689356741e-08, + "logits/chosen": -2.0021746158599854, + "logits/rejected": -2.181589365005493, + "logps/chosen": -2.607229709625244, + "logps/rejected": -3.0638108253479004, + "loss": 2.0169, + "rewards/accuracies": 0.5, + "rewards/chosen": -26.072296142578125, + "rewards/margins": 4.565813064575195, + "rewards/rejected": -30.638107299804688, + "step": 25065 + }, + { + "epoch": 0.84498971990967, + "grad_norm": 43.49299240112305, + "learning_rate": 7.142611267864068e-08, + "logits/chosen": -2.0716075897216797, + "logits/rejected": -2.165670871734619, + "logps/chosen": -2.8804638385772705, + "logps/rejected": -3.0383782386779785, + "loss": 3.4523, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -28.804637908935547, + "rewards/margins": 1.5791432857513428, + "rewards/rejected": -30.3837833404541, + "step": 25070 + }, + { + "epoch": 0.8451582459806533, + "grad_norm": 33.29661560058594, + "learning_rate": 7.127468677625137e-08, + "logits/chosen": -1.3675150871276855, + "logits/rejected": -1.3157413005828857, + "logps/chosen": -2.02780818939209, + "logps/rejected": -2.133814573287964, + "loss": 3.5942, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.2780818939209, + "rewards/margins": 1.060063362121582, + "rewards/rejected": -21.338146209716797, + "step": 25075 + }, + { + "epoch": 0.8453267720516364, + "grad_norm": 7.868017673492432, + "learning_rate": 7.112340923880172e-08, + "logits/chosen": -1.3394761085510254, + "logits/rejected": -1.7868926525115967, + "logps/chosen": -1.9955241680145264, + "logps/rejected": -2.234886646270752, + "loss": 2.258, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.955242156982422, + "rewards/margins": 2.393623113632202, + "rewards/rejected": -22.348867416381836, + "step": 25080 + }, + { + "epoch": 0.8454952981226196, + "grad_norm": 40.71536636352539, + "learning_rate": 7.097228011864304e-08, + "logits/chosen": -1.532164216041565, + "logits/rejected": -2.4663033485412598, + "logps/chosen": -2.945025682449341, + "logps/rejected": -3.815264940261841, + "loss": 1.6375, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -29.45025634765625, + "rewards/margins": 8.702392578125, + "rewards/rejected": -38.15264892578125, + "step": 25085 + }, + { + "epoch": 0.8456638241936028, + "grad_norm": 107.01415252685547, + "learning_rate": 7.082129946807525e-08, + "logits/chosen": -1.9736740589141846, + "logits/rejected": -1.782965898513794, + "logps/chosen": -2.528587818145752, + "logps/rejected": -2.898869276046753, + "loss": 2.9653, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.285877227783203, + "rewards/margins": 3.702814817428589, + "rewards/rejected": -28.988689422607422, + "step": 25090 + }, + { + "epoch": 0.8458323502645859, + "grad_norm": 39.172874450683594, + "learning_rate": 7.067046733934685e-08, + "logits/chosen": -1.4259833097457886, + "logits/rejected": -1.6555812358856201, + "logps/chosen": -2.3975255489349365, + "logps/rejected": -3.425919771194458, + "loss": 2.7265, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.975255966186523, + "rewards/margins": 10.283941268920898, + "rewards/rejected": -34.25919723510742, + "step": 25095 + }, + { + "epoch": 0.8460008763355691, + "grad_norm": 36.24737548828125, + "learning_rate": 7.051978378465461e-08, + "logits/chosen": -1.5988880395889282, + "logits/rejected": -1.908831000328064, + "logps/chosen": -2.4758269786834717, + "logps/rejected": -2.7519752979278564, + "loss": 3.4556, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -24.758270263671875, + "rewards/margins": 2.7614827156066895, + "rewards/rejected": -27.519750595092773, + "step": 25100 + }, + { + "epoch": 0.8461694024065523, + "grad_norm": 113.6759262084961, + "learning_rate": 7.036924885614443e-08, + "logits/chosen": -1.8109318017959595, + "logits/rejected": -1.8489185571670532, + "logps/chosen": -2.460669994354248, + "logps/rejected": -2.535710096359253, + "loss": 2.8731, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.606698989868164, + "rewards/margins": 0.7504032254219055, + "rewards/rejected": -25.357101440429688, + "step": 25105 + }, + { + "epoch": 0.8463379284775355, + "grad_norm": 48.524330139160156, + "learning_rate": 7.021886260591053e-08, + "logits/chosen": -1.6381628513336182, + "logits/rejected": -1.8552592992782593, + "logps/chosen": -1.9970314502716064, + "logps/rejected": -2.153109073638916, + "loss": 2.0627, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.97031593322754, + "rewards/margins": 1.5607770681381226, + "rewards/rejected": -21.531091690063477, + "step": 25110 + }, + { + "epoch": 0.8465064545485187, + "grad_norm": 44.247013092041016, + "learning_rate": 7.006862508599554e-08, + "logits/chosen": -1.9095268249511719, + "logits/rejected": -2.1885132789611816, + "logps/chosen": -2.5970306396484375, + "logps/rejected": -3.2741634845733643, + "loss": 3.3046, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -25.970306396484375, + "rewards/margins": 6.771327018737793, + "rewards/rejected": -32.741634368896484, + "step": 25115 + }, + { + "epoch": 0.8466749806195019, + "grad_norm": 0.39335116744041443, + "learning_rate": 6.991853634839068e-08, + "logits/chosen": -2.1617605686187744, + "logits/rejected": -2.1641488075256348, + "logps/chosen": -2.821693181991577, + "logps/rejected": -3.0508570671081543, + "loss": 2.5592, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -28.216930389404297, + "rewards/margins": 2.291637897491455, + "rewards/rejected": -30.508569717407227, + "step": 25120 + }, + { + "epoch": 0.846843506690485, + "grad_norm": 67.44718170166016, + "learning_rate": 6.976859644503591e-08, + "logits/chosen": -1.5310341119766235, + "logits/rejected": -1.79361891746521, + "logps/chosen": -2.7277884483337402, + "logps/rejected": -2.9582366943359375, + "loss": 4.3421, + "rewards/accuracies": 0.5, + "rewards/chosen": -27.277883529663086, + "rewards/margins": 2.304482936859131, + "rewards/rejected": -29.582366943359375, + "step": 25125 + }, + { + "epoch": 0.8470120327614682, + "grad_norm": 33.54582977294922, + "learning_rate": 6.961880542781962e-08, + "logits/chosen": -1.2129520177841187, + "logits/rejected": -1.4452576637268066, + "logps/chosen": -2.5996367931365967, + "logps/rejected": -3.155336380004883, + "loss": 0.7775, + "rewards/accuracies": 1.0, + "rewards/chosen": -25.996368408203125, + "rewards/margins": 5.5569963455200195, + "rewards/rejected": -31.553363800048828, + "step": 25130 + }, + { + "epoch": 0.8471805588324514, + "grad_norm": 11.366177558898926, + "learning_rate": 6.946916334857822e-08, + "logits/chosen": -1.721011757850647, + "logits/rejected": -2.0394160747528076, + "logps/chosen": -2.0530264377593994, + "logps/rejected": -2.2458395957946777, + "loss": 2.0855, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.530263900756836, + "rewards/margins": 1.9281337261199951, + "rewards/rejected": -22.45839500427246, + "step": 25135 + }, + { + "epoch": 0.8473490849034345, + "grad_norm": 44.85331344604492, + "learning_rate": 6.931967025909724e-08, + "logits/chosen": -1.4946274757385254, + "logits/rejected": -1.5228080749511719, + "logps/chosen": -2.5096378326416016, + "logps/rejected": -2.6178455352783203, + "loss": 3.6777, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -25.096378326416016, + "rewards/margins": 1.082077980041504, + "rewards/rejected": -26.178457260131836, + "step": 25140 + }, + { + "epoch": 0.8475176109744177, + "grad_norm": 30.542884826660156, + "learning_rate": 6.917032621111029e-08, + "logits/chosen": -2.2814676761627197, + "logits/rejected": -2.2030251026153564, + "logps/chosen": -2.5095839500427246, + "logps/rejected": -2.711360216140747, + "loss": 2.8255, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.095840454101562, + "rewards/margins": 2.017761468887329, + "rewards/rejected": -27.113601684570312, + "step": 25145 + }, + { + "epoch": 0.847686137045401, + "grad_norm": 104.68621063232422, + "learning_rate": 6.902113125629938e-08, + "logits/chosen": -1.8296295404434204, + "logits/rejected": -2.0239923000335693, + "logps/chosen": -2.2184555530548096, + "logps/rejected": -2.414580821990967, + "loss": 2.8458, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.184555053710938, + "rewards/margins": 1.9612529277801514, + "rewards/rejected": -24.145809173583984, + "step": 25150 + }, + { + "epoch": 0.8478546631163841, + "grad_norm": 63.11360168457031, + "learning_rate": 6.887208544629503e-08, + "logits/chosen": -1.5033972263336182, + "logits/rejected": -1.4420006275177002, + "logps/chosen": -2.1528759002685547, + "logps/rejected": -2.4601898193359375, + "loss": 2.2622, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.528759002685547, + "rewards/margins": 3.0731396675109863, + "rewards/rejected": -24.601898193359375, + "step": 25155 + }, + { + "epoch": 0.8480231891873673, + "grad_norm": 151.5568084716797, + "learning_rate": 6.872318883267614e-08, + "logits/chosen": -2.212207317352295, + "logits/rejected": -1.8575718402862549, + "logps/chosen": -2.4121429920196533, + "logps/rejected": -2.3576273918151855, + "loss": 4.9464, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.121427536010742, + "rewards/margins": -0.5451576113700867, + "rewards/rejected": -23.576269149780273, + "step": 25160 + }, + { + "epoch": 0.8481917152583505, + "grad_norm": 32.88406753540039, + "learning_rate": 6.857444146697006e-08, + "logits/chosen": -1.6049182415008545, + "logits/rejected": -1.7114994525909424, + "logps/chosen": -2.603238344192505, + "logps/rejected": -2.758378028869629, + "loss": 2.7873, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.03238296508789, + "rewards/margins": 1.5513970851898193, + "rewards/rejected": -27.58378028869629, + "step": 25165 + }, + { + "epoch": 0.8483602413293336, + "grad_norm": 44.524539947509766, + "learning_rate": 6.842584340065222e-08, + "logits/chosen": -1.3670152425765991, + "logits/rejected": -1.4763129949569702, + "logps/chosen": -1.884411096572876, + "logps/rejected": -2.149660110473633, + "loss": 2.0465, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.8441104888916, + "rewards/margins": 2.652489185333252, + "rewards/rejected": -21.496601104736328, + "step": 25170 + }, + { + "epoch": 0.8485287674003168, + "grad_norm": 60.07644271850586, + "learning_rate": 6.827739468514659e-08, + "logits/chosen": -2.2666361331939697, + "logits/rejected": -2.264575719833374, + "logps/chosen": -2.6530709266662598, + "logps/rejected": -2.750534772872925, + "loss": 3.4756, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -26.53070640563965, + "rewards/margins": 0.9746391177177429, + "rewards/rejected": -27.505346298217773, + "step": 25175 + }, + { + "epoch": 0.8486972934713, + "grad_norm": 34.915287017822266, + "learning_rate": 6.812909537182565e-08, + "logits/chosen": -1.5165358781814575, + "logits/rejected": -1.5831248760223389, + "logps/chosen": -2.374108076095581, + "logps/rejected": -2.758392810821533, + "loss": 2.4535, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.741079330444336, + "rewards/margins": 3.8428473472595215, + "rewards/rejected": -27.58392906188965, + "step": 25180 + }, + { + "epoch": 0.8488658195422832, + "grad_norm": 42.44291305541992, + "learning_rate": 6.798094551200961e-08, + "logits/chosen": -1.5595664978027344, + "logits/rejected": -1.9098215103149414, + "logps/chosen": -3.4170849323272705, + "logps/rejected": -3.8340396881103516, + "loss": 2.9079, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -34.17084503173828, + "rewards/margins": 4.1695475578308105, + "rewards/rejected": -38.340396881103516, + "step": 25185 + }, + { + "epoch": 0.8490343456132664, + "grad_norm": 15.514145851135254, + "learning_rate": 6.783294515696747e-08, + "logits/chosen": -2.5971148014068604, + "logits/rejected": -2.3608932495117188, + "logps/chosen": -2.288546323776245, + "logps/rejected": -2.5332279205322266, + "loss": 3.8562, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.88546371459961, + "rewards/margins": 2.4468159675598145, + "rewards/rejected": -25.332279205322266, + "step": 25190 + }, + { + "epoch": 0.8492028716842496, + "grad_norm": 41.55581283569336, + "learning_rate": 6.768509435791631e-08, + "logits/chosen": -2.2747750282287598, + "logits/rejected": -2.2851319313049316, + "logps/chosen": -2.667318820953369, + "logps/rejected": -2.459958076477051, + "loss": 6.2535, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -26.67318344116211, + "rewards/margins": -2.073606491088867, + "rewards/rejected": -24.599578857421875, + "step": 25195 + }, + { + "epoch": 0.8493713977552327, + "grad_norm": 75.3543701171875, + "learning_rate": 6.753739316602148e-08, + "logits/chosen": -2.2415099143981934, + "logits/rejected": -2.157219171524048, + "logps/chosen": -2.4603915214538574, + "logps/rejected": -2.5724644660949707, + "loss": 2.7241, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.603918075561523, + "rewards/margins": 1.1207275390625, + "rewards/rejected": -25.724645614624023, + "step": 25200 + }, + { + "epoch": 0.8493713977552327, + "eval_logits/chosen": -2.2988386154174805, + "eval_logits/rejected": -2.477008104324341, + "eval_logps/chosen": -2.283003330230713, + "eval_logps/rejected": -2.4374542236328125, + "eval_loss": 3.086169958114624, + "eval_rewards/accuracies": 0.6299999952316284, + "eval_rewards/chosen": -22.830032348632812, + "eval_rewards/margins": 1.5445094108581543, + "eval_rewards/rejected": -24.374540328979492, + "eval_runtime": 12.887, + "eval_samples_per_second": 7.76, + "eval_steps_per_second": 1.94, + "step": 25200 + }, + { + "epoch": 0.8495399238262159, + "grad_norm": 26.182111740112305, + "learning_rate": 6.738984163239647e-08, + "logits/chosen": -1.3157546520233154, + "logits/rejected": -1.4888827800750732, + "logps/chosen": -2.333944320678711, + "logps/rejected": -3.176734447479248, + "loss": 1.0272, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -23.33944320678711, + "rewards/margins": 8.427900314331055, + "rewards/rejected": -31.767343521118164, + "step": 25205 + }, + { + "epoch": 0.8497084498971991, + "grad_norm": 21.64483070373535, + "learning_rate": 6.724243980810319e-08, + "logits/chosen": -2.0497689247131348, + "logits/rejected": -2.2630741596221924, + "logps/chosen": -2.9822685718536377, + "logps/rejected": -3.110841989517212, + "loss": 2.0844, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -29.82268714904785, + "rewards/margins": 1.285732388496399, + "rewards/rejected": -31.10841941833496, + "step": 25210 + }, + { + "epoch": 0.8498769759681822, + "grad_norm": 9.70484447479248, + "learning_rate": 6.709518774415157e-08, + "logits/chosen": -1.9983733892440796, + "logits/rejected": -2.8238205909729004, + "logps/chosen": -2.225658416748047, + "logps/rejected": -2.7286124229431152, + "loss": 1.9975, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.25658416748047, + "rewards/margins": 5.029541969299316, + "rewards/rejected": -27.2861270904541, + "step": 25215 + }, + { + "epoch": 0.8500455020391655, + "grad_norm": 39.07335662841797, + "learning_rate": 6.69480854914996e-08, + "logits/chosen": -1.7552353143692017, + "logits/rejected": -2.2177629470825195, + "logps/chosen": -2.202458620071411, + "logps/rejected": -2.6856517791748047, + "loss": 1.6069, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -22.024587631225586, + "rewards/margins": 4.831931114196777, + "rewards/rejected": -26.856517791748047, + "step": 25220 + }, + { + "epoch": 0.8502140281101487, + "grad_norm": 50.24772262573242, + "learning_rate": 6.680113310105373e-08, + "logits/chosen": -1.8044487237930298, + "logits/rejected": -1.6741470098495483, + "logps/chosen": -3.385709762573242, + "logps/rejected": -3.506852626800537, + "loss": 2.8577, + "rewards/accuracies": 0.5, + "rewards/chosen": -33.85709762573242, + "rewards/margins": 1.2114317417144775, + "rewards/rejected": -35.06852722167969, + "step": 25225 + }, + { + "epoch": 0.8503825541811318, + "grad_norm": 231.3321075439453, + "learning_rate": 6.665433062366838e-08, + "logits/chosen": -1.8543148040771484, + "logits/rejected": -1.9665143489837646, + "logps/chosen": -2.7595534324645996, + "logps/rejected": -3.008307456970215, + "loss": 2.334, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -27.595539093017578, + "rewards/margins": 2.4875378608703613, + "rewards/rejected": -30.08307456970215, + "step": 25230 + }, + { + "epoch": 0.850551080252115, + "grad_norm": 34.387332916259766, + "learning_rate": 6.650767811014602e-08, + "logits/chosen": -1.3960977792739868, + "logits/rejected": -1.4877779483795166, + "logps/chosen": -2.183588743209839, + "logps/rejected": -2.609048366546631, + "loss": 2.8054, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.835886001586914, + "rewards/margins": 4.254596710205078, + "rewards/rejected": -26.09048080444336, + "step": 25235 + }, + { + "epoch": 0.8507196063230982, + "grad_norm": 23.472320556640625, + "learning_rate": 6.636117561123733e-08, + "logits/chosen": -1.3947211503982544, + "logits/rejected": -1.6472208499908447, + "logps/chosen": -2.369253396987915, + "logps/rejected": -2.7023870944976807, + "loss": 2.2897, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.69253158569336, + "rewards/margins": 3.3313395977020264, + "rewards/rejected": -27.02387046813965, + "step": 25240 + }, + { + "epoch": 0.8508881323940813, + "grad_norm": 26.026569366455078, + "learning_rate": 6.621482317764104e-08, + "logits/chosen": -1.4750478267669678, + "logits/rejected": -1.8304048776626587, + "logps/chosen": -2.4229671955108643, + "logps/rejected": -2.976175308227539, + "loss": 1.5136, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -24.229671478271484, + "rewards/margins": 5.53208065032959, + "rewards/rejected": -29.76175308227539, + "step": 25245 + }, + { + "epoch": 0.8510566584650645, + "grad_norm": 33.33852005004883, + "learning_rate": 6.606862086000414e-08, + "logits/chosen": -1.5895836353302002, + "logits/rejected": -2.616696357727051, + "logps/chosen": -2.1377921104431152, + "logps/rejected": -2.691991090774536, + "loss": 2.8422, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.37792205810547, + "rewards/margins": 5.541991233825684, + "rewards/rejected": -26.919910430908203, + "step": 25250 + }, + { + "epoch": 0.8512251845360477, + "grad_norm": 32.822242736816406, + "learning_rate": 6.592256870892122e-08, + "logits/chosen": -1.5423781871795654, + "logits/rejected": -2.161618709564209, + "logps/chosen": -2.338876247406006, + "logps/rejected": -3.336784839630127, + "loss": 1.0311, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -23.388761520385742, + "rewards/margins": 9.979085922241211, + "rewards/rejected": -33.36784744262695, + "step": 25255 + }, + { + "epoch": 0.851393710607031, + "grad_norm": 45.84088134765625, + "learning_rate": 6.577666677493532e-08, + "logits/chosen": -2.2524895668029785, + "logits/rejected": -1.8200067281723022, + "logps/chosen": -3.145887851715088, + "logps/rejected": -3.471153736114502, + "loss": 2.6785, + "rewards/accuracies": 0.5, + "rewards/chosen": -31.458881378173828, + "rewards/margins": 3.252655029296875, + "rewards/rejected": -34.7115364074707, + "step": 25260 + }, + { + "epoch": 0.8515622366780141, + "grad_norm": 20.831850051879883, + "learning_rate": 6.563091510853741e-08, + "logits/chosen": -1.4504529237747192, + "logits/rejected": -1.597791075706482, + "logps/chosen": -1.9485721588134766, + "logps/rejected": -2.133368492126465, + "loss": 2.2884, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.485719680786133, + "rewards/margins": 1.8479652404785156, + "rewards/rejected": -21.333683013916016, + "step": 25265 + }, + { + "epoch": 0.8517307627489973, + "grad_norm": 26.64033317565918, + "learning_rate": 6.548531376016619e-08, + "logits/chosen": -2.3206093311309814, + "logits/rejected": -2.648336887359619, + "logps/chosen": -2.4551775455474854, + "logps/rejected": -2.920405864715576, + "loss": 2.0748, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.551776885986328, + "rewards/margins": 4.652281284332275, + "rewards/rejected": -29.204059600830078, + "step": 25270 + }, + { + "epoch": 0.8518992888199804, + "grad_norm": 521.4677734375, + "learning_rate": 6.533986278020875e-08, + "logits/chosen": -1.5181517601013184, + "logits/rejected": -2.314953327178955, + "logps/chosen": -3.4335227012634277, + "logps/rejected": -3.6828982830047607, + "loss": 6.0014, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -34.335227966308594, + "rewards/margins": 2.493752956390381, + "rewards/rejected": -36.828983306884766, + "step": 25275 + }, + { + "epoch": 0.8520678148909636, + "grad_norm": 32.94503402709961, + "learning_rate": 6.519456221899982e-08, + "logits/chosen": -2.042977809906006, + "logits/rejected": -2.5099101066589355, + "logps/chosen": -2.3646905422210693, + "logps/rejected": -2.729539394378662, + "loss": 1.9035, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.64690589904785, + "rewards/margins": 3.6484885215759277, + "rewards/rejected": -27.295394897460938, + "step": 25280 + }, + { + "epoch": 0.8522363409619468, + "grad_norm": 18.743024826049805, + "learning_rate": 6.50494121268224e-08, + "logits/chosen": -1.6273609399795532, + "logits/rejected": -1.8308128118515015, + "logps/chosen": -1.9369827508926392, + "logps/rejected": -2.1672778129577637, + "loss": 2.0784, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.369827270507812, + "rewards/margins": 2.3029518127441406, + "rewards/rejected": -21.672779083251953, + "step": 25285 + }, + { + "epoch": 0.85240486703293, + "grad_norm": 1.6357653141021729, + "learning_rate": 6.4904412553907e-08, + "logits/chosen": -1.824774146080017, + "logits/rejected": -2.023179769515991, + "logps/chosen": -2.229252338409424, + "logps/rejected": -2.7864389419555664, + "loss": 2.2768, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.292522430419922, + "rewards/margins": 5.571867942810059, + "rewards/rejected": -27.864391326904297, + "step": 25290 + }, + { + "epoch": 0.8525733931039132, + "grad_norm": 31.789331436157227, + "learning_rate": 6.475956355043227e-08, + "logits/chosen": -2.1489577293395996, + "logits/rejected": -2.3417067527770996, + "logps/chosen": -2.325360059738159, + "logps/rejected": -2.52240252494812, + "loss": 2.8407, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.25360107421875, + "rewards/margins": 1.9704246520996094, + "rewards/rejected": -25.22402572631836, + "step": 25295 + }, + { + "epoch": 0.8527419191748964, + "grad_norm": 33.04253005981445, + "learning_rate": 6.461486516652492e-08, + "logits/chosen": -1.5875952243804932, + "logits/rejected": -1.7361282110214233, + "logps/chosen": -2.2500598430633545, + "logps/rejected": -2.3189072608947754, + "loss": 3.0055, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -22.500600814819336, + "rewards/margins": 0.6884740591049194, + "rewards/rejected": -23.189075469970703, + "step": 25300 + }, + { + "epoch": 0.8529104452458796, + "grad_norm": 28.588184356689453, + "learning_rate": 6.447031745225917e-08, + "logits/chosen": -1.8044865131378174, + "logits/rejected": -2.0297751426696777, + "logps/chosen": -3.2455849647521973, + "logps/rejected": -3.45509672164917, + "loss": 2.4975, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -32.45585250854492, + "rewards/margins": 2.095115900039673, + "rewards/rejected": -34.550968170166016, + "step": 25305 + }, + { + "epoch": 0.8530789713168627, + "grad_norm": 48.96995544433594, + "learning_rate": 6.432592045765733e-08, + "logits/chosen": -1.811680555343628, + "logits/rejected": -2.0719192028045654, + "logps/chosen": -2.452927350997925, + "logps/rejected": -2.5236716270446777, + "loss": 2.9467, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -24.529273986816406, + "rewards/margins": 0.707441508769989, + "rewards/rejected": -25.236713409423828, + "step": 25310 + }, + { + "epoch": 0.8532474973878459, + "grad_norm": 43.726924896240234, + "learning_rate": 6.41816742326896e-08, + "logits/chosen": -1.991275429725647, + "logits/rejected": -1.9130961894989014, + "logps/chosen": -2.3691442012786865, + "logps/rejected": -2.2164864540100098, + "loss": 5.1097, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -23.69144058227539, + "rewards/margins": -1.526573657989502, + "rewards/rejected": -22.164867401123047, + "step": 25315 + }, + { + "epoch": 0.853416023458829, + "grad_norm": 23.73866081237793, + "learning_rate": 6.403757882727389e-08, + "logits/chosen": -1.9794378280639648, + "logits/rejected": -2.502854108810425, + "logps/chosen": -3.8947620391845703, + "logps/rejected": -4.647494792938232, + "loss": 1.7916, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -38.9476203918457, + "rewards/margins": 7.527327060699463, + "rewards/rejected": -46.474945068359375, + "step": 25320 + }, + { + "epoch": 0.8535845495298122, + "grad_norm": 25.906620025634766, + "learning_rate": 6.389363429127586e-08, + "logits/chosen": -1.6756868362426758, + "logits/rejected": -1.8262481689453125, + "logps/chosen": -2.3997318744659424, + "logps/rejected": -2.5467724800109863, + "loss": 2.8854, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.997316360473633, + "rewards/margins": 1.4704080820083618, + "rewards/rejected": -25.467723846435547, + "step": 25325 + }, + { + "epoch": 0.8537530756007955, + "grad_norm": 96.08812713623047, + "learning_rate": 6.374984067450912e-08, + "logits/chosen": -3.0671684741973877, + "logits/rejected": -3.0086820125579834, + "logps/chosen": -3.6582512855529785, + "logps/rejected": -3.5938503742218018, + "loss": 4.6296, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -36.58251190185547, + "rewards/margins": -0.6440094113349915, + "rewards/rejected": -35.938499450683594, + "step": 25330 + }, + { + "epoch": 0.8539216016717787, + "grad_norm": 44.67426681518555, + "learning_rate": 6.36061980267349e-08, + "logits/chosen": -1.7330238819122314, + "logits/rejected": -1.7280261516571045, + "logps/chosen": -2.71097993850708, + "logps/rejected": -2.6598942279815674, + "loss": 4.144, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -27.109798431396484, + "rewards/margins": -0.5108593106269836, + "rewards/rejected": -26.598941802978516, + "step": 25335 + }, + { + "epoch": 0.8540901277427618, + "grad_norm": 459.4154357910156, + "learning_rate": 6.346270639766232e-08, + "logits/chosen": -2.0010597705841064, + "logits/rejected": -2.0018951892852783, + "logps/chosen": -2.6174869537353516, + "logps/rejected": -2.702815055847168, + "loss": 2.7935, + "rewards/accuracies": 0.5, + "rewards/chosen": -26.174869537353516, + "rewards/margins": 0.8532818555831909, + "rewards/rejected": -27.028152465820312, + "step": 25340 + }, + { + "epoch": 0.854258653813745, + "grad_norm": 51.30471420288086, + "learning_rate": 6.331936583694819e-08, + "logits/chosen": -2.0278751850128174, + "logits/rejected": -1.9472980499267578, + "logps/chosen": -2.5634241104125977, + "logps/rejected": -2.626685857772827, + "loss": 2.7637, + "rewards/accuracies": 0.5, + "rewards/chosen": -25.634241104125977, + "rewards/margins": 0.6326183080673218, + "rewards/rejected": -26.266857147216797, + "step": 25345 + }, + { + "epoch": 0.8544271798847282, + "grad_norm": 19.242332458496094, + "learning_rate": 6.317617639419714e-08, + "logits/chosen": -1.845207929611206, + "logits/rejected": -2.0394089221954346, + "logps/chosen": -3.0137135982513428, + "logps/rejected": -3.1849112510681152, + "loss": 3.7604, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -30.137136459350586, + "rewards/margins": 1.7119754552841187, + "rewards/rejected": -31.849109649658203, + "step": 25350 + }, + { + "epoch": 0.8545957059557113, + "grad_norm": 16.459596633911133, + "learning_rate": 6.303313811896111e-08, + "logits/chosen": -1.489900827407837, + "logits/rejected": -1.5980645418167114, + "logps/chosen": -2.0218257904052734, + "logps/rejected": -2.2330009937286377, + "loss": 2.9988, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.218257904052734, + "rewards/margins": 2.1117513179779053, + "rewards/rejected": -22.33000946044922, + "step": 25355 + }, + { + "epoch": 0.8547642320266945, + "grad_norm": 24.089542388916016, + "learning_rate": 6.289025106074019e-08, + "logits/chosen": -1.951390027999878, + "logits/rejected": -2.3046865463256836, + "logps/chosen": -2.4620213508605957, + "logps/rejected": -2.7450006008148193, + "loss": 2.0841, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -24.62021255493164, + "rewards/margins": 2.8297929763793945, + "rewards/rejected": -27.45000648498535, + "step": 25360 + }, + { + "epoch": 0.8549327580976777, + "grad_norm": 55.347660064697266, + "learning_rate": 6.274751526898197e-08, + "logits/chosen": -1.7793890237808228, + "logits/rejected": -1.7488059997558594, + "logps/chosen": -2.639188051223755, + "logps/rejected": -3.5281052589416504, + "loss": 1.8899, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.39188003540039, + "rewards/margins": 8.889172554016113, + "rewards/rejected": -35.28104782104492, + "step": 25365 + }, + { + "epoch": 0.8551012841686609, + "grad_norm": 18.582015991210938, + "learning_rate": 6.260493079308176e-08, + "logits/chosen": -2.4654316902160645, + "logits/rejected": -2.598381519317627, + "logps/chosen": -2.64225697517395, + "logps/rejected": -2.852651834487915, + "loss": 3.1262, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -26.42256736755371, + "rewards/margins": 2.1039481163024902, + "rewards/rejected": -28.52651596069336, + "step": 25370 + }, + { + "epoch": 0.8552698102396441, + "grad_norm": 36.65127944946289, + "learning_rate": 6.24624976823822e-08, + "logits/chosen": -2.479947328567505, + "logits/rejected": -2.5645601749420166, + "logps/chosen": -2.4463698863983154, + "logps/rejected": -2.914379596710205, + "loss": 2.9974, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.463699340820312, + "rewards/margins": 4.680096626281738, + "rewards/rejected": -29.143795013427734, + "step": 25375 + }, + { + "epoch": 0.8554383363106273, + "grad_norm": 4.597733974456787, + "learning_rate": 6.232021598617388e-08, + "logits/chosen": -1.8337205648422241, + "logits/rejected": -2.154306411743164, + "logps/chosen": -2.193875789642334, + "logps/rejected": -3.0177786350250244, + "loss": 1.1959, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.93876075744629, + "rewards/margins": 8.23902416229248, + "rewards/rejected": -30.177783966064453, + "step": 25380 + }, + { + "epoch": 0.8556068623816104, + "grad_norm": 2.200549602508545, + "learning_rate": 6.217808575369493e-08, + "logits/chosen": -1.9514930248260498, + "logits/rejected": -1.7945950031280518, + "logps/chosen": -1.9390647411346436, + "logps/rejected": -2.213057041168213, + "loss": 2.0697, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.390647888183594, + "rewards/margins": 2.7399203777313232, + "rewards/rejected": -22.13056755065918, + "step": 25385 + }, + { + "epoch": 0.8557753884525936, + "grad_norm": 30.521926879882812, + "learning_rate": 6.203610703413114e-08, + "logits/chosen": -1.8817222118377686, + "logits/rejected": -2.326267719268799, + "logps/chosen": -2.600776195526123, + "logps/rejected": -3.109726667404175, + "loss": 1.8063, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -26.007761001586914, + "rewards/margins": 5.089505672454834, + "rewards/rejected": -31.097265243530273, + "step": 25390 + }, + { + "epoch": 0.8559439145235768, + "grad_norm": 35.248165130615234, + "learning_rate": 6.18942798766155e-08, + "logits/chosen": -1.4958255290985107, + "logits/rejected": -1.527790904045105, + "logps/chosen": -2.2628679275512695, + "logps/rejected": -2.3205666542053223, + "loss": 2.8091, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.628681182861328, + "rewards/margins": 0.5769863128662109, + "rewards/rejected": -23.205665588378906, + "step": 25395 + }, + { + "epoch": 0.8561124405945599, + "grad_norm": 22.03529167175293, + "learning_rate": 6.175260433022889e-08, + "logits/chosen": -2.300389528274536, + "logits/rejected": -2.2869935035705566, + "logps/chosen": -2.078281879425049, + "logps/rejected": -2.5390524864196777, + "loss": 2.0494, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.782817840576172, + "rewards/margins": 4.607706546783447, + "rewards/rejected": -25.39052391052246, + "step": 25400 + }, + { + "epoch": 0.8562809666655432, + "grad_norm": 34.80204391479492, + "learning_rate": 6.161108044399976e-08, + "logits/chosen": -2.219104528427124, + "logits/rejected": -2.532747507095337, + "logps/chosen": -2.918858766555786, + "logps/rejected": -3.3544135093688965, + "loss": 5.1036, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -29.188587188720703, + "rewards/margins": 4.355550289154053, + "rewards/rejected": -33.54413604736328, + "step": 25405 + }, + { + "epoch": 0.8564494927365264, + "grad_norm": 34.77595520019531, + "learning_rate": 6.146970826690378e-08, + "logits/chosen": -2.205582857131958, + "logits/rejected": -2.330531597137451, + "logps/chosen": -2.4941062927246094, + "logps/rejected": -2.755558967590332, + "loss": 2.306, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.941062927246094, + "rewards/margins": 2.61452579498291, + "rewards/rejected": -27.555587768554688, + "step": 25410 + }, + { + "epoch": 0.8566180188075095, + "grad_norm": 56.285179138183594, + "learning_rate": 6.132848784786437e-08, + "logits/chosen": -1.6364961862564087, + "logits/rejected": -1.7433507442474365, + "logps/chosen": -2.412095308303833, + "logps/rejected": -2.6662497520446777, + "loss": 2.2664, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.120952606201172, + "rewards/margins": 2.5415430068969727, + "rewards/rejected": -26.662494659423828, + "step": 25415 + }, + { + "epoch": 0.8567865448784927, + "grad_norm": 47.27323532104492, + "learning_rate": 6.118741923575233e-08, + "logits/chosen": -1.5887492895126343, + "logits/rejected": -1.5800529718399048, + "logps/chosen": -2.2246580123901367, + "logps/rejected": -2.341395616531372, + "loss": 2.3448, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.246578216552734, + "rewards/margins": 1.167377233505249, + "rewards/rejected": -23.413955688476562, + "step": 25420 + }, + { + "epoch": 0.8569550709494759, + "grad_norm": 21.56765365600586, + "learning_rate": 6.104650247938609e-08, + "logits/chosen": -1.5196692943572998, + "logits/rejected": -1.9559457302093506, + "logps/chosen": -2.3416597843170166, + "logps/rejected": -2.5563902854919434, + "loss": 2.3498, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.41659927368164, + "rewards/margins": 2.147305727005005, + "rewards/rejected": -25.56390380859375, + "step": 25425 + }, + { + "epoch": 0.857123597020459, + "grad_norm": 150.3126678466797, + "learning_rate": 6.090573762753115e-08, + "logits/chosen": -1.950821876525879, + "logits/rejected": -2.2176384925842285, + "logps/chosen": -2.4310436248779297, + "logps/rejected": -2.4594826698303223, + "loss": 3.4541, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.310436248779297, + "rewards/margins": 0.28439101576805115, + "rewards/rejected": -24.59482765197754, + "step": 25430 + }, + { + "epoch": 0.8572921230914422, + "grad_norm": 148.96141052246094, + "learning_rate": 6.076512472890077e-08, + "logits/chosen": -1.6106939315795898, + "logits/rejected": -1.8222036361694336, + "logps/chosen": -1.9590381383895874, + "logps/rejected": -2.043668270111084, + "loss": 3.5791, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.590383529663086, + "rewards/margins": 0.8463004231452942, + "rewards/rejected": -20.436681747436523, + "step": 25435 + }, + { + "epoch": 0.8574606491624255, + "grad_norm": 44.487586975097656, + "learning_rate": 6.06246638321557e-08, + "logits/chosen": -1.8768688440322876, + "logits/rejected": -2.0317156314849854, + "logps/chosen": -2.3081698417663574, + "logps/rejected": -2.3036158084869385, + "loss": 3.8499, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -23.081695556640625, + "rewards/margins": -0.045539092272520065, + "rewards/rejected": -23.036157608032227, + "step": 25440 + }, + { + "epoch": 0.8576291752334086, + "grad_norm": 24.27766227722168, + "learning_rate": 6.048435498590366e-08, + "logits/chosen": -0.9871917963027954, + "logits/rejected": -1.2592726945877075, + "logps/chosen": -3.0321362018585205, + "logps/rejected": -3.322997570037842, + "loss": 2.0057, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -30.321359634399414, + "rewards/margins": 2.9086170196533203, + "rewards/rejected": -33.22998046875, + "step": 25445 + }, + { + "epoch": 0.8577977013043918, + "grad_norm": 85.73827362060547, + "learning_rate": 6.034419823870012e-08, + "logits/chosen": -2.156827926635742, + "logits/rejected": -2.120016574859619, + "logps/chosen": -2.8094635009765625, + "logps/rejected": -2.7601757049560547, + "loss": 4.3052, + "rewards/accuracies": 0.5, + "rewards/chosen": -28.094635009765625, + "rewards/margins": -0.4928779602050781, + "rewards/rejected": -27.601755142211914, + "step": 25450 + }, + { + "epoch": 0.857966227375375, + "grad_norm": 30.530527114868164, + "learning_rate": 6.020419363904783e-08, + "logits/chosen": -1.977306604385376, + "logits/rejected": -2.1351287364959717, + "logps/chosen": -3.0446646213531494, + "logps/rejected": -3.5633749961853027, + "loss": 1.9825, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -30.4466495513916, + "rewards/margins": 5.187103271484375, + "rewards/rejected": -35.633750915527344, + "step": 25455 + }, + { + "epoch": 0.8581347534463581, + "grad_norm": 171.41098022460938, + "learning_rate": 6.0064341235397e-08, + "logits/chosen": -1.567341923713684, + "logits/rejected": -1.6634018421173096, + "logps/chosen": -3.08695650100708, + "logps/rejected": -2.7849831581115723, + "loss": 6.2163, + "rewards/accuracies": 0.5, + "rewards/chosen": -30.86956787109375, + "rewards/margins": -3.019737720489502, + "rewards/rejected": -27.849828720092773, + "step": 25460 + }, + { + "epoch": 0.8583032795173413, + "grad_norm": 61.39971160888672, + "learning_rate": 5.992464107614475e-08, + "logits/chosen": -1.4290657043457031, + "logits/rejected": -1.4986751079559326, + "logps/chosen": -2.2665841579437256, + "logps/rejected": -2.458251953125, + "loss": 2.5043, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.66584014892578, + "rewards/margins": 1.91667902469635, + "rewards/rejected": -24.58251953125, + "step": 25465 + }, + { + "epoch": 0.8584718055883245, + "grad_norm": 27.6011962890625, + "learning_rate": 5.978509320963593e-08, + "logits/chosen": -1.7557885646820068, + "logits/rejected": -1.6499592065811157, + "logps/chosen": -2.4323935508728027, + "logps/rejected": -2.376591444015503, + "loss": 4.2695, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -24.32393455505371, + "rewards/margins": -0.558021068572998, + "rewards/rejected": -23.765914916992188, + "step": 25470 + }, + { + "epoch": 0.8586403316593076, + "grad_norm": 42.387786865234375, + "learning_rate": 5.964569768416261e-08, + "logits/chosen": -1.96505868434906, + "logits/rejected": -2.3774993419647217, + "logps/chosen": -2.227853298187256, + "logps/rejected": -2.4281139373779297, + "loss": 2.3639, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -22.278533935546875, + "rewards/margins": 2.0026066303253174, + "rewards/rejected": -24.28114128112793, + "step": 25475 + }, + { + "epoch": 0.8588088577302909, + "grad_norm": 51.79988479614258, + "learning_rate": 5.950645454796416e-08, + "logits/chosen": -2.068077802658081, + "logits/rejected": -2.061128854751587, + "logps/chosen": -2.6374306678771973, + "logps/rejected": -2.715590238571167, + "loss": 4.849, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -26.37430763244629, + "rewards/margins": 0.7815954089164734, + "rewards/rejected": -27.155902862548828, + "step": 25480 + }, + { + "epoch": 0.8589773838012741, + "grad_norm": 0.0009354325011372566, + "learning_rate": 5.936736384922691e-08, + "logits/chosen": -1.0480647087097168, + "logits/rejected": -1.550246238708496, + "logps/chosen": -1.7990306615829468, + "logps/rejected": -2.71217679977417, + "loss": 0.5304, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.990306854248047, + "rewards/margins": 9.13145923614502, + "rewards/rejected": -27.12176513671875, + "step": 25485 + }, + { + "epoch": 0.8591459098722573, + "grad_norm": 14.77606201171875, + "learning_rate": 5.9228425636084824e-08, + "logits/chosen": -1.341812252998352, + "logits/rejected": -2.1341910362243652, + "logps/chosen": -2.212036609649658, + "logps/rejected": -2.9296481609344482, + "loss": 3.4865, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.1203670501709, + "rewards/margins": 7.1761155128479, + "rewards/rejected": -29.29648208618164, + "step": 25490 + }, + { + "epoch": 0.8593144359432404, + "grad_norm": 68.93023681640625, + "learning_rate": 5.908963995661892e-08, + "logits/chosen": -1.236950397491455, + "logits/rejected": -1.241420030593872, + "logps/chosen": -2.6127872467041016, + "logps/rejected": -2.8742825984954834, + "loss": 2.2873, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -26.12787437438965, + "rewards/margins": 2.614952325820923, + "rewards/rejected": -28.742828369140625, + "step": 25495 + }, + { + "epoch": 0.8594829620142236, + "grad_norm": 10.785711288452148, + "learning_rate": 5.895100685885745e-08, + "logits/chosen": -1.7537428140640259, + "logits/rejected": -1.8643490076065063, + "logps/chosen": -2.1532981395721436, + "logps/rejected": -2.436422824859619, + "loss": 1.1377, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -21.53297996520996, + "rewards/margins": 2.8312461376190186, + "rewards/rejected": -24.364227294921875, + "step": 25500 + }, + { + "epoch": 0.8596514880852067, + "grad_norm": 42.79972457885742, + "learning_rate": 5.881252639077583e-08, + "logits/chosen": -1.8643391132354736, + "logits/rejected": -1.891916036605835, + "logps/chosen": -2.975830554962158, + "logps/rejected": -3.31604266166687, + "loss": 3.4354, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -29.7583065032959, + "rewards/margins": 3.402117967605591, + "rewards/rejected": -33.160423278808594, + "step": 25505 + }, + { + "epoch": 0.8598200141561899, + "grad_norm": 42.81157302856445, + "learning_rate": 5.867419860029688e-08, + "logits/chosen": -1.5780177116394043, + "logits/rejected": -2.0716984272003174, + "logps/chosen": -2.1099133491516113, + "logps/rejected": -3.069995164871216, + "loss": 2.5412, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.099132537841797, + "rewards/margins": 9.600818634033203, + "rewards/rejected": -30.699951171875, + "step": 25510 + }, + { + "epoch": 0.8599885402271732, + "grad_norm": 28.209131240844727, + "learning_rate": 5.8536023535290134e-08, + "logits/chosen": -1.9131946563720703, + "logits/rejected": -2.2496771812438965, + "logps/chosen": -1.9939839839935303, + "logps/rejected": -2.5202078819274902, + "loss": 1.3354, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.93984031677246, + "rewards/margins": 5.262241840362549, + "rewards/rejected": -25.202083587646484, + "step": 25515 + }, + { + "epoch": 0.8601570662981564, + "grad_norm": 57.89436340332031, + "learning_rate": 5.839800124357264e-08, + "logits/chosen": -1.967872977256775, + "logits/rejected": -1.999243140220642, + "logps/chosen": -2.5514752864837646, + "logps/rejected": -2.5840890407562256, + "loss": 3.0422, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.514755249023438, + "rewards/margins": 0.3261362910270691, + "rewards/rejected": -25.840890884399414, + "step": 25520 + }, + { + "epoch": 0.8603255923691395, + "grad_norm": 25.394502639770508, + "learning_rate": 5.8260131772908504e-08, + "logits/chosen": -1.37850022315979, + "logits/rejected": -1.5792301893234253, + "logps/chosen": -2.2832894325256348, + "logps/rejected": -2.744271755218506, + "loss": 2.6536, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.8328914642334, + "rewards/margins": 4.609823703765869, + "rewards/rejected": -27.442718505859375, + "step": 25525 + }, + { + "epoch": 0.8604941184401227, + "grad_norm": 22.741680145263672, + "learning_rate": 5.812241517100902e-08, + "logits/chosen": -2.1774706840515137, + "logits/rejected": -2.6091322898864746, + "logps/chosen": -2.7606756687164307, + "logps/rejected": -3.212062358856201, + "loss": 2.5673, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -27.60675621032715, + "rewards/margins": 4.513869762420654, + "rewards/rejected": -32.120628356933594, + "step": 25530 + }, + { + "epoch": 0.8606626445111059, + "grad_norm": 26.45964241027832, + "learning_rate": 5.7984851485532284e-08, + "logits/chosen": -1.8604751825332642, + "logits/rejected": -1.7118768692016602, + "logps/chosen": -3.140791654586792, + "logps/rejected": -2.977461338043213, + "loss": 5.6263, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -31.407917022705078, + "rewards/margins": -1.6333030462265015, + "rewards/rejected": -29.774616241455078, + "step": 25535 + }, + { + "epoch": 0.860831170582089, + "grad_norm": 32.870521545410156, + "learning_rate": 5.784744076408371e-08, + "logits/chosen": -1.555826187133789, + "logits/rejected": -1.7253106832504272, + "logps/chosen": -2.182738780975342, + "logps/rejected": -2.4404165744781494, + "loss": 2.0564, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.8273868560791, + "rewards/margins": 2.576777935028076, + "rewards/rejected": -24.404163360595703, + "step": 25540 + }, + { + "epoch": 0.8609996966530722, + "grad_norm": 6.715198516845703, + "learning_rate": 5.771018305421588e-08, + "logits/chosen": -1.9538259506225586, + "logits/rejected": -2.2901089191436768, + "logps/chosen": -2.500892400741577, + "logps/rejected": -2.8917453289031982, + "loss": 3.0437, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -25.008922576904297, + "rewards/margins": 3.9085326194763184, + "rewards/rejected": -28.917455673217773, + "step": 25545 + }, + { + "epoch": 0.8611682227240555, + "grad_norm": 36.01880645751953, + "learning_rate": 5.757307840342807e-08, + "logits/chosen": -1.9131215810775757, + "logits/rejected": -1.9809229373931885, + "logps/chosen": -2.1885132789611816, + "logps/rejected": -2.4454269409179688, + "loss": 2.4507, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.8851318359375, + "rewards/margins": 2.569138288497925, + "rewards/rejected": -24.45427131652832, + "step": 25550 + }, + { + "epoch": 0.8613367487950386, + "grad_norm": 156.0620880126953, + "learning_rate": 5.743612685916688e-08, + "logits/chosen": -1.8777625560760498, + "logits/rejected": -1.917538046836853, + "logps/chosen": -3.0116991996765137, + "logps/rejected": -2.9883906841278076, + "loss": 4.478, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -30.116992950439453, + "rewards/margins": -0.23308487236499786, + "rewards/rejected": -29.8839054107666, + "step": 25555 + }, + { + "epoch": 0.8615052748660218, + "grad_norm": 517.7734985351562, + "learning_rate": 5.72993284688259e-08, + "logits/chosen": -1.6766964197158813, + "logits/rejected": -1.5441253185272217, + "logps/chosen": -2.456958055496216, + "logps/rejected": -2.3334765434265137, + "loss": 4.4397, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.569580078125, + "rewards/margins": -1.2348133325576782, + "rewards/rejected": -23.334766387939453, + "step": 25560 + }, + { + "epoch": 0.861673800937005, + "grad_norm": 36.23284912109375, + "learning_rate": 5.7162683279745715e-08, + "logits/chosen": -2.2240848541259766, + "logits/rejected": -2.341034412384033, + "logps/chosen": -2.754894971847534, + "logps/rejected": -2.6347315311431885, + "loss": 4.8563, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -27.5489501953125, + "rewards/margins": -1.201634407043457, + "rewards/rejected": -26.347314834594727, + "step": 25565 + }, + { + "epoch": 0.8618423270079881, + "grad_norm": 17.289134979248047, + "learning_rate": 5.7026191339213655e-08, + "logits/chosen": -2.197456121444702, + "logits/rejected": -2.4068684577941895, + "logps/chosen": -2.623168468475342, + "logps/rejected": -3.1079468727111816, + "loss": 1.5349, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.231685638427734, + "rewards/margins": 4.847784519195557, + "rewards/rejected": -31.079471588134766, + "step": 25570 + }, + { + "epoch": 0.8620108530789713, + "grad_norm": 39.08678436279297, + "learning_rate": 5.688985269446428e-08, + "logits/chosen": -1.9539051055908203, + "logits/rejected": -1.9796451330184937, + "logps/chosen": -2.2235019207000732, + "logps/rejected": -2.5791525840759277, + "loss": 2.3173, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.23501968383789, + "rewards/margins": 3.556506395339966, + "rewards/rejected": -25.79152488708496, + "step": 25575 + }, + { + "epoch": 0.8621793791499545, + "grad_norm": 27.619142532348633, + "learning_rate": 5.675366739267917e-08, + "logits/chosen": -1.809984803199768, + "logits/rejected": -2.1333956718444824, + "logps/chosen": -2.193274974822998, + "logps/rejected": -2.231731414794922, + "loss": 2.9135, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.932748794555664, + "rewards/margins": 0.38456735014915466, + "rewards/rejected": -22.31731605529785, + "step": 25580 + }, + { + "epoch": 0.8623479052209376, + "grad_norm": 33.4080696105957, + "learning_rate": 5.661763548098647e-08, + "logits/chosen": -1.8288103342056274, + "logits/rejected": -1.947704553604126, + "logps/chosen": -2.4664788246154785, + "logps/rejected": -2.844200611114502, + "loss": 2.104, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.6647891998291, + "rewards/margins": 3.7772185802459717, + "rewards/rejected": -28.442005157470703, + "step": 25585 + }, + { + "epoch": 0.8625164312919209, + "grad_norm": 16.371057510375977, + "learning_rate": 5.648175700646152e-08, + "logits/chosen": -1.4505841732025146, + "logits/rejected": -1.5557242631912231, + "logps/chosen": -2.552976131439209, + "logps/rejected": -3.2438766956329346, + "loss": 1.9105, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -25.52976417541504, + "rewards/margins": 6.909002780914307, + "rewards/rejected": -32.43876647949219, + "step": 25590 + }, + { + "epoch": 0.8626849573629041, + "grad_norm": 26.774486541748047, + "learning_rate": 5.6346032016126585e-08, + "logits/chosen": -1.6347877979278564, + "logits/rejected": -1.58437979221344, + "logps/chosen": -2.2510108947753906, + "logps/rejected": -2.346017360687256, + "loss": 3.0834, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.510108947753906, + "rewards/margins": 0.9500652551651001, + "rewards/rejected": -23.460172653198242, + "step": 25595 + }, + { + "epoch": 0.8628534834338872, + "grad_norm": 32.13697052001953, + "learning_rate": 5.621046055695078e-08, + "logits/chosen": -1.5043491125106812, + "logits/rejected": -2.238797426223755, + "logps/chosen": -2.6151254177093506, + "logps/rejected": -3.404712677001953, + "loss": 2.7441, + "rewards/accuracies": 0.5, + "rewards/chosen": -26.1512508392334, + "rewards/margins": 7.895873069763184, + "rewards/rejected": -34.047122955322266, + "step": 25600 + }, + { + "epoch": 0.8628534834338872, + "eval_logits/chosen": -2.3047690391540527, + "eval_logits/rejected": -2.4822678565979004, + "eval_logps/chosen": -2.284496545791626, + "eval_logps/rejected": -2.4387638568878174, + "eval_loss": 3.0866053104400635, + "eval_rewards/accuracies": 0.6299999952316284, + "eval_rewards/chosen": -22.84496307373047, + "eval_rewards/margins": 1.5426740646362305, + "eval_rewards/rejected": -24.38763999938965, + "eval_runtime": 12.8886, + "eval_samples_per_second": 7.759, + "eval_steps_per_second": 1.94, + "step": 25600 + }, + { + "epoch": 0.8630220095048704, + "grad_norm": 36.613040924072266, + "learning_rate": 5.6075042675849896e-08, + "logits/chosen": -1.3739955425262451, + "logits/rejected": -1.321131944656372, + "logps/chosen": -2.1500613689422607, + "logps/rejected": -2.17360520362854, + "loss": 4.0157, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -21.500614166259766, + "rewards/margins": 0.23543719947338104, + "rewards/rejected": -21.736053466796875, + "step": 25605 + }, + { + "epoch": 0.8631905355758536, + "grad_norm": 2.961085796356201, + "learning_rate": 5.593977841968678e-08, + "logits/chosen": -1.9631197452545166, + "logits/rejected": -2.171518325805664, + "logps/chosen": -2.591064691543579, + "logps/rejected": -2.8690671920776367, + "loss": 1.9633, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.91064453125, + "rewards/margins": 2.7800254821777344, + "rewards/rejected": -28.690670013427734, + "step": 25610 + }, + { + "epoch": 0.8633590616468367, + "grad_norm": 35.69943618774414, + "learning_rate": 5.580466783527116e-08, + "logits/chosen": -1.8889795541763306, + "logits/rejected": -2.0595734119415283, + "logps/chosen": -1.7020518779754639, + "logps/rejected": -1.756344199180603, + "loss": 2.9206, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.020517349243164, + "rewards/margins": 0.5429241061210632, + "rewards/rejected": -17.563440322875977, + "step": 25615 + }, + { + "epoch": 0.8635275877178199, + "grad_norm": 0.00012647907715290785, + "learning_rate": 5.566971096935935e-08, + "logits/chosen": -1.9607864618301392, + "logits/rejected": -2.334564685821533, + "logps/chosen": -2.574162244796753, + "logps/rejected": -3.8408493995666504, + "loss": 1.7019, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -25.741619110107422, + "rewards/margins": 12.666872024536133, + "rewards/rejected": -38.40849304199219, + "step": 25620 + }, + { + "epoch": 0.8636961137888032, + "grad_norm": 7.7881598472595215, + "learning_rate": 5.5534907868654615e-08, + "logits/chosen": -2.3653645515441895, + "logits/rejected": -2.4128830432891846, + "logps/chosen": -2.2764759063720703, + "logps/rejected": -2.3745293617248535, + "loss": 4.8364, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -22.764759063720703, + "rewards/margins": 0.9805337190628052, + "rewards/rejected": -23.745290756225586, + "step": 25625 + }, + { + "epoch": 0.8638646398597863, + "grad_norm": 29.671714782714844, + "learning_rate": 5.540025857980707e-08, + "logits/chosen": -1.6906957626342773, + "logits/rejected": -2.0325279235839844, + "logps/chosen": -1.7866567373275757, + "logps/rejected": -1.9048383235931396, + "loss": 2.205, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.866567611694336, + "rewards/margins": 1.1818159818649292, + "rewards/rejected": -19.048383712768555, + "step": 25630 + }, + { + "epoch": 0.8640331659307695, + "grad_norm": 44.803955078125, + "learning_rate": 5.52657631494135e-08, + "logits/chosen": -1.8340305089950562, + "logits/rejected": -1.7409149408340454, + "logps/chosen": -2.6429009437561035, + "logps/rejected": -3.0195226669311523, + "loss": 3.688, + "rewards/accuracies": 0.5, + "rewards/chosen": -26.429006576538086, + "rewards/margins": 3.7662174701690674, + "rewards/rejected": -30.195226669311523, + "step": 25635 + }, + { + "epoch": 0.8642016920017527, + "grad_norm": 59.5605583190918, + "learning_rate": 5.513142162401746e-08, + "logits/chosen": -1.4266693592071533, + "logits/rejected": -1.5339360237121582, + "logps/chosen": -1.7503912448883057, + "logps/rejected": -1.7414312362670898, + "loss": 3.3267, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.50391387939453, + "rewards/margins": -0.08959989249706268, + "rewards/rejected": -17.4143123626709, + "step": 25640 + }, + { + "epoch": 0.8643702180727358, + "grad_norm": 112.27110290527344, + "learning_rate": 5.4997234050109365e-08, + "logits/chosen": -2.070991277694702, + "logits/rejected": -2.8527050018310547, + "logps/chosen": -2.9822659492492676, + "logps/rejected": -3.033463478088379, + "loss": 3.4673, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -29.822656631469727, + "rewards/margins": 0.5119756460189819, + "rewards/rejected": -30.334630966186523, + "step": 25645 + }, + { + "epoch": 0.864538744143719, + "grad_norm": 45.651275634765625, + "learning_rate": 5.486320047412607e-08, + "logits/chosen": -2.4427013397216797, + "logits/rejected": -2.478302001953125, + "logps/chosen": -3.080040216445923, + "logps/rejected": -3.496181011199951, + "loss": 2.1427, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -30.800403594970703, + "rewards/margins": 4.161408424377441, + "rewards/rejected": -34.96180725097656, + "step": 25650 + }, + { + "epoch": 0.8647072702147022, + "grad_norm": 128.43927001953125, + "learning_rate": 5.4729320942451417e-08, + "logits/chosen": -2.290976047515869, + "logits/rejected": -2.463477611541748, + "logps/chosen": -2.309473752975464, + "logps/rejected": -2.7989444732666016, + "loss": 1.4247, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -23.094736099243164, + "rewards/margins": 4.894708156585693, + "rewards/rejected": -27.98944664001465, + "step": 25655 + }, + { + "epoch": 0.8648757962856854, + "grad_norm": 33.28281021118164, + "learning_rate": 5.459559550141579e-08, + "logits/chosen": -1.9118398427963257, + "logits/rejected": -2.3658390045166016, + "logps/chosen": -2.0546092987060547, + "logps/rejected": -2.8094446659088135, + "loss": 1.5514, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.546092987060547, + "rewards/margins": 7.548353672027588, + "rewards/rejected": -28.094446182250977, + "step": 25660 + }, + { + "epoch": 0.8650443223566686, + "grad_norm": 112.78716278076172, + "learning_rate": 5.446202419729634e-08, + "logits/chosen": -2.431443691253662, + "logits/rejected": -2.530750274658203, + "logps/chosen": -3.1777215003967285, + "logps/rejected": -3.2929470539093018, + "loss": 6.256, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -31.7772159576416, + "rewards/margins": 1.1522538661956787, + "rewards/rejected": -32.929466247558594, + "step": 25665 + }, + { + "epoch": 0.8652128484276518, + "grad_norm": 18.13178062438965, + "learning_rate": 5.432860707631692e-08, + "logits/chosen": -1.9325920343399048, + "logits/rejected": -2.1956613063812256, + "logps/chosen": -2.1996188163757324, + "logps/rejected": -2.414581775665283, + "loss": 2.0302, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.996187210083008, + "rewards/margins": 2.149627685546875, + "rewards/rejected": -24.145816802978516, + "step": 25670 + }, + { + "epoch": 0.865381374498635, + "grad_norm": 18.623947143554688, + "learning_rate": 5.419534418464772e-08, + "logits/chosen": -1.4391006231307983, + "logits/rejected": -1.5431092977523804, + "logps/chosen": -2.181546688079834, + "logps/rejected": -2.4147443771362305, + "loss": 1.8495, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.815465927124023, + "rewards/margins": 2.331977367401123, + "rewards/rejected": -24.147441864013672, + "step": 25675 + }, + { + "epoch": 0.8655499005696181, + "grad_norm": 38.271759033203125, + "learning_rate": 5.406223556840594e-08, + "logits/chosen": -1.714686393737793, + "logits/rejected": -2.1915745735168457, + "logps/chosen": -1.994512915611267, + "logps/rejected": -2.8179125785827637, + "loss": 2.5671, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.94512939453125, + "rewards/margins": 8.233994483947754, + "rewards/rejected": -28.179126739501953, + "step": 25680 + }, + { + "epoch": 0.8657184266406013, + "grad_norm": 22.912878036499023, + "learning_rate": 5.3929281273655255e-08, + "logits/chosen": -2.155128002166748, + "logits/rejected": -2.2736685276031494, + "logps/chosen": -2.6940789222717285, + "logps/rejected": -2.630833864212036, + "loss": 3.7778, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -26.940786361694336, + "rewards/margins": -0.6324483156204224, + "rewards/rejected": -26.308338165283203, + "step": 25685 + }, + { + "epoch": 0.8658869527115844, + "grad_norm": 18.871042251586914, + "learning_rate": 5.379648134640574e-08, + "logits/chosen": -2.1314008235931396, + "logits/rejected": -2.387678861618042, + "logps/chosen": -2.2513153553009033, + "logps/rejected": -2.300379753112793, + "loss": 3.4506, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.513153076171875, + "rewards/margins": 0.49064674973487854, + "rewards/rejected": -23.003799438476562, + "step": 25690 + }, + { + "epoch": 0.8660554787825676, + "grad_norm": 13.355888366699219, + "learning_rate": 5.36638358326143e-08, + "logits/chosen": -1.9050413370132446, + "logits/rejected": -2.75042724609375, + "logps/chosen": -2.610729932785034, + "logps/rejected": -3.4782874584198, + "loss": 1.1352, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -26.1072998046875, + "rewards/margins": 8.67557430267334, + "rewards/rejected": -34.782875061035156, + "step": 25695 + }, + { + "epoch": 0.8662240048535509, + "grad_norm": 28.12961196899414, + "learning_rate": 5.353134477818444e-08, + "logits/chosen": -1.6968810558319092, + "logits/rejected": -1.7211993932724, + "logps/chosen": -1.8958885669708252, + "logps/rejected": -2.062389612197876, + "loss": 2.7024, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.958887100219727, + "rewards/margins": 1.665008544921875, + "rewards/rejected": -20.6238956451416, + "step": 25700 + }, + { + "epoch": 0.866392530924534, + "grad_norm": 41.80595779418945, + "learning_rate": 5.3399008228965926e-08, + "logits/chosen": -1.6834443807601929, + "logits/rejected": -1.737204909324646, + "logps/chosen": -2.680849313735962, + "logps/rejected": -2.803086519241333, + "loss": 2.2861, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.80849266052246, + "rewards/margins": 1.2223708629608154, + "rewards/rejected": -28.03086280822754, + "step": 25705 + }, + { + "epoch": 0.8665610569955172, + "grad_norm": 32.27495574951172, + "learning_rate": 5.3266826230755234e-08, + "logits/chosen": -1.3415998220443726, + "logits/rejected": -1.842403769493103, + "logps/chosen": -2.7923684120178223, + "logps/rejected": -3.519589900970459, + "loss": 0.7945, + "rewards/accuracies": 1.0, + "rewards/chosen": -27.923686981201172, + "rewards/margins": 7.272216796875, + "rewards/rejected": -35.195899963378906, + "step": 25710 + }, + { + "epoch": 0.8667295830665004, + "grad_norm": 23.474517822265625, + "learning_rate": 5.313479882929545e-08, + "logits/chosen": -1.6315898895263672, + "logits/rejected": -1.831046462059021, + "logps/chosen": -2.6967477798461914, + "logps/rejected": -2.7854743003845215, + "loss": 3.9346, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -26.967477798461914, + "rewards/margins": 0.8872681856155396, + "rewards/rejected": -27.854745864868164, + "step": 25715 + }, + { + "epoch": 0.8668981091374836, + "grad_norm": 63.77497482299805, + "learning_rate": 5.3002926070276065e-08, + "logits/chosen": -1.9106180667877197, + "logits/rejected": -2.332305908203125, + "logps/chosen": -2.7835779190063477, + "logps/rejected": -2.867992877960205, + "loss": 2.92, + "rewards/accuracies": 0.5, + "rewards/chosen": -27.835779190063477, + "rewards/margins": 0.8441485166549683, + "rewards/rejected": -28.679927825927734, + "step": 25720 + }, + { + "epoch": 0.8670666352084667, + "grad_norm": 43.49818420410156, + "learning_rate": 5.2871207999332866e-08, + "logits/chosen": -1.9081672430038452, + "logits/rejected": -2.2695870399475098, + "logps/chosen": -3.0630500316619873, + "logps/rejected": -3.210258960723877, + "loss": 2.3696, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -30.630502700805664, + "rewards/margins": 1.472088098526001, + "rewards/rejected": -32.10258865356445, + "step": 25725 + }, + { + "epoch": 0.8672351612794499, + "grad_norm": 42.61958312988281, + "learning_rate": 5.273964466204844e-08, + "logits/chosen": -1.745123267173767, + "logits/rejected": -2.2044224739074707, + "logps/chosen": -2.550452709197998, + "logps/rejected": -2.8602192401885986, + "loss": 2.4402, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.504528045654297, + "rewards/margins": 3.0976648330688477, + "rewards/rejected": -28.60219383239746, + "step": 25730 + }, + { + "epoch": 0.8674036873504332, + "grad_norm": 20.689411163330078, + "learning_rate": 5.260823610395177e-08, + "logits/chosen": -1.9419893026351929, + "logits/rejected": -2.1809206008911133, + "logps/chosen": -1.7887070178985596, + "logps/rejected": -1.9395878314971924, + "loss": 2.787, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.887067794799805, + "rewards/margins": 1.5088112354278564, + "rewards/rejected": -19.395877838134766, + "step": 25735 + }, + { + "epoch": 0.8675722134214163, + "grad_norm": 37.88859939575195, + "learning_rate": 5.2476982370517895e-08, + "logits/chosen": -2.154304265975952, + "logits/rejected": -2.248838424682617, + "logps/chosen": -2.048656940460205, + "logps/rejected": -2.063976764678955, + "loss": 3.0845, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.4865665435791, + "rewards/margins": 0.15320205688476562, + "rewards/rejected": -20.639766693115234, + "step": 25740 + }, + { + "epoch": 0.8677407394923995, + "grad_norm": 32.98319625854492, + "learning_rate": 5.234588350716879e-08, + "logits/chosen": -1.9644311666488647, + "logits/rejected": -2.0829384326934814, + "logps/chosen": -2.6873435974121094, + "logps/rejected": -3.071718692779541, + "loss": 2.2025, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.873437881469727, + "rewards/margins": 3.843745470046997, + "rewards/rejected": -30.717182159423828, + "step": 25745 + }, + { + "epoch": 0.8679092655633827, + "grad_norm": 60.62992858886719, + "learning_rate": 5.2214939559272474e-08, + "logits/chosen": -1.1950910091400146, + "logits/rejected": -1.9384139776229858, + "logps/chosen": -2.483628034591675, + "logps/rejected": -3.1380577087402344, + "loss": 2.8334, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.83627700805664, + "rewards/margins": 6.544297218322754, + "rewards/rejected": -31.38057518005371, + "step": 25750 + }, + { + "epoch": 0.8680777916343658, + "grad_norm": 1.7990132570266724, + "learning_rate": 5.208415057214366e-08, + "logits/chosen": -1.8497015237808228, + "logits/rejected": -2.1536459922790527, + "logps/chosen": -2.151120901107788, + "logps/rejected": -2.640761137008667, + "loss": 1.3812, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.51120948791504, + "rewards/margins": 4.896402359008789, + "rewards/rejected": -26.40761375427246, + "step": 25755 + }, + { + "epoch": 0.868246317705349, + "grad_norm": 83.80754852294922, + "learning_rate": 5.195351659104308e-08, + "logits/chosen": -2.031888723373413, + "logits/rejected": -1.8762810230255127, + "logps/chosen": -2.5077152252197266, + "logps/rejected": -2.4214892387390137, + "loss": 5.0355, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -25.077150344848633, + "rewards/margins": -0.8622571229934692, + "rewards/rejected": -24.214893341064453, + "step": 25760 + }, + { + "epoch": 0.8684148437763322, + "grad_norm": 23.788127899169922, + "learning_rate": 5.182303766117807e-08, + "logits/chosen": -1.7114553451538086, + "logits/rejected": -1.8850023746490479, + "logps/chosen": -2.594632625579834, + "logps/rejected": -2.736550807952881, + "loss": 3.2035, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.946325302124023, + "rewards/margins": 1.4191802740097046, + "rewards/rejected": -27.36550521850586, + "step": 25765 + }, + { + "epoch": 0.8685833698473154, + "grad_norm": 31.160070419311523, + "learning_rate": 5.169271382770224e-08, + "logits/chosen": -2.122729778289795, + "logits/rejected": -2.1520445346832275, + "logps/chosen": -2.044503688812256, + "logps/rejected": -2.158705949783325, + "loss": 2.3285, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.445035934448242, + "rewards/margins": 1.1420232057571411, + "rewards/rejected": -21.587059020996094, + "step": 25770 + }, + { + "epoch": 0.8687518959182986, + "grad_norm": 37.71006774902344, + "learning_rate": 5.1562545135715676e-08, + "logits/chosen": -2.032743453979492, + "logits/rejected": -1.8914331197738647, + "logps/chosen": -2.209226608276367, + "logps/rejected": -2.465100049972534, + "loss": 3.1635, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.092266082763672, + "rewards/margins": 2.5587353706359863, + "rewards/rejected": -24.6510009765625, + "step": 25775 + }, + { + "epoch": 0.8689204219892818, + "grad_norm": 26.651168823242188, + "learning_rate": 5.14325316302644e-08, + "logits/chosen": -1.7310502529144287, + "logits/rejected": -1.8083226680755615, + "logps/chosen": -2.462341785430908, + "logps/rejected": -2.4454903602600098, + "loss": 3.7791, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.623416900634766, + "rewards/margins": -0.16851606965065002, + "rewards/rejected": -24.45490264892578, + "step": 25780 + }, + { + "epoch": 0.8690889480602649, + "grad_norm": 101.77996063232422, + "learning_rate": 5.130267335634103e-08, + "logits/chosen": -1.7319138050079346, + "logits/rejected": -1.8407793045043945, + "logps/chosen": -2.481172561645508, + "logps/rejected": -2.444258213043213, + "loss": 3.4822, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.811725616455078, + "rewards/margins": -0.3691454827785492, + "rewards/rejected": -24.442581176757812, + "step": 25785 + }, + { + "epoch": 0.8692574741312481, + "grad_norm": 96.85649108886719, + "learning_rate": 5.117297035888451e-08, + "logits/chosen": -1.9268567562103271, + "logits/rejected": -2.3404312133789062, + "logps/chosen": -2.1492671966552734, + "logps/rejected": -2.1940503120422363, + "loss": 3.1017, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.492671966552734, + "rewards/margins": 0.44782838225364685, + "rewards/rejected": -21.940500259399414, + "step": 25790 + }, + { + "epoch": 0.8694260002022313, + "grad_norm": 53.152503967285156, + "learning_rate": 5.1043422682779837e-08, + "logits/chosen": -1.959204912185669, + "logits/rejected": -2.3568460941314697, + "logps/chosen": -2.5286505222320557, + "logps/rejected": -2.8187003135681152, + "loss": 2.6502, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.2865047454834, + "rewards/margins": 2.9004974365234375, + "rewards/rejected": -28.187002182006836, + "step": 25795 + }, + { + "epoch": 0.8695945262732144, + "grad_norm": 36.7203254699707, + "learning_rate": 5.091403037285841e-08, + "logits/chosen": -2.4292044639587402, + "logits/rejected": -2.6584384441375732, + "logps/chosen": -2.8159584999084473, + "logps/rejected": -3.5781993865966797, + "loss": 1.4842, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -28.15958595275879, + "rewards/margins": 7.622408390045166, + "rewards/rejected": -35.7819938659668, + "step": 25800 + }, + { + "epoch": 0.8697630523441976, + "grad_norm": 12.048297882080078, + "learning_rate": 5.078479347389786e-08, + "logits/chosen": -1.6142889261245728, + "logits/rejected": -2.0917158126831055, + "logps/chosen": -1.6926759481430054, + "logps/rejected": -2.018369674682617, + "loss": 1.713, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -16.926759719848633, + "rewards/margins": 3.2569377422332764, + "rewards/rejected": -20.183696746826172, + "step": 25805 + }, + { + "epoch": 0.8699315784151809, + "grad_norm": 17.070327758789062, + "learning_rate": 5.065571203062186e-08, + "logits/chosen": -2.337498903274536, + "logits/rejected": -2.5879263877868652, + "logps/chosen": -2.1566128730773926, + "logps/rejected": -2.3633060455322266, + "loss": 2.4247, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.56612777709961, + "rewards/margins": 2.066929578781128, + "rewards/rejected": -23.633060455322266, + "step": 25810 + }, + { + "epoch": 0.870100104486164, + "grad_norm": 39.20555114746094, + "learning_rate": 5.0526786087700446e-08, + "logits/chosen": -2.312941551208496, + "logits/rejected": -2.5641887187957764, + "logps/chosen": -2.590254783630371, + "logps/rejected": -2.675225019454956, + "loss": 3.7626, + "rewards/accuracies": 0.5, + "rewards/chosen": -25.902551651000977, + "rewards/margins": 0.8497023582458496, + "rewards/rejected": -26.75225257873535, + "step": 25815 + }, + { + "epoch": 0.8702686305571472, + "grad_norm": 9.575358853908256e-05, + "learning_rate": 5.039801568974983e-08, + "logits/chosen": -2.032660961151123, + "logits/rejected": -2.1150026321411133, + "logps/chosen": -3.270076036453247, + "logps/rejected": -4.2751970291137695, + "loss": 1.2892, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -32.70075607299805, + "rewards/margins": 10.05120849609375, + "rewards/rejected": -42.75196838378906, + "step": 25820 + }, + { + "epoch": 0.8704371566281304, + "grad_norm": 21.379125595092773, + "learning_rate": 5.0269400881332415e-08, + "logits/chosen": -1.2569396495819092, + "logits/rejected": -2.036778450012207, + "logps/chosen": -1.7933368682861328, + "logps/rejected": -2.9262733459472656, + "loss": 1.9382, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.933368682861328, + "rewards/margins": 11.329366683959961, + "rewards/rejected": -29.26273536682129, + "step": 25825 + }, + { + "epoch": 0.8706056826991135, + "grad_norm": 15.96414852142334, + "learning_rate": 5.014094170695665e-08, + "logits/chosen": -1.8142383098602295, + "logits/rejected": -2.0156233310699463, + "logps/chosen": -2.004096508026123, + "logps/rejected": -2.1798839569091797, + "loss": 2.7522, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.040966033935547, + "rewards/margins": 1.7578716278076172, + "rewards/rejected": -21.798837661743164, + "step": 25830 + }, + { + "epoch": 0.8707742087700967, + "grad_norm": 20.892578125, + "learning_rate": 5.0012638211077205e-08, + "logits/chosen": -2.0305237770080566, + "logits/rejected": -2.6673262119293213, + "logps/chosen": -2.4386634826660156, + "logps/rejected": -2.5656256675720215, + "loss": 3.8098, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -24.386632919311523, + "rewards/margins": 1.269622802734375, + "rewards/rejected": -25.6562557220459, + "step": 25835 + }, + { + "epoch": 0.8709427348410799, + "grad_norm": 32.19673156738281, + "learning_rate": 4.988449043809495e-08, + "logits/chosen": -1.9713561534881592, + "logits/rejected": -1.8127353191375732, + "logps/chosen": -2.835386276245117, + "logps/rejected": -2.4795279502868652, + "loss": 7.098, + "rewards/accuracies": 0.5, + "rewards/chosen": -28.353862762451172, + "rewards/margins": -3.558582305908203, + "rewards/rejected": -24.79528045654297, + "step": 25840 + }, + { + "epoch": 0.8711112609120631, + "grad_norm": 14.6963472366333, + "learning_rate": 4.975649843235663e-08, + "logits/chosen": -1.4432746171951294, + "logits/rejected": -1.6536836624145508, + "logps/chosen": -2.234740972518921, + "logps/rejected": -2.4873673915863037, + "loss": 2.6795, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.347408294677734, + "rewards/margins": 2.5262651443481445, + "rewards/rejected": -24.873674392700195, + "step": 25845 + }, + { + "epoch": 0.8712797869830463, + "grad_norm": 27.49606704711914, + "learning_rate": 4.9628662238155375e-08, + "logits/chosen": -2.0613508224487305, + "logits/rejected": -2.6506898403167725, + "logps/chosen": -2.446141242980957, + "logps/rejected": -2.9206721782684326, + "loss": 2.1984, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.46141242980957, + "rewards/margins": 4.745309352874756, + "rewards/rejected": -29.206722259521484, + "step": 25850 + }, + { + "epoch": 0.8714483130540295, + "grad_norm": 82.02774810791016, + "learning_rate": 4.950098189973012e-08, + "logits/chosen": -2.126680374145508, + "logits/rejected": -2.285808801651001, + "logps/chosen": -2.632131576538086, + "logps/rejected": -2.502464771270752, + "loss": 4.6964, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -26.32131576538086, + "rewards/margins": -1.296666145324707, + "rewards/rejected": -25.024648666381836, + "step": 25855 + }, + { + "epoch": 0.8716168391250126, + "grad_norm": 19.395328521728516, + "learning_rate": 4.9373457461266196e-08, + "logits/chosen": -2.0173676013946533, + "logits/rejected": -2.2325971126556396, + "logps/chosen": -2.738555669784546, + "logps/rejected": -3.175352096557617, + "loss": 2.064, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -27.385555267333984, + "rewards/margins": 4.367962837219238, + "rewards/rejected": -31.753520965576172, + "step": 25860 + }, + { + "epoch": 0.8717853651959958, + "grad_norm": 79.3759994506836, + "learning_rate": 4.9246088966894586e-08, + "logits/chosen": -1.773151159286499, + "logits/rejected": -1.7250587940216064, + "logps/chosen": -2.4536876678466797, + "logps/rejected": -2.5901026725769043, + "loss": 3.0366, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.536874771118164, + "rewards/margins": 1.364152193069458, + "rewards/rejected": -25.90102767944336, + "step": 25865 + }, + { + "epoch": 0.871953891266979, + "grad_norm": 258.291259765625, + "learning_rate": 4.911887646069257e-08, + "logits/chosen": -1.7847461700439453, + "logits/rejected": -1.9289964437484741, + "logps/chosen": -2.952078104019165, + "logps/rejected": -2.842191219329834, + "loss": 4.685, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -29.520776748657227, + "rewards/margins": -1.0988658666610718, + "rewards/rejected": -28.42191505432129, + "step": 25870 + }, + { + "epoch": 0.8721224173379621, + "grad_norm": 53.33089065551758, + "learning_rate": 4.8991819986683506e-08, + "logits/chosen": -2.109039068222046, + "logits/rejected": -2.342543125152588, + "logps/chosen": -2.3845467567443848, + "logps/rejected": -2.819420337677002, + "loss": 2.336, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.845470428466797, + "rewards/margins": 4.348735809326172, + "rewards/rejected": -28.194204330444336, + "step": 25875 + }, + { + "epoch": 0.8722909434089454, + "grad_norm": 34.33763122558594, + "learning_rate": 4.8864919588836425e-08, + "logits/chosen": -1.5160837173461914, + "logits/rejected": -1.320216417312622, + "logps/chosen": -2.544429302215576, + "logps/rejected": -2.622082471847534, + "loss": 3.5551, + "rewards/accuracies": 0.5, + "rewards/chosen": -25.444292068481445, + "rewards/margins": 0.7765324711799622, + "rewards/rejected": -26.2208251953125, + "step": 25880 + }, + { + "epoch": 0.8724594694799286, + "grad_norm": 72.6720962524414, + "learning_rate": 4.8738175311066665e-08, + "logits/chosen": -1.8243458271026611, + "logits/rejected": -1.7153728008270264, + "logps/chosen": -2.4128189086914062, + "logps/rejected": -2.5200035572052, + "loss": 2.5664, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -24.128189086914062, + "rewards/margins": 1.071846604347229, + "rewards/rejected": -25.200035095214844, + "step": 25885 + }, + { + "epoch": 0.8726279955509118, + "grad_norm": 27.856830596923828, + "learning_rate": 4.861158719723546e-08, + "logits/chosen": -1.793646216392517, + "logits/rejected": -1.7063287496566772, + "logps/chosen": -2.146275043487549, + "logps/rejected": -2.327519416809082, + "loss": 2.6674, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.462749481201172, + "rewards/margins": 1.8124430179595947, + "rewards/rejected": -23.275196075439453, + "step": 25890 + }, + { + "epoch": 0.8727965216218949, + "grad_norm": 56.65886688232422, + "learning_rate": 4.848515529114999e-08, + "logits/chosen": -1.4978837966918945, + "logits/rejected": -1.857505440711975, + "logps/chosen": -2.9919285774230957, + "logps/rejected": -3.0575897693634033, + "loss": 4.2514, + "rewards/accuracies": 0.5, + "rewards/chosen": -29.919286727905273, + "rewards/margins": 0.6566106081008911, + "rewards/rejected": -30.575897216796875, + "step": 25895 + }, + { + "epoch": 0.8729650476928781, + "grad_norm": 80.9780502319336, + "learning_rate": 4.835887963656321e-08, + "logits/chosen": -1.6397136449813843, + "logits/rejected": -1.7722076177597046, + "logps/chosen": -2.625100612640381, + "logps/rejected": -3.0324058532714844, + "loss": 3.1167, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -26.251007080078125, + "rewards/margins": 4.073052883148193, + "rewards/rejected": -30.32405662536621, + "step": 25900 + }, + { + "epoch": 0.8731335737638612, + "grad_norm": 42.08506774902344, + "learning_rate": 4.823276027717427e-08, + "logits/chosen": -1.019819974899292, + "logits/rejected": -1.865755319595337, + "logps/chosen": -2.091823101043701, + "logps/rejected": -2.5190513134002686, + "loss": 1.7671, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.918230056762695, + "rewards/margins": 4.272280693054199, + "rewards/rejected": -25.19051170349121, + "step": 25905 + }, + { + "epoch": 0.8733020998348444, + "grad_norm": 30.96110725402832, + "learning_rate": 4.810679725662814e-08, + "logits/chosen": -1.2372939586639404, + "logits/rejected": -1.3814033269882202, + "logps/chosen": -2.071434497833252, + "logps/rejected": -2.412691354751587, + "loss": 1.7943, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.714344024658203, + "rewards/margins": 3.4125685691833496, + "rewards/rejected": -24.12691307067871, + "step": 25910 + }, + { + "epoch": 0.8734706259058276, + "grad_norm": 24.661306381225586, + "learning_rate": 4.798099061851546e-08, + "logits/chosen": -1.249774694442749, + "logits/rejected": -1.1749866008758545, + "logps/chosen": -2.4756853580474854, + "logps/rejected": -2.424419403076172, + "loss": 3.7921, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -24.756851196289062, + "rewards/margins": -0.5126598477363586, + "rewards/rejected": -24.24419403076172, + "step": 25915 + }, + { + "epoch": 0.8736391519768109, + "grad_norm": 22.849403381347656, + "learning_rate": 4.785534040637318e-08, + "logits/chosen": -2.1637206077575684, + "logits/rejected": -2.4329028129577637, + "logps/chosen": -2.336874008178711, + "logps/rejected": -2.866360902786255, + "loss": 1.73, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.36874008178711, + "rewards/margins": 5.294869422912598, + "rewards/rejected": -28.663610458374023, + "step": 25920 + }, + { + "epoch": 0.873807678047794, + "grad_norm": 16.065292358398438, + "learning_rate": 4.7729846663683734e-08, + "logits/chosen": -1.3076118230819702, + "logits/rejected": -1.5740753412246704, + "logps/chosen": -2.1626815795898438, + "logps/rejected": -2.5529274940490723, + "loss": 2.3872, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.626815795898438, + "rewards/margins": 3.902461290359497, + "rewards/rejected": -25.52927589416504, + "step": 25925 + }, + { + "epoch": 0.8739762041187772, + "grad_norm": 57.940948486328125, + "learning_rate": 4.7604509433875674e-08, + "logits/chosen": -1.8605324029922485, + "logits/rejected": -1.996279001235962, + "logps/chosen": -2.231952428817749, + "logps/rejected": -2.156428098678589, + "loss": 4.6811, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.31952667236328, + "rewards/margins": -0.7552453875541687, + "rewards/rejected": -21.564281463623047, + "step": 25930 + }, + { + "epoch": 0.8741447301897604, + "grad_norm": 132.3805389404297, + "learning_rate": 4.747932876032318e-08, + "logits/chosen": -2.017028331756592, + "logits/rejected": -1.755444884300232, + "logps/chosen": -3.1288609504699707, + "logps/rejected": -3.00249981880188, + "loss": 4.5781, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -31.288610458374023, + "rewards/margins": -1.2636101245880127, + "rewards/rejected": -30.02499771118164, + "step": 25935 + }, + { + "epoch": 0.8743132562607435, + "grad_norm": 77.25647735595703, + "learning_rate": 4.7354304686346436e-08, + "logits/chosen": -1.965309739112854, + "logits/rejected": -2.3678011894226074, + "logps/chosen": -2.950040102005005, + "logps/rejected": -4.675734996795654, + "loss": 1.7058, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -29.50040054321289, + "rewards/margins": 17.25695037841797, + "rewards/rejected": -46.75735092163086, + "step": 25940 + }, + { + "epoch": 0.8744817823317267, + "grad_norm": 21.622243881225586, + "learning_rate": 4.7229437255211394e-08, + "logits/chosen": -1.2503682374954224, + "logits/rejected": -1.5136396884918213, + "logps/chosen": -2.392681837081909, + "logps/rejected": -2.7116854190826416, + "loss": 2.1726, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -23.926820755004883, + "rewards/margins": 3.1900343894958496, + "rewards/rejected": -27.11685562133789, + "step": 25945 + }, + { + "epoch": 0.8746503084027099, + "grad_norm": 18.216819763183594, + "learning_rate": 4.710472651012953e-08, + "logits/chosen": -1.824148416519165, + "logits/rejected": -2.0601725578308105, + "logps/chosen": -2.657719135284424, + "logps/rejected": -2.9636952877044678, + "loss": 3.0157, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -26.577190399169922, + "rewards/margins": 3.059760570526123, + "rewards/rejected": -29.636951446533203, + "step": 25950 + }, + { + "epoch": 0.8748188344736931, + "grad_norm": 18.032672882080078, + "learning_rate": 4.6980172494258505e-08, + "logits/chosen": -2.3354194164276123, + "logits/rejected": -2.417893886566162, + "logps/chosen": -2.268496036529541, + "logps/rejected": -2.806499481201172, + "loss": 2.4009, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.68495750427246, + "rewards/margins": 5.380034923553467, + "rewards/rejected": -28.064992904663086, + "step": 25955 + }, + { + "epoch": 0.8749873605446763, + "grad_norm": 31.231897354125977, + "learning_rate": 4.68557752507015e-08, + "logits/chosen": -1.844909906387329, + "logits/rejected": -1.9287885427474976, + "logps/chosen": -1.8651247024536133, + "logps/rejected": -2.1045570373535156, + "loss": 1.6024, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -18.651248931884766, + "rewards/margins": 2.3943233489990234, + "rewards/rejected": -21.045570373535156, + "step": 25960 + }, + { + "epoch": 0.8751558866156595, + "grad_norm": 92.8125, + "learning_rate": 4.673153482250763e-08, + "logits/chosen": -2.0548250675201416, + "logits/rejected": -1.8608801364898682, + "logps/chosen": -2.5349326133728027, + "logps/rejected": -2.7556471824645996, + "loss": 2.8412, + "rewards/accuracies": 0.5, + "rewards/chosen": -25.349323272705078, + "rewards/margins": 2.2071495056152344, + "rewards/rejected": -27.556472778320312, + "step": 25965 + }, + { + "epoch": 0.8753244126866426, + "grad_norm": 57.874595642089844, + "learning_rate": 4.66074512526714e-08, + "logits/chosen": -1.6241235733032227, + "logits/rejected": -1.7461202144622803, + "logps/chosen": -2.530500888824463, + "logps/rejected": -2.4094245433807373, + "loss": 4.4387, + "rewards/accuracies": 0.5, + "rewards/chosen": -25.305007934570312, + "rewards/margins": -1.2107617855072021, + "rewards/rejected": -24.0942440032959, + "step": 25970 + }, + { + "epoch": 0.8754929387576258, + "grad_norm": 64.47271728515625, + "learning_rate": 4.648352458413329e-08, + "logits/chosen": -1.1945641040802002, + "logits/rejected": -1.3513495922088623, + "logps/chosen": -2.8699965476989746, + "logps/rejected": -3.5107929706573486, + "loss": 1.2196, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -28.699966430664062, + "rewards/margins": 6.40796422958374, + "rewards/rejected": -35.10792922973633, + "step": 25975 + }, + { + "epoch": 0.875661464828609, + "grad_norm": 102.32059478759766, + "learning_rate": 4.635975485977961e-08, + "logits/chosen": -1.6279737949371338, + "logits/rejected": -1.6798328161239624, + "logps/chosen": -3.12237286567688, + "logps/rejected": -3.4952163696289062, + "loss": 3.6478, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -31.22372817993164, + "rewards/margins": 3.7284350395202637, + "rewards/rejected": -34.95216369628906, + "step": 25980 + }, + { + "epoch": 0.8758299908995921, + "grad_norm": 20.18497085571289, + "learning_rate": 4.623614212244198e-08, + "logits/chosen": -1.6860414743423462, + "logits/rejected": -1.9204658269882202, + "logps/chosen": -2.681396961212158, + "logps/rejected": -3.1187262535095215, + "loss": 1.9732, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -26.813968658447266, + "rewards/margins": 4.373295783996582, + "rewards/rejected": -31.1872615814209, + "step": 25985 + }, + { + "epoch": 0.8759985169705754, + "grad_norm": 30.457250595092773, + "learning_rate": 4.611268641489796e-08, + "logits/chosen": -2.0881166458129883, + "logits/rejected": -2.1515793800354004, + "logps/chosen": -2.6387476921081543, + "logps/rejected": -2.6230766773223877, + "loss": 4.4297, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -26.387475967407227, + "rewards/margins": -0.15670490264892578, + "rewards/rejected": -26.230770111083984, + "step": 25990 + }, + { + "epoch": 0.8761670430415586, + "grad_norm": 98.93780517578125, + "learning_rate": 4.5989387779870716e-08, + "logits/chosen": -2.187082529067993, + "logits/rejected": -2.343308925628662, + "logps/chosen": -2.834789752960205, + "logps/rejected": -2.9538867473602295, + "loss": 2.9953, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -28.347896575927734, + "rewards/margins": 1.190970778465271, + "rewards/rejected": -29.538867950439453, + "step": 25995 + }, + { + "epoch": 0.8763355691125417, + "grad_norm": 38.53907775878906, + "learning_rate": 4.586624626002916e-08, + "logits/chosen": -1.936500906944275, + "logits/rejected": -2.458440065383911, + "logps/chosen": -2.3346505165100098, + "logps/rejected": -2.676438093185425, + "loss": 1.4801, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -23.34650421142578, + "rewards/margins": 3.417875289916992, + "rewards/rejected": -26.764379501342773, + "step": 26000 + }, + { + "epoch": 0.8763355691125417, + "eval_logits/chosen": -2.305676221847534, + "eval_logits/rejected": -2.4827284812927246, + "eval_logps/chosen": -2.2852182388305664, + "eval_logps/rejected": -2.440098762512207, + "eval_loss": 3.083927631378174, + "eval_rewards/accuracies": 0.6299999952316284, + "eval_rewards/chosen": -22.852182388305664, + "eval_rewards/margins": 1.548802137374878, + "eval_rewards/rejected": -24.400985717773438, + "eval_runtime": 12.8886, + "eval_samples_per_second": 7.759, + "eval_steps_per_second": 1.94, + "step": 26000 + }, + { + "epoch": 0.8765040951835249, + "grad_norm": 18.142152786254883, + "learning_rate": 4.574326189798755e-08, + "logits/chosen": -1.9278751611709595, + "logits/rejected": -2.1549553871154785, + "logps/chosen": -2.518207550048828, + "logps/rejected": -3.117119550704956, + "loss": 2.1305, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.18207359313965, + "rewards/margins": 5.989120960235596, + "rewards/rejected": -31.17119789123535, + "step": 26005 + }, + { + "epoch": 0.8766726212545081, + "grad_norm": 42.60398483276367, + "learning_rate": 4.562043473630595e-08, + "logits/chosen": -2.0454294681549072, + "logits/rejected": -2.2847232818603516, + "logps/chosen": -2.7390663623809814, + "logps/rejected": -2.778416156768799, + "loss": 2.8106, + "rewards/accuracies": 0.5, + "rewards/chosen": -27.390661239624023, + "rewards/margins": 0.39350032806396484, + "rewards/rejected": -27.784160614013672, + "step": 26010 + }, + { + "epoch": 0.8768411473254912, + "grad_norm": 33.4304084777832, + "learning_rate": 4.549776481749018e-08, + "logits/chosen": -1.3675581216812134, + "logits/rejected": -1.3049651384353638, + "logps/chosen": -2.0452096462249756, + "logps/rejected": -2.0717859268188477, + "loss": 3.8067, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.452096939086914, + "rewards/margins": 0.2657632827758789, + "rewards/rejected": -20.71786117553711, + "step": 26015 + }, + { + "epoch": 0.8770096733964744, + "grad_norm": 48.97243881225586, + "learning_rate": 4.537525218399124e-08, + "logits/chosen": -1.8618223667144775, + "logits/rejected": -1.8452171087265015, + "logps/chosen": -2.2914299964904785, + "logps/rejected": -2.3020145893096924, + "loss": 4.5788, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.91429901123047, + "rewards/margins": 0.10584697872400284, + "rewards/rejected": -23.020145416259766, + "step": 26020 + }, + { + "epoch": 0.8771781994674576, + "grad_norm": 43.23052978515625, + "learning_rate": 4.525289687820599e-08, + "logits/chosen": -2.1048130989074707, + "logits/rejected": -2.744718074798584, + "logps/chosen": -3.903913974761963, + "logps/rejected": -4.380173206329346, + "loss": 2.1283, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -39.03913497924805, + "rewards/margins": 4.76259708404541, + "rewards/rejected": -43.801734924316406, + "step": 26025 + }, + { + "epoch": 0.8773467255384408, + "grad_norm": 200.03939819335938, + "learning_rate": 4.5130698942476843e-08, + "logits/chosen": -1.6560029983520508, + "logits/rejected": -1.8514435291290283, + "logps/chosen": -2.5157668590545654, + "logps/rejected": -2.5783932209014893, + "loss": 3.1845, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.157665252685547, + "rewards/margins": 0.626263439655304, + "rewards/rejected": -25.783931732177734, + "step": 26030 + }, + { + "epoch": 0.877515251609424, + "grad_norm": 27.59550666809082, + "learning_rate": 4.5008658419091686e-08, + "logits/chosen": -1.7573442459106445, + "logits/rejected": -1.9589412212371826, + "logps/chosen": -2.2838704586029053, + "logps/rejected": -2.5899243354797363, + "loss": 1.5338, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -22.83870506286621, + "rewards/margins": 3.0605380535125732, + "rewards/rejected": -25.899242401123047, + "step": 26035 + }, + { + "epoch": 0.8776837776804072, + "grad_norm": 32.58989334106445, + "learning_rate": 4.48867753502839e-08, + "logits/chosen": -2.00469708442688, + "logits/rejected": -2.1252007484436035, + "logps/chosen": -2.224625587463379, + "logps/rejected": -2.202439785003662, + "loss": 3.6983, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -22.246257781982422, + "rewards/margins": -0.22186097502708435, + "rewards/rejected": -22.024394989013672, + "step": 26040 + }, + { + "epoch": 0.8778523037513903, + "grad_norm": 32.600189208984375, + "learning_rate": 4.476504977823237e-08, + "logits/chosen": -1.4882522821426392, + "logits/rejected": -1.4651497602462769, + "logps/chosen": -2.2987568378448486, + "logps/rejected": -2.344611406326294, + "loss": 3.488, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.987567901611328, + "rewards/margins": 0.4585467278957367, + "rewards/rejected": -23.44611358642578, + "step": 26045 + }, + { + "epoch": 0.8780208298223735, + "grad_norm": 22.328445434570312, + "learning_rate": 4.4643481745061664e-08, + "logits/chosen": -0.9219368696212769, + "logits/rejected": -1.0782215595245361, + "logps/chosen": -2.841899871826172, + "logps/rejected": -3.263303279876709, + "loss": 0.9888, + "rewards/accuracies": 1.0, + "rewards/chosen": -28.41900062561035, + "rewards/margins": 4.214035987854004, + "rewards/rejected": -32.633033752441406, + "step": 26050 + }, + { + "epoch": 0.8781893558933567, + "grad_norm": 81.40164184570312, + "learning_rate": 4.4522071292841524e-08, + "logits/chosen": -1.9810707569122314, + "logits/rejected": -2.1055080890655518, + "logps/chosen": -2.5610156059265137, + "logps/rejected": -3.1644368171691895, + "loss": 2.5395, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.610157012939453, + "rewards/margins": 6.034211158752441, + "rewards/rejected": -31.64436912536621, + "step": 26055 + }, + { + "epoch": 0.8783578819643398, + "grad_norm": 28.70875358581543, + "learning_rate": 4.440081846358734e-08, + "logits/chosen": -1.7828556299209595, + "logits/rejected": -1.9503099918365479, + "logps/chosen": -3.098968744277954, + "logps/rejected": -3.4416584968566895, + "loss": 3.4186, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -30.989688873291016, + "rewards/margins": 3.4268956184387207, + "rewards/rejected": -34.41658401489258, + "step": 26060 + }, + { + "epoch": 0.8785264080353231, + "grad_norm": 39.028114318847656, + "learning_rate": 4.4279723299260053e-08, + "logits/chosen": -2.1715588569641113, + "logits/rejected": -1.9619592428207397, + "logps/chosen": -2.756772518157959, + "logps/rejected": -2.7361457347869873, + "loss": 4.1748, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -27.567724227905273, + "rewards/margins": -0.20626945793628693, + "rewards/rejected": -27.3614559173584, + "step": 26065 + }, + { + "epoch": 0.8786949341063063, + "grad_norm": 25.182750701904297, + "learning_rate": 4.415878584176586e-08, + "logits/chosen": -1.9110186100006104, + "logits/rejected": -2.0006096363067627, + "logps/chosen": -1.853926420211792, + "logps/rejected": -2.0113346576690674, + "loss": 2.167, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.539264678955078, + "rewards/margins": 1.5740830898284912, + "rewards/rejected": -20.11334800720215, + "step": 26070 + }, + { + "epoch": 0.8788634601772894, + "grad_norm": 27.858596801757812, + "learning_rate": 4.4038006132956554e-08, + "logits/chosen": -2.290611505508423, + "logits/rejected": -2.5834479331970215, + "logps/chosen": -2.9701969623565674, + "logps/rejected": -3.535025119781494, + "loss": 3.1378, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -29.70197105407715, + "rewards/margins": 5.64827823638916, + "rewards/rejected": -35.35024642944336, + "step": 26075 + }, + { + "epoch": 0.8790319862482726, + "grad_norm": 34.910274505615234, + "learning_rate": 4.3917384214629035e-08, + "logits/chosen": -2.2534584999084473, + "logits/rejected": -2.124990940093994, + "logps/chosen": -2.2603816986083984, + "logps/rejected": -2.226010322570801, + "loss": 3.7679, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.603816986083984, + "rewards/margins": -0.3437148928642273, + "rewards/rejected": -22.260103225708008, + "step": 26080 + }, + { + "epoch": 0.8792005123192558, + "grad_norm": 18.28013038635254, + "learning_rate": 4.3796920128525927e-08, + "logits/chosen": -1.7659509181976318, + "logits/rejected": -2.554335117340088, + "logps/chosen": -2.34553861618042, + "logps/rejected": -3.6819541454315186, + "loss": 3.374, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.455387115478516, + "rewards/margins": 13.364153861999512, + "rewards/rejected": -36.819541931152344, + "step": 26085 + }, + { + "epoch": 0.8793690383902389, + "grad_norm": 95.5963363647461, + "learning_rate": 4.367661391633504e-08, + "logits/chosen": -2.4856085777282715, + "logits/rejected": -2.6605522632598877, + "logps/chosen": -3.307602643966675, + "logps/rejected": -3.840097427368164, + "loss": 3.1067, + "rewards/accuracies": 0.5, + "rewards/chosen": -33.076026916503906, + "rewards/margins": 5.324949741363525, + "rewards/rejected": -38.400978088378906, + "step": 26090 + }, + { + "epoch": 0.8795375644612221, + "grad_norm": 39.2291259765625, + "learning_rate": 4.355646561968968e-08, + "logits/chosen": -2.148571491241455, + "logits/rejected": -2.437236785888672, + "logps/chosen": -2.1277217864990234, + "logps/rejected": -2.223153591156006, + "loss": 2.7293, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.2772159576416, + "rewards/margins": 0.9543191194534302, + "rewards/rejected": -22.231534957885742, + "step": 26095 + }, + { + "epoch": 0.8797060905322054, + "grad_norm": 31.748767852783203, + "learning_rate": 4.343647528016842e-08, + "logits/chosen": -2.2617831230163574, + "logits/rejected": -2.2275707721710205, + "logps/chosen": -2.0656771659851074, + "logps/rejected": -2.070960521697998, + "loss": 4.1375, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.656770706176758, + "rewards/margins": 0.05283470079302788, + "rewards/rejected": -20.709606170654297, + "step": 26100 + }, + { + "epoch": 0.8798746166031886, + "grad_norm": 29.608827590942383, + "learning_rate": 4.331664293929521e-08, + "logits/chosen": -2.194572925567627, + "logits/rejected": -2.268022060394287, + "logps/chosen": -2.4366941452026367, + "logps/rejected": -2.6624813079833984, + "loss": 2.9006, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.366941452026367, + "rewards/margins": 2.2578701972961426, + "rewards/rejected": -26.62481117248535, + "step": 26105 + }, + { + "epoch": 0.8800431426741717, + "grad_norm": 34.71430587768555, + "learning_rate": 4.3196968638539224e-08, + "logits/chosen": -1.5167404413223267, + "logits/rejected": -1.8624671697616577, + "logps/chosen": -2.115821361541748, + "logps/rejected": -2.019786834716797, + "loss": 4.231, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.158214569091797, + "rewards/margins": -0.9603476524353027, + "rewards/rejected": -20.197866439819336, + "step": 26110 + }, + { + "epoch": 0.8802116687451549, + "grad_norm": 30.95269775390625, + "learning_rate": 4.30774524193151e-08, + "logits/chosen": -1.7086502313613892, + "logits/rejected": -1.7376470565795898, + "logps/chosen": -2.0986180305480957, + "logps/rejected": -2.125507354736328, + "loss": 2.8685, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.98617935180664, + "rewards/margins": 0.2688938081264496, + "rewards/rejected": -21.25507164001465, + "step": 26115 + }, + { + "epoch": 0.880380194816138, + "grad_norm": 30.516836166381836, + "learning_rate": 4.29580943229827e-08, + "logits/chosen": -1.6464307308197021, + "logits/rejected": -2.455376148223877, + "logps/chosen": -1.8904457092285156, + "logps/rejected": -2.61106538772583, + "loss": 2.0733, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.904457092285156, + "rewards/margins": 7.206197261810303, + "rewards/rejected": -26.110652923583984, + "step": 26120 + }, + { + "epoch": 0.8805487208871212, + "grad_norm": 10.775001525878906, + "learning_rate": 4.283889439084709e-08, + "logits/chosen": -1.5866488218307495, + "logits/rejected": -1.9692538976669312, + "logps/chosen": -2.8774304389953613, + "logps/rejected": -3.5010387897491455, + "loss": 2.0928, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -28.774303436279297, + "rewards/margins": 6.236086368560791, + "rewards/rejected": -35.01038360595703, + "step": 26125 + }, + { + "epoch": 0.8807172469581044, + "grad_norm": 38.87605667114258, + "learning_rate": 4.2719852664158673e-08, + "logits/chosen": -1.7289457321166992, + "logits/rejected": -2.259916305541992, + "logps/chosen": -2.7738230228424072, + "logps/rejected": -3.8895785808563232, + "loss": 2.9428, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -27.738229751586914, + "rewards/margins": 11.15755844116211, + "rewards/rejected": -38.895790100097656, + "step": 26130 + }, + { + "epoch": 0.8808857730290875, + "grad_norm": 4.833995342254639, + "learning_rate": 4.26009691841131e-08, + "logits/chosen": -2.0140976905822754, + "logits/rejected": -2.562851667404175, + "logps/chosen": -2.6443378925323486, + "logps/rejected": -3.1197803020477295, + "loss": 2.7417, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -26.443378448486328, + "rewards/margins": 4.754426002502441, + "rewards/rejected": -31.197805404663086, + "step": 26135 + }, + { + "epoch": 0.8810542991000708, + "grad_norm": 20.582605361938477, + "learning_rate": 4.2482243991851405e-08, + "logits/chosen": -1.6481196880340576, + "logits/rejected": -2.0094666481018066, + "logps/chosen": -2.9820003509521484, + "logps/rejected": -3.30322003364563, + "loss": 2.1949, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -29.82000160217285, + "rewards/margins": 3.212195873260498, + "rewards/rejected": -33.03219985961914, + "step": 26140 + }, + { + "epoch": 0.881222825171054, + "grad_norm": 17.267221450805664, + "learning_rate": 4.236367712845951e-08, + "logits/chosen": -1.5387892723083496, + "logits/rejected": -1.8134196996688843, + "logps/chosen": -2.9204771518707275, + "logps/rejected": -3.2025482654571533, + "loss": 2.188, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -29.20477294921875, + "rewards/margins": 2.820711612701416, + "rewards/rejected": -32.025482177734375, + "step": 26145 + }, + { + "epoch": 0.8813913512420372, + "grad_norm": 127.3127212524414, + "learning_rate": 4.22452686349688e-08, + "logits/chosen": -1.8970210552215576, + "logits/rejected": -1.8642895221710205, + "logps/chosen": -3.043975353240967, + "logps/rejected": -3.17755389213562, + "loss": 2.802, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -30.43975257873535, + "rewards/margins": 1.3357876539230347, + "rewards/rejected": -31.77553939819336, + "step": 26150 + }, + { + "epoch": 0.8815598773130203, + "grad_norm": 28.518186569213867, + "learning_rate": 4.21270185523559e-08, + "logits/chosen": -1.7622315883636475, + "logits/rejected": -2.2663474082946777, + "logps/chosen": -2.4464478492736816, + "logps/rejected": -2.898188829421997, + "loss": 3.1741, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.464481353759766, + "rewards/margins": 4.5174055099487305, + "rewards/rejected": -28.981884002685547, + "step": 26155 + }, + { + "epoch": 0.8817284033840035, + "grad_norm": 130.8482208251953, + "learning_rate": 4.2008926921542285e-08, + "logits/chosen": -1.9923702478408813, + "logits/rejected": -2.0565195083618164, + "logps/chosen": -2.6995887756347656, + "logps/rejected": -2.7402122020721436, + "loss": 5.303, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -26.995885848999023, + "rewards/margins": 0.4062366485595703, + "rewards/rejected": -27.402124404907227, + "step": 26160 + }, + { + "epoch": 0.8818969294549867, + "grad_norm": 79.5093994140625, + "learning_rate": 4.189099378339495e-08, + "logits/chosen": -2.1510725021362305, + "logits/rejected": -2.7977688312530518, + "logps/chosen": -2.7006373405456543, + "logps/rejected": -3.728480577468872, + "loss": 2.0349, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -27.006372451782227, + "rewards/margins": 10.278432846069336, + "rewards/rejected": -37.2848014831543, + "step": 26165 + }, + { + "epoch": 0.8820654555259698, + "grad_norm": 37.60076141357422, + "learning_rate": 4.177321917872589e-08, + "logits/chosen": -1.7785587310791016, + "logits/rejected": -2.1146063804626465, + "logps/chosen": -2.0391170978546143, + "logps/rejected": -2.5872554779052734, + "loss": 1.6566, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.391170501708984, + "rewards/margins": 5.481385707855225, + "rewards/rejected": -25.872554779052734, + "step": 26170 + }, + { + "epoch": 0.8822339815969531, + "grad_norm": 33.79237365722656, + "learning_rate": 4.165560314829236e-08, + "logits/chosen": -1.832585334777832, + "logits/rejected": -1.8615341186523438, + "logps/chosen": -2.010195732116699, + "logps/rejected": -2.08314847946167, + "loss": 2.6019, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.10195541381836, + "rewards/margins": 0.7295287847518921, + "rewards/rejected": -20.831485748291016, + "step": 26175 + }, + { + "epoch": 0.8824025076679363, + "grad_norm": 131.44534301757812, + "learning_rate": 4.153814573279646e-08, + "logits/chosen": -1.9326139688491821, + "logits/rejected": -1.7735908031463623, + "logps/chosen": -3.275007724761963, + "logps/rejected": -3.273677110671997, + "loss": 4.6015, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -32.75007629394531, + "rewards/margins": -0.01330490130931139, + "rewards/rejected": -32.73677444458008, + "step": 26180 + }, + { + "epoch": 0.8825710337389194, + "grad_norm": 26.220003128051758, + "learning_rate": 4.1420846972885613e-08, + "logits/chosen": -1.9239784479141235, + "logits/rejected": -1.7237510681152344, + "logps/chosen": -2.5050339698791504, + "logps/rejected": -2.7856767177581787, + "loss": 4.8332, + "rewards/accuracies": 0.5, + "rewards/chosen": -25.050338745117188, + "rewards/margins": 2.8064308166503906, + "rewards/rejected": -27.856769561767578, + "step": 26185 + }, + { + "epoch": 0.8827395598099026, + "grad_norm": 335.8841552734375, + "learning_rate": 4.1303706909152414e-08, + "logits/chosen": -2.3145358562469482, + "logits/rejected": -2.4512641429901123, + "logps/chosen": -2.4827306270599365, + "logps/rejected": -2.8225104808807373, + "loss": 1.8292, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.827306747436523, + "rewards/margins": 3.397799015045166, + "rewards/rejected": -28.225107192993164, + "step": 26190 + }, + { + "epoch": 0.8829080858808858, + "grad_norm": 46.0804328918457, + "learning_rate": 4.1186725582134264e-08, + "logits/chosen": -1.4715369939804077, + "logits/rejected": -1.7713149785995483, + "logps/chosen": -2.4654946327209473, + "logps/rejected": -2.7149832248687744, + "loss": 3.7104, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.65494728088379, + "rewards/margins": 2.494884490966797, + "rewards/rejected": -27.149831771850586, + "step": 26195 + }, + { + "epoch": 0.8830766119518689, + "grad_norm": 37.92697525024414, + "learning_rate": 4.106990303231389e-08, + "logits/chosen": -2.0186607837677, + "logits/rejected": -1.8988367319107056, + "logps/chosen": -2.6057591438293457, + "logps/rejected": -2.699638843536377, + "loss": 3.3298, + "rewards/accuracies": 0.5, + "rewards/chosen": -26.057592391967773, + "rewards/margins": 0.9387954473495483, + "rewards/rejected": -26.996387481689453, + "step": 26200 + }, + { + "epoch": 0.8832451380228521, + "grad_norm": 40.06204605102539, + "learning_rate": 4.0953239300119016e-08, + "logits/chosen": -1.4002172946929932, + "logits/rejected": -1.7599208354949951, + "logps/chosen": -2.041703462600708, + "logps/rejected": -2.2836692333221436, + "loss": 2.1846, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.417034149169922, + "rewards/margins": 2.41965913772583, + "rewards/rejected": -22.836694717407227, + "step": 26205 + }, + { + "epoch": 0.8834136640938354, + "grad_norm": 42.123268127441406, + "learning_rate": 4.083673442592217e-08, + "logits/chosen": -1.9567673206329346, + "logits/rejected": -2.1912286281585693, + "logps/chosen": -2.125568151473999, + "logps/rejected": -2.2718589305877686, + "loss": 2.6719, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.25568199157715, + "rewards/margins": 1.4629076719284058, + "rewards/rejected": -22.718591690063477, + "step": 26210 + }, + { + "epoch": 0.8835821901648185, + "grad_norm": 127.27120208740234, + "learning_rate": 4.072038845004128e-08, + "logits/chosen": -1.8005714416503906, + "logits/rejected": -2.0479798316955566, + "logps/chosen": -2.3783552646636963, + "logps/rejected": -2.9461581707000732, + "loss": 1.5435, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -23.783552169799805, + "rewards/margins": 5.678030490875244, + "rewards/rejected": -29.46158218383789, + "step": 26215 + }, + { + "epoch": 0.8837507162358017, + "grad_norm": 36.57089614868164, + "learning_rate": 4.060420141273907e-08, + "logits/chosen": -1.8329858779907227, + "logits/rejected": -1.9664011001586914, + "logps/chosen": -1.9984794855117798, + "logps/rejected": -2.5146331787109375, + "loss": 1.5813, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.984792709350586, + "rewards/margins": 5.161539554595947, + "rewards/rejected": -25.14633560180664, + "step": 26220 + }, + { + "epoch": 0.8839192423067849, + "grad_norm": 19.129188537597656, + "learning_rate": 4.048817335422327e-08, + "logits/chosen": -1.6715996265411377, + "logits/rejected": -1.9743512868881226, + "logps/chosen": -2.968926429748535, + "logps/rejected": -3.169696092605591, + "loss": 4.191, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -29.68926429748535, + "rewards/margins": 2.0076935291290283, + "rewards/rejected": -31.69696044921875, + "step": 26225 + }, + { + "epoch": 0.884087768377768, + "grad_norm": 1.8036651611328125, + "learning_rate": 4.037230431464661e-08, + "logits/chosen": -1.4815946817398071, + "logits/rejected": -1.510654091835022, + "logps/chosen": -1.9119043350219727, + "logps/rejected": -2.0924339294433594, + "loss": 2.2048, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.119043350219727, + "rewards/margins": 1.805295705795288, + "rewards/rejected": -20.92433738708496, + "step": 26230 + }, + { + "epoch": 0.8842562944487512, + "grad_norm": 51.39049530029297, + "learning_rate": 4.025659433410683e-08, + "logits/chosen": -1.695776343345642, + "logits/rejected": -2.1014962196350098, + "logps/chosen": -2.6338090896606445, + "logps/rejected": -3.248582124710083, + "loss": 1.1106, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -26.338092803955078, + "rewards/margins": 6.1477274894714355, + "rewards/rejected": -32.485816955566406, + "step": 26235 + }, + { + "epoch": 0.8844248205197344, + "grad_norm": 13.321321487426758, + "learning_rate": 4.014104345264663e-08, + "logits/chosen": -1.6323163509368896, + "logits/rejected": -1.794264554977417, + "logps/chosen": -3.0402426719665527, + "logps/rejected": -2.900519847869873, + "loss": 5.725, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -30.402429580688477, + "rewards/margins": -1.3972291946411133, + "rewards/rejected": -29.005199432373047, + "step": 26240 + }, + { + "epoch": 0.8845933465907175, + "grad_norm": 35.8294677734375, + "learning_rate": 4.002565171025352e-08, + "logits/chosen": -2.30120849609375, + "logits/rejected": -2.334444761276245, + "logps/chosen": -2.891282320022583, + "logps/rejected": -4.049437522888184, + "loss": 1.6171, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -28.91282081604004, + "rewards/margins": 11.581551551818848, + "rewards/rejected": -40.4943733215332, + "step": 26245 + }, + { + "epoch": 0.8847618726617008, + "grad_norm": 15.660163879394531, + "learning_rate": 3.991041914686011e-08, + "logits/chosen": -1.8731937408447266, + "logits/rejected": -1.8050944805145264, + "logps/chosen": -2.4904088973999023, + "logps/rejected": -3.021503210067749, + "loss": 2.0161, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -24.904088973999023, + "rewards/margins": 5.310944557189941, + "rewards/rejected": -30.21503257751465, + "step": 26250 + }, + { + "epoch": 0.884930398732684, + "grad_norm": 21.406620025634766, + "learning_rate": 3.979534580234378e-08, + "logits/chosen": -1.6166706085205078, + "logits/rejected": -2.2093665599823, + "logps/chosen": -2.2637438774108887, + "logps/rejected": -2.64788556098938, + "loss": 1.6625, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.637439727783203, + "rewards/margins": 3.841416835784912, + "rewards/rejected": -26.478857040405273, + "step": 26255 + }, + { + "epoch": 0.8850989248036671, + "grad_norm": 164.92747497558594, + "learning_rate": 3.968043171652707e-08, + "logits/chosen": -2.1059999465942383, + "logits/rejected": -2.2262609004974365, + "logps/chosen": -3.2196247577667236, + "logps/rejected": -3.4764976501464844, + "loss": 3.888, + "rewards/accuracies": 0.5, + "rewards/chosen": -32.196250915527344, + "rewards/margins": 2.568727970123291, + "rewards/rejected": -34.764976501464844, + "step": 26260 + }, + { + "epoch": 0.8852674508746503, + "grad_norm": 54.82228088378906, + "learning_rate": 3.956567692917695e-08, + "logits/chosen": -2.0362296104431152, + "logits/rejected": -2.172341823577881, + "logps/chosen": -2.940056324005127, + "logps/rejected": -3.6462929248809814, + "loss": 2.2166, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -29.400562286376953, + "rewards/margins": 7.062368869781494, + "rewards/rejected": -36.462928771972656, + "step": 26265 + }, + { + "epoch": 0.8854359769456335, + "grad_norm": 212.3646697998047, + "learning_rate": 3.9451081480005647e-08, + "logits/chosen": -1.7555129528045654, + "logits/rejected": -1.7756602764129639, + "logps/chosen": -2.326939105987549, + "logps/rejected": -2.392819404602051, + "loss": 3.7741, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -23.269390106201172, + "rewards/margins": 0.6588034629821777, + "rewards/rejected": -23.92819595336914, + "step": 26270 + }, + { + "epoch": 0.8856045030166166, + "grad_norm": 42.445858001708984, + "learning_rate": 3.933664540867027e-08, + "logits/chosen": -1.9917614459991455, + "logits/rejected": -2.2271552085876465, + "logps/chosen": -2.3137385845184326, + "logps/rejected": -2.756070613861084, + "loss": 2.4639, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.137386322021484, + "rewards/margins": 4.4233198165893555, + "rewards/rejected": -27.56070899963379, + "step": 26275 + }, + { + "epoch": 0.8857730290875998, + "grad_norm": 43.41066360473633, + "learning_rate": 3.922236875477236e-08, + "logits/chosen": -2.333465099334717, + "logits/rejected": -2.0408828258514404, + "logps/chosen": -2.147674083709717, + "logps/rejected": -1.9634158611297607, + "loss": 5.2754, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.47673797607422, + "rewards/margins": -1.8425800800323486, + "rewards/rejected": -19.634159088134766, + "step": 26280 + }, + { + "epoch": 0.8859415551585831, + "grad_norm": 32.42685317993164, + "learning_rate": 3.910825155785874e-08, + "logits/chosen": -1.8978359699249268, + "logits/rejected": -1.8385225534439087, + "logps/chosen": -2.6863465309143066, + "logps/rejected": -2.741611957550049, + "loss": 4.7944, + "rewards/accuracies": 0.5, + "rewards/chosen": -26.86346435546875, + "rewards/margins": 0.5526536703109741, + "rewards/rejected": -27.416118621826172, + "step": 26285 + }, + { + "epoch": 0.8861100812295662, + "grad_norm": 3.309293031692505, + "learning_rate": 3.899429385742087e-08, + "logits/chosen": -1.8833850622177124, + "logits/rejected": -1.8865734338760376, + "logps/chosen": -2.6383042335510254, + "logps/rejected": -2.7358269691467285, + "loss": 2.9002, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -26.383041381835938, + "rewards/margins": 0.9752256274223328, + "rewards/rejected": -27.3582706451416, + "step": 26290 + }, + { + "epoch": 0.8862786073005494, + "grad_norm": 34.39920425415039, + "learning_rate": 3.888049569289503e-08, + "logits/chosen": -1.429771900177002, + "logits/rejected": -1.9705921411514282, + "logps/chosen": -2.29795241355896, + "logps/rejected": -2.6995646953582764, + "loss": 1.6437, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.979524612426758, + "rewards/margins": 4.016123294830322, + "rewards/rejected": -26.995647430419922, + "step": 26295 + }, + { + "epoch": 0.8864471333715326, + "grad_norm": 58.80097579956055, + "learning_rate": 3.876685710366223e-08, + "logits/chosen": -2.0685536861419678, + "logits/rejected": -2.290975332260132, + "logps/chosen": -2.396440029144287, + "logps/rejected": -2.4646482467651367, + "loss": 2.9537, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.964397430419922, + "rewards/margins": 0.6820847392082214, + "rewards/rejected": -24.646480560302734, + "step": 26300 + }, + { + "epoch": 0.8866156594425157, + "grad_norm": 41.23129653930664, + "learning_rate": 3.8653378129048285e-08, + "logits/chosen": -1.9296531677246094, + "logits/rejected": -1.8945322036743164, + "logps/chosen": -2.165827989578247, + "logps/rejected": -2.2192344665527344, + "loss": 2.6034, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.658279418945312, + "rewards/margins": 0.5340660214424133, + "rewards/rejected": -22.192346572875977, + "step": 26305 + }, + { + "epoch": 0.8867841855134989, + "grad_norm": 29.026077270507812, + "learning_rate": 3.854005880832395e-08, + "logits/chosen": -1.916271448135376, + "logits/rejected": -1.7464656829833984, + "logps/chosen": -1.7863132953643799, + "logps/rejected": -2.0211892127990723, + "loss": 1.7257, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.86313247680664, + "rewards/margins": 2.3487584590911865, + "rewards/rejected": -20.211891174316406, + "step": 26310 + }, + { + "epoch": 0.8869527115844821, + "grad_norm": 125.84819793701172, + "learning_rate": 3.8426899180704356e-08, + "logits/chosen": -1.8223766088485718, + "logits/rejected": -2.0062925815582275, + "logps/chosen": -3.252180576324463, + "logps/rejected": -3.5337576866149902, + "loss": 2.8062, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -32.52180480957031, + "rewards/margins": 2.8157730102539062, + "rewards/rejected": -35.33757781982422, + "step": 26315 + }, + { + "epoch": 0.8871212376554654, + "grad_norm": 42.38909149169922, + "learning_rate": 3.831389928534967e-08, + "logits/chosen": -1.7225860357284546, + "logits/rejected": -2.0962350368499756, + "logps/chosen": -2.7228047847747803, + "logps/rejected": -3.8885185718536377, + "loss": 2.579, + "rewards/accuracies": 0.5, + "rewards/chosen": -27.22804832458496, + "rewards/margins": 11.657136917114258, + "rewards/rejected": -38.88518524169922, + "step": 26320 + }, + { + "epoch": 0.8872897637264485, + "grad_norm": 43.3592643737793, + "learning_rate": 3.820105916136479e-08, + "logits/chosen": -1.665102243423462, + "logits/rejected": -2.1733100414276123, + "logps/chosen": -2.2348780632019043, + "logps/rejected": -2.450758934020996, + "loss": 2.2147, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.348779678344727, + "rewards/margins": 2.158808469772339, + "rewards/rejected": -24.507587432861328, + "step": 26325 + }, + { + "epoch": 0.8874582897974317, + "grad_norm": 34.18486404418945, + "learning_rate": 3.808837884779925e-08, + "logits/chosen": -1.429547667503357, + "logits/rejected": -1.861356496810913, + "logps/chosen": -2.4993577003479004, + "logps/rejected": -2.9565072059631348, + "loss": 1.7211, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.993579864501953, + "rewards/margins": 4.57149600982666, + "rewards/rejected": -29.565073013305664, + "step": 26330 + }, + { + "epoch": 0.8876268158684149, + "grad_norm": 52.628761291503906, + "learning_rate": 3.7975858383647086e-08, + "logits/chosen": -1.957058310508728, + "logits/rejected": -1.9958652257919312, + "logps/chosen": -2.476940155029297, + "logps/rejected": -3.013596296310425, + "loss": 3.4843, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.76940155029297, + "rewards/margins": 5.366562843322754, + "rewards/rejected": -30.13596534729004, + "step": 26335 + }, + { + "epoch": 0.887795341939398, + "grad_norm": 43.5367546081543, + "learning_rate": 3.786349780784731e-08, + "logits/chosen": -1.6538121700286865, + "logits/rejected": -1.627824068069458, + "logps/chosen": -2.146531820297241, + "logps/rejected": -2.0789332389831543, + "loss": 3.8805, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.465316772460938, + "rewards/margins": -0.675983726978302, + "rewards/rejected": -20.78933334350586, + "step": 26340 + }, + { + "epoch": 0.8879638680103812, + "grad_norm": 8.637046278181515e-08, + "learning_rate": 3.77512971592836e-08, + "logits/chosen": -2.0448696613311768, + "logits/rejected": -2.352041244506836, + "logps/chosen": -3.145742416381836, + "logps/rejected": -4.253350734710693, + "loss": 1.4902, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -31.457427978515625, + "rewards/margins": 11.076081275939941, + "rewards/rejected": -42.533504486083984, + "step": 26345 + }, + { + "epoch": 0.8881323940813644, + "grad_norm": 0.0413767471909523, + "learning_rate": 3.763925647678401e-08, + "logits/chosen": -1.8336588144302368, + "logits/rejected": -1.9427735805511475, + "logps/chosen": -2.932631015777588, + "logps/rejected": -3.345829486846924, + "loss": 2.6405, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -29.326309204101562, + "rewards/margins": 4.131980895996094, + "rewards/rejected": -33.458290100097656, + "step": 26350 + }, + { + "epoch": 0.8883009201523475, + "grad_norm": 18.382659912109375, + "learning_rate": 3.752737579912146e-08, + "logits/chosen": -1.733275055885315, + "logits/rejected": -1.9531488418579102, + "logps/chosen": -1.8927421569824219, + "logps/rejected": -2.2471954822540283, + "loss": 2.1427, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.92742347717285, + "rewards/margins": 3.5445313453674316, + "rewards/rejected": -22.471952438354492, + "step": 26355 + }, + { + "epoch": 0.8884694462233308, + "grad_norm": 17.96552276611328, + "learning_rate": 3.7415655165013435e-08, + "logits/chosen": -2.0436959266662598, + "logits/rejected": -2.390925884246826, + "logps/chosen": -1.8439857959747314, + "logps/rejected": -2.1458404064178467, + "loss": 1.234, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -18.43985939025879, + "rewards/margins": 3.018548011779785, + "rewards/rejected": -21.458406448364258, + "step": 26360 + }, + { + "epoch": 0.888637972294314, + "grad_norm": 5.227560043334961, + "learning_rate": 3.7304094613122064e-08, + "logits/chosen": -1.5187642574310303, + "logits/rejected": -1.4792652130126953, + "logps/chosen": -2.2077884674072266, + "logps/rejected": -2.3878226280212402, + "loss": 3.1346, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.0778865814209, + "rewards/margins": 1.800341248512268, + "rewards/rejected": -23.87822723388672, + "step": 26365 + }, + { + "epoch": 0.8888064983652971, + "grad_norm": 42.939727783203125, + "learning_rate": 3.7192694182054065e-08, + "logits/chosen": -2.4460108280181885, + "logits/rejected": -2.9244384765625, + "logps/chosen": -2.6990339756011963, + "logps/rejected": -3.735548496246338, + "loss": 1.3055, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -26.990337371826172, + "rewards/margins": 10.365147590637207, + "rewards/rejected": -37.35548400878906, + "step": 26370 + }, + { + "epoch": 0.8889750244362803, + "grad_norm": 16.590620040893555, + "learning_rate": 3.708145391036077e-08, + "logits/chosen": -1.39180588722229, + "logits/rejected": -1.8719170093536377, + "logps/chosen": -2.1872692108154297, + "logps/rejected": -2.8928062915802, + "loss": 1.5447, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.872692108154297, + "rewards/margins": 7.055368900299072, + "rewards/rejected": -28.92806053161621, + "step": 26375 + }, + { + "epoch": 0.8891435505072635, + "grad_norm": 82.56770324707031, + "learning_rate": 3.697037383653795e-08, + "logits/chosen": -2.03816556930542, + "logits/rejected": -2.2176055908203125, + "logps/chosen": -3.07786226272583, + "logps/rejected": -3.490302324295044, + "loss": 2.1655, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -30.77862548828125, + "rewards/margins": 4.124399662017822, + "rewards/rejected": -34.90302276611328, + "step": 26380 + }, + { + "epoch": 0.8893120765782466, + "grad_norm": 20.41136360168457, + "learning_rate": 3.685945399902612e-08, + "logits/chosen": -2.1533596515655518, + "logits/rejected": -2.9651713371276855, + "logps/chosen": -2.3581838607788086, + "logps/rejected": -2.778751850128174, + "loss": 1.6457, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -23.581836700439453, + "rewards/margins": 4.205681324005127, + "rewards/rejected": -27.787517547607422, + "step": 26385 + }, + { + "epoch": 0.8894806026492298, + "grad_norm": 31.72279930114746, + "learning_rate": 3.674869443621026e-08, + "logits/chosen": -2.089503049850464, + "logits/rejected": -2.2000720500946045, + "logps/chosen": -2.883258581161499, + "logps/rejected": -3.233872175216675, + "loss": 2.5709, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -28.832584381103516, + "rewards/margins": 3.5061347484588623, + "rewards/rejected": -32.338722229003906, + "step": 26390 + }, + { + "epoch": 0.8896491287202131, + "grad_norm": 72.49952697753906, + "learning_rate": 3.6638095186419915e-08, + "logits/chosen": -1.2590601444244385, + "logits/rejected": -2.207016706466675, + "logps/chosen": -2.256216049194336, + "logps/rejected": -3.127159833908081, + "loss": 2.2759, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.56216049194336, + "rewards/margins": 8.709436416625977, + "rewards/rejected": -31.271595001220703, + "step": 26395 + }, + { + "epoch": 0.8898176547911962, + "grad_norm": 47.8337287902832, + "learning_rate": 3.652765628792914e-08, + "logits/chosen": -2.090017080307007, + "logits/rejected": -2.1647074222564697, + "logps/chosen": -2.8364415168762207, + "logps/rejected": -3.1691741943359375, + "loss": 2.5965, + "rewards/accuracies": 0.5, + "rewards/chosen": -28.364416122436523, + "rewards/margins": 3.3273262977600098, + "rewards/rejected": -31.691741943359375, + "step": 26400 + }, + { + "epoch": 0.8898176547911962, + "eval_logits/chosen": -2.3095388412475586, + "eval_logits/rejected": -2.4877052307128906, + "eval_logps/chosen": -2.2862913608551025, + "eval_logps/rejected": -2.441688299179077, + "eval_loss": 3.084059953689575, + "eval_rewards/accuracies": 0.6299999952316284, + "eval_rewards/chosen": -22.8629150390625, + "eval_rewards/margins": 1.553971767425537, + "eval_rewards/rejected": -24.416885375976562, + "eval_runtime": 12.9458, + "eval_samples_per_second": 7.724, + "eval_steps_per_second": 1.931, + "step": 26400 + }, + { + "epoch": 0.8899861808621794, + "grad_norm": 15.262998580932617, + "learning_rate": 3.641737777895631e-08, + "logits/chosen": -1.386071801185608, + "logits/rejected": -1.60391104221344, + "logps/chosen": -1.9214773178100586, + "logps/rejected": -2.374145269393921, + "loss": 0.8926, + "rewards/accuracies": 1.0, + "rewards/chosen": -19.214773178100586, + "rewards/margins": 4.526679992675781, + "rewards/rejected": -23.741451263427734, + "step": 26405 + }, + { + "epoch": 0.8901547069331626, + "grad_norm": 27.034141540527344, + "learning_rate": 3.6307259697664684e-08, + "logits/chosen": -1.3661694526672363, + "logits/rejected": -1.655531883239746, + "logps/chosen": -1.7544902563095093, + "logps/rejected": -1.987342119216919, + "loss": 1.7387, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.544902801513672, + "rewards/margins": 2.328519105911255, + "rewards/rejected": -19.873422622680664, + "step": 26410 + }, + { + "epoch": 0.8903232330041457, + "grad_norm": 36.613914489746094, + "learning_rate": 3.619730208216176e-08, + "logits/chosen": -2.304776668548584, + "logits/rejected": -2.4895987510681152, + "logps/chosen": -1.8933130502700806, + "logps/rejected": -2.0928142070770264, + "loss": 2.1845, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -18.933130264282227, + "rewards/margins": 1.995012640953064, + "rewards/rejected": -20.928142547607422, + "step": 26415 + }, + { + "epoch": 0.8904917590751289, + "grad_norm": 30.089303970336914, + "learning_rate": 3.6087504970499394e-08, + "logits/chosen": -1.9915310144424438, + "logits/rejected": -2.441012144088745, + "logps/chosen": -2.0041680335998535, + "logps/rejected": -2.1425201892852783, + "loss": 3.4509, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.041683197021484, + "rewards/margins": 1.383520483970642, + "rewards/rejected": -21.425201416015625, + "step": 26420 + }, + { + "epoch": 0.8906602851461121, + "grad_norm": 75.44097137451172, + "learning_rate": 3.597786840067418e-08, + "logits/chosen": -1.8589273691177368, + "logits/rejected": -1.8614015579223633, + "logps/chosen": -2.8449208736419678, + "logps/rejected": -3.298020124435425, + "loss": 2.9951, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -28.449207305908203, + "rewards/margins": 4.530993461608887, + "rewards/rejected": -32.980201721191406, + "step": 26425 + }, + { + "epoch": 0.8908288112170953, + "grad_norm": 40.12236022949219, + "learning_rate": 3.586839241062695e-08, + "logits/chosen": -1.7321977615356445, + "logits/rejected": -1.890179991722107, + "logps/chosen": -3.3235104084014893, + "logps/rejected": -3.320343494415283, + "loss": 4.4035, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -33.23509979248047, + "rewards/margins": -0.03166971355676651, + "rewards/rejected": -33.20343780517578, + "step": 26430 + }, + { + "epoch": 0.8909973372880785, + "grad_norm": 11.368515968322754, + "learning_rate": 3.5759077038243105e-08, + "logits/chosen": -1.9310667514801025, + "logits/rejected": -2.2706170082092285, + "logps/chosen": -2.7760231494903564, + "logps/rejected": -3.5364856719970703, + "loss": 1.2416, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -27.760229110717773, + "rewards/margins": 7.604626655578613, + "rewards/rejected": -35.36486053466797, + "step": 26435 + }, + { + "epoch": 0.8911658633590617, + "grad_norm": 33.3480339050293, + "learning_rate": 3.5649922321352276e-08, + "logits/chosen": -1.975754976272583, + "logits/rejected": -2.155705451965332, + "logps/chosen": -2.017496109008789, + "logps/rejected": -2.319976329803467, + "loss": 2.4917, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.17496109008789, + "rewards/margins": 3.0248000621795654, + "rewards/rejected": -23.19976234436035, + "step": 26440 + }, + { + "epoch": 0.8913343894300448, + "grad_norm": 60.67802810668945, + "learning_rate": 3.5540928297728644e-08, + "logits/chosen": -1.498110294342041, + "logits/rejected": -1.9327552318572998, + "logps/chosen": -2.3765645027160645, + "logps/rejected": -2.515491008758545, + "loss": 2.4061, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.765644073486328, + "rewards/margins": 1.3892669677734375, + "rewards/rejected": -25.154911041259766, + "step": 26445 + }, + { + "epoch": 0.891502915501028, + "grad_norm": 5.0874528884887695, + "learning_rate": 3.543209500509087e-08, + "logits/chosen": -2.114600658416748, + "logits/rejected": -2.1396377086639404, + "logps/chosen": -2.263089895248413, + "logps/rejected": -2.864563465118408, + "loss": 2.5865, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.63089942932129, + "rewards/margins": 6.014737129211426, + "rewards/rejected": -28.6456356048584, + "step": 26450 + }, + { + "epoch": 0.8916714415720112, + "grad_norm": 218.70321655273438, + "learning_rate": 3.5323422481101704e-08, + "logits/chosen": -1.9041109085083008, + "logits/rejected": -1.7146583795547485, + "logps/chosen": -2.972308874130249, + "logps/rejected": -2.422968626022339, + "loss": 8.8578, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -29.72308921813965, + "rewards/margins": -5.49340295791626, + "rewards/rejected": -24.229686737060547, + "step": 26455 + }, + { + "epoch": 0.8918399676429943, + "grad_norm": 23.796815872192383, + "learning_rate": 3.5214910763368465e-08, + "logits/chosen": -1.6301389932632446, + "logits/rejected": -1.4314095973968506, + "logps/chosen": -2.698103427886963, + "logps/rejected": -3.5130105018615723, + "loss": 3.2043, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -26.981029510498047, + "rewards/margins": 8.149070739746094, + "rewards/rejected": -35.130104064941406, + "step": 26460 + }, + { + "epoch": 0.8920084937139775, + "grad_norm": 29.702604293823242, + "learning_rate": 3.5106559889442834e-08, + "logits/chosen": -2.535851240158081, + "logits/rejected": -2.5843117237091064, + "logps/chosen": -3.023690938949585, + "logps/rejected": -3.4369590282440186, + "loss": 2.2928, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -30.236907958984375, + "rewards/margins": 4.132681369781494, + "rewards/rejected": -34.369590759277344, + "step": 26465 + }, + { + "epoch": 0.8921770197849608, + "grad_norm": 34.1428337097168, + "learning_rate": 3.499836989682081e-08, + "logits/chosen": -1.4994641542434692, + "logits/rejected": -1.624497413635254, + "logps/chosen": -1.7730028629302979, + "logps/rejected": -1.9339821338653564, + "loss": 2.1891, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.730030059814453, + "rewards/margins": 1.6097911596298218, + "rewards/rejected": -19.339818954467773, + "step": 26470 + }, + { + "epoch": 0.892345545855944, + "grad_norm": 43.83182907104492, + "learning_rate": 3.489034082294257e-08, + "logits/chosen": -2.2628543376922607, + "logits/rejected": -2.2911453247070312, + "logps/chosen": -2.642298698425293, + "logps/rejected": -2.8167061805725098, + "loss": 2.734, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -26.422988891601562, + "rewards/margins": 1.7440717220306396, + "rewards/rejected": -28.16705894470215, + "step": 26475 + }, + { + "epoch": 0.8925140719269271, + "grad_norm": 24.363571166992188, + "learning_rate": 3.47824727051928e-08, + "logits/chosen": -2.286649703979492, + "logits/rejected": -2.66743803024292, + "logps/chosen": -3.0622658729553223, + "logps/rejected": -3.6419472694396973, + "loss": 2.8387, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -30.62265968322754, + "rewards/margins": 5.796811103820801, + "rewards/rejected": -36.419471740722656, + "step": 26480 + }, + { + "epoch": 0.8926825979979103, + "grad_norm": 34.195926666259766, + "learning_rate": 3.4674765580900435e-08, + "logits/chosen": -1.6512067317962646, + "logits/rejected": -1.8246276378631592, + "logps/chosen": -2.3566603660583496, + "logps/rejected": -2.6705374717712402, + "loss": 1.879, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.566604614257812, + "rewards/margins": 3.1387696266174316, + "rewards/rejected": -26.705373764038086, + "step": 26485 + }, + { + "epoch": 0.8928511240688934, + "grad_norm": 34.41433334350586, + "learning_rate": 3.456721948733854e-08, + "logits/chosen": -1.578489065170288, + "logits/rejected": -2.0936756134033203, + "logps/chosen": -2.417600154876709, + "logps/rejected": -2.9172120094299316, + "loss": 2.9168, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.17600440979004, + "rewards/margins": 4.996116638183594, + "rewards/rejected": -29.172119140625, + "step": 26490 + }, + { + "epoch": 0.8930196501398766, + "grad_norm": 36.56930923461914, + "learning_rate": 3.445983446172468e-08, + "logits/chosen": -1.7800805568695068, + "logits/rejected": -1.7682125568389893, + "logps/chosen": -2.447359800338745, + "logps/rejected": -2.3159613609313965, + "loss": 4.43, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.47359848022461, + "rewards/margins": -1.3139829635620117, + "rewards/rejected": -23.159616470336914, + "step": 26495 + }, + { + "epoch": 0.8931881762108598, + "grad_norm": 48.0949592590332, + "learning_rate": 3.4352610541220574e-08, + "logits/chosen": -2.0843327045440674, + "logits/rejected": -2.2306456565856934, + "logps/chosen": -2.0241684913635254, + "logps/rejected": -2.457520008087158, + "loss": 2.3178, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.241683959960938, + "rewards/margins": 4.333517074584961, + "rewards/rejected": -24.5752010345459, + "step": 26500 + }, + { + "epoch": 0.893356702281843, + "grad_norm": 69.8865966796875, + "learning_rate": 3.42455477629322e-08, + "logits/chosen": -1.8679606914520264, + "logits/rejected": -2.1544744968414307, + "logps/chosen": -2.019822120666504, + "logps/rejected": -1.9206161499023438, + "loss": 4.1463, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.198219299316406, + "rewards/margins": -0.9920600056648254, + "rewards/rejected": -19.206159591674805, + "step": 26505 + }, + { + "epoch": 0.8935252283528262, + "grad_norm": 22.330610275268555, + "learning_rate": 3.4138646163909715e-08, + "logits/chosen": -1.898535132408142, + "logits/rejected": -2.080216646194458, + "logps/chosen": -2.839162826538086, + "logps/rejected": -3.4196839332580566, + "loss": 2.588, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -28.391626358032227, + "rewards/margins": 5.805212020874023, + "rewards/rejected": -34.19683837890625, + "step": 26510 + }, + { + "epoch": 0.8936937544238094, + "grad_norm": 46.52693557739258, + "learning_rate": 3.403190578114762e-08, + "logits/chosen": -1.8540111780166626, + "logits/rejected": -1.9055122137069702, + "logps/chosen": -2.3714959621429443, + "logps/rejected": -2.411144256591797, + "loss": 4.4685, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -23.7149600982666, + "rewards/margins": 0.39648064970970154, + "rewards/rejected": -24.111440658569336, + "step": 26515 + }, + { + "epoch": 0.8938622804947925, + "grad_norm": 27.26725959777832, + "learning_rate": 3.392532665158449e-08, + "logits/chosen": -2.230833053588867, + "logits/rejected": -2.185213327407837, + "logps/chosen": -2.181652545928955, + "logps/rejected": -2.0785341262817383, + "loss": 4.192, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.816524505615234, + "rewards/margins": -1.031184434890747, + "rewards/rejected": -20.785343170166016, + "step": 26520 + }, + { + "epoch": 0.8940308065657757, + "grad_norm": 29.42605972290039, + "learning_rate": 3.38189088121032e-08, + "logits/chosen": -2.070993661880493, + "logits/rejected": -2.3387389183044434, + "logps/chosen": -2.1574041843414307, + "logps/rejected": -2.6259658336639404, + "loss": 1.657, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.57404136657715, + "rewards/margins": 4.685616970062256, + "rewards/rejected": -26.259658813476562, + "step": 26525 + }, + { + "epoch": 0.8941993326367589, + "grad_norm": 2.8149056434631348, + "learning_rate": 3.371265229953074e-08, + "logits/chosen": -1.9254567623138428, + "logits/rejected": -2.2278952598571777, + "logps/chosen": -2.96343731880188, + "logps/rejected": -3.121615171432495, + "loss": 5.3805, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -29.634368896484375, + "rewards/margins": 1.5817797183990479, + "rewards/rejected": -31.21615219116211, + "step": 26530 + }, + { + "epoch": 0.894367858707742, + "grad_norm": 27.930755615234375, + "learning_rate": 3.360655715063837e-08, + "logits/chosen": -1.7379354238510132, + "logits/rejected": -1.865264892578125, + "logps/chosen": -2.278048515319824, + "logps/rejected": -2.7165865898132324, + "loss": 1.135, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -22.780485153198242, + "rewards/margins": 4.385384559631348, + "rewards/rejected": -27.165868759155273, + "step": 26535 + }, + { + "epoch": 0.8945363847787253, + "grad_norm": 29.486385345458984, + "learning_rate": 3.350062340214149e-08, + "logits/chosen": -1.9781516790390015, + "logits/rejected": -2.348292350769043, + "logps/chosen": -2.722933053970337, + "logps/rejected": -3.038951873779297, + "loss": 1.3001, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -27.229333877563477, + "rewards/margins": 3.160186529159546, + "rewards/rejected": -30.389516830444336, + "step": 26540 + }, + { + "epoch": 0.8947049108497085, + "grad_norm": 24.48255157470703, + "learning_rate": 3.339485109069939e-08, + "logits/chosen": -1.8980258703231812, + "logits/rejected": -2.232142925262451, + "logps/chosen": -2.2230277061462402, + "logps/rejected": -2.9788966178894043, + "loss": 1.5709, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -22.230276107788086, + "rewards/margins": 7.558690547943115, + "rewards/rejected": -29.788965225219727, + "step": 26545 + }, + { + "epoch": 0.8948734369206917, + "grad_norm": 30.568679809570312, + "learning_rate": 3.328924025291585e-08, + "logits/chosen": -1.8301811218261719, + "logits/rejected": -1.786790132522583, + "logps/chosen": -2.1006762981414795, + "logps/rejected": -2.4340219497680664, + "loss": 1.2593, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -21.00676155090332, + "rewards/margins": 3.333458662033081, + "rewards/rejected": -24.340219497680664, + "step": 26550 + }, + { + "epoch": 0.8950419629916748, + "grad_norm": 32.12637710571289, + "learning_rate": 3.318379092533868e-08, + "logits/chosen": -1.7069917917251587, + "logits/rejected": -1.9681928157806396, + "logps/chosen": -2.761687755584717, + "logps/rejected": -2.6828818321228027, + "loss": 3.8563, + "rewards/accuracies": 0.5, + "rewards/chosen": -27.61687660217285, + "rewards/margins": -0.7880581021308899, + "rewards/rejected": -26.828815460205078, + "step": 26555 + }, + { + "epoch": 0.895210489062658, + "grad_norm": 28.803434371948242, + "learning_rate": 3.3078503144459535e-08, + "logits/chosen": -1.1517115831375122, + "logits/rejected": -1.3329699039459229, + "logps/chosen": -2.1754512786865234, + "logps/rejected": -3.084625720977783, + "loss": 2.7049, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.754512786865234, + "rewards/margins": 9.091741561889648, + "rewards/rejected": -30.846256256103516, + "step": 26560 + }, + { + "epoch": 0.8953790151336412, + "grad_norm": 47.28593444824219, + "learning_rate": 3.297337694671448e-08, + "logits/chosen": -2.179915428161621, + "logits/rejected": -2.6756978034973145, + "logps/chosen": -2.4682414531707764, + "logps/rejected": -3.088728666305542, + "loss": 1.6676, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -24.682416915893555, + "rewards/margins": 6.20487117767334, + "rewards/rejected": -30.88728904724121, + "step": 26565 + }, + { + "epoch": 0.8955475412046243, + "grad_norm": 39.167423248291016, + "learning_rate": 3.286841236848353e-08, + "logits/chosen": -1.4234455823898315, + "logits/rejected": -1.5056092739105225, + "logps/chosen": -2.820444345474243, + "logps/rejected": -3.1854355335235596, + "loss": 2.5673, + "rewards/accuracies": 0.5, + "rewards/chosen": -28.204442977905273, + "rewards/margins": 3.649911403656006, + "rewards/rejected": -31.854354858398438, + "step": 26570 + }, + { + "epoch": 0.8957160672756075, + "grad_norm": 41.25774002075195, + "learning_rate": 3.2763609446090966e-08, + "logits/chosen": -2.3403804302215576, + "logits/rejected": -2.43021821975708, + "logps/chosen": -3.2246127128601074, + "logps/rejected": -3.2669715881347656, + "loss": 3.8261, + "rewards/accuracies": 0.5, + "rewards/chosen": -32.246131896972656, + "rewards/margins": 0.42358970642089844, + "rewards/rejected": -32.66971969604492, + "step": 26575 + }, + { + "epoch": 0.8958845933465908, + "grad_norm": 34.58900451660156, + "learning_rate": 3.265896821580466e-08, + "logits/chosen": -1.5373561382293701, + "logits/rejected": -1.5862390995025635, + "logps/chosen": -2.270263195037842, + "logps/rejected": -2.307436227798462, + "loss": 3.3203, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.702632904052734, + "rewards/margins": 0.37172871828079224, + "rewards/rejected": -23.074359893798828, + "step": 26580 + }, + { + "epoch": 0.8960531194175739, + "grad_norm": 29.757532119750977, + "learning_rate": 3.255448871383692e-08, + "logits/chosen": -1.7115838527679443, + "logits/rejected": -1.9038822650909424, + "logps/chosen": -2.302544116973877, + "logps/rejected": -2.024075746536255, + "loss": 6.2287, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.025440216064453, + "rewards/margins": -2.7846837043762207, + "rewards/rejected": -20.24075698852539, + "step": 26585 + }, + { + "epoch": 0.8962216454885571, + "grad_norm": 5.816464424133301, + "learning_rate": 3.245017097634417e-08, + "logits/chosen": -2.073551893234253, + "logits/rejected": -2.096770763397217, + "logps/chosen": -1.9916664361953735, + "logps/rejected": -2.167724847793579, + "loss": 2.6002, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.91666603088379, + "rewards/margins": 1.7605831623077393, + "rewards/rejected": -21.677249908447266, + "step": 26590 + }, + { + "epoch": 0.8963901715595403, + "grad_norm": 29.604887008666992, + "learning_rate": 3.234601503942641e-08, + "logits/chosen": -1.8629038333892822, + "logits/rejected": -1.8070766925811768, + "logps/chosen": -2.160930871963501, + "logps/rejected": -2.353682041168213, + "loss": 3.0509, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.60930824279785, + "rewards/margins": 1.9275100231170654, + "rewards/rejected": -23.536815643310547, + "step": 26595 + }, + { + "epoch": 0.8965586976305234, + "grad_norm": 15.401908874511719, + "learning_rate": 3.224202093912798e-08, + "logits/chosen": -1.1934399604797363, + "logits/rejected": -1.2277182340621948, + "logps/chosen": -2.0062804222106934, + "logps/rejected": -2.2444908618927, + "loss": 1.7024, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.06280517578125, + "rewards/margins": 2.3821024894714355, + "rewards/rejected": -22.444908142089844, + "step": 26600 + }, + { + "epoch": 0.8967272237015066, + "grad_norm": 28.409311294555664, + "learning_rate": 3.21381887114372e-08, + "logits/chosen": -1.805456519126892, + "logits/rejected": -2.1640634536743164, + "logps/chosen": -2.77734375, + "logps/rejected": -2.9346604347229004, + "loss": 2.6559, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -27.7734375, + "rewards/margins": 1.5731639862060547, + "rewards/rejected": -29.346603393554688, + "step": 26605 + }, + { + "epoch": 0.8968957497724898, + "grad_norm": 97.36381530761719, + "learning_rate": 3.203451839228638e-08, + "logits/chosen": -1.751865029335022, + "logits/rejected": -2.0187325477600098, + "logps/chosen": -1.852243185043335, + "logps/rejected": -1.9989274740219116, + "loss": 4.0031, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.522432327270508, + "rewards/margins": 1.4668428897857666, + "rewards/rejected": -19.989276885986328, + "step": 26610 + }, + { + "epoch": 0.897064275843473, + "grad_norm": 34.04191970825195, + "learning_rate": 3.1931010017551555e-08, + "logits/chosen": -1.436517596244812, + "logits/rejected": -2.0932650566101074, + "logps/chosen": -2.7484524250030518, + "logps/rejected": -3.309058427810669, + "loss": 3.0777, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -27.484527587890625, + "rewards/margins": 5.606059551239014, + "rewards/rejected": -33.09058380126953, + "step": 26615 + }, + { + "epoch": 0.8972328019144562, + "grad_norm": 42.45766830444336, + "learning_rate": 3.1827663623052945e-08, + "logits/chosen": -1.7623097896575928, + "logits/rejected": -1.7391726970672607, + "logps/chosen": -1.8129985332489014, + "logps/rejected": -1.9569908380508423, + "loss": 2.2725, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.129985809326172, + "rewards/margins": 1.4399234056472778, + "rewards/rejected": -19.569908142089844, + "step": 26620 + }, + { + "epoch": 0.8974013279854394, + "grad_norm": 54.091957092285156, + "learning_rate": 3.1724479244554794e-08, + "logits/chosen": -1.7639528512954712, + "logits/rejected": -2.0980749130249023, + "logps/chosen": -2.110844135284424, + "logps/rejected": -2.2267391681671143, + "loss": 2.7418, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.108442306518555, + "rewards/margins": 1.158947467803955, + "rewards/rejected": -22.26738929748535, + "step": 26625 + }, + { + "epoch": 0.8975698540564225, + "grad_norm": 28.083711624145508, + "learning_rate": 3.1621456917765025e-08, + "logits/chosen": -1.6799514293670654, + "logits/rejected": -2.156489133834839, + "logps/chosen": -2.3385581970214844, + "logps/rejected": -2.5456862449645996, + "loss": 2.7011, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.38558006286621, + "rewards/margins": 2.071280002593994, + "rewards/rejected": -25.45686149597168, + "step": 26630 + }, + { + "epoch": 0.8977383801274057, + "grad_norm": 8.380878448486328, + "learning_rate": 3.151859667833562e-08, + "logits/chosen": -2.053631544113159, + "logits/rejected": -1.9986978769302368, + "logps/chosen": -1.9755045175552368, + "logps/rejected": -2.13932728767395, + "loss": 2.5733, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.75504493713379, + "rewards/margins": 1.638228178024292, + "rewards/rejected": -21.393274307250977, + "step": 26635 + }, + { + "epoch": 0.8979069061983889, + "grad_norm": 123.43840789794922, + "learning_rate": 3.141589856186244e-08, + "logits/chosen": -1.7218620777130127, + "logits/rejected": -2.00181245803833, + "logps/chosen": -3.3388209342956543, + "logps/rejected": -3.430068254470825, + "loss": 4.0124, + "rewards/accuracies": 0.5, + "rewards/chosen": -33.388206481933594, + "rewards/margins": 0.9124706387519836, + "rewards/rejected": -34.300682067871094, + "step": 26640 + }, + { + "epoch": 0.898075432269372, + "grad_norm": 54.00300979614258, + "learning_rate": 3.13133626038854e-08, + "logits/chosen": -2.025984525680542, + "logits/rejected": -1.9393415451049805, + "logps/chosen": -2.0208346843719482, + "logps/rejected": -2.30167818069458, + "loss": 1.65, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.20834732055664, + "rewards/margins": 2.8084373474121094, + "rewards/rejected": -23.01678466796875, + "step": 26645 + }, + { + "epoch": 0.8982439583403552, + "grad_norm": 100.82463073730469, + "learning_rate": 3.121098883988793e-08, + "logits/chosen": -1.9338929653167725, + "logits/rejected": -1.9727376699447632, + "logps/chosen": -3.4275753498077393, + "logps/rejected": -3.6173622608184814, + "loss": 5.8341, + "rewards/accuracies": 0.5, + "rewards/chosen": -34.275753021240234, + "rewards/margins": 1.8978685140609741, + "rewards/rejected": -36.173622131347656, + "step": 26650 + }, + { + "epoch": 0.8984124844113385, + "grad_norm": 32.94302749633789, + "learning_rate": 3.110877730529771e-08, + "logits/chosen": -2.066908121109009, + "logits/rejected": -2.4684619903564453, + "logps/chosen": -2.7884440422058105, + "logps/rejected": -3.1394457817077637, + "loss": 2.0568, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -27.884441375732422, + "rewards/margins": 3.510014057159424, + "rewards/rejected": -31.394454956054688, + "step": 26655 + }, + { + "epoch": 0.8985810104823216, + "grad_norm": 31.880945205688477, + "learning_rate": 3.1006728035486095e-08, + "logits/chosen": -2.197225570678711, + "logits/rejected": -2.540008544921875, + "logps/chosen": -2.043921709060669, + "logps/rejected": -2.7333791255950928, + "loss": 1.9214, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.43921661376953, + "rewards/margins": 6.8945770263671875, + "rewards/rejected": -27.33379554748535, + "step": 26660 + }, + { + "epoch": 0.8987495365533048, + "grad_norm": 33.723976135253906, + "learning_rate": 3.0904841065768293e-08, + "logits/chosen": -1.9747921228408813, + "logits/rejected": -2.0234017372131348, + "logps/chosen": -2.2100253105163574, + "logps/rejected": -2.1898884773254395, + "loss": 3.3422, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -22.10025405883789, + "rewards/margins": -0.20136889815330505, + "rewards/rejected": -21.898883819580078, + "step": 26665 + }, + { + "epoch": 0.898918062624288, + "grad_norm": 7.845767498016357, + "learning_rate": 3.0803116431403375e-08, + "logits/chosen": -1.9601243734359741, + "logits/rejected": -2.368412733078003, + "logps/chosen": -1.7174384593963623, + "logps/rejected": -1.9996525049209595, + "loss": 2.791, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.17438507080078, + "rewards/margins": 2.8221397399902344, + "rewards/rejected": -19.996524810791016, + "step": 26670 + }, + { + "epoch": 0.8990865886952711, + "grad_norm": 33.07391357421875, + "learning_rate": 3.0701554167594345e-08, + "logits/chosen": -2.0543577671051025, + "logits/rejected": -1.900191068649292, + "logps/chosen": -3.7917778491973877, + "logps/rejected": -3.9905028343200684, + "loss": 2.6151, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -37.91777801513672, + "rewards/margins": 1.9872547388076782, + "rewards/rejected": -39.905033111572266, + "step": 26675 + }, + { + "epoch": 0.8992551147662543, + "grad_norm": 159.2845916748047, + "learning_rate": 3.06001543094877e-08, + "logits/chosen": -1.7436946630477905, + "logits/rejected": -2.141364574432373, + "logps/chosen": -2.9924213886260986, + "logps/rejected": -3.945605516433716, + "loss": 2.1637, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -29.924213409423828, + "rewards/margins": 9.531842231750488, + "rewards/rejected": -39.4560546875, + "step": 26680 + }, + { + "epoch": 0.8994236408372375, + "grad_norm": 31.464847564697266, + "learning_rate": 3.049891689217404e-08, + "logits/chosen": -1.6657555103302002, + "logits/rejected": -1.8241376876831055, + "logps/chosen": -2.2361302375793457, + "logps/rejected": -2.5087828636169434, + "loss": 1.8575, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.361303329467773, + "rewards/margins": 2.7265238761901855, + "rewards/rejected": -25.08782958984375, + "step": 26685 + }, + { + "epoch": 0.8995921669082207, + "grad_norm": 28.74648666381836, + "learning_rate": 3.039784195068762e-08, + "logits/chosen": -2.540984630584717, + "logits/rejected": -2.3905649185180664, + "logps/chosen": -4.0224199295043945, + "logps/rejected": -4.216039657592773, + "loss": 4.7423, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -40.22419738769531, + "rewards/margins": 1.9361984729766846, + "rewards/rejected": -42.160396575927734, + "step": 26690 + }, + { + "epoch": 0.8997606929792039, + "grad_norm": 26.239681243896484, + "learning_rate": 3.029692952000662e-08, + "logits/chosen": -1.6646521091461182, + "logits/rejected": -2.0250906944274902, + "logps/chosen": -2.4633376598358154, + "logps/rejected": -3.2082221508026123, + "loss": 2.9795, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.633377075195312, + "rewards/margins": 7.448843955993652, + "rewards/rejected": -32.08222198486328, + "step": 26695 + }, + { + "epoch": 0.8999292190501871, + "grad_norm": 29.84377098083496, + "learning_rate": 3.0196179635052664e-08, + "logits/chosen": -2.0663959980010986, + "logits/rejected": -1.9202702045440674, + "logps/chosen": -2.9367499351501465, + "logps/rejected": -3.0594124794006348, + "loss": 3.022, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -29.36749839782715, + "rewards/margins": 1.226623773574829, + "rewards/rejected": -30.5941219329834, + "step": 26700 + }, + { + "epoch": 0.9000977451211702, + "grad_norm": 34.29310989379883, + "learning_rate": 3.009559233069142e-08, + "logits/chosen": -1.9751489162445068, + "logits/rejected": -1.7080457210540771, + "logps/chosen": -2.1225452423095703, + "logps/rejected": -1.9370813369750977, + "loss": 5.146, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.225452423095703, + "rewards/margins": -1.8546397686004639, + "rewards/rejected": -19.370811462402344, + "step": 26705 + }, + { + "epoch": 0.9002662711921534, + "grad_norm": 79.37320709228516, + "learning_rate": 2.9995167641732154e-08, + "logits/chosen": -1.799004316329956, + "logits/rejected": -2.029813289642334, + "logps/chosen": -2.426522731781006, + "logps/rejected": -2.999939203262329, + "loss": 2.2679, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.265226364135742, + "rewards/margins": 5.73416805267334, + "rewards/rejected": -29.9993953704834, + "step": 26710 + }, + { + "epoch": 0.9004347972631366, + "grad_norm": 27.92984962463379, + "learning_rate": 2.989490560292801e-08, + "logits/chosen": -1.9473021030426025, + "logits/rejected": -2.1453399658203125, + "logps/chosen": -2.4092183113098145, + "logps/rejected": -2.7499892711639404, + "loss": 2.1248, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.092182159423828, + "rewards/margins": 3.407710552215576, + "rewards/rejected": -27.499893188476562, + "step": 26715 + }, + { + "epoch": 0.9006033233341197, + "grad_norm": 26.3646240234375, + "learning_rate": 2.9794806248975512e-08, + "logits/chosen": -1.6984527111053467, + "logits/rejected": -2.1335010528564453, + "logps/chosen": -2.008679151535034, + "logps/rejected": -2.133533000946045, + "loss": 2.3707, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.0867919921875, + "rewards/margins": 1.2485383749008179, + "rewards/rejected": -21.335330963134766, + "step": 26720 + }, + { + "epoch": 0.900771849405103, + "grad_norm": 10.612837791442871, + "learning_rate": 2.9694869614515283e-08, + "logits/chosen": -1.4333100318908691, + "logits/rejected": -1.9090903997421265, + "logps/chosen": -2.660550594329834, + "logps/rejected": -3.347529649734497, + "loss": 1.4041, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -26.60550308227539, + "rewards/margins": 6.8697919845581055, + "rewards/rejected": -33.47529983520508, + "step": 26725 + }, + { + "epoch": 0.9009403754760862, + "grad_norm": 54.768768310546875, + "learning_rate": 2.9595095734131438e-08, + "logits/chosen": -1.4375172853469849, + "logits/rejected": -1.9926517009735107, + "logps/chosen": -2.2313895225524902, + "logps/rejected": -2.748983383178711, + "loss": 2.7583, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.31389617919922, + "rewards/margins": 5.175940990447998, + "rewards/rejected": -27.489837646484375, + "step": 26730 + }, + { + "epoch": 0.9011089015470694, + "grad_norm": 34.80324172973633, + "learning_rate": 2.9495484642351686e-08, + "logits/chosen": -2.2112724781036377, + "logits/rejected": -2.3973257541656494, + "logps/chosen": -1.9748632907867432, + "logps/rejected": -2.477585554122925, + "loss": 1.9833, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.748634338378906, + "rewards/margins": 5.027222156524658, + "rewards/rejected": -24.775854110717773, + "step": 26735 + }, + { + "epoch": 0.9012774276180525, + "grad_norm": 0.01907912641763687, + "learning_rate": 2.939603637364757e-08, + "logits/chosen": -1.949859619140625, + "logits/rejected": -2.05171537399292, + "logps/chosen": -2.8990979194641113, + "logps/rejected": -3.3347411155700684, + "loss": 2.8307, + "rewards/accuracies": 0.5, + "rewards/chosen": -28.990982055664062, + "rewards/margins": 4.3564276695251465, + "rewards/rejected": -33.347412109375, + "step": 26740 + }, + { + "epoch": 0.9014459536890357, + "grad_norm": 228.8786163330078, + "learning_rate": 2.929675096243428e-08, + "logits/chosen": -1.9393707513809204, + "logits/rejected": -2.239442825317383, + "logps/chosen": -2.230768918991089, + "logps/rejected": -2.1282057762145996, + "loss": 4.1624, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -22.307689666748047, + "rewards/margins": -1.0256315469741821, + "rewards/rejected": -21.282058715820312, + "step": 26745 + }, + { + "epoch": 0.9016144797600188, + "grad_norm": 58.850704193115234, + "learning_rate": 2.9197628443070443e-08, + "logits/chosen": -1.6945288181304932, + "logits/rejected": -1.7264130115509033, + "logps/chosen": -2.3006865978240967, + "logps/rejected": -2.233219623565674, + "loss": 3.7347, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.006868362426758, + "rewards/margins": -0.6746741533279419, + "rewards/rejected": -22.332195281982422, + "step": 26750 + }, + { + "epoch": 0.901783005831002, + "grad_norm": 3.1833982467651367, + "learning_rate": 2.9098668849858508e-08, + "logits/chosen": -1.402016043663025, + "logits/rejected": -1.3957245349884033, + "logps/chosen": -2.561000347137451, + "logps/rejected": -3.177316188812256, + "loss": 1.8452, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.61000633239746, + "rewards/margins": 6.1631574630737305, + "rewards/rejected": -31.77316665649414, + "step": 26755 + }, + { + "epoch": 0.9019515319019852, + "grad_norm": 25.571414947509766, + "learning_rate": 2.899987221704453e-08, + "logits/chosen": -1.9028421640396118, + "logits/rejected": -2.308382749557495, + "logps/chosen": -2.1655309200286865, + "logps/rejected": -2.9270224571228027, + "loss": 2.0631, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.655309677124023, + "rewards/margins": 7.614912986755371, + "rewards/rejected": -29.270221710205078, + "step": 26760 + }, + { + "epoch": 0.9021200579729685, + "grad_norm": 16.592662811279297, + "learning_rate": 2.8901238578818153e-08, + "logits/chosen": -1.417004942893982, + "logits/rejected": -1.7401018142700195, + "logps/chosen": -2.598989725112915, + "logps/rejected": -3.228271961212158, + "loss": 2.0323, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -25.98989486694336, + "rewards/margins": 6.292824745178223, + "rewards/rejected": -32.28272247314453, + "step": 26765 + }, + { + "epoch": 0.9022885840439516, + "grad_norm": 19.315271377563477, + "learning_rate": 2.8802767969312524e-08, + "logits/chosen": -2.464322328567505, + "logits/rejected": -2.841975688934326, + "logps/chosen": -2.9235432147979736, + "logps/rejected": -3.1510472297668457, + "loss": 2.342, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -29.235431671142578, + "rewards/margins": 2.27504301071167, + "rewards/rejected": -31.510473251342773, + "step": 26770 + }, + { + "epoch": 0.9024571101149348, + "grad_norm": 56.24665832519531, + "learning_rate": 2.870446042260444e-08, + "logits/chosen": -1.5425212383270264, + "logits/rejected": -1.7652885913848877, + "logps/chosen": -2.8841872215270996, + "logps/rejected": -3.6044490337371826, + "loss": 1.5722, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -28.841873168945312, + "rewards/margins": 7.2026166915893555, + "rewards/rejected": -36.044490814208984, + "step": 26775 + }, + { + "epoch": 0.902625636185918, + "grad_norm": 21.616291046142578, + "learning_rate": 2.86063159727144e-08, + "logits/chosen": -2.3555169105529785, + "logits/rejected": -2.294743061065674, + "logps/chosen": -3.1372265815734863, + "logps/rejected": -3.617166042327881, + "loss": 2.1903, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -31.372264862060547, + "rewards/margins": 4.799395561218262, + "rewards/rejected": -36.17166519165039, + "step": 26780 + }, + { + "epoch": 0.9027941622569011, + "grad_norm": 17.52286148071289, + "learning_rate": 2.8508334653606135e-08, + "logits/chosen": -2.1238174438476562, + "logits/rejected": -2.157766580581665, + "logps/chosen": -2.153778553009033, + "logps/rejected": -2.369436502456665, + "loss": 3.4831, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.537784576416016, + "rewards/margins": 2.1565799713134766, + "rewards/rejected": -23.694364547729492, + "step": 26785 + }, + { + "epoch": 0.9029626883278843, + "grad_norm": 69.37480926513672, + "learning_rate": 2.8410516499187244e-08, + "logits/chosen": -1.8046211004257202, + "logits/rejected": -2.1276800632476807, + "logps/chosen": -2.6248655319213867, + "logps/rejected": -3.009124279022217, + "loss": 2.1481, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.2486572265625, + "rewards/margins": 3.8425865173339844, + "rewards/rejected": -30.09124183654785, + "step": 26790 + }, + { + "epoch": 0.9031312143988675, + "grad_norm": 31.732303619384766, + "learning_rate": 2.8312861543308696e-08, + "logits/chosen": -2.1751065254211426, + "logits/rejected": -2.1534345149993896, + "logps/chosen": -2.7119526863098145, + "logps/rejected": -2.8299505710601807, + "loss": 4.864, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -27.11952781677246, + "rewards/margins": 1.1799787282943726, + "rewards/rejected": -28.29950523376465, + "step": 26795 + }, + { + "epoch": 0.9032997404698507, + "grad_norm": 25.450721740722656, + "learning_rate": 2.821536981976502e-08, + "logits/chosen": -1.3617388010025024, + "logits/rejected": -1.6299562454223633, + "logps/chosen": -2.2438454627990723, + "logps/rejected": -2.4280383586883545, + "loss": 3.6415, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -22.43845558166504, + "rewards/margins": 1.8419297933578491, + "rewards/rejected": -24.280384063720703, + "step": 26800 + }, + { + "epoch": 0.9032997404698507, + "eval_logits/chosen": -2.3114259243011475, + "eval_logits/rejected": -2.489382743835449, + "eval_logps/chosen": -2.2882986068725586, + "eval_logps/rejected": -2.4433987140655518, + "eval_loss": 3.089292287826538, + "eval_rewards/accuracies": 0.6299999952316284, + "eval_rewards/chosen": -22.882986068725586, + "eval_rewards/margins": 1.5510010719299316, + "eval_rewards/rejected": -24.433984756469727, + "eval_runtime": 12.9054, + "eval_samples_per_second": 7.749, + "eval_steps_per_second": 1.937, + "step": 26800 + }, + { + "epoch": 0.9034682665408339, + "grad_norm": 26.823387145996094, + "learning_rate": 2.81180413622944e-08, + "logits/chosen": -1.6327470541000366, + "logits/rejected": -1.6029590368270874, + "logps/chosen": -3.0612990856170654, + "logps/rejected": -3.3704915046691895, + "loss": 4.5638, + "rewards/accuracies": 0.5, + "rewards/chosen": -30.612987518310547, + "rewards/margins": 3.091923713684082, + "rewards/rejected": -33.70491409301758, + "step": 26805 + }, + { + "epoch": 0.9036367926118171, + "grad_norm": 42.258033752441406, + "learning_rate": 2.8020876204578104e-08, + "logits/chosen": -2.2854976654052734, + "logits/rejected": -2.360747814178467, + "logps/chosen": -3.3987221717834473, + "logps/rejected": -3.3430073261260986, + "loss": 5.2192, + "rewards/accuracies": 0.5, + "rewards/chosen": -33.987220764160156, + "rewards/margins": -0.5571478605270386, + "rewards/rejected": -33.430076599121094, + "step": 26810 + }, + { + "epoch": 0.9038053186828002, + "grad_norm": 3.3188095092773438, + "learning_rate": 2.7923874380241407e-08, + "logits/chosen": -1.265821933746338, + "logits/rejected": -1.7384541034698486, + "logps/chosen": -2.1968064308166504, + "logps/rejected": -2.7010929584503174, + "loss": 1.1806, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.968059539794922, + "rewards/margins": 5.042864799499512, + "rewards/rejected": -27.01092529296875, + "step": 26815 + }, + { + "epoch": 0.9039738447537834, + "grad_norm": 42.46717834472656, + "learning_rate": 2.7827035922852682e-08, + "logits/chosen": -1.985002875328064, + "logits/rejected": -2.063169240951538, + "logps/chosen": -2.1471776962280273, + "logps/rejected": -2.5167198181152344, + "loss": 1.2743, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -21.471778869628906, + "rewards/margins": 3.6954193115234375, + "rewards/rejected": -25.16719627380371, + "step": 26820 + }, + { + "epoch": 0.9041423708247666, + "grad_norm": 3.5700883865356445, + "learning_rate": 2.7730360865923952e-08, + "logits/chosen": -1.6788097620010376, + "logits/rejected": -2.0639233589172363, + "logps/chosen": -1.9001522064208984, + "logps/rejected": -2.203819751739502, + "loss": 1.5051, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.001523971557617, + "rewards/margins": 3.0366764068603516, + "rewards/rejected": -22.0382022857666, + "step": 26825 + }, + { + "epoch": 0.9043108968957497, + "grad_norm": 29.52191734313965, + "learning_rate": 2.7633849242910622e-08, + "logits/chosen": -1.4773155450820923, + "logits/rejected": -1.8487679958343506, + "logps/chosen": -2.4244544506073, + "logps/rejected": -2.540086030960083, + "loss": 2.6833, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.244544982910156, + "rewards/margins": 1.1563156843185425, + "rewards/rejected": -25.400859832763672, + "step": 26830 + }, + { + "epoch": 0.904479422966733, + "grad_norm": 40.38792419433594, + "learning_rate": 2.75375010872117e-08, + "logits/chosen": -1.6742206811904907, + "logits/rejected": -1.5991586446762085, + "logps/chosen": -2.3412322998046875, + "logps/rejected": -2.2697975635528564, + "loss": 4.0595, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.412322998046875, + "rewards/margins": -0.7143500447273254, + "rewards/rejected": -22.697973251342773, + "step": 26835 + }, + { + "epoch": 0.9046479490377162, + "grad_norm": 25.706867218017578, + "learning_rate": 2.744131643216929e-08, + "logits/chosen": -1.7045414447784424, + "logits/rejected": -2.089465856552124, + "logps/chosen": -1.8297027349472046, + "logps/rejected": -2.1544618606567383, + "loss": 1.8551, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.297027587890625, + "rewards/margins": 3.247591495513916, + "rewards/rejected": -21.544620513916016, + "step": 26840 + }, + { + "epoch": 0.9048164751086993, + "grad_norm": 19.23711395263672, + "learning_rate": 2.734529531106916e-08, + "logits/chosen": -1.7439048290252686, + "logits/rejected": -1.5896174907684326, + "logps/chosen": -3.032853364944458, + "logps/rejected": -3.0909409523010254, + "loss": 5.9402, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -30.328533172607422, + "rewards/margins": 0.5808780789375305, + "rewards/rejected": -30.909412384033203, + "step": 26845 + }, + { + "epoch": 0.9049850011796825, + "grad_norm": 46.411834716796875, + "learning_rate": 2.7249437757140615e-08, + "logits/chosen": -1.701541543006897, + "logits/rejected": -1.8513438701629639, + "logps/chosen": -2.831533908843994, + "logps/rejected": -3.0360515117645264, + "loss": 2.4463, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -28.31533432006836, + "rewards/margins": 2.045178174972534, + "rewards/rejected": -30.360515594482422, + "step": 26850 + }, + { + "epoch": 0.9051535272506657, + "grad_norm": 26.993629455566406, + "learning_rate": 2.7153743803555894e-08, + "logits/chosen": -1.7826759815216064, + "logits/rejected": -2.196767807006836, + "logps/chosen": -1.8802204132080078, + "logps/rejected": -2.171701192855835, + "loss": 2.3952, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.80220603942871, + "rewards/margins": 2.9148058891296387, + "rewards/rejected": -21.717010498046875, + "step": 26855 + }, + { + "epoch": 0.9053220533216488, + "grad_norm": 72.22422790527344, + "learning_rate": 2.705821348343107e-08, + "logits/chosen": -1.5827332735061646, + "logits/rejected": -2.21030855178833, + "logps/chosen": -2.4274351596832275, + "logps/rejected": -3.115079641342163, + "loss": 0.7714, + "rewards/accuracies": 1.0, + "rewards/chosen": -24.274351119995117, + "rewards/margins": 6.876446723937988, + "rewards/rejected": -31.15079689025879, + "step": 26860 + }, + { + "epoch": 0.905490579392632, + "grad_norm": 3.0414493083953857, + "learning_rate": 2.6962846829825415e-08, + "logits/chosen": -2.1721713542938232, + "logits/rejected": -2.610666275024414, + "logps/chosen": -3.422571897506714, + "logps/rejected": -4.277789115905762, + "loss": 1.0687, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -34.2257194519043, + "rewards/margins": 8.55217170715332, + "rewards/rejected": -42.77789306640625, + "step": 26865 + }, + { + "epoch": 0.9056591054636152, + "grad_norm": 97.23381805419922, + "learning_rate": 2.6867643875741585e-08, + "logits/chosen": -1.9782911539077759, + "logits/rejected": -2.014655351638794, + "logps/chosen": -2.3666396141052246, + "logps/rejected": -2.3032031059265137, + "loss": 4.2141, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.666393280029297, + "rewards/margins": -0.6343621015548706, + "rewards/rejected": -23.032032012939453, + "step": 26870 + }, + { + "epoch": 0.9058276315345984, + "grad_norm": 44.421600341796875, + "learning_rate": 2.677260465412551e-08, + "logits/chosen": -1.8927501440048218, + "logits/rejected": -2.187912940979004, + "logps/chosen": -2.850724458694458, + "logps/rejected": -3.9353561401367188, + "loss": 3.3546, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -28.50724220275879, + "rewards/margins": 10.846318244934082, + "rewards/rejected": -39.35356521606445, + "step": 26875 + }, + { + "epoch": 0.9059961576055816, + "grad_norm": 50.800689697265625, + "learning_rate": 2.667772919786648e-08, + "logits/chosen": -2.2175285816192627, + "logits/rejected": -2.1087779998779297, + "logps/chosen": -2.667564630508423, + "logps/rejected": -3.046851396560669, + "loss": 2.6457, + "rewards/accuracies": 0.5, + "rewards/chosen": -26.675647735595703, + "rewards/margins": 3.792868137359619, + "rewards/rejected": -30.4685115814209, + "step": 26880 + }, + { + "epoch": 0.9061646836765648, + "grad_norm": 34.15726089477539, + "learning_rate": 2.6583017539797358e-08, + "logits/chosen": -1.3783862590789795, + "logits/rejected": -1.9121935367584229, + "logps/chosen": -2.2632646560668945, + "logps/rejected": -2.575612783432007, + "loss": 1.7138, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -22.632644653320312, + "rewards/margins": 3.1234829425811768, + "rewards/rejected": -25.756128311157227, + "step": 26885 + }, + { + "epoch": 0.9063332097475479, + "grad_norm": 20.80045509338379, + "learning_rate": 2.6488469712693862e-08, + "logits/chosen": -1.5416743755340576, + "logits/rejected": -1.7947609424591064, + "logps/chosen": -2.3976223468780518, + "logps/rejected": -2.6126463413238525, + "loss": 2.0554, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.97622299194336, + "rewards/margins": 2.150240898132324, + "rewards/rejected": -26.12646484375, + "step": 26890 + }, + { + "epoch": 0.9065017358185311, + "grad_norm": 0.6815192103385925, + "learning_rate": 2.639408574927543e-08, + "logits/chosen": -1.4656648635864258, + "logits/rejected": -1.8038467168807983, + "logps/chosen": -2.7530505657196045, + "logps/rejected": -2.9685773849487305, + "loss": 2.3417, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -27.530506134033203, + "rewards/margins": 2.155269145965576, + "rewards/rejected": -29.685771942138672, + "step": 26895 + }, + { + "epoch": 0.9066702618895143, + "grad_norm": 140.0187530517578, + "learning_rate": 2.62998656822046e-08, + "logits/chosen": -1.39266836643219, + "logits/rejected": -2.597870111465454, + "logps/chosen": -2.699371814727783, + "logps/rejected": -4.062556743621826, + "loss": 1.7151, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.993717193603516, + "rewards/margins": 13.631851196289062, + "rewards/rejected": -40.62556838989258, + "step": 26900 + }, + { + "epoch": 0.9068387879604974, + "grad_norm": 40.37078094482422, + "learning_rate": 2.620580954408724e-08, + "logits/chosen": -2.158634901046753, + "logits/rejected": -2.2582380771636963, + "logps/chosen": -2.825108051300049, + "logps/rejected": -3.085275888442993, + "loss": 2.8587, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -28.251079559326172, + "rewards/margins": 2.6016762256622314, + "rewards/rejected": -30.85275650024414, + "step": 26905 + }, + { + "epoch": 0.9070073140314807, + "grad_norm": 0.6319014430046082, + "learning_rate": 2.6111917367472425e-08, + "logits/chosen": -1.7111196517944336, + "logits/rejected": -1.805869698524475, + "logps/chosen": -3.093212604522705, + "logps/rejected": -3.5046744346618652, + "loss": 1.9236, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -30.9321231842041, + "rewards/margins": 4.114621639251709, + "rewards/rejected": -35.04674530029297, + "step": 26910 + }, + { + "epoch": 0.9071758401024639, + "grad_norm": 28.448257446289062, + "learning_rate": 2.6018189184852545e-08, + "logits/chosen": -1.9640458822250366, + "logits/rejected": -2.1212968826293945, + "logps/chosen": -1.9360281229019165, + "logps/rejected": -1.9871841669082642, + "loss": 2.8041, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.360280990600586, + "rewards/margins": 0.5115610361099243, + "rewards/rejected": -19.871841430664062, + "step": 26915 + }, + { + "epoch": 0.907344366173447, + "grad_norm": 50.9765625, + "learning_rate": 2.592462502866333e-08, + "logits/chosen": -1.5658257007598877, + "logits/rejected": -1.8782110214233398, + "logps/chosen": -2.493435859680176, + "logps/rejected": -2.6581640243530273, + "loss": 2.7056, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.93436050415039, + "rewards/margins": 1.6472809314727783, + "rewards/rejected": -26.581640243530273, + "step": 26920 + }, + { + "epoch": 0.9075128922444302, + "grad_norm": 24.455228805541992, + "learning_rate": 2.583122493128348e-08, + "logits/chosen": -1.6781708002090454, + "logits/rejected": -1.9250268936157227, + "logps/chosen": -2.064728021621704, + "logps/rejected": -2.7096962928771973, + "loss": 2.4928, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.64727783203125, + "rewards/margins": 6.449685573577881, + "rewards/rejected": -27.096965789794922, + "step": 26925 + }, + { + "epoch": 0.9076814183154134, + "grad_norm": 74.76783752441406, + "learning_rate": 2.5737988925035204e-08, + "logits/chosen": -1.885584831237793, + "logits/rejected": -2.2635011672973633, + "logps/chosen": -2.8694005012512207, + "logps/rejected": -3.353682279586792, + "loss": 2.965, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -28.694005966186523, + "rewards/margins": 4.842817306518555, + "rewards/rejected": -33.53682327270508, + "step": 26930 + }, + { + "epoch": 0.9078499443863965, + "grad_norm": 40.00210189819336, + "learning_rate": 2.5644917042183745e-08, + "logits/chosen": -1.3769192695617676, + "logits/rejected": -1.2565984725952148, + "logps/chosen": -2.0572023391723633, + "logps/rejected": -2.2845962047576904, + "loss": 1.8896, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.572025299072266, + "rewards/margins": 2.2739388942718506, + "rewards/rejected": -22.845962524414062, + "step": 26935 + }, + { + "epoch": 0.9080184704573797, + "grad_norm": 28.726268768310547, + "learning_rate": 2.5552009314937728e-08, + "logits/chosen": -2.2686069011688232, + "logits/rejected": -2.4023404121398926, + "logps/chosen": -2.7572731971740723, + "logps/rejected": -2.9134395122528076, + "loss": 2.7201, + "rewards/accuracies": 0.5, + "rewards/chosen": -27.57273292541504, + "rewards/margins": 1.5616614818572998, + "rewards/rejected": -29.1343936920166, + "step": 26940 + }, + { + "epoch": 0.908186996528363, + "grad_norm": 25.02350425720215, + "learning_rate": 2.545926577544877e-08, + "logits/chosen": -1.8146655559539795, + "logits/rejected": -1.8045673370361328, + "logps/chosen": -2.1324949264526367, + "logps/rejected": -2.3172719478607178, + "loss": 2.4044, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.324947357177734, + "rewards/margins": 1.8477720022201538, + "rewards/rejected": -23.172719955444336, + "step": 26945 + }, + { + "epoch": 0.9083555225993462, + "grad_norm": 31.74529266357422, + "learning_rate": 2.5366686455811693e-08, + "logits/chosen": -2.118863344192505, + "logits/rejected": -2.5201308727264404, + "logps/chosen": -2.5623345375061035, + "logps/rejected": -3.3792407512664795, + "loss": 2.1214, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.623342514038086, + "rewards/margins": 8.16906452178955, + "rewards/rejected": -33.79241180419922, + "step": 26950 + }, + { + "epoch": 0.9085240486703293, + "grad_norm": 17.955358505249023, + "learning_rate": 2.527427138806465e-08, + "logits/chosen": -1.5059000253677368, + "logits/rejected": -1.6303247213363647, + "logps/chosen": -2.6259942054748535, + "logps/rejected": -3.205542802810669, + "loss": 1.4593, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -26.25994300842285, + "rewards/margins": 5.795483589172363, + "rewards/rejected": -32.05542755126953, + "step": 26955 + }, + { + "epoch": 0.9086925747413125, + "grad_norm": 30.412948608398438, + "learning_rate": 2.5182020604188892e-08, + "logits/chosen": -1.84066903591156, + "logits/rejected": -2.3871397972106934, + "logps/chosen": -3.038327693939209, + "logps/rejected": -3.112415313720703, + "loss": 4.9481, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -30.38327980041504, + "rewards/margins": 0.7408765554428101, + "rewards/rejected": -31.124155044555664, + "step": 26960 + }, + { + "epoch": 0.9088611008122957, + "grad_norm": 25.51723289489746, + "learning_rate": 2.508993413610866e-08, + "logits/chosen": -1.8257606029510498, + "logits/rejected": -2.3894286155700684, + "logps/chosen": -2.486851453781128, + "logps/rejected": -3.019463062286377, + "loss": 2.6325, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.868515014648438, + "rewards/margins": 5.326113224029541, + "rewards/rejected": -30.194629669189453, + "step": 26965 + }, + { + "epoch": 0.9090296268832788, + "grad_norm": 35.03866195678711, + "learning_rate": 2.4998012015691517e-08, + "logits/chosen": -1.8151493072509766, + "logits/rejected": -2.12388277053833, + "logps/chosen": -2.1048531532287598, + "logps/rejected": -2.4690277576446533, + "loss": 3.1153, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.04853057861328, + "rewards/margins": 3.6417458057403564, + "rewards/rejected": -24.690277099609375, + "step": 26970 + }, + { + "epoch": 0.909198152954262, + "grad_norm": 57.44851303100586, + "learning_rate": 2.4906254274748182e-08, + "logits/chosen": -1.9289391040802002, + "logits/rejected": -2.2314047813415527, + "logps/chosen": -2.174452781677246, + "logps/rejected": -2.5939812660217285, + "loss": 2.597, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.744531631469727, + "rewards/margins": 4.195285797119141, + "rewards/rejected": -25.939815521240234, + "step": 26975 + }, + { + "epoch": 0.9093666790252452, + "grad_norm": 39.91963195800781, + "learning_rate": 2.4814660945032206e-08, + "logits/chosen": -1.9026374816894531, + "logits/rejected": -2.213430166244507, + "logps/chosen": -2.6050117015838623, + "logps/rejected": -2.9902827739715576, + "loss": 1.2653, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -26.050119400024414, + "rewards/margins": 3.8527092933654785, + "rewards/rejected": -29.902828216552734, + "step": 26980 + }, + { + "epoch": 0.9095352050962284, + "grad_norm": 40.34387969970703, + "learning_rate": 2.4723232058240507e-08, + "logits/chosen": -2.1123440265655518, + "logits/rejected": -1.9164499044418335, + "logps/chosen": -3.002830982208252, + "logps/rejected": -3.1687307357788086, + "loss": 2.5419, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -30.028308868408203, + "rewards/margins": 1.6589953899383545, + "rewards/rejected": -31.687305450439453, + "step": 26985 + }, + { + "epoch": 0.9097037311672116, + "grad_norm": 28.5035400390625, + "learning_rate": 2.4631967646013107e-08, + "logits/chosen": -1.6512867212295532, + "logits/rejected": -1.7113101482391357, + "logps/chosen": -2.0377159118652344, + "logps/rejected": -2.164827585220337, + "loss": 2.3158, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.377161026000977, + "rewards/margins": 1.271114706993103, + "rewards/rejected": -21.64827537536621, + "step": 26990 + }, + { + "epoch": 0.9098722572381948, + "grad_norm": 47.506649017333984, + "learning_rate": 2.4540867739932912e-08, + "logits/chosen": -1.38832688331604, + "logits/rejected": -1.7318493127822876, + "logps/chosen": -2.3842415809631348, + "logps/rejected": -2.51163911819458, + "loss": 4.2326, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.84241485595703, + "rewards/margins": 1.2739765644073486, + "rewards/rejected": -25.116390228271484, + "step": 26995 + }, + { + "epoch": 0.9100407833091779, + "grad_norm": 39.53714370727539, + "learning_rate": 2.444993237152604e-08, + "logits/chosen": -2.342810869216919, + "logits/rejected": -2.1015408039093018, + "logps/chosen": -2.428750991821289, + "logps/rejected": -2.727332592010498, + "loss": 1.7182, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.28750991821289, + "rewards/margins": 2.9858198165893555, + "rewards/rejected": -27.273326873779297, + "step": 27000 + }, + { + "epoch": 0.9102093093801611, + "grad_norm": 21.055631637573242, + "learning_rate": 2.4359161572261645e-08, + "logits/chosen": -1.832567572593689, + "logits/rejected": -1.7975997924804688, + "logps/chosen": -2.410696506500244, + "logps/rejected": -2.769796848297119, + "loss": 2.5338, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.10696792602539, + "rewards/margins": 3.59100341796875, + "rewards/rejected": -27.69797134399414, + "step": 27005 + }, + { + "epoch": 0.9103778354511443, + "grad_norm": 52.577049255371094, + "learning_rate": 2.426855537355199e-08, + "logits/chosen": -0.961966872215271, + "logits/rejected": -1.447507381439209, + "logps/chosen": -2.2045254707336426, + "logps/rejected": -2.5315704345703125, + "loss": 1.8555, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -22.04525375366211, + "rewards/margins": 3.2704505920410156, + "rewards/rejected": -25.315704345703125, + "step": 27010 + }, + { + "epoch": 0.9105463615221274, + "grad_norm": 42.69902801513672, + "learning_rate": 2.4178113806752222e-08, + "logits/chosen": -2.3947811126708984, + "logits/rejected": -2.06548810005188, + "logps/chosen": -2.6121039390563965, + "logps/rejected": -2.8605058193206787, + "loss": 4.3609, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -26.121036529541016, + "rewards/margins": 2.4840221405029297, + "rewards/rejected": -28.605060577392578, + "step": 27015 + }, + { + "epoch": 0.9107148875931107, + "grad_norm": 21.63785171508789, + "learning_rate": 2.4087836903160574e-08, + "logits/chosen": -1.3393892049789429, + "logits/rejected": -1.6054449081420898, + "logps/chosen": -2.1144537925720215, + "logps/rejected": -2.4885573387145996, + "loss": 1.193, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -21.14453887939453, + "rewards/margins": 3.741034746170044, + "rewards/rejected": -24.885574340820312, + "step": 27020 + }, + { + "epoch": 0.9108834136640939, + "grad_norm": 34.35652160644531, + "learning_rate": 2.399772469401845e-08, + "logits/chosen": -1.684240698814392, + "logits/rejected": -1.5888200998306274, + "logps/chosen": -2.075859785079956, + "logps/rejected": -1.9586131572723389, + "loss": 5.9731, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.75859832763672, + "rewards/margins": -1.1724650859832764, + "rewards/rejected": -19.58613395690918, + "step": 27025 + }, + { + "epoch": 0.911051939735077, + "grad_norm": 61.34150695800781, + "learning_rate": 2.390777721051007e-08, + "logits/chosen": -1.950811743736267, + "logits/rejected": -2.438870668411255, + "logps/chosen": -2.0603041648864746, + "logps/rejected": -2.328369617462158, + "loss": 3.326, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.60304069519043, + "rewards/margins": 2.6806557178497314, + "rewards/rejected": -23.2836971282959, + "step": 27030 + }, + { + "epoch": 0.9112204658060602, + "grad_norm": 43.496246337890625, + "learning_rate": 2.3817994483762648e-08, + "logits/chosen": -1.5817230939865112, + "logits/rejected": -2.0574898719787598, + "logps/chosen": -2.211698055267334, + "logps/rejected": -2.8117573261260986, + "loss": 2.0304, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.116981506347656, + "rewards/margins": 6.000591278076172, + "rewards/rejected": -28.117572784423828, + "step": 27035 + }, + { + "epoch": 0.9113889918770434, + "grad_norm": 8.805611610412598, + "learning_rate": 2.372837654484655e-08, + "logits/chosen": -1.6177377700805664, + "logits/rejected": -1.8455016613006592, + "logps/chosen": -2.7482638359069824, + "logps/rejected": -2.7214274406433105, + "loss": 4.0024, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -27.48263931274414, + "rewards/margins": -0.2683609127998352, + "rewards/rejected": -27.214275360107422, + "step": 27040 + }, + { + "epoch": 0.9115575179480265, + "grad_norm": 47.26149368286133, + "learning_rate": 2.3638923424775025e-08, + "logits/chosen": -2.373136520385742, + "logits/rejected": -2.4441769123077393, + "logps/chosen": -3.1748688220977783, + "logps/rejected": -3.6415977478027344, + "loss": 1.6607, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -31.748687744140625, + "rewards/margins": 4.667288303375244, + "rewards/rejected": -36.415977478027344, + "step": 27045 + }, + { + "epoch": 0.9117260440190097, + "grad_norm": 141.83863830566406, + "learning_rate": 2.3549635154504145e-08, + "logits/chosen": -1.7244266271591187, + "logits/rejected": -2.0603253841400146, + "logps/chosen": -3.489849805831909, + "logps/rejected": -3.5083630084991455, + "loss": 6.7995, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -34.898494720458984, + "rewards/margins": 0.1851322203874588, + "rewards/rejected": -35.08362579345703, + "step": 27050 + }, + { + "epoch": 0.911894570089993, + "grad_norm": 22.03189468383789, + "learning_rate": 2.3460511764933187e-08, + "logits/chosen": -2.0633015632629395, + "logits/rejected": -2.145843029022217, + "logps/chosen": -1.955177664756775, + "logps/rejected": -2.733556032180786, + "loss": 2.2386, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.55177879333496, + "rewards/margins": 7.7837815284729, + "rewards/rejected": -27.335559844970703, + "step": 27055 + }, + { + "epoch": 0.9120630961609761, + "grad_norm": 32.86399459838867, + "learning_rate": 2.337155328690421e-08, + "logits/chosen": -1.5522150993347168, + "logits/rejected": -1.4998700618743896, + "logps/chosen": -2.131031036376953, + "logps/rejected": -2.2082178592681885, + "loss": 2.7185, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.31031036376953, + "rewards/margins": 0.7718679308891296, + "rewards/rejected": -22.082176208496094, + "step": 27060 + }, + { + "epoch": 0.9122316222319593, + "grad_norm": 39.76874542236328, + "learning_rate": 2.3282759751202197e-08, + "logits/chosen": -2.256934642791748, + "logits/rejected": -2.1311392784118652, + "logps/chosen": -2.158473253250122, + "logps/rejected": -2.0872859954833984, + "loss": 4.0022, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -21.584732055664062, + "rewards/margins": -0.7118738889694214, + "rewards/rejected": -20.87285804748535, + "step": 27065 + }, + { + "epoch": 0.9124001483029425, + "grad_norm": 34.26323318481445, + "learning_rate": 2.319413118855512e-08, + "logits/chosen": -1.6916424036026, + "logits/rejected": -2.1182878017425537, + "logps/chosen": -2.520982027053833, + "logps/rejected": -2.8135578632354736, + "loss": 2.329, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.209819793701172, + "rewards/margins": 2.9257562160491943, + "rewards/rejected": -28.135578155517578, + "step": 27070 + }, + { + "epoch": 0.9125686743739256, + "grad_norm": 61.415306091308594, + "learning_rate": 2.310566762963384e-08, + "logits/chosen": -2.140953779220581, + "logits/rejected": -2.252530813217163, + "logps/chosen": -2.918928623199463, + "logps/rejected": -3.3146369457244873, + "loss": 5.3396, + "rewards/accuracies": 0.5, + "rewards/chosen": -29.189289093017578, + "rewards/margins": 3.957087755203247, + "rewards/rejected": -33.1463737487793, + "step": 27075 + }, + { + "epoch": 0.9127372004449088, + "grad_norm": 30.03462791442871, + "learning_rate": 2.3017369105052142e-08, + "logits/chosen": -1.6456882953643799, + "logits/rejected": -1.8969169855117798, + "logps/chosen": -2.3969192504882812, + "logps/rejected": -3.0997815132141113, + "loss": 2.3552, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.969192504882812, + "rewards/margins": 7.028619289398193, + "rewards/rejected": -30.997814178466797, + "step": 27080 + }, + { + "epoch": 0.912905726515892, + "grad_norm": 20.82946014404297, + "learning_rate": 2.292923564536664e-08, + "logits/chosen": -1.9467077255249023, + "logits/rejected": -2.043200731277466, + "logps/chosen": -2.5897421836853027, + "logps/rejected": -3.2874908447265625, + "loss": 3.6496, + "rewards/accuracies": 0.5, + "rewards/chosen": -25.897424697875977, + "rewards/margins": 6.9774885177612305, + "rewards/rejected": -32.874908447265625, + "step": 27085 + }, + { + "epoch": 0.9130742525868751, + "grad_norm": 83.79644775390625, + "learning_rate": 2.284126728107677e-08, + "logits/chosen": -1.6562871932983398, + "logits/rejected": -1.8409792184829712, + "logps/chosen": -2.63004207611084, + "logps/rejected": -2.590404748916626, + "loss": 4.0595, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -26.3004207611084, + "rewards/margins": -0.39637669920921326, + "rewards/rejected": -25.9040470123291, + "step": 27090 + }, + { + "epoch": 0.9132427786578584, + "grad_norm": 23.544960021972656, + "learning_rate": 2.2753464042625015e-08, + "logits/chosen": -1.9590803384780884, + "logits/rejected": -2.0734570026397705, + "logps/chosen": -2.8419137001037598, + "logps/rejected": -3.1839821338653564, + "loss": 2.4574, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -28.41913414001465, + "rewards/margins": 3.4206855297088623, + "rewards/rejected": -31.839818954467773, + "step": 27095 + }, + { + "epoch": 0.9134113047288416, + "grad_norm": 45.71376037597656, + "learning_rate": 2.2665825960396624e-08, + "logits/chosen": -1.905147910118103, + "logits/rejected": -2.3859221935272217, + "logps/chosen": -1.8513004779815674, + "logps/rejected": -2.118018627166748, + "loss": 1.5718, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.513004302978516, + "rewards/margins": 2.667184591293335, + "rewards/rejected": -21.180187225341797, + "step": 27100 + }, + { + "epoch": 0.9135798307998247, + "grad_norm": 52.81638717651367, + "learning_rate": 2.257835306471967e-08, + "logits/chosen": -2.118196487426758, + "logits/rejected": -2.160170793533325, + "logps/chosen": -2.148125648498535, + "logps/rejected": -2.3048596382141113, + "loss": 2.4082, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.481258392333984, + "rewards/margins": 1.5673408508300781, + "rewards/rejected": -23.04859733581543, + "step": 27105 + }, + { + "epoch": 0.9137483568708079, + "grad_norm": 55.22969055175781, + "learning_rate": 2.2491045385864993e-08, + "logits/chosen": -1.9875249862670898, + "logits/rejected": -2.3192827701568604, + "logps/chosen": -3.277228593826294, + "logps/rejected": -3.8430697917938232, + "loss": 2.3816, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -32.77228927612305, + "rewards/margins": 5.658409595489502, + "rewards/rejected": -38.430702209472656, + "step": 27110 + }, + { + "epoch": 0.9139168829417911, + "grad_norm": 45.046607971191406, + "learning_rate": 2.2403902954046427e-08, + "logits/chosen": -1.7681725025177002, + "logits/rejected": -1.7429002523422241, + "logps/chosen": -2.472874641418457, + "logps/rejected": -2.4763524532318115, + "loss": 3.3829, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.728748321533203, + "rewards/margins": 0.0347774513065815, + "rewards/rejected": -24.76352310180664, + "step": 27115 + }, + { + "epoch": 0.9140854090127742, + "grad_norm": 22.940006256103516, + "learning_rate": 2.2316925799420517e-08, + "logits/chosen": -2.224156618118286, + "logits/rejected": -2.271947145462036, + "logps/chosen": -2.6618289947509766, + "logps/rejected": -2.998880386352539, + "loss": 2.5628, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.618289947509766, + "rewards/margins": 3.370516300201416, + "rewards/rejected": -29.988805770874023, + "step": 27120 + }, + { + "epoch": 0.9142539350837574, + "grad_norm": 19.00252914428711, + "learning_rate": 2.2230113952086626e-08, + "logits/chosen": -1.829633116722107, + "logits/rejected": -2.575827121734619, + "logps/chosen": -2.6892051696777344, + "logps/rejected": -3.115910291671753, + "loss": 4.5612, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -26.89204978942871, + "rewards/margins": 4.267054080963135, + "rewards/rejected": -31.159103393554688, + "step": 27125 + }, + { + "epoch": 0.9144224611547407, + "grad_norm": 25.019968032836914, + "learning_rate": 2.2143467442086948e-08, + "logits/chosen": -1.8367170095443726, + "logits/rejected": -1.8480660915374756, + "logps/chosen": -2.1454520225524902, + "logps/rejected": -2.04133939743042, + "loss": 4.9755, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.454519271850586, + "rewards/margins": -1.041122317314148, + "rewards/rejected": -20.41339683532715, + "step": 27130 + }, + { + "epoch": 0.9145909872257239, + "grad_norm": 72.38630676269531, + "learning_rate": 2.205698629940639e-08, + "logits/chosen": -1.979882836341858, + "logits/rejected": -2.1404590606689453, + "logps/chosen": -2.3046226501464844, + "logps/rejected": -2.704655408859253, + "loss": 1.9762, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.046226501464844, + "rewards/margins": 4.000330448150635, + "rewards/rejected": -27.046558380126953, + "step": 27135 + }, + { + "epoch": 0.914759513296707, + "grad_norm": 617.034423828125, + "learning_rate": 2.1970670553972613e-08, + "logits/chosen": -2.043015480041504, + "logits/rejected": -1.733640432357788, + "logps/chosen": -2.702033758163452, + "logps/rejected": -2.0956649780273438, + "loss": 9.1843, + "rewards/accuracies": 0.5, + "rewards/chosen": -27.020336151123047, + "rewards/margins": -6.063687801361084, + "rewards/rejected": -20.956649780273438, + "step": 27140 + }, + { + "epoch": 0.9149280393676902, + "grad_norm": 37.748779296875, + "learning_rate": 2.188452023565618e-08, + "logits/chosen": -1.8703845739364624, + "logits/rejected": -1.954056739807129, + "logps/chosen": -2.447855234146118, + "logps/rejected": -3.0836079120635986, + "loss": 2.5521, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.478551864624023, + "rewards/margins": 6.357526779174805, + "rewards/rejected": -30.836078643798828, + "step": 27145 + }, + { + "epoch": 0.9150965654386733, + "grad_norm": 35.00310134887695, + "learning_rate": 2.1798535374270345e-08, + "logits/chosen": -1.4112383127212524, + "logits/rejected": -1.43263840675354, + "logps/chosen": -2.427001476287842, + "logps/rejected": -2.7232165336608887, + "loss": 2.4778, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.270015716552734, + "rewards/margins": 2.9621503353118896, + "rewards/rejected": -27.232168197631836, + "step": 27150 + }, + { + "epoch": 0.9152650915096565, + "grad_norm": 51.57535171508789, + "learning_rate": 2.1712715999570974e-08, + "logits/chosen": -2.230093479156494, + "logits/rejected": -2.553774833679199, + "logps/chosen": -3.5125930309295654, + "logps/rejected": -4.026968955993652, + "loss": 2.2864, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -35.12593078613281, + "rewards/margins": 5.1437578201293945, + "rewards/rejected": -40.26968765258789, + "step": 27155 + }, + { + "epoch": 0.9154336175806397, + "grad_norm": 79.23214721679688, + "learning_rate": 2.1627062141256815e-08, + "logits/chosen": -1.558318853378296, + "logits/rejected": -1.3582528829574585, + "logps/chosen": -2.803522825241089, + "logps/rejected": -2.6959805488586426, + "loss": 4.4538, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -28.035228729248047, + "rewards/margins": -1.0754238367080688, + "rewards/rejected": -26.95980453491211, + "step": 27160 + }, + { + "epoch": 0.915602143651623, + "grad_norm": 14.498846054077148, + "learning_rate": 2.154157382896943e-08, + "logits/chosen": -1.7168614864349365, + "logits/rejected": -2.117745876312256, + "logps/chosen": -3.605928897857666, + "logps/rejected": -4.526423454284668, + "loss": 2.3218, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -36.059288024902344, + "rewards/margins": 9.204951286315918, + "rewards/rejected": -45.26424026489258, + "step": 27165 + }, + { + "epoch": 0.9157706697226061, + "grad_norm": 1.3730069398880005, + "learning_rate": 2.145625109229271e-08, + "logits/chosen": -2.0063352584838867, + "logits/rejected": -2.3878989219665527, + "logps/chosen": -2.6619865894317627, + "logps/rejected": -3.246828079223633, + "loss": 2.4983, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -26.6198673248291, + "rewards/margins": 5.848411560058594, + "rewards/rejected": -32.46827697753906, + "step": 27170 + }, + { + "epoch": 0.9159391957935893, + "grad_norm": 33.612762451171875, + "learning_rate": 2.137109396075365e-08, + "logits/chosen": -1.2773253917694092, + "logits/rejected": -1.6957318782806396, + "logps/chosen": -2.4628424644470215, + "logps/rejected": -2.8134007453918457, + "loss": 1.7423, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.6284236907959, + "rewards/margins": 3.5055809020996094, + "rewards/rejected": -28.13400650024414, + "step": 27175 + }, + { + "epoch": 0.9161077218645725, + "grad_norm": 9.232412338256836, + "learning_rate": 2.1286102463821675e-08, + "logits/chosen": -2.0160505771636963, + "logits/rejected": -2.3261797428131104, + "logps/chosen": -2.102545738220215, + "logps/rejected": -2.440828323364258, + "loss": 1.8755, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -21.025455474853516, + "rewards/margins": 3.3828253746032715, + "rewards/rejected": -24.408281326293945, + "step": 27180 + }, + { + "epoch": 0.9162762479355556, + "grad_norm": 15.199311256408691, + "learning_rate": 2.1201276630909203e-08, + "logits/chosen": -2.2370071411132812, + "logits/rejected": -2.714974880218506, + "logps/chosen": -1.9336721897125244, + "logps/rejected": -2.636110305786133, + "loss": 1.5309, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.336721420288086, + "rewards/margins": 7.024382591247559, + "rewards/rejected": -26.36110496520996, + "step": 27185 + }, + { + "epoch": 0.9164447740065388, + "grad_norm": 39.4886589050293, + "learning_rate": 2.1116616491370863e-08, + "logits/chosen": -2.0533199310302734, + "logits/rejected": -2.326814889907837, + "logps/chosen": -2.4584240913391113, + "logps/rejected": -2.459068775177002, + "loss": 3.996, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.584243774414062, + "rewards/margins": 0.006443119142204523, + "rewards/rejected": -24.590682983398438, + "step": 27190 + }, + { + "epoch": 0.916613300077522, + "grad_norm": 26.44153594970703, + "learning_rate": 2.1032122074504332e-08, + "logits/chosen": -1.8153555393218994, + "logits/rejected": -2.659576177597046, + "logps/chosen": -2.7765450477600098, + "logps/rejected": -4.102330207824707, + "loss": 2.3122, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -27.76544761657715, + "rewards/margins": 13.257855415344238, + "rewards/rejected": -41.0233039855957, + "step": 27195 + }, + { + "epoch": 0.9167818261485051, + "grad_norm": 50.18436813354492, + "learning_rate": 2.094779340954983e-08, + "logits/chosen": -2.3786637783050537, + "logits/rejected": -2.708362102508545, + "logps/chosen": -2.495008945465088, + "logps/rejected": -3.038865804672241, + "loss": 2.0584, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.950092315673828, + "rewards/margins": 5.438567161560059, + "rewards/rejected": -30.388656616210938, + "step": 27200 + }, + { + "epoch": 0.9167818261485051, + "eval_logits/chosen": -2.3133950233459473, + "eval_logits/rejected": -2.4917452335357666, + "eval_logps/chosen": -2.2887940406799316, + "eval_logps/rejected": -2.442683219909668, + "eval_loss": 3.089421272277832, + "eval_rewards/accuracies": 0.6299999952316284, + "eval_rewards/chosen": -22.887939453125, + "eval_rewards/margins": 1.5388928651809692, + "eval_rewards/rejected": -24.426830291748047, + "eval_runtime": 12.895, + "eval_samples_per_second": 7.755, + "eval_steps_per_second": 1.939, + "step": 27200 + }, + { + "epoch": 0.9169503522194884, + "grad_norm": 1.289928913116455, + "learning_rate": 2.0863630525690066e-08, + "logits/chosen": -1.750335931777954, + "logits/rejected": -2.047384738922119, + "logps/chosen": -2.2318363189697266, + "logps/rejected": -2.868582248687744, + "loss": 2.0977, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.318363189697266, + "rewards/margins": 6.367456912994385, + "rewards/rejected": -28.685821533203125, + "step": 27205 + }, + { + "epoch": 0.9171188782904716, + "grad_norm": 8.582649230957031, + "learning_rate": 2.0779633452050526e-08, + "logits/chosen": -1.8326599597930908, + "logits/rejected": -2.413167953491211, + "logps/chosen": -2.6722311973571777, + "logps/rejected": -3.0735504627227783, + "loss": 1.4892, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -26.722314834594727, + "rewards/margins": 4.013188362121582, + "rewards/rejected": -30.73550033569336, + "step": 27210 + }, + { + "epoch": 0.9172874043614547, + "grad_norm": 20.33465576171875, + "learning_rate": 2.0695802217699344e-08, + "logits/chosen": -2.121351718902588, + "logits/rejected": -2.4303436279296875, + "logps/chosen": -2.191392421722412, + "logps/rejected": -2.4328086376190186, + "loss": 1.7268, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.913923263549805, + "rewards/margins": 2.414163112640381, + "rewards/rejected": -24.32808494567871, + "step": 27215 + }, + { + "epoch": 0.9174559304324379, + "grad_norm": 35.218204498291016, + "learning_rate": 2.0612136851647255e-08, + "logits/chosen": -1.718423843383789, + "logits/rejected": -2.4133172035217285, + "logps/chosen": -2.11830735206604, + "logps/rejected": -2.890991687774658, + "loss": 1.1518, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.183073043823242, + "rewards/margins": 7.72684383392334, + "rewards/rejected": -28.909915924072266, + "step": 27220 + }, + { + "epoch": 0.9176244565034211, + "grad_norm": 75.08181762695312, + "learning_rate": 2.052863738284738e-08, + "logits/chosen": -2.1429359912872314, + "logits/rejected": -2.147839069366455, + "logps/chosen": -3.5561728477478027, + "logps/rejected": -3.924323558807373, + "loss": 2.0425, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -35.561729431152344, + "rewards/margins": 3.681507110595703, + "rewards/rejected": -39.24324035644531, + "step": 27225 + }, + { + "epoch": 0.9177929825744042, + "grad_norm": 186.15933227539062, + "learning_rate": 2.0445303840195717e-08, + "logits/chosen": -2.096203327178955, + "logits/rejected": -2.370142936706543, + "logps/chosen": -3.1219077110290527, + "logps/rejected": -3.7952494621276855, + "loss": 1.9524, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -31.219079971313477, + "rewards/margins": 6.733415126800537, + "rewards/rejected": -37.952491760253906, + "step": 27230 + }, + { + "epoch": 0.9179615086453874, + "grad_norm": 32.30589294433594, + "learning_rate": 2.0362136252530748e-08, + "logits/chosen": -1.6504993438720703, + "logits/rejected": -1.96540105342865, + "logps/chosen": -2.1283984184265137, + "logps/rejected": -2.554733991622925, + "loss": 1.497, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.283985137939453, + "rewards/margins": 4.263354301452637, + "rewards/rejected": -25.547340393066406, + "step": 27235 + }, + { + "epoch": 0.9181300347163707, + "grad_norm": 36.14775466918945, + "learning_rate": 2.02791346486334e-08, + "logits/chosen": -1.7485713958740234, + "logits/rejected": -1.8560590744018555, + "logps/chosen": -2.0910072326660156, + "logps/rejected": -2.087937116622925, + "loss": 3.6899, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.91007423400879, + "rewards/margins": -0.030699919909238815, + "rewards/rejected": -20.87937355041504, + "step": 27240 + }, + { + "epoch": 0.9182985607873538, + "grad_norm": 20.150341033935547, + "learning_rate": 2.019629905722725e-08, + "logits/chosen": -2.292171001434326, + "logits/rejected": -2.4348361492156982, + "logps/chosen": -2.8926589488983154, + "logps/rejected": -3.336768388748169, + "loss": 1.9384, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -28.926589965820312, + "rewards/margins": 4.441091060638428, + "rewards/rejected": -33.36768341064453, + "step": 27245 + }, + { + "epoch": 0.918467086858337, + "grad_norm": 41.496490478515625, + "learning_rate": 2.0113629506978536e-08, + "logits/chosen": -1.8662121295928955, + "logits/rejected": -2.5918049812316895, + "logps/chosen": -2.452751874923706, + "logps/rejected": -3.2109732627868652, + "loss": 1.6352, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -24.52752113342285, + "rewards/margins": 7.582211494445801, + "rewards/rejected": -32.10973358154297, + "step": 27250 + }, + { + "epoch": 0.9186356129293202, + "grad_norm": 12.877457618713379, + "learning_rate": 2.0031126026495872e-08, + "logits/chosen": -1.6692726612091064, + "logits/rejected": -1.9603191614151, + "logps/chosen": -2.6382012367248535, + "logps/rejected": -3.2847511768341064, + "loss": 0.9585, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -26.382009506225586, + "rewards/margins": 6.4654998779296875, + "rewards/rejected": -32.847511291503906, + "step": 27255 + }, + { + "epoch": 0.9188041390003033, + "grad_norm": 27.91530418395996, + "learning_rate": 1.9948788644330473e-08, + "logits/chosen": -1.5431195497512817, + "logits/rejected": -1.6130651235580444, + "logps/chosen": -2.2890515327453613, + "logps/rejected": -2.5770554542541504, + "loss": 2.7111, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.890514373779297, + "rewards/margins": 2.8800418376922607, + "rewards/rejected": -25.770557403564453, + "step": 27260 + }, + { + "epoch": 0.9189726650712865, + "grad_norm": 28.36544418334961, + "learning_rate": 1.9866617388976047e-08, + "logits/chosen": -1.4692294597625732, + "logits/rejected": -1.737244963645935, + "logps/chosen": -1.9486863613128662, + "logps/rejected": -2.105894088745117, + "loss": 2.7634, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.486865997314453, + "rewards/margins": 1.572076439857483, + "rewards/rejected": -21.058942794799805, + "step": 27265 + }, + { + "epoch": 0.9191411911422697, + "grad_norm": 27.256328582763672, + "learning_rate": 1.9784612288868907e-08, + "logits/chosen": -2.2276055812835693, + "logits/rejected": -2.251549243927002, + "logps/chosen": -1.954451322555542, + "logps/rejected": -2.037261486053467, + "loss": 2.6429, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.544513702392578, + "rewards/margins": 0.8281000852584839, + "rewards/rejected": -20.37261390686035, + "step": 27270 + }, + { + "epoch": 0.9193097172132529, + "grad_norm": 25.21340560913086, + "learning_rate": 1.970277337238768e-08, + "logits/chosen": -2.331439256668091, + "logits/rejected": -2.3402552604675293, + "logps/chosen": -2.5144410133361816, + "logps/rejected": -2.983175754547119, + "loss": 2.0745, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.1444091796875, + "rewards/margins": 4.687346935272217, + "rewards/rejected": -29.831756591796875, + "step": 27275 + }, + { + "epoch": 0.9194782432842361, + "grad_norm": 61.691226959228516, + "learning_rate": 1.962110066785361e-08, + "logits/chosen": -1.3510804176330566, + "logits/rejected": -1.6053918600082397, + "logps/chosen": -2.667389392852783, + "logps/rejected": -2.8345603942871094, + "loss": 2.735, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -26.673892974853516, + "rewards/margins": 1.6717100143432617, + "rewards/rejected": -28.345605850219727, + "step": 27280 + }, + { + "epoch": 0.9196467693552193, + "grad_norm": 22.086963653564453, + "learning_rate": 1.9539594203530464e-08, + "logits/chosen": -1.633506178855896, + "logits/rejected": -1.676327109336853, + "logps/chosen": -2.520235300064087, + "logps/rejected": -2.5373756885528564, + "loss": 3.0548, + "rewards/accuracies": 0.5, + "rewards/chosen": -25.20235252380371, + "rewards/margins": 0.17140674591064453, + "rewards/rejected": -25.373760223388672, + "step": 27285 + }, + { + "epoch": 0.9198152954262024, + "grad_norm": 72.09431457519531, + "learning_rate": 1.945825400762435e-08, + "logits/chosen": -1.7755218744277954, + "logits/rejected": -1.8230489492416382, + "logps/chosen": -2.6445462703704834, + "logps/rejected": -2.7068397998809814, + "loss": 2.7089, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -26.445465087890625, + "rewards/margins": 0.6229327321052551, + "rewards/rejected": -27.068395614624023, + "step": 27290 + }, + { + "epoch": 0.9199838214971856, + "grad_norm": 27.323516845703125, + "learning_rate": 1.937708010828393e-08, + "logits/chosen": -1.8622583150863647, + "logits/rejected": -2.1557297706604004, + "logps/chosen": -2.5883989334106445, + "logps/rejected": -2.788325071334839, + "loss": 2.2512, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -25.883991241455078, + "rewards/margins": 1.999259352684021, + "rewards/rejected": -27.883249282836914, + "step": 27295 + }, + { + "epoch": 0.9201523475681688, + "grad_norm": 17.495512008666992, + "learning_rate": 1.9296072533600326e-08, + "logits/chosen": -1.857142448425293, + "logits/rejected": -2.222248077392578, + "logps/chosen": -2.382188081741333, + "logps/rejected": -2.569136381149292, + "loss": 2.7625, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.82187843322754, + "rewards/margins": 1.86948561668396, + "rewards/rejected": -25.69136619567871, + "step": 27300 + }, + { + "epoch": 0.9203208736391519, + "grad_norm": 50.69965362548828, + "learning_rate": 1.921523131160707e-08, + "logits/chosen": -2.3246703147888184, + "logits/rejected": -2.7445061206817627, + "logps/chosen": -2.288409471511841, + "logps/rejected": -3.077025890350342, + "loss": 3.6925, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.88409423828125, + "rewards/margins": 7.886164665222168, + "rewards/rejected": -30.7702579498291, + "step": 27305 + }, + { + "epoch": 0.9204893997101351, + "grad_norm": 28.7850284576416, + "learning_rate": 1.913455647028006e-08, + "logits/chosen": -2.21712327003479, + "logits/rejected": -2.5007641315460205, + "logps/chosen": -3.346345901489258, + "logps/rejected": -3.8853752613067627, + "loss": 3.0171, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -33.463462829589844, + "rewards/margins": 5.390293121337891, + "rewards/rejected": -38.85375213623047, + "step": 27310 + }, + { + "epoch": 0.9206579257811184, + "grad_norm": 19.60544776916504, + "learning_rate": 1.9054048037537683e-08, + "logits/chosen": -1.5601160526275635, + "logits/rejected": -2.036533832550049, + "logps/chosen": -2.6824698448181152, + "logps/rejected": -3.735724925994873, + "loss": 1.3956, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -26.8247013092041, + "rewards/margins": 10.532548904418945, + "rewards/rejected": -37.35724639892578, + "step": 27315 + }, + { + "epoch": 0.9208264518521015, + "grad_norm": 33.12553787231445, + "learning_rate": 1.8973706041240824e-08, + "logits/chosen": -1.1942306756973267, + "logits/rejected": -1.300018072128296, + "logps/chosen": -2.2763888835906982, + "logps/rejected": -2.3771214485168457, + "loss": 2.6452, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.76388931274414, + "rewards/margins": 1.0073254108428955, + "rewards/rejected": -23.77121353149414, + "step": 27320 + }, + { + "epoch": 0.9209949779230847, + "grad_norm": 30.314062118530273, + "learning_rate": 1.889353050919257e-08, + "logits/chosen": -1.6723153591156006, + "logits/rejected": -1.867597222328186, + "logps/chosen": -1.978061318397522, + "logps/rejected": -2.1816296577453613, + "loss": 2.0179, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.780614852905273, + "rewards/margins": 2.0356831550598145, + "rewards/rejected": -21.816295623779297, + "step": 27325 + }, + { + "epoch": 0.9211635039940679, + "grad_norm": 33.596317291259766, + "learning_rate": 1.881352146913856e-08, + "logits/chosen": -1.8963005542755127, + "logits/rejected": -1.8901876211166382, + "logps/chosen": -2.6146950721740723, + "logps/rejected": -2.9768056869506836, + "loss": 3.3404, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -26.14695167541504, + "rewards/margins": 3.621105909347534, + "rewards/rejected": -29.768056869506836, + "step": 27330 + }, + { + "epoch": 0.921332030065051, + "grad_norm": 22.420825958251953, + "learning_rate": 1.8733678948766816e-08, + "logits/chosen": -2.086516857147217, + "logits/rejected": -1.8573553562164307, + "logps/chosen": -2.346881628036499, + "logps/rejected": -2.5964813232421875, + "loss": 1.9698, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -23.46881675720215, + "rewards/margins": 2.4959959983825684, + "rewards/rejected": -25.964813232421875, + "step": 27335 + }, + { + "epoch": 0.9215005561360342, + "grad_norm": 74.21624755859375, + "learning_rate": 1.8654002975707684e-08, + "logits/chosen": -1.608689546585083, + "logits/rejected": -1.6286035776138306, + "logps/chosen": -2.3946728706359863, + "logps/rejected": -2.4410223960876465, + "loss": 2.7656, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.946725845336914, + "rewards/margins": 0.46349868178367615, + "rewards/rejected": -24.410226821899414, + "step": 27340 + }, + { + "epoch": 0.9216690822070174, + "grad_norm": 50.7723274230957, + "learning_rate": 1.8574493577533768e-08, + "logits/chosen": -2.1765570640563965, + "logits/rejected": -1.717564344406128, + "logps/chosen": -2.544821262359619, + "logps/rejected": -2.759453535079956, + "loss": 2.9138, + "rewards/accuracies": 0.5, + "rewards/chosen": -25.448213577270508, + "rewards/margins": 2.146322727203369, + "rewards/rejected": -27.59453773498535, + "step": 27345 + }, + { + "epoch": 0.9218376082780007, + "grad_norm": 60.63186264038086, + "learning_rate": 1.8495150781760283e-08, + "logits/chosen": -1.9756433963775635, + "logits/rejected": -2.058195114135742, + "logps/chosen": -1.8520549535751343, + "logps/rejected": -1.9105947017669678, + "loss": 2.9764, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.520549774169922, + "rewards/margins": 0.5853978991508484, + "rewards/rejected": -19.105945587158203, + "step": 27350 + }, + { + "epoch": 0.9220061343489838, + "grad_norm": 36.562808990478516, + "learning_rate": 1.8415974615844598e-08, + "logits/chosen": -2.3384909629821777, + "logits/rejected": -2.3781991004943848, + "logps/chosen": -2.5374786853790283, + "logps/rejected": -2.66489839553833, + "loss": 2.4687, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -25.374786376953125, + "rewards/margins": 1.2741988897323608, + "rewards/rejected": -26.64898681640625, + "step": 27355 + }, + { + "epoch": 0.922174660419967, + "grad_norm": 33.754371643066406, + "learning_rate": 1.8336965107186354e-08, + "logits/chosen": -1.54237961769104, + "logits/rejected": -1.7503583431243896, + "logps/chosen": -2.4724485874176025, + "logps/rejected": -2.766716480255127, + "loss": 2.126, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.724483489990234, + "rewards/margins": 2.9426774978637695, + "rewards/rejected": -27.667163848876953, + "step": 27360 + }, + { + "epoch": 0.9223431864909502, + "grad_norm": 28.843944549560547, + "learning_rate": 1.8258122283127787e-08, + "logits/chosen": -1.7064812183380127, + "logits/rejected": -1.896023154258728, + "logps/chosen": -2.667555332183838, + "logps/rejected": -2.815514087677002, + "loss": 2.5606, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.675552368164062, + "rewards/margins": 1.4795873165130615, + "rewards/rejected": -28.155141830444336, + "step": 27365 + }, + { + "epoch": 0.9225117125619333, + "grad_norm": 79.54619598388672, + "learning_rate": 1.8179446170953182e-08, + "logits/chosen": -1.9791038036346436, + "logits/rejected": -1.836622953414917, + "logps/chosen": -2.6727569103240967, + "logps/rejected": -2.673759698867798, + "loss": 3.3764, + "rewards/accuracies": 0.5, + "rewards/chosen": -26.727569580078125, + "rewards/margins": 0.010026884265244007, + "rewards/rejected": -26.737598419189453, + "step": 27370 + }, + { + "epoch": 0.9226802386329165, + "grad_norm": 0.006284057628363371, + "learning_rate": 1.810093679788932e-08, + "logits/chosen": -1.9792457818984985, + "logits/rejected": -2.220228433609009, + "logps/chosen": -2.9561426639556885, + "logps/rejected": -3.098778486251831, + "loss": 6.8838, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -29.561426162719727, + "rewards/margins": 1.4263594150543213, + "rewards/rejected": -30.9877872467041, + "step": 27375 + }, + { + "epoch": 0.9228487647038996, + "grad_norm": 38.517547607421875, + "learning_rate": 1.8022594191105133e-08, + "logits/chosen": -1.7412769794464111, + "logits/rejected": -2.1089096069335938, + "logps/chosen": -2.4893736839294434, + "logps/rejected": -3.0103344917297363, + "loss": 1.5992, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -24.893733978271484, + "rewards/margins": 5.209610939025879, + "rewards/rejected": -30.103343963623047, + "step": 27380 + }, + { + "epoch": 0.9230172907748829, + "grad_norm": 15.737530708312988, + "learning_rate": 1.794441837771199e-08, + "logits/chosen": -2.132612705230713, + "logits/rejected": -1.8385169506072998, + "logps/chosen": -2.0549371242523193, + "logps/rejected": -2.159449338912964, + "loss": 3.5841, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.54937171936035, + "rewards/margins": 1.045123815536499, + "rewards/rejected": -21.594493865966797, + "step": 27385 + }, + { + "epoch": 0.9231858168458661, + "grad_norm": 42.83122634887695, + "learning_rate": 1.786640938476336e-08, + "logits/chosen": -1.5979821681976318, + "logits/rejected": -1.888920545578003, + "logps/chosen": -2.3727355003356934, + "logps/rejected": -2.5195016860961914, + "loss": 2.7367, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.727354049682617, + "rewards/margins": 1.4676618576049805, + "rewards/rejected": -25.195018768310547, + "step": 27390 + }, + { + "epoch": 0.9233543429168493, + "grad_norm": 31.570093154907227, + "learning_rate": 1.778856723925515e-08, + "logits/chosen": -1.9060827493667603, + "logits/rejected": -1.9400758743286133, + "logps/chosen": -2.1560046672821045, + "logps/rejected": -2.0583724975585938, + "loss": 5.3615, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.560047149658203, + "rewards/margins": -0.9763216972351074, + "rewards/rejected": -20.58372688293457, + "step": 27395 + }, + { + "epoch": 0.9235228689878324, + "grad_norm": 50.94015884399414, + "learning_rate": 1.771089196812542e-08, + "logits/chosen": -1.942318320274353, + "logits/rejected": -2.1093525886535645, + "logps/chosen": -2.1805660724639893, + "logps/rejected": -2.2652573585510254, + "loss": 2.756, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.805662155151367, + "rewards/margins": 0.8469133377075195, + "rewards/rejected": -22.65257453918457, + "step": 27400 + }, + { + "epoch": 0.9236913950588156, + "grad_norm": 301.8170166015625, + "learning_rate": 1.763338359825467e-08, + "logits/chosen": -1.502886414527893, + "logits/rejected": -1.5221701860427856, + "logps/chosen": -2.850468873977661, + "logps/rejected": -2.600606679916382, + "loss": 6.5051, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -28.504688262939453, + "rewards/margins": -2.498622417449951, + "rewards/rejected": -26.006067276000977, + "step": 27405 + }, + { + "epoch": 0.9238599211297988, + "grad_norm": 33.5927848815918, + "learning_rate": 1.7556042156465278e-08, + "logits/chosen": -2.5625860691070557, + "logits/rejected": -3.1480767726898193, + "logps/chosen": -1.8674657344818115, + "logps/rejected": -2.3821492195129395, + "loss": 1.9451, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.674657821655273, + "rewards/margins": 5.1468329429626465, + "rewards/rejected": -23.821491241455078, + "step": 27410 + }, + { + "epoch": 0.9240284472007819, + "grad_norm": 68.57820892333984, + "learning_rate": 1.747886766952217e-08, + "logits/chosen": -1.8768354654312134, + "logits/rejected": -2.275322437286377, + "logps/chosen": -2.3458003997802734, + "logps/rejected": -3.2828738689422607, + "loss": 2.3436, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.458003997802734, + "rewards/margins": 9.370733261108398, + "rewards/rejected": -32.828739166259766, + "step": 27415 + }, + { + "epoch": 0.9241969732717651, + "grad_norm": 8.633049964904785, + "learning_rate": 1.7401860164132364e-08, + "logits/chosen": -1.939244270324707, + "logits/rejected": -1.7692468166351318, + "logps/chosen": -3.509154796600342, + "logps/rejected": -2.917518138885498, + "loss": 11.9724, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -35.09154510498047, + "rewards/margins": -5.916362762451172, + "rewards/rejected": -29.175182342529297, + "step": 27420 + }, + { + "epoch": 0.9243654993427484, + "grad_norm": 0.3071482479572296, + "learning_rate": 1.7325019666945217e-08, + "logits/chosen": -2.091637134552002, + "logits/rejected": -2.435678005218506, + "logps/chosen": -2.8514938354492188, + "logps/rejected": -3.5164198875427246, + "loss": 2.0882, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -28.514938354492188, + "rewards/margins": 6.649260520935059, + "rewards/rejected": -35.16419982910156, + "step": 27425 + }, + { + "epoch": 0.9245340254137315, + "grad_norm": 31.114391326904297, + "learning_rate": 1.7248346204552065e-08, + "logits/chosen": -2.094665050506592, + "logits/rejected": -2.374276638031006, + "logps/chosen": -2.6455271244049072, + "logps/rejected": -3.161177158355713, + "loss": 2.3138, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -26.455270767211914, + "rewards/margins": 5.156498908996582, + "rewards/rejected": -31.611770629882812, + "step": 27430 + }, + { + "epoch": 0.9247025514847147, + "grad_norm": 114.97472381591797, + "learning_rate": 1.717183980348663e-08, + "logits/chosen": -2.3799057006835938, + "logits/rejected": -2.057694911956787, + "logps/chosen": -2.764962673187256, + "logps/rejected": -2.583735227584839, + "loss": 6.0765, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -27.649627685546875, + "rewards/margins": -1.8122737407684326, + "rewards/rejected": -25.837352752685547, + "step": 27435 + }, + { + "epoch": 0.9248710775556979, + "grad_norm": 151.6468505859375, + "learning_rate": 1.709550049022479e-08, + "logits/chosen": -1.7874248027801514, + "logits/rejected": -1.8235124349594116, + "logps/chosen": -3.8881237506866455, + "logps/rejected": -5.001030445098877, + "loss": 2.4004, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -38.8812370300293, + "rewards/margins": 11.12906551361084, + "rewards/rejected": -50.01030731201172, + "step": 27440 + }, + { + "epoch": 0.925039603626681, + "grad_norm": 45.867191314697266, + "learning_rate": 1.7019328291184632e-08, + "logits/chosen": -1.7721633911132812, + "logits/rejected": -1.5765838623046875, + "logps/chosen": -2.182168483734131, + "logps/rejected": -2.279179096221924, + "loss": 2.5337, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.821685791015625, + "rewards/margins": 0.9701029062271118, + "rewards/rejected": -22.79178810119629, + "step": 27445 + }, + { + "epoch": 0.9252081296976642, + "grad_norm": 28.50269889831543, + "learning_rate": 1.6943323232726182e-08, + "logits/chosen": -1.6506497859954834, + "logits/rejected": -1.994645357131958, + "logps/chosen": -2.5445361137390137, + "logps/rejected": -2.7690863609313965, + "loss": 2.6771, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.44536018371582, + "rewards/margins": 2.2455050945281982, + "rewards/rejected": -27.69086265563965, + "step": 27450 + }, + { + "epoch": 0.9253766557686474, + "grad_norm": 23.46358871459961, + "learning_rate": 1.68674853411519e-08, + "logits/chosen": -1.7911182641983032, + "logits/rejected": -2.173764705657959, + "logps/chosen": -3.1003613471984863, + "logps/rejected": -3.6647541522979736, + "loss": 1.7456, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -31.003612518310547, + "rewards/margins": 5.643929481506348, + "rewards/rejected": -36.647544860839844, + "step": 27455 + }, + { + "epoch": 0.9255451818396306, + "grad_norm": 158.92230224609375, + "learning_rate": 1.6791814642706292e-08, + "logits/chosen": -2.188016653060913, + "logits/rejected": -2.542320489883423, + "logps/chosen": -2.734858989715576, + "logps/rejected": -2.99385929107666, + "loss": 2.6771, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -27.348590850830078, + "rewards/margins": 2.5900049209594727, + "rewards/rejected": -29.9385929107666, + "step": 27460 + }, + { + "epoch": 0.9257137079106138, + "grad_norm": 35.6912841796875, + "learning_rate": 1.6716311163575967e-08, + "logits/chosen": -1.6770681142807007, + "logits/rejected": -1.688140630722046, + "logps/chosen": -2.8789820671081543, + "logps/rejected": -2.7300705909729004, + "loss": 5.1186, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -28.789819717407227, + "rewards/margins": -1.48911452293396, + "rewards/rejected": -27.300708770751953, + "step": 27465 + }, + { + "epoch": 0.925882233981597, + "grad_norm": 137.83705139160156, + "learning_rate": 1.664097492988975e-08, + "logits/chosen": -2.104797840118408, + "logits/rejected": -2.3004043102264404, + "logps/chosen": -3.1804895401000977, + "logps/rejected": -3.556551456451416, + "loss": 2.7026, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -31.804895401000977, + "rewards/margins": 3.7606215476989746, + "rewards/rejected": -35.565513610839844, + "step": 27470 + }, + { + "epoch": 0.9260507600525801, + "grad_norm": 92.81694793701172, + "learning_rate": 1.6565805967718504e-08, + "logits/chosen": -2.285025119781494, + "logits/rejected": -2.4701247215270996, + "logps/chosen": -2.6418814659118652, + "logps/rejected": -3.3334007263183594, + "loss": 2.3526, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.4188175201416, + "rewards/margins": 6.915187835693359, + "rewards/rejected": -33.33400344848633, + "step": 27475 + }, + { + "epoch": 0.9262192861235633, + "grad_norm": 45.628963470458984, + "learning_rate": 1.649080430307537e-08, + "logits/chosen": -1.9886986017227173, + "logits/rejected": -2.3399429321289062, + "logps/chosen": -2.4697680473327637, + "logps/rejected": -2.8362088203430176, + "loss": 2.7542, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.69767951965332, + "rewards/margins": 3.6644082069396973, + "rewards/rejected": -28.36208724975586, + "step": 27480 + }, + { + "epoch": 0.9263878121945465, + "grad_norm": 53.76228713989258, + "learning_rate": 1.6415969961915245e-08, + "logits/chosen": -2.2061126232147217, + "logits/rejected": -2.1689679622650146, + "logps/chosen": -3.3630974292755127, + "logps/rejected": -3.4911270141601562, + "loss": 3.0385, + "rewards/accuracies": 0.5, + "rewards/chosen": -33.6309700012207, + "rewards/margins": 1.2802989482879639, + "rewards/rejected": -34.91127014160156, + "step": 27485 + }, + { + "epoch": 0.9265563382655296, + "grad_norm": 84.9283447265625, + "learning_rate": 1.6341302970135472e-08, + "logits/chosen": -1.81033456325531, + "logits/rejected": -1.845776915550232, + "logps/chosen": -3.6270880699157715, + "logps/rejected": -4.127110481262207, + "loss": 2.1203, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -36.27088165283203, + "rewards/margins": 5.00022029876709, + "rewards/rejected": -41.2711067199707, + "step": 27490 + }, + { + "epoch": 0.9267248643365129, + "grad_norm": 34.078857421875, + "learning_rate": 1.6266803353575444e-08, + "logits/chosen": -1.784132957458496, + "logits/rejected": -1.6151421070098877, + "logps/chosen": -2.0967466831207275, + "logps/rejected": -2.0627224445343018, + "loss": 3.4931, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.967464447021484, + "rewards/margins": -0.3402392268180847, + "rewards/rejected": -20.627225875854492, + "step": 27495 + }, + { + "epoch": 0.9268933904074961, + "grad_norm": 36.228755950927734, + "learning_rate": 1.619247113801636e-08, + "logits/chosen": -1.7772010564804077, + "logits/rejected": -1.9724609851837158, + "logps/chosen": -2.2621092796325684, + "logps/rejected": -2.5008437633514404, + "loss": 2.521, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.62109375, + "rewards/margins": 2.387342929840088, + "rewards/rejected": -25.008438110351562, + "step": 27500 + }, + { + "epoch": 0.9270619164784792, + "grad_norm": 130.93255615234375, + "learning_rate": 1.6118306349181766e-08, + "logits/chosen": -1.3324534893035889, + "logits/rejected": -1.5228675603866577, + "logps/chosen": -2.476607322692871, + "logps/rejected": -2.4428625106811523, + "loss": 3.8933, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.766071319580078, + "rewards/margins": -0.3374488949775696, + "rewards/rejected": -24.428625106811523, + "step": 27505 + }, + { + "epoch": 0.9272304425494624, + "grad_norm": 26.975387573242188, + "learning_rate": 1.604430901273718e-08, + "logits/chosen": -1.7993533611297607, + "logits/rejected": -2.2579345703125, + "logps/chosen": -2.5446598529815674, + "logps/rejected": -3.0560150146484375, + "loss": 2.2713, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.446598052978516, + "rewards/margins": 5.113553047180176, + "rewards/rejected": -30.560150146484375, + "step": 27510 + }, + { + "epoch": 0.9273989686204456, + "grad_norm": 31.938961029052734, + "learning_rate": 1.5970479154290228e-08, + "logits/chosen": -1.9177414178848267, + "logits/rejected": -2.183879852294922, + "logps/chosen": -2.496314764022827, + "logps/rejected": -2.7273929119110107, + "loss": 1.8621, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -24.963146209716797, + "rewards/margins": 2.310781955718994, + "rewards/rejected": -27.273929595947266, + "step": 27515 + }, + { + "epoch": 0.9275674946914287, + "grad_norm": 17.566181182861328, + "learning_rate": 1.5896816799390313e-08, + "logits/chosen": -2.3122284412384033, + "logits/rejected": -2.619051456451416, + "logps/chosen": -2.220881700515747, + "logps/rejected": -2.4604485034942627, + "loss": 2.3981, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.208816528320312, + "rewards/margins": 2.3956680297851562, + "rewards/rejected": -24.604482650756836, + "step": 27520 + }, + { + "epoch": 0.9277360207624119, + "grad_norm": 25.077037811279297, + "learning_rate": 1.5823321973529256e-08, + "logits/chosen": -1.534099817276001, + "logits/rejected": -1.9863879680633545, + "logps/chosen": -2.358691692352295, + "logps/rejected": -2.79862117767334, + "loss": 2.2013, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.586915969848633, + "rewards/margins": 4.399293422698975, + "rewards/rejected": -27.986209869384766, + "step": 27525 + }, + { + "epoch": 0.9279045468333951, + "grad_norm": 126.60164642333984, + "learning_rate": 1.5749994702140666e-08, + "logits/chosen": -2.381648540496826, + "logits/rejected": -2.3383431434631348, + "logps/chosen": -2.6393046379089355, + "logps/rejected": -2.7791647911071777, + "loss": 3.071, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -26.39304542541504, + "rewards/margins": 1.3986046314239502, + "rewards/rejected": -27.791650772094727, + "step": 27530 + }, + { + "epoch": 0.9280730729043783, + "grad_norm": 25.092164993286133, + "learning_rate": 1.5676835010600242e-08, + "logits/chosen": -2.014479160308838, + "logits/rejected": -2.1772711277008057, + "logps/chosen": -2.1367969512939453, + "logps/rejected": -2.154160261154175, + "loss": 3.3433, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.367969512939453, + "rewards/margins": 0.1736338585615158, + "rewards/rejected": -21.54160499572754, + "step": 27535 + }, + { + "epoch": 0.9282415989753615, + "grad_norm": 44.35649108886719, + "learning_rate": 1.560384292422562e-08, + "logits/chosen": -1.437814712524414, + "logits/rejected": -1.5548362731933594, + "logps/chosen": -2.0173325538635254, + "logps/rejected": -2.0877983570098877, + "loss": 2.8418, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.173322677612305, + "rewards/margins": 0.7046583294868469, + "rewards/rejected": -20.877981185913086, + "step": 27540 + }, + { + "epoch": 0.9284101250463447, + "grad_norm": 53.693397521972656, + "learning_rate": 1.553101846827648e-08, + "logits/chosen": -1.4574278593063354, + "logits/rejected": -2.2143871784210205, + "logps/chosen": -2.498276472091675, + "logps/rejected": -3.071702480316162, + "loss": 3.0937, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.982765197753906, + "rewards/margins": 5.734262943267822, + "rewards/rejected": -30.717029571533203, + "step": 27545 + }, + { + "epoch": 0.9285786511173278, + "grad_norm": 20.134761810302734, + "learning_rate": 1.5458361667954612e-08, + "logits/chosen": -1.9929225444793701, + "logits/rejected": -2.411076307296753, + "logps/chosen": -2.3681139945983887, + "logps/rejected": -2.5899434089660645, + "loss": 2.8595, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.681140899658203, + "rewards/margins": 2.218296527862549, + "rewards/rejected": -25.899438858032227, + "step": 27550 + }, + { + "epoch": 0.928747177188311, + "grad_norm": 19.580636978149414, + "learning_rate": 1.5385872548403513e-08, + "logits/chosen": -1.4577335119247437, + "logits/rejected": -1.5620262622833252, + "logps/chosen": -2.7865166664123535, + "logps/rejected": -2.948183536529541, + "loss": 2.5797, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -27.86516761779785, + "rewards/margins": 1.616668701171875, + "rewards/rejected": -29.48183250427246, + "step": 27555 + }, + { + "epoch": 0.9289157032592942, + "grad_norm": 61.952980041503906, + "learning_rate": 1.531355113470889e-08, + "logits/chosen": -2.2046194076538086, + "logits/rejected": -2.318603038787842, + "logps/chosen": -2.6069815158843994, + "logps/rejected": -2.8184146881103516, + "loss": 3.1143, + "rewards/accuracies": 0.5, + "rewards/chosen": -26.069812774658203, + "rewards/margins": 2.114332675933838, + "rewards/rejected": -28.184146881103516, + "step": 27560 + }, + { + "epoch": 0.9290842293302773, + "grad_norm": 11.280858039855957, + "learning_rate": 1.524139745189845e-08, + "logits/chosen": -1.6384538412094116, + "logits/rejected": -1.7444041967391968, + "logps/chosen": -2.4805355072021484, + "logps/rejected": -2.4746363162994385, + "loss": 3.8022, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.805355072021484, + "rewards/margins": -0.058992959558963776, + "rewards/rejected": -24.746362686157227, + "step": 27565 + }, + { + "epoch": 0.9292527554012606, + "grad_norm": 61.026634216308594, + "learning_rate": 1.5169411524941556e-08, + "logits/chosen": -2.0152387619018555, + "logits/rejected": -2.6300861835479736, + "logps/chosen": -1.8867985010147095, + "logps/rejected": -2.0875132083892822, + "loss": 3.0549, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.867984771728516, + "rewards/margins": 2.0071449279785156, + "rewards/rejected": -20.8751277923584, + "step": 27570 + }, + { + "epoch": 0.9294212814722438, + "grad_norm": 45.335079193115234, + "learning_rate": 1.5097593378749717e-08, + "logits/chosen": -1.8455007076263428, + "logits/rejected": -1.8004382848739624, + "logps/chosen": -2.500025987625122, + "logps/rejected": -2.3104910850524902, + "loss": 5.0566, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -25.000259399414062, + "rewards/margins": -1.895347237586975, + "rewards/rejected": -23.10491180419922, + "step": 27575 + }, + { + "epoch": 0.929589807543227, + "grad_norm": 10.588850975036621, + "learning_rate": 1.5025943038176447e-08, + "logits/chosen": -1.7851600646972656, + "logits/rejected": -2.0314362049102783, + "logps/chosen": -1.9325587749481201, + "logps/rejected": -2.1342849731445312, + "loss": 2.7042, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.32558822631836, + "rewards/margins": 2.017261028289795, + "rewards/rejected": -21.342849731445312, + "step": 27580 + }, + { + "epoch": 0.9297583336142101, + "grad_norm": 27.62746810913086, + "learning_rate": 1.4954460528017132e-08, + "logits/chosen": -1.3435360193252563, + "logits/rejected": -1.7085888385772705, + "logps/chosen": -2.0199694633483887, + "logps/rejected": -2.219791889190674, + "loss": 2.4459, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.19969367980957, + "rewards/margins": 1.9982258081436157, + "rewards/rejected": -22.197919845581055, + "step": 27585 + }, + { + "epoch": 0.9299268596851933, + "grad_norm": 147.57992553710938, + "learning_rate": 1.4883145873008984e-08, + "logits/chosen": -1.373422384262085, + "logits/rejected": -2.1292011737823486, + "logps/chosen": -3.485476016998291, + "logps/rejected": -4.046762466430664, + "loss": 3.336, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -34.85475540161133, + "rewards/margins": 5.6128668785095215, + "rewards/rejected": -40.467628479003906, + "step": 27590 + }, + { + "epoch": 0.9300953857561765, + "grad_norm": 27.67316436767578, + "learning_rate": 1.4811999097831151e-08, + "logits/chosen": -1.812901258468628, + "logits/rejected": -1.9904277324676514, + "logps/chosen": -2.89308500289917, + "logps/rejected": -3.0367727279663086, + "loss": 4.4566, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -28.930850982666016, + "rewards/margins": 1.436877965927124, + "rewards/rejected": -30.367727279663086, + "step": 27595 + }, + { + "epoch": 0.9302639118271596, + "grad_norm": 54.67453384399414, + "learning_rate": 1.4741020227104883e-08, + "logits/chosen": -1.3734403848648071, + "logits/rejected": -1.4352593421936035, + "logps/chosen": -2.109225034713745, + "logps/rejected": -2.2077791690826416, + "loss": 2.5068, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.09225082397461, + "rewards/margins": 0.9855405688285828, + "rewards/rejected": -22.07779312133789, + "step": 27600 + }, + { + "epoch": 0.9302639118271596, + "eval_logits/chosen": -2.3133976459503174, + "eval_logits/rejected": -2.492166519165039, + "eval_logps/chosen": -2.2893614768981934, + "eval_logps/rejected": -2.4440784454345703, + "eval_loss": 3.0896363258361816, + "eval_rewards/accuracies": 0.6299999952316284, + "eval_rewards/chosen": -22.893613815307617, + "eval_rewards/margins": 1.547170877456665, + "eval_rewards/rejected": -24.440786361694336, + "eval_runtime": 12.8916, + "eval_samples_per_second": 7.757, + "eval_steps_per_second": 1.939, + "step": 27600 + }, + { + "epoch": 0.9304324378981429, + "grad_norm": 12.913957595825195, + "learning_rate": 1.4670209285392975e-08, + "logits/chosen": -1.9860661029815674, + "logits/rejected": -2.458383321762085, + "logps/chosen": -2.9501705169677734, + "logps/rejected": -3.6025662422180176, + "loss": 1.6886, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -29.501705169677734, + "rewards/margins": 6.523956298828125, + "rewards/rejected": -36.025657653808594, + "step": 27605 + }, + { + "epoch": 0.9306009639691261, + "grad_norm": 20.887788772583008, + "learning_rate": 1.4599566297200438e-08, + "logits/chosen": -1.6999485492706299, + "logits/rejected": -2.1493802070617676, + "logps/chosen": -2.7761943340301514, + "logps/rejected": -3.2419955730438232, + "loss": 2.472, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -27.761943817138672, + "rewards/margins": 4.6580095291137695, + "rewards/rejected": -32.41995620727539, + "step": 27610 + }, + { + "epoch": 0.9307694900401092, + "grad_norm": 95.52549743652344, + "learning_rate": 1.4529091286973993e-08, + "logits/chosen": -1.990030288696289, + "logits/rejected": -1.9676322937011719, + "logps/chosen": -2.367176055908203, + "logps/rejected": -2.256159543991089, + "loss": 4.5058, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.671756744384766, + "rewards/margins": -1.110162377357483, + "rewards/rejected": -22.561595916748047, + "step": 27615 + }, + { + "epoch": 0.9309380161110924, + "grad_norm": 95.74559020996094, + "learning_rate": 1.4458784279102299e-08, + "logits/chosen": -1.9741586446762085, + "logits/rejected": -2.3909668922424316, + "logps/chosen": -1.9129314422607422, + "logps/rejected": -2.1065430641174316, + "loss": 2.8111, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.129314422607422, + "rewards/margins": 1.9361183643341064, + "rewards/rejected": -21.065433502197266, + "step": 27620 + }, + { + "epoch": 0.9311065421820756, + "grad_norm": 4.898240566253662, + "learning_rate": 1.4388645297915725e-08, + "logits/chosen": -1.7137861251831055, + "logits/rejected": -2.158115863800049, + "logps/chosen": -2.891925811767578, + "logps/rejected": -3.6723380088806152, + "loss": 1.5774, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -28.919260025024414, + "rewards/margins": 7.804121971130371, + "rewards/rejected": -36.72338104248047, + "step": 27625 + }, + { + "epoch": 0.9312750682530587, + "grad_norm": 32.084014892578125, + "learning_rate": 1.4318674367686745e-08, + "logits/chosen": -1.2843127250671387, + "logits/rejected": -1.3736934661865234, + "logps/chosen": -1.9384397268295288, + "logps/rejected": -2.291755437850952, + "loss": 2.4342, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.3843994140625, + "rewards/margins": 3.5331573486328125, + "rewards/rejected": -22.917556762695312, + "step": 27630 + }, + { + "epoch": 0.9314435943240419, + "grad_norm": 12.00391960144043, + "learning_rate": 1.424887151262949e-08, + "logits/chosen": -2.211188554763794, + "logits/rejected": -2.742053270339966, + "logps/chosen": -2.813009738922119, + "logps/rejected": -3.6539814472198486, + "loss": 3.0753, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -28.130096435546875, + "rewards/margins": 8.409716606140137, + "rewards/rejected": -36.53981399536133, + "step": 27635 + }, + { + "epoch": 0.9316121203950251, + "grad_norm": 28.257776260375977, + "learning_rate": 1.4179236756899971e-08, + "logits/chosen": -1.988883376121521, + "logits/rejected": -2.2060391902923584, + "logps/chosen": -2.281707286834717, + "logps/rejected": -2.3848066329956055, + "loss": 3.1235, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.81707191467285, + "rewards/margins": 1.0309938192367554, + "rewards/rejected": -23.848064422607422, + "step": 27640 + }, + { + "epoch": 0.9317806464660083, + "grad_norm": 39.309444427490234, + "learning_rate": 1.4109770124596022e-08, + "logits/chosen": -1.8199679851531982, + "logits/rejected": -1.969831109046936, + "logps/chosen": -2.4952075481414795, + "logps/rejected": -2.6376724243164062, + "loss": 2.4044, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.952075958251953, + "rewards/margins": 1.4246470928192139, + "rewards/rejected": -26.376724243164062, + "step": 27645 + }, + { + "epoch": 0.9319491725369915, + "grad_norm": 40.65642547607422, + "learning_rate": 1.4040471639757301e-08, + "logits/chosen": -1.3425325155258179, + "logits/rejected": -1.6632499694824219, + "logps/chosen": -2.3197176456451416, + "logps/rejected": -2.7622852325439453, + "loss": 2.0812, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.19717788696289, + "rewards/margins": 4.425675392150879, + "rewards/rejected": -27.622852325439453, + "step": 27650 + }, + { + "epoch": 0.9321176986079747, + "grad_norm": 47.1500244140625, + "learning_rate": 1.3971341326365349e-08, + "logits/chosen": -2.0653042793273926, + "logits/rejected": -1.8965908288955688, + "logps/chosen": -2.4217300415039062, + "logps/rejected": -2.5961432456970215, + "loss": 2.6185, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.217300415039062, + "rewards/margins": 1.7441316843032837, + "rewards/rejected": -25.9614315032959, + "step": 27655 + }, + { + "epoch": 0.9322862246789578, + "grad_norm": 149.48915100097656, + "learning_rate": 1.3902379208343362e-08, + "logits/chosen": -1.7947008609771729, + "logits/rejected": -1.8451206684112549, + "logps/chosen": -2.4357595443725586, + "logps/rejected": -2.420999050140381, + "loss": 3.5261, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -24.357593536376953, + "rewards/margins": -0.14760398864746094, + "rewards/rejected": -24.209991455078125, + "step": 27660 + }, + { + "epoch": 0.932454750749941, + "grad_norm": 27.083669662475586, + "learning_rate": 1.3833585309556472e-08, + "logits/chosen": -1.52590012550354, + "logits/rejected": -1.748573660850525, + "logps/chosen": -2.4408881664276123, + "logps/rejected": -2.4321718215942383, + "loss": 3.2916, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -24.408884048461914, + "rewards/margins": -0.08716096729040146, + "rewards/rejected": -24.32172203063965, + "step": 27665 + }, + { + "epoch": 0.9326232768209242, + "grad_norm": 64.89007568359375, + "learning_rate": 1.3764959653811525e-08, + "logits/chosen": -1.835463523864746, + "logits/rejected": -2.141990900039673, + "logps/chosen": -2.914647340774536, + "logps/rejected": -3.2299137115478516, + "loss": 2.2172, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -29.146469116210938, + "rewards/margins": 3.15266489982605, + "rewards/rejected": -32.29913330078125, + "step": 27670 + }, + { + "epoch": 0.9327918028919073, + "grad_norm": 69.75782775878906, + "learning_rate": 1.3696502264857134e-08, + "logits/chosen": -1.6715993881225586, + "logits/rejected": -2.1183884143829346, + "logps/chosen": -2.2679061889648438, + "logps/rejected": -2.700915575027466, + "loss": 3.0478, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.679061889648438, + "rewards/margins": 4.3300933837890625, + "rewards/rejected": -27.0091552734375, + "step": 27675 + }, + { + "epoch": 0.9329603289628906, + "grad_norm": 17.441205978393555, + "learning_rate": 1.3628213166383684e-08, + "logits/chosen": -1.648181676864624, + "logits/rejected": -1.6260621547698975, + "logps/chosen": -1.8544126749038696, + "logps/rejected": -1.884833574295044, + "loss": 3.2328, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.544126510620117, + "rewards/margins": 0.3042069375514984, + "rewards/rejected": -18.84833335876465, + "step": 27680 + }, + { + "epoch": 0.9331288550338738, + "grad_norm": 54.063350677490234, + "learning_rate": 1.356009238202338e-08, + "logits/chosen": -1.9545844793319702, + "logits/rejected": -2.0981831550598145, + "logps/chosen": -3.1395256519317627, + "logps/rejected": -3.4566750526428223, + "loss": 3.5834, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -31.3952579498291, + "rewards/margins": 3.1714961528778076, + "rewards/rejected": -34.56675338745117, + "step": 27685 + }, + { + "epoch": 0.9332973811048569, + "grad_norm": 16.980487823486328, + "learning_rate": 1.3492139935350143e-08, + "logits/chosen": -1.3540923595428467, + "logits/rejected": -1.7177197933197021, + "logps/chosen": -2.7987303733825684, + "logps/rejected": -2.9920051097869873, + "loss": 5.1382, + "rewards/accuracies": 0.5, + "rewards/chosen": -27.9873046875, + "rewards/margins": 1.9327491521835327, + "rewards/rejected": -29.920055389404297, + "step": 27690 + }, + { + "epoch": 0.9334659071758401, + "grad_norm": 2.9056694507598877, + "learning_rate": 1.3424355849879665e-08, + "logits/chosen": -2.472014904022217, + "logits/rejected": -3.0095813274383545, + "logps/chosen": -2.2502903938293457, + "logps/rejected": -2.774362802505493, + "loss": 1.2752, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -22.502906799316406, + "rewards/margins": 5.240719795227051, + "rewards/rejected": -27.743627548217773, + "step": 27695 + }, + { + "epoch": 0.9336344332468233, + "grad_norm": 30.565427780151367, + "learning_rate": 1.3356740149069234e-08, + "logits/chosen": -1.6824843883514404, + "logits/rejected": -1.5276223421096802, + "logps/chosen": -3.4816088676452637, + "logps/rejected": -3.265540361404419, + "loss": 7.1031, + "rewards/accuracies": 0.5, + "rewards/chosen": -34.81608963012695, + "rewards/margins": -2.160688877105713, + "rewards/rejected": -32.65540313720703, + "step": 27700 + }, + { + "epoch": 0.9338029593178064, + "grad_norm": 148.01853942871094, + "learning_rate": 1.328929285631819e-08, + "logits/chosen": -1.693103551864624, + "logits/rejected": -2.0444936752319336, + "logps/chosen": -2.936856508255005, + "logps/rejected": -3.2731621265411377, + "loss": 3.5818, + "rewards/accuracies": 0.5, + "rewards/chosen": -29.368566513061523, + "rewards/margins": 3.3630542755126953, + "rewards/rejected": -32.73162078857422, + "step": 27705 + }, + { + "epoch": 0.9339714853887896, + "grad_norm": 20.24788475036621, + "learning_rate": 1.322201399496714e-08, + "logits/chosen": -2.2496120929718018, + "logits/rejected": -2.1728973388671875, + "logps/chosen": -3.4590187072753906, + "logps/rejected": -3.597189426422119, + "loss": 4.7234, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -34.590187072753906, + "rewards/margins": 1.3817100524902344, + "rewards/rejected": -35.971893310546875, + "step": 27710 + }, + { + "epoch": 0.9341400114597729, + "grad_norm": 61.02878952026367, + "learning_rate": 1.3154903588298794e-08, + "logits/chosen": -2.0132148265838623, + "logits/rejected": -2.283881664276123, + "logps/chosen": -2.496445417404175, + "logps/rejected": -2.786395311355591, + "loss": 2.5994, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.964452743530273, + "rewards/margins": 2.8994970321655273, + "rewards/rejected": -27.86395263671875, + "step": 27715 + }, + { + "epoch": 0.934308537530756, + "grad_norm": 53.6756477355957, + "learning_rate": 1.3087961659537349e-08, + "logits/chosen": -1.861713171005249, + "logits/rejected": -1.7294833660125732, + "logps/chosen": -2.40018892288208, + "logps/rejected": -2.500532865524292, + "loss": 2.574, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.001888275146484, + "rewards/margins": 1.0034393072128296, + "rewards/rejected": -25.005329132080078, + "step": 27720 + }, + { + "epoch": 0.9344770636017392, + "grad_norm": 159.35438537597656, + "learning_rate": 1.302118823184889e-08, + "logits/chosen": -1.8387367725372314, + "logits/rejected": -1.9457080364227295, + "logps/chosen": -2.5194616317749023, + "logps/rejected": -2.489358425140381, + "loss": 4.4858, + "rewards/accuracies": 0.5, + "rewards/chosen": -25.19462013244629, + "rewards/margins": -0.3010326325893402, + "rewards/rejected": -24.893585205078125, + "step": 27725 + }, + { + "epoch": 0.9346455896727224, + "grad_norm": 0.06013474240899086, + "learning_rate": 1.2954583328340929e-08, + "logits/chosen": -1.674951195716858, + "logits/rejected": -2.087681293487549, + "logps/chosen": -2.5127158164978027, + "logps/rejected": -2.8834900856018066, + "loss": 2.2586, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.127161026000977, + "rewards/margins": 3.7077407836914062, + "rewards/rejected": -28.83489990234375, + "step": 27730 + }, + { + "epoch": 0.9348141157437055, + "grad_norm": 64.38245391845703, + "learning_rate": 1.2888146972062863e-08, + "logits/chosen": -2.249094009399414, + "logits/rejected": -2.2135531902313232, + "logps/chosen": -1.886885643005371, + "logps/rejected": -2.272331476211548, + "loss": 2.1438, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -18.868854522705078, + "rewards/margins": 3.8544578552246094, + "rewards/rejected": -22.723316192626953, + "step": 27735 + }, + { + "epoch": 0.9349826418146887, + "grad_norm": 27.971521377563477, + "learning_rate": 1.2821879186005747e-08, + "logits/chosen": -1.8382654190063477, + "logits/rejected": -1.9038264751434326, + "logps/chosen": -1.9858547449111938, + "logps/rejected": -2.219144821166992, + "loss": 2.2832, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.858545303344727, + "rewards/margins": 2.3329017162323, + "rewards/rejected": -22.191448211669922, + "step": 27740 + }, + { + "epoch": 0.9351511678856719, + "grad_norm": 42.19049835205078, + "learning_rate": 1.2755779993102122e-08, + "logits/chosen": -2.663670301437378, + "logits/rejected": -2.474766492843628, + "logps/chosen": -2.775183916091919, + "logps/rejected": -2.7223057746887207, + "loss": 7.3642, + "rewards/accuracies": 0.5, + "rewards/chosen": -27.751840591430664, + "rewards/margins": -0.528782069683075, + "rewards/rejected": -27.223058700561523, + "step": 27745 + }, + { + "epoch": 0.935319693956655, + "grad_norm": 12.594414710998535, + "learning_rate": 1.2689849416226362e-08, + "logits/chosen": -1.594200849533081, + "logits/rejected": -1.7864547967910767, + "logps/chosen": -2.1359341144561768, + "logps/rejected": -2.3643715381622314, + "loss": 1.7207, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.35934066772461, + "rewards/margins": 2.2843754291534424, + "rewards/rejected": -23.643714904785156, + "step": 27750 + }, + { + "epoch": 0.9354882200276383, + "grad_norm": 118.9895248413086, + "learning_rate": 1.2624087478194545e-08, + "logits/chosen": -2.010768413543701, + "logits/rejected": -1.775254249572754, + "logps/chosen": -2.5973310470581055, + "logps/rejected": -2.587301254272461, + "loss": 4.139, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.973312377929688, + "rewards/margins": -0.10029850155115128, + "rewards/rejected": -25.873010635375977, + "step": 27755 + }, + { + "epoch": 0.9356567460986215, + "grad_norm": 62.92037582397461, + "learning_rate": 1.2558494201764136e-08, + "logits/chosen": -1.943966269493103, + "logits/rejected": -2.2178397178649902, + "logps/chosen": -2.327336072921753, + "logps/rejected": -2.6325855255126953, + "loss": 3.4514, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -23.273361206054688, + "rewards/margins": 3.0524938106536865, + "rewards/rejected": -26.325855255126953, + "step": 27760 + }, + { + "epoch": 0.9358252721696046, + "grad_norm": 44.706966400146484, + "learning_rate": 1.2493069609634477e-08, + "logits/chosen": -2.0918984413146973, + "logits/rejected": -1.994227409362793, + "logps/chosen": -2.0638251304626465, + "logps/rejected": -2.0130505561828613, + "loss": 3.8463, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.638248443603516, + "rewards/margins": -0.5077415704727173, + "rewards/rejected": -20.130508422851562, + "step": 27765 + }, + { + "epoch": 0.9359937982405878, + "grad_norm": 28.990034103393555, + "learning_rate": 1.24278137244464e-08, + "logits/chosen": -1.9322535991668701, + "logits/rejected": -2.4840035438537598, + "logps/chosen": -2.13610577583313, + "logps/rejected": -2.6279749870300293, + "loss": 2.0504, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.36105728149414, + "rewards/margins": 4.918692588806152, + "rewards/rejected": -26.279748916625977, + "step": 27770 + }, + { + "epoch": 0.936162324311571, + "grad_norm": 25.019968032836914, + "learning_rate": 1.2362726568782512e-08, + "logits/chosen": -1.871313452720642, + "logits/rejected": -1.842206358909607, + "logps/chosen": -2.322143077850342, + "logps/rejected": -3.0833826065063477, + "loss": 1.9137, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.221431732177734, + "rewards/margins": 7.6123948097229, + "rewards/rejected": -30.833826065063477, + "step": 27775 + }, + { + "epoch": 0.9363308503825541, + "grad_norm": 40.041114807128906, + "learning_rate": 1.2297808165166735e-08, + "logits/chosen": -2.6236162185668945, + "logits/rejected": -3.0642149448394775, + "logps/chosen": -2.3841915130615234, + "logps/rejected": -2.809760332107544, + "loss": 2.8452, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.841915130615234, + "rewards/margins": 4.2556891441345215, + "rewards/rejected": -28.097604751586914, + "step": 27780 + }, + { + "epoch": 0.9364993764535373, + "grad_norm": 78.90538787841797, + "learning_rate": 1.2233058536064821e-08, + "logits/chosen": -1.3989120721817017, + "logits/rejected": -1.1721596717834473, + "logps/chosen": -2.9932949542999268, + "logps/rejected": -3.3058419227600098, + "loss": 3.0073, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -29.93294906616211, + "rewards/margins": 3.125467300415039, + "rewards/rejected": -33.05841827392578, + "step": 27785 + }, + { + "epoch": 0.9366679025245206, + "grad_norm": 12.254767417907715, + "learning_rate": 1.2168477703884184e-08, + "logits/chosen": -2.0970308780670166, + "logits/rejected": -2.6638078689575195, + "logps/chosen": -1.7973181009292603, + "logps/rejected": -2.045571804046631, + "loss": 1.8345, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -17.973180770874023, + "rewards/margins": 2.4825377464294434, + "rewards/rejected": -20.455718994140625, + "step": 27790 + }, + { + "epoch": 0.9368364285955038, + "grad_norm": 38.09868621826172, + "learning_rate": 1.2104065690973554e-08, + "logits/chosen": -2.2299957275390625, + "logits/rejected": -2.420968532562256, + "logps/chosen": -2.7009854316711426, + "logps/rejected": -2.7875897884368896, + "loss": 3.2235, + "rewards/accuracies": 0.5, + "rewards/chosen": -27.00985336303711, + "rewards/margins": 0.866042971611023, + "rewards/rejected": -27.875896453857422, + "step": 27795 + }, + { + "epoch": 0.9370049546664869, + "grad_norm": 22.674205780029297, + "learning_rate": 1.2039822519623489e-08, + "logits/chosen": -1.987546682357788, + "logits/rejected": -2.2283618450164795, + "logps/chosen": -2.184810161590576, + "logps/rejected": -2.653113603591919, + "loss": 1.1945, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -21.848100662231445, + "rewards/margins": 4.683035373687744, + "rewards/rejected": -26.5311336517334, + "step": 27800 + }, + { + "epoch": 0.9371734807374701, + "grad_norm": 4.418429671204649e-05, + "learning_rate": 1.1975748212065928e-08, + "logits/chosen": -2.031785011291504, + "logits/rejected": -2.4191551208496094, + "logps/chosen": -2.511854410171509, + "logps/rejected": -3.39988374710083, + "loss": 1.1578, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -25.118541717529297, + "rewards/margins": 8.88029670715332, + "rewards/rejected": -33.998836517333984, + "step": 27805 + }, + { + "epoch": 0.9373420068084533, + "grad_norm": 191.81787109375, + "learning_rate": 1.1911842790474635e-08, + "logits/chosen": -2.0759177207946777, + "logits/rejected": -2.140557289123535, + "logps/chosen": -3.1038882732391357, + "logps/rejected": -3.3400981426239014, + "loss": 3.7893, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -31.038883209228516, + "rewards/margins": 2.362098455429077, + "rewards/rejected": -33.400978088378906, + "step": 27810 + }, + { + "epoch": 0.9375105328794364, + "grad_norm": 28.452058792114258, + "learning_rate": 1.184810627696453e-08, + "logits/chosen": -2.5447916984558105, + "logits/rejected": -2.1735787391662598, + "logps/chosen": -2.8184876441955566, + "logps/rejected": -2.4024946689605713, + "loss": 7.4393, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -28.184871673583984, + "rewards/margins": -4.1599273681640625, + "rewards/rejected": -24.024944305419922, + "step": 27815 + }, + { + "epoch": 0.9376790589504196, + "grad_norm": 26.47957992553711, + "learning_rate": 1.1784538693592472e-08, + "logits/chosen": -1.9781200885772705, + "logits/rejected": -2.3153228759765625, + "logps/chosen": -2.557616710662842, + "logps/rejected": -3.2911009788513184, + "loss": 2.8156, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.5761661529541, + "rewards/margins": 7.334845542907715, + "rewards/rejected": -32.911014556884766, + "step": 27820 + }, + { + "epoch": 0.9378475850214029, + "grad_norm": 15.519754409790039, + "learning_rate": 1.1721140062356638e-08, + "logits/chosen": -1.6726192235946655, + "logits/rejected": -1.926715612411499, + "logps/chosen": -2.3190712928771973, + "logps/rejected": -2.477382183074951, + "loss": 2.8953, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.190710067749023, + "rewards/margins": 1.5831115245819092, + "rewards/rejected": -24.773822784423828, + "step": 27825 + }, + { + "epoch": 0.938016111092386, + "grad_norm": 19.423538208007812, + "learning_rate": 1.1657910405196814e-08, + "logits/chosen": -1.5616414546966553, + "logits/rejected": -1.7062292098999023, + "logps/chosen": -2.029991865158081, + "logps/rejected": -2.273928165435791, + "loss": 1.7689, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.2999210357666, + "rewards/margins": 2.4393606185913086, + "rewards/rejected": -22.739282608032227, + "step": 27830 + }, + { + "epoch": 0.9381846371633692, + "grad_norm": 31.801822662353516, + "learning_rate": 1.1594849743994384e-08, + "logits/chosen": -2.074384927749634, + "logits/rejected": -1.9508994817733765, + "logps/chosen": -2.286508083343506, + "logps/rejected": -2.482470989227295, + "loss": 1.9202, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.865079879760742, + "rewards/margins": 1.9596277475357056, + "rewards/rejected": -24.824708938598633, + "step": 27835 + }, + { + "epoch": 0.9383531632343524, + "grad_norm": 66.98432922363281, + "learning_rate": 1.1531958100571948e-08, + "logits/chosen": -2.0815181732177734, + "logits/rejected": -2.2014503479003906, + "logps/chosen": -2.952211856842041, + "logps/rejected": -3.1800284385681152, + "loss": 3.0369, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -29.522119522094727, + "rewards/margins": 2.2781662940979004, + "rewards/rejected": -31.8002872467041, + "step": 27840 + }, + { + "epoch": 0.9385216893053355, + "grad_norm": 28.881742477416992, + "learning_rate": 1.146923549669393e-08, + "logits/chosen": -2.072078227996826, + "logits/rejected": -2.109978437423706, + "logps/chosen": -2.861506462097168, + "logps/rejected": -3.2766547203063965, + "loss": 3.2541, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -28.615062713623047, + "rewards/margins": 4.151480674743652, + "rewards/rejected": -32.76654815673828, + "step": 27845 + }, + { + "epoch": 0.9386902153763187, + "grad_norm": 106.37144470214844, + "learning_rate": 1.1406681954066244e-08, + "logits/chosen": -1.798151969909668, + "logits/rejected": -1.6892074346542358, + "logps/chosen": -2.5100529193878174, + "logps/rejected": -2.3787970542907715, + "loss": 4.5418, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -25.100528717041016, + "rewards/margins": -1.3125566244125366, + "rewards/rejected": -23.78797149658203, + "step": 27850 + }, + { + "epoch": 0.9388587414473019, + "grad_norm": 54.06602096557617, + "learning_rate": 1.1344297494336075e-08, + "logits/chosen": -1.5469924211502075, + "logits/rejected": -1.8411296606063843, + "logps/chosen": -1.773207426071167, + "logps/rejected": -1.9032232761383057, + "loss": 3.1252, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.732074737548828, + "rewards/margins": 1.3001577854156494, + "rewards/rejected": -19.0322322845459, + "step": 27855 + }, + { + "epoch": 0.939027267518285, + "grad_norm": 33.818641662597656, + "learning_rate": 1.1282082139092319e-08, + "logits/chosen": -1.8127334117889404, + "logits/rejected": -2.0438244342803955, + "logps/chosen": -2.7030246257781982, + "logps/rejected": -2.8986270427703857, + "loss": 2.6653, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -27.03024673461914, + "rewards/margins": 1.9560247659683228, + "rewards/rejected": -28.98626708984375, + "step": 27860 + }, + { + "epoch": 0.9391957935892683, + "grad_norm": 287.0670471191406, + "learning_rate": 1.1220035909865145e-08, + "logits/chosen": -1.7046902179718018, + "logits/rejected": -1.8295114040374756, + "logps/chosen": -2.8792572021484375, + "logps/rejected": -3.215181350708008, + "loss": 4.1455, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -28.792572021484375, + "rewards/margins": 3.3592400550842285, + "rewards/rejected": -32.15180969238281, + "step": 27865 + }, + { + "epoch": 0.9393643196602515, + "grad_norm": 81.16524505615234, + "learning_rate": 1.115815882812643e-08, + "logits/chosen": -2.333130121231079, + "logits/rejected": -2.2076778411865234, + "logps/chosen": -3.074096918106079, + "logps/rejected": -2.455504894256592, + "loss": 9.8043, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -30.740970611572266, + "rewards/margins": -6.185919761657715, + "rewards/rejected": -24.5550479888916, + "step": 27870 + }, + { + "epoch": 0.9395328457312346, + "grad_norm": 279.53173828125, + "learning_rate": 1.1096450915289324e-08, + "logits/chosen": -2.2081775665283203, + "logits/rejected": -1.953830361366272, + "logps/chosen": -2.8438355922698975, + "logps/rejected": -2.6367883682250977, + "loss": 7.0671, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -28.4383544921875, + "rewards/margins": -2.0704727172851562, + "rewards/rejected": -26.367883682250977, + "step": 27875 + }, + { + "epoch": 0.9397013718022178, + "grad_norm": 24.882822036743164, + "learning_rate": 1.103491219270858e-08, + "logits/chosen": -2.1974661350250244, + "logits/rejected": -2.2992677688598633, + "logps/chosen": -2.638702869415283, + "logps/rejected": -2.751085042953491, + "loss": 3.4128, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.38702964782715, + "rewards/margins": 1.1238213777542114, + "rewards/rejected": -27.510848999023438, + "step": 27880 + }, + { + "epoch": 0.939869897873201, + "grad_norm": 0.040229279547929764, + "learning_rate": 1.0973542681680215e-08, + "logits/chosen": -2.1547296047210693, + "logits/rejected": -2.1364502906799316, + "logps/chosen": -2.7284128665924072, + "logps/rejected": -2.731088161468506, + "loss": 5.0208, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -27.284130096435547, + "rewards/margins": 0.026755237951874733, + "rewards/rejected": -27.310882568359375, + "step": 27885 + }, + { + "epoch": 0.9400384239441841, + "grad_norm": 69.75837707519531, + "learning_rate": 1.0912342403441854e-08, + "logits/chosen": -2.2461647987365723, + "logits/rejected": -2.149193525314331, + "logps/chosen": -2.4559860229492188, + "logps/rejected": -2.5802507400512695, + "loss": 2.2705, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.559860229492188, + "rewards/margins": 1.2426480054855347, + "rewards/rejected": -25.802509307861328, + "step": 27890 + }, + { + "epoch": 0.9402069500151673, + "grad_norm": 21.39365005493164, + "learning_rate": 1.0851311379172556e-08, + "logits/chosen": -1.7585290670394897, + "logits/rejected": -1.6060632467269897, + "logps/chosen": -2.1874899864196777, + "logps/rejected": -2.264883041381836, + "loss": 3.467, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.874902725219727, + "rewards/margins": 0.7739295959472656, + "rewards/rejected": -22.64883041381836, + "step": 27895 + }, + { + "epoch": 0.9403754760861506, + "grad_norm": 101.61637115478516, + "learning_rate": 1.0790449629992648e-08, + "logits/chosen": -2.101813793182373, + "logits/rejected": -2.3753764629364014, + "logps/chosen": -3.024099826812744, + "logps/rejected": -3.160783290863037, + "loss": 2.7098, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -30.240997314453125, + "rewards/margins": 1.366835117340088, + "rewards/rejected": -31.607837677001953, + "step": 27900 + }, + { + "epoch": 0.9405440021571337, + "grad_norm": 42.03780746459961, + "learning_rate": 1.0729757176964005e-08, + "logits/chosen": -2.129361867904663, + "logits/rejected": -2.212040424346924, + "logps/chosen": -2.0724599361419678, + "logps/rejected": -2.3914830684661865, + "loss": 1.864, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.724597930908203, + "rewards/margins": 3.1902308464050293, + "rewards/rejected": -23.91482925415039, + "step": 27905 + }, + { + "epoch": 0.9407125282281169, + "grad_norm": 33.489646911621094, + "learning_rate": 1.0669234041089991e-08, + "logits/chosen": -1.8179874420166016, + "logits/rejected": -2.0074830055236816, + "logps/chosen": -2.027855634689331, + "logps/rejected": -2.0084097385406494, + "loss": 3.5226, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.2785587310791, + "rewards/margins": -0.1944606751203537, + "rewards/rejected": -20.084096908569336, + "step": 27910 + }, + { + "epoch": 0.9408810542991001, + "grad_norm": 53.953433990478516, + "learning_rate": 1.0608880243315188e-08, + "logits/chosen": -2.8705196380615234, + "logits/rejected": -2.637935161590576, + "logps/chosen": -2.8548240661621094, + "logps/rejected": -2.816099166870117, + "loss": 4.5831, + "rewards/accuracies": 0.5, + "rewards/chosen": -28.548242568969727, + "rewards/margins": -0.3872489929199219, + "rewards/rejected": -28.16098976135254, + "step": 27915 + }, + { + "epoch": 0.9410495803700832, + "grad_norm": 26.6319522857666, + "learning_rate": 1.054869580452572e-08, + "logits/chosen": -1.5271388292312622, + "logits/rejected": -1.7312593460083008, + "logps/chosen": -1.947431206703186, + "logps/rejected": -2.1562416553497314, + "loss": 2.0796, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.47431182861328, + "rewards/margins": 2.0881032943725586, + "rewards/rejected": -21.562414169311523, + "step": 27920 + }, + { + "epoch": 0.9412181064410664, + "grad_norm": 1.690170407295227, + "learning_rate": 1.0488680745548983e-08, + "logits/chosen": -1.8683593273162842, + "logits/rejected": -2.092747211456299, + "logps/chosen": -2.7897651195526123, + "logps/rejected": -3.438420057296753, + "loss": 1.1328, + "rewards/accuracies": 1.0, + "rewards/chosen": -27.89764976501465, + "rewards/margins": 6.486549377441406, + "rewards/rejected": -34.38420104980469, + "step": 27925 + }, + { + "epoch": 0.9413866325120496, + "grad_norm": 24.103199005126953, + "learning_rate": 1.042883508715392e-08, + "logits/chosen": -2.063265323638916, + "logits/rejected": -2.2874794006347656, + "logps/chosen": -2.7128069400787354, + "logps/rejected": -2.7375307083129883, + "loss": 3.6814, + "rewards/accuracies": 0.5, + "rewards/chosen": -27.128067016601562, + "rewards/margins": 0.24723930656909943, + "rewards/rejected": -27.375308990478516, + "step": 27930 + }, + { + "epoch": 0.9415551585830328, + "grad_norm": 30.12285614013672, + "learning_rate": 1.036915885005063e-08, + "logits/chosen": -2.5748400688171387, + "logits/rejected": -2.4589247703552246, + "logps/chosen": -2.301511526107788, + "logps/rejected": -2.3860554695129395, + "loss": 2.9559, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.01511573791504, + "rewards/margins": 0.8454399108886719, + "rewards/rejected": -23.860553741455078, + "step": 27935 + }, + { + "epoch": 0.941723684654016, + "grad_norm": 29.477624893188477, + "learning_rate": 1.0309652054890816e-08, + "logits/chosen": -1.42814302444458, + "logits/rejected": -1.6454927921295166, + "logps/chosen": -2.172950029373169, + "logps/rejected": -2.59196138381958, + "loss": 1.1831, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -21.7294979095459, + "rewards/margins": 4.190115451812744, + "rewards/rejected": -25.919612884521484, + "step": 27940 + }, + { + "epoch": 0.9418922107249992, + "grad_norm": 22.671653747558594, + "learning_rate": 1.025031472226734e-08, + "logits/chosen": -1.8475160598754883, + "logits/rejected": -2.1124308109283447, + "logps/chosen": -1.6322600841522217, + "logps/rejected": -2.4699718952178955, + "loss": 1.9411, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -16.322601318359375, + "rewards/margins": 8.377116203308105, + "rewards/rejected": -24.699716567993164, + "step": 27945 + }, + { + "epoch": 0.9420607367959823, + "grad_norm": 25.53143882751465, + "learning_rate": 1.0191146872714662e-08, + "logits/chosen": -1.754601240158081, + "logits/rejected": -2.093977451324463, + "logps/chosen": -1.7095845937728882, + "logps/rejected": -2.059696674346924, + "loss": 0.7661, + "rewards/accuracies": 1.0, + "rewards/chosen": -17.095844268798828, + "rewards/margins": 3.5011227130889893, + "rewards/rejected": -20.596969604492188, + "step": 27950 + }, + { + "epoch": 0.9422292628669655, + "grad_norm": 39.68607711791992, + "learning_rate": 1.0132148526708296e-08, + "logits/chosen": -1.9065732955932617, + "logits/rejected": -2.1846835613250732, + "logps/chosen": -1.9765098094940186, + "logps/rejected": -2.0324313640594482, + "loss": 2.7704, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.765098571777344, + "rewards/margins": 0.5592161417007446, + "rewards/rejected": -20.32431411743164, + "step": 27955 + }, + { + "epoch": 0.9423977889379487, + "grad_norm": 27.000288009643555, + "learning_rate": 1.0073319704665295e-08, + "logits/chosen": -2.5076403617858887, + "logits/rejected": -2.666171073913574, + "logps/chosen": -2.1777713298797607, + "logps/rejected": -2.539689302444458, + "loss": 2.1136, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.777713775634766, + "rewards/margins": 3.619178056716919, + "rewards/rejected": -25.396892547607422, + "step": 27960 + }, + { + "epoch": 0.9425663150089318, + "grad_norm": 21.133543014526367, + "learning_rate": 1.0014660426944044e-08, + "logits/chosen": -1.6421520709991455, + "logits/rejected": -2.5268847942352295, + "logps/chosen": -2.5491130352020264, + "logps/rejected": -2.8137052059173584, + "loss": 2.273, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.491130828857422, + "rewards/margins": 2.645921230316162, + "rewards/rejected": -28.13705062866211, + "step": 27965 + }, + { + "epoch": 0.942734841079915, + "grad_norm": 49.087364196777344, + "learning_rate": 9.956170713844136e-09, + "logits/chosen": -1.8143583536148071, + "logits/rejected": -2.109388828277588, + "logps/chosen": -2.2308526039123535, + "logps/rejected": -2.328622817993164, + "loss": 2.7613, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.308523178100586, + "rewards/margins": 0.9777054786682129, + "rewards/rejected": -23.28622817993164, + "step": 27970 + }, + { + "epoch": 0.9429033671508983, + "grad_norm": 16.130882263183594, + "learning_rate": 9.897850585606605e-09, + "logits/chosen": -2.2523300647735596, + "logits/rejected": -2.478358507156372, + "logps/chosen": -2.370288133621216, + "logps/rejected": -2.562955379486084, + "loss": 1.9819, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -23.702880859375, + "rewards/margins": 1.926670789718628, + "rewards/rejected": -25.629552841186523, + "step": 27975 + }, + { + "epoch": 0.9430718932218815, + "grad_norm": 49.06074905395508, + "learning_rate": 9.839700062413692e-09, + "logits/chosen": -1.795732855796814, + "logits/rejected": -2.104189157485962, + "logps/chosen": -2.573464870452881, + "logps/rejected": -2.9868791103363037, + "loss": 3.0401, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.734649658203125, + "rewards/margins": 4.134143829345703, + "rewards/rejected": -29.868793487548828, + "step": 27980 + }, + { + "epoch": 0.9432404192928646, + "grad_norm": 35.39234924316406, + "learning_rate": 9.78171916438908e-09, + "logits/chosen": -2.184047222137451, + "logits/rejected": -2.3190789222717285, + "logps/chosen": -3.3273043632507324, + "logps/rejected": -3.9008243083953857, + "loss": 2.3656, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -33.273040771484375, + "rewards/margins": 5.735200881958008, + "rewards/rejected": -39.008243560791016, + "step": 27985 + }, + { + "epoch": 0.9434089453638478, + "grad_norm": 36.98085403442383, + "learning_rate": 9.723907911597607e-09, + "logits/chosen": -1.1470811367034912, + "logits/rejected": -1.2830091714859009, + "logps/chosen": -2.3072681427001953, + "logps/rejected": -2.4950547218322754, + "loss": 2.3621, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -23.072681427001953, + "rewards/margins": 1.877862572669983, + "rewards/rejected": -24.950544357299805, + "step": 27990 + }, + { + "epoch": 0.943577471434831, + "grad_norm": 60.771636962890625, + "learning_rate": 9.666266324045547e-09, + "logits/chosen": -2.091439723968506, + "logits/rejected": -1.8897613286972046, + "logps/chosen": -2.765847682952881, + "logps/rejected": -2.8236958980560303, + "loss": 5.6922, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -27.658477783203125, + "rewards/margins": 0.5784798860549927, + "rewards/rejected": -28.236957550048828, + "step": 27995 + }, + { + "epoch": 0.9437459975058141, + "grad_norm": 0.013381626456975937, + "learning_rate": 9.608794421680334e-09, + "logits/chosen": -2.3415331840515137, + "logits/rejected": -2.6215662956237793, + "logps/chosen": -3.6000258922576904, + "logps/rejected": -4.7026519775390625, + "loss": 0.677, + "rewards/accuracies": 1.0, + "rewards/chosen": -36.00026321411133, + "rewards/margins": 11.026254653930664, + "rewards/rejected": -47.02651596069336, + "step": 28000 + }, + { + "epoch": 0.9437459975058141, + "eval_logits/chosen": -2.313400983810425, + "eval_logits/rejected": -2.4919190406799316, + "eval_logps/chosen": -2.2887589931488037, + "eval_logps/rejected": -2.4447197914123535, + "eval_loss": 3.0835182666778564, + "eval_rewards/accuracies": 0.6299999952316284, + "eval_rewards/chosen": -22.887590408325195, + "eval_rewards/margins": 1.5596061944961548, + "eval_rewards/rejected": -24.447195053100586, + "eval_runtime": 12.8885, + "eval_samples_per_second": 7.759, + "eval_steps_per_second": 1.94, + "step": 28000 + }, + { + "epoch": 0.9439145235767973, + "grad_norm": 52.55608367919922, + "learning_rate": 9.551492224390666e-09, + "logits/chosen": -2.1889519691467285, + "logits/rejected": -2.4650866985321045, + "logps/chosen": -2.833047866821289, + "logps/rejected": -3.5559210777282715, + "loss": 2.5163, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -28.33047866821289, + "rewards/margins": 7.228735446929932, + "rewards/rejected": -35.5592155456543, + "step": 28005 + }, + { + "epoch": 0.9440830496477806, + "grad_norm": 28.72416114807129, + "learning_rate": 9.494359752006686e-09, + "logits/chosen": -1.4533464908599854, + "logits/rejected": -2.3801748752593994, + "logps/chosen": -2.3763134479522705, + "logps/rejected": -3.6961147785186768, + "loss": 1.967, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.76313591003418, + "rewards/margins": 13.19801139831543, + "rewards/rejected": -36.961151123046875, + "step": 28010 + }, + { + "epoch": 0.9442515757187637, + "grad_norm": 8.59366512298584, + "learning_rate": 9.437397024299631e-09, + "logits/chosen": -1.1914489269256592, + "logits/rejected": -1.926476240158081, + "logps/chosen": -2.2145838737487793, + "logps/rejected": -3.3406097888946533, + "loss": 1.6371, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.145837783813477, + "rewards/margins": 11.26025676727295, + "rewards/rejected": -33.40609359741211, + "step": 28015 + }, + { + "epoch": 0.9444201017897469, + "grad_norm": 30.539743423461914, + "learning_rate": 9.380604060982123e-09, + "logits/chosen": -1.732550024986267, + "logits/rejected": -2.0217487812042236, + "logps/chosen": -2.098297595977783, + "logps/rejected": -2.4610893726348877, + "loss": 1.9242, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.982975006103516, + "rewards/margins": 3.6279189586639404, + "rewards/rejected": -24.61089515686035, + "step": 28020 + }, + { + "epoch": 0.9445886278607301, + "grad_norm": 27.034799575805664, + "learning_rate": 9.323980881707827e-09, + "logits/chosen": -1.8111553192138672, + "logits/rejected": -1.981406807899475, + "logps/chosen": -2.0265769958496094, + "logps/rejected": -2.132596015930176, + "loss": 2.2946, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.265771865844727, + "rewards/margins": 1.0601909160614014, + "rewards/rejected": -21.32596206665039, + "step": 28025 + }, + { + "epoch": 0.9447571539317132, + "grad_norm": 50.454833984375, + "learning_rate": 9.26752750607196e-09, + "logits/chosen": -2.05342173576355, + "logits/rejected": -1.770754098892212, + "logps/chosen": -3.303680419921875, + "logps/rejected": -2.9499616622924805, + "loss": 7.4913, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -33.03680419921875, + "rewards/margins": -3.537187099456787, + "rewards/rejected": -29.499618530273438, + "step": 28030 + }, + { + "epoch": 0.9449256800026964, + "grad_norm": 129.8310089111328, + "learning_rate": 9.211243953610726e-09, + "logits/chosen": -1.383266568183899, + "logits/rejected": -1.8043930530548096, + "logps/chosen": -2.948296308517456, + "logps/rejected": -2.9599292278289795, + "loss": 3.9439, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -29.482959747314453, + "rewards/margins": 0.11632823944091797, + "rewards/rejected": -29.599292755126953, + "step": 28035 + }, + { + "epoch": 0.9450942060736796, + "grad_norm": 44.37803268432617, + "learning_rate": 9.1551302438016e-09, + "logits/chosen": -2.1227376461029053, + "logits/rejected": -2.096128225326538, + "logps/chosen": -2.1822402477264404, + "logps/rejected": -2.4015820026397705, + "loss": 2.6138, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.822399139404297, + "rewards/margins": 2.193415880203247, + "rewards/rejected": -24.015817642211914, + "step": 28040 + }, + { + "epoch": 0.9452627321446628, + "grad_norm": 20.145410537719727, + "learning_rate": 9.09918639606344e-09, + "logits/chosen": -1.7266706228256226, + "logits/rejected": -1.9664170742034912, + "logps/chosen": -3.275007963180542, + "logps/rejected": -3.9894778728485107, + "loss": 1.1419, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -32.75008010864258, + "rewards/margins": 7.1446990966796875, + "rewards/rejected": -39.89478302001953, + "step": 28045 + }, + { + "epoch": 0.945431258215646, + "grad_norm": 27.345746994018555, + "learning_rate": 9.043412429756091e-09, + "logits/chosen": -1.701939344406128, + "logits/rejected": -1.802038550376892, + "logps/chosen": -2.7647578716278076, + "logps/rejected": -2.5529465675354004, + "loss": 5.8626, + "rewards/accuracies": 0.5, + "rewards/chosen": -27.647579193115234, + "rewards/margins": -2.1181139945983887, + "rewards/rejected": -25.529464721679688, + "step": 28050 + }, + { + "epoch": 0.9455997842866292, + "grad_norm": 46.19438552856445, + "learning_rate": 8.987808364180837e-09, + "logits/chosen": -2.0644924640655518, + "logits/rejected": -2.2390027046203613, + "logps/chosen": -2.9007680416107178, + "logps/rejected": -2.8708555698394775, + "loss": 4.7119, + "rewards/accuracies": 0.5, + "rewards/chosen": -29.007680892944336, + "rewards/margins": -0.29912489652633667, + "rewards/rejected": -28.70855712890625, + "step": 28055 + }, + { + "epoch": 0.9457683103576123, + "grad_norm": 59.57084655761719, + "learning_rate": 8.932374218579953e-09, + "logits/chosen": -1.6476083993911743, + "logits/rejected": -1.338379144668579, + "logps/chosen": -2.277503728866577, + "logps/rejected": -2.210233211517334, + "loss": 3.7347, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -22.775039672851562, + "rewards/margins": -0.6727081537246704, + "rewards/rejected": -22.102331161499023, + "step": 28060 + }, + { + "epoch": 0.9459368364285955, + "grad_norm": 162.84632873535156, + "learning_rate": 8.87711001213709e-09, + "logits/chosen": -2.315368413925171, + "logits/rejected": -2.393383741378784, + "logps/chosen": -2.475985050201416, + "logps/rejected": -2.5510942935943604, + "loss": 3.2253, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.75984764099121, + "rewards/margins": 0.7510935068130493, + "rewards/rejected": -25.510942459106445, + "step": 28065 + }, + { + "epoch": 0.9461053624995787, + "grad_norm": 27.281959533691406, + "learning_rate": 8.822015763977009e-09, + "logits/chosen": -2.1430652141571045, + "logits/rejected": -2.3477883338928223, + "logps/chosen": -2.2344632148742676, + "logps/rejected": -2.4346368312835693, + "loss": 1.9776, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.34463119506836, + "rewards/margins": 2.0017340183258057, + "rewards/rejected": -24.34636688232422, + "step": 28070 + }, + { + "epoch": 0.9462738885705618, + "grad_norm": 5.122872829437256, + "learning_rate": 8.767091493165568e-09, + "logits/chosen": -1.899531602859497, + "logits/rejected": -2.1331827640533447, + "logps/chosen": -2.2442357540130615, + "logps/rejected": -3.0054221153259277, + "loss": 1.1727, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -22.44235610961914, + "rewards/margins": 7.6118669509887695, + "rewards/rejected": -30.054224014282227, + "step": 28075 + }, + { + "epoch": 0.946442414641545, + "grad_norm": 0.05286615341901779, + "learning_rate": 8.712337218710009e-09, + "logits/chosen": -1.9935481548309326, + "logits/rejected": -2.8229637145996094, + "logps/chosen": -2.3799171447753906, + "logps/rejected": -3.0323686599731445, + "loss": 1.2227, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -23.79917335510254, + "rewards/margins": 6.524515628814697, + "rewards/rejected": -30.323688507080078, + "step": 28080 + }, + { + "epoch": 0.9466109407125283, + "grad_norm": 30.209877014160156, + "learning_rate": 8.657752959558562e-09, + "logits/chosen": -1.6426223516464233, + "logits/rejected": -1.5145375728607178, + "logps/chosen": -2.6865198612213135, + "logps/rejected": -2.789952516555786, + "loss": 3.3766, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -26.865198135375977, + "rewards/margins": 1.0343248844146729, + "rewards/rejected": -27.899524688720703, + "step": 28085 + }, + { + "epoch": 0.9467794667835114, + "grad_norm": 12.501883506774902, + "learning_rate": 8.60333873460073e-09, + "logits/chosen": -1.8514074087142944, + "logits/rejected": -2.0068726539611816, + "logps/chosen": -2.2644107341766357, + "logps/rejected": -2.4691319465637207, + "loss": 1.6423, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.64410972595215, + "rewards/margins": 2.0472114086151123, + "rewards/rejected": -24.691320419311523, + "step": 28090 + }, + { + "epoch": 0.9469479928544946, + "grad_norm": 19.746841430664062, + "learning_rate": 8.549094562667059e-09, + "logits/chosen": -2.1093087196350098, + "logits/rejected": -2.5620765686035156, + "logps/chosen": -2.370896100997925, + "logps/rejected": -3.1036839485168457, + "loss": 2.4403, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.708959579467773, + "rewards/margins": 7.327878475189209, + "rewards/rejected": -31.036840438842773, + "step": 28095 + }, + { + "epoch": 0.9471165189254778, + "grad_norm": 51.745853424072266, + "learning_rate": 8.495020462529368e-09, + "logits/chosen": -1.7082334756851196, + "logits/rejected": -2.372655153274536, + "logps/chosen": -2.9153425693511963, + "logps/rejected": -4.1864423751831055, + "loss": 4.3572, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -29.153427124023438, + "rewards/margins": 12.710992813110352, + "rewards/rejected": -41.864418029785156, + "step": 28100 + }, + { + "epoch": 0.9472850449964609, + "grad_norm": 62.38421630859375, + "learning_rate": 8.441116452900632e-09, + "logits/chosen": -1.7928497791290283, + "logits/rejected": -1.9549148082733154, + "logps/chosen": -2.526888370513916, + "logps/rejected": -2.4071240425109863, + "loss": 4.8002, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -25.26888084411621, + "rewards/margins": -1.1976430416107178, + "rewards/rejected": -24.071239471435547, + "step": 28105 + }, + { + "epoch": 0.9474535710674441, + "grad_norm": 18.18597412109375, + "learning_rate": 8.387382552434763e-09, + "logits/chosen": -2.213923454284668, + "logits/rejected": -2.179241180419922, + "logps/chosen": -2.6775755882263184, + "logps/rejected": -3.191340208053589, + "loss": 4.1458, + "rewards/accuracies": 0.5, + "rewards/chosen": -26.7757568359375, + "rewards/margins": 5.137645721435547, + "rewards/rejected": -31.913400650024414, + "step": 28110 + }, + { + "epoch": 0.9476220971384273, + "grad_norm": 54.165409088134766, + "learning_rate": 8.333818779727053e-09, + "logits/chosen": -1.8373768329620361, + "logits/rejected": -1.9318361282348633, + "logps/chosen": -2.7003121376037598, + "logps/rejected": -3.0800962448120117, + "loss": 2.3767, + "rewards/accuracies": 0.5, + "rewards/chosen": -27.003122329711914, + "rewards/margins": 3.7978405952453613, + "rewards/rejected": -30.80096435546875, + "step": 28115 + }, + { + "epoch": 0.9477906232094105, + "grad_norm": 78.94615936279297, + "learning_rate": 8.280425153313786e-09, + "logits/chosen": -1.6564782857894897, + "logits/rejected": -1.58085298538208, + "logps/chosen": -2.667534828186035, + "logps/rejected": -2.594963550567627, + "loss": 4.0251, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -26.67534828186035, + "rewards/margins": -0.725711464881897, + "rewards/rejected": -25.949636459350586, + "step": 28120 + }, + { + "epoch": 0.9479591492803937, + "grad_norm": 162.03231811523438, + "learning_rate": 8.227201691672403e-09, + "logits/chosen": -2.3832004070281982, + "logits/rejected": -2.4587082862854004, + "logps/chosen": -3.8883845806121826, + "logps/rejected": -3.8318228721618652, + "loss": 4.067, + "rewards/accuracies": 0.5, + "rewards/chosen": -38.88385009765625, + "rewards/margins": -0.5656188726425171, + "rewards/rejected": -38.318233489990234, + "step": 28125 + }, + { + "epoch": 0.9481276753513769, + "grad_norm": 0.09015277773141861, + "learning_rate": 8.174148413221448e-09, + "logits/chosen": -1.579756736755371, + "logits/rejected": -2.15185809135437, + "logps/chosen": -2.458775758743286, + "logps/rejected": -2.8276195526123047, + "loss": 2.6917, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.587757110595703, + "rewards/margins": 3.688439130783081, + "rewards/rejected": -28.276195526123047, + "step": 28130 + }, + { + "epoch": 0.94829620142236, + "grad_norm": 47.610538482666016, + "learning_rate": 8.121265336320572e-09, + "logits/chosen": -1.3809497356414795, + "logits/rejected": -1.3594849109649658, + "logps/chosen": -2.757594108581543, + "logps/rejected": -3.185593605041504, + "loss": 2.5558, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -27.575939178466797, + "rewards/margins": 4.279994964599609, + "rewards/rejected": -31.855932235717773, + "step": 28135 + }, + { + "epoch": 0.9484647274933432, + "grad_norm": 61.344520568847656, + "learning_rate": 8.068552479270519e-09, + "logits/chosen": -2.294250249862671, + "logits/rejected": -2.034433603286743, + "logps/chosen": -3.1079373359680176, + "logps/rejected": -3.056140899658203, + "loss": 4.0428, + "rewards/accuracies": 0.5, + "rewards/chosen": -31.079376220703125, + "rewards/margins": -0.5179659128189087, + "rewards/rejected": -30.5614070892334, + "step": 28140 + }, + { + "epoch": 0.9486332535643264, + "grad_norm": 33.442928314208984, + "learning_rate": 8.016009860313089e-09, + "logits/chosen": -1.8951114416122437, + "logits/rejected": -2.076521635055542, + "logps/chosen": -2.8847079277038574, + "logps/rejected": -3.0558080673217773, + "loss": 2.3421, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -28.847076416015625, + "rewards/margins": 1.7110036611557007, + "rewards/rejected": -30.558080673217773, + "step": 28145 + }, + { + "epoch": 0.9488017796353095, + "grad_norm": 28.92125701904297, + "learning_rate": 7.963637497631237e-09, + "logits/chosen": -2.5039522647857666, + "logits/rejected": -2.384413242340088, + "logps/chosen": -2.762303352355957, + "logps/rejected": -3.159745454788208, + "loss": 1.1603, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -27.623035430908203, + "rewards/margins": 3.9744210243225098, + "rewards/rejected": -31.597454071044922, + "step": 28150 + }, + { + "epoch": 0.9489703057062928, + "grad_norm": 18.608285903930664, + "learning_rate": 7.91143540934902e-09, + "logits/chosen": -1.7014240026474, + "logits/rejected": -2.3139872550964355, + "logps/chosen": -3.1759629249572754, + "logps/rejected": -3.790762424468994, + "loss": 2.2991, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -31.759624481201172, + "rewards/margins": 6.147995948791504, + "rewards/rejected": -37.907623291015625, + "step": 28155 + }, + { + "epoch": 0.949138831777276, + "grad_norm": 58.06821060180664, + "learning_rate": 7.859403613531546e-09, + "logits/chosen": -1.2894471883773804, + "logits/rejected": -1.6731243133544922, + "logps/chosen": -2.6358516216278076, + "logps/rejected": -3.137530565261841, + "loss": 2.8082, + "rewards/accuracies": 0.5, + "rewards/chosen": -26.358516693115234, + "rewards/margins": 5.01678991317749, + "rewards/rejected": -31.37530517578125, + "step": 28160 + }, + { + "epoch": 0.9493073578482591, + "grad_norm": 24.11583709716797, + "learning_rate": 7.807542128184852e-09, + "logits/chosen": -2.01277756690979, + "logits/rejected": -2.4013025760650635, + "logps/chosen": -1.914523720741272, + "logps/rejected": -2.132474184036255, + "loss": 2.4283, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.14523696899414, + "rewards/margins": 2.179502487182617, + "rewards/rejected": -21.324739456176758, + "step": 28165 + }, + { + "epoch": 0.9494758839192423, + "grad_norm": 37.180076599121094, + "learning_rate": 7.75585097125625e-09, + "logits/chosen": -1.6620514392852783, + "logits/rejected": -2.2578060626983643, + "logps/chosen": -2.9266459941864014, + "logps/rejected": -3.3216185569763184, + "loss": 2.0754, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -29.266460418701172, + "rewards/margins": 3.949725389480591, + "rewards/rejected": -33.2161865234375, + "step": 28170 + }, + { + "epoch": 0.9496444099902255, + "grad_norm": 17.247196197509766, + "learning_rate": 7.704330160633987e-09, + "logits/chosen": -1.8131141662597656, + "logits/rejected": -2.2728631496429443, + "logps/chosen": -2.863938808441162, + "logps/rejected": -3.2482426166534424, + "loss": 2.2238, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -28.639385223388672, + "rewards/margins": 3.8430371284484863, + "rewards/rejected": -32.482421875, + "step": 28175 + }, + { + "epoch": 0.9498129360612086, + "grad_norm": 24.438737869262695, + "learning_rate": 7.652979714147357e-09, + "logits/chosen": -2.0364794731140137, + "logits/rejected": -2.189481019973755, + "logps/chosen": -2.011589288711548, + "logps/rejected": -1.904343605041504, + "loss": 4.4351, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.11589241027832, + "rewards/margins": -1.0724555253982544, + "rewards/rejected": -19.04343605041504, + "step": 28180 + }, + { + "epoch": 0.9499814621321918, + "grad_norm": 21.459518432617188, + "learning_rate": 7.601799649566699e-09, + "logits/chosen": -1.9324400424957275, + "logits/rejected": -1.7644433975219727, + "logps/chosen": -2.0944879055023193, + "logps/rejected": -2.2868826389312744, + "loss": 2.7438, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.94487953186035, + "rewards/margins": 1.9239448308944702, + "rewards/rejected": -22.868825912475586, + "step": 28185 + }, + { + "epoch": 0.950149988203175, + "grad_norm": 41.32154083251953, + "learning_rate": 7.550789984603512e-09, + "logits/chosen": -1.8872880935668945, + "logits/rejected": -2.033036470413208, + "logps/chosen": -2.2491378784179688, + "logps/rejected": -2.213416337966919, + "loss": 3.4831, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -22.49138069152832, + "rewards/margins": -0.3572167456150055, + "rewards/rejected": -22.1341609954834, + "step": 28190 + }, + { + "epoch": 0.9503185142741583, + "grad_norm": 60.450828552246094, + "learning_rate": 7.499950736910232e-09, + "logits/chosen": -1.3878852128982544, + "logits/rejected": -1.6997029781341553, + "logps/chosen": -1.9645227193832397, + "logps/rejected": -2.1677870750427246, + "loss": 2.4453, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.64522933959961, + "rewards/margins": 2.032642364501953, + "rewards/rejected": -21.677871704101562, + "step": 28195 + }, + { + "epoch": 0.9504870403451414, + "grad_norm": 24.949216842651367, + "learning_rate": 7.449281924080231e-09, + "logits/chosen": -2.2458033561706543, + "logits/rejected": -2.445981502532959, + "logps/chosen": -1.9202144145965576, + "logps/rejected": -2.4266836643218994, + "loss": 1.3608, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.202144622802734, + "rewards/margins": 5.064691066741943, + "rewards/rejected": -24.266836166381836, + "step": 28200 + }, + { + "epoch": 0.9506555664161246, + "grad_norm": 202.5205535888672, + "learning_rate": 7.398783563648037e-09, + "logits/chosen": -1.5952340364456177, + "logits/rejected": -1.6486787796020508, + "logps/chosen": -2.4221603870391846, + "logps/rejected": -2.4342808723449707, + "loss": 5.1397, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.22160530090332, + "rewards/margins": 0.12120513617992401, + "rewards/rejected": -24.34280776977539, + "step": 28205 + }, + { + "epoch": 0.9508240924871078, + "grad_norm": 33.384708404541016, + "learning_rate": 7.348455673089171e-09, + "logits/chosen": -1.8759981393814087, + "logits/rejected": -1.9951766729354858, + "logps/chosen": -1.9931285381317139, + "logps/rejected": -2.0459671020507812, + "loss": 3.0892, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -19.93128776550293, + "rewards/margins": 0.5283831357955933, + "rewards/rejected": -20.459671020507812, + "step": 28210 + }, + { + "epoch": 0.9509926185580909, + "grad_norm": 34.56550598144531, + "learning_rate": 7.298298269820091e-09, + "logits/chosen": -1.8990745544433594, + "logits/rejected": -1.8381602764129639, + "logps/chosen": -2.5596415996551514, + "logps/rejected": -2.999729633331299, + "loss": 3.3995, + "rewards/accuracies": 0.5, + "rewards/chosen": -25.596416473388672, + "rewards/margins": 4.40087890625, + "rewards/rejected": -29.997295379638672, + "step": 28215 + }, + { + "epoch": 0.9511611446290741, + "grad_norm": 73.5636215209961, + "learning_rate": 7.248311371198246e-09, + "logits/chosen": -1.5171552896499634, + "logits/rejected": -2.3751957416534424, + "logps/chosen": -2.4360406398773193, + "logps/rejected": -4.005704402923584, + "loss": 1.3572, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -24.360408782958984, + "rewards/margins": 15.696635246276855, + "rewards/rejected": -40.057044982910156, + "step": 28220 + }, + { + "epoch": 0.9513296707000573, + "grad_norm": 72.0538101196289, + "learning_rate": 7.198494994522242e-09, + "logits/chosen": -2.095430374145508, + "logits/rejected": -1.976109266281128, + "logps/chosen": -3.0543994903564453, + "logps/rejected": -3.1049439907073975, + "loss": 3.3299, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -30.543996810913086, + "rewards/margins": 0.5054424405097961, + "rewards/rejected": -31.0494384765625, + "step": 28225 + }, + { + "epoch": 0.9514981967710405, + "grad_norm": 42.48782730102539, + "learning_rate": 7.1488491570315116e-09, + "logits/chosen": -2.2845242023468018, + "logits/rejected": -2.450240135192871, + "logps/chosen": -2.590700626373291, + "logps/rejected": -2.9526290893554688, + "loss": 2.2784, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.907007217407227, + "rewards/margins": 3.619286298751831, + "rewards/rejected": -29.526294708251953, + "step": 28230 + }, + { + "epoch": 0.9516667228420237, + "grad_norm": 26.9639949798584, + "learning_rate": 7.099373875906534e-09, + "logits/chosen": -1.91399347782135, + "logits/rejected": -1.7625631093978882, + "logps/chosen": -2.7082173824310303, + "logps/rejected": -2.8804149627685547, + "loss": 2.7748, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -27.08217430114746, + "rewards/margins": 1.7219760417938232, + "rewards/rejected": -28.804149627685547, + "step": 28235 + }, + { + "epoch": 0.9518352489130069, + "grad_norm": 0.7547553777694702, + "learning_rate": 7.050069168268724e-09, + "logits/chosen": -1.9236023426055908, + "logits/rejected": -2.384753704071045, + "logps/chosen": -2.392350435256958, + "logps/rejected": -2.638664484024048, + "loss": 3.2697, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.92350196838379, + "rewards/margins": 2.4631431102752686, + "rewards/rejected": -26.386646270751953, + "step": 28240 + }, + { + "epoch": 0.95200377498399, + "grad_norm": 44.27131271362305, + "learning_rate": 7.000935051180546e-09, + "logits/chosen": -1.9845330715179443, + "logits/rejected": -2.276456117630005, + "logps/chosen": -2.1075778007507324, + "logps/rejected": -2.3886475563049316, + "loss": 1.9211, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.07577896118164, + "rewards/margins": 2.810696840286255, + "rewards/rejected": -23.886472702026367, + "step": 28245 + }, + { + "epoch": 0.9521723010549732, + "grad_norm": 195.17190551757812, + "learning_rate": 6.951971541645341e-09, + "logits/chosen": -2.1520159244537354, + "logits/rejected": -2.5742106437683105, + "logps/chosen": -1.8013156652450562, + "logps/rejected": -1.940410852432251, + "loss": 2.6314, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.01315689086914, + "rewards/margins": 1.3909523487091064, + "rewards/rejected": -19.404109954833984, + "step": 28250 + }, + { + "epoch": 0.9523408271259564, + "grad_norm": 36.765411376953125, + "learning_rate": 6.9031786566075e-09, + "logits/chosen": -1.9867013692855835, + "logits/rejected": -2.1289658546447754, + "logps/chosen": -1.9412078857421875, + "logps/rejected": -1.8753058910369873, + "loss": 4.0751, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.412078857421875, + "rewards/margins": -0.6590215563774109, + "rewards/rejected": -18.7530574798584, + "step": 28255 + }, + { + "epoch": 0.9525093531969395, + "grad_norm": 26.724512100219727, + "learning_rate": 6.854556412952239e-09, + "logits/chosen": -2.8886868953704834, + "logits/rejected": -2.6446754932403564, + "logps/chosen": -2.0121846199035645, + "logps/rejected": -2.0627729892730713, + "loss": 4.1562, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.121845245361328, + "rewards/margins": 0.5058833360671997, + "rewards/rejected": -20.627731323242188, + "step": 28260 + }, + { + "epoch": 0.9526778792679228, + "grad_norm": 29.14841079711914, + "learning_rate": 6.806104827505932e-09, + "logits/chosen": -1.601732850074768, + "logits/rejected": -1.7263450622558594, + "logps/chosen": -2.5978128910064697, + "logps/rejected": -2.745781421661377, + "loss": 3.0066, + "rewards/accuracies": 0.5, + "rewards/chosen": -25.97812843322754, + "rewards/margins": 1.4796851873397827, + "rewards/rejected": -27.457813262939453, + "step": 28265 + }, + { + "epoch": 0.952846405338906, + "grad_norm": 55.97766876220703, + "learning_rate": 6.75782391703561e-09, + "logits/chosen": -1.4899791479110718, + "logits/rejected": -1.2709168195724487, + "logps/chosen": -2.8961353302001953, + "logps/rejected": -2.7889018058776855, + "loss": 4.4169, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -28.961355209350586, + "rewards/margins": -1.072336196899414, + "rewards/rejected": -27.88901710510254, + "step": 28270 + }, + { + "epoch": 0.9530149314098891, + "grad_norm": 30.594099044799805, + "learning_rate": 6.709713698249464e-09, + "logits/chosen": -2.045694589614868, + "logits/rejected": -1.9917447566986084, + "logps/chosen": -2.5798110961914062, + "logps/rejected": -2.8535544872283936, + "loss": 2.7851, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.798110961914062, + "rewards/margins": 2.7374346256256104, + "rewards/rejected": -28.535547256469727, + "step": 28275 + }, + { + "epoch": 0.9531834574808723, + "grad_norm": 44.43939208984375, + "learning_rate": 6.66177418779651e-09, + "logits/chosen": -1.5889904499053955, + "logits/rejected": -1.861187219619751, + "logps/chosen": -2.142317295074463, + "logps/rejected": -2.2729101181030273, + "loss": 2.6055, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.423171997070312, + "rewards/margins": 1.305925965309143, + "rewards/rejected": -22.72909927368164, + "step": 28280 + }, + { + "epoch": 0.9533519835518555, + "grad_norm": 77.94812774658203, + "learning_rate": 6.614005402266809e-09, + "logits/chosen": -2.1064066886901855, + "logits/rejected": -2.0448784828186035, + "logps/chosen": -2.8452343940734863, + "logps/rejected": -2.7356557846069336, + "loss": 4.3889, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -28.452342987060547, + "rewards/margins": -1.0957868099212646, + "rewards/rejected": -27.356555938720703, + "step": 28285 + }, + { + "epoch": 0.9535205096228386, + "grad_norm": 34.475746154785156, + "learning_rate": 6.566407358191195e-09, + "logits/chosen": -1.9868141412734985, + "logits/rejected": -1.904761552810669, + "logps/chosen": -2.3303680419921875, + "logps/rejected": -2.568265199661255, + "loss": 2.4678, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.303680419921875, + "rewards/margins": 2.37896990776062, + "rewards/rejected": -25.68265151977539, + "step": 28290 + }, + { + "epoch": 0.9536890356938218, + "grad_norm": 32.694644927978516, + "learning_rate": 6.5189800720415465e-09, + "logits/chosen": -1.7498209476470947, + "logits/rejected": -1.8394416570663452, + "logps/chosen": -2.193845272064209, + "logps/rejected": -2.44431734085083, + "loss": 1.7126, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.938451766967773, + "rewards/margins": 2.5047202110290527, + "rewards/rejected": -24.443172454833984, + "step": 28295 + }, + { + "epoch": 0.953857561764805, + "grad_norm": 19.778499603271484, + "learning_rate": 6.471723560230458e-09, + "logits/chosen": -1.4242563247680664, + "logits/rejected": -1.4941645860671997, + "logps/chosen": -2.2942872047424316, + "logps/rejected": -2.5150489807128906, + "loss": 2.0971, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.942873001098633, + "rewards/margins": 2.2076144218444824, + "rewards/rejected": -25.150487899780273, + "step": 28300 + }, + { + "epoch": 0.9540260878357882, + "grad_norm": 167.14210510253906, + "learning_rate": 6.424637839111624e-09, + "logits/chosen": -1.7573789358139038, + "logits/rejected": -2.0239624977111816, + "logps/chosen": -3.1321041584014893, + "logps/rejected": -3.244879961013794, + "loss": 2.6003, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -31.3210391998291, + "rewards/margins": 1.127755880355835, + "rewards/rejected": -32.44879913330078, + "step": 28305 + }, + { + "epoch": 0.9541946139067714, + "grad_norm": 37.638282775878906, + "learning_rate": 6.3777229249795114e-09, + "logits/chosen": -2.3085360527038574, + "logits/rejected": -2.0975089073181152, + "logps/chosen": -2.0547664165496826, + "logps/rejected": -2.1216301918029785, + "loss": 4.1529, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -20.54766273498535, + "rewards/margins": 0.6686397790908813, + "rewards/rejected": -21.216304779052734, + "step": 28310 + }, + { + "epoch": 0.9543631399777546, + "grad_norm": 52.93343734741211, + "learning_rate": 6.330978834069578e-09, + "logits/chosen": -1.2301470041275024, + "logits/rejected": -1.3939309120178223, + "logps/chosen": -2.5767438411712646, + "logps/rejected": -2.3654720783233643, + "loss": 5.5037, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -25.767436981201172, + "rewards/margins": -2.112718105316162, + "rewards/rejected": -23.654720306396484, + "step": 28315 + }, + { + "epoch": 0.9545316660487377, + "grad_norm": 58.48306655883789, + "learning_rate": 6.284405582558106e-09, + "logits/chosen": -1.9192569255828857, + "logits/rejected": -2.261803150177002, + "logps/chosen": -2.3037891387939453, + "logps/rejected": -2.5604360103607178, + "loss": 3.1239, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -23.037891387939453, + "rewards/margins": 2.5664706230163574, + "rewards/rejected": -25.604360580444336, + "step": 28320 + }, + { + "epoch": 0.9547001921197209, + "grad_norm": 49.80613327026367, + "learning_rate": 6.2380031865622015e-09, + "logits/chosen": -2.0661492347717285, + "logits/rejected": -1.951005220413208, + "logps/chosen": -2.4906983375549316, + "logps/rejected": -3.1569619178771973, + "loss": 4.9785, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -24.906984329223633, + "rewards/margins": 6.662638187408447, + "rewards/rejected": -31.56962013244629, + "step": 28325 + }, + { + "epoch": 0.9548687181907041, + "grad_norm": 24.512651443481445, + "learning_rate": 6.191771662140022e-09, + "logits/chosen": -1.8547757863998413, + "logits/rejected": -2.033181667327881, + "logps/chosen": -2.024604320526123, + "logps/rejected": -2.131096601486206, + "loss": 2.4034, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.246042251586914, + "rewards/margins": 1.0649257898330688, + "rewards/rejected": -21.31096839904785, + "step": 28330 + }, + { + "epoch": 0.9550372442616872, + "grad_norm": 23.979116439819336, + "learning_rate": 6.145711025290323e-09, + "logits/chosen": -2.1622776985168457, + "logits/rejected": -2.1714184284210205, + "logps/chosen": -2.2897789478302, + "logps/rejected": -2.0312511920928955, + "loss": 5.7362, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.897790908813477, + "rewards/margins": -2.585280179977417, + "rewards/rejected": -20.312509536743164, + "step": 28335 + }, + { + "epoch": 0.9552057703326705, + "grad_norm": 26.935739517211914, + "learning_rate": 6.099821291952967e-09, + "logits/chosen": -2.1228065490722656, + "logits/rejected": -1.8896598815917969, + "logps/chosen": -2.216864824295044, + "logps/rejected": -2.186297655105591, + "loss": 3.8784, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.16864585876465, + "rewards/margins": -0.30566978454589844, + "rewards/rejected": -21.86297607421875, + "step": 28340 + }, + { + "epoch": 0.9553742964036537, + "grad_norm": 42.45960235595703, + "learning_rate": 6.0541024780085824e-09, + "logits/chosen": -1.6388975381851196, + "logits/rejected": -1.9698559045791626, + "logps/chosen": -2.4808647632598877, + "logps/rejected": -2.721128225326538, + "loss": 2.4064, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.80864906311035, + "rewards/margins": 2.4026341438293457, + "rewards/rejected": -27.21128273010254, + "step": 28345 + }, + { + "epoch": 0.9555428224746368, + "grad_norm": 39.344261169433594, + "learning_rate": 6.008554599278681e-09, + "logits/chosen": -1.4452064037322998, + "logits/rejected": -1.2238776683807373, + "logps/chosen": -2.0691027641296387, + "logps/rejected": -2.0181994438171387, + "loss": 4.1986, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -20.691028594970703, + "rewards/margins": -0.5090330839157104, + "rewards/rejected": -20.181995391845703, + "step": 28350 + }, + { + "epoch": 0.95571134854562, + "grad_norm": 25.936553955078125, + "learning_rate": 5.9631776715254876e-09, + "logits/chosen": -1.562342643737793, + "logits/rejected": -1.6943756341934204, + "logps/chosen": -1.931775450706482, + "logps/rejected": -2.0938315391540527, + "loss": 2.2773, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.3177547454834, + "rewards/margins": 1.6205610036849976, + "rewards/rejected": -20.938316345214844, + "step": 28355 + }, + { + "epoch": 0.9558798746166032, + "grad_norm": 18.464405059814453, + "learning_rate": 5.917971710452274e-09, + "logits/chosen": -1.4668428897857666, + "logits/rejected": -1.9650462865829468, + "logps/chosen": -2.674292802810669, + "logps/rejected": -3.530560255050659, + "loss": 1.4807, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -26.742929458618164, + "rewards/margins": 8.562676429748535, + "rewards/rejected": -35.30560302734375, + "step": 28360 + }, + { + "epoch": 0.9560484006875863, + "grad_norm": 20.47051239013672, + "learning_rate": 5.872936731702971e-09, + "logits/chosen": -2.486109972000122, + "logits/rejected": -2.605259656906128, + "logps/chosen": -2.237062692642212, + "logps/rejected": -2.565539598464966, + "loss": 1.8784, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.37062644958496, + "rewards/margins": 3.2847702503204346, + "rewards/rejected": -25.6553955078125, + "step": 28365 + }, + { + "epoch": 0.9562169267585695, + "grad_norm": 96.48336791992188, + "learning_rate": 5.828072750862445e-09, + "logits/chosen": -1.6551952362060547, + "logits/rejected": -1.9391210079193115, + "logps/chosen": -2.6091928482055664, + "logps/rejected": -2.7079176902770996, + "loss": 3.4794, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -26.091930389404297, + "rewards/margins": 0.9872471690177917, + "rewards/rejected": -27.079174041748047, + "step": 28370 + }, + { + "epoch": 0.9563854528295528, + "grad_norm": 13.303318977355957, + "learning_rate": 5.783379783456332e-09, + "logits/chosen": -1.881603479385376, + "logits/rejected": -2.7143008708953857, + "logps/chosen": -2.1637027263641357, + "logps/rejected": -3.5254616737365723, + "loss": 1.3403, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.637027740478516, + "rewards/margins": 13.617584228515625, + "rewards/rejected": -35.254615783691406, + "step": 28375 + }, + { + "epoch": 0.956553978900536, + "grad_norm": 0.08836426585912704, + "learning_rate": 5.738857844951095e-09, + "logits/chosen": -1.431921124458313, + "logits/rejected": -1.5725934505462646, + "logps/chosen": -2.3832743167877197, + "logps/rejected": -2.58109188079834, + "loss": 2.9704, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.832740783691406, + "rewards/margins": 1.9781758785247803, + "rewards/rejected": -25.8109188079834, + "step": 28380 + }, + { + "epoch": 0.9567225049715191, + "grad_norm": 21.63187026977539, + "learning_rate": 5.69450695075413e-09, + "logits/chosen": -1.9501020908355713, + "logits/rejected": -2.476367712020874, + "logps/chosen": -2.1850900650024414, + "logps/rejected": -2.6595330238342285, + "loss": 1.6813, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.850900650024414, + "rewards/margins": 4.7444257736206055, + "rewards/rejected": -26.595327377319336, + "step": 28385 + }, + { + "epoch": 0.9568910310425023, + "grad_norm": 26.18231964111328, + "learning_rate": 5.650327116213383e-09, + "logits/chosen": -1.7134357690811157, + "logits/rejected": -2.0885303020477295, + "logps/chosen": -2.298783540725708, + "logps/rejected": -2.608452558517456, + "loss": 3.0266, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.987834930419922, + "rewards/margins": 3.0966885089874268, + "rewards/rejected": -26.084524154663086, + "step": 28390 + }, + { + "epoch": 0.9570595571134854, + "grad_norm": 30.45747184753418, + "learning_rate": 5.6063183566177894e-09, + "logits/chosen": -2.2303929328918457, + "logits/rejected": -2.2387430667877197, + "logps/chosen": -2.4746217727661133, + "logps/rejected": -2.5235071182250977, + "loss": 2.9599, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.7462158203125, + "rewards/margins": 0.48885470628738403, + "rewards/rejected": -25.235071182250977, + "step": 28395 + }, + { + "epoch": 0.9572280831844686, + "grad_norm": 25.11550521850586, + "learning_rate": 5.562480687197169e-09, + "logits/chosen": -1.8280874490737915, + "logits/rejected": -2.0225062370300293, + "logps/chosen": -2.3772523403167725, + "logps/rejected": -2.5516517162323, + "loss": 2.5931, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.772525787353516, + "rewards/margins": 1.7439934015274048, + "rewards/rejected": -25.516517639160156, + "step": 28400 + }, + { + "epoch": 0.9572280831844686, + "eval_logits/chosen": -2.3117282390594482, + "eval_logits/rejected": -2.4906811714172363, + "eval_logps/chosen": -2.2893803119659424, + "eval_logps/rejected": -2.444187641143799, + "eval_loss": 3.087453603744507, + "eval_rewards/accuracies": 0.6299999952316284, + "eval_rewards/chosen": -22.8938045501709, + "eval_rewards/margins": 1.5480728149414062, + "eval_rewards/rejected": -24.441877365112305, + "eval_runtime": 12.8967, + "eval_samples_per_second": 7.754, + "eval_steps_per_second": 1.938, + "step": 28400 + }, + { + "epoch": 0.9573966092554518, + "grad_norm": 38.26020812988281, + "learning_rate": 5.518814123121884e-09, + "logits/chosen": -1.7564789056777954, + "logits/rejected": -2.1984901428222656, + "logps/chosen": -2.915069341659546, + "logps/rejected": -3.5751430988311768, + "loss": 2.0598, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -29.15069580078125, + "rewards/margins": 6.60073709487915, + "rewards/rejected": -35.75143051147461, + "step": 28405 + }, + { + "epoch": 0.957565135326435, + "grad_norm": 22.221555709838867, + "learning_rate": 5.475318679503238e-09, + "logits/chosen": -1.768252968788147, + "logits/rejected": -2.038252353668213, + "logps/chosen": -2.356065034866333, + "logps/rejected": -2.307173252105713, + "loss": 3.9019, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -23.560649871826172, + "rewards/margins": -0.4889196455478668, + "rewards/rejected": -23.07172966003418, + "step": 28410 + }, + { + "epoch": 0.9577336613974182, + "grad_norm": 33.3132438659668, + "learning_rate": 5.4319943713933e-09, + "logits/chosen": -2.22514009475708, + "logits/rejected": -2.325124979019165, + "logps/chosen": -2.7837955951690674, + "logps/rejected": -3.3888633251190186, + "loss": 1.191, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -27.837955474853516, + "rewards/margins": 6.050678253173828, + "rewards/rejected": -33.888633728027344, + "step": 28415 + }, + { + "epoch": 0.9579021874684014, + "grad_norm": 0.19005419313907623, + "learning_rate": 5.388841213784911e-09, + "logits/chosen": -1.3929895162582397, + "logits/rejected": -1.7694988250732422, + "logps/chosen": -2.0488882064819336, + "logps/rejected": -2.4571712017059326, + "loss": 1.7978, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.48888397216797, + "rewards/margins": 4.082829475402832, + "rewards/rejected": -24.571712493896484, + "step": 28420 + }, + { + "epoch": 0.9580707135393846, + "grad_norm": 56.7274169921875, + "learning_rate": 5.345859221611626e-09, + "logits/chosen": -2.0072145462036133, + "logits/rejected": -1.8055555820465088, + "logps/chosen": -3.5600218772888184, + "logps/rejected": -3.5715057849884033, + "loss": 4.3241, + "rewards/accuracies": 0.5, + "rewards/chosen": -35.600215911865234, + "rewards/margins": 0.11484356224536896, + "rewards/rejected": -35.71506118774414, + "step": 28425 + }, + { + "epoch": 0.9582392396103677, + "grad_norm": 50.16498565673828, + "learning_rate": 5.30304840974799e-09, + "logits/chosen": -1.5627635717391968, + "logits/rejected": -1.7483123540878296, + "logps/chosen": -2.061707019805908, + "logps/rejected": -2.066906452178955, + "loss": 3.4939, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.6170711517334, + "rewards/margins": 0.05199117586016655, + "rewards/rejected": -20.669063568115234, + "step": 28430 + }, + { + "epoch": 0.9584077656813509, + "grad_norm": 153.0499267578125, + "learning_rate": 5.26040879300893e-09, + "logits/chosen": -1.2752379179000854, + "logits/rejected": -1.482933759689331, + "logps/chosen": -2.8088629245758057, + "logps/rejected": -3.7246298789978027, + "loss": 4.4737, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -28.0886287689209, + "rewards/margins": 9.157669067382812, + "rewards/rejected": -37.246299743652344, + "step": 28435 + }, + { + "epoch": 0.958576291752334, + "grad_norm": 13.519747734069824, + "learning_rate": 5.2179403861504215e-09, + "logits/chosen": -1.7635982036590576, + "logits/rejected": -1.942718267440796, + "logps/chosen": -2.7124428749084473, + "logps/rejected": -3.15181303024292, + "loss": 2.2837, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -27.12442970275879, + "rewards/margins": 4.39370059967041, + "rewards/rejected": -31.518131256103516, + "step": 28440 + }, + { + "epoch": 0.9587448178233172, + "grad_norm": 17.492719650268555, + "learning_rate": 5.175643203869151e-09, + "logits/chosen": -1.8395544290542603, + "logits/rejected": -2.241781711578369, + "logps/chosen": -2.3206355571746826, + "logps/rejected": -2.714784622192383, + "loss": 2.0138, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -23.206356048583984, + "rewards/margins": 3.9414877891540527, + "rewards/rejected": -27.147846221923828, + "step": 28445 + }, + { + "epoch": 0.9589133438943005, + "grad_norm": 3.5783886909484863, + "learning_rate": 5.133517260802411e-09, + "logits/chosen": -2.2439727783203125, + "logits/rejected": -2.1783652305603027, + "logps/chosen": -3.2132935523986816, + "logps/rejected": -3.4586052894592285, + "loss": 3.8842, + "rewards/accuracies": 0.5, + "rewards/chosen": -32.1329345703125, + "rewards/margins": 2.453115940093994, + "rewards/rejected": -34.5860481262207, + "step": 28450 + }, + { + "epoch": 0.9590818699652837, + "grad_norm": 31.823118209838867, + "learning_rate": 5.091562571528485e-09, + "logits/chosen": -2.086142063140869, + "logits/rejected": -2.2255921363830566, + "logps/chosen": -2.3477437496185303, + "logps/rejected": -2.6771368980407715, + "loss": 2.753, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.47743797302246, + "rewards/margins": 3.293931484222412, + "rewards/rejected": -26.7713680267334, + "step": 28455 + }, + { + "epoch": 0.9592503960362668, + "grad_norm": 33.51295852661133, + "learning_rate": 5.049779150566036e-09, + "logits/chosen": -1.6997692584991455, + "logits/rejected": -2.0853042602539062, + "logps/chosen": -2.2463183403015137, + "logps/rejected": -3.2169253826141357, + "loss": 2.017, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.463180541992188, + "rewards/margins": 9.706072807312012, + "rewards/rejected": -32.169254302978516, + "step": 28460 + }, + { + "epoch": 0.95941892210725, + "grad_norm": 19.38419532775879, + "learning_rate": 5.008167012374831e-09, + "logits/chosen": -1.8482658863067627, + "logits/rejected": -2.0040698051452637, + "logps/chosen": -3.1203415393829346, + "logps/rejected": -3.2832634449005127, + "loss": 2.8735, + "rewards/accuracies": 0.5, + "rewards/chosen": -31.203411102294922, + "rewards/margins": 1.6292169094085693, + "rewards/rejected": -32.8326301574707, + "step": 28465 + }, + { + "epoch": 0.9595874481782332, + "grad_norm": 28.08884620666504, + "learning_rate": 4.966726171355129e-09, + "logits/chosen": -1.5639275312423706, + "logits/rejected": -1.9470984935760498, + "logps/chosen": -2.2116024494171143, + "logps/rejected": -2.476243734359741, + "loss": 2.7705, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.116024017333984, + "rewards/margins": 2.646414279937744, + "rewards/rejected": -24.76243782043457, + "step": 28470 + }, + { + "epoch": 0.9597559742492163, + "grad_norm": 31.719369888305664, + "learning_rate": 4.925456641847903e-09, + "logits/chosen": -1.650435209274292, + "logits/rejected": -1.4346725940704346, + "logps/chosen": -2.046463966369629, + "logps/rejected": -2.1387128829956055, + "loss": 2.9857, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.46463966369629, + "rewards/margins": 0.9224891662597656, + "rewards/rejected": -21.387126922607422, + "step": 28475 + }, + { + "epoch": 0.9599245003201995, + "grad_norm": 25.16029930114746, + "learning_rate": 4.884358438135006e-09, + "logits/chosen": -1.7380361557006836, + "logits/rejected": -2.3135714530944824, + "logps/chosen": -1.9699513912200928, + "logps/rejected": -2.640869140625, + "loss": 2.9684, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.699514389038086, + "rewards/margins": 6.709176540374756, + "rewards/rejected": -26.40869140625, + "step": 28480 + }, + { + "epoch": 0.9600930263911828, + "grad_norm": 17.687870025634766, + "learning_rate": 4.843431574438839e-09, + "logits/chosen": -2.181513547897339, + "logits/rejected": -2.259943962097168, + "logps/chosen": -2.7528865337371826, + "logps/rejected": -3.143580198287964, + "loss": 4.391, + "rewards/accuracies": 0.5, + "rewards/chosen": -27.52886390686035, + "rewards/margins": 3.906938076019287, + "rewards/rejected": -31.435800552368164, + "step": 28485 + }, + { + "epoch": 0.9602615524621659, + "grad_norm": 170.2515869140625, + "learning_rate": 4.802676064922684e-09, + "logits/chosen": -1.9429032802581787, + "logits/rejected": -1.7839523553848267, + "logps/chosen": -3.687286853790283, + "logps/rejected": -3.8839850425720215, + "loss": 4.0808, + "rewards/accuracies": 0.5, + "rewards/chosen": -36.872867584228516, + "rewards/margins": 1.9669809341430664, + "rewards/rejected": -38.83985137939453, + "step": 28490 + }, + { + "epoch": 0.9604300785331491, + "grad_norm": 29.234834671020508, + "learning_rate": 4.762091923690315e-09, + "logits/chosen": -1.4997103214263916, + "logits/rejected": -1.640759825706482, + "logps/chosen": -3.094761371612549, + "logps/rejected": -2.9256839752197266, + "loss": 5.9159, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -30.94761085510254, + "rewards/margins": -1.6907707452774048, + "rewards/rejected": -29.256839752197266, + "step": 28495 + }, + { + "epoch": 0.9605986046041323, + "grad_norm": 48.139503479003906, + "learning_rate": 4.721679164786329e-09, + "logits/chosen": -1.8838342428207397, + "logits/rejected": -1.9751946926116943, + "logps/chosen": -2.078104257583618, + "logps/rejected": -2.1125640869140625, + "loss": 2.8742, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.781042098999023, + "rewards/margins": 0.3445958197116852, + "rewards/rejected": -21.125638961791992, + "step": 28500 + }, + { + "epoch": 0.9607671306751154, + "grad_norm": 52.75870132446289, + "learning_rate": 4.681437802196042e-09, + "logits/chosen": -1.1743929386138916, + "logits/rejected": -1.440617322921753, + "logps/chosen": -2.2910919189453125, + "logps/rejected": -2.7413864135742188, + "loss": 1.6422, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -22.910917282104492, + "rewards/margins": 4.502947807312012, + "rewards/rejected": -27.413867950439453, + "step": 28505 + }, + { + "epoch": 0.9609356567460986, + "grad_norm": 81.56078338623047, + "learning_rate": 4.641367849845312e-09, + "logits/chosen": -2.3156750202178955, + "logits/rejected": -2.689465045928955, + "logps/chosen": -2.853290557861328, + "logps/rejected": -3.101311683654785, + "loss": 3.3697, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -28.53290367126465, + "rewards/margins": 2.480211019515991, + "rewards/rejected": -31.01311683654785, + "step": 28510 + }, + { + "epoch": 0.9611041828170818, + "grad_norm": 6.894532680511475, + "learning_rate": 4.601469321600826e-09, + "logits/chosen": -1.5325663089752197, + "logits/rejected": -1.9350004196166992, + "logps/chosen": -2.1749038696289062, + "logps/rejected": -3.0541293621063232, + "loss": 2.1314, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.749038696289062, + "rewards/margins": 8.792255401611328, + "rewards/rejected": -30.541296005249023, + "step": 28515 + }, + { + "epoch": 0.9612727088880649, + "grad_norm": 34.375911712646484, + "learning_rate": 4.561742231269872e-09, + "logits/chosen": -1.6561189889907837, + "logits/rejected": -1.647509217262268, + "logps/chosen": -1.992790937423706, + "logps/rejected": -2.0441665649414062, + "loss": 3.1003, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -19.92790985107422, + "rewards/margins": 0.5137545466423035, + "rewards/rejected": -20.441665649414062, + "step": 28520 + }, + { + "epoch": 0.9614412349590482, + "grad_norm": 56.41557693481445, + "learning_rate": 4.522186592600452e-09, + "logits/chosen": -2.0667316913604736, + "logits/rejected": -2.038256883621216, + "logps/chosen": -2.5181336402893066, + "logps/rejected": -2.4041855335235596, + "loss": 4.1961, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -25.18133544921875, + "rewards/margins": -1.139478325843811, + "rewards/rejected": -24.041854858398438, + "step": 28525 + }, + { + "epoch": 0.9616097610300314, + "grad_norm": 29.580322265625, + "learning_rate": 4.482802419281229e-09, + "logits/chosen": -1.4627254009246826, + "logits/rejected": -1.1754380464553833, + "logps/chosen": -1.9528896808624268, + "logps/rejected": -1.9384548664093018, + "loss": 3.9845, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.52889633178711, + "rewards/margins": -0.14434775710105896, + "rewards/rejected": -19.38454818725586, + "step": 28530 + }, + { + "epoch": 0.9617782871010145, + "grad_norm": 56.96183776855469, + "learning_rate": 4.443589724941466e-09, + "logits/chosen": -1.8529266119003296, + "logits/rejected": -2.2691307067871094, + "logps/chosen": -2.944936513900757, + "logps/rejected": -3.49541974067688, + "loss": 1.8121, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -29.449365615844727, + "rewards/margins": 5.504830837249756, + "rewards/rejected": -34.95419692993164, + "step": 28535 + }, + { + "epoch": 0.9619468131719977, + "grad_norm": 22.926856994628906, + "learning_rate": 4.404548523151197e-09, + "logits/chosen": -2.0277888774871826, + "logits/rejected": -2.1954684257507324, + "logps/chosen": -2.326841354370117, + "logps/rejected": -2.5182197093963623, + "loss": 2.5164, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.268413543701172, + "rewards/margins": 1.9137840270996094, + "rewards/rejected": -25.18219566345215, + "step": 28540 + }, + { + "epoch": 0.9621153392429809, + "grad_norm": 224.8534393310547, + "learning_rate": 4.365678827420949e-09, + "logits/chosen": -1.8096294403076172, + "logits/rejected": -2.36387300491333, + "logps/chosen": -2.040325164794922, + "logps/rejected": -2.2415404319763184, + "loss": 3.5418, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.40325164794922, + "rewards/margins": 2.0121548175811768, + "rewards/rejected": -22.4154052734375, + "step": 28545 + }, + { + "epoch": 0.962283865313964, + "grad_norm": 60.661651611328125, + "learning_rate": 4.326980651202072e-09, + "logits/chosen": -2.061427354812622, + "logits/rejected": -2.287824869155884, + "logps/chosen": -2.8698368072509766, + "logps/rejected": -3.1925554275512695, + "loss": 1.8822, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -28.6983642578125, + "rewards/margins": 3.227189540863037, + "rewards/rejected": -31.925556182861328, + "step": 28550 + }, + { + "epoch": 0.9624523913849472, + "grad_norm": 38.10822677612305, + "learning_rate": 4.28845400788641e-09, + "logits/chosen": -1.6126525402069092, + "logits/rejected": -1.8027045726776123, + "logps/chosen": -3.1499922275543213, + "logps/rejected": -3.1586296558380127, + "loss": 5.7095, + "rewards/accuracies": 0.5, + "rewards/chosen": -31.499919891357422, + "rewards/margins": 0.08637352287769318, + "rewards/rejected": -31.5862979888916, + "step": 28555 + }, + { + "epoch": 0.9626209174559305, + "grad_norm": 28.21145248413086, + "learning_rate": 4.250098910806632e-09, + "logits/chosen": -2.2559456825256348, + "logits/rejected": -2.324834108352661, + "logps/chosen": -2.5185751914978027, + "logps/rejected": -2.8438072204589844, + "loss": 3.1164, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.18575096130371, + "rewards/margins": 3.2523179054260254, + "rewards/rejected": -28.43807029724121, + "step": 28560 + }, + { + "epoch": 0.9627894435269136, + "grad_norm": 24.09324073791504, + "learning_rate": 4.211915373235841e-09, + "logits/chosen": -1.9755699634552002, + "logits/rejected": -1.9022626876831055, + "logps/chosen": -2.221282720565796, + "logps/rejected": -2.3595638275146484, + "loss": 3.3347, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.21282958984375, + "rewards/margins": 1.3828084468841553, + "rewards/rejected": -23.59563636779785, + "step": 28565 + }, + { + "epoch": 0.9629579695978968, + "grad_norm": 32.821205139160156, + "learning_rate": 4.173903408387802e-09, + "logits/chosen": -2.4099087715148926, + "logits/rejected": -2.6462645530700684, + "logps/chosen": -2.4751827716827393, + "logps/rejected": -2.603869915008545, + "loss": 2.7741, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.7518310546875, + "rewards/margins": 1.2868703603744507, + "rewards/rejected": -26.0387020111084, + "step": 28570 + }, + { + "epoch": 0.96312649566888, + "grad_norm": 38.95467758178711, + "learning_rate": 4.136063029417103e-09, + "logits/chosen": -1.8634698390960693, + "logits/rejected": -2.089498996734619, + "logps/chosen": -1.9672338962554932, + "logps/rejected": -2.2207417488098145, + "loss": 2.5031, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.672338485717773, + "rewards/margins": 2.5350780487060547, + "rewards/rejected": -22.207416534423828, + "step": 28575 + }, + { + "epoch": 0.9632950217398631, + "grad_norm": 33.86354446411133, + "learning_rate": 4.098394249418657e-09, + "logits/chosen": -1.2706377506256104, + "logits/rejected": -1.239839792251587, + "logps/chosen": -1.8439422845840454, + "logps/rejected": -1.8992751836776733, + "loss": 2.6658, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.439422607421875, + "rewards/margins": 0.5533312559127808, + "rewards/rejected": -18.992752075195312, + "step": 28580 + }, + { + "epoch": 0.9634635478108463, + "grad_norm": 31.960468292236328, + "learning_rate": 4.06089708142826e-09, + "logits/chosen": -1.8972434997558594, + "logits/rejected": -2.0411770343780518, + "logps/chosen": -2.1596405506134033, + "logps/rejected": -2.309891939163208, + "loss": 2.1328, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.596405029296875, + "rewards/margins": 1.5025140047073364, + "rewards/rejected": -23.098918914794922, + "step": 28585 + }, + { + "epoch": 0.9636320738818295, + "grad_norm": 15.974693298339844, + "learning_rate": 4.023571538422199e-09, + "logits/chosen": -1.7587133646011353, + "logits/rejected": -1.7669118642807007, + "logps/chosen": -2.5280303955078125, + "logps/rejected": -2.8322720527648926, + "loss": 1.4144, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -25.280302047729492, + "rewards/margins": 3.0424180030822754, + "rewards/rejected": -28.32271957397461, + "step": 28590 + }, + { + "epoch": 0.9638005999528128, + "grad_norm": 12.441394805908203, + "learning_rate": 3.986417633317307e-09, + "logits/chosen": -1.434709906578064, + "logits/rejected": -1.4066721200942993, + "logps/chosen": -2.8106179237365723, + "logps/rejected": -2.5945558547973633, + "loss": 5.7322, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -28.106182098388672, + "rewards/margins": -2.1606221199035645, + "rewards/rejected": -25.945560455322266, + "step": 28595 + }, + { + "epoch": 0.9639691260237959, + "grad_norm": 55.572017669677734, + "learning_rate": 3.949435378971078e-09, + "logits/chosen": -2.209183692932129, + "logits/rejected": -2.1617417335510254, + "logps/chosen": -2.475675106048584, + "logps/rejected": -2.499035358428955, + "loss": 3.7538, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.75674819946289, + "rewards/margins": 0.2336008995771408, + "rewards/rejected": -24.9903507232666, + "step": 28600 + }, + { + "epoch": 0.9641376520947791, + "grad_norm": 26.91092872619629, + "learning_rate": 3.912624788181718e-09, + "logits/chosen": -1.8822380304336548, + "logits/rejected": -2.017254590988159, + "logps/chosen": -2.8590264320373535, + "logps/rejected": -3.2963809967041016, + "loss": 1.8247, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -28.59026527404785, + "rewards/margins": 4.373543739318848, + "rewards/rejected": -32.96380615234375, + "step": 28605 + }, + { + "epoch": 0.9643061781657623, + "grad_norm": 26.15085220336914, + "learning_rate": 3.875985873687815e-09, + "logits/chosen": -1.9373159408569336, + "logits/rejected": -2.321394443511963, + "logps/chosen": -2.242321491241455, + "logps/rejected": -3.103455066680908, + "loss": 2.3169, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.423213958740234, + "rewards/margins": 8.611337661743164, + "rewards/rejected": -31.0345516204834, + "step": 28610 + }, + { + "epoch": 0.9644747042367454, + "grad_norm": 37.95960235595703, + "learning_rate": 3.839518648168727e-09, + "logits/chosen": -1.6489719152450562, + "logits/rejected": -1.6028999090194702, + "logps/chosen": -2.2515532970428467, + "logps/rejected": -2.245664119720459, + "loss": 3.6217, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.515533447265625, + "rewards/margins": -0.05889282375574112, + "rewards/rejected": -22.456640243530273, + "step": 28615 + }, + { + "epoch": 0.9646432303077286, + "grad_norm": 19.891639709472656, + "learning_rate": 3.803223124244248e-09, + "logits/chosen": -1.575360894203186, + "logits/rejected": -1.8237911462783813, + "logps/chosen": -2.147665500640869, + "logps/rejected": -2.6151740550994873, + "loss": 2.6801, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.476654052734375, + "rewards/margins": 4.675088405609131, + "rewards/rejected": -26.151742935180664, + "step": 28620 + }, + { + "epoch": 0.9648117563787117, + "grad_norm": 59.24190902709961, + "learning_rate": 3.767099314474887e-09, + "logits/chosen": -2.1034979820251465, + "logits/rejected": -2.959904670715332, + "logps/chosen": -3.063356637954712, + "logps/rejected": -5.073480129241943, + "loss": 3.1896, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -30.633569717407227, + "rewards/margins": 20.10123062133789, + "rewards/rejected": -50.73479461669922, + "step": 28625 + }, + { + "epoch": 0.9649802824496949, + "grad_norm": 19.20398712158203, + "learning_rate": 3.731147231361698e-09, + "logits/chosen": -1.7048972845077515, + "logits/rejected": -1.8632335662841797, + "logps/chosen": -2.008641481399536, + "logps/rejected": -2.1975979804992676, + "loss": 1.9952, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.086414337158203, + "rewards/margins": 1.8895690441131592, + "rewards/rejected": -21.975982666015625, + "step": 28630 + }, + { + "epoch": 0.9651488085206782, + "grad_norm": 31.66425895690918, + "learning_rate": 3.695366887346174e-09, + "logits/chosen": -1.8439382314682007, + "logits/rejected": -2.2968287467956543, + "logps/chosen": -1.9383208751678467, + "logps/rejected": -2.144949436187744, + "loss": 3.0275, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.383209228515625, + "rewards/margins": 2.066281795501709, + "rewards/rejected": -21.44948959350586, + "step": 28635 + }, + { + "epoch": 0.9653173345916614, + "grad_norm": 39.83387756347656, + "learning_rate": 3.6597582948105774e-09, + "logits/chosen": -1.7409656047821045, + "logits/rejected": -1.9152864217758179, + "logps/chosen": -2.375166654586792, + "logps/rejected": -2.440739393234253, + "loss": 3.1421, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.75166893005371, + "rewards/margins": 0.6557281613349915, + "rewards/rejected": -24.407394409179688, + "step": 28640 + }, + { + "epoch": 0.9654858606626445, + "grad_norm": 90.26930236816406, + "learning_rate": 3.624321466077662e-09, + "logits/chosen": -1.8725887537002563, + "logits/rejected": -2.0449306964874268, + "logps/chosen": -3.7120277881622314, + "logps/rejected": -3.5196595191955566, + "loss": 6.2949, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -37.120277404785156, + "rewards/margins": -1.9236809015274048, + "rewards/rejected": -35.19659423828125, + "step": 28645 + }, + { + "epoch": 0.9656543867336277, + "grad_norm": 33.77092361450195, + "learning_rate": 3.589056413410563e-09, + "logits/chosen": -1.4964176416397095, + "logits/rejected": -1.619513750076294, + "logps/chosen": -1.9811350107192993, + "logps/rejected": -2.0898003578186035, + "loss": 2.4862, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.811349868774414, + "rewards/margins": 1.0866527557373047, + "rewards/rejected": -20.898000717163086, + "step": 28650 + }, + { + "epoch": 0.9658229128046109, + "grad_norm": 25.501209259033203, + "learning_rate": 3.553963149013295e-09, + "logits/chosen": -1.4640041589736938, + "logits/rejected": -1.5587005615234375, + "logps/chosen": -2.191490650177002, + "logps/rejected": -2.5473177433013916, + "loss": 1.2411, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -21.914907455444336, + "rewards/margins": 3.558271884918213, + "rewards/rejected": -25.47317886352539, + "step": 28655 + }, + { + "epoch": 0.965991438875594, + "grad_norm": 25.410991668701172, + "learning_rate": 3.5190416850301998e-09, + "logits/chosen": -1.6539732217788696, + "logits/rejected": -1.6497859954833984, + "logps/chosen": -1.9782453775405884, + "logps/rejected": -2.0479655265808105, + "loss": 2.602, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.782453536987305, + "rewards/margins": 0.697202205657959, + "rewards/rejected": -20.479656219482422, + "step": 28660 + }, + { + "epoch": 0.9661599649465772, + "grad_norm": 18.554931640625, + "learning_rate": 3.484292033546166e-09, + "logits/chosen": -1.9503523111343384, + "logits/rejected": -2.0218005180358887, + "logps/chosen": -2.2136752605438232, + "logps/rejected": -2.3350918292999268, + "loss": 2.4669, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.13675308227539, + "rewards/margins": 1.2141668796539307, + "rewards/rejected": -23.35091781616211, + "step": 28665 + }, + { + "epoch": 0.9663284910175605, + "grad_norm": 29.587387084960938, + "learning_rate": 3.44971420658674e-09, + "logits/chosen": -2.254612684249878, + "logits/rejected": -2.438527822494507, + "logps/chosen": -1.7682807445526123, + "logps/rejected": -2.1252379417419434, + "loss": 2.1373, + "rewards/accuracies": 0.5, + "rewards/chosen": -17.68280601501465, + "rewards/margins": 3.569573163986206, + "rewards/rejected": -21.25238037109375, + "step": 28670 + }, + { + "epoch": 0.9664970170885436, + "grad_norm": 41.48356628417969, + "learning_rate": 3.415308216117907e-09, + "logits/chosen": -2.4520583152770996, + "logits/rejected": -2.6164937019348145, + "logps/chosen": -3.1903624534606934, + "logps/rejected": -3.697064161300659, + "loss": 1.6011, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -31.90362548828125, + "rewards/margins": 5.067018985748291, + "rewards/rejected": -36.970645904541016, + "step": 28675 + }, + { + "epoch": 0.9666655431595268, + "grad_norm": 29.71332359313965, + "learning_rate": 3.3810740740463086e-09, + "logits/chosen": -1.7559601068496704, + "logits/rejected": -1.8875644207000732, + "logps/chosen": -2.5920846462249756, + "logps/rejected": -2.9399256706237793, + "loss": 2.3224, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -25.920846939086914, + "rewards/margins": 3.4784131050109863, + "rewards/rejected": -29.39925765991211, + "step": 28680 + }, + { + "epoch": 0.96683406923051, + "grad_norm": 21.529844284057617, + "learning_rate": 3.3470117922189123e-09, + "logits/chosen": -1.950628638267517, + "logits/rejected": -2.0394814014434814, + "logps/chosen": -2.423513412475586, + "logps/rejected": -2.692171096801758, + "loss": 1.7308, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -24.23513412475586, + "rewards/margins": 2.6865768432617188, + "rewards/rejected": -26.921710968017578, + "step": 28685 + }, + { + "epoch": 0.9670025953014931, + "grad_norm": 113.19538879394531, + "learning_rate": 3.3131213824234007e-09, + "logits/chosen": -1.4651386737823486, + "logits/rejected": -1.5366795063018799, + "logps/chosen": -2.404541492462158, + "logps/rejected": -2.637510299682617, + "loss": 2.2841, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -24.0454158782959, + "rewards/margins": 2.3296852111816406, + "rewards/rejected": -26.37510108947754, + "step": 28690 + }, + { + "epoch": 0.9671711213724763, + "grad_norm": 50.14772033691406, + "learning_rate": 3.2794028563878917e-09, + "logits/chosen": -1.2284538745880127, + "logits/rejected": -1.4384486675262451, + "logps/chosen": -2.580862522125244, + "logps/rejected": -2.5931923389434814, + "loss": 3.6783, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -25.808624267578125, + "rewards/margins": 0.1232973113656044, + "rewards/rejected": -25.93192481994629, + "step": 28695 + }, + { + "epoch": 0.9673396474434595, + "grad_norm": 31.067855834960938, + "learning_rate": 3.245856225781052e-09, + "logits/chosen": -2.0800235271453857, + "logits/rejected": -2.189945936203003, + "logps/chosen": -2.929348945617676, + "logps/rejected": -3.384255886077881, + "loss": 2.2898, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -29.293487548828125, + "rewards/margins": 4.549074649810791, + "rewards/rejected": -33.84256362915039, + "step": 28700 + }, + { + "epoch": 0.9675081735144427, + "grad_norm": 37.61033248901367, + "learning_rate": 3.2124815022120387e-09, + "logits/chosen": -2.1509222984313965, + "logits/rejected": -2.2263705730438232, + "logps/chosen": -2.11362361907959, + "logps/rejected": -2.359367609024048, + "loss": 2.7692, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.13623809814453, + "rewards/margins": 2.457437038421631, + "rewards/rejected": -23.593673706054688, + "step": 28705 + }, + { + "epoch": 0.9676766995854259, + "grad_norm": 31.55821418762207, + "learning_rate": 3.179278697230503e-09, + "logits/chosen": -2.073500156402588, + "logits/rejected": -2.0467491149902344, + "logps/chosen": -2.6881165504455566, + "logps/rejected": -2.769103527069092, + "loss": 3.0882, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -26.88116455078125, + "rewards/margins": 0.8098711967468262, + "rewards/rejected": -27.691036224365234, + "step": 28710 + }, + { + "epoch": 0.9678452256564091, + "grad_norm": 29.740568161010742, + "learning_rate": 3.1462478223266975e-09, + "logits/chosen": -2.016167640686035, + "logits/rejected": -2.078772783279419, + "logps/chosen": -2.57477068901062, + "logps/rejected": -2.529498338699341, + "loss": 4.3984, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -25.747705459594727, + "rewards/margins": -0.45272406935691833, + "rewards/rejected": -25.29498291015625, + "step": 28715 + }, + { + "epoch": 0.9680137517273922, + "grad_norm": 98.16922760009766, + "learning_rate": 3.1133888889312565e-09, + "logits/chosen": -2.303128242492676, + "logits/rejected": -2.5696189403533936, + "logps/chosen": -2.2590153217315674, + "logps/rejected": -2.7877578735351562, + "loss": 2.3962, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.590152740478516, + "rewards/margins": 5.287428855895996, + "rewards/rejected": -27.877578735351562, + "step": 28720 + }, + { + "epoch": 0.9681822777983754, + "grad_norm": 15.801095008850098, + "learning_rate": 3.0807019084153618e-09, + "logits/chosen": -1.862961769104004, + "logits/rejected": -2.02888822555542, + "logps/chosen": -1.8569800853729248, + "logps/rejected": -2.0848309993743896, + "loss": 3.0561, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.569801330566406, + "rewards/margins": 2.2785086631774902, + "rewards/rejected": -20.848312377929688, + "step": 28725 + }, + { + "epoch": 0.9683508038693586, + "grad_norm": 54.87903594970703, + "learning_rate": 3.0481868920906874e-09, + "logits/chosen": -1.3197087049484253, + "logits/rejected": -1.2992570400238037, + "logps/chosen": -2.3507628440856934, + "logps/rejected": -2.3117618560791016, + "loss": 3.8642, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.507625579833984, + "rewards/margins": -0.3900091052055359, + "rewards/rejected": -23.117618560791016, + "step": 28730 + }, + { + "epoch": 0.9685193299403417, + "grad_norm": 109.8236083984375, + "learning_rate": 3.0158438512093986e-09, + "logits/chosen": -2.0474331378936768, + "logits/rejected": -2.081048011779785, + "logps/chosen": -3.522554397583008, + "logps/rejected": -3.243844509124756, + "loss": 6.6535, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -35.225547790527344, + "rewards/margins": -2.7871007919311523, + "rewards/rejected": -32.438446044921875, + "step": 28735 + }, + { + "epoch": 0.9686878560113249, + "grad_norm": 56.84428405761719, + "learning_rate": 2.9836727969642095e-09, + "logits/chosen": -2.045356273651123, + "logits/rejected": -2.2447898387908936, + "logps/chosen": -3.090925693511963, + "logps/rejected": -3.5039238929748535, + "loss": 2.3891, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -30.909255981445312, + "rewards/margins": 4.12998104095459, + "rewards/rejected": -35.03923797607422, + "step": 28740 + }, + { + "epoch": 0.9688563820823082, + "grad_norm": 34.671470642089844, + "learning_rate": 2.9516737404881587e-09, + "logits/chosen": -1.768585443496704, + "logits/rejected": -2.076643228530884, + "logps/chosen": -2.617316484451294, + "logps/rejected": -3.1697921752929688, + "loss": 4.1929, + "rewards/accuracies": 0.5, + "rewards/chosen": -26.17316246032715, + "rewards/margins": 5.5247578620910645, + "rewards/rejected": -31.697921752929688, + "step": 28745 + }, + { + "epoch": 0.9690249081532913, + "grad_norm": 25.47473907470703, + "learning_rate": 2.9198466928549435e-09, + "logits/chosen": -2.3101115226745605, + "logits/rejected": -2.5386404991149902, + "logps/chosen": -1.9861423969268799, + "logps/rejected": -2.017178773880005, + "loss": 3.0888, + "rewards/accuracies": 0.5, + "rewards/chosen": -19.861421585083008, + "rewards/margins": 0.3103656768798828, + "rewards/rejected": -20.171789169311523, + "step": 28750 + }, + { + "epoch": 0.9691934342242745, + "grad_norm": 30.15863037109375, + "learning_rate": 2.8881916650785875e-09, + "logits/chosen": -2.482518434524536, + "logits/rejected": -2.915527582168579, + "logps/chosen": -3.200958251953125, + "logps/rejected": -5.268540382385254, + "loss": 1.2083, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -32.00957489013672, + "rewards/margins": 20.67582130432129, + "rewards/rejected": -52.685401916503906, + "step": 28755 + }, + { + "epoch": 0.9693619602952577, + "grad_norm": 36.95034408569336, + "learning_rate": 2.8567086681136608e-09, + "logits/chosen": -1.4024853706359863, + "logits/rejected": -1.4760109186172485, + "logps/chosen": -2.4063594341278076, + "logps/rejected": -2.446986436843872, + "loss": 3.5985, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.063594818115234, + "rewards/margins": 0.40627098083496094, + "rewards/rejected": -24.469867706298828, + "step": 28760 + }, + { + "epoch": 0.9695304863662408, + "grad_norm": 5.878857612609863, + "learning_rate": 2.8253977128551708e-09, + "logits/chosen": -1.9031862020492554, + "logits/rejected": -1.948831558227539, + "logps/chosen": -2.9409420490264893, + "logps/rejected": -3.4585976600646973, + "loss": 2.0746, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -29.409423828125, + "rewards/margins": 5.176552772521973, + "rewards/rejected": -34.585975646972656, + "step": 28765 + }, + { + "epoch": 0.969699012437224, + "grad_norm": 45.4705924987793, + "learning_rate": 2.794258810138728e-09, + "logits/chosen": -1.9813125133514404, + "logits/rejected": -2.1614718437194824, + "logps/chosen": -2.020968198776245, + "logps/rejected": -2.2112536430358887, + "loss": 2.5303, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -20.20968246459961, + "rewards/margins": 1.9028533697128296, + "rewards/rejected": -22.11253547668457, + "step": 28770 + }, + { + "epoch": 0.9698675385082072, + "grad_norm": 58.42421340942383, + "learning_rate": 2.7632919707401e-09, + "logits/chosen": -1.7921464443206787, + "logits/rejected": -1.8928537368774414, + "logps/chosen": -3.405698299407959, + "logps/rejected": -3.547631025314331, + "loss": 3.6025, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -34.056983947753906, + "rewards/margins": 1.4193235635757446, + "rewards/rejected": -35.47631072998047, + "step": 28775 + }, + { + "epoch": 0.9700360645791904, + "grad_norm": 31.0329647064209, + "learning_rate": 2.7324972053758275e-09, + "logits/chosen": -2.084688663482666, + "logits/rejected": -2.337444305419922, + "logps/chosen": -2.1967928409576416, + "logps/rejected": -2.5291614532470703, + "loss": 2.0453, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.967926025390625, + "rewards/margins": 3.3236873149871826, + "rewards/rejected": -25.291614532470703, + "step": 28780 + }, + { + "epoch": 0.9702045906501736, + "grad_norm": 17.989961624145508, + "learning_rate": 2.7018745247027184e-09, + "logits/chosen": -1.533870816230774, + "logits/rejected": -1.8206316232681274, + "logps/chosen": -1.893083930015564, + "logps/rejected": -2.1794352531433105, + "loss": 1.4881, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.93084144592285, + "rewards/margins": 2.863513946533203, + "rewards/rejected": -21.794353485107422, + "step": 28785 + }, + { + "epoch": 0.9703731167211568, + "grad_norm": 43.55194854736328, + "learning_rate": 2.671423939318018e-09, + "logits/chosen": -1.1976906061172485, + "logits/rejected": -1.5607186555862427, + "logps/chosen": -2.5054843425750732, + "logps/rejected": -2.798884391784668, + "loss": 2.1093, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.05484390258789, + "rewards/margins": 2.933999538421631, + "rewards/rejected": -27.988842010498047, + "step": 28790 + }, + { + "epoch": 0.97054164279214, + "grad_norm": 86.71769714355469, + "learning_rate": 2.641145459759575e-09, + "logits/chosen": -1.8374137878417969, + "logits/rejected": -2.4237678050994873, + "logps/chosen": -2.241248369216919, + "logps/rejected": -2.8495514392852783, + "loss": 2.2861, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -22.4124813079834, + "rewards/margins": 6.083029747009277, + "rewards/rejected": -28.495513916015625, + "step": 28795 + }, + { + "epoch": 0.9707101688631231, + "grad_norm": 11.737354278564453, + "learning_rate": 2.611039096505563e-09, + "logits/chosen": -1.5608351230621338, + "logits/rejected": -2.000445604324341, + "logps/chosen": -3.0214154720306396, + "logps/rejected": -3.182666540145874, + "loss": 4.4413, + "rewards/accuracies": 0.5, + "rewards/chosen": -30.214153289794922, + "rewards/margins": 1.612510323524475, + "rewards/rejected": -31.8266658782959, + "step": 28800 + }, + { + "epoch": 0.9707101688631231, + "eval_logits/chosen": -2.3131000995635986, + "eval_logits/rejected": -2.4913644790649414, + "eval_logps/chosen": -2.2895238399505615, + "eval_logps/rejected": -2.44382905960083, + "eval_loss": 3.089254140853882, + "eval_rewards/accuracies": 0.6299999952316284, + "eval_rewards/chosen": -22.89523696899414, + "eval_rewards/margins": 1.5430537462234497, + "eval_rewards/rejected": -24.438291549682617, + "eval_runtime": 12.9148, + "eval_samples_per_second": 7.743, + "eval_steps_per_second": 1.936, + "step": 28800 + }, + { + "epoch": 0.9708786949341063, + "grad_norm": 4.901158332824707, + "learning_rate": 2.5811048599744813e-09, + "logits/chosen": -1.9958289861679077, + "logits/rejected": -2.1583971977233887, + "logps/chosen": -2.2538743019104004, + "logps/rejected": -2.5135021209716797, + "loss": 3.2677, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.53874397277832, + "rewards/margins": 2.5962772369384766, + "rewards/rejected": -25.135021209716797, + "step": 28805 + }, + { + "epoch": 0.9710472210050894, + "grad_norm": 38.69203567504883, + "learning_rate": 2.5513427605255433e-09, + "logits/chosen": -1.3330328464508057, + "logits/rejected": -2.0836803913116455, + "logps/chosen": -2.295102596282959, + "logps/rejected": -3.4632301330566406, + "loss": 1.5787, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.95102882385254, + "rewards/margins": 11.681272506713867, + "rewards/rejected": -34.632301330566406, + "step": 28810 + }, + { + "epoch": 0.9712157470760727, + "grad_norm": 64.19732666015625, + "learning_rate": 2.5217528084581773e-09, + "logits/chosen": -2.233541965484619, + "logits/rejected": -2.275514602661133, + "logps/chosen": -2.8877837657928467, + "logps/rejected": -2.8634886741638184, + "loss": 3.5775, + "rewards/accuracies": 0.5, + "rewards/chosen": -28.877838134765625, + "rewards/margins": -0.24294976890087128, + "rewards/rejected": -28.6348876953125, + "step": 28815 + }, + { + "epoch": 0.9713842731470559, + "grad_norm": 28.017070770263672, + "learning_rate": 2.4923350140123033e-09, + "logits/chosen": -1.9607566595077515, + "logits/rejected": -1.9388478994369507, + "logps/chosen": -2.181363344192505, + "logps/rejected": -2.456298828125, + "loss": 3.5988, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.81363296508789, + "rewards/margins": 2.7493553161621094, + "rewards/rejected": -24.562986373901367, + "step": 28820 + }, + { + "epoch": 0.971552799218039, + "grad_norm": 40.535179138183594, + "learning_rate": 2.4630893873682777e-09, + "logits/chosen": -1.5083204507827759, + "logits/rejected": -1.7215535640716553, + "logps/chosen": -3.0454647541046143, + "logps/rejected": -3.433953046798706, + "loss": 1.6007, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -30.45464515686035, + "rewards/margins": 3.8848869800567627, + "rewards/rejected": -34.33953094482422, + "step": 28825 + }, + { + "epoch": 0.9717213252890222, + "grad_norm": 45.41533660888672, + "learning_rate": 2.4340159386468383e-09, + "logits/chosen": -1.9942636489868164, + "logits/rejected": -2.528942108154297, + "logps/chosen": -2.4124343395233154, + "logps/rejected": -2.7206547260284424, + "loss": 1.6811, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -24.124343872070312, + "rewards/margins": 3.0822019577026367, + "rewards/rejected": -27.206546783447266, + "step": 28830 + }, + { + "epoch": 0.9718898513600054, + "grad_norm": 22.677080154418945, + "learning_rate": 2.405114677909159e-09, + "logits/chosen": -1.1437983512878418, + "logits/rejected": -1.6425338983535767, + "logps/chosen": -2.4738216400146484, + "logps/rejected": -3.083200693130493, + "loss": 2.4923, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.738216400146484, + "rewards/margins": 6.0937910079956055, + "rewards/rejected": -30.832006454467773, + "step": 28835 + }, + { + "epoch": 0.9720583774309886, + "grad_norm": 18.422693252563477, + "learning_rate": 2.3763856151567953e-09, + "logits/chosen": -1.6711835861206055, + "logits/rejected": -2.03035831451416, + "logps/chosen": -1.7528514862060547, + "logps/rejected": -2.049534797668457, + "loss": 2.1549, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -17.528514862060547, + "rewards/margins": 2.966834306716919, + "rewards/rejected": -20.49534797668457, + "step": 28840 + }, + { + "epoch": 0.9722269035019717, + "grad_norm": 38.746925354003906, + "learning_rate": 2.347828760331849e-09, + "logits/chosen": -2.295783519744873, + "logits/rejected": -2.0634446144104004, + "logps/chosen": -2.789072036743164, + "logps/rejected": -2.251504898071289, + "loss": 8.5075, + "rewards/accuracies": 0.5, + "rewards/chosen": -27.890722274780273, + "rewards/margins": -5.375671863555908, + "rewards/rejected": -22.51504898071289, + "step": 28845 + }, + { + "epoch": 0.9723954295729549, + "grad_norm": 29.222929000854492, + "learning_rate": 2.319444123316583e-09, + "logits/chosen": -1.7320674657821655, + "logits/rejected": -2.0177040100097656, + "logps/chosen": -1.969813585281372, + "logps/rejected": -2.4684014320373535, + "loss": 1.4651, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.698135375976562, + "rewards/margins": 4.985878944396973, + "rewards/rejected": -24.684011459350586, + "step": 28850 + }, + { + "epoch": 0.9725639556439382, + "grad_norm": 23.3667049407959, + "learning_rate": 2.2912317139339164e-09, + "logits/chosen": -1.6429636478424072, + "logits/rejected": -1.873246192932129, + "logps/chosen": -3.437391757965088, + "logps/rejected": -3.674466609954834, + "loss": 2.725, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -34.3739128112793, + "rewards/margins": 2.3707499504089355, + "rewards/rejected": -36.744667053222656, + "step": 28855 + }, + { + "epoch": 0.9727324817149213, + "grad_norm": 167.22389221191406, + "learning_rate": 2.2631915419470406e-09, + "logits/chosen": -1.5777119398117065, + "logits/rejected": -1.7945228815078735, + "logps/chosen": -2.8456039428710938, + "logps/rejected": -2.8768460750579834, + "loss": 3.7524, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -28.456035614013672, + "rewards/margins": 0.31242045760154724, + "rewards/rejected": -28.76845932006836, + "step": 28860 + }, + { + "epoch": 0.9729010077859045, + "grad_norm": 23.845905303955078, + "learning_rate": 2.23532361705947e-09, + "logits/chosen": -1.9737046957015991, + "logits/rejected": -1.9449818134307861, + "logps/chosen": -2.9338698387145996, + "logps/rejected": -3.416442394256592, + "loss": 1.9032, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -29.338695526123047, + "rewards/margins": 4.82572603225708, + "rewards/rejected": -34.16442108154297, + "step": 28865 + }, + { + "epoch": 0.9730695338568877, + "grad_norm": 305.1143798828125, + "learning_rate": 2.207627948915269e-09, + "logits/chosen": -1.7003231048583984, + "logits/rejected": -1.7136930227279663, + "logps/chosen": -3.029107093811035, + "logps/rejected": -3.0569915771484375, + "loss": 3.7145, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -30.29107093811035, + "rewards/margins": 0.27884674072265625, + "rewards/rejected": -30.569915771484375, + "step": 28870 + }, + { + "epoch": 0.9732380599278708, + "grad_norm": 28.607589721679688, + "learning_rate": 2.1801045470987713e-09, + "logits/chosen": -1.466975450515747, + "logits/rejected": -1.8123632669448853, + "logps/chosen": -2.1416099071502686, + "logps/rejected": -2.387000322341919, + "loss": 2.759, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.41609764099121, + "rewards/margins": 2.453904628753662, + "rewards/rejected": -23.870004653930664, + "step": 28875 + }, + { + "epoch": 0.973406585998854, + "grad_norm": 76.04690551757812, + "learning_rate": 2.1527534211348008e-09, + "logits/chosen": -1.8390899896621704, + "logits/rejected": -2.3541603088378906, + "logps/chosen": -2.2660720348358154, + "logps/rejected": -2.8816139698028564, + "loss": 2.0392, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -22.660724639892578, + "rewards/margins": 6.155418872833252, + "rewards/rejected": -28.81614112854004, + "step": 28880 + }, + { + "epoch": 0.9735751120698372, + "grad_norm": 23.925153732299805, + "learning_rate": 2.1255745804885096e-09, + "logits/chosen": -2.8247294425964355, + "logits/rejected": -2.893087148666382, + "logps/chosen": -3.386502742767334, + "logps/rejected": -3.850271224975586, + "loss": 1.8654, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -33.865028381347656, + "rewards/margins": 4.637683868408203, + "rewards/rejected": -38.502708435058594, + "step": 28885 + }, + { + "epoch": 0.9737436381408204, + "grad_norm": 37.15006637573242, + "learning_rate": 2.098568034565318e-09, + "logits/chosen": -1.4200313091278076, + "logits/rejected": -1.75543212890625, + "logps/chosen": -1.8023878335952759, + "logps/rejected": -2.0486152172088623, + "loss": 2.5868, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.02387809753418, + "rewards/margins": 2.462275743484497, + "rewards/rejected": -20.48615264892578, + "step": 28890 + }, + { + "epoch": 0.9739121642118036, + "grad_norm": 7.579281806945801, + "learning_rate": 2.07173379271125e-09, + "logits/chosen": -0.8189151883125305, + "logits/rejected": -1.8859783411026, + "logps/chosen": -2.5505759716033936, + "logps/rejected": -3.3479816913604736, + "loss": 1.2665, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -25.505762100219727, + "rewards/margins": 7.974055290222168, + "rewards/rejected": -33.47981643676758, + "step": 28895 + }, + { + "epoch": 0.9740806902827868, + "grad_norm": 40.954010009765625, + "learning_rate": 2.0450718642124887e-09, + "logits/chosen": -1.6798347234725952, + "logits/rejected": -1.7447267770767212, + "logps/chosen": -2.070929527282715, + "logps/rejected": -2.3259315490722656, + "loss": 2.3437, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.709293365478516, + "rewards/margins": 2.5500221252441406, + "rewards/rejected": -23.25931739807129, + "step": 28900 + }, + { + "epoch": 0.9742492163537699, + "grad_norm": 29.78924560546875, + "learning_rate": 2.0185822582957648e-09, + "logits/chosen": -2.079418659210205, + "logits/rejected": -2.615996837615967, + "logps/chosen": -2.4528346061706543, + "logps/rejected": -3.0139706134796143, + "loss": 2.5714, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.528343200683594, + "rewards/margins": 5.611359119415283, + "rewards/rejected": -30.13970375061035, + "step": 28905 + }, + { + "epoch": 0.9744177424247531, + "grad_norm": 101.4603500366211, + "learning_rate": 1.9922649841279673e-09, + "logits/chosen": -2.2764244079589844, + "logits/rejected": -2.7363200187683105, + "logps/chosen": -3.0192959308624268, + "logps/rejected": -3.1520752906799316, + "loss": 5.4982, + "rewards/accuracies": 0.5, + "rewards/chosen": -30.192956924438477, + "rewards/margins": 1.3277934789657593, + "rewards/rejected": -31.520751953125, + "step": 28910 + }, + { + "epoch": 0.9745862684957363, + "grad_norm": 57.92829513549805, + "learning_rate": 1.966120050816589e-09, + "logits/chosen": -1.9717071056365967, + "logits/rejected": -2.3834729194641113, + "logps/chosen": -2.6838412284851074, + "logps/rejected": -3.0372207164764404, + "loss": 2.1416, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.83841323852539, + "rewards/margins": 3.533792495727539, + "rewards/rejected": -30.372207641601562, + "step": 28915 + }, + { + "epoch": 0.9747547945667194, + "grad_norm": 0.024026213213801384, + "learning_rate": 1.940147467409281e-09, + "logits/chosen": -1.569778561592102, + "logits/rejected": -1.7317079305648804, + "logps/chosen": -3.2188212871551514, + "logps/rejected": -4.365044593811035, + "loss": 1.1098, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -32.18821334838867, + "rewards/margins": 11.462237358093262, + "rewards/rejected": -43.65044403076172, + "step": 28920 + }, + { + "epoch": 0.9749233206377027, + "grad_norm": 30.91973876953125, + "learning_rate": 1.9143472428941877e-09, + "logits/chosen": -1.6828441619873047, + "logits/rejected": -2.0210423469543457, + "logps/chosen": -2.875429391860962, + "logps/rejected": -3.3311076164245605, + "loss": 3.7751, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -28.754297256469727, + "rewards/margins": 4.55678129196167, + "rewards/rejected": -33.31107711791992, + "step": 28925 + }, + { + "epoch": 0.9750918467086859, + "grad_norm": 39.773719787597656, + "learning_rate": 1.8887193861996664e-09, + "logits/chosen": -1.3712999820709229, + "logits/rejected": -1.5209099054336548, + "logps/chosen": -2.148160457611084, + "logps/rejected": -2.1714203357696533, + "loss": 3.2071, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -21.48160743713379, + "rewards/margins": 0.23259706795215607, + "rewards/rejected": -21.714202880859375, + "step": 28930 + }, + { + "epoch": 0.975260372779669, + "grad_norm": 38.75511169433594, + "learning_rate": 1.8632639061946233e-09, + "logits/chosen": -1.8781366348266602, + "logits/rejected": -1.9841234683990479, + "logps/chosen": -2.3052010536193848, + "logps/rejected": -2.2915608882904053, + "loss": 3.5765, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.052011489868164, + "rewards/margins": -0.13640041649341583, + "rewards/rejected": -22.91560935974121, + "step": 28935 + }, + { + "epoch": 0.9754288988506522, + "grad_norm": 27.88527488708496, + "learning_rate": 1.8379808116881224e-09, + "logits/chosen": -1.8556627035140991, + "logits/rejected": -2.137606620788574, + "logps/chosen": -2.3696718215942383, + "logps/rejected": -2.8112268447875977, + "loss": 2.1553, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.69671630859375, + "rewards/margins": 4.415551662445068, + "rewards/rejected": -28.112268447875977, + "step": 28940 + }, + { + "epoch": 0.9755974249216354, + "grad_norm": 48.13548278808594, + "learning_rate": 1.812870111429665e-09, + "logits/chosen": -2.6025211811065674, + "logits/rejected": -3.094240665435791, + "logps/chosen": -3.035796642303467, + "logps/rejected": -4.208906650543213, + "loss": 1.0567, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -30.357967376708984, + "rewards/margins": 11.73109245300293, + "rewards/rejected": -42.08905792236328, + "step": 28945 + }, + { + "epoch": 0.9757659509926185, + "grad_norm": 62.21755599975586, + "learning_rate": 1.7879318141090226e-09, + "logits/chosen": -2.0555005073547363, + "logits/rejected": -2.551521062850952, + "logps/chosen": -2.7381367683410645, + "logps/rejected": -2.8648579120635986, + "loss": 3.8195, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -27.38136863708496, + "rewards/margins": 1.2672100067138672, + "rewards/rejected": -28.648578643798828, + "step": 28950 + }, + { + "epoch": 0.9759344770636017, + "grad_norm": 114.9220199584961, + "learning_rate": 1.7631659283564582e-09, + "logits/chosen": -1.478084683418274, + "logits/rejected": -1.5832620859146118, + "logps/chosen": -2.6809325218200684, + "logps/rejected": -2.6643710136413574, + "loss": 3.6116, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -26.809326171875, + "rewards/margins": -0.1656126081943512, + "rewards/rejected": -26.643712997436523, + "step": 28955 + }, + { + "epoch": 0.9761030031345849, + "grad_norm": 56.121726989746094, + "learning_rate": 1.7385724627423936e-09, + "logits/chosen": -1.7954374551773071, + "logits/rejected": -2.2343225479125977, + "logps/chosen": -2.2366280555725098, + "logps/rejected": -2.4855599403381348, + "loss": 2.2053, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -22.366281509399414, + "rewards/margins": 2.48931622505188, + "rewards/rejected": -24.8555965423584, + "step": 28960 + }, + { + "epoch": 0.9762715292055681, + "grad_norm": 20.632129669189453, + "learning_rate": 1.7141514257777435e-09, + "logits/chosen": -1.7457069158554077, + "logits/rejected": -1.9348461627960205, + "logps/chosen": -2.1216189861297607, + "logps/rejected": -2.219635486602783, + "loss": 2.7346, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.216190338134766, + "rewards/margins": 0.9801637530326843, + "rewards/rejected": -22.19635581970215, + "step": 28965 + }, + { + "epoch": 0.9764400552765513, + "grad_norm": 20.116281509399414, + "learning_rate": 1.689902825913525e-09, + "logits/chosen": -1.2659189701080322, + "logits/rejected": -2.0253186225891113, + "logps/chosen": -2.4162893295288086, + "logps/rejected": -3.086562395095825, + "loss": 2.2569, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.162893295288086, + "rewards/margins": 6.702728271484375, + "rewards/rejected": -30.865625381469727, + "step": 28970 + }, + { + "epoch": 0.9766085813475345, + "grad_norm": 45.286468505859375, + "learning_rate": 1.6658266715413593e-09, + "logits/chosen": -1.519742727279663, + "logits/rejected": -1.9302390813827515, + "logps/chosen": -3.0658726692199707, + "logps/rejected": -3.5045742988586426, + "loss": 4.8525, + "rewards/accuracies": 0.5, + "rewards/chosen": -30.65872573852539, + "rewards/margins": 4.387016296386719, + "rewards/rejected": -35.04574203491211, + "step": 28975 + }, + { + "epoch": 0.9767771074185176, + "grad_norm": 46.37701416015625, + "learning_rate": 1.6419229709929704e-09, + "logits/chosen": -2.17651104927063, + "logits/rejected": -2.3157334327697754, + "logps/chosen": -2.608999729156494, + "logps/rejected": -2.539881467819214, + "loss": 4.01, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -26.089996337890625, + "rewards/margins": -0.6911813020706177, + "rewards/rejected": -25.398815155029297, + "step": 28980 + }, + { + "epoch": 0.9769456334895008, + "grad_norm": 22.79644012451172, + "learning_rate": 1.6181917325405192e-09, + "logits/chosen": -1.9644426107406616, + "logits/rejected": -2.2710633277893066, + "logps/chosen": -2.450073480606079, + "logps/rejected": -2.7313647270202637, + "loss": 2.3456, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.500734329223633, + "rewards/margins": 2.81291127204895, + "rewards/rejected": -27.313644409179688, + "step": 28985 + }, + { + "epoch": 0.977114159560484, + "grad_norm": 2.411163568496704, + "learning_rate": 1.5946329643964363e-09, + "logits/chosen": -1.7838443517684937, + "logits/rejected": -1.734575867652893, + "logps/chosen": -2.378779172897339, + "logps/rejected": -2.4997496604919434, + "loss": 2.8712, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.78779411315918, + "rewards/margins": 1.2097065448760986, + "rewards/rejected": -24.997501373291016, + "step": 28990 + }, + { + "epoch": 0.9772826856314671, + "grad_norm": 46.94289016723633, + "learning_rate": 1.5712466747135334e-09, + "logits/chosen": -1.7236160039901733, + "logits/rejected": -2.0851263999938965, + "logps/chosen": -2.2219066619873047, + "logps/rejected": -2.5512471199035645, + "loss": 2.763, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -22.219064712524414, + "rewards/margins": 3.293402910232544, + "rewards/rejected": -25.512470245361328, + "step": 28995 + }, + { + "epoch": 0.9774512117024504, + "grad_norm": 49.3720817565918, + "learning_rate": 1.5480328715848367e-09, + "logits/chosen": -1.7591028213500977, + "logits/rejected": -1.8149206638336182, + "logps/chosen": -2.1563258171081543, + "logps/rejected": -3.1621527671813965, + "loss": 1.3768, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.563257217407227, + "rewards/margins": 10.058270454406738, + "rewards/rejected": -31.62152671813965, + "step": 29000 + }, + { + "epoch": 0.9776197377734336, + "grad_norm": 0.0798095315694809, + "learning_rate": 1.5249915630437538e-09, + "logits/chosen": -1.4440138339996338, + "logits/rejected": -2.5432896614074707, + "logps/chosen": -2.615081787109375, + "logps/rejected": -4.234910011291504, + "loss": 1.5559, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -26.15081787109375, + "rewards/margins": 16.19828224182129, + "rewards/rejected": -42.34910202026367, + "step": 29005 + }, + { + "epoch": 0.9777882638444167, + "grad_norm": 36.522396087646484, + "learning_rate": 1.5021227570639062e-09, + "logits/chosen": -1.3796061277389526, + "logits/rejected": -1.505127191543579, + "logps/chosen": -2.3741424083709717, + "logps/rejected": -2.817383289337158, + "loss": 1.7147, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -23.741424560546875, + "rewards/margins": 4.432408332824707, + "rewards/rejected": -28.1738338470459, + "step": 29010 + }, + { + "epoch": 0.9779567899153999, + "grad_norm": 20.012441635131836, + "learning_rate": 1.4794264615594076e-09, + "logits/chosen": -1.6143648624420166, + "logits/rejected": -1.552825689315796, + "logps/chosen": -2.325551748275757, + "logps/rejected": -2.6102821826934814, + "loss": 2.0325, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -23.25551986694336, + "rewards/margins": 2.8473024368286133, + "rewards/rejected": -26.10282325744629, + "step": 29015 + }, + { + "epoch": 0.9781253159863831, + "grad_norm": 29.94846534729004, + "learning_rate": 1.4569026843844201e-09, + "logits/chosen": -1.814994215965271, + "logits/rejected": -1.8052419424057007, + "logps/chosen": -1.8165454864501953, + "logps/rejected": -1.75994074344635, + "loss": 3.7696, + "rewards/accuracies": 0.5, + "rewards/chosen": -18.165454864501953, + "rewards/margins": -0.5660479664802551, + "rewards/rejected": -17.599407196044922, + "step": 29020 + }, + { + "epoch": 0.9782938420573662, + "grad_norm": 42.956748962402344, + "learning_rate": 1.4345514333336528e-09, + "logits/chosen": -2.1387548446655273, + "logits/rejected": -2.604097366333008, + "logps/chosen": -2.7845778465270996, + "logps/rejected": -3.264005661010742, + "loss": 3.4512, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -27.845779418945312, + "rewards/margins": 4.794276237487793, + "rewards/rejected": -32.640052795410156, + "step": 29025 + }, + { + "epoch": 0.9784623681283494, + "grad_norm": 24.933717727661133, + "learning_rate": 1.4123727161419186e-09, + "logits/chosen": -2.3789522647857666, + "logits/rejected": -2.5313992500305176, + "logps/chosen": -3.1476669311523438, + "logps/rejected": -4.168401718139648, + "loss": 1.8028, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -31.476673126220703, + "rewards/margins": 10.20734691619873, + "rewards/rejected": -41.68401336669922, + "step": 29030 + }, + { + "epoch": 0.9786308941993327, + "grad_norm": 172.79562377929688, + "learning_rate": 1.3903665404844112e-09, + "logits/chosen": -1.3795078992843628, + "logits/rejected": -1.259374737739563, + "logps/chosen": -2.9650959968566895, + "logps/rejected": -3.208503007888794, + "loss": 2.6839, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -29.650955200195312, + "rewards/margins": 2.4340739250183105, + "rewards/rejected": -32.08502960205078, + "step": 29035 + }, + { + "epoch": 0.9787994202703159, + "grad_norm": 87.01050567626953, + "learning_rate": 1.3685329139765945e-09, + "logits/chosen": -1.9595003128051758, + "logits/rejected": -1.6780097484588623, + "logps/chosen": -2.65761137008667, + "logps/rejected": -2.8279170989990234, + "loss": 3.2983, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -26.576114654541016, + "rewards/margins": 1.703057885169983, + "rewards/rejected": -28.279170989990234, + "step": 29040 + }, + { + "epoch": 0.978967946341299, + "grad_norm": 44.545223236083984, + "learning_rate": 1.3468718441743132e-09, + "logits/chosen": -1.9111160039901733, + "logits/rejected": -2.1547746658325195, + "logps/chosen": -2.1006617546081543, + "logps/rejected": -2.54248046875, + "loss": 1.6358, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.006620407104492, + "rewards/margins": 4.418184757232666, + "rewards/rejected": -25.424802780151367, + "step": 29045 + }, + { + "epoch": 0.9791364724122822, + "grad_norm": 23.81278419494629, + "learning_rate": 1.3253833385734603e-09, + "logits/chosen": -1.5132275819778442, + "logits/rejected": -1.3265550136566162, + "logps/chosen": -3.042940616607666, + "logps/rejected": -3.4171204566955566, + "loss": 1.3111, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -30.429407119750977, + "rewards/margins": 3.7417988777160645, + "rewards/rejected": -34.17120361328125, + "step": 29050 + }, + { + "epoch": 0.9793049984832654, + "grad_norm": 27.37566375732422, + "learning_rate": 1.304067404610476e-09, + "logits/chosen": -1.7547746896743774, + "logits/rejected": -2.023449659347534, + "logps/chosen": -2.1776890754699707, + "logps/rejected": -2.5270471572875977, + "loss": 3.4206, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.776891708374023, + "rewards/margins": 3.4935803413391113, + "rewards/rejected": -25.270471572875977, + "step": 29055 + }, + { + "epoch": 0.9794735245542485, + "grad_norm": 35.15890121459961, + "learning_rate": 1.2829240496619042e-09, + "logits/chosen": -1.609505295753479, + "logits/rejected": -1.6805378198623657, + "logps/chosen": -2.2529454231262207, + "logps/rejected": -2.4146313667297363, + "loss": 2.7068, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.52945327758789, + "rewards/margins": 1.6168590784072876, + "rewards/rejected": -24.146312713623047, + "step": 29060 + }, + { + "epoch": 0.9796420506252317, + "grad_norm": 48.223201751708984, + "learning_rate": 1.2619532810446699e-09, + "logits/chosen": -1.2492077350616455, + "logits/rejected": -1.2873347997665405, + "logps/chosen": -2.0145344734191895, + "logps/rejected": -2.223071813583374, + "loss": 2.6329, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.14534568786621, + "rewards/margins": 2.0853724479675293, + "rewards/rejected": -22.2307186126709, + "step": 29065 + }, + { + "epoch": 0.9798105766962149, + "grad_norm": 43.44717025756836, + "learning_rate": 1.241155106015912e-09, + "logits/chosen": -1.802374243736267, + "logits/rejected": -1.7998616695404053, + "logps/chosen": -2.245756149291992, + "logps/rejected": -2.369556427001953, + "loss": 3.1205, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -22.457561492919922, + "rewards/margins": 1.2380021810531616, + "rewards/rejected": -23.695566177368164, + "step": 29070 + }, + { + "epoch": 0.9799791027671981, + "grad_norm": 112.0615463256836, + "learning_rate": 1.2205295317730402e-09, + "logits/chosen": -1.9436088800430298, + "logits/rejected": -2.0945136547088623, + "logps/chosen": -2.564476490020752, + "logps/rejected": -3.054882049560547, + "loss": 3.0998, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.644763946533203, + "rewards/margins": 4.904057502746582, + "rewards/rejected": -30.5488224029541, + "step": 29075 + }, + { + "epoch": 0.9801476288381813, + "grad_norm": 42.39691162109375, + "learning_rate": 1.2000765654537892e-09, + "logits/chosen": -2.0819265842437744, + "logits/rejected": -1.9295778274536133, + "logps/chosen": -2.2024292945861816, + "logps/rejected": -2.439565896987915, + "loss": 2.701, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.0242919921875, + "rewards/margins": 2.371366500854492, + "rewards/rejected": -24.39565658569336, + "step": 29080 + }, + { + "epoch": 0.9803161549091645, + "grad_norm": 54.22341537475586, + "learning_rate": 1.1797962141360529e-09, + "logits/chosen": -1.668026328086853, + "logits/rejected": -1.8890268802642822, + "logps/chosen": -2.1706669330596924, + "logps/rejected": -2.2109477519989014, + "loss": 2.8692, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.7066707611084, + "rewards/margins": 0.4028078019618988, + "rewards/rejected": -22.109477996826172, + "step": 29085 + }, + { + "epoch": 0.9804846809801476, + "grad_norm": 0.008076355792582035, + "learning_rate": 1.1596884848381616e-09, + "logits/chosen": -1.7775027751922607, + "logits/rejected": -1.9149770736694336, + "logps/chosen": -2.447822093963623, + "logps/rejected": -2.7496438026428223, + "loss": 2.0461, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.47822380065918, + "rewards/margins": 3.018213987350464, + "rewards/rejected": -27.496435165405273, + "step": 29090 + }, + { + "epoch": 0.9806532070511308, + "grad_norm": 66.91258239746094, + "learning_rate": 1.1397533845185492e-09, + "logits/chosen": -1.7012237310409546, + "logits/rejected": -1.7801170349121094, + "logps/chosen": -2.3985514640808105, + "logps/rejected": -2.5203640460968018, + "loss": 2.4471, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.985515594482422, + "rewards/margins": 1.218124508857727, + "rewards/rejected": -25.203641891479492, + "step": 29095 + }, + { + "epoch": 0.980821733122114, + "grad_norm": 67.51605987548828, + "learning_rate": 1.1199909200760305e-09, + "logits/chosen": -2.3086938858032227, + "logits/rejected": -2.628457546234131, + "logps/chosen": -1.8733421564102173, + "logps/rejected": -2.0515007972717285, + "loss": 2.1875, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.733423233032227, + "rewards/margins": 1.7815853357315063, + "rewards/rejected": -20.51500701904297, + "step": 29100 + }, + { + "epoch": 0.9809902591930971, + "grad_norm": 16.39108657836914, + "learning_rate": 1.1004010983495238e-09, + "logits/chosen": -1.6252338886260986, + "logits/rejected": -1.9250046014785767, + "logps/chosen": -2.4929986000061035, + "logps/rejected": -2.7562291622161865, + "loss": 1.9, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.92998695373535, + "rewards/margins": 2.6323070526123047, + "rewards/rejected": -27.562292098999023, + "step": 29105 + }, + { + "epoch": 0.9811587852640804, + "grad_norm": 37.90568161010742, + "learning_rate": 1.0809839261183285e-09, + "logits/chosen": -1.688126564025879, + "logits/rejected": -1.9933946132659912, + "logps/chosen": -2.792940616607666, + "logps/rejected": -3.1035900115966797, + "loss": 3.0784, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -27.92940330505371, + "rewards/margins": 3.106494903564453, + "rewards/rejected": -31.035900115966797, + "step": 29110 + }, + { + "epoch": 0.9813273113350636, + "grad_norm": 28.15143585205078, + "learning_rate": 1.0617394101020139e-09, + "logits/chosen": -2.4952352046966553, + "logits/rejected": -2.438091278076172, + "logps/chosen": -2.3946175575256348, + "logps/rejected": -2.601440668106079, + "loss": 2.2339, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.9461727142334, + "rewards/margins": 2.0682334899902344, + "rewards/rejected": -26.014408111572266, + "step": 29115 + }, + { + "epoch": 0.9814958374060467, + "grad_norm": 29.762697219848633, + "learning_rate": 1.0426675569602529e-09, + "logits/chosen": -1.716699242591858, + "logits/rejected": -2.109680652618408, + "logps/chosen": -2.18900728225708, + "logps/rejected": -2.372774600982666, + "loss": 2.4536, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.890071868896484, + "rewards/margins": 1.8376758098602295, + "rewards/rejected": -23.727746963500977, + "step": 29120 + }, + { + "epoch": 0.9816643634770299, + "grad_norm": 117.76435852050781, + "learning_rate": 1.0237683732931545e-09, + "logits/chosen": -2.146617889404297, + "logits/rejected": -2.143404960632324, + "logps/chosen": -2.951927423477173, + "logps/rejected": -3.0596113204956055, + "loss": 3.7283, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -29.519275665283203, + "rewards/margins": 1.0768405199050903, + "rewards/rejected": -30.596111297607422, + "step": 29125 + }, + { + "epoch": 0.9818328895480131, + "grad_norm": 72.63237762451172, + "learning_rate": 1.0050418656408766e-09, + "logits/chosen": -1.8240457773208618, + "logits/rejected": -2.367785930633545, + "logps/chosen": -3.1584181785583496, + "logps/rejected": -3.3102378845214844, + "loss": 3.8649, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -31.584178924560547, + "rewards/margins": 1.5181951522827148, + "rewards/rejected": -33.102378845214844, + "step": 29130 + }, + { + "epoch": 0.9820014156189962, + "grad_norm": 37.19519805908203, + "learning_rate": 9.86488040484068e-10, + "logits/chosen": -1.5405725240707397, + "logits/rejected": -1.7350629568099976, + "logps/chosen": -2.063495635986328, + "logps/rejected": -2.0626368522644043, + "loss": 3.511, + "rewards/accuracies": 0.5, + "rewards/chosen": -20.63495445251465, + "rewards/margins": -0.008587169460952282, + "rewards/rejected": -20.62636947631836, + "step": 29135 + }, + { + "epoch": 0.9821699416899794, + "grad_norm": 40.8488883972168, + "learning_rate": 9.68106904243371e-10, + "logits/chosen": -1.2401533126831055, + "logits/rejected": -1.1598286628723145, + "logps/chosen": -2.5394511222839355, + "logps/rejected": -2.641644239425659, + "loss": 3.0189, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.39451026916504, + "rewards/margins": 1.0219334363937378, + "rewards/rejected": -26.41644287109375, + "step": 29140 + }, + { + "epoch": 0.9823384677609627, + "grad_norm": 27.47707176208496, + "learning_rate": 9.49898463279808e-10, + "logits/chosen": -1.5628474950790405, + "logits/rejected": -1.6683374643325806, + "logps/chosen": -2.1274776458740234, + "logps/rejected": -2.3868026733398438, + "loss": 1.7491, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -21.274776458740234, + "rewards/margins": 2.5932505130767822, + "rewards/rejected": -23.868024826049805, + "step": 29145 + }, + { + "epoch": 0.9825069938319458, + "grad_norm": 26.376081466674805, + "learning_rate": 9.318627238946164e-10, + "logits/chosen": -1.6443513631820679, + "logits/rejected": -1.7434518337249756, + "logps/chosen": -3.311771869659424, + "logps/rejected": -3.490668535232544, + "loss": 2.9818, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -33.11771774291992, + "rewards/margins": 1.788968801498413, + "rewards/rejected": -34.90668487548828, + "step": 29150 + }, + { + "epoch": 0.982675519902929, + "grad_norm": 0.5067969560623169, + "learning_rate": 9.139996923291927e-10, + "logits/chosen": -1.6530559062957764, + "logits/rejected": -2.6780498027801514, + "logps/chosen": -2.3142178058624268, + "logps/rejected": -3.273839235305786, + "loss": 2.5638, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.14217758178711, + "rewards/margins": 9.59621524810791, + "rewards/rejected": -32.7383918762207, + "step": 29155 + }, + { + "epoch": 0.9828440459739122, + "grad_norm": 1.6404162645339966, + "learning_rate": 8.963093747653139e-10, + "logits/chosen": -1.8647918701171875, + "logits/rejected": -2.108612537384033, + "logps/chosen": -2.119544267654419, + "logps/rejected": -2.551068067550659, + "loss": 1.0067, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.195444107055664, + "rewards/margins": 4.315237998962402, + "rewards/rejected": -25.510677337646484, + "step": 29160 + }, + { + "epoch": 0.9830125720448953, + "grad_norm": 24.310962677001953, + "learning_rate": 8.78791777324861e-10, + "logits/chosen": -2.083103895187378, + "logits/rejected": -2.228764772415161, + "logps/chosen": -3.090188503265381, + "logps/rejected": -3.515840530395508, + "loss": 1.6981, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -30.90188980102539, + "rewards/margins": 4.256514549255371, + "rewards/rejected": -35.15840530395508, + "step": 29165 + }, + { + "epoch": 0.9831810981158785, + "grad_norm": 43.69054412841797, + "learning_rate": 8.614469060699292e-10, + "logits/chosen": -2.039569854736328, + "logits/rejected": -2.212449550628662, + "logps/chosen": -2.5203347206115723, + "logps/rejected": -2.6209521293640137, + "loss": 2.8296, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.203344345092773, + "rewards/margins": 1.0061757564544678, + "rewards/rejected": -26.209522247314453, + "step": 29170 + }, + { + "epoch": 0.9833496241868617, + "grad_norm": 114.72516632080078, + "learning_rate": 8.442747670029948e-10, + "logits/chosen": -1.3245737552642822, + "logits/rejected": -1.4342682361602783, + "logps/chosen": -2.268794298171997, + "logps/rejected": -2.3118577003479004, + "loss": 4.0943, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.687946319580078, + "rewards/margins": 0.43063393235206604, + "rewards/rejected": -23.118576049804688, + "step": 29175 + }, + { + "epoch": 0.9835181502578448, + "grad_norm": 0.09160160273313522, + "learning_rate": 8.272753660665821e-10, + "logits/chosen": -1.489280104637146, + "logits/rejected": -2.2636759281158447, + "logps/chosen": -2.1328532695770264, + "logps/rejected": -3.0796000957489014, + "loss": 1.7, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.328535079956055, + "rewards/margins": 9.467466354370117, + "rewards/rejected": -30.796001434326172, + "step": 29180 + }, + { + "epoch": 0.9836866763288281, + "grad_norm": 32.43025207519531, + "learning_rate": 8.104487091435963e-10, + "logits/chosen": -2.016549825668335, + "logits/rejected": -2.1013665199279785, + "logps/chosen": -2.4078688621520996, + "logps/rejected": -2.9210119247436523, + "loss": 2.4298, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.078689575195312, + "rewards/margins": 5.131430149078369, + "rewards/rejected": -29.210119247436523, + "step": 29185 + }, + { + "epoch": 0.9838552023998113, + "grad_norm": 39.57857894897461, + "learning_rate": 7.937948020569906e-10, + "logits/chosen": -1.3796770572662354, + "logits/rejected": -1.4084769487380981, + "logps/chosen": -2.1895060539245605, + "logps/rejected": -2.352719306945801, + "loss": 2.0924, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.895061492919922, + "rewards/margins": 1.6321327686309814, + "rewards/rejected": -23.527191162109375, + "step": 29190 + }, + { + "epoch": 0.9840237284707944, + "grad_norm": 18.390674591064453, + "learning_rate": 7.773136505700995e-10, + "logits/chosen": -1.5435973405838013, + "logits/rejected": -1.9853636026382446, + "logps/chosen": -2.088843822479248, + "logps/rejected": -2.8570563793182373, + "loss": 1.6811, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.8884334564209, + "rewards/margins": 7.682130336761475, + "rewards/rejected": -28.570566177368164, + "step": 29195 + }, + { + "epoch": 0.9841922545417776, + "grad_norm": 13.768671989440918, + "learning_rate": 7.610052603863048e-10, + "logits/chosen": -1.6334747076034546, + "logits/rejected": -1.6783907413482666, + "logps/chosen": -1.8387682437896729, + "logps/rejected": -1.905200719833374, + "loss": 2.7584, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.387680053710938, + "rewards/margins": 0.664326012134552, + "rewards/rejected": -19.0520076751709, + "step": 29200 + }, + { + "epoch": 0.9841922545417776, + "eval_logits/chosen": -2.3112363815307617, + "eval_logits/rejected": -2.489377975463867, + "eval_logps/chosen": -2.289456605911255, + "eval_logps/rejected": -2.444096088409424, + "eval_loss": 3.0873899459838867, + "eval_rewards/accuracies": 0.6299999952316284, + "eval_rewards/chosen": -22.89456558227539, + "eval_rewards/margins": 1.5463964939117432, + "eval_rewards/rejected": -24.440961837768555, + "eval_runtime": 12.8936, + "eval_samples_per_second": 7.756, + "eval_steps_per_second": 1.939, + "step": 29200 + }, + { + "epoch": 0.9843607806127608, + "grad_norm": 59.42336654663086, + "learning_rate": 7.448696371494257e-10, + "logits/chosen": -1.8782780170440674, + "logits/rejected": -2.575622797012329, + "logps/chosen": -2.4771718978881836, + "logps/rejected": -3.015079975128174, + "loss": 1.4728, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -24.771717071533203, + "rewards/margins": 5.379080772399902, + "rewards/rejected": -30.150802612304688, + "step": 29205 + }, + { + "epoch": 0.9845293066837439, + "grad_norm": 8.903990745544434, + "learning_rate": 7.28906786443273e-10, + "logits/chosen": -2.6060993671417236, + "logits/rejected": -2.4315786361694336, + "logps/chosen": -3.1309814453125, + "logps/rejected": -3.394001007080078, + "loss": 2.6579, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -31.309810638427734, + "rewards/margins": 2.630197763442993, + "rewards/rejected": -33.94001007080078, + "step": 29210 + }, + { + "epoch": 0.9846978327547271, + "grad_norm": 35.52606964111328, + "learning_rate": 7.13116713791928e-10, + "logits/chosen": -1.7815643548965454, + "logits/rejected": -1.951751470565796, + "logps/chosen": -2.3580946922302246, + "logps/rejected": -2.5867106914520264, + "loss": 2.2759, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.580947875976562, + "rewards/margins": 2.2861576080322266, + "rewards/rejected": -25.867107391357422, + "step": 29215 + }, + { + "epoch": 0.9848663588257104, + "grad_norm": 33.73111343383789, + "learning_rate": 6.974994246598531e-10, + "logits/chosen": -2.0735392570495605, + "logits/rejected": -2.174109697341919, + "logps/chosen": -2.5979247093200684, + "logps/rejected": -2.739593267440796, + "loss": 3.0291, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.979248046875, + "rewards/margins": 1.4166853427886963, + "rewards/rejected": -27.395931243896484, + "step": 29220 + }, + { + "epoch": 0.9850348848966936, + "grad_norm": 32.62689208984375, + "learning_rate": 6.820549244514473e-10, + "logits/chosen": -2.2399845123291016, + "logits/rejected": -1.980328917503357, + "logps/chosen": -2.616978645324707, + "logps/rejected": -2.76224684715271, + "loss": 3.4192, + "rewards/accuracies": 0.5, + "rewards/chosen": -26.169788360595703, + "rewards/margins": 1.4526822566986084, + "rewards/rejected": -27.62247085571289, + "step": 29225 + }, + { + "epoch": 0.9852034109676767, + "grad_norm": 34.097007751464844, + "learning_rate": 6.667832185114908e-10, + "logits/chosen": -1.56089186668396, + "logits/rejected": -1.7227294445037842, + "logps/chosen": -1.9335496425628662, + "logps/rejected": -2.214301824569702, + "loss": 2.5257, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.335498809814453, + "rewards/margins": 2.807521343231201, + "rewards/rejected": -22.14301872253418, + "step": 29230 + }, + { + "epoch": 0.9853719370386599, + "grad_norm": 31.763954162597656, + "learning_rate": 6.516843121249227e-10, + "logits/chosen": -1.6636956930160522, + "logits/rejected": -1.8226375579833984, + "logps/chosen": -1.736010193824768, + "logps/rejected": -1.8142486810684204, + "loss": 2.6185, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.3601016998291, + "rewards/margins": 0.7823851108551025, + "rewards/rejected": -18.142486572265625, + "step": 29235 + }, + { + "epoch": 0.985540463109643, + "grad_norm": 32.31121826171875, + "learning_rate": 6.367582105168968e-10, + "logits/chosen": -1.8201602697372437, + "logits/rejected": -2.7413697242736816, + "logps/chosen": -2.2536308765411377, + "logps/rejected": -3.829385280609131, + "loss": 1.3365, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -22.536306381225586, + "rewards/margins": 15.757547378540039, + "rewards/rejected": -38.293853759765625, + "step": 29240 + }, + { + "epoch": 0.9857089891806262, + "grad_norm": 27.129085540771484, + "learning_rate": 6.220049188527254e-10, + "logits/chosen": -1.7569191455841064, + "logits/rejected": -1.9579541683197021, + "logps/chosen": -2.5340628623962402, + "logps/rejected": -2.6569466590881348, + "loss": 3.4591, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.340627670288086, + "rewards/margins": 1.228839635848999, + "rewards/rejected": -26.5694637298584, + "step": 29245 + }, + { + "epoch": 0.9858775152516094, + "grad_norm": 7.850180149078369, + "learning_rate": 6.07424442237936e-10, + "logits/chosen": -2.442378044128418, + "logits/rejected": -2.476134777069092, + "logps/chosen": -2.5148308277130127, + "logps/rejected": -3.0705695152282715, + "loss": 2.3982, + "rewards/accuracies": 0.5, + "rewards/chosen": -25.148305892944336, + "rewards/margins": 5.557389736175537, + "rewards/rejected": -30.705698013305664, + "step": 29250 + }, + { + "epoch": 0.9860460413225927, + "grad_norm": 56.99378967285156, + "learning_rate": 5.930167857182699e-10, + "logits/chosen": -1.0735199451446533, + "logits/rejected": -1.1086907386779785, + "logps/chosen": -2.445570468902588, + "logps/rejected": -2.5016674995422363, + "loss": 3.7414, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.455707550048828, + "rewards/margins": 0.560967743396759, + "rewards/rejected": -25.016674041748047, + "step": 29255 + }, + { + "epoch": 0.9862145673935758, + "grad_norm": 30.35076904296875, + "learning_rate": 5.787819542796279e-10, + "logits/chosen": -1.972487211227417, + "logits/rejected": -2.2727303504943848, + "logps/chosen": -2.4433388710021973, + "logps/rejected": -2.5713579654693604, + "loss": 4.8165, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -24.433391571044922, + "rewards/margins": 1.2801861763000488, + "rewards/rejected": -25.713577270507812, + "step": 29260 + }, + { + "epoch": 0.986383093464559, + "grad_norm": 22.402549743652344, + "learning_rate": 5.647199528481805e-10, + "logits/chosen": -2.089200496673584, + "logits/rejected": -2.1892621517181396, + "logps/chosen": -2.8049428462982178, + "logps/rejected": -3.1378207206726074, + "loss": 2.0068, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -28.049428939819336, + "rewards/margins": 3.328777313232422, + "rewards/rejected": -31.37820816040039, + "step": 29265 + }, + { + "epoch": 0.9865516195355422, + "grad_norm": 719.610107421875, + "learning_rate": 5.508307862901462e-10, + "logits/chosen": -2.50829815864563, + "logits/rejected": -2.8846817016601562, + "logps/chosen": -3.515585422515869, + "logps/rejected": -3.9776313304901123, + "loss": 3.2065, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -35.15585708618164, + "rewards/margins": 4.620454788208008, + "rewards/rejected": -39.77631378173828, + "step": 29270 + }, + { + "epoch": 0.9867201456065253, + "grad_norm": 65.99314880371094, + "learning_rate": 5.371144594120691e-10, + "logits/chosen": -1.5672378540039062, + "logits/rejected": -1.792138695716858, + "logps/chosen": -3.856194257736206, + "logps/rejected": -3.839482069015503, + "loss": 5.3038, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -38.56194305419922, + "rewards/margins": -0.16712017357349396, + "rewards/rejected": -38.39482116699219, + "step": 29275 + }, + { + "epoch": 0.9868886716775085, + "grad_norm": 50.995548248291016, + "learning_rate": 5.235709769606522e-10, + "logits/chosen": -1.8027080297470093, + "logits/rejected": -2.1330089569091797, + "logps/chosen": -2.413428544998169, + "logps/rejected": -2.9030966758728027, + "loss": 2.2273, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -24.13428497314453, + "rewards/margins": 4.896681785583496, + "rewards/rejected": -29.030963897705078, + "step": 29280 + }, + { + "epoch": 0.9870571977484917, + "grad_norm": 51.32760238647461, + "learning_rate": 5.102003436227576e-10, + "logits/chosen": -2.102128267288208, + "logits/rejected": -2.396106004714966, + "logps/chosen": -2.6976327896118164, + "logps/rejected": -3.4498488903045654, + "loss": 1.5823, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -26.976327896118164, + "rewards/margins": 7.522161960601807, + "rewards/rejected": -34.49848937988281, + "step": 29285 + }, + { + "epoch": 0.9872257238194748, + "grad_norm": 41.277225494384766, + "learning_rate": 4.970025640253505e-10, + "logits/chosen": -1.7478545904159546, + "logits/rejected": -2.046114206314087, + "logps/chosen": -2.7258543968200684, + "logps/rejected": -2.9497528076171875, + "loss": 3.0146, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -27.258544921875, + "rewards/margins": 2.238983154296875, + "rewards/rejected": -29.497528076171875, + "step": 29290 + }, + { + "epoch": 0.9873942498904581, + "grad_norm": 11.74426555633545, + "learning_rate": 4.839776427357778e-10, + "logits/chosen": -1.8929609060287476, + "logits/rejected": -2.212101459503174, + "logps/chosen": -2.844020366668701, + "logps/rejected": -3.1261801719665527, + "loss": 2.3292, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -28.440200805664062, + "rewards/margins": 2.8215994834899902, + "rewards/rejected": -31.261804580688477, + "step": 29295 + }, + { + "epoch": 0.9875627759614413, + "grad_norm": 21.692180633544922, + "learning_rate": 4.711255842613226e-10, + "logits/chosen": -1.4589345455169678, + "logits/rejected": -2.142324209213257, + "logps/chosen": -1.9915090799331665, + "logps/rejected": -2.128383159637451, + "loss": 2.2874, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -19.915088653564453, + "rewards/margins": 1.368740200996399, + "rewards/rejected": -21.283828735351562, + "step": 29300 + }, + { + "epoch": 0.9877313020324244, + "grad_norm": 21.955440521240234, + "learning_rate": 4.584463930497051e-10, + "logits/chosen": -1.9233735799789429, + "logits/rejected": -2.112452268600464, + "logps/chosen": -2.1158950328826904, + "logps/rejected": -2.686612606048584, + "loss": 2.1764, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.158950805664062, + "rewards/margins": 5.707176208496094, + "rewards/rejected": -26.866125106811523, + "step": 29305 + }, + { + "epoch": 0.9878998281034076, + "grad_norm": 20.328765869140625, + "learning_rate": 4.459400734886376e-10, + "logits/chosen": -1.9143705368041992, + "logits/rejected": -2.199479341506958, + "logps/chosen": -1.9987812042236328, + "logps/rejected": -2.205310344696045, + "loss": 1.8434, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.987812042236328, + "rewards/margins": 2.0652928352355957, + "rewards/rejected": -22.053104400634766, + "step": 29310 + }, + { + "epoch": 0.9880683541743908, + "grad_norm": 25.061670303344727, + "learning_rate": 4.33606629906047e-10, + "logits/chosen": -1.4500936269760132, + "logits/rejected": -1.5468348264694214, + "logps/chosen": -2.3813271522521973, + "logps/rejected": -2.4723310470581055, + "loss": 2.542, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -23.81327247619629, + "rewards/margins": 0.910036563873291, + "rewards/rejected": -24.723308563232422, + "step": 29315 + }, + { + "epoch": 0.9882368802453739, + "grad_norm": 110.3287582397461, + "learning_rate": 4.2144606657007475e-10, + "logits/chosen": -1.7592315673828125, + "logits/rejected": -1.8345882892608643, + "logps/chosen": -2.7503910064697266, + "logps/rejected": -2.7181968688964844, + "loss": 4.7156, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -27.5039119720459, + "rewards/margins": -0.3219425082206726, + "rewards/rejected": -27.18196678161621, + "step": 29320 + }, + { + "epoch": 0.9884054063163571, + "grad_norm": 69.42078399658203, + "learning_rate": 4.0945838768902116e-10, + "logits/chosen": -1.7482601404190063, + "logits/rejected": -1.8381834030151367, + "logps/chosen": -2.808565378189087, + "logps/rejected": -3.3611602783203125, + "loss": 1.7605, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -28.085657119750977, + "rewards/margins": 5.525949954986572, + "rewards/rejected": -33.611602783203125, + "step": 29325 + }, + { + "epoch": 0.9885739323873404, + "grad_norm": 56.57253646850586, + "learning_rate": 3.9764359741134566e-10, + "logits/chosen": -1.9182945489883423, + "logits/rejected": -2.220074415206909, + "logps/chosen": -2.5628058910369873, + "logps/rejected": -3.0394446849823, + "loss": 1.4884, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -25.628061294555664, + "rewards/margins": 4.766386985778809, + "rewards/rejected": -30.394445419311523, + "step": 29330 + }, + { + "epoch": 0.9887424584583235, + "grad_norm": 28.18037986755371, + "learning_rate": 3.8600169982566655e-10, + "logits/chosen": -1.6121898889541626, + "logits/rejected": -1.843125581741333, + "logps/chosen": -2.0145015716552734, + "logps/rejected": -2.2432212829589844, + "loss": 2.1496, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.1450138092041, + "rewards/margins": 2.2871999740600586, + "rewards/rejected": -22.432212829589844, + "step": 29335 + }, + { + "epoch": 0.9889109845293067, + "grad_norm": 26.318395614624023, + "learning_rate": 3.7453269896081665e-10, + "logits/chosen": -2.023275852203369, + "logits/rejected": -2.5470919609069824, + "logps/chosen": -2.5232348442077637, + "logps/rejected": -3.547163486480713, + "loss": 0.7872, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -25.232349395751953, + "rewards/margins": 10.239288330078125, + "rewards/rejected": -35.47163391113281, + "step": 29340 + }, + { + "epoch": 0.9890795106002899, + "grad_norm": 68.82476043701172, + "learning_rate": 3.632365987856767e-10, + "logits/chosen": -2.064157009124756, + "logits/rejected": -1.9872633218765259, + "logps/chosen": -2.234658718109131, + "logps/rejected": -2.367302179336548, + "loss": 2.9236, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -22.34658432006836, + "rewards/margins": 1.3264367580413818, + "rewards/rejected": -23.673023223876953, + "step": 29345 + }, + { + "epoch": 0.989248036671273, + "grad_norm": 175.67710876464844, + "learning_rate": 3.5211340320950853e-10, + "logits/chosen": -2.1267590522766113, + "logits/rejected": -2.370877742767334, + "logps/chosen": -3.2157340049743652, + "logps/rejected": -3.601935625076294, + "loss": 1.9471, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -32.15734100341797, + "rewards/margins": 3.862014055252075, + "rewards/rejected": -36.01935577392578, + "step": 29350 + }, + { + "epoch": 0.9894165627422562, + "grad_norm": 13.162681579589844, + "learning_rate": 3.4116311608151095e-10, + "logits/chosen": -1.7573215961456299, + "logits/rejected": -1.9320533275604248, + "logps/chosen": -3.3882813453674316, + "logps/rejected": -3.6124320030212402, + "loss": 2.8952, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -33.8828125, + "rewards/margins": 2.2415099143981934, + "rewards/rejected": -36.12432098388672, + "step": 29355 + }, + { + "epoch": 0.9895850888132394, + "grad_norm": 49.257545471191406, + "learning_rate": 3.303857411912081e-10, + "logits/chosen": -1.8998920917510986, + "logits/rejected": -2.0867068767547607, + "logps/chosen": -1.901677131652832, + "logps/rejected": -2.135617971420288, + "loss": 2.2306, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.01677131652832, + "rewards/margins": 2.339409351348877, + "rewards/rejected": -21.356182098388672, + "step": 29360 + }, + { + "epoch": 0.9897536148842226, + "grad_norm": 73.01632690429688, + "learning_rate": 3.1978128226822775e-10, + "logits/chosen": -1.3546922206878662, + "logits/rejected": -1.2406705617904663, + "logps/chosen": -2.1406056880950928, + "logps/rejected": -2.3482110500335693, + "loss": 2.5606, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -21.406057357788086, + "rewards/margins": 2.0760552883148193, + "rewards/rejected": -23.482112884521484, + "step": 29365 + }, + { + "epoch": 0.9899221409552058, + "grad_norm": 51.609500885009766, + "learning_rate": 3.093497429823011e-10, + "logits/chosen": -2.147397518157959, + "logits/rejected": -2.0322537422180176, + "logps/chosen": -2.272533893585205, + "logps/rejected": -2.137698173522949, + "loss": 5.1356, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.725337982177734, + "rewards/margins": -1.3483564853668213, + "rewards/rejected": -21.37697982788086, + "step": 29370 + }, + { + "epoch": 0.990090667026189, + "grad_norm": 15.991497039794922, + "learning_rate": 2.990911269433738e-10, + "logits/chosen": -2.087343215942383, + "logits/rejected": -2.145076036453247, + "logps/chosen": -3.2033817768096924, + "logps/rejected": -3.4729628562927246, + "loss": 2.1612, + "rewards/accuracies": 0.5, + "rewards/chosen": -32.0338134765625, + "rewards/margins": 2.6958134174346924, + "rewards/rejected": -34.7296257019043, + "step": 29375 + }, + { + "epoch": 0.9902591930971721, + "grad_norm": 35.58505630493164, + "learning_rate": 2.89005437701606e-10, + "logits/chosen": -2.4574391841888428, + "logits/rejected": -2.5140743255615234, + "logps/chosen": -2.106220245361328, + "logps/rejected": -2.2607998847961426, + "loss": 1.9083, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -21.06220245361328, + "rewards/margins": 1.5457961559295654, + "rewards/rejected": -22.60799789428711, + "step": 29380 + }, + { + "epoch": 0.9904277191681553, + "grad_norm": 127.40261840820312, + "learning_rate": 2.790926787472614e-10, + "logits/chosen": -2.1903467178344727, + "logits/rejected": -2.4454219341278076, + "logps/chosen": -3.0538299083709717, + "logps/rejected": -3.1230578422546387, + "loss": 2.8361, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -30.538299560546875, + "rewards/margins": 0.6922758221626282, + "rewards/rejected": -31.230575561523438, + "step": 29385 + }, + { + "epoch": 0.9905962452391385, + "grad_norm": 290.842041015625, + "learning_rate": 2.693528535106515e-10, + "logits/chosen": -2.323547840118408, + "logits/rejected": -2.200070858001709, + "logps/chosen": -2.8393075466156006, + "logps/rejected": -2.5815980434417725, + "loss": 6.0147, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -28.393077850341797, + "rewards/margins": -2.5770952701568604, + "rewards/rejected": -25.815982818603516, + "step": 29390 + }, + { + "epoch": 0.9907647713101216, + "grad_norm": 0.32132697105407715, + "learning_rate": 2.59785965362469e-10, + "logits/chosen": -1.2735278606414795, + "logits/rejected": -1.618552803993225, + "logps/chosen": -2.191002368927002, + "logps/rejected": -2.9995617866516113, + "loss": 2.9313, + "rewards/accuracies": 0.5, + "rewards/chosen": -21.910022735595703, + "rewards/margins": 8.085596084594727, + "rewards/rejected": -29.995616912841797, + "step": 29395 + }, + { + "epoch": 0.9909332973811048, + "grad_norm": 15.46959114074707, + "learning_rate": 2.503920176133989e-10, + "logits/chosen": -2.1506848335266113, + "logits/rejected": -2.733182191848755, + "logps/chosen": -2.5749802589416504, + "logps/rejected": -3.1757876873016357, + "loss": 1.575, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -25.749805450439453, + "rewards/margins": 6.008072853088379, + "rewards/rejected": -31.75787353515625, + "step": 29400 + }, + { + "epoch": 0.9911018234520881, + "grad_norm": 41.7327880859375, + "learning_rate": 2.4117101351428527e-10, + "logits/chosen": -1.9427173137664795, + "logits/rejected": -2.579338550567627, + "logps/chosen": -2.8326809406280518, + "logps/rejected": -4.368477821350098, + "loss": 1.543, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -28.32680892944336, + "rewards/margins": 15.357976913452148, + "rewards/rejected": -43.684783935546875, + "step": 29405 + }, + { + "epoch": 0.9912703495230712, + "grad_norm": 112.20866394042969, + "learning_rate": 2.321229562561311e-10, + "logits/chosen": -1.885607123374939, + "logits/rejected": -2.2688915729522705, + "logps/chosen": -2.0797476768493652, + "logps/rejected": -2.6279759407043457, + "loss": 2.9359, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.7974796295166, + "rewards/margins": 5.482276916503906, + "rewards/rejected": -26.279754638671875, + "step": 29410 + }, + { + "epoch": 0.9914388755940544, + "grad_norm": 33.625770568847656, + "learning_rate": 2.2324784897020942e-10, + "logits/chosen": -2.2554001808166504, + "logits/rejected": -2.609884738922119, + "logps/chosen": -2.615712881088257, + "logps/rejected": -3.316114902496338, + "loss": 2.0842, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -26.157129287719727, + "rewards/margins": 7.0040178298950195, + "rewards/rejected": -33.16114807128906, + "step": 29415 + }, + { + "epoch": 0.9916074016650376, + "grad_norm": 21.583099365234375, + "learning_rate": 2.1454569472773022e-10, + "logits/chosen": -1.9407627582550049, + "logits/rejected": -2.017627000808716, + "logps/chosen": -2.4730026721954346, + "logps/rejected": -3.1498286724090576, + "loss": 2.4309, + "rewards/accuracies": 0.5, + "rewards/chosen": -24.730026245117188, + "rewards/margins": 6.7682600021362305, + "rewards/rejected": -31.4982852935791, + "step": 29420 + }, + { + "epoch": 0.9917759277360207, + "grad_norm": 14.413912773132324, + "learning_rate": 2.0601649654028441e-10, + "logits/chosen": -1.7489020824432373, + "logits/rejected": -2.184823513031006, + "logps/chosen": -2.741649627685547, + "logps/rejected": -3.141798734664917, + "loss": 2.2382, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -27.4164981842041, + "rewards/margins": 4.001489162445068, + "rewards/rejected": -31.417987823486328, + "step": 29425 + }, + { + "epoch": 0.9919444538070039, + "grad_norm": 20.478744506835938, + "learning_rate": 1.9766025735939995e-10, + "logits/chosen": -2.1394152641296387, + "logits/rejected": -2.261610507965088, + "logps/chosen": -1.9039617776870728, + "logps/rejected": -2.0586116313934326, + "loss": 2.9164, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.03961753845215, + "rewards/margins": 1.5464990139007568, + "rewards/rejected": -20.58611488342285, + "step": 29430 + }, + { + "epoch": 0.9921129798779871, + "grad_norm": 31.20186996459961, + "learning_rate": 1.8947698007687474e-10, + "logits/chosen": -1.5419042110443115, + "logits/rejected": -1.47139310836792, + "logps/chosen": -2.033275842666626, + "logps/rejected": -2.1540489196777344, + "loss": 2.9035, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.33275604248047, + "rewards/margins": 1.207731008529663, + "rewards/rejected": -21.54048728942871, + "step": 29435 + }, + { + "epoch": 0.9922815059489704, + "grad_norm": 79.59695434570312, + "learning_rate": 1.8146666752466566e-10, + "logits/chosen": -1.8107490539550781, + "logits/rejected": -1.939801812171936, + "logps/chosen": -2.415051221847534, + "logps/rejected": -2.6897196769714355, + "loss": 2.4452, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.150510787963867, + "rewards/margins": 2.7466835975646973, + "rewards/rejected": -26.897192001342773, + "step": 29440 + }, + { + "epoch": 0.9924500320199535, + "grad_norm": 7.891360291978344e-05, + "learning_rate": 1.7362932247472206e-10, + "logits/chosen": -2.1820032596588135, + "logits/rejected": -2.3591179847717285, + "logps/chosen": -2.8081321716308594, + "logps/rejected": -3.666566848754883, + "loss": 1.7237, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -28.081323623657227, + "rewards/margins": 8.584344863891602, + "rewards/rejected": -36.66566848754883, + "step": 29445 + }, + { + "epoch": 0.9926185580909367, + "grad_norm": 48.53298568725586, + "learning_rate": 1.6596494763931878e-10, + "logits/chosen": -1.581608772277832, + "logits/rejected": -1.462424635887146, + "logps/chosen": -2.1671245098114014, + "logps/rejected": -2.1732068061828613, + "loss": 3.3712, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -21.671245574951172, + "rewards/margins": 0.06082124635577202, + "rewards/rejected": -21.732067108154297, + "step": 29450 + }, + { + "epoch": 0.9927870841619199, + "grad_norm": 71.53341674804688, + "learning_rate": 1.5847354567077864e-10, + "logits/chosen": -1.604501485824585, + "logits/rejected": -2.0805764198303223, + "logps/chosen": -2.781949043273926, + "logps/rejected": -2.9773471355438232, + "loss": 3.1445, + "rewards/accuracies": 0.5, + "rewards/chosen": -27.81949234008789, + "rewards/margins": 1.9539794921875, + "rewards/rejected": -29.77347183227539, + "step": 29455 + }, + { + "epoch": 0.992955610232903, + "grad_norm": 38.53224563598633, + "learning_rate": 1.511551191615279e-10, + "logits/chosen": -2.089249849319458, + "logits/rejected": -2.2512056827545166, + "logps/chosen": -2.8309149742126465, + "logps/rejected": -2.986532688140869, + "loss": 2.5773, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -28.30914878845215, + "rewards/margins": 1.5561754703521729, + "rewards/rejected": -29.86532211303711, + "step": 29460 + }, + { + "epoch": 0.9931241363038862, + "grad_norm": 34.39546585083008, + "learning_rate": 1.4400967064426283e-10, + "logits/chosen": -1.3842085599899292, + "logits/rejected": -1.924599051475525, + "logps/chosen": -1.9771686792373657, + "logps/rejected": -2.347097396850586, + "loss": 1.884, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -19.771686553955078, + "rewards/margins": 3.6992874145507812, + "rewards/rejected": -23.47097396850586, + "step": 29465 + }, + { + "epoch": 0.9932926623748694, + "grad_norm": 88.5250244140625, + "learning_rate": 1.3703720259172768e-10, + "logits/chosen": -1.921378493309021, + "logits/rejected": -1.913988471031189, + "logps/chosen": -2.352660655975342, + "logps/rejected": -2.473912000656128, + "loss": 2.6431, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.52660369873047, + "rewards/margins": 1.2125155925750732, + "rewards/rejected": -24.739120483398438, + "step": 29470 + }, + { + "epoch": 0.9934611884458526, + "grad_norm": 87.77112579345703, + "learning_rate": 1.3023771741682564e-10, + "logits/chosen": -1.5295897722244263, + "logits/rejected": -1.5386664867401123, + "logps/chosen": -2.8793563842773438, + "logps/rejected": -2.8281455039978027, + "loss": 4.8123, + "rewards/accuracies": 0.5, + "rewards/chosen": -28.793567657470703, + "rewards/margins": -0.5121095776557922, + "rewards/rejected": -28.28145408630371, + "step": 29475 + }, + { + "epoch": 0.9936297145168358, + "grad_norm": 35.174774169921875, + "learning_rate": 1.2361121747250792e-10, + "logits/chosen": -1.5465962886810303, + "logits/rejected": -1.622862458229065, + "logps/chosen": -2.6694183349609375, + "logps/rejected": -2.7877554893493652, + "loss": 2.9192, + "rewards/accuracies": 0.5, + "rewards/chosen": -26.69417953491211, + "rewards/margins": 1.1833751201629639, + "rewards/rejected": -27.877553939819336, + "step": 29480 + }, + { + "epoch": 0.993798240587819, + "grad_norm": 38.90205383300781, + "learning_rate": 1.1715770505205114e-10, + "logits/chosen": -1.9135345220565796, + "logits/rejected": -2.611912250518799, + "logps/chosen": -2.255214214324951, + "logps/rejected": -2.5369668006896973, + "loss": 2.9454, + "rewards/accuracies": 0.5, + "rewards/chosen": -22.552143096923828, + "rewards/margins": 2.8175246715545654, + "rewards/rejected": -25.369667053222656, + "step": 29485 + }, + { + "epoch": 0.9939667666588021, + "grad_norm": 38.99119186401367, + "learning_rate": 1.1087718238866894e-10, + "logits/chosen": -2.2368454933166504, + "logits/rejected": -2.5548629760742188, + "logps/chosen": -2.6624879837036133, + "logps/rejected": -2.9628915786743164, + "loss": 2.3557, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -26.6248779296875, + "rewards/margins": 3.0040369033813477, + "rewards/rejected": -29.628917694091797, + "step": 29490 + }, + { + "epoch": 0.9941352927297853, + "grad_norm": 28.901277542114258, + "learning_rate": 1.0476965165590046e-10, + "logits/chosen": -2.244162082672119, + "logits/rejected": -2.3326351642608643, + "logps/chosen": -2.6673943996429443, + "logps/rejected": -2.514676332473755, + "loss": 6.1204, + "rewards/accuracies": 0.5, + "rewards/chosen": -26.6739444732666, + "rewards/margins": -1.527183175086975, + "rewards/rejected": -25.14676284790039, + "step": 29495 + }, + { + "epoch": 0.9943038188007685, + "grad_norm": 31.86191177368164, + "learning_rate": 9.883511496722175e-11, + "logits/chosen": -2.0590789318084717, + "logits/rejected": -2.631744623184204, + "logps/chosen": -2.57993745803833, + "logps/rejected": -3.5227248668670654, + "loss": 1.5666, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -25.79937744140625, + "rewards/margins": 9.427871704101562, + "rewards/rejected": -35.22725296020508, + "step": 29500 + }, + { + "epoch": 0.9944723448717516, + "grad_norm": 28.257728576660156, + "learning_rate": 9.307357437637887e-11, + "logits/chosen": -1.8757565021514893, + "logits/rejected": -1.8568928241729736, + "logps/chosen": -2.9727933406829834, + "logps/rejected": -3.244004011154175, + "loss": 1.5322, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -29.72793197631836, + "rewards/margins": 2.7121071815490723, + "rewards/rejected": -32.440040588378906, + "step": 29505 + }, + { + "epoch": 0.9946408709427348, + "grad_norm": 2.5099925994873047, + "learning_rate": 8.748503187727685e-11, + "logits/chosen": -2.2586395740509033, + "logits/rejected": -2.446408748626709, + "logps/chosen": -3.2831978797912598, + "logps/rejected": -3.4535508155822754, + "loss": 2.634, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -32.83197784423828, + "rewards/margins": 1.7035328149795532, + "rewards/rejected": -34.53550720214844, + "step": 29510 + }, + { + "epoch": 0.9948093970137181, + "grad_norm": 145.2407684326172, + "learning_rate": 8.20694894038132e-11, + "logits/chosen": -2.0817415714263916, + "logits/rejected": -1.9775947332382202, + "logps/chosen": -2.9662461280822754, + "logps/rejected": -3.018073797225952, + "loss": 4.6757, + "rewards/accuracies": 0.5, + "rewards/chosen": -29.662464141845703, + "rewards/margins": 0.5182735323905945, + "rewards/rejected": -30.180736541748047, + "step": 29515 + }, + { + "epoch": 0.9949779230847012, + "grad_norm": 35.678321838378906, + "learning_rate": 7.682694883015539e-11, + "logits/chosen": -1.6773369312286377, + "logits/rejected": -2.1161258220672607, + "logps/chosen": -1.8673717975616455, + "logps/rejected": -2.8105673789978027, + "loss": 2.3088, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -18.673717498779297, + "rewards/margins": 9.431957244873047, + "rewards/rejected": -28.105676651000977, + "step": 29520 + }, + { + "epoch": 0.9951464491556844, + "grad_norm": 12.716200828552246, + "learning_rate": 7.175741197046337e-11, + "logits/chosen": -2.0986926555633545, + "logits/rejected": -2.707505702972412, + "logps/chosen": -2.657029390335083, + "logps/rejected": -3.285545825958252, + "loss": 2.1085, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.57029151916504, + "rewards/margins": 6.285164833068848, + "rewards/rejected": -32.8554573059082, + "step": 29525 + }, + { + "epoch": 0.9953149752266676, + "grad_norm": 44.7720947265625, + "learning_rate": 6.686088057916706e-11, + "logits/chosen": -1.5420719385147095, + "logits/rejected": -1.6112966537475586, + "logps/chosen": -1.7459132671356201, + "logps/rejected": -1.9257084131240845, + "loss": 2.3536, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -17.45913314819336, + "rewards/margins": 1.7979503870010376, + "rewards/rejected": -19.257083892822266, + "step": 29530 + }, + { + "epoch": 0.9954835012976507, + "grad_norm": 55.11979293823242, + "learning_rate": 6.213735635068885e-11, + "logits/chosen": -1.4349676370620728, + "logits/rejected": -1.5553808212280273, + "logps/chosen": -2.4499545097351074, + "logps/rejected": -2.442242383956909, + "loss": 3.2346, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -24.499547958374023, + "rewards/margins": -0.0771210640668869, + "rewards/rejected": -24.422426223754883, + "step": 29535 + }, + { + "epoch": 0.9956520273686339, + "grad_norm": 38.8819465637207, + "learning_rate": 5.7586840919776616e-11, + "logits/chosen": -1.542608618736267, + "logits/rejected": -1.7981573343276978, + "logps/chosen": -2.311901569366455, + "logps/rejected": -2.3454556465148926, + "loss": 3.6785, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -23.119014739990234, + "rewards/margins": 0.3355420231819153, + "rewards/rejected": -23.454559326171875, + "step": 29540 + }, + { + "epoch": 0.9958205534396171, + "grad_norm": 30.439922332763672, + "learning_rate": 5.320933586105969e-11, + "logits/chosen": -1.348572850227356, + "logits/rejected": -1.7277705669403076, + "logps/chosen": -2.084947109222412, + "logps/rejected": -2.9177541732788086, + "loss": 1.8665, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -20.849472045898438, + "rewards/margins": 8.328069686889648, + "rewards/rejected": -29.177541732788086, + "step": 29545 + }, + { + "epoch": 0.9959890795106003, + "grad_norm": 35.98751449584961, + "learning_rate": 4.90048426894929e-11, + "logits/chosen": -1.7604057788848877, + "logits/rejected": -1.8795154094696045, + "logps/chosen": -1.927356481552124, + "logps/rejected": -2.0210258960723877, + "loss": 2.453, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.2735652923584, + "rewards/margins": 0.9366942644119263, + "rewards/rejected": -20.21026039123535, + "step": 29550 + }, + { + "epoch": 0.9961576055815835, + "grad_norm": 23.52623176574707, + "learning_rate": 4.497336286007902e-11, + "logits/chosen": -2.0633692741394043, + "logits/rejected": -2.2805099487304688, + "logps/chosen": -2.636157989501953, + "logps/rejected": -2.6414108276367188, + "loss": 3.6446, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.361581802368164, + "rewards/margins": 0.05252895504236221, + "rewards/rejected": -26.414112091064453, + "step": 29555 + }, + { + "epoch": 0.9963261316525667, + "grad_norm": 0.07981903105974197, + "learning_rate": 4.111489776792432e-11, + "logits/chosen": -1.992034912109375, + "logits/rejected": -1.8614925146102905, + "logps/chosen": -2.9912405014038086, + "logps/rejected": -3.2475357055664062, + "loss": 2.7499, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -29.912405014038086, + "rewards/margins": 2.5629496574401855, + "rewards/rejected": -32.4753532409668, + "step": 29560 + }, + { + "epoch": 0.9964946577235498, + "grad_norm": 29.34018325805664, + "learning_rate": 3.742944874829401e-11, + "logits/chosen": -1.7337758541107178, + "logits/rejected": -1.7979612350463867, + "logps/chosen": -2.685328722000122, + "logps/rejected": -2.908069133758545, + "loss": 2.1848, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.853282928466797, + "rewards/margins": 2.2274093627929688, + "rewards/rejected": -29.0806941986084, + "step": 29565 + }, + { + "epoch": 0.996663183794533, + "grad_norm": 31.588529586791992, + "learning_rate": 3.391701707666783e-11, + "logits/chosen": -1.919046401977539, + "logits/rejected": -2.470398426055908, + "logps/chosen": -2.288696050643921, + "logps/rejected": -3.090512752532959, + "loss": 1.9905, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.886960983276367, + "rewards/margins": 8.018167495727539, + "rewards/rejected": -30.90513038635254, + "step": 29570 + }, + { + "epoch": 0.9968317098655162, + "grad_norm": 36.931217193603516, + "learning_rate": 3.0577603968406915e-11, + "logits/chosen": -1.0361428260803223, + "logits/rejected": -1.8651416301727295, + "logps/chosen": -2.3662960529327393, + "logps/rejected": -3.014408588409424, + "loss": 2.6455, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -23.662960052490234, + "rewards/margins": 6.481126308441162, + "rewards/rejected": -30.144084930419922, + "step": 29575 + }, + { + "epoch": 0.9970002359364993, + "grad_norm": 77.14857482910156, + "learning_rate": 2.741121057925344e-11, + "logits/chosen": -2.14117169380188, + "logits/rejected": -2.7788901329040527, + "logps/chosen": -3.382768154144287, + "logps/rejected": -4.087404727935791, + "loss": 1.3519, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -33.82768249511719, + "rewards/margins": 7.046361446380615, + "rewards/rejected": -40.874046325683594, + "step": 29580 + }, + { + "epoch": 0.9971687620074826, + "grad_norm": 13.677254676818848, + "learning_rate": 2.4417838004942014e-11, + "logits/chosen": -1.6798818111419678, + "logits/rejected": -1.7711633443832397, + "logps/chosen": -2.3940412998199463, + "logps/rejected": -2.2626118659973145, + "loss": 4.6407, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.940410614013672, + "rewards/margins": -1.3142937421798706, + "rewards/rejected": -22.626117706298828, + "step": 29585 + }, + { + "epoch": 0.9973372880784658, + "grad_norm": 25.177274703979492, + "learning_rate": 2.1597487281366234e-11, + "logits/chosen": -1.2978934049606323, + "logits/rejected": -1.5820646286010742, + "logps/chosen": -2.278775691986084, + "logps/rejected": -2.6102938652038574, + "loss": 1.3518, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -22.78775405883789, + "rewards/margins": 3.315180540084839, + "rewards/rejected": -26.102935791015625, + "step": 29590 + }, + { + "epoch": 0.9975058141494489, + "grad_norm": 17.82845115661621, + "learning_rate": 1.8950159384578666e-11, + "logits/chosen": -1.8338546752929688, + "logits/rejected": -1.8769657611846924, + "logps/chosen": -3.4636001586914062, + "logps/rejected": -3.8861083984375, + "loss": 2.4416, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -34.6359977722168, + "rewards/margins": 4.225088596343994, + "rewards/rejected": -38.861087799072266, + "step": 29595 + }, + { + "epoch": 0.9976743402204321, + "grad_norm": 33.08721923828125, + "learning_rate": 1.6475855230624336e-11, + "logits/chosen": -2.3700110912323, + "logits/rejected": -2.2387986183166504, + "logps/chosen": -2.5337352752685547, + "logps/rejected": -2.4349582195281982, + "loss": 4.4406, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -25.337352752685547, + "rewards/margins": -0.9877703785896301, + "rewards/rejected": -24.34958267211914, + "step": 29600 + }, + { + "epoch": 0.9976743402204321, + "eval_logits/chosen": -2.3131020069122314, + "eval_logits/rejected": -2.491314649581909, + "eval_logps/chosen": -2.289489507675171, + "eval_logps/rejected": -2.4444422721862793, + "eval_loss": 3.0876708030700684, + "eval_rewards/accuracies": 0.6299999952316284, + "eval_rewards/chosen": -22.8948917388916, + "eval_rewards/margins": 1.5495290756225586, + "eval_rewards/rejected": -24.44442367553711, + "eval_runtime": 12.8936, + "eval_samples_per_second": 7.756, + "eval_steps_per_second": 1.939, + "step": 29600 + }, + { + "epoch": 0.9978428662914153, + "grad_norm": 327.03582763671875, + "learning_rate": 1.4174575675818256e-11, + "logits/chosen": -1.5383819341659546, + "logits/rejected": -1.8430954217910767, + "logps/chosen": -2.5591301918029785, + "logps/rejected": -2.5529518127441406, + "loss": 3.5187, + "rewards/accuracies": 0.5, + "rewards/chosen": -25.591299057006836, + "rewards/margins": -0.061784934252500534, + "rewards/rejected": -25.529516220092773, + "step": 29605 + }, + { + "epoch": 0.9980113923623984, + "grad_norm": 165.06675720214844, + "learning_rate": 1.2046321516523405e-11, + "logits/chosen": -2.3325538635253906, + "logits/rejected": -2.551406145095825, + "logps/chosen": -3.6366772651672363, + "logps/rejected": -4.427830696105957, + "loss": 2.3673, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -36.36676788330078, + "rewards/margins": 7.911537170410156, + "rewards/rejected": -44.2783088684082, + "step": 29610 + }, + { + "epoch": 0.9981799184333816, + "grad_norm": 43.59071350097656, + "learning_rate": 1.0091093489317249e-11, + "logits/chosen": -2.1163182258605957, + "logits/rejected": -2.2026889324188232, + "logps/chosen": -2.3985800743103027, + "logps/rejected": -2.481600522994995, + "loss": 2.8614, + "rewards/accuracies": 0.5, + "rewards/chosen": -23.985801696777344, + "rewards/margins": 0.830204963684082, + "rewards/rejected": -24.81600570678711, + "step": 29615 + }, + { + "epoch": 0.9983484445043648, + "grad_norm": 26.20160484313965, + "learning_rate": 8.308892270714184e-12, + "logits/chosen": -1.830585241317749, + "logits/rejected": -1.8308626413345337, + "logps/chosen": -2.605586290359497, + "logps/rejected": -3.0579631328582764, + "loss": 2.5246, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -26.055866241455078, + "rewards/margins": 4.52376651763916, + "rewards/rejected": -30.579631805419922, + "step": 29620 + }, + { + "epoch": 0.998516970575348, + "grad_norm": 40.348228454589844, + "learning_rate": 6.6997184775541285e-12, + "logits/chosen": -1.8791911602020264, + "logits/rejected": -2.223784923553467, + "logps/chosen": -2.243389368057251, + "logps/rejected": -2.3758978843688965, + "loss": 2.6277, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -22.43389320373535, + "rewards/margins": 1.3250858783721924, + "rewards/rejected": -23.75897979736328, + "step": 29625 + }, + { + "epoch": 0.9986854966463312, + "grad_norm": 37.30038070678711, + "learning_rate": 5.263572666613925e-12, + "logits/chosen": -1.5783147811889648, + "logits/rejected": -1.5388597249984741, + "logps/chosen": -1.9799810647964478, + "logps/rejected": -2.0920217037200928, + "loss": 2.8596, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -19.799808502197266, + "rewards/margins": 1.1204078197479248, + "rewards/rejected": -20.920215606689453, + "step": 29630 + }, + { + "epoch": 0.9988540227173144, + "grad_norm": 56.66443634033203, + "learning_rate": 4.0004553349959335e-12, + "logits/chosen": -2.0255823135375977, + "logits/rejected": -2.4068846702575684, + "logps/chosen": -2.280776262283325, + "logps/rejected": -3.0424695014953613, + "loss": 1.2858, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -22.807762145996094, + "rewards/margins": 7.616931915283203, + "rewards/rejected": -30.424694061279297, + "step": 29635 + }, + { + "epoch": 0.9990225487882975, + "grad_norm": 34.044044494628906, + "learning_rate": 2.910366919739449e-12, + "logits/chosen": -2.4027655124664307, + "logits/rejected": -2.703449010848999, + "logps/chosen": -2.6529898643493652, + "logps/rejected": -3.1543755531311035, + "loss": 3.742, + "rewards/accuracies": 0.5, + "rewards/chosen": -26.529897689819336, + "rewards/margins": 5.013855457305908, + "rewards/rejected": -31.54375648498535, + "step": 29640 + }, + { + "epoch": 0.9991910748592807, + "grad_norm": 37.6712646484375, + "learning_rate": 1.9933077980982537e-12, + "logits/chosen": -1.9718729257583618, + "logits/rejected": -2.110247850418091, + "logps/chosen": -2.6708006858825684, + "logps/rejected": -2.7890660762786865, + "loss": 2.8295, + "rewards/accuracies": 0.5, + "rewards/chosen": -26.7080078125, + "rewards/margins": 1.1826552152633667, + "rewards/rejected": -27.890661239624023, + "step": 29645 + }, + { + "epoch": 0.9993596009302639, + "grad_norm": 75.10658264160156, + "learning_rate": 1.2492782874851115e-12, + "logits/chosen": -2.043384552001953, + "logits/rejected": -2.308835744857788, + "logps/chosen": -2.0701959133148193, + "logps/rejected": -2.082885265350342, + "loss": 3.1115, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.70195960998535, + "rewards/margins": 0.12689141929149628, + "rewards/rejected": -20.8288516998291, + "step": 29650 + }, + { + "epoch": 0.999528127001247, + "grad_norm": 14.825223922729492, + "learning_rate": 6.782786453052303e-13, + "logits/chosen": -1.656032919883728, + "logits/rejected": -2.6159210205078125, + "logps/chosen": -2.042296886444092, + "logps/rejected": -2.4043121337890625, + "loss": 1.6344, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -20.422969818115234, + "rewards/margins": 3.6201515197753906, + "rewards/rejected": -24.043121337890625, + "step": 29655 + }, + { + "epoch": 0.9996966530722303, + "grad_norm": 24.87315559387207, + "learning_rate": 2.803090691783083e-13, + "logits/chosen": -1.6609647274017334, + "logits/rejected": -2.110682487487793, + "logps/chosen": -2.726547956466675, + "logps/rejected": -3.436779737472534, + "loss": 1.4057, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -27.26548194885254, + "rewards/margins": 7.102316856384277, + "rewards/rejected": -34.3677978515625, + "step": 29660 + }, + { + "epoch": 0.9998651791432135, + "grad_norm": 33.12513732910156, + "learning_rate": 5.5369696827511915e-14, + "logits/chosen": -2.2672488689422607, + "logits/rejected": -2.2087759971618652, + "logps/chosen": -2.245776414871216, + "logps/rejected": -2.293640613555908, + "loss": 3.5487, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -22.457765579223633, + "rewards/margins": 0.47864046692848206, + "rewards/rejected": -22.936405181884766, + "step": 29665 + } + ], + "logging_steps": 5, + "max_steps": 29669, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}