diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5866 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.998027613412229, + "eval_steps": 50000, + "global_step": 1824, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00821827744904668, + "grad_norm": 48.55606780690502, + "learning_rate": 1.358695652173913e-08, + "logits/chosen": 26.16689682006836, + "logits/rejected": 25.511425018310547, + "logps/chosen": -189.36741638183594, + "logps/rejected": -78.73792266845703, + "loss": 1.79, + "rewards/accuracies": 0.2800000011920929, + "rewards/chosen": -0.006983796134591103, + "rewards/margins": 3.662884410005063e-05, + "rewards/rejected": -0.007020425051450729, + "sft_loss": 0.661233127117157, + "step": 5 + }, + { + "epoch": 0.01643655489809336, + "grad_norm": 50.84809399854242, + "learning_rate": 2.717391304347826e-08, + "logits/chosen": 25.634292602539062, + "logits/rejected": 25.165508270263672, + "logps/chosen": -175.30511474609375, + "logps/rejected": -79.45011901855469, + "loss": 1.7672, + "rewards/accuracies": 0.6100000143051147, + "rewards/chosen": -0.01493214163929224, + "rewards/margins": 0.032123688608407974, + "rewards/rejected": -0.04705582931637764, + "sft_loss": 0.6432023644447327, + "step": 10 + }, + { + "epoch": 0.02465483234714004, + "grad_norm": 41.52221293409133, + "learning_rate": 4.076086956521739e-08, + "logits/chosen": 25.897306442260742, + "logits/rejected": 25.234777450561523, + "logps/chosen": -204.5565643310547, + "logps/rejected": -85.37405395507812, + "loss": 1.6603, + "rewards/accuracies": 0.8799999952316284, + "rewards/chosen": -0.028912657871842384, + "rewards/margins": 0.18977542221546173, + "rewards/rejected": -0.21868810057640076, + "sft_loss": 0.7592554688453674, + "step": 15 + }, + { + "epoch": 0.03287310979618672, + "grad_norm": 30.024466917447533, + "learning_rate": 5.434782608695652e-08, + "logits/chosen": 26.472496032714844, + "logits/rejected": 26.013669967651367, + "logps/chosen": -178.9062042236328, + "logps/rejected": -87.18224334716797, + "loss": 1.5519, + "rewards/accuracies": 0.9300000071525574, + "rewards/chosen": -0.11060313880443573, + "rewards/margins": 0.3851660490036011, + "rewards/rejected": -0.495769202709198, + "sft_loss": 0.6785654425621033, + "step": 20 + }, + { + "epoch": 0.041091387245233396, + "grad_norm": 23.574332052101575, + "learning_rate": 6.793478260869565e-08, + "logits/chosen": 26.571308135986328, + "logits/rejected": 26.069765090942383, + "logps/chosen": -204.71995544433594, + "logps/rejected": -95.25181579589844, + "loss": 1.4535, + "rewards/accuracies": 0.9300000071525574, + "rewards/chosen": -0.2731512486934662, + "rewards/margins": 0.7024775743484497, + "rewards/rejected": -0.9756287336349487, + "sft_loss": 0.6605415344238281, + "step": 25 + }, + { + "epoch": 0.04930966469428008, + "grad_norm": 18.127113157576492, + "learning_rate": 8.152173913043478e-08, + "logits/chosen": 26.70085906982422, + "logits/rejected": 26.199695587158203, + "logps/chosen": -189.0041961669922, + "logps/rejected": -95.67135620117188, + "loss": 1.3598, + "rewards/accuracies": 0.9399999976158142, + "rewards/chosen": -0.4376958906650543, + "rewards/margins": 0.9910183548927307, + "rewards/rejected": -1.4287142753601074, + "sft_loss": 0.6798427700996399, + "step": 30 + }, + { + "epoch": 0.05752794214332676, + "grad_norm": 16.856249874916603, + "learning_rate": 9.510869565217392e-08, + "logits/chosen": 27.086894989013672, + "logits/rejected": 26.779054641723633, + "logps/chosen": -202.5185546875, + "logps/rejected": -98.5663070678711, + "loss": 1.2944, + "rewards/accuracies": 0.9200000166893005, + "rewards/chosen": -0.5852899551391602, + "rewards/margins": 1.2753018140792847, + "rewards/rejected": -1.8605915307998657, + "sft_loss": 0.6831802129745483, + "step": 35 + }, + { + "epoch": 0.06574621959237344, + "grad_norm": 15.222314216803584, + "learning_rate": 1.0869565217391303e-07, + "logits/chosen": 26.470937728881836, + "logits/rejected": 26.266651153564453, + "logps/chosen": -185.2868194580078, + "logps/rejected": -96.5091781616211, + "loss": 1.2027, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -0.6554566025733948, + "rewards/margins": 1.4152508974075317, + "rewards/rejected": -2.0707075595855713, + "sft_loss": 0.6970738768577576, + "step": 40 + }, + { + "epoch": 0.07396449704142012, + "grad_norm": 14.365159397400335, + "learning_rate": 1.2228260869565216e-07, + "logits/chosen": 25.881906509399414, + "logits/rejected": 25.525175094604492, + "logps/chosen": -202.46238708496094, + "logps/rejected": -108.43726348876953, + "loss": 1.1328, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.787525475025177, + "rewards/margins": 1.8143333196640015, + "rewards/rejected": -2.6018588542938232, + "sft_loss": 0.6782786846160889, + "step": 45 + }, + { + "epoch": 0.08218277449046679, + "grad_norm": 13.924602084521048, + "learning_rate": 1.358695652173913e-07, + "logits/chosen": 24.610755920410156, + "logits/rejected": 24.408979415893555, + "logps/chosen": -206.8500213623047, + "logps/rejected": -113.42557525634766, + "loss": 1.0599, + "rewards/accuracies": 0.9900000095367432, + "rewards/chosen": -0.9198395609855652, + "rewards/margins": 1.9545520544052124, + "rewards/rejected": -2.874391555786133, + "sft_loss": 0.7132790088653564, + "step": 50 + }, + { + "epoch": 0.09040105193951348, + "grad_norm": 11.972485852637668, + "learning_rate": 1.4945652173913042e-07, + "logits/chosen": 23.996862411499023, + "logits/rejected": 24.392988204956055, + "logps/chosen": -176.3905487060547, + "logps/rejected": -110.62020874023438, + "loss": 1.0223, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -0.8744373321533203, + "rewards/margins": 2.172375440597534, + "rewards/rejected": -3.0468130111694336, + "sft_loss": 0.7045189738273621, + "step": 55 + }, + { + "epoch": 0.09861932938856016, + "grad_norm": 13.242028156676367, + "learning_rate": 1.6304347826086955e-07, + "logits/chosen": 23.04694366455078, + "logits/rejected": 23.079355239868164, + "logps/chosen": -186.1154327392578, + "logps/rejected": -107.23130798339844, + "loss": 1.0046, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -1.0562888383865356, + "rewards/margins": 2.0806047916412354, + "rewards/rejected": -3.1368932723999023, + "sft_loss": 0.6290792226791382, + "step": 60 + }, + { + "epoch": 0.10683760683760683, + "grad_norm": 11.030176141313747, + "learning_rate": 1.766304347826087e-07, + "logits/chosen": 21.996606826782227, + "logits/rejected": 22.384113311767578, + "logps/chosen": -199.38589477539062, + "logps/rejected": -116.91275024414062, + "loss": 0.9338, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.1412394046783447, + "rewards/margins": 2.474609613418579, + "rewards/rejected": -3.615849018096924, + "sft_loss": 0.697711706161499, + "step": 65 + }, + { + "epoch": 0.11505588428665352, + "grad_norm": 11.76117705302215, + "learning_rate": 1.9021739130434784e-07, + "logits/chosen": 22.534835815429688, + "logits/rejected": 23.107168197631836, + "logps/chosen": -216.9481964111328, + "logps/rejected": -129.04183959960938, + "loss": 0.8671, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -1.3997070789337158, + "rewards/margins": 2.9236786365509033, + "rewards/rejected": -4.323385715484619, + "sft_loss": 0.728801965713501, + "step": 70 + }, + { + "epoch": 0.1232741617357002, + "grad_norm": 32.386219318167385, + "learning_rate": 2.0380434782608694e-07, + "logits/chosen": 20.90481948852539, + "logits/rejected": 21.215843200683594, + "logps/chosen": -247.61224365234375, + "logps/rejected": -138.62893676757812, + "loss": 0.8076, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -1.5252928733825684, + "rewards/margins": 3.3426883220672607, + "rewards/rejected": -4.86798095703125, + "sft_loss": 0.7596563696861267, + "step": 75 + }, + { + "epoch": 0.13149243918474687, + "grad_norm": 11.700521911598706, + "learning_rate": 2.1739130434782607e-07, + "logits/chosen": 20.761672973632812, + "logits/rejected": 20.871828079223633, + "logps/chosen": -236.5396728515625, + "logps/rejected": -138.31297302246094, + "loss": 0.842, + "rewards/accuracies": 0.9399999976158142, + "rewards/chosen": -1.9364999532699585, + "rewards/margins": 3.281285047531128, + "rewards/rejected": -5.217784881591797, + "sft_loss": 0.7300873398780823, + "step": 80 + }, + { + "epoch": 0.13971071663379356, + "grad_norm": 11.895414317868761, + "learning_rate": 2.309782608695652e-07, + "logits/chosen": 21.150850296020508, + "logits/rejected": 21.817951202392578, + "logps/chosen": -223.0463104248047, + "logps/rejected": -139.8596954345703, + "loss": 0.7489, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -2.007277250289917, + "rewards/margins": 3.5759541988372803, + "rewards/rejected": -5.5832319259643555, + "sft_loss": 0.7483465075492859, + "step": 85 + }, + { + "epoch": 0.14792899408284024, + "grad_norm": 11.018586570679572, + "learning_rate": 2.445652173913043e-07, + "logits/chosen": 22.40447998046875, + "logits/rejected": 22.448156356811523, + "logps/chosen": -201.39810180664062, + "logps/rejected": -126.50525665283203, + "loss": 0.8269, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -2.1027634143829346, + "rewards/margins": 3.118117332458496, + "rewards/rejected": -5.220880508422852, + "sft_loss": 0.7317149639129639, + "step": 90 + }, + { + "epoch": 0.15614727153188693, + "grad_norm": 9.026135528071627, + "learning_rate": 2.499981493451693e-07, + "logits/chosen": 20.40322494506836, + "logits/rejected": 20.44278907775879, + "logps/chosen": -203.20326232910156, + "logps/rejected": -124.00860595703125, + "loss": 0.8771, + "rewards/accuracies": 0.9100000262260437, + "rewards/chosen": -1.6680656671524048, + "rewards/margins": 3.2214581966400146, + "rewards/rejected": -4.889523983001709, + "sft_loss": 0.7273903489112854, + "step": 95 + }, + { + "epoch": 0.16436554898093358, + "grad_norm": 10.366938012622036, + "learning_rate": 2.499868399863186e-07, + "logits/chosen": 20.907590866088867, + "logits/rejected": 21.92055892944336, + "logps/chosen": -226.97225952148438, + "logps/rejected": -144.5021514892578, + "loss": 0.7676, + "rewards/accuracies": 0.9200000166893005, + "rewards/chosen": -2.1906163692474365, + "rewards/margins": 3.6612253189086914, + "rewards/rejected": -5.851841449737549, + "sft_loss": 0.7680675983428955, + "step": 100 + }, + { + "epoch": 0.17258382642998027, + "grad_norm": 9.779078878164054, + "learning_rate": 2.4996525033926786e-07, + "logits/chosen": 19.350120544433594, + "logits/rejected": 19.718740463256836, + "logps/chosen": -209.20166015625, + "logps/rejected": -136.57321166992188, + "loss": 0.7133, + "rewards/accuracies": 0.9300000071525574, + "rewards/chosen": -2.251823663711548, + "rewards/margins": 3.696510076522827, + "rewards/rejected": -5.948334217071533, + "sft_loss": 0.7179654836654663, + "step": 105 + }, + { + "epoch": 0.18080210387902695, + "grad_norm": 8.45489237540799, + "learning_rate": 2.499333821797864e-07, + "logits/chosen": 20.7148380279541, + "logits/rejected": 20.950342178344727, + "logps/chosen": -197.59976196289062, + "logps/rejected": -124.13175964355469, + "loss": 0.7642, + "rewards/accuracies": 0.9399999976158142, + "rewards/chosen": -2.359647750854492, + "rewards/margins": 3.3463170528411865, + "rewards/rejected": -5.705965042114258, + "sft_loss": 0.7615786790847778, + "step": 110 + }, + { + "epoch": 0.18902038132807364, + "grad_norm": 10.762078567025862, + "learning_rate": 2.4989123812906105e-07, + "logits/chosen": 19.379554748535156, + "logits/rejected": 20.651145935058594, + "logps/chosen": -219.8887176513672, + "logps/rejected": -148.8833770751953, + "loss": 0.7483, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.958165168762207, + "rewards/margins": 3.9372713565826416, + "rewards/rejected": -6.895437240600586, + "sft_loss": 0.7731737494468689, + "step": 115 + }, + { + "epoch": 0.19723865877712032, + "grad_norm": 10.354433872987686, + "learning_rate": 2.498388216534807e-07, + "logits/chosen": 19.773361206054688, + "logits/rejected": 21.142953872680664, + "logps/chosen": -238.31101989746094, + "logps/rejected": -152.0144500732422, + "loss": 0.7063, + "rewards/accuracies": 0.8799999952316284, + "rewards/chosen": -2.7792108058929443, + "rewards/margins": 4.163509845733643, + "rewards/rejected": -6.942720413208008, + "sft_loss": 0.7693167328834534, + "step": 120 + }, + { + "epoch": 0.205456936226167, + "grad_norm": 11.490346482929228, + "learning_rate": 2.49776137064351e-07, + "logits/chosen": 19.508024215698242, + "logits/rejected": 19.62151527404785, + "logps/chosen": -232.81178283691406, + "logps/rejected": -151.69398498535156, + "loss": 0.7148, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.6999313831329346, + "rewards/margins": 3.9598686695098877, + "rewards/rejected": -6.659799575805664, + "sft_loss": 0.8186704516410828, + "step": 125 + }, + { + "epoch": 0.21367521367521367, + "grad_norm": 13.390026452837366, + "learning_rate": 2.4970318951754e-07, + "logits/chosen": 19.62987518310547, + "logits/rejected": 20.120250701904297, + "logps/chosen": -247.29205322265625, + "logps/rejected": -159.60348510742188, + "loss": 0.6619, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -2.8834011554718018, + "rewards/margins": 4.369426727294922, + "rewards/rejected": -7.252828598022461, + "sft_loss": 0.7933542728424072, + "step": 130 + }, + { + "epoch": 0.22189349112426035, + "grad_norm": 20.479502968540558, + "learning_rate": 2.496199850130537e-07, + "logits/chosen": 18.90142059326172, + "logits/rejected": 19.151918411254883, + "logps/chosen": -231.70069885253906, + "logps/rejected": -141.99693298339844, + "loss": 0.7109, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -2.819154977798462, + "rewards/margins": 3.806306838989258, + "rewards/rejected": -6.625460624694824, + "sft_loss": 0.7920200228691101, + "step": 135 + }, + { + "epoch": 0.23011176857330704, + "grad_norm": 16.190350556337812, + "learning_rate": 2.4952653039454297e-07, + "logits/chosen": 18.546707153320312, + "logits/rejected": 18.616119384765625, + "logps/chosen": -251.7685089111328, + "logps/rejected": -160.7568817138672, + "loss": 0.703, + "rewards/accuracies": 0.9300000071525574, + "rewards/chosen": -3.2368268966674805, + "rewards/margins": 4.385184288024902, + "rewards/rejected": -7.622011184692383, + "sft_loss": 0.8116011023521423, + "step": 140 + }, + { + "epoch": 0.23833004602235372, + "grad_norm": 14.348906773180857, + "learning_rate": 2.494228333487403e-07, + "logits/chosen": 18.956235885620117, + "logits/rejected": 19.919641494750977, + "logps/chosen": -210.7549591064453, + "logps/rejected": -144.51132202148438, + "loss": 0.6182, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.001668691635132, + "rewards/margins": 3.866687536239624, + "rewards/rejected": -6.868356227874756, + "sft_loss": 0.7950787544250488, + "step": 145 + }, + { + "epoch": 0.2465483234714004, + "grad_norm": 11.009157695890236, + "learning_rate": 2.4930890240482784e-07, + "logits/chosen": 18.876365661621094, + "logits/rejected": 19.30438804626465, + "logps/chosen": -229.18504333496094, + "logps/rejected": -150.90707397460938, + "loss": 0.671, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.3249759674072266, + "rewards/margins": 4.095080375671387, + "rewards/rejected": -7.4200568199157715, + "sft_loss": 0.7879451513290405, + "step": 150 + }, + { + "epoch": 0.25476660092044706, + "grad_norm": 11.264576367918604, + "learning_rate": 2.491847469337356e-07, + "logits/chosen": 18.14313316345215, + "logits/rejected": 18.77975082397461, + "logps/chosen": -219.8468780517578, + "logps/rejected": -150.99098205566406, + "loss": 0.6461, + "rewards/accuracies": 0.9200000166893005, + "rewards/chosen": -3.250223159790039, + "rewards/margins": 4.443104267120361, + "rewards/rejected": -7.6933274269104, + "sft_loss": 0.8351505994796753, + "step": 155 + }, + { + "epoch": 0.26298487836949375, + "grad_norm": 17.15390685304222, + "learning_rate": 2.4905037714737094e-07, + "logits/chosen": 19.779348373413086, + "logits/rejected": 19.593463897705078, + "logps/chosen": -259.2501220703125, + "logps/rejected": -162.26368713378906, + "loss": 0.7398, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -3.7065176963806152, + "rewards/margins": 4.470663070678711, + "rewards/rejected": -8.177180290222168, + "sft_loss": 0.8221470713615417, + "step": 160 + }, + { + "epoch": 0.27120315581854043, + "grad_norm": 10.266952014618042, + "learning_rate": 2.489058040977784e-07, + "logits/chosen": 19.731273651123047, + "logits/rejected": 19.947425842285156, + "logps/chosen": -222.83753967285156, + "logps/rejected": -142.3966522216797, + "loss": 0.6633, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.9937241077423096, + "rewards/margins": 4.092346668243408, + "rewards/rejected": -7.086071968078613, + "sft_loss": 0.8631803393363953, + "step": 165 + }, + { + "epoch": 0.2794214332675871, + "grad_norm": 13.183734224346434, + "learning_rate": 2.487510396762309e-07, + "logits/chosen": 18.506755828857422, + "logits/rejected": 19.725309371948242, + "logps/chosen": -246.2398223876953, + "logps/rejected": -171.14974975585938, + "loss": 0.6512, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -3.316751480102539, + "rewards/margins": 4.549408912658691, + "rewards/rejected": -7.8661603927612305, + "sft_loss": 0.9392525553703308, + "step": 170 + }, + { + "epoch": 0.2876397107166338, + "grad_norm": 12.820383998338311, + "learning_rate": 2.485860966122514e-07, + "logits/chosen": 18.673315048217773, + "logits/rejected": 19.47124671936035, + "logps/chosen": -239.1477508544922, + "logps/rejected": -168.49923706054688, + "loss": 0.6218, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -3.3230719566345215, + "rewards/margins": 4.699094295501709, + "rewards/rejected": -8.022165298461914, + "sft_loss": 0.8536433577537537, + "step": 175 + }, + { + "epoch": 0.2958579881656805, + "grad_norm": 10.336252791103886, + "learning_rate": 2.484109884725661e-07, + "logits/chosen": 17.68476104736328, + "logits/rejected": 18.92132568359375, + "logps/chosen": -248.71087646484375, + "logps/rejected": -164.80517578125, + "loss": 0.6908, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.7376978397369385, + "rewards/margins": 4.469425678253174, + "rewards/rejected": -8.207123756408691, + "sft_loss": 0.7900984883308411, + "step": 180 + }, + { + "epoch": 0.30407626561472717, + "grad_norm": 9.07674205143479, + "learning_rate": 2.4822572965998844e-07, + "logits/chosen": 17.927953720092773, + "logits/rejected": 18.744905471801758, + "logps/chosen": -256.3652038574219, + "logps/rejected": -169.36451721191406, + "loss": 0.6008, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -3.603369951248169, + "rewards/margins": 4.865907192230225, + "rewards/rejected": -8.469277381896973, + "sft_loss": 0.8645619750022888, + "step": 185 + }, + { + "epoch": 0.31229454306377386, + "grad_norm": 11.293965527732967, + "learning_rate": 2.4803033541223455e-07, + "logits/chosen": 19.39400863647461, + "logits/rejected": 19.796106338500977, + "logps/chosen": -245.06739807128906, + "logps/rejected": -164.10296630859375, + "loss": 0.6281, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -3.5845892429351807, + "rewards/margins": 4.6414408683776855, + "rewards/rejected": -8.226030349731445, + "sft_loss": 0.8358697295188904, + "step": 190 + }, + { + "epoch": 0.32051282051282054, + "grad_norm": 11.390930360072153, + "learning_rate": 2.478248218006699e-07, + "logits/chosen": 17.902259826660156, + "logits/rejected": 18.019027709960938, + "logps/chosen": -265.0622253417969, + "logps/rejected": -175.5810546875, + "loss": 0.6158, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -3.9043285846710205, + "rewards/margins": 5.115177154541016, + "rewards/rejected": -9.019506454467773, + "sft_loss": 0.8782904148101807, + "step": 195 + }, + { + "epoch": 0.32873109796186717, + "grad_norm": 52.895489458940915, + "learning_rate": 2.476092057289873e-07, + "logits/chosen": 17.241554260253906, + "logits/rejected": 18.226573944091797, + "logps/chosen": -249.59454345703125, + "logps/rejected": -181.9971923828125, + "loss": 0.6044, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -4.305534362792969, + "rewards/margins": 5.0905351638793945, + "rewards/rejected": -9.396068572998047, + "sft_loss": 0.9349213242530823, + "step": 200 + }, + { + "epoch": 0.33694937541091385, + "grad_norm": 13.12464260474008, + "learning_rate": 2.473835049318167e-07, + "logits/chosen": 18.299766540527344, + "logits/rejected": 19.57137107849121, + "logps/chosen": -248.37832641601562, + "logps/rejected": -171.3523406982422, + "loss": 0.6532, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -4.206078052520752, + "rewards/margins": 4.699835300445557, + "rewards/rejected": -8.905913352966309, + "sft_loss": 0.9326413869857788, + "step": 205 + }, + { + "epoch": 0.34516765285996054, + "grad_norm": 8.71116895518069, + "learning_rate": 2.4714773797326657e-07, + "logits/chosen": 18.58841896057129, + "logits/rejected": 19.255895614624023, + "logps/chosen": -247.08616638183594, + "logps/rejected": -165.45547485351562, + "loss": 0.6183, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -3.8454854488372803, + "rewards/margins": 4.812742710113525, + "rewards/rejected": -8.658228874206543, + "sft_loss": 0.853776752948761, + "step": 210 + }, + { + "epoch": 0.3533859303090072, + "grad_norm": 17.852596870413777, + "learning_rate": 2.4690192424539663e-07, + "logits/chosen": 18.283300399780273, + "logits/rejected": 19.169416427612305, + "logps/chosen": -241.07122802734375, + "logps/rejected": -173.18699645996094, + "loss": 0.6071, + "rewards/accuracies": 0.9900000095367432, + "rewards/chosen": -4.187161445617676, + "rewards/margins": 5.0552144050598145, + "rewards/rejected": -9.242376327514648, + "sft_loss": 0.8952550292015076, + "step": 215 + }, + { + "epoch": 0.3616042077580539, + "grad_norm": 12.136136465528743, + "learning_rate": 2.466460839666233e-07, + "logits/chosen": 17.772991180419922, + "logits/rejected": 18.684547424316406, + "logps/chosen": -255.16156005859375, + "logps/rejected": -183.1548614501953, + "loss": 0.562, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -4.293615341186523, + "rewards/margins": 5.553874969482422, + "rewards/rejected": -9.847491264343262, + "sft_loss": 0.8942830562591553, + "step": 220 + }, + { + "epoch": 0.3698224852071006, + "grad_norm": 13.249996024918259, + "learning_rate": 2.463802381800563e-07, + "logits/chosen": 17.9425106048584, + "logits/rejected": 18.508359909057617, + "logps/chosen": -260.12322998046875, + "logps/rejected": -176.5136260986328, + "loss": 0.6343, + "rewards/accuracies": 0.9900000095367432, + "rewards/chosen": -3.9985711574554443, + "rewards/margins": 5.279909133911133, + "rewards/rejected": -9.278480529785156, + "sft_loss": 0.890729546546936, + "step": 225 + }, + { + "epoch": 0.3780407626561473, + "grad_norm": 13.483286780837357, + "learning_rate": 2.461044087517682e-07, + "logits/chosen": 19.322052001953125, + "logits/rejected": 19.914690017700195, + "logps/chosen": -267.1094970703125, + "logps/rejected": -181.53118896484375, + "loss": 0.59, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -4.28004264831543, + "rewards/margins": 5.2816243171691895, + "rewards/rejected": -9.561667442321777, + "sft_loss": 0.8358654975891113, + "step": 230 + }, + { + "epoch": 0.38625904010519396, + "grad_norm": 10.134479758320998, + "learning_rate": 2.458186183689957e-07, + "logits/chosen": 18.751750946044922, + "logits/rejected": 18.550024032592773, + "logps/chosen": -237.7452392578125, + "logps/rejected": -155.38726806640625, + "loss": 0.6427, + "rewards/accuracies": 0.9200000166893005, + "rewards/chosen": -3.9234371185302734, + "rewards/margins": 4.515294075012207, + "rewards/rejected": -8.438732147216797, + "sft_loss": 0.9805070757865906, + "step": 235 + }, + { + "epoch": 0.39447731755424065, + "grad_norm": 13.771161444519256, + "learning_rate": 2.4552289053827344e-07, + "logits/chosen": 18.025060653686523, + "logits/rejected": 18.463733673095703, + "logps/chosen": -252.61175537109375, + "logps/rejected": -171.77259826660156, + "loss": 0.5599, + "rewards/accuracies": 0.9399999976158142, + "rewards/chosen": -4.3357720375061035, + "rewards/margins": 5.04067325592041, + "rewards/rejected": -9.376445770263672, + "sft_loss": 0.7902787923812866, + "step": 240 + }, + { + "epoch": 0.40269559500328733, + "grad_norm": 9.886456705994728, + "learning_rate": 2.4521724958350093e-07, + "logits/chosen": 18.645158767700195, + "logits/rejected": 19.603240966796875, + "logps/chosen": -239.74526977539062, + "logps/rejected": -162.94131469726562, + "loss": 0.6344, + "rewards/accuracies": 0.9200000166893005, + "rewards/chosen": -4.351040840148926, + "rewards/margins": 4.734447002410889, + "rewards/rejected": -9.085487365722656, + "sft_loss": 0.8848291635513306, + "step": 245 + }, + { + "epoch": 0.410913872452334, + "grad_norm": 16.574947299413026, + "learning_rate": 2.449017206439417e-07, + "logits/chosen": 18.770355224609375, + "logits/rejected": 19.167869567871094, + "logps/chosen": -257.2867431640625, + "logps/rejected": -180.79721069335938, + "loss": 0.5475, + "rewards/accuracies": 0.9900000095367432, + "rewards/chosen": -4.755511283874512, + "rewards/margins": 5.377356052398682, + "rewards/rejected": -10.132868766784668, + "sft_loss": 0.9855692982673645, + "step": 250 + }, + { + "epoch": 0.41913214990138065, + "grad_norm": 15.729142249690554, + "learning_rate": 2.445763296721554e-07, + "logits/chosen": 18.016155242919922, + "logits/rejected": 18.655664443969727, + "logps/chosen": -243.2661590576172, + "logps/rejected": -178.59429931640625, + "loss": 0.6424, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -5.371219635009766, + "rewards/margins": 5.091875076293945, + "rewards/rejected": -10.463094711303711, + "sft_loss": 1.0052944421768188, + "step": 255 + }, + { + "epoch": 0.42735042735042733, + "grad_norm": 14.846371154809418, + "learning_rate": 2.4424110343186345e-07, + "logits/chosen": 18.64227867126465, + "logits/rejected": 19.062152862548828, + "logps/chosen": -241.11070251464844, + "logps/rejected": -167.0811767578125, + "loss": 0.6183, + "rewards/accuracies": 0.9300000071525574, + "rewards/chosen": -3.9312877655029297, + "rewards/margins": 4.8627119064331055, + "rewards/rejected": -8.793999671936035, + "sft_loss": 0.8778759837150574, + "step": 260 + }, + { + "epoch": 0.435568704799474, + "grad_norm": 16.788820590336183, + "learning_rate": 2.4389606949574767e-07, + "logits/chosen": 18.801990509033203, + "logits/rejected": 20.348352432250977, + "logps/chosen": -266.7105407714844, + "logps/rejected": -190.86622619628906, + "loss": 0.5961, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -4.232571601867676, + "rewards/margins": 5.312459945678711, + "rewards/rejected": -9.545029640197754, + "sft_loss": 0.8269821405410767, + "step": 265 + }, + { + "epoch": 0.4437869822485207, + "grad_norm": 9.660029588751273, + "learning_rate": 2.435412562431823e-07, + "logits/chosen": 18.019432067871094, + "logits/rejected": 18.232667922973633, + "logps/chosen": -254.80136108398438, + "logps/rejected": -172.0924835205078, + "loss": 0.547, + "rewards/accuracies": 0.9200000166893005, + "rewards/chosen": -4.478307723999023, + "rewards/margins": 5.105349540710449, + "rewards/rejected": -9.583656311035156, + "sft_loss": 0.8911004662513733, + "step": 270 + }, + { + "epoch": 0.4520052596975674, + "grad_norm": 8.447767610497143, + "learning_rate": 2.4317669285789964e-07, + "logits/chosen": 18.408342361450195, + "logits/rejected": 18.87084197998047, + "logps/chosen": -296.8369445800781, + "logps/rejected": -195.3644561767578, + "loss": 0.5759, + "rewards/accuracies": 0.9200000166893005, + "rewards/chosen": -4.8854217529296875, + "rewards/margins": 5.9024529457092285, + "rewards/rejected": -10.787875175476074, + "sft_loss": 0.8718220591545105, + "step": 275 + }, + { + "epoch": 0.46022353714661407, + "grad_norm": 14.077509009393875, + "learning_rate": 2.428024093255901e-07, + "logits/chosen": 17.676301956176758, + "logits/rejected": 19.232654571533203, + "logps/chosen": -261.8072509765625, + "logps/rejected": -193.81626892089844, + "loss": 0.6028, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -4.590798854827881, + "rewards/margins": 5.75556755065918, + "rewards/rejected": -10.346365928649902, + "sft_loss": 0.8692941069602966, + "step": 280 + }, + { + "epoch": 0.46844181459566075, + "grad_norm": 12.255103077032402, + "learning_rate": 2.424184364314352e-07, + "logits/chosen": 19.874698638916016, + "logits/rejected": 19.855077743530273, + "logps/chosen": -263.8525085449219, + "logps/rejected": -174.5958251953125, + "loss": 0.5687, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.0808610916137695, + "rewards/margins": 5.203913688659668, + "rewards/rejected": -9.284773826599121, + "sft_loss": 0.8956073522567749, + "step": 285 + }, + { + "epoch": 0.47666009204470744, + "grad_norm": 15.082062203409798, + "learning_rate": 2.420248057575761e-07, + "logits/chosen": 17.83322525024414, + "logits/rejected": 17.633359909057617, + "logps/chosen": -278.74298095703125, + "logps/rejected": -181.1900634765625, + "loss": 0.5783, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -4.548935890197754, + "rewards/margins": 5.899779796600342, + "rewards/rejected": -10.448714256286621, + "sft_loss": 0.8952395915985107, + "step": 290 + }, + { + "epoch": 0.4848783694937541, + "grad_norm": 11.834958728287821, + "learning_rate": 2.416215496805156e-07, + "logits/chosen": 18.121597290039062, + "logits/rejected": 19.50238037109375, + "logps/chosen": -252.4333038330078, + "logps/rejected": -197.94659423828125, + "loss": 0.5665, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -5.617161273956299, + "rewards/margins": 5.908203125, + "rewards/rejected": -11.52536392211914, + "sft_loss": 0.9183645844459534, + "step": 295 + }, + { + "epoch": 0.4930966469428008, + "grad_norm": 13.662146621659161, + "learning_rate": 2.412087013684552e-07, + "logits/chosen": 16.815900802612305, + "logits/rejected": 17.304187774658203, + "logps/chosen": -276.7563781738281, + "logps/rejected": -191.68553161621094, + "loss": 0.6409, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -5.5067291259765625, + "rewards/margins": 5.485719680786133, + "rewards/rejected": -10.992449760437012, + "sft_loss": 0.9233679175376892, + "step": 300 + }, + { + "epoch": 0.5013149243918474, + "grad_norm": 12.176993675847571, + "learning_rate": 2.407862947785669e-07, + "logits/chosen": 18.833539962768555, + "logits/rejected": 18.9912109375, + "logps/chosen": -301.635498046875, + "logps/rejected": -204.53671264648438, + "loss": 0.487, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -5.389955997467041, + "rewards/margins": 6.232929706573486, + "rewards/rejected": -11.622885704040527, + "sft_loss": 0.92539381980896, + "step": 305 + }, + { + "epoch": 0.5095332018408941, + "grad_norm": 8.075422505238562, + "learning_rate": 2.403543646542003e-07, + "logits/chosen": 18.5779972076416, + "logits/rejected": 19.133594512939453, + "logps/chosen": -267.43695068359375, + "logps/rejected": -186.43345642089844, + "loss": 0.6388, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -4.979398727416992, + "rewards/margins": 5.5010504722595215, + "rewards/rejected": -10.480450630187988, + "sft_loss": 0.9564525485038757, + "step": 310 + }, + { + "epoch": 0.5177514792899408, + "grad_norm": 8.97962168945258, + "learning_rate": 2.39912946522025e-07, + "logits/chosen": 19.53040313720703, + "logits/rejected": 20.46470069885254, + "logps/chosen": -244.89207458496094, + "logps/rejected": -172.9203643798828, + "loss": 0.5741, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -4.482312202453613, + "rewards/margins": 5.123040676116943, + "rewards/rejected": -9.605354309082031, + "sft_loss": 0.9498026371002197, + "step": 315 + }, + { + "epoch": 0.5259697567389875, + "grad_norm": 12.054702965132526, + "learning_rate": 2.3946207668910833e-07, + "logits/chosen": 18.005373001098633, + "logits/rejected": 18.470924377441406, + "logps/chosen": -231.72732543945312, + "logps/rejected": -168.2989044189453, + "loss": 0.5869, + "rewards/accuracies": 0.9200000166893005, + "rewards/chosen": -4.519069671630859, + "rewards/margins": 4.713679313659668, + "rewards/rejected": -9.232749938964844, + "sft_loss": 0.8408420085906982, + "step": 320 + }, + { + "epoch": 0.5341880341880342, + "grad_norm": 25.950655473924865, + "learning_rate": 2.390017922399292e-07, + "logits/chosen": 18.79814910888672, + "logits/rejected": 19.250444412231445, + "logps/chosen": -247.69647216796875, + "logps/rejected": -174.35218811035156, + "loss": 0.6145, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.683900833129883, + "rewards/margins": 5.248979568481445, + "rewards/rejected": -9.932881355285645, + "sft_loss": 0.9410896301269531, + "step": 325 + }, + { + "epoch": 0.5424063116370809, + "grad_norm": 10.907505413471052, + "learning_rate": 2.385321310333276e-07, + "logits/chosen": 17.780803680419922, + "logits/rejected": 18.34245491027832, + "logps/chosen": -248.3139190673828, + "logps/rejected": -172.43350219726562, + "loss": 0.6284, + "rewards/accuracies": 0.9100000262260437, + "rewards/chosen": -5.367508411407471, + "rewards/margins": 4.742012977600098, + "rewards/rejected": -10.109521865844727, + "sft_loss": 0.9266583323478699, + "step": 330 + }, + { + "epoch": 0.5506245890861275, + "grad_norm": 29.199966853282145, + "learning_rate": 2.38053131699391e-07, + "logits/chosen": 18.024690628051758, + "logits/rejected": 18.614425659179688, + "logps/chosen": -290.8337707519531, + "logps/rejected": -203.31809997558594, + "loss": 0.5688, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.330504894256592, + "rewards/margins": 6.163724422454834, + "rewards/rejected": -11.49422836303711, + "sft_loss": 0.9595879316329956, + "step": 335 + }, + { + "epoch": 0.5588428665351742, + "grad_norm": 11.416242977585302, + "learning_rate": 2.3756483363627694e-07, + "logits/chosen": 17.60715103149414, + "logits/rejected": 18.161012649536133, + "logps/chosen": -250.91665649414062, + "logps/rejected": -184.4646453857422, + "loss": 0.5981, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -5.246757984161377, + "rewards/margins": 5.438488006591797, + "rewards/rejected": -10.685246467590332, + "sft_loss": 0.9181762933731079, + "step": 340 + }, + { + "epoch": 0.5670611439842209, + "grad_norm": 9.90321260332983, + "learning_rate": 2.3706727700697226e-07, + "logits/chosen": 17.566362380981445, + "logits/rejected": 18.253488540649414, + "logps/chosen": -284.3514404296875, + "logps/rejected": -193.24594116210938, + "loss": 0.5567, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -4.994836807250977, + "rewards/margins": 5.735879421234131, + "rewards/rejected": -10.73071575164795, + "sft_loss": 1.0169059038162231, + "step": 345 + }, + { + "epoch": 0.5752794214332676, + "grad_norm": 15.546918377467371, + "learning_rate": 2.3656050273598986e-07, + "logits/chosen": 17.2511043548584, + "logits/rejected": 18.237810134887695, + "logps/chosen": -258.5328369140625, + "logps/rejected": -191.0077362060547, + "loss": 0.5363, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -5.097340106964111, + "rewards/margins": 5.559810638427734, + "rewards/rejected": -10.657149314880371, + "sft_loss": 0.8693541884422302, + "step": 350 + }, + { + "epoch": 0.5834976988823143, + "grad_norm": 10.563639895115125, + "learning_rate": 2.3604455250600256e-07, + "logits/chosen": 18.051647186279297, + "logits/rejected": 18.685161590576172, + "logps/chosen": -273.46368408203125, + "logps/rejected": -202.36537170410156, + "loss": 0.516, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.360798358917236, + "rewards/margins": 5.977966785430908, + "rewards/rejected": -11.338766098022461, + "sft_loss": 0.9063312411308289, + "step": 355 + }, + { + "epoch": 0.591715976331361, + "grad_norm": 11.326441657016302, + "learning_rate": 2.3551946875441467e-07, + "logits/chosen": 19.21741485595703, + "logits/rejected": 19.171350479125977, + "logps/chosen": -265.16619873046875, + "logps/rejected": -185.63027954101562, + "loss": 0.586, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.824009895324707, + "rewards/margins": 5.92770528793335, + "rewards/rejected": -10.751714706420898, + "sft_loss": 0.967497706413269, + "step": 360 + }, + { + "epoch": 0.5999342537804077, + "grad_norm": 16.154882276044376, + "learning_rate": 2.3498529466987147e-07, + "logits/chosen": 18.083656311035156, + "logits/rejected": 19.166841506958008, + "logps/chosen": -275.3788146972656, + "logps/rejected": -196.90736389160156, + "loss": 0.6121, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -5.075117111206055, + "rewards/margins": 6.357577323913574, + "rewards/rejected": -11.432694435119629, + "sft_loss": 0.9689314961433411, + "step": 365 + }, + { + "epoch": 0.6081525312294543, + "grad_norm": 12.069410065037287, + "learning_rate": 2.3444207418870688e-07, + "logits/chosen": 17.682310104370117, + "logits/rejected": 18.865554809570312, + "logps/chosen": -277.48114013671875, + "logps/rejected": -195.44508361816406, + "loss": 0.5471, + "rewards/accuracies": 0.9100000262260437, + "rewards/chosen": -4.911283493041992, + "rewards/margins": 5.8411865234375, + "rewards/rejected": -10.75246810913086, + "sft_loss": 0.8908612728118896, + "step": 370 + }, + { + "epoch": 0.616370808678501, + "grad_norm": 17.941774722560346, + "learning_rate": 2.3388985199132962e-07, + "logits/chosen": 17.635793685913086, + "logits/rejected": 18.530078887939453, + "logps/chosen": -265.6659240722656, + "logps/rejected": -185.41099548339844, + "loss": 0.5578, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.487802982330322, + "rewards/margins": 5.8236083984375, + "rewards/rejected": -10.311410903930664, + "sft_loss": 0.8852910399436951, + "step": 375 + }, + { + "epoch": 0.6245890861275477, + "grad_norm": 16.222798143855407, + "learning_rate": 2.3332867349854844e-07, + "logits/chosen": 18.22924041748047, + "logits/rejected": 19.445384979248047, + "logps/chosen": -267.8589172363281, + "logps/rejected": -200.61328125, + "loss": 0.6283, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.089979648590088, + "rewards/margins": 6.0606160163879395, + "rewards/rejected": -11.150596618652344, + "sft_loss": 0.85948646068573, + "step": 380 + }, + { + "epoch": 0.6328073635765944, + "grad_norm": 58.78518201844404, + "learning_rate": 2.3275858486783578e-07, + "logits/chosen": 17.743967056274414, + "logits/rejected": 19.073143005371094, + "logps/chosen": -229.31361389160156, + "logps/rejected": -178.3441162109375, + "loss": 0.5824, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -4.969345569610596, + "rewards/margins": 5.37393045425415, + "rewards/rejected": -10.343276023864746, + "sft_loss": 0.9465056657791138, + "step": 385 + }, + { + "epoch": 0.6410256410256411, + "grad_norm": 15.400545086822072, + "learning_rate": 2.321796329895317e-07, + "logits/chosen": 16.995241165161133, + "logits/rejected": 18.397994995117188, + "logps/chosen": -266.69647216796875, + "logps/rejected": -193.65902709960938, + "loss": 0.5813, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -5.233003616333008, + "rewards/margins": 6.00741720199585, + "rewards/rejected": -11.240421295166016, + "sft_loss": 0.9756826758384705, + "step": 390 + }, + { + "epoch": 0.6492439184746877, + "grad_norm": 11.604457345989609, + "learning_rate": 2.3159186548298688e-07, + "logits/chosen": 16.9737606048584, + "logits/rejected": 18.478750228881836, + "logps/chosen": -257.61419677734375, + "logps/rejected": -194.60252380371094, + "loss": 0.5278, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.19744873046875, + "rewards/margins": 6.024503707885742, + "rewards/rejected": -11.221953392028809, + "sft_loss": 0.972574770450592, + "step": 395 + }, + { + "epoch": 0.6574621959237343, + "grad_norm": 14.695134059357779, + "learning_rate": 2.3099533069264594e-07, + "logits/chosen": 17.685321807861328, + "logits/rejected": 18.130495071411133, + "logps/chosen": -257.6887512207031, + "logps/rejected": -180.2339324951172, + "loss": 0.5419, + "rewards/accuracies": 0.9399999976158142, + "rewards/chosen": -5.080874919891357, + "rewards/margins": 5.387575626373291, + "rewards/rejected": -10.468450546264648, + "sft_loss": 1.00028657913208, + "step": 400 + }, + { + "epoch": 0.665680473372781, + "grad_norm": 14.2588021174925, + "learning_rate": 2.3039007768407098e-07, + "logits/chosen": 17.992835998535156, + "logits/rejected": 18.434703826904297, + "logps/chosen": -278.3475341796875, + "logps/rejected": -196.46011352539062, + "loss": 0.581, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -5.298067092895508, + "rewards/margins": 6.079626560211182, + "rewards/rejected": -11.377694129943848, + "sft_loss": 0.9695589542388916, + "step": 405 + }, + { + "epoch": 0.6738987508218277, + "grad_norm": 14.653004208659825, + "learning_rate": 2.2977615623990603e-07, + "logits/chosen": 18.65854263305664, + "logits/rejected": 19.244489669799805, + "logps/chosen": -263.1656188964844, + "logps/rejected": -193.50169372558594, + "loss": 0.555, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -5.245527267456055, + "rewards/margins": 5.687096118927002, + "rewards/rejected": -10.932621955871582, + "sft_loss": 0.9538100957870483, + "step": 410 + }, + { + "epoch": 0.6821170282708744, + "grad_norm": 16.632773914957095, + "learning_rate": 2.2915361685578235e-07, + "logits/chosen": 18.390525817871094, + "logits/rejected": 19.31244468688965, + "logps/chosen": -259.29205322265625, + "logps/rejected": -189.3291015625, + "loss": 0.5501, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -5.290169715881348, + "rewards/margins": 5.542262077331543, + "rewards/rejected": -10.83243179321289, + "sft_loss": 0.9607923030853271, + "step": 415 + }, + { + "epoch": 0.6903353057199211, + "grad_norm": 14.010413486772263, + "learning_rate": 2.2852251073616503e-07, + "logits/chosen": 17.323869705200195, + "logits/rejected": 18.94650650024414, + "logps/chosen": -282.4395751953125, + "logps/rejected": -215.9941864013672, + "loss": 0.4948, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -5.772212505340576, + "rewards/margins": 6.878769397735596, + "rewards/rejected": -12.650981903076172, + "sft_loss": 0.993140697479248, + "step": 420 + }, + { + "epoch": 0.6985535831689678, + "grad_norm": 14.508340310090572, + "learning_rate": 2.2788288979014132e-07, + "logits/chosen": 18.25994300842285, + "logits/rejected": 19.41350555419922, + "logps/chosen": -279.428955078125, + "logps/rejected": -197.93687438964844, + "loss": 0.5473, + "rewards/accuracies": 0.9399999976158142, + "rewards/chosen": -5.4432454109191895, + "rewards/margins": 5.909384250640869, + "rewards/rejected": -11.352629661560059, + "sft_loss": 0.9294517040252686, + "step": 425 + }, + { + "epoch": 0.7067718606180144, + "grad_norm": 15.828121421000128, + "learning_rate": 2.2723480662715134e-07, + "logits/chosen": 17.447628021240234, + "logits/rejected": 18.819887161254883, + "logps/chosen": -253.06153869628906, + "logps/rejected": -190.72598266601562, + "loss": 0.5712, + "rewards/accuracies": 0.8899999856948853, + "rewards/chosen": -5.495950698852539, + "rewards/margins": 5.677833080291748, + "rewards/rejected": -11.173783302307129, + "sft_loss": 1.0165560245513916, + "step": 430 + }, + { + "epoch": 0.7149901380670611, + "grad_norm": 21.070659832772854, + "learning_rate": 2.2657831455266063e-07, + "logits/chosen": 19.03611946105957, + "logits/rejected": 19.757238388061523, + "logps/chosen": -281.93084716796875, + "logps/rejected": -194.18865966796875, + "loss": 0.6137, + "rewards/accuracies": 0.9399999976158142, + "rewards/chosen": -5.303485870361328, + "rewards/margins": 5.8611884117126465, + "rewards/rejected": -11.164673805236816, + "sft_loss": 1.0157676935195923, + "step": 435 + }, + { + "epoch": 0.7232084155161078, + "grad_norm": 10.044668338093802, + "learning_rate": 2.2591346756377588e-07, + "logits/chosen": 18.8349666595459, + "logits/rejected": 19.587926864624023, + "logps/chosen": -245.26052856445312, + "logps/rejected": -174.76011657714844, + "loss": 0.5325, + "rewards/accuracies": 0.9399999976158142, + "rewards/chosen": -4.77711820602417, + "rewards/margins": 5.245749473571777, + "rewards/rejected": -10.022867202758789, + "sft_loss": 0.9105268120765686, + "step": 440 + }, + { + "epoch": 0.7314266929651545, + "grad_norm": 13.114453854538773, + "learning_rate": 2.252403203448034e-07, + "logits/chosen": 19.10161781311035, + "logits/rejected": 20.04970932006836, + "logps/chosen": -325.4466552734375, + "logps/rejected": -227.55043029785156, + "loss": 0.5582, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -5.897343635559082, + "rewards/margins": 6.912624359130859, + "rewards/rejected": -12.809967994689941, + "sft_loss": 0.9535994529724121, + "step": 445 + }, + { + "epoch": 0.7396449704142012, + "grad_norm": 12.5969825666755, + "learning_rate": 2.2455892826275155e-07, + "logits/chosen": 18.5415096282959, + "logits/rejected": 19.55573844909668, + "logps/chosen": -302.2394714355469, + "logps/rejected": -217.98895263671875, + "loss": 0.5556, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -6.171204090118408, + "rewards/margins": 6.812131881713867, + "rewards/rejected": -12.9833345413208, + "sft_loss": 0.9671850800514221, + "step": 450 + }, + { + "epoch": 0.7478632478632479, + "grad_norm": 11.483896112432117, + "learning_rate": 2.2386934736277666e-07, + "logits/chosen": 18.071735382080078, + "logits/rejected": 19.025733947753906, + "logps/chosen": -237.59962463378906, + "logps/rejected": -185.32635498046875, + "loss": 0.577, + "rewards/accuracies": 0.9399999976158142, + "rewards/chosen": -5.7157416343688965, + "rewards/margins": 5.618371963500977, + "rewards/rejected": -11.334112167358398, + "sft_loss": 0.9591123461723328, + "step": 455 + }, + { + "epoch": 0.7560815253122946, + "grad_norm": 13.120210730356671, + "learning_rate": 2.2317163436357317e-07, + "logits/chosen": 16.842187881469727, + "logits/rejected": 18.437271118164062, + "logps/chosen": -282.98541259765625, + "logps/rejected": -213.07257080078125, + "loss": 0.5363, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -5.7529802322387695, + "rewards/margins": 6.836727142333984, + "rewards/rejected": -12.589707374572754, + "sft_loss": 0.9440767168998718, + "step": 460 + }, + { + "epoch": 0.7642998027613412, + "grad_norm": 12.516354265498741, + "learning_rate": 2.2246584665270855e-07, + "logits/chosen": 18.161880493164062, + "logits/rejected": 19.371177673339844, + "logps/chosen": -298.9051513671875, + "logps/rejected": -213.79953002929688, + "loss": 0.4837, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -5.515788555145264, + "rewards/margins": 6.570387363433838, + "rewards/rejected": -12.086176872253418, + "sft_loss": 0.9586593508720398, + "step": 465 + }, + { + "epoch": 0.7725180802103879, + "grad_norm": 11.048153129151439, + "learning_rate": 2.2175204228190308e-07, + "logits/chosen": 18.859655380249023, + "logits/rejected": 20.116731643676758, + "logps/chosen": -261.10186767578125, + "logps/rejected": -194.5068817138672, + "loss": 0.6008, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -5.460696220397949, + "rewards/margins": 5.922670841217041, + "rewards/rejected": -11.383367538452148, + "sft_loss": 0.9851782321929932, + "step": 470 + }, + { + "epoch": 0.7807363576594346, + "grad_norm": 34.036831132798504, + "learning_rate": 2.2103027996225512e-07, + "logits/chosen": 17.431440353393555, + "logits/rejected": 18.033245086669922, + "logps/chosen": -278.5311584472656, + "logps/rejected": -198.3171844482422, + "loss": 0.5997, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -5.360807418823242, + "rewards/margins": 6.381589412689209, + "rewards/rejected": -11.74239730834961, + "sft_loss": 1.0034022331237793, + "step": 475 + }, + { + "epoch": 0.7889546351084813, + "grad_norm": 14.859702493359293, + "learning_rate": 2.2030061905941193e-07, + "logits/chosen": 18.73612403869629, + "logits/rejected": 18.83433723449707, + "logps/chosen": -264.3339538574219, + "logps/rejected": -190.15017700195312, + "loss": 0.5072, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -5.439321517944336, + "rewards/margins": 5.989686489105225, + "rewards/rejected": -11.429006576538086, + "sft_loss": 0.9705156087875366, + "step": 480 + }, + { + "epoch": 0.797172912557528, + "grad_norm": 10.75919165569494, + "learning_rate": 2.1956311958868684e-07, + "logits/chosen": 19.243186950683594, + "logits/rejected": 19.267446517944336, + "logps/chosen": -267.3321228027344, + "logps/rejected": -196.00926208496094, + "loss": 0.4832, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -5.953473091125488, + "rewards/margins": 5.860842227935791, + "rewards/rejected": -11.814314842224121, + "sft_loss": 0.9466427564620972, + "step": 485 + }, + { + "epoch": 0.8053911900065747, + "grad_norm": 17.23206010012729, + "learning_rate": 2.1881784221012307e-07, + "logits/chosen": 17.544191360473633, + "logits/rejected": 18.491127014160156, + "logps/chosen": -250.6893768310547, + "logps/rejected": -189.68630981445312, + "loss": 0.5522, + "rewards/accuracies": 0.9100000262260437, + "rewards/chosen": -6.418759822845459, + "rewards/margins": 5.677851676940918, + "rewards/rejected": -12.096611022949219, + "sft_loss": 1.0340924263000488, + "step": 490 + }, + { + "epoch": 0.8136094674556213, + "grad_norm": 14.38672703795697, + "learning_rate": 2.1806484822350417e-07, + "logits/chosen": 17.07558250427246, + "logits/rejected": 17.701539993286133, + "logps/chosen": -301.8546142578125, + "logps/rejected": -211.86402893066406, + "loss": 0.511, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -6.485326766967773, + "rewards/margins": 6.297828197479248, + "rewards/rejected": -12.78315544128418, + "sft_loss": 1.0085182189941406, + "step": 495 + }, + { + "epoch": 0.821827744904668, + "grad_norm": 11.220505543423183, + "learning_rate": 2.1730419956331215e-07, + "logits/chosen": 17.45648956298828, + "logits/rejected": 18.378616333007812, + "logps/chosen": -281.8039245605469, + "logps/rejected": -211.0707550048828, + "loss": 0.4967, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -5.824225425720215, + "rewards/margins": 6.649372577667236, + "rewards/rejected": -12.47359848022461, + "sft_loss": 0.9624088406562805, + "step": 500 + }, + { + "epoch": 0.8300460223537146, + "grad_norm": 19.974838378014, + "learning_rate": 2.1653595879363335e-07, + "logits/chosen": 18.410470962524414, + "logits/rejected": 18.558494567871094, + "logps/chosen": -267.88653564453125, + "logps/rejected": -197.4770050048828, + "loss": 0.5762, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -6.227014541625977, + "rewards/margins": 6.174468517303467, + "rewards/rejected": -12.401481628417969, + "sft_loss": 0.9929137229919434, + "step": 505 + }, + { + "epoch": 0.8382642998027613, + "grad_norm": 20.289642932843638, + "learning_rate": 2.1576018910301238e-07, + "logits/chosen": 18.445819854736328, + "logits/rejected": 18.456052780151367, + "logps/chosen": -268.7127990722656, + "logps/rejected": -191.65673828125, + "loss": 0.5308, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.160595893859863, + "rewards/margins": 5.674745559692383, + "rewards/rejected": -11.835343360900879, + "sft_loss": 0.9606292843818665, + "step": 510 + }, + { + "epoch": 0.846482577251808, + "grad_norm": 12.060714182430129, + "learning_rate": 2.1497695429925497e-07, + "logits/chosen": 17.933076858520508, + "logits/rejected": 18.939220428466797, + "logps/chosen": -267.7327575683594, + "logps/rejected": -197.41754150390625, + "loss": 0.5127, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -5.445801258087158, + "rewards/margins": 6.1840291023254395, + "rewards/rejected": -11.629830360412598, + "sft_loss": 0.8621335029602051, + "step": 515 + }, + { + "epoch": 0.8547008547008547, + "grad_norm": 10.501846825508975, + "learning_rate": 2.1418631880417954e-07, + "logits/chosen": 17.952999114990234, + "logits/rejected": 19.42998504638672, + "logps/chosen": -270.5357360839844, + "logps/rejected": -212.4191436767578, + "loss": 0.5705, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -6.491232872009277, + "rewards/margins": 6.157339096069336, + "rewards/rejected": -12.648571968078613, + "sft_loss": 1.0165194272994995, + "step": 520 + }, + { + "epoch": 0.8629191321499013, + "grad_norm": 38.938347224135214, + "learning_rate": 2.1338834764831843e-07, + "logits/chosen": 18.03480339050293, + "logits/rejected": 18.895524978637695, + "logps/chosen": -288.3295593261719, + "logps/rejected": -212.9174041748047, + "loss": 0.5076, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -6.212762355804443, + "rewards/margins": 6.556905746459961, + "rewards/rejected": -12.769665718078613, + "sft_loss": 1.0657466650009155, + "step": 525 + }, + { + "epoch": 0.871137409598948, + "grad_norm": 23.662606552485556, + "learning_rate": 2.125831064655693e-07, + "logits/chosen": 18.570951461791992, + "logits/rejected": 19.01372528076172, + "logps/chosen": -299.0896911621094, + "logps/rejected": -218.2689666748047, + "loss": 0.4869, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -6.196591377258301, + "rewards/margins": 6.7210693359375, + "rewards/rejected": -12.9176607131958, + "sft_loss": 1.0185062885284424, + "step": 530 + }, + { + "epoch": 0.8793556870479947, + "grad_norm": 19.788570154737137, + "learning_rate": 2.1177066148779655e-07, + "logits/chosen": 18.860197067260742, + "logits/rejected": 19.767044067382812, + "logps/chosen": -318.2361755371094, + "logps/rejected": -226.54783630371094, + "loss": 0.5328, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -6.095911979675293, + "rewards/margins": 7.498478412628174, + "rewards/rejected": -13.594389915466309, + "sft_loss": 0.9245139360427856, + "step": 535 + }, + { + "epoch": 0.8875739644970414, + "grad_norm": 9.861201904757298, + "learning_rate": 2.1095107953938348e-07, + "logits/chosen": 18.201683044433594, + "logits/rejected": 18.54186248779297, + "logps/chosen": -252.76708984375, + "logps/rejected": -189.79519653320312, + "loss": 0.491, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -6.304187774658203, + "rewards/margins": 5.595078945159912, + "rewards/rejected": -11.899266242980957, + "sft_loss": 1.0021482706069946, + "step": 540 + }, + { + "epoch": 0.8957922419460881, + "grad_norm": 12.854026542061266, + "learning_rate": 2.1012442803173634e-07, + "logits/chosen": 16.392040252685547, + "logits/rejected": 18.43426513671875, + "logps/chosen": -268.9873962402344, + "logps/rejected": -213.36622619628906, + "loss": 0.452, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -6.529672622680664, + "rewards/margins": 6.670236110687256, + "rewards/rejected": -13.199908256530762, + "sft_loss": 1.0502568483352661, + "step": 545 + }, + { + "epoch": 0.9040105193951348, + "grad_norm": 14.317934082382363, + "learning_rate": 2.0929077495773927e-07, + "logits/chosen": 17.196094512939453, + "logits/rejected": 18.512819290161133, + "logps/chosen": -301.5859375, + "logps/rejected": -215.9300994873047, + "loss": 0.5177, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -6.289539813995361, + "rewards/margins": 7.147468090057373, + "rewards/rejected": -13.43700885772705, + "sft_loss": 1.052231788635254, + "step": 550 + }, + { + "epoch": 0.9122287968441815, + "grad_norm": 13.793660373919764, + "learning_rate": 2.0845018888616212e-07, + "logits/chosen": 17.761926651000977, + "logits/rejected": 18.349868774414062, + "logps/chosen": -275.8336486816406, + "logps/rejected": -202.1535186767578, + "loss": 0.4794, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -5.62368106842041, + "rewards/margins": 6.281108856201172, + "rewards/rejected": -11.904790878295898, + "sft_loss": 0.9447892904281616, + "step": 555 + }, + { + "epoch": 0.9204470742932281, + "grad_norm": 13.501353742225147, + "learning_rate": 2.0760273895602037e-07, + "logits/chosen": 17.632814407348633, + "logits/rejected": 17.65854263305664, + "logps/chosen": -254.25704956054688, + "logps/rejected": -177.63784790039062, + "loss": 0.5335, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.114619255065918, + "rewards/margins": 5.592235565185547, + "rewards/rejected": -10.706855773925781, + "sft_loss": 0.9995157718658447, + "step": 560 + }, + { + "epoch": 0.9286653517422748, + "grad_norm": 19.535542998103256, + "learning_rate": 2.0674849487088864e-07, + "logits/chosen": 18.379846572875977, + "logits/rejected": 19.475313186645508, + "logps/chosen": -249.86785888671875, + "logps/rejected": -187.93824768066406, + "loss": 0.5958, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.827848434448242, + "rewards/margins": 5.467617034912109, + "rewards/rejected": -11.295466423034668, + "sft_loss": 0.9322109222412109, + "step": 565 + }, + { + "epoch": 0.9368836291913215, + "grad_norm": 25.195757238729385, + "learning_rate": 2.0588752689316723e-07, + "logits/chosen": 18.46122169494629, + "logits/rejected": 18.586881637573242, + "logps/chosen": -286.5140075683594, + "logps/rejected": -202.23248291015625, + "loss": 0.5319, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.130897045135498, + "rewards/margins": 6.1991753578186035, + "rewards/rejected": -12.330072402954102, + "sft_loss": 0.924500048160553, + "step": 570 + }, + { + "epoch": 0.9451019066403682, + "grad_norm": 14.694663908634908, + "learning_rate": 2.0501990583830315e-07, + "logits/chosen": 17.5371036529541, + "logits/rejected": 18.469070434570312, + "logps/chosen": -274.0564270019531, + "logps/rejected": -211.01268005371094, + "loss": 0.4981, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -6.664008617401123, + "rewards/margins": 6.217647552490234, + "rewards/rejected": -12.8816556930542, + "sft_loss": 1.0239460468292236, + "step": 575 + }, + { + "epoch": 0.9533201840894149, + "grad_norm": 8.507356630817076, + "learning_rate": 2.0414570306896536e-07, + "logits/chosen": 17.411376953125, + "logits/rejected": 18.47208023071289, + "logps/chosen": -295.3019714355469, + "logps/rejected": -213.13792419433594, + "loss": 0.5512, + "rewards/accuracies": 0.9399999976158142, + "rewards/chosen": -6.6735124588012695, + "rewards/margins": 6.6261305809021, + "rewards/rejected": -13.299642562866211, + "sft_loss": 1.529820442199707, + "step": 580 + }, + { + "epoch": 0.9615384615384616, + "grad_norm": 25.681414018757476, + "learning_rate": 2.0326499048917527e-07, + "logits/chosen": 17.31963348388672, + "logits/rejected": 18.280134201049805, + "logps/chosen": -282.2524108886719, + "logps/rejected": -218.47996520996094, + "loss": 0.5755, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -7.157464981079102, + "rewards/margins": 6.622015953063965, + "rewards/rejected": -13.779480934143066, + "sft_loss": 0.9510271549224854, + "step": 585 + }, + { + "epoch": 0.9697567389875082, + "grad_norm": 13.878204470039535, + "learning_rate": 2.023778405383925e-07, + "logits/chosen": 18.141050338745117, + "logits/rejected": 18.204177856445312, + "logps/chosen": -273.6821594238281, + "logps/rejected": -200.89984130859375, + "loss": 0.4418, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -5.963834762573242, + "rewards/margins": 6.579600811004639, + "rewards/rejected": -12.543435096740723, + "sft_loss": 0.9940951466560364, + "step": 590 + }, + { + "epoch": 0.9779750164365549, + "grad_norm": 11.452199407752436, + "learning_rate": 2.0148432618555651e-07, + "logits/chosen": 18.627866744995117, + "logits/rejected": 18.42972755432129, + "logps/chosen": -258.9418029785156, + "logps/rejected": -185.6231231689453, + "loss": 0.5262, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -5.471505165100098, + "rewards/margins": 5.772936820983887, + "rewards/rejected": -11.244441032409668, + "sft_loss": 0.9383735060691833, + "step": 595 + }, + { + "epoch": 0.9861932938856016, + "grad_norm": 11.942794396918284, + "learning_rate": 2.005845209230851e-07, + "logits/chosen": 18.03531265258789, + "logits/rejected": 18.720346450805664, + "logps/chosen": -292.6284484863281, + "logps/rejected": -217.44017028808594, + "loss": 0.5167, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -6.853020191192627, + "rewards/margins": 6.340816497802734, + "rewards/rejected": -13.193839073181152, + "sft_loss": 1.0825438499450684, + "step": 600 + }, + { + "epoch": 0.9944115713346483, + "grad_norm": 11.995957867465538, + "learning_rate": 1.9967849876082937e-07, + "logits/chosen": 16.612958908081055, + "logits/rejected": 17.676807403564453, + "logps/chosen": -290.99993896484375, + "logps/rejected": -217.08941650390625, + "loss": 0.5367, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.961750030517578, + "rewards/margins": 6.5437798500061035, + "rewards/rejected": -13.505529403686523, + "sft_loss": 1.0639195442199707, + "step": 605 + }, + { + "epoch": 1.0026298487836949, + "grad_norm": 10.297644271924568, + "learning_rate": 1.9876633421998652e-07, + "logits/chosen": 17.37873649597168, + "logits/rejected": 18.0369815826416, + "logps/chosen": -277.8174133300781, + "logps/rejected": -203.3291473388672, + "loss": 0.4734, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -6.353253364562988, + "rewards/margins": 6.258001804351807, + "rewards/rejected": -12.611254692077637, + "sft_loss": 0.9542250037193298, + "step": 610 + }, + { + "epoch": 1.0108481262327416, + "grad_norm": 11.471429971847657, + "learning_rate": 1.9784810232697024e-07, + "logits/chosen": 17.6014461517334, + "logits/rejected": 18.502716064453125, + "logps/chosen": -295.8468017578125, + "logps/rejected": -225.82949829101562, + "loss": 0.4473, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -6.305618762969971, + "rewards/margins": 7.557163238525391, + "rewards/rejected": -13.862781524658203, + "sft_loss": 0.9756129384040833, + "step": 615 + }, + { + "epoch": 1.0190664036817882, + "grad_norm": 14.22423049629626, + "learning_rate": 1.969238786072398e-07, + "logits/chosen": 17.072832107543945, + "logits/rejected": 17.857742309570312, + "logps/chosen": -318.9200134277344, + "logps/rejected": -236.0108184814453, + "loss": 0.423, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.207548141479492, + "rewards/margins": 7.39101505279541, + "rewards/rejected": -14.598563194274902, + "sft_loss": 0.9570875763893127, + "step": 620 + }, + { + "epoch": 1.027284681130835, + "grad_norm": 14.863752308544749, + "learning_rate": 1.9599373907908803e-07, + "logits/chosen": 18.62479591369629, + "logits/rejected": 19.332067489624023, + "logps/chosen": -311.5079650878906, + "logps/rejected": -230.38861083984375, + "loss": 0.4746, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -7.520875453948975, + "rewards/margins": 7.087317943572998, + "rewards/rejected": -14.608192443847656, + "sft_loss": 1.0305228233337402, + "step": 625 + }, + { + "epoch": 1.0355029585798816, + "grad_norm": 11.389098298703924, + "learning_rate": 1.9505776024738873e-07, + "logits/chosen": 17.646556854248047, + "logits/rejected": 18.52758026123047, + "logps/chosen": -267.45611572265625, + "logps/rejected": -202.84034729003906, + "loss": 0.494, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -6.995048999786377, + "rewards/margins": 5.844033241271973, + "rewards/rejected": -12.839081764221191, + "sft_loss": 1.0837846994400024, + "step": 630 + }, + { + "epoch": 1.0437212360289283, + "grad_norm": 17.383619355827555, + "learning_rate": 1.9411601909730397e-07, + "logits/chosen": 16.90384292602539, + "logits/rejected": 17.69657325744629, + "logps/chosen": -276.2812805175781, + "logps/rejected": -210.5614471435547, + "loss": 0.5568, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.192663669586182, + "rewards/margins": 6.900697231292725, + "rewards/rejected": -13.093358993530273, + "sft_loss": 1.2382417917251587, + "step": 635 + }, + { + "epoch": 1.051939513477975, + "grad_norm": 15.094044445712935, + "learning_rate": 1.9316859308795215e-07, + "logits/chosen": 16.81202507019043, + "logits/rejected": 18.695880889892578, + "logps/chosen": -257.9354553222656, + "logps/rejected": -203.78866577148438, + "loss": 0.5268, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.167855262756348, + "rewards/margins": 6.644321441650391, + "rewards/rejected": -12.812177658081055, + "sft_loss": 1.173020839691162, + "step": 640 + }, + { + "epoch": 1.0601577909270217, + "grad_norm": 15.863163074258626, + "learning_rate": 1.9221556014603674e-07, + "logits/chosen": 16.538555145263672, + "logits/rejected": 18.44594955444336, + "logps/chosen": -299.3294982910156, + "logps/rejected": -236.79315185546875, + "loss": 0.4933, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -7.601771354675293, + "rewards/margins": 7.276884078979492, + "rewards/rejected": -14.878654479980469, + "sft_loss": 1.1147685050964355, + "step": 645 + }, + { + "epoch": 1.0683760683760684, + "grad_norm": 12.95009158653796, + "learning_rate": 1.9125699865943696e-07, + "logits/chosen": 17.819013595581055, + "logits/rejected": 18.056425094604492, + "logps/chosen": -280.44134521484375, + "logps/rejected": -211.0347900390625, + "loss": 0.4992, + "rewards/accuracies": 0.9399999976158142, + "rewards/chosen": -6.4677534103393555, + "rewards/margins": 6.797198295593262, + "rewards/rejected": -13.26495361328125, + "sft_loss": 1.0369815826416016, + "step": 650 + }, + { + "epoch": 1.076594345825115, + "grad_norm": 9.53030890727526, + "learning_rate": 1.9029298747076e-07, + "logits/chosen": 18.56303596496582, + "logits/rejected": 19.128713607788086, + "logps/chosen": -301.52069091796875, + "logps/rejected": -222.11752319335938, + "loss": 0.4653, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.511043071746826, + "rewards/margins": 7.3326520919799805, + "rewards/rejected": -13.843696594238281, + "sft_loss": 1.039981722831726, + "step": 655 + }, + { + "epoch": 1.0848126232741617, + "grad_norm": 12.49460951335956, + "learning_rate": 1.893236058708565e-07, + "logits/chosen": 17.331298828125, + "logits/rejected": 18.1816463470459, + "logps/chosen": -290.297607421875, + "logps/rejected": -212.6442413330078, + "loss": 0.4897, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -6.593270301818848, + "rewards/margins": 6.5445356369018555, + "rewards/rejected": -13.137805938720703, + "sft_loss": 1.0305876731872559, + "step": 660 + }, + { + "epoch": 1.0930309007232084, + "grad_norm": 10.084660494140396, + "learning_rate": 1.8834893359229839e-07, + "logits/chosen": 17.249683380126953, + "logits/rejected": 18.377492904663086, + "logps/chosen": -317.7668151855469, + "logps/rejected": -234.8712158203125, + "loss": 0.4925, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -6.917696952819824, + "rewards/margins": 7.316926956176758, + "rewards/rejected": -14.234623908996582, + "sft_loss": 1.0477817058563232, + "step": 665 + }, + { + "epoch": 1.101249178172255, + "grad_norm": 11.370135962731284, + "learning_rate": 1.8736905080282117e-07, + "logits/chosen": 17.393232345581055, + "logits/rejected": 18.21647071838379, + "logps/chosen": -291.6396789550781, + "logps/rejected": -215.71307373046875, + "loss": 0.5118, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -6.400353908538818, + "rewards/margins": 6.503895282745361, + "rewards/rejected": -12.904250144958496, + "sft_loss": 1.0789752006530762, + "step": 670 + }, + { + "epoch": 1.1094674556213018, + "grad_norm": 14.128398069389478, + "learning_rate": 1.8638403809872988e-07, + "logits/chosen": 18.000486373901367, + "logits/rejected": 19.02123260498047, + "logps/chosen": -238.9346923828125, + "logps/rejected": -187.83901977539062, + "loss": 0.4881, + "rewards/accuracies": 0.9100000262260437, + "rewards/chosen": -5.991827011108398, + "rewards/margins": 6.166553974151611, + "rewards/rejected": -12.158380508422852, + "sft_loss": 1.0633037090301514, + "step": 675 + }, + { + "epoch": 1.1176857330703485, + "grad_norm": 10.039232848979895, + "learning_rate": 1.8539397649826993e-07, + "logits/chosen": 17.416231155395508, + "logits/rejected": 18.53554344177246, + "logps/chosen": -271.6786193847656, + "logps/rejected": -208.55459594726562, + "loss": 0.4408, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -6.946457386016846, + "rewards/margins": 6.493756294250488, + "rewards/rejected": -13.440213203430176, + "sft_loss": 1.0465832948684692, + "step": 680 + }, + { + "epoch": 1.1259040105193951, + "grad_norm": 17.7290983481912, + "learning_rate": 1.8439894743496336e-07, + "logits/chosen": 17.006452560424805, + "logits/rejected": 17.804595947265625, + "logps/chosen": -289.0384826660156, + "logps/rejected": -228.98916625976562, + "loss": 0.464, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -7.2547478675842285, + "rewards/margins": 7.524634838104248, + "rewards/rejected": -14.779382705688477, + "sft_loss": 1.0623209476470947, + "step": 685 + }, + { + "epoch": 1.1341222879684418, + "grad_norm": 15.995020113178853, + "learning_rate": 1.8339903275091085e-07, + "logits/chosen": 17.363964080810547, + "logits/rejected": 18.096250534057617, + "logps/chosen": -313.4389343261719, + "logps/rejected": -239.9541015625, + "loss": 0.4292, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.249270439147949, + "rewards/margins": 7.737963676452637, + "rewards/rejected": -14.987234115600586, + "sft_loss": 1.1172467470169067, + "step": 690 + }, + { + "epoch": 1.1423405654174885, + "grad_norm": 10.290356468777885, + "learning_rate": 1.8239431469006e-07, + "logits/chosen": 16.6265811920166, + "logits/rejected": 18.333799362182617, + "logps/chosen": -268.6365966796875, + "logps/rejected": -221.0557098388672, + "loss": 0.4627, + "rewards/accuracies": 0.9399999976158142, + "rewards/chosen": -6.95206356048584, + "rewards/margins": 7.242475986480713, + "rewards/rejected": -14.194538116455078, + "sft_loss": 1.2080581188201904, + "step": 695 + }, + { + "epoch": 1.1505588428665352, + "grad_norm": 12.079608347733119, + "learning_rate": 1.8138487589144093e-07, + "logits/chosen": 16.631559371948242, + "logits/rejected": 16.87362289428711, + "logps/chosen": -273.40997314453125, + "logps/rejected": -210.4160614013672, + "loss": 0.5063, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -7.272107124328613, + "rewards/margins": 6.501527786254883, + "rewards/rejected": -13.77363395690918, + "sft_loss": 1.0478310585021973, + "step": 700 + }, + { + "epoch": 1.1587771203155819, + "grad_norm": 17.778097749378432, + "learning_rate": 1.8037079938236894e-07, + "logits/chosen": 17.234224319458008, + "logits/rejected": 18.432863235473633, + "logps/chosen": -281.38458251953125, + "logps/rejected": -223.9882049560547, + "loss": 0.4823, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -7.636561870574951, + "rewards/margins": 7.072784423828125, + "rewards/rejected": -14.709345817565918, + "sft_loss": 0.9729472398757935, + "step": 705 + }, + { + "epoch": 1.1669953977646286, + "grad_norm": 13.760102505142987, + "learning_rate": 1.793521685716154e-07, + "logits/chosen": 17.158409118652344, + "logits/rejected": 18.147829055786133, + "logps/chosen": -339.1050720214844, + "logps/rejected": -257.5541687011719, + "loss": 0.4268, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -8.170562744140625, + "rewards/margins": 8.136800765991211, + "rewards/rejected": -16.307363510131836, + "sft_loss": 1.087196946144104, + "step": 710 + }, + { + "epoch": 1.1752136752136753, + "grad_norm": 12.543576537196508, + "learning_rate": 1.7832906724254747e-07, + "logits/chosen": 16.710582733154297, + "logits/rejected": 17.746997833251953, + "logps/chosen": -279.0878601074219, + "logps/rejected": -217.86927795410156, + "loss": 0.4347, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -7.324019908905029, + "rewards/margins": 6.887091636657715, + "rewards/rejected": -14.211112022399902, + "sft_loss": 1.0954669713974, + "step": 715 + }, + { + "epoch": 1.183431952662722, + "grad_norm": 14.156394204679035, + "learning_rate": 1.7730157954623685e-07, + "logits/chosen": 17.9290828704834, + "logits/rejected": 17.706289291381836, + "logps/chosen": -284.99176025390625, + "logps/rejected": -210.2812957763672, + "loss": 0.5001, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -7.146309852600098, + "rewards/margins": 6.681734085083008, + "rewards/rejected": -13.828044891357422, + "sft_loss": 1.0680426359176636, + "step": 720 + }, + { + "epoch": 1.1916502301117686, + "grad_norm": 12.575179703681824, + "learning_rate": 1.7626978999453794e-07, + "logits/chosen": 17.4116268157959, + "logits/rejected": 17.362062454223633, + "logps/chosen": -319.6551818847656, + "logps/rejected": -242.6376495361328, + "loss": 0.3929, + "rewards/accuracies": 0.9900000095367432, + "rewards/chosen": -7.572165012359619, + "rewards/margins": 7.830206871032715, + "rewards/rejected": -15.402371406555176, + "sft_loss": 1.0497316122055054, + "step": 725 + }, + { + "epoch": 1.1998685075608153, + "grad_norm": 9.969097695004054, + "learning_rate": 1.7523378345313714e-07, + "logits/chosen": 17.700010299682617, + "logits/rejected": 18.3839168548584, + "logps/chosen": -291.83917236328125, + "logps/rejected": -215.37081909179688, + "loss": 0.5242, + "rewards/accuracies": 0.9900000095367432, + "rewards/chosen": -7.1273322105407715, + "rewards/margins": 6.290266036987305, + "rewards/rejected": -13.417597770690918, + "sft_loss": 1.382573127746582, + "step": 730 + }, + { + "epoch": 1.208086785009862, + "grad_norm": 17.17576749860381, + "learning_rate": 1.741936451345722e-07, + "logits/chosen": 18.578615188598633, + "logits/rejected": 19.108678817749023, + "logps/chosen": -271.18505859375, + "logps/rejected": -205.25746154785156, + "loss": 0.4562, + "rewards/accuracies": 0.9900000095367432, + "rewards/chosen": -6.272554397583008, + "rewards/margins": 6.781675815582275, + "rewards/rejected": -13.054230690002441, + "sft_loss": 1.151402473449707, + "step": 735 + }, + { + "epoch": 1.2163050624589087, + "grad_norm": 17.314304500732653, + "learning_rate": 1.731494605912235e-07, + "logits/chosen": 17.34149932861328, + "logits/rejected": 18.757190704345703, + "logps/chosen": -262.0509948730469, + "logps/rejected": -208.38226318359375, + "loss": 0.4598, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -6.2556867599487305, + "rewards/margins": 6.655214309692383, + "rewards/rejected": -12.910900115966797, + "sft_loss": 1.0516655445098877, + "step": 740 + }, + { + "epoch": 1.2245233399079554, + "grad_norm": 15.379389005940164, + "learning_rate": 1.721013157082774e-07, + "logits/chosen": 16.926176071166992, + "logits/rejected": 18.068889617919922, + "logps/chosen": -276.72833251953125, + "logps/rejected": -224.33856201171875, + "loss": 0.4921, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -7.5205397605896, + "rewards/margins": 6.801075458526611, + "rewards/rejected": -14.321615219116211, + "sft_loss": 1.0424396991729736, + "step": 745 + }, + { + "epoch": 1.232741617357002, + "grad_norm": 16.009052812361457, + "learning_rate": 1.7104929669666194e-07, + "logits/chosen": 16.49311065673828, + "logits/rejected": 17.206867218017578, + "logps/chosen": -299.70855712890625, + "logps/rejected": -234.7362060546875, + "loss": 0.5132, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -7.55043888092041, + "rewards/margins": 7.260469436645508, + "rewards/rejected": -14.810908317565918, + "sft_loss": 1.148091197013855, + "step": 750 + }, + { + "epoch": 1.2409598948060487, + "grad_norm": 12.479892072042215, + "learning_rate": 1.69993490085956e-07, + "logits/chosen": 16.645790100097656, + "logits/rejected": 18.348690032958984, + "logps/chosen": -289.54217529296875, + "logps/rejected": -232.9552001953125, + "loss": 0.4746, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -7.2228593826293945, + "rewards/margins": 7.266669273376465, + "rewards/rejected": -14.48952865600586, + "sft_loss": 1.0830727815628052, + "step": 755 + }, + { + "epoch": 1.2491781722550954, + "grad_norm": 13.701336630947893, + "learning_rate": 1.6893398271727222e-07, + "logits/chosen": 17.36661148071289, + "logits/rejected": 18.305465698242188, + "logps/chosen": -300.6762390136719, + "logps/rejected": -228.61175537109375, + "loss": 0.4574, + "rewards/accuracies": 0.9900000095367432, + "rewards/chosen": -7.493809223175049, + "rewards/margins": 7.260177135467529, + "rewards/rejected": -14.753986358642578, + "sft_loss": 1.016793966293335, + "step": 760 + }, + { + "epoch": 1.2573964497041419, + "grad_norm": 10.12301776047569, + "learning_rate": 1.6787086173611407e-07, + "logits/chosen": 17.593551635742188, + "logits/rejected": 18.34381675720215, + "logps/chosen": -280.0817565917969, + "logps/rejected": -211.71542358398438, + "loss": 0.4631, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.457971096038818, + "rewards/margins": 6.6875996589660645, + "rewards/rejected": -14.1455717086792, + "sft_loss": 1.0228469371795654, + "step": 765 + }, + { + "epoch": 1.2656147271531886, + "grad_norm": 7.684067785358655, + "learning_rate": 1.6680421458520813e-07, + "logits/chosen": 18.189321517944336, + "logits/rejected": 18.308818817138672, + "logps/chosen": -280.6365966796875, + "logps/rejected": -212.9956817626953, + "loss": 0.4905, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -6.9928879737854, + "rewards/margins": 6.62729024887085, + "rewards/rejected": -13.62017822265625, + "sft_loss": 1.4820358753204346, + "step": 770 + }, + { + "epoch": 1.2738330046022353, + "grad_norm": 12.91245370337745, + "learning_rate": 1.6573412899731187e-07, + "logits/chosen": 17.40738868713379, + "logits/rejected": 18.874313354492188, + "logps/chosen": -299.2168884277344, + "logps/rejected": -221.5058135986328, + "loss": 0.4091, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.799927234649658, + "rewards/margins": 6.812719821929932, + "rewards/rejected": -13.612646102905273, + "sft_loss": 1.1041682958602905, + "step": 775 + }, + { + "epoch": 1.282051282051282, + "grad_norm": 10.002770129869452, + "learning_rate": 1.646606929879975e-07, + "logits/chosen": 18.40058135986328, + "logits/rejected": 19.07294273376465, + "logps/chosen": -323.3199157714844, + "logps/rejected": -239.97935485839844, + "loss": 0.4266, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -7.476480484008789, + "rewards/margins": 8.036779403686523, + "rewards/rejected": -15.513258934020996, + "sft_loss": 1.0359128713607788, + "step": 780 + }, + { + "epoch": 1.2902695595003286, + "grad_norm": 13.874094233494837, + "learning_rate": 1.6358399484841268e-07, + "logits/chosen": 16.465330123901367, + "logits/rejected": 17.001684188842773, + "logps/chosen": -302.719482421875, + "logps/rejected": -224.98745727539062, + "loss": 0.5129, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -7.293752670288086, + "rewards/margins": 7.167456150054932, + "rewards/rejected": -14.46120834350586, + "sft_loss": 1.1338067054748535, + "step": 785 + }, + { + "epoch": 1.2984878369493753, + "grad_norm": 16.794137790287348, + "learning_rate": 1.625041231380184e-07, + "logits/chosen": 16.809955596923828, + "logits/rejected": 18.395627975463867, + "logps/chosen": -310.674560546875, + "logps/rejected": -239.32200622558594, + "loss": 0.4581, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -7.000899791717529, + "rewards/margins": 7.625972747802734, + "rewards/rejected": -14.626873016357422, + "sft_loss": 0.9849548935890198, + "step": 790 + }, + { + "epoch": 1.306706114398422, + "grad_norm": 12.439364730991043, + "learning_rate": 1.6142116667730482e-07, + "logits/chosen": 19.75507164001465, + "logits/rejected": 20.32160758972168, + "logps/chosen": -293.4500732421875, + "logps/rejected": -214.4062042236328, + "loss": 0.4713, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -6.081357955932617, + "rewards/margins": 7.148606777191162, + "rewards/rejected": -13.229966163635254, + "sft_loss": 0.9287933111190796, + "step": 795 + }, + { + "epoch": 1.3149243918474687, + "grad_norm": 11.945683940407063, + "learning_rate": 1.6033521454048597e-07, + "logits/chosen": 18.249954223632812, + "logits/rejected": 19.019634246826172, + "logps/chosen": -271.8877258300781, + "logps/rejected": -217.09132385253906, + "loss": 0.4673, + "rewards/accuracies": 0.9399999976158142, + "rewards/chosen": -6.703191757202148, + "rewards/margins": 7.068259239196777, + "rewards/rejected": -13.77145004272461, + "sft_loss": 1.0365476608276367, + "step": 800 + }, + { + "epoch": 1.3231426692965154, + "grad_norm": 10.191092591520466, + "learning_rate": 1.5924635604817306e-07, + "logits/chosen": 17.222694396972656, + "logits/rejected": 18.468660354614258, + "logps/chosen": -288.8092041015625, + "logps/rejected": -236.29319763183594, + "loss": 0.4065, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -7.580431938171387, + "rewards/margins": 7.9504780769348145, + "rewards/rejected": -15.530909538269043, + "sft_loss": 1.162276268005371, + "step": 805 + }, + { + "epoch": 1.331360946745562, + "grad_norm": 9.751260919138856, + "learning_rate": 1.5815468076002771e-07, + "logits/chosen": 16.873342514038086, + "logits/rejected": 18.183860778808594, + "logps/chosen": -312.6845397949219, + "logps/rejected": -240.49859619140625, + "loss": 0.429, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -7.772741794586182, + "rewards/margins": 8.080373764038086, + "rewards/rejected": -15.853116035461426, + "sft_loss": 0.9787502288818359, + "step": 810 + }, + { + "epoch": 1.3395792241946087, + "grad_norm": 13.966159704549986, + "learning_rate": 1.5706027846739588e-07, + "logits/chosen": 17.78404426574707, + "logits/rejected": 18.716482162475586, + "logps/chosen": -265.793701171875, + "logps/rejected": -212.49057006835938, + "loss": 0.4521, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -6.772706985473633, + "rewards/margins": 6.92323112487793, + "rewards/rejected": -13.695940017700195, + "sft_loss": 1.0237793922424316, + "step": 815 + }, + { + "epoch": 1.3477975016436554, + "grad_norm": 32.697820524211366, + "learning_rate": 1.5596323918592227e-07, + "logits/chosen": 18.034412384033203, + "logits/rejected": 18.671672821044922, + "logps/chosen": -253.35609436035156, + "logps/rejected": -206.98895263671875, + "loss": 0.4833, + "rewards/accuracies": 0.9200000166893005, + "rewards/chosen": -7.246993541717529, + "rewards/margins": 6.500965595245361, + "rewards/rejected": -13.74795913696289, + "sft_loss": 1.0642235279083252, + "step": 820 + }, + { + "epoch": 1.356015779092702, + "grad_norm": 12.398186085004639, + "learning_rate": 1.5486365314814637e-07, + "logits/chosen": 17.62421226501465, + "logits/rejected": 18.33708953857422, + "logps/chosen": -292.3586120605469, + "logps/rejected": -230.61155700683594, + "loss": 0.4084, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -7.588433742523193, + "rewards/margins": 7.831187725067139, + "rewards/rejected": -15.4196195602417, + "sft_loss": 1.0407756567001343, + "step": 825 + }, + { + "epoch": 1.3642340565417488, + "grad_norm": 12.166605913363364, + "learning_rate": 1.5376161079608088e-07, + "logits/chosen": 17.150541305541992, + "logits/rejected": 18.62920379638672, + "logps/chosen": -296.70465087890625, + "logps/rejected": -242.9381866455078, + "loss": 0.46, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -7.590549945831299, + "rewards/margins": 8.183311462402344, + "rewards/rejected": -15.773859977722168, + "sft_loss": 1.191388487815857, + "step": 830 + }, + { + "epoch": 1.3724523339907955, + "grad_norm": 10.880603493347238, + "learning_rate": 1.5265720277377273e-07, + "logits/chosen": 17.14630889892578, + "logits/rejected": 19.08263397216797, + "logps/chosen": -288.0076904296875, + "logps/rejected": -237.15341186523438, + "loss": 0.4435, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -7.4387054443359375, + "rewards/margins": 7.707547664642334, + "rewards/rejected": -15.146254539489746, + "sft_loss": 1.0695911645889282, + "step": 835 + }, + { + "epoch": 1.3806706114398422, + "grad_norm": 50.18720477246092, + "learning_rate": 1.5155051991984745e-07, + "logits/chosen": 18.334110260009766, + "logits/rejected": 18.69322967529297, + "logps/chosen": -315.9974365234375, + "logps/rejected": -228.48602294921875, + "loss": 0.4849, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -7.224093437194824, + "rewards/margins": 7.033995151519775, + "rewards/rejected": -14.258088111877441, + "sft_loss": 0.9990159869194031, + "step": 840 + }, + { + "epoch": 1.3888888888888888, + "grad_norm": 8.437783211213006, + "learning_rate": 1.504416532600378e-07, + "logits/chosen": 17.403743743896484, + "logits/rejected": 18.235454559326172, + "logps/chosen": -242.6099853515625, + "logps/rejected": -199.91429138183594, + "loss": 0.4367, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -6.768044948577881, + "rewards/margins": 6.265518665313721, + "rewards/rejected": -13.033564567565918, + "sft_loss": 1.0013427734375, + "step": 845 + }, + { + "epoch": 1.3971071663379355, + "grad_norm": 14.969642809049821, + "learning_rate": 1.4933069399969653e-07, + "logits/chosen": 17.80324935913086, + "logits/rejected": 18.639148712158203, + "logps/chosen": -272.4168395996094, + "logps/rejected": -217.99310302734375, + "loss": 0.4617, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -7.3702874183654785, + "rewards/margins": 6.988955020904541, + "rewards/rejected": -14.359243392944336, + "sft_loss": 1.1217681169509888, + "step": 850 + }, + { + "epoch": 1.4053254437869822, + "grad_norm": 14.289009158482923, + "learning_rate": 1.4821773351629487e-07, + "logits/chosen": 18.467451095581055, + "logits/rejected": 19.347543716430664, + "logps/chosen": -302.4975280761719, + "logps/rejected": -243.9453125, + "loss": 0.4132, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -8.117691040039062, + "rewards/margins": 8.244772911071777, + "rewards/rejected": -16.362462997436523, + "sft_loss": 1.1255364418029785, + "step": 855 + }, + { + "epoch": 1.413543721236029, + "grad_norm": 10.706706272611981, + "learning_rate": 1.4710286335190664e-07, + "logits/chosen": 18.262802124023438, + "logits/rejected": 18.210296630859375, + "logps/chosen": -306.64691162109375, + "logps/rejected": -234.53460693359375, + "loss": 0.4363, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -7.4043498039245605, + "rewards/margins": 7.886282920837402, + "rewards/rejected": -15.290633201599121, + "sft_loss": 1.080936074256897, + "step": 860 + }, + { + "epoch": 1.4217619986850756, + "grad_norm": 13.539503399960063, + "learning_rate": 1.4598617520567863e-07, + "logits/chosen": 18.688413619995117, + "logits/rejected": 19.166378021240234, + "logps/chosen": -295.90008544921875, + "logps/rejected": -231.57505798339844, + "loss": 0.4445, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -7.257371425628662, + "rewards/margins": 7.788801193237305, + "rewards/rejected": -15.046174049377441, + "sft_loss": 1.04954195022583, + "step": 865 + }, + { + "epoch": 1.4299802761341223, + "grad_norm": 20.41588952283392, + "learning_rate": 1.448677609262885e-07, + "logits/chosen": 17.124914169311523, + "logits/rejected": 18.068174362182617, + "logps/chosen": -291.83245849609375, + "logps/rejected": -229.2489776611328, + "loss": 0.4916, + "rewards/accuracies": 0.9399999976158142, + "rewards/chosen": -7.751894474029541, + "rewards/margins": 7.248422145843506, + "rewards/rejected": -15.000316619873047, + "sft_loss": 1.1058861017227173, + "step": 870 + }, + { + "epoch": 1.438198553583169, + "grad_norm": 10.416378982514427, + "learning_rate": 1.4374771250438997e-07, + "logits/chosen": 17.683748245239258, + "logits/rejected": 18.105945587158203, + "logps/chosen": -338.9434814453125, + "logps/rejected": -252.90367126464844, + "loss": 0.353, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -8.734278678894043, + "rewards/margins": 8.11069107055664, + "rewards/rejected": -16.844970703125, + "sft_loss": 1.1128793954849243, + "step": 875 + }, + { + "epoch": 1.4464168310322156, + "grad_norm": 15.631489594193368, + "learning_rate": 1.4262612206504653e-07, + "logits/chosen": 19.22788429260254, + "logits/rejected": 18.560340881347656, + "logps/chosen": -288.2774658203125, + "logps/rejected": -221.1851806640625, + "loss": 0.4398, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -8.204787254333496, + "rewards/margins": 6.889291763305664, + "rewards/rejected": -15.094079971313477, + "sft_loss": 1.0347801446914673, + "step": 880 + }, + { + "epoch": 1.4546351084812623, + "grad_norm": 22.470025016143673, + "learning_rate": 1.4150308186015428e-07, + "logits/chosen": 18.78541374206543, + "logits/rejected": 19.072355270385742, + "logps/chosen": -266.7073669433594, + "logps/rejected": -214.3734130859375, + "loss": 0.4864, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -7.12351131439209, + "rewards/margins": 7.012777328491211, + "rewards/rejected": -14.1362886428833, + "sft_loss": 1.0819884538650513, + "step": 885 + }, + { + "epoch": 1.462853385930309, + "grad_norm": 11.047306179137715, + "learning_rate": 1.4037868426085368e-07, + "logits/chosen": 17.600828170776367, + "logits/rejected": 17.870738983154297, + "logps/chosen": -321.2472229003906, + "logps/rejected": -237.96395874023438, + "loss": 0.4823, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -7.274439811706543, + "rewards/margins": 8.21683120727539, + "rewards/rejected": -15.49127197265625, + "sft_loss": 1.1358665227890015, + "step": 890 + }, + { + "epoch": 1.4710716633793557, + "grad_norm": 9.894309836137355, + "learning_rate": 1.3925302174993233e-07, + "logits/chosen": 16.768348693847656, + "logits/rejected": 18.076475143432617, + "logps/chosen": -295.2914123535156, + "logps/rejected": -222.6123504638672, + "loss": 0.4288, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -6.861530780792236, + "rewards/margins": 7.223613262176514, + "rewards/rejected": -14.085144996643066, + "sft_loss": 0.9808722734451294, + "step": 895 + }, + { + "epoch": 1.4792899408284024, + "grad_norm": 15.122256978486702, + "learning_rate": 1.3812618691421803e-07, + "logits/chosen": 17.618257522583008, + "logits/rejected": 18.547971725463867, + "logps/chosen": -307.7926025390625, + "logps/rejected": -228.6370849609375, + "loss": 0.4755, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -6.855221748352051, + "rewards/margins": 7.493732929229736, + "rewards/rejected": -14.348955154418945, + "sft_loss": 0.975628137588501, + "step": 900 + }, + { + "epoch": 1.487508218277449, + "grad_norm": 14.990640701163656, + "learning_rate": 1.3699827243696336e-07, + "logits/chosen": 17.19367027282715, + "logits/rejected": 18.374305725097656, + "logps/chosen": -286.5935363769531, + "logps/rejected": -236.76593017578125, + "loss": 0.4732, + "rewards/accuracies": 0.9300000071525574, + "rewards/chosen": -7.718534469604492, + "rewards/margins": 7.860580921173096, + "rewards/rejected": -15.57911491394043, + "sft_loss": 1.1146594285964966, + "step": 905 + }, + { + "epoch": 1.4957264957264957, + "grad_norm": 10.50314444472379, + "learning_rate": 1.3586937109022251e-07, + "logits/chosen": 16.421382904052734, + "logits/rejected": 17.77210235595703, + "logps/chosen": -324.25927734375, + "logps/rejected": -260.9275207519531, + "loss": 0.4663, + "rewards/accuracies": 0.9900000095367432, + "rewards/chosen": -8.703363418579102, + "rewards/margins": 8.462730407714844, + "rewards/rejected": -17.166095733642578, + "sft_loss": 1.0979522466659546, + "step": 910 + }, + { + "epoch": 1.5039447731755424, + "grad_norm": 16.690789592498312, + "learning_rate": 1.347395757272207e-07, + "logits/chosen": 19.563251495361328, + "logits/rejected": 19.970426559448242, + "logps/chosen": -271.6186218261719, + "logps/rejected": -212.50277709960938, + "loss": 0.4515, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.6580634117126465, + "rewards/margins": 7.265621185302734, + "rewards/rejected": -13.923684120178223, + "sft_loss": 1.0007566213607788, + "step": 915 + }, + { + "epoch": 1.5121630506245891, + "grad_norm": 21.799881591539336, + "learning_rate": 1.3360897927471668e-07, + "logits/chosen": 18.252246856689453, + "logits/rejected": 18.873050689697266, + "logps/chosen": -278.3526611328125, + "logps/rejected": -221.5440216064453, + "loss": 0.4632, + "rewards/accuracies": 0.9399999976158142, + "rewards/chosen": -7.180948257446289, + "rewards/margins": 7.29295539855957, + "rewards/rejected": -14.473901748657227, + "sft_loss": 1.0442688465118408, + "step": 920 + }, + { + "epoch": 1.5203813280736358, + "grad_norm": 10.712033452260947, + "learning_rate": 1.3247767472535972e-07, + "logits/chosen": 18.07443618774414, + "logits/rejected": 19.142240524291992, + "logps/chosen": -294.86700439453125, + "logps/rejected": -238.5161895751953, + "loss": 0.4686, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -7.611084461212158, + "rewards/margins": 8.040576934814453, + "rewards/rejected": -15.651662826538086, + "sft_loss": 1.0576171875, + "step": 925 + }, + { + "epoch": 1.5285996055226825, + "grad_norm": 7.019511894014553, + "learning_rate": 1.3134575513004073e-07, + "logits/chosen": 18.114564895629883, + "logits/rejected": 18.515487670898438, + "logps/chosen": -303.06329345703125, + "logps/rejected": -237.0087432861328, + "loss": 0.3908, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -7.551575183868408, + "rewards/margins": 7.9892473220825195, + "rewards/rejected": -15.540822982788086, + "sft_loss": 1.048262119293213, + "step": 930 + }, + { + "epoch": 1.5368178829717292, + "grad_norm": 14.009760349607332, + "learning_rate": 1.3021331359023874e-07, + "logits/chosen": 17.101354598999023, + "logits/rejected": 18.246139526367188, + "logps/chosen": -310.4385070800781, + "logps/rejected": -244.6991424560547, + "loss": 0.4262, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.569284439086914, + "rewards/margins": 8.347086906433105, + "rewards/rejected": -15.916370391845703, + "sft_loss": 1.0606290102005005, + "step": 935 + }, + { + "epoch": 1.5450361604207759, + "grad_norm": 15.650861724973655, + "learning_rate": 1.2908044325036312e-07, + "logits/chosen": 17.97089195251465, + "logits/rejected": 18.223573684692383, + "logps/chosen": -296.1282958984375, + "logps/rejected": -233.69146728515625, + "loss": 0.4616, + "rewards/accuracies": 0.9900000095367432, + "rewards/chosen": -7.757159233093262, + "rewards/margins": 7.639113903045654, + "rewards/rejected": -15.396271705627441, + "sft_loss": 1.138619065284729, + "step": 940 + }, + { + "epoch": 1.5532544378698225, + "grad_norm": 17.515447400155715, + "learning_rate": 1.2794723729009255e-07, + "logits/chosen": 16.958641052246094, + "logits/rejected": 18.472318649291992, + "logps/chosen": -298.9012756347656, + "logps/rejected": -239.90469360351562, + "loss": 0.4502, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -7.437976837158203, + "rewards/margins": 8.138365745544434, + "rewards/rejected": -15.576342582702637, + "sft_loss": 1.0626742839813232, + "step": 945 + }, + { + "epoch": 1.5614727153188692, + "grad_norm": 45.641039135520685, + "learning_rate": 1.2681378891671082e-07, + "logits/chosen": 17.490928649902344, + "logits/rejected": 17.976585388183594, + "logps/chosen": -306.0874328613281, + "logps/rejected": -237.03607177734375, + "loss": 0.4737, + "rewards/accuracies": 0.9900000095367432, + "rewards/chosen": -7.880832672119141, + "rewards/margins": 7.584968090057373, + "rewards/rejected": -15.465802192687988, + "sft_loss": 1.0900439023971558, + "step": 950 + }, + { + "epoch": 1.569690992767916, + "grad_norm": 19.898061737121086, + "learning_rate": 1.2568019135744044e-07, + "logits/chosen": 16.957841873168945, + "logits/rejected": 17.985727310180664, + "logps/chosen": -291.70135498046875, + "logps/rejected": -229.38314819335938, + "loss": 0.4349, + "rewards/accuracies": 0.9900000095367432, + "rewards/chosen": -7.644362926483154, + "rewards/margins": 7.429901123046875, + "rewards/rejected": -15.074263572692871, + "sft_loss": 1.0944395065307617, + "step": 955 + }, + { + "epoch": 1.5779092702169626, + "grad_norm": 12.39680434949017, + "learning_rate": 1.2454653785177445e-07, + "logits/chosen": 17.493330001831055, + "logits/rejected": 18.42995834350586, + "logps/chosen": -278.9170837402344, + "logps/rejected": -230.72608947753906, + "loss": 0.4231, + "rewards/accuracies": 0.9900000095367432, + "rewards/chosen": -7.324814319610596, + "rewards/margins": 7.593767166137695, + "rewards/rejected": -14.918582916259766, + "sft_loss": 1.0732117891311646, + "step": 960 + }, + { + "epoch": 1.5861275476660093, + "grad_norm": 21.306042868258853, + "learning_rate": 1.2341292164380783e-07, + "logits/chosen": 18.833568572998047, + "logits/rejected": 18.869935989379883, + "logps/chosen": -286.1907653808594, + "logps/rejected": -224.49281311035156, + "loss": 0.4817, + "rewards/accuracies": 0.9399999976158142, + "rewards/chosen": -7.71510124206543, + "rewards/margins": 7.221285820007324, + "rewards/rejected": -14.936385154724121, + "sft_loss": 1.3040668964385986, + "step": 965 + }, + { + "epoch": 1.594345825115056, + "grad_norm": 14.69263028616145, + "learning_rate": 1.222794359745675e-07, + "logits/chosen": 16.27896499633789, + "logits/rejected": 18.376323699951172, + "logps/chosen": -300.5797424316406, + "logps/rejected": -242.6514129638672, + "loss": 0.4114, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.52255392074585, + "rewards/margins": 8.169685363769531, + "rewards/rejected": -15.692238807678223, + "sft_loss": 1.0308858156204224, + "step": 970 + }, + { + "epoch": 1.6025641025641026, + "grad_norm": 13.802476438483277, + "learning_rate": 1.2114617407434354e-07, + "logits/chosen": 18.055139541625977, + "logits/rejected": 19.250368118286133, + "logps/chosen": -309.2381286621094, + "logps/rejected": -245.81809997558594, + "loss": 0.4326, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -7.345053672790527, + "rewards/margins": 8.126486778259277, + "rewards/rejected": -15.471541404724121, + "sft_loss": 1.123140811920166, + "step": 975 + }, + { + "epoch": 1.6107823800131493, + "grad_norm": 10.423391619330996, + "learning_rate": 1.2001322915502091e-07, + "logits/chosen": 16.897199630737305, + "logits/rejected": 18.748310089111328, + "logps/chosen": -292.1817932128906, + "logps/rejected": -235.8812255859375, + "loss": 0.3942, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -7.173976898193359, + "rewards/margins": 7.872208118438721, + "rewards/rejected": -15.046185493469238, + "sft_loss": 1.1811002492904663, + "step": 980 + }, + { + "epoch": 1.619000657462196, + "grad_norm": 26.973905524007105, + "learning_rate": 1.1888069440241243e-07, + "logits/chosen": 18.107698440551758, + "logits/rejected": 19.736108779907227, + "logps/chosen": -317.0016174316406, + "logps/rejected": -252.54832458496094, + "loss": 0.4222, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -7.532571792602539, + "rewards/margins": 9.049071311950684, + "rewards/rejected": -16.581642150878906, + "sft_loss": 1.075319766998291, + "step": 985 + }, + { + "epoch": 1.6272189349112427, + "grad_norm": 16.255178289646476, + "learning_rate": 1.1774866296859448e-07, + "logits/chosen": 17.9573917388916, + "logits/rejected": 19.03142738342285, + "logps/chosen": -301.56561279296875, + "logps/rejected": -243.9299774169922, + "loss": 0.4749, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -7.284952163696289, + "rewards/margins": 8.675047874450684, + "rewards/rejected": -15.960000038146973, + "sft_loss": 1.1328290700912476, + "step": 990 + }, + { + "epoch": 1.6354372123602894, + "grad_norm": 10.065426351498546, + "learning_rate": 1.1661722796424478e-07, + "logits/chosen": 17.292905807495117, + "logits/rejected": 18.3796443939209, + "logps/chosen": -309.9263000488281, + "logps/rejected": -241.42181396484375, + "loss": 0.4268, + "rewards/accuracies": 0.9900000095367432, + "rewards/chosen": -7.671374320983887, + "rewards/margins": 8.211640357971191, + "rewards/rejected": -15.883017539978027, + "sft_loss": 1.0408843755722046, + "step": 995 + }, + { + "epoch": 1.643655489809336, + "grad_norm": 12.50718545323396, + "learning_rate": 1.1548648245098432e-07, + "logits/chosen": 17.582983016967773, + "logits/rejected": 18.472742080688477, + "logps/chosen": -319.5430908203125, + "logps/rejected": -253.3585968017578, + "loss": 0.4368, + "rewards/accuracies": 0.9900000095367432, + "rewards/chosen": -8.135196685791016, + "rewards/margins": 8.56678295135498, + "rewards/rejected": -16.701980590820312, + "sft_loss": 1.121424674987793, + "step": 1000 + }, + { + "epoch": 1.6518737672583828, + "grad_norm": 9.456497156444888, + "learning_rate": 1.1435651943372278e-07, + "logits/chosen": 16.574844360351562, + "logits/rejected": 17.709199905395508, + "logps/chosen": -286.1977844238281, + "logps/rejected": -229.33741760253906, + "loss": 0.4208, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.042440414428711, + "rewards/margins": 7.619970798492432, + "rewards/rejected": -15.662409782409668, + "sft_loss": 1.1242254972457886, + "step": 1005 + }, + { + "epoch": 1.6600920447074294, + "grad_norm": 12.581807587635986, + "learning_rate": 1.1322743185300865e-07, + "logits/chosen": 17.700603485107422, + "logits/rejected": 19.024187088012695, + "logps/chosen": -296.780029296875, + "logps/rejected": -233.88160705566406, + "loss": 0.4889, + "rewards/accuracies": 0.9399999976158142, + "rewards/chosen": -7.796105861663818, + "rewards/margins": 7.478055953979492, + "rewards/rejected": -15.274161338806152, + "sft_loss": 1.075081467628479, + "step": 1010 + }, + { + "epoch": 1.6683103221564761, + "grad_norm": 14.09597654178517, + "learning_rate": 1.1209931257738503e-07, + "logits/chosen": 17.260271072387695, + "logits/rejected": 18.022357940673828, + "logps/chosen": -306.3436584472656, + "logps/rejected": -227.7841339111328, + "loss": 0.4487, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -6.75, + "rewards/margins": 7.679973602294922, + "rewards/rejected": -14.429974555969238, + "sft_loss": 1.1023831367492676, + "step": 1015 + }, + { + "epoch": 1.6765285996055228, + "grad_norm": 10.14530298124155, + "learning_rate": 1.1097225439575096e-07, + "logits/chosen": 16.790157318115234, + "logits/rejected": 17.936586380004883, + "logps/chosen": -274.2288818359375, + "logps/rejected": -220.5703125, + "loss": 0.4648, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.9578022956848145, + "rewards/margins": 7.266170501708984, + "rewards/rejected": -14.22397232055664, + "sft_loss": 1.0298852920532227, + "step": 1020 + }, + { + "epoch": 1.6847468770545695, + "grad_norm": 14.64734935061402, + "learning_rate": 1.0984635000972946e-07, + "logits/chosen": 16.42229461669922, + "logits/rejected": 17.54804229736328, + "logps/chosen": -277.86077880859375, + "logps/rejected": -223.43917846679688, + "loss": 0.5101, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -7.510883808135986, + "rewards/margins": 7.296814441680908, + "rewards/rejected": -14.807699203491211, + "sft_loss": 1.089572548866272, + "step": 1025 + }, + { + "epoch": 1.6929651545036162, + "grad_norm": 14.998745686830942, + "learning_rate": 1.0872169202604284e-07, + "logits/chosen": 17.45005226135254, + "logits/rejected": 18.329872131347656, + "logps/chosen": -335.4214782714844, + "logps/rejected": -264.5696105957031, + "loss": 0.4259, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -8.625652313232422, + "rewards/margins": 8.4821138381958, + "rewards/rejected": -17.107765197753906, + "sft_loss": 1.1337147951126099, + "step": 1030 + }, + { + "epoch": 1.7011834319526629, + "grad_norm": 15.126502195785678, + "learning_rate": 1.0759837294889546e-07, + "logits/chosen": 15.89870834350586, + "logits/rejected": 17.66954803466797, + "logps/chosen": -324.4315185546875, + "logps/rejected": -251.8769073486328, + "loss": 0.4365, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -7.891256332397461, + "rewards/margins": 8.40850830078125, + "rewards/rejected": -16.299766540527344, + "sft_loss": 1.0551294088363647, + "step": 1035 + }, + { + "epoch": 1.7094017094017095, + "grad_norm": 11.887438634341896, + "learning_rate": 1.0647648517236547e-07, + "logits/chosen": 17.808908462524414, + "logits/rejected": 17.868276596069336, + "logps/chosen": -318.5857849121094, + "logps/rejected": -237.06268310546875, + "loss": 0.4077, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -7.441680431365967, + "rewards/margins": 7.951440811157227, + "rewards/rejected": -15.393121719360352, + "sft_loss": 1.0577045679092407, + "step": 1040 + }, + { + "epoch": 1.7176199868507562, + "grad_norm": 13.592964221155555, + "learning_rate": 1.0535612097280505e-07, + "logits/chosen": 17.357389450073242, + "logits/rejected": 18.236921310424805, + "logps/chosen": -309.05316162109375, + "logps/rejected": -234.39718627929688, + "loss": 0.4578, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -7.402204513549805, + "rewards/margins": 7.640995025634766, + "rewards/rejected": -15.043200492858887, + "sft_loss": 1.1290278434753418, + "step": 1045 + }, + { + "epoch": 1.725838264299803, + "grad_norm": 17.516227986033588, + "learning_rate": 1.042373725012508e-07, + "logits/chosen": 15.968868255615234, + "logits/rejected": 17.182361602783203, + "logps/chosen": -277.1082763671875, + "logps/rejected": -217.5791778564453, + "loss": 0.4706, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -7.2921223640441895, + "rewards/margins": 7.116176605224609, + "rewards/rejected": -14.40829849243164, + "sft_loss": 1.1019597053527832, + "step": 1050 + }, + { + "epoch": 1.7340565417488496, + "grad_norm": 14.545988790543376, + "learning_rate": 1.0312033177584409e-07, + "logits/chosen": 18.982242584228516, + "logits/rejected": 18.7514705657959, + "logps/chosen": -293.9178466796875, + "logps/rejected": -226.5133819580078, + "loss": 0.3922, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -7.807718276977539, + "rewards/margins": 7.406096935272217, + "rewards/rejected": -15.213815689086914, + "sft_loss": 1.0929393768310547, + "step": 1055 + }, + { + "epoch": 1.7422748191978963, + "grad_norm": 10.680737229216966, + "learning_rate": 1.0200509067426243e-07, + "logits/chosen": 16.079814910888672, + "logits/rejected": 17.51044273376465, + "logps/chosen": -302.1490173339844, + "logps/rejected": -233.8198699951172, + "loss": 0.444, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -8.101183891296387, + "rewards/margins": 7.351180553436279, + "rewards/rejected": -15.452364921569824, + "sft_loss": 1.2096168994903564, + "step": 1060 + }, + { + "epoch": 1.7504930966469427, + "grad_norm": 9.891781648367795, + "learning_rate": 1.0089174092616271e-07, + "logits/chosen": 17.791248321533203, + "logits/rejected": 18.2585506439209, + "logps/chosen": -280.9420166015625, + "logps/rejected": -224.9687957763672, + "loss": 0.4607, + "rewards/accuracies": 0.9399999976158142, + "rewards/chosen": -7.537823677062988, + "rewards/margins": 7.212753772735596, + "rewards/rejected": -14.750576972961426, + "sft_loss": 1.0387908220291138, + "step": 1065 + }, + { + "epoch": 1.7587113740959894, + "grad_norm": 18.289134457763506, + "learning_rate": 9.97803741056361e-08, + "logits/chosen": 16.976699829101562, + "logits/rejected": 17.30523109436035, + "logps/chosen": -275.5840148925781, + "logps/rejected": -215.13279724121094, + "loss": 0.3879, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -7.284540176391602, + "rewards/margins": 6.921156406402588, + "rewards/rejected": -14.205697059631348, + "sft_loss": 1.0973351001739502, + "step": 1070 + }, + { + "epoch": 1.7669296515450361, + "grad_norm": 17.72039206697929, + "learning_rate": 9.867108162367594e-08, + "logits/chosen": 16.939437866210938, + "logits/rejected": 18.218585968017578, + "logps/chosen": -294.5352478027344, + "logps/rejected": -230.98623657226562, + "loss": 0.3974, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -7.325733661651611, + "rewards/margins": 7.582549571990967, + "rewards/rejected": -14.908282279968262, + "sft_loss": 1.034481406211853, + "step": 1075 + }, + { + "epoch": 1.7751479289940828, + "grad_norm": 13.466593004835952, + "learning_rate": 9.756395472065947e-08, + "logits/chosen": 17.363365173339844, + "logits/rejected": 18.14643669128418, + "logps/chosen": -275.0605163574219, + "logps/rejected": -223.0447998046875, + "loss": 0.4368, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -7.5301194190979, + "rewards/margins": 7.630979537963867, + "rewards/rejected": -15.161099433898926, + "sft_loss": 1.191418170928955, + "step": 1080 + }, + { + "epoch": 1.7833662064431295, + "grad_norm": 17.525060893448625, + "learning_rate": 9.645908445884271e-08, + "logits/chosen": 17.93121910095215, + "logits/rejected": 19.609464645385742, + "logps/chosen": -313.574951171875, + "logps/rejected": -255.39015197753906, + "loss": 0.392, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.089523315429688, + "rewards/margins": 8.564504623413086, + "rewards/rejected": -16.654027938842773, + "sft_loss": 1.0859136581420898, + "step": 1085 + }, + { + "epoch": 1.7915844838921762, + "grad_norm": 14.818652238656334, + "learning_rate": 9.535656171487096e-08, + "logits/chosen": 17.432899475097656, + "logits/rejected": 18.06930160522461, + "logps/chosen": -306.2559814453125, + "logps/rejected": -247.05564880371094, + "loss": 0.4113, + "rewards/accuracies": 0.9900000095367432, + "rewards/chosen": -8.375761985778809, + "rewards/margins": 8.475983619689941, + "rewards/rejected": -16.85174560546875, + "sft_loss": 1.2146451473236084, + "step": 1090 + }, + { + "epoch": 1.7998027613412229, + "grad_norm": 64.12698029544616, + "learning_rate": 9.425647717230382e-08, + "logits/chosen": 17.3497257232666, + "logits/rejected": 18.322324752807617, + "logps/chosen": -314.32830810546875, + "logps/rejected": -253.83473205566406, + "loss": 0.4062, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -8.878050804138184, + "rewards/margins": 8.278247833251953, + "rewards/rejected": -17.15629768371582, + "sft_loss": 1.077860713005066, + "step": 1095 + }, + { + "epoch": 1.8080210387902695, + "grad_norm": 13.052337358867197, + "learning_rate": 9.315892131415642e-08, + "logits/chosen": 16.90951919555664, + "logits/rejected": 18.101472854614258, + "logps/chosen": -344.9137878417969, + "logps/rejected": -264.2882080078125, + "loss": 0.3948, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -8.652148246765137, + "rewards/margins": 9.170465469360352, + "rewards/rejected": -17.822612762451172, + "sft_loss": 1.2117801904678345, + "step": 1100 + }, + { + "epoch": 1.8162393162393162, + "grad_norm": 11.613352799050077, + "learning_rate": 9.206398441545729e-08, + "logits/chosen": 17.647083282470703, + "logits/rejected": 18.84397315979004, + "logps/chosen": -312.7010498046875, + "logps/rejected": -254.3484344482422, + "loss": 0.3759, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.489236831665039, + "rewards/margins": 8.119637489318848, + "rewards/rejected": -16.608875274658203, + "sft_loss": 1.01621675491333, + "step": 1105 + }, + { + "epoch": 1.824457593688363, + "grad_norm": 11.15254994077485, + "learning_rate": 9.097175653582299e-08, + "logits/chosen": 17.26348114013672, + "logits/rejected": 18.160728454589844, + "logps/chosen": -284.86114501953125, + "logps/rejected": -232.5272979736328, + "loss": 0.41, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -7.840343475341797, + "rewards/margins": 7.370659351348877, + "rewards/rejected": -15.211003303527832, + "sft_loss": 1.1511608362197876, + "step": 1110 + }, + { + "epoch": 1.8326758711374096, + "grad_norm": 14.803907963552794, + "learning_rate": 8.988232751205051e-08, + "logits/chosen": 17.386255264282227, + "logits/rejected": 17.55118751525879, + "logps/chosen": -271.7340087890625, + "logps/rejected": -208.06320190429688, + "loss": 0.4401, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -7.613986968994141, + "rewards/margins": 6.274531841278076, + "rewards/rejected": -13.888518333435059, + "sft_loss": 1.144532322883606, + "step": 1115 + }, + { + "epoch": 1.8408941485864563, + "grad_norm": 14.423568520659874, + "learning_rate": 8.879578695072846e-08, + "logits/chosen": 17.274259567260742, + "logits/rejected": 18.399911880493164, + "logps/chosen": -289.1215515136719, + "logps/rejected": -230.22369384765625, + "loss": 0.4135, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -7.828088760375977, + "rewards/margins": 7.673010349273682, + "rewards/rejected": -15.5010986328125, + "sft_loss": 1.1277306079864502, + "step": 1120 + }, + { + "epoch": 1.849112426035503, + "grad_norm": 11.37404702454821, + "learning_rate": 8.771222422086639e-08, + "logits/chosen": 16.860265731811523, + "logits/rejected": 17.736581802368164, + "logps/chosen": -297.23956298828125, + "logps/rejected": -233.06109619140625, + "loss": 0.3998, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -7.860833168029785, + "rewards/margins": 8.063416481018066, + "rewards/rejected": -15.924250602722168, + "sft_loss": 1.2870830297470093, + "step": 1125 + }, + { + "epoch": 1.8573307034845496, + "grad_norm": 12.257681191538563, + "learning_rate": 8.663172844654452e-08, + "logits/chosen": 17.366941452026367, + "logits/rejected": 17.93768882751465, + "logps/chosen": -300.5145263671875, + "logps/rejected": -230.68685913085938, + "loss": 0.4455, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -7.420682430267334, + "rewards/margins": 7.759568691253662, + "rewards/rejected": -15.180251121520996, + "sft_loss": 1.0831838846206665, + "step": 1130 + }, + { + "epoch": 1.8655489809335963, + "grad_norm": 11.344131200773928, + "learning_rate": 8.555438849958296e-08, + "logits/chosen": 17.97229766845703, + "logits/rejected": 18.921049118041992, + "logps/chosen": -319.6356201171875, + "logps/rejected": -246.49024963378906, + "loss": 0.3864, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -7.522003650665283, + "rewards/margins": 8.551565170288086, + "rewards/rejected": -16.07356834411621, + "sft_loss": 1.150990605354309, + "step": 1135 + }, + { + "epoch": 1.873767258382643, + "grad_norm": 20.985079338983198, + "learning_rate": 8.448029299223194e-08, + "logits/chosen": 17.783571243286133, + "logits/rejected": 18.174728393554688, + "logps/chosen": -312.2618713378906, + "logps/rejected": -233.99496459960938, + "loss": 0.4933, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -7.624851226806641, + "rewards/margins": 7.475332260131836, + "rewards/rejected": -15.100183486938477, + "sft_loss": 1.1498528718948364, + "step": 1140 + }, + { + "epoch": 1.8819855358316897, + "grad_norm": 14.844798746234286, + "learning_rate": 8.340953026988351e-08, + "logits/chosen": 17.779254913330078, + "logits/rejected": 19.071887969970703, + "logps/chosen": -311.01190185546875, + "logps/rejected": -248.10272216796875, + "loss": 0.4615, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -7.804770469665527, + "rewards/margins": 8.161953926086426, + "rewards/rejected": -15.966724395751953, + "sft_loss": 1.1634888648986816, + "step": 1145 + }, + { + "epoch": 1.8902038132807364, + "grad_norm": 11.515222849514643, + "learning_rate": 8.234218840380475e-08, + "logits/chosen": 16.18383026123047, + "logits/rejected": 17.827003479003906, + "logps/chosen": -301.19659423828125, + "logps/rejected": -245.50054931640625, + "loss": 0.4341, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -8.15174388885498, + "rewards/margins": 7.812210559844971, + "rewards/rejected": -15.963953971862793, + "sft_loss": 1.0311837196350098, + "step": 1150 + }, + { + "epoch": 1.898422090729783, + "grad_norm": 14.564597779855657, + "learning_rate": 8.127835518389417e-08, + "logits/chosen": 16.831256866455078, + "logits/rejected": 18.508529663085938, + "logps/chosen": -311.1943054199219, + "logps/rejected": -245.4080047607422, + "loss": 0.4095, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -7.765483856201172, + "rewards/margins": 8.15777587890625, + "rewards/rejected": -15.923259735107422, + "sft_loss": 1.114915132522583, + "step": 1155 + }, + { + "epoch": 1.9066403681788298, + "grad_norm": 26.10926811927184, + "learning_rate": 8.021811811146075e-08, + "logits/chosen": 16.842208862304688, + "logits/rejected": 17.959400177001953, + "logps/chosen": -291.0676574707031, + "logps/rejected": -237.74246215820312, + "loss": 0.4551, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -7.678957939147949, + "rewards/margins": 8.211709022521973, + "rewards/rejected": -15.890668869018555, + "sft_loss": 1.1757006645202637, + "step": 1160 + }, + { + "epoch": 1.9148586456278764, + "grad_norm": 12.813401775007092, + "learning_rate": 7.916156439202672e-08, + "logits/chosen": 17.37171173095703, + "logits/rejected": 18.593181610107422, + "logps/chosen": -289.34759521484375, + "logps/rejected": -234.8267059326172, + "loss": 0.4289, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -7.662449836730957, + "rewards/margins": 7.566576957702637, + "rewards/rejected": -15.229025840759277, + "sft_loss": 1.1354382038116455, + "step": 1165 + }, + { + "epoch": 1.9230769230769231, + "grad_norm": 6.596137423450017, + "learning_rate": 7.810878092815512e-08, + "logits/chosen": 17.296720504760742, + "logits/rejected": 17.11487579345703, + "logps/chosen": -307.8653869628906, + "logps/rejected": -237.65505981445312, + "loss": 0.3663, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -7.926757335662842, + "rewards/margins": 7.959318161010742, + "rewards/rejected": -15.886076927185059, + "sft_loss": 1.1921048164367676, + "step": 1170 + }, + { + "epoch": 1.9312952005259696, + "grad_norm": 14.579022955412034, + "learning_rate": 7.705985431230183e-08, + "logits/chosen": 15.675207138061523, + "logits/rejected": 16.91021156311035, + "logps/chosen": -322.23992919921875, + "logps/rejected": -266.904296875, + "loss": 0.391, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -8.967777252197266, + "rewards/margins": 8.5900297164917, + "rewards/rejected": -17.557802200317383, + "sft_loss": 1.228776454925537, + "step": 1175 + }, + { + "epoch": 1.9395134779750163, + "grad_norm": 12.828599154800472, + "learning_rate": 7.601487081969307e-08, + "logits/chosen": 18.340225219726562, + "logits/rejected": 19.142946243286133, + "logps/chosen": -350.186279296875, + "logps/rejected": -269.3705749511719, + "loss": 0.3851, + "rewards/accuracies": 0.9300000071525574, + "rewards/chosen": -8.475415229797363, + "rewards/margins": 9.2521390914917, + "rewards/rejected": -17.727554321289062, + "sft_loss": 1.1213669776916504, + "step": 1180 + }, + { + "epoch": 1.947731755424063, + "grad_norm": 13.15248193805534, + "learning_rate": 7.497391640122967e-08, + "logits/chosen": 18.557586669921875, + "logits/rejected": 19.259462356567383, + "logps/chosen": -311.15838623046875, + "logps/rejected": -252.96751403808594, + "loss": 0.4041, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -8.537500381469727, + "rewards/margins": 8.597896575927734, + "rewards/rejected": -17.13539695739746, + "sft_loss": 1.1180825233459473, + "step": 1185 + }, + { + "epoch": 1.9559500328731096, + "grad_norm": 20.887376048027924, + "learning_rate": 7.393707667641691e-08, + "logits/chosen": 16.45261573791504, + "logits/rejected": 17.498512268066406, + "logps/chosen": -310.4942626953125, + "logps/rejected": -250.18203735351562, + "loss": 0.4276, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -8.239749908447266, + "rewards/margins": 8.2033109664917, + "rewards/rejected": -16.44305992126465, + "sft_loss": 1.188431739807129, + "step": 1190 + }, + { + "epoch": 1.9641683103221563, + "grad_norm": 32.140189305396625, + "learning_rate": 7.290443692632281e-08, + "logits/chosen": 19.094688415527344, + "logits/rejected": 19.616283416748047, + "logps/chosen": -291.1233825683594, + "logps/rejected": -234.5458526611328, + "loss": 0.4942, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -7.3053131103515625, + "rewards/margins": 7.835725784301758, + "rewards/rejected": -15.141037940979004, + "sft_loss": 1.075373888015747, + "step": 1195 + }, + { + "epoch": 1.972386587771203, + "grad_norm": 13.526795062615003, + "learning_rate": 7.187608208656328e-08, + "logits/chosen": 16.982704162597656, + "logits/rejected": 17.547874450683594, + "logps/chosen": -293.3042297363281, + "logps/rejected": -233.2967987060547, + "loss": 0.3964, + "rewards/accuracies": 0.9900000095367432, + "rewards/chosen": -8.24399185180664, + "rewards/margins": 7.097829818725586, + "rewards/rejected": -15.341819763183594, + "sft_loss": 1.063591718673706, + "step": 1200 + }, + { + "epoch": 1.9806048652202497, + "grad_norm": 12.330320612053741, + "learning_rate": 7.085209674031618e-08, + "logits/chosen": 18.508739471435547, + "logits/rejected": 19.527912139892578, + "logps/chosen": -318.8953857421875, + "logps/rejected": -255.2642822265625, + "loss": 0.3766, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -7.808796405792236, + "rewards/margins": 8.834001541137695, + "rewards/rejected": -16.642797470092773, + "sft_loss": 1.0131335258483887, + "step": 1205 + }, + { + "epoch": 1.9888231426692964, + "grad_norm": 19.628735128907458, + "learning_rate": 6.983256511136442e-08, + "logits/chosen": 17.349624633789062, + "logits/rejected": 18.25617218017578, + "logps/chosen": -315.596923828125, + "logps/rejected": -252.95460510253906, + "loss": 0.3878, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -8.501006126403809, + "rewards/margins": 8.494573593139648, + "rewards/rejected": -16.995580673217773, + "sft_loss": 1.0632458925247192, + "step": 1210 + }, + { + "epoch": 1.997041420118343, + "grad_norm": 14.674802699510677, + "learning_rate": 6.881757105716831e-08, + "logits/chosen": 17.45104217529297, + "logits/rejected": 18.316680908203125, + "logps/chosen": -330.3178405761719, + "logps/rejected": -251.63551330566406, + "loss": 0.4009, + "rewards/accuracies": 0.9900000095367432, + "rewards/chosen": -8.423145294189453, + "rewards/margins": 8.154979705810547, + "rewards/rejected": -16.578125, + "sft_loss": 1.0945472717285156, + "step": 1215 + }, + { + "epoch": 2.0052596975673898, + "grad_norm": 12.624994593347873, + "learning_rate": 6.780719806196828e-08, + "logits/chosen": 17.815471649169922, + "logits/rejected": 19.435829162597656, + "logps/chosen": -326.4144287109375, + "logps/rejected": -260.8008117675781, + "loss": 0.4449, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -8.356893539428711, + "rewards/margins": 8.929654121398926, + "rewards/rejected": -17.286548614501953, + "sft_loss": 1.1082605123519897, + "step": 1220 + }, + { + "epoch": 2.0134779750164364, + "grad_norm": 12.725913199026877, + "learning_rate": 6.680152922991822e-08, + "logits/chosen": 16.1939754486084, + "logits/rejected": 17.380538940429688, + "logps/chosen": -282.47589111328125, + "logps/rejected": -237.52879333496094, + "loss": 0.3868, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -8.493667602539062, + "rewards/margins": 7.595485687255859, + "rewards/rejected": -16.089153289794922, + "sft_loss": 1.1127554178237915, + "step": 1225 + }, + { + "epoch": 2.021696252465483, + "grad_norm": 16.704211079520014, + "learning_rate": 6.580064727824994e-08, + "logits/chosen": 17.634016036987305, + "logits/rejected": 18.210420608520508, + "logps/chosen": -294.94793701171875, + "logps/rejected": -239.2569122314453, + "loss": 0.4093, + "rewards/accuracies": 0.9399999976158142, + "rewards/chosen": -8.08222770690918, + "rewards/margins": 7.951410293579102, + "rewards/rejected": -16.03363609313965, + "sft_loss": 1.0821824073791504, + "step": 1230 + }, + { + "epoch": 2.02991452991453, + "grad_norm": 11.063546418547208, + "learning_rate": 6.480463453046985e-08, + "logits/chosen": 18.466581344604492, + "logits/rejected": 18.895183563232422, + "logps/chosen": -304.6612243652344, + "logps/rejected": -241.4573974609375, + "loss": 0.4202, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -8.562366485595703, + "rewards/margins": 7.818039417266846, + "rewards/rejected": -16.38040542602539, + "sft_loss": 1.1190707683563232, + "step": 1235 + }, + { + "epoch": 2.0381328073635765, + "grad_norm": 14.946770498466886, + "learning_rate": 6.381357290958767e-08, + "logits/chosen": 16.804920196533203, + "logits/rejected": 17.867015838623047, + "logps/chosen": -296.5435485839844, + "logps/rejected": -246.7471466064453, + "loss": 0.3722, + "rewards/accuracies": 0.9900000095367432, + "rewards/chosen": -8.007586479187012, + "rewards/margins": 8.32363224029541, + "rewards/rejected": -16.33121681213379, + "sft_loss": 1.104773759841919, + "step": 1240 + }, + { + "epoch": 2.046351084812623, + "grad_norm": 11.140569334845633, + "learning_rate": 6.282754393137796e-08, + "logits/chosen": 17.95855140686035, + "logits/rejected": 18.640541076660156, + "logps/chosen": -310.16778564453125, + "logps/rejected": -239.66641235351562, + "loss": 0.4065, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -8.086520195007324, + "rewards/margins": 8.207837104797363, + "rewards/rejected": -16.294357299804688, + "sft_loss": 1.023207187652588, + "step": 1245 + }, + { + "epoch": 2.05456936226167, + "grad_norm": 15.822685116826385, + "learning_rate": 6.184662869767577e-08, + "logits/chosen": 17.26742172241211, + "logits/rejected": 17.335512161254883, + "logps/chosen": -328.2395324707031, + "logps/rejected": -263.0542297363281, + "loss": 0.4175, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -8.798872947692871, + "rewards/margins": 8.849559783935547, + "rewards/rejected": -17.648433685302734, + "sft_loss": 1.1304852962493896, + "step": 1250 + }, + { + "epoch": 2.0627876397107165, + "grad_norm": 17.368549612926913, + "learning_rate": 6.08709078897056e-08, + "logits/chosen": 17.57396125793457, + "logits/rejected": 17.95652198791504, + "logps/chosen": -302.7294006347656, + "logps/rejected": -251.41261291503906, + "loss": 0.4021, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -8.686173439025879, + "rewards/margins": 8.318988800048828, + "rewards/rejected": -17.005163192749023, + "sft_loss": 1.119976282119751, + "step": 1255 + }, + { + "epoch": 2.0710059171597632, + "grad_norm": 13.875960320644882, + "learning_rate": 5.990046176144551e-08, + "logits/chosen": 16.934846878051758, + "logits/rejected": 17.557884216308594, + "logps/chosen": -274.9892578125, + "logps/rejected": -239.31570434570312, + "loss": 0.4283, + "rewards/accuracies": 0.9100000262260437, + "rewards/chosen": -8.879440307617188, + "rewards/margins": 7.452706336975098, + "rewards/rejected": -16.3321475982666, + "sft_loss": 1.162746787071228, + "step": 1260 + }, + { + "epoch": 2.07922419460881, + "grad_norm": 12.354544231223421, + "learning_rate": 5.893537013302602e-08, + "logits/chosen": 17.52082061767578, + "logits/rejected": 18.2637939453125, + "logps/chosen": -304.33441162109375, + "logps/rejected": -243.52101135253906, + "loss": 0.4253, + "rewards/accuracies": 0.9900000095367432, + "rewards/chosen": -7.936227321624756, + "rewards/margins": 8.085640907287598, + "rewards/rejected": -16.021867752075195, + "sft_loss": 1.0547149181365967, + "step": 1265 + }, + { + "epoch": 2.0874424720578566, + "grad_norm": 14.221340160175023, + "learning_rate": 5.7975712384164795e-08, + "logits/chosen": 17.841602325439453, + "logits/rejected": 17.95541000366211, + "logps/chosen": -295.451416015625, + "logps/rejected": -230.86936950683594, + "loss": 0.4009, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -7.948279857635498, + "rewards/margins": 7.590776443481445, + "rewards/rejected": -15.539057731628418, + "sft_loss": 1.1430902481079102, + "step": 1270 + }, + { + "epoch": 2.0956607495069033, + "grad_norm": 15.060512661462361, + "learning_rate": 5.702156744763784e-08, + "logits/chosen": 17.457277297973633, + "logits/rejected": 18.601512908935547, + "logps/chosen": -286.6520080566406, + "logps/rejected": -236.4774627685547, + "loss": 0.4211, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -7.859719276428223, + "rewards/margins": 7.776011943817139, + "rewards/rejected": -15.635732650756836, + "sft_loss": 1.039507269859314, + "step": 1275 + }, + { + "epoch": 2.10387902695595, + "grad_norm": 14.43891440512856, + "learning_rate": 5.607301380278683e-08, + "logits/chosen": 17.887542724609375, + "logits/rejected": 18.098596572875977, + "logps/chosen": -287.3581848144531, + "logps/rejected": -228.4025421142578, + "loss": 0.4356, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -7.526928901672363, + "rewards/margins": 8.00684642791748, + "rewards/rejected": -15.533775329589844, + "sft_loss": 1.1267131567001343, + "step": 1280 + }, + { + "epoch": 2.1120973044049967, + "grad_norm": 12.750181563192855, + "learning_rate": 5.513012946906445e-08, + "logits/chosen": 17.97955322265625, + "logits/rejected": 18.05929183959961, + "logps/chosen": -319.4637145996094, + "logps/rejected": -245.5413818359375, + "loss": 0.3884, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -8.154685020446777, + "rewards/margins": 8.448837280273438, + "rewards/rejected": -16.6035213470459, + "sft_loss": 1.193272590637207, + "step": 1285 + }, + { + "epoch": 2.1203155818540433, + "grad_norm": 9.936573876560704, + "learning_rate": 5.419299199961708e-08, + "logits/chosen": 17.2838077545166, + "logits/rejected": 17.822799682617188, + "logps/chosen": -337.51031494140625, + "logps/rejected": -259.70428466796875, + "loss": 0.3565, + "rewards/accuracies": 0.9900000095367432, + "rewards/chosen": -7.9683146476745605, + "rewards/margins": 8.882369041442871, + "rewards/rejected": -16.850685119628906, + "sft_loss": 1.041199803352356, + "step": 1290 + }, + { + "epoch": 2.12853385930309, + "grad_norm": 13.38745866462026, + "learning_rate": 5.3261678474905785e-08, + "logits/chosen": 18.08312225341797, + "logits/rejected": 18.110692977905273, + "logps/chosen": -324.0693359375, + "logps/rejected": -256.90234375, + "loss": 0.391, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -8.347922325134277, + "rewards/margins": 8.93021011352539, + "rewards/rejected": -17.27813148498535, + "sft_loss": 1.1214524507522583, + "step": 1295 + }, + { + "epoch": 2.1367521367521367, + "grad_norm": 12.52755859911023, + "learning_rate": 5.2336265496366774e-08, + "logits/chosen": 16.553739547729492, + "logits/rejected": 18.280567169189453, + "logps/chosen": -298.98480224609375, + "logps/rejected": -247.38160705566406, + "loss": 0.3604, + "rewards/accuracies": 0.9900000095367432, + "rewards/chosen": -8.771576881408691, + "rewards/margins": 7.8479180335998535, + "rewards/rejected": -16.619495391845703, + "sft_loss": 1.1661113500595093, + "step": 1300 + }, + { + "epoch": 2.1449704142011834, + "grad_norm": 15.615159328078256, + "learning_rate": 5.141682918011055e-08, + "logits/chosen": 17.72024917602539, + "logits/rejected": 18.12508773803711, + "logps/chosen": -311.2801818847656, + "logps/rejected": -245.24436950683594, + "loss": 0.4611, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -8.237349510192871, + "rewards/margins": 8.371785163879395, + "rewards/rejected": -16.609132766723633, + "sft_loss": 1.1050708293914795, + "step": 1305 + }, + { + "epoch": 2.15318869165023, + "grad_norm": 9.121410431305465, + "learning_rate": 5.0503445150661306e-08, + "logits/chosen": 17.203432083129883, + "logits/rejected": 18.309484481811523, + "logps/chosen": -282.98101806640625, + "logps/rejected": -231.17942810058594, + "loss": 0.3828, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.479619979858398, + "rewards/margins": 7.617303371429443, + "rewards/rejected": -16.096921920776367, + "sft_loss": 1.110097050666809, + "step": 1310 + }, + { + "epoch": 2.1614069690992768, + "grad_norm": 16.777024727194785, + "learning_rate": 4.959618853473696e-08, + "logits/chosen": 16.61244010925293, + "logits/rejected": 17.989538192749023, + "logps/chosen": -302.3112487792969, + "logps/rejected": -246.63719177246094, + "loss": 0.4158, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -8.352829933166504, + "rewards/margins": 8.38165283203125, + "rewards/rejected": -16.734481811523438, + "sft_loss": 1.1263587474822998, + "step": 1315 + }, + { + "epoch": 2.1696252465483234, + "grad_norm": 12.912024759458012, + "learning_rate": 4.8695133955069564e-08, + "logits/chosen": 15.624103546142578, + "logits/rejected": 16.827468872070312, + "logps/chosen": -306.7551574707031, + "logps/rejected": -244.3481903076172, + "loss": 0.429, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -8.34555435180664, + "rewards/margins": 8.023102760314941, + "rewards/rejected": -16.3686580657959, + "sft_loss": 1.2823337316513062, + "step": 1320 + }, + { + "epoch": 2.17784352399737, + "grad_norm": 13.478297039710323, + "learning_rate": 4.780035552426787e-08, + "logits/chosen": 16.33539581298828, + "logits/rejected": 18.024782180786133, + "logps/chosen": -325.2061767578125, + "logps/rejected": -265.6727600097656, + "loss": 0.4175, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -8.308411598205566, + "rewards/margins": 9.635498046875, + "rewards/rejected": -17.943910598754883, + "sft_loss": 1.1859756708145142, + "step": 1325 + }, + { + "epoch": 2.186061801446417, + "grad_norm": 13.817488574864614, + "learning_rate": 4.691192683872129e-08, + "logits/chosen": 16.309165954589844, + "logits/rejected": 17.056123733520508, + "logps/chosen": -319.3105163574219, + "logps/rejected": -256.0588073730469, + "loss": 0.382, + "rewards/accuracies": 0.9900000095367432, + "rewards/chosen": -8.325318336486816, + "rewards/margins": 8.578317642211914, + "rewards/rejected": -16.903636932373047, + "sft_loss": 1.0492181777954102, + "step": 1330 + }, + { + "epoch": 2.1942800788954635, + "grad_norm": 10.242449837573213, + "learning_rate": 4.602992097254646e-08, + "logits/chosen": 17.743621826171875, + "logits/rejected": 19.387224197387695, + "logps/chosen": -307.1810607910156, + "logps/rejected": -254.61309814453125, + "loss": 0.3948, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -8.201935768127441, + "rewards/margins": 8.72970962524414, + "rewards/rejected": -16.9316463470459, + "sft_loss": 1.159468173980713, + "step": 1335 + }, + { + "epoch": 2.20249835634451, + "grad_norm": 12.840091970424348, + "learning_rate": 4.515441047157707e-08, + "logits/chosen": 17.517444610595703, + "logits/rejected": 18.110706329345703, + "logps/chosen": -303.7611083984375, + "logps/rejected": -246.00747680664062, + "loss": 0.4279, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -8.031211853027344, + "rewards/margins": 8.574084281921387, + "rewards/rejected": -16.605297088623047, + "sft_loss": 1.1109663248062134, + "step": 1340 + }, + { + "epoch": 2.210716633793557, + "grad_norm": 11.358689713775057, + "learning_rate": 4.428546734739666e-08, + "logits/chosen": 17.79754066467285, + "logits/rejected": 18.65445327758789, + "logps/chosen": -310.1402587890625, + "logps/rejected": -257.2119445800781, + "loss": 0.3393, + "rewards/accuracies": 0.9900000095367432, + "rewards/chosen": -8.179025650024414, + "rewards/margins": 9.184054374694824, + "rewards/rejected": -17.363079071044922, + "sft_loss": 1.040381669998169, + "step": 1345 + }, + { + "epoch": 2.2189349112426036, + "grad_norm": 10.0246369651475, + "learning_rate": 4.342316307141568e-08, + "logits/chosen": 15.378368377685547, + "logits/rejected": 17.601299285888672, + "logps/chosen": -293.3377380371094, + "logps/rejected": -252.94558715820312, + "loss": 0.4169, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -8.73454761505127, + "rewards/margins": 8.430728912353516, + "rewards/rejected": -17.16527557373047, + "sft_loss": 1.088813304901123, + "step": 1350 + }, + { + "epoch": 2.2271531886916502, + "grad_norm": 8.66405912578809, + "learning_rate": 4.256756856899299e-08, + "logits/chosen": 16.15410041809082, + "logits/rejected": 17.089345932006836, + "logps/chosen": -293.54864501953125, + "logps/rejected": -243.08554077148438, + "loss": 0.3688, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -8.237372398376465, + "rewards/margins": 8.213920593261719, + "rewards/rejected": -16.4512939453125, + "sft_loss": 1.1098147630691528, + "step": 1355 + }, + { + "epoch": 2.235371466140697, + "grad_norm": 15.953137176215671, + "learning_rate": 4.171875421360202e-08, + "logits/chosen": 16.227901458740234, + "logits/rejected": 16.872665405273438, + "logps/chosen": -329.6645202636719, + "logps/rejected": -257.57489013671875, + "loss": 0.4039, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -8.39266300201416, + "rewards/margins": 8.796185493469238, + "rewards/rejected": -17.1888484954834, + "sft_loss": 1.1166497468948364, + "step": 1360 + }, + { + "epoch": 2.2435897435897436, + "grad_norm": 9.812512910956865, + "learning_rate": 4.0876789821042606e-08, + "logits/chosen": 16.98467445373535, + "logits/rejected": 17.594194412231445, + "logps/chosen": -308.341064453125, + "logps/rejected": -250.00465393066406, + "loss": 0.3941, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -8.45275592803955, + "rewards/margins": 8.58054256439209, + "rewards/rejected": -17.03329849243164, + "sft_loss": 1.1679203510284424, + "step": 1365 + }, + { + "epoch": 2.2518080210387903, + "grad_norm": 11.249560857734895, + "learning_rate": 4.0041744643698585e-08, + "logits/chosen": 17.271631240844727, + "logits/rejected": 18.480789184570312, + "logps/chosen": -323.33148193359375, + "logps/rejected": -265.4918212890625, + "loss": 0.4133, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -8.759995460510254, + "rewards/margins": 9.08838176727295, + "rewards/rejected": -17.848377227783203, + "sft_loss": 1.1703903675079346, + "step": 1370 + }, + { + "epoch": 2.260026298487837, + "grad_norm": 13.168234484012684, + "learning_rate": 3.9213687364841514e-08, + "logits/chosen": 17.725706100463867, + "logits/rejected": 18.4434871673584, + "logps/chosen": -265.5625305175781, + "logps/rejected": -229.72801208496094, + "loss": 0.3827, + "rewards/accuracies": 0.9900000095367432, + "rewards/chosen": -7.455626964569092, + "rewards/margins": 8.284765243530273, + "rewards/rejected": -15.740392684936523, + "sft_loss": 1.061354160308838, + "step": 1375 + }, + { + "epoch": 2.2682445759368837, + "grad_norm": 10.595007690116647, + "learning_rate": 3.8392686092981716e-08, + "logits/chosen": 16.218524932861328, + "logits/rejected": 17.454858779907227, + "logps/chosen": -330.2020263671875, + "logps/rejected": -259.57513427734375, + "loss": 0.3713, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -8.325506210327148, + "rewards/margins": 8.986472129821777, + "rewards/rejected": -17.311979293823242, + "sft_loss": 1.1411256790161133, + "step": 1380 + }, + { + "epoch": 2.2764628533859304, + "grad_norm": 16.544764732871236, + "learning_rate": 3.757880835626601e-08, + "logits/chosen": 19.006175994873047, + "logits/rejected": 20.302326202392578, + "logps/chosen": -322.05242919921875, + "logps/rejected": -260.6827087402344, + "loss": 0.3984, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -8.566563606262207, + "rewards/margins": 8.706660270690918, + "rewards/rejected": -17.273221969604492, + "sft_loss": 1.01236891746521, + "step": 1385 + }, + { + "epoch": 2.284681130834977, + "grad_norm": 9.92900744936661, + "learning_rate": 3.677212109692364e-08, + "logits/chosen": 16.336091995239258, + "logits/rejected": 18.393173217773438, + "logps/chosen": -296.00811767578125, + "logps/rejected": -255.37149047851562, + "loss": 0.4114, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.0043363571167, + "rewards/margins": 9.217806816101074, + "rewards/rejected": -17.222143173217773, + "sft_loss": 1.1503466367721558, + "step": 1390 + }, + { + "epoch": 2.2928994082840237, + "grad_norm": 9.386107838289549, + "learning_rate": 3.597269066576017e-08, + "logits/chosen": 17.042190551757812, + "logits/rejected": 18.17107582092285, + "logps/chosen": -300.5311584472656, + "logps/rejected": -244.8414306640625, + "loss": 0.3695, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -8.179112434387207, + "rewards/margins": 8.550080299377441, + "rewards/rejected": -16.72919273376465, + "sft_loss": 1.1738831996917725, + "step": 1395 + }, + { + "epoch": 2.3011176857330704, + "grad_norm": 13.817759213393538, + "learning_rate": 3.518058281669996e-08, + "logits/chosen": 17.452651977539062, + "logits/rejected": 19.167875289916992, + "logps/chosen": -325.5849914550781, + "logps/rejected": -261.9805908203125, + "loss": 0.405, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.316492080688477, + "rewards/margins": 9.281232833862305, + "rewards/rejected": -17.59772491455078, + "sft_loss": 1.0759243965148926, + "step": 1400 + }, + { + "epoch": 2.309335963182117, + "grad_norm": 14.27386340226445, + "learning_rate": 3.439586270137797e-08, + "logits/chosen": 16.01079750061035, + "logits/rejected": 17.990955352783203, + "logps/chosen": -317.35968017578125, + "logps/rejected": -265.36737060546875, + "loss": 0.382, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -8.888944625854492, + "rewards/margins": 9.316179275512695, + "rewards/rejected": -18.205123901367188, + "sft_loss": 1.064568281173706, + "step": 1405 + }, + { + "epoch": 2.3175542406311638, + "grad_norm": 17.736685407866446, + "learning_rate": 3.3618594863780993e-08, + "logits/chosen": 18.37812042236328, + "logits/rejected": 19.024595260620117, + "logps/chosen": -319.2788391113281, + "logps/rejected": -255.89810180664062, + "loss": 0.3468, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -8.851397514343262, + "rewards/margins": 8.883750915527344, + "rewards/rejected": -17.73514747619629, + "sft_loss": 1.235966682434082, + "step": 1410 + }, + { + "epoch": 2.3257725180802105, + "grad_norm": 22.487095580329445, + "learning_rate": 3.2848843234938694e-08, + "logits/chosen": 17.141220092773438, + "logits/rejected": 17.714786529541016, + "logps/chosen": -302.5834045410156, + "logps/rejected": -254.14559936523438, + "loss": 0.36, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.972297668457031, + "rewards/margins": 8.797745704650879, + "rewards/rejected": -17.770044326782227, + "sft_loss": 1.1860109567642212, + "step": 1415 + }, + { + "epoch": 2.333990795529257, + "grad_norm": 11.596948370393193, + "learning_rate": 3.208667112766529e-08, + "logits/chosen": 17.32436752319336, + "logits/rejected": 18.515031814575195, + "logps/chosen": -312.43267822265625, + "logps/rejected": -266.10052490234375, + "loss": 0.3933, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -9.664741516113281, + "rewards/margins": 9.025125503540039, + "rewards/rejected": -18.689865112304688, + "sft_loss": 1.17525315284729, + "step": 1420 + }, + { + "epoch": 2.342209072978304, + "grad_norm": 16.78732289470905, + "learning_rate": 3.1332141231352194e-08, + "logits/chosen": 17.367273330688477, + "logits/rejected": 17.978761672973633, + "logps/chosen": -325.341552734375, + "logps/rejected": -261.8766784667969, + "loss": 0.3954, + "rewards/accuracies": 0.9900000095367432, + "rewards/chosen": -9.092870712280273, + "rewards/margins": 9.265833854675293, + "rewards/rejected": -18.358705520629883, + "sft_loss": 1.1345161199569702, + "step": 1425 + }, + { + "epoch": 2.3504273504273505, + "grad_norm": 13.214123565552589, + "learning_rate": 3.058531560681141e-08, + "logits/chosen": 18.152240753173828, + "logits/rejected": 19.055191040039062, + "logps/chosen": -327.43487548828125, + "logps/rejected": -266.76446533203125, + "loss": 0.3363, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -8.500913619995117, + "rewards/margins": 9.283426284790039, + "rewards/rejected": -17.784339904785156, + "sft_loss": 1.231545329093933, + "step": 1430 + }, + { + "epoch": 2.358645627876397, + "grad_norm": 22.300498596470074, + "learning_rate": 2.984625568117129e-08, + "logits/chosen": 18.67966079711914, + "logits/rejected": 19.73933982849121, + "logps/chosen": -334.677734375, + "logps/rejected": -265.227783203125, + "loss": 0.4029, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -8.572039604187012, + "rewards/margins": 8.896354675292969, + "rewards/rejected": -17.468393325805664, + "sft_loss": 1.1262859106063843, + "step": 1435 + }, + { + "epoch": 2.366863905325444, + "grad_norm": 37.074159109819185, + "learning_rate": 2.9115022242823862e-08, + "logits/chosen": 17.512964248657227, + "logits/rejected": 18.453014373779297, + "logps/chosen": -326.0170593261719, + "logps/rejected": -263.2306213378906, + "loss": 0.3968, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -8.473074913024902, + "rewards/margins": 9.212455749511719, + "rewards/rejected": -17.685529708862305, + "sft_loss": 1.092557430267334, + "step": 1440 + }, + { + "epoch": 2.3750821827744906, + "grad_norm": 11.887196623999731, + "learning_rate": 2.839167543642511e-08, + "logits/chosen": 17.14059066772461, + "logits/rejected": 18.407007217407227, + "logps/chosen": -291.7596435546875, + "logps/rejected": -250.99574279785156, + "loss": 0.4211, + "rewards/accuracies": 0.9900000095367432, + "rewards/chosen": -8.36557388305664, + "rewards/margins": 8.695883750915527, + "rewards/rejected": -17.06145668029785, + "sft_loss": 1.202438235282898, + "step": 1445 + }, + { + "epoch": 2.3833004602235373, + "grad_norm": 17.524490621614987, + "learning_rate": 2.7676274757947816e-08, + "logits/chosen": 18.85689926147461, + "logits/rejected": 19.545021057128906, + "logps/chosen": -287.7202453613281, + "logps/rejected": -244.56924438476562, + "loss": 0.3838, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -8.284355163574219, + "rewards/margins": 8.70119857788086, + "rewards/rejected": -16.985553741455078, + "sft_loss": 1.0111671686172485, + "step": 1450 + }, + { + "epoch": 2.391518737672584, + "grad_norm": 9.1542241365719, + "learning_rate": 2.696887904978819e-08, + "logits/chosen": 18.2181453704834, + "logits/rejected": 18.709545135498047, + "logps/chosen": -280.198974609375, + "logps/rejected": -226.15415954589844, + "loss": 0.4051, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -8.791984558105469, + "rewards/margins": 7.134130477905273, + "rewards/rejected": -15.926115989685059, + "sft_loss": 1.1695269346237183, + "step": 1455 + }, + { + "epoch": 2.3997370151216306, + "grad_norm": 13.076192251177769, + "learning_rate": 2.6269546495925886e-08, + "logits/chosen": 16.172388076782227, + "logits/rejected": 17.052417755126953, + "logps/chosen": -287.6596984863281, + "logps/rejected": -242.12660217285156, + "loss": 0.4246, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -8.466004371643066, + "rewards/margins": 8.168050765991211, + "rewards/rejected": -16.634056091308594, + "sft_loss": 1.1705952882766724, + "step": 1460 + }, + { + "epoch": 2.4079552925706773, + "grad_norm": 14.024390303705356, + "learning_rate": 2.5578334617138236e-08, + "logits/chosen": 17.606464385986328, + "logits/rejected": 18.12337303161621, + "logps/chosen": -301.743408203125, + "logps/rejected": -248.48464965820312, + "loss": 0.3833, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -8.655086517333984, + "rewards/margins": 8.575737953186035, + "rewards/rejected": -17.230825424194336, + "sft_loss": 1.08839750289917, + "step": 1465 + }, + { + "epoch": 2.416173570019724, + "grad_norm": 17.812699456228195, + "learning_rate": 2.489530026626932e-08, + "logits/chosen": 17.72669219970703, + "logits/rejected": 18.6758975982666, + "logps/chosen": -306.7005310058594, + "logps/rejected": -242.5311279296875, + "loss": 0.379, + "rewards/accuracies": 0.9900000095367432, + "rewards/chosen": -8.553206443786621, + "rewards/margins": 8.148569107055664, + "rewards/rejected": -16.70177459716797, + "sft_loss": 1.1815282106399536, + "step": 1470 + }, + { + "epoch": 2.4243918474687707, + "grad_norm": 8.680672775515832, + "learning_rate": 2.422049962355366e-08, + "logits/chosen": 18.41983413696289, + "logits/rejected": 19.47545623779297, + "logps/chosen": -282.099609375, + "logps/rejected": -238.36300659179688, + "loss": 0.3486, + "rewards/accuracies": 0.9900000095367432, + "rewards/chosen": -8.316095352172852, + "rewards/margins": 8.191699981689453, + "rewards/rejected": -16.507797241210938, + "sft_loss": 1.205697774887085, + "step": 1475 + }, + { + "epoch": 2.4326101249178174, + "grad_norm": 16.775370793665815, + "learning_rate": 2.3553988191995208e-08, + "logits/chosen": 16.783174514770508, + "logits/rejected": 18.405048370361328, + "logps/chosen": -304.1385498046875, + "logps/rejected": -256.7261047363281, + "loss": 0.3744, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -8.7383451461792, + "rewards/margins": 8.930658340454102, + "rewards/rejected": -17.669002532958984, + "sft_loss": 1.1059280633926392, + "step": 1480 + }, + { + "epoch": 2.440828402366864, + "grad_norm": 10.437808650182905, + "learning_rate": 2.2895820792802474e-08, + "logits/chosen": 16.727697372436523, + "logits/rejected": 17.59294891357422, + "logps/chosen": -328.55389404296875, + "logps/rejected": -269.7945251464844, + "loss": 0.3695, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -9.062349319458008, + "rewards/margins": 9.41024112701416, + "rewards/rejected": -18.472591400146484, + "sft_loss": 1.173682451248169, + "step": 1485 + }, + { + "epoch": 2.4490466798159107, + "grad_norm": 20.73492010593765, + "learning_rate": 2.2246051560879095e-08, + "logits/chosen": 16.899852752685547, + "logits/rejected": 17.82339096069336, + "logps/chosen": -338.22186279296875, + "logps/rejected": -279.0784912109375, + "loss": 0.4179, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -9.162134170532227, + "rewards/margins": 9.35285758972168, + "rewards/rejected": -18.514989852905273, + "sft_loss": 1.17171311378479, + "step": 1490 + }, + { + "epoch": 2.4572649572649574, + "grad_norm": 12.636170820327301, + "learning_rate": 2.160473394037149e-08, + "logits/chosen": 17.118467330932617, + "logits/rejected": 17.36690330505371, + "logps/chosen": -335.8661804199219, + "logps/rejected": -262.7174072265625, + "loss": 0.4504, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -8.756584167480469, + "rewards/margins": 8.854002952575684, + "rewards/rejected": -17.610586166381836, + "sft_loss": 1.1354836225509644, + "step": 1495 + }, + { + "epoch": 2.465483234714004, + "grad_norm": 11.838207356236568, + "learning_rate": 2.097192068027276e-08, + "logits/chosen": 16.54058837890625, + "logits/rejected": 17.930091857910156, + "logps/chosen": -329.2217712402344, + "logps/rejected": -270.410888671875, + "loss": 0.3262, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -8.5565185546875, + "rewards/margins": 9.7839937210083, + "rewards/rejected": -18.340513229370117, + "sft_loss": 1.0987026691436768, + "step": 1500 + }, + { + "epoch": 2.473701512163051, + "grad_norm": 17.261726485061967, + "learning_rate": 2.0347663830084182e-08, + "logits/chosen": 16.857637405395508, + "logits/rejected": 17.605924606323242, + "logps/chosen": -278.7782287597656, + "logps/rejected": -237.3050537109375, + "loss": 0.3978, + "rewards/accuracies": 0.9900000095367432, + "rewards/chosen": -8.520981788635254, + "rewards/margins": 7.912033557891846, + "rewards/rejected": -16.433013916015625, + "sft_loss": 1.1526176929473877, + "step": 1505 + }, + { + "epoch": 2.4819197896120975, + "grad_norm": 19.01971806956554, + "learning_rate": 1.9732014735534168e-08, + "logits/chosen": 17.1612606048584, + "logits/rejected": 17.63095474243164, + "logps/chosen": -307.8269958496094, + "logps/rejected": -234.46160888671875, + "loss": 0.4156, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -8.234945297241211, + "rewards/margins": 7.63665771484375, + "rewards/rejected": -15.871603012084961, + "sft_loss": 1.1170748472213745, + "step": 1510 + }, + { + "epoch": 2.490138067061144, + "grad_norm": 10.87683842585221, + "learning_rate": 1.9125024034354758e-08, + "logits/chosen": 17.20734214782715, + "logits/rejected": 17.946365356445312, + "logps/chosen": -312.4763488769531, + "logps/rejected": -246.2183837890625, + "loss": 0.351, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -8.035243034362793, + "rewards/margins": 8.52718734741211, + "rewards/rejected": -16.56243133544922, + "sft_loss": 1.071519374847412, + "step": 1515 + }, + { + "epoch": 2.498356344510191, + "grad_norm": 10.244386828979161, + "learning_rate": 1.85267416521169e-08, + "logits/chosen": 17.724872589111328, + "logits/rejected": 18.053852081298828, + "logps/chosen": -320.68597412109375, + "logps/rejected": -246.91893005371094, + "loss": 0.3733, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -7.582959175109863, + "rewards/margins": 8.986913681030273, + "rewards/rejected": -16.56987190246582, + "sft_loss": 1.0908424854278564, + "step": 1520 + }, + { + "epoch": 2.5065746219592375, + "grad_norm": 12.813362766851835, + "learning_rate": 1.793721679812389e-08, + "logits/chosen": 18.601253509521484, + "logits/rejected": 19.362607955932617, + "logps/chosen": -288.14776611328125, + "logps/rejected": -237.0640106201172, + "loss": 0.3857, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -8.49190616607666, + "rewards/margins": 7.810946464538574, + "rewards/rejected": -16.302852630615234, + "sft_loss": 1.091495394706726, + "step": 1525 + }, + { + "epoch": 2.5147928994082838, + "grad_norm": 11.364641270765484, + "learning_rate": 1.735649796136382e-08, + "logits/chosen": 15.785613059997559, + "logits/rejected": 17.070707321166992, + "logps/chosen": -324.2843017578125, + "logps/rejected": -258.7143859863281, + "loss": 0.3883, + "rewards/accuracies": 0.9300000071525574, + "rewards/chosen": -8.855399131774902, + "rewards/margins": 8.804574012756348, + "rewards/rejected": -17.65997314453125, + "sft_loss": 1.1961203813552856, + "step": 1530 + }, + { + "epoch": 2.523011176857331, + "grad_norm": 15.007717453848354, + "learning_rate": 1.678463290652142e-08, + "logits/chosen": 17.604642868041992, + "logits/rejected": 17.90863609313965, + "logps/chosen": -312.69024658203125, + "logps/rejected": -249.09962463378906, + "loss": 0.3626, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -8.328804016113281, + "rewards/margins": 8.493717193603516, + "rewards/rejected": -16.822521209716797, + "sft_loss": 1.157140851020813, + "step": 1535 + }, + { + "epoch": 2.531229454306377, + "grad_norm": 8.792843348493232, + "learning_rate": 1.6221668670049315e-08, + "logits/chosen": 16.296873092651367, + "logits/rejected": 17.479211807250977, + "logps/chosen": -327.1073303222656, + "logps/rejected": -271.3090515136719, + "loss": 0.3481, + "rewards/accuracies": 0.9900000095367432, + "rewards/chosen": -8.877535820007324, + "rewards/margins": 9.413492202758789, + "rewards/rejected": -18.291027069091797, + "sft_loss": 1.1908369064331055, + "step": 1540 + }, + { + "epoch": 2.5394477317554243, + "grad_norm": 14.557660052303598, + "learning_rate": 1.5667651556299178e-08, + "logits/chosen": 16.44731903076172, + "logits/rejected": 17.4537296295166, + "logps/chosen": -306.1639709472656, + "logps/rejected": -253.69247436523438, + "loss": 0.3531, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -8.432329177856445, + "rewards/margins": 8.949870109558105, + "rewards/rejected": -17.382200241088867, + "sft_loss": 1.114105463027954, + "step": 1545 + }, + { + "epoch": 2.5476660092044705, + "grad_norm": 15.415629759090677, + "learning_rate": 1.5122627133713262e-08, + "logits/chosen": 15.742711067199707, + "logits/rejected": 17.65005874633789, + "logps/chosen": -317.73675537109375, + "logps/rejected": -260.23907470703125, + "loss": 0.3849, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -7.840112209320068, + "rewards/margins": 9.513845443725586, + "rewards/rejected": -17.35395622253418, + "sft_loss": 1.292752981185913, + "step": 1550 + }, + { + "epoch": 2.5558842866535176, + "grad_norm": 11.677314306376815, + "learning_rate": 1.4586640231076226e-08, + "logits/chosen": 17.83001708984375, + "logits/rejected": 18.008840560913086, + "logps/chosen": -290.8938293457031, + "logps/rejected": -234.77801513671875, + "loss": 0.3699, + "rewards/accuracies": 0.9399999976158142, + "rewards/chosen": -8.604043006896973, + "rewards/margins": 7.615962028503418, + "rewards/rejected": -16.22000503540039, + "sft_loss": 1.1707122325897217, + "step": 1555 + }, + { + "epoch": 2.564102564102564, + "grad_norm": 17.403632913196056, + "learning_rate": 1.405973493382806e-08, + "logits/chosen": 16.150592803955078, + "logits/rejected": 17.557065963745117, + "logps/chosen": -321.72802734375, + "logps/rejected": -270.1099548339844, + "loss": 0.3552, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -9.425326347351074, + "rewards/margins": 9.209266662597656, + "rewards/rejected": -18.634592056274414, + "sft_loss": 1.0887880325317383, + "step": 1560 + }, + { + "epoch": 2.572320841551611, + "grad_norm": 16.12695707285676, + "learning_rate": 1.3541954580437941e-08, + "logits/chosen": 18.370115280151367, + "logits/rejected": 18.63874626159668, + "logps/chosen": -321.462646484375, + "logps/rejected": -259.6288757324219, + "loss": 0.3254, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -8.813506126403809, + "rewards/margins": 9.208869934082031, + "rewards/rejected": -18.022377014160156, + "sft_loss": 1.0541073083877563, + "step": 1565 + }, + { + "epoch": 2.5805391190006572, + "grad_norm": 18.71581348868284, + "learning_rate": 1.3033341758839592e-08, + "logits/chosen": 16.9278621673584, + "logits/rejected": 17.87784767150879, + "logps/chosen": -333.1341552734375, + "logps/rejected": -271.1338195800781, + "loss": 0.4055, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -9.0064058303833, + "rewards/margins": 9.4508638381958, + "rewards/rejected": -18.4572696685791, + "sft_loss": 1.1667834520339966, + "step": 1570 + }, + { + "epoch": 2.5887573964497044, + "grad_norm": 11.817463136679503, + "learning_rate": 1.2533938302928329e-08, + "logits/chosen": 17.372867584228516, + "logits/rejected": 18.298500061035156, + "logps/chosen": -346.6560974121094, + "logps/rejected": -274.773681640625, + "loss": 0.3683, + "rewards/accuracies": 0.9900000095367432, + "rewards/chosen": -8.409444808959961, + "rewards/margins": 9.886656761169434, + "rewards/rejected": -18.296100616455078, + "sft_loss": 1.183761477470398, + "step": 1575 + }, + { + "epoch": 2.5969756738987506, + "grad_norm": 11.086694788731137, + "learning_rate": 1.2043785289120409e-08, + "logits/chosen": 16.920242309570312, + "logits/rejected": 18.36749839782715, + "logps/chosen": -333.09539794921875, + "logps/rejected": -272.066162109375, + "loss": 0.378, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -8.567721366882324, + "rewards/margins": 9.71126937866211, + "rewards/rejected": -18.278989791870117, + "sft_loss": 1.226511001586914, + "step": 1580 + }, + { + "epoch": 2.6051939513477977, + "grad_norm": 19.57395022687368, + "learning_rate": 1.1562923032974125e-08, + "logits/chosen": 17.482685089111328, + "logits/rejected": 18.186784744262695, + "logps/chosen": -336.7694396972656, + "logps/rejected": -273.9622497558594, + "loss": 0.3656, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -8.473450660705566, + "rewards/margins": 9.806720733642578, + "rewards/rejected": -18.280170440673828, + "sft_loss": 1.0997947454452515, + "step": 1585 + }, + { + "epoch": 2.613412228796844, + "grad_norm": 13.846460377871546, + "learning_rate": 1.1091391085874161e-08, + "logits/chosen": 17.66254425048828, + "logits/rejected": 17.869403839111328, + "logps/chosen": -355.09124755859375, + "logps/rejected": -262.7408142089844, + "loss": 0.3909, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -8.950118064880371, + "rewards/margins": 8.9635009765625, + "rewards/rejected": -17.913618087768555, + "sft_loss": 1.2338536977767944, + "step": 1590 + }, + { + "epoch": 2.621630506245891, + "grad_norm": 18.388530342654583, + "learning_rate": 1.06292282317781e-08, + "logits/chosen": 18.353347778320312, + "logits/rejected": 19.111572265625, + "logps/chosen": -293.8038024902344, + "logps/rejected": -240.29061889648438, + "loss": 0.3818, + "rewards/accuracies": 0.9900000095367432, + "rewards/chosen": -8.344436645507812, + "rewards/margins": 8.241558074951172, + "rewards/rejected": -16.58599281311035, + "sft_loss": 1.1101101636886597, + "step": 1595 + }, + { + "epoch": 2.6298487836949374, + "grad_norm": 48.6421585527008, + "learning_rate": 1.017647248402674e-08, + "logits/chosen": 17.27472686767578, + "logits/rejected": 17.775699615478516, + "logps/chosen": -338.6330871582031, + "logps/rejected": -265.278564453125, + "loss": 0.4384, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -8.897347450256348, + "rewards/margins": 8.987475395202637, + "rewards/rejected": -17.884824752807617, + "sft_loss": 1.1422169208526611, + "step": 1600 + }, + { + "epoch": 2.6380670611439845, + "grad_norm": 13.863026192467665, + "learning_rate": 9.733161082217223e-09, + "logits/chosen": 16.872806549072266, + "logits/rejected": 17.572965621948242, + "logps/chosen": -321.6798095703125, + "logps/rejected": -258.7831115722656, + "loss": 0.4032, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -8.822911262512207, + "rewards/margins": 8.982833862304688, + "rewards/rejected": -17.805744171142578, + "sft_loss": 1.125891923904419, + "step": 1605 + }, + { + "epoch": 2.6462853385930307, + "grad_norm": 15.402379291218823, + "learning_rate": 9.299330489140125e-09, + "logits/chosen": 17.64206314086914, + "logits/rejected": 18.37377166748047, + "logps/chosen": -285.738037109375, + "logps/rejected": -240.1550750732422, + "loss": 0.4197, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -8.605627059936523, + "rewards/margins": 8.010725021362305, + "rewards/rejected": -16.616352081298828, + "sft_loss": 1.0786948204040527, + "step": 1610 + }, + { + "epoch": 2.654503616042078, + "grad_norm": 11.137896212671778, + "learning_rate": 8.87501638778039e-09, + "logits/chosen": 16.587888717651367, + "logits/rejected": 17.759031295776367, + "logps/chosen": -309.4990539550781, + "logps/rejected": -254.31495666503906, + "loss": 0.4112, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -9.20348834991455, + "rewards/margins": 8.457581520080566, + "rewards/rejected": -17.66107177734375, + "sft_loss": 1.10163414478302, + "step": 1615 + }, + { + "epoch": 2.662721893491124, + "grad_norm": 10.357256991488983, + "learning_rate": 8.460253678382296e-09, + "logits/chosen": 17.529693603515625, + "logits/rejected": 18.570171356201172, + "logps/chosen": -337.939453125, + "logps/rejected": -269.9917297363281, + "loss": 0.3553, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -8.473341941833496, + "rewards/margins": 9.756902694702148, + "rewards/rejected": -18.230243682861328, + "sft_loss": 1.0737409591674805, + "step": 1620 + }, + { + "epoch": 2.6709401709401708, + "grad_norm": 12.585703695972256, + "learning_rate": 8.055076475578918e-09, + "logits/chosen": 17.500032424926758, + "logits/rejected": 18.32237434387207, + "logps/chosen": -326.6228942871094, + "logps/rejected": -261.5873107910156, + "loss": 0.3922, + "rewards/accuracies": 0.9900000095367432, + "rewards/chosen": -8.563287734985352, + "rewards/margins": 9.018136024475098, + "rewards/rejected": -17.581424713134766, + "sft_loss": 1.1417536735534668, + "step": 1625 + }, + { + "epoch": 2.6791584483892175, + "grad_norm": 12.800841299642682, + "learning_rate": 7.659518105586238e-09, + "logits/chosen": 16.294475555419922, + "logits/rejected": 18.111600875854492, + "logps/chosen": -335.0698547363281, + "logps/rejected": -275.7948913574219, + "loss": 0.3539, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -8.872127532958984, + "rewards/margins": 9.738655090332031, + "rewards/rejected": -18.610782623291016, + "sft_loss": 1.171600341796875, + "step": 1630 + }, + { + "epoch": 2.687376725838264, + "grad_norm": 11.437442046862925, + "learning_rate": 7.273611103461836e-09, + "logits/chosen": 17.347509384155273, + "logits/rejected": 18.348569869995117, + "logps/chosen": -303.4100341796875, + "logps/rejected": -250.9491729736328, + "loss": 0.3316, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -8.186358451843262, + "rewards/margins": 8.977690696716309, + "rewards/rejected": -17.164051055908203, + "sft_loss": 1.1324518918991089, + "step": 1635 + }, + { + "epoch": 2.695595003287311, + "grad_norm": 12.932581100678355, + "learning_rate": 6.897387210429067e-09, + "logits/chosen": 17.321182250976562, + "logits/rejected": 18.35422134399414, + "logps/chosen": -298.5028381347656, + "logps/rejected": -246.92356872558594, + "loss": 0.4056, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -8.715899467468262, + "rewards/margins": 8.367709159851074, + "rewards/rejected": -17.08361053466797, + "sft_loss": 1.143718957901001, + "step": 1640 + }, + { + "epoch": 2.7038132807363575, + "grad_norm": 14.723722025410018, + "learning_rate": 6.530877371266175e-09, + "logits/chosen": 16.489261627197266, + "logits/rejected": 17.733213424682617, + "logps/chosen": -305.61749267578125, + "logps/rejected": -256.1786804199219, + "loss": 0.3542, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -8.568675994873047, + "rewards/margins": 9.070348739624023, + "rewards/rejected": -17.639026641845703, + "sft_loss": 1.176300048828125, + "step": 1645 + }, + { + "epoch": 2.712031558185404, + "grad_norm": 22.923491412294727, + "learning_rate": 6.1741117317611196e-09, + "logits/chosen": 17.291810989379883, + "logits/rejected": 18.644412994384766, + "logps/chosen": -321.612060546875, + "logps/rejected": -269.1338195800781, + "loss": 0.4291, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -9.136231422424316, + "rewards/margins": 9.433501243591309, + "rewards/rejected": -18.569734573364258, + "sft_loss": 1.2353969812393188, + "step": 1650 + }, + { + "epoch": 2.720249835634451, + "grad_norm": 10.617946186080342, + "learning_rate": 5.827119636232017e-09, + "logits/chosen": 17.4252872467041, + "logits/rejected": 18.208906173706055, + "logps/chosen": -308.66943359375, + "logps/rejected": -251.34764099121094, + "loss": 0.4103, + "rewards/accuracies": 0.9300000071525574, + "rewards/chosen": -8.655915260314941, + "rewards/margins": 8.82339096069336, + "rewards/rejected": -17.479307174682617, + "sft_loss": 1.2225102186203003, + "step": 1655 + }, + { + "epoch": 2.7284681130834976, + "grad_norm": 9.888027224233095, + "learning_rate": 5.489929625113549e-09, + "logits/chosen": 16.691282272338867, + "logits/rejected": 17.671295166015625, + "logps/chosen": -328.8042297363281, + "logps/rejected": -267.9706726074219, + "loss": 0.4266, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -8.37188720703125, + "rewards/margins": 9.643902778625488, + "rewards/rejected": -18.015790939331055, + "sft_loss": 1.2559726238250732, + "step": 1660 + }, + { + "epoch": 2.7366863905325443, + "grad_norm": 20.805457290074077, + "learning_rate": 5.1625694326095506e-09, + "logits/chosen": 16.405752182006836, + "logits/rejected": 17.14948081970215, + "logps/chosen": -341.1684875488281, + "logps/rejected": -271.09710693359375, + "loss": 0.3332, + "rewards/accuracies": 0.9900000095367432, + "rewards/chosen": -8.78724193572998, + "rewards/margins": 9.57591724395752, + "rewards/rejected": -18.363157272338867, + "sft_loss": 1.0471839904785156, + "step": 1665 + }, + { + "epoch": 2.744904667981591, + "grad_norm": 13.812771677348046, + "learning_rate": 4.845065984411742e-09, + "logits/chosen": 16.383556365966797, + "logits/rejected": 17.95462989807129, + "logps/chosen": -331.20526123046875, + "logps/rejected": -279.220458984375, + "loss": 0.357, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -9.316568374633789, + "rewards/margins": 9.552423477172852, + "rewards/rejected": -18.86899185180664, + "sft_loss": 1.163619875907898, + "step": 1670 + }, + { + "epoch": 2.7531229454306376, + "grad_norm": 14.688896292238876, + "learning_rate": 4.5374453954851035e-09, + "logits/chosen": 18.362672805786133, + "logits/rejected": 19.01654815673828, + "logps/chosen": -307.0843200683594, + "logps/rejected": -246.02671813964844, + "loss": 0.3677, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -8.525540351867676, + "rewards/margins": 8.622610092163086, + "rewards/rejected": -17.148151397705078, + "sft_loss": 1.1697852611541748, + "step": 1675 + }, + { + "epoch": 2.7613412228796843, + "grad_norm": 15.151111907515142, + "learning_rate": 4.239732967919976e-09, + "logits/chosen": 18.35997772216797, + "logits/rejected": 18.519113540649414, + "logps/chosen": -283.6457214355469, + "logps/rejected": -239.9178466796875, + "loss": 0.3946, + "rewards/accuracies": 0.9900000095367432, + "rewards/chosen": -8.615645408630371, + "rewards/margins": 8.21639633178711, + "rewards/rejected": -16.832042694091797, + "sft_loss": 1.185640573501587, + "step": 1680 + }, + { + "epoch": 2.769559500328731, + "grad_norm": 8.993747704826987, + "learning_rate": 3.951953188850762e-09, + "logits/chosen": 15.838356018066406, + "logits/rejected": 17.58329963684082, + "logps/chosen": -300.3641662597656, + "logps/rejected": -254.8916473388672, + "loss": 0.3834, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -8.536845207214355, + "rewards/margins": 8.757308006286621, + "rewards/rejected": -17.294153213500977, + "sft_loss": 1.1382744312286377, + "step": 1685 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 13.655366152597214, + "learning_rate": 3.674129728442013e-09, + "logits/chosen": 17.68130874633789, + "logits/rejected": 19.080127716064453, + "logps/chosen": -268.5239562988281, + "logps/rejected": -229.50523376464844, + "loss": 0.3877, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -8.217788696289062, + "rewards/margins": 7.664586067199707, + "rewards/rejected": -15.882373809814453, + "sft_loss": 1.0555132627487183, + "step": 1690 + }, + { + "epoch": 2.7859960552268244, + "grad_norm": 11.562075341982874, + "learning_rate": 3.4062854379414694e-09, + "logits/chosen": 17.3222599029541, + "logits/rejected": 18.08160972595215, + "logps/chosen": -306.2829284667969, + "logps/rejected": -251.935546875, + "loss": 0.3607, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -8.6878023147583, + "rewards/margins": 8.51749038696289, + "rewards/rejected": -17.205289840698242, + "sft_loss": 1.1840558052062988, + "step": 1695 + }, + { + "epoch": 2.794214332675871, + "grad_norm": 25.80729521542422, + "learning_rate": 3.1484423478004563e-09, + "logits/chosen": 17.99493408203125, + "logits/rejected": 18.518619537353516, + "logps/chosen": -289.563232421875, + "logps/rejected": -243.09219360351562, + "loss": 0.4297, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -8.428235054016113, + "rewards/margins": 8.079200744628906, + "rewards/rejected": -16.50743865966797, + "sft_loss": 1.108068585395813, + "step": 1700 + }, + { + "epoch": 2.8024326101249177, + "grad_norm": 13.892592196473423, + "learning_rate": 2.9006216658619687e-09, + "logits/chosen": 16.929012298583984, + "logits/rejected": 17.672870635986328, + "logps/chosen": -313.2660217285156, + "logps/rejected": -257.04034423828125, + "loss": 0.399, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -8.524652481079102, + "rewards/margins": 9.001575469970703, + "rewards/rejected": -17.526227951049805, + "sft_loss": 1.1432716846466064, + "step": 1705 + }, + { + "epoch": 2.8106508875739644, + "grad_norm": 9.790469642612795, + "learning_rate": 2.6628437756162635e-09, + "logits/chosen": 17.310102462768555, + "logits/rejected": 18.04708480834961, + "logps/chosen": -293.9396057128906, + "logps/rejected": -240.7176971435547, + "loss": 0.3473, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -8.278569221496582, + "rewards/margins": 8.390737533569336, + "rewards/rejected": -16.669307708740234, + "sft_loss": 1.1700962781906128, + "step": 1710 + }, + { + "epoch": 2.818869165023011, + "grad_norm": 9.444898259948333, + "learning_rate": 2.435128234524228e-09, + "logits/chosen": 17.586627960205078, + "logits/rejected": 18.176280975341797, + "logps/chosen": -299.7925109863281, + "logps/rejected": -243.57485961914062, + "loss": 0.4067, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -8.356633186340332, + "rewards/margins": 8.317458152770996, + "rewards/rejected": -16.674091339111328, + "sft_loss": 1.1623938083648682, + "step": 1715 + }, + { + "epoch": 2.827087442472058, + "grad_norm": 11.302990178309454, + "learning_rate": 2.2174937724088877e-09, + "logits/chosen": 17.02381134033203, + "logits/rejected": 18.46286392211914, + "logps/chosen": -314.8418273925781, + "logps/rejected": -259.57745361328125, + "loss": 0.4069, + "rewards/accuracies": 0.9200000166893005, + "rewards/chosen": -8.880843162536621, + "rewards/margins": 8.848891258239746, + "rewards/rejected": -17.729736328125, + "sft_loss": 1.1079494953155518, + "step": 1720 + }, + { + "epoch": 2.8353057199211045, + "grad_norm": 9.711126487613186, + "learning_rate": 2.009958289914765e-09, + "logits/chosen": 17.012800216674805, + "logits/rejected": 18.349876403808594, + "logps/chosen": -321.7917175292969, + "logps/rejected": -270.1522216796875, + "loss": 0.3451, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -8.694296836853027, + "rewards/margins": 9.856916427612305, + "rewards/rejected": -18.551212310791016, + "sft_loss": 1.0486385822296143, + "step": 1725 + }, + { + "epoch": 2.843523997370151, + "grad_norm": 22.398743525886992, + "learning_rate": 1.8125388570355422e-09, + "logits/chosen": 16.76806640625, + "logits/rejected": 17.946535110473633, + "logps/chosen": -312.1168212890625, + "logps/rejected": -266.208984375, + "loss": 0.3337, + "rewards/accuracies": 0.9900000095367432, + "rewards/chosen": -8.700928688049316, + "rewards/margins": 9.464086532592773, + "rewards/rejected": -18.165014266967773, + "sft_loss": 1.162864327430725, + "step": 1730 + }, + { + "epoch": 2.851742274819198, + "grad_norm": 14.549076580676688, + "learning_rate": 1.6252517117101017e-09, + "logits/chosen": 16.1746768951416, + "logits/rejected": 17.028032302856445, + "logps/chosen": -316.6230773925781, + "logps/rejected": -258.9454345703125, + "loss": 0.4137, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -8.804771423339844, + "rewards/margins": 8.894736289978027, + "rewards/rejected": -17.699508666992188, + "sft_loss": 1.2625643014907837, + "step": 1735 + }, + { + "epoch": 2.8599605522682445, + "grad_norm": 22.305121337267558, + "learning_rate": 1.4481122584868582e-09, + "logits/chosen": 16.654598236083984, + "logits/rejected": 17.727828979492188, + "logps/chosen": -327.3823547363281, + "logps/rejected": -264.8335876464844, + "loss": 0.4201, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.304695129394531, + "rewards/margins": 8.695550918579102, + "rewards/rejected": -18.000246047973633, + "sft_loss": 1.1684330701828003, + "step": 1740 + }, + { + "epoch": 2.868178829717291, + "grad_norm": 12.445565014042396, + "learning_rate": 1.2811350672568138e-09, + "logits/chosen": 16.678804397583008, + "logits/rejected": 18.215984344482422, + "logps/chosen": -340.2626953125, + "logps/rejected": -277.87872314453125, + "loss": 0.4267, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -8.802419662475586, + "rewards/margins": 9.925808906555176, + "rewards/rejected": -18.728229522705078, + "sft_loss": 1.1311696767807007, + "step": 1745 + }, + { + "epoch": 2.876397107166338, + "grad_norm": 16.390316522695066, + "learning_rate": 1.1243338720550445e-09, + "logits/chosen": 16.955345153808594, + "logits/rejected": 18.02084732055664, + "logps/chosen": -291.6322937011719, + "logps/rejected": -249.6865234375, + "loss": 0.4018, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -8.541548728942871, + "rewards/margins": 8.891424179077148, + "rewards/rejected": -17.432973861694336, + "sft_loss": 1.122809648513794, + "step": 1750 + }, + { + "epoch": 2.8846153846153846, + "grad_norm": 12.452008236969373, + "learning_rate": 9.777215699311725e-10, + "logits/chosen": 17.285600662231445, + "logits/rejected": 18.065244674682617, + "logps/chosen": -304.11834716796875, + "logps/rejected": -252.4257049560547, + "loss": 0.3855, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -8.604863166809082, + "rewards/margins": 8.664654731750488, + "rewards/rejected": -17.269519805908203, + "sft_loss": 1.2225173711776733, + "step": 1755 + }, + { + "epoch": 2.8928336620644313, + "grad_norm": 26.02343316648693, + "learning_rate": 8.413102198885358e-10, + "logits/chosen": 15.385034561157227, + "logits/rejected": 16.86432456970215, + "logps/chosen": -327.46160888671875, + "logps/rejected": -264.8345031738281, + "loss": 0.4478, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.194596290588379, + "rewards/margins": 8.853148460388184, + "rewards/rejected": -18.047740936279297, + "sft_loss": 1.0643724203109741, + "step": 1760 + }, + { + "epoch": 2.901051939513478, + "grad_norm": 9.970098814112205, + "learning_rate": 7.151110418923134e-10, + "logits/chosen": 18.434673309326172, + "logits/rejected": 18.675090789794922, + "logps/chosen": -302.91534423828125, + "logps/rejected": -248.91583251953125, + "loss": 0.3988, + "rewards/accuracies": 0.9900000095367432, + "rewards/chosen": -8.768433570861816, + "rewards/margins": 8.766546249389648, + "rewards/rejected": -17.53498077392578, + "sft_loss": 1.1868294477462769, + "step": 1765 + }, + { + "epoch": 2.9092702169625246, + "grad_norm": 18.45143826968204, + "learning_rate": 5.991344159466672e-10, + "logits/chosen": 16.24605941772461, + "logits/rejected": 17.377365112304688, + "logps/chosen": -318.8271789550781, + "logps/rejected": -257.1405334472656, + "loss": 0.352, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -8.925313949584961, + "rewards/margins": 8.410311698913574, + "rewards/rejected": -17.335628509521484, + "sft_loss": 1.1228437423706055, + "step": 1770 + }, + { + "epoch": 2.9174884944115713, + "grad_norm": 12.923252042791281, + "learning_rate": 4.933898812409937e-10, + "logits/chosen": 16.73847198486328, + "logits/rejected": 17.230134963989258, + "logps/chosen": -338.15118408203125, + "logps/rejected": -271.0611267089844, + "loss": 0.3936, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -9.159259796142578, + "rewards/margins": 9.365001678466797, + "rewards/rejected": -18.524259567260742, + "sft_loss": 1.1974759101867676, + "step": 1775 + }, + { + "epoch": 2.925706771860618, + "grad_norm": 18.59092085629164, + "learning_rate": 3.978861353653301e-10, + "logits/chosen": 17.0466251373291, + "logits/rejected": 17.81385612487793, + "logps/chosen": -301.74603271484375, + "logps/rejected": -247.91571044921875, + "loss": 0.4187, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -8.685689926147461, + "rewards/margins": 8.343822479248047, + "rewards/rejected": -17.02951431274414, + "sft_loss": 1.086068034172058, + "step": 1780 + }, + { + "epoch": 2.9339250493096647, + "grad_norm": 16.48047435187235, + "learning_rate": 3.1263103359494005e-10, + "logits/chosen": 17.160581588745117, + "logits/rejected": 18.65143585205078, + "logps/chosen": -300.696533203125, + "logps/rejected": -245.1064453125, + "loss": 0.3632, + "rewards/accuracies": 0.9399999976158142, + "rewards/chosen": -7.98746395111084, + "rewards/margins": 8.9141206741333, + "rewards/rejected": -16.90158462524414, + "sft_loss": 1.4039214849472046, + "step": 1785 + }, + { + "epoch": 2.9421433267587114, + "grad_norm": 14.541496183664128, + "learning_rate": 2.3763158824419147e-10, + "logits/chosen": 16.89483642578125, + "logits/rejected": 17.82222557067871, + "logps/chosen": -328.6429748535156, + "logps/rejected": -268.05938720703125, + "loss": 0.3455, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.467479705810547, + "rewards/margins": 9.740607261657715, + "rewards/rejected": -18.208087921142578, + "sft_loss": 1.1161048412322998, + "step": 1790 + }, + { + "epoch": 2.950361604207758, + "grad_norm": 14.008459290888235, + "learning_rate": 1.728939680898517e-10, + "logits/chosen": 16.10931396484375, + "logits/rejected": 17.673229217529297, + "logps/chosen": -308.1393737792969, + "logps/rejected": -262.28009033203125, + "loss": 0.367, + "rewards/accuracies": 0.9900000095367432, + "rewards/chosen": -9.119178771972656, + "rewards/margins": 8.93433666229248, + "rewards/rejected": -18.05351448059082, + "sft_loss": 1.1963419914245605, + "step": 1795 + }, + { + "epoch": 2.9585798816568047, + "grad_norm": 16.666553214725845, + "learning_rate": 1.184234978636456e-10, + "logits/chosen": 16.49167823791504, + "logits/rejected": 17.332914352416992, + "logps/chosen": -282.5769348144531, + "logps/rejected": -248.25242614746094, + "loss": 0.3921, + "rewards/accuracies": 0.9599999785423279, + "rewards/chosen": -8.752376556396484, + "rewards/margins": 8.20280933380127, + "rewards/rejected": -16.955184936523438, + "sft_loss": 1.2729109525680542, + "step": 1800 + }, + { + "epoch": 2.9667981591058514, + "grad_norm": 15.626709598718353, + "learning_rate": 7.422465781431464e-11, + "logits/chosen": 16.95427894592285, + "logits/rejected": 17.818552017211914, + "logps/chosen": -329.6918640136719, + "logps/rejected": -266.5020446777344, + "loss": 0.3843, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -8.854848861694336, + "rewards/margins": 9.140162467956543, + "rewards/rejected": -17.995010375976562, + "sft_loss": 1.1496516466140747, + "step": 1805 + }, + { + "epoch": 2.975016436554898, + "grad_norm": 12.600231440275685, + "learning_rate": 4.030108333910598e-11, + "logits/chosen": 17.70891571044922, + "logits/rejected": 18.366714477539062, + "logps/chosen": -295.2488708496094, + "logps/rejected": -242.97634887695312, + "loss": 0.3819, + "rewards/accuracies": 0.9700000286102295, + "rewards/chosen": -8.695423126220703, + "rewards/margins": 8.07010269165039, + "rewards/rejected": -16.765525817871094, + "sft_loss": 1.1890416145324707, + "step": 1810 + }, + { + "epoch": 2.983234714003945, + "grad_norm": 14.846673339349834, + "learning_rate": 1.6655564684747713e-11, + "logits/chosen": 17.073108673095703, + "logits/rejected": 17.751785278320312, + "logps/chosen": -334.0798034667969, + "logps/rejected": -261.91644287109375, + "loss": 0.372, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -9.156329154968262, + "rewards/margins": 8.551715850830078, + "rewards/rejected": -17.708045959472656, + "sft_loss": 1.1268292665481567, + "step": 1815 + }, + { + "epoch": 2.9914529914529915, + "grad_norm": 9.077728581968316, + "learning_rate": 3.290046717979722e-12, + "logits/chosen": 16.00580406188965, + "logits/rejected": 16.491676330566406, + "logps/chosen": -320.336181640625, + "logps/rejected": -255.6234588623047, + "loss": 0.3871, + "rewards/accuracies": 0.9800000190734863, + "rewards/chosen": -7.969948768615723, + "rewards/margins": 9.396775245666504, + "rewards/rejected": -17.366724014282227, + "sft_loss": 1.16538667678833, + "step": 1820 + }, + { + "epoch": 2.998027613412229, + "step": 1824, + "total_flos": 287426369617920.0, + "train_loss": 0.5032803327368017, + "train_runtime": 76434.0426, + "train_samples_per_second": 1.433, + "train_steps_per_second": 0.024 + } + ], + "logging_steps": 5, + "max_steps": 1824, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 287426369617920.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}