diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,11706 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 8335, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 5.995203836930456e-09, + "logits/chosen": -2.424614667892456, + "logits/rejected": -1.9891018867492676, + "logps/chosen": -441.5737609863281, + "logps/rejected": -473.3967590332031, + "loss": 0.1361, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 5.995203836930456e-08, + "logits/chosen": -2.110199213027954, + "logits/rejected": -1.765876054763794, + "logps/chosen": -209.27218627929688, + "logps/rejected": -153.5750274658203, + "loss": 0.2066, + "rewards/accuracies": 0.3888888955116272, + "rewards/chosen": 0.0005430497694760561, + "rewards/margins": 0.0006039439467713237, + "rewards/rejected": -6.089422822697088e-05, + "step": 10 + }, + { + "epoch": 0.0, + "learning_rate": 1.1990407673860913e-07, + "logits/chosen": -1.9729121923446655, + "logits/rejected": -1.6711788177490234, + "logps/chosen": -187.25914001464844, + "logps/rejected": -146.9638671875, + "loss": 0.1876, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.00047091051237657666, + "rewards/margins": -0.0006188965635374188, + "rewards/rejected": 0.00014798599295318127, + "step": 20 + }, + { + "epoch": 0.0, + "learning_rate": 1.7985611510791368e-07, + "logits/chosen": -2.093867063522339, + "logits/rejected": -1.7798885107040405, + "logps/chosen": -271.8372802734375, + "logps/rejected": -197.7427978515625, + "loss": 0.161, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0006424393504858017, + "rewards/margins": 0.0006549443351104856, + "rewards/rejected": -1.2505089216574561e-05, + "step": 30 + }, + { + "epoch": 0.0, + "learning_rate": 2.3980815347721825e-07, + "logits/chosen": -1.8111674785614014, + "logits/rejected": -1.651614785194397, + "logps/chosen": -180.64151000976562, + "logps/rejected": -205.8025360107422, + "loss": 0.1737, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0005901036784052849, + "rewards/margins": 0.0005529513582587242, + "rewards/rejected": 3.7152261938899755e-05, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 2.997601918465228e-07, + "logits/chosen": -2.04856538772583, + "logits/rejected": -1.7901275157928467, + "logps/chosen": -215.7578582763672, + "logps/rejected": -220.8831024169922, + "loss": 0.2306, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.00023345758381765336, + "rewards/margins": 8.568236808059737e-05, + "rewards/rejected": -0.00031913991551846266, + "step": 50 + }, + { + "epoch": 0.01, + "learning_rate": 3.5971223021582736e-07, + "logits/chosen": -2.021206855773926, + "logits/rejected": -1.532591462135315, + "logps/chosen": -217.2874298095703, + "logps/rejected": -155.38461303710938, + "loss": 0.1551, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.0003670873702503741, + "rewards/margins": 0.004133955575525761, + "rewards/rejected": -0.004501043353229761, + "step": 60 + }, + { + "epoch": 0.01, + "learning_rate": 4.1966426858513196e-07, + "logits/chosen": -2.06149959564209, + "logits/rejected": -1.6334540843963623, + "logps/chosen": -219.23593139648438, + "logps/rejected": -171.34017944335938, + "loss": 0.1949, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.0024479026906192303, + "rewards/margins": 0.001299393828958273, + "rewards/rejected": -0.0037472962867468596, + "step": 70 + }, + { + "epoch": 0.01, + "learning_rate": 4.796163069544365e-07, + "logits/chosen": -2.0656137466430664, + "logits/rejected": -1.6331923007965088, + "logps/chosen": -287.973876953125, + "logps/rejected": -253.8162384033203, + "loss": 0.1885, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.006006647367030382, + "rewards/margins": 0.0021041277796030045, + "rewards/rejected": -0.0081107746809721, + "step": 80 + }, + { + "epoch": 0.01, + "learning_rate": 5.39568345323741e-07, + "logits/chosen": -2.0038671493530273, + "logits/rejected": -1.5671374797821045, + "logps/chosen": -227.6986083984375, + "logps/rejected": -173.5839385986328, + "loss": 0.1738, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.004840956535190344, + "rewards/margins": 0.010292068123817444, + "rewards/rejected": -0.015133025124669075, + "step": 90 + }, + { + "epoch": 0.01, + "learning_rate": 5.995203836930456e-07, + "logits/chosen": -1.8383821249008179, + "logits/rejected": -1.9401063919067383, + "logps/chosen": -162.48382568359375, + "logps/rejected": -229.74935913085938, + "loss": 0.1897, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.013102886267006397, + "rewards/margins": 0.002711429027840495, + "rewards/rejected": -0.015814315527677536, + "step": 100 + }, + { + "epoch": 0.01, + "learning_rate": 6.594724220623502e-07, + "logits/chosen": -2.0046496391296387, + "logits/rejected": -1.519207239151001, + "logps/chosen": -167.69444274902344, + "logps/rejected": -131.64898681640625, + "loss": 0.2673, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.02011437714099884, + "rewards/margins": 0.024061836302280426, + "rewards/rejected": -0.04417620971798897, + "step": 110 + }, + { + "epoch": 0.01, + "learning_rate": 7.194244604316547e-07, + "logits/chosen": -2.0810534954071045, + "logits/rejected": -1.8244158029556274, + "logps/chosen": -236.47250366210938, + "logps/rejected": -229.98471069335938, + "loss": 0.2358, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.011348506435751915, + "rewards/margins": 0.03258121386170387, + "rewards/rejected": -0.04392971843481064, + "step": 120 + }, + { + "epoch": 0.02, + "learning_rate": 7.793764988009593e-07, + "logits/chosen": -1.749682068824768, + "logits/rejected": -1.4685245752334595, + "logps/chosen": -212.50454711914062, + "logps/rejected": -210.17977905273438, + "loss": 0.2316, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.017982326447963715, + "rewards/margins": 0.043081801384687424, + "rewards/rejected": -0.06106413155794144, + "step": 130 + }, + { + "epoch": 0.02, + "learning_rate": 8.393285371702639e-07, + "logits/chosen": -1.8063485622406006, + "logits/rejected": -1.7413132190704346, + "logps/chosen": -159.96786499023438, + "logps/rejected": -218.73233032226562, + "loss": 0.1663, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.042347490787506104, + "rewards/margins": 0.051009368151426315, + "rewards/rejected": -0.09335686266422272, + "step": 140 + }, + { + "epoch": 0.02, + "learning_rate": 8.992805755395684e-07, + "logits/chosen": -1.974597692489624, + "logits/rejected": -1.5300289392471313, + "logps/chosen": -221.36166381835938, + "logps/rejected": -209.5833282470703, + "loss": 0.1912, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.08563482016324997, + "rewards/margins": 0.06949006021022797, + "rewards/rejected": -0.15512490272521973, + "step": 150 + }, + { + "epoch": 0.02, + "learning_rate": 9.59232613908873e-07, + "logits/chosen": -1.9377422332763672, + "logits/rejected": -1.4812471866607666, + "logps/chosen": -214.61581420898438, + "logps/rejected": -158.63467407226562, + "loss": 0.19, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.19673360884189606, + "rewards/margins": 0.11003688722848892, + "rewards/rejected": -0.3067705035209656, + "step": 160 + }, + { + "epoch": 0.02, + "learning_rate": 1.0191846522781776e-06, + "logits/chosen": -1.8835103511810303, + "logits/rejected": -1.6908140182495117, + "logps/chosen": -232.00540161132812, + "logps/rejected": -224.3048095703125, + "loss": 0.1984, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.32456761598587036, + "rewards/margins": 0.03511672466993332, + "rewards/rejected": -0.35968437790870667, + "step": 170 + }, + { + "epoch": 0.02, + "learning_rate": 1.079136690647482e-06, + "logits/chosen": -2.0397212505340576, + "logits/rejected": -1.7811028957366943, + "logps/chosen": -209.57711791992188, + "logps/rejected": -219.6051483154297, + "loss": 0.1525, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.20149996876716614, + "rewards/margins": 0.10627492517232895, + "rewards/rejected": -0.3077749013900757, + "step": 180 + }, + { + "epoch": 0.02, + "learning_rate": 1.1390887290167866e-06, + "logits/chosen": -2.0456180572509766, + "logits/rejected": -1.645538091659546, + "logps/chosen": -298.3092346191406, + "logps/rejected": -260.2908020019531, + "loss": 0.0944, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1598762422800064, + "rewards/margins": 0.14658963680267334, + "rewards/rejected": -0.30646592378616333, + "step": 190 + }, + { + "epoch": 0.02, + "learning_rate": 1.1990407673860912e-06, + "logits/chosen": -2.1075217723846436, + "logits/rejected": -1.8570976257324219, + "logps/chosen": -279.629638671875, + "logps/rejected": -273.2984924316406, + "loss": 0.171, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.15068146586418152, + "rewards/margins": 0.04055650904774666, + "rewards/rejected": -0.19123797118663788, + "step": 200 + }, + { + "epoch": 0.03, + "learning_rate": 1.2589928057553958e-06, + "logits/chosen": -2.0482380390167236, + "logits/rejected": -1.786058783531189, + "logps/chosen": -206.5548858642578, + "logps/rejected": -211.97860717773438, + "loss": 0.1609, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.10160304605960846, + "rewards/margins": 0.0669463574886322, + "rewards/rejected": -0.16854938864707947, + "step": 210 + }, + { + "epoch": 0.03, + "learning_rate": 1.3189448441247004e-06, + "logits/chosen": -2.084348678588867, + "logits/rejected": -1.6052637100219727, + "logps/chosen": -251.0631866455078, + "logps/rejected": -206.6727752685547, + "loss": 0.1085, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.13653235137462616, + "rewards/margins": 0.06996998935937881, + "rewards/rejected": -0.20650234818458557, + "step": 220 + }, + { + "epoch": 0.03, + "learning_rate": 1.378896882494005e-06, + "logits/chosen": -2.140784502029419, + "logits/rejected": -1.4730150699615479, + "logps/chosen": -233.042236328125, + "logps/rejected": -207.8023681640625, + "loss": 0.1887, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.13673333823680878, + "rewards/margins": 0.11993058770895004, + "rewards/rejected": -0.25666388869285583, + "step": 230 + }, + { + "epoch": 0.03, + "learning_rate": 1.4388489208633094e-06, + "logits/chosen": -1.9343608617782593, + "logits/rejected": -1.6981518268585205, + "logps/chosen": -254.0943145751953, + "logps/rejected": -275.8316650390625, + "loss": 0.1273, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.13066630065441132, + "rewards/margins": 0.07596530020236969, + "rewards/rejected": -0.2066315859556198, + "step": 240 + }, + { + "epoch": 0.03, + "learning_rate": 1.4988009592326142e-06, + "logits/chosen": -1.952183485031128, + "logits/rejected": -1.8142400979995728, + "logps/chosen": -236.6318359375, + "logps/rejected": -268.00537109375, + "loss": 0.17, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.22585809230804443, + "rewards/margins": 0.0762966051697731, + "rewards/rejected": -0.30215469002723694, + "step": 250 + }, + { + "epoch": 0.03, + "learning_rate": 1.5587529976019186e-06, + "logits/chosen": -1.9224618673324585, + "logits/rejected": -1.6340528726577759, + "logps/chosen": -196.9944305419922, + "logps/rejected": -205.39712524414062, + "loss": 0.2002, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.2947540879249573, + "rewards/margins": 0.09202824532985687, + "rewards/rejected": -0.38678231835365295, + "step": 260 + }, + { + "epoch": 0.03, + "learning_rate": 1.618705035971223e-06, + "logits/chosen": -1.6605432033538818, + "logits/rejected": -1.3576246500015259, + "logps/chosen": -190.0495147705078, + "logps/rejected": -203.37741088867188, + "loss": 0.1713, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.2563242018222809, + "rewards/margins": 0.13959848880767822, + "rewards/rejected": -0.3959227204322815, + "step": 270 + }, + { + "epoch": 0.03, + "learning_rate": 1.6786570743405278e-06, + "logits/chosen": -2.081996440887451, + "logits/rejected": -1.8335282802581787, + "logps/chosen": -206.75241088867188, + "logps/rejected": -198.82595825195312, + "loss": 0.1588, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2610263228416443, + "rewards/margins": 0.1122390478849411, + "rewards/rejected": -0.3732653856277466, + "step": 280 + }, + { + "epoch": 0.03, + "learning_rate": 1.7386091127098322e-06, + "logits/chosen": -2.127209186553955, + "logits/rejected": -1.662936806678772, + "logps/chosen": -278.27166748046875, + "logps/rejected": -271.78399658203125, + "loss": 0.1299, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3486752510070801, + "rewards/margins": 0.12807399034500122, + "rewards/rejected": -0.4767492413520813, + "step": 290 + }, + { + "epoch": 0.04, + "learning_rate": 1.7985611510791368e-06, + "logits/chosen": -1.844478964805603, + "logits/rejected": -1.6861326694488525, + "logps/chosen": -323.74169921875, + "logps/rejected": -350.01019287109375, + "loss": 0.154, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4493889808654785, + "rewards/margins": 0.10103818029165268, + "rewards/rejected": -0.5504271388053894, + "step": 300 + }, + { + "epoch": 0.04, + "learning_rate": 1.8585131894484414e-06, + "logits/chosen": -2.01519513130188, + "logits/rejected": -1.7900644540786743, + "logps/chosen": -268.0411071777344, + "logps/rejected": -255.30453491210938, + "loss": 0.1899, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4563392698764801, + "rewards/margins": 0.09228341281414032, + "rewards/rejected": -0.5486227869987488, + "step": 310 + }, + { + "epoch": 0.04, + "learning_rate": 1.918465227817746e-06, + "logits/chosen": -1.924393653869629, + "logits/rejected": -1.5130094289779663, + "logps/chosen": -186.55593872070312, + "logps/rejected": -182.58775329589844, + "loss": 0.166, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.4110238552093506, + "rewards/margins": 0.16674764454364777, + "rewards/rejected": -0.5777715444564819, + "step": 320 + }, + { + "epoch": 0.04, + "learning_rate": 1.9784172661870504e-06, + "logits/chosen": -2.1037185192108154, + "logits/rejected": -1.9657186269760132, + "logps/chosen": -218.725830078125, + "logps/rejected": -252.91024780273438, + "loss": 0.1719, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.35161924362182617, + "rewards/margins": 0.0837181806564331, + "rewards/rejected": -0.4353373944759369, + "step": 330 + }, + { + "epoch": 0.04, + "learning_rate": 2.0383693045563552e-06, + "logits/chosen": -2.0712966918945312, + "logits/rejected": -1.9187800884246826, + "logps/chosen": -201.36724853515625, + "logps/rejected": -223.52041625976562, + "loss": 0.1685, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2922331988811493, + "rewards/margins": 0.11911450326442719, + "rewards/rejected": -0.41134771704673767, + "step": 340 + }, + { + "epoch": 0.04, + "learning_rate": 2.0983213429256596e-06, + "logits/chosen": -1.9400737285614014, + "logits/rejected": -1.7396290302276611, + "logps/chosen": -189.12892150878906, + "logps/rejected": -177.9490509033203, + "loss": 0.2454, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.25958988070487976, + "rewards/margins": 0.03190717101097107, + "rewards/rejected": -0.29149705171585083, + "step": 350 + }, + { + "epoch": 0.04, + "learning_rate": 2.158273381294964e-06, + "logits/chosen": -2.010946035385132, + "logits/rejected": -1.7258754968643188, + "logps/chosen": -230.20706176757812, + "logps/rejected": -224.4259033203125, + "loss": 0.1737, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.28419947624206543, + "rewards/margins": 0.1424468755722046, + "rewards/rejected": -0.4266463816165924, + "step": 360 + }, + { + "epoch": 0.04, + "learning_rate": 2.218225419664269e-06, + "logits/chosen": -1.7381842136383057, + "logits/rejected": -1.4838992357254028, + "logps/chosen": -234.2359619140625, + "logps/rejected": -248.5423583984375, + "loss": 0.0967, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.3407961428165436, + "rewards/margins": 0.10874161869287491, + "rewards/rejected": -0.4495377540588379, + "step": 370 + }, + { + "epoch": 0.05, + "learning_rate": 2.2781774580335732e-06, + "logits/chosen": -2.022749423980713, + "logits/rejected": -1.4032447338104248, + "logps/chosen": -256.8328552246094, + "logps/rejected": -192.03329467773438, + "loss": 0.1595, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2618308663368225, + "rewards/margins": 0.14256241917610168, + "rewards/rejected": -0.4043932855129242, + "step": 380 + }, + { + "epoch": 0.05, + "learning_rate": 2.3381294964028776e-06, + "logits/chosen": -1.7514533996582031, + "logits/rejected": -1.6247230768203735, + "logps/chosen": -214.0389862060547, + "logps/rejected": -226.91531372070312, + "loss": 0.1191, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.2733401656150818, + "rewards/margins": 0.11437875032424927, + "rewards/rejected": -0.38771897554397583, + "step": 390 + }, + { + "epoch": 0.05, + "learning_rate": 2.3980815347721824e-06, + "logits/chosen": -1.9063535928726196, + "logits/rejected": -1.5726300477981567, + "logps/chosen": -222.97158813476562, + "logps/rejected": -203.55575561523438, + "loss": 0.15, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.2823329567909241, + "rewards/margins": 0.119574174284935, + "rewards/rejected": -0.40190714597702026, + "step": 400 + }, + { + "epoch": 0.05, + "learning_rate": 2.458033573141487e-06, + "logits/chosen": -1.9088506698608398, + "logits/rejected": -1.5609016418457031, + "logps/chosen": -262.8750915527344, + "logps/rejected": -279.5753479003906, + "loss": 0.1665, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3804900348186493, + "rewards/margins": 0.17719073593616486, + "rewards/rejected": -0.5576807856559753, + "step": 410 + }, + { + "epoch": 0.05, + "learning_rate": 2.5179856115107916e-06, + "logits/chosen": -1.9670276641845703, + "logits/rejected": -1.5786240100860596, + "logps/chosen": -247.9984588623047, + "logps/rejected": -261.22015380859375, + "loss": 0.103, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.306973397731781, + "rewards/margins": 0.17263731360435486, + "rewards/rejected": -0.4796106815338135, + "step": 420 + }, + { + "epoch": 0.05, + "learning_rate": 2.577937649880096e-06, + "logits/chosen": -1.9811060428619385, + "logits/rejected": -1.8827531337738037, + "logps/chosen": -216.9333953857422, + "logps/rejected": -262.21697998046875, + "loss": 0.2221, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.36211085319519043, + "rewards/margins": 0.09959669411182404, + "rewards/rejected": -0.46170753240585327, + "step": 430 + }, + { + "epoch": 0.05, + "learning_rate": 2.637889688249401e-06, + "logits/chosen": -2.250277042388916, + "logits/rejected": -1.69893479347229, + "logps/chosen": -390.80206298828125, + "logps/rejected": -284.90594482421875, + "loss": 0.1114, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.28247594833374023, + "rewards/margins": 0.12760603427886963, + "rewards/rejected": -0.4100819528102875, + "step": 440 + }, + { + "epoch": 0.05, + "learning_rate": 2.6978417266187052e-06, + "logits/chosen": -1.8655027151107788, + "logits/rejected": -1.5759456157684326, + "logps/chosen": -273.0975036621094, + "logps/rejected": -323.8568420410156, + "loss": 0.1357, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.33328741788864136, + "rewards/margins": 0.1260485053062439, + "rewards/rejected": -0.45933595299720764, + "step": 450 + }, + { + "epoch": 0.06, + "learning_rate": 2.75779376498801e-06, + "logits/chosen": -2.09033203125, + "logits/rejected": -1.8335535526275635, + "logps/chosen": -276.64398193359375, + "logps/rejected": -283.84088134765625, + "loss": 0.1603, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.2383461743593216, + "rewards/margins": 0.10929499566555023, + "rewards/rejected": -0.3476411700248718, + "step": 460 + }, + { + "epoch": 0.06, + "learning_rate": 2.8177458033573145e-06, + "logits/chosen": -1.972602128982544, + "logits/rejected": -1.670013427734375, + "logps/chosen": -201.99130249023438, + "logps/rejected": -196.4750518798828, + "loss": 0.1939, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.2819952368736267, + "rewards/margins": 0.0845649391412735, + "rewards/rejected": -0.3665602207183838, + "step": 470 + }, + { + "epoch": 0.06, + "learning_rate": 2.877697841726619e-06, + "logits/chosen": -1.984262228012085, + "logits/rejected": -1.9005893468856812, + "logps/chosen": -280.5291442871094, + "logps/rejected": -247.97720336914062, + "loss": 0.1729, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.35804125666618347, + "rewards/margins": 0.015171018429100513, + "rewards/rejected": -0.3732122778892517, + "step": 480 + }, + { + "epoch": 0.06, + "learning_rate": 2.9376498800959237e-06, + "logits/chosen": -2.09686541557312, + "logits/rejected": -1.6708438396453857, + "logps/chosen": -238.1042022705078, + "logps/rejected": -228.37350463867188, + "loss": 0.1234, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3339739739894867, + "rewards/margins": 0.13829635083675385, + "rewards/rejected": -0.47227030992507935, + "step": 490 + }, + { + "epoch": 0.06, + "learning_rate": 2.9976019184652285e-06, + "logits/chosen": -1.8362640142440796, + "logits/rejected": -1.5808006525039673, + "logps/chosen": -247.06204223632812, + "logps/rejected": -234.0608673095703, + "loss": 0.1674, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.24948985874652863, + "rewards/margins": 0.061165668070316315, + "rewards/rejected": -0.31065553426742554, + "step": 500 + }, + { + "epoch": 0.06, + "learning_rate": 3.0575539568345324e-06, + "logits/chosen": -2.108161449432373, + "logits/rejected": -1.9195600748062134, + "logps/chosen": -204.02523803710938, + "logps/rejected": -211.36361694335938, + "loss": 0.1593, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.23356959223747253, + "rewards/margins": 0.07123871147632599, + "rewards/rejected": -0.3048083186149597, + "step": 510 + }, + { + "epoch": 0.06, + "learning_rate": 3.1175059952038373e-06, + "logits/chosen": -1.8966169357299805, + "logits/rejected": -1.4281069040298462, + "logps/chosen": -257.47454833984375, + "logps/rejected": -211.0913543701172, + "loss": 0.1349, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2831900119781494, + "rewards/margins": 0.1322624385356903, + "rewards/rejected": -0.41545242071151733, + "step": 520 + }, + { + "epoch": 0.06, + "learning_rate": 3.177458033573142e-06, + "logits/chosen": -2.059906482696533, + "logits/rejected": -1.6643825769424438, + "logps/chosen": -228.86172485351562, + "logps/rejected": -192.63961791992188, + "loss": 0.1881, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3060818314552307, + "rewards/margins": 0.1667357236146927, + "rewards/rejected": -0.4728175103664398, + "step": 530 + }, + { + "epoch": 0.06, + "learning_rate": 3.237410071942446e-06, + "logits/chosen": -1.9420620203018188, + "logits/rejected": -1.6259702444076538, + "logps/chosen": -239.92581176757812, + "logps/rejected": -244.3850555419922, + "loss": 0.1508, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.26758164167404175, + "rewards/margins": 0.09906923025846481, + "rewards/rejected": -0.36665090918540955, + "step": 540 + }, + { + "epoch": 0.07, + "learning_rate": 3.297362110311751e-06, + "logits/chosen": -2.0125441551208496, + "logits/rejected": -1.77133047580719, + "logps/chosen": -234.72738647460938, + "logps/rejected": -229.4871368408203, + "loss": 0.0946, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.36053597927093506, + "rewards/margins": 0.1191805824637413, + "rewards/rejected": -0.47971653938293457, + "step": 550 + }, + { + "epoch": 0.07, + "learning_rate": 3.3573141486810557e-06, + "logits/chosen": -1.9631010293960571, + "logits/rejected": -1.7263110876083374, + "logps/chosen": -264.1136169433594, + "logps/rejected": -245.3027801513672, + "loss": 0.1203, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.26941028237342834, + "rewards/margins": 0.09951646625995636, + "rewards/rejected": -0.3689267337322235, + "step": 560 + }, + { + "epoch": 0.07, + "learning_rate": 3.4172661870503596e-06, + "logits/chosen": -2.090639114379883, + "logits/rejected": -1.617297887802124, + "logps/chosen": -253.80575561523438, + "logps/rejected": -249.5729217529297, + "loss": 0.1359, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.16714581847190857, + "rewards/margins": 0.15788118541240692, + "rewards/rejected": -0.3250270485877991, + "step": 570 + }, + { + "epoch": 0.07, + "learning_rate": 3.4772182254196645e-06, + "logits/chosen": -2.1856539249420166, + "logits/rejected": -1.682244896888733, + "logps/chosen": -291.01373291015625, + "logps/rejected": -219.6407470703125, + "loss": 0.2091, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.12654975056648254, + "rewards/margins": 0.10359915345907211, + "rewards/rejected": -0.23014888167381287, + "step": 580 + }, + { + "epoch": 0.07, + "learning_rate": 3.5371702637889693e-06, + "logits/chosen": -1.8060592412948608, + "logits/rejected": -1.5262947082519531, + "logps/chosen": -242.6085968017578, + "logps/rejected": -213.41592407226562, + "loss": 0.1047, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.306922972202301, + "rewards/margins": 0.15780650079250336, + "rewards/rejected": -0.4647294580936432, + "step": 590 + }, + { + "epoch": 0.07, + "learning_rate": 3.5971223021582737e-06, + "logits/chosen": -1.937294602394104, + "logits/rejected": -1.5673654079437256, + "logps/chosen": -255.3214874267578, + "logps/rejected": -233.734619140625, + "loss": 0.1647, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5621557831764221, + "rewards/margins": 0.1001255139708519, + "rewards/rejected": -0.6622812747955322, + "step": 600 + }, + { + "epoch": 0.07, + "learning_rate": 3.657074340527578e-06, + "logits/chosen": -1.9981091022491455, + "logits/rejected": -1.8456417322158813, + "logps/chosen": -284.6530456542969, + "logps/rejected": -266.3650207519531, + "loss": 0.1632, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5589288473129272, + "rewards/margins": 0.10421963781118393, + "rewards/rejected": -0.6631485223770142, + "step": 610 + }, + { + "epoch": 0.07, + "learning_rate": 3.717026378896883e-06, + "logits/chosen": -1.7741851806640625, + "logits/rejected": -1.5398646593093872, + "logps/chosen": -253.56478881835938, + "logps/rejected": -286.1531066894531, + "loss": 0.1523, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6353442072868347, + "rewards/margins": 0.179083913564682, + "rewards/rejected": -0.8144281506538391, + "step": 620 + }, + { + "epoch": 0.08, + "learning_rate": 3.7769784172661873e-06, + "logits/chosen": -1.7653591632843018, + "logits/rejected": -1.5256303548812866, + "logps/chosen": -309.58636474609375, + "logps/rejected": -303.3143615722656, + "loss": 0.0848, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.5480080842971802, + "rewards/margins": 0.16620375216007233, + "rewards/rejected": -0.7142117023468018, + "step": 630 + }, + { + "epoch": 0.08, + "learning_rate": 3.836930455635492e-06, + "logits/chosen": -1.8177156448364258, + "logits/rejected": -1.576643705368042, + "logps/chosen": -297.94183349609375, + "logps/rejected": -282.4010314941406, + "loss": 0.1422, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5166171193122864, + "rewards/margins": 0.1333894580602646, + "rewards/rejected": -0.6500065922737122, + "step": 640 + }, + { + "epoch": 0.08, + "learning_rate": 3.896882494004797e-06, + "logits/chosen": -1.9186363220214844, + "logits/rejected": -1.5935009717941284, + "logps/chosen": -284.55377197265625, + "logps/rejected": -241.8046875, + "loss": 0.1217, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.34077686071395874, + "rewards/margins": 0.04739413410425186, + "rewards/rejected": -0.38817098736763, + "step": 650 + }, + { + "epoch": 0.08, + "learning_rate": 3.956834532374101e-06, + "logits/chosen": -1.8101074695587158, + "logits/rejected": -1.6274917125701904, + "logps/chosen": -198.5729522705078, + "logps/rejected": -253.2472686767578, + "loss": 0.1605, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3132355511188507, + "rewards/margins": 0.13122674822807312, + "rewards/rejected": -0.44446223974227905, + "step": 660 + }, + { + "epoch": 0.08, + "learning_rate": 4.016786570743406e-06, + "logits/chosen": -1.9853845834732056, + "logits/rejected": -1.614189863204956, + "logps/chosen": -191.61289978027344, + "logps/rejected": -172.0723114013672, + "loss": 0.0808, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.32061904668807983, + "rewards/margins": 0.17392602562904358, + "rewards/rejected": -0.4945450723171234, + "step": 670 + }, + { + "epoch": 0.08, + "learning_rate": 4.0767386091127105e-06, + "logits/chosen": -1.895282506942749, + "logits/rejected": -1.4025895595550537, + "logps/chosen": -287.64459228515625, + "logps/rejected": -237.231201171875, + "loss": 0.1305, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.34143179655075073, + "rewards/margins": 0.1731572449207306, + "rewards/rejected": -0.5145890116691589, + "step": 680 + }, + { + "epoch": 0.08, + "learning_rate": 4.1366906474820145e-06, + "logits/chosen": -1.9859917163848877, + "logits/rejected": -1.811034917831421, + "logps/chosen": -267.02532958984375, + "logps/rejected": -248.8046417236328, + "loss": 0.1599, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4106379449367523, + "rewards/margins": 0.09379793703556061, + "rewards/rejected": -0.5044358968734741, + "step": 690 + }, + { + "epoch": 0.08, + "learning_rate": 4.196642685851319e-06, + "logits/chosen": -2.0239920616149902, + "logits/rejected": -1.7091875076293945, + "logps/chosen": -163.7296600341797, + "logps/rejected": -176.8492431640625, + "loss": 0.1985, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.22491566836833954, + "rewards/margins": 0.10180320590734482, + "rewards/rejected": -0.32671886682510376, + "step": 700 + }, + { + "epoch": 0.09, + "learning_rate": 4.256594724220624e-06, + "logits/chosen": -2.042534828186035, + "logits/rejected": -1.7031242847442627, + "logps/chosen": -243.00247192382812, + "logps/rejected": -238.3159637451172, + "loss": 0.1317, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.22081628441810608, + "rewards/margins": 0.13557776808738708, + "rewards/rejected": -0.3563940227031708, + "step": 710 + }, + { + "epoch": 0.09, + "learning_rate": 4.316546762589928e-06, + "logits/chosen": -2.0568203926086426, + "logits/rejected": -1.7259998321533203, + "logps/chosen": -263.7857666015625, + "logps/rejected": -256.0942077636719, + "loss": 0.0914, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3193433880805969, + "rewards/margins": 0.14199210703372955, + "rewards/rejected": -0.4613354802131653, + "step": 720 + }, + { + "epoch": 0.09, + "learning_rate": 4.376498800959233e-06, + "logits/chosen": -1.8908824920654297, + "logits/rejected": -1.599169135093689, + "logps/chosen": -250.081787109375, + "logps/rejected": -221.8983917236328, + "loss": 0.176, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.42117080092430115, + "rewards/margins": 0.1439433991909027, + "rewards/rejected": -0.5651142001152039, + "step": 730 + }, + { + "epoch": 0.09, + "learning_rate": 4.436450839328538e-06, + "logits/chosen": -1.81271231174469, + "logits/rejected": -1.4758561849594116, + "logps/chosen": -237.4412841796875, + "logps/rejected": -207.1601104736328, + "loss": 0.189, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4822634756565094, + "rewards/margins": 0.16576936841011047, + "rewards/rejected": -0.6480327844619751, + "step": 740 + }, + { + "epoch": 0.09, + "learning_rate": 4.496402877697842e-06, + "logits/chosen": -2.2231035232543945, + "logits/rejected": -1.774987816810608, + "logps/chosen": -291.08026123046875, + "logps/rejected": -247.5023193359375, + "loss": 0.09, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4788135886192322, + "rewards/margins": 0.17467689514160156, + "rewards/rejected": -0.6534904837608337, + "step": 750 + }, + { + "epoch": 0.09, + "learning_rate": 4.5563549160671465e-06, + "logits/chosen": -1.9474821090698242, + "logits/rejected": -1.523559808731079, + "logps/chosen": -256.2854919433594, + "logps/rejected": -199.56689453125, + "loss": 0.2096, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.525553286075592, + "rewards/margins": 0.10277509689331055, + "rewards/rejected": -0.6283284425735474, + "step": 760 + }, + { + "epoch": 0.09, + "learning_rate": 4.616306954436451e-06, + "logits/chosen": -1.9608790874481201, + "logits/rejected": -1.600482702255249, + "logps/chosen": -234.0125274658203, + "logps/rejected": -219.60494995117188, + "loss": 0.1606, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.560684084892273, + "rewards/margins": 0.11568351089954376, + "rewards/rejected": -0.6763675808906555, + "step": 770 + }, + { + "epoch": 0.09, + "learning_rate": 4.676258992805755e-06, + "logits/chosen": -1.9117473363876343, + "logits/rejected": -1.6412442922592163, + "logps/chosen": -225.1289520263672, + "logps/rejected": -219.20339965820312, + "loss": 0.1631, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.4210142195224762, + "rewards/margins": 0.12721626460552216, + "rewards/rejected": -0.5482303500175476, + "step": 780 + }, + { + "epoch": 0.09, + "learning_rate": 4.73621103117506e-06, + "logits/chosen": -1.8688926696777344, + "logits/rejected": -1.7802883386611938, + "logps/chosen": -220.6570281982422, + "logps/rejected": -236.92758178710938, + "loss": 0.1411, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.46808117628097534, + "rewards/margins": 0.0750129297375679, + "rewards/rejected": -0.5430941581726074, + "step": 790 + }, + { + "epoch": 0.1, + "learning_rate": 4.796163069544365e-06, + "logits/chosen": -2.0156643390655518, + "logits/rejected": -1.6202924251556396, + "logps/chosen": -296.9110412597656, + "logps/rejected": -225.12802124023438, + "loss": 0.1625, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5204383730888367, + "rewards/margins": 0.12295063585042953, + "rewards/rejected": -0.6433890461921692, + "step": 800 + }, + { + "epoch": 0.1, + "learning_rate": 4.856115107913669e-06, + "logits/chosen": -1.9585834741592407, + "logits/rejected": -1.6052249670028687, + "logps/chosen": -224.037353515625, + "logps/rejected": -237.3111572265625, + "loss": 0.1651, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.4183143675327301, + "rewards/margins": 0.10342450439929962, + "rewards/rejected": -0.5217388868331909, + "step": 810 + }, + { + "epoch": 0.1, + "learning_rate": 4.916067146282974e-06, + "logits/chosen": -2.0472521781921387, + "logits/rejected": -1.5536268949508667, + "logps/chosen": -236.7208251953125, + "logps/rejected": -195.6331787109375, + "loss": 0.137, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.36916661262512207, + "rewards/margins": 0.11299363523721695, + "rewards/rejected": -0.4821602702140808, + "step": 820 + }, + { + "epoch": 0.1, + "learning_rate": 4.9760191846522785e-06, + "logits/chosen": -1.9616940021514893, + "logits/rejected": -1.7898718118667603, + "logps/chosen": -208.87741088867188, + "logps/rejected": -228.8385467529297, + "loss": 0.1618, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3800353407859802, + "rewards/margins": 0.12817123532295227, + "rewards/rejected": -0.5082066059112549, + "step": 830 + }, + { + "epoch": 0.1, + "learning_rate": 4.9999921064257284e-06, + "logits/chosen": -1.8127410411834717, + "logits/rejected": -1.3970979452133179, + "logps/chosen": -278.39019775390625, + "logps/rejected": -293.3469543457031, + "loss": 0.1529, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.4711076319217682, + "rewards/margins": 0.15813672542572021, + "rewards/rejected": -0.629244327545166, + "step": 840 + }, + { + "epoch": 0.1, + "learning_rate": 4.9999438680968e-06, + "logits/chosen": -1.7962703704833984, + "logits/rejected": -1.365013837814331, + "logps/chosen": -249.14828491210938, + "logps/rejected": -232.2996063232422, + "loss": 0.1558, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.6188076734542847, + "rewards/margins": 0.18656638264656067, + "rewards/rejected": -0.8053741455078125, + "step": 850 + }, + { + "epoch": 0.1, + "learning_rate": 4.999851777603122e-06, + "logits/chosen": -1.9362220764160156, + "logits/rejected": -1.697091817855835, + "logps/chosen": -292.0437927246094, + "logps/rejected": -294.3341369628906, + "loss": 0.1133, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5470017194747925, + "rewards/margins": 0.12919080257415771, + "rewards/rejected": -0.6761925220489502, + "step": 860 + }, + { + "epoch": 0.1, + "learning_rate": 4.999715836560074e-06, + "logits/chosen": -1.907859206199646, + "logits/rejected": -1.5174684524536133, + "logps/chosen": -218.5150909423828, + "logps/rejected": -220.261474609375, + "loss": 0.2033, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.3629061281681061, + "rewards/margins": 0.12141053378582001, + "rewards/rejected": -0.4843166768550873, + "step": 870 + }, + { + "epoch": 0.11, + "learning_rate": 4.999536047352236e-06, + "logits/chosen": -1.909115195274353, + "logits/rejected": -1.6440328359603882, + "logps/chosen": -206.3199005126953, + "logps/rejected": -194.60552978515625, + "loss": 0.1926, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.3590332567691803, + "rewards/margins": 0.04666576534509659, + "rewards/rejected": -0.4056990146636963, + "step": 880 + }, + { + "epoch": 0.11, + "learning_rate": 4.999312413133335e-06, + "logits/chosen": -2.02655029296875, + "logits/rejected": -1.555143117904663, + "logps/chosen": -290.0566711425781, + "logps/rejected": -258.43951416015625, + "loss": 0.1552, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.21334341168403625, + "rewards/margins": 0.13854533433914185, + "rewards/rejected": -0.3518887460231781, + "step": 890 + }, + { + "epoch": 0.11, + "learning_rate": 4.999044937826198e-06, + "logits/chosen": -1.7870111465454102, + "logits/rejected": -1.3078781366348267, + "logps/chosen": -232.5644073486328, + "logps/rejected": -228.1163330078125, + "loss": 0.167, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.22880849242210388, + "rewards/margins": 0.1350013017654419, + "rewards/rejected": -0.36380982398986816, + "step": 900 + }, + { + "epoch": 0.11, + "learning_rate": 4.998733626122679e-06, + "logits/chosen": -1.9320144653320312, + "logits/rejected": -1.7529146671295166, + "logps/chosen": -230.58480834960938, + "logps/rejected": -215.9065399169922, + "loss": 0.1188, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.2430393248796463, + "rewards/margins": 0.10922437906265259, + "rewards/rejected": -0.3522637188434601, + "step": 910 + }, + { + "epoch": 0.11, + "learning_rate": 4.998378483483577e-06, + "logits/chosen": -2.0543458461761475, + "logits/rejected": -1.576468586921692, + "logps/chosen": -215.76681518554688, + "logps/rejected": -143.02210998535156, + "loss": 0.1665, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.19551251828670502, + "rewards/margins": 0.15126529335975647, + "rewards/rejected": -0.3467778265476227, + "step": 920 + }, + { + "epoch": 0.11, + "learning_rate": 4.997979516138542e-06, + "logits/chosen": -1.7736568450927734, + "logits/rejected": -1.427062749862671, + "logps/chosen": -193.51675415039062, + "logps/rejected": -190.2722625732422, + "loss": 0.1579, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2125972956418991, + "rewards/margins": 0.12206624448299408, + "rewards/rejected": -0.3346635401248932, + "step": 930 + }, + { + "epoch": 0.11, + "learning_rate": 4.997536731085962e-06, + "logits/chosen": -2.076658248901367, + "logits/rejected": -1.6957495212554932, + "logps/chosen": -275.86004638671875, + "logps/rejected": -257.83856201171875, + "loss": 0.1156, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.15670546889305115, + "rewards/margins": 0.1267530769109726, + "rewards/rejected": -0.28345853090286255, + "step": 940 + }, + { + "epoch": 0.11, + "learning_rate": 4.997050136092847e-06, + "logits/chosen": -1.9965837001800537, + "logits/rejected": -1.5267733335494995, + "logps/chosen": -234.33804321289062, + "logps/rejected": -168.39871215820312, + "loss": 0.1527, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.16364577412605286, + "rewards/margins": 0.10411280393600464, + "rewards/rejected": -0.2677585780620575, + "step": 950 + }, + { + "epoch": 0.12, + "learning_rate": 4.996519739694684e-06, + "logits/chosen": -1.9764916896820068, + "logits/rejected": -1.8170015811920166, + "logps/chosen": -261.2804260253906, + "logps/rejected": -244.72708129882812, + "loss": 0.1467, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2282881736755371, + "rewards/margins": 0.0751405730843544, + "rewards/rejected": -0.3034287393093109, + "step": 960 + }, + { + "epoch": 0.12, + "learning_rate": 4.995945551195296e-06, + "logits/chosen": -1.8942501544952393, + "logits/rejected": -1.576650857925415, + "logps/chosen": -216.11184692382812, + "logps/rejected": -192.7135467529297, + "loss": 0.1492, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2776258587837219, + "rewards/margins": 0.18120935559272766, + "rewards/rejected": -0.4588352143764496, + "step": 970 + }, + { + "epoch": 0.12, + "learning_rate": 4.995327580666672e-06, + "logits/chosen": -2.1704368591308594, + "logits/rejected": -1.4965741634368896, + "logps/chosen": -241.30697631835938, + "logps/rejected": -190.32650756835938, + "loss": 0.1123, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.27652624249458313, + "rewards/margins": 0.24555762112140656, + "rewards/rejected": -0.5220838785171509, + "step": 980 + }, + { + "epoch": 0.12, + "learning_rate": 4.994665838948792e-06, + "logits/chosen": -1.9886589050292969, + "logits/rejected": -1.8707554340362549, + "logps/chosen": -235.10440063476562, + "logps/rejected": -289.9771423339844, + "loss": 0.1256, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.1920485645532608, + "rewards/margins": 0.11377612501382828, + "rewards/rejected": -0.3058246970176697, + "step": 990 + }, + { + "epoch": 0.12, + "learning_rate": 4.993960337649441e-06, + "logits/chosen": -2.0039737224578857, + "logits/rejected": -1.454641342163086, + "logps/chosen": -253.69454956054688, + "logps/rejected": -198.70730590820312, + "loss": 0.2516, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21102575957775116, + "rewards/margins": 0.06709831953048706, + "rewards/rejected": -0.2781240940093994, + "step": 1000 + }, + { + "epoch": 0.12, + "learning_rate": 4.993211089144e-06, + "logits/chosen": -1.9574140310287476, + "logits/rejected": -1.4666115045547485, + "logps/chosen": -264.51251220703125, + "logps/rejected": -221.65234375, + "loss": 0.1448, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.2643020749092102, + "rewards/margins": 0.15589205920696259, + "rewards/rejected": -0.420194149017334, + "step": 1010 + }, + { + "epoch": 0.12, + "learning_rate": 4.992418106575232e-06, + "logits/chosen": -2.2091064453125, + "logits/rejected": -1.704564094543457, + "logps/chosen": -330.94915771484375, + "logps/rejected": -254.61880493164062, + "loss": 0.14, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.32784610986709595, + "rewards/margins": 0.08099902421236038, + "rewards/rejected": -0.40884512662887573, + "step": 1020 + }, + { + "epoch": 0.12, + "learning_rate": 4.9915814038530505e-06, + "logits/chosen": -2.091163396835327, + "logits/rejected": -1.8074891567230225, + "logps/chosen": -215.7232208251953, + "logps/rejected": -192.0024871826172, + "loss": 0.1666, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.18186353147029877, + "rewards/margins": 0.08887914568185806, + "rewards/rejected": -0.2707426846027374, + "step": 1030 + }, + { + "epoch": 0.12, + "learning_rate": 4.990700995654274e-06, + "logits/chosen": -2.0555949211120605, + "logits/rejected": -1.72856867313385, + "logps/chosen": -213.6540069580078, + "logps/rejected": -205.2044677734375, + "loss": 0.1204, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.35042351484298706, + "rewards/margins": 0.1402900069952011, + "rewards/rejected": -0.49071353673934937, + "step": 1040 + }, + { + "epoch": 0.13, + "learning_rate": 4.9897768974223726e-06, + "logits/chosen": -2.166123867034912, + "logits/rejected": -1.7946255207061768, + "logps/chosen": -231.53933715820312, + "logps/rejected": -211.31204223632812, + "loss": 0.178, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.20556874573230743, + "rewards/margins": 0.16699795424938202, + "rewards/rejected": -0.37256669998168945, + "step": 1050 + }, + { + "epoch": 0.13, + "learning_rate": 4.9888091253671925e-06, + "logits/chosen": -2.0272960662841797, + "logits/rejected": -1.4488000869750977, + "logps/chosen": -219.529052734375, + "logps/rejected": -171.55601501464844, + "loss": 0.1258, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1403190791606903, + "rewards/margins": 0.1539257913827896, + "rewards/rejected": -0.2942448556423187, + "step": 1060 + }, + { + "epoch": 0.13, + "learning_rate": 4.9877976964646755e-06, + "logits/chosen": -2.0916085243225098, + "logits/rejected": -1.8679349422454834, + "logps/chosen": -252.82199096679688, + "logps/rejected": -225.4739532470703, + "loss": 0.1693, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.19687017798423767, + "rewards/margins": 0.07593067735433578, + "rewards/rejected": -0.27280086278915405, + "step": 1070 + }, + { + "epoch": 0.13, + "learning_rate": 4.986742628456559e-06, + "logits/chosen": -2.1024928092956543, + "logits/rejected": -1.534501075744629, + "logps/chosen": -257.09710693359375, + "logps/rejected": -182.4271240234375, + "loss": 0.179, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.15705084800720215, + "rewards/margins": 0.11441371589899063, + "rewards/rejected": -0.27146458625793457, + "step": 1080 + }, + { + "epoch": 0.13, + "learning_rate": 4.985643939850063e-06, + "logits/chosen": -2.1568455696105957, + "logits/rejected": -1.6811933517456055, + "logps/chosen": -275.54144287109375, + "logps/rejected": -227.31417846679688, + "loss": 0.1206, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2653849124908447, + "rewards/margins": 0.15564067661762238, + "rewards/rejected": -0.4210255742073059, + "step": 1090 + }, + { + "epoch": 0.13, + "learning_rate": 4.984501649917573e-06, + "logits/chosen": -1.9597032070159912, + "logits/rejected": -1.5438346862792969, + "logps/chosen": -223.2861785888672, + "logps/rejected": -213.6127166748047, + "loss": 0.1441, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.42509156465530396, + "rewards/margins": 0.1514272391796112, + "rewards/rejected": -0.5765187740325928, + "step": 1100 + }, + { + "epoch": 0.13, + "learning_rate": 4.98331577869629e-06, + "logits/chosen": -1.9955313205718994, + "logits/rejected": -1.7289683818817139, + "logps/chosen": -316.2930908203125, + "logps/rejected": -297.17388916015625, + "loss": 0.1012, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4474318027496338, + "rewards/margins": 0.11068395525217056, + "rewards/rejected": -0.5581157207489014, + "step": 1110 + }, + { + "epoch": 0.13, + "learning_rate": 4.982086346987891e-06, + "logits/chosen": -1.8174870014190674, + "logits/rejected": -1.5796586275100708, + "logps/chosen": -252.7262725830078, + "logps/rejected": -244.24661254882812, + "loss": 0.1905, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3619054853916168, + "rewards/margins": 0.08687237650156021, + "rewards/rejected": -0.44877785444259644, + "step": 1120 + }, + { + "epoch": 0.14, + "learning_rate": 4.980813376358157e-06, + "logits/chosen": -1.8165006637573242, + "logits/rejected": -1.5728943347930908, + "logps/chosen": -238.3287353515625, + "logps/rejected": -264.8955993652344, + "loss": 0.1116, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4419601857662201, + "rewards/margins": 0.14847253262996674, + "rewards/rejected": -0.5904327630996704, + "step": 1130 + }, + { + "epoch": 0.14, + "learning_rate": 4.9794968891365955e-06, + "logits/chosen": -1.9940223693847656, + "logits/rejected": -1.602085828781128, + "logps/chosen": -283.71234130859375, + "logps/rejected": -260.02947998046875, + "loss": 0.1771, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.42881909012794495, + "rewards/margins": 0.11037082970142365, + "rewards/rejected": -0.5391899347305298, + "step": 1140 + }, + { + "epoch": 0.14, + "learning_rate": 4.978136908416052e-06, + "logits/chosen": -2.128349781036377, + "logits/rejected": -1.6948553323745728, + "logps/chosen": -193.53390502929688, + "logps/rejected": -213.319091796875, + "loss": 0.1714, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.41444897651672363, + "rewards/margins": 0.15515312552452087, + "rewards/rejected": -0.5696021318435669, + "step": 1150 + }, + { + "epoch": 0.14, + "learning_rate": 4.976733458052301e-06, + "logits/chosen": -2.0403997898101807, + "logits/rejected": -1.6035503149032593, + "logps/chosen": -197.5802001953125, + "logps/rejected": -190.8645782470703, + "loss": 0.1026, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.46744871139526367, + "rewards/margins": 0.2037152796983719, + "rewards/rejected": -0.6711639165878296, + "step": 1160 + }, + { + "epoch": 0.14, + "learning_rate": 4.975286562663629e-06, + "logits/chosen": -2.0656538009643555, + "logits/rejected": -1.832098364830017, + "logps/chosen": -282.55438232421875, + "logps/rejected": -244.44912719726562, + "loss": 0.1259, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4940463900566101, + "rewards/margins": 0.11086853593587875, + "rewards/rejected": -0.6049149036407471, + "step": 1170 + }, + { + "epoch": 0.14, + "learning_rate": 4.9737962476304045e-06, + "logits/chosen": -1.9830493927001953, + "logits/rejected": -1.647684097290039, + "logps/chosen": -220.96340942382812, + "logps/rejected": -208.0186767578125, + "loss": 0.1464, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.48578500747680664, + "rewards/margins": 0.15359191596508026, + "rewards/rejected": -0.6393769383430481, + "step": 1180 + }, + { + "epoch": 0.14, + "learning_rate": 4.972262539094633e-06, + "logits/chosen": -1.978539228439331, + "logits/rejected": -1.682488203048706, + "logps/chosen": -226.36874389648438, + "logps/rejected": -201.7385711669922, + "loss": 0.1508, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.38268616795539856, + "rewards/margins": 0.08130475878715515, + "rewards/rejected": -0.46399086713790894, + "step": 1190 + }, + { + "epoch": 0.14, + "learning_rate": 4.970685463959489e-06, + "logits/chosen": -2.1193437576293945, + "logits/rejected": -1.6852171421051025, + "logps/chosen": -204.7485809326172, + "logps/rejected": -179.21803283691406, + "loss": 0.1222, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.29113954305648804, + "rewards/margins": 0.18358632922172546, + "rewards/rejected": -0.4747259020805359, + "step": 1200 + }, + { + "epoch": 0.15, + "learning_rate": 4.969065049888861e-06, + "logits/chosen": -2.1825406551361084, + "logits/rejected": -1.6184136867523193, + "logps/chosen": -186.78878784179688, + "logps/rejected": -203.55813598632812, + "loss": 0.1486, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.20323626697063446, + "rewards/margins": 0.19247011840343475, + "rewards/rejected": -0.3957063853740692, + "step": 1210 + }, + { + "epoch": 0.15, + "learning_rate": 4.9674013253068535e-06, + "logits/chosen": -2.1128785610198975, + "logits/rejected": -1.8958683013916016, + "logps/chosen": -235.69161987304688, + "logps/rejected": -235.4266357421875, + "loss": 0.1277, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.22614555060863495, + "rewards/margins": 0.1264582872390747, + "rewards/rejected": -0.35260388255119324, + "step": 1220 + }, + { + "epoch": 0.15, + "learning_rate": 4.96569431939729e-06, + "logits/chosen": -1.8693885803222656, + "logits/rejected": -1.6573035717010498, + "logps/chosen": -216.2715301513672, + "logps/rejected": -202.7554473876953, + "loss": 0.1266, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3473548889160156, + "rewards/margins": 0.13557687401771545, + "rewards/rejected": -0.4829317033290863, + "step": 1230 + }, + { + "epoch": 0.15, + "learning_rate": 4.963944062103205e-06, + "logits/chosen": -2.0532517433166504, + "logits/rejected": -1.7747167348861694, + "logps/chosen": -238.31423950195312, + "logps/rejected": -231.4252166748047, + "loss": 0.163, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.2758443355560303, + "rewards/margins": 0.12897524237632751, + "rewards/rejected": -0.4048195779323578, + "step": 1240 + }, + { + "epoch": 0.15, + "learning_rate": 4.9621505841263155e-06, + "logits/chosen": -1.9278017282485962, + "logits/rejected": -1.6317808628082275, + "logps/chosen": -209.70388793945312, + "logps/rejected": -204.15481567382812, + "loss": 0.1274, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.28794723749160767, + "rewards/margins": 0.08738715946674347, + "rewards/rejected": -0.37533441185951233, + "step": 1250 + }, + { + "epoch": 0.15, + "learning_rate": 4.960313916926486e-06, + "logits/chosen": -1.9663488864898682, + "logits/rejected": -1.8491098880767822, + "logps/chosen": -225.07937622070312, + "logps/rejected": -231.40744018554688, + "loss": 0.1182, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.31935936212539673, + "rewards/margins": 0.10864345729351044, + "rewards/rejected": -0.42800283432006836, + "step": 1260 + }, + { + "epoch": 0.15, + "learning_rate": 4.958434092721172e-06, + "logits/chosen": -1.9907243251800537, + "logits/rejected": -1.6675913333892822, + "logps/chosen": -199.87515258789062, + "logps/rejected": -212.6792755126953, + "loss": 0.1512, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2891536056995392, + "rewards/margins": 0.11914797872304916, + "rewards/rejected": -0.40830159187316895, + "step": 1270 + }, + { + "epoch": 0.15, + "learning_rate": 4.956511144484858e-06, + "logits/chosen": -2.0224695205688477, + "logits/rejected": -1.5479836463928223, + "logps/chosen": -303.0483703613281, + "logps/rejected": -241.37060546875, + "loss": 0.1655, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4082934856414795, + "rewards/margins": 0.11875990778207779, + "rewards/rejected": -0.5270534157752991, + "step": 1280 + }, + { + "epoch": 0.15, + "learning_rate": 4.954545105948479e-06, + "logits/chosen": -2.2034153938293457, + "logits/rejected": -2.0540757179260254, + "logps/chosen": -282.18048095703125, + "logps/rejected": -286.74310302734375, + "loss": 0.163, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4291737973690033, + "rewards/margins": 0.12008972465991974, + "rewards/rejected": -0.5492635369300842, + "step": 1290 + }, + { + "epoch": 0.16, + "learning_rate": 4.952536011598828e-06, + "logits/chosen": -1.9675910472869873, + "logits/rejected": -1.8413200378417969, + "logps/chosen": -218.7751922607422, + "logps/rejected": -267.6961669921875, + "loss": 0.2403, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.3774875998497009, + "rewards/margins": 0.08021329343318939, + "rewards/rejected": -0.45770081877708435, + "step": 1300 + }, + { + "epoch": 0.16, + "learning_rate": 4.950483896677949e-06, + "logits/chosen": -1.9533389806747437, + "logits/rejected": -1.5656068325042725, + "logps/chosen": -268.5809020996094, + "logps/rejected": -251.406494140625, + "loss": 0.0979, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.43018245697021484, + "rewards/margins": 0.2314719408750534, + "rewards/rejected": -0.6616543531417847, + "step": 1310 + }, + { + "epoch": 0.16, + "learning_rate": 4.948388797182525e-06, + "logits/chosen": -1.9137376546859741, + "logits/rejected": -1.8603311777114868, + "logps/chosen": -179.66416931152344, + "logps/rejected": -240.1300506591797, + "loss": 0.2446, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3417154848575592, + "rewards/margins": 0.15184545516967773, + "rewards/rejected": -0.4935609698295593, + "step": 1320 + }, + { + "epoch": 0.16, + "learning_rate": 4.9462507498632404e-06, + "logits/chosen": -1.8757511377334595, + "logits/rejected": -1.556334376335144, + "logps/chosen": -170.3055419921875, + "logps/rejected": -155.99923706054688, + "loss": 0.1384, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.09021402895450592, + "rewards/margins": 0.061974525451660156, + "rewards/rejected": -0.15218856930732727, + "step": 1330 + }, + { + "epoch": 0.16, + "learning_rate": 4.944069792224138e-06, + "logits/chosen": -2.04624080657959, + "logits/rejected": -1.6785959005355835, + "logps/chosen": -259.4488525390625, + "logps/rejected": -208.564697265625, + "loss": 0.1482, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.11900661140680313, + "rewards/margins": 0.09059344977140427, + "rewards/rejected": -0.2096000462770462, + "step": 1340 + }, + { + "epoch": 0.16, + "learning_rate": 4.941845962521961e-06, + "logits/chosen": -2.206084728240967, + "logits/rejected": -1.8071863651275635, + "logps/chosen": -174.04827880859375, + "logps/rejected": -162.29518127441406, + "loss": 0.1481, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.22567129135131836, + "rewards/margins": 0.115481436252594, + "rewards/rejected": -0.34115272760391235, + "step": 1350 + }, + { + "epoch": 0.16, + "learning_rate": 4.939579299765485e-06, + "logits/chosen": -2.1437458992004395, + "logits/rejected": -1.828401803970337, + "logps/chosen": -189.3341522216797, + "logps/rejected": -232.9872283935547, + "loss": 0.1491, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3480433225631714, + "rewards/margins": 0.12997707724571228, + "rewards/rejected": -0.47802042961120605, + "step": 1360 + }, + { + "epoch": 0.16, + "learning_rate": 4.937269843714831e-06, + "logits/chosen": -1.8172667026519775, + "logits/rejected": -1.5740686655044556, + "logps/chosen": -230.97781372070312, + "logps/rejected": -243.69906616210938, + "loss": 0.1532, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.3387555181980133, + "rewards/margins": 0.12509162724018097, + "rewards/rejected": -0.46384716033935547, + "step": 1370 + }, + { + "epoch": 0.17, + "learning_rate": 4.934917634880766e-06, + "logits/chosen": -1.8426252603530884, + "logits/rejected": -1.6131852865219116, + "logps/chosen": -208.56918334960938, + "logps/rejected": -232.096435546875, + "loss": 0.1422, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.3319644033908844, + "rewards/margins": 0.11488902568817139, + "rewards/rejected": -0.4468534588813782, + "step": 1380 + }, + { + "epoch": 0.17, + "learning_rate": 4.932522714523996e-06, + "logits/chosen": -1.8893773555755615, + "logits/rejected": -1.7410329580307007, + "logps/chosen": -207.05514526367188, + "logps/rejected": -207.72817993164062, + "loss": 0.1441, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.40233850479125977, + "rewards/margins": 0.09210322797298431, + "rewards/rejected": -0.4944417476654053, + "step": 1390 + }, + { + "epoch": 0.17, + "learning_rate": 4.930085124654443e-06, + "logits/chosen": -2.0386033058166504, + "logits/rejected": -1.3505053520202637, + "logps/chosen": -369.05743408203125, + "logps/rejected": -262.7327575683594, + "loss": 0.144, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.38073664903640747, + "rewards/margins": 0.19237622618675232, + "rewards/rejected": -0.5731129050254822, + "step": 1400 + }, + { + "epoch": 0.17, + "learning_rate": 4.927604908030503e-06, + "logits/chosen": -1.893441915512085, + "logits/rejected": -1.7213819026947021, + "logps/chosen": -203.71377563476562, + "logps/rejected": -231.38339233398438, + "loss": 0.2063, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2586003541946411, + "rewards/margins": 0.1567048281431198, + "rewards/rejected": -0.4153051972389221, + "step": 1410 + }, + { + "epoch": 0.17, + "learning_rate": 4.9250821081583e-06, + "logits/chosen": -1.9134151935577393, + "logits/rejected": -1.5941439867019653, + "logps/chosen": -238.3423309326172, + "logps/rejected": -239.90673828125, + "loss": 0.1188, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3424052298069, + "rewards/margins": 0.14903725683689117, + "rewards/rejected": -0.49144238233566284, + "step": 1420 + }, + { + "epoch": 0.17, + "learning_rate": 4.922516769290921e-06, + "logits/chosen": -2.1062850952148438, + "logits/rejected": -1.5940475463867188, + "logps/chosen": -285.4846496582031, + "logps/rejected": -267.6663818359375, + "loss": 0.0838, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.41442838311195374, + "rewards/margins": 0.16315698623657227, + "rewards/rejected": -0.5775853395462036, + "step": 1430 + }, + { + "epoch": 0.17, + "learning_rate": 4.919908936427643e-06, + "logits/chosen": -1.9641939401626587, + "logits/rejected": -1.6408805847167969, + "logps/chosen": -233.9027862548828, + "logps/rejected": -241.10244750976562, + "loss": 0.1349, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.31380248069763184, + "rewards/margins": 0.1510525941848755, + "rewards/rejected": -0.4648551344871521, + "step": 1440 + }, + { + "epoch": 0.17, + "learning_rate": 4.917258655313137e-06, + "logits/chosen": -1.9187742471694946, + "logits/rejected": -1.7553646564483643, + "logps/chosen": -183.12078857421875, + "logps/rejected": -215.263916015625, + "loss": 0.1463, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.45299237966537476, + "rewards/margins": 0.14117896556854248, + "rewards/rejected": -0.5941713452339172, + "step": 1450 + }, + { + "epoch": 0.18, + "learning_rate": 4.914565972436677e-06, + "logits/chosen": -1.9224255084991455, + "logits/rejected": -1.568176031112671, + "logps/chosen": -243.716552734375, + "logps/rejected": -229.509765625, + "loss": 0.1259, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.32048431038856506, + "rewards/margins": 0.1995118111371994, + "rewards/rejected": -0.5199961066246033, + "step": 1460 + }, + { + "epoch": 0.18, + "learning_rate": 4.911830935031308e-06, + "logits/chosen": -1.7421767711639404, + "logits/rejected": -1.6233383417129517, + "logps/chosen": -248.0158233642578, + "logps/rejected": -251.9866485595703, + "loss": 0.1802, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.37991154193878174, + "rewards/margins": 0.10848450660705566, + "rewards/rejected": -0.4883960783481598, + "step": 1470 + }, + { + "epoch": 0.18, + "learning_rate": 4.909053591073034e-06, + "logits/chosen": -1.8475421667099, + "logits/rejected": -1.5178005695343018, + "logps/chosen": -234.1063995361328, + "logps/rejected": -198.0278778076172, + "loss": 0.188, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.21273323893547058, + "rewards/margins": 0.10500024259090424, + "rewards/rejected": -0.3177334666252136, + "step": 1480 + }, + { + "epoch": 0.18, + "learning_rate": 4.906233989279967e-06, + "logits/chosen": -2.128617525100708, + "logits/rejected": -1.6844680309295654, + "logps/chosen": -254.1748504638672, + "logps/rejected": -218.5316162109375, + "loss": 0.1283, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.23462820053100586, + "rewards/margins": 0.07426507025957108, + "rewards/rejected": -0.30889326333999634, + "step": 1490 + }, + { + "epoch": 0.18, + "learning_rate": 4.903372179111473e-06, + "logits/chosen": -1.8551340103149414, + "logits/rejected": -1.7969564199447632, + "logps/chosen": -245.1498565673828, + "logps/rejected": -237.3717498779297, + "loss": 0.1349, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.26474660634994507, + "rewards/margins": 0.06484004855155945, + "rewards/rejected": -0.3295866847038269, + "step": 1500 + }, + { + "epoch": 0.18, + "learning_rate": 4.900468210767309e-06, + "logits/chosen": -1.875862717628479, + "logits/rejected": -1.5309476852416992, + "logps/chosen": -220.53408813476562, + "logps/rejected": -184.7772674560547, + "loss": 0.2182, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.20155379176139832, + "rewards/margins": 0.08043310791254044, + "rewards/rejected": -0.28198689222335815, + "step": 1510 + }, + { + "epoch": 0.18, + "learning_rate": 4.897522135186737e-06, + "logits/chosen": -2.022017478942871, + "logits/rejected": -1.7306649684906006, + "logps/chosen": -266.8653869628906, + "logps/rejected": -258.17962646484375, + "loss": 0.1517, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2780263423919678, + "rewards/margins": 0.12058179080486298, + "rewards/rejected": -0.39860814809799194, + "step": 1520 + }, + { + "epoch": 0.18, + "learning_rate": 4.894534004047635e-06, + "logits/chosen": -2.042154312133789, + "logits/rejected": -1.7138588428497314, + "logps/chosen": -322.60791015625, + "logps/rejected": -300.8125915527344, + "loss": 0.0724, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.37096601724624634, + "rewards/margins": 0.13116590678691864, + "rewards/rejected": -0.502131998538971, + "step": 1530 + }, + { + "epoch": 0.18, + "learning_rate": 4.891503869765586e-06, + "logits/chosen": -2.0083236694335938, + "logits/rejected": -1.7603801488876343, + "logps/chosen": -266.07330322265625, + "logps/rejected": -251.0727081298828, + "loss": 0.0935, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.27762866020202637, + "rewards/margins": 0.21909542381763458, + "rewards/rejected": -0.49672412872314453, + "step": 1540 + }, + { + "epoch": 0.19, + "learning_rate": 4.888431785492964e-06, + "logits/chosen": -2.025075912475586, + "logits/rejected": -1.55344820022583, + "logps/chosen": -198.4198455810547, + "logps/rejected": -193.17062377929688, + "loss": 0.1424, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.22466003894805908, + "rewards/margins": 0.1589314192533493, + "rewards/rejected": -0.3835914731025696, + "step": 1550 + }, + { + "epoch": 0.19, + "learning_rate": 4.8853178051179965e-06, + "logits/chosen": -2.004646062850952, + "logits/rejected": -1.5767240524291992, + "logps/chosen": -275.49566650390625, + "logps/rejected": -211.9617462158203, + "loss": 0.1153, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.24958455562591553, + "rewards/margins": 0.12301850318908691, + "rewards/rejected": -0.37260305881500244, + "step": 1560 + }, + { + "epoch": 0.19, + "learning_rate": 4.882161983263822e-06, + "logits/chosen": -1.9383262395858765, + "logits/rejected": -1.6864608526229858, + "logps/chosen": -215.1851348876953, + "logps/rejected": -219.91763305664062, + "loss": 0.1282, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.21753136813640594, + "rewards/margins": 0.07913483679294586, + "rewards/rejected": -0.2966662049293518, + "step": 1570 + }, + { + "epoch": 0.19, + "learning_rate": 4.8789643752875315e-06, + "logits/chosen": -2.1681385040283203, + "logits/rejected": -1.5037636756896973, + "logps/chosen": -296.7191467285156, + "logps/rejected": -210.6960906982422, + "loss": 0.0908, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.13743911683559418, + "rewards/margins": 0.14550727605819702, + "rewards/rejected": -0.2829464077949524, + "step": 1580 + }, + { + "epoch": 0.19, + "learning_rate": 4.875725037279197e-06, + "logits/chosen": -2.100879669189453, + "logits/rejected": -1.6887743473052979, + "logps/chosen": -275.7843322753906, + "logps/rejected": -254.24166870117188, + "loss": 0.106, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.2630919814109802, + "rewards/margins": 0.1653970181941986, + "rewards/rejected": -0.4284890294075012, + "step": 1590 + }, + { + "epoch": 0.19, + "learning_rate": 4.8724440260608885e-06, + "logits/chosen": -2.100240707397461, + "logits/rejected": -1.6715034246444702, + "logps/chosen": -215.9812774658203, + "logps/rejected": -220.94790649414062, + "loss": 0.1162, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.312126100063324, + "rewards/margins": 0.20332340896129608, + "rewards/rejected": -0.515449583530426, + "step": 1600 + }, + { + "epoch": 0.19, + "learning_rate": 4.8691213991856755e-06, + "logits/chosen": -2.143995523452759, + "logits/rejected": -1.8109419345855713, + "logps/chosen": -219.9774932861328, + "logps/rejected": -207.9420623779297, + "loss": 0.1514, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.3066982924938202, + "rewards/margins": 0.12182126939296722, + "rewards/rejected": -0.4285196363925934, + "step": 1610 + }, + { + "epoch": 0.19, + "learning_rate": 4.8657572149366195e-06, + "logits/chosen": -2.0417563915252686, + "logits/rejected": -1.8656389713287354, + "logps/chosen": -240.6365203857422, + "logps/rejected": -234.8029022216797, + "loss": 0.1367, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2173691689968109, + "rewards/margins": 0.11171890795230865, + "rewards/rejected": -0.32908809185028076, + "step": 1620 + }, + { + "epoch": 0.2, + "learning_rate": 4.8623515323257496e-06, + "logits/chosen": -1.8849719762802124, + "logits/rejected": -1.764593482017517, + "logps/chosen": -237.77490234375, + "logps/rejected": -266.98089599609375, + "loss": 0.138, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3375057876110077, + "rewards/margins": 0.1129181832075119, + "rewards/rejected": -0.4504240155220032, + "step": 1630 + }, + { + "epoch": 0.2, + "learning_rate": 4.85890441109303e-06, + "logits/chosen": -1.9908783435821533, + "logits/rejected": -1.6857595443725586, + "logps/chosen": -254.00650024414062, + "logps/rejected": -248.4944610595703, + "loss": 0.1031, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.35663676261901855, + "rewards/margins": 0.18786312639713287, + "rewards/rejected": -0.544499933719635, + "step": 1640 + }, + { + "epoch": 0.2, + "learning_rate": 4.855415911705308e-06, + "logits/chosen": -2.0321478843688965, + "logits/rejected": -1.9031927585601807, + "logps/chosen": -243.13436889648438, + "logps/rejected": -255.0236053466797, + "loss": 0.1124, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.29369717836380005, + "rewards/margins": 0.11365096271038055, + "rewards/rejected": -0.4073481559753418, + "step": 1650 + }, + { + "epoch": 0.2, + "learning_rate": 4.851886095355259e-06, + "logits/chosen": -2.0734264850616455, + "logits/rejected": -1.5513179302215576, + "logps/chosen": -311.870849609375, + "logps/rejected": -246.34963989257812, + "loss": 0.3146, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.44044867157936096, + "rewards/margins": 0.06915672868490219, + "rewards/rejected": -0.509605348110199, + "step": 1660 + }, + { + "epoch": 0.2, + "learning_rate": 4.848315023960308e-06, + "logits/chosen": -2.0195250511169434, + "logits/rejected": -1.5690950155258179, + "logps/chosen": -245.0861053466797, + "logps/rejected": -176.3268280029297, + "loss": 0.1332, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.32303136587142944, + "rewards/margins": 0.14561942219734192, + "rewards/rejected": -0.46865081787109375, + "step": 1670 + }, + { + "epoch": 0.2, + "learning_rate": 4.844702760161546e-06, + "logits/chosen": -1.9474140405654907, + "logits/rejected": -1.498877763748169, + "logps/chosen": -214.76220703125, + "logps/rejected": -202.29042053222656, + "loss": 0.1529, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.34490758180618286, + "rewards/margins": 0.16737958788871765, + "rewards/rejected": -0.5122871398925781, + "step": 1680 + }, + { + "epoch": 0.2, + "learning_rate": 4.841049367322631e-06, + "logits/chosen": -1.878689169883728, + "logits/rejected": -1.5493450164794922, + "logps/chosen": -263.93896484375, + "logps/rejected": -252.0777587890625, + "loss": 0.112, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.32466429471969604, + "rewards/margins": 0.15016327798366547, + "rewards/rejected": -0.4748276174068451, + "step": 1690 + }, + { + "epoch": 0.2, + "learning_rate": 4.837354909528675e-06, + "logits/chosen": -1.8449478149414062, + "logits/rejected": -1.8139785528182983, + "logps/chosen": -187.8353271484375, + "logps/rejected": -203.24066162109375, + "loss": 0.2003, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.4078170359134674, + "rewards/margins": 0.08712447434663773, + "rewards/rejected": -0.49494147300720215, + "step": 1700 + }, + { + "epoch": 0.21, + "learning_rate": 4.833619451585122e-06, + "logits/chosen": -1.8499475717544556, + "logits/rejected": -1.498587727546692, + "logps/chosen": -243.45242309570312, + "logps/rejected": -231.98251342773438, + "loss": 0.1135, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3529844284057617, + "rewards/margins": 0.16045571863651276, + "rewards/rejected": -0.5134401321411133, + "step": 1710 + }, + { + "epoch": 0.21, + "learning_rate": 4.829843059016611e-06, + "logits/chosen": -1.81149423122406, + "logits/rejected": -1.4992173910140991, + "logps/chosen": -184.52310180664062, + "logps/rejected": -201.0103759765625, + "loss": 0.1903, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.35175323486328125, + "rewards/margins": 0.14889448881149292, + "rewards/rejected": -0.5006477236747742, + "step": 1720 + }, + { + "epoch": 0.21, + "learning_rate": 4.826025798065823e-06, + "logits/chosen": -1.8906818628311157, + "logits/rejected": -1.7450675964355469, + "logps/chosen": -254.50241088867188, + "logps/rejected": -233.2667999267578, + "loss": 0.195, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4035833775997162, + "rewards/margins": 0.10915178060531616, + "rewards/rejected": -0.5127351880073547, + "step": 1730 + }, + { + "epoch": 0.21, + "learning_rate": 4.8221677356923255e-06, + "logits/chosen": -1.822003722190857, + "logits/rejected": -1.6503994464874268, + "logps/chosen": -194.3645782470703, + "logps/rejected": -234.00198364257812, + "loss": 0.19, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.42598286271095276, + "rewards/margins": 0.17088128626346588, + "rewards/rejected": -0.5968641638755798, + "step": 1740 + }, + { + "epoch": 0.21, + "learning_rate": 4.8182689395713925e-06, + "logits/chosen": -1.911811113357544, + "logits/rejected": -1.4322118759155273, + "logps/chosen": -241.7019500732422, + "logps/rejected": -241.4373016357422, + "loss": 0.1291, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.4872862696647644, + "rewards/margins": 0.21816511452198029, + "rewards/rejected": -0.7054513692855835, + "step": 1750 + }, + { + "epoch": 0.21, + "learning_rate": 4.814329478092818e-06, + "logits/chosen": -2.0462019443511963, + "logits/rejected": -1.638604760169983, + "logps/chosen": -279.9619445800781, + "logps/rejected": -274.87408447265625, + "loss": 0.1234, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4822389483451843, + "rewards/margins": 0.10693428665399551, + "rewards/rejected": -0.5891731977462769, + "step": 1760 + }, + { + "epoch": 0.21, + "learning_rate": 4.810349420359722e-06, + "logits/chosen": -1.8592302799224854, + "logits/rejected": -1.4212658405303955, + "logps/chosen": -241.1697998046875, + "logps/rejected": -256.21026611328125, + "loss": 0.1746, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5237879753112793, + "rewards/margins": 0.18233174085617065, + "rewards/rejected": -0.70611971616745, + "step": 1770 + }, + { + "epoch": 0.21, + "learning_rate": 4.806328836187328e-06, + "logits/chosen": -1.9457238912582397, + "logits/rejected": -1.5514074563980103, + "logps/chosen": -258.5703430175781, + "logps/rejected": -231.8943634033203, + "loss": 0.1483, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.43616265058517456, + "rewards/margins": 0.18835784494876862, + "rewards/rejected": -0.624520480632782, + "step": 1780 + }, + { + "epoch": 0.21, + "learning_rate": 4.802267796101749e-06, + "logits/chosen": -1.8042892217636108, + "logits/rejected": -1.4870389699935913, + "logps/chosen": -296.75115966796875, + "logps/rejected": -246.88882446289062, + "loss": 0.1334, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.48000073432922363, + "rewards/margins": 0.14861378073692322, + "rewards/rejected": -0.6286145448684692, + "step": 1790 + }, + { + "epoch": 0.22, + "learning_rate": 4.798166371338745e-06, + "logits/chosen": -1.9880012273788452, + "logits/rejected": -1.7663252353668213, + "logps/chosen": -252.8795623779297, + "logps/rejected": -283.4180603027344, + "loss": 0.1518, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.3115619122982025, + "rewards/margins": 0.16061726212501526, + "rewards/rejected": -0.4721791744232178, + "step": 1800 + }, + { + "epoch": 0.22, + "learning_rate": 4.79402463384247e-06, + "logits/chosen": -1.9161767959594727, + "logits/rejected": -1.753260850906372, + "logps/chosen": -244.0667266845703, + "logps/rejected": -268.25164794921875, + "loss": 0.1697, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.299142062664032, + "rewards/margins": 0.11554409563541412, + "rewards/rejected": -0.4146861433982849, + "step": 1810 + }, + { + "epoch": 0.22, + "learning_rate": 4.78984265626422e-06, + "logits/chosen": -1.8210163116455078, + "logits/rejected": -1.640275001525879, + "logps/chosen": -195.47975158691406, + "logps/rejected": -204.36692810058594, + "loss": 0.1215, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.2043401002883911, + "rewards/margins": 0.11669757217168808, + "rewards/rejected": -0.3210376501083374, + "step": 1820 + }, + { + "epoch": 0.22, + "learning_rate": 4.785620511961148e-06, + "logits/chosen": -2.0741043090820312, + "logits/rejected": -1.788116455078125, + "logps/chosen": -263.5960388183594, + "logps/rejected": -245.22525024414062, + "loss": 0.1444, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.22155031561851501, + "rewards/margins": 0.10210974514484406, + "rewards/rejected": -0.3236600458621979, + "step": 1830 + }, + { + "epoch": 0.22, + "learning_rate": 4.781358274994985e-06, + "logits/chosen": -2.1329751014709473, + "logits/rejected": -1.7640917301177979, + "logps/chosen": -227.3170928955078, + "logps/rejected": -210.49746704101562, + "loss": 0.1666, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2090965211391449, + "rewards/margins": 0.1362505704164505, + "rewards/rejected": -0.3453471064567566, + "step": 1840 + }, + { + "epoch": 0.22, + "learning_rate": 4.777056020130737e-06, + "logits/chosen": -2.188413143157959, + "logits/rejected": -1.6932777166366577, + "logps/chosen": -314.6272277832031, + "logps/rejected": -289.4335632324219, + "loss": 0.1244, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3082244098186493, + "rewards/margins": 0.21150562167167664, + "rewards/rejected": -0.5197300314903259, + "step": 1850 + }, + { + "epoch": 0.22, + "learning_rate": 4.772713822835374e-06, + "logits/chosen": -1.8019129037857056, + "logits/rejected": -1.4269254207611084, + "logps/chosen": -215.6371307373047, + "logps/rejected": -216.8037567138672, + "loss": 0.1824, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3367902636528015, + "rewards/margins": 0.19317738711833954, + "rewards/rejected": -0.5299676656723022, + "step": 1860 + }, + { + "epoch": 0.22, + "learning_rate": 4.768331759276506e-06, + "logits/chosen": -2.0352442264556885, + "logits/rejected": -1.8071515560150146, + "logps/chosen": -285.3484802246094, + "logps/rejected": -280.1053161621094, + "loss": 0.084, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.29302871227264404, + "rewards/margins": 0.1241946592926979, + "rewards/rejected": -0.41722336411476135, + "step": 1870 + }, + { + "epoch": 0.23, + "learning_rate": 4.763909906321048e-06, + "logits/chosen": -1.9953176975250244, + "logits/rejected": -1.617118239402771, + "logps/chosen": -248.2249755859375, + "logps/rejected": -225.35562133789062, + "loss": 0.1598, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.24114413559436798, + "rewards/margins": 0.1202860102057457, + "rewards/rejected": -0.36143016815185547, + "step": 1880 + }, + { + "epoch": 0.23, + "learning_rate": 4.759448341533872e-06, + "logits/chosen": -1.7579656839370728, + "logits/rejected": -1.539206624031067, + "logps/chosen": -255.2971649169922, + "logps/rejected": -270.1570739746094, + "loss": 0.1665, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.401685893535614, + "rewards/margins": 0.19057750701904297, + "rewards/rejected": -0.5922634601593018, + "step": 1890 + }, + { + "epoch": 0.23, + "learning_rate": 4.754947143176445e-06, + "logits/chosen": -1.880765676498413, + "logits/rejected": -1.4084635972976685, + "logps/chosen": -198.8912353515625, + "logps/rejected": -170.9627227783203, + "loss": 0.1344, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.33145859837532043, + "rewards/margins": 0.16721078753471375, + "rewards/rejected": -0.4986693859100342, + "step": 1900 + }, + { + "epoch": 0.23, + "learning_rate": 4.750406390205456e-06, + "logits/chosen": -1.9753971099853516, + "logits/rejected": -1.8784997463226318, + "logps/chosen": -285.95343017578125, + "logps/rejected": -265.814453125, + "loss": 0.1751, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.28710874915122986, + "rewards/margins": 0.07001986354589462, + "rewards/rejected": -0.3571286201477051, + "step": 1910 + }, + { + "epoch": 0.23, + "learning_rate": 4.745826162271433e-06, + "logits/chosen": -2.2576117515563965, + "logits/rejected": -1.8165054321289062, + "logps/chosen": -266.011474609375, + "logps/rejected": -255.41897583007812, + "loss": 0.1212, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.22302386164665222, + "rewards/margins": 0.15649394690990448, + "rewards/rejected": -0.3795178532600403, + "step": 1920 + }, + { + "epoch": 0.23, + "learning_rate": 4.741206539717343e-06, + "logits/chosen": -2.0917410850524902, + "logits/rejected": -1.5043323040008545, + "logps/chosen": -253.68002319335938, + "logps/rejected": -229.77005004882812, + "loss": 0.1366, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18172211945056915, + "rewards/margins": 0.23254887759685516, + "rewards/rejected": -0.41427096724510193, + "step": 1930 + }, + { + "epoch": 0.23, + "learning_rate": 4.736547603577185e-06, + "logits/chosen": -1.7454341650009155, + "logits/rejected": -1.6577666997909546, + "logps/chosen": -199.49789428710938, + "logps/rejected": -199.80279541015625, + "loss": 0.1921, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.3379337191581726, + "rewards/margins": 0.12971019744873047, + "rewards/rejected": -0.4676439166069031, + "step": 1940 + }, + { + "epoch": 0.23, + "learning_rate": 4.731849435574568e-06, + "logits/chosen": -2.069859504699707, + "logits/rejected": -1.7830657958984375, + "logps/chosen": -232.3899688720703, + "logps/rejected": -232.227294921875, + "loss": 0.1621, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2579636573791504, + "rewards/margins": 0.18101055920124054, + "rewards/rejected": -0.4389742314815521, + "step": 1950 + }, + { + "epoch": 0.24, + "learning_rate": 4.727112118121279e-06, + "logits/chosen": -2.024989366531372, + "logits/rejected": -1.809133768081665, + "logps/chosen": -225.9645538330078, + "logps/rejected": -218.0653533935547, + "loss": 0.1256, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.27211037278175354, + "rewards/margins": 0.13195666670799255, + "rewards/rejected": -0.4040670394897461, + "step": 1960 + }, + { + "epoch": 0.24, + "learning_rate": 4.722335734315833e-06, + "logits/chosen": -1.990189790725708, + "logits/rejected": -1.5594546794891357, + "logps/chosen": -290.8906555175781, + "logps/rejected": -227.53616333007812, + "loss": 0.0908, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2791885733604431, + "rewards/margins": 0.1558622419834137, + "rewards/rejected": -0.4350507855415344, + "step": 1970 + }, + { + "epoch": 0.24, + "learning_rate": 4.7175203679420175e-06, + "logits/chosen": -1.9072424173355103, + "logits/rejected": -1.51438307762146, + "logps/chosen": -216.40524291992188, + "logps/rejected": -228.6339874267578, + "loss": 0.1292, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.37562626600265503, + "rewards/margins": 0.20719614624977112, + "rewards/rejected": -0.582822322845459, + "step": 1980 + }, + { + "epoch": 0.24, + "learning_rate": 4.712666103467428e-06, + "logits/chosen": -2.0311849117279053, + "logits/rejected": -1.8723666667938232, + "logps/chosen": -233.4354705810547, + "logps/rejected": -218.10659790039062, + "loss": 0.155, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2636135220527649, + "rewards/margins": 0.1285993456840515, + "rewards/rejected": -0.3922128677368164, + "step": 1990 + }, + { + "epoch": 0.24, + "learning_rate": 4.707773026041975e-06, + "logits/chosen": -2.1100738048553467, + "logits/rejected": -1.8569968938827515, + "logps/chosen": -268.63885498046875, + "logps/rejected": -232.58578491210938, + "loss": 0.1733, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.27281227707862854, + "rewards/margins": 0.13495466113090515, + "rewards/rejected": -0.4077669084072113, + "step": 2000 + }, + { + "epoch": 0.24, + "learning_rate": 4.702841221496403e-06, + "logits/chosen": -1.991676688194275, + "logits/rejected": -1.5846397876739502, + "logps/chosen": -257.11932373046875, + "logps/rejected": -221.46017456054688, + "loss": 0.1592, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.24664053320884705, + "rewards/margins": 0.13256794214248657, + "rewards/rejected": -0.3792084753513336, + "step": 2010 + }, + { + "epoch": 0.24, + "learning_rate": 4.697870776340776e-06, + "logits/chosen": -2.2750840187072754, + "logits/rejected": -1.7563819885253906, + "logps/chosen": -235.4178009033203, + "logps/rejected": -190.757080078125, + "loss": 0.1656, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.1487046480178833, + "rewards/margins": 0.1518375426530838, + "rewards/rejected": -0.3005422055721283, + "step": 2020 + }, + { + "epoch": 0.24, + "learning_rate": 4.692861777762963e-06, + "logits/chosen": -2.0295958518981934, + "logits/rejected": -1.5150290727615356, + "logps/chosen": -232.8603973388672, + "logps/rejected": -198.23338317871094, + "loss": 0.119, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18991556763648987, + "rewards/margins": 0.1996304988861084, + "rewards/rejected": -0.3895460069179535, + "step": 2030 + }, + { + "epoch": 0.24, + "learning_rate": 4.68781431362711e-06, + "logits/chosen": -2.090059518814087, + "logits/rejected": -1.5975253582000732, + "logps/chosen": -268.8729553222656, + "logps/rejected": -242.9998779296875, + "loss": 0.1739, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.427692174911499, + "rewards/margins": 0.1998465359210968, + "rewards/rejected": -0.6275386810302734, + "step": 2040 + }, + { + "epoch": 0.25, + "learning_rate": 4.6827284724720955e-06, + "logits/chosen": -2.05842924118042, + "logits/rejected": -1.7655296325683594, + "logps/chosen": -258.84442138671875, + "logps/rejected": -226.68350219726562, + "loss": 0.1977, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.3383828103542328, + "rewards/margins": 0.13462142646312714, + "rewards/rejected": -0.4730042517185211, + "step": 2050 + }, + { + "epoch": 0.25, + "learning_rate": 4.677604343509981e-06, + "logits/chosen": -2.037433385848999, + "logits/rejected": -1.5807982683181763, + "logps/chosen": -238.64389038085938, + "logps/rejected": -213.24490356445312, + "loss": 0.1261, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2423369586467743, + "rewards/margins": 0.174465611577034, + "rewards/rejected": -0.4168025553226471, + "step": 2060 + }, + { + "epoch": 0.25, + "learning_rate": 4.672442016624444e-06, + "logits/chosen": -2.1892218589782715, + "logits/rejected": -1.862447738647461, + "logps/chosen": -290.03985595703125, + "logps/rejected": -240.31771850585938, + "loss": 0.1574, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2680838704109192, + "rewards/margins": 0.11730837821960449, + "rewards/rejected": -0.3853922486305237, + "step": 2070 + }, + { + "epoch": 0.25, + "learning_rate": 4.6672415823692e-06, + "logits/chosen": -1.8660366535186768, + "logits/rejected": -1.5226314067840576, + "logps/chosen": -304.72833251953125, + "logps/rejected": -276.5460205078125, + "loss": 0.1431, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.32084205746650696, + "rewards/margins": 0.15598034858703613, + "rewards/rejected": -0.4768224358558655, + "step": 2080 + }, + { + "epoch": 0.25, + "learning_rate": 4.662003131966418e-06, + "logits/chosen": -2.167304277420044, + "logits/rejected": -1.6622650623321533, + "logps/chosen": -235.3962860107422, + "logps/rejected": -217.3386688232422, + "loss": 0.1545, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2755299210548401, + "rewards/margins": 0.10942339897155762, + "rewards/rejected": -0.3849533200263977, + "step": 2090 + }, + { + "epoch": 0.25, + "learning_rate": 4.6567267573051176e-06, + "logits/chosen": -1.8638086318969727, + "logits/rejected": -1.7130645513534546, + "logps/chosen": -219.14736938476562, + "logps/rejected": -230.8884735107422, + "loss": 0.1861, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.282665878534317, + "rewards/margins": 0.05729461461305618, + "rewards/rejected": -0.339960515499115, + "step": 2100 + }, + { + "epoch": 0.25, + "learning_rate": 4.651412550939556e-06, + "logits/chosen": -2.023266553878784, + "logits/rejected": -1.448335886001587, + "logps/chosen": -222.55819702148438, + "logps/rejected": -192.7770538330078, + "loss": 0.1121, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2549906075000763, + "rewards/margins": 0.18708500266075134, + "rewards/rejected": -0.44207563996315, + "step": 2110 + }, + { + "epoch": 0.25, + "learning_rate": 4.646060606087608e-06, + "logits/chosen": -1.9137452840805054, + "logits/rejected": -1.637158751487732, + "logps/chosen": -258.3423767089844, + "logps/rejected": -228.5346221923828, + "loss": 0.1564, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.3416040241718292, + "rewards/margins": 0.14088527858257294, + "rewards/rejected": -0.4824892580509186, + "step": 2120 + }, + { + "epoch": 0.26, + "learning_rate": 4.640671016629129e-06, + "logits/chosen": -1.8286240100860596, + "logits/rejected": -1.660211205482483, + "logps/chosen": -262.9971008300781, + "logps/rejected": -271.8037109375, + "loss": 0.1143, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4632445275783539, + "rewards/margins": 0.1317237764596939, + "rewards/rejected": -0.594968318939209, + "step": 2130 + }, + { + "epoch": 0.26, + "learning_rate": 4.635243877104307e-06, + "logits/chosen": -2.006418466567993, + "logits/rejected": -1.716923475265503, + "logps/chosen": -250.9329376220703, + "logps/rejected": -260.55462646484375, + "loss": 0.1458, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.34360507130622864, + "rewards/margins": 0.0931440070271492, + "rewards/rejected": -0.43674907088279724, + "step": 2140 + }, + { + "epoch": 0.26, + "learning_rate": 4.629779282712007e-06, + "logits/chosen": -1.8346214294433594, + "logits/rejected": -1.4906994104385376, + "logps/chosen": -262.6698913574219, + "logps/rejected": -255.5764923095703, + "loss": 0.1224, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4487072825431824, + "rewards/margins": 0.2172963172197342, + "rewards/rejected": -0.6660035848617554, + "step": 2150 + }, + { + "epoch": 0.26, + "learning_rate": 4.6242773293080965e-06, + "logits/chosen": -2.074744462966919, + "logits/rejected": -1.5857570171356201, + "logps/chosen": -322.7615966796875, + "logps/rejected": -300.53790283203125, + "loss": 0.076, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.47309666872024536, + "rewards/margins": 0.2397969663143158, + "rewards/rejected": -0.712893545627594, + "step": 2160 + }, + { + "epoch": 0.26, + "learning_rate": 4.618738113403772e-06, + "logits/chosen": -1.9601848125457764, + "logits/rejected": -1.3724687099456787, + "logps/chosen": -329.15814208984375, + "logps/rejected": -296.91790771484375, + "loss": 0.0874, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.4118635058403015, + "rewards/margins": 0.20115897059440613, + "rewards/rejected": -0.6130224466323853, + "step": 2170 + }, + { + "epoch": 0.26, + "learning_rate": 4.613161732163857e-06, + "logits/chosen": -2.002680540084839, + "logits/rejected": -1.7260891199111938, + "logps/chosen": -218.44058227539062, + "logps/rejected": -206.19589233398438, + "loss": 0.1605, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4169228672981262, + "rewards/margins": 0.15920642018318176, + "rewards/rejected": -0.5761292576789856, + "step": 2180 + }, + { + "epoch": 0.26, + "learning_rate": 4.607548283405103e-06, + "logits/chosen": -2.2463881969451904, + "logits/rejected": -1.870919942855835, + "logps/chosen": -271.3766784667969, + "logps/rejected": -237.7291259765625, + "loss": 0.1687, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3008974492549896, + "rewards/margins": 0.21275082230567932, + "rewards/rejected": -0.5136483311653137, + "step": 2190 + }, + { + "epoch": 0.26, + "learning_rate": 4.601897865594473e-06, + "logits/chosen": -2.1344265937805176, + "logits/rejected": -1.807756781578064, + "logps/chosen": -254.6204833984375, + "logps/rejected": -278.7408142089844, + "loss": 0.0969, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2825480103492737, + "rewards/margins": 0.14841003715991974, + "rewards/rejected": -0.43095797300338745, + "step": 2200 + }, + { + "epoch": 0.27, + "learning_rate": 4.596210577847415e-06, + "logits/chosen": -1.8466428518295288, + "logits/rejected": -1.4773153066635132, + "logps/chosen": -221.1357421875, + "logps/rejected": -214.00961303710938, + "loss": 0.1446, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2894059717655182, + "rewards/margins": 0.18356752395629883, + "rewards/rejected": -0.4729735255241394, + "step": 2210 + }, + { + "epoch": 0.27, + "learning_rate": 4.590486519926118e-06, + "logits/chosen": -1.8348041772842407, + "logits/rejected": -1.76007080078125, + "logps/chosen": -247.6072540283203, + "logps/rejected": -254.64437866210938, + "loss": 0.1073, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.30001839995384216, + "rewards/margins": 0.1468794047832489, + "rewards/rejected": -0.44689780473709106, + "step": 2220 + }, + { + "epoch": 0.27, + "learning_rate": 4.584725792237772e-06, + "logits/chosen": -1.8341724872589111, + "logits/rejected": -1.4840484857559204, + "logps/chosen": -281.08843994140625, + "logps/rejected": -287.3544006347656, + "loss": 0.1452, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3432127833366394, + "rewards/margins": 0.16836020350456238, + "rewards/rejected": -0.5115729570388794, + "step": 2230 + }, + { + "epoch": 0.27, + "learning_rate": 4.578928495832795e-06, + "logits/chosen": -2.124887466430664, + "logits/rejected": -1.5507080554962158, + "logps/chosen": -280.98626708984375, + "logps/rejected": -227.08627319335938, + "loss": 0.1314, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.43157821893692017, + "rewards/margins": 0.19229252636432648, + "rewards/rejected": -0.6238707304000854, + "step": 2240 + }, + { + "epoch": 0.27, + "learning_rate": 4.57309473240307e-06, + "logits/chosen": -2.0037436485290527, + "logits/rejected": -1.5078222751617432, + "logps/chosen": -269.2857360839844, + "logps/rejected": -206.1649627685547, + "loss": 0.1401, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.32349663972854614, + "rewards/margins": 0.14150217175483704, + "rewards/rejected": -0.4649987816810608, + "step": 2250 + }, + { + "epoch": 0.27, + "learning_rate": 4.567224604280157e-06, + "logits/chosen": -1.7673028707504272, + "logits/rejected": -1.6784632205963135, + "logps/chosen": -184.74407958984375, + "logps/rejected": -243.2403106689453, + "loss": 0.1252, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.33205723762512207, + "rewards/margins": 0.1660684049129486, + "rewards/rejected": -0.49812570214271545, + "step": 2260 + }, + { + "epoch": 0.27, + "learning_rate": 4.561318214433499e-06, + "logits/chosen": -1.9934532642364502, + "logits/rejected": -1.8898242712020874, + "logps/chosen": -221.0784912109375, + "logps/rejected": -242.1062469482422, + "loss": 0.1536, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.31267839670181274, + "rewards/margins": 0.1026720255613327, + "rewards/rejected": -0.41535043716430664, + "step": 2270 + }, + { + "epoch": 0.27, + "learning_rate": 4.555375666468613e-06, + "logits/chosen": -1.9682775735855103, + "logits/rejected": -1.6195480823516846, + "logps/chosen": -280.294677734375, + "logps/rejected": -256.6422119140625, + "loss": 0.198, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.31996774673461914, + "rewards/margins": 0.1175389513373375, + "rewards/rejected": -0.43750667572021484, + "step": 2280 + }, + { + "epoch": 0.27, + "learning_rate": 4.549397064625275e-06, + "logits/chosen": -1.9350669384002686, + "logits/rejected": -1.8133732080459595, + "logps/chosen": -249.39791870117188, + "logps/rejected": -271.91839599609375, + "loss": 0.1115, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.44019174575805664, + "rewards/margins": 0.09418530017137527, + "rewards/rejected": -0.5343769788742065, + "step": 2290 + }, + { + "epoch": 0.28, + "learning_rate": 4.543382513775696e-06, + "logits/chosen": -1.925415277481079, + "logits/rejected": -1.594972014427185, + "logps/chosen": -220.84228515625, + "logps/rejected": -211.1090545654297, + "loss": 0.1146, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.29334786534309387, + "rewards/margins": 0.1734061986207962, + "rewards/rejected": -0.4667540490627289, + "step": 2300 + }, + { + "epoch": 0.28, + "learning_rate": 4.5373321194226736e-06, + "logits/chosen": -1.9605739116668701, + "logits/rejected": -1.6391212940216064, + "logps/chosen": -259.51397705078125, + "logps/rejected": -275.52398681640625, + "loss": 0.0944, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.35873842239379883, + "rewards/margins": 0.1706543266773224, + "rewards/rejected": -0.5293928384780884, + "step": 2310 + }, + { + "epoch": 0.28, + "learning_rate": 4.531245987697747e-06, + "logits/chosen": -2.164452075958252, + "logits/rejected": -1.8649688959121704, + "logps/chosen": -275.1927185058594, + "logps/rejected": -261.66571044921875, + "loss": 0.1149, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.30722159147262573, + "rewards/margins": 0.11033214628696442, + "rewards/rejected": -0.4175536632537842, + "step": 2320 + }, + { + "epoch": 0.28, + "learning_rate": 4.525124225359332e-06, + "logits/chosen": -2.0339162349700928, + "logits/rejected": -1.7141647338867188, + "logps/chosen": -264.71923828125, + "logps/rejected": -232.3660125732422, + "loss": 0.1714, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.36529964208602905, + "rewards/margins": 0.15927986800670624, + "rewards/rejected": -0.5245795249938965, + "step": 2330 + }, + { + "epoch": 0.28, + "learning_rate": 4.518966939790854e-06, + "logits/chosen": -2.047182083129883, + "logits/rejected": -1.7538772821426392, + "logps/chosen": -294.2006530761719, + "logps/rejected": -253.27822875976562, + "loss": 0.2142, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3689686954021454, + "rewards/margins": 0.10386856645345688, + "rewards/rejected": -0.47283726930618286, + "step": 2340 + }, + { + "epoch": 0.28, + "learning_rate": 4.512774238998858e-06, + "logits/chosen": -1.9125845432281494, + "logits/rejected": -1.6135631799697876, + "logps/chosen": -214.25314331054688, + "logps/rejected": -214.7703399658203, + "loss": 0.1415, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.4276903569698334, + "rewards/margins": 0.14196541905403137, + "rewards/rejected": -0.56965571641922, + "step": 2350 + }, + { + "epoch": 0.28, + "learning_rate": 4.506546231611116e-06, + "logits/chosen": -1.8613827228546143, + "logits/rejected": -1.7953588962554932, + "logps/chosen": -261.28729248046875, + "logps/rejected": -297.7659606933594, + "loss": 0.0916, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.47510260343551636, + "rewards/margins": 0.18565957248210907, + "rewards/rejected": -0.6607621908187866, + "step": 2360 + }, + { + "epoch": 0.28, + "learning_rate": 4.500283026874724e-06, + "logits/chosen": -2.1421940326690674, + "logits/rejected": -1.7659165859222412, + "logps/chosen": -278.3591613769531, + "logps/rejected": -258.3917236328125, + "loss": 0.1477, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.37536367774009705, + "rewards/margins": 0.14370563626289368, + "rewards/rejected": -0.5190693140029907, + "step": 2370 + }, + { + "epoch": 0.29, + "learning_rate": 4.493984734654184e-06, + "logits/chosen": -2.0281529426574707, + "logits/rejected": -1.8278974294662476, + "logps/chosen": -231.8653106689453, + "logps/rejected": -220.0163116455078, + "loss": 0.1186, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3119848370552063, + "rewards/margins": 0.14551366865634918, + "rewards/rejected": -0.4574984908103943, + "step": 2380 + }, + { + "epoch": 0.29, + "learning_rate": 4.487651465429475e-06, + "logits/chosen": -2.177546501159668, + "logits/rejected": -1.955910086631775, + "logps/chosen": -248.14846801757812, + "logps/rejected": -242.93801879882812, + "loss": 0.2459, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.27779582142829895, + "rewards/margins": 0.20447003841400146, + "rewards/rejected": -0.4822658598423004, + "step": 2390 + }, + { + "epoch": 0.29, + "learning_rate": 4.481283330294118e-06, + "logits/chosen": -1.8666213750839233, + "logits/rejected": -1.3665393590927124, + "logps/chosen": -222.7278594970703, + "logps/rejected": -199.8502655029297, + "loss": 0.1838, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.35523027181625366, + "rewards/margins": 0.17108853161334991, + "rewards/rejected": -0.5263187885284424, + "step": 2400 + }, + { + "epoch": 0.29, + "learning_rate": 4.474880440953224e-06, + "logits/chosen": -1.9999549388885498, + "logits/rejected": -1.8370367288589478, + "logps/chosen": -192.89273071289062, + "logps/rejected": -223.0652618408203, + "loss": 0.076, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2936464548110962, + "rewards/margins": 0.11866404116153717, + "rewards/rejected": -0.41231051087379456, + "step": 2410 + }, + { + "epoch": 0.29, + "learning_rate": 4.468442909721541e-06, + "logits/chosen": -1.9979664087295532, + "logits/rejected": -1.8018696308135986, + "logps/chosen": -215.09585571289062, + "logps/rejected": -227.23757934570312, + "loss": 0.1393, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.29200148582458496, + "rewards/margins": 0.10804203897714615, + "rewards/rejected": -0.4000435471534729, + "step": 2420 + }, + { + "epoch": 0.29, + "learning_rate": 4.4619708495214735e-06, + "logits/chosen": -2.1377148628234863, + "logits/rejected": -1.6982520818710327, + "logps/chosen": -303.42315673828125, + "logps/rejected": -226.8529815673828, + "loss": 0.1636, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2613959312438965, + "rewards/margins": 0.09153415262699127, + "rewards/rejected": -0.35293012857437134, + "step": 2430 + }, + { + "epoch": 0.29, + "learning_rate": 4.455464373881112e-06, + "logits/chosen": -1.9143394231796265, + "logits/rejected": -1.7412408590316772, + "logps/chosen": -237.32070922851562, + "logps/rejected": -219.79324340820312, + "loss": 0.1836, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.26246917247772217, + "rewards/margins": 0.10806657373905182, + "rewards/rejected": -0.3705357313156128, + "step": 2440 + }, + { + "epoch": 0.29, + "learning_rate": 4.4489235969322355e-06, + "logits/chosen": -2.121340751647949, + "logits/rejected": -1.8871597051620483, + "logps/chosen": -190.05088806152344, + "logps/rejected": -195.73678588867188, + "loss": 0.1619, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3710538446903229, + "rewards/margins": 0.08781547844409943, + "rewards/rejected": -0.4588693082332611, + "step": 2450 + }, + { + "epoch": 0.3, + "learning_rate": 4.442348633408312e-06, + "logits/chosen": -1.9419047832489014, + "logits/rejected": -1.5559477806091309, + "logps/chosen": -200.26173400878906, + "logps/rejected": -201.53518676757812, + "loss": 0.2091, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.4545482099056244, + "rewards/margins": 0.18632353842258453, + "rewards/rejected": -0.6408717632293701, + "step": 2460 + }, + { + "epoch": 0.3, + "learning_rate": 4.435739598642484e-06, + "logits/chosen": -2.016679525375366, + "logits/rejected": -1.7562650442123413, + "logps/chosen": -288.39404296875, + "logps/rejected": -258.4088439941406, + "loss": 0.0716, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.3196231424808502, + "rewards/margins": 0.10171397030353546, + "rewards/rejected": -0.4213371276855469, + "step": 2470 + }, + { + "epoch": 0.3, + "learning_rate": 4.429096608565547e-06, + "logits/chosen": -1.806492805480957, + "logits/rejected": -1.3850048780441284, + "logps/chosen": -249.5767364501953, + "logps/rejected": -222.6543426513672, + "loss": 0.1036, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3670658767223358, + "rewards/margins": 0.16034328937530518, + "rewards/rejected": -0.5274091958999634, + "step": 2480 + }, + { + "epoch": 0.3, + "learning_rate": 4.422419779703916e-06, + "logits/chosen": -2.2245254516601562, + "logits/rejected": -1.865247130393982, + "logps/chosen": -208.52560424804688, + "logps/rejected": -190.5004425048828, + "loss": 0.1605, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.3057347238063812, + "rewards/margins": 0.11808328330516815, + "rewards/rejected": -0.42381802201271057, + "step": 2490 + }, + { + "epoch": 0.3, + "learning_rate": 4.415709229177579e-06, + "logits/chosen": -2.0980026721954346, + "logits/rejected": -1.7750365734100342, + "logps/chosen": -243.53494262695312, + "logps/rejected": -276.8687438964844, + "loss": 0.1665, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.253294974565506, + "rewards/margins": 0.14929592609405518, + "rewards/rejected": -0.40259090065956116, + "step": 2500 + }, + { + "epoch": 0.3, + "learning_rate": 4.408965074698048e-06, + "logits/chosen": -2.0569772720336914, + "logits/rejected": -1.7046623229980469, + "logps/chosen": -227.9197998046875, + "logps/rejected": -218.9676055908203, + "loss": 0.1427, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.26669183373451233, + "rewards/margins": 0.17199033498764038, + "rewards/rejected": -0.4386821687221527, + "step": 2510 + }, + { + "epoch": 0.3, + "learning_rate": 4.402187434566286e-06, + "logits/chosen": -1.852573037147522, + "logits/rejected": -1.9005470275878906, + "logps/chosen": -230.39346313476562, + "logps/rejected": -252.2911376953125, + "loss": 0.1693, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3414091169834137, + "rewards/margins": 0.12495288997888565, + "rewards/rejected": -0.46636199951171875, + "step": 2520 + }, + { + "epoch": 0.3, + "learning_rate": 4.395376427670641e-06, + "logits/chosen": -1.8688786029815674, + "logits/rejected": -1.7985941171646118, + "logps/chosen": -279.96453857421875, + "logps/rejected": -322.30328369140625, + "loss": 0.0915, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4272969365119934, + "rewards/margins": 0.16690507531166077, + "rewards/rejected": -0.5942019820213318, + "step": 2530 + }, + { + "epoch": 0.3, + "learning_rate": 4.388532173484754e-06, + "logits/chosen": -2.0615181922912598, + "logits/rejected": -1.5270709991455078, + "logps/chosen": -265.37591552734375, + "logps/rejected": -240.03640747070312, + "loss": 0.1222, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.28391411900520325, + "rewards/margins": 0.16852129995822906, + "rewards/rejected": -0.4524354040622711, + "step": 2540 + }, + { + "epoch": 0.31, + "learning_rate": 4.381654792065464e-06, + "logits/chosen": -2.1687228679656982, + "logits/rejected": -1.6613953113555908, + "logps/chosen": -320.2466735839844, + "logps/rejected": -226.2439727783203, + "loss": 0.1973, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3444002866744995, + "rewards/margins": 0.19264493882656097, + "rewards/rejected": -0.5370452404022217, + "step": 2550 + }, + { + "epoch": 0.31, + "learning_rate": 4.374744404050706e-06, + "logits/chosen": -2.1842730045318604, + "logits/rejected": -1.622300386428833, + "logps/chosen": -257.1001892089844, + "logps/rejected": -265.92913818359375, + "loss": 0.1575, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.26375168561935425, + "rewards/margins": 0.22742655873298645, + "rewards/rejected": -0.4911782145500183, + "step": 2560 + }, + { + "epoch": 0.31, + "learning_rate": 4.367801130657391e-06, + "logits/chosen": -2.060206890106201, + "logits/rejected": -1.610399603843689, + "logps/chosen": -314.66949462890625, + "logps/rejected": -269.8716125488281, + "loss": 0.1212, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.37169474363327026, + "rewards/margins": 0.1514434516429901, + "rewards/rejected": -0.523138165473938, + "step": 2570 + }, + { + "epoch": 0.31, + "learning_rate": 4.3608250936792816e-06, + "logits/chosen": -2.1835896968841553, + "logits/rejected": -1.7747691869735718, + "logps/chosen": -266.17095947265625, + "logps/rejected": -240.68453979492188, + "loss": 0.1395, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.31101933121681213, + "rewards/margins": 0.16181661188602448, + "rewards/rejected": -0.4728359580039978, + "step": 2580 + }, + { + "epoch": 0.31, + "learning_rate": 4.353816415484853e-06, + "logits/chosen": -2.2123303413391113, + "logits/rejected": -1.7858177423477173, + "logps/chosen": -268.2467956542969, + "logps/rejected": -238.09500122070312, + "loss": 0.1023, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.394400417804718, + "rewards/margins": 0.14999434351921082, + "rewards/rejected": -0.5443947911262512, + "step": 2590 + }, + { + "epoch": 0.31, + "learning_rate": 4.346775219015152e-06, + "logits/chosen": -2.0210156440734863, + "logits/rejected": -1.6547054052352905, + "logps/chosen": -292.4082946777344, + "logps/rejected": -289.0941467285156, + "loss": 0.1894, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4957256317138672, + "rewards/margins": 0.1184227466583252, + "rewards/rejected": -0.6141483783721924, + "step": 2600 + }, + { + "epoch": 0.31, + "learning_rate": 4.339701627781633e-06, + "logits/chosen": -1.906998634338379, + "logits/rejected": -1.7141332626342773, + "logps/chosen": -258.7335205078125, + "logps/rejected": -251.07754516601562, + "loss": 0.0969, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3772934079170227, + "rewards/margins": 0.09553851187229156, + "rewards/rejected": -0.47283196449279785, + "step": 2610 + }, + { + "epoch": 0.31, + "learning_rate": 4.332595765863998e-06, + "logits/chosen": -1.8866933584213257, + "logits/rejected": -1.5196045637130737, + "logps/chosen": -179.44906616210938, + "logps/rejected": -181.3992156982422, + "loss": 0.0869, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.376947820186615, + "rewards/margins": 0.14659300446510315, + "rewards/rejected": -0.5235407948493958, + "step": 2620 + }, + { + "epoch": 0.32, + "learning_rate": 4.325457757908016e-06, + "logits/chosen": -1.9919402599334717, + "logits/rejected": -1.5705921649932861, + "logps/chosen": -260.714111328125, + "logps/rejected": -241.1851043701172, + "loss": 0.1377, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.39705362915992737, + "rewards/margins": 0.15993081033229828, + "rewards/rejected": -0.5569844841957092, + "step": 2630 + }, + { + "epoch": 0.32, + "learning_rate": 4.3182877291233395e-06, + "logits/chosen": -1.9707670211791992, + "logits/rejected": -1.495273232460022, + "logps/chosen": -213.5478515625, + "logps/rejected": -201.37350463867188, + "loss": 0.1589, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3921014368534088, + "rewards/margins": 0.1725640594959259, + "rewards/rejected": -0.5646654367446899, + "step": 2640 + }, + { + "epoch": 0.32, + "learning_rate": 4.311085805281306e-06, + "logits/chosen": -1.9549287557601929, + "logits/rejected": -1.630378007888794, + "logps/chosen": -322.6145935058594, + "logps/rejected": -297.1085205078125, + "loss": 0.1446, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.41820430755615234, + "rewards/margins": 0.1227729320526123, + "rewards/rejected": -0.5409771800041199, + "step": 2650 + }, + { + "epoch": 0.32, + "learning_rate": 4.303852112712731e-06, + "logits/chosen": -2.088381767272949, + "logits/rejected": -1.612980604171753, + "logps/chosen": -291.43255615234375, + "logps/rejected": -240.37240600585938, + "loss": 0.0888, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3984852731227875, + "rewards/margins": 0.1815904676914215, + "rewards/rejected": -0.580075740814209, + "step": 2660 + }, + { + "epoch": 0.32, + "learning_rate": 4.2965867783056965e-06, + "logits/chosen": -2.1672182083129883, + "logits/rejected": -1.5168484449386597, + "logps/chosen": -244.7864990234375, + "logps/rejected": -220.2649688720703, + "loss": 0.0939, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.24389946460723877, + "rewards/margins": 0.23934423923492432, + "rewards/rejected": -0.4832437038421631, + "step": 2670 + }, + { + "epoch": 0.32, + "learning_rate": 4.289289929503319e-06, + "logits/chosen": -1.6916240453720093, + "logits/rejected": -1.7404859066009521, + "logps/chosen": -278.12933349609375, + "logps/rejected": -313.3092956542969, + "loss": 0.1081, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.38156622648239136, + "rewards/margins": 0.08456975966691971, + "rewards/rejected": -0.4661359190940857, + "step": 2680 + }, + { + "epoch": 0.32, + "learning_rate": 4.28196169430152e-06, + "logits/chosen": -2.0158498287200928, + "logits/rejected": -1.6960630416870117, + "logps/chosen": -214.83908081054688, + "logps/rejected": -203.30886840820312, + "loss": 0.1377, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.23143234848976135, + "rewards/margins": 0.13869646191596985, + "rewards/rejected": -0.3701288104057312, + "step": 2690 + }, + { + "epoch": 0.32, + "learning_rate": 4.274602201246775e-06, + "logits/chosen": -2.104879856109619, + "logits/rejected": -1.873944878578186, + "logps/chosen": -237.60867309570312, + "logps/rejected": -253.7798309326172, + "loss": 0.1325, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.32146531343460083, + "rewards/margins": 0.13118143379688263, + "rewards/rejected": -0.45264673233032227, + "step": 2700 + }, + { + "epoch": 0.33, + "learning_rate": 4.267211579433865e-06, + "logits/chosen": -2.124311923980713, + "logits/rejected": -1.6988563537597656, + "logps/chosen": -248.6872100830078, + "logps/rejected": -259.9998779296875, + "loss": 0.123, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2223828136920929, + "rewards/margins": 0.22472596168518066, + "rewards/rejected": -0.44710874557495117, + "step": 2710 + }, + { + "epoch": 0.33, + "learning_rate": 4.259789958503606e-06, + "logits/chosen": -1.808075189590454, + "logits/rejected": -1.4258639812469482, + "logps/chosen": -288.0134582519531, + "logps/rejected": -270.99285888671875, + "loss": 0.1181, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.48417121171951294, + "rewards/margins": 0.1431351900100708, + "rewards/rejected": -0.627306342124939, + "step": 2720 + }, + { + "epoch": 0.33, + "learning_rate": 4.252337468640578e-06, + "logits/chosen": -1.8779484033584595, + "logits/rejected": -1.4368770122528076, + "logps/chosen": -182.4998321533203, + "logps/rejected": -176.48782348632812, + "loss": 0.1357, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3015367388725281, + "rewards/margins": 0.1921067237854004, + "rewards/rejected": -0.49364346265792847, + "step": 2730 + }, + { + "epoch": 0.33, + "learning_rate": 4.244854240570844e-06, + "logits/chosen": -1.8997386693954468, + "logits/rejected": -1.638164758682251, + "logps/chosen": -261.68792724609375, + "logps/rejected": -275.615478515625, + "loss": 0.1552, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.38944321870803833, + "rewards/margins": 0.14735980331897736, + "rewards/rejected": -0.536803126335144, + "step": 2740 + }, + { + "epoch": 0.33, + "learning_rate": 4.237340405559648e-06, + "logits/chosen": -2.111983060836792, + "logits/rejected": -1.8002418279647827, + "logps/chosen": -276.4152526855469, + "logps/rejected": -253.03970336914062, + "loss": 0.1451, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.43595629930496216, + "rewards/margins": 0.1535700559616089, + "rewards/rejected": -0.589526355266571, + "step": 2750 + }, + { + "epoch": 0.33, + "learning_rate": 4.229796095409124e-06, + "logits/chosen": -1.9869279861450195, + "logits/rejected": -1.6609262228012085, + "logps/chosen": -243.32666015625, + "logps/rejected": -212.46484375, + "loss": 0.1604, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.41807931661605835, + "rewards/margins": 0.15857204794883728, + "rewards/rejected": -0.5766514539718628, + "step": 2760 + }, + { + "epoch": 0.33, + "learning_rate": 4.222221442455975e-06, + "logits/chosen": -1.7951889038085938, + "logits/rejected": -1.6626970767974854, + "logps/chosen": -270.48785400390625, + "logps/rejected": -270.6944274902344, + "loss": 0.1321, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5235955119132996, + "rewards/margins": 0.13521410524845123, + "rewards/rejected": -0.6588095426559448, + "step": 2770 + }, + { + "epoch": 0.33, + "learning_rate": 4.2146165795691565e-06, + "logits/chosen": -2.0419421195983887, + "logits/rejected": -1.6905419826507568, + "logps/chosen": -274.6120910644531, + "logps/rejected": -232.2598876953125, + "loss": 0.1655, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5508221387863159, + "rewards/margins": 0.0936468318104744, + "rewards/rejected": -0.6444690823554993, + "step": 2780 + }, + { + "epoch": 0.33, + "learning_rate": 4.206981640147543e-06, + "logits/chosen": -1.9829498529434204, + "logits/rejected": -1.5640804767608643, + "logps/chosen": -227.0321044921875, + "logps/rejected": -209.35629272460938, + "loss": 0.1261, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.38011568784713745, + "rewards/margins": 0.20434486865997314, + "rewards/rejected": -0.5844606161117554, + "step": 2790 + }, + { + "epoch": 0.34, + "learning_rate": 4.199316758117592e-06, + "logits/chosen": -1.7994956970214844, + "logits/rejected": -1.328776240348816, + "logps/chosen": -219.972900390625, + "logps/rejected": -209.31918334960938, + "loss": 0.1531, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.39323854446411133, + "rewards/margins": 0.17654483020305634, + "rewards/rejected": -0.5697833895683289, + "step": 2800 + }, + { + "epoch": 0.34, + "learning_rate": 4.191622067930987e-06, + "logits/chosen": -1.9456312656402588, + "logits/rejected": -1.5701932907104492, + "logps/chosen": -287.2892761230469, + "logps/rejected": -293.2528076171875, + "loss": 0.0978, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5495396852493286, + "rewards/margins": 0.13945366442203522, + "rewards/rejected": -0.6889933943748474, + "step": 2810 + }, + { + "epoch": 0.34, + "learning_rate": 4.1838977045622884e-06, + "logits/chosen": -2.122058629989624, + "logits/rejected": -1.8984161615371704, + "logps/chosen": -296.1750793457031, + "logps/rejected": -291.369384765625, + "loss": 0.1468, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5357397794723511, + "rewards/margins": 0.10289420932531357, + "rewards/rejected": -0.6386340260505676, + "step": 2820 + }, + { + "epoch": 0.34, + "learning_rate": 4.1761438035065624e-06, + "logits/chosen": -1.9847745895385742, + "logits/rejected": -1.5842864513397217, + "logps/chosen": -265.0511779785156, + "logps/rejected": -265.59912109375, + "loss": 0.171, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4800674319267273, + "rewards/margins": 0.20678965747356415, + "rewards/rejected": -0.6868571639060974, + "step": 2830 + }, + { + "epoch": 0.34, + "learning_rate": 4.168360500777e-06, + "logits/chosen": -1.9825668334960938, + "logits/rejected": -1.8264620304107666, + "logps/chosen": -270.49163818359375, + "logps/rejected": -263.13958740234375, + "loss": 0.1304, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.6147781014442444, + "rewards/margins": 0.15809233486652374, + "rewards/rejected": -0.7728704214096069, + "step": 2840 + }, + { + "epoch": 0.34, + "learning_rate": 4.160547932902536e-06, + "logits/chosen": -1.998263955116272, + "logits/rejected": -1.4734325408935547, + "logps/chosen": -305.92901611328125, + "logps/rejected": -268.21624755859375, + "loss": 0.1526, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.6971688866615295, + "rewards/margins": 0.1615949124097824, + "rewards/rejected": -0.8587638139724731, + "step": 2850 + }, + { + "epoch": 0.34, + "learning_rate": 4.152706236925453e-06, + "logits/chosen": -1.8893840312957764, + "logits/rejected": -1.5303113460540771, + "logps/chosen": -264.53741455078125, + "logps/rejected": -240.29244995117188, + "loss": 0.1553, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.7637086510658264, + "rewards/margins": 0.09846861660480499, + "rewards/rejected": -0.8621772527694702, + "step": 2860 + }, + { + "epoch": 0.34, + "learning_rate": 4.144835550398977e-06, + "logits/chosen": -2.0382955074310303, + "logits/rejected": -1.6921682357788086, + "logps/chosen": -293.18017578125, + "logps/rejected": -257.85516357421875, + "loss": 0.1353, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5966418385505676, + "rewards/margins": 0.15557792782783508, + "rewards/rejected": -0.7522197961807251, + "step": 2870 + }, + { + "epoch": 0.35, + "learning_rate": 4.136936011384864e-06, + "logits/chosen": -1.9725558757781982, + "logits/rejected": -1.6349289417266846, + "logps/chosen": -282.499267578125, + "logps/rejected": -256.969970703125, + "loss": 0.1057, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5478484034538269, + "rewards/margins": 0.13796645402908325, + "rewards/rejected": -0.6858149170875549, + "step": 2880 + }, + { + "epoch": 0.35, + "learning_rate": 4.129007758450982e-06, + "logits/chosen": -1.8872253894805908, + "logits/rejected": -1.4205682277679443, + "logps/chosen": -262.5359802246094, + "logps/rejected": -235.87744140625, + "loss": 0.1245, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6125961542129517, + "rewards/margins": 0.20152945816516876, + "rewards/rejected": -0.8141257166862488, + "step": 2890 + }, + { + "epoch": 0.35, + "learning_rate": 4.121050930668871e-06, + "logits/chosen": -2.047837734222412, + "logits/rejected": -1.848854422569275, + "logps/chosen": -243.146728515625, + "logps/rejected": -233.36685180664062, + "loss": 0.1355, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4853152334690094, + "rewards/margins": 0.10266610234975815, + "rewards/rejected": -0.5879813432693481, + "step": 2900 + }, + { + "epoch": 0.35, + "learning_rate": 4.113065667611319e-06, + "logits/chosen": -2.1213645935058594, + "logits/rejected": -1.6058502197265625, + "logps/chosen": -282.0854187011719, + "logps/rejected": -236.03237915039062, + "loss": 0.133, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5683524012565613, + "rewards/margins": 0.13890430331230164, + "rewards/rejected": -0.7072567939758301, + "step": 2910 + }, + { + "epoch": 0.35, + "learning_rate": 4.105052109349896e-06, + "logits/chosen": -1.961520791053772, + "logits/rejected": -1.6209933757781982, + "logps/chosen": -235.26229858398438, + "logps/rejected": -211.75735473632812, + "loss": 0.1818, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5107260942459106, + "rewards/margins": 0.1078595519065857, + "rewards/rejected": -0.6185856461524963, + "step": 2920 + }, + { + "epoch": 0.35, + "learning_rate": 4.097010396452511e-06, + "logits/chosen": -1.7602100372314453, + "logits/rejected": -1.5986255407333374, + "logps/chosen": -221.6525115966797, + "logps/rejected": -236.3260955810547, + "loss": 0.099, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5530390739440918, + "rewards/margins": 0.15462610125541687, + "rewards/rejected": -0.7076650857925415, + "step": 2930 + }, + { + "epoch": 0.35, + "learning_rate": 4.088940669980936e-06, + "logits/chosen": -1.833754301071167, + "logits/rejected": -1.4045370817184448, + "logps/chosen": -229.57437133789062, + "logps/rejected": -231.51205444335938, + "loss": 0.1555, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.5543310046195984, + "rewards/margins": 0.21897678077220917, + "rewards/rejected": -0.7733078002929688, + "step": 2940 + }, + { + "epoch": 0.35, + "learning_rate": 4.080843071488343e-06, + "logits/chosen": -1.7528629302978516, + "logits/rejected": -1.5004993677139282, + "logps/chosen": -324.9387512207031, + "logps/rejected": -295.79315185546875, + "loss": 0.1262, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6013875007629395, + "rewards/margins": 0.061654604971408844, + "rewards/rejected": -0.6630421876907349, + "step": 2950 + }, + { + "epoch": 0.36, + "learning_rate": 4.072717743016807e-06, + "logits/chosen": -1.9874687194824219, + "logits/rejected": -1.7957178354263306, + "logps/chosen": -243.137939453125, + "logps/rejected": -279.1502685546875, + "loss": 0.1195, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.45743948221206665, + "rewards/margins": 0.1679219901561737, + "rewards/rejected": -0.6253615021705627, + "step": 2960 + }, + { + "epoch": 0.36, + "learning_rate": 4.064564827094827e-06, + "logits/chosen": -2.1176095008850098, + "logits/rejected": -1.8404308557510376, + "logps/chosen": -246.0503692626953, + "logps/rejected": -247.1431121826172, + "loss": 0.1031, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.41762226819992065, + "rewards/margins": 0.1749526411294937, + "rewards/rejected": -0.592574954032898, + "step": 2970 + }, + { + "epoch": 0.36, + "learning_rate": 4.056384466734819e-06, + "logits/chosen": -1.7445135116577148, + "logits/rejected": -1.2714130878448486, + "logps/chosen": -256.58575439453125, + "logps/rejected": -227.7979736328125, + "loss": 0.1514, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5112208127975464, + "rewards/margins": 0.19044998288154602, + "rewards/rejected": -0.70167076587677, + "step": 2980 + }, + { + "epoch": 0.36, + "learning_rate": 4.048176805430608e-06, + "logits/chosen": -1.8863022327423096, + "logits/rejected": -1.7394742965698242, + "logps/chosen": -262.2151794433594, + "logps/rejected": -257.8638000488281, + "loss": 0.1186, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4821039140224457, + "rewards/margins": 0.17492111027240753, + "rewards/rejected": -0.6570249795913696, + "step": 2990 + }, + { + "epoch": 0.36, + "learning_rate": 4.039941987154913e-06, + "logits/chosen": -2.089552879333496, + "logits/rejected": -1.5143569707870483, + "logps/chosen": -255.2841339111328, + "logps/rejected": -214.74667358398438, + "loss": 0.1349, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.41421470046043396, + "rewards/margins": 0.21286948025226593, + "rewards/rejected": -0.6270841360092163, + "step": 3000 + }, + { + "epoch": 0.36, + "learning_rate": 4.031680156356822e-06, + "logits/chosen": -2.152740478515625, + "logits/rejected": -1.648754358291626, + "logps/chosen": -298.00860595703125, + "logps/rejected": -279.27215576171875, + "loss": 0.088, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.45244675874710083, + "rewards/margins": 0.17495819926261902, + "rewards/rejected": -0.6274049282073975, + "step": 3010 + }, + { + "epoch": 0.36, + "learning_rate": 4.023391457959253e-06, + "logits/chosen": -1.9636989831924438, + "logits/rejected": -1.5016404390335083, + "logps/chosen": -223.6481475830078, + "logps/rejected": -208.9552001953125, + "loss": 0.1553, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3230467140674591, + "rewards/margins": 0.15630824863910675, + "rewards/rejected": -0.47935494780540466, + "step": 3020 + }, + { + "epoch": 0.36, + "learning_rate": 4.015076037356419e-06, + "logits/chosen": -1.778830885887146, + "logits/rejected": -1.504024624824524, + "logps/chosen": -261.44805908203125, + "logps/rejected": -237.22036743164062, + "loss": 0.2152, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.4342494606971741, + "rewards/margins": 0.06908075511455536, + "rewards/rejected": -0.5033301711082458, + "step": 3030 + }, + { + "epoch": 0.36, + "learning_rate": 4.006734040411272e-06, + "logits/chosen": -1.8755178451538086, + "logits/rejected": -1.488073706626892, + "logps/chosen": -233.17788696289062, + "logps/rejected": -202.04881286621094, + "loss": 0.1823, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4887026846408844, + "rewards/margins": 0.13193130493164062, + "rewards/rejected": -0.6206339597702026, + "step": 3040 + }, + { + "epoch": 0.37, + "learning_rate": 3.998365613452947e-06, + "logits/chosen": -1.744222640991211, + "logits/rejected": -1.7371858358383179, + "logps/chosen": -213.4022979736328, + "logps/rejected": -271.8200378417969, + "loss": 0.1179, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.43451786041259766, + "rewards/margins": 0.1312752515077591, + "rewards/rejected": -0.5657930970191956, + "step": 3050 + }, + { + "epoch": 0.37, + "learning_rate": 3.9899709032741955e-06, + "logits/chosen": -2.135042190551758, + "logits/rejected": -1.7216142416000366, + "logps/chosen": -226.56991577148438, + "logps/rejected": -227.9345245361328, + "loss": 0.1873, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.442862331867218, + "rewards/margins": 0.20581674575805664, + "rewards/rejected": -0.6486790776252747, + "step": 3060 + }, + { + "epoch": 0.37, + "learning_rate": 3.981550057128809e-06, + "logits/chosen": -2.0724985599517822, + "logits/rejected": -1.5731353759765625, + "logps/chosen": -249.2626953125, + "logps/rejected": -205.86062622070312, + "loss": 0.0977, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3592108488082886, + "rewards/margins": 0.1588134914636612, + "rewards/rejected": -0.5180243253707886, + "step": 3070 + }, + { + "epoch": 0.37, + "learning_rate": 3.973103222729037e-06, + "logits/chosen": -1.9891624450683594, + "logits/rejected": -1.8182249069213867, + "logps/chosen": -238.1395263671875, + "logps/rejected": -248.2894744873047, + "loss": 0.1503, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.3745599687099457, + "rewards/margins": 0.1474495679140091, + "rewards/rejected": -0.522009551525116, + "step": 3080 + }, + { + "epoch": 0.37, + "learning_rate": 3.964630548242997e-06, + "logits/chosen": -1.7449464797973633, + "logits/rejected": -1.3936296701431274, + "logps/chosen": -234.7018585205078, + "logps/rejected": -203.70974731445312, + "loss": 0.1525, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3542986512184143, + "rewards/margins": 0.15336424112319946, + "rewards/rejected": -0.5076628923416138, + "step": 3090 + }, + { + "epoch": 0.37, + "learning_rate": 3.956132182292071e-06, + "logits/chosen": -1.9436609745025635, + "logits/rejected": -1.6176378726959229, + "logps/chosen": -306.6236572265625, + "logps/rejected": -285.08624267578125, + "loss": 0.1063, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.48512548208236694, + "rewards/margins": 0.1552090346813202, + "rewards/rejected": -0.6403344869613647, + "step": 3100 + }, + { + "epoch": 0.37, + "learning_rate": 3.947608273948305e-06, + "logits/chosen": -1.9343887567520142, + "logits/rejected": -1.7104957103729248, + "logps/chosen": -197.42628479003906, + "logps/rejected": -188.55636596679688, + "loss": 0.1288, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.4073019027709961, + "rewards/margins": 0.13703104853630066, + "rewards/rejected": -0.5443329811096191, + "step": 3110 + }, + { + "epoch": 0.37, + "learning_rate": 3.939058972731788e-06, + "logits/chosen": -2.057648181915283, + "logits/rejected": -1.7952289581298828, + "logps/chosen": -184.43569946289062, + "logps/rejected": -189.1503143310547, + "loss": 0.156, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3685847818851471, + "rewards/margins": 0.1822405755519867, + "rewards/rejected": -0.5508254170417786, + "step": 3120 + }, + { + "epoch": 0.38, + "learning_rate": 3.9304844286080356e-06, + "logits/chosen": -1.9299640655517578, + "logits/rejected": -1.5476219654083252, + "logps/chosen": -265.6641540527344, + "logps/rejected": -238.1685791015625, + "loss": 0.0987, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4248623847961426, + "rewards/margins": 0.14193350076675415, + "rewards/rejected": -0.566795825958252, + "step": 3130 + }, + { + "epoch": 0.38, + "learning_rate": 3.921884791985351e-06, + "logits/chosen": -2.0945184230804443, + "logits/rejected": -1.710710883140564, + "logps/chosen": -289.3420715332031, + "logps/rejected": -286.7973327636719, + "loss": 0.1331, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.41171398758888245, + "rewards/margins": 0.18646354973316193, + "rewards/rejected": -0.5981774926185608, + "step": 3140 + }, + { + "epoch": 0.38, + "learning_rate": 3.913260213712195e-06, + "logits/chosen": -2.005298614501953, + "logits/rejected": -1.6120986938476562, + "logps/chosen": -271.31695556640625, + "logps/rejected": -271.58599853515625, + "loss": 0.1618, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.39514026045799255, + "rewards/margins": 0.18233473598957062, + "rewards/rejected": -0.5774749517440796, + "step": 3150 + }, + { + "epoch": 0.38, + "learning_rate": 3.9046108450745365e-06, + "logits/chosen": -1.9153554439544678, + "logits/rejected": -1.6038618087768555, + "logps/chosen": -244.7465057373047, + "logps/rejected": -224.35009765625, + "loss": 0.1676, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4914848804473877, + "rewards/margins": 0.13527749478816986, + "rewards/rejected": -0.626762330532074, + "step": 3160 + }, + { + "epoch": 0.38, + "learning_rate": 3.895936837793195e-06, + "logits/chosen": -2.1196136474609375, + "logits/rejected": -1.9197361469268799, + "logps/chosen": -272.1467590332031, + "logps/rejected": -282.51373291015625, + "loss": 0.1016, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.3920500576496124, + "rewards/margins": 0.1278650313615799, + "rewards/rejected": -0.5199151039123535, + "step": 3170 + }, + { + "epoch": 0.38, + "learning_rate": 3.887238344021187e-06, + "logits/chosen": -1.9512029886245728, + "logits/rejected": -1.5371229648590088, + "logps/chosen": -229.00180053710938, + "logps/rejected": -227.99154663085938, + "loss": 0.1223, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.4040389060974121, + "rewards/margins": 0.25474390387535095, + "rewards/rejected": -0.6587827801704407, + "step": 3180 + }, + { + "epoch": 0.38, + "learning_rate": 3.878515516341051e-06, + "logits/chosen": -1.8965469598770142, + "logits/rejected": -1.5892069339752197, + "logps/chosen": -312.53717041015625, + "logps/rejected": -321.0782165527344, + "loss": 0.1367, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.38909637928009033, + "rewards/margins": 0.18783049285411835, + "rewards/rejected": -0.5769269466400146, + "step": 3190 + }, + { + "epoch": 0.38, + "learning_rate": 3.869768507762174e-06, + "logits/chosen": -1.8793041706085205, + "logits/rejected": -1.5246042013168335, + "logps/chosen": -206.9381866455078, + "logps/rejected": -187.6604461669922, + "loss": 0.1119, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.5590367913246155, + "rewards/margins": 0.09890522062778473, + "rewards/rejected": -0.6579420566558838, + "step": 3200 + }, + { + "epoch": 0.39, + "learning_rate": 3.860997471718103e-06, + "logits/chosen": -2.1226069927215576, + "logits/rejected": -1.4512499570846558, + "logps/chosen": -252.26736450195312, + "logps/rejected": -193.675048828125, + "loss": 0.1548, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3612119257450104, + "rewards/margins": 0.20598213374614716, + "rewards/rejected": -0.5671939849853516, + "step": 3210 + }, + { + "epoch": 0.39, + "learning_rate": 3.852202562063861e-06, + "logits/chosen": -1.9937137365341187, + "logits/rejected": -1.6922187805175781, + "logps/chosen": -273.3023376464844, + "logps/rejected": -247.2145233154297, + "loss": 0.1343, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.48126834630966187, + "rewards/margins": 0.08357418328523636, + "rewards/rejected": -0.5648424625396729, + "step": 3220 + }, + { + "epoch": 0.39, + "learning_rate": 3.843383933073243e-06, + "logits/chosen": -1.9415899515151978, + "logits/rejected": -1.546696424484253, + "logps/chosen": -264.92291259765625, + "logps/rejected": -254.01150512695312, + "loss": 0.1317, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4355335235595703, + "rewards/margins": 0.15428543090820312, + "rewards/rejected": -0.5898188948631287, + "step": 3230 + }, + { + "epoch": 0.39, + "learning_rate": 3.834541739436111e-06, + "logits/chosen": -2.0209739208221436, + "logits/rejected": -1.7102893590927124, + "logps/chosen": -217.0830078125, + "logps/rejected": -216.00875854492188, + "loss": 0.1922, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3377262353897095, + "rewards/margins": 0.18450435996055603, + "rewards/rejected": -0.5222306251525879, + "step": 3240 + }, + { + "epoch": 0.39, + "learning_rate": 3.82567613625568e-06, + "logits/chosen": -2.1100401878356934, + "logits/rejected": -2.0386178493499756, + "logps/chosen": -306.51043701171875, + "logps/rejected": -312.2314758300781, + "loss": 0.087, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.28856438398361206, + "rewards/margins": 0.09022587537765503, + "rewards/rejected": -0.3787902593612671, + "step": 3250 + }, + { + "epoch": 0.39, + "learning_rate": 3.816787279045796e-06, + "logits/chosen": -1.8298437595367432, + "logits/rejected": -1.4992341995239258, + "logps/chosen": -182.92562866210938, + "logps/rejected": -192.92718505859375, + "loss": 0.1166, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2565813660621643, + "rewards/margins": 0.21510104835033417, + "rewards/rejected": -0.47168245911598206, + "step": 3260 + }, + { + "epoch": 0.39, + "learning_rate": 3.807875323728216e-06, + "logits/chosen": -2.188213586807251, + "logits/rejected": -1.716449499130249, + "logps/chosen": -218.6008758544922, + "logps/rejected": -216.4799041748047, + "loss": 0.1432, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.24077515304088593, + "rewards/margins": 0.20533184707164764, + "rewards/rejected": -0.44610700011253357, + "step": 3270 + }, + { + "epoch": 0.39, + "learning_rate": 3.7989404266298614e-06, + "logits/chosen": -1.775099515914917, + "logits/rejected": -1.7529404163360596, + "logps/chosen": -209.0091094970703, + "logps/rejected": -222.7781982421875, + "loss": 0.1195, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.41656428575515747, + "rewards/margins": 0.1831093728542328, + "rewards/rejected": -0.5996736288070679, + "step": 3280 + }, + { + "epoch": 0.39, + "learning_rate": 3.7899827444800824e-06, + "logits/chosen": -1.975610375404358, + "logits/rejected": -1.701148271560669, + "logps/chosen": -320.1239929199219, + "logps/rejected": -338.82916259765625, + "loss": 0.1189, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5537024736404419, + "rewards/margins": 0.1302124708890915, + "rewards/rejected": -0.6839149594306946, + "step": 3290 + }, + { + "epoch": 0.4, + "learning_rate": 3.7810024344079087e-06, + "logits/chosen": -1.9031383991241455, + "logits/rejected": -1.6330820322036743, + "logps/chosen": -293.07061767578125, + "logps/rejected": -310.0358581542969, + "loss": 0.135, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7108369469642639, + "rewards/margins": 0.24567703902721405, + "rewards/rejected": -0.9565140008926392, + "step": 3300 + }, + { + "epoch": 0.4, + "learning_rate": 3.7719996539392934e-06, + "logits/chosen": -1.9635547399520874, + "logits/rejected": -1.790226697921753, + "logps/chosen": -293.0434875488281, + "logps/rejected": -275.4864501953125, + "loss": 0.1856, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.7340956926345825, + "rewards/margins": 0.12796764075756073, + "rewards/rejected": -0.8620632886886597, + "step": 3310 + }, + { + "epoch": 0.4, + "learning_rate": 3.7629745609943454e-06, + "logits/chosen": -1.8187742233276367, + "logits/rejected": -1.5776069164276123, + "logps/chosen": -245.385498046875, + "logps/rejected": -267.95513916015625, + "loss": 0.2146, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.7081323266029358, + "rewards/margins": 0.12336601316928864, + "rewards/rejected": -0.8314983248710632, + "step": 3320 + }, + { + "epoch": 0.4, + "learning_rate": 3.7539273138845646e-06, + "logits/chosen": -1.7952165603637695, + "logits/rejected": -1.5672744512557983, + "logps/chosen": -287.3976135253906, + "logps/rejected": -305.989013671875, + "loss": 0.121, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.774767279624939, + "rewards/margins": 0.18739402294158936, + "rewards/rejected": -0.9621612429618835, + "step": 3330 + }, + { + "epoch": 0.4, + "learning_rate": 3.744858071310063e-06, + "logits/chosen": -1.732142686843872, + "logits/rejected": -1.4198137521743774, + "logps/chosen": -244.3140869140625, + "logps/rejected": -244.147705078125, + "loss": 0.1903, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.7413711547851562, + "rewards/margins": 0.11797042936086655, + "rewards/rejected": -0.8593416213989258, + "step": 3340 + }, + { + "epoch": 0.4, + "learning_rate": 3.7357669923567796e-06, + "logits/chosen": -2.07377290725708, + "logits/rejected": -1.57345449924469, + "logps/chosen": -321.8138427734375, + "logps/rejected": -300.8492736816406, + "loss": 0.1081, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.709524929523468, + "rewards/margins": 0.2038944661617279, + "rewards/rejected": -0.9134193658828735, + "step": 3350 + }, + { + "epoch": 0.4, + "learning_rate": 3.726654236493693e-06, + "logits/chosen": -1.7073522806167603, + "logits/rejected": -1.2896873950958252, + "logps/chosen": -241.73583984375, + "logps/rejected": -225.9768524169922, + "loss": 0.1198, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7038867473602295, + "rewards/margins": 0.18675477802753448, + "rewards/rejected": -0.8906415700912476, + "step": 3360 + }, + { + "epoch": 0.4, + "learning_rate": 3.71751996357002e-06, + "logits/chosen": -1.9721448421478271, + "logits/rejected": -1.5171587467193604, + "logps/chosen": -273.17547607421875, + "logps/rejected": -271.9955749511719, + "loss": 0.1155, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6923592686653137, + "rewards/margins": 0.09898529201745987, + "rewards/rejected": -0.7913444638252258, + "step": 3370 + }, + { + "epoch": 0.41, + "learning_rate": 3.7083643338124148e-06, + "logits/chosen": -1.9171966314315796, + "logits/rejected": -1.3887364864349365, + "logps/chosen": -230.0151824951172, + "logps/rejected": -235.19412231445312, + "loss": 0.1357, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6308478116989136, + "rewards/margins": 0.2189496010541916, + "rewards/rejected": -0.8497973680496216, + "step": 3380 + }, + { + "epoch": 0.41, + "learning_rate": 3.6991875078221566e-06, + "logits/chosen": -1.9821815490722656, + "logits/rejected": -1.4464524984359741, + "logps/chosen": -298.43017578125, + "logps/rejected": -268.8391418457031, + "loss": 0.1016, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6277667284011841, + "rewards/margins": 0.22099390625953674, + "rewards/rejected": -0.8487606048583984, + "step": 3390 + }, + { + "epoch": 0.41, + "learning_rate": 3.6899896465723352e-06, + "logits/chosen": -1.9020191431045532, + "logits/rejected": -1.55315101146698, + "logps/chosen": -225.8837890625, + "logps/rejected": -174.9435272216797, + "loss": 0.1186, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.46708327531814575, + "rewards/margins": 0.11991620063781738, + "rewards/rejected": -0.5869995355606079, + "step": 3400 + }, + { + "epoch": 0.41, + "learning_rate": 3.6807709114050224e-06, + "logits/chosen": -1.8173465728759766, + "logits/rejected": -1.6795600652694702, + "logps/chosen": -272.4085388183594, + "logps/rejected": -300.34197998046875, + "loss": 0.1344, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6119049787521362, + "rewards/margins": 0.07957009226083755, + "rewards/rejected": -0.6914750337600708, + "step": 3410 + }, + { + "epoch": 0.41, + "learning_rate": 3.6715314640284465e-06, + "logits/chosen": -1.95268976688385, + "logits/rejected": -1.4205843210220337, + "logps/chosen": -284.33819580078125, + "logps/rejected": -281.9459533691406, + "loss": 0.1827, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.6660246253013611, + "rewards/margins": 0.1357964128255844, + "rewards/rejected": -0.8018211126327515, + "step": 3420 + }, + { + "epoch": 0.41, + "learning_rate": 3.6622714665141555e-06, + "logits/chosen": -1.760504961013794, + "logits/rejected": -1.5992462635040283, + "logps/chosen": -255.13363647460938, + "logps/rejected": -276.85931396484375, + "loss": 0.1626, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6074542999267578, + "rewards/margins": 0.1671931892633438, + "rewards/rejected": -0.7746474742889404, + "step": 3430 + }, + { + "epoch": 0.41, + "learning_rate": 3.6529910812941688e-06, + "logits/chosen": -1.9736906290054321, + "logits/rejected": -1.5405575037002563, + "logps/chosen": -306.52337646484375, + "logps/rejected": -294.29608154296875, + "loss": 0.0816, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.6215208768844604, + "rewards/margins": 0.19987761974334717, + "rewards/rejected": -0.8213985562324524, + "step": 3440 + }, + { + "epoch": 0.41, + "learning_rate": 3.6436904711581358e-06, + "logits/chosen": -1.7905645370483398, + "logits/rejected": -1.4196887016296387, + "logps/chosen": -263.2611389160156, + "logps/rejected": -266.86700439453125, + "loss": 0.108, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.576808750629425, + "rewards/margins": 0.20972958207130432, + "rewards/rejected": -0.7865381240844727, + "step": 3450 + }, + { + "epoch": 0.42, + "learning_rate": 3.6343697992504745e-06, + "logits/chosen": -1.8011541366577148, + "logits/rejected": -1.528407096862793, + "logps/chosen": -260.6270446777344, + "logps/rejected": -244.4331817626953, + "loss": 0.1388, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.6323608160018921, + "rewards/margins": 0.14937646687030792, + "rewards/rejected": -0.7817373275756836, + "step": 3460 + }, + { + "epoch": 0.42, + "learning_rate": 3.6250292290675103e-06, + "logits/chosen": -1.8209354877471924, + "logits/rejected": -1.6716169118881226, + "logps/chosen": -279.5582275390625, + "logps/rejected": -250.3738250732422, + "loss": 0.2005, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.6096035242080688, + "rewards/margins": 0.018474172800779343, + "rewards/rejected": -0.6280776858329773, + "step": 3470 + }, + { + "epoch": 0.42, + "learning_rate": 3.6156689244546135e-06, + "logits/chosen": -1.9151493310928345, + "logits/rejected": -1.635745644569397, + "logps/chosen": -304.9268493652344, + "logps/rejected": -309.64410400390625, + "loss": 0.1445, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5671188831329346, + "rewards/margins": 0.12486696243286133, + "rewards/rejected": -0.6919858455657959, + "step": 3480 + }, + { + "epoch": 0.42, + "learning_rate": 3.606289049603317e-06, + "logits/chosen": -1.9070123434066772, + "logits/rejected": -1.632115125656128, + "logps/chosen": -217.40640258789062, + "logps/rejected": -266.7468566894531, + "loss": 0.1873, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5615465044975281, + "rewards/margins": 0.10746095329523087, + "rewards/rejected": -0.6690074801445007, + "step": 3490 + }, + { + "epoch": 0.42, + "learning_rate": 3.596889769048442e-06, + "logits/chosen": -1.9706356525421143, + "logits/rejected": -1.8182321786880493, + "logps/chosen": -245.6522674560547, + "logps/rejected": -260.2083740234375, + "loss": 0.1345, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4987486004829407, + "rewards/margins": 0.12167295068502426, + "rewards/rejected": -0.6204215288162231, + "step": 3500 + }, + { + "epoch": 0.42, + "learning_rate": 3.587471247665211e-06, + "logits/chosen": -1.870273232460022, + "logits/rejected": -1.4389684200286865, + "logps/chosen": -282.3717346191406, + "logps/rejected": -282.10955810546875, + "loss": 0.1518, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5567461252212524, + "rewards/margins": 0.15303365886211395, + "rewards/rejected": -0.7097797393798828, + "step": 3510 + }, + { + "epoch": 0.42, + "learning_rate": 3.578033650666354e-06, + "logits/chosen": -1.9102929830551147, + "logits/rejected": -1.6725549697875977, + "logps/chosen": -269.2619323730469, + "logps/rejected": -265.96185302734375, + "loss": 0.1345, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5339959859848022, + "rewards/margins": 0.11029736697673798, + "rewards/rejected": -0.6442933678627014, + "step": 3520 + }, + { + "epoch": 0.42, + "learning_rate": 3.56857714359921e-06, + "logits/chosen": -1.9303925037384033, + "logits/rejected": -1.450552225112915, + "logps/chosen": -314.16229248046875, + "logps/rejected": -267.4084167480469, + "loss": 0.1383, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5251078605651855, + "rewards/margins": 0.17124128341674805, + "rewards/rejected": -0.6963491439819336, + "step": 3530 + }, + { + "epoch": 0.42, + "learning_rate": 3.5591018923428273e-06, + "logits/chosen": -1.821260690689087, + "logits/rejected": -1.5743091106414795, + "logps/chosen": -229.2704620361328, + "logps/rejected": -216.47482299804688, + "loss": 0.1376, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.45546379685401917, + "rewards/margins": 0.09873731434345245, + "rewards/rejected": -0.5542011857032776, + "step": 3540 + }, + { + "epoch": 0.43, + "learning_rate": 3.5496080631050494e-06, + "logits/chosen": -1.9756050109863281, + "logits/rejected": -1.7553752660751343, + "logps/chosen": -242.6219940185547, + "logps/rejected": -246.052978515625, + "loss": 0.1605, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.44631749391555786, + "rewards/margins": 0.12928064167499542, + "rewards/rejected": -0.5755981206893921, + "step": 3550 + }, + { + "epoch": 0.43, + "learning_rate": 3.5400958224196e-06, + "logits/chosen": -1.7444331645965576, + "logits/rejected": -1.646104097366333, + "logps/chosen": -219.2833251953125, + "logps/rejected": -236.79434204101562, + "loss": 0.1119, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.3898042142391205, + "rewards/margins": 0.09327594935894012, + "rewards/rejected": -0.4830802083015442, + "step": 3560 + }, + { + "epoch": 0.43, + "learning_rate": 3.5305653371431635e-06, + "logits/chosen": -1.8823859691619873, + "logits/rejected": -1.5607668161392212, + "logps/chosen": -255.4270782470703, + "logps/rejected": -250.3773651123047, + "loss": 0.122, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.502920925617218, + "rewards/margins": 0.15608903765678406, + "rewards/rejected": -0.6590099930763245, + "step": 3570 + }, + { + "epoch": 0.43, + "learning_rate": 3.52101677445246e-06, + "logits/chosen": -1.8646084070205688, + "logits/rejected": -1.5643223524093628, + "logps/chosen": -284.89697265625, + "logps/rejected": -273.821044921875, + "loss": 0.112, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.5394625067710876, + "rewards/margins": 0.147272527217865, + "rewards/rejected": -0.6867350339889526, + "step": 3580 + }, + { + "epoch": 0.43, + "learning_rate": 3.5114503018413055e-06, + "logits/chosen": -2.060659408569336, + "logits/rejected": -1.689171552658081, + "logps/chosen": -249.76150512695312, + "logps/rejected": -236.51171875, + "loss": 0.1114, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4521172046661377, + "rewards/margins": 0.12867510318756104, + "rewards/rejected": -0.5807923078536987, + "step": 3590 + }, + { + "epoch": 0.43, + "learning_rate": 3.5018660871176815e-06, + "logits/chosen": -2.1247520446777344, + "logits/rejected": -1.6458734273910522, + "logps/chosen": -317.2967224121094, + "logps/rejected": -260.74072265625, + "loss": 0.1181, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4817916750907898, + "rewards/margins": 0.13223214447498322, + "rewards/rejected": -0.6140238046646118, + "step": 3600 + }, + { + "epoch": 0.43, + "learning_rate": 3.4922642984007888e-06, + "logits/chosen": -1.8655788898468018, + "logits/rejected": -1.3581578731536865, + "logps/chosen": -288.52496337890625, + "logps/rejected": -243.9176483154297, + "loss": 0.2081, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5236949324607849, + "rewards/margins": 0.24149084091186523, + "rewards/rejected": -0.7651858329772949, + "step": 3610 + }, + { + "epoch": 0.43, + "learning_rate": 3.4826451041180963e-06, + "logits/chosen": -1.8614925146102905, + "logits/rejected": -1.6801944971084595, + "logps/chosen": -224.9128875732422, + "logps/rejected": -241.3170928955078, + "loss": 0.1374, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.49704447388648987, + "rewards/margins": 0.1276617795228958, + "rewards/rejected": -0.6247062683105469, + "step": 3620 + }, + { + "epoch": 0.44, + "learning_rate": 3.4730086730023904e-06, + "logits/chosen": -1.9381475448608398, + "logits/rejected": -1.6607654094696045, + "logps/chosen": -270.886474609375, + "logps/rejected": -252.22116088867188, + "loss": 0.1707, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.3630528151988983, + "rewards/margins": 0.13112984597682953, + "rewards/rejected": -0.49418267607688904, + "step": 3630 + }, + { + "epoch": 0.44, + "learning_rate": 3.4633551740888122e-06, + "logits/chosen": -2.1135964393615723, + "logits/rejected": -1.4086754322052002, + "logps/chosen": -322.132568359375, + "logps/rejected": -269.0862731933594, + "loss": 0.0716, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3996971547603607, + "rewards/margins": 0.2393627166748047, + "rewards/rejected": -0.639059841632843, + "step": 3640 + }, + { + "epoch": 0.44, + "learning_rate": 3.4536847767118926e-06, + "logits/chosen": -1.9193264245986938, + "logits/rejected": -1.5788618326187134, + "logps/chosen": -240.32687377929688, + "logps/rejected": -219.1731414794922, + "loss": 0.1606, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.4481803774833679, + "rewards/margins": 0.13924534618854523, + "rewards/rejected": -0.587425708770752, + "step": 3650 + }, + { + "epoch": 0.44, + "learning_rate": 3.443997650502586e-06, + "logits/chosen": -1.7943519353866577, + "logits/rejected": -1.4917861223220825, + "logps/chosen": -238.60647583007812, + "logps/rejected": -199.71517944335938, + "loss": 0.1366, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5198334455490112, + "rewards/margins": 0.13573278486728668, + "rewards/rejected": -0.6555660963058472, + "step": 3660 + }, + { + "epoch": 0.44, + "learning_rate": 3.434293965385287e-06, + "logits/chosen": -1.852033257484436, + "logits/rejected": -1.650202989578247, + "logps/chosen": -262.7406311035156, + "logps/rejected": -232.7073211669922, + "loss": 0.1364, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.40459275245666504, + "rewards/margins": 0.13505297899246216, + "rewards/rejected": -0.5396457314491272, + "step": 3670 + }, + { + "epoch": 0.44, + "learning_rate": 3.4245738915748584e-06, + "logits/chosen": -2.122192621231079, + "logits/rejected": -1.8862508535385132, + "logps/chosen": -264.1258544921875, + "logps/rejected": -277.42730712890625, + "loss": 0.1551, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.42287102341651917, + "rewards/margins": 0.10563336312770844, + "rewards/rejected": -0.5285043716430664, + "step": 3680 + }, + { + "epoch": 0.44, + "learning_rate": 3.4148375995736395e-06, + "logits/chosen": -1.9229469299316406, + "logits/rejected": -1.524235486984253, + "logps/chosen": -297.52252197265625, + "logps/rejected": -271.72381591796875, + "loss": 0.1194, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6332093477249146, + "rewards/margins": 0.17999136447906494, + "rewards/rejected": -0.8132007718086243, + "step": 3690 + }, + { + "epoch": 0.44, + "learning_rate": 3.4050852601684563e-06, + "logits/chosen": -1.7078931331634521, + "logits/rejected": -1.2806559801101685, + "logps/chosen": -242.444091796875, + "logps/rejected": -239.5855712890625, + "loss": 0.1546, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5890045762062073, + "rewards/margins": 0.20271596312522888, + "rewards/rejected": -0.7917205095291138, + "step": 3700 + }, + { + "epoch": 0.45, + "learning_rate": 3.3953170444276283e-06, + "logits/chosen": -2.0124032497406006, + "logits/rejected": -1.6335124969482422, + "logps/chosen": -294.7106628417969, + "logps/rejected": -276.8992004394531, + "loss": 0.094, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5468276143074036, + "rewards/margins": 0.18210643529891968, + "rewards/rejected": -0.728934109210968, + "step": 3710 + }, + { + "epoch": 0.45, + "learning_rate": 3.385533123697966e-06, + "logits/chosen": -1.6806570291519165, + "logits/rejected": -1.582833170890808, + "logps/chosen": -250.8227081298828, + "logps/rejected": -278.7505187988281, + "loss": 0.1042, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5766712427139282, + "rewards/margins": 0.1681402027606964, + "rewards/rejected": -0.744811475276947, + "step": 3720 + }, + { + "epoch": 0.45, + "learning_rate": 3.375733669601763e-06, + "logits/chosen": -1.9780842065811157, + "logits/rejected": -1.5905098915100098, + "logps/chosen": -307.57269287109375, + "logps/rejected": -258.7306213378906, + "loss": 0.1604, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6550931930541992, + "rewards/margins": 0.08839500695466995, + "rewards/rejected": -0.7434881925582886, + "step": 3730 + }, + { + "epoch": 0.45, + "learning_rate": 3.3659188540337884e-06, + "logits/chosen": -2.0141289234161377, + "logits/rejected": -1.7356878519058228, + "logps/chosen": -234.14389038085938, + "logps/rejected": -253.2290496826172, + "loss": 0.1203, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.424830824136734, + "rewards/margins": 0.12162800878286362, + "rewards/rejected": -0.5464588403701782, + "step": 3740 + }, + { + "epoch": 0.45, + "learning_rate": 3.3560888491582736e-06, + "logits/chosen": -1.877969741821289, + "logits/rejected": -1.6804864406585693, + "logps/chosen": -217.326171875, + "logps/rejected": -254.08627319335938, + "loss": 0.1314, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4663441777229309, + "rewards/margins": 0.21716144680976868, + "rewards/rejected": -0.683505654335022, + "step": 3750 + }, + { + "epoch": 0.45, + "learning_rate": 3.3462438274058856e-06, + "logits/chosen": -1.7631876468658447, + "logits/rejected": -1.5371801853179932, + "logps/chosen": -281.08563232421875, + "logps/rejected": -305.8494873046875, + "loss": 0.1094, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5819532871246338, + "rewards/margins": 0.18844255805015564, + "rewards/rejected": -0.770395815372467, + "step": 3760 + }, + { + "epoch": 0.45, + "learning_rate": 3.3363839614707094e-06, + "logits/chosen": -1.8546764850616455, + "logits/rejected": -1.6666762828826904, + "logps/chosen": -333.0240783691406, + "logps/rejected": -356.2004699707031, + "loss": 0.1544, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5481715798377991, + "rewards/margins": 0.1419263780117035, + "rewards/rejected": -0.6900979280471802, + "step": 3770 + }, + { + "epoch": 0.45, + "learning_rate": 3.326509424307214e-06, + "logits/chosen": -1.9387702941894531, + "logits/rejected": -1.7059547901153564, + "logps/chosen": -264.86865234375, + "logps/rejected": -257.7136535644531, + "loss": 0.1748, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.57854825258255, + "rewards/margins": 0.1645454317331314, + "rewards/rejected": -0.7430936694145203, + "step": 3780 + }, + { + "epoch": 0.45, + "learning_rate": 3.3166203891272204e-06, + "logits/chosen": -2.0821845531463623, + "logits/rejected": -1.6988433599472046, + "logps/chosen": -336.27447509765625, + "logps/rejected": -306.909912109375, + "loss": 0.1274, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.48699769377708435, + "rewards/margins": 0.19956621527671814, + "rewards/rejected": -0.686564028263092, + "step": 3790 + }, + { + "epoch": 0.46, + "learning_rate": 3.306717029396863e-06, + "logits/chosen": -1.900738000869751, + "logits/rejected": -1.6325021982192993, + "logps/chosen": -303.42596435546875, + "logps/rejected": -268.81109619140625, + "loss": 0.156, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5264959335327148, + "rewards/margins": 0.08770108968019485, + "rewards/rejected": -0.6141969561576843, + "step": 3800 + }, + { + "epoch": 0.46, + "learning_rate": 3.2967995188335487e-06, + "logits/chosen": -2.0487513542175293, + "logits/rejected": -1.877976655960083, + "logps/chosen": -187.6756134033203, + "logps/rejected": -191.01234436035156, + "loss": 0.1496, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.33429086208343506, + "rewards/margins": 0.13553500175476074, + "rewards/rejected": -0.4698258936405182, + "step": 3810 + }, + { + "epoch": 0.46, + "learning_rate": 3.2868680314029056e-06, + "logits/chosen": -2.096329689025879, + "logits/rejected": -1.7965996265411377, + "logps/chosen": -288.78558349609375, + "logps/rejected": -270.60418701171875, + "loss": 0.1086, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.39450058341026306, + "rewards/margins": 0.10876335948705673, + "rewards/rejected": -0.5032640099525452, + "step": 3820 + }, + { + "epoch": 0.46, + "learning_rate": 3.2769227413157346e-06, + "logits/chosen": -1.8762671947479248, + "logits/rejected": -1.6266272068023682, + "logps/chosen": -268.38555908203125, + "logps/rejected": -219.3870849609375, + "loss": 0.1707, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.29574501514434814, + "rewards/margins": 0.18245580792427063, + "rewards/rejected": -0.47820085287094116, + "step": 3830 + }, + { + "epoch": 0.46, + "learning_rate": 3.266963823024951e-06, + "logits/chosen": -1.7480850219726562, + "logits/rejected": -1.5056158304214478, + "logps/chosen": -233.0159149169922, + "logps/rejected": -227.66030883789062, + "loss": 0.1303, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3971914052963257, + "rewards/margins": 0.13690955936908722, + "rewards/rejected": -0.5341008901596069, + "step": 3840 + }, + { + "epoch": 0.46, + "learning_rate": 3.2569914512225294e-06, + "logits/chosen": -2.3906702995300293, + "logits/rejected": -1.7254350185394287, + "logps/chosen": -289.9384460449219, + "logps/rejected": -234.61758422851562, + "loss": 0.2108, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.31785309314727783, + "rewards/margins": 0.15459677577018738, + "rewards/rejected": -0.47244992852211, + "step": 3850 + }, + { + "epoch": 0.46, + "learning_rate": 3.2470058008364335e-06, + "logits/chosen": -1.940606713294983, + "logits/rejected": -1.567697286605835, + "logps/chosen": -300.01751708984375, + "logps/rejected": -281.2144775390625, + "loss": 0.1124, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.48682618141174316, + "rewards/margins": 0.1738821119070053, + "rewards/rejected": -0.6607083082199097, + "step": 3860 + }, + { + "epoch": 0.46, + "learning_rate": 3.2370070470275493e-06, + "logits/chosen": -1.9613168239593506, + "logits/rejected": -1.6750881671905518, + "logps/chosen": -239.41580200195312, + "logps/rejected": -267.15350341796875, + "loss": 0.1649, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.514406681060791, + "rewards/margins": 0.1650439351797104, + "rewards/rejected": -0.6794506311416626, + "step": 3870 + }, + { + "epoch": 0.47, + "learning_rate": 3.226995365186616e-06, + "logits/chosen": -1.8612607717514038, + "logits/rejected": -1.55705988407135, + "logps/chosen": -228.90213012695312, + "logps/rejected": -198.01766967773438, + "loss": 0.199, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.4910704493522644, + "rewards/margins": 0.11654232442378998, + "rewards/rejected": -0.6076127290725708, + "step": 3880 + }, + { + "epoch": 0.47, + "learning_rate": 3.216970930931144e-06, + "logits/chosen": -2.0845413208007812, + "logits/rejected": -1.7625007629394531, + "logps/chosen": -227.1043243408203, + "logps/rejected": -240.2063751220703, + "loss": 0.1225, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4222134053707123, + "rewards/margins": 0.11798451095819473, + "rewards/rejected": -0.5401979684829712, + "step": 3890 + }, + { + "epoch": 0.47, + "learning_rate": 3.2069339201023398e-06, + "logits/chosen": -2.0071322917938232, + "logits/rejected": -1.961846947669983, + "logps/chosen": -281.2358703613281, + "logps/rejected": -288.28924560546875, + "loss": 0.1283, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.5697019100189209, + "rewards/margins": 0.07494824379682541, + "rewards/rejected": -0.6446502208709717, + "step": 3900 + }, + { + "epoch": 0.47, + "learning_rate": 3.196884508762016e-06, + "logits/chosen": -1.7892014980316162, + "logits/rejected": -1.560509204864502, + "logps/chosen": -263.98577880859375, + "logps/rejected": -225.89773559570312, + "loss": 0.2099, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5703567266464233, + "rewards/margins": 0.1076982244849205, + "rewards/rejected": -0.6780549883842468, + "step": 3910 + }, + { + "epoch": 0.47, + "learning_rate": 3.186822873189508e-06, + "logits/chosen": -1.8385652303695679, + "logits/rejected": -1.474015712738037, + "logps/chosen": -245.1987762451172, + "logps/rejected": -248.1842498779297, + "loss": 0.1091, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5351457595825195, + "rewards/margins": 0.19029943645000458, + "rewards/rejected": -0.7254451513290405, + "step": 3920 + }, + { + "epoch": 0.47, + "learning_rate": 3.1767491898785795e-06, + "logits/chosen": -2.093048572540283, + "logits/rejected": -1.6442959308624268, + "logps/chosen": -260.1977233886719, + "logps/rejected": -194.9726104736328, + "loss": 0.1783, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.4470372200012207, + "rewards/margins": 0.11479449272155762, + "rewards/rejected": -0.5618317127227783, + "step": 3930 + }, + { + "epoch": 0.47, + "learning_rate": 3.166663635534325e-06, + "logits/chosen": -1.9069023132324219, + "logits/rejected": -1.7728145122528076, + "logps/chosen": -251.0937957763672, + "logps/rejected": -275.21026611328125, + "loss": 0.0791, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.4740324914455414, + "rewards/margins": 0.09531258046627045, + "rewards/rejected": -0.5693451166152954, + "step": 3940 + }, + { + "epoch": 0.47, + "learning_rate": 3.1565663870700735e-06, + "logits/chosen": -1.7941697835922241, + "logits/rejected": -1.6212133169174194, + "logps/chosen": -257.2917785644531, + "logps/rejected": -283.5328369140625, + "loss": 0.1458, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.6217584609985352, + "rewards/margins": 0.12172119319438934, + "rewards/rejected": -0.7434796690940857, + "step": 3950 + }, + { + "epoch": 0.48, + "learning_rate": 3.1464576216042832e-06, + "logits/chosen": -1.9793428182601929, + "logits/rejected": -1.592930555343628, + "logps/chosen": -305.0171813964844, + "logps/rejected": -260.90753173828125, + "loss": 0.1064, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6241774559020996, + "rewards/margins": 0.1830337643623352, + "rewards/rejected": -0.8072112202644348, + "step": 3960 + }, + { + "epoch": 0.48, + "learning_rate": 3.1363375164574343e-06, + "logits/chosen": -1.9784101247787476, + "logits/rejected": -1.7758643627166748, + "logps/chosen": -251.56320190429688, + "logps/rejected": -248.0037384033203, + "loss": 0.1837, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.556761622428894, + "rewards/margins": 0.11395516246557236, + "rewards/rejected": -0.6707167625427246, + "step": 3970 + }, + { + "epoch": 0.48, + "learning_rate": 3.126206249148921e-06, + "logits/chosen": -1.7647323608398438, + "logits/rejected": -1.493837594985962, + "logps/chosen": -300.1401062011719, + "logps/rejected": -318.6617431640625, + "loss": 0.1218, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6313523054122925, + "rewards/margins": 0.19563212990760803, + "rewards/rejected": -0.8269845247268677, + "step": 3980 + }, + { + "epoch": 0.48, + "learning_rate": 3.1160639973939337e-06, + "logits/chosen": -2.1163768768310547, + "logits/rejected": -1.8096988201141357, + "logps/chosen": -311.0527038574219, + "logps/rejected": -290.04046630859375, + "loss": 0.2148, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5415524244308472, + "rewards/margins": 0.08402875810861588, + "rewards/rejected": -0.6255810856819153, + "step": 3990 + }, + { + "epoch": 0.48, + "learning_rate": 3.105910939100345e-06, + "logits/chosen": -2.17586088180542, + "logits/rejected": -1.680841088294983, + "logps/chosen": -301.86749267578125, + "logps/rejected": -298.6219177246094, + "loss": 0.1436, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.5832773447036743, + "rewards/margins": 0.17088885605335236, + "rewards/rejected": -0.7541662454605103, + "step": 4000 + }, + { + "epoch": 0.48, + "learning_rate": 3.095747252365588e-06, + "logits/chosen": -1.8582245111465454, + "logits/rejected": -1.5364271402359009, + "logps/chosen": -283.0442810058594, + "logps/rejected": -277.89788818359375, + "loss": 0.1558, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5722718834877014, + "rewards/margins": 0.09478892385959625, + "rewards/rejected": -0.6670608520507812, + "step": 4010 + }, + { + "epoch": 0.48, + "learning_rate": 3.0855731154735326e-06, + "logits/chosen": -1.6970354318618774, + "logits/rejected": -1.434828281402588, + "logps/chosen": -237.4835662841797, + "logps/rejected": -244.4088134765625, + "loss": 0.1824, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.6819806694984436, + "rewards/margins": 0.17408792674541473, + "rewards/rejected": -0.8560686111450195, + "step": 4020 + }, + { + "epoch": 0.48, + "learning_rate": 3.0753887068913545e-06, + "logits/chosen": -1.896554946899414, + "logits/rejected": -1.6122684478759766, + "logps/chosen": -268.3304748535156, + "logps/rejected": -254.9834747314453, + "loss": 0.1759, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5617300271987915, + "rewards/margins": 0.12652353942394257, + "rewards/rejected": -0.6882535219192505, + "step": 4030 + }, + { + "epoch": 0.48, + "learning_rate": 3.0651942052664117e-06, + "logits/chosen": -1.7737739086151123, + "logits/rejected": -1.4258487224578857, + "logps/chosen": -292.5776672363281, + "logps/rejected": -271.95257568359375, + "loss": 0.1443, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5280343890190125, + "rewards/margins": 0.165075421333313, + "rewards/rejected": -0.6931098699569702, + "step": 4040 + }, + { + "epoch": 0.49, + "learning_rate": 3.0549897894231058e-06, + "logits/chosen": -1.9711172580718994, + "logits/rejected": -1.7488377094268799, + "logps/chosen": -313.0440368652344, + "logps/rejected": -290.74798583984375, + "loss": 0.0978, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5002248883247375, + "rewards/margins": 0.10817272961139679, + "rewards/rejected": -0.6083976626396179, + "step": 4050 + }, + { + "epoch": 0.49, + "learning_rate": 3.0447756383597438e-06, + "logits/chosen": -1.9547706842422485, + "logits/rejected": -1.4798098802566528, + "logps/chosen": -224.11904907226562, + "logps/rejected": -188.64633178710938, + "loss": 0.1618, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.5285110473632812, + "rewards/margins": 0.14819425344467163, + "rewards/rejected": -0.6767052412033081, + "step": 4060 + }, + { + "epoch": 0.49, + "learning_rate": 3.034551931245404e-06, + "logits/chosen": -1.902276635169983, + "logits/rejected": -1.4849553108215332, + "logps/chosen": -358.1360168457031, + "logps/rejected": -280.324951171875, + "loss": 0.1335, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.5318064093589783, + "rewards/margins": 0.16225329041481018, + "rewards/rejected": -0.6940596699714661, + "step": 4070 + }, + { + "epoch": 0.49, + "learning_rate": 3.0243188474167884e-06, + "logits/chosen": -1.960026502609253, + "logits/rejected": -1.5699583292007446, + "logps/chosen": -248.52590942382812, + "logps/rejected": -228.6545867919922, + "loss": 0.1788, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.39242681860923767, + "rewards/margins": 0.1711881309747696, + "rewards/rejected": -0.5636149644851685, + "step": 4080 + }, + { + "epoch": 0.49, + "learning_rate": 3.014076566375078e-06, + "logits/chosen": -2.0379650592803955, + "logits/rejected": -1.8301384449005127, + "logps/chosen": -295.1788635253906, + "logps/rejected": -255.16983032226562, + "loss": 0.1771, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.46374598145484924, + "rewards/margins": 0.09922705590724945, + "rewards/rejected": -0.5629730224609375, + "step": 4090 + }, + { + "epoch": 0.49, + "learning_rate": 3.003825267782785e-06, + "logits/chosen": -2.1784117221832275, + "logits/rejected": -1.8092330694198608, + "logps/chosen": -222.7117156982422, + "logps/rejected": -212.63888549804688, + "loss": 0.082, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3608611524105072, + "rewards/margins": 0.22283951938152313, + "rewards/rejected": -0.5837006568908691, + "step": 4100 + }, + { + "epoch": 0.49, + "learning_rate": 2.993565131460602e-06, + "logits/chosen": -1.8552563190460205, + "logits/rejected": -1.5919392108917236, + "logps/chosen": -258.01837158203125, + "logps/rejected": -256.6094665527344, + "loss": 0.1408, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.39110246300697327, + "rewards/margins": 0.14050598442554474, + "rewards/rejected": -0.5316083431243896, + "step": 4110 + }, + { + "epoch": 0.49, + "learning_rate": 2.9832963373842434e-06, + "logits/chosen": -1.8685518503189087, + "logits/rejected": -1.6710237264633179, + "logps/chosen": -225.02157592773438, + "logps/rejected": -231.95175170898438, + "loss": 0.1088, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.4149894714355469, + "rewards/margins": 0.11046002060174942, + "rewards/rejected": -0.5254494547843933, + "step": 4120 + }, + { + "epoch": 0.5, + "learning_rate": 2.973019065681294e-06, + "logits/chosen": -1.994270920753479, + "logits/rejected": -1.5915766954421997, + "logps/chosen": -237.4359893798828, + "logps/rejected": -215.31338500976562, + "loss": 0.1742, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4488137364387512, + "rewards/margins": 0.1414080411195755, + "rewards/rejected": -0.5902218222618103, + "step": 4130 + }, + { + "epoch": 0.5, + "learning_rate": 2.9627334966280474e-06, + "logits/chosen": -2.0599796772003174, + "logits/rejected": -1.6930656433105469, + "logps/chosen": -272.7281799316406, + "logps/rejected": -230.94287109375, + "loss": 0.1445, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4368261396884918, + "rewards/margins": 0.08637617528438568, + "rewards/rejected": -0.5232023000717163, + "step": 4140 + }, + { + "epoch": 0.5, + "learning_rate": 2.952439810646341e-06, + "logits/chosen": -1.9677894115447998, + "logits/rejected": -1.6096127033233643, + "logps/chosen": -250.97988891601562, + "logps/rejected": -230.15469360351562, + "loss": 0.1327, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4640297293663025, + "rewards/margins": 0.13882431387901306, + "rewards/rejected": -0.6028540134429932, + "step": 4150 + }, + { + "epoch": 0.5, + "learning_rate": 2.942138188300394e-06, + "logits/chosen": -1.9296554327011108, + "logits/rejected": -1.5421892404556274, + "logps/chosen": -257.2073974609375, + "logps/rejected": -258.81756591796875, + "loss": 0.1773, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5215272307395935, + "rewards/margins": 0.17221274971961975, + "rewards/rejected": -0.6937400102615356, + "step": 4160 + }, + { + "epoch": 0.5, + "learning_rate": 2.931828810293642e-06, + "logits/chosen": -2.06691312789917, + "logits/rejected": -1.559309720993042, + "logps/chosen": -252.13491821289062, + "logps/rejected": -233.42626953125, + "loss": 0.1603, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.46336793899536133, + "rewards/margins": 0.18448057770729065, + "rewards/rejected": -0.6478484869003296, + "step": 4170 + }, + { + "epoch": 0.5, + "learning_rate": 2.92151185746556e-06, + "logits/chosen": -1.9915742874145508, + "logits/rejected": -1.6672182083129883, + "logps/chosen": -277.6402893066406, + "logps/rejected": -279.9922790527344, + "loss": 0.1693, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5202735066413879, + "rewards/margins": 0.10429404675960541, + "rewards/rejected": -0.6245675683021545, + "step": 4180 + }, + { + "epoch": 0.5, + "learning_rate": 2.911187510788498e-06, + "logits/chosen": -1.9717572927474976, + "logits/rejected": -1.7132648229599, + "logps/chosen": -282.0147399902344, + "logps/rejected": -253.1018524169922, + "loss": 0.1024, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.41674551367759705, + "rewards/margins": 0.13815635442733765, + "rewards/rejected": -0.5549019575119019, + "step": 4190 + }, + { + "epoch": 0.5, + "learning_rate": 2.9008559513645033e-06, + "logits/chosen": -1.9843193292617798, + "logits/rejected": -1.7526146173477173, + "logps/chosen": -264.8548278808594, + "logps/rejected": -241.67941284179688, + "loss": 0.1593, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.43674373626708984, + "rewards/margins": 0.1286846250295639, + "rewards/rejected": -0.5654283761978149, + "step": 4200 + }, + { + "epoch": 0.51, + "learning_rate": 2.890517360422144e-06, + "logits/chosen": -1.9424211978912354, + "logits/rejected": -1.694551706314087, + "logps/chosen": -256.5013732910156, + "logps/rejected": -248.4475860595703, + "loss": 0.1429, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4047975540161133, + "rewards/margins": 0.13096585869789124, + "rewards/rejected": -0.5357634425163269, + "step": 4210 + }, + { + "epoch": 0.51, + "learning_rate": 2.880171919313327e-06, + "logits/chosen": -1.9947586059570312, + "logits/rejected": -1.516392469406128, + "logps/chosen": -261.6585388183594, + "logps/rejected": -189.2516632080078, + "loss": 0.0992, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3641483783721924, + "rewards/margins": 0.1547776162624359, + "rewards/rejected": -0.5189260244369507, + "step": 4220 + }, + { + "epoch": 0.51, + "learning_rate": 2.869819809510125e-06, + "logits/chosen": -1.9108898639678955, + "logits/rejected": -1.6370693445205688, + "logps/chosen": -228.65298461914062, + "logps/rejected": -231.56832885742188, + "loss": 0.1458, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.44458943605422974, + "rewards/margins": 0.1396978348493576, + "rewards/rejected": -0.5842872858047485, + "step": 4230 + }, + { + "epoch": 0.51, + "learning_rate": 2.8594612126015825e-06, + "logits/chosen": -2.0889339447021484, + "logits/rejected": -1.7717602252960205, + "logps/chosen": -264.8407897949219, + "logps/rejected": -302.21026611328125, + "loss": 0.1162, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3832109570503235, + "rewards/margins": 0.19960837066173553, + "rewards/rejected": -0.5828193426132202, + "step": 4240 + }, + { + "epoch": 0.51, + "learning_rate": 2.84909631029054e-06, + "logits/chosen": -1.8553768396377563, + "logits/rejected": -1.621514081954956, + "logps/chosen": -263.96429443359375, + "logps/rejected": -288.17620849609375, + "loss": 0.1611, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6084156036376953, + "rewards/margins": 0.16580604016780853, + "rewards/rejected": -0.7742215991020203, + "step": 4250 + }, + { + "epoch": 0.51, + "learning_rate": 2.838725284390441e-06, + "logits/chosen": -1.8268849849700928, + "logits/rejected": -1.6455726623535156, + "logps/chosen": -269.8697204589844, + "logps/rejected": -297.85150146484375, + "loss": 0.112, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5495954751968384, + "rewards/margins": 0.19235818088054657, + "rewards/rejected": -0.7419536113739014, + "step": 4260 + }, + { + "epoch": 0.51, + "learning_rate": 2.828348316822144e-06, + "logits/chosen": -1.8502800464630127, + "logits/rejected": -1.6670821905136108, + "logps/chosen": -209.65640258789062, + "logps/rejected": -273.5523986816406, + "loss": 0.113, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.46271175146102905, + "rewards/margins": 0.1825498640537262, + "rewards/rejected": -0.6452616453170776, + "step": 4270 + }, + { + "epoch": 0.51, + "learning_rate": 2.817965589610733e-06, + "logits/chosen": -1.8152210712432861, + "logits/rejected": -1.5677975416183472, + "logps/chosen": -217.03182983398438, + "logps/rejected": -245.7638702392578, + "loss": 0.134, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5718420743942261, + "rewards/margins": 0.1416967660188675, + "rewards/rejected": -0.7135388255119324, + "step": 4280 + }, + { + "epoch": 0.51, + "learning_rate": 2.807577284882324e-06, + "logits/chosen": -1.8648059368133545, + "logits/rejected": -1.4172070026397705, + "logps/chosen": -214.18099975585938, + "logps/rejected": -227.8682861328125, + "loss": 0.1346, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.47108086943626404, + "rewards/margins": 0.21896126866340637, + "rewards/rejected": -0.6900421380996704, + "step": 4290 + }, + { + "epoch": 0.52, + "learning_rate": 2.797183584860867e-06, + "logits/chosen": -1.9184010028839111, + "logits/rejected": -1.5958585739135742, + "logps/chosen": -201.8664093017578, + "logps/rejected": -200.3294219970703, + "loss": 0.1953, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3741530776023865, + "rewards/margins": 0.12563326954841614, + "rewards/rejected": -0.499786376953125, + "step": 4300 + }, + { + "epoch": 0.52, + "learning_rate": 2.7867846718649538e-06, + "logits/chosen": -1.7121245861053467, + "logits/rejected": -1.4752973318099976, + "logps/chosen": -242.7071533203125, + "logps/rejected": -287.45587158203125, + "loss": 0.1076, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.39267921447753906, + "rewards/margins": 0.21084478497505188, + "rewards/rejected": -0.6035240292549133, + "step": 4310 + }, + { + "epoch": 0.52, + "learning_rate": 2.7763807283046195e-06, + "logits/chosen": -2.0703561305999756, + "logits/rejected": -1.8529870510101318, + "logps/chosen": -213.96029663085938, + "logps/rejected": -224.46817016601562, + "loss": 0.1417, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.37636810541152954, + "rewards/margins": 0.15098202228546143, + "rewards/rejected": -0.527350127696991, + "step": 4320 + }, + { + "epoch": 0.52, + "learning_rate": 2.76597193667814e-06, + "logits/chosen": -2.061995029449463, + "logits/rejected": -1.6790978908538818, + "logps/chosen": -291.52508544921875, + "logps/rejected": -278.72088623046875, + "loss": 0.1316, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.35219401121139526, + "rewards/margins": 0.10760994255542755, + "rewards/rejected": -0.45980390906333923, + "step": 4330 + }, + { + "epoch": 0.52, + "learning_rate": 2.7555584795688328e-06, + "logits/chosen": -1.9189672470092773, + "logits/rejected": -1.6146653890609741, + "logps/chosen": -249.65347290039062, + "logps/rejected": -247.68115234375, + "loss": 0.1578, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.36561039090156555, + "rewards/margins": 0.15556563436985016, + "rewards/rejected": -0.5211759805679321, + "step": 4340 + }, + { + "epoch": 0.52, + "learning_rate": 2.7451405396418544e-06, + "logits/chosen": -1.940999984741211, + "logits/rejected": -1.4690072536468506, + "logps/chosen": -244.7572021484375, + "logps/rejected": -207.64208984375, + "loss": 0.1639, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.38119882345199585, + "rewards/margins": 0.1026213988661766, + "rewards/rejected": -0.48382019996643066, + "step": 4350 + }, + { + "epoch": 0.52, + "learning_rate": 2.734718299640994e-06, + "logits/chosen": -2.1663918495178223, + "logits/rejected": -1.8815300464630127, + "logps/chosen": -250.9813232421875, + "logps/rejected": -251.88296508789062, + "loss": 0.1574, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.36953067779541016, + "rewards/margins": 0.12235681712627411, + "rewards/rejected": -0.4918874204158783, + "step": 4360 + }, + { + "epoch": 0.52, + "learning_rate": 2.724291942385472e-06, + "logits/chosen": -2.146113395690918, + "logits/rejected": -1.609834909439087, + "logps/chosen": -304.42132568359375, + "logps/rejected": -268.470947265625, + "loss": 0.0921, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.35676488280296326, + "rewards/margins": 0.1696079671382904, + "rewards/rejected": -0.5263728499412537, + "step": 4370 + }, + { + "epoch": 0.53, + "learning_rate": 2.713861650766729e-06, + "logits/chosen": -1.9884231090545654, + "logits/rejected": -1.5890326499938965, + "logps/chosen": -239.7928924560547, + "logps/rejected": -229.9420623779297, + "loss": 0.1299, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.38828951120376587, + "rewards/margins": 0.1986490935087204, + "rewards/rejected": -0.5869385600090027, + "step": 4380 + }, + { + "epoch": 0.53, + "learning_rate": 2.703427607745219e-06, + "logits/chosen": -2.1583807468414307, + "logits/rejected": -1.7095534801483154, + "logps/chosen": -290.33868408203125, + "logps/rejected": -269.4275817871094, + "loss": 0.1828, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3558133542537689, + "rewards/margins": 0.13851602375507355, + "rewards/rejected": -0.4943293631076813, + "step": 4390 + }, + { + "epoch": 0.53, + "learning_rate": 2.6929899963472005e-06, + "logits/chosen": -1.947405219078064, + "logits/rejected": -1.5273383855819702, + "logps/chosen": -235.3554229736328, + "logps/rejected": -215.59786987304688, + "loss": 0.1119, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.35831964015960693, + "rewards/margins": 0.1973123699426651, + "rewards/rejected": -0.5556319952011108, + "step": 4400 + }, + { + "epoch": 0.53, + "learning_rate": 2.6825489996615278e-06, + "logits/chosen": -1.8226381540298462, + "logits/rejected": -1.5650604963302612, + "logps/chosen": -223.3660888671875, + "logps/rejected": -215.26327514648438, + "loss": 0.1294, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.4000251293182373, + "rewards/margins": 0.13242687284946442, + "rewards/rejected": -0.5324519872665405, + "step": 4410 + }, + { + "epoch": 0.53, + "learning_rate": 2.6721048008364343e-06, + "logits/chosen": -1.968601942062378, + "logits/rejected": -1.527261734008789, + "logps/chosen": -263.6767578125, + "logps/rejected": -242.66275024414062, + "loss": 0.1512, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.357761949300766, + "rewards/margins": 0.21579334139823914, + "rewards/rejected": -0.5735553503036499, + "step": 4420 + }, + { + "epoch": 0.53, + "learning_rate": 2.6616575830763247e-06, + "logits/chosen": -2.044994831085205, + "logits/rejected": -1.5942163467407227, + "logps/chosen": -242.64852905273438, + "logps/rejected": -245.306640625, + "loss": 0.1581, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.3358486294746399, + "rewards/margins": 0.13670720160007477, + "rewards/rejected": -0.47255581617355347, + "step": 4430 + }, + { + "epoch": 0.53, + "learning_rate": 2.651207529638561e-06, + "logits/chosen": -1.7535009384155273, + "logits/rejected": -1.373928189277649, + "logps/chosen": -261.17889404296875, + "logps/rejected": -223.257080078125, + "loss": 0.116, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.29363709688186646, + "rewards/margins": 0.1494956910610199, + "rewards/rejected": -0.44313281774520874, + "step": 4440 + }, + { + "epoch": 0.53, + "learning_rate": 2.640754823830242e-06, + "logits/chosen": -2.192082405090332, + "logits/rejected": -1.9141347408294678, + "logps/chosen": -312.79693603515625, + "logps/rejected": -237.3318634033203, + "loss": 0.1115, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.32631200551986694, + "rewards/margins": 0.0922769159078598, + "rewards/rejected": -0.4185889661312103, + "step": 4450 + }, + { + "epoch": 0.54, + "learning_rate": 2.6302996490049983e-06, + "logits/chosen": -2.042506456375122, + "logits/rejected": -1.6036508083343506, + "logps/chosen": -254.78213500976562, + "logps/rejected": -255.03018188476562, + "loss": 0.1262, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.35550904273986816, + "rewards/margins": 0.10495875775814056, + "rewards/rejected": -0.4604678153991699, + "step": 4460 + }, + { + "epoch": 0.54, + "learning_rate": 2.619842188559765e-06, + "logits/chosen": -1.9425427913665771, + "logits/rejected": -1.5557024478912354, + "logps/chosen": -186.65274047851562, + "logps/rejected": -203.18185424804688, + "loss": 0.1508, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.35145506262779236, + "rewards/margins": 0.2223198413848877, + "rewards/rejected": -0.5737749338150024, + "step": 4470 + }, + { + "epoch": 0.54, + "learning_rate": 2.609382625931575e-06, + "logits/chosen": -1.9245996475219727, + "logits/rejected": -1.6936094760894775, + "logps/chosen": -244.86160278320312, + "logps/rejected": -269.80584716796875, + "loss": 0.1606, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4344099164009094, + "rewards/margins": 0.16771261394023895, + "rewards/rejected": -0.6021225452423096, + "step": 4480 + }, + { + "epoch": 0.54, + "learning_rate": 2.59892114459433e-06, + "logits/chosen": -1.9008939266204834, + "logits/rejected": -1.969109296798706, + "logps/chosen": -238.5630340576172, + "logps/rejected": -280.3179931640625, + "loss": 0.1689, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.40993937849998474, + "rewards/margins": 0.12407805770635605, + "rewards/rejected": -0.5340174436569214, + "step": 4490 + }, + { + "epoch": 0.54, + "learning_rate": 2.588457928055592e-06, + "logits/chosen": -1.6659586429595947, + "logits/rejected": -1.2960065603256226, + "logps/chosen": -255.4331512451172, + "logps/rejected": -236.24594116210938, + "loss": 0.1008, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.39769870042800903, + "rewards/margins": 0.24053025245666504, + "rewards/rejected": -0.6382290124893188, + "step": 4500 + }, + { + "epoch": 0.54, + "learning_rate": 2.5779931598533624e-06, + "logits/chosen": -1.9211695194244385, + "logits/rejected": -1.555855393409729, + "logps/chosen": -265.3733215332031, + "logps/rejected": -246.8184814453125, + "loss": 0.1628, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.4115406572818756, + "rewards/margins": 0.12964625656604767, + "rewards/rejected": -0.5411869287490845, + "step": 4510 + }, + { + "epoch": 0.54, + "learning_rate": 2.567527023552857e-06, + "logits/chosen": -1.9409809112548828, + "logits/rejected": -1.6917556524276733, + "logps/chosen": -309.97760009765625, + "logps/rejected": -264.5470886230469, + "loss": 0.0704, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4299238324165344, + "rewards/margins": 0.14857104420661926, + "rewards/rejected": -0.5784948468208313, + "step": 4520 + }, + { + "epoch": 0.54, + "learning_rate": 2.5570597027432907e-06, + "logits/chosen": -1.9963619709014893, + "logits/rejected": -1.5232570171356201, + "logps/chosen": -248.9495391845703, + "logps/rejected": -209.61312866210938, + "loss": 0.1608, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4026300311088562, + "rewards/margins": 0.1366431713104248, + "rewards/rejected": -0.5392731428146362, + "step": 4530 + }, + { + "epoch": 0.54, + "learning_rate": 2.5465913810346575e-06, + "logits/chosen": -1.7939636707305908, + "logits/rejected": -1.6163822412490845, + "logps/chosen": -263.76019287109375, + "logps/rejected": -286.8879699707031, + "loss": 0.1451, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4445052146911621, + "rewards/margins": 0.14057457447052002, + "rewards/rejected": -0.5850798487663269, + "step": 4540 + }, + { + "epoch": 0.55, + "learning_rate": 2.536122242054507e-06, + "logits/chosen": -1.9959796667099, + "logits/rejected": -1.377286672592163, + "logps/chosen": -249.66006469726562, + "logps/rejected": -221.0861358642578, + "loss": 0.1297, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.3975772261619568, + "rewards/margins": 0.19428391754627228, + "rewards/rejected": -0.5918611288070679, + "step": 4550 + }, + { + "epoch": 0.55, + "learning_rate": 2.525652469444727e-06, + "logits/chosen": -2.10296368598938, + "logits/rejected": -1.6731036901474, + "logps/chosen": -211.2078857421875, + "logps/rejected": -196.8761444091797, + "loss": 0.1252, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.3557474613189697, + "rewards/margins": 0.14848320186138153, + "rewards/rejected": -0.5042306184768677, + "step": 4560 + }, + { + "epoch": 0.55, + "learning_rate": 2.5151822468583165e-06, + "logits/chosen": -1.8910295963287354, + "logits/rejected": -1.441156268119812, + "logps/chosen": -235.1863250732422, + "logps/rejected": -203.9948272705078, + "loss": 0.0839, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3970240652561188, + "rewards/margins": 0.21173615753650665, + "rewards/rejected": -0.6087601780891418, + "step": 4570 + }, + { + "epoch": 0.55, + "learning_rate": 2.5047117579561703e-06, + "logits/chosen": -1.858319878578186, + "logits/rejected": -1.6645057201385498, + "logps/chosen": -318.26129150390625, + "logps/rejected": -315.4615478515625, + "loss": 0.1405, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5496889352798462, + "rewards/margins": 0.14965248107910156, + "rewards/rejected": -0.6993414163589478, + "step": 4580 + }, + { + "epoch": 0.55, + "learning_rate": 2.494241186403854e-06, + "logits/chosen": -2.0156023502349854, + "logits/rejected": -1.8666023015975952, + "logps/chosen": -204.9974365234375, + "logps/rejected": -199.46676635742188, + "loss": 0.2009, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.38789471983909607, + "rewards/margins": 0.09470699727535248, + "rewards/rejected": -0.48260173201560974, + "step": 4590 + }, + { + "epoch": 0.55, + "learning_rate": 2.4837707158683833e-06, + "logits/chosen": -1.733515739440918, + "logits/rejected": -1.5892311334609985, + "logps/chosen": -248.87283325195312, + "logps/rejected": -265.2043762207031, + "loss": 0.1352, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5136191248893738, + "rewards/margins": 0.11203992366790771, + "rewards/rejected": -0.6256589889526367, + "step": 4600 + }, + { + "epoch": 0.55, + "learning_rate": 2.473300530015e-06, + "logits/chosen": -2.1996002197265625, + "logits/rejected": -1.857081651687622, + "logps/chosen": -279.1620788574219, + "logps/rejected": -275.622314453125, + "loss": 0.1569, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.3480737805366516, + "rewards/margins": 0.11938655376434326, + "rewards/rejected": -0.4674603343009949, + "step": 4610 + }, + { + "epoch": 0.55, + "learning_rate": 2.4628308125039557e-06, + "logits/chosen": -1.8926032781600952, + "logits/rejected": -1.5367896556854248, + "logps/chosen": -305.69732666015625, + "logps/rejected": -276.2022705078125, + "loss": 0.1795, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.41959348320961, + "rewards/margins": 0.16269411146640778, + "rewards/rejected": -0.582287609577179, + "step": 4620 + }, + { + "epoch": 0.56, + "learning_rate": 2.452361746987284e-06, + "logits/chosen": -1.8755619525909424, + "logits/rejected": -1.7180248498916626, + "logps/chosen": -267.4087829589844, + "logps/rejected": -282.9756774902344, + "loss": 0.0926, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5185616612434387, + "rewards/margins": 0.11375057697296143, + "rewards/rejected": -0.6323122978210449, + "step": 4630 + }, + { + "epoch": 0.56, + "learning_rate": 2.4418935171055818e-06, + "logits/chosen": -1.9167985916137695, + "logits/rejected": -1.6408929824829102, + "logps/chosen": -202.68295288085938, + "logps/rejected": -215.5853729248047, + "loss": 0.1187, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.41076311469078064, + "rewards/margins": 0.12577161192893982, + "rewards/rejected": -0.5365347266197205, + "step": 4640 + }, + { + "epoch": 0.56, + "learning_rate": 2.43142630648479e-06, + "logits/chosen": -1.941982626914978, + "logits/rejected": -1.627986192703247, + "logps/chosen": -308.0236511230469, + "logps/rejected": -356.6058349609375, + "loss": 0.0804, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5076587796211243, + "rewards/margins": 0.15822356939315796, + "rewards/rejected": -0.665882408618927, + "step": 4650 + }, + { + "epoch": 0.56, + "learning_rate": 2.4209602987329685e-06, + "logits/chosen": -1.7499468326568604, + "logits/rejected": -1.2955760955810547, + "logps/chosen": -229.9163055419922, + "logps/rejected": -189.3719024658203, + "loss": 0.0907, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.426525354385376, + "rewards/margins": 0.18848739564418793, + "rewards/rejected": -0.6150127649307251, + "step": 4660 + }, + { + "epoch": 0.56, + "learning_rate": 2.410495677437076e-06, + "logits/chosen": -1.9118763208389282, + "logits/rejected": -1.8392162322998047, + "logps/chosen": -226.6531219482422, + "logps/rejected": -220.9828338623047, + "loss": 0.1856, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3598722815513611, + "rewards/margins": 0.13947324454784393, + "rewards/rejected": -0.49934548139572144, + "step": 4670 + }, + { + "epoch": 0.56, + "learning_rate": 2.400032626159756e-06, + "logits/chosen": -1.9028289318084717, + "logits/rejected": -1.7155911922454834, + "logps/chosen": -224.9667510986328, + "logps/rejected": -226.68728637695312, + "loss": 0.1633, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3874308168888092, + "rewards/margins": 0.13661661744117737, + "rewards/rejected": -0.5240474343299866, + "step": 4680 + }, + { + "epoch": 0.56, + "learning_rate": 2.3895713284361065e-06, + "logits/chosen": -2.0748324394226074, + "logits/rejected": -1.5768488645553589, + "logps/chosen": -245.1274871826172, + "logps/rejected": -222.86328125, + "loss": 0.1049, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3151063919067383, + "rewards/margins": 0.19956035912036896, + "rewards/rejected": -0.514666736125946, + "step": 4690 + }, + { + "epoch": 0.56, + "learning_rate": 2.3791119677704676e-06, + "logits/chosen": -2.194417715072632, + "logits/rejected": -1.6311323642730713, + "logps/chosen": -287.04107666015625, + "logps/rejected": -247.39163208007812, + "loss": 0.1247, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.31668201088905334, + "rewards/margins": 0.18119558691978455, + "rewards/rejected": -0.4978775978088379, + "step": 4700 + }, + { + "epoch": 0.57, + "learning_rate": 2.3686547276332046e-06, + "logits/chosen": -2.08101487159729, + "logits/rejected": -1.6385080814361572, + "logps/chosen": -264.02655029296875, + "logps/rejected": -233.8690948486328, + "loss": 0.1344, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.40405863523483276, + "rewards/margins": 0.16924302279949188, + "rewards/rejected": -0.5733017325401306, + "step": 4710 + }, + { + "epoch": 0.57, + "learning_rate": 2.3581997914574807e-06, + "logits/chosen": -1.9531478881835938, + "logits/rejected": -1.559780240058899, + "logps/chosen": -237.64059448242188, + "logps/rejected": -223.76565551757812, + "loss": 0.1204, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.39436954259872437, + "rewards/margins": 0.16357873380184174, + "rewards/rejected": -0.5579482913017273, + "step": 4720 + }, + { + "epoch": 0.57, + "learning_rate": 2.3477473426360463e-06, + "logits/chosen": -2.1687614917755127, + "logits/rejected": -1.6831060647964478, + "logps/chosen": -255.7380828857422, + "logps/rejected": -224.1558837890625, + "loss": 0.1026, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.3829661011695862, + "rewards/margins": 0.23224039375782013, + "rewards/rejected": -0.6152064800262451, + "step": 4730 + }, + { + "epoch": 0.57, + "learning_rate": 2.337297564518024e-06, + "logits/chosen": -2.1666550636291504, + "logits/rejected": -1.8726260662078857, + "logps/chosen": -303.0262451171875, + "logps/rejected": -272.85894775390625, + "loss": 0.1091, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3961569666862488, + "rewards/margins": 0.1607007533311844, + "rewards/rejected": -0.556857705116272, + "step": 4740 + }, + { + "epoch": 0.57, + "learning_rate": 2.326850640405684e-06, + "logits/chosen": -1.865952730178833, + "logits/rejected": -1.3388252258300781, + "logps/chosen": -331.45611572265625, + "logps/rejected": -297.70074462890625, + "loss": 0.0822, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.35268011689186096, + "rewards/margins": 0.2889634668827057, + "rewards/rejected": -0.6416435837745667, + "step": 4750 + }, + { + "epoch": 0.57, + "learning_rate": 2.3164067535512353e-06, + "logits/chosen": -1.8777456283569336, + "logits/rejected": -1.4981722831726074, + "logps/chosen": -285.3711853027344, + "logps/rejected": -237.86270141601562, + "loss": 0.125, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.4790197014808655, + "rewards/margins": 0.15934725105762482, + "rewards/rejected": -0.6383669972419739, + "step": 4760 + }, + { + "epoch": 0.57, + "learning_rate": 2.3059660871536123e-06, + "logits/chosen": -1.6866099834442139, + "logits/rejected": -1.4525415897369385, + "logps/chosen": -237.55563354492188, + "logps/rejected": -251.5872039794922, + "loss": 0.1271, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5034652948379517, + "rewards/margins": 0.1651829183101654, + "rewards/rejected": -0.6686481833457947, + "step": 4770 + }, + { + "epoch": 0.57, + "learning_rate": 2.2955288243552543e-06, + "logits/chosen": -2.0782809257507324, + "logits/rejected": -1.6525154113769531, + "logps/chosen": -335.6650085449219, + "logps/rejected": -239.81674194335938, + "loss": 0.1245, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.4862341284751892, + "rewards/margins": 0.12166903167963028, + "rewards/rejected": -0.6079031229019165, + "step": 4780 + }, + { + "epoch": 0.57, + "learning_rate": 2.285095148238899e-06, + "logits/chosen": -1.9941129684448242, + "logits/rejected": -1.7789087295532227, + "logps/chosen": -281.5749206542969, + "logps/rejected": -266.099609375, + "loss": 0.1526, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.4684416651725769, + "rewards/margins": 0.17475393414497375, + "rewards/rejected": -0.643195629119873, + "step": 4790 + }, + { + "epoch": 0.58, + "learning_rate": 2.2746652418243714e-06, + "logits/chosen": -2.0029962062835693, + "logits/rejected": -1.7494831085205078, + "logps/chosen": -326.5981140136719, + "logps/rejected": -310.0174255371094, + "loss": 0.0976, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5547425150871277, + "rewards/margins": 0.10356296598911285, + "rewards/rejected": -0.6583055257797241, + "step": 4800 + }, + { + "epoch": 0.58, + "learning_rate": 2.2642392880653677e-06, + "logits/chosen": -1.9393142461776733, + "logits/rejected": -1.916164755821228, + "logps/chosen": -261.62750244140625, + "logps/rejected": -236.9197235107422, + "loss": 0.1388, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.49896836280822754, + "rewards/margins": 0.07649464905261993, + "rewards/rejected": -0.5754629969596863, + "step": 4810 + }, + { + "epoch": 0.58, + "learning_rate": 2.25381746984625e-06, + "logits/chosen": -1.9654737710952759, + "logits/rejected": -1.6573346853256226, + "logps/chosen": -262.6412658691406, + "logps/rejected": -285.51361083984375, + "loss": 0.1336, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.49716243147850037, + "rewards/margins": 0.175074964761734, + "rewards/rejected": -0.6722373366355896, + "step": 4820 + }, + { + "epoch": 0.58, + "learning_rate": 2.2433999699788404e-06, + "logits/chosen": -2.004723072052002, + "logits/rejected": -1.700979471206665, + "logps/chosen": -265.5523986816406, + "logps/rejected": -228.4165496826172, + "loss": 0.124, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5388234257698059, + "rewards/margins": 0.11548347771167755, + "rewards/rejected": -0.654306948184967, + "step": 4830 + }, + { + "epoch": 0.58, + "learning_rate": 2.2329869711992093e-06, + "logits/chosen": -1.9097673892974854, + "logits/rejected": -1.7621214389801025, + "logps/chosen": -229.69906616210938, + "logps/rejected": -267.36273193359375, + "loss": 0.119, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5158060789108276, + "rewards/margins": 0.14432090520858765, + "rewards/rejected": -0.6601270437240601, + "step": 4840 + }, + { + "epoch": 0.58, + "learning_rate": 2.2225786561644724e-06, + "logits/chosen": -1.7414562702178955, + "logits/rejected": -1.63266921043396, + "logps/chosen": -258.82373046875, + "logps/rejected": -264.71661376953125, + "loss": 0.0976, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5453828573226929, + "rewards/margins": 0.16923405230045319, + "rewards/rejected": -0.7146168351173401, + "step": 4850 + }, + { + "epoch": 0.58, + "learning_rate": 2.212175207449589e-06, + "logits/chosen": -1.9261242151260376, + "logits/rejected": -1.4300401210784912, + "logps/chosen": -220.4147186279297, + "logps/rejected": -217.1300811767578, + "loss": 0.1082, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.5556666254997253, + "rewards/margins": 0.23020467162132263, + "rewards/rejected": -0.7858712077140808, + "step": 4860 + }, + { + "epoch": 0.58, + "learning_rate": 2.2017768075441544e-06, + "logits/chosen": -1.9333302974700928, + "logits/rejected": -1.7991241216659546, + "logps/chosen": -260.7289123535156, + "logps/rejected": -277.58941650390625, + "loss": 0.089, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6178120374679565, + "rewards/margins": 0.15292315185070038, + "rewards/rejected": -0.7707351446151733, + "step": 4870 + }, + { + "epoch": 0.59, + "learning_rate": 2.191383638849201e-06, + "logits/chosen": -1.6100937128067017, + "logits/rejected": -1.4786584377288818, + "logps/chosen": -224.05224609375, + "logps/rejected": -254.0164031982422, + "loss": 0.1265, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5358814597129822, + "rewards/margins": 0.21690325438976288, + "rewards/rejected": -0.7527847290039062, + "step": 4880 + }, + { + "epoch": 0.59, + "learning_rate": 2.180995883674003e-06, + "logits/chosen": -2.0832412242889404, + "logits/rejected": -1.8438653945922852, + "logps/chosen": -301.72808837890625, + "logps/rejected": -263.3715515136719, + "loss": 0.1293, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5527404546737671, + "rewards/margins": 0.11939278990030289, + "rewards/rejected": -0.6721332669258118, + "step": 4890 + }, + { + "epoch": 0.59, + "learning_rate": 2.1706137242328708e-06, + "logits/chosen": -1.8641271591186523, + "logits/rejected": -1.7302074432373047, + "logps/chosen": -223.2054443359375, + "logps/rejected": -246.4233856201172, + "loss": 0.1346, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.5136536359786987, + "rewards/margins": 0.13826540112495422, + "rewards/rejected": -0.6519190669059753, + "step": 4900 + }, + { + "epoch": 0.59, + "learning_rate": 2.1602373426419593e-06, + "logits/chosen": -2.0203075408935547, + "logits/rejected": -1.7125927209854126, + "logps/chosen": -247.02804565429688, + "logps/rejected": -248.95193481445312, + "loss": 0.156, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5652672648429871, + "rewards/margins": 0.21123281121253967, + "rewards/rejected": -0.7765001058578491, + "step": 4910 + }, + { + "epoch": 0.59, + "learning_rate": 2.149866920916075e-06, + "logits/chosen": -1.9118244647979736, + "logits/rejected": -1.6481826305389404, + "logps/chosen": -287.49969482421875, + "logps/rejected": -266.9239196777344, + "loss": 0.098, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.507907509803772, + "rewards/margins": 0.15150879323482513, + "rewards/rejected": -0.6594163179397583, + "step": 4920 + }, + { + "epoch": 0.59, + "learning_rate": 2.1395026409654776e-06, + "logits/chosen": -2.052753448486328, + "logits/rejected": -1.707918405532837, + "logps/chosen": -287.81787109375, + "logps/rejected": -250.28988647460938, + "loss": 0.1311, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5523009896278381, + "rewards/margins": 0.10048754513263702, + "rewards/rejected": -0.6527885794639587, + "step": 4930 + }, + { + "epoch": 0.59, + "learning_rate": 2.129144684592694e-06, + "logits/chosen": -1.8895385265350342, + "logits/rejected": -1.4638690948486328, + "logps/chosen": -229.0388946533203, + "logps/rejected": -215.0070343017578, + "loss": 0.1343, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5384308695793152, + "rewards/margins": 0.1393643468618393, + "rewards/rejected": -0.6777952909469604, + "step": 4940 + }, + { + "epoch": 0.59, + "learning_rate": 2.1187932334893282e-06, + "logits/chosen": -2.058537721633911, + "logits/rejected": -1.811730146408081, + "logps/chosen": -238.6881866455078, + "logps/rejected": -239.40774536132812, + "loss": 0.1401, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5211200714111328, + "rewards/margins": 0.11581333726644516, + "rewards/rejected": -0.636933445930481, + "step": 4950 + }, + { + "epoch": 0.6, + "learning_rate": 2.1084484692328726e-06, + "logits/chosen": -1.8077147006988525, + "logits/rejected": -1.6539599895477295, + "logps/chosen": -324.1515808105469, + "logps/rejected": -339.98333740234375, + "loss": 0.0679, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.667458176612854, + "rewards/margins": 0.1506289839744568, + "rewards/rejected": -0.8180869817733765, + "step": 4960 + }, + { + "epoch": 0.6, + "learning_rate": 2.0981105732835227e-06, + "logits/chosen": -1.9896256923675537, + "logits/rejected": -1.4527660608291626, + "logps/chosen": -269.33746337890625, + "logps/rejected": -214.31729125976562, + "loss": 0.1888, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5238735675811768, + "rewards/margins": 0.20568545162677765, + "rewards/rejected": -0.729559063911438, + "step": 4970 + }, + { + "epoch": 0.6, + "learning_rate": 2.087779726980999e-06, + "logits/chosen": -2.0337650775909424, + "logits/rejected": -1.6298131942749023, + "logps/chosen": -297.9162292480469, + "logps/rejected": -280.47637939453125, + "loss": 0.0769, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5461810827255249, + "rewards/margins": 0.18490315973758698, + "rewards/rejected": -0.7310842275619507, + "step": 4980 + }, + { + "epoch": 0.6, + "learning_rate": 2.077456111541359e-06, + "logits/chosen": -1.915443778038025, + "logits/rejected": -1.500583291053772, + "logps/chosen": -290.4164733886719, + "logps/rejected": -241.181396484375, + "loss": 0.1659, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.49130645394325256, + "rewards/margins": 0.18306657671928406, + "rewards/rejected": -0.6743730306625366, + "step": 4990 + }, + { + "epoch": 0.6, + "learning_rate": 2.067139908053821e-06, + "logits/chosen": -2.1264405250549316, + "logits/rejected": -1.8090966939926147, + "logps/chosen": -281.15185546875, + "logps/rejected": -267.20489501953125, + "loss": 0.1224, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.43849506974220276, + "rewards/margins": 0.1732708215713501, + "rewards/rejected": -0.6117658019065857, + "step": 5000 + }, + { + "epoch": 0.6, + "learning_rate": 2.056831297477592e-06, + "logits/chosen": -1.8897788524627686, + "logits/rejected": -1.8385932445526123, + "logps/chosen": -329.4559020996094, + "logps/rejected": -293.82940673828125, + "loss": 0.136, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.678800106048584, + "rewards/margins": 0.08193562924861908, + "rewards/rejected": -0.7607358694076538, + "step": 5010 + }, + { + "epoch": 0.6, + "learning_rate": 2.046530460638687e-06, + "logits/chosen": -2.099050760269165, + "logits/rejected": -1.724473237991333, + "logps/chosen": -318.06732177734375, + "logps/rejected": -279.4828796386719, + "loss": 0.1347, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5295699834823608, + "rewards/margins": 0.18352551758289337, + "rewards/rejected": -0.7130955457687378, + "step": 5020 + }, + { + "epoch": 0.6, + "learning_rate": 2.036237578226761e-06, + "logits/chosen": -1.7614809274673462, + "logits/rejected": -1.4049193859100342, + "logps/chosen": -224.44375610351562, + "logps/rejected": -221.57180786132812, + "loss": 0.1367, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5156166553497314, + "rewards/margins": 0.1933528184890747, + "rewards/rejected": -0.7089694738388062, + "step": 5030 + }, + { + "epoch": 0.6, + "learning_rate": 2.0259528307919385e-06, + "logits/chosen": -2.024557590484619, + "logits/rejected": -1.4408557415008545, + "logps/chosen": -298.25, + "logps/rejected": -232.6403045654297, + "loss": 0.149, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.43420344591140747, + "rewards/margins": 0.20419923961162567, + "rewards/rejected": -0.6384027004241943, + "step": 5040 + }, + { + "epoch": 0.61, + "learning_rate": 2.015676398741644e-06, + "logits/chosen": -1.9110151529312134, + "logits/rejected": -1.3820592164993286, + "logps/chosen": -328.24456787109375, + "logps/rejected": -274.53765869140625, + "loss": 0.1629, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5336133241653442, + "rewards/margins": 0.20271578431129456, + "rewards/rejected": -0.7363291382789612, + "step": 5050 + }, + { + "epoch": 0.61, + "learning_rate": 2.005408462337443e-06, + "logits/chosen": -2.0011394023895264, + "logits/rejected": -1.5512298345565796, + "logps/chosen": -250.78280639648438, + "logps/rejected": -245.1263885498047, + "loss": 0.1404, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.49497905373573303, + "rewards/margins": 0.21909329295158386, + "rewards/rejected": -0.7140722870826721, + "step": 5060 + }, + { + "epoch": 0.61, + "learning_rate": 1.9951492016918745e-06, + "logits/chosen": -1.9097583293914795, + "logits/rejected": -1.5266954898834229, + "logps/chosen": -195.3928985595703, + "logps/rejected": -203.9522247314453, + "loss": 0.1487, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5452216863632202, + "rewards/margins": 0.17070701718330383, + "rewards/rejected": -0.7159286737442017, + "step": 5070 + }, + { + "epoch": 0.61, + "learning_rate": 1.984898796765294e-06, + "logits/chosen": -1.871835470199585, + "logits/rejected": -1.4449571371078491, + "logps/chosen": -198.09242248535156, + "logps/rejected": -197.41738891601562, + "loss": 0.1232, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4947785437107086, + "rewards/margins": 0.17074665427207947, + "rewards/rejected": -0.6655251979827881, + "step": 5080 + }, + { + "epoch": 0.61, + "learning_rate": 1.974657427362717e-06, + "logits/chosen": -1.817120909690857, + "logits/rejected": -1.635406494140625, + "logps/chosen": -292.73358154296875, + "logps/rejected": -272.16900634765625, + "loss": 0.1009, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5638500452041626, + "rewards/margins": 0.10627535730600357, + "rewards/rejected": -0.670125424861908, + "step": 5090 + }, + { + "epoch": 0.61, + "learning_rate": 1.9644252731306653e-06, + "logits/chosen": -1.8737514019012451, + "logits/rejected": -1.5180460214614868, + "logps/chosen": -367.1842956542969, + "logps/rejected": -313.0085144042969, + "loss": 0.1218, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6011655330657959, + "rewards/margins": 0.1648952215909958, + "rewards/rejected": -0.7660607099533081, + "step": 5100 + }, + { + "epoch": 0.61, + "learning_rate": 1.954202513554013e-06, + "logits/chosen": -1.9836708307266235, + "logits/rejected": -1.797654390335083, + "logps/chosen": -243.0941162109375, + "logps/rejected": -266.05535888671875, + "loss": 0.156, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4675242006778717, + "rewards/margins": 0.17585307359695435, + "rewards/rejected": -0.6433773636817932, + "step": 5110 + }, + { + "epoch": 0.61, + "learning_rate": 1.943989327952841e-06, + "logits/chosen": -2.0395166873931885, + "logits/rejected": -1.6357864141464233, + "logps/chosen": -350.53411865234375, + "logps/rejected": -324.0770568847656, + "loss": 0.0882, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5562902092933655, + "rewards/margins": 0.15983954071998596, + "rewards/rejected": -0.7161296606063843, + "step": 5120 + }, + { + "epoch": 0.62, + "learning_rate": 1.9337858954792917e-06, + "logits/chosen": -1.8916152715682983, + "logits/rejected": -1.6612498760223389, + "logps/chosen": -255.1393280029297, + "logps/rejected": -268.5872802734375, + "loss": 0.1109, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5940229892730713, + "rewards/margins": 0.24639251828193665, + "rewards/rejected": -0.8404154777526855, + "step": 5130 + }, + { + "epoch": 0.62, + "learning_rate": 1.9235923951144246e-06, + "logits/chosen": -1.9813220500946045, + "logits/rejected": -1.6410853862762451, + "logps/chosen": -291.50323486328125, + "logps/rejected": -276.9924011230469, + "loss": 0.1087, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.531834602355957, + "rewards/margins": 0.15474644303321838, + "rewards/rejected": -0.686581015586853, + "step": 5140 + }, + { + "epoch": 0.62, + "learning_rate": 1.9134090056650764e-06, + "logits/chosen": -2.085635185241699, + "logits/rejected": -1.7884467840194702, + "logps/chosen": -284.26025390625, + "logps/rejected": -268.23699951171875, + "loss": 0.1903, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5530114769935608, + "rewards/margins": 0.16405172646045685, + "rewards/rejected": -0.7170631885528564, + "step": 5150 + }, + { + "epoch": 0.62, + "learning_rate": 1.9032359057607272e-06, + "logits/chosen": -2.1484217643737793, + "logits/rejected": -1.5867314338684082, + "logps/chosen": -338.3865661621094, + "logps/rejected": -318.114990234375, + "loss": 0.0867, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.4841574728488922, + "rewards/margins": 0.2181321382522583, + "rewards/rejected": -0.7022895812988281, + "step": 5160 + }, + { + "epoch": 0.62, + "learning_rate": 1.8930732738503652e-06, + "logits/chosen": -1.991681694984436, + "logits/rejected": -1.7101647853851318, + "logps/chosen": -253.91015625, + "logps/rejected": -210.7447052001953, + "loss": 0.1055, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5010813474655151, + "rewards/margins": 0.16278687119483948, + "rewards/rejected": -0.6638683080673218, + "step": 5170 + }, + { + "epoch": 0.62, + "learning_rate": 1.8829212881993553e-06, + "logits/chosen": -2.1374099254608154, + "logits/rejected": -1.8165279626846313, + "logps/chosen": -280.1686096191406, + "logps/rejected": -253.91220092773438, + "loss": 0.0657, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.49169284105300903, + "rewards/margins": 0.12237177044153214, + "rewards/rejected": -0.6140645742416382, + "step": 5180 + }, + { + "epoch": 0.62, + "learning_rate": 1.872780126886316e-06, + "logits/chosen": -2.1169066429138184, + "logits/rejected": -1.6588671207427979, + "logps/chosen": -282.9520568847656, + "logps/rejected": -245.0965118408203, + "loss": 0.1101, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4680394232273102, + "rewards/margins": 0.19195261597633362, + "rewards/rejected": -0.659991979598999, + "step": 5190 + }, + { + "epoch": 0.62, + "learning_rate": 1.8626499677999915e-06, + "logits/chosen": -1.8921172618865967, + "logits/rejected": -1.8109171390533447, + "logps/chosen": -260.7506408691406, + "logps/rejected": -277.9801025390625, + "loss": 0.132, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5560612082481384, + "rewards/margins": 0.13018205761909485, + "rewards/rejected": -0.6862432360649109, + "step": 5200 + }, + { + "epoch": 0.63, + "learning_rate": 1.8525309886361332e-06, + "logits/chosen": -1.9643144607543945, + "logits/rejected": -1.4548659324645996, + "logps/chosen": -215.2502899169922, + "logps/rejected": -212.14553833007812, + "loss": 0.1047, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.5360875725746155, + "rewards/margins": 0.2591664493083954, + "rewards/rejected": -0.7952540516853333, + "step": 5210 + }, + { + "epoch": 0.63, + "learning_rate": 1.8424233668943844e-06, + "logits/chosen": -1.8108913898468018, + "logits/rejected": -1.6711933612823486, + "logps/chosen": -245.8279571533203, + "logps/rejected": -260.3638610839844, + "loss": 0.1462, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5376258492469788, + "rewards/margins": 0.17290274798870087, + "rewards/rejected": -0.710528552532196, + "step": 5220 + }, + { + "epoch": 0.63, + "learning_rate": 1.8323272798751629e-06, + "logits/chosen": -1.8469750881195068, + "logits/rejected": -1.6417795419692993, + "logps/chosen": -267.5218811035156, + "logps/rejected": -255.52890014648438, + "loss": 0.1275, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6079937219619751, + "rewards/margins": 0.1432160884141922, + "rewards/rejected": -0.7512098550796509, + "step": 5230 + }, + { + "epoch": 0.63, + "learning_rate": 1.822242904676552e-06, + "logits/chosen": -1.8902513980865479, + "logits/rejected": -1.6447397470474243, + "logps/chosen": -224.96902465820312, + "logps/rejected": -236.59140014648438, + "loss": 0.0791, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5813908576965332, + "rewards/margins": 0.11333571374416351, + "rewards/rejected": -0.6947265863418579, + "step": 5240 + }, + { + "epoch": 0.63, + "learning_rate": 1.8121704181911989e-06, + "logits/chosen": -2.0475192070007324, + "logits/rejected": -1.7533676624298096, + "logps/chosen": -322.51141357421875, + "logps/rejected": -286.36376953125, + "loss": 0.1171, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.5761255025863647, + "rewards/margins": 0.12248637527227402, + "rewards/rejected": -0.698611855506897, + "step": 5250 + }, + { + "epoch": 0.63, + "learning_rate": 1.8021099971032046e-06, + "logits/chosen": -1.731256127357483, + "logits/rejected": -1.3307876586914062, + "logps/chosen": -248.487060546875, + "logps/rejected": -219.0608367919922, + "loss": 0.1045, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5520157217979431, + "rewards/margins": 0.1579941362142563, + "rewards/rejected": -0.7100099325180054, + "step": 5260 + }, + { + "epoch": 0.63, + "learning_rate": 1.7920618178850269e-06, + "logits/chosen": -2.0428383350372314, + "logits/rejected": -1.7901527881622314, + "logps/chosen": -307.78533935546875, + "logps/rejected": -285.2147521972656, + "loss": 0.1015, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5738735795021057, + "rewards/margins": 0.16959087550640106, + "rewards/rejected": -0.743464469909668, + "step": 5270 + }, + { + "epoch": 0.63, + "learning_rate": 1.7820260567943904e-06, + "logits/chosen": -1.903365135192871, + "logits/rejected": -1.7534162998199463, + "logps/chosen": -185.64505004882812, + "logps/rejected": -201.76344299316406, + "loss": 0.1676, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.574264407157898, + "rewards/margins": 0.13275280594825745, + "rewards/rejected": -0.7070172429084778, + "step": 5280 + }, + { + "epoch": 0.63, + "learning_rate": 1.7720028898711852e-06, + "logits/chosen": -1.8674421310424805, + "logits/rejected": -1.413944959640503, + "logps/chosen": -263.2582092285156, + "logps/rejected": -235.0648956298828, + "loss": 0.137, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.491985946893692, + "rewards/margins": 0.17603492736816406, + "rewards/rejected": -0.6680207848548889, + "step": 5290 + }, + { + "epoch": 0.64, + "learning_rate": 1.7619924929343857e-06, + "logits/chosen": -1.919923186302185, + "logits/rejected": -1.7005188465118408, + "logps/chosen": -273.3664855957031, + "logps/rejected": -306.89056396484375, + "loss": 0.1344, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4788680076599121, + "rewards/margins": 0.18705452978610992, + "rewards/rejected": -0.6659225225448608, + "step": 5300 + }, + { + "epoch": 0.64, + "learning_rate": 1.7519950415789661e-06, + "logits/chosen": -1.7585570812225342, + "logits/rejected": -1.5700247287750244, + "logps/chosen": -251.32113647460938, + "logps/rejected": -301.25079345703125, + "loss": 0.1677, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.4546022415161133, + "rewards/margins": 0.16988904774188995, + "rewards/rejected": -0.624491274356842, + "step": 5310 + }, + { + "epoch": 0.64, + "learning_rate": 1.7420107111728167e-06, + "logits/chosen": -1.8963468074798584, + "logits/rejected": -1.7362186908721924, + "logps/chosen": -206.5269317626953, + "logps/rejected": -223.75009155273438, + "loss": 0.08, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.39686816930770874, + "rewards/margins": 0.15394839644432068, + "rewards/rejected": -0.550816535949707, + "step": 5320 + }, + { + "epoch": 0.64, + "learning_rate": 1.7320396768536695e-06, + "logits/chosen": -1.9675251245498657, + "logits/rejected": -1.5709375143051147, + "logps/chosen": -268.6692199707031, + "logps/rejected": -247.16464233398438, + "loss": 0.0934, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4870396554470062, + "rewards/margins": 0.1673925369977951, + "rewards/rejected": -0.6544321775436401, + "step": 5330 + }, + { + "epoch": 0.64, + "learning_rate": 1.7220821135260301e-06, + "logits/chosen": -1.836387038230896, + "logits/rejected": -1.3881337642669678, + "logps/chosen": -240.44357299804688, + "logps/rejected": -221.38931274414062, + "loss": 0.0791, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5035630464553833, + "rewards/margins": 0.15110139548778534, + "rewards/rejected": -0.6546644568443298, + "step": 5340 + }, + { + "epoch": 0.64, + "learning_rate": 1.7121381958581018e-06, + "logits/chosen": -2.0114264488220215, + "logits/rejected": -1.6229709386825562, + "logps/chosen": -298.6057434082031, + "logps/rejected": -229.20565795898438, + "loss": 0.1362, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5573118925094604, + "rewards/margins": 0.11680523306131363, + "rewards/rejected": -0.6741170883178711, + "step": 5350 + }, + { + "epoch": 0.64, + "learning_rate": 1.7022080982787259e-06, + "logits/chosen": -1.8884429931640625, + "logits/rejected": -1.5066940784454346, + "logps/chosen": -274.16937255859375, + "logps/rejected": -251.51052856445312, + "loss": 0.1055, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5535235404968262, + "rewards/margins": 0.13713331520557404, + "rewards/rejected": -0.6906567811965942, + "step": 5360 + }, + { + "epoch": 0.64, + "learning_rate": 1.692291994974326e-06, + "logits/chosen": -1.9273033142089844, + "logits/rejected": -1.4788744449615479, + "logps/chosen": -301.24896240234375, + "logps/rejected": -266.068115234375, + "loss": 0.1223, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4468786120414734, + "rewards/margins": 0.1857236623764038, + "rewards/rejected": -0.6326022148132324, + "step": 5370 + }, + { + "epoch": 0.65, + "learning_rate": 1.682390059885845e-06, + "logits/chosen": -1.962938904762268, + "logits/rejected": -1.4628360271453857, + "logps/chosen": -287.84283447265625, + "logps/rejected": -222.7774658203125, + "loss": 0.0968, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5061804056167603, + "rewards/margins": 0.22085240483283997, + "rewards/rejected": -0.7270327806472778, + "step": 5380 + }, + { + "epoch": 0.65, + "learning_rate": 1.6725024667056965e-06, + "logits/chosen": -1.8040755987167358, + "logits/rejected": -1.4079840183258057, + "logps/chosen": -270.97686767578125, + "logps/rejected": -205.47982788085938, + "loss": 0.1878, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.4969252645969391, + "rewards/margins": 0.0869857668876648, + "rewards/rejected": -0.5839110612869263, + "step": 5390 + }, + { + "epoch": 0.65, + "learning_rate": 1.6626293888747238e-06, + "logits/chosen": -1.9853794574737549, + "logits/rejected": -1.4653469324111938, + "logps/chosen": -268.05926513671875, + "logps/rejected": -262.09197998046875, + "loss": 0.1083, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.47049784660339355, + "rewards/margins": 0.1899462640285492, + "rewards/rejected": -0.6604441404342651, + "step": 5400 + }, + { + "epoch": 0.65, + "learning_rate": 1.652770999579148e-06, + "logits/chosen": -1.9712364673614502, + "logits/rejected": -1.6718246936798096, + "logps/chosen": -248.3401336669922, + "logps/rejected": -259.27471923828125, + "loss": 0.1179, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4843429923057556, + "rewards/margins": 0.15328797698020935, + "rewards/rejected": -0.6376310586929321, + "step": 5410 + }, + { + "epoch": 0.65, + "learning_rate": 1.6429274717475358e-06, + "logits/chosen": -1.8927046060562134, + "logits/rejected": -1.5739778280258179, + "logps/chosen": -279.84503173828125, + "logps/rejected": -235.696533203125, + "loss": 0.0874, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4386516213417053, + "rewards/margins": 0.19288429617881775, + "rewards/rejected": -0.6315358877182007, + "step": 5420 + }, + { + "epoch": 0.65, + "learning_rate": 1.6330989780477673e-06, + "logits/chosen": -1.8618462085723877, + "logits/rejected": -1.5246363878250122, + "logps/chosen": -253.5839385986328, + "logps/rejected": -251.325927734375, + "loss": 0.151, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5161840319633484, + "rewards/margins": 0.14887337386608124, + "rewards/rejected": -0.665057361125946, + "step": 5430 + }, + { + "epoch": 0.65, + "learning_rate": 1.6232856908840033e-06, + "logits/chosen": -2.285269260406494, + "logits/rejected": -1.713772177696228, + "logps/chosen": -263.0864562988281, + "logps/rejected": -217.45498657226562, + "loss": 0.0977, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4418443739414215, + "rewards/margins": 0.17126531898975372, + "rewards/rejected": -0.6131097078323364, + "step": 5440 + }, + { + "epoch": 0.65, + "learning_rate": 1.613487782393661e-06, + "logits/chosen": -1.9873231649398804, + "logits/rejected": -1.5774824619293213, + "logps/chosen": -259.41546630859375, + "logps/rejected": -270.5675048828125, + "loss": 0.122, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4419049322605133, + "rewards/margins": 0.1639256477355957, + "rewards/rejected": -0.6058306097984314, + "step": 5450 + }, + { + "epoch": 0.66, + "learning_rate": 1.6037054244444007e-06, + "logits/chosen": -1.9209073781967163, + "logits/rejected": -1.675719976425171, + "logps/chosen": -248.13119506835938, + "logps/rejected": -253.267822265625, + "loss": 0.1288, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5143054127693176, + "rewards/margins": 0.1403733789920807, + "rewards/rejected": -0.6546787023544312, + "step": 5460 + }, + { + "epoch": 0.66, + "learning_rate": 1.593938788631103e-06, + "logits/chosen": -1.6965789794921875, + "logits/rejected": -1.3882747888565063, + "logps/chosen": -236.5738067626953, + "logps/rejected": -283.84869384765625, + "loss": 0.0971, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.4736576974391937, + "rewards/margins": 0.199508398771286, + "rewards/rejected": -0.6731661558151245, + "step": 5470 + }, + { + "epoch": 0.66, + "learning_rate": 1.5841880462728626e-06, + "logits/chosen": -1.8833509683609009, + "logits/rejected": -1.6429067850112915, + "logps/chosen": -280.9195861816406, + "logps/rejected": -278.397705078125, + "loss": 0.1557, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.46772581338882446, + "rewards/margins": 0.1507682502269745, + "rewards/rejected": -0.6184940934181213, + "step": 5480 + }, + { + "epoch": 0.66, + "learning_rate": 1.5744533684099861e-06, + "logits/chosen": -2.0979132652282715, + "logits/rejected": -1.699033498764038, + "logps/chosen": -264.59173583984375, + "logps/rejected": -252.0912628173828, + "loss": 0.164, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4788171648979187, + "rewards/margins": 0.12794797122478485, + "rewards/rejected": -0.6067651510238647, + "step": 5490 + }, + { + "epoch": 0.66, + "learning_rate": 1.5647349258009857e-06, + "logits/chosen": -1.7671406269073486, + "logits/rejected": -1.574204683303833, + "logps/chosen": -282.95458984375, + "logps/rejected": -298.6538391113281, + "loss": 0.0751, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5590416193008423, + "rewards/margins": 0.1640872210264206, + "rewards/rejected": -0.7231289148330688, + "step": 5500 + }, + { + "epoch": 0.66, + "learning_rate": 1.555032888919586e-06, + "logits/chosen": -1.686753511428833, + "logits/rejected": -1.4295735359191895, + "logps/chosen": -264.7411804199219, + "logps/rejected": -253.6551971435547, + "loss": 0.2432, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5408766269683838, + "rewards/margins": 0.15903475880622864, + "rewards/rejected": -0.69991135597229, + "step": 5510 + }, + { + "epoch": 0.66, + "learning_rate": 1.5453474279517383e-06, + "logits/chosen": -1.805437684059143, + "logits/rejected": -1.6262556314468384, + "logps/chosen": -240.7580108642578, + "logps/rejected": -236.3534393310547, + "loss": 0.1229, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.500167727470398, + "rewards/margins": 0.12385289371013641, + "rewards/rejected": -0.6240206360816956, + "step": 5520 + }, + { + "epoch": 0.66, + "learning_rate": 1.5356787127926285e-06, + "logits/chosen": -1.9110714197158813, + "logits/rejected": -1.4245280027389526, + "logps/chosen": -316.56072998046875, + "logps/rejected": -266.979248046875, + "loss": 0.0896, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4973506033420563, + "rewards/margins": 0.21019454300403595, + "rewards/rejected": -0.7075451612472534, + "step": 5530 + }, + { + "epoch": 0.66, + "learning_rate": 1.526026913043699e-06, + "logits/chosen": -1.7721723318099976, + "logits/rejected": -1.6717960834503174, + "logps/chosen": -231.5484161376953, + "logps/rejected": -236.1089324951172, + "loss": 0.0822, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.5355015397071838, + "rewards/margins": 0.13346286118030548, + "rewards/rejected": -0.6689643859863281, + "step": 5540 + }, + { + "epoch": 0.67, + "learning_rate": 1.5163921980096791e-06, + "logits/chosen": -1.8417619466781616, + "logits/rejected": -1.7490192651748657, + "logps/chosen": -259.3507080078125, + "logps/rejected": -285.2629699707031, + "loss": 0.1187, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5752219557762146, + "rewards/margins": 0.11573759466409683, + "rewards/rejected": -0.6909595727920532, + "step": 5550 + }, + { + "epoch": 0.67, + "learning_rate": 1.5067747366956065e-06, + "logits/chosen": -2.117729663848877, + "logits/rejected": -1.7773869037628174, + "logps/chosen": -260.44342041015625, + "logps/rejected": -221.59371948242188, + "loss": 0.1647, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.49646344780921936, + "rewards/margins": 0.16098496317863464, + "rewards/rejected": -0.657448410987854, + "step": 5560 + }, + { + "epoch": 0.67, + "learning_rate": 1.4971746978038671e-06, + "logits/chosen": -1.8527145385742188, + "logits/rejected": -1.7836803197860718, + "logps/chosen": -257.3758850097656, + "logps/rejected": -278.2964782714844, + "loss": 0.1172, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4473143517971039, + "rewards/margins": 0.14970967173576355, + "rewards/rejected": -0.5970240235328674, + "step": 5570 + }, + { + "epoch": 0.67, + "learning_rate": 1.4875922497312384e-06, + "logits/chosen": -1.802384376525879, + "logits/rejected": -1.3964884281158447, + "logps/chosen": -257.4482421875, + "logps/rejected": -256.9828796386719, + "loss": 0.0714, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4673423767089844, + "rewards/margins": 0.22057469189167023, + "rewards/rejected": -0.6879170536994934, + "step": 5580 + }, + { + "epoch": 0.67, + "learning_rate": 1.4780275605659308e-06, + "logits/chosen": -1.9443477392196655, + "logits/rejected": -1.470523476600647, + "logps/chosen": -216.4662628173828, + "logps/rejected": -213.83154296875, + "loss": 0.1033, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.47214627265930176, + "rewards/margins": 0.24475538730621338, + "rewards/rejected": -0.7169016599655151, + "step": 5590 + }, + { + "epoch": 0.67, + "learning_rate": 1.46848079808464e-06, + "logits/chosen": -1.811112642288208, + "logits/rejected": -1.5633313655853271, + "logps/chosen": -286.9377746582031, + "logps/rejected": -256.46368408203125, + "loss": 0.1492, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4783618450164795, + "rewards/margins": 0.16989843547344208, + "rewards/rejected": -0.648260235786438, + "step": 5600 + }, + { + "epoch": 0.67, + "learning_rate": 1.4589521297496085e-06, + "logits/chosen": -1.9072492122650146, + "logits/rejected": -1.6674668788909912, + "logps/chosen": -287.529052734375, + "logps/rejected": -309.4631042480469, + "loss": 0.1368, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.46460795402526855, + "rewards/margins": 0.12597152590751648, + "rewards/rejected": -0.5905795097351074, + "step": 5610 + }, + { + "epoch": 0.67, + "learning_rate": 1.4494417227056811e-06, + "logits/chosen": -1.9489076137542725, + "logits/rejected": -1.5660401582717896, + "logps/chosen": -236.4983673095703, + "logps/rejected": -253.90518188476562, + "loss": 0.0872, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.41152167320251465, + "rewards/margins": 0.21388018131256104, + "rewards/rejected": -0.6254019737243652, + "step": 5620 + }, + { + "epoch": 0.68, + "learning_rate": 1.4399497437773786e-06, + "logits/chosen": -1.9147542715072632, + "logits/rejected": -1.4943046569824219, + "logps/chosen": -272.55987548828125, + "logps/rejected": -264.3128967285156, + "loss": 0.1259, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5410269498825073, + "rewards/margins": 0.19129455089569092, + "rewards/rejected": -0.732321560382843, + "step": 5630 + }, + { + "epoch": 0.68, + "learning_rate": 1.4304763594659694e-06, + "logits/chosen": -2.035388946533203, + "logits/rejected": -1.5075829029083252, + "logps/chosen": -302.56365966796875, + "logps/rejected": -257.10601806640625, + "loss": 0.1725, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.49816712737083435, + "rewards/margins": 0.18971143662929535, + "rewards/rejected": -0.6878786087036133, + "step": 5640 + }, + { + "epoch": 0.68, + "learning_rate": 1.4210217359465483e-06, + "logits/chosen": -2.001213312149048, + "logits/rejected": -1.7480404376983643, + "logps/chosen": -250.2290496826172, + "logps/rejected": -263.33099365234375, + "loss": 0.1514, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.43809619545936584, + "rewards/margins": 0.09596999734640121, + "rewards/rejected": -0.5340661406517029, + "step": 5650 + }, + { + "epoch": 0.68, + "learning_rate": 1.4115860390651204e-06, + "logits/chosen": -1.8610761165618896, + "logits/rejected": -1.242117166519165, + "logps/chosen": -272.94488525390625, + "logps/rejected": -214.08584594726562, + "loss": 0.1465, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.5517784953117371, + "rewards/margins": 0.17574277520179749, + "rewards/rejected": -0.7275213003158569, + "step": 5660 + }, + { + "epoch": 0.68, + "learning_rate": 1.4021694343356992e-06, + "logits/chosen": -2.0516114234924316, + "logits/rejected": -1.6413084268569946, + "logps/chosen": -243.1460723876953, + "logps/rejected": -240.1009063720703, + "loss": 0.1033, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.4688890874385834, + "rewards/margins": 0.14948108792304993, + "rewards/rejected": -0.6183701753616333, + "step": 5670 + }, + { + "epoch": 0.68, + "learning_rate": 1.3927720869373912e-06, + "logits/chosen": -1.7400707006454468, + "logits/rejected": -1.5164529085159302, + "logps/chosen": -283.2667236328125, + "logps/rejected": -289.87200927734375, + "loss": 0.0951, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.46899938583374023, + "rewards/margins": 0.15114405751228333, + "rewards/rejected": -0.6201435327529907, + "step": 5680 + }, + { + "epoch": 0.68, + "learning_rate": 1.383394161711509e-06, + "logits/chosen": -1.7821184396743774, + "logits/rejected": -1.5143952369689941, + "logps/chosen": -243.70639038085938, + "logps/rejected": -248.849365234375, + "loss": 0.0863, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.5013656616210938, + "rewards/margins": 0.18036916851997375, + "rewards/rejected": -0.6817347407341003, + "step": 5690 + }, + { + "epoch": 0.68, + "learning_rate": 1.3740358231586752e-06, + "logits/chosen": -1.9248117208480835, + "logits/rejected": -1.6384576559066772, + "logps/chosen": -238.166015625, + "logps/rejected": -213.8789825439453, + "loss": 0.1689, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.5257433652877808, + "rewards/margins": 0.1685757339000702, + "rewards/rejected": -0.6943190693855286, + "step": 5700 + }, + { + "epoch": 0.69, + "learning_rate": 1.3646972354359379e-06, + "logits/chosen": -2.0671422481536865, + "logits/rejected": -1.6382163763046265, + "logps/chosen": -248.79116821289062, + "logps/rejected": -239.56393432617188, + "loss": 0.1352, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5227428674697876, + "rewards/margins": 0.1494591385126114, + "rewards/rejected": -0.672201931476593, + "step": 5710 + }, + { + "epoch": 0.69, + "learning_rate": 1.3553785623538873e-06, + "logits/chosen": -1.8637508153915405, + "logits/rejected": -1.6675211191177368, + "logps/chosen": -217.97286987304688, + "logps/rejected": -246.8916015625, + "loss": 0.1261, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5041235685348511, + "rewards/margins": 0.16983681917190552, + "rewards/rejected": -0.6739604473114014, + "step": 5720 + }, + { + "epoch": 0.69, + "learning_rate": 1.346079967373792e-06, + "logits/chosen": -1.684739351272583, + "logits/rejected": -1.6077110767364502, + "logps/chosen": -205.7179718017578, + "logps/rejected": -213.598388671875, + "loss": 0.0877, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4919508099555969, + "rewards/margins": 0.15389500558376312, + "rewards/rejected": -0.6458457708358765, + "step": 5730 + }, + { + "epoch": 0.69, + "learning_rate": 1.3368016136047194e-06, + "logits/chosen": -1.7886161804199219, + "logits/rejected": -1.5077216625213623, + "logps/chosen": -311.54486083984375, + "logps/rejected": -265.2232971191406, + "loss": 0.1852, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5703347325325012, + "rewards/margins": 0.1437651813030243, + "rewards/rejected": -0.7140999436378479, + "step": 5740 + }, + { + "epoch": 0.69, + "learning_rate": 1.3275436638006838e-06, + "logits/chosen": -1.9010818004608154, + "logits/rejected": -1.6132938861846924, + "logps/chosen": -279.0166320800781, + "logps/rejected": -293.9725646972656, + "loss": 0.1197, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5455999374389648, + "rewards/margins": 0.1750425398349762, + "rewards/rejected": -0.7206425070762634, + "step": 5750 + }, + { + "epoch": 0.69, + "learning_rate": 1.3183062803577872e-06, + "logits/chosen": -1.9034898281097412, + "logits/rejected": -1.5641247034072876, + "logps/chosen": -227.96151733398438, + "logps/rejected": -215.01339721679688, + "loss": 0.1401, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.5809696316719055, + "rewards/margins": 0.2188103199005127, + "rewards/rejected": -0.7997799515724182, + "step": 5760 + }, + { + "epoch": 0.69, + "learning_rate": 1.3090896253113736e-06, + "logits/chosen": -1.8766626119613647, + "logits/rejected": -1.700510025024414, + "logps/chosen": -258.7127380371094, + "logps/rejected": -250.1974334716797, + "loss": 0.1517, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.5055121183395386, + "rewards/margins": 0.1273249089717865, + "rewards/rejected": -0.6328369379043579, + "step": 5770 + }, + { + "epoch": 0.69, + "learning_rate": 1.2998938603331796e-06, + "logits/chosen": -1.8572345972061157, + "logits/rejected": -1.651341199874878, + "logps/chosen": -255.7646026611328, + "logps/rejected": -279.30694580078125, + "loss": 0.0922, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6293990612030029, + "rewards/margins": 0.12281368672847748, + "rewards/rejected": -0.7522127032279968, + "step": 5780 + }, + { + "epoch": 0.69, + "learning_rate": 1.2907191467285118e-06, + "logits/chosen": -1.9182488918304443, + "logits/rejected": -1.6267467737197876, + "logps/chosen": -266.6481018066406, + "logps/rejected": -413.2779235839844, + "loss": 5.1685, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.47287482023239136, + "rewards/margins": 0.5273378491401672, + "rewards/rejected": -1.0002126693725586, + "step": 5790 + }, + { + "epoch": 0.7, + "learning_rate": 1.2815656454334013e-06, + "logits/chosen": -1.9395920038223267, + "logits/rejected": -1.8263431787490845, + "logps/chosen": -256.23663330078125, + "logps/rejected": -270.293701171875, + "loss": 0.1378, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4917454719543457, + "rewards/margins": 0.15827788412570953, + "rewards/rejected": -0.650023341178894, + "step": 5800 + }, + { + "epoch": 0.7, + "learning_rate": 1.272433517011793e-06, + "logits/chosen": -1.9891746044158936, + "logits/rejected": -1.7108840942382812, + "logps/chosen": -311.8459167480469, + "logps/rejected": -299.76654052734375, + "loss": 0.0751, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.48035064339637756, + "rewards/margins": 0.1297578364610672, + "rewards/rejected": -0.6101084351539612, + "step": 5810 + }, + { + "epoch": 0.7, + "learning_rate": 1.2633229216527235e-06, + "logits/chosen": -1.9504735469818115, + "logits/rejected": -1.4860206842422485, + "logps/chosen": -230.38272094726562, + "logps/rejected": -224.21981811523438, + "loss": 0.1323, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4245625138282776, + "rewards/margins": 0.2091328203678131, + "rewards/rejected": -0.6336953043937683, + "step": 5820 + }, + { + "epoch": 0.7, + "learning_rate": 1.254234019167514e-06, + "logits/chosen": -1.9143810272216797, + "logits/rejected": -1.5591933727264404, + "logps/chosen": -283.6191711425781, + "logps/rejected": -264.5260009765625, + "loss": 0.0793, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4440692067146301, + "rewards/margins": 0.2215677946805954, + "rewards/rejected": -0.6656370759010315, + "step": 5830 + }, + { + "epoch": 0.7, + "learning_rate": 1.24516696898696e-06, + "logits/chosen": -1.9349048137664795, + "logits/rejected": -1.555820345878601, + "logps/chosen": -267.323486328125, + "logps/rejected": -289.51947021484375, + "loss": 0.1085, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5123074054718018, + "rewards/margins": 0.20973484218120575, + "rewards/rejected": -0.7220422029495239, + "step": 5840 + }, + { + "epoch": 0.7, + "learning_rate": 1.2361219301585487e-06, + "logits/chosen": -2.0896143913269043, + "logits/rejected": -1.63728928565979, + "logps/chosen": -277.4181213378906, + "logps/rejected": -244.1571044921875, + "loss": 0.111, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4161832332611084, + "rewards/margins": 0.1889437586069107, + "rewards/rejected": -0.6051269769668579, + "step": 5850 + }, + { + "epoch": 0.7, + "learning_rate": 1.2270990613436522e-06, + "logits/chosen": -1.9406229257583618, + "logits/rejected": -1.6228440999984741, + "logps/chosen": -229.52969360351562, + "logps/rejected": -256.2364196777344, + "loss": 0.1576, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.4235209822654724, + "rewards/margins": 0.12438831478357315, + "rewards/rejected": -0.5479093194007874, + "step": 5860 + }, + { + "epoch": 0.7, + "learning_rate": 1.2180985208147571e-06, + "logits/chosen": -1.9510200023651123, + "logits/rejected": -1.7511215209960938, + "logps/chosen": -213.273681640625, + "logps/rejected": -237.1942596435547, + "loss": 0.0932, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.39561378955841064, + "rewards/margins": 0.13690926134586334, + "rewards/rejected": -0.5325230360031128, + "step": 5870 + }, + { + "epoch": 0.71, + "learning_rate": 1.2091204664526831e-06, + "logits/chosen": -2.0162253379821777, + "logits/rejected": -1.6138890981674194, + "logps/chosen": -287.21746826171875, + "logps/rejected": -234.35693359375, + "loss": 0.184, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.4287000596523285, + "rewards/margins": 0.1497940570116043, + "rewards/rejected": -0.578494131565094, + "step": 5880 + }, + { + "epoch": 0.71, + "learning_rate": 1.2001650557438143e-06, + "logits/chosen": -2.028672695159912, + "logits/rejected": -1.7924978733062744, + "logps/chosen": -294.2822265625, + "logps/rejected": -278.37652587890625, + "loss": 0.1596, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.44275110960006714, + "rewards/margins": 0.11968086659908295, + "rewards/rejected": -0.5624319911003113, + "step": 5890 + }, + { + "epoch": 0.71, + "learning_rate": 1.1912324457773336e-06, + "logits/chosen": -2.0378835201263428, + "logits/rejected": -1.7147912979125977, + "logps/chosen": -240.72216796875, + "logps/rejected": -268.4493713378906, + "loss": 0.0914, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.39333948493003845, + "rewards/margins": 0.19647042453289032, + "rewards/rejected": -0.5898098945617676, + "step": 5900 + }, + { + "epoch": 0.71, + "learning_rate": 1.182322793242476e-06, + "logits/chosen": -2.0833020210266113, + "logits/rejected": -1.8951669931411743, + "logps/chosen": -225.5447540283203, + "logps/rejected": -256.537109375, + "loss": 0.1383, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.431450754404068, + "rewards/margins": 0.16372133791446686, + "rewards/rejected": -0.5951720476150513, + "step": 5910 + }, + { + "epoch": 0.71, + "learning_rate": 1.1734362544257686e-06, + "logits/chosen": -1.8667224645614624, + "logits/rejected": -1.4377329349517822, + "logps/chosen": -305.6443786621094, + "logps/rejected": -238.24801635742188, + "loss": 0.1093, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3690487742424011, + "rewards/margins": 0.14427319169044495, + "rewards/rejected": -0.5133219957351685, + "step": 5920 + }, + { + "epoch": 0.71, + "learning_rate": 1.1645729852082977e-06, + "logits/chosen": -2.2229387760162354, + "logits/rejected": -1.7280595302581787, + "logps/chosen": -246.1704559326172, + "logps/rejected": -238.6482391357422, + "loss": 0.1897, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.4612872004508972, + "rewards/margins": 0.18152639269828796, + "rewards/rejected": -0.6428135633468628, + "step": 5930 + }, + { + "epoch": 0.71, + "learning_rate": 1.1557331410629708e-06, + "logits/chosen": -2.273224115371704, + "logits/rejected": -1.5956547260284424, + "logps/chosen": -271.32366943359375, + "logps/rejected": -216.1154327392578, + "loss": 0.1183, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.39185529947280884, + "rewards/margins": 0.1586187481880188, + "rewards/rejected": -0.5504740476608276, + "step": 5940 + }, + { + "epoch": 0.71, + "learning_rate": 1.1469168770517913e-06, + "logits/chosen": -2.2409141063690186, + "logits/rejected": -1.7641382217407227, + "logps/chosen": -249.9724578857422, + "logps/rejected": -213.88729858398438, + "loss": 0.123, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4436149001121521, + "rewards/margins": 0.15342697501182556, + "rewards/rejected": -0.5970418453216553, + "step": 5950 + }, + { + "epoch": 0.72, + "learning_rate": 1.1381243478231336e-06, + "logits/chosen": -2.1302855014801025, + "logits/rejected": -1.7275043725967407, + "logps/chosen": -295.96807861328125, + "logps/rejected": -232.9789276123047, + "loss": 0.1576, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3948201537132263, + "rewards/margins": 0.1410309374332428, + "rewards/rejected": -0.5358511209487915, + "step": 5960 + }, + { + "epoch": 0.72, + "learning_rate": 1.1293557076090403e-06, + "logits/chosen": -2.055603504180908, + "logits/rejected": -1.6448357105255127, + "logps/chosen": -270.03485107421875, + "logps/rejected": -257.23333740234375, + "loss": 0.1255, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3696318566799164, + "rewards/margins": 0.16263523697853088, + "rewards/rejected": -0.5322670936584473, + "step": 5970 + }, + { + "epoch": 0.72, + "learning_rate": 1.1206111102225043e-06, + "logits/chosen": -2.016026020050049, + "logits/rejected": -1.7732445001602173, + "logps/chosen": -322.04571533203125, + "logps/rejected": -314.3202819824219, + "loss": 0.1256, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.43497371673583984, + "rewards/margins": 0.15854512155056, + "rewards/rejected": -0.593518853187561, + "step": 5980 + }, + { + "epoch": 0.72, + "learning_rate": 1.1118907090547805e-06, + "logits/chosen": -2.1408379077911377, + "logits/rejected": -1.6741759777069092, + "logps/chosen": -290.5514221191406, + "logps/rejected": -262.75262451171875, + "loss": 0.099, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.45133018493652344, + "rewards/margins": 0.22001805901527405, + "rewards/rejected": -0.6713482737541199, + "step": 5990 + }, + { + "epoch": 0.72, + "learning_rate": 1.1031946570726912e-06, + "logits/chosen": -1.9711778163909912, + "logits/rejected": -1.855182409286499, + "logps/chosen": -278.8828430175781, + "logps/rejected": -290.8421325683594, + "loss": 0.1164, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5591145157814026, + "rewards/margins": 0.11836342513561249, + "rewards/rejected": -0.6774778962135315, + "step": 6000 + }, + { + "epoch": 0.72, + "learning_rate": 1.094523106815944e-06, + "logits/chosen": -1.855329155921936, + "logits/rejected": -1.5584124326705933, + "logps/chosen": -267.0565185546875, + "logps/rejected": -273.24310302734375, + "loss": 0.1452, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.44338005781173706, + "rewards/margins": 0.17573294043540955, + "rewards/rejected": -0.6191130876541138, + "step": 6010 + }, + { + "epoch": 0.72, + "learning_rate": 1.0858762103944511e-06, + "logits/chosen": -1.87862229347229, + "logits/rejected": -1.6584688425064087, + "logps/chosen": -280.62738037109375, + "logps/rejected": -262.09942626953125, + "loss": 0.125, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5238662362098694, + "rewards/margins": 0.09647830575704575, + "rewards/rejected": -0.6203445196151733, + "step": 6020 + }, + { + "epoch": 0.72, + "learning_rate": 1.0772541194856732e-06, + "logits/chosen": -2.123035430908203, + "logits/rejected": -1.6155083179473877, + "logps/chosen": -321.01776123046875, + "logps/rejected": -279.3617858886719, + "loss": 0.0745, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.42041999101638794, + "rewards/margins": 0.16421575844287872, + "rewards/rejected": -0.5846357941627502, + "step": 6030 + }, + { + "epoch": 0.72, + "learning_rate": 1.068656985331943e-06, + "logits/chosen": -1.9696871042251587, + "logits/rejected": -1.6894041299819946, + "logps/chosen": -253.11233520507812, + "logps/rejected": -267.6400146484375, + "loss": 0.1156, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.48150572180747986, + "rewards/margins": 0.17806780338287354, + "rewards/rejected": -0.6595736145973206, + "step": 6040 + }, + { + "epoch": 0.73, + "learning_rate": 1.060084958737825e-06, + "logits/chosen": -1.990724802017212, + "logits/rejected": -1.446575403213501, + "logps/chosen": -237.4898681640625, + "logps/rejected": -233.758056640625, + "loss": 0.1, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.3963894546031952, + "rewards/margins": 0.20723167061805725, + "rewards/rejected": -0.6036210656166077, + "step": 6050 + }, + { + "epoch": 0.73, + "learning_rate": 1.0515381900674643e-06, + "logits/chosen": -2.1221401691436768, + "logits/rejected": -1.7900186777114868, + "logps/chosen": -257.96722412109375, + "logps/rejected": -280.8104553222656, + "loss": 0.1199, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4584527611732483, + "rewards/margins": 0.16997918486595154, + "rewards/rejected": -0.6284319162368774, + "step": 6060 + }, + { + "epoch": 0.73, + "learning_rate": 1.04301682924195e-06, + "logits/chosen": -1.9626433849334717, + "logits/rejected": -1.4727249145507812, + "logps/chosen": -207.9431915283203, + "logps/rejected": -195.52149963378906, + "loss": 0.1507, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4559662938117981, + "rewards/margins": 0.22094134986400604, + "rewards/rejected": -0.6769076585769653, + "step": 6070 + }, + { + "epoch": 0.73, + "learning_rate": 1.034521025736686e-06, + "logits/chosen": -1.9479316473007202, + "logits/rejected": -1.5621986389160156, + "logps/chosen": -225.77102661132812, + "logps/rejected": -238.942138671875, + "loss": 0.1602, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4776443839073181, + "rewards/margins": 0.16333012282848358, + "rewards/rejected": -0.6409745216369629, + "step": 6080 + }, + { + "epoch": 0.73, + "learning_rate": 1.0260509285787694e-06, + "logits/chosen": -2.129117250442505, + "logits/rejected": -1.501849889755249, + "logps/chosen": -253.2616729736328, + "logps/rejected": -219.7965545654297, + "loss": 0.1035, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4091704785823822, + "rewards/margins": 0.18820294737815857, + "rewards/rejected": -0.597373366355896, + "step": 6090 + }, + { + "epoch": 0.73, + "learning_rate": 1.0176066863443726e-06, + "logits/chosen": -1.8864481449127197, + "logits/rejected": -1.5883742570877075, + "logps/chosen": -257.90185546875, + "logps/rejected": -220.6878662109375, + "loss": 0.1418, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.45451027154922485, + "rewards/margins": 0.12341739982366562, + "rewards/rejected": -0.5779277086257935, + "step": 6100 + }, + { + "epoch": 0.73, + "learning_rate": 1.0091884471561424e-06, + "logits/chosen": -1.8764444589614868, + "logits/rejected": -1.63616943359375, + "logps/chosen": -261.2283020019531, + "logps/rejected": -244.85128784179688, + "loss": 0.1435, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4946800172328949, + "rewards/margins": 0.1375671923160553, + "rewards/rejected": -0.632247269153595, + "step": 6110 + }, + { + "epoch": 0.73, + "learning_rate": 1.0007963586806e-06, + "logits/chosen": -1.905747652053833, + "logits/rejected": -1.4888708591461182, + "logps/chosen": -284.759033203125, + "logps/rejected": -269.97320556640625, + "loss": 0.1243, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5422388315200806, + "rewards/margins": 0.15825456380844116, + "rewards/rejected": -0.700493335723877, + "step": 6120 + }, + { + "epoch": 0.74, + "learning_rate": 9.924305681255484e-07, + "logits/chosen": -1.8924305438995361, + "logits/rejected": -1.4120725393295288, + "logps/chosen": -261.49359130859375, + "logps/rejected": -246.7588653564453, + "loss": 0.1309, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4799385964870453, + "rewards/margins": 0.22691066563129425, + "rewards/rejected": -0.7068492770195007, + "step": 6130 + }, + { + "epoch": 0.74, + "learning_rate": 9.840912222374932e-07, + "logits/chosen": -2.065091609954834, + "logits/rejected": -1.551511287689209, + "logps/chosen": -271.27764892578125, + "logps/rejected": -250.86550903320312, + "loss": 0.1203, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.465266615152359, + "rewards/margins": 0.19768501818180084, + "rewards/rejected": -0.6629515886306763, + "step": 6140 + }, + { + "epoch": 0.74, + "learning_rate": 9.757784672990668e-07, + "logits/chosen": -1.8214833736419678, + "logits/rejected": -1.3881083726882935, + "logps/chosen": -256.01123046875, + "logps/rejected": -230.39895629882812, + "loss": 0.1763, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5507286787033081, + "rewards/margins": 0.19404050707817078, + "rewards/rejected": -0.7447691559791565, + "step": 6150 + }, + { + "epoch": 0.74, + "learning_rate": 9.674924491264632e-07, + "logits/chosen": -1.8416248559951782, + "logits/rejected": -1.6367276906967163, + "logps/chosen": -219.26351928710938, + "logps/rejected": -215.86849975585938, + "loss": 0.1109, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.4397053122520447, + "rewards/margins": 0.14263916015625, + "rewards/rejected": -0.5823444724082947, + "step": 6160 + }, + { + "epoch": 0.74, + "learning_rate": 9.59233313066878e-07, + "logits/chosen": -2.089197874069214, + "logits/rejected": -1.6182596683502197, + "logps/chosen": -260.54376220703125, + "logps/rejected": -253.81912231445312, + "loss": 0.1007, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.44585880637168884, + "rewards/margins": 0.17450013756752014, + "rewards/rejected": -0.620358943939209, + "step": 6170 + }, + { + "epoch": 0.74, + "learning_rate": 9.510012039959632e-07, + "logits/chosen": -1.9944250583648682, + "logits/rejected": -1.6469684839248657, + "logps/chosen": -285.20440673828125, + "logps/rejected": -260.31121826171875, + "loss": 0.1213, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4487648904323578, + "rewards/margins": 0.14048054814338684, + "rewards/rejected": -0.5892454981803894, + "step": 6180 + }, + { + "epoch": 0.74, + "learning_rate": 9.427962663152821e-07, + "logits/chosen": -1.9396718740463257, + "logits/rejected": -1.632127046585083, + "logps/chosen": -305.9094543457031, + "logps/rejected": -257.2548828125, + "loss": 0.1065, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.46664246916770935, + "rewards/margins": 0.15920694172382355, + "rewards/rejected": -0.6258494257926941, + "step": 6190 + }, + { + "epoch": 0.74, + "learning_rate": 9.346186439497778e-07, + "logits/chosen": -1.9716598987579346, + "logits/rejected": -1.637711524963379, + "logps/chosen": -238.27755737304688, + "logps/rejected": -227.07540893554688, + "loss": 0.1825, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.43801671266555786, + "rewards/margins": 0.13967742025852203, + "rewards/rejected": -0.5776941180229187, + "step": 6200 + }, + { + "epoch": 0.75, + "learning_rate": 9.264684803452484e-07, + "logits/chosen": -1.9573974609375, + "logits/rejected": -1.6610110998153687, + "logps/chosen": -292.9803771972656, + "logps/rejected": -284.8642578125, + "loss": 0.1376, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.4125700891017914, + "rewards/margins": 0.1055794209241867, + "rewards/rejected": -0.5181494951248169, + "step": 6210 + }, + { + "epoch": 0.75, + "learning_rate": 9.183459184658317e-07, + "logits/chosen": -1.8743131160736084, + "logits/rejected": -1.5913054943084717, + "logps/chosen": -259.7023010253906, + "logps/rejected": -258.9094543457031, + "loss": 0.105, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.45578956604003906, + "rewards/margins": 0.11757218837738037, + "rewards/rejected": -0.5733617544174194, + "step": 6220 + }, + { + "epoch": 0.75, + "learning_rate": 9.102511007914924e-07, + "logits/chosen": -1.9219213724136353, + "logits/rejected": -1.4633852243423462, + "logps/chosen": -210.48403930664062, + "logps/rejected": -200.42410278320312, + "loss": 0.126, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4389539659023285, + "rewards/margins": 0.202097088098526, + "rewards/rejected": -0.6410510540008545, + "step": 6230 + }, + { + "epoch": 0.75, + "learning_rate": 9.021841693155343e-07, + "logits/chosen": -2.061584234237671, + "logits/rejected": -1.6734384298324585, + "logps/chosen": -264.5237121582031, + "logps/rejected": -252.5099334716797, + "loss": 0.1233, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.39105328917503357, + "rewards/margins": 0.18605628609657288, + "rewards/rejected": -0.5771095752716064, + "step": 6240 + }, + { + "epoch": 0.75, + "learning_rate": 8.94145265542094e-07, + "logits/chosen": -2.173652172088623, + "logits/rejected": -1.8842623233795166, + "logps/chosen": -311.49774169921875, + "logps/rejected": -285.50506591796875, + "loss": 0.0915, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3695235252380371, + "rewards/margins": 0.18801763653755188, + "rewards/rejected": -0.5575411915779114, + "step": 6250 + }, + { + "epoch": 0.75, + "learning_rate": 8.861345304836727e-07, + "logits/chosen": -1.937359094619751, + "logits/rejected": -1.8544925451278687, + "logps/chosen": -278.1236877441406, + "logps/rejected": -304.86962890625, + "loss": 0.1114, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5378960371017456, + "rewards/margins": 0.09365091472864151, + "rewards/rejected": -0.6315470933914185, + "step": 6260 + }, + { + "epoch": 0.75, + "learning_rate": 8.781521046586541e-07, + "logits/chosen": -1.9989734888076782, + "logits/rejected": -1.5364919900894165, + "logps/chosen": -244.04989624023438, + "logps/rejected": -233.1022491455078, + "loss": 0.136, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4168265461921692, + "rewards/margins": 0.18471379578113556, + "rewards/rejected": -0.6015402674674988, + "step": 6270 + }, + { + "epoch": 0.75, + "learning_rate": 8.701981280888444e-07, + "logits/chosen": -1.8424322605133057, + "logits/rejected": -1.6274335384368896, + "logps/chosen": -247.665771484375, + "logps/rejected": -262.58160400390625, + "loss": 0.1512, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3954276442527771, + "rewards/margins": 0.15929386019706726, + "rewards/rejected": -0.554721474647522, + "step": 6280 + }, + { + "epoch": 0.75, + "learning_rate": 8.622727402970097e-07, + "logits/chosen": -1.7672450542449951, + "logits/rejected": -1.7314002513885498, + "logps/chosen": -255.74935913085938, + "logps/rejected": -306.3473205566406, + "loss": 0.0826, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.47890645265579224, + "rewards/margins": 0.13192898035049438, + "rewards/rejected": -0.6108353734016418, + "step": 6290 + }, + { + "epoch": 0.76, + "learning_rate": 8.543760803044393e-07, + "logits/chosen": -1.9199352264404297, + "logits/rejected": -1.465319275856018, + "logps/chosen": -249.533447265625, + "logps/rejected": -250.07919311523438, + "loss": 0.1434, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4657590389251709, + "rewards/margins": 0.20879296958446503, + "rewards/rejected": -0.6745520830154419, + "step": 6300 + }, + { + "epoch": 0.76, + "learning_rate": 8.465082866284951e-07, + "logits/chosen": -2.023472547531128, + "logits/rejected": -1.5475150346755981, + "logps/chosen": -259.3255310058594, + "logps/rejected": -234.91415405273438, + "loss": 0.1315, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4470517635345459, + "rewards/margins": 0.19469048082828522, + "rewards/rejected": -0.6417423486709595, + "step": 6310 + }, + { + "epoch": 0.76, + "learning_rate": 8.386694972801904e-07, + "logits/chosen": -1.8993467092514038, + "logits/rejected": -1.540050745010376, + "logps/chosen": -270.31182861328125, + "logps/rejected": -249.36618041992188, + "loss": 0.1409, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4989251494407654, + "rewards/margins": 0.16831564903259277, + "rewards/rejected": -0.6672407984733582, + "step": 6320 + }, + { + "epoch": 0.76, + "learning_rate": 8.308598497617648e-07, + "logits/chosen": -1.906795859336853, + "logits/rejected": -1.6299690008163452, + "logps/chosen": -175.07479858398438, + "logps/rejected": -191.7357635498047, + "loss": 0.0714, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.4141884446144104, + "rewards/margins": 0.1404605209827423, + "rewards/rejected": -0.5546489953994751, + "step": 6330 + }, + { + "epoch": 0.76, + "learning_rate": 8.230794810642753e-07, + "logits/chosen": -1.9722293615341187, + "logits/rejected": -1.5651670694351196, + "logps/chosen": -290.1768798828125, + "logps/rejected": -253.2939910888672, + "loss": 0.1167, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4297245442867279, + "rewards/margins": 0.1225515827536583, + "rewards/rejected": -0.5522761344909668, + "step": 6340 + }, + { + "epoch": 0.76, + "learning_rate": 8.153285276651876e-07, + "logits/chosen": -2.1099610328674316, + "logits/rejected": -1.8138777017593384, + "logps/chosen": -228.72891235351562, + "logps/rejected": -261.8756103515625, + "loss": 0.0897, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.36999136209487915, + "rewards/margins": 0.14173254370689392, + "rewards/rejected": -0.5117239356040955, + "step": 6350 + }, + { + "epoch": 0.76, + "learning_rate": 8.076071255259918e-07, + "logits/chosen": -1.9893842935562134, + "logits/rejected": -1.562839150428772, + "logps/chosen": -248.32656860351562, + "logps/rejected": -230.19082641601562, + "loss": 0.1012, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4931401312351227, + "rewards/margins": 0.16243189573287964, + "rewards/rejected": -0.6555719971656799, + "step": 6360 + }, + { + "epoch": 0.76, + "learning_rate": 7.999154100898063e-07, + "logits/chosen": -1.8441355228424072, + "logits/rejected": -1.6876119375228882, + "logps/chosen": -210.8905792236328, + "logps/rejected": -267.10882568359375, + "loss": 0.0825, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.513385534286499, + "rewards/margins": 0.17421071231365204, + "rewards/rejected": -0.6875962018966675, + "step": 6370 + }, + { + "epoch": 0.77, + "learning_rate": 7.922535162790095e-07, + "logits/chosen": -2.086892604827881, + "logits/rejected": -1.8276185989379883, + "logps/chosen": -234.5299072265625, + "logps/rejected": -250.52059936523438, + "loss": 0.1293, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4303308129310608, + "rewards/margins": 0.16687455773353577, + "rewards/rejected": -0.5972053408622742, + "step": 6380 + }, + { + "epoch": 0.77, + "learning_rate": 7.846215784928721e-07, + "logits/chosen": -2.0581459999084473, + "logits/rejected": -1.7542043924331665, + "logps/chosen": -229.604736328125, + "logps/rejected": -245.43185424804688, + "loss": 0.1118, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.43111807107925415, + "rewards/margins": 0.1881767064332962, + "rewards/rejected": -0.6192947626113892, + "step": 6390 + }, + { + "epoch": 0.77, + "learning_rate": 7.770197306051968e-07, + "logits/chosen": -2.1675782203674316, + "logits/rejected": -1.4943665266036987, + "logps/chosen": -267.9649963378906, + "logps/rejected": -236.1041259765625, + "loss": 0.1182, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.42259639501571655, + "rewards/margins": 0.24217908084392548, + "rewards/rejected": -0.664775550365448, + "step": 6400 + }, + { + "epoch": 0.77, + "learning_rate": 7.694481059619705e-07, + "logits/chosen": -1.932381272315979, + "logits/rejected": -1.6388800144195557, + "logps/chosen": -246.7991943359375, + "logps/rejected": -249.91943359375, + "loss": 0.1273, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4872209429740906, + "rewards/margins": 0.18630096316337585, + "rewards/rejected": -0.6735219955444336, + "step": 6410 + }, + { + "epoch": 0.77, + "learning_rate": 7.619068373790306e-07, + "logits/chosen": -2.089247226715088, + "logits/rejected": -1.614682912826538, + "logps/chosen": -282.35125732421875, + "logps/rejected": -271.75067138671875, + "loss": 0.129, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.44997042417526245, + "rewards/margins": 0.1696648895740509, + "rewards/rejected": -0.6196353435516357, + "step": 6420 + }, + { + "epoch": 0.77, + "learning_rate": 7.543960571397257e-07, + "logits/chosen": -2.0346500873565674, + "logits/rejected": -1.7321970462799072, + "logps/chosen": -237.32608032226562, + "logps/rejected": -245.86557006835938, + "loss": 0.0754, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.46555739641189575, + "rewards/margins": 0.16662751138210297, + "rewards/rejected": -0.6321848630905151, + "step": 6430 + }, + { + "epoch": 0.77, + "learning_rate": 7.469158969926038e-07, + "logits/chosen": -2.0735549926757812, + "logits/rejected": -1.6884253025054932, + "logps/chosen": -263.26898193359375, + "logps/rejected": -264.8492126464844, + "loss": 0.074, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4788509011268616, + "rewards/margins": 0.14806295931339264, + "rewards/rejected": -0.6269139051437378, + "step": 6440 + }, + { + "epoch": 0.77, + "learning_rate": 7.39466488149097e-07, + "logits/chosen": -2.0731894969940186, + "logits/rejected": -1.5555330514907837, + "logps/chosen": -243.86892700195312, + "logps/rejected": -223.0145721435547, + "loss": 0.1347, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.41140609979629517, + "rewards/margins": 0.2104901522397995, + "rewards/rejected": -0.6218962073326111, + "step": 6450 + }, + { + "epoch": 0.78, + "learning_rate": 7.320479612812218e-07, + "logits/chosen": -2.017112970352173, + "logits/rejected": -1.5632555484771729, + "logps/chosen": -207.18191528320312, + "logps/rejected": -199.11024475097656, + "loss": 0.0694, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3916897475719452, + "rewards/margins": 0.14873406291007996, + "rewards/rejected": -0.5404238700866699, + "step": 6460 + }, + { + "epoch": 0.78, + "learning_rate": 7.246604465192825e-07, + "logits/chosen": -1.9492496252059937, + "logits/rejected": -1.3995827436447144, + "logps/chosen": -259.1231994628906, + "logps/rejected": -208.67135620117188, + "loss": 0.124, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.49213799834251404, + "rewards/margins": 0.2363675832748413, + "rewards/rejected": -0.7285054922103882, + "step": 6470 + }, + { + "epoch": 0.78, + "learning_rate": 7.173040734495973e-07, + "logits/chosen": -1.8647918701171875, + "logits/rejected": -1.5196516513824463, + "logps/chosen": -291.6728210449219, + "logps/rejected": -321.27923583984375, + "loss": 0.134, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.504711925983429, + "rewards/margins": 0.14177486300468445, + "rewards/rejected": -0.646486759185791, + "step": 6480 + }, + { + "epoch": 0.78, + "learning_rate": 7.099789711122149e-07, + "logits/chosen": -2.0234179496765137, + "logits/rejected": -1.6390674114227295, + "logps/chosen": -296.9944763183594, + "logps/rejected": -274.94488525390625, + "loss": 0.1205, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4782020151615143, + "rewards/margins": 0.15751081705093384, + "rewards/rejected": -0.6357128024101257, + "step": 6490 + }, + { + "epoch": 0.78, + "learning_rate": 7.02685267998659e-07, + "logits/chosen": -1.8039512634277344, + "logits/rejected": -1.610396146774292, + "logps/chosen": -217.3019256591797, + "logps/rejected": -230.8792724609375, + "loss": 0.1698, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.42607298493385315, + "rewards/margins": 0.10820464789867401, + "rewards/rejected": -0.534277617931366, + "step": 6500 + }, + { + "epoch": 0.78, + "learning_rate": 6.954230920496702e-07, + "logits/chosen": -1.9449794292449951, + "logits/rejected": -1.672254204750061, + "logps/chosen": -209.4949188232422, + "logps/rejected": -234.17465209960938, + "loss": 0.0954, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5420662760734558, + "rewards/margins": 0.2034389078617096, + "rewards/rejected": -0.7455052733421326, + "step": 6510 + }, + { + "epoch": 0.78, + "learning_rate": 6.881925706529641e-07, + "logits/chosen": -2.1921558380126953, + "logits/rejected": -1.6133739948272705, + "logps/chosen": -253.46450805664062, + "logps/rejected": -225.8391571044922, + "loss": 0.0943, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.47693949937820435, + "rewards/margins": 0.17005819082260132, + "rewards/rejected": -0.6469976305961609, + "step": 6520 + }, + { + "epoch": 0.78, + "learning_rate": 6.809938306409925e-07, + "logits/chosen": -1.8478351831436157, + "logits/rejected": -1.6184707880020142, + "logps/chosen": -254.103515625, + "logps/rejected": -244.87991333007812, + "loss": 0.0911, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.474212646484375, + "rewards/margins": 0.16678480803966522, + "rewards/rejected": -0.6409973502159119, + "step": 6530 + }, + { + "epoch": 0.78, + "learning_rate": 6.738269982887266e-07, + "logits/chosen": -2.0551493167877197, + "logits/rejected": -1.6460390090942383, + "logps/chosen": -320.0547790527344, + "logps/rejected": -268.46875, + "loss": 0.1129, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4602360725402832, + "rewards/margins": 0.18921080231666565, + "rewards/rejected": -0.6494468450546265, + "step": 6540 + }, + { + "epoch": 0.79, + "learning_rate": 6.66692199311432e-07, + "logits/chosen": -2.0328869819641113, + "logits/rejected": -1.675079584121704, + "logps/chosen": -289.8661193847656, + "logps/rejected": -240.75991821289062, + "loss": 0.1209, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.42221537232398987, + "rewards/margins": 0.1419004648923874, + "rewards/rejected": -0.5641158819198608, + "step": 6550 + }, + { + "epoch": 0.79, + "learning_rate": 6.595895588624717e-07, + "logits/chosen": -2.185662269592285, + "logits/rejected": -1.7335224151611328, + "logps/chosen": -257.2197570800781, + "logps/rejected": -243.74807739257812, + "loss": 0.1239, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.36936455965042114, + "rewards/margins": 0.1940496563911438, + "rewards/rejected": -0.5634142756462097, + "step": 6560 + }, + { + "epoch": 0.79, + "learning_rate": 6.525192015311069e-07, + "logits/chosen": -2.069929599761963, + "logits/rejected": -1.7172218561172485, + "logps/chosen": -269.12603759765625, + "logps/rejected": -264.0863952636719, + "loss": 0.085, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.456916481256485, + "rewards/margins": 0.16084381937980652, + "rewards/rejected": -0.6177603006362915, + "step": 6570 + }, + { + "epoch": 0.79, + "learning_rate": 6.454812513403127e-07, + "logits/chosen": -2.1930034160614014, + "logits/rejected": -1.7271572351455688, + "logps/chosen": -229.4690399169922, + "logps/rejected": -206.50888061523438, + "loss": 0.105, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.4114112854003906, + "rewards/margins": 0.1248263344168663, + "rewards/rejected": -0.5362376570701599, + "step": 6580 + }, + { + "epoch": 0.79, + "learning_rate": 6.384758317445991e-07, + "logits/chosen": -1.9955850839614868, + "logits/rejected": -1.3841392993927002, + "logps/chosen": -287.55853271484375, + "logps/rejected": -227.1190948486328, + "loss": 0.1237, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.39280954003334045, + "rewards/margins": 0.25686582922935486, + "rewards/rejected": -0.6496754288673401, + "step": 6590 + }, + { + "epoch": 0.79, + "learning_rate": 6.31503065627854e-07, + "logits/chosen": -1.863050103187561, + "logits/rejected": -1.605564832687378, + "logps/chosen": -265.7858581542969, + "logps/rejected": -288.51629638671875, + "loss": 0.1061, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.48526960611343384, + "rewards/margins": 0.19922366738319397, + "rewards/rejected": -0.684493362903595, + "step": 6600 + }, + { + "epoch": 0.79, + "learning_rate": 6.245630753011767e-07, + "logits/chosen": -2.018514633178711, + "logits/rejected": -1.6095269918441772, + "logps/chosen": -293.40185546875, + "logps/rejected": -261.62066650390625, + "loss": 0.1041, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.39809244871139526, + "rewards/margins": 0.2280418872833252, + "rewards/rejected": -0.6261343955993652, + "step": 6610 + }, + { + "epoch": 0.79, + "learning_rate": 6.176559825007408e-07, + "logits/chosen": -2.115142345428467, + "logits/rejected": -1.8222726583480835, + "logps/chosen": -286.07147216796875, + "logps/rejected": -268.15667724609375, + "loss": 0.0929, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.4131897985935211, + "rewards/margins": 0.17083369195461273, + "rewards/rejected": -0.5840234756469727, + "step": 6620 + }, + { + "epoch": 0.8, + "learning_rate": 6.107819083856559e-07, + "logits/chosen": -2.1033377647399902, + "logits/rejected": -1.6299479007720947, + "logps/chosen": -292.8658142089844, + "logps/rejected": -253.60104370117188, + "loss": 0.1869, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4031642973423004, + "rewards/margins": 0.14742298424243927, + "rewards/rejected": -0.5505872964859009, + "step": 6630 + }, + { + "epoch": 0.8, + "learning_rate": 6.039409735358418e-07, + "logits/chosen": -1.938940405845642, + "logits/rejected": -1.6774669885635376, + "logps/chosen": -264.48992919921875, + "logps/rejected": -243.3132781982422, + "loss": 0.0917, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.4583476483821869, + "rewards/margins": 0.227961927652359, + "rewards/rejected": -0.6863095164299011, + "step": 6640 + }, + { + "epoch": 0.8, + "learning_rate": 5.971332979499112e-07, + "logits/chosen": -1.9997894763946533, + "logits/rejected": -1.6866439580917358, + "logps/chosen": -228.4779815673828, + "logps/rejected": -202.27304077148438, + "loss": 0.0927, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4338861405849457, + "rewards/margins": 0.186149001121521, + "rewards/rejected": -0.6200351715087891, + "step": 6650 + }, + { + "epoch": 0.8, + "learning_rate": 5.903590010430732e-07, + "logits/chosen": -1.9610633850097656, + "logits/rejected": -1.4865853786468506, + "logps/chosen": -247.3853302001953, + "logps/rejected": -217.017333984375, + "loss": 0.1559, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4397171139717102, + "rewards/margins": 0.16036757826805115, + "rewards/rejected": -0.600084662437439, + "step": 6660 + }, + { + "epoch": 0.8, + "learning_rate": 5.836182016450273e-07, + "logits/chosen": -1.881838083267212, + "logits/rejected": -1.6120822429656982, + "logps/chosen": -277.1618957519531, + "logps/rejected": -234.37094116210938, + "loss": 0.1946, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.4851130545139313, + "rewards/margins": 0.14165827631950378, + "rewards/rejected": -0.6267713308334351, + "step": 6670 + }, + { + "epoch": 0.8, + "learning_rate": 5.769110179978874e-07, + "logits/chosen": -2.084548234939575, + "logits/rejected": -1.870194435119629, + "logps/chosen": -213.6473388671875, + "logps/rejected": -241.52255249023438, + "loss": 0.1835, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.41156840324401855, + "rewards/margins": 0.1030372828245163, + "rewards/rejected": -0.514605700969696, + "step": 6680 + }, + { + "epoch": 0.8, + "learning_rate": 5.702375677541037e-07, + "logits/chosen": -1.8728317022323608, + "logits/rejected": -1.7525784969329834, + "logps/chosen": -246.52294921875, + "logps/rejected": -250.93368530273438, + "loss": 0.0981, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4282185435295105, + "rewards/margins": 0.15069648623466492, + "rewards/rejected": -0.578914999961853, + "step": 6690 + }, + { + "epoch": 0.8, + "learning_rate": 5.635979679744006e-07, + "logits/chosen": -1.756136178970337, + "logits/rejected": -1.4906980991363525, + "logps/chosen": -233.78359985351562, + "logps/rejected": -230.1853485107422, + "loss": 0.0827, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4131496548652649, + "rewards/margins": 0.11998526751995087, + "rewards/rejected": -0.533134937286377, + "step": 6700 + }, + { + "epoch": 0.81, + "learning_rate": 5.569923351257223e-07, + "logits/chosen": -1.9852508306503296, + "logits/rejected": -1.7096633911132812, + "logps/chosen": -241.21292114257812, + "logps/rejected": -279.4994201660156, + "loss": 0.1367, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.38888391852378845, + "rewards/margins": 0.18084433674812317, + "rewards/rejected": -0.5697282552719116, + "step": 6710 + }, + { + "epoch": 0.81, + "learning_rate": 5.504207850791912e-07, + "logits/chosen": -1.9846687316894531, + "logits/rejected": -1.4948934316635132, + "logps/chosen": -277.4078674316406, + "logps/rejected": -225.47042846679688, + "loss": 0.1489, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4744594097137451, + "rewards/margins": 0.17791931331157684, + "rewards/rejected": -0.6523788571357727, + "step": 6720 + }, + { + "epoch": 0.81, + "learning_rate": 5.438834331080725e-07, + "logits/chosen": -2.0223140716552734, + "logits/rejected": -1.8239043951034546, + "logps/chosen": -246.28005981445312, + "logps/rejected": -258.4651794433594, + "loss": 0.145, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.43237772583961487, + "rewards/margins": 0.14295579493045807, + "rewards/rejected": -0.5753334760665894, + "step": 6730 + }, + { + "epoch": 0.81, + "learning_rate": 5.373803938857558e-07, + "logits/chosen": -1.9347299337387085, + "logits/rejected": -1.681318998336792, + "logps/chosen": -266.93133544921875, + "logps/rejected": -230.0156707763672, + "loss": 0.2025, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5939790606498718, + "rewards/margins": 0.11487730592489243, + "rewards/rejected": -0.708856463432312, + "step": 6740 + }, + { + "epoch": 0.81, + "learning_rate": 5.309117814837409e-07, + "logits/chosen": -2.084141731262207, + "logits/rejected": -1.590496301651001, + "logps/chosen": -245.8157501220703, + "logps/rejected": -212.6662139892578, + "loss": 0.1119, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.33361050486564636, + "rewards/margins": 0.17130649089813232, + "rewards/rejected": -0.5049170255661011, + "step": 6750 + }, + { + "epoch": 0.81, + "learning_rate": 5.244777093696385e-07, + "logits/chosen": -2.146206855773926, + "logits/rejected": -1.6665055751800537, + "logps/chosen": -225.45962524414062, + "logps/rejected": -228.45425415039062, + "loss": 0.0773, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4145995080471039, + "rewards/margins": 0.2185206413269043, + "rewards/rejected": -0.6331201791763306, + "step": 6760 + }, + { + "epoch": 0.81, + "learning_rate": 5.180782904051787e-07, + "logits/chosen": -1.8693435192108154, + "logits/rejected": -1.7655032873153687, + "logps/chosen": -248.69705200195312, + "logps/rejected": -256.6230773925781, + "loss": 0.1171, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.44969773292541504, + "rewards/margins": 0.13754042983055115, + "rewards/rejected": -0.5872381925582886, + "step": 6770 + }, + { + "epoch": 0.81, + "learning_rate": 5.117136368442322e-07, + "logits/chosen": -1.9687871932983398, + "logits/rejected": -1.5913883447647095, + "logps/chosen": -216.0896759033203, + "logps/rejected": -212.8240203857422, + "loss": 0.1145, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.47843655943870544, + "rewards/margins": 0.14962342381477356, + "rewards/rejected": -0.628059983253479, + "step": 6780 + }, + { + "epoch": 0.81, + "learning_rate": 5.053838603308403e-07, + "logits/chosen": -2.2426624298095703, + "logits/rejected": -1.8825994729995728, + "logps/chosen": -305.18011474609375, + "logps/rejected": -311.623046875, + "loss": 0.1362, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.4912681579589844, + "rewards/margins": 0.1363973617553711, + "rewards/rejected": -0.6276654601097107, + "step": 6790 + }, + { + "epoch": 0.82, + "learning_rate": 4.99089071897256e-07, + "logits/chosen": -1.974311113357544, + "logits/rejected": -1.5755977630615234, + "logps/chosen": -247.4668731689453, + "logps/rejected": -246.3483123779297, + "loss": 0.1313, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.4030587077140808, + "rewards/margins": 0.2043962925672531, + "rewards/rejected": -0.6074550151824951, + "step": 6800 + }, + { + "epoch": 0.82, + "learning_rate": 4.92829381961999e-07, + "logits/chosen": -1.7325855493545532, + "logits/rejected": -1.5258713960647583, + "logps/chosen": -243.26107788085938, + "logps/rejected": -243.32833862304688, + "loss": 0.0918, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.47821131348609924, + "rewards/margins": 0.18177253007888794, + "rewards/rejected": -0.6599838733673096, + "step": 6810 + }, + { + "epoch": 0.82, + "learning_rate": 4.866049003279163e-07, + "logits/chosen": -1.9763206243515015, + "logits/rejected": -1.603864073753357, + "logps/chosen": -245.575439453125, + "logps/rejected": -217.13046264648438, + "loss": 0.1529, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4021136164665222, + "rewards/margins": 0.12271346896886826, + "rewards/rejected": -0.5248271226882935, + "step": 6820 + }, + { + "epoch": 0.82, + "learning_rate": 4.80415736180257e-07, + "logits/chosen": -1.7185128927230835, + "logits/rejected": -1.5524407625198364, + "logps/chosen": -206.218017578125, + "logps/rejected": -224.42819213867188, + "loss": 0.1003, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.502429187297821, + "rewards/margins": 0.15189214050769806, + "rewards/rejected": -0.6543213129043579, + "step": 6830 + }, + { + "epoch": 0.82, + "learning_rate": 4.7426199808475735e-07, + "logits/chosen": -1.9034755229949951, + "logits/rejected": -1.6419847011566162, + "logps/chosen": -266.4277648925781, + "logps/rejected": -252.0305633544922, + "loss": 0.1589, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.46889528632164, + "rewards/margins": 0.16565118730068207, + "rewards/rejected": -0.6345464587211609, + "step": 6840 + }, + { + "epoch": 0.82, + "learning_rate": 4.6814379398573613e-07, + "logits/chosen": -1.9863262176513672, + "logits/rejected": -1.6904399394989014, + "logps/chosen": -288.7919921875, + "logps/rejected": -317.627197265625, + "loss": 0.1441, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4666666090488434, + "rewards/margins": 0.15177717804908752, + "rewards/rejected": -0.6184438467025757, + "step": 6850 + }, + { + "epoch": 0.82, + "learning_rate": 4.6206123120419944e-07, + "logits/chosen": -1.7895174026489258, + "logits/rejected": -1.514021635055542, + "logps/chosen": -262.0445251464844, + "logps/rejected": -283.6744384765625, + "loss": 0.1085, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.49015116691589355, + "rewards/margins": 0.18592293560504913, + "rewards/rejected": -0.6760741472244263, + "step": 6860 + }, + { + "epoch": 0.82, + "learning_rate": 4.5601441643596145e-07, + "logits/chosen": -1.9775257110595703, + "logits/rejected": -1.5629953145980835, + "logps/chosen": -260.10284423828125, + "logps/rejected": -243.96127319335938, + "loss": 0.126, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.4571099877357483, + "rewards/margins": 0.18302378058433533, + "rewards/rejected": -0.6401337385177612, + "step": 6870 + }, + { + "epoch": 0.83, + "learning_rate": 4.500034557497709e-07, + "logits/chosen": -1.863673448562622, + "logits/rejected": -1.4204473495483398, + "logps/chosen": -292.68621826171875, + "logps/rejected": -247.7226104736328, + "loss": 0.1288, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5655902624130249, + "rewards/margins": 0.15709365904331207, + "rewards/rejected": -0.7226839661598206, + "step": 6880 + }, + { + "epoch": 0.83, + "learning_rate": 4.4402845458545037e-07, + "logits/chosen": -1.9163280725479126, + "logits/rejected": -1.6178480386734009, + "logps/chosen": -260.6136169433594, + "logps/rejected": -262.7823181152344, + "loss": 0.0979, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5299848318099976, + "rewards/margins": 0.15895573794841766, + "rewards/rejected": -0.6889406442642212, + "step": 6890 + }, + { + "epoch": 0.83, + "learning_rate": 4.380895177520475e-07, + "logits/chosen": -2.119663953781128, + "logits/rejected": -1.4375776052474976, + "logps/chosen": -320.13934326171875, + "logps/rejected": -282.3840026855469, + "loss": 0.1025, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.48634710907936096, + "rewards/margins": 0.1542474776506424, + "rewards/rejected": -0.6405946612358093, + "step": 6900 + }, + { + "epoch": 0.83, + "learning_rate": 4.3218674942599655e-07, + "logits/chosen": -1.989381194114685, + "logits/rejected": -1.6404857635498047, + "logps/chosen": -266.8316955566406, + "logps/rejected": -250.7163848876953, + "loss": 0.1453, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3762025237083435, + "rewards/margins": 0.18678084015846252, + "rewards/rejected": -0.5629833936691284, + "step": 6910 + }, + { + "epoch": 0.83, + "learning_rate": 4.263202531492877e-07, + "logits/chosen": -1.9917621612548828, + "logits/rejected": -1.7141485214233398, + "logps/chosen": -256.47882080078125, + "logps/rejected": -231.6102752685547, + "loss": 0.1329, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5046517252922058, + "rewards/margins": 0.1110520213842392, + "rewards/rejected": -0.6157038807868958, + "step": 6920 + }, + { + "epoch": 0.83, + "learning_rate": 4.204901318276586e-07, + "logits/chosen": -2.0761396884918213, + "logits/rejected": -1.6558525562286377, + "logps/chosen": -301.8915710449219, + "logps/rejected": -294.1426086425781, + "loss": 0.4635, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.49290648102760315, + "rewards/margins": 0.2808048725128174, + "rewards/rejected": -0.7737113237380981, + "step": 6930 + }, + { + "epoch": 0.83, + "learning_rate": 4.146964877287804e-07, + "logits/chosen": -2.0482866764068604, + "logits/rejected": -1.5608055591583252, + "logps/chosen": -353.6920166015625, + "logps/rejected": -292.26177978515625, + "loss": 0.1426, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.44764071702957153, + "rewards/margins": 0.165061354637146, + "rewards/rejected": -0.6127020716667175, + "step": 6940 + }, + { + "epoch": 0.83, + "learning_rate": 4.089394224804691e-07, + "logits/chosen": -2.0461716651916504, + "logits/rejected": -1.643463373184204, + "logps/chosen": -236.15139770507812, + "logps/rejected": -208.002685546875, + "loss": 0.1336, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4394107460975647, + "rewards/margins": 0.15235844254493713, + "rewards/rejected": -0.5917690992355347, + "step": 6950 + }, + { + "epoch": 0.84, + "learning_rate": 4.032190370689018e-07, + "logits/chosen": -2.036041021347046, + "logits/rejected": -1.5846761465072632, + "logps/chosen": -270.18865966796875, + "logps/rejected": -221.50357055664062, + "loss": 0.1346, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.48520708084106445, + "rewards/margins": 0.1285451054573059, + "rewards/rejected": -0.6137521862983704, + "step": 6960 + }, + { + "epoch": 0.84, + "learning_rate": 3.9753543183684573e-07, + "logits/chosen": -1.8880681991577148, + "logits/rejected": -1.741532564163208, + "logps/chosen": -290.96209716796875, + "logps/rejected": -323.4405212402344, + "loss": 0.1471, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5335294008255005, + "rewards/margins": 0.1267957240343094, + "rewards/rejected": -0.6603251099586487, + "step": 6970 + }, + { + "epoch": 0.84, + "learning_rate": 3.9188870648189437e-07, + "logits/chosen": -2.0555896759033203, + "logits/rejected": -1.7752418518066406, + "logps/chosen": -273.6790466308594, + "logps/rejected": -279.67437744140625, + "loss": 0.1747, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5123944282531738, + "rewards/margins": 0.10983245074748993, + "rewards/rejected": -0.622226893901825, + "step": 6980 + }, + { + "epoch": 0.84, + "learning_rate": 3.862789600547268e-07, + "logits/chosen": -2.072603702545166, + "logits/rejected": -1.5208203792572021, + "logps/chosen": -228.24551391601562, + "logps/rejected": -198.6292724609375, + "loss": 0.1493, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.46002787351608276, + "rewards/margins": 0.15952149033546448, + "rewards/rejected": -0.6195493936538696, + "step": 6990 + }, + { + "epoch": 0.84, + "learning_rate": 3.8070629095736e-07, + "logits/chosen": -2.0098140239715576, + "logits/rejected": -1.8628854751586914, + "logps/chosen": -278.3236389160156, + "logps/rejected": -290.70135498046875, + "loss": 0.1144, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.47310394048690796, + "rewards/margins": 0.1412026733160019, + "rewards/rejected": -0.6143065690994263, + "step": 7000 + }, + { + "epoch": 0.84, + "learning_rate": 3.7517079694143145e-07, + "logits/chosen": -1.8572998046875, + "logits/rejected": -1.6057090759277344, + "logps/chosen": -219.03427124023438, + "logps/rejected": -238.3125762939453, + "loss": 0.1321, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3572883903980255, + "rewards/margins": 0.18927468359470367, + "rewards/rejected": -0.5465630292892456, + "step": 7010 + }, + { + "epoch": 0.84, + "learning_rate": 3.696725751064778e-07, + "logits/chosen": -1.8692944049835205, + "logits/rejected": -1.6396774053573608, + "logps/chosen": -249.4277801513672, + "logps/rejected": -236.60397338867188, + "loss": 0.1562, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.41915005445480347, + "rewards/margins": 0.17370714247226715, + "rewards/rejected": -0.5928572416305542, + "step": 7020 + }, + { + "epoch": 0.84, + "learning_rate": 3.6421172189823884e-07, + "logits/chosen": -2.1776063442230225, + "logits/rejected": -1.8611905574798584, + "logps/chosen": -308.03680419921875, + "logps/rejected": -264.7606506347656, + "loss": 0.1133, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.44754162430763245, + "rewards/margins": 0.1368330419063568, + "rewards/rejected": -0.5843747854232788, + "step": 7030 + }, + { + "epoch": 0.84, + "learning_rate": 3.587883331069575e-07, + "logits/chosen": -1.870141625404358, + "logits/rejected": -1.6269527673721313, + "logps/chosen": -300.21435546875, + "logps/rejected": -270.22515869140625, + "loss": 0.1038, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5208837389945984, + "rewards/margins": 0.10919564962387085, + "rewards/rejected": -0.630079448223114, + "step": 7040 + }, + { + "epoch": 0.85, + "learning_rate": 3.5340250386570547e-07, + "logits/chosen": -1.9647839069366455, + "logits/rejected": -1.6906397342681885, + "logps/chosen": -289.46002197265625, + "logps/rejected": -272.2466735839844, + "loss": 0.1131, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5417642593383789, + "rewards/margins": 0.15753653645515442, + "rewards/rejected": -0.6993007063865662, + "step": 7050 + }, + { + "epoch": 0.85, + "learning_rate": 3.480543286487126e-07, + "logits/chosen": -2.036736249923706, + "logits/rejected": -1.7443698644638062, + "logps/chosen": -250.82235717773438, + "logps/rejected": -267.0654602050781, + "loss": 0.111, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4719162881374359, + "rewards/margins": 0.19385971128940582, + "rewards/rejected": -0.6657760143280029, + "step": 7060 + }, + { + "epoch": 0.85, + "learning_rate": 3.4274390126971035e-07, + "logits/chosen": -1.9663559198379517, + "logits/rejected": -1.701615571975708, + "logps/chosen": -236.70419311523438, + "logps/rejected": -215.01290893554688, + "loss": 0.2167, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.4362650513648987, + "rewards/margins": 0.10641946643590927, + "rewards/rejected": -0.5426844358444214, + "step": 7070 + }, + { + "epoch": 0.85, + "learning_rate": 3.374713148802827e-07, + "logits/chosen": -2.056093692779541, + "logits/rejected": -1.5538482666015625, + "logps/chosen": -264.41412353515625, + "logps/rejected": -251.5968780517578, + "loss": 0.1848, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.46888089179992676, + "rewards/margins": 0.1688876450061798, + "rewards/rejected": -0.6377686262130737, + "step": 7080 + }, + { + "epoch": 0.85, + "learning_rate": 3.3223666196823963e-07, + "logits/chosen": -2.1422367095947266, + "logits/rejected": -1.6432521343231201, + "logps/chosen": -332.3525695800781, + "logps/rejected": -257.8883361816406, + "loss": 0.139, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.5275644063949585, + "rewards/margins": 0.13949260115623474, + "rewards/rejected": -0.6670569181442261, + "step": 7090 + }, + { + "epoch": 0.85, + "learning_rate": 3.27040034355986e-07, + "logits/chosen": -1.8350646495819092, + "logits/rejected": -1.7812795639038086, + "logps/chosen": -248.4701690673828, + "logps/rejected": -259.24176025390625, + "loss": 0.1682, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.523715615272522, + "rewards/margins": 0.1881420910358429, + "rewards/rejected": -0.7118576765060425, + "step": 7100 + }, + { + "epoch": 0.85, + "learning_rate": 3.218815231989167e-07, + "logits/chosen": -2.013810157775879, + "logits/rejected": -1.7800830602645874, + "logps/chosen": -269.4825744628906, + "logps/rejected": -250.37014770507812, + "loss": 0.1172, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.47187668085098267, + "rewards/margins": 0.1158447265625, + "rewards/rejected": -0.5877213478088379, + "step": 7110 + }, + { + "epoch": 0.85, + "learning_rate": 3.1676121898381597e-07, + "logits/chosen": -1.7077372074127197, + "logits/rejected": -1.537479281425476, + "logps/chosen": -283.3507385253906, + "logps/rejected": -296.3389892578125, + "loss": 0.0994, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4422314167022705, + "rewards/margins": 0.15675051510334015, + "rewards/rejected": -0.5989819765090942, + "step": 7120 + }, + { + "epoch": 0.86, + "learning_rate": 3.1167921152727096e-07, + "logits/chosen": -1.9554319381713867, + "logits/rejected": -1.6862990856170654, + "logps/chosen": -265.57684326171875, + "logps/rejected": -236.5033416748047, + "loss": 0.1686, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.41017451882362366, + "rewards/margins": 0.1357969343662262, + "rewards/rejected": -0.5459714531898499, + "step": 7130 + }, + { + "epoch": 0.86, + "learning_rate": 3.066355899740925e-07, + "logits/chosen": -1.9464342594146729, + "logits/rejected": -1.6717723608016968, + "logps/chosen": -255.412841796875, + "logps/rejected": -277.73638916015625, + "loss": 0.0925, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.48311883211135864, + "rewards/margins": 0.16044536232948303, + "rewards/rejected": -0.6435642242431641, + "step": 7140 + }, + { + "epoch": 0.86, + "learning_rate": 3.0163044279575865e-07, + "logits/chosen": -2.0800702571868896, + "logits/rejected": -1.6157634258270264, + "logps/chosen": -273.6175231933594, + "logps/rejected": -198.8623809814453, + "loss": 0.1335, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4554726481437683, + "rewards/margins": 0.1640358418226242, + "rewards/rejected": -0.6195084452629089, + "step": 7150 + }, + { + "epoch": 0.86, + "learning_rate": 2.966638577888548e-07, + "logits/chosen": -1.9913661479949951, + "logits/rejected": -1.7240318059921265, + "logps/chosen": -269.86920166015625, + "logps/rejected": -281.88836669921875, + "loss": 0.1086, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.5008378028869629, + "rewards/margins": 0.14693915843963623, + "rewards/rejected": -0.6477769613265991, + "step": 7160 + }, + { + "epoch": 0.86, + "learning_rate": 2.917359220735386e-07, + "logits/chosen": -1.7666356563568115, + "logits/rejected": -1.6807903051376343, + "logps/chosen": -212.5333709716797, + "logps/rejected": -204.54331970214844, + "loss": 0.1653, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.5168853998184204, + "rewards/margins": 0.08568285405635834, + "rewards/rejected": -0.6025682687759399, + "step": 7170 + }, + { + "epoch": 0.86, + "learning_rate": 2.8684672209201067e-07, + "logits/chosen": -1.8947250843048096, + "logits/rejected": -1.5195536613464355, + "logps/chosen": -257.7271423339844, + "logps/rejected": -226.953125, + "loss": 0.1413, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.48005929589271545, + "rewards/margins": 0.11954379081726074, + "rewards/rejected": -0.5996031165122986, + "step": 7180 + }, + { + "epoch": 0.86, + "learning_rate": 2.819963436069986e-07, + "logits/chosen": -2.003467321395874, + "logits/rejected": -1.6604106426239014, + "logps/chosen": -301.46466064453125, + "logps/rejected": -248.64669799804688, + "loss": 0.1251, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.47957324981689453, + "rewards/margins": 0.09554027020931244, + "rewards/rejected": -0.5751134753227234, + "step": 7190 + }, + { + "epoch": 0.86, + "learning_rate": 2.771848717002498e-07, + "logits/chosen": -1.7738151550292969, + "logits/rejected": -1.6896775960922241, + "logps/chosen": -236.6964569091797, + "logps/rejected": -270.410888671875, + "loss": 0.1098, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5385271310806274, + "rewards/margins": 0.1548021137714386, + "rewards/rejected": -0.6933292746543884, + "step": 7200 + }, + { + "epoch": 0.87, + "learning_rate": 2.724123907710444e-07, + "logits/chosen": -1.7751373052597046, + "logits/rejected": -1.565288782119751, + "logps/chosen": -195.97508239746094, + "logps/rejected": -213.2833251953125, + "loss": 0.1312, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.466492235660553, + "rewards/margins": 0.1685236245393753, + "rewards/rejected": -0.6350158452987671, + "step": 7210 + }, + { + "epoch": 0.87, + "learning_rate": 2.6767898453470886e-07, + "logits/chosen": -2.038952350616455, + "logits/rejected": -1.636182427406311, + "logps/chosen": -237.19473266601562, + "logps/rejected": -223.7322235107422, + "loss": 0.1162, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.383628785610199, + "rewards/margins": 0.1849764585494995, + "rewards/rejected": -0.5686052441596985, + "step": 7220 + }, + { + "epoch": 0.87, + "learning_rate": 2.629847360211518e-07, + "logits/chosen": -1.9980814456939697, + "logits/rejected": -1.6197658777236938, + "logps/chosen": -247.9432830810547, + "logps/rejected": -251.46630859375, + "loss": 0.0962, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4834599494934082, + "rewards/margins": 0.15811513364315033, + "rewards/rejected": -0.6415750980377197, + "step": 7230 + }, + { + "epoch": 0.87, + "learning_rate": 2.5832972757340565e-07, + "logits/chosen": -2.032080888748169, + "logits/rejected": -1.8052698373794556, + "logps/chosen": -242.31698608398438, + "logps/rejected": -257.6308898925781, + "loss": 0.142, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.44113120436668396, + "rewards/margins": 0.11361245810985565, + "rewards/rejected": -0.5547436475753784, + "step": 7240 + }, + { + "epoch": 0.87, + "learning_rate": 2.53714040846183e-07, + "logits/chosen": -1.9440562725067139, + "logits/rejected": -1.5986840724945068, + "logps/chosen": -284.2532958984375, + "logps/rejected": -236.9690704345703, + "loss": 0.1171, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3770473003387451, + "rewards/margins": 0.23315231502056122, + "rewards/rejected": -0.6101996302604675, + "step": 7250 + }, + { + "epoch": 0.87, + "learning_rate": 2.491377568044434e-07, + "logits/chosen": -2.0843589305877686, + "logits/rejected": -1.6826503276824951, + "logps/chosen": -314.3594665527344, + "logps/rejected": -272.8559875488281, + "loss": 0.0734, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5402888059616089, + "rewards/margins": 0.14350660145282745, + "rewards/rejected": -0.6837953925132751, + "step": 7260 + }, + { + "epoch": 0.87, + "learning_rate": 2.4460095572197476e-07, + "logits/chosen": -2.0028257369995117, + "logits/rejected": -1.6907081604003906, + "logps/chosen": -238.27001953125, + "logps/rejected": -250.3758544921875, + "loss": 0.1027, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4421817660331726, + "rewards/margins": 0.1625339388847351, + "rewards/rejected": -0.6047157049179077, + "step": 7270 + }, + { + "epoch": 0.87, + "learning_rate": 2.401037171799819e-07, + "logits/chosen": -1.9249871969223022, + "logits/rejected": -1.5126729011535645, + "logps/chosen": -267.6981506347656, + "logps/rejected": -230.08407592773438, + "loss": 0.1672, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3991939127445221, + "rewards/margins": 0.1651068925857544, + "rewards/rejected": -0.5643008351325989, + "step": 7280 + }, + { + "epoch": 0.87, + "learning_rate": 2.3564612006569482e-07, + "logits/chosen": -2.02858304977417, + "logits/rejected": -1.7860488891601562, + "logps/chosen": -264.84002685546875, + "logps/rejected": -264.8438415527344, + "loss": 0.1607, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.45592936873435974, + "rewards/margins": 0.1130853146314621, + "rewards/rejected": -0.5690146684646606, + "step": 7290 + }, + { + "epoch": 0.88, + "learning_rate": 2.3122824257098275e-07, + "logits/chosen": -1.7619152069091797, + "logits/rejected": -1.4626259803771973, + "logps/chosen": -230.26889038085938, + "logps/rejected": -224.6363525390625, + "loss": 0.0667, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4507187008857727, + "rewards/margins": 0.16845285892486572, + "rewards/rejected": -0.6191716194152832, + "step": 7300 + }, + { + "epoch": 0.88, + "learning_rate": 2.2685016219098187e-07, + "logits/chosen": -2.1385879516601562, + "logits/rejected": -1.5403960943222046, + "logps/chosen": -266.4300231933594, + "logps/rejected": -216.80722045898438, + "loss": 0.1237, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.39975208044052124, + "rewards/margins": 0.22581128776073456, + "rewards/rejected": -0.625563383102417, + "step": 7310 + }, + { + "epoch": 0.88, + "learning_rate": 2.2251195572273758e-07, + "logits/chosen": -2.1694719791412354, + "logits/rejected": -1.607452154159546, + "logps/chosen": -341.0927734375, + "logps/rejected": -272.94464111328125, + "loss": 0.0914, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4034740924835205, + "rewards/margins": 0.19167360663414001, + "rewards/rejected": -0.5951477289199829, + "step": 7320 + }, + { + "epoch": 0.88, + "learning_rate": 2.18213699263857e-07, + "logits/chosen": -1.9197025299072266, + "logits/rejected": -1.6683180332183838, + "logps/chosen": -291.236572265625, + "logps/rejected": -316.86737060546875, + "loss": 0.0876, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.47617173194885254, + "rewards/margins": 0.2178315371274948, + "rewards/rejected": -0.6940032839775085, + "step": 7330 + }, + { + "epoch": 0.88, + "learning_rate": 2.1395546821117192e-07, + "logits/chosen": -1.8608232736587524, + "logits/rejected": -1.577675461769104, + "logps/chosen": -279.8750915527344, + "logps/rejected": -252.7456817626953, + "loss": 0.1519, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4677848815917969, + "rewards/margins": 0.1569843590259552, + "rewards/rejected": -0.6247692108154297, + "step": 7340 + }, + { + "epoch": 0.88, + "learning_rate": 2.097373372594197e-07, + "logits/chosen": -2.017251968383789, + "logits/rejected": -1.649173378944397, + "logps/chosen": -284.208984375, + "logps/rejected": -264.468505859375, + "loss": 0.1668, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.47412624955177307, + "rewards/margins": 0.12772853672504425, + "rewards/rejected": -0.6018548011779785, + "step": 7350 + }, + { + "epoch": 0.88, + "learning_rate": 2.0555938039993145e-07, + "logits/chosen": -2.207703113555908, + "logits/rejected": -1.7608064413070679, + "logps/chosen": -317.61566162109375, + "logps/rejected": -273.2945861816406, + "loss": 0.1079, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.3838236629962921, + "rewards/margins": 0.15046364068984985, + "rewards/rejected": -0.5342873334884644, + "step": 7360 + }, + { + "epoch": 0.88, + "learning_rate": 2.0142167091933368e-07, + "logits/chosen": -1.8200185298919678, + "logits/rejected": -1.7389323711395264, + "logps/chosen": -233.045654296875, + "logps/rejected": -263.55535888671875, + "loss": 0.1277, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.4523780941963196, + "rewards/margins": 0.1272934377193451, + "rewards/rejected": -0.5796715617179871, + "step": 7370 + }, + { + "epoch": 0.89, + "learning_rate": 1.973242813982626e-07, + "logits/chosen": -1.7422631978988647, + "logits/rejected": -1.4749271869659424, + "logps/chosen": -223.44418334960938, + "logps/rejected": -217.0834503173828, + "loss": 0.1286, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.490098237991333, + "rewards/margins": 0.15991918742656708, + "rewards/rejected": -0.6500174403190613, + "step": 7380 + }, + { + "epoch": 0.89, + "learning_rate": 1.932672837100924e-07, + "logits/chosen": -2.1760799884796143, + "logits/rejected": -1.4984136819839478, + "logps/chosen": -262.04400634765625, + "logps/rejected": -240.68911743164062, + "loss": 0.1252, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.49469202756881714, + "rewards/margins": 0.19546754658222198, + "rewards/rejected": -0.6901595592498779, + "step": 7390 + }, + { + "epoch": 0.89, + "learning_rate": 1.8925074901967406e-07, + "logits/chosen": -2.032710313796997, + "logits/rejected": -1.4763513803482056, + "logps/chosen": -284.44317626953125, + "logps/rejected": -255.60302734375, + "loss": 0.0695, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4748278558254242, + "rewards/margins": 0.16811513900756836, + "rewards/rejected": -0.6429430246353149, + "step": 7400 + }, + { + "epoch": 0.89, + "learning_rate": 1.8527474778208458e-07, + "logits/chosen": -1.841803789138794, + "logits/rejected": -1.7014901638031006, + "logps/chosen": -180.16915893554688, + "logps/rejected": -192.54147338867188, + "loss": 0.1502, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.42819100618362427, + "rewards/margins": 0.07826290279626846, + "rewards/rejected": -0.5064539313316345, + "step": 7410 + }, + { + "epoch": 0.89, + "learning_rate": 1.813393497413951e-07, + "logits/chosen": -1.9294426441192627, + "logits/rejected": -1.611358880996704, + "logps/chosen": -286.4609680175781, + "logps/rejected": -255.0117645263672, + "loss": 0.0994, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.4472865164279938, + "rewards/margins": 0.13087594509124756, + "rewards/rejected": -0.5781623721122742, + "step": 7420 + }, + { + "epoch": 0.89, + "learning_rate": 1.7744462392944472e-07, + "logits/chosen": -2.1310625076293945, + "logits/rejected": -1.6776885986328125, + "logps/chosen": -288.6611328125, + "logps/rejected": -262.71453857421875, + "loss": 0.1262, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.47246164083480835, + "rewards/margins": 0.1253913938999176, + "rewards/rejected": -0.5978530049324036, + "step": 7430 + }, + { + "epoch": 0.89, + "learning_rate": 1.7359063866463048e-07, + "logits/chosen": -2.0324885845184326, + "logits/rejected": -1.6409807205200195, + "logps/chosen": -239.01052856445312, + "logps/rejected": -202.0733642578125, + "loss": 0.1716, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3990083932876587, + "rewards/margins": 0.19792206585407257, + "rewards/rejected": -0.5969304442405701, + "step": 7440 + }, + { + "epoch": 0.89, + "learning_rate": 1.6977746155070946e-07, + "logits/chosen": -1.877323865890503, + "logits/rejected": -1.8759233951568604, + "logps/chosen": -244.84945678710938, + "logps/rejected": -282.86456298828125, + "loss": 0.1464, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4934001564979553, + "rewards/margins": 0.12719208002090454, + "rewards/rejected": -0.6205921769142151, + "step": 7450 + }, + { + "epoch": 0.9, + "learning_rate": 1.6600515947561207e-07, + "logits/chosen": -2.076573133468628, + "logits/rejected": -1.5323445796966553, + "logps/chosen": -282.5911560058594, + "logps/rejected": -220.7271728515625, + "loss": 0.1678, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4402221739292145, + "rewards/margins": 0.16551920771598816, + "rewards/rejected": -0.6057413220405579, + "step": 7460 + }, + { + "epoch": 0.9, + "learning_rate": 1.6227379861026738e-07, + "logits/chosen": -2.02341365814209, + "logits/rejected": -1.6644586324691772, + "logps/chosen": -255.2810821533203, + "logps/rejected": -251.2646484375, + "loss": 0.151, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4583802819252014, + "rewards/margins": 0.15048515796661377, + "rewards/rejected": -0.60886549949646, + "step": 7470 + }, + { + "epoch": 0.9, + "learning_rate": 1.5858344440744745e-07, + "logits/chosen": -2.074061632156372, + "logits/rejected": -1.6793878078460693, + "logps/chosen": -275.97467041015625, + "logps/rejected": -275.2462463378906, + "loss": 0.128, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4571126103401184, + "rewards/margins": 0.14766040444374084, + "rewards/rejected": -0.6047729253768921, + "step": 7480 + }, + { + "epoch": 0.9, + "learning_rate": 1.5493416160061254e-07, + "logits/chosen": -2.1556544303894043, + "logits/rejected": -1.7006380558013916, + "logps/chosen": -289.0265197753906, + "logps/rejected": -254.0576171875, + "loss": 0.179, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.4663164019584656, + "rewards/margins": 0.13743355870246887, + "rewards/rejected": -0.6037499904632568, + "step": 7490 + }, + { + "epoch": 0.9, + "learning_rate": 1.5132601420278086e-07, + "logits/chosen": -1.9497236013412476, + "logits/rejected": -1.6325147151947021, + "logps/chosen": -267.0816955566406, + "logps/rejected": -226.15576171875, + "loss": 0.0762, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.47710466384887695, + "rewards/margins": 0.14392545819282532, + "rewards/rejected": -0.6210300326347351, + "step": 7500 + }, + { + "epoch": 0.9, + "learning_rate": 1.4775906550540287e-07, + "logits/chosen": -1.7985035181045532, + "logits/rejected": -1.484886884689331, + "logps/chosen": -217.06076049804688, + "logps/rejected": -206.60055541992188, + "loss": 0.1064, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.443693071603775, + "rewards/margins": 0.13600948452949524, + "rewards/rejected": -0.5797025561332703, + "step": 7510 + }, + { + "epoch": 0.9, + "learning_rate": 1.4423337807725286e-07, + "logits/chosen": -1.957148790359497, + "logits/rejected": -1.8867321014404297, + "logps/chosen": -207.6150360107422, + "logps/rejected": -233.46658325195312, + "loss": 0.0856, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4386165142059326, + "rewards/margins": 0.1362169086933136, + "rewards/rejected": -0.5748334527015686, + "step": 7520 + }, + { + "epoch": 0.9, + "learning_rate": 1.4074901376332855e-07, + "logits/chosen": -1.9299287796020508, + "logits/rejected": -1.9098894596099854, + "logps/chosen": -274.15850830078125, + "logps/rejected": -299.91265869140625, + "loss": 0.0883, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.47814321517944336, + "rewards/margins": 0.13489031791687012, + "rewards/rejected": -0.6130335927009583, + "step": 7530 + }, + { + "epoch": 0.9, + "learning_rate": 1.3730603368377088e-07, + "logits/chosen": -1.915826439857483, + "logits/rejected": -1.7198493480682373, + "logps/chosen": -296.31182861328125, + "logps/rejected": -286.13592529296875, + "loss": 0.0992, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.4671846926212311, + "rewards/margins": 0.1536525934934616, + "rewards/rejected": -0.6208373308181763, + "step": 7540 + }, + { + "epoch": 0.91, + "learning_rate": 1.3390449823278666e-07, + "logits/chosen": -1.976479172706604, + "logits/rejected": -1.7112070322036743, + "logps/chosen": -309.96136474609375, + "logps/rejected": -279.93597412109375, + "loss": 0.2023, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4522860646247864, + "rewards/margins": 0.08242306113243103, + "rewards/rejected": -0.5347092151641846, + "step": 7550 + }, + { + "epoch": 0.91, + "learning_rate": 1.3054446707759323e-07, + "logits/chosen": -2.088073968887329, + "logits/rejected": -1.6631252765655518, + "logps/chosen": -269.19268798828125, + "logps/rejected": -219.1171112060547, + "loss": 0.0938, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.4191514551639557, + "rewards/margins": 0.1780991554260254, + "rewards/rejected": -0.5972505807876587, + "step": 7560 + }, + { + "epoch": 0.91, + "learning_rate": 1.2722599915736962e-07, + "logits/chosen": -1.9217841625213623, + "logits/rejected": -1.4964293241500854, + "logps/chosen": -215.7314910888672, + "logps/rejected": -203.1638641357422, + "loss": 0.161, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4369204640388489, + "rewards/margins": 0.19439806044101715, + "rewards/rejected": -0.6313184499740601, + "step": 7570 + }, + { + "epoch": 0.91, + "learning_rate": 1.2394915268222423e-07, + "logits/chosen": -1.7211532592773438, + "logits/rejected": -1.4849998950958252, + "logps/chosen": -253.0435028076172, + "logps/rejected": -240.583740234375, + "loss": 0.1213, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.45572876930236816, + "rewards/margins": 0.12647785246372223, + "rewards/rejected": -0.5822066068649292, + "step": 7580 + }, + { + "epoch": 0.91, + "learning_rate": 1.2071398513217118e-07, + "logits/chosen": -1.8722648620605469, + "logits/rejected": -1.4593003988265991, + "logps/chosen": -299.0888366699219, + "logps/rejected": -245.06686401367188, + "loss": 0.1898, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.45327168703079224, + "rewards/margins": 0.16469185054302216, + "rewards/rejected": -0.6179635524749756, + "step": 7590 + }, + { + "epoch": 0.91, + "learning_rate": 1.1752055325612605e-07, + "logits/chosen": -2.0962700843811035, + "logits/rejected": -1.614005446434021, + "logps/chosen": -284.02618408203125, + "logps/rejected": -245.89785766601562, + "loss": 0.1521, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.3982982933521271, + "rewards/margins": 0.14497140049934387, + "rewards/rejected": -0.5432697534561157, + "step": 7600 + }, + { + "epoch": 0.91, + "learning_rate": 1.143689130709058e-07, + "logits/chosen": -1.713621735572815, + "logits/rejected": -1.4907618761062622, + "logps/chosen": -288.0763854980469, + "logps/rejected": -314.97869873046875, + "loss": 0.067, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4883494973182678, + "rewards/margins": 0.1738799810409546, + "rewards/rejected": -0.6622294187545776, + "step": 7610 + }, + { + "epoch": 0.91, + "learning_rate": 1.1125911986025001e-07, + "logits/chosen": -1.7698646783828735, + "logits/rejected": -1.6841932535171509, + "logps/chosen": -350.5254211425781, + "logps/rejected": -300.12103271484375, + "loss": 0.1248, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5203927755355835, + "rewards/margins": 0.08340970426797867, + "rewards/rejected": -0.603802502155304, + "step": 7620 + }, + { + "epoch": 0.92, + "learning_rate": 1.0819122817384897e-07, + "logits/chosen": -1.9861576557159424, + "logits/rejected": -1.5860307216644287, + "logps/chosen": -254.307861328125, + "logps/rejected": -279.30615234375, + "loss": 0.1191, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5575435757637024, + "rewards/margins": 0.1310313642024994, + "rewards/rejected": -0.6885749101638794, + "step": 7630 + }, + { + "epoch": 0.92, + "learning_rate": 1.0516529182638819e-07, + "logits/chosen": -1.825350046157837, + "logits/rejected": -1.5633935928344727, + "logps/chosen": -293.57757568359375, + "logps/rejected": -316.1748962402344, + "loss": 0.1375, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5138299465179443, + "rewards/margins": 0.13412019610404968, + "rewards/rejected": -0.6479502320289612, + "step": 7640 + }, + { + "epoch": 0.92, + "learning_rate": 1.0218136389660211e-07, + "logits/chosen": -1.9528892040252686, + "logits/rejected": -1.6495583057403564, + "logps/chosen": -300.08294677734375, + "logps/rejected": -279.8624572753906, + "loss": 0.1515, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.441044420003891, + "rewards/margins": 0.08407802134752274, + "rewards/rejected": -0.5251224040985107, + "step": 7650 + }, + { + "epoch": 0.92, + "learning_rate": 9.923949672634714e-08, + "logits/chosen": -1.7535518407821655, + "logits/rejected": -1.5248991250991821, + "logps/chosen": -286.5433654785156, + "logps/rejected": -296.0802001953125, + "loss": 0.148, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.519503116607666, + "rewards/margins": 0.18411189317703247, + "rewards/rejected": -0.7036150097846985, + "step": 7660 + }, + { + "epoch": 0.92, + "learning_rate": 9.633974191967794e-08, + "logits/chosen": -1.9656978845596313, + "logits/rejected": -1.546311616897583, + "logps/chosen": -251.5748748779297, + "logps/rejected": -249.95162963867188, + "loss": 0.1335, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3971438705921173, + "rewards/margins": 0.2471657246351242, + "rewards/rejected": -0.6443095207214355, + "step": 7670 + }, + { + "epoch": 0.92, + "learning_rate": 9.348215034194752e-08, + "logits/chosen": -1.918678879737854, + "logits/rejected": -1.3947335481643677, + "logps/chosen": -295.56622314453125, + "logps/rejected": -266.44805908203125, + "loss": 0.103, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.43143653869628906, + "rewards/margins": 0.18379929661750793, + "rewards/rejected": -0.6152359247207642, + "step": 7680 + }, + { + "epoch": 0.92, + "learning_rate": 9.066677211891195e-08, + "logits/chosen": -1.9028263092041016, + "logits/rejected": -1.4752349853515625, + "logps/chosen": -219.3140869140625, + "logps/rejected": -224.39013671875, + "loss": 0.1199, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.47012224793434143, + "rewards/margins": 0.2023288756608963, + "rewards/rejected": -0.6724511384963989, + "step": 7690 + }, + { + "epoch": 0.92, + "learning_rate": 8.789365663585208e-08, + "logits/chosen": -2.116147518157959, + "logits/rejected": -1.8864881992340088, + "logps/chosen": -283.5893249511719, + "logps/rejected": -234.08224487304688, + "loss": 0.1225, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4223853051662445, + "rewards/margins": 0.1264767050743103, + "rewards/rejected": -0.5488620400428772, + "step": 7700 + }, + { + "epoch": 0.93, + "learning_rate": 8.516285253670597e-08, + "logits/chosen": -1.9596683979034424, + "logits/rejected": -1.6173160076141357, + "logps/chosen": -230.27188110351562, + "logps/rejected": -205.488525390625, + "loss": 0.1671, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5056071281433105, + "rewards/margins": 0.17759716510772705, + "rewards/rejected": -0.6832043528556824, + "step": 7710 + }, + { + "epoch": 0.93, + "learning_rate": 8.247440772321924e-08, + "logits/chosen": -1.8863664865493774, + "logits/rejected": -1.7962907552719116, + "logps/chosen": -265.54937744140625, + "logps/rejected": -279.07379150390625, + "loss": 0.1479, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5093497037887573, + "rewards/margins": 0.14122812449932098, + "rewards/rejected": -0.6505778431892395, + "step": 7720 + }, + { + "epoch": 0.93, + "learning_rate": 7.982836935409938e-08, + "logits/chosen": -1.9433815479278564, + "logits/rejected": -1.6725490093231201, + "logps/chosen": -274.528564453125, + "logps/rejected": -256.54766845703125, + "loss": 0.1345, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5039865970611572, + "rewards/margins": 0.16849592328071594, + "rewards/rejected": -0.6724825501441956, + "step": 7730 + }, + { + "epoch": 0.93, + "learning_rate": 7.722478384419335e-08, + "logits/chosen": -1.8644914627075195, + "logits/rejected": -1.4965304136276245, + "logps/chosen": -278.33746337890625, + "logps/rejected": -243.71817016601562, + "loss": 0.1328, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.49178391695022583, + "rewards/margins": 0.12286220490932465, + "rewards/rejected": -0.6146460771560669, + "step": 7740 + }, + { + "epoch": 0.93, + "learning_rate": 7.466369686367075e-08, + "logits/chosen": -2.0346133708953857, + "logits/rejected": -1.6774375438690186, + "logps/chosen": -293.4043273925781, + "logps/rejected": -247.53579711914062, + "loss": 0.1475, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.46793246269226074, + "rewards/margins": 0.09048617631196976, + "rewards/rejected": -0.5584186315536499, + "step": 7750 + }, + { + "epoch": 0.93, + "learning_rate": 7.21451533372236e-08, + "logits/chosen": -1.9626423120498657, + "logits/rejected": -1.580739974975586, + "logps/chosen": -258.93060302734375, + "logps/rejected": -241.1245574951172, + "loss": 0.1536, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.47752898931503296, + "rewards/margins": 0.1563757061958313, + "rewards/rejected": -0.6339046955108643, + "step": 7760 + }, + { + "epoch": 0.93, + "learning_rate": 6.966919744327783e-08, + "logits/chosen": -2.0937347412109375, + "logits/rejected": -1.5341440439224243, + "logps/chosen": -341.6336669921875, + "logps/rejected": -260.72589111328125, + "loss": 0.1603, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.42314642667770386, + "rewards/margins": 0.19263425469398499, + "rewards/rejected": -0.6157806515693665, + "step": 7770 + }, + { + "epoch": 0.93, + "learning_rate": 6.723587261321912e-08, + "logits/chosen": -1.9547611474990845, + "logits/rejected": -1.642380714416504, + "logps/chosen": -297.21649169921875, + "logps/rejected": -273.52459716796875, + "loss": 0.1617, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4811842441558838, + "rewards/margins": 0.11549937725067139, + "rewards/rejected": -0.5966835618019104, + "step": 7780 + }, + { + "epoch": 0.93, + "learning_rate": 6.484522153063056e-08, + "logits/chosen": -1.9168577194213867, + "logits/rejected": -1.5155632495880127, + "logps/chosen": -225.7365264892578, + "logps/rejected": -204.46615600585938, + "loss": 0.1319, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4766554832458496, + "rewards/margins": 0.1555982381105423, + "rewards/rejected": -0.6322537064552307, + "step": 7790 + }, + { + "epoch": 0.94, + "learning_rate": 6.249728613054313e-08, + "logits/chosen": -1.7597625255584717, + "logits/rejected": -1.5485173463821411, + "logps/chosen": -264.35321044921875, + "logps/rejected": -273.6183166503906, + "loss": 0.1479, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.48368602991104126, + "rewards/margins": 0.07144041359424591, + "rewards/rejected": -0.5551263689994812, + "step": 7800 + }, + { + "epoch": 0.94, + "learning_rate": 6.01921075987022e-08, + "logits/chosen": -1.9097301959991455, + "logits/rejected": -1.186971664428711, + "logps/chosen": -246.1496124267578, + "logps/rejected": -194.6399688720703, + "loss": 0.1488, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4605187773704529, + "rewards/margins": 0.23739011585712433, + "rewards/rejected": -0.697908878326416, + "step": 7810 + }, + { + "epoch": 0.94, + "learning_rate": 5.7929726370843096e-08, + "logits/chosen": -2.1597225666046143, + "logits/rejected": -1.8278968334197998, + "logps/chosen": -254.0515594482422, + "logps/rejected": -271.48236083984375, + "loss": 0.1766, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.4439367651939392, + "rewards/margins": 0.2024269998073578, + "rewards/rejected": -0.6463637351989746, + "step": 7820 + }, + { + "epoch": 0.94, + "learning_rate": 5.5710182131981927e-08, + "logits/chosen": -1.99951171875, + "logits/rejected": -1.7067577838897705, + "logps/chosen": -238.32467651367188, + "logps/rejected": -256.7763671875, + "loss": 0.1304, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.39582788944244385, + "rewards/margins": 0.21840229630470276, + "rewards/rejected": -0.614230215549469, + "step": 7830 + }, + { + "epoch": 0.94, + "learning_rate": 5.3533513815721694e-08, + "logits/chosen": -2.118350028991699, + "logits/rejected": -1.6194887161254883, + "logps/chosen": -272.92449951171875, + "logps/rejected": -227.6458282470703, + "loss": 0.0858, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.43067464232444763, + "rewards/margins": 0.21112823486328125, + "rewards/rejected": -0.6418029069900513, + "step": 7840 + }, + { + "epoch": 0.94, + "learning_rate": 5.1399759603565916e-08, + "logits/chosen": -2.176056385040283, + "logits/rejected": -1.72799551486969, + "logps/chosen": -259.24176025390625, + "logps/rejected": -220.6403350830078, + "loss": 0.1577, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.41245096921920776, + "rewards/margins": 0.16166391968727112, + "rewards/rejected": -0.5741148591041565, + "step": 7850 + }, + { + "epoch": 0.94, + "learning_rate": 4.930895692425192e-08, + "logits/chosen": -2.08243989944458, + "logits/rejected": -1.6198593378067017, + "logps/chosen": -313.93475341796875, + "logps/rejected": -302.1040344238281, + "loss": 0.0815, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.48940593004226685, + "rewards/margins": 0.1875011920928955, + "rewards/rejected": -0.6769071817398071, + "step": 7860 + }, + { + "epoch": 0.94, + "learning_rate": 4.726114245309249e-08, + "logits/chosen": -2.1306631565093994, + "logits/rejected": -1.8443044424057007, + "logps/chosen": -260.3841552734375, + "logps/rejected": -240.7478790283203, + "loss": 0.084, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4020455777645111, + "rewards/margins": 0.11921534687280655, + "rewards/rejected": -0.5212609171867371, + "step": 7870 + }, + { + "epoch": 0.95, + "learning_rate": 4.5256352111333334e-08, + "logits/chosen": -2.197538137435913, + "logits/rejected": -1.9828275442123413, + "logps/chosen": -265.1056213378906, + "logps/rejected": -240.36581420898438, + "loss": 0.1341, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.41930776834487915, + "rewards/margins": 0.10385145992040634, + "rewards/rejected": -0.5231592059135437, + "step": 7880 + }, + { + "epoch": 0.95, + "learning_rate": 4.32946210655219e-08, + "logits/chosen": -1.879839539527893, + "logits/rejected": -1.6344287395477295, + "logps/chosen": -289.47540283203125, + "logps/rejected": -303.8387756347656, + "loss": 0.0844, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4271796643733978, + "rewards/margins": 0.12638108432292938, + "rewards/rejected": -0.553560733795166, + "step": 7890 + }, + { + "epoch": 0.95, + "learning_rate": 4.137598372689289e-08, + "logits/chosen": -1.9711477756500244, + "logits/rejected": -1.640442132949829, + "logps/chosen": -289.0892639160156, + "logps/rejected": -256.19390869140625, + "loss": 0.1124, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.5559543967247009, + "rewards/margins": 0.06960447132587433, + "rewards/rejected": -0.6255587935447693, + "step": 7900 + }, + { + "epoch": 0.95, + "learning_rate": 3.950047375076177e-08, + "logits/chosen": -1.9874846935272217, + "logits/rejected": -1.746363639831543, + "logps/chosen": -253.0285186767578, + "logps/rejected": -267.914306640625, + "loss": 0.151, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.4185236096382141, + "rewards/margins": 0.170128732919693, + "rewards/rejected": -0.5886522531509399, + "step": 7910 + }, + { + "epoch": 0.95, + "learning_rate": 3.7668124035936395e-08, + "logits/chosen": -1.8806092739105225, + "logits/rejected": -1.7465565204620361, + "logps/chosen": -281.9869384765625, + "logps/rejected": -293.6903381347656, + "loss": 0.1098, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.48278847336769104, + "rewards/margins": 0.16966548562049866, + "rewards/rejected": -0.6524539589881897, + "step": 7920 + }, + { + "epoch": 0.95, + "learning_rate": 3.587896672413882e-08, + "logits/chosen": -1.954755187034607, + "logits/rejected": -1.6159816980361938, + "logps/chosen": -330.97698974609375, + "logps/rejected": -257.1192932128906, + "loss": 0.1392, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4463822841644287, + "rewards/margins": 0.12165029346942902, + "rewards/rejected": -0.5680325031280518, + "step": 7930 + }, + { + "epoch": 0.95, + "learning_rate": 3.413303319944244e-08, + "logits/chosen": -1.8800256252288818, + "logits/rejected": -1.666426420211792, + "logps/chosen": -255.54055786132812, + "logps/rejected": -272.89056396484375, + "loss": 0.1096, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5043514370918274, + "rewards/margins": 0.13485528528690338, + "rewards/rejected": -0.6392067670822144, + "step": 7940 + }, + { + "epoch": 0.95, + "learning_rate": 3.243035408772077e-08, + "logits/chosen": -1.8116118907928467, + "logits/rejected": -1.465587854385376, + "logps/chosen": -263.7826232910156, + "logps/rejected": -215.6946563720703, + "loss": 0.1744, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.47774848341941833, + "rewards/margins": 0.16809609532356262, + "rewards/rejected": -0.645844578742981, + "step": 7950 + }, + { + "epoch": 0.96, + "learning_rate": 3.077095925611007e-08, + "logits/chosen": -1.8181393146514893, + "logits/rejected": -1.7243306636810303, + "logps/chosen": -255.312255859375, + "logps/rejected": -261.9159851074219, + "loss": 0.1284, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.49375930428504944, + "rewards/margins": 0.1126602292060852, + "rewards/rejected": -0.6064194440841675, + "step": 7960 + }, + { + "epoch": 0.96, + "learning_rate": 2.915487781248616e-08, + "logits/chosen": -2.0126254558563232, + "logits/rejected": -1.7640451192855835, + "logps/chosen": -258.76007080078125, + "logps/rejected": -300.79937744140625, + "loss": 0.0698, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.43722277879714966, + "rewards/margins": 0.21826669573783875, + "rewards/rejected": -0.6554895043373108, + "step": 7970 + }, + { + "epoch": 0.96, + "learning_rate": 2.7582138104953748e-08, + "logits/chosen": -1.9624278545379639, + "logits/rejected": -1.6752490997314453, + "logps/chosen": -219.3885955810547, + "logps/rejected": -218.8914794921875, + "loss": 0.1453, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4231085181236267, + "rewards/margins": 0.1884673833847046, + "rewards/rejected": -0.6115759015083313, + "step": 7980 + }, + { + "epoch": 0.96, + "learning_rate": 2.6052767721348184e-08, + "logits/chosen": -2.087956666946411, + "logits/rejected": -1.7118041515350342, + "logps/chosen": -260.5646667480469, + "logps/rejected": -232.9169921875, + "loss": 0.1097, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.46867817640304565, + "rewards/margins": 0.12946224212646484, + "rewards/rejected": -0.5981403589248657, + "step": 7990 + }, + { + "epoch": 0.96, + "learning_rate": 2.4566793488752795e-08, + "logits/chosen": -1.9538524150848389, + "logits/rejected": -1.9380991458892822, + "logps/chosen": -240.32583618164062, + "logps/rejected": -280.9888000488281, + "loss": 0.1325, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5332067608833313, + "rewards/margins": 0.08598224818706512, + "rewards/rejected": -0.6191889643669128, + "step": 8000 + }, + { + "epoch": 0.96, + "learning_rate": 2.3124241473027333e-08, + "logits/chosen": -1.9650394916534424, + "logits/rejected": -1.666481375694275, + "logps/chosen": -229.7511444091797, + "logps/rejected": -257.80889892578125, + "loss": 0.1788, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.44220954179763794, + "rewards/margins": 0.14231547713279724, + "rewards/rejected": -0.5845250487327576, + "step": 8010 + }, + { + "epoch": 0.96, + "learning_rate": 2.1725136978351934e-08, + "logits/chosen": -2.1228089332580566, + "logits/rejected": -1.5950334072113037, + "logps/chosen": -295.29400634765625, + "logps/rejected": -244.79330444335938, + "loss": 0.1059, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.36449581384658813, + "rewards/margins": 0.2231093943119049, + "rewards/rejected": -0.5876051783561707, + "step": 8020 + }, + { + "epoch": 0.96, + "learning_rate": 2.036950454678166e-08, + "logits/chosen": -2.112056016921997, + "logits/rejected": -1.595947504043579, + "logps/chosen": -281.60601806640625, + "logps/rejected": -243.0486297607422, + "loss": 0.1566, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3312150835990906, + "rewards/margins": 0.19925157725811005, + "rewards/rejected": -0.530466616153717, + "step": 8030 + }, + { + "epoch": 0.96, + "learning_rate": 1.9057367957817096e-08, + "logits/chosen": -1.9058849811553955, + "logits/rejected": -1.4319086074829102, + "logps/chosen": -244.7132568359375, + "logps/rejected": -223.8097686767578, + "loss": 0.09, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.38957375288009644, + "rewards/margins": 0.19439074397087097, + "rewards/rejected": -0.583964467048645, + "step": 8040 + }, + { + "epoch": 0.97, + "learning_rate": 1.778875022798693e-08, + "logits/chosen": -1.5648739337921143, + "logits/rejected": -1.51466965675354, + "logps/chosen": -189.83966064453125, + "logps/rejected": -232.4983367919922, + "loss": 0.1424, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5467423796653748, + "rewards/margins": 0.11828476190567017, + "rewards/rejected": -0.6650272011756897, + "step": 8050 + }, + { + "epoch": 0.97, + "learning_rate": 1.6563673610444363e-08, + "logits/chosen": -2.011043071746826, + "logits/rejected": -1.761691689491272, + "logps/chosen": -261.29498291015625, + "logps/rejected": -261.8739929199219, + "loss": 0.0942, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.5379740595817566, + "rewards/margins": 0.1529974639415741, + "rewards/rejected": -0.6909714937210083, + "step": 8060 + }, + { + "epoch": 0.97, + "learning_rate": 1.5382159594576616e-08, + "logits/chosen": -1.802074670791626, + "logits/rejected": -1.4913840293884277, + "logps/chosen": -231.8621368408203, + "logps/rejected": -263.0806884765625, + "loss": 0.1387, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.5238322019577026, + "rewards/margins": 0.1918632835149765, + "rewards/rejected": -0.7156955003738403, + "step": 8070 + }, + { + "epoch": 0.97, + "learning_rate": 1.424422890562771e-08, + "logits/chosen": -2.1378629207611084, + "logits/rejected": -1.9434757232666016, + "logps/chosen": -232.4508514404297, + "logps/rejected": -239.05911254882812, + "loss": 0.1501, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.42561620473861694, + "rewards/margins": 0.15531761944293976, + "rewards/rejected": -0.5809338688850403, + "step": 8080 + }, + { + "epoch": 0.97, + "learning_rate": 1.3149901504335706e-08, + "logits/chosen": -1.9773696660995483, + "logits/rejected": -1.4441204071044922, + "logps/chosen": -237.9873504638672, + "logps/rejected": -219.8182373046875, + "loss": 0.1467, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.43333473801612854, + "rewards/margins": 0.14528748393058777, + "rewards/rejected": -0.5786222219467163, + "step": 8090 + }, + { + "epoch": 0.97, + "learning_rate": 1.2099196586581596e-08, + "logits/chosen": -1.9610668420791626, + "logits/rejected": -1.8130521774291992, + "logps/chosen": -229.5089569091797, + "logps/rejected": -231.4257354736328, + "loss": 0.1316, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.501410722732544, + "rewards/margins": 0.13243083655834198, + "rewards/rejected": -0.6338415741920471, + "step": 8100 + }, + { + "epoch": 0.97, + "learning_rate": 1.1092132583053472e-08, + "logits/chosen": -1.984513521194458, + "logits/rejected": -1.6125694513320923, + "logps/chosen": -297.69390869140625, + "logps/rejected": -272.7558898925781, + "loss": 0.0944, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4865299165248871, + "rewards/margins": 0.19219791889190674, + "rewards/rejected": -0.6787278056144714, + "step": 8110 + }, + { + "epoch": 0.97, + "learning_rate": 1.0128727158922603e-08, + "logits/chosen": -1.8715741634368896, + "logits/rejected": -1.7751662731170654, + "logps/chosen": -242.9435577392578, + "logps/rejected": -240.58901977539062, + "loss": 0.1389, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4711694121360779, + "rewards/margins": 0.08951371163129807, + "rewards/rejected": -0.5606831312179565, + "step": 8120 + }, + { + "epoch": 0.98, + "learning_rate": 9.20899721353341e-09, + "logits/chosen": -2.031369924545288, + "logits/rejected": -1.8035614490509033, + "logps/chosen": -258.4107971191406, + "logps/rejected": -323.7535705566406, + "loss": 0.1405, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4729674458503723, + "rewards/margins": 0.15468671917915344, + "rewards/rejected": -0.6276541948318481, + "step": 8130 + }, + { + "epoch": 0.98, + "learning_rate": 8.332958880108155e-09, + "logits/chosen": -2.0815842151641846, + "logits/rejected": -1.6333885192871094, + "logps/chosen": -278.9797668457031, + "logps/rejected": -250.55886840820312, + "loss": 0.1568, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3721701502799988, + "rewards/margins": 0.18021699786186218, + "rewards/rejected": -0.5523871183395386, + "step": 8140 + }, + { + "epoch": 0.98, + "learning_rate": 7.500627525462711e-09, + "logits/chosen": -2.008918285369873, + "logits/rejected": -1.5942274332046509, + "logps/chosen": -286.1038513183594, + "logps/rejected": -229.10812377929688, + "loss": 0.1275, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4887969493865967, + "rewards/margins": 0.1278802454471588, + "rewards/rejected": -0.6166771650314331, + "step": 8150 + }, + { + "epoch": 0.98, + "learning_rate": 6.712017749737343e-09, + "logits/chosen": -2.0941195487976074, + "logits/rejected": -1.3795114755630493, + "logps/chosen": -307.4806823730469, + "logps/rejected": -249.28445434570312, + "loss": 0.1405, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.4613245129585266, + "rewards/margins": 0.1985275000333786, + "rewards/rejected": -0.6598520278930664, + "step": 8160 + }, + { + "epoch": 0.98, + "learning_rate": 5.96714338614135e-09, + "logits/chosen": -2.215099334716797, + "logits/rejected": -1.6631901264190674, + "logps/chosen": -375.0947265625, + "logps/rejected": -313.10205078125, + "loss": 0.1627, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.39094269275665283, + "rewards/margins": 0.1836312711238861, + "rewards/rejected": -0.5745739936828613, + "step": 8170 + }, + { + "epoch": 0.98, + "learning_rate": 5.266017500709098e-09, + "logits/chosen": -2.128441095352173, + "logits/rejected": -1.8798637390136719, + "logps/chosen": -252.12783813476562, + "logps/rejected": -264.87506103515625, + "loss": 0.1501, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4046745300292969, + "rewards/margins": 0.11516742408275604, + "rewards/rejected": -0.5198420286178589, + "step": 8180 + }, + { + "epoch": 0.98, + "learning_rate": 4.608652392072144e-09, + "logits/chosen": -2.095583438873291, + "logits/rejected": -1.7360079288482666, + "logps/chosen": -267.0264587402344, + "logps/rejected": -242.8651885986328, + "loss": 0.1365, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4499017596244812, + "rewards/margins": 0.15220007300376892, + "rewards/rejected": -0.6021018028259277, + "step": 8190 + }, + { + "epoch": 0.98, + "learning_rate": 3.995059591242467e-09, + "logits/chosen": -2.0483319759368896, + "logits/rejected": -1.7100093364715576, + "logps/chosen": -344.25360107421875, + "logps/rejected": -320.6583251953125, + "loss": 0.113, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4792884886264801, + "rewards/margins": 0.13656087219715118, + "rewards/rejected": -0.6158494353294373, + "step": 8200 + }, + { + "epoch": 0.99, + "learning_rate": 3.4252498614106843e-09, + "logits/chosen": -1.996120810508728, + "logits/rejected": -1.5770938396453857, + "logps/chosen": -301.490478515625, + "logps/rejected": -247.4650421142578, + "loss": 0.1302, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.49081936478614807, + "rewards/margins": 0.14099135994911194, + "rewards/rejected": -0.6318107843399048, + "step": 8210 + }, + { + "epoch": 0.99, + "learning_rate": 2.8992331977570343e-09, + "logits/chosen": -2.1005160808563232, + "logits/rejected": -1.9152495861053467, + "logps/chosen": -269.256591796875, + "logps/rejected": -259.864501953125, + "loss": 0.1759, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4604567885398865, + "rewards/margins": 0.09413363039493561, + "rewards/rejected": -0.5545904636383057, + "step": 8220 + }, + { + "epoch": 0.99, + "learning_rate": 2.4170188272770736e-09, + "logits/chosen": -2.0771050453186035, + "logits/rejected": -1.7211967706680298, + "logps/chosen": -322.2476501464844, + "logps/rejected": -295.37115478515625, + "loss": 0.1513, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.380420446395874, + "rewards/margins": 0.18110953271389008, + "rewards/rejected": -0.5615299940109253, + "step": 8230 + }, + { + "epoch": 0.99, + "learning_rate": 1.9786152086181955e-09, + "logits/chosen": -2.000828266143799, + "logits/rejected": -1.55103600025177, + "logps/chosen": -267.5838928222656, + "logps/rejected": -248.3422393798828, + "loss": 0.1141, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4607185423374176, + "rewards/margins": 0.19947698712348938, + "rewards/rejected": -0.660195529460907, + "step": 8240 + }, + { + "epoch": 0.99, + "learning_rate": 1.5840300319316937e-09, + "logits/chosen": -1.786924123764038, + "logits/rejected": -1.3054568767547607, + "logps/chosen": -270.92205810546875, + "logps/rejected": -260.0880126953125, + "loss": 0.1442, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.541989266872406, + "rewards/margins": 0.18230721354484558, + "rewards/rejected": -0.7242964506149292, + "step": 8250 + }, + { + "epoch": 0.99, + "learning_rate": 1.23327021873898e-09, + "logits/chosen": -2.0959393978118896, + "logits/rejected": -1.521923303604126, + "logps/chosen": -264.62353515625, + "logps/rejected": -225.8861846923828, + "loss": 0.1201, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.37499696016311646, + "rewards/margins": 0.20748789608478546, + "rewards/rejected": -0.5824848413467407, + "step": 8260 + }, + { + "epoch": 0.99, + "learning_rate": 9.263419218089042e-10, + "logits/chosen": -1.8493993282318115, + "logits/rejected": -1.476485013961792, + "logps/chosen": -253.7661895751953, + "logps/rejected": -263.26422119140625, + "loss": 0.1452, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4299536645412445, + "rewards/margins": 0.20787473022937775, + "rewards/rejected": -0.6378284692764282, + "step": 8270 + }, + { + "epoch": 0.99, + "learning_rate": 6.632505250506183e-10, + "logits/chosen": -2.12581205368042, + "logits/rejected": -1.7464358806610107, + "logps/chosen": -305.6679382324219, + "logps/rejected": -250.2874298095703, + "loss": 0.1467, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.44572052359580994, + "rewards/margins": 0.15005187690258026, + "rewards/rejected": -0.595772385597229, + "step": 8280 + }, + { + "epoch": 0.99, + "learning_rate": 4.440006434183741e-10, + "logits/chosen": -1.8823049068450928, + "logits/rejected": -1.5557067394256592, + "logps/chosen": -312.54608154296875, + "logps/rejected": -316.3292541503906, + "loss": 0.1481, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.5499744415283203, + "rewards/margins": 0.11392368376255035, + "rewards/rejected": -0.663898229598999, + "step": 8290 + }, + { + "epoch": 1.0, + "learning_rate": 2.6859612283186567e-10, + "logits/chosen": -2.018319845199585, + "logits/rejected": -1.6919094324111938, + "logps/chosen": -225.6822052001953, + "logps/rejected": -243.1230926513672, + "loss": 0.1243, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4279976785182953, + "rewards/margins": 0.1937953531742096, + "rewards/rejected": -0.6217929720878601, + "step": 8300 + }, + { + "epoch": 1.0, + "learning_rate": 1.370400401065619e-10, + "logits/chosen": -1.9666255712509155, + "logits/rejected": -1.7945621013641357, + "logps/chosen": -199.1581573486328, + "logps/rejected": -203.54124450683594, + "loss": 0.1544, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5227402448654175, + "rewards/margins": 0.12387014925479889, + "rewards/rejected": -0.6466103792190552, + "step": 8310 + }, + { + "epoch": 1.0, + "learning_rate": 4.933470290263698e-11, + "logits/chosen": -2.083996295928955, + "logits/rejected": -1.7333097457885742, + "logps/chosen": -277.2462463378906, + "logps/rejected": -264.87225341796875, + "loss": 0.155, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.4118651747703552, + "rewards/margins": 0.1753673553466797, + "rewards/rejected": -0.5872325301170349, + "step": 8320 + }, + { + "epoch": 1.0, + "learning_rate": 5.481649681671197e-12, + "logits/chosen": -1.8581920862197876, + "logits/rejected": -1.672580361366272, + "logps/chosen": -255.0128173828125, + "logps/rejected": -287.12408447265625, + "loss": 0.1253, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.5130528211593628, + "rewards/margins": 0.1296093761920929, + "rewards/rejected": -0.6426622867584229, + "step": 8330 + }, + { + "epoch": 1.0, + "step": 8335, + "total_flos": 0.0, + "train_loss": 0.14340321629899808, + "train_runtime": 34860.3009, + "train_samples_per_second": 0.956, + "train_steps_per_second": 0.239 + } + ], + "logging_steps": 10, + "max_steps": 8335, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}