{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9994767137624281, "eval_steps": 100, "global_step": 955, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 5.208333333333333e-09, "logits/chosen": -2.919764995574951, "logits/rejected": -2.686896800994873, "logps/chosen": -229.94229125976562, "logps/rejected": -214.70114135742188, "loss": 0.9741, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "learning_rate": 5.208333333333333e-08, "logits/chosen": -2.6810548305511475, "logits/rejected": -2.709120035171509, "logps/chosen": -295.81451416015625, "logps/rejected": -250.5977325439453, "loss": 0.9742, "rewards/accuracies": 0.4375, "rewards/chosen": 0.0003499284212011844, "rewards/margins": 0.0001925795222632587, "rewards/rejected": 0.00015734886983409524, "step": 10 }, { "epoch": 0.02, "learning_rate": 1.0416666666666667e-07, "logits/chosen": -2.6195099353790283, "logits/rejected": -2.625662088394165, "logps/chosen": -271.3158264160156, "logps/rejected": -246.94711303710938, "loss": 0.9738, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.00036363088293001056, "rewards/margins": 0.0009133815765380859, "rewards/rejected": -0.0005497508682310581, "step": 20 }, { "epoch": 0.03, "learning_rate": 1.5624999999999999e-07, "logits/chosen": -2.7032079696655273, "logits/rejected": -2.666191577911377, "logps/chosen": -278.3299865722656, "logps/rejected": -254.5498809814453, "loss": 0.9731, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.0010723542654886842, "rewards/margins": 0.0018442096188664436, "rewards/rejected": -0.0007718555280007422, "step": 30 }, { "epoch": 0.04, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -2.6499533653259277, "logits/rejected": -2.6374642848968506, "logps/chosen": -273.9149475097656, "logps/rejected": -237.7373504638672, "loss": 0.9713, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.003091720398515463, "rewards/margins": 0.005813647527247667, "rewards/rejected": -0.002721927361562848, "step": 40 }, { "epoch": 0.05, "learning_rate": 2.604166666666667e-07, "logits/chosen": -2.6743884086608887, "logits/rejected": -2.6382362842559814, "logps/chosen": -296.06744384765625, "logps/rejected": -274.7203063964844, "loss": 0.9657, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.009085027500987053, "rewards/margins": 0.016131814569234848, "rewards/rejected": -0.007046787533909082, "step": 50 }, { "epoch": 0.06, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -2.630288600921631, "logits/rejected": -2.6329100131988525, "logps/chosen": -285.3112487792969, "logps/rejected": -274.3540954589844, "loss": 0.9577, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.02476242184638977, "rewards/margins": 0.0284078661352396, "rewards/rejected": -0.0036454431246966124, "step": 60 }, { "epoch": 0.07, "learning_rate": 3.645833333333333e-07, "logits/chosen": -2.6492714881896973, "logits/rejected": -2.6846468448638916, "logps/chosen": -310.99761962890625, "logps/rejected": -290.5529479980469, "loss": 0.9406, "rewards/accuracies": 0.6875, "rewards/chosen": 0.03506559133529663, "rewards/margins": 0.05608881637454033, "rewards/rejected": -0.021023228764533997, "step": 70 }, { "epoch": 0.08, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -2.523487091064453, "logits/rejected": -2.464901924133301, "logps/chosen": -304.66845703125, "logps/rejected": -281.81732177734375, "loss": 0.9095, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.04107608273625374, "rewards/margins": 0.11836276948451996, "rewards/rejected": -0.1594388782978058, "step": 80 }, { "epoch": 0.09, "learning_rate": 4.6874999999999996e-07, "logits/chosen": -2.5307559967041016, "logits/rejected": -2.5012693405151367, "logps/chosen": -292.33392333984375, "logps/rejected": -284.6061096191406, "loss": 0.8948, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.00477323355153203, "rewards/margins": 0.17751149833202362, "rewards/rejected": -0.18228471279144287, "step": 90 }, { "epoch": 0.1, "learning_rate": 4.999732492681437e-07, "logits/chosen": -2.501075267791748, "logits/rejected": -2.491670846939087, "logps/chosen": -340.92401123046875, "logps/rejected": -345.025390625, "loss": 0.8957, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.35498708486557007, "rewards/margins": 0.17348773777484894, "rewards/rejected": -0.5284748077392578, "step": 100 }, { "epoch": 0.1, "eval_logits/chosen": -2.54604434967041, "eval_logits/rejected": -2.5107295513153076, "eval_logps/chosen": -336.2060241699219, "eval_logps/rejected": -330.2667541503906, "eval_loss": 0.9028440117835999, "eval_rewards/accuracies": 0.6904761791229248, "eval_rewards/chosen": -0.5209627151489258, "eval_rewards/margins": 0.1639193296432495, "eval_rewards/rejected": -0.6848820447921753, "eval_runtime": 245.63, "eval_samples_per_second": 8.142, "eval_steps_per_second": 0.256, "step": 100 }, { "epoch": 0.12, "learning_rate": 4.996723692767926e-07, "logits/chosen": -2.258904218673706, "logits/rejected": -2.185375452041626, "logps/chosen": -319.14654541015625, "logps/rejected": -304.89739990234375, "loss": 0.8703, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.5882319211959839, "rewards/margins": 0.2905711531639099, "rewards/rejected": -0.8788030743598938, "step": 110 }, { "epoch": 0.13, "learning_rate": 4.990375746213598e-07, "logits/chosen": -1.46225106716156, "logits/rejected": -1.342179775238037, "logps/chosen": -348.0282897949219, "logps/rejected": -337.330078125, "loss": 0.8274, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.42451825737953186, "rewards/margins": 0.41508832573890686, "rewards/rejected": -0.8396065831184387, "step": 120 }, { "epoch": 0.14, "learning_rate": 4.980697142834314e-07, "logits/chosen": -1.1638177633285522, "logits/rejected": -0.918566107749939, "logps/chosen": -371.7701721191406, "logps/rejected": -366.410400390625, "loss": 0.8256, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.5147355198860168, "rewards/margins": 0.4547084867954254, "rewards/rejected": -0.9694439172744751, "step": 130 }, { "epoch": 0.15, "learning_rate": 4.967700826904229e-07, "logits/chosen": 0.09236583858728409, "logits/rejected": 0.07060788571834564, "logps/chosen": -294.2372741699219, "logps/rejected": -336.0912170410156, "loss": 0.771, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.5445362329483032, "rewards/margins": 0.5613424181938171, "rewards/rejected": -1.1058785915374756, "step": 140 }, { "epoch": 0.16, "learning_rate": 4.951404179843962e-07, "logits/chosen": 0.7083513140678406, "logits/rejected": 0.5464950203895569, "logps/chosen": -364.0424499511719, "logps/rejected": -411.08209228515625, "loss": 0.8494, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.8699826002120972, "rewards/margins": 0.39457255601882935, "rewards/rejected": -1.2645552158355713, "step": 150 }, { "epoch": 0.17, "learning_rate": 4.931828996974498e-07, "logits/chosen": 0.4938809871673584, "logits/rejected": 0.9410598874092102, "logps/chosen": -424.72430419921875, "logps/rejected": -468.349609375, "loss": 0.7631, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.2891677618026733, "rewards/margins": 0.488609254360199, "rewards/rejected": -1.777777075767517, "step": 160 }, { "epoch": 0.18, "learning_rate": 4.909001458367866e-07, "logits/chosen": 0.14566074311733246, "logits/rejected": 0.20485401153564453, "logps/chosen": -388.87158203125, "logps/rejected": -427.6263732910156, "loss": 0.7772, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.1580675840377808, "rewards/margins": 0.6463179588317871, "rewards/rejected": -1.8043855428695679, "step": 170 }, { "epoch": 0.19, "learning_rate": 4.882952093833627e-07, "logits/chosen": 0.1968054324388504, "logits/rejected": 0.5141702890396118, "logps/chosen": -351.22821044921875, "logps/rejected": -409.0249938964844, "loss": 0.7395, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.02028489112854, "rewards/margins": 0.7496393322944641, "rewards/rejected": -1.7699241638183594, "step": 180 }, { "epoch": 0.2, "learning_rate": 4.853715742087946e-07, "logits/chosen": 0.310161292552948, "logits/rejected": 0.9174222946166992, "logps/chosen": -406.7144775390625, "logps/rejected": -464.5382385253906, "loss": 0.7373, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.2787965536117554, "rewards/margins": 0.8055577278137207, "rewards/rejected": -2.0843544006347656, "step": 190 }, { "epoch": 0.21, "learning_rate": 4.821331504159906e-07, "logits/chosen": 0.49894601106643677, "logits/rejected": 1.0194227695465088, "logps/chosen": -405.57330322265625, "logps/rejected": -467.26641845703125, "loss": 0.7658, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1890487670898438, "rewards/margins": 0.7286871671676636, "rewards/rejected": -1.9177358150482178, "step": 200 }, { "epoch": 0.21, "eval_logits/chosen": -0.152938574552536, "eval_logits/rejected": 0.33469492197036743, "eval_logps/chosen": -378.24761962890625, "eval_logps/rejected": -431.1014709472656, "eval_loss": 0.7649896144866943, "eval_rewards/accuracies": 0.7460317611694336, "eval_rewards/chosen": -0.941378653049469, "eval_rewards/margins": 0.7518512010574341, "eval_rewards/rejected": -1.6932299137115479, "eval_runtime": 243.5152, "eval_samples_per_second": 8.213, "eval_steps_per_second": 0.259, "step": 200 }, { "epoch": 0.22, "learning_rate": 4.785842691097342e-07, "logits/chosen": -0.3292608857154846, "logits/rejected": 0.1720762550830841, "logps/chosen": -389.8594665527344, "logps/rejected": -401.64581298828125, "loss": 0.7634, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8458383679389954, "rewards/margins": 0.6316131353378296, "rewards/rejected": -1.4774516820907593, "step": 210 }, { "epoch": 0.23, "learning_rate": 4.7472967660421603e-07, "logits/chosen": 0.5375509858131409, "logits/rejected": 0.9775497317314148, "logps/chosen": -387.09014892578125, "logps/rejected": -446.6785583496094, "loss": 0.7559, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.9755613207817078, "rewards/margins": 0.7280157208442688, "rewards/rejected": -1.7035770416259766, "step": 220 }, { "epoch": 0.24, "learning_rate": 4.705745280752585e-07, "logits/chosen": 0.5464267134666443, "logits/rejected": 1.0262590646743774, "logps/chosen": -444.06072998046875, "logps/rejected": -483.55926513671875, "loss": 0.7417, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5034980773925781, "rewards/margins": 0.8456419706344604, "rewards/rejected": -2.349139928817749, "step": 230 }, { "epoch": 0.25, "learning_rate": 4.6612438066572555e-07, "logits/chosen": 0.6961275935173035, "logits/rejected": 1.6300386190414429, "logps/chosen": -418.9384765625, "logps/rejected": -454.033935546875, "loss": 0.6919, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.4168249368667603, "rewards/margins": 0.863106369972229, "rewards/rejected": -2.2799313068389893, "step": 240 }, { "epoch": 0.26, "learning_rate": 4.6138518605333664e-07, "logits/chosen": -0.6623570919036865, "logits/rejected": -0.1930474489927292, "logps/chosen": -347.79315185546875, "logps/rejected": -429.45465087890625, "loss": 0.7378, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8020486831665039, "rewards/margins": 0.6783057451248169, "rewards/rejected": -1.4803544282913208, "step": 250 }, { "epoch": 0.27, "learning_rate": 4.5636328249082514e-07, "logits/chosen": -1.4540965557098389, "logits/rejected": -0.7168424725532532, "logps/chosen": -342.86102294921875, "logps/rejected": -387.8767395019531, "loss": 0.7447, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5879959464073181, "rewards/margins": 0.7121099233627319, "rewards/rejected": -1.3001058101654053, "step": 260 }, { "epoch": 0.28, "learning_rate": 4.510653863290871e-07, "logits/chosen": -0.18550051748752594, "logits/rejected": 0.8060259819030762, "logps/chosen": -402.1506042480469, "logps/rejected": -452.3603515625, "loss": 0.7185, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.002969741821289, "rewards/margins": 1.018422245979309, "rewards/rejected": -2.0213921070098877, "step": 270 }, { "epoch": 0.29, "learning_rate": 4.4549858303465737e-07, "logits/chosen": 0.21997830271720886, "logits/rejected": 0.8937602043151855, "logps/chosen": -420.73773193359375, "logps/rejected": -503.6924743652344, "loss": 0.6991, "rewards/accuracies": 0.78125, "rewards/chosen": -1.295493721961975, "rewards/margins": 0.9076651334762573, "rewards/rejected": -2.2031588554382324, "step": 280 }, { "epoch": 0.3, "learning_rate": 4.396703177135261e-07, "logits/chosen": 0.11293928325176239, "logits/rejected": 0.5330738425254822, "logps/chosen": -414.304931640625, "logps/rejected": -458.75946044921875, "loss": 0.7064, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.2927477359771729, "rewards/margins": 0.7714477777481079, "rewards/rejected": -2.0641958713531494, "step": 290 }, { "epoch": 0.31, "learning_rate": 4.335883851539693e-07, "logits/chosen": 0.7407528162002563, "logits/rejected": 1.589734435081482, "logps/chosen": -405.79022216796875, "logps/rejected": -488.19036865234375, "loss": 0.7079, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.223577857017517, "rewards/margins": 1.1488986015319824, "rewards/rejected": -2.372476816177368, "step": 300 }, { "epoch": 0.31, "eval_logits/chosen": 0.87442946434021, "eval_logits/rejected": 1.8370469808578491, "eval_logps/chosen": -422.4754333496094, "eval_logps/rejected": -510.4591064453125, "eval_loss": 0.7289105653762817, "eval_rewards/accuracies": 0.7559523582458496, "eval_rewards/chosen": -1.383657455444336, "eval_rewards/margins": 1.1031482219696045, "eval_rewards/rejected": -2.4868052005767822, "eval_runtime": 242.4112, "eval_samples_per_second": 8.25, "eval_steps_per_second": 0.26, "step": 300 }, { "epoch": 0.32, "learning_rate": 4.272609194017105e-07, "logits/chosen": 0.7091141939163208, "logits/rejected": 1.7715873718261719, "logps/chosen": -404.2896728515625, "logps/rejected": -514.3563842773438, "loss": 0.6597, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.2207536697387695, "rewards/margins": 1.231592059135437, "rewards/rejected": -2.452345848083496, "step": 310 }, { "epoch": 0.33, "learning_rate": 4.2069638288135547e-07, "logits/chosen": 0.6078277826309204, "logits/rejected": 1.4416134357452393, "logps/chosen": -417.4878845214844, "logps/rejected": -491.2012634277344, "loss": 0.7219, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.358330249786377, "rewards/margins": 0.8599346280097961, "rewards/rejected": -2.2182650566101074, "step": 320 }, { "epoch": 0.35, "learning_rate": 4.139035550786494e-07, "logits/chosen": 0.2845739424228668, "logits/rejected": 0.876905620098114, "logps/chosen": -394.2958068847656, "logps/rejected": -421.72393798828125, "loss": 0.7567, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.147202730178833, "rewards/margins": 0.7379333972930908, "rewards/rejected": -1.8851358890533447, "step": 330 }, { "epoch": 0.36, "learning_rate": 4.0689152079869306e-07, "logits/chosen": -0.5365289449691772, "logits/rejected": 0.34862279891967773, "logps/chosen": -342.41485595703125, "logps/rejected": -388.524658203125, "loss": 0.776, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.9666112065315247, "rewards/margins": 0.6763932108879089, "rewards/rejected": -1.6430044174194336, "step": 340 }, { "epoch": 0.37, "learning_rate": 3.99669658015821e-07, "logits/chosen": 0.45323339104652405, "logits/rejected": 0.7332956194877625, "logps/chosen": -399.76519775390625, "logps/rejected": -506.49658203125, "loss": 0.7213, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.27635657787323, "rewards/margins": 0.998461127281189, "rewards/rejected": -2.2748172283172607, "step": 350 }, { "epoch": 0.38, "learning_rate": 3.92247625331392e-07, "logits/chosen": 0.6511309742927551, "logits/rejected": 1.2110098600387573, "logps/chosen": -422.53778076171875, "logps/rejected": -472.4964294433594, "loss": 0.7003, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.3980082273483276, "rewards/margins": 0.8997832536697388, "rewards/rejected": -2.2977914810180664, "step": 360 }, { "epoch": 0.39, "learning_rate": 3.846353490562664e-07, "logits/chosen": 0.30174368619918823, "logits/rejected": 0.9621099233627319, "logps/chosen": -372.94635009765625, "logps/rejected": -497.25714111328125, "loss": 0.6659, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3280470371246338, "rewards/margins": 1.1047414541244507, "rewards/rejected": -2.432788372039795, "step": 370 }, { "epoch": 0.4, "learning_rate": 3.768430099352445e-07, "logits/chosen": -0.1737074851989746, "logits/rejected": 1.1676225662231445, "logps/chosen": -460.0953063964844, "logps/rejected": -537.3989868164062, "loss": 0.6788, "rewards/accuracies": 0.75, "rewards/chosen": -1.6348745822906494, "rewards/margins": 1.1145647764205933, "rewards/rejected": -2.7494394779205322, "step": 380 }, { "epoch": 0.41, "learning_rate": 3.6888102953122304e-07, "logits/chosen": -0.2348991185426712, "logits/rejected": 0.7535260915756226, "logps/chosen": -421.128173828125, "logps/rejected": -482.41656494140625, "loss": 0.7027, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.4536683559417725, "rewards/margins": 1.019152045249939, "rewards/rejected": -2.47282075881958, "step": 390 }, { "epoch": 0.42, "learning_rate": 3.607600562872785e-07, "logits/chosen": -0.24078145623207092, "logits/rejected": 0.8144145011901855, "logps/chosen": -454.4244689941406, "logps/rejected": -500.9457092285156, "loss": 0.6806, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5004165172576904, "rewards/margins": 0.854039192199707, "rewards/rejected": -2.3544554710388184, "step": 400 }, { "epoch": 0.42, "eval_logits/chosen": 0.09922664612531662, "eval_logits/rejected": 1.2713433504104614, "eval_logps/chosen": -416.9630432128906, "eval_logps/rejected": -503.67401123046875, "eval_loss": 0.7040213346481323, "eval_rewards/accuracies": 0.7698412537574768, "eval_rewards/chosen": -1.3285325765609741, "eval_rewards/margins": 1.0904221534729004, "eval_rewards/rejected": -2.418954849243164, "eval_runtime": 242.7344, "eval_samples_per_second": 8.239, "eval_steps_per_second": 0.26, "step": 400 }, { "epoch": 0.43, "learning_rate": 3.5249095128531856e-07, "logits/chosen": -0.06878291815519333, "logits/rejected": 1.236665964126587, "logps/chosen": -443.3794860839844, "logps/rejected": -522.2283325195312, "loss": 0.7085, "rewards/accuracies": 0.75, "rewards/chosen": -1.2952146530151367, "rewards/margins": 1.0169426202774048, "rewards/rejected": -2.312157154083252, "step": 410 }, { "epoch": 0.44, "learning_rate": 3.4408477372034736e-07, "logits/chosen": 0.041589152067899704, "logits/rejected": 1.3509619235992432, "logps/chosen": -390.40008544921875, "logps/rejected": -440.0779724121094, "loss": 0.7102, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2629300355911255, "rewards/margins": 0.8384572863578796, "rewards/rejected": -2.1013875007629395, "step": 420 }, { "epoch": 0.45, "learning_rate": 3.3555276610977276e-07, "logits/chosen": 0.06685711443424225, "logits/rejected": 1.2138116359710693, "logps/chosen": -372.45196533203125, "logps/rejected": -431.705078125, "loss": 0.7376, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.1025346517562866, "rewards/margins": 0.8204299211502075, "rewards/rejected": -1.9229644536972046, "step": 430 }, { "epoch": 0.46, "learning_rate": 3.269063392575352e-07, "logits/chosen": 0.9198445081710815, "logits/rejected": 0.9934859275817871, "logps/chosen": -396.35968017578125, "logps/rejected": -476.847900390625, "loss": 0.6966, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3736763000488281, "rewards/margins": 0.9061026573181152, "rewards/rejected": -2.2797789573669434, "step": 440 }, { "epoch": 0.47, "learning_rate": 3.1815705699316964e-07, "logits/chosen": 0.9360873103141785, "logits/rejected": 1.5897537469863892, "logps/chosen": -401.81207275390625, "logps/rejected": -485.36041259765625, "loss": 0.715, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.3765710592269897, "rewards/margins": 1.040223479270935, "rewards/rejected": -2.416795015335083, "step": 450 }, { "epoch": 0.48, "learning_rate": 3.0931662070620794e-07, "logits/chosen": 0.9600452184677124, "logits/rejected": 2.169283866882324, "logps/chosen": -413.7256774902344, "logps/rejected": -508.83892822265625, "loss": 0.6982, "rewards/accuracies": 0.75, "rewards/chosen": -1.4869290590286255, "rewards/margins": 1.0339877605438232, "rewards/rejected": -2.5209171772003174, "step": 460 }, { "epoch": 0.49, "learning_rate": 3.003968536966078e-07, "logits/chosen": 1.2197582721710205, "logits/rejected": 2.1600019931793213, "logps/chosen": -441.291748046875, "logps/rejected": -515.36083984375, "loss": 0.6864, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.4309029579162598, "rewards/margins": 1.1178550720214844, "rewards/rejected": -2.548758029937744, "step": 470 }, { "epoch": 0.5, "learning_rate": 2.9140968536213693e-07, "logits/chosen": 1.2262110710144043, "logits/rejected": 2.440544843673706, "logps/chosen": -372.9617919921875, "logps/rejected": -461.28765869140625, "loss": 0.7197, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.4022343158721924, "rewards/margins": 0.9221154451370239, "rewards/rejected": -2.324349880218506, "step": 480 }, { "epoch": 0.51, "learning_rate": 2.823671352438608e-07, "logits/chosen": 1.2201405763626099, "logits/rejected": 2.058537721633911, "logps/chosen": -409.15814208984375, "logps/rejected": -467.56573486328125, "loss": 0.6978, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.334566354751587, "rewards/margins": 0.8473381996154785, "rewards/rejected": -2.1819043159484863, "step": 490 }, { "epoch": 0.52, "learning_rate": 2.73281296951072e-07, "logits/chosen": 1.904309868812561, "logits/rejected": 2.613346576690674, "logps/chosen": -433.7508850097656, "logps/rejected": -524.71044921875, "loss": 0.7129, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.621690034866333, "rewards/margins": 1.130249261856079, "rewards/rejected": -2.751939058303833, "step": 500 }, { "epoch": 0.52, "eval_logits/chosen": 1.4090828895568848, "eval_logits/rejected": 2.334313154220581, "eval_logps/chosen": -430.316650390625, "eval_logps/rejected": -514.4609375, "eval_loss": 0.6979612112045288, "eval_rewards/accuracies": 0.7440476417541504, "eval_rewards/chosen": -1.4620689153671265, "eval_rewards/margins": 1.0647554397583008, "eval_rewards/rejected": -2.5268242359161377, "eval_runtime": 243.0213, "eval_samples_per_second": 8.23, "eval_steps_per_second": 0.259, "step": 500 }, { "epoch": 0.53, "learning_rate": 2.641643219871597e-07, "logits/chosen": 1.5503180027008057, "logits/rejected": 2.431549072265625, "logps/chosen": -441.70721435546875, "logps/rejected": -510.1366271972656, "loss": 0.6483, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.403807282447815, "rewards/margins": 1.1817026138305664, "rewards/rejected": -2.585510015487671, "step": 510 }, { "epoch": 0.54, "learning_rate": 2.550284034980507e-07, "logits/chosen": 1.6724971532821655, "logits/rejected": 2.7851836681365967, "logps/chosen": -426.255859375, "logps/rejected": -528.8610229492188, "loss": 0.701, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6710426807403564, "rewards/margins": 1.100687861442566, "rewards/rejected": -2.771730661392212, "step": 520 }, { "epoch": 0.55, "learning_rate": 2.4588575996495794e-07, "logits/chosen": 1.8039824962615967, "logits/rejected": 2.613548755645752, "logps/chosen": -449.572265625, "logps/rejected": -537.4893798828125, "loss": 0.6894, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.654245376586914, "rewards/margins": 1.1508926153182983, "rewards/rejected": -2.805138111114502, "step": 530 }, { "epoch": 0.57, "learning_rate": 2.367486188632446e-07, "logits/chosen": 1.2713382244110107, "logits/rejected": 2.1325278282165527, "logps/chosen": -456.769775390625, "logps/rejected": -589.4976806640625, "loss": 0.6831, "rewards/accuracies": 0.75, "rewards/chosen": -1.6260936260223389, "rewards/margins": 1.2662837505340576, "rewards/rejected": -2.8923773765563965, "step": 540 }, { "epoch": 0.58, "learning_rate": 2.276292003092593e-07, "logits/chosen": 1.5338995456695557, "logits/rejected": 2.471559524536133, "logps/chosen": -422.6114807128906, "logps/rejected": -517.1149291992188, "loss": 0.7116, "rewards/accuracies": 0.75, "rewards/chosen": -1.5288379192352295, "rewards/margins": 1.1625785827636719, "rewards/rejected": -2.6914165019989014, "step": 550 }, { "epoch": 0.59, "learning_rate": 2.185397007170141e-07, "logits/chosen": 1.3305785655975342, "logits/rejected": 1.9697010517120361, "logps/chosen": -409.57635498046875, "logps/rejected": -467.7806091308594, "loss": 0.7032, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.4281054735183716, "rewards/margins": 0.915216326713562, "rewards/rejected": -2.3433218002319336, "step": 560 }, { "epoch": 0.6, "learning_rate": 2.094922764865619e-07, "logits/chosen": 1.1964863538742065, "logits/rejected": 2.188833236694336, "logps/chosen": -427.9024353027344, "logps/rejected": -497.96917724609375, "loss": 0.7036, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5176267623901367, "rewards/margins": 0.8729730844497681, "rewards/rejected": -2.3905997276306152, "step": 570 }, { "epoch": 0.61, "learning_rate": 2.0049902774588797e-07, "logits/chosen": 1.2527071237564087, "logits/rejected": 2.2398314476013184, "logps/chosen": -436.9583435058594, "logps/rejected": -504.8207092285156, "loss": 0.6924, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.6303842067718506, "rewards/margins": 1.0391124486923218, "rewards/rejected": -2.669496774673462, "step": 580 }, { "epoch": 0.62, "learning_rate": 1.9157198216806238e-07, "logits/chosen": 0.6701461672782898, "logits/rejected": 1.8555580377578735, "logps/chosen": -405.43902587890625, "logps/rejected": -496.9740295410156, "loss": 0.6922, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.323939561843872, "rewards/margins": 0.8526731729507446, "rewards/rejected": -2.176612615585327, "step": 590 }, { "epoch": 0.63, "learning_rate": 1.8272307888529274e-07, "logits/chosen": 0.462153822183609, "logits/rejected": 1.8794240951538086, "logps/chosen": -451.150634765625, "logps/rejected": -545.4595947265625, "loss": 0.6636, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.2517478466033936, "rewards/margins": 1.0819759368896484, "rewards/rejected": -2.333723783493042, "step": 600 }, { "epoch": 0.63, "eval_logits/chosen": 0.7470372319221497, "eval_logits/rejected": 2.2081830501556396, "eval_logps/chosen": -417.3849792480469, "eval_logps/rejected": -513.6627197265625, "eval_loss": 0.6876600980758667, "eval_rewards/accuracies": 0.75, "eval_rewards/chosen": -1.3327523469924927, "eval_rewards/margins": 1.1860896348953247, "eval_rewards/rejected": -2.5188419818878174, "eval_runtime": 244.0904, "eval_samples_per_second": 8.194, "eval_steps_per_second": 0.258, "step": 600 }, { "epoch": 0.64, "learning_rate": 1.7396415252139288e-07, "logits/chosen": 1.345348834991455, "logits/rejected": 2.903435707092285, "logps/chosen": -418.60797119140625, "logps/rejected": -482.93035888671875, "loss": 0.6686, "rewards/accuracies": 0.75, "rewards/chosen": -1.4358808994293213, "rewards/margins": 1.1490730047225952, "rewards/rejected": -2.584954261779785, "step": 610 }, { "epoch": 0.65, "learning_rate": 1.6530691736402316e-07, "logits/chosen": 2.0353918075561523, "logits/rejected": 3.255995512008667, "logps/chosen": -455.14959716796875, "logps/rejected": -525.127685546875, "loss": 0.6894, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.831215500831604, "rewards/margins": 1.118740200996399, "rewards/rejected": -2.949955701828003, "step": 620 }, { "epoch": 0.66, "learning_rate": 1.5676295169786864e-07, "logits/chosen": 2.8849244117736816, "logits/rejected": 3.68190336227417, "logps/chosen": -441.6712951660156, "logps/rejected": -579.8939819335938, "loss": 0.6633, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.800830602645874, "rewards/margins": 1.4441092014312744, "rewards/rejected": -3.2449398040771484, "step": 630 }, { "epoch": 0.67, "learning_rate": 1.483436823197092e-07, "logits/chosen": 1.9812686443328857, "logits/rejected": 2.8578484058380127, "logps/chosen": -427.90460205078125, "logps/rejected": -523.0911865234375, "loss": 0.6953, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.432652235031128, "rewards/margins": 1.2259643077850342, "rewards/rejected": -2.658616542816162, "step": 640 }, { "epoch": 0.68, "learning_rate": 1.4006036925609243e-07, "logits/chosen": 1.2402979135513306, "logits/rejected": 2.491854190826416, "logps/chosen": -458.84344482421875, "logps/rejected": -508.12847900390625, "loss": 0.6884, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.5252853631973267, "rewards/margins": 0.9278051257133484, "rewards/rejected": -2.4530904293060303, "step": 650 }, { "epoch": 0.69, "learning_rate": 1.319240907040458e-07, "logits/chosen": 1.5803894996643066, "logits/rejected": 2.081526756286621, "logps/chosen": -454.36309814453125, "logps/rejected": -526.6981201171875, "loss": 0.6824, "rewards/accuracies": 0.75, "rewards/chosen": -1.5280418395996094, "rewards/margins": 1.0684349536895752, "rewards/rejected": -2.5964770317077637, "step": 660 }, { "epoch": 0.7, "learning_rate": 1.239457282149695e-07, "logits/chosen": 1.9639816284179688, "logits/rejected": 3.2135062217712402, "logps/chosen": -435.85888671875, "logps/rejected": -534.4019775390625, "loss": 0.6736, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5270483493804932, "rewards/margins": 1.1297345161437988, "rewards/rejected": -2.656782627105713, "step": 670 }, { "epoch": 0.71, "learning_rate": 1.1613595214152711e-07, "logits/chosen": 2.091609477996826, "logits/rejected": 2.95405912399292, "logps/chosen": -404.2757873535156, "logps/rejected": -456.81732177734375, "loss": 0.7108, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.4280402660369873, "rewards/margins": 0.7583026885986328, "rewards/rejected": -2.186342716217041, "step": 680 }, { "epoch": 0.72, "learning_rate": 1.0850520736699362e-07, "logits/chosen": 1.712774634361267, "logits/rejected": 3.230499267578125, "logps/chosen": -395.8936767578125, "logps/rejected": -499.32965087890625, "loss": 0.655, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.3443926572799683, "rewards/margins": 1.2300078868865967, "rewards/rejected": -2.5744004249572754, "step": 690 }, { "epoch": 0.73, "learning_rate": 1.0106369933615042e-07, "logits/chosen": 2.6943306922912598, "logits/rejected": 4.300943851470947, "logps/chosen": -449.5069274902344, "logps/rejected": -574.8884887695312, "loss": 0.6217, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.9530900716781616, "rewards/margins": 1.3479530811309814, "rewards/rejected": -3.3010432720184326, "step": 700 }, { "epoch": 0.73, "eval_logits/chosen": 2.593170166015625, "eval_logits/rejected": 3.816297769546509, "eval_logps/chosen": -473.18865966796875, "eval_logps/rejected": -579.6353759765625, "eval_loss": 0.6762357354164124, "eval_rewards/accuracies": 0.7698412537574768, "eval_rewards/chosen": -1.8907891511917114, "eval_rewards/margins": 1.2877792119979858, "eval_rewards/rejected": -3.1785686016082764, "eval_runtime": 244.2095, "eval_samples_per_second": 8.19, "eval_steps_per_second": 0.258, "step": 700 }, { "epoch": 0.74, "learning_rate": 9.382138040640714e-08, "logits/chosen": 2.803864002227783, "logits/rejected": 3.797267198562622, "logps/chosen": -505.75677490234375, "logps/rejected": -552.4486083984375, "loss": 0.6429, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.0083601474761963, "rewards/margins": 1.1107932329177856, "rewards/rejected": -3.1191532611846924, "step": 710 }, { "epoch": 0.75, "learning_rate": 8.678793653740632e-08, "logits/chosen": 3.4752883911132812, "logits/rejected": 4.102308750152588, "logps/chosen": -513.325439453125, "logps/rejected": -616.204345703125, "loss": 0.6616, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.2190704345703125, "rewards/margins": 1.2776607275009155, "rewards/rejected": -3.4967312812805176, "step": 720 }, { "epoch": 0.76, "learning_rate": 7.997277433690983e-08, "logits/chosen": 3.118082046508789, "logits/rejected": 4.068647861480713, "logps/chosen": -515.4818725585938, "logps/rejected": -578.458984375, "loss": 0.6692, "rewards/accuracies": 0.78125, "rewards/chosen": -2.115870475769043, "rewards/margins": 1.1491236686706543, "rewards/rejected": -3.2649941444396973, "step": 730 }, { "epoch": 0.77, "learning_rate": 7.338500848029602e-08, "logits/chosen": 3.267651319503784, "logits/rejected": 4.356374263763428, "logps/chosen": -517.941650390625, "logps/rejected": -612.4710693359375, "loss": 0.6626, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.0364277362823486, "rewards/margins": 1.4974032640457153, "rewards/rejected": -3.5338311195373535, "step": 740 }, { "epoch": 0.78, "learning_rate": 6.70334495204884e-08, "logits/chosen": 2.9188966751098633, "logits/rejected": 3.9505248069763184, "logps/chosen": -495.83734130859375, "logps/rejected": -612.5488891601562, "loss": 0.6476, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.124872922897339, "rewards/margins": 1.2153751850128174, "rewards/rejected": -3.3402485847473145, "step": 750 }, { "epoch": 0.8, "learning_rate": 6.092659210462231e-08, "logits/chosen": 3.145782709121704, "logits/rejected": 3.7394192218780518, "logps/chosen": -498.886474609375, "logps/rejected": -587.1813354492188, "loss": 0.6186, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.2077949047088623, "rewards/margins": 1.1572462320327759, "rewards/rejected": -3.3650412559509277, "step": 760 }, { "epoch": 0.81, "learning_rate": 5.507260361320737e-08, "logits/chosen": 3.364577531814575, "logits/rejected": 4.243520259857178, "logps/chosen": -541.8561401367188, "logps/rejected": -659.7272338867188, "loss": 0.6917, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.4785799980163574, "rewards/margins": 1.123822569847107, "rewards/rejected": -3.602402448654175, "step": 770 }, { "epoch": 0.82, "learning_rate": 4.947931323697982e-08, "logits/chosen": 3.392416477203369, "logits/rejected": 4.268471717834473, "logps/chosen": -480.16455078125, "logps/rejected": -572.0571899414062, "loss": 0.6843, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.2396440505981445, "rewards/margins": 1.116714596748352, "rewards/rejected": -3.356358766555786, "step": 780 }, { "epoch": 0.83, "learning_rate": 4.415420150605398e-08, "logits/chosen": 2.9198615550994873, "logits/rejected": 3.857909679412842, "logps/chosen": -536.2203369140625, "logps/rejected": -663.2730102539062, "loss": 0.6706, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.358766794204712, "rewards/margins": 1.428501844406128, "rewards/rejected": -3.787268877029419, "step": 790 }, { "epoch": 0.84, "learning_rate": 3.9104390285376374e-08, "logits/chosen": 2.381801128387451, "logits/rejected": 4.157925605773926, "logps/chosen": -550.6214599609375, "logps/rejected": -629.8056030273438, "loss": 0.6418, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.0941646099090576, "rewards/margins": 1.3854446411132812, "rewards/rejected": -3.4796090126037598, "step": 800 }, { "epoch": 0.84, "eval_logits/chosen": 2.6092491149902344, "eval_logits/rejected": 3.865464448928833, "eval_logps/chosen": -494.04217529296875, "eval_logps/rejected": -602.0606689453125, "eval_loss": 0.6711603403091431, "eval_rewards/accuracies": 0.7678571343421936, "eval_rewards/chosen": -2.0993239879608154, "eval_rewards/margins": 1.3034968376159668, "eval_rewards/rejected": -3.4028208255767822, "eval_runtime": 243.1214, "eval_samples_per_second": 8.226, "eval_steps_per_second": 0.259, "step": 800 }, { "epoch": 0.85, "learning_rate": 3.433663324986208e-08, "logits/chosen": 3.123400926589966, "logits/rejected": 4.341358184814453, "logps/chosen": -507.13848876953125, "logps/rejected": -566.4569091796875, "loss": 0.6669, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.2766828536987305, "rewards/margins": 0.9975360035896301, "rewards/rejected": -3.274219036102295, "step": 810 }, { "epoch": 0.86, "learning_rate": 2.9857306851953897e-08, "logits/chosen": 3.36864972114563, "logits/rejected": 3.826308488845825, "logps/chosen": -454.4519958496094, "logps/rejected": -553.3275756835938, "loss": 0.7048, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.0178139209747314, "rewards/margins": 1.1636488437652588, "rewards/rejected": -3.1814627647399902, "step": 820 }, { "epoch": 0.87, "learning_rate": 2.567240179368185e-08, "logits/chosen": 2.7587532997131348, "logits/rejected": 3.9622387886047363, "logps/chosen": -461.407470703125, "logps/rejected": -577.1260986328125, "loss": 0.6538, "rewards/accuracies": 0.78125, "rewards/chosen": -2.125457286834717, "rewards/margins": 1.223615050315857, "rewards/rejected": -3.3490726947784424, "step": 830 }, { "epoch": 0.88, "learning_rate": 2.1787515014630357e-08, "logits/chosen": 3.194129228591919, "logits/rejected": 3.398770809173584, "logps/chosen": -526.1636962890625, "logps/rejected": -602.7716064453125, "loss": 0.7025, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.1302103996276855, "rewards/margins": 1.0573285818099976, "rewards/rejected": -3.1875391006469727, "step": 840 }, { "epoch": 0.89, "learning_rate": 1.820784220652766e-08, "logits/chosen": 2.8333277702331543, "logits/rejected": 3.949988842010498, "logps/chosen": -497.5380859375, "logps/rejected": -547.4320068359375, "loss": 0.6689, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.9726041555404663, "rewards/margins": 1.0516941547393799, "rewards/rejected": -3.0242981910705566, "step": 850 }, { "epoch": 0.9, "learning_rate": 1.4938170864468636e-08, "logits/chosen": 2.8082051277160645, "logits/rejected": 4.054238796234131, "logps/chosen": -488.4153747558594, "logps/rejected": -592.6768798828125, "loss": 0.6703, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.0998573303222656, "rewards/margins": 1.3178421258926392, "rewards/rejected": -3.4176993370056152, "step": 860 }, { "epoch": 0.91, "learning_rate": 1.1982873884064465e-08, "logits/chosen": 2.500764846801758, "logits/rejected": 3.581740140914917, "logps/chosen": -425.3038024902344, "logps/rejected": -571.4251708984375, "loss": 0.6579, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.7831649780273438, "rewards/margins": 1.468294382095337, "rewards/rejected": -3.2514591217041016, "step": 870 }, { "epoch": 0.92, "learning_rate": 9.345903713082304e-09, "logits/chosen": 2.309072971343994, "logits/rejected": 3.692427158355713, "logps/chosen": -488.32049560546875, "logps/rejected": -586.3157958984375, "loss": 0.6445, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.1420960426330566, "rewards/margins": 1.187731146812439, "rewards/rejected": -3.329827070236206, "step": 880 }, { "epoch": 0.93, "learning_rate": 7.030787065396865e-09, "logits/chosen": 2.455681800842285, "logits/rejected": 3.950096607208252, "logps/chosen": -487.016357421875, "logps/rejected": -593.2111206054688, "loss": 0.701, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.2053864002227783, "rewards/margins": 1.07200026512146, "rewards/rejected": -3.277386426925659, "step": 890 }, { "epoch": 0.94, "learning_rate": 5.04062020432286e-09, "logits/chosen": 2.607274293899536, "logits/rejected": 3.9918441772460938, "logps/chosen": -505.3465881347656, "logps/rejected": -603.4442138671875, "loss": 0.6678, "rewards/accuracies": 0.71875, "rewards/chosen": -2.0696864128112793, "rewards/margins": 1.1221836805343628, "rewards/rejected": -3.1918704509735107, "step": 900 }, { "epoch": 0.94, "eval_logits/chosen": 2.4517972469329834, "eval_logits/rejected": 3.733224630355835, "eval_logps/chosen": -487.1844482421875, "eval_logps/rejected": -594.1102905273438, "eval_loss": 0.6716480851173401, "eval_rewards/accuracies": 0.7638888955116272, "eval_rewards/chosen": -2.030747175216675, "eval_rewards/margins": 1.2925708293914795, "eval_rewards/rejected": -3.323317766189575, "eval_runtime": 244.2155, "eval_samples_per_second": 8.189, "eval_steps_per_second": 0.258, "step": 900 }, { "epoch": 0.95, "learning_rate": 3.3780648016376866e-09, "logits/chosen": 2.913886308670044, "logits/rejected": 4.131613731384277, "logps/chosen": -459.2386779785156, "logps/rejected": -578.1640014648438, "loss": 0.6622, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.1169772148132324, "rewards/margins": 1.2496932744979858, "rewards/rejected": -3.3666698932647705, "step": 910 }, { "epoch": 0.96, "learning_rate": 2.0453443778310766e-09, "logits/chosen": 2.6672825813293457, "logits/rejected": 3.8567919731140137, "logps/chosen": -503.61395263671875, "logps/rejected": -597.9525756835938, "loss": 0.6413, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.079343557357788, "rewards/margins": 1.3192239999771118, "rewards/rejected": -3.3985676765441895, "step": 920 }, { "epoch": 0.97, "learning_rate": 1.0442413283435758e-09, "logits/chosen": 2.472501754760742, "logits/rejected": 3.5483956336975098, "logps/chosen": -481.59515380859375, "logps/rejected": -590.560791015625, "loss": 0.6391, "rewards/accuracies": 0.8125, "rewards/chosen": -2.0882575511932373, "rewards/margins": 1.377394676208496, "rewards/rejected": -3.4656529426574707, "step": 930 }, { "epoch": 0.98, "learning_rate": 3.760945397705828e-10, "logits/chosen": 2.6073191165924072, "logits/rejected": 3.743886947631836, "logps/chosen": -536.0969848632812, "logps/rejected": -624.2562255859375, "loss": 0.6442, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.063697338104248, "rewards/margins": 1.2653493881225586, "rewards/rejected": -3.3290467262268066, "step": 940 }, { "epoch": 0.99, "learning_rate": 4.17975992204056e-11, "logits/chosen": 2.321948766708374, "logits/rejected": 3.977466583251953, "logps/chosen": -496.20025634765625, "logps/rejected": -609.7675170898438, "loss": 0.6933, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.116821527481079, "rewards/margins": 1.348793387413025, "rewards/rejected": -3.4656143188476562, "step": 950 }, { "epoch": 1.0, "step": 955, "total_flos": 0.0, "train_loss": 0.7263204834224042, "train_runtime": 20734.7169, "train_samples_per_second": 2.948, "train_steps_per_second": 0.046 } ], "logging_steps": 10, "max_steps": 955, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100000000, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }