{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 62, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016260162601626018, "grad_norm": 19.880552291870117, "learning_rate": 2e-05, "logits/chosen": 0.20684528350830078, "logits/rejected": 0.4346590042114258, "logps/chosen": -777.121826171875, "logps/rejected": -997.1637573242188, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.032520325203252036, "grad_norm": 20.27885627746582, "learning_rate": 4e-05, "logits/chosen": 0.12451896071434021, "logits/rejected": 0.3398062586784363, "logps/chosen": -841.6675415039062, "logps/rejected": -988.1629638671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.04878048780487805, "grad_norm": 390.8882141113281, "learning_rate": 6e-05, "logits/chosen": 0.14335429668426514, "logits/rejected": 0.32437634468078613, "logps/chosen": -876.8231811523438, "logps/rejected": -1356.0509033203125, "loss": 0.6706, "rewards/accuracies": 0.25, "rewards/chosen": -0.12680970132350922, "rewards/margins": -0.06611938774585724, "rewards/rejected": -0.06069030612707138, "step": 3 }, { "epoch": 0.06504065040650407, "grad_norm": 21.47028923034668, "learning_rate": 8e-05, "logits/chosen": 0.7833376526832581, "logits/rejected": 1.1811182498931885, "logps/chosen": -1178.9454345703125, "logps/rejected": -974.9606323242188, "loss": 0.6883, "rewards/accuracies": 0.25, "rewards/chosen": -0.11406403034925461, "rewards/margins": -0.005326844751834869, "rewards/rejected": -0.10873718559741974, "step": 4 }, { "epoch": 0.08130081300813008, "grad_norm": 40.24486541748047, "learning_rate": 0.0001, "logits/chosen": -0.44922593235969543, "logits/rejected": -0.6411373019218445, "logps/chosen": -559.5548706054688, "logps/rejected": -1254.8680419921875, "loss": 0.4832, "rewards/accuracies": 1.0, "rewards/chosen": -0.34520798921585083, "rewards/margins": 0.4895774722099304, "rewards/rejected": -0.834785521030426, "step": 5 }, { "epoch": 0.0975609756097561, "grad_norm": 16.58538818359375, "learning_rate": 0.00012, "logits/chosen": 0.9809624552726746, "logits/rejected": 1.187626838684082, "logps/chosen": -757.462158203125, "logps/rejected": -1020.3145141601562, "loss": 0.4292, "rewards/accuracies": 1.0, "rewards/chosen": -0.2485191375017166, "rewards/margins": 0.7915412783622742, "rewards/rejected": -1.0400605201721191, "step": 6 }, { "epoch": 0.11382113821138211, "grad_norm": 18.358051300048828, "learning_rate": 0.00014, "logits/chosen": 1.6894466876983643, "logits/rejected": 1.6828027963638306, "logps/chosen": -1125.97412109375, "logps/rejected": -877.0285034179688, "loss": 0.3812, "rewards/accuracies": 0.75, "rewards/chosen": -0.9222716689109802, "rewards/margins": 0.32721251249313354, "rewards/rejected": -1.2494843006134033, "step": 7 }, { "epoch": 0.13008130081300814, "grad_norm": 163.26919555664062, "learning_rate": 0.00016, "logits/chosen": -0.45762500166893005, "logits/rejected": -0.5206366777420044, "logps/chosen": -705.5869750976562, "logps/rejected": -1347.400390625, "loss": 0.288, "rewards/accuracies": 1.0, "rewards/chosen": -3.067340850830078, "rewards/margins": 3.900920867919922, "rewards/rejected": -6.968262195587158, "step": 8 }, { "epoch": 0.14634146341463414, "grad_norm": 5.863889217376709, "learning_rate": 0.00018, "logits/chosen": 0.2462751269340515, "logits/rejected": 0.21955497562885284, "logps/chosen": -619.6600341796875, "logps/rejected": -1208.003662109375, "loss": 0.0717, "rewards/accuracies": 1.0, "rewards/chosen": -2.7182769775390625, "rewards/margins": 8.603934288024902, "rewards/rejected": -11.322211265563965, "step": 9 }, { "epoch": 0.16260162601626016, "grad_norm": 0.6885181665420532, "learning_rate": 0.0002, "logits/chosen": 1.1071248054504395, "logits/rejected": 1.1347391605377197, "logps/chosen": -877.805419921875, "logps/rejected": -1244.745849609375, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": -5.3332839012146, "rewards/margins": 10.358970642089844, "rewards/rejected": -15.692255020141602, "step": 10 }, { "epoch": 0.17886178861788618, "grad_norm": 2.558082103729248, "learning_rate": 0.00019996135574945544, "logits/chosen": 0.24951541423797607, "logits/rejected": 0.2528836727142334, "logps/chosen": -740.1439208984375, "logps/rejected": -1265.59814453125, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": -7.989352226257324, "rewards/margins": 19.463153839111328, "rewards/rejected": -27.45250701904297, "step": 11 }, { "epoch": 0.1951219512195122, "grad_norm": 0.0005222362815402448, "learning_rate": 0.0001998454528653836, "logits/chosen": 0.6122381687164307, "logits/rejected": 0.8588502407073975, "logps/chosen": -879.779296875, "logps/rejected": -1585.720947265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.228717803955078, "rewards/margins": 32.099365234375, "rewards/rejected": -50.32808303833008, "step": 12 }, { "epoch": 0.21138211382113822, "grad_norm": 3.927712168660946e-05, "learning_rate": 0.00019965238092738643, "logits/chosen": 1.1087465286254883, "logits/rejected": 1.5179497003555298, "logps/chosen": -1257.50830078125, "logps/rejected": -1163.919677734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.7935791015625, "rewards/margins": 20.931385040283203, "rewards/rejected": -36.72496032714844, "step": 13 }, { "epoch": 0.22764227642276422, "grad_norm": 0.21046003699302673, "learning_rate": 0.0001993822891578708, "logits/chosen": 0.23910227417945862, "logits/rejected": 0.31048309803009033, "logps/chosen": -1491.3905029296875, "logps/rejected": -2108.9990234375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -56.71916198730469, "rewards/margins": 42.71849822998047, "rewards/rejected": -99.43765258789062, "step": 14 }, { "epoch": 0.24390243902439024, "grad_norm": 591.9841918945312, "learning_rate": 0.0001990353863067169, "logits/chosen": 0.5623903870582581, "logits/rejected": 0.6063950061798096, "logps/chosen": -1970.40576171875, "logps/rejected": -2018.9765625, "loss": 0.5538, "rewards/accuracies": 0.75, "rewards/chosen": -86.55944061279297, "rewards/margins": 29.65001106262207, "rewards/rejected": -116.2094497680664, "step": 15 }, { "epoch": 0.2601626016260163, "grad_norm": 90.19036865234375, "learning_rate": 0.00019861194048993863, "logits/chosen": 0.6143627166748047, "logits/rejected": 0.7420700788497925, "logps/chosen": -1821.3201904296875, "logps/rejected": -1930.827880859375, "loss": 1.0906, "rewards/accuracies": 0.75, "rewards/chosen": -76.42454528808594, "rewards/margins": 28.595970153808594, "rewards/rejected": -105.02052307128906, "step": 16 }, { "epoch": 0.2764227642276423, "grad_norm": 0.0009420510032214224, "learning_rate": 0.0001981122789824607, "logits/chosen": 0.20949414372444153, "logits/rejected": 0.1935410499572754, "logps/chosen": -1610.02783203125, "logps/rejected": -2431.318359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -65.77059936523438, "rewards/margins": 73.17414855957031, "rewards/rejected": -138.94476318359375, "step": 17 }, { "epoch": 0.2926829268292683, "grad_norm": 132.33953857421875, "learning_rate": 0.00019753678796517282, "logits/chosen": 0.728495717048645, "logits/rejected": 1.0449868440628052, "logps/chosen": -1515.9527587890625, "logps/rejected": -1517.2254638671875, "loss": 2.6435, "rewards/accuracies": 0.5, "rewards/chosen": -61.27394104003906, "rewards/margins": 20.481342315673828, "rewards/rejected": -81.75528717041016, "step": 18 }, { "epoch": 0.3089430894308943, "grad_norm": 0.00032979066600091755, "learning_rate": 0.00019688591222645607, "logits/chosen": 0.8106945753097534, "logits/rejected": 0.6099438071250916, "logps/chosen": -1138.11767578125, "logps/rejected": -1558.903076171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -46.01788330078125, "rewards/margins": 41.312171936035156, "rewards/rejected": -87.33006286621094, "step": 19 }, { "epoch": 0.3252032520325203, "grad_norm": 0.22872093319892883, "learning_rate": 0.0001961601548184129, "logits/chosen": -0.05689544230699539, "logits/rejected": 0.0633389949798584, "logps/chosen": -1466.4468994140625, "logps/rejected": -2267.798828125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -76.84449005126953, "rewards/margins": 48.28419494628906, "rewards/rejected": -125.12869262695312, "step": 20 }, { "epoch": 0.34146341463414637, "grad_norm": 1.10204017162323, "learning_rate": 0.00019536007666806556, "logits/chosen": 0.5605583786964417, "logits/rejected": 0.45388907194137573, "logps/chosen": -1369.92529296875, "logps/rejected": -1706.2607421875, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -33.74466323852539, "rewards/margins": 45.32139587402344, "rewards/rejected": -79.06605529785156, "step": 21 }, { "epoch": 0.35772357723577236, "grad_norm": 0.7084241509437561, "learning_rate": 0.0001944862961438239, "logits/chosen": 0.7291379570960999, "logits/rejected": 0.9067746996879578, "logps/chosen": -998.4527587890625, "logps/rejected": -1456.096923828125, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -19.574996948242188, "rewards/margins": 45.93708038330078, "rewards/rejected": -65.51207733154297, "step": 22 }, { "epoch": 0.37398373983739835, "grad_norm": 3.134854793548584, "learning_rate": 0.00019353948857755803, "logits/chosen": 0.9795281887054443, "logits/rejected": 0.8698853850364685, "logps/chosen": -1127.320068359375, "logps/rejected": -1399.870849609375, "loss": 0.0096, "rewards/accuracies": 1.0, "rewards/chosen": -28.826623916625977, "rewards/margins": 29.93848419189453, "rewards/rejected": -58.765106201171875, "step": 23 }, { "epoch": 0.3902439024390244, "grad_norm": 2.085594654083252, "learning_rate": 0.00019252038574264405, "logits/chosen": 0.17023050785064697, "logits/rejected": -0.1173945814371109, "logps/chosen": -1615.32568359375, "logps/rejected": -2291.47509765625, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -82.27009582519531, "rewards/margins": 44.62742614746094, "rewards/rejected": -126.89752197265625, "step": 24 }, { "epoch": 0.4065040650406504, "grad_norm": 7.152135367505252e-05, "learning_rate": 0.00019142977528838762, "logits/chosen": 0.6659821271896362, "logits/rejected": 0.6975608468055725, "logps/chosen": -1023.6649169921875, "logps/rejected": -1710.140380859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -33.36669921875, "rewards/margins": 49.14038848876953, "rewards/rejected": -82.50708770751953, "step": 25 }, { "epoch": 0.42276422764227645, "grad_norm": 2.22769040192361e-06, "learning_rate": 0.00019026850013126157, "logits/chosen": -0.624580442905426, "logits/rejected": -0.42581236362457275, "logps/chosen": -1117.0599365234375, "logps/rejected": -2134.2626953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -57.8393669128418, "rewards/margins": 44.58246994018555, "rewards/rejected": -102.42182922363281, "step": 26 }, { "epoch": 0.43902439024390244, "grad_norm": 0.7476986050605774, "learning_rate": 0.00018903745780342839, "logits/chosen": 0.17943906784057617, "logits/rejected": 0.21112221479415894, "logps/chosen": -1208.960205078125, "logps/rejected": -1999.635009765625, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -55.38972473144531, "rewards/margins": 40.17228317260742, "rewards/rejected": -95.56201171875, "step": 27 }, { "epoch": 0.45528455284552843, "grad_norm": 0.6162808537483215, "learning_rate": 0.00018773759975905098, "logits/chosen": 0.15270072221755981, "logits/rejected": 0.32134106755256653, "logps/chosen": -1206.7701416015625, "logps/rejected": -2007.0269775390625, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -38.11735916137695, "rewards/margins": 50.446754455566406, "rewards/rejected": -88.5641098022461, "step": 28 }, { "epoch": 0.4715447154471545, "grad_norm": 8.754213354222884e-07, "learning_rate": 0.0001863699306389282, "logits/chosen": 0.8678311109542847, "logits/rejected": 0.8028951287269592, "logps/chosen": -1161.56591796875, "logps/rejected": -1967.0069580078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.882237434387207, "rewards/margins": 65.84603881835938, "rewards/rejected": -81.72827911376953, "step": 29 }, { "epoch": 0.4878048780487805, "grad_norm": 0.0023462281096726656, "learning_rate": 0.00018493550749402278, "logits/chosen": 1.54906165599823, "logits/rejected": 1.6790410280227661, "logps/chosen": -951.4666748046875, "logps/rejected": -1339.60107421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.993054389953613, "rewards/margins": 40.59773635864258, "rewards/rejected": -47.590789794921875, "step": 30 }, { "epoch": 0.5040650406504065, "grad_norm": 0.00014203626778908074, "learning_rate": 0.00018343543896848273, "logits/chosen": 1.832588791847229, "logits/rejected": 1.6241607666015625, "logps/chosen": -1032.7232666015625, "logps/rejected": -1197.1595458984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.2398042678833, "rewards/margins": 28.274524688720703, "rewards/rejected": -42.51432800292969, "step": 31 }, { "epoch": 0.5203252032520326, "grad_norm": 2.814833402633667, "learning_rate": 0.00018187088444278674, "logits/chosen": 2.1444239616394043, "logits/rejected": 1.8101916313171387, "logps/chosen": -874.6080322265625, "logps/rejected": -1012.015625, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": -13.471307754516602, "rewards/margins": 20.194053649902344, "rewards/rejected": -33.66536331176758, "step": 32 }, { "epoch": 0.5365853658536586, "grad_norm": 0.06849005818367004, "learning_rate": 0.00018024305313767646, "logits/chosen": 1.9995535612106323, "logits/rejected": 1.8331811428070068, "logps/chosen": -1230.6785888671875, "logps/rejected": -1346.717041015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.62438678741455, "rewards/margins": 31.655826568603516, "rewards/rejected": -42.280216217041016, "step": 33 }, { "epoch": 0.5528455284552846, "grad_norm": 0.01905296929180622, "learning_rate": 0.00017855320317956784, "logits/chosen": 1.1833341121673584, "logits/rejected": 1.240072250366211, "logps/chosen": -841.6439208984375, "logps/rejected": -1193.967041015625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -15.020572662353516, "rewards/margins": 28.115928649902344, "rewards/rejected": -43.136505126953125, "step": 34 }, { "epoch": 0.5691056910569106, "grad_norm": 1.866630009317305e-05, "learning_rate": 0.0001768026406281642, "logits/chosen": 1.0859436988830566, "logits/rejected": 1.226615309715271, "logps/chosen": -1046.376708984375, "logps/rejected": -1418.09228515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.104580879211426, "rewards/margins": 34.29302978515625, "rewards/rejected": -47.397613525390625, "step": 35 }, { "epoch": 0.5853658536585366, "grad_norm": 0.0032898751087486744, "learning_rate": 0.00017499271846702213, "logits/chosen": -0.23074638843536377, "logits/rejected": -0.09211879968643188, "logps/chosen": -1246.923095703125, "logps/rejected": -2060.51123046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -44.84193801879883, "rewards/margins": 45.95753479003906, "rewards/rejected": -90.79946899414062, "step": 36 }, { "epoch": 0.6016260162601627, "grad_norm": 0.008372440002858639, "learning_rate": 0.00017312483555785086, "logits/chosen": 0.5074482560157776, "logits/rejected": 0.48830437660217285, "logps/chosen": -920.7339477539062, "logps/rejected": -1666.024658203125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -18.29103660583496, "rewards/margins": 32.98884582519531, "rewards/rejected": -51.27988052368164, "step": 37 }, { "epoch": 0.6178861788617886, "grad_norm": 0.0008834120817482471, "learning_rate": 0.00017120043555935298, "logits/chosen": 1.3600270748138428, "logits/rejected": 1.2087562084197998, "logps/chosen": -1251.687744140625, "logps/rejected": -1775.605224609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -19.525299072265625, "rewards/margins": 45.839603424072266, "rewards/rejected": -65.36489868164062, "step": 38 }, { "epoch": 0.6341463414634146, "grad_norm": 9.272828901885077e-05, "learning_rate": 0.00016922100581144228, "logits/chosen": 1.4009983539581299, "logits/rejected": 1.2046518325805664, "logps/chosen": -1155.6650390625, "logps/rejected": -1281.83740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.521747589111328, "rewards/margins": 24.7418155670166, "rewards/rejected": -41.2635612487793, "step": 39 }, { "epoch": 0.6504065040650406, "grad_norm": 0.0009182749781757593, "learning_rate": 0.00016718807618570106, "logits/chosen": 1.3781325817108154, "logits/rejected": 1.565840244293213, "logps/chosen": -1133.72216796875, "logps/rejected": -1346.7265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.05687427520752, "rewards/margins": 18.654136657714844, "rewards/rejected": -27.711009979248047, "step": 40 }, { "epoch": 0.6666666666666666, "grad_norm": 0.004382506478577852, "learning_rate": 0.00016510321790296525, "logits/chosen": 1.1266183853149414, "logits/rejected": 1.2493317127227783, "logps/chosen": -926.239501953125, "logps/rejected": -1293.30322265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.177988052368164, "rewards/margins": 22.40888786315918, "rewards/rejected": -33.586875915527344, "step": 41 }, { "epoch": 0.6829268292682927, "grad_norm": 0.15565475821495056, "learning_rate": 0.00016296804231895142, "logits/chosen": 1.099910020828247, "logits/rejected": 0.820236086845398, "logps/chosen": -626.5668334960938, "logps/rejected": -1386.260498046875, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -10.778373718261719, "rewards/margins": 27.383846282958984, "rewards/rejected": -38.16221618652344, "step": 42 }, { "epoch": 0.6991869918699187, "grad_norm": 3.971878322772682e-05, "learning_rate": 0.00016078419967886402, "logits/chosen": 1.4016125202178955, "logits/rejected": 1.5134223699569702, "logps/chosen": -1066.9713134765625, "logps/rejected": -1517.39208984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.4629487991333, "rewards/margins": 27.75263214111328, "rewards/rejected": -39.215576171875, "step": 43 }, { "epoch": 0.7154471544715447, "grad_norm": 0.004684010986238718, "learning_rate": 0.00015855337784194577, "logits/chosen": 1.989326000213623, "logits/rejected": 2.3816940784454346, "logps/chosen": -956.5921630859375, "logps/rejected": -1014.5316162109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.150079727172852, "rewards/margins": 12.83597183227539, "rewards/rejected": -18.986051559448242, "step": 44 }, { "epoch": 0.7317073170731707, "grad_norm": 0.03292777016758919, "learning_rate": 0.00015627730097695638, "logits/chosen": 2.072270631790161, "logits/rejected": 2.0922999382019043, "logps/chosen": -1218.990478515625, "logps/rejected": -1251.8980712890625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.599820137023926, "rewards/margins": 19.980201721191406, "rewards/rejected": -27.580020904541016, "step": 45 }, { "epoch": 0.7479674796747967, "grad_norm": 0.06399545818567276, "learning_rate": 0.00015395772822958845, "logits/chosen": 1.245821475982666, "logits/rejected": 1.3717162609100342, "logps/chosen": -960.6263427734375, "logps/rejected": -1502.2239990234375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -8.884254455566406, "rewards/margins": 28.055803298950195, "rewards/rejected": -36.94005584716797, "step": 46 }, { "epoch": 0.7642276422764228, "grad_norm": 0.022615160793066025, "learning_rate": 0.0001515964523628501, "logits/chosen": 1.4772993326187134, "logits/rejected": 1.3233076333999634, "logps/chosen": -900.41552734375, "logps/rejected": -1422.0224609375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.169479370117188, "rewards/margins": 29.0593204498291, "rewards/rejected": -37.228797912597656, "step": 47 }, { "epoch": 0.7804878048780488, "grad_norm": 0.7834580540657043, "learning_rate": 0.00014919529837146528, "logits/chosen": 2.019958019256592, "logits/rejected": 2.0058090686798096, "logps/chosen": -908.94970703125, "logps/rejected": -1153.9830322265625, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -10.564983367919922, "rewards/margins": 15.311219215393066, "rewards/rejected": -25.87619972229004, "step": 48 }, { "epoch": 0.7967479674796748, "grad_norm": 0.0006066004862077534, "learning_rate": 0.0001467561220713628, "logits/chosen": 1.297697901725769, "logits/rejected": 1.5303912162780762, "logps/chosen": -1167.181640625, "logps/rejected": -1485.501953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.699865341186523, "rewards/margins": 47.49958801269531, "rewards/rejected": -59.19945526123047, "step": 49 }, { "epoch": 0.8130081300813008, "grad_norm": 0.03268749639391899, "learning_rate": 0.00014428080866534396, "logits/chosen": 0.707965612411499, "logits/rejected": 0.7305536866188049, "logps/chosen": -1051.2691650390625, "logps/rejected": -1463.647705078125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -14.360027313232422, "rewards/margins": 24.690279006958008, "rewards/rejected": -39.05030822753906, "step": 50 }, { "epoch": 0.8292682926829268, "grad_norm": 0.06594517827033997, "learning_rate": 0.00014177127128603745, "logits/chosen": 1.219120740890503, "logits/rejected": 1.2810195684432983, "logps/chosen": -1020.8298950195312, "logps/rejected": -1290.2015380859375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -12.565038681030273, "rewards/margins": 20.74908447265625, "rewards/rejected": -33.314125061035156, "step": 51 }, { "epoch": 0.8455284552845529, "grad_norm": 0.008960689418017864, "learning_rate": 0.0001392294495172681, "logits/chosen": 0.49424344301223755, "logits/rejected": 0.4817698895931244, "logps/chosen": -988.3806762695312, "logps/rejected": -1388.4130859375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -14.987248420715332, "rewards/margins": 38.28583908081055, "rewards/rejected": -53.27308654785156, "step": 52 }, { "epoch": 0.8617886178861789, "grad_norm": 4.988933142158203e-07, "learning_rate": 0.0001366573078949813, "logits/chosen": -0.09240919351577759, "logits/rejected": -0.1942935436964035, "logps/chosen": -863.5594482421875, "logps/rejected": -1951.684814453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -21.636280059814453, "rewards/margins": 39.47431182861328, "rewards/rejected": -61.110591888427734, "step": 53 }, { "epoch": 0.8780487804878049, "grad_norm": 0.36996814608573914, "learning_rate": 0.00013405683438888282, "logits/chosen": 1.8010693788528442, "logits/rejected": 1.9799494743347168, "logps/chosen": -1090.9835205078125, "logps/rejected": -1244.3988037109375, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -10.118224143981934, "rewards/margins": 23.42540740966797, "rewards/rejected": -33.54362869262695, "step": 54 }, { "epoch": 0.8943089430894309, "grad_norm": 0.0004369132802821696, "learning_rate": 0.00013143003886596669, "logits/chosen": 1.255205750465393, "logits/rejected": 1.1578245162963867, "logps/chosen": -1015.79541015625, "logps/rejected": -1361.6103515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.066598892211914, "rewards/margins": 27.31325340270996, "rewards/rejected": -45.379852294921875, "step": 55 }, { "epoch": 0.9105691056910569, "grad_norm": 3.5815644423564663e-06, "learning_rate": 0.00012877895153711935, "logits/chosen": 0.5448588132858276, "logits/rejected": 0.6314257383346558, "logps/chosen": -1082.805908203125, "logps/rejected": -1538.261962890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -23.810945510864258, "rewards/margins": 29.520732879638672, "rewards/rejected": -53.3316764831543, "step": 56 }, { "epoch": 0.926829268292683, "grad_norm": 58.86332702636719, "learning_rate": 0.00012610562138799978, "logits/chosen": 1.9793856143951416, "logits/rejected": 2.0082552433013916, "logps/chosen": -1352.8492431640625, "logps/rejected": -1265.2257080078125, "loss": 0.3774, "rewards/accuracies": 0.75, "rewards/chosen": -20.378952026367188, "rewards/margins": 17.73773193359375, "rewards/rejected": -38.1166877746582, "step": 57 }, { "epoch": 0.943089430894309, "grad_norm": 5.57162458392213e-08, "learning_rate": 0.0001234121145954094, "logits/chosen": 0.7738958597183228, "logits/rejected": 0.6971035599708557, "logps/chosen": -927.3837280273438, "logps/rejected": -1710.65771484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.810049057006836, "rewards/margins": 38.65287780761719, "rewards/rejected": -56.462928771972656, "step": 58 }, { "epoch": 0.959349593495935, "grad_norm": 0.10466321557760239, "learning_rate": 0.00012070051293037492, "logits/chosen": 1.3470133543014526, "logits/rejected": 1.3975563049316406, "logps/chosen": -1097.9437255859375, "logps/rejected": -1693.154541015625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -20.652606964111328, "rewards/margins": 36.89767074584961, "rewards/rejected": -57.55027770996094, "step": 59 }, { "epoch": 0.975609756097561, "grad_norm": 2.4582501282566227e-05, "learning_rate": 0.00011797291214917881, "logits/chosen": 1.379901647567749, "logits/rejected": 1.2993323802947998, "logps/chosen": -1204.1943359375, "logps/rejected": -1411.241455078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -19.423160552978516, "rewards/margins": 26.866172790527344, "rewards/rejected": -46.28933334350586, "step": 60 }, { "epoch": 0.991869918699187, "grad_norm": 7.934165478218347e-05, "learning_rate": 0.0001152314203735805, "logits/chosen": 1.951298713684082, "logits/rejected": 2.0110878944396973, "logps/chosen": -1275.750732421875, "logps/rejected": -1257.931640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.708940505981445, "rewards/margins": 21.205249786376953, "rewards/rejected": -37.914188385009766, "step": 61 }, { "epoch": 1.0, "grad_norm": 2.9418702141015274e-08, "learning_rate": 0.00011247815646148087, "logits/chosen": 1.219478964805603, "logits/rejected": 1.4597835540771484, "logps/chosen": -1298.3076171875, "logps/rejected": -1700.546142578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -26.570446014404297, "rewards/margins": 39.88042449951172, "rewards/rejected": -66.45086669921875, "step": 62 } ], "logging_steps": 1, "max_steps": 123, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 62, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }