{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 100, "global_step": 5510, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 41.204517024742586, "learning_rate": 2.7223230490018146e-10, "logits/chosen": -1.0541447401046753, "logits/rejected": -0.7520447373390198, "logps/chosen": -60.40666961669922, "logps/rejected": -106.31614685058594, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "grad_norm": 46.36022550961656, "learning_rate": 2.722323049001815e-09, "logits/chosen": -1.6423306465148926, "logits/rejected": -1.0551129579544067, "logps/chosen": -118.1044921875, "logps/rejected": -147.71112060546875, "loss": 0.6932, "rewards/accuracies": 0.4444444477558136, "rewards/chosen": 0.00048234747373498976, "rewards/margins": -0.009559460915625095, "rewards/rejected": 0.01004180870950222, "step": 10 }, { "epoch": 0.01, "grad_norm": 44.73280646725986, "learning_rate": 5.44464609800363e-09, "logits/chosen": -1.3452703952789307, "logits/rejected": -1.4189544916152954, "logps/chosen": -100.15978240966797, "logps/rejected": -120.36905670166016, "loss": 0.6934, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.005216827150434256, "rewards/margins": 0.023615092039108276, "rewards/rejected": -0.02883191965520382, "step": 20 }, { "epoch": 0.01, "grad_norm": 47.03882983712326, "learning_rate": 8.166969147005445e-09, "logits/chosen": -1.8659166097640991, "logits/rejected": -1.3837544918060303, "logps/chosen": -97.74725341796875, "logps/rejected": -225.3568878173828, "loss": 0.6933, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.01882881112396717, "rewards/margins": 0.012613847851753235, "rewards/rejected": 0.0062149628065526485, "step": 30 }, { "epoch": 0.01, "grad_norm": 51.31627563719448, "learning_rate": 1.088929219600726e-08, "logits/chosen": -1.726585030555725, "logits/rejected": -1.5602327585220337, "logps/chosen": -117.8109359741211, "logps/rejected": -127.14540100097656, "loss": 0.6942, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.006876278668642044, "rewards/margins": 0.018800906836986542, "rewards/rejected": -0.025677183642983437, "step": 40 }, { "epoch": 0.02, "grad_norm": 41.50488252094809, "learning_rate": 1.3611615245009074e-08, "logits/chosen": -1.8296377658843994, "logits/rejected": -1.4772355556488037, "logps/chosen": -97.29546356201172, "logps/rejected": -119.9217758178711, "loss": 0.6936, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.01763664372265339, "rewards/margins": 0.02796524204313755, "rewards/rejected": -0.010328599251806736, "step": 50 }, { "epoch": 0.02, "grad_norm": 43.88658635517158, "learning_rate": 1.633393829401089e-08, "logits/chosen": -1.7242791652679443, "logits/rejected": -1.7074722051620483, "logps/chosen": -93.03128814697266, "logps/rejected": -110.11827087402344, "loss": 0.6967, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.006739559583365917, "rewards/margins": -0.004187393002212048, "rewards/rejected": 0.010926952585577965, "step": 60 }, { "epoch": 0.03, "grad_norm": 45.67398430745054, "learning_rate": 1.90562613430127e-08, "logits/chosen": -2.0839061737060547, "logits/rejected": -1.6974592208862305, "logps/chosen": -89.07298278808594, "logps/rejected": -121.43327331542969, "loss": 0.6934, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.018308373168110847, "rewards/margins": 0.026240844279527664, "rewards/rejected": -0.007932471111416817, "step": 70 }, { "epoch": 0.03, "grad_norm": 48.69573234717181, "learning_rate": 2.177858439201452e-08, "logits/chosen": -1.2379311323165894, "logits/rejected": -1.0085999965667725, "logps/chosen": -92.49485778808594, "logps/rejected": -110.10205078125, "loss": 0.6901, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.012255726382136345, "rewards/margins": 0.024808084592223167, "rewards/rejected": -0.012552358210086823, "step": 80 }, { "epoch": 0.03, "grad_norm": 41.975191835891444, "learning_rate": 2.4500907441016332e-08, "logits/chosen": -1.6204273700714111, "logits/rejected": -1.3888098001480103, "logps/chosen": -96.46438598632812, "logps/rejected": -131.0227508544922, "loss": 0.694, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0034812160301953554, "rewards/margins": -0.005814972333610058, "rewards/rejected": 0.002333755139261484, "step": 90 }, { "epoch": 0.04, "grad_norm": 42.27700827156122, "learning_rate": 2.7223230490018148e-08, "logits/chosen": -1.6626800298690796, "logits/rejected": -1.4588112831115723, "logps/chosen": -84.01578521728516, "logps/rejected": -111.53340911865234, "loss": 0.6953, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.015042876824736595, "rewards/margins": 0.0040522003546357155, "rewards/rejected": 0.01099067647010088, "step": 100 }, { "epoch": 0.04, "eval_logits/chosen": -2.501953125, "eval_logits/rejected": -2.2406537532806396, "eval_logps/chosen": -99.84339141845703, "eval_logps/rejected": -125.21623992919922, "eval_loss": 0.6942028403282166, "eval_rewards/accuracies": 0.5357142686843872, "eval_rewards/chosen": 0.011287148110568523, "eval_rewards/margins": -0.0042336308397352695, "eval_rewards/rejected": 0.015520776621997356, "eval_runtime": 73.7375, "eval_samples_per_second": 12.07, "eval_steps_per_second": 0.19, "step": 100 }, { "epoch": 0.04, "grad_norm": 46.94348163244785, "learning_rate": 2.994555353901996e-08, "logits/chosen": -1.4403226375579834, "logits/rejected": -1.123073935508728, "logps/chosen": -107.60018157958984, "logps/rejected": -231.7578582763672, "loss": 0.6892, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.009284459054470062, "rewards/margins": 0.01384044624865055, "rewards/rejected": -0.004555988125503063, "step": 110 }, { "epoch": 0.04, "grad_norm": 46.81740301639235, "learning_rate": 3.266787658802178e-08, "logits/chosen": -1.4798848628997803, "logits/rejected": -1.3624470233917236, "logps/chosen": -94.34761047363281, "logps/rejected": -120.3208999633789, "loss": 0.6925, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.016081811860203743, "rewards/margins": 0.002866668626666069, "rewards/rejected": 0.0132151423022151, "step": 120 }, { "epoch": 0.05, "grad_norm": 46.08791439590883, "learning_rate": 3.539019963702359e-08, "logits/chosen": -1.9895107746124268, "logits/rejected": -1.281200885772705, "logps/chosen": -95.02359771728516, "logps/rejected": -131.6729736328125, "loss": 0.6933, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.02468906342983246, "rewards/margins": -0.0023543343413621187, "rewards/rejected": 0.027043398469686508, "step": 130 }, { "epoch": 0.05, "grad_norm": 48.41164277694124, "learning_rate": 3.81125226860254e-08, "logits/chosen": -1.8839794397354126, "logits/rejected": -1.5620959997177124, "logps/chosen": -97.3235855102539, "logps/rejected": -101.75816345214844, "loss": 0.6896, "rewards/accuracies": 0.25, "rewards/chosen": 0.023149680346250534, "rewards/margins": -0.027130965143442154, "rewards/rejected": 0.050280649214982986, "step": 140 }, { "epoch": 0.05, "grad_norm": 42.39632204422603, "learning_rate": 4.083484573502722e-08, "logits/chosen": -1.9134677648544312, "logits/rejected": -1.6838871240615845, "logps/chosen": -72.68460845947266, "logps/rejected": -104.17420959472656, "loss": 0.6856, "rewards/accuracies": 0.5, "rewards/chosen": 0.04995592683553696, "rewards/margins": 0.012682494707405567, "rewards/rejected": 0.03727342560887337, "step": 150 }, { "epoch": 0.06, "grad_norm": 45.64533725609479, "learning_rate": 4.355716878402904e-08, "logits/chosen": -1.3503320217132568, "logits/rejected": -1.1550263166427612, "logps/chosen": -110.38504791259766, "logps/rejected": -130.46852111816406, "loss": 0.69, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.05299491807818413, "rewards/margins": -0.0019288047915324569, "rewards/rejected": 0.05492372438311577, "step": 160 }, { "epoch": 0.06, "grad_norm": 41.18460830204607, "learning_rate": 4.627949183303085e-08, "logits/chosen": -1.496957778930664, "logits/rejected": -0.9095319509506226, "logps/chosen": -78.29255676269531, "logps/rejected": -132.0447540283203, "loss": 0.6846, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.059414975345134735, "rewards/margins": 0.02102474495768547, "rewards/rejected": 0.03839023411273956, "step": 170 }, { "epoch": 0.07, "grad_norm": 41.49152413703793, "learning_rate": 4.9001814882032664e-08, "logits/chosen": -1.5343455076217651, "logits/rejected": -1.8757272958755493, "logps/chosen": -103.69041442871094, "logps/rejected": -111.36357116699219, "loss": 0.687, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.07798389345407486, "rewards/margins": 0.0013759995345026255, "rewards/rejected": 0.07660789787769318, "step": 180 }, { "epoch": 0.07, "grad_norm": 42.39863370578702, "learning_rate": 5.172413793103448e-08, "logits/chosen": -1.6158069372177124, "logits/rejected": -0.9124865531921387, "logps/chosen": -100.64363861083984, "logps/rejected": -135.06634521484375, "loss": 0.6837, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.09579858928918839, "rewards/margins": 0.01044096052646637, "rewards/rejected": 0.08535762131214142, "step": 190 }, { "epoch": 0.07, "grad_norm": 39.9911934894391, "learning_rate": 5.4446460980036295e-08, "logits/chosen": -2.3459877967834473, "logits/rejected": -1.5009344816207886, "logps/chosen": -82.8287353515625, "logps/rejected": -124.65399169921875, "loss": 0.6799, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.14615079760551453, "rewards/margins": -0.0005797408521175385, "rewards/rejected": 0.14673054218292236, "step": 200 }, { "epoch": 0.07, "eval_logits/chosen": -2.5021491050720215, "eval_logits/rejected": -2.2421486377716064, "eval_logps/chosen": -98.44856262207031, "eval_logps/rejected": -124.38037109375, "eval_loss": 0.6807048916816711, "eval_rewards/accuracies": 0.7857142686843872, "eval_rewards/chosen": 0.15076985955238342, "eval_rewards/margins": 0.05166209489107132, "eval_rewards/rejected": 0.0991077572107315, "eval_runtime": 71.9557, "eval_samples_per_second": 12.369, "eval_steps_per_second": 0.195, "step": 200 }, { "epoch": 0.08, "grad_norm": 45.974517848209416, "learning_rate": 5.716878402903811e-08, "logits/chosen": -2.038114070892334, "logits/rejected": -1.6292145252227783, "logps/chosen": -95.0359115600586, "logps/rejected": -117.90171813964844, "loss": 0.6816, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.15096978843212128, "rewards/margins": -0.013230574317276478, "rewards/rejected": 0.16420036554336548, "step": 210 }, { "epoch": 0.08, "grad_norm": 44.619784920786046, "learning_rate": 5.989110707803992e-08, "logits/chosen": -2.296483039855957, "logits/rejected": -1.9715936183929443, "logps/chosen": -93.15199279785156, "logps/rejected": -127.34361267089844, "loss": 0.674, "rewards/accuracies": 0.75, "rewards/chosen": 0.20922093093395233, "rewards/margins": 0.046642549335956573, "rewards/rejected": 0.16257838904857635, "step": 220 }, { "epoch": 0.08, "grad_norm": 44.88970750413813, "learning_rate": 6.261343012704174e-08, "logits/chosen": -1.450714349746704, "logits/rejected": -0.9239526987075806, "logps/chosen": -85.76863098144531, "logps/rejected": -121.76054382324219, "loss": 0.6702, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1901402473449707, "rewards/margins": 0.05737558752298355, "rewards/rejected": 0.13276468217372894, "step": 230 }, { "epoch": 0.09, "grad_norm": 39.31008777722159, "learning_rate": 6.533575317604356e-08, "logits/chosen": -1.9880611896514893, "logits/rejected": -1.3975508213043213, "logps/chosen": -99.23193359375, "logps/rejected": -116.69496154785156, "loss": 0.6673, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.2814417779445648, "rewards/margins": 0.020699840039014816, "rewards/rejected": 0.2607419490814209, "step": 240 }, { "epoch": 0.09, "grad_norm": 43.800148674203605, "learning_rate": 6.805807622504536e-08, "logits/chosen": -1.808105230331421, "logits/rejected": -1.5273463726043701, "logps/chosen": -86.12389373779297, "logps/rejected": -122.4721908569336, "loss": 0.666, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.2851946949958801, "rewards/margins": 0.03660944104194641, "rewards/rejected": 0.24858525395393372, "step": 250 }, { "epoch": 0.09, "grad_norm": 43.84613473195863, "learning_rate": 7.078039927404718e-08, "logits/chosen": -2.0416717529296875, "logits/rejected": -1.812233567237854, "logps/chosen": -118.04366302490234, "logps/rejected": -118.3464126586914, "loss": 0.6663, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.3446394205093384, "rewards/margins": 0.07985617220401764, "rewards/rejected": 0.26478323340415955, "step": 260 }, { "epoch": 0.1, "grad_norm": 45.46886488583846, "learning_rate": 7.3502722323049e-08, "logits/chosen": -2.089078664779663, "logits/rejected": -1.5567834377288818, "logps/chosen": -103.9489974975586, "logps/rejected": -151.25582885742188, "loss": 0.651, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.3747584819793701, "rewards/margins": 0.04969606548547745, "rewards/rejected": 0.32506245374679565, "step": 270 }, { "epoch": 0.1, "grad_norm": 46.872473628922336, "learning_rate": 7.62250453720508e-08, "logits/chosen": -1.9163455963134766, "logits/rejected": -1.490391492843628, "logps/chosen": -79.95561218261719, "logps/rejected": -128.16030883789062, "loss": 0.6558, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.5081700682640076, "rewards/margins": 0.14635446667671204, "rewards/rejected": 0.36181557178497314, "step": 280 }, { "epoch": 0.11, "grad_norm": 40.264154663026744, "learning_rate": 7.894736842105262e-08, "logits/chosen": -1.999222755432129, "logits/rejected": -2.104238986968994, "logps/chosen": -96.35908508300781, "logps/rejected": -113.0312728881836, "loss": 0.6477, "rewards/accuracies": 0.75, "rewards/chosen": 0.603106677532196, "rewards/margins": 0.18604376912117004, "rewards/rejected": 0.4170629382133484, "step": 290 }, { "epoch": 0.11, "grad_norm": 38.393082680911135, "learning_rate": 8.166969147005444e-08, "logits/chosen": -2.011298894882202, "logits/rejected": -1.8035573959350586, "logps/chosen": -101.60588836669922, "logps/rejected": -126.2813491821289, "loss": 0.6328, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.5809694528579712, "rewards/margins": 0.10704417526721954, "rewards/rejected": 0.4739252030849457, "step": 300 }, { "epoch": 0.11, "eval_logits/chosen": -2.5139808654785156, "eval_logits/rejected": -2.253553628921509, "eval_logps/chosen": -93.76677703857422, "eval_logps/rejected": -121.03182220458984, "eval_loss": 0.6357866525650024, "eval_rewards/accuracies": 0.8214285969734192, "eval_rewards/chosen": 0.6189486384391785, "eval_rewards/margins": 0.18498651683330536, "eval_rewards/rejected": 0.4339621067047119, "eval_runtime": 72.0138, "eval_samples_per_second": 12.359, "eval_steps_per_second": 0.194, "step": 300 }, { "epoch": 0.11, "grad_norm": 41.32014110656028, "learning_rate": 8.439201451905626e-08, "logits/chosen": -1.7809641361236572, "logits/rejected": -1.4835566282272339, "logps/chosen": -91.62686157226562, "logps/rejected": -160.69992065429688, "loss": 0.6213, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.6366099715232849, "rewards/margins": 0.10506564378738403, "rewards/rejected": 0.5315443277359009, "step": 310 }, { "epoch": 0.12, "grad_norm": 40.20352730538307, "learning_rate": 8.711433756805808e-08, "logits/chosen": -1.3174822330474854, "logits/rejected": -0.9151695370674133, "logps/chosen": -76.4534912109375, "logps/rejected": -106.75065612792969, "loss": 0.6352, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.6975838541984558, "rewards/margins": 0.11969652026891708, "rewards/rejected": 0.5778872966766357, "step": 320 }, { "epoch": 0.12, "grad_norm": 44.10982454650109, "learning_rate": 8.983666061705989e-08, "logits/chosen": -1.7297052145004272, "logits/rejected": -1.1650458574295044, "logps/chosen": -95.3366470336914, "logps/rejected": -113.1480712890625, "loss": 0.6209, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.8798357844352722, "rewards/margins": 0.10148117691278458, "rewards/rejected": 0.7783547043800354, "step": 330 }, { "epoch": 0.12, "grad_norm": 39.48829529957203, "learning_rate": 9.25589836660617e-08, "logits/chosen": -1.7264535427093506, "logits/rejected": -1.7544937133789062, "logps/chosen": -100.7391586303711, "logps/rejected": -132.95297241210938, "loss": 0.6138, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.7276839017868042, "rewards/margins": 0.1849050223827362, "rewards/rejected": 0.5427789092063904, "step": 340 }, { "epoch": 0.13, "grad_norm": 40.75750118936868, "learning_rate": 9.528130671506351e-08, "logits/chosen": -1.9755557775497437, "logits/rejected": -1.5455005168914795, "logps/chosen": -92.50463104248047, "logps/rejected": -114.50557708740234, "loss": 0.6187, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.886059582233429, "rewards/margins": 0.1289076954126358, "rewards/rejected": 0.7571519613265991, "step": 350 }, { "epoch": 0.13, "grad_norm": 42.45802253147107, "learning_rate": 9.800362976406533e-08, "logits/chosen": -1.9283500909805298, "logits/rejected": -1.5053136348724365, "logps/chosen": -82.00525665283203, "logps/rejected": -104.69712829589844, "loss": 0.6127, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.1786202192306519, "rewards/margins": 0.27130889892578125, "rewards/rejected": 0.9073113203048706, "step": 360 }, { "epoch": 0.13, "grad_norm": 39.734876260743064, "learning_rate": 1.0072595281306713e-07, "logits/chosen": -2.2132327556610107, "logits/rejected": -1.904916524887085, "logps/chosen": -72.9496841430664, "logps/rejected": -108.14723205566406, "loss": 0.605, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.2401649951934814, "rewards/margins": 0.1573180854320526, "rewards/rejected": 1.0828468799591064, "step": 370 }, { "epoch": 0.14, "grad_norm": 39.83612602137503, "learning_rate": 1.0344827586206897e-07, "logits/chosen": -2.0371642112731934, "logits/rejected": -1.9173485040664673, "logps/chosen": -83.82426452636719, "logps/rejected": -97.99388885498047, "loss": 0.5913, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.2351922988891602, "rewards/margins": 0.4066685140132904, "rewards/rejected": 0.8285236358642578, "step": 380 }, { "epoch": 0.14, "grad_norm": 42.576323725278016, "learning_rate": 1.0617059891107078e-07, "logits/chosen": -2.1276021003723145, "logits/rejected": -2.370884418487549, "logps/chosen": -97.64468383789062, "logps/rejected": -113.1034164428711, "loss": 0.6069, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.331588864326477, "rewards/margins": 0.19395257532596588, "rewards/rejected": 1.1376360654830933, "step": 390 }, { "epoch": 0.15, "grad_norm": 44.1343637917746, "learning_rate": 1.0889292196007259e-07, "logits/chosen": -1.8790748119354248, "logits/rejected": -1.5620168447494507, "logps/chosen": -72.6387710571289, "logps/rejected": -154.55137634277344, "loss": 0.5964, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.3714253902435303, "rewards/margins": 0.4154202938079834, "rewards/rejected": 0.9560050964355469, "step": 400 }, { "epoch": 0.15, "eval_logits/chosen": -2.5321178436279297, "eval_logits/rejected": -2.2736294269561768, "eval_logps/chosen": -86.19153594970703, "eval_logps/rejected": -116.01555633544922, "eval_loss": 0.5837011337280273, "eval_rewards/accuracies": 0.7678571343421936, "eval_rewards/chosen": 1.3764731884002686, "eval_rewards/margins": 0.44088268280029297, "eval_rewards/rejected": 0.9355906844139099, "eval_runtime": 71.9235, "eval_samples_per_second": 12.374, "eval_steps_per_second": 0.195, "step": 400 }, { "epoch": 0.15, "grad_norm": 32.5866475732278, "learning_rate": 1.1161524500907441e-07, "logits/chosen": -2.1084635257720947, "logits/rejected": -1.6532913446426392, "logps/chosen": -71.08902740478516, "logps/rejected": -119.5894546508789, "loss": 0.5579, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.3634411096572876, "rewards/margins": 0.15741710364818573, "rewards/rejected": 1.206023931503296, "step": 410 }, { "epoch": 0.15, "grad_norm": 51.63908130905885, "learning_rate": 1.1433756805807621e-07, "logits/chosen": -1.8392841815948486, "logits/rejected": -1.5305787324905396, "logps/chosen": -64.91630554199219, "logps/rejected": -90.24629974365234, "loss": 0.5809, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.4629342555999756, "rewards/margins": 0.21032755076885223, "rewards/rejected": 1.2526066303253174, "step": 420 }, { "epoch": 0.16, "grad_norm": 42.575288667126856, "learning_rate": 1.1705989110707803e-07, "logits/chosen": -1.7529628276824951, "logits/rejected": -1.451038122177124, "logps/chosen": -81.2251205444336, "logps/rejected": -142.9680938720703, "loss": 0.5776, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.41762375831604, "rewards/margins": 0.20172078907489777, "rewards/rejected": 1.2159030437469482, "step": 430 }, { "epoch": 0.16, "grad_norm": 37.38028775120078, "learning_rate": 1.1978221415607984e-07, "logits/chosen": -1.7141106128692627, "logits/rejected": -1.342930555343628, "logps/chosen": -74.53529357910156, "logps/rejected": -107.2362060546875, "loss": 0.5626, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.7542121410369873, "rewards/margins": 0.4075583517551422, "rewards/rejected": 1.3466538190841675, "step": 440 }, { "epoch": 0.16, "grad_norm": 36.24749956558924, "learning_rate": 1.2250453720508167e-07, "logits/chosen": -1.9711906909942627, "logits/rejected": -1.221478819847107, "logps/chosen": -103.87667083740234, "logps/rejected": -135.00277709960938, "loss": 0.5604, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.4585211277008057, "rewards/margins": 0.28392547369003296, "rewards/rejected": 1.1745957136154175, "step": 450 }, { "epoch": 0.17, "grad_norm": 42.96091232839455, "learning_rate": 1.2522686025408348e-07, "logits/chosen": -1.7234961986541748, "logits/rejected": -1.5666788816452026, "logps/chosen": -94.36329650878906, "logps/rejected": -109.29872131347656, "loss": 0.5283, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.7453796863555908, "rewards/margins": 0.3029170632362366, "rewards/rejected": 1.442462682723999, "step": 460 }, { "epoch": 0.17, "grad_norm": 48.636321311117, "learning_rate": 1.279491833030853e-07, "logits/chosen": -2.4974513053894043, "logits/rejected": -1.580528736114502, "logps/chosen": -93.00968170166016, "logps/rejected": -127.61222076416016, "loss": 0.5586, "rewards/accuracies": 0.75, "rewards/chosen": 1.8251793384552002, "rewards/margins": 0.470356285572052, "rewards/rejected": 1.354823112487793, "step": 470 }, { "epoch": 0.17, "grad_norm": 35.14478185239256, "learning_rate": 1.3067150635208711e-07, "logits/chosen": -1.878103256225586, "logits/rejected": -1.7810052633285522, "logps/chosen": -64.92475891113281, "logps/rejected": -95.4583511352539, "loss": 0.5455, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.9156177043914795, "rewards/margins": 0.21928730607032776, "rewards/rejected": 1.6963306665420532, "step": 480 }, { "epoch": 0.18, "grad_norm": 41.07425605129, "learning_rate": 1.3339382940108892e-07, "logits/chosen": -1.8308073282241821, "logits/rejected": -1.6511573791503906, "logps/chosen": -94.47706604003906, "logps/rejected": -142.56326293945312, "loss": 0.5669, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.7191970348358154, "rewards/margins": 0.4305998682975769, "rewards/rejected": 1.2885972261428833, "step": 490 }, { "epoch": 0.18, "grad_norm": 40.72644317271578, "learning_rate": 1.3611615245009072e-07, "logits/chosen": -1.8288648128509521, "logits/rejected": -1.6132848262786865, "logps/chosen": -84.89088439941406, "logps/rejected": -141.57489013671875, "loss": 0.5567, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.799727201461792, "rewards/margins": 0.7182740569114685, "rewards/rejected": 1.0814530849456787, "step": 500 }, { "epoch": 0.18, "eval_logits/chosen": -2.56964373588562, "eval_logits/rejected": -2.3121345043182373, "eval_logps/chosen": -81.02156829833984, "eval_logps/rejected": -113.64513397216797, "eval_loss": 0.5355415940284729, "eval_rewards/accuracies": 0.8392857313156128, "eval_rewards/chosen": 1.8934696912765503, "eval_rewards/margins": 0.7208380699157715, "eval_rewards/rejected": 1.1726313829421997, "eval_runtime": 71.9101, "eval_samples_per_second": 12.377, "eval_steps_per_second": 0.195, "step": 500 }, { "epoch": 0.19, "grad_norm": 31.50812715169961, "learning_rate": 1.3883847549909256e-07, "logits/chosen": -2.6629555225372314, "logits/rejected": -1.8640201091766357, "logps/chosen": -64.94083404541016, "logps/rejected": -131.4000701904297, "loss": 0.5287, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.017392635345459, "rewards/margins": 0.5965152978897095, "rewards/rejected": 1.4208776950836182, "step": 510 }, { "epoch": 0.19, "grad_norm": 41.001026190013995, "learning_rate": 1.4156079854809436e-07, "logits/chosen": -2.0135555267333984, "logits/rejected": -2.3467681407928467, "logps/chosen": -83.2616958618164, "logps/rejected": -96.27758026123047, "loss": 0.5086, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.7922933101654053, "rewards/margins": 0.6331424117088318, "rewards/rejected": 1.1591508388519287, "step": 520 }, { "epoch": 0.19, "grad_norm": 43.50803723743233, "learning_rate": 1.442831215970962e-07, "logits/chosen": -2.3700008392333984, "logits/rejected": -1.9721641540527344, "logps/chosen": -65.375, "logps/rejected": -107.3283920288086, "loss": 0.5664, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.932377576828003, "rewards/margins": 0.5669600367546082, "rewards/rejected": 1.36541748046875, "step": 530 }, { "epoch": 0.2, "grad_norm": 36.6257172454756, "learning_rate": 1.47005444646098e-07, "logits/chosen": -2.7627670764923096, "logits/rejected": -2.018375873565674, "logps/chosen": -66.04200744628906, "logps/rejected": -90.71024322509766, "loss": 0.5114, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.1180334091186523, "rewards/margins": 0.621078610420227, "rewards/rejected": 1.4969549179077148, "step": 540 }, { "epoch": 0.2, "grad_norm": 35.991193860058296, "learning_rate": 1.497277676950998e-07, "logits/chosen": -1.3566482067108154, "logits/rejected": -1.3935582637786865, "logps/chosen": -77.91327667236328, "logps/rejected": -97.44300079345703, "loss": 0.5308, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.9388548135757446, "rewards/margins": 0.38032227754592896, "rewards/rejected": 1.5585325956344604, "step": 550 }, { "epoch": 0.2, "grad_norm": 48.06228927830701, "learning_rate": 1.49998780935628e-07, "logits/chosen": -1.7465827465057373, "logits/rejected": -1.4473216533660889, "logps/chosen": -70.13250732421875, "logps/rejected": -105.08538818359375, "loss": 0.4779, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.5702747106552124, "rewards/margins": 0.8154007196426392, "rewards/rejected": 0.7548739910125732, "step": 560 }, { "epoch": 0.21, "grad_norm": 38.75394797505258, "learning_rate": 1.4999456693682575e-07, "logits/chosen": -2.4446985721588135, "logits/rejected": -1.8925511837005615, "logps/chosen": -71.6372299194336, "logps/rejected": -103.564697265625, "loss": 0.5129, "rewards/accuracies": 0.75, "rewards/chosen": 2.205054759979248, "rewards/margins": 0.5576547384262085, "rewards/rejected": 1.647400140762329, "step": 570 }, { "epoch": 0.21, "grad_norm": 37.78034263492822, "learning_rate": 1.4998734312249918e-07, "logits/chosen": -1.8992903232574463, "logits/rejected": -1.5952703952789307, "logps/chosen": -83.2913589477539, "logps/rejected": -120.90938568115234, "loss": 0.4891, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.9770288467407227, "rewards/margins": 0.7018577456474304, "rewards/rejected": 1.2751710414886475, "step": 580 }, { "epoch": 0.21, "grad_norm": 37.211325491550305, "learning_rate": 1.4997710978256735e-07, "logits/chosen": -1.94363272190094, "logits/rejected": -1.9168386459350586, "logps/chosen": -76.82261657714844, "logps/rejected": -120.50138092041016, "loss": 0.5175, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.0862905979156494, "rewards/margins": 0.5364335775375366, "rewards/rejected": 1.5498571395874023, "step": 590 }, { "epoch": 0.22, "grad_norm": 38.42991397247903, "learning_rate": 1.4996386732773285e-07, "logits/chosen": -1.6784331798553467, "logits/rejected": -1.4212602376937866, "logps/chosen": -78.51207733154297, "logps/rejected": -97.93910217285156, "loss": 0.5234, "rewards/accuracies": 0.75, "rewards/chosen": 1.9482488632202148, "rewards/margins": 0.5499599575996399, "rewards/rejected": 1.3982888460159302, "step": 600 }, { "epoch": 0.22, "eval_logits/chosen": -2.631465435028076, "eval_logits/rejected": -2.3776392936706543, "eval_logps/chosen": -79.7362289428711, "eval_logps/rejected": -114.89885711669922, "eval_loss": 0.4917795658111572, "eval_rewards/accuracies": 0.8571428656578064, "eval_rewards/chosen": 2.022002696990967, "eval_rewards/margins": 0.9747439026832581, "eval_rewards/rejected": 1.047258734703064, "eval_runtime": 71.9087, "eval_samples_per_second": 12.377, "eval_steps_per_second": 0.195, "step": 600 }, { "epoch": 0.22, "grad_norm": 48.91223436910098, "learning_rate": 1.4994761628946546e-07, "logits/chosen": -1.6232852935791016, "logits/rejected": -1.4054292440414429, "logps/chosen": -76.1790542602539, "logps/rejected": -125.78961181640625, "loss": 0.4929, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.940302848815918, "rewards/margins": 0.9549075961112976, "rewards/rejected": 0.9853953123092651, "step": 610 }, { "epoch": 0.23, "grad_norm": 41.70252788830195, "learning_rate": 1.499283573199808e-07, "logits/chosen": -1.7214891910552979, "logits/rejected": -1.2875360250473022, "logps/chosen": -73.24628448486328, "logps/rejected": -86.8110122680664, "loss": 0.4935, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.8633216619491577, "rewards/margins": 0.7748242020606995, "rewards/rejected": 1.0884974002838135, "step": 620 }, { "epoch": 0.23, "grad_norm": 38.30334820196022, "learning_rate": 1.499060911922141e-07, "logits/chosen": -1.616151213645935, "logits/rejected": -1.6390151977539062, "logps/chosen": -68.93297576904297, "logps/rejected": -113.6138687133789, "loss": 0.4795, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.810837984085083, "rewards/margins": 0.38780540227890015, "rewards/rejected": 1.423032522201538, "step": 630 }, { "epoch": 0.23, "grad_norm": 34.79521166345824, "learning_rate": 1.498808187997893e-07, "logits/chosen": -1.9353692531585693, "logits/rejected": -1.5537188053131104, "logps/chosen": -97.22018432617188, "logps/rejected": -153.95028686523438, "loss": 0.5006, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.8405921459197998, "rewards/margins": 1.0561001300811768, "rewards/rejected": 0.7844920754432678, "step": 640 }, { "epoch": 0.24, "grad_norm": 94.46732088298053, "learning_rate": 1.4985254115698304e-07, "logits/chosen": -2.263124704360962, "logits/rejected": -1.8345104455947876, "logps/chosen": -73.39897155761719, "logps/rejected": -92.21670532226562, "loss": 0.4917, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.1108241081237793, "rewards/margins": 0.9142869114875793, "rewards/rejected": 1.1965371370315552, "step": 650 }, { "epoch": 0.24, "grad_norm": 33.786235958694014, "learning_rate": 1.4982125939868402e-07, "logits/chosen": -2.4463446140289307, "logits/rejected": -1.6779578924179077, "logps/chosen": -79.39064025878906, "logps/rejected": -132.6228790283203, "loss": 0.4479, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 2.3004369735717773, "rewards/margins": 1.3063924312591553, "rewards/rejected": 0.9940446019172668, "step": 660 }, { "epoch": 0.24, "grad_norm": 35.539345812176975, "learning_rate": 1.4978697478034753e-07, "logits/chosen": -2.2131102085113525, "logits/rejected": -1.8831088542938232, "logps/chosen": -67.44413757324219, "logps/rejected": -113.38688659667969, "loss": 0.4648, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.094628095626831, "rewards/margins": 0.9700366854667664, "rewards/rejected": 1.124591588973999, "step": 670 }, { "epoch": 0.25, "grad_norm": 38.83509762475066, "learning_rate": 1.497496886779449e-07, "logits/chosen": -2.4378554821014404, "logits/rejected": -1.9790445566177368, "logps/chosen": -103.16511535644531, "logps/rejected": -128.8167266845703, "loss": 0.4337, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.8627456426620483, "rewards/margins": 0.8879152536392212, "rewards/rejected": 0.9748304486274719, "step": 680 }, { "epoch": 0.25, "grad_norm": 41.327033787846, "learning_rate": 1.497094025879084e-07, "logits/chosen": -1.8331416845321655, "logits/rejected": -1.4264384508132935, "logps/chosen": -80.54314422607422, "logps/rejected": -137.10955810546875, "loss": 0.4143, "rewards/accuracies": 0.75, "rewards/chosen": 1.4691455364227295, "rewards/margins": 0.8567919731140137, "rewards/rejected": 0.612353503704071, "step": 690 }, { "epoch": 0.25, "grad_norm": 38.09387686336411, "learning_rate": 1.4966611812707116e-07, "logits/chosen": -2.376185894012451, "logits/rejected": -1.9318294525146484, "logps/chosen": -67.70744323730469, "logps/rejected": -106.6443099975586, "loss": 0.4468, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.5539474487304688, "rewards/margins": 1.1218903064727783, "rewards/rejected": 1.43205726146698, "step": 700 }, { "epoch": 0.25, "eval_logits/chosen": -2.6510469913482666, "eval_logits/rejected": -2.410930871963501, "eval_logps/chosen": -79.01429748535156, "eval_logps/rejected": -117.04235076904297, "eval_loss": 0.45572710037231445, "eval_rewards/accuracies": 0.8571428656578064, "eval_rewards/chosen": 2.0941965579986572, "eval_rewards/margins": 1.2612864971160889, "eval_rewards/rejected": 0.832909882068634, "eval_runtime": 72.043, "eval_samples_per_second": 12.354, "eval_steps_per_second": 0.194, "step": 700 }, { "epoch": 0.26, "grad_norm": 34.41497827448044, "learning_rate": 1.4961983703260222e-07, "logits/chosen": -2.285163402557373, "logits/rejected": -1.6250555515289307, "logps/chosen": -82.45291137695312, "logps/rejected": -162.6754913330078, "loss": 0.4388, "rewards/accuracies": 0.75, "rewards/chosen": 1.6247352361679077, "rewards/margins": 1.2791966199874878, "rewards/rejected": 0.34553852677345276, "step": 710 }, { "epoch": 0.26, "grad_norm": 38.939975049466675, "learning_rate": 1.495705611619369e-07, "logits/chosen": -2.8705544471740723, "logits/rejected": -1.9211571216583252, "logps/chosen": -52.576690673828125, "logps/rejected": -104.49955749511719, "loss": 0.4632, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.5734798908233643, "rewards/margins": 1.8182973861694336, "rewards/rejected": 0.755182683467865, "step": 720 }, { "epoch": 0.26, "grad_norm": 37.237058738428075, "learning_rate": 1.4951829249270223e-07, "logits/chosen": -2.2160234451293945, "logits/rejected": -1.7666336297988892, "logps/chosen": -94.98374938964844, "logps/rejected": -155.47586059570312, "loss": 0.4477, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.8300750255584717, "rewards/margins": 0.9866692423820496, "rewards/rejected": 0.8434059023857117, "step": 730 }, { "epoch": 0.27, "grad_norm": 42.247815955525425, "learning_rate": 1.4946303312263751e-07, "logits/chosen": -2.1757843494415283, "logits/rejected": -1.8440332412719727, "logps/chosen": -72.95508575439453, "logps/rejected": -113.18672943115234, "loss": 0.4715, "rewards/accuracies": 0.75, "rewards/chosen": 2.1009631156921387, "rewards/margins": 1.115952491760254, "rewards/rejected": 0.9850105047225952, "step": 740 }, { "epoch": 0.27, "grad_norm": 32.89732640498844, "learning_rate": 1.4940478526951018e-07, "logits/chosen": -1.856689453125, "logits/rejected": -1.4313759803771973, "logps/chosen": -82.71755981445312, "logps/rejected": -135.11648559570312, "loss": 0.426, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.647456407546997, "rewards/margins": 1.2215721607208252, "rewards/rejected": 0.425883948802948, "step": 750 }, { "epoch": 0.28, "grad_norm": 39.76459644604554, "learning_rate": 1.4934355127102686e-07, "logits/chosen": -2.3893539905548096, "logits/rejected": -1.5895264148712158, "logps/chosen": -78.76719665527344, "logps/rejected": -160.42080688476562, "loss": 0.4394, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.3389594554901123, "rewards/margins": 1.721954345703125, "rewards/rejected": 0.6170052289962769, "step": 760 }, { "epoch": 0.28, "grad_norm": 30.768392184975625, "learning_rate": 1.492793335847394e-07, "logits/chosen": -2.251796245574951, "logits/rejected": -1.5280625820159912, "logps/chosen": -63.670433044433594, "logps/rejected": -124.48072814941406, "loss": 0.4001, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.670900583267212, "rewards/margins": 1.3456604480743408, "rewards/rejected": 0.3252400755882263, "step": 770 }, { "epoch": 0.28, "grad_norm": 35.26526799612695, "learning_rate": 1.4921213478794637e-07, "logits/chosen": -2.0840353965759277, "logits/rejected": -1.9413776397705078, "logps/chosen": -73.97483825683594, "logps/rejected": -113.29158020019531, "loss": 0.4185, "rewards/accuracies": 0.75, "rewards/chosen": 1.834567666053772, "rewards/margins": 1.0754048824310303, "rewards/rejected": 0.7591627240180969, "step": 780 }, { "epoch": 0.29, "grad_norm": 42.37614388323698, "learning_rate": 1.4914195757758955e-07, "logits/chosen": -2.1000962257385254, "logits/rejected": -2.3834102153778076, "logps/chosen": -76.1771240234375, "logps/rejected": -88.94342803955078, "loss": 0.4397, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.1757988929748535, "rewards/margins": 1.1148507595062256, "rewards/rejected": 1.060948133468628, "step": 790 }, { "epoch": 0.29, "grad_norm": 37.69381725528037, "learning_rate": 1.4906880477014573e-07, "logits/chosen": -2.2119500637054443, "logits/rejected": -1.4499582052230835, "logps/chosen": -62.1577033996582, "logps/rejected": -163.81442260742188, "loss": 0.4445, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.9283027648925781, "rewards/margins": 1.5437467098236084, "rewards/rejected": 0.3845560550689697, "step": 800 }, { "epoch": 0.29, "eval_logits/chosen": -2.6051876544952393, "eval_logits/rejected": -2.3914036750793457, "eval_logps/chosen": -83.42094421386719, "eval_logps/rejected": -123.46272277832031, "eval_loss": 0.43243297934532166, "eval_rewards/accuracies": 0.8928571343421936, "eval_rewards/chosen": 1.6535331010818481, "eval_rewards/margins": 1.4626604318618774, "eval_rewards/rejected": 0.1908726841211319, "eval_runtime": 71.9896, "eval_samples_per_second": 12.363, "eval_steps_per_second": 0.194, "step": 800 }, { "epoch": 0.29, "grad_norm": 35.55693000649927, "learning_rate": 1.489926793015137e-07, "logits/chosen": -2.137413501739502, "logits/rejected": -1.9606491327285767, "logps/chosen": -69.8790512084961, "logps/rejected": -97.57211303710938, "loss": 0.3881, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 2.0780739784240723, "rewards/margins": 1.4736759662628174, "rewards/rejected": 0.6043978333473206, "step": 810 }, { "epoch": 0.3, "grad_norm": 37.02292806830243, "learning_rate": 1.489135842268963e-07, "logits/chosen": -2.0538628101348877, "logits/rejected": -2.0753753185272217, "logps/chosen": -106.13938903808594, "logps/rejected": -144.13514709472656, "loss": 0.4145, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.3868672847747803, "rewards/margins": 1.0619122982025146, "rewards/rejected": 0.3249548673629761, "step": 820 }, { "epoch": 0.3, "grad_norm": 40.671728265294014, "learning_rate": 1.4883152272067798e-07, "logits/chosen": -1.9181344509124756, "logits/rejected": -1.6507272720336914, "logps/chosen": -99.54740905761719, "logps/rejected": -119.50572204589844, "loss": 0.4404, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.4642399549484253, "rewards/margins": 0.9573423266410828, "rewards/rejected": 0.5068976283073425, "step": 830 }, { "epoch": 0.3, "grad_norm": 28.025590997376412, "learning_rate": 1.487464980762972e-07, "logits/chosen": -2.1211495399475098, "logits/rejected": -1.789926528930664, "logps/chosen": -76.32536315917969, "logps/rejected": -180.9839324951172, "loss": 0.393, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.8189208507537842, "rewards/margins": 1.659450888633728, "rewards/rejected": 0.15946999192237854, "step": 840 }, { "epoch": 0.31, "grad_norm": 42.494743726734264, "learning_rate": 1.4865851370611445e-07, "logits/chosen": -1.7717339992523193, "logits/rejected": -1.7804222106933594, "logps/chosen": -94.37425994873047, "logps/rejected": -134.35830688476562, "loss": 0.3825, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.593718409538269, "rewards/margins": 1.334201455116272, "rewards/rejected": 0.25951701402664185, "step": 850 }, { "epoch": 0.31, "grad_norm": 47.60353919509746, "learning_rate": 1.4856757314127514e-07, "logits/chosen": -1.930068016052246, "logits/rejected": -1.686745285987854, "logps/chosen": -87.13040161132812, "logps/rejected": -108.21986389160156, "loss": 0.3977, "rewards/accuracies": 0.75, "rewards/chosen": 1.576547622680664, "rewards/margins": 1.2103341817855835, "rewards/rejected": 0.3662133812904358, "step": 860 }, { "epoch": 0.32, "grad_norm": 55.2942123107171, "learning_rate": 1.4847368003156803e-07, "logits/chosen": -1.7101478576660156, "logits/rejected": -1.5123283863067627, "logps/chosen": -80.416259765625, "logps/rejected": -92.58718872070312, "loss": 0.4117, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.8133132457733154, "rewards/margins": 1.0088872909545898, "rewards/rejected": 0.8044260144233704, "step": 870 }, { "epoch": 0.32, "grad_norm": 35.816541721556696, "learning_rate": 1.483768381452786e-07, "logits/chosen": -2.319746732711792, "logits/rejected": -1.79997980594635, "logps/chosen": -87.76374816894531, "logps/rejected": -112.37357330322266, "loss": 0.3523, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.6848220825195312, "rewards/margins": 0.9358736276626587, "rewards/rejected": 0.7489483952522278, "step": 880 }, { "epoch": 0.32, "grad_norm": 36.93257214414797, "learning_rate": 1.482770513690379e-07, "logits/chosen": -1.800686240196228, "logits/rejected": -1.3662943840026855, "logps/chosen": -88.72349548339844, "logps/rejected": -132.239501953125, "loss": 0.3748, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.5943005084991455, "rewards/margins": 1.0010852813720703, "rewards/rejected": 0.5932152271270752, "step": 890 }, { "epoch": 0.33, "grad_norm": 32.655789909757885, "learning_rate": 1.4817432370766656e-07, "logits/chosen": -1.7111234664916992, "logits/rejected": -1.62754225730896, "logps/chosen": -72.77391052246094, "logps/rejected": -117.9988021850586, "loss": 0.4219, "rewards/accuracies": 1.0, "rewards/chosen": 2.0136775970458984, "rewards/margins": 1.9716641902923584, "rewards/rejected": 0.042013369500637054, "step": 900 }, { "epoch": 0.33, "eval_logits/chosen": -2.572319507598877, "eval_logits/rejected": -2.3702971935272217, "eval_logps/chosen": -83.57896423339844, "eval_logps/rejected": -125.93928527832031, "eval_loss": 0.4064531624317169, "eval_rewards/accuracies": 0.9107142686843872, "eval_rewards/chosen": 1.6377297639846802, "eval_rewards/margins": 1.694512963294983, "eval_rewards/rejected": -0.056782953441143036, "eval_runtime": 71.9888, "eval_samples_per_second": 12.363, "eval_steps_per_second": 0.194, "step": 900 }, { "epoch": 0.33, "grad_norm": 34.63562171188158, "learning_rate": 1.4806865928401402e-07, "logits/chosen": -2.034156084060669, "logits/rejected": -1.3545467853546143, "logps/chosen": -63.69415283203125, "logps/rejected": -122.93882751464844, "loss": 0.3752, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 2.400649309158325, "rewards/margins": 2.036404609680176, "rewards/rejected": 0.36424484848976135, "step": 910 }, { "epoch": 0.33, "grad_norm": 51.406692521659814, "learning_rate": 1.4796006233879314e-07, "logits/chosen": -1.6481889486312866, "logits/rejected": -1.8775209188461304, "logps/chosen": -105.07774353027344, "logps/rejected": -128.53115844726562, "loss": 0.4173, "rewards/accuracies": 0.75, "rewards/chosen": 1.0885381698608398, "rewards/margins": 0.976910412311554, "rewards/rejected": 0.11162771284580231, "step": 920 }, { "epoch": 0.34, "grad_norm": 36.79775770964529, "learning_rate": 1.4784853723040993e-07, "logits/chosen": -1.9353240728378296, "logits/rejected": -1.7171472311019897, "logps/chosen": -96.37171173095703, "logps/rejected": -142.31790161132812, "loss": 0.4281, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.0895094871520996, "rewards/margins": 1.8335977792739868, "rewards/rejected": -0.7440882921218872, "step": 930 }, { "epoch": 0.34, "grad_norm": 41.812337440587434, "learning_rate": 1.4773408843478865e-07, "logits/chosen": -2.2208828926086426, "logits/rejected": -1.9805580377578735, "logps/chosen": -97.20782470703125, "logps/rejected": -128.40142822265625, "loss": 0.3873, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.3034839630126953, "rewards/margins": 1.4772690534591675, "rewards/rejected": -0.17378509044647217, "step": 940 }, { "epoch": 0.34, "grad_norm": 42.4893277983641, "learning_rate": 1.4761672054519223e-07, "logits/chosen": -2.1439785957336426, "logits/rejected": -2.007866144180298, "logps/chosen": -98.85569763183594, "logps/rejected": -139.9319305419922, "loss": 0.3826, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.8108327388763428, "rewards/margins": 1.2574350833892822, "rewards/rejected": 0.5533978343009949, "step": 950 }, { "epoch": 0.35, "grad_norm": 33.46468899519379, "learning_rate": 1.4749643827203783e-07, "logits/chosen": -1.9708385467529297, "logits/rejected": -1.5546438694000244, "logps/chosen": -93.60973358154297, "logps/rejected": -142.23947143554688, "loss": 0.3418, "rewards/accuracies": 0.75, "rewards/chosen": 1.3536150455474854, "rewards/margins": 1.6791467666625977, "rewards/rejected": -0.32553163170814514, "step": 960 }, { "epoch": 0.35, "grad_norm": 37.27542798597497, "learning_rate": 1.4737324644270786e-07, "logits/chosen": -2.059234142303467, "logits/rejected": -2.1984705924987793, "logps/chosen": -100.48094177246094, "logps/rejected": -111.93769836425781, "loss": 0.4226, "rewards/accuracies": 0.75, "rewards/chosen": 1.6639249324798584, "rewards/margins": 1.1406528949737549, "rewards/rejected": 0.5232721567153931, "step": 970 }, { "epoch": 0.36, "grad_norm": 41.082132677027026, "learning_rate": 1.4724715000135616e-07, "logits/chosen": -1.8186811208724976, "logits/rejected": -1.4485307931900024, "logps/chosen": -99.40641784667969, "logps/rejected": -124.72676086425781, "loss": 0.4518, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.4649403095245361, "rewards/margins": 1.2863953113555908, "rewards/rejected": 0.1785450279712677, "step": 980 }, { "epoch": 0.36, "grad_norm": 45.98733730441951, "learning_rate": 1.4711815400870976e-07, "logits/chosen": -1.4492137432098389, "logits/rejected": -1.3497685194015503, "logps/chosen": -80.22025299072266, "logps/rejected": -121.85737609863281, "loss": 0.3912, "rewards/accuracies": 0.75, "rewards/chosen": 1.7067760229110718, "rewards/margins": 2.1301562786102295, "rewards/rejected": -0.42338013648986816, "step": 990 }, { "epoch": 0.36, "grad_norm": 38.58236345723491, "learning_rate": 1.4698626364186557e-07, "logits/chosen": -2.125136137008667, "logits/rejected": -1.753357172012329, "logps/chosen": -95.4744644165039, "logps/rejected": -130.9934539794922, "loss": 0.4453, "rewards/accuracies": 0.75, "rewards/chosen": 1.6447858810424805, "rewards/margins": 1.4352115392684937, "rewards/rejected": 0.2095741331577301, "step": 1000 }, { "epoch": 0.36, "eval_logits/chosen": -2.6885669231414795, "eval_logits/rejected": -2.4480230808258057, "eval_logps/chosen": -84.5037841796875, "eval_logps/rejected": -127.53032684326172, "eval_loss": 0.38953348994255066, "eval_rewards/accuracies": 0.9285714030265808, "eval_rewards/chosen": 1.545248031616211, "eval_rewards/margins": 1.7611348628997803, "eval_rewards/rejected": -0.2158866971731186, "eval_runtime": 71.9853, "eval_samples_per_second": 12.364, "eval_steps_per_second": 0.194, "step": 1000 }, { "epoch": 0.37, "grad_norm": 37.010975459317685, "learning_rate": 1.4685148419408265e-07, "logits/chosen": -2.1752285957336426, "logits/rejected": -1.78762686252594, "logps/chosen": -85.75431060791016, "logps/rejected": -120.37774658203125, "loss": 0.3768, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.1561453342437744, "rewards/margins": 1.0167858600616455, "rewards/rejected": 0.1393592357635498, "step": 1010 }, { "epoch": 0.37, "grad_norm": 35.576239317002795, "learning_rate": 1.4671382107456988e-07, "logits/chosen": -1.7128040790557861, "logits/rejected": -1.5929228067398071, "logps/chosen": -87.97055053710938, "logps/rejected": -114.09513092041016, "loss": 0.363, "rewards/accuracies": 0.75, "rewards/chosen": 1.064514398574829, "rewards/margins": 0.6555222272872925, "rewards/rejected": 0.408992201089859, "step": 1020 }, { "epoch": 0.37, "grad_norm": 48.8875893207821, "learning_rate": 1.465732798082687e-07, "logits/chosen": -1.8439133167266846, "logits/rejected": -1.883114218711853, "logps/chosen": -83.29924011230469, "logps/rejected": -107.3209228515625, "loss": 0.3854, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.5590392351150513, "rewards/margins": 1.844124436378479, "rewards/rejected": -0.2850852608680725, "step": 1030 }, { "epoch": 0.38, "grad_norm": 37.219215466416436, "learning_rate": 1.4642986603563156e-07, "logits/chosen": -2.579458713531494, "logits/rejected": -2.0807945728302, "logps/chosen": -63.25598907470703, "logps/rejected": -125.60914611816406, "loss": 0.39, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.9264271259307861, "rewards/margins": 1.684195876121521, "rewards/rejected": 0.2422313243150711, "step": 1040 }, { "epoch": 0.38, "grad_norm": 34.650758595758035, "learning_rate": 1.4628358551239537e-07, "logits/chosen": -2.4567530155181885, "logits/rejected": -1.7575896978378296, "logps/chosen": -67.78668975830078, "logps/rejected": -145.57591247558594, "loss": 0.3579, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.0559427738189697, "rewards/margins": 1.9872772693634033, "rewards/rejected": 0.06866538524627686, "step": 1050 }, { "epoch": 0.38, "grad_norm": 28.530221009217506, "learning_rate": 1.461344441093506e-07, "logits/chosen": -1.84628164768219, "logits/rejected": -1.4732797145843506, "logps/chosen": -92.16481018066406, "logps/rejected": -120.20820617675781, "loss": 0.3983, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.2543227672576904, "rewards/margins": 0.7427952885627747, "rewards/rejected": 0.5115275979042053, "step": 1060 }, { "epoch": 0.39, "grad_norm": 39.23433854659703, "learning_rate": 1.4598244781210573e-07, "logits/chosen": -2.2004318237304688, "logits/rejected": -1.6184364557266235, "logps/chosen": -83.8439712524414, "logps/rejected": -131.82962036132812, "loss": 0.3871, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.4584438800811768, "rewards/margins": 1.8565394878387451, "rewards/rejected": -0.39809566736221313, "step": 1070 }, { "epoch": 0.39, "grad_norm": 35.45066510618733, "learning_rate": 1.4582760272084676e-07, "logits/chosen": -1.8293546438217163, "logits/rejected": -1.3871347904205322, "logps/chosen": -76.8940658569336, "logps/rejected": -131.3907470703125, "loss": 0.3741, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.866342306137085, "rewards/margins": 1.9369169473648071, "rewards/rejected": -0.07057473063468933, "step": 1080 }, { "epoch": 0.4, "grad_norm": 37.751293634401364, "learning_rate": 1.4566991505009272e-07, "logits/chosen": -2.4931588172912598, "logits/rejected": -2.37707781791687, "logps/chosen": -91.82124328613281, "logps/rejected": -113.77565002441406, "loss": 0.3502, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.9339624643325806, "rewards/margins": 0.9942086338996887, "rewards/rejected": 0.9397537112236023, "step": 1090 }, { "epoch": 0.4, "grad_norm": 68.79835073894007, "learning_rate": 1.4550939112844606e-07, "logits/chosen": -2.0712924003601074, "logits/rejected": -1.6202366352081299, "logps/chosen": -62.410888671875, "logps/rejected": -119.64176940917969, "loss": 0.372, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.7587225437164307, "rewards/margins": 2.3271777629852295, "rewards/rejected": -0.5684553980827332, "step": 1100 }, { "epoch": 0.4, "eval_logits/chosen": -2.6049461364746094, "eval_logits/rejected": -2.3804662227630615, "eval_logps/chosen": -83.14766693115234, "eval_logps/rejected": -129.16748046875, "eval_loss": 0.373942106962204, "eval_rewards/accuracies": 0.8928571343421936, "eval_rewards/chosen": 1.6808584928512573, "eval_rewards/margins": 2.0604610443115234, "eval_rewards/rejected": -0.37960249185562134, "eval_runtime": 71.9703, "eval_samples_per_second": 12.366, "eval_steps_per_second": 0.195, "step": 1100 }, { "epoch": 0.4, "grad_norm": 49.198262614504216, "learning_rate": 1.453460373983387e-07, "logits/chosen": -2.2133469581604004, "logits/rejected": -1.7755333185195923, "logps/chosen": -86.2526626586914, "logps/rejected": -125.26475524902344, "loss": 0.4013, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.4716092348098755, "rewards/margins": 2.0779099464416504, "rewards/rejected": -0.606300950050354, "step": 1110 }, { "epoch": 0.41, "grad_norm": 26.816352398316827, "learning_rate": 1.451798604157734e-07, "logits/chosen": -2.0376839637756348, "logits/rejected": -1.5721218585968018, "logps/chosen": -62.28696823120117, "logps/rejected": -122.70638275146484, "loss": 0.3371, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.9331592321395874, "rewards/margins": 1.9306316375732422, "rewards/rejected": 0.0025276304222643375, "step": 1120 }, { "epoch": 0.41, "grad_norm": 30.305040452259234, "learning_rate": 1.4501086685006087e-07, "logits/chosen": -2.2330105304718018, "logits/rejected": -1.6862401962280273, "logps/chosen": -61.83301544189453, "logps/rejected": -138.49703979492188, "loss": 0.3442, "rewards/accuracies": 1.0, "rewards/chosen": 1.9937431812286377, "rewards/margins": 2.239272356033325, "rewards/rejected": -0.2455294132232666, "step": 1130 }, { "epoch": 0.41, "grad_norm": 37.935713309108, "learning_rate": 1.4483906348355185e-07, "logits/chosen": -1.9341682195663452, "logits/rejected": -1.8443982601165771, "logps/chosen": -88.57161712646484, "logps/rejected": -138.91732788085938, "loss": 0.3578, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.5548217296600342, "rewards/margins": 1.6803414821624756, "rewards/rejected": -0.1255197376012802, "step": 1140 }, { "epoch": 0.42, "grad_norm": 41.30231239743997, "learning_rate": 1.4466445721136494e-07, "logits/chosen": -1.9577938318252563, "logits/rejected": -1.6811765432357788, "logps/chosen": -83.18925476074219, "logps/rejected": -151.69259643554688, "loss": 0.3935, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.5005887746810913, "rewards/margins": 1.946763038635254, "rewards/rejected": -0.4461742043495178, "step": 1150 }, { "epoch": 0.42, "grad_norm": 47.452759957888006, "learning_rate": 1.444870550411101e-07, "logits/chosen": -2.1157777309417725, "logits/rejected": -1.8774683475494385, "logps/chosen": -72.8653793334961, "logps/rejected": -136.59445190429688, "loss": 0.3984, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.7430633306503296, "rewards/margins": 1.85836660861969, "rewards/rejected": -0.11530301719903946, "step": 1160 }, { "epoch": 0.42, "grad_norm": 27.935593390748892, "learning_rate": 1.443068640926072e-07, "logits/chosen": -1.8945796489715576, "logits/rejected": -2.1599068641662598, "logps/chosen": -57.86497116088867, "logps/rejected": -86.31175231933594, "loss": 0.3544, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.740037202835083, "rewards/margins": 1.9110946655273438, "rewards/rejected": -0.171057790517807, "step": 1170 }, { "epoch": 0.43, "grad_norm": 40.028733125905276, "learning_rate": 1.4412389159760025e-07, "logits/chosen": -2.241224527359009, "logits/rejected": -1.6412010192871094, "logps/chosen": -76.12211608886719, "logps/rejected": -164.74478149414062, "loss": 0.3165, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.4326903820037842, "rewards/margins": 2.377753496170044, "rewards/rejected": -0.9450632333755493, "step": 1180 }, { "epoch": 0.43, "grad_norm": 47.0201653078679, "learning_rate": 1.439381448994673e-07, "logits/chosen": -2.094806432723999, "logits/rejected": -1.7619339227676392, "logps/chosen": -80.14140319824219, "logps/rejected": -172.28895568847656, "loss": 0.3704, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.4097177982330322, "rewards/margins": 2.00710129737854, "rewards/rejected": -0.5973835587501526, "step": 1190 }, { "epoch": 0.44, "grad_norm": 45.538770655344166, "learning_rate": 1.4374963145292563e-07, "logits/chosen": -2.0040104389190674, "logits/rejected": -1.7236404418945312, "logps/chosen": -78.05459594726562, "logps/rejected": -118.63360595703125, "loss": 0.3191, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.8405094146728516, "rewards/margins": 1.999542236328125, "rewards/rejected": -0.15903277695178986, "step": 1200 }, { "epoch": 0.44, "eval_logits/chosen": -2.6045000553131104, "eval_logits/rejected": -2.3742053508758545, "eval_logps/chosen": -86.77649688720703, "eval_logps/rejected": -132.74559020996094, "eval_loss": 0.3634836673736572, "eval_rewards/accuracies": 0.875, "eval_rewards/chosen": 1.3179771900177002, "eval_rewards/margins": 2.0553901195526123, "eval_rewards/rejected": -0.7374131083488464, "eval_runtime": 71.874, "eval_samples_per_second": 12.383, "eval_steps_per_second": 0.195, "step": 1200 }, { "epoch": 0.44, "grad_norm": 32.36206179132804, "learning_rate": 1.4355835882373265e-07, "logits/chosen": -1.9400993585586548, "logits/rejected": -1.7014987468719482, "logps/chosen": -102.5603256225586, "logps/rejected": -158.36917114257812, "loss": 0.3049, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.8652098774909973, "rewards/margins": 1.5464653968811035, "rewards/rejected": -0.6812552809715271, "step": 1210 }, { "epoch": 0.44, "grad_norm": 42.82494337547035, "learning_rate": 1.433643346883822e-07, "logits/chosen": -2.0089356899261475, "logits/rejected": -1.5017987489700317, "logps/chosen": -91.1595687866211, "logps/rejected": -135.00486755371094, "loss": 0.3278, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.4155638217926025, "rewards/margins": 1.6915414333343506, "rewards/rejected": -0.27597787976264954, "step": 1220 }, { "epoch": 0.45, "grad_norm": 56.19237380245196, "learning_rate": 1.4316756683379635e-07, "logits/chosen": -1.520774483680725, "logits/rejected": -1.4852197170257568, "logps/chosen": -90.1426010131836, "logps/rejected": -124.76286315917969, "loss": 0.3839, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.9303890466690063, "rewards/margins": 2.0999183654785156, "rewards/rejected": -0.16952911019325256, "step": 1230 }, { "epoch": 0.45, "grad_norm": 51.35397866660894, "learning_rate": 1.4296806315701312e-07, "logits/chosen": -2.2964136600494385, "logits/rejected": -1.7400308847427368, "logps/chosen": -50.99583053588867, "logps/rejected": -114.6817626953125, "loss": 0.3506, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.290008783340454, "rewards/margins": 2.5690829753875732, "rewards/rejected": -0.27907437086105347, "step": 1240 }, { "epoch": 0.45, "grad_norm": 36.79699061086273, "learning_rate": 1.427658316648694e-07, "logits/chosen": -2.083348512649536, "logits/rejected": -1.4933042526245117, "logps/chosen": -73.86151885986328, "logps/rejected": -149.9459991455078, "loss": 0.3594, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.754785180091858, "rewards/margins": 2.0723814964294434, "rewards/rejected": -0.3175961375236511, "step": 1250 }, { "epoch": 0.46, "grad_norm": 61.77575626305043, "learning_rate": 1.4256088047367958e-07, "logits/chosen": -2.3271241188049316, "logits/rejected": -2.1036393642425537, "logps/chosen": -58.489654541015625, "logps/rejected": -135.50341796875, "loss": 0.3875, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.7433359622955322, "rewards/margins": 1.8952022790908813, "rewards/rejected": -0.1518661081790924, "step": 1260 }, { "epoch": 0.46, "grad_norm": 45.208373613406316, "learning_rate": 1.423532178089099e-07, "logits/chosen": -2.00785756111145, "logits/rejected": -2.118194341659546, "logps/chosen": -87.7576904296875, "logps/rejected": -121.8628158569336, "loss": 0.2697, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.2886258363723755, "rewards/margins": 2.014240026473999, "rewards/rejected": -0.7256141901016235, "step": 1270 }, { "epoch": 0.46, "grad_norm": 35.53690989690113, "learning_rate": 1.421428520048482e-07, "logits/chosen": -1.6762386560440063, "logits/rejected": -1.9535388946533203, "logps/chosen": -91.85359191894531, "logps/rejected": -145.9041290283203, "loss": 0.3821, "rewards/accuracies": 0.75, "rewards/chosen": 1.8587366342544556, "rewards/margins": 1.5649387836456299, "rewards/rejected": 0.2937980592250824, "step": 1280 }, { "epoch": 0.47, "grad_norm": 33.054145363402554, "learning_rate": 1.419297915042697e-07, "logits/chosen": -2.3081910610198975, "logits/rejected": -1.6348092555999756, "logps/chosen": -73.78157806396484, "logps/rejected": -153.98765563964844, "loss": 0.3255, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.926365852355957, "rewards/margins": 3.0679659843444824, "rewards/rejected": -1.1416000127792358, "step": 1290 }, { "epoch": 0.47, "grad_norm": 35.78921822753497, "learning_rate": 1.4171404485809776e-07, "logits/chosen": -1.6090662479400635, "logits/rejected": -1.2726771831512451, "logps/chosen": -80.95599365234375, "logps/rejected": -132.03736877441406, "loss": 0.3319, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.9143034219741821, "rewards/margins": 1.9614051580429077, "rewards/rejected": -0.04710172861814499, "step": 1300 }, { "epoch": 0.47, "eval_logits/chosen": -2.6384994983673096, "eval_logits/rejected": -2.401343822479248, "eval_logps/chosen": -80.92295837402344, "eval_logps/rejected": -127.90816497802734, "eval_loss": 0.35564500093460083, "eval_rewards/accuracies": 0.875, "eval_rewards/chosen": 1.9033304452896118, "eval_rewards/margins": 2.1570017337799072, "eval_rewards/rejected": -0.25367113947868347, "eval_runtime": 72.0772, "eval_samples_per_second": 12.348, "eval_steps_per_second": 0.194, "step": 1300 }, { "epoch": 0.48, "grad_norm": 41.95871729273735, "learning_rate": 1.4149562072506109e-07, "logits/chosen": -1.6684595346450806, "logits/rejected": -1.6990349292755127, "logps/chosen": -106.24676513671875, "logps/rejected": -139.57778930664062, "loss": 0.3228, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.357491135597229, "rewards/margins": 1.8578159809112549, "rewards/rejected": -0.5003247261047363, "step": 1310 }, { "epoch": 0.48, "grad_norm": 29.34145064992561, "learning_rate": 1.4127452787134597e-07, "logits/chosen": -2.3935904502868652, "logits/rejected": -1.9407179355621338, "logps/chosen": -75.06128692626953, "logps/rejected": -170.85089111328125, "loss": 0.3365, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.364776372909546, "rewards/margins": 1.6955333948135376, "rewards/rejected": -0.33075690269470215, "step": 1320 }, { "epoch": 0.48, "grad_norm": 54.390170546748955, "learning_rate": 1.4105077517024458e-07, "logits/chosen": -2.1387906074523926, "logits/rejected": -1.6588388681411743, "logps/chosen": -64.23258972167969, "logps/rejected": -124.39351654052734, "loss": 0.3544, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.8940436840057373, "rewards/margins": 2.4280219078063965, "rewards/rejected": -0.5339787602424622, "step": 1330 }, { "epoch": 0.49, "grad_norm": 41.328588380198035, "learning_rate": 1.4082437160179884e-07, "logits/chosen": -1.7820161581039429, "logits/rejected": -1.5780375003814697, "logps/chosen": -76.89183044433594, "logps/rejected": -122.71630859375, "loss": 0.3367, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.836066484451294, "rewards/margins": 1.8271701335906982, "rewards/rejected": 0.008896279148757458, "step": 1340 }, { "epoch": 0.49, "grad_norm": 36.692279705097874, "learning_rate": 1.4059532625243992e-07, "logits/chosen": -1.921148657798767, "logits/rejected": -1.6129744052886963, "logps/chosen": -66.90629577636719, "logps/rejected": -127.26859283447266, "loss": 0.37, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.4975523948669434, "rewards/margins": 1.937352180480957, "rewards/rejected": -0.43979987502098083, "step": 1350 }, { "epoch": 0.49, "grad_norm": 30.243381015544752, "learning_rate": 1.403636483146238e-07, "logits/chosen": -2.2570159435272217, "logits/rejected": -1.9846521615982056, "logps/chosen": -79.24600982666016, "logps/rejected": -115.38359069824219, "loss": 0.3979, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.5120487213134766, "rewards/margins": 1.4373055696487427, "rewards/rejected": 0.07474327087402344, "step": 1360 }, { "epoch": 0.5, "grad_norm": 44.64162084878288, "learning_rate": 1.40129347086462e-07, "logits/chosen": -2.0078492164611816, "logits/rejected": -1.9076550006866455, "logps/chosen": -102.33550262451172, "logps/rejected": -123.74501037597656, "loss": 0.3483, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.0897409915924072, "rewards/margins": 1.6122095584869385, "rewards/rejected": -0.5224683880805969, "step": 1370 }, { "epoch": 0.5, "grad_norm": 39.12577532028238, "learning_rate": 1.3989243197134876e-07, "logits/chosen": -2.399981737136841, "logits/rejected": -1.599557876586914, "logps/chosen": -82.2318115234375, "logps/rejected": -135.64981079101562, "loss": 0.3279, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.5288795232772827, "rewards/margins": 1.9530413150787354, "rewards/rejected": -0.42416173219680786, "step": 1380 }, { "epoch": 0.5, "grad_norm": 36.85635152662901, "learning_rate": 1.396529124775834e-07, "logits/chosen": -2.5128228664398193, "logits/rejected": -2.076643705368042, "logps/chosen": -105.89326477050781, "logps/rejected": -146.3048553466797, "loss": 0.3094, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.3551980257034302, "rewards/margins": 1.9240223169326782, "rewards/rejected": -0.5688244700431824, "step": 1390 }, { "epoch": 0.51, "grad_norm": 46.84425675735352, "learning_rate": 1.394107982179888e-07, "logits/chosen": -1.8977441787719727, "logits/rejected": -2.266324996948242, "logps/chosen": -78.88629913330078, "logps/rejected": -171.7329559326172, "loss": 0.3469, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.2071471214294434, "rewards/margins": 1.8551435470581055, "rewards/rejected": -0.6479963064193726, "step": 1400 }, { "epoch": 0.51, "eval_logits/chosen": -2.633074998855591, "eval_logits/rejected": -2.3969056606292725, "eval_logps/chosen": -84.95893096923828, "eval_logps/rejected": -132.71334838867188, "eval_loss": 0.3462066352367401, "eval_rewards/accuracies": 0.8928571343421936, "eval_rewards/chosen": 1.4997329711914062, "eval_rewards/margins": 2.2339236736297607, "eval_rewards/rejected": -0.7341909408569336, "eval_runtime": 71.9731, "eval_samples_per_second": 12.366, "eval_steps_per_second": 0.195, "step": 1400 }, { "epoch": 0.51, "grad_norm": 28.479823112134607, "learning_rate": 1.3916609890952566e-07, "logits/chosen": -1.909106969833374, "logits/rejected": -1.6904758214950562, "logps/chosen": -82.46297454833984, "logps/rejected": -124.34074401855469, "loss": 0.319, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.6145960092544556, "rewards/margins": 2.080094575881958, "rewards/rejected": -0.4654986262321472, "step": 1410 }, { "epoch": 0.52, "grad_norm": 39.355457607430694, "learning_rate": 1.3891882437290242e-07, "logits/chosen": -1.5750586986541748, "logits/rejected": -1.6994893550872803, "logps/chosen": -82.91178894042969, "logps/rejected": -111.0701675415039, "loss": 0.4042, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.5502707958221436, "rewards/margins": 1.4318604469299316, "rewards/rejected": 0.1184103712439537, "step": 1420 }, { "epoch": 0.52, "grad_norm": 43.081802728687556, "learning_rate": 1.386689845321812e-07, "logits/chosen": -2.426974058151245, "logits/rejected": -1.7977014780044556, "logps/chosen": -79.90650939941406, "logps/rejected": -139.31407165527344, "loss": 0.3121, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.8190940618515015, "rewards/margins": 2.4520251750946045, "rewards/rejected": -0.6329307556152344, "step": 1430 }, { "epoch": 0.52, "grad_norm": 38.9495726615902, "learning_rate": 1.384165894143794e-07, "logits/chosen": -1.8546695709228516, "logits/rejected": -1.7970068454742432, "logps/chosen": -105.05964660644531, "logps/rejected": -128.0200653076172, "loss": 0.2894, "rewards/accuracies": 0.75, "rewards/chosen": 1.3824260234832764, "rewards/margins": 1.8351023197174072, "rewards/rejected": -0.45267629623413086, "step": 1440 }, { "epoch": 0.53, "grad_norm": 36.527576333516, "learning_rate": 1.381616491490674e-07, "logits/chosen": -2.3374876976013184, "logits/rejected": -2.048158884048462, "logps/chosen": -73.65496063232422, "logps/rejected": -159.6743927001953, "loss": 0.3077, "rewards/accuracies": 0.75, "rewards/chosen": 1.549155592918396, "rewards/margins": 2.265070915222168, "rewards/rejected": -0.7159156799316406, "step": 1450 }, { "epoch": 0.53, "grad_norm": 37.33453867519459, "learning_rate": 1.3790417396796205e-07, "logits/chosen": -1.6809660196304321, "logits/rejected": -1.6903836727142334, "logps/chosen": -92.21663665771484, "logps/rejected": -123.54522705078125, "loss": 0.3688, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.5740840435028076, "rewards/margins": 2.4830660820007324, "rewards/rejected": -0.9089819192886353, "step": 1460 }, { "epoch": 0.53, "grad_norm": 30.137594905849312, "learning_rate": 1.376441742045158e-07, "logits/chosen": -1.7785037755966187, "logits/rejected": -1.5067006349563599, "logps/chosen": -88.78409576416016, "logps/rejected": -144.81686401367188, "loss": 0.2812, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.1298682689666748, "rewards/margins": 2.1699564456939697, "rewards/rejected": -1.0400878190994263, "step": 1470 }, { "epoch": 0.54, "grad_norm": 45.12639127949661, "learning_rate": 1.3738166029350223e-07, "logits/chosen": -2.215520143508911, "logits/rejected": -1.8298145532608032, "logps/chosen": -78.92552185058594, "logps/rejected": -147.58212280273438, "loss": 0.361, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.8534799814224243, "rewards/margins": 2.4630656242370605, "rewards/rejected": -0.6095854640007019, "step": 1480 }, { "epoch": 0.54, "grad_norm": 30.18520757137706, "learning_rate": 1.3711664277059714e-07, "logits/chosen": -1.3158643245697021, "logits/rejected": -1.4519625902175903, "logps/chosen": -84.15853118896484, "logps/rejected": -140.84286499023438, "loss": 0.3058, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.9669021368026733, "rewards/margins": 2.4468729496002197, "rewards/rejected": -0.4799710810184479, "step": 1490 }, { "epoch": 0.54, "grad_norm": 39.46537040194847, "learning_rate": 1.3684913227195577e-07, "logits/chosen": -2.1749026775360107, "logits/rejected": -1.9646120071411133, "logps/chosen": -85.54203796386719, "logps/rejected": -132.18484497070312, "loss": 0.2976, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.073870062828064, "rewards/margins": 1.4576705694198608, "rewards/rejected": -0.3838004171848297, "step": 1500 }, { "epoch": 0.54, "eval_logits/chosen": -2.6264164447784424, "eval_logits/rejected": -2.3935818672180176, "eval_logps/chosen": -86.55225372314453, "eval_logps/rejected": -134.98130798339844, "eval_loss": 0.3363237977027893, "eval_rewards/accuracies": 0.8928571343421936, "eval_rewards/chosen": 1.3404006958007812, "eval_rewards/margins": 2.3013861179351807, "eval_rewards/rejected": -0.960985541343689, "eval_runtime": 71.9646, "eval_samples_per_second": 12.367, "eval_steps_per_second": 0.195, "step": 1500 }, { "epoch": 0.55, "grad_norm": 46.836051305920115, "learning_rate": 1.365791395337859e-07, "logits/chosen": -1.5352973937988281, "logits/rejected": -1.6385904550552368, "logps/chosen": -79.53118896484375, "logps/rejected": -113.24952697753906, "loss": 0.3002, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.4074381589889526, "rewards/margins": 2.091799259185791, "rewards/rejected": -0.6843612790107727, "step": 1510 }, { "epoch": 0.55, "grad_norm": 31.950241265489773, "learning_rate": 1.3630667539191687e-07, "logits/chosen": -1.526102066040039, "logits/rejected": -1.5050265789031982, "logps/chosen": -98.58650207519531, "logps/rejected": -133.04754638671875, "loss": 0.3186, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.1431519985198975, "rewards/margins": 1.964644193649292, "rewards/rejected": -0.8214923739433289, "step": 1520 }, { "epoch": 0.56, "grad_norm": 40.788510789631786, "learning_rate": 1.3603175078136497e-07, "logits/chosen": -1.8123880624771118, "logits/rejected": -1.3758784532546997, "logps/chosen": -68.65928649902344, "logps/rejected": -116.91877746582031, "loss": 0.3174, "rewards/accuracies": 1.0, "rewards/chosen": 1.3894504308700562, "rewards/margins": 2.661546468734741, "rewards/rejected": -1.2720959186553955, "step": 1530 }, { "epoch": 0.56, "grad_norm": 46.155078059787826, "learning_rate": 1.3575437673589428e-07, "logits/chosen": -1.7942472696304321, "logits/rejected": -1.867285132408142, "logps/chosen": -94.3653793334961, "logps/rejected": -112.46723937988281, "loss": 0.3213, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.7745468020439148, "rewards/margins": 1.4602556228637695, "rewards/rejected": -0.6857088804244995, "step": 1540 }, { "epoch": 0.56, "grad_norm": 39.375049143103645, "learning_rate": 1.3547456438757397e-07, "logits/chosen": -2.056170701980591, "logits/rejected": -1.2939419746398926, "logps/chosen": -89.32717895507812, "logps/rejected": -133.4949493408203, "loss": 0.3142, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.4916505813598633, "rewards/margins": 1.9974079132080078, "rewards/rejected": -0.5057573914527893, "step": 1550 }, { "epoch": 0.57, "grad_norm": 32.64828907806418, "learning_rate": 1.3519232496633152e-07, "logits/chosen": -2.2024412155151367, "logits/rejected": -1.6808528900146484, "logps/chosen": -64.60079193115234, "logps/rejected": -127.1625747680664, "loss": 0.3165, "rewards/accuracies": 1.0, "rewards/chosen": 1.7459484338760376, "rewards/margins": 2.624204158782959, "rewards/rejected": -0.8782557249069214, "step": 1560 }, { "epoch": 0.57, "grad_norm": 37.74052323913691, "learning_rate": 1.3490766979950211e-07, "logits/chosen": -2.4449658393859863, "logits/rejected": -2.1780996322631836, "logps/chosen": -63.970916748046875, "logps/rejected": -116.03575134277344, "loss": 0.285, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.5420334339141846, "rewards/margins": 2.1397767066955566, "rewards/rejected": -0.5977433919906616, "step": 1570 }, { "epoch": 0.57, "grad_norm": 38.57331331754987, "learning_rate": 1.3462061031137382e-07, "logits/chosen": -1.8334707021713257, "logits/rejected": -1.6402419805526733, "logps/chosen": -81.07013702392578, "logps/rejected": -196.51010131835938, "loss": 0.3205, "rewards/accuracies": 1.0, "rewards/chosen": 1.801913857460022, "rewards/margins": 3.0623841285705566, "rewards/rejected": -1.2604701519012451, "step": 1580 }, { "epoch": 0.58, "grad_norm": 35.786587394552, "learning_rate": 1.3433115802272929e-07, "logits/chosen": -1.9113935232162476, "logits/rejected": -1.5720059871673584, "logps/chosen": -72.0071792602539, "logps/rejected": -155.9192352294922, "loss": 0.2904, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.8154712915420532, "rewards/margins": 2.6344165802001953, "rewards/rejected": -0.8189449310302734, "step": 1590 }, { "epoch": 0.58, "grad_norm": 38.20450363937764, "learning_rate": 1.3403932455038328e-07, "logits/chosen": -2.0501441955566406, "logits/rejected": -1.8184171915054321, "logps/chosen": -86.4954833984375, "logps/rejected": -138.66053771972656, "loss": 0.2839, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.814734697341919, "rewards/margins": 2.979191303253174, "rewards/rejected": -1.1644567251205444, "step": 1600 }, { "epoch": 0.58, "eval_logits/chosen": -2.5862371921539307, "eval_logits/rejected": -2.3639228343963623, "eval_logps/chosen": -84.44693756103516, "eval_logps/rejected": -134.8124237060547, "eval_loss": 0.3325304090976715, "eval_rewards/accuracies": 0.8571428656578064, "eval_rewards/chosen": 1.5509331226348877, "eval_rewards/margins": 2.4950315952301025, "eval_rewards/rejected": -0.9440980553627014, "eval_runtime": 71.9558, "eval_samples_per_second": 12.369, "eval_steps_per_second": 0.195, "step": 1600 }, { "epoch": 0.58, "grad_norm": 35.827979868952966, "learning_rate": 1.3374512160671644e-07, "logits/chosen": -1.5930265188217163, "logits/rejected": -1.6432759761810303, "logps/chosen": -101.54774475097656, "logps/rejected": -133.1485595703125, "loss": 0.3038, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.5359328389167786, "rewards/margins": 1.9368603229522705, "rewards/rejected": -1.4009274244308472, "step": 1610 }, { "epoch": 0.59, "grad_norm": 30.92177760123355, "learning_rate": 1.3344856099920526e-07, "logits/chosen": -1.9082187414169312, "logits/rejected": -1.8227027654647827, "logps/chosen": -100.11506652832031, "logps/rejected": -131.5067596435547, "loss": 0.3262, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.2241116762161255, "rewards/margins": 1.7369029521942139, "rewards/rejected": -0.5127913355827332, "step": 1620 }, { "epoch": 0.59, "grad_norm": 37.15625275272501, "learning_rate": 1.3314965462994826e-07, "logits/chosen": -1.8317235708236694, "logits/rejected": -1.373651146888733, "logps/chosen": -88.5484848022461, "logps/rejected": -131.82040405273438, "loss": 0.2875, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.9509607553482056, "rewards/margins": 2.071371555328369, "rewards/rejected": -1.120410680770874, "step": 1630 }, { "epoch": 0.6, "grad_norm": 45.37302624095175, "learning_rate": 1.3284841449518813e-07, "logits/chosen": -1.686141014099121, "logits/rejected": -1.2155983448028564, "logps/chosen": -99.9353256225586, "logps/rejected": -144.32980346679688, "loss": 0.3014, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.9641411900520325, "rewards/margins": 1.8706451654434204, "rewards/rejected": -0.9065039753913879, "step": 1640 }, { "epoch": 0.6, "grad_norm": 55.85191593624814, "learning_rate": 1.3254485268483055e-07, "logits/chosen": -1.7119086980819702, "logits/rejected": -1.548729419708252, "logps/chosen": -92.55009460449219, "logps/rejected": -211.9893341064453, "loss": 0.3211, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.1339528560638428, "rewards/margins": 2.2448153495788574, "rewards/rejected": -1.1108627319335938, "step": 1650 }, { "epoch": 0.6, "grad_norm": 36.20383771458562, "learning_rate": 1.3223898138195864e-07, "logits/chosen": -2.340451240539551, "logits/rejected": -1.6691877841949463, "logps/chosen": -57.80481719970703, "logps/rejected": -119.7285385131836, "loss": 0.2937, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.0053391456604004, "rewards/margins": 2.7729382514953613, "rewards/rejected": -0.7675992250442505, "step": 1660 }, { "epoch": 0.61, "grad_norm": 33.03559928672513, "learning_rate": 1.319308128623443e-07, "logits/chosen": -1.8629367351531982, "logits/rejected": -1.5715751647949219, "logps/chosen": -94.56365203857422, "logps/rejected": -130.30677795410156, "loss": 0.2732, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.7679950594902039, "rewards/margins": 2.2738566398620605, "rewards/rejected": -1.5058612823486328, "step": 1670 }, { "epoch": 0.61, "grad_norm": 39.357859474732834, "learning_rate": 1.3162035949395548e-07, "logits/chosen": -1.9703582525253296, "logits/rejected": -1.6913745403289795, "logps/chosen": -68.82579040527344, "logps/rejected": -130.0358123779297, "loss": 0.3041, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.3682914972305298, "rewards/margins": 2.242696762084961, "rewards/rejected": -0.874405562877655, "step": 1680 }, { "epoch": 0.61, "grad_norm": 44.19717913326496, "learning_rate": 1.3130763373645956e-07, "logits/chosen": -2.4028756618499756, "logits/rejected": -1.5053786039352417, "logps/chosen": -93.85604095458984, "logps/rejected": -234.1709747314453, "loss": 0.3071, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.169268250465393, "rewards/margins": 3.234970808029175, "rewards/rejected": -2.0657026767730713, "step": 1690 }, { "epoch": 0.62, "grad_norm": 30.436503628615085, "learning_rate": 1.309926481407237e-07, "logits/chosen": -2.1171786785125732, "logits/rejected": -1.6411173343658447, "logps/chosen": -81.68711853027344, "logps/rejected": -157.5670166015625, "loss": 0.3095, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.531718373298645, "rewards/margins": 2.767521858215332, "rewards/rejected": -1.2358036041259766, "step": 1700 }, { "epoch": 0.62, "eval_logits/chosen": -2.6024363040924072, "eval_logits/rejected": -2.381307601928711, "eval_logps/chosen": -88.0004653930664, "eval_logps/rejected": -138.6060028076172, "eval_loss": 0.3237309455871582, "eval_rewards/accuracies": 0.8928571343421936, "eval_rewards/chosen": 1.1955795288085938, "eval_rewards/margins": 2.519036054611206, "eval_rewards/rejected": -1.3234565258026123, "eval_runtime": 71.9205, "eval_samples_per_second": 12.375, "eval_steps_per_second": 0.195, "step": 1700 }, { "epoch": 0.62, "grad_norm": 36.27177663241429, "learning_rate": 1.3067541534831074e-07, "logits/chosen": -2.4922726154327393, "logits/rejected": -1.9374639987945557, "logps/chosen": -95.45484161376953, "logps/rejected": -176.23080444335938, "loss": 0.3039, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.721973180770874, "rewards/margins": 2.6191511154174805, "rewards/rejected": -1.897178053855896, "step": 1710 }, { "epoch": 0.62, "grad_norm": 32.7184977007972, "learning_rate": 1.303559480909721e-07, "logits/chosen": -2.122936487197876, "logits/rejected": -1.8037102222442627, "logps/chosen": -88.16950988769531, "logps/rejected": -144.52603149414062, "loss": 0.2703, "rewards/accuracies": 0.75, "rewards/chosen": 1.2837637662887573, "rewards/margins": 2.390108585357666, "rewards/rejected": -1.1063446998596191, "step": 1720 }, { "epoch": 0.63, "grad_norm": 29.34834638089634, "learning_rate": 1.3003425919013677e-07, "logits/chosen": -1.9752366542816162, "logits/rejected": -1.5035978555679321, "logps/chosen": -87.86656188964844, "logps/rejected": -164.7413330078125, "loss": 0.2606, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.7725374698638916, "rewards/margins": 3.28916597366333, "rewards/rejected": -1.5166288614273071, "step": 1730 }, { "epoch": 0.63, "grad_norm": 36.999465061787745, "learning_rate": 1.2971036155639654e-07, "logits/chosen": -1.8440446853637695, "logits/rejected": -1.6909431219100952, "logps/chosen": -92.90309143066406, "logps/rejected": -143.26986694335938, "loss": 0.3133, "rewards/accuracies": 1.0, "rewards/chosen": 1.0839173793792725, "rewards/margins": 2.5392398834228516, "rewards/rejected": -1.455322504043579, "step": 1740 }, { "epoch": 0.64, "grad_norm": 38.30391165242112, "learning_rate": 1.293842681889882e-07, "logits/chosen": -2.0229029655456543, "logits/rejected": -1.4855544567108154, "logps/chosen": -93.77799987792969, "logps/rejected": -204.0419464111328, "loss": 0.3124, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.2000395059585571, "rewards/margins": 2.4565627574920654, "rewards/rejected": -1.2565232515335083, "step": 1750 }, { "epoch": 0.64, "grad_norm": 41.551439437509096, "learning_rate": 1.290559921752715e-07, "logits/chosen": -2.3583412170410156, "logits/rejected": -2.0657455921173096, "logps/chosen": -85.60011291503906, "logps/rejected": -131.65814208984375, "loss": 0.3016, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.3455216884613037, "rewards/margins": 2.5204150676727295, "rewards/rejected": -1.1748934984207153, "step": 1760 }, { "epoch": 0.64, "grad_norm": 30.177779462195463, "learning_rate": 1.287255466902041e-07, "logits/chosen": -1.8713395595550537, "logits/rejected": -1.5737354755401611, "logps/chosen": -73.5456314086914, "logps/rejected": -161.8707275390625, "loss": 0.2684, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.3379333019256592, "rewards/margins": 3.22587251663208, "rewards/rejected": -1.887939214706421, "step": 1770 }, { "epoch": 0.65, "grad_norm": 42.266893865591655, "learning_rate": 1.2839294499581266e-07, "logits/chosen": -1.8609529733657837, "logits/rejected": -1.9660890102386475, "logps/chosen": -90.25117492675781, "logps/rejected": -139.0089111328125, "loss": 0.2725, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.4452346563339233, "rewards/margins": 2.1682021617889404, "rewards/rejected": -0.7229675054550171, "step": 1780 }, { "epoch": 0.65, "grad_norm": 45.909064669287716, "learning_rate": 1.280582004406608e-07, "logits/chosen": -2.2296082973480225, "logits/rejected": -1.7719411849975586, "logps/chosen": -91.33625793457031, "logps/rejected": -132.76153564453125, "loss": 0.3173, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.3035770654678345, "rewards/margins": 2.369469165802002, "rewards/rejected": -1.0658921003341675, "step": 1790 }, { "epoch": 0.65, "grad_norm": 38.096350658013705, "learning_rate": 1.2772132645931315e-07, "logits/chosen": -1.8231449127197266, "logits/rejected": -1.4982807636260986, "logps/chosen": -77.42939758300781, "logps/rejected": -142.36465454101562, "loss": 0.2593, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.426496982574463, "rewards/margins": 2.347156047821045, "rewards/rejected": -0.9206587672233582, "step": 1800 }, { "epoch": 0.65, "eval_logits/chosen": -2.613116502761841, "eval_logits/rejected": -2.38425612449646, "eval_logps/chosen": -87.31402587890625, "eval_logps/rejected": -137.7794952392578, "eval_loss": 0.3188331723213196, "eval_rewards/accuracies": 0.8392857313156128, "eval_rewards/chosen": 1.2642244100570679, "eval_rewards/margins": 2.5050292015075684, "eval_rewards/rejected": -1.240804672241211, "eval_runtime": 71.9246, "eval_samples_per_second": 12.374, "eval_steps_per_second": 0.195, "step": 1800 }, { "epoch": 0.66, "grad_norm": 39.50184113749164, "learning_rate": 1.273823365717963e-07, "logits/chosen": -1.7294566631317139, "logits/rejected": -1.341367244720459, "logps/chosen": -74.58464050292969, "logps/rejected": -133.0430450439453, "loss": 0.3023, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.7522623538970947, "rewards/margins": 2.6015543937683105, "rewards/rejected": -0.8492921590805054, "step": 1810 }, { "epoch": 0.66, "grad_norm": 42.89748213165641, "learning_rate": 1.270412443830562e-07, "logits/chosen": -1.636803388595581, "logits/rejected": -1.4411513805389404, "logps/chosen": -90.65261840820312, "logps/rejected": -148.75440979003906, "loss": 0.2737, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.4128892421722412, "rewards/margins": 2.5060081481933594, "rewards/rejected": -1.0931187868118286, "step": 1820 }, { "epoch": 0.66, "grad_norm": 49.9218877269305, "learning_rate": 1.2669806358241194e-07, "logits/chosen": -1.672900915145874, "logits/rejected": -1.657602310180664, "logps/chosen": -102.61698150634766, "logps/rejected": -124.9526138305664, "loss": 0.3258, "rewards/accuracies": 0.75, "rewards/chosen": 1.2539775371551514, "rewards/margins": 1.9778436422348022, "rewards/rejected": -0.7238659858703613, "step": 1830 }, { "epoch": 0.67, "grad_norm": 52.08013573210232, "learning_rate": 1.2635280794300674e-07, "logits/chosen": -1.9965959787368774, "logits/rejected": -1.7998332977294922, "logps/chosen": -89.28053283691406, "logps/rejected": -126.0293960571289, "loss": 0.2968, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.1432512998580933, "rewards/margins": 1.8423658609390259, "rewards/rejected": -0.6991146802902222, "step": 1840 }, { "epoch": 0.67, "grad_norm": 57.840988304922966, "learning_rate": 1.260054913212547e-07, "logits/chosen": -2.127389907836914, "logits/rejected": -1.7502504587173462, "logps/chosen": -71.455078125, "logps/rejected": -115.15047454833984, "loss": 0.3373, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.6597360372543335, "rewards/margins": 2.4347267150878906, "rewards/rejected": -0.7749907970428467, "step": 1850 }, { "epoch": 0.68, "grad_norm": 53.530462845694025, "learning_rate": 1.2565612765628513e-07, "logits/chosen": -1.9101688861846924, "logits/rejected": -1.6512504816055298, "logps/chosen": -74.90973663330078, "logps/rejected": -138.97523498535156, "loss": 0.3027, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.8204338550567627, "rewards/margins": 2.453209161758423, "rewards/rejected": -0.6327755451202393, "step": 1860 }, { "epoch": 0.68, "grad_norm": 37.85257138685388, "learning_rate": 1.2530473096938278e-07, "logits/chosen": -1.8988683223724365, "logits/rejected": -1.6503827571868896, "logps/chosen": -79.6770248413086, "logps/rejected": -143.09071350097656, "loss": 0.3155, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.498981237411499, "rewards/margins": 2.404602527618408, "rewards/rejected": -0.9056210517883301, "step": 1870 }, { "epoch": 0.68, "grad_norm": 50.64293606347689, "learning_rate": 1.2495131536342536e-07, "logits/chosen": -2.128816843032837, "logits/rejected": -1.7718677520751953, "logps/chosen": -86.67499542236328, "logps/rejected": -179.9734344482422, "loss": 0.2966, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.5343300104141235, "rewards/margins": 2.9166336059570312, "rewards/rejected": -1.3823034763336182, "step": 1880 }, { "epoch": 0.69, "grad_norm": 37.57626214701421, "learning_rate": 1.245958950223174e-07, "logits/chosen": -1.349810242652893, "logits/rejected": -1.1940224170684814, "logps/chosen": -87.54304504394531, "logps/rejected": -124.6638412475586, "loss": 0.3138, "rewards/accuracies": 0.75, "rewards/chosen": 0.8737591505050659, "rewards/margins": 1.981792688369751, "rewards/rejected": -1.1080334186553955, "step": 1890 }, { "epoch": 0.69, "grad_norm": 37.541514062279354, "learning_rate": 1.24238484210421e-07, "logits/chosen": -2.122223138809204, "logits/rejected": -1.972744345664978, "logps/chosen": -123.90694427490234, "logps/rejected": -174.65133666992188, "loss": 0.2394, "rewards/accuracies": 0.75, "rewards/chosen": 0.4691895842552185, "rewards/margins": 1.7866857051849365, "rewards/rejected": -1.3174960613250732, "step": 1900 }, { "epoch": 0.69, "eval_logits/chosen": -2.6142075061798096, "eval_logits/rejected": -2.388277053833008, "eval_logps/chosen": -86.47371673583984, "eval_logps/rejected": -137.2868194580078, "eval_loss": 0.3111128807067871, "eval_rewards/accuracies": 0.8392857313156128, "eval_rewards/chosen": 1.3482542037963867, "eval_rewards/margins": 2.539792776107788, "eval_rewards/rejected": -1.1915382146835327, "eval_runtime": 71.9156, "eval_samples_per_second": 12.376, "eval_steps_per_second": 0.195, "step": 1900 }, { "epoch": 0.69, "grad_norm": 48.134112820259595, "learning_rate": 1.2387909727198345e-07, "logits/chosen": -2.399280309677124, "logits/rejected": -2.280487537384033, "logps/chosen": -97.30809020996094, "logps/rejected": -159.677734375, "loss": 0.2845, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.0647090673446655, "rewards/margins": 1.8406950235366821, "rewards/rejected": -0.7759860754013062, "step": 1910 }, { "epoch": 0.7, "grad_norm": 49.91211298121504, "learning_rate": 1.2351774863056134e-07, "logits/chosen": -2.3767571449279785, "logits/rejected": -1.682460069656372, "logps/chosen": -62.444793701171875, "logps/rejected": -145.32229614257812, "loss": 0.2989, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.9718739986419678, "rewards/margins": 3.5472664833068848, "rewards/rejected": -1.575392246246338, "step": 1920 }, { "epoch": 0.7, "grad_norm": 29.38061476258355, "learning_rate": 1.2315445278844197e-07, "logits/chosen": -2.427912950515747, "logits/rejected": -1.9351403713226318, "logps/chosen": -73.01206970214844, "logps/rejected": -209.5726776123047, "loss": 0.2909, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.7785199880599976, "rewards/margins": 2.738701581954956, "rewards/rejected": -0.9601818323135376, "step": 1930 }, { "epoch": 0.7, "grad_norm": 26.75959860175755, "learning_rate": 1.227892243260611e-07, "logits/chosen": -2.469193696975708, "logits/rejected": -1.6766465902328491, "logps/chosen": -56.01939010620117, "logps/rejected": -119.8520736694336, "loss": 0.3141, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.8703302145004272, "rewards/margins": 2.945542335510254, "rewards/rejected": -1.0752121210098267, "step": 1940 }, { "epoch": 0.71, "grad_norm": 38.461850874562536, "learning_rate": 1.224220779014178e-07, "logits/chosen": -1.9026371240615845, "logits/rejected": -1.4415004253387451, "logps/chosen": -97.89253997802734, "logps/rejected": -149.56936645507812, "loss": 0.2902, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.6727228164672852, "rewards/margins": 2.737921953201294, "rewards/rejected": -1.0651990175247192, "step": 1950 }, { "epoch": 0.71, "grad_norm": 31.657055351360636, "learning_rate": 1.2205302824948634e-07, "logits/chosen": -2.2927825450897217, "logits/rejected": -1.8084007501602173, "logps/chosen": -80.52983093261719, "logps/rejected": -156.01187133789062, "loss": 0.3306, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.4960755109786987, "rewards/margins": 2.939119338989258, "rewards/rejected": -1.4430434703826904, "step": 1960 }, { "epoch": 0.72, "grad_norm": 50.459905943712606, "learning_rate": 1.2168209018162456e-07, "logits/chosen": -2.1764161586761475, "logits/rejected": -1.9086568355560303, "logps/chosen": -97.29804992675781, "logps/rejected": -158.05215454101562, "loss": 0.2753, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.2845994234085083, "rewards/margins": 2.076698064804077, "rewards/rejected": -0.7920988202095032, "step": 1970 }, { "epoch": 0.72, "grad_norm": 34.05499830411277, "learning_rate": 1.2130927858497966e-07, "logits/chosen": -1.833762526512146, "logits/rejected": -1.881530523300171, "logps/chosen": -108.65325927734375, "logps/rejected": -144.40902709960938, "loss": 0.3155, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.106397271156311, "rewards/margins": 2.1092371940612793, "rewards/rejected": -1.0028399229049683, "step": 1980 }, { "epoch": 0.72, "grad_norm": 18.84745830969425, "learning_rate": 1.209346084218906e-07, "logits/chosen": -1.7890899181365967, "logits/rejected": -1.7402870655059814, "logps/chosen": -86.61128234863281, "logps/rejected": -116.84139251708984, "loss": 0.3061, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.6292413473129272, "rewards/margins": 2.007689952850342, "rewards/rejected": -0.3784485161304474, "step": 1990 }, { "epoch": 0.73, "grad_norm": 48.3959022523091, "learning_rate": 1.2055809472928762e-07, "logits/chosen": -1.7730016708374023, "logits/rejected": -1.6386531591415405, "logps/chosen": -79.35774230957031, "logps/rejected": -126.07957458496094, "loss": 0.3234, "rewards/accuracies": 1.0, "rewards/chosen": 1.5467567443847656, "rewards/margins": 2.7957520484924316, "rewards/rejected": -1.2489951848983765, "step": 2000 }, { "epoch": 0.73, "eval_logits/chosen": -2.596472978591919, "eval_logits/rejected": -2.3702468872070312, "eval_logps/chosen": -86.58128356933594, "eval_logps/rejected": -138.30990600585938, "eval_loss": 0.3054315149784088, "eval_rewards/accuracies": 0.8392857313156128, "eval_rewards/chosen": 1.3374978303909302, "eval_rewards/margins": 2.6313438415527344, "eval_rewards/rejected": -1.2938461303710938, "eval_runtime": 71.9938, "eval_samples_per_second": 12.362, "eval_steps_per_second": 0.194, "step": 2000 }, { "epoch": 0.73, "grad_norm": 53.09826173444019, "learning_rate": 1.2017975261808887e-07, "logits/chosen": -1.9460725784301758, "logits/rejected": -1.7900736331939697, "logps/chosen": -103.51519775390625, "logps/rejected": -141.5853271484375, "loss": 0.3128, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.3981924057006836, "rewards/margins": 2.500655174255371, "rewards/rejected": -1.1024627685546875, "step": 2010 }, { "epoch": 0.73, "grad_norm": 39.783453163990046, "learning_rate": 1.1979959727259367e-07, "logits/chosen": -1.8045324087142944, "logits/rejected": -1.6309808492660522, "logps/chosen": -92.61442565917969, "logps/rejected": -154.93099975585938, "loss": 0.2678, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.2143778800964355, "rewards/margins": 2.35176420211792, "rewards/rejected": -1.137386441230774, "step": 2020 }, { "epoch": 0.74, "grad_norm": 39.81682054465197, "learning_rate": 1.1941764394987351e-07, "logits/chosen": -1.738677740097046, "logits/rejected": -1.5726416110992432, "logps/chosen": -82.56022644042969, "logps/rejected": -151.69943237304688, "loss": 0.2551, "rewards/accuracies": 0.75, "rewards/chosen": 1.240398645401001, "rewards/margins": 2.4828732013702393, "rewards/rejected": -1.2424746751785278, "step": 2030 }, { "epoch": 0.74, "grad_norm": 37.01944950877088, "learning_rate": 1.1903390797915929e-07, "logits/chosen": -2.1611745357513428, "logits/rejected": -1.4224140644073486, "logps/chosen": -80.29561614990234, "logps/rejected": -215.88174438476562, "loss": 0.2887, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.6173490285873413, "rewards/margins": 3.720074415206909, "rewards/rejected": -2.1027255058288574, "step": 2040 }, { "epoch": 0.74, "grad_norm": 40.82628805207018, "learning_rate": 1.1864840476122647e-07, "logits/chosen": -1.8892196416854858, "logits/rejected": -1.9005800485610962, "logps/chosen": -69.93404388427734, "logps/rejected": -123.88677978515625, "loss": 0.2882, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.8273292779922485, "rewards/margins": 3.182246685028076, "rewards/rejected": -1.354917287826538, "step": 2050 }, { "epoch": 0.75, "grad_norm": 25.39556496888905, "learning_rate": 1.1826114976777678e-07, "logits/chosen": -1.7499809265136719, "logits/rejected": -1.4074931144714355, "logps/chosen": -91.85431671142578, "logps/rejected": -170.28402709960938, "loss": 0.2753, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.5811419486999512, "rewards/margins": 3.3795552253723145, "rewards/rejected": -1.7984129190444946, "step": 2060 }, { "epoch": 0.75, "grad_norm": 27.81061856916663, "learning_rate": 1.1787215854081724e-07, "logits/chosen": -2.056485652923584, "logits/rejected": -1.6245391368865967, "logps/chosen": -80.32780456542969, "logps/rejected": -132.90272521972656, "loss": 0.3135, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.4287405014038086, "rewards/margins": 2.6631085872650146, "rewards/rejected": -1.2343682050704956, "step": 2070 }, { "epoch": 0.75, "grad_norm": 30.567746366717504, "learning_rate": 1.1748144669203663e-07, "logits/chosen": -1.631363868713379, "logits/rejected": -1.6502500772476196, "logps/chosen": -96.3510971069336, "logps/rejected": -155.26380920410156, "loss": 0.2693, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.1531099081039429, "rewards/margins": 2.349030017852783, "rewards/rejected": -1.1959199905395508, "step": 2080 }, { "epoch": 0.76, "grad_norm": 40.52562795601512, "learning_rate": 1.1708902990217868e-07, "logits/chosen": -1.8120248317718506, "logits/rejected": -1.6999822854995728, "logps/chosen": -73.98417663574219, "logps/rejected": -166.10572814941406, "loss": 0.2698, "rewards/accuracies": 1.0, "rewards/chosen": 0.971939742565155, "rewards/margins": 3.178133487701416, "rewards/rejected": -2.206193685531616, "step": 2090 }, { "epoch": 0.76, "grad_norm": 42.93005039151693, "learning_rate": 1.166949239204129e-07, "logits/chosen": -2.2130112648010254, "logits/rejected": -1.7474415302276611, "logps/chosen": -89.34376525878906, "logps/rejected": -188.7493133544922, "loss": 0.2532, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.4905040264129639, "rewards/margins": 3.4080872535705566, "rewards/rejected": -1.9175831079483032, "step": 2100 }, { "epoch": 0.76, "eval_logits/chosen": -2.5582356452941895, "eval_logits/rejected": -2.3411316871643066, "eval_logps/chosen": -86.91837310791016, "eval_logps/rejected": -139.59432983398438, "eval_loss": 0.30377882719039917, "eval_rewards/accuracies": 0.8571428656578064, "eval_rewards/chosen": 1.303788423538208, "eval_rewards/margins": 2.72607684135437, "eval_rewards/rejected": -1.4222885370254517, "eval_runtime": 71.8831, "eval_samples_per_second": 12.381, "eval_steps_per_second": 0.195, "step": 2100 }, { "epoch": 0.77, "grad_norm": 53.74378080351859, "learning_rate": 1.1629914456370243e-07, "logits/chosen": -1.6835792064666748, "logits/rejected": -1.5631754398345947, "logps/chosen": -79.44068908691406, "logps/rejected": -109.39453125, "loss": 0.3213, "rewards/accuracies": 0.75, "rewards/chosen": 0.8239709734916687, "rewards/margins": 1.709472417831421, "rewards/rejected": -0.885501503944397, "step": 2110 }, { "epoch": 0.77, "grad_norm": 45.76775103982164, "learning_rate": 1.1590170771616929e-07, "logits/chosen": -2.1422882080078125, "logits/rejected": -2.01249361038208, "logps/chosen": -70.21590423583984, "logps/rejected": -106.15130615234375, "loss": 0.2981, "rewards/accuracies": 0.75, "rewards/chosen": 1.9929507970809937, "rewards/margins": 1.7180888652801514, "rewards/rejected": 0.27486199140548706, "step": 2120 }, { "epoch": 0.77, "grad_norm": 34.522061772377945, "learning_rate": 1.1550262932845691e-07, "logits/chosen": -1.712194800376892, "logits/rejected": -1.602046251296997, "logps/chosen": -87.20169830322266, "logps/rejected": -150.86685180664062, "loss": 0.2747, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.7278430461883545, "rewards/margins": 2.8564956188201904, "rewards/rejected": -1.1286523342132568, "step": 2130 }, { "epoch": 0.78, "grad_norm": 54.87111432142923, "learning_rate": 1.1510192541708984e-07, "logits/chosen": -1.5016837120056152, "logits/rejected": -1.4684686660766602, "logps/chosen": -72.81539916992188, "logps/rejected": -142.9152374267578, "loss": 0.3061, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 2.2450437545776367, "rewards/margins": 3.1693520545959473, "rewards/rejected": -0.9243084192276001, "step": 2140 }, { "epoch": 0.78, "grad_norm": 50.33350490881025, "learning_rate": 1.1469961206383114e-07, "logits/chosen": -2.383873462677002, "logits/rejected": -2.1281042098999023, "logps/chosen": -93.19439697265625, "logps/rejected": -159.3778076171875, "loss": 0.2755, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.0546815395355225, "rewards/margins": 2.3399760723114014, "rewards/rejected": -1.285294532775879, "step": 2150 }, { "epoch": 0.78, "grad_norm": 29.94921093499752, "learning_rate": 1.1429570541503681e-07, "logits/chosen": -1.880446434020996, "logits/rejected": -1.4601715803146362, "logps/chosen": -97.71735382080078, "logps/rejected": -151.2371368408203, "loss": 0.2765, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.2635929584503174, "rewards/margins": 3.5359420776367188, "rewards/rejected": -2.2723495960235596, "step": 2160 }, { "epoch": 0.79, "grad_norm": 38.26921815972567, "learning_rate": 1.1389022168100782e-07, "logits/chosen": -1.897870659828186, "logits/rejected": -1.5864663124084473, "logps/chosen": -91.51280212402344, "logps/rejected": -171.35614013671875, "loss": 0.2715, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.0029170513153076, "rewards/margins": 2.7681167125701904, "rewards/rejected": -1.7651996612548828, "step": 2170 }, { "epoch": 0.79, "grad_norm": 25.55797410044776, "learning_rate": 1.1348317713533955e-07, "logits/chosen": -1.8892625570297241, "logits/rejected": -1.941860556602478, "logps/chosen": -112.3303451538086, "logps/rejected": -152.99148559570312, "loss": 0.2756, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.2206674814224243, "rewards/margins": 2.4190514087677, "rewards/rejected": -1.1983839273452759, "step": 2180 }, { "epoch": 0.79, "grad_norm": 35.1411721341359, "learning_rate": 1.1307458811426865e-07, "logits/chosen": -2.288433313369751, "logits/rejected": -1.941382646560669, "logps/chosen": -117.0739517211914, "logps/rejected": -211.00942993164062, "loss": 0.2902, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.4456199109554291, "rewards/margins": 3.088353395462036, "rewards/rejected": -2.642733335494995, "step": 2190 }, { "epoch": 0.8, "grad_norm": 33.568721302560235, "learning_rate": 1.1266447101601738e-07, "logits/chosen": -2.234501361846924, "logits/rejected": -1.5646653175354004, "logps/chosen": -85.79940795898438, "logps/rejected": -137.91712951660156, "loss": 0.2862, "rewards/accuracies": 0.75, "rewards/chosen": 0.6818853616714478, "rewards/margins": 1.9311177730560303, "rewards/rejected": -1.249232530593872, "step": 2200 }, { "epoch": 0.8, "eval_logits/chosen": -2.611751079559326, "eval_logits/rejected": -2.3868792057037354, "eval_logps/chosen": -90.76590728759766, "eval_logps/rejected": -142.14036560058594, "eval_loss": 0.29903972148895264, "eval_rewards/accuracies": 0.8928571343421936, "eval_rewards/chosen": 0.9190365672111511, "eval_rewards/margins": 2.59592866897583, "eval_rewards/rejected": -1.6768922805786133, "eval_runtime": 71.9273, "eval_samples_per_second": 12.374, "eval_steps_per_second": 0.195, "step": 2200 }, { "epoch": 0.8, "grad_norm": 40.79977336139254, "learning_rate": 1.1225284230013554e-07, "logits/chosen": -1.9493424892425537, "logits/rejected": -1.7713226079940796, "logps/chosen": -127.34124755859375, "logps/rejected": -142.61746215820312, "loss": 0.2739, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.7090291976928711, "rewards/margins": 2.365572452545166, "rewards/rejected": -1.656543493270874, "step": 2210 }, { "epoch": 0.81, "grad_norm": 42.792407711246895, "learning_rate": 1.118397184868399e-07, "logits/chosen": -1.76601243019104, "logits/rejected": -1.4212825298309326, "logps/chosen": -80.1509780883789, "logps/rejected": -130.9561004638672, "loss": 0.2638, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.3400588035583496, "rewards/margins": 2.613032817840576, "rewards/rejected": -1.272973895072937, "step": 2220 }, { "epoch": 0.81, "grad_norm": 32.9359485242729, "learning_rate": 1.1142511615635106e-07, "logits/chosen": -2.112626791000366, "logits/rejected": -1.8563470840454102, "logps/chosen": -96.79916381835938, "logps/rejected": -137.69740295410156, "loss": 0.271, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.3981750011444092, "rewards/margins": 2.080976963043213, "rewards/rejected": -0.6828019618988037, "step": 2230 }, { "epoch": 0.81, "grad_norm": 31.24730889188293, "learning_rate": 1.1100905194822817e-07, "logits/chosen": -2.2185370922088623, "logits/rejected": -1.9021923542022705, "logps/chosen": -100.98394775390625, "logps/rejected": -165.42442321777344, "loss": 0.2959, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.7293106913566589, "rewards/margins": 2.7436270713806152, "rewards/rejected": -2.0143163204193115, "step": 2240 }, { "epoch": 0.82, "grad_norm": 29.657646636917896, "learning_rate": 1.1059154256070101e-07, "logits/chosen": -2.116981029510498, "logits/rejected": -1.6903644800186157, "logps/chosen": -62.789337158203125, "logps/rejected": -139.08433532714844, "loss": 0.2711, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.8867229223251343, "rewards/margins": 3.3395798206329346, "rewards/rejected": -1.4528568983078003, "step": 2250 }, { "epoch": 0.82, "grad_norm": 42.14718531203437, "learning_rate": 1.101726047499999e-07, "logits/chosen": -1.7964967489242554, "logits/rejected": -1.3701756000518799, "logps/chosen": -84.9123306274414, "logps/rejected": -195.67616271972656, "loss": 0.295, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.1344997882843018, "rewards/margins": 2.1091301441192627, "rewards/rejected": -0.9746300578117371, "step": 2260 }, { "epoch": 0.82, "grad_norm": 44.98425619656123, "learning_rate": 1.0975225532968324e-07, "logits/chosen": -2.183009386062622, "logits/rejected": -1.8169399499893188, "logps/chosen": -71.13068389892578, "logps/rejected": -153.55203247070312, "loss": 0.2318, "rewards/accuracies": 1.0, "rewards/chosen": 1.66214919090271, "rewards/margins": 2.8192644119262695, "rewards/rejected": -1.1571152210235596, "step": 2270 }, { "epoch": 0.83, "grad_norm": 38.49859201574968, "learning_rate": 1.0933051116996251e-07, "logits/chosen": -2.2806153297424316, "logits/rejected": -1.6658601760864258, "logps/chosen": -87.88021850585938, "logps/rejected": -157.28659057617188, "loss": 0.2386, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.618966817855835, "rewards/margins": 3.0835256576538086, "rewards/rejected": -1.4645588397979736, "step": 2280 }, { "epoch": 0.83, "grad_norm": 29.80803011600754, "learning_rate": 1.089073891970255e-07, "logits/chosen": -2.1777398586273193, "logits/rejected": -1.9328248500823975, "logps/chosen": -95.84873962402344, "logps/rejected": -143.92617797851562, "loss": 0.2572, "rewards/accuracies": 1.0, "rewards/chosen": 1.5284852981567383, "rewards/margins": 3.2201919555664062, "rewards/rejected": -1.6917064189910889, "step": 2290 }, { "epoch": 0.83, "grad_norm": 38.40290299546086, "learning_rate": 1.0848290639235677e-07, "logits/chosen": -2.0058891773223877, "logits/rejected": -1.5227512121200562, "logps/chosen": -52.72210693359375, "logps/rejected": -111.29896545410156, "loss": 0.2972, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.021421432495117, "rewards/margins": 2.641392230987549, "rewards/rejected": -0.6199706196784973, "step": 2300 }, { "epoch": 0.83, "eval_logits/chosen": -2.568045139312744, "eval_logits/rejected": -2.3515257835388184, "eval_logps/chosen": -86.87254333496094, "eval_logps/rejected": -140.3467559814453, "eval_loss": 0.29616376757621765, "eval_rewards/accuracies": 0.8571428656578064, "eval_rewards/chosen": 1.308371901512146, "eval_rewards/margins": 2.8059024810791016, "eval_rewards/rejected": -1.4975303411483765, "eval_runtime": 71.9446, "eval_samples_per_second": 12.371, "eval_steps_per_second": 0.195, "step": 2300 }, { "epoch": 0.84, "grad_norm": 32.78552611053422, "learning_rate": 1.0805707979205626e-07, "logits/chosen": -1.8444101810455322, "logits/rejected": -1.6570831537246704, "logps/chosen": -82.16907501220703, "logps/rejected": -161.206787109375, "loss": 0.2583, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.7756086587905884, "rewards/margins": 3.3080837726593018, "rewards/rejected": -1.532475233078003, "step": 2310 }, { "epoch": 0.84, "grad_norm": 49.48891626378323, "learning_rate": 1.0762992648615549e-07, "logits/chosen": -2.2481982707977295, "logits/rejected": -1.626535415649414, "logps/chosen": -60.07440185546875, "logps/rejected": -154.71829223632812, "loss": 0.2713, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.572458028793335, "rewards/margins": 2.8353686332702637, "rewards/rejected": -1.2629106044769287, "step": 2320 }, { "epoch": 0.85, "grad_norm": 32.324264844561206, "learning_rate": 1.0720146361793166e-07, "logits/chosen": -1.3140501976013184, "logits/rejected": -1.3236348628997803, "logps/chosen": -84.41221618652344, "logps/rejected": -151.84896850585938, "loss": 0.2769, "rewards/accuracies": 1.0, "rewards/chosen": 1.4236546754837036, "rewards/margins": 3.031369686126709, "rewards/rejected": -1.6077148914337158, "step": 2330 }, { "epoch": 0.85, "grad_norm": 33.985147386077095, "learning_rate": 1.0677170838321969e-07, "logits/chosen": -2.1076178550720215, "logits/rejected": -1.5049619674682617, "logps/chosen": -63.42456817626953, "logps/rejected": -135.26893615722656, "loss": 0.3078, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.8713020086288452, "rewards/margins": 3.84709095954895, "rewards/rejected": -1.9757890701293945, "step": 2340 }, { "epoch": 0.85, "grad_norm": 28.49155730270651, "learning_rate": 1.0634067802972204e-07, "logits/chosen": -2.251544713973999, "logits/rejected": -1.766570806503296, "logps/chosen": -59.371925354003906, "logps/rejected": -96.25486755371094, "loss": 0.2679, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.6637605428695679, "rewards/margins": 1.9050308465957642, "rewards/rejected": -0.24127018451690674, "step": 2350 }, { "epoch": 0.86, "grad_norm": 46.79774181330583, "learning_rate": 1.0590838985631653e-07, "logits/chosen": -1.972684621810913, "logits/rejected": -1.8693819046020508, "logps/chosen": -81.91486358642578, "logps/rejected": -135.78457641601562, "loss": 0.2746, "rewards/accuracies": 1.0, "rewards/chosen": 1.4232879877090454, "rewards/margins": 3.040739059448242, "rewards/rejected": -1.6174514293670654, "step": 2360 }, { "epoch": 0.86, "grad_norm": 45.90541036194016, "learning_rate": 1.0547486121236202e-07, "logits/chosen": -2.168940305709839, "logits/rejected": -1.6793582439422607, "logps/chosen": -85.18965148925781, "logps/rejected": -126.7214126586914, "loss": 0.25, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.3474589586257935, "rewards/margins": 2.820422649383545, "rewards/rejected": -1.472963571548462, "step": 2370 }, { "epoch": 0.86, "grad_norm": 32.321114428237806, "learning_rate": 1.0504010949700214e-07, "logits/chosen": -2.089189052581787, "logits/rejected": -1.6855659484863281, "logps/chosen": -106.45992279052734, "logps/rejected": -163.7842559814453, "loss": 0.2809, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.0083647966384888, "rewards/margins": 2.169463872909546, "rewards/rejected": -1.1610991954803467, "step": 2380 }, { "epoch": 0.87, "grad_norm": 29.03655328835733, "learning_rate": 1.0460415215846706e-07, "logits/chosen": -1.7703535556793213, "logits/rejected": -1.6672289371490479, "logps/chosen": -78.27106475830078, "logps/rejected": -119.7970962524414, "loss": 0.3083, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.5671287775039673, "rewards/margins": 2.6943907737731934, "rewards/rejected": -1.1272616386413574, "step": 2390 }, { "epoch": 0.87, "grad_norm": 40.17380893976502, "learning_rate": 1.0416700669337309e-07, "logits/chosen": -1.7051159143447876, "logits/rejected": -1.4203988313674927, "logps/chosen": -81.63056945800781, "logps/rejected": -137.97152709960938, "loss": 0.2819, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.8759418725967407, "rewards/margins": 2.198500156402588, "rewards/rejected": -1.3225582838058472, "step": 2400 }, { "epoch": 0.87, "eval_logits/chosen": -2.5985686779022217, "eval_logits/rejected": -2.3792624473571777, "eval_logps/chosen": -89.82866668701172, "eval_logps/rejected": -142.37832641601562, "eval_loss": 0.29318711161613464, "eval_rewards/accuracies": 0.875, "eval_rewards/chosen": 1.0127601623535156, "eval_rewards/margins": 2.7134480476379395, "eval_rewards/rejected": -1.7006880044937134, "eval_runtime": 71.9548, "eval_samples_per_second": 12.369, "eval_steps_per_second": 0.195, "step": 2400 }, { "epoch": 0.87, "grad_norm": 33.42712229281822, "learning_rate": 1.0372869064602057e-07, "logits/chosen": -1.7892534732818604, "logits/rejected": -1.1852535009384155, "logps/chosen": -83.11079406738281, "logps/rejected": -178.16806030273438, "loss": 0.2442, "rewards/accuracies": 1.0, "rewards/chosen": 1.063844084739685, "rewards/margins": 2.93255352973938, "rewards/rejected": -1.8687092065811157, "step": 2410 }, { "epoch": 0.88, "grad_norm": 35.05254862902216, "learning_rate": 1.032892216076898e-07, "logits/chosen": -1.8080724477767944, "logits/rejected": -2.15615177154541, "logps/chosen": -103.28878021240234, "logps/rejected": -124.3226089477539, "loss": 0.2772, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.9695222973823547, "rewards/margins": 1.7081365585327148, "rewards/rejected": -0.7386142611503601, "step": 2420 }, { "epoch": 0.88, "grad_norm": 36.27734663918374, "learning_rate": 1.0284861721593486e-07, "logits/chosen": -1.5924618244171143, "logits/rejected": -1.227879285812378, "logps/chosen": -101.4988784790039, "logps/rejected": -178.65904235839844, "loss": 0.2531, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.11053466796875, "rewards/margins": 3.443126678466797, "rewards/rejected": -2.3325917720794678, "step": 2430 }, { "epoch": 0.89, "grad_norm": 24.345274978234222, "learning_rate": 1.0240689515387594e-07, "logits/chosen": -1.6779060363769531, "logits/rejected": -1.6044038534164429, "logps/chosen": -79.8328857421875, "logps/rejected": -146.0101776123047, "loss": 0.2333, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.1275861263275146, "rewards/margins": 3.003176212310791, "rewards/rejected": -1.8755900859832764, "step": 2440 }, { "epoch": 0.89, "grad_norm": 38.18691013378142, "learning_rate": 1.0196407314948948e-07, "logits/chosen": -2.0358641147613525, "logits/rejected": -1.629227638244629, "logps/chosen": -68.20674896240234, "logps/rejected": -162.77389526367188, "loss": 0.2482, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.680604338645935, "rewards/margins": 3.843975067138672, "rewards/rejected": -2.1633706092834473, "step": 2450 }, { "epoch": 0.89, "grad_norm": 48.20820417201082, "learning_rate": 1.015201689748969e-07, "logits/chosen": -1.8446376323699951, "logits/rejected": -1.5708643198013306, "logps/chosen": -71.98088836669922, "logps/rejected": -119.35798645019531, "loss": 0.2679, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.3860020637512207, "rewards/margins": 2.888756275177002, "rewards/rejected": -1.5027542114257812, "step": 2460 }, { "epoch": 0.9, "grad_norm": 54.24613191962403, "learning_rate": 1.0107520044565107e-07, "logits/chosen": -1.6053078174591064, "logits/rejected": -1.7150065898895264, "logps/chosen": -95.47930908203125, "logps/rejected": -129.6278839111328, "loss": 0.2346, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.3713977336883545, "rewards/margins": 1.8234504461288452, "rewards/rejected": -0.45205289125442505, "step": 2470 }, { "epoch": 0.9, "grad_norm": 33.14197724163633, "learning_rate": 1.0062918542002149e-07, "logits/chosen": -1.7330402135849, "logits/rejected": -1.5148823261260986, "logps/chosen": -90.85403442382812, "logps/rejected": -120.53018951416016, "loss": 0.291, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.1606512069702148, "rewards/margins": 1.8089545965194702, "rewards/rejected": -0.6483034491539001, "step": 2480 }, { "epoch": 0.9, "grad_norm": 37.842170372492035, "learning_rate": 1.0018214179827752e-07, "logits/chosen": -1.585915207862854, "logits/rejected": -1.2993358373641968, "logps/chosen": -100.29901123046875, "logps/rejected": -149.75497436523438, "loss": 0.2526, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.6282235980033875, "rewards/margins": 2.4122531414031982, "rewards/rejected": -1.784029245376587, "step": 2490 }, { "epoch": 0.91, "grad_norm": 47.99587426386816, "learning_rate": 9.973408752196995e-08, "logits/chosen": -1.995279312133789, "logits/rejected": -1.883319616317749, "logps/chosen": -85.00283813476562, "logps/rejected": -133.04844665527344, "loss": 0.2523, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.3437588214874268, "rewards/margins": 2.9578824043273926, "rewards/rejected": -1.6141233444213867, "step": 2500 }, { "epoch": 0.91, "eval_logits/chosen": -2.529489755630493, "eval_logits/rejected": -2.3238425254821777, "eval_logps/chosen": -87.53929901123047, "eval_logps/rejected": -142.36253356933594, "eval_loss": 0.2887308895587921, "eval_rewards/accuracies": 0.8571428656578064, "eval_rewards/chosen": 1.2416969537734985, "eval_rewards/margins": 2.9408047199249268, "eval_rewards/rejected": -1.6991075277328491, "eval_runtime": 71.9193, "eval_samples_per_second": 12.375, "eval_steps_per_second": 0.195, "step": 2500 }, { "epoch": 0.91, "grad_norm": 43.66156972073303, "learning_rate": 9.9285040573211e-08, "logits/chosen": -1.346799612045288, "logits/rejected": -1.2726528644561768, "logps/chosen": -84.28345489501953, "logps/rejected": -163.90647888183594, "loss": 0.2379, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.04119873046875, "rewards/margins": 2.8771090507507324, "rewards/rejected": -1.8359102010726929, "step": 2510 }, { "epoch": 0.91, "grad_norm": 29.983525537713383, "learning_rate": 9.883501897395255e-08, "logits/chosen": -2.4633562564849854, "logits/rejected": -1.8329979181289673, "logps/chosen": -77.02892303466797, "logps/rejected": -148.3466796875, "loss": 0.309, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.9383471012115479, "rewards/margins": 3.8611648082733154, "rewards/rejected": -1.9228184223175049, "step": 2520 }, { "epoch": 0.92, "grad_norm": 54.586263258672254, "learning_rate": 9.838404078526287e-08, "logits/chosen": -1.8038403987884521, "logits/rejected": -1.38016676902771, "logps/chosen": -81.94984436035156, "logps/rejected": -152.57546997070312, "loss": 0.2406, "rewards/accuracies": 1.0, "rewards/chosen": 1.466467022895813, "rewards/margins": 3.583028793334961, "rewards/rejected": -2.1165618896484375, "step": 2530 }, { "epoch": 0.92, "grad_norm": 34.64469687487794, "learning_rate": 9.79321241066019e-08, "logits/chosen": -1.7441418170928955, "logits/rejected": -1.8793613910675049, "logps/chosen": -80.00519561767578, "logps/rejected": -135.61685180664062, "loss": 0.2527, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.3446810245513916, "rewards/margins": 2.6392345428466797, "rewards/rejected": -1.2945533990859985, "step": 2540 }, { "epoch": 0.93, "grad_norm": 52.236703640195636, "learning_rate": 9.747928707509452e-08, "logits/chosen": -2.3760972023010254, "logits/rejected": -1.7141082286834717, "logps/chosen": -70.12747955322266, "logps/rejected": -142.975341796875, "loss": 0.2482, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.4635976552963257, "rewards/margins": 3.061598300933838, "rewards/rejected": -1.5980006456375122, "step": 2550 }, { "epoch": 0.93, "grad_norm": 54.072926051989626, "learning_rate": 9.702554786480307e-08, "logits/chosen": -2.1945765018463135, "logits/rejected": -1.9323997497558594, "logps/chosen": -109.21195983886719, "logps/rejected": -147.5604705810547, "loss": 0.2711, "rewards/accuracies": 0.75, "rewards/chosen": 0.7500441670417786, "rewards/margins": 1.275587558746338, "rewards/rejected": -0.5255435109138489, "step": 2560 }, { "epoch": 0.93, "grad_norm": 28.839686251981885, "learning_rate": 9.657092468599762e-08, "logits/chosen": -1.9283406734466553, "logits/rejected": -1.9637447595596313, "logps/chosen": -73.35202026367188, "logps/rejected": -218.9588165283203, "loss": 0.2294, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.6155729293823242, "rewards/margins": 3.6671338081359863, "rewards/rejected": -2.051560640335083, "step": 2570 }, { "epoch": 0.94, "grad_norm": 23.167116682862034, "learning_rate": 9.611543578442521e-08, "logits/chosen": -2.004908800125122, "logits/rejected": -1.8643825054168701, "logps/chosen": -73.96257019042969, "logps/rejected": -120.63322448730469, "loss": 0.2663, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.1677210330963135, "rewards/margins": 1.7092103958129883, "rewards/rejected": -0.5414894819259644, "step": 2580 }, { "epoch": 0.94, "grad_norm": 37.61703553720983, "learning_rate": 9.565909944057774e-08, "logits/chosen": -1.8392232656478882, "logits/rejected": -1.622847318649292, "logps/chosen": -98.19245910644531, "logps/rejected": -152.8502197265625, "loss": 0.2644, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.2672936916351318, "rewards/margins": 2.7811713218688965, "rewards/rejected": -1.513877511024475, "step": 2590 }, { "epoch": 0.94, "grad_norm": 27.461231350463876, "learning_rate": 9.520193396895798e-08, "logits/chosen": -1.9339004755020142, "logits/rejected": -1.5946998596191406, "logps/chosen": -107.6896743774414, "logps/rejected": -166.02828979492188, "loss": 0.2534, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.9528090357780457, "rewards/margins": 3.083143949508667, "rewards/rejected": -2.1303353309631348, "step": 2600 }, { "epoch": 0.94, "eval_logits/chosen": -2.55859375, "eval_logits/rejected": -2.3451855182647705, "eval_logps/chosen": -89.4637680053711, "eval_logps/rejected": -143.6739959716797, "eval_loss": 0.2875581979751587, "eval_rewards/accuracies": 0.8214285969734192, "eval_rewards/chosen": 1.0492494106292725, "eval_rewards/margins": 2.8795037269592285, "eval_rewards/rejected": -1.8302545547485352, "eval_runtime": 71.9243, "eval_samples_per_second": 12.374, "eval_steps_per_second": 0.195, "step": 2600 }, { "epoch": 0.95, "grad_norm": 49.8667964660086, "learning_rate": 9.474395771734493e-08, "logits/chosen": -1.8184125423431396, "logits/rejected": -1.5943410396575928, "logps/chosen": -81.92977905273438, "logps/rejected": -143.59848022460938, "loss": 0.284, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.8414875268936157, "rewards/margins": 3.2987303733825684, "rewards/rejected": -2.457242727279663, "step": 2610 }, { "epoch": 0.95, "grad_norm": 30.852377454296445, "learning_rate": 9.428518906605715e-08, "logits/chosen": -1.6976007223129272, "logits/rejected": -2.0197830200195312, "logps/chosen": -77.92415618896484, "logps/rejected": -120.9302749633789, "loss": 0.2503, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.2503085136413574, "rewards/margins": 2.8966543674468994, "rewards/rejected": -1.646345853805542, "step": 2620 }, { "epoch": 0.95, "grad_norm": 44.68222855190559, "learning_rate": 9.382564642721517e-08, "logits/chosen": -2.2393205165863037, "logits/rejected": -1.8123022317886353, "logps/chosen": -104.79426574707031, "logps/rejected": -157.04183959960938, "loss": 0.2891, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.9819310307502747, "rewards/margins": 2.653965473175049, "rewards/rejected": -1.6720342636108398, "step": 2630 }, { "epoch": 0.96, "grad_norm": 49.38339013216613, "learning_rate": 9.336534824400267e-08, "logits/chosen": -1.8587281703948975, "logits/rejected": -1.5764485597610474, "logps/chosen": -72.25798797607422, "logps/rejected": -129.2059783935547, "loss": 0.2643, "rewards/accuracies": 1.0, "rewards/chosen": 1.856530785560608, "rewards/margins": 3.2116611003875732, "rewards/rejected": -1.3551304340362549, "step": 2640 }, { "epoch": 0.96, "grad_norm": 38.226700349391514, "learning_rate": 9.290431298992605e-08, "logits/chosen": -2.0723516941070557, "logits/rejected": -1.8886346817016602, "logps/chosen": -73.31155395507812, "logps/rejected": -133.17425537109375, "loss": 0.2182, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.8224437236785889, "rewards/margins": 2.681816577911377, "rewards/rejected": -0.8593727946281433, "step": 2650 }, { "epoch": 0.97, "grad_norm": 27.790126692612404, "learning_rate": 9.244255916807328e-08, "logits/chosen": -1.9878448247909546, "logits/rejected": -1.7479631900787354, "logps/chosen": -75.09999084472656, "logps/rejected": -106.42635345458984, "loss": 0.2358, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.3537728786468506, "rewards/margins": 2.176225423812866, "rewards/rejected": -0.8224524259567261, "step": 2660 }, { "epoch": 0.97, "grad_norm": 45.32065506760666, "learning_rate": 9.19801053103711e-08, "logits/chosen": -2.0774807929992676, "logits/rejected": -1.564305305480957, "logps/chosen": -108.69667053222656, "logps/rejected": -161.80776977539062, "loss": 0.2735, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.1085962057113647, "rewards/margins": 2.628162145614624, "rewards/rejected": -1.5195659399032593, "step": 2670 }, { "epoch": 0.97, "grad_norm": 42.93481884745555, "learning_rate": 9.151696997684133e-08, "logits/chosen": -1.8935344219207764, "logits/rejected": -1.3089290857315063, "logps/chosen": -82.37386322021484, "logps/rejected": -136.06396484375, "loss": 0.3265, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.3590657711029053, "rewards/margins": 2.706857681274414, "rewards/rejected": -1.3477914333343506, "step": 2680 }, { "epoch": 0.98, "grad_norm": 35.54137712720915, "learning_rate": 9.105317175485603e-08, "logits/chosen": -1.7412023544311523, "logits/rejected": -1.2009631395339966, "logps/chosen": -100.62733459472656, "logps/rejected": -161.97164916992188, "loss": 0.2636, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.8465662002563477, "rewards/margins": 2.8156447410583496, "rewards/rejected": -1.9690784215927124, "step": 2690 }, { "epoch": 0.98, "grad_norm": 28.86922135565408, "learning_rate": 9.058872925839145e-08, "logits/chosen": -1.6413863897323608, "logits/rejected": -1.4907596111297607, "logps/chosen": -95.3847427368164, "logps/rejected": -137.19778442382812, "loss": 0.2065, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.9992202520370483, "rewards/margins": 2.450819969177246, "rewards/rejected": -1.4515998363494873, "step": 2700 }, { "epoch": 0.98, "eval_logits/chosen": -2.584843158721924, "eval_logits/rejected": -2.3644392490386963, "eval_logps/chosen": -91.23661804199219, "eval_logps/rejected": -144.7612762451172, "eval_loss": 0.28060293197631836, "eval_rewards/accuracies": 0.8571428656578064, "eval_rewards/chosen": 0.8719648718833923, "eval_rewards/margins": 2.8109474182128906, "eval_rewards/rejected": -1.9389822483062744, "eval_runtime": 72.0935, "eval_samples_per_second": 12.345, "eval_steps_per_second": 0.194, "step": 2700 }, { "epoch": 0.98, "grad_norm": 33.738621565238745, "learning_rate": 9.012366112728104e-08, "logits/chosen": -2.308584213256836, "logits/rejected": -1.8555558919906616, "logps/chosen": -87.93582153320312, "logps/rejected": -192.4984130859375, "loss": 0.2612, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.67702716588974, "rewards/margins": 2.732879161834717, "rewards/rejected": -2.055852174758911, "step": 2710 }, { "epoch": 0.99, "grad_norm": 27.441037937773388, "learning_rate": 8.96579860264673e-08, "logits/chosen": -1.98538339138031, "logits/rejected": -1.8649771213531494, "logps/chosen": -95.34701538085938, "logps/rejected": -144.11663818359375, "loss": 0.2566, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.9213436841964722, "rewards/margins": 2.6342453956604004, "rewards/rejected": -1.7129017114639282, "step": 2720 }, { "epoch": 0.99, "grad_norm": 30.78636513438412, "learning_rate": 8.919172264525274e-08, "logits/chosen": -1.7649853229522705, "logits/rejected": -1.6394439935684204, "logps/chosen": -93.97325134277344, "logps/rejected": -128.0108642578125, "loss": 0.2684, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8014779090881348, "rewards/margins": 2.1937363147735596, "rewards/rejected": -1.3922584056854248, "step": 2730 }, { "epoch": 0.99, "grad_norm": 31.448648707859167, "learning_rate": 8.872488969654978e-08, "logits/chosen": -1.7622663974761963, "logits/rejected": -1.6264762878417969, "logps/chosen": -67.11592102050781, "logps/rejected": -124.63407897949219, "loss": 0.2686, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.3815752267837524, "rewards/margins": 2.803210735321045, "rewards/rejected": -1.4216358661651611, "step": 2740 }, { "epoch": 1.0, "grad_norm": 39.421832880751474, "learning_rate": 8.825750591612973e-08, "logits/chosen": -2.0863993167877197, "logits/rejected": -2.0298314094543457, "logps/chosen": -88.62030029296875, "logps/rejected": -142.9340057373047, "loss": 0.261, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.233635663986206, "rewards/margins": 2.6705703735351562, "rewards/rejected": -1.4369348287582397, "step": 2750 }, { "epoch": 1.0, "grad_norm": 24.481557561032986, "learning_rate": 8.778959006187086e-08, "logits/chosen": -2.1946396827697754, "logits/rejected": -2.0280699729919434, "logps/chosen": -109.33675384521484, "logps/rejected": -159.2801055908203, "loss": 0.1921, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.6714259386062622, "rewards/margins": 1.6729438304901123, "rewards/rejected": -1.0015180110931396, "step": 2760 }, { "epoch": 1.01, "grad_norm": 33.3000860606884, "learning_rate": 8.732116091300562e-08, "logits/chosen": -2.672719955444336, "logits/rejected": -2.3178205490112305, "logps/chosen": -108.69718170166016, "logps/rejected": -154.65037536621094, "loss": 0.1737, "rewards/accuracies": 1.0, "rewards/chosen": 0.7940966486930847, "rewards/margins": 2.4711570739746094, "rewards/rejected": -1.6770607233047485, "step": 2770 }, { "epoch": 1.01, "grad_norm": 33.433009311142335, "learning_rate": 8.685223726936686e-08, "logits/chosen": -1.885528802871704, "logits/rejected": -1.5169920921325684, "logps/chosen": -77.01846313476562, "logps/rejected": -153.978759765625, "loss": 0.171, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.206106185913086, "rewards/margins": 3.5677733421325684, "rewards/rejected": -2.3616671562194824, "step": 2780 }, { "epoch": 1.01, "grad_norm": 36.29017775178196, "learning_rate": 8.638283795063337e-08, "logits/chosen": -1.7784044742584229, "logits/rejected": -1.95382559299469, "logps/chosen": -102.05876922607422, "logps/rejected": -136.20278930664062, "loss": 0.1558, "rewards/accuracies": 1.0, "rewards/chosen": 1.0153976678848267, "rewards/margins": 2.9986801147460938, "rewards/rejected": -1.9832820892333984, "step": 2790 }, { "epoch": 1.02, "grad_norm": 29.62169456800005, "learning_rate": 8.591298179557462e-08, "logits/chosen": -1.6251239776611328, "logits/rejected": -1.2165063619613647, "logps/chosen": -86.18495178222656, "logps/rejected": -161.23684692382812, "loss": 0.1669, "rewards/accuracies": 1.0, "rewards/chosen": 1.3276035785675049, "rewards/margins": 4.416350364685059, "rewards/rejected": -3.0887465476989746, "step": 2800 }, { "epoch": 1.02, "eval_logits/chosen": -2.437558650970459, "eval_logits/rejected": -2.259409189224243, "eval_logps/chosen": -88.76131439208984, "eval_logps/rejected": -145.60667419433594, "eval_loss": 0.2792236804962158, "eval_rewards/accuracies": 0.875, "eval_rewards/chosen": 1.1194943189620972, "eval_rewards/margins": 3.143015146255493, "eval_rewards/rejected": -2.0235207080841064, "eval_runtime": 71.9382, "eval_samples_per_second": 12.372, "eval_steps_per_second": 0.195, "step": 2800 }, { "epoch": 1.02, "grad_norm": 25.67903585470204, "learning_rate": 8.544268766129463e-08, "logits/chosen": -1.636718988418579, "logits/rejected": -1.3525017499923706, "logps/chosen": -91.5226821899414, "logps/rejected": -152.884033203125, "loss": 0.1626, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.0812289714813232, "rewards/margins": 3.153066873550415, "rewards/rejected": -2.0718374252319336, "step": 2810 }, { "epoch": 1.02, "grad_norm": 23.17233872525318, "learning_rate": 8.497197442247518e-08, "logits/chosen": -2.255305051803589, "logits/rejected": -1.3946579694747925, "logps/chosen": -73.07662963867188, "logps/rejected": -160.0610809326172, "loss": 0.1865, "rewards/accuracies": 1.0, "rewards/chosen": 1.7297379970550537, "rewards/margins": 3.913317918777466, "rewards/rejected": -2.183579683303833, "step": 2820 }, { "epoch": 1.03, "grad_norm": 38.41529064635961, "learning_rate": 8.45008609706183e-08, "logits/chosen": -1.6799609661102295, "logits/rejected": -1.7947546243667603, "logps/chosen": -98.69215393066406, "logps/rejected": -176.2200469970703, "loss": 0.1551, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.3261483907699585, "rewards/margins": 2.8912904262542725, "rewards/rejected": -1.565142273902893, "step": 2830 }, { "epoch": 1.03, "grad_norm": 33.65938875922252, "learning_rate": 8.402936621328809e-08, "logits/chosen": -1.584078073501587, "logits/rejected": -1.5029069185256958, "logps/chosen": -78.0562515258789, "logps/rejected": -129.6306915283203, "loss": 0.1773, "rewards/accuracies": 0.75, "rewards/chosen": 1.175910472869873, "rewards/margins": 3.317577838897705, "rewards/rejected": -2.141667366027832, "step": 2840 }, { "epoch": 1.03, "grad_norm": 28.882074958874135, "learning_rate": 8.355750907335185e-08, "logits/chosen": -1.3962595462799072, "logits/rejected": -1.2154072523117065, "logps/chosen": -97.38468933105469, "logps/rejected": -141.413330078125, "loss": 0.1708, "rewards/accuracies": 1.0, "rewards/chosen": 0.9436796307563782, "rewards/margins": 2.901923418045044, "rewards/rejected": -1.9582436084747314, "step": 2850 }, { "epoch": 1.04, "grad_norm": 48.57490152838243, "learning_rate": 8.308530848822072e-08, "logits/chosen": -1.322256088256836, "logits/rejected": -1.3967430591583252, "logps/chosen": -69.242919921875, "logps/rejected": -106.2719955444336, "loss": 0.167, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.6083996295928955, "rewards/margins": 3.0584025382995605, "rewards/rejected": -1.4500024318695068, "step": 2860 }, { "epoch": 1.04, "grad_norm": 33.604690884057646, "learning_rate": 8.261278340908956e-08, "logits/chosen": -2.133439302444458, "logits/rejected": -1.5575885772705078, "logps/chosen": -82.22913360595703, "logps/rejected": -155.5948028564453, "loss": 0.1388, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.6941452026367188, "rewards/margins": 3.734036922454834, "rewards/rejected": -2.0398921966552734, "step": 2870 }, { "epoch": 1.05, "grad_norm": 39.178640280039765, "learning_rate": 8.213995280017641e-08, "logits/chosen": -2.022156238555908, "logits/rejected": -1.857143759727478, "logps/chosen": -85.9186019897461, "logps/rejected": -166.30189514160156, "loss": 0.1486, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.53517484664917, "rewards/margins": 4.04697322845459, "rewards/rejected": -2.511798858642578, "step": 2880 }, { "epoch": 1.05, "grad_norm": 34.78141019059118, "learning_rate": 8.166683563796132e-08, "logits/chosen": -1.9229103326797485, "logits/rejected": -2.1193885803222656, "logps/chosen": -80.5084228515625, "logps/rejected": -117.29715728759766, "loss": 0.2072, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.6038023233413696, "rewards/margins": 2.8986401557922363, "rewards/rejected": -1.2948377132415771, "step": 2890 }, { "epoch": 1.05, "grad_norm": 55.9854636653751, "learning_rate": 8.119345091042493e-08, "logits/chosen": -1.9403820037841797, "logits/rejected": -1.5053958892822266, "logps/chosen": -78.80266571044922, "logps/rejected": -152.7443389892578, "loss": 0.2042, "rewards/accuracies": 1.0, "rewards/chosen": 1.886501669883728, "rewards/margins": 4.484723091125488, "rewards/rejected": -2.59822154045105, "step": 2900 }, { "epoch": 1.05, "eval_logits/chosen": -2.46332049369812, "eval_logits/rejected": -2.278831958770752, "eval_logps/chosen": -90.24932861328125, "eval_logps/rejected": -147.7169189453125, "eval_loss": 0.2783988416194916, "eval_rewards/accuracies": 0.8928571343421936, "eval_rewards/chosen": 0.9706932306289673, "eval_rewards/margins": 3.205240249633789, "eval_rewards/rejected": -2.2345468997955322, "eval_runtime": 71.9005, "eval_samples_per_second": 12.378, "eval_steps_per_second": 0.195, "step": 2900 }, { "epoch": 1.06, "grad_norm": 19.853743178316083, "learning_rate": 8.071981761628615e-08, "logits/chosen": -2.0366461277008057, "logits/rejected": -1.6515051126480103, "logps/chosen": -108.37039947509766, "logps/rejected": -271.4614562988281, "loss": 0.1636, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.959917426109314, "rewards/margins": 4.299803256988525, "rewards/rejected": -3.33988618850708, "step": 2910 }, { "epoch": 1.06, "grad_norm": 36.91632833855024, "learning_rate": 8.024595476423992e-08, "logits/chosen": -1.8184095621109009, "logits/rejected": -1.9267457723617554, "logps/chosen": -84.40902709960938, "logps/rejected": -157.05877685546875, "loss": 0.1462, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.5019279718399048, "rewards/margins": 3.4105846881866455, "rewards/rejected": -1.9086570739746094, "step": 2920 }, { "epoch": 1.06, "grad_norm": 33.58138147718405, "learning_rate": 7.977188137219414e-08, "logits/chosen": -1.3525230884552002, "logits/rejected": -1.4422528743743896, "logps/chosen": -77.68660736083984, "logps/rejected": -128.90638732910156, "loss": 0.1572, "rewards/accuracies": 1.0, "rewards/chosen": 1.6080999374389648, "rewards/margins": 3.3746752738952637, "rewards/rejected": -1.7665754556655884, "step": 2930 }, { "epoch": 1.07, "grad_norm": 32.99013154464336, "learning_rate": 7.929761646650649e-08, "logits/chosen": -2.245877504348755, "logits/rejected": -1.9841814041137695, "logps/chosen": -96.16056823730469, "logps/rejected": -256.2076721191406, "loss": 0.1878, "rewards/accuracies": 1.0, "rewards/chosen": 1.466813564300537, "rewards/margins": 4.286352157592773, "rewards/rejected": -2.8195388317108154, "step": 2940 }, { "epoch": 1.07, "grad_norm": 30.967043967811318, "learning_rate": 7.882317908122083e-08, "logits/chosen": -1.4433786869049072, "logits/rejected": -1.1254138946533203, "logps/chosen": -98.50581359863281, "logps/rejected": -170.2156982421875, "loss": 0.1753, "rewards/accuracies": 1.0, "rewards/chosen": 0.9996970891952515, "rewards/margins": 3.900161027908325, "rewards/rejected": -2.900463819503784, "step": 2950 }, { "epoch": 1.07, "grad_norm": 21.50191304483852, "learning_rate": 7.834858825730326e-08, "logits/chosen": -1.6459989547729492, "logits/rejected": -1.7681890726089478, "logps/chosen": -95.52315521240234, "logps/rejected": -140.78799438476562, "loss": 0.1528, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.9920917749404907, "rewards/margins": 3.2807719707489014, "rewards/rejected": -2.2886805534362793, "step": 2960 }, { "epoch": 1.08, "grad_norm": 17.260479958583524, "learning_rate": 7.787386304187798e-08, "logits/chosen": -1.4531595706939697, "logits/rejected": -1.3701298236846924, "logps/chosen": -93.404541015625, "logps/rejected": -135.31427001953125, "loss": 0.171, "rewards/accuracies": 1.0, "rewards/chosen": 1.0154047012329102, "rewards/margins": 3.056318998336792, "rewards/rejected": -2.040914297103882, "step": 2970 }, { "epoch": 1.08, "grad_norm": 23.025875175350215, "learning_rate": 7.739902248746283e-08, "logits/chosen": -1.7231334447860718, "logits/rejected": -1.83186936378479, "logps/chosen": -107.64973449707031, "logps/rejected": -145.9791717529297, "loss": 0.1349, "rewards/accuracies": 1.0, "rewards/chosen": 1.5788373947143555, "rewards/margins": 3.6469104290008545, "rewards/rejected": -2.068073034286499, "step": 2980 }, { "epoch": 1.09, "grad_norm": 32.06528399748813, "learning_rate": 7.692408565120458e-08, "logits/chosen": -2.0466766357421875, "logits/rejected": -1.4406808614730835, "logps/chosen": -95.41911315917969, "logps/rejected": -297.7329406738281, "loss": 0.1434, "rewards/accuracies": 1.0, "rewards/chosen": 1.45736563205719, "rewards/margins": 4.8933281898498535, "rewards/rejected": -3.435962200164795, "step": 2990 }, { "epoch": 1.09, "grad_norm": 31.87542686599211, "learning_rate": 7.64490715941142e-08, "logits/chosen": -2.2173619270324707, "logits/rejected": -1.7960357666015625, "logps/chosen": -85.83885192871094, "logps/rejected": -169.8695831298828, "loss": 0.1529, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.137475609779358, "rewards/margins": 4.046895503997803, "rewards/rejected": -2.9094200134277344, "step": 3000 }, { "epoch": 1.09, "eval_logits/chosen": -2.4170167446136475, "eval_logits/rejected": -2.2436559200286865, "eval_logps/chosen": -89.03692626953125, "eval_logps/rejected": -148.1836395263672, "eval_loss": 0.27793529629707336, "eval_rewards/accuracies": 0.8928571343421936, "eval_rewards/chosen": 1.0919331312179565, "eval_rewards/margins": 3.373152256011963, "eval_rewards/rejected": -2.281219720840454, "eval_runtime": 71.967, "eval_samples_per_second": 12.367, "eval_steps_per_second": 0.195, "step": 3000 }, { "epoch": 1.09, "grad_norm": 30.683194890069508, "learning_rate": 7.597399938030184e-08, "logits/chosen": -1.7612870931625366, "logits/rejected": -1.8522275686264038, "logps/chosen": -73.18524169921875, "logps/rejected": -130.78829956054688, "loss": 0.1392, "rewards/accuracies": 1.0, "rewards/chosen": 1.7518160343170166, "rewards/margins": 3.613239288330078, "rewards/rejected": -1.861423134803772, "step": 3010 }, { "epoch": 1.1, "grad_norm": 12.524140781131472, "learning_rate": 7.549888807621168e-08, "logits/chosen": -1.6050277948379517, "logits/rejected": -1.5487821102142334, "logps/chosen": -84.5078125, "logps/rejected": -174.291259765625, "loss": 0.1498, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.5817402601242065, "rewards/margins": 3.690995693206787, "rewards/rejected": -2.10925555229187, "step": 3020 }, { "epoch": 1.1, "grad_norm": 19.057459821446617, "learning_rate": 7.502375674985675e-08, "logits/chosen": -2.2322306632995605, "logits/rejected": -1.8400142192840576, "logps/chosen": -94.57080078125, "logps/rejected": -167.90066528320312, "loss": 0.197, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.6182721853256226, "rewards/margins": 3.7063040733337402, "rewards/rejected": -2.0880320072174072, "step": 3030 }, { "epoch": 1.1, "grad_norm": 39.27950520182761, "learning_rate": 7.454862447005359e-08, "logits/chosen": -2.199615001678467, "logits/rejected": -1.5867798328399658, "logps/chosen": -83.38591766357422, "logps/rejected": -181.61129760742188, "loss": 0.1505, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.513109564781189, "rewards/margins": 5.107856750488281, "rewards/rejected": -3.594747543334961, "step": 3040 }, { "epoch": 1.11, "grad_norm": 29.496269378804925, "learning_rate": 7.407351030565711e-08, "logits/chosen": -2.2386364936828613, "logits/rejected": -1.7992044687271118, "logps/chosen": -70.04452514648438, "logps/rejected": -168.61111450195312, "loss": 0.1842, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.804783582687378, "rewards/margins": 4.483792781829834, "rewards/rejected": -2.679008960723877, "step": 3050 }, { "epoch": 1.11, "grad_norm": 28.123692890471585, "learning_rate": 7.359843332479512e-08, "logits/chosen": -1.5445009469985962, "logits/rejected": -1.632628083229065, "logps/chosen": -97.18016815185547, "logps/rejected": -156.12501525878906, "loss": 0.1522, "rewards/accuracies": 1.0, "rewards/chosen": 0.9871312975883484, "rewards/margins": 3.336866855621338, "rewards/rejected": -2.3497352600097656, "step": 3060 }, { "epoch": 1.11, "grad_norm": 24.98966890041787, "learning_rate": 7.312341259410308e-08, "logits/chosen": -1.8162829875946045, "logits/rejected": -1.6895732879638672, "logps/chosen": -82.59261322021484, "logps/rejected": -126.13545227050781, "loss": 0.1533, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.7326328754425049, "rewards/margins": 3.6410274505615234, "rewards/rejected": -1.9083948135375977, "step": 3070 }, { "epoch": 1.12, "grad_norm": 38.009951638634, "learning_rate": 7.264846717795899e-08, "logits/chosen": -1.9883182048797607, "logits/rejected": -1.5640618801116943, "logps/chosen": -79.90910339355469, "logps/rejected": -204.0009765625, "loss": 0.1641, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.5611870288848877, "rewards/margins": 3.625385284423828, "rewards/rejected": -2.0641980171203613, "step": 3080 }, { "epoch": 1.12, "grad_norm": 37.92405502481002, "learning_rate": 7.217361613771814e-08, "logits/chosen": -1.5013178586959839, "logits/rejected": -1.6586967706680298, "logps/chosen": -74.64991760253906, "logps/rejected": -133.19850158691406, "loss": 0.1487, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.388861894607544, "rewards/margins": 3.804910659790039, "rewards/rejected": -2.416048765182495, "step": 3090 }, { "epoch": 1.13, "grad_norm": 33.70546037656875, "learning_rate": 7.16988785309482e-08, "logits/chosen": -1.7027190923690796, "logits/rejected": -1.2969590425491333, "logps/chosen": -91.05853271484375, "logps/rejected": -159.3774871826172, "loss": 0.1675, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.82365882396698, "rewards/margins": 4.186999320983887, "rewards/rejected": -3.363340377807617, "step": 3100 }, { "epoch": 1.13, "eval_logits/chosen": -2.3937642574310303, "eval_logits/rejected": -2.2311301231384277, "eval_logps/chosen": -88.40351104736328, "eval_logps/rejected": -148.453857421875, "eval_loss": 0.2778138816356659, "eval_rewards/accuracies": 0.8928571343421936, "eval_rewards/chosen": 1.155274748802185, "eval_rewards/margins": 3.4635140895843506, "eval_rewards/rejected": -2.308239459991455, "eval_runtime": 71.9837, "eval_samples_per_second": 12.364, "eval_steps_per_second": 0.194, "step": 3100 }, { "epoch": 1.13, "grad_norm": 10.977394963530008, "learning_rate": 7.122427341066431e-08, "logits/chosen": -1.7731870412826538, "logits/rejected": -1.5993355512619019, "logps/chosen": -94.86338806152344, "logps/rejected": -159.97232055664062, "loss": 0.1694, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.6271803975105286, "rewards/margins": 3.4266326427459717, "rewards/rejected": -2.799452304840088, "step": 3110 }, { "epoch": 1.13, "grad_norm": 35.6693717697895, "learning_rate": 7.074981982456438e-08, "logits/chosen": -1.6707003116607666, "logits/rejected": -1.3329427242279053, "logps/chosen": -74.29015350341797, "logps/rejected": -126.54850769042969, "loss": 0.1541, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.1601439714431763, "rewards/margins": 3.103707790374756, "rewards/rejected": -1.9435638189315796, "step": 3120 }, { "epoch": 1.14, "grad_norm": 26.223529400996284, "learning_rate": 7.027553681426475e-08, "logits/chosen": -1.6821495294570923, "logits/rejected": -1.6677749156951904, "logps/chosen": -69.13279724121094, "logps/rejected": -125.01466369628906, "loss": 0.1206, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.598923683166504, "rewards/margins": 3.550352096557617, "rewards/rejected": -1.951428771018982, "step": 3130 }, { "epoch": 1.14, "grad_norm": 33.00723755782217, "learning_rate": 6.980144341453587e-08, "logits/chosen": -1.8480640649795532, "logits/rejected": -1.4870339632034302, "logps/chosen": -93.90699768066406, "logps/rejected": -151.79629516601562, "loss": 0.1937, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.6560937166213989, "rewards/margins": 3.1022205352783203, "rewards/rejected": -2.446126937866211, "step": 3140 }, { "epoch": 1.14, "grad_norm": 28.329720861751795, "learning_rate": 6.932755865253842e-08, "logits/chosen": -2.052046537399292, "logits/rejected": -1.551355242729187, "logps/chosen": -79.02404022216797, "logps/rejected": -138.8905792236328, "loss": 0.1736, "rewards/accuracies": 1.0, "rewards/chosen": 1.4904524087905884, "rewards/margins": 3.604060411453247, "rewards/rejected": -2.113607883453369, "step": 3150 }, { "epoch": 1.15, "grad_norm": 22.857016307017492, "learning_rate": 6.885390154705964e-08, "logits/chosen": -1.932428002357483, "logits/rejected": -1.4152684211730957, "logps/chosen": -98.20048522949219, "logps/rejected": -169.7655792236328, "loss": 0.1481, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.0732301473617554, "rewards/margins": 4.125487327575684, "rewards/rejected": -3.0522565841674805, "step": 3160 }, { "epoch": 1.15, "grad_norm": 27.862330812022716, "learning_rate": 6.838049110775007e-08, "logits/chosen": -1.6845344305038452, "logits/rejected": -1.5890744924545288, "logps/chosen": -98.02180480957031, "logps/rejected": -128.43356323242188, "loss": 0.1365, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.3146222829818726, "rewards/margins": 2.931612014770508, "rewards/rejected": -1.6169898509979248, "step": 3170 }, { "epoch": 1.15, "grad_norm": 29.94290227789718, "learning_rate": 6.790734633436058e-08, "logits/chosen": -1.99330735206604, "logits/rejected": -1.2950857877731323, "logps/chosen": -99.69730377197266, "logps/rejected": -170.60195922851562, "loss": 0.1448, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.1823816299438477, "rewards/margins": 3.796720504760742, "rewards/rejected": -2.6143388748168945, "step": 3180 }, { "epoch": 1.16, "grad_norm": 64.10187404062623, "learning_rate": 6.743448621597989e-08, "logits/chosen": -1.8239631652832031, "logits/rejected": -1.8418114185333252, "logps/chosen": -100.65877532958984, "logps/rejected": -151.15411376953125, "loss": 0.131, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.8053275942802429, "rewards/margins": 3.210911989212036, "rewards/rejected": -2.4055840969085693, "step": 3190 }, { "epoch": 1.16, "grad_norm": 26.247231746645152, "learning_rate": 6.696192973027241e-08, "logits/chosen": -2.037968158721924, "logits/rejected": -1.6115257740020752, "logps/chosen": -73.27265930175781, "logps/rejected": -159.865966796875, "loss": 0.1542, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.6886720657348633, "rewards/margins": 3.8989975452423096, "rewards/rejected": -2.2103257179260254, "step": 3200 }, { "epoch": 1.16, "eval_logits/chosen": -2.4345650672912598, "eval_logits/rejected": -2.2589755058288574, "eval_logps/chosen": -90.60651397705078, "eval_logps/rejected": -151.0882110595703, "eval_loss": 0.27637749910354614, "eval_rewards/accuracies": 0.875, "eval_rewards/chosen": 0.9349749684333801, "eval_rewards/margins": 3.50665283203125, "eval_rewards/rejected": -2.5716776847839355, "eval_runtime": 71.9022, "eval_samples_per_second": 12.378, "eval_steps_per_second": 0.195, "step": 3200 }, { "epoch": 1.17, "grad_norm": 26.043972128745228, "learning_rate": 6.648969584271668e-08, "logits/chosen": -1.7389099597930908, "logits/rejected": -1.7843681573867798, "logps/chosen": -92.87619018554688, "logps/rejected": -143.67486572265625, "loss": 0.155, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.270666241645813, "rewards/margins": 3.2985167503356934, "rewards/rejected": -2.0278501510620117, "step": 3210 }, { "epoch": 1.17, "grad_norm": 48.1935636258946, "learning_rate": 6.601780350584408e-08, "logits/chosen": -2.341850757598877, "logits/rejected": -2.0155091285705566, "logps/chosen": -91.69564819335938, "logps/rejected": -173.34188842773438, "loss": 0.1687, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.45611572265625, "rewards/margins": 4.184123516082764, "rewards/rejected": -2.7280075550079346, "step": 3220 }, { "epoch": 1.17, "grad_norm": 23.817770856670688, "learning_rate": 6.55462716584783e-08, "logits/chosen": -1.6578280925750732, "logits/rejected": -1.547189712524414, "logps/chosen": -70.36898803710938, "logps/rejected": -163.3883819580078, "loss": 0.1816, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.5254313945770264, "rewards/margins": 4.97360897064209, "rewards/rejected": -3.4481780529022217, "step": 3230 }, { "epoch": 1.18, "grad_norm": 30.43724197841503, "learning_rate": 6.507511922497525e-08, "logits/chosen": -1.9906114339828491, "logits/rejected": -1.7292976379394531, "logps/chosen": -70.70221710205078, "logps/rejected": -145.86181640625, "loss": 0.1662, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.5770456790924072, "rewards/margins": 4.609046459197998, "rewards/rejected": -3.03200101852417, "step": 3240 }, { "epoch": 1.18, "grad_norm": 30.164526587495324, "learning_rate": 6.460436511446348e-08, "logits/chosen": -1.9928280115127563, "logits/rejected": -1.7679364681243896, "logps/chosen": -100.5902328491211, "logps/rejected": -178.82383728027344, "loss": 0.1463, "rewards/accuracies": 1.0, "rewards/chosen": 0.6832008361816406, "rewards/margins": 4.548179626464844, "rewards/rejected": -3.864978790283203, "step": 3250 }, { "epoch": 1.18, "grad_norm": 52.18169505988799, "learning_rate": 6.413402822008541e-08, "logits/chosen": -2.0655717849731445, "logits/rejected": -1.3271939754486084, "logps/chosen": -87.31197357177734, "logps/rejected": -160.46385192871094, "loss": 0.1905, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.8497316241264343, "rewards/margins": 3.820449113845825, "rewards/rejected": -2.9707179069519043, "step": 3260 }, { "epoch": 1.19, "grad_norm": 28.76843973839471, "learning_rate": 6.366412741823888e-08, "logits/chosen": -1.6622741222381592, "logits/rejected": -1.776906967163086, "logps/chosen": -94.9710693359375, "logps/rejected": -128.8150634765625, "loss": 0.1917, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.5094915628433228, "rewards/margins": 3.212104082107544, "rewards/rejected": -1.7026126384735107, "step": 3270 }, { "epoch": 1.19, "grad_norm": 29.1350717468671, "learning_rate": 6.31946815678198e-08, "logits/chosen": -2.0058090686798096, "logits/rejected": -1.7673218250274658, "logps/chosen": -95.3953857421875, "logps/rejected": -176.0774688720703, "loss": 0.1337, "rewards/accuracies": 1.0, "rewards/chosen": 1.4662463665008545, "rewards/margins": 4.142214775085449, "rewards/rejected": -2.6759676933288574, "step": 3280 }, { "epoch": 1.19, "grad_norm": 27.79208712883472, "learning_rate": 6.272570950946508e-08, "logits/chosen": -2.106937885284424, "logits/rejected": -1.6478996276855469, "logps/chosen": -85.5794906616211, "logps/rejected": -173.1703338623047, "loss": 0.1716, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.1876065731048584, "rewards/margins": 4.5130767822265625, "rewards/rejected": -3.325469970703125, "step": 3290 }, { "epoch": 1.2, "grad_norm": 26.268117829624686, "learning_rate": 6.225723006479663e-08, "logits/chosen": -2.263885259628296, "logits/rejected": -1.6747421026229858, "logps/chosen": -63.94829559326172, "logps/rejected": -163.29486083984375, "loss": 0.1694, "rewards/accuracies": 1.0, "rewards/chosen": 1.9152793884277344, "rewards/margins": 4.830869197845459, "rewards/rejected": -2.9155895709991455, "step": 3300 }, { "epoch": 1.2, "eval_logits/chosen": -2.4307594299316406, "eval_logits/rejected": -2.257986545562744, "eval_logps/chosen": -91.0833969116211, "eval_logps/rejected": -151.17442321777344, "eval_loss": 0.27279049158096313, "eval_rewards/accuracies": 0.8928571343421936, "eval_rewards/chosen": 0.887286365032196, "eval_rewards/margins": 3.4675843715667725, "eval_rewards/rejected": -2.5802979469299316, "eval_runtime": 71.844, "eval_samples_per_second": 12.388, "eval_steps_per_second": 0.195, "step": 3300 }, { "epoch": 1.2, "grad_norm": 20.771596605110865, "learning_rate": 6.178926203566588e-08, "logits/chosen": -1.9085609912872314, "logits/rejected": -1.6423466205596924, "logps/chosen": -88.54236602783203, "logps/rejected": -151.7003631591797, "loss": 0.1747, "rewards/accuracies": 1.0, "rewards/chosen": 1.4402354955673218, "rewards/margins": 4.408102989196777, "rewards/rejected": -2.9678680896759033, "step": 3310 }, { "epoch": 1.21, "grad_norm": 23.09159131359664, "learning_rate": 6.132182420339918e-08, "logits/chosen": -1.6666587591171265, "logits/rejected": -1.2324297428131104, "logps/chosen": -81.8795166015625, "logps/rejected": -138.54940795898438, "loss": 0.1441, "rewards/accuracies": 1.0, "rewards/chosen": 1.4413508176803589, "rewards/margins": 3.961609363555908, "rewards/rejected": -2.5202584266662598, "step": 3320 }, { "epoch": 1.21, "grad_norm": 33.84562565815576, "learning_rate": 6.085493532804413e-08, "logits/chosen": -2.300516128540039, "logits/rejected": -2.122788906097412, "logps/chosen": -86.87980651855469, "logps/rejected": -168.46022033691406, "loss": 0.1544, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.0005496740341187, "rewards/margins": 3.9717116355895996, "rewards/rejected": -2.9711620807647705, "step": 3330 }, { "epoch": 1.21, "grad_norm": 14.408331018978265, "learning_rate": 6.03886141476166e-08, "logits/chosen": -2.2469568252563477, "logits/rejected": -1.8382642269134521, "logps/chosen": -81.93803405761719, "logps/rejected": -163.33473205566406, "loss": 0.1334, "rewards/accuracies": 1.0, "rewards/chosen": 1.7893930673599243, "rewards/margins": 4.628080368041992, "rewards/rejected": -2.8386874198913574, "step": 3340 }, { "epoch": 1.22, "grad_norm": 33.15124696775361, "learning_rate": 5.992287937734873e-08, "logits/chosen": -2.054621934890747, "logits/rejected": -1.8777787685394287, "logps/chosen": -84.07951354980469, "logps/rejected": -179.1838836669922, "loss": 0.144, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.4575679302215576, "rewards/margins": 4.167536735534668, "rewards/rejected": -2.7099688053131104, "step": 3350 }, { "epoch": 1.22, "grad_norm": 32.145978602477854, "learning_rate": 5.9457749708937756e-08, "logits/chosen": -1.9643110036849976, "logits/rejected": -1.795340895652771, "logps/chosen": -82.72921752929688, "logps/rejected": -128.0702667236328, "loss": 0.1543, "rewards/accuracies": 1.0, "rewards/chosen": 0.9123824238777161, "rewards/margins": 3.376009464263916, "rewards/rejected": -2.4636270999908447, "step": 3360 }, { "epoch": 1.22, "grad_norm": 49.92991955367269, "learning_rate": 5.8993243809795915e-08, "logits/chosen": -1.8667415380477905, "logits/rejected": -1.3340458869934082, "logps/chosen": -91.13977813720703, "logps/rejected": -158.5907440185547, "loss": 0.1547, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.49219757318496704, "rewards/margins": 3.9671154022216797, "rewards/rejected": -3.4749176502227783, "step": 3370 }, { "epoch": 1.23, "grad_norm": 31.770406094051957, "learning_rate": 5.852938032230126e-08, "logits/chosen": -2.3319461345672607, "logits/rejected": -1.882067322731018, "logps/chosen": -76.39924621582031, "logps/rejected": -176.21929931640625, "loss": 0.1699, "rewards/accuracies": 1.0, "rewards/chosen": 2.2963407039642334, "rewards/margins": 6.4765625, "rewards/rejected": -4.180222034454346, "step": 3380 }, { "epoch": 1.23, "grad_norm": 21.3080930046784, "learning_rate": 5.806617786304937e-08, "logits/chosen": -1.8695051670074463, "logits/rejected": -1.3510863780975342, "logps/chosen": -86.91441345214844, "logps/rejected": -250.5083770751953, "loss": 0.1788, "rewards/accuracies": 1.0, "rewards/chosen": 0.846575140953064, "rewards/margins": 5.218047142028809, "rewards/rejected": -4.371471405029297, "step": 3390 }, { "epoch": 1.23, "grad_norm": 41.88504763461129, "learning_rate": 5.760365502210634e-08, "logits/chosen": -1.7990360260009766, "logits/rejected": -1.9284346103668213, "logps/chosen": -91.82389068603516, "logps/rejected": -152.16659545898438, "loss": 0.1763, "rewards/accuracies": 1.0, "rewards/chosen": 1.4950382709503174, "rewards/margins": 4.346599578857422, "rewards/rejected": -2.8515613079071045, "step": 3400 }, { "epoch": 1.23, "eval_logits/chosen": -2.414144277572632, "eval_logits/rejected": -2.243586778640747, "eval_logps/chosen": -91.0235366821289, "eval_logps/rejected": -152.044921875, "eval_loss": 0.26993635296821594, "eval_rewards/accuracies": 0.8928571343421936, "eval_rewards/chosen": 0.8932717442512512, "eval_rewards/margins": 3.5606191158294678, "eval_rewards/rejected": -2.6673471927642822, "eval_runtime": 71.993, "eval_samples_per_second": 12.362, "eval_steps_per_second": 0.194, "step": 3400 }, { "epoch": 1.24, "grad_norm": 25.05732311627679, "learning_rate": 5.7141830362262514e-08, "logits/chosen": -1.7883466482162476, "logits/rejected": -1.5041227340698242, "logps/chosen": -86.22000885009766, "logps/rejected": -157.66397094726562, "loss": 0.1676, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.7772796154022217, "rewards/margins": 4.851300239562988, "rewards/rejected": -3.0740203857421875, "step": 3410 }, { "epoch": 1.24, "grad_norm": 44.30312099667292, "learning_rate": 5.6680722418287674e-08, "logits/chosen": -2.064953565597534, "logits/rejected": -1.8630918264389038, "logps/chosen": -60.13167190551758, "logps/rejected": -247.2148895263672, "loss": 0.1305, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 2.1393609046936035, "rewards/margins": 4.484022617340088, "rewards/rejected": -2.3446614742279053, "step": 3420 }, { "epoch": 1.25, "grad_norm": 48.64244001308947, "learning_rate": 5.622034969618704e-08, "logits/chosen": -1.976629614830017, "logits/rejected": -1.4999226331710815, "logps/chosen": -77.93717956542969, "logps/rejected": -151.5218963623047, "loss": 0.1785, "rewards/accuracies": 1.0, "rewards/chosen": 1.5172656774520874, "rewards/margins": 3.7420005798339844, "rewards/rejected": -2.2247347831726074, "step": 3430 }, { "epoch": 1.25, "grad_norm": 35.51662255328931, "learning_rate": 5.576073067245862e-08, "logits/chosen": -1.8411935567855835, "logits/rejected": -1.700330376625061, "logps/chosen": -67.32380676269531, "logps/rejected": -140.62051391601562, "loss": 0.167, "rewards/accuracies": 1.0, "rewards/chosen": 1.6732587814331055, "rewards/margins": 4.312959671020508, "rewards/rejected": -2.6397011280059814, "step": 3440 }, { "epoch": 1.25, "grad_norm": 42.40486506801471, "learning_rate": 5.530188379335166e-08, "logits/chosen": -1.5920283794403076, "logits/rejected": -1.1166603565216064, "logps/chosen": -70.99920654296875, "logps/rejected": -169.20806884765625, "loss": 0.1756, "rewards/accuracies": 1.0, "rewards/chosen": 0.8735821843147278, "rewards/margins": 5.562444686889648, "rewards/rejected": -4.6888628005981445, "step": 3450 }, { "epoch": 1.26, "grad_norm": 26.913972282878095, "learning_rate": 5.4843827474126274e-08, "logits/chosen": -1.9140102863311768, "logits/rejected": -1.400821328163147, "logps/chosen": -74.26724243164062, "logps/rejected": -132.2233123779297, "loss": 0.1374, "rewards/accuracies": 1.0, "rewards/chosen": 1.5520453453063965, "rewards/margins": 4.453476905822754, "rewards/rejected": -2.90143084526062, "step": 3460 }, { "epoch": 1.26, "grad_norm": 26.13373331897645, "learning_rate": 5.438658009831448e-08, "logits/chosen": -1.73845636844635, "logits/rejected": -1.6531593799591064, "logps/chosen": -96.46147918701172, "logps/rejected": -160.9883575439453, "loss": 0.1403, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.2454642057418823, "rewards/margins": 4.1239237785339355, "rewards/rejected": -2.8784596920013428, "step": 3470 }, { "epoch": 1.26, "grad_norm": 34.42235214848031, "learning_rate": 5.39301600169823e-08, "logits/chosen": -1.9057807922363281, "logits/rejected": -1.927220344543457, "logps/chosen": -115.20938873291016, "logps/rejected": -181.9356231689453, "loss": 0.1796, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.7145345211029053, "rewards/margins": 4.270758152008057, "rewards/rejected": -3.5562233924865723, "step": 3480 }, { "epoch": 1.27, "grad_norm": 20.971370688650854, "learning_rate": 5.347458554799332e-08, "logits/chosen": -1.615875005722046, "logits/rejected": -1.6177091598510742, "logps/chosen": -118.13121032714844, "logps/rejected": -160.26910400390625, "loss": 0.163, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.0960075855255127, "rewards/margins": 3.6775288581848145, "rewards/rejected": -2.581521511077881, "step": 3490 }, { "epoch": 1.27, "grad_norm": 27.13239221195734, "learning_rate": 5.301987497527353e-08, "logits/chosen": -2.2539658546447754, "logits/rejected": -1.4924699068069458, "logps/chosen": -91.34033203125, "logps/rejected": -181.88430786132812, "loss": 0.1526, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.448373556137085, "rewards/margins": 4.606696128845215, "rewards/rejected": -3.1583220958709717, "step": 3500 }, { "epoch": 1.27, "eval_logits/chosen": -2.421151876449585, "eval_logits/rejected": -2.2513325214385986, "eval_logps/chosen": -91.84569549560547, "eval_logps/rejected": -153.45870971679688, "eval_loss": 0.2666407823562622, "eval_rewards/accuracies": 0.875, "eval_rewards/chosen": 0.8110561370849609, "eval_rewards/margins": 3.6197829246520996, "eval_rewards/rejected": -2.8087267875671387, "eval_runtime": 71.9717, "eval_samples_per_second": 12.366, "eval_steps_per_second": 0.195, "step": 3500 }, { "epoch": 1.27, "grad_norm": 29.49364758024353, "learning_rate": 5.256604654807742e-08, "logits/chosen": -1.7953096628189087, "logits/rejected": -1.669303297996521, "logps/chosen": -90.09492492675781, "logps/rejected": -149.954833984375, "loss": 0.1912, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.0876281261444092, "rewards/margins": 3.9230809211730957, "rewards/rejected": -2.8354527950286865, "step": 3510 }, { "epoch": 1.28, "grad_norm": 41.572390412456194, "learning_rate": 5.21131184802557e-08, "logits/chosen": -2.22542142868042, "logits/rejected": -1.8966829776763916, "logps/chosen": -83.38715362548828, "logps/rejected": -141.80059814453125, "loss": 0.1877, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.1568256616592407, "rewards/margins": 3.453406572341919, "rewards/rejected": -2.2965807914733887, "step": 3520 }, { "epoch": 1.28, "grad_norm": 24.565593605623125, "learning_rate": 5.166110894952426e-08, "logits/chosen": -1.9109958410263062, "logits/rejected": -1.4489656686782837, "logps/chosen": -58.008819580078125, "logps/rejected": -191.261474609375, "loss": 0.1478, "rewards/accuracies": 1.0, "rewards/chosen": 1.9802272319793701, "rewards/margins": 6.276611804962158, "rewards/rejected": -4.296384811401367, "step": 3530 }, { "epoch": 1.28, "grad_norm": 53.61199506683095, "learning_rate": 5.1210036096734595e-08, "logits/chosen": -1.5364563465118408, "logits/rejected": -1.3872387409210205, "logps/chosen": -107.84410095214844, "logps/rejected": -160.59361267089844, "loss": 0.1401, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5507475137710571, "rewards/margins": 3.1839842796325684, "rewards/rejected": -2.633236885070801, "step": 3540 }, { "epoch": 1.29, "grad_norm": 28.74282956678025, "learning_rate": 5.0759918025145814e-08, "logits/chosen": -2.2165632247924805, "logits/rejected": -1.650313138961792, "logps/chosen": -93.6681137084961, "logps/rejected": -145.92111206054688, "loss": 0.1582, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.3355414867401123, "rewards/margins": 4.038471698760986, "rewards/rejected": -2.702929973602295, "step": 3550 }, { "epoch": 1.29, "grad_norm": 26.165374741112064, "learning_rate": 5.031077279969797e-08, "logits/chosen": -1.9472557306289673, "logits/rejected": -1.7647784948349, "logps/chosen": -76.48893737792969, "logps/rejected": -135.52386474609375, "loss": 0.1404, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.397589921951294, "rewards/margins": 4.449340343475342, "rewards/rejected": -3.051750898361206, "step": 3560 }, { "epoch": 1.3, "grad_norm": 36.82965132004594, "learning_rate": 4.9862618446287206e-08, "logits/chosen": -1.9602140188217163, "logits/rejected": -1.6700010299682617, "logps/chosen": -72.4864730834961, "logps/rejected": -136.77584838867188, "loss": 0.171, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.6417019367218018, "rewards/margins": 3.702420473098755, "rewards/rejected": -2.060718536376953, "step": 3570 }, { "epoch": 1.3, "grad_norm": 24.181445490171278, "learning_rate": 4.9415472951042175e-08, "logits/chosen": -1.9628846645355225, "logits/rejected": -1.5444905757904053, "logps/chosen": -81.23819732666016, "logps/rejected": -182.43032836914062, "loss": 0.121, "rewards/accuracies": 1.0, "rewards/chosen": 1.530139684677124, "rewards/margins": 4.815797328948975, "rewards/rejected": -3.2856578826904297, "step": 3580 }, { "epoch": 1.3, "grad_norm": 45.8781810453152, "learning_rate": 4.8969354259602245e-08, "logits/chosen": -1.7887170314788818, "logits/rejected": -1.6762058734893799, "logps/chosen": -87.67835998535156, "logps/rejected": -148.01345825195312, "loss": 0.1509, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.3987677097320557, "rewards/margins": 4.317151069641113, "rewards/rejected": -2.9183835983276367, "step": 3590 }, { "epoch": 1.31, "grad_norm": 46.13081539436112, "learning_rate": 4.85242802763973e-08, "logits/chosen": -1.9573310613632202, "logits/rejected": -1.7241952419281006, "logps/chosen": -68.08113861083984, "logps/rejected": -126.60243225097656, "loss": 0.1819, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.9216814041137695, "rewards/margins": 4.243165969848633, "rewards/rejected": -2.321484327316284, "step": 3600 }, { "epoch": 1.31, "eval_logits/chosen": -2.4482152462005615, "eval_logits/rejected": -2.2676963806152344, "eval_logps/chosen": -91.19036865234375, "eval_logps/rejected": -152.58682250976562, "eval_loss": 0.26565536856651306, "eval_rewards/accuracies": 0.8928571343421936, "eval_rewards/chosen": 0.8765901327133179, "eval_rewards/margins": 3.5981290340423584, "eval_rewards/rejected": -2.721538782119751, "eval_runtime": 71.9607, "eval_samples_per_second": 12.368, "eval_steps_per_second": 0.195, "step": 3600 }, { "epoch": 1.31, "grad_norm": 23.242650326252527, "learning_rate": 4.808026886392907e-08, "logits/chosen": -1.5748409032821655, "logits/rejected": -1.6680151224136353, "logps/chosen": -97.38771057128906, "logps/rejected": -131.87599182128906, "loss": 0.1103, "rewards/accuracies": 1.0, "rewards/chosen": 1.183221459388733, "rewards/margins": 2.838090419769287, "rewards/rejected": -1.654868721961975, "step": 3610 }, { "epoch": 1.31, "grad_norm": 20.775972068583737, "learning_rate": 4.763733784205434e-08, "logits/chosen": -2.157090902328491, "logits/rejected": -1.7526382207870483, "logps/chosen": -83.79398345947266, "logps/rejected": -138.4102020263672, "loss": 0.1308, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.2044012546539307, "rewards/margins": 3.7157578468322754, "rewards/rejected": -2.5113563537597656, "step": 3620 }, { "epoch": 1.32, "grad_norm": 27.624631894217778, "learning_rate": 4.7195504987269736e-08, "logits/chosen": -1.8666884899139404, "logits/rejected": -1.962099313735962, "logps/chosen": -92.00392150878906, "logps/rejected": -265.90936279296875, "loss": 0.1808, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.662792444229126, "rewards/margins": 4.868588924407959, "rewards/rejected": -3.205796718597412, "step": 3630 }, { "epoch": 1.32, "grad_norm": 35.3219065228648, "learning_rate": 4.6754788031998294e-08, "logits/chosen": -2.1062629222869873, "logits/rejected": -1.9411357641220093, "logps/chosen": -81.11627960205078, "logps/rejected": -207.1051483154297, "loss": 0.1747, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.1783745288848877, "rewards/margins": 4.6277360916137695, "rewards/rejected": -3.4493613243103027, "step": 3640 }, { "epoch": 1.32, "grad_norm": 15.460191475491241, "learning_rate": 4.631520466387777e-08, "logits/chosen": -1.867004632949829, "logits/rejected": -1.6915152072906494, "logps/chosen": -110.6289291381836, "logps/rejected": -156.9150390625, "loss": 0.1242, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.2745405435562134, "rewards/margins": 3.7722878456115723, "rewards/rejected": -2.4977474212646484, "step": 3650 }, { "epoch": 1.33, "grad_norm": 35.15290230831595, "learning_rate": 4.587677252505077e-08, "logits/chosen": -1.8647207021713257, "logits/rejected": -1.6314834356307983, "logps/chosen": -82.48086547851562, "logps/rejected": -158.58407592773438, "loss": 0.1358, "rewards/accuracies": 1.0, "rewards/chosen": 1.5844959020614624, "rewards/margins": 4.390726566314697, "rewards/rejected": -2.806230068206787, "step": 3660 }, { "epoch": 1.33, "grad_norm": 31.64353347707315, "learning_rate": 4.5439509211456734e-08, "logits/chosen": -2.0931248664855957, "logits/rejected": -1.6264079809188843, "logps/chosen": -62.890602111816406, "logps/rejected": -150.46688842773438, "loss": 0.1504, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.1719433069229126, "rewards/margins": 4.196671009063721, "rewards/rejected": -3.0247273445129395, "step": 3670 }, { "epoch": 1.34, "grad_norm": 14.71173714786521, "learning_rate": 4.500343227212572e-08, "logits/chosen": -1.7733246088027954, "logits/rejected": -2.041614294052124, "logps/chosen": -123.92191314697266, "logps/rejected": -155.9501953125, "loss": 0.1176, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.6424921154975891, "rewards/margins": 3.105193853378296, "rewards/rejected": -2.4627020359039307, "step": 3680 }, { "epoch": 1.34, "grad_norm": 24.247587972420103, "learning_rate": 4.4568559208474127e-08, "logits/chosen": -1.6494388580322266, "logits/rejected": -1.4123470783233643, "logps/chosen": -117.4607162475586, "logps/rejected": -177.09315490722656, "loss": 0.1495, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.41657859086990356, "rewards/margins": 4.236758232116699, "rewards/rejected": -3.8201801776885986, "step": 3690 }, { "epoch": 1.34, "grad_norm": 26.64386939815121, "learning_rate": 4.4134907473602205e-08, "logits/chosen": -2.0543887615203857, "logits/rejected": -1.703619360923767, "logps/chosen": -76.43885803222656, "logps/rejected": -196.35208129882812, "loss": 0.1192, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.2960059642791748, "rewards/margins": 4.0435991287231445, "rewards/rejected": -2.747593641281128, "step": 3700 }, { "epoch": 1.34, "eval_logits/chosen": -2.443880319595337, "eval_logits/rejected": -2.264932155609131, "eval_logps/chosen": -91.3131332397461, "eval_logps/rejected": -153.05369567871094, "eval_loss": 0.26342639327049255, "eval_rewards/accuracies": 0.8928571343421936, "eval_rewards/chosen": 0.8643126487731934, "eval_rewards/margins": 3.6325364112854004, "eval_rewards/rejected": -2.768224000930786, "eval_runtime": 71.8523, "eval_samples_per_second": 12.387, "eval_steps_per_second": 0.195, "step": 3700 }, { "epoch": 1.35, "grad_norm": 40.03721241840345, "learning_rate": 4.370249447159372e-08, "logits/chosen": -1.892690896987915, "logits/rejected": -1.5442091226577759, "logps/chosen": -105.28565979003906, "logps/rejected": -149.9592742919922, "loss": 0.1496, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7859503030776978, "rewards/margins": 3.7989330291748047, "rewards/rejected": -3.0129826068878174, "step": 3710 }, { "epoch": 1.35, "grad_norm": 35.14173667461832, "learning_rate": 4.32713375568174e-08, "logits/chosen": -1.6806223392486572, "logits/rejected": -1.7743151187896729, "logps/chosen": -96.13143157958984, "logps/rejected": -149.65716552734375, "loss": 0.1656, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.204106092453003, "rewards/margins": 3.2453396320343018, "rewards/rejected": -2.041234016418457, "step": 3720 }, { "epoch": 1.35, "grad_norm": 18.483200235512868, "learning_rate": 4.284145403323043e-08, "logits/chosen": -1.508544683456421, "logits/rejected": -1.6680357456207275, "logps/chosen": -111.27012634277344, "logps/rejected": -159.14892578125, "loss": 0.1446, "rewards/accuracies": 1.0, "rewards/chosen": 0.998451828956604, "rewards/margins": 4.472312927246094, "rewards/rejected": -3.4738609790802, "step": 3730 }, { "epoch": 1.36, "grad_norm": 36.84141617747612, "learning_rate": 4.2412861153684e-08, "logits/chosen": -1.8184179067611694, "logits/rejected": -1.6679880619049072, "logps/chosen": -99.74363708496094, "logps/rejected": -159.52243041992188, "loss": 0.1462, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.8888320922851562, "rewards/margins": 3.3356125354766846, "rewards/rejected": -2.4467806816101074, "step": 3740 }, { "epoch": 1.36, "grad_norm": 20.65082234776276, "learning_rate": 4.198557611923083e-08, "logits/chosen": -1.5405653715133667, "logits/rejected": -1.5137519836425781, "logps/chosen": -83.92024230957031, "logps/rejected": -171.1388702392578, "loss": 0.1492, "rewards/accuracies": 1.0, "rewards/chosen": 1.2103972434997559, "rewards/margins": 5.250774383544922, "rewards/rejected": -4.040377616882324, "step": 3750 }, { "epoch": 1.36, "grad_norm": 25.88209046742063, "learning_rate": 4.1559616078434946e-08, "logits/chosen": -2.0902841091156006, "logits/rejected": -1.6206400394439697, "logps/chosen": -79.99602508544922, "logps/rejected": -223.35891723632812, "loss": 0.1586, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.2605037689208984, "rewards/margins": 4.679810047149658, "rewards/rejected": -3.419306993484497, "step": 3760 }, { "epoch": 1.37, "grad_norm": 31.138553957115032, "learning_rate": 4.113499812668331e-08, "logits/chosen": -2.0036697387695312, "logits/rejected": -1.8283746242523193, "logps/chosen": -75.56672668457031, "logps/rejected": -130.79147338867188, "loss": 0.1369, "rewards/accuracies": 1.0, "rewards/chosen": 0.610881507396698, "rewards/margins": 3.6262385845184326, "rewards/rejected": -3.015357255935669, "step": 3770 }, { "epoch": 1.37, "grad_norm": 22.960729088747243, "learning_rate": 4.071173930549979e-08, "logits/chosen": -1.7299280166625977, "logits/rejected": -1.4976304769515991, "logps/chosen": -94.44389343261719, "logps/rejected": -154.29798889160156, "loss": 0.1498, "rewards/accuracies": 1.0, "rewards/chosen": 0.5378260612487793, "rewards/margins": 3.7743492126464844, "rewards/rejected": -3.236523151397705, "step": 3780 }, { "epoch": 1.38, "grad_norm": 28.61684578895429, "learning_rate": 4.0289856601861285e-08, "logits/chosen": -1.8740425109863281, "logits/rejected": -1.5999120473861694, "logps/chosen": -87.98422241210938, "logps/rejected": -153.87075805664062, "loss": 0.1542, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.4116048216819763, "rewards/margins": 3.741361141204834, "rewards/rejected": -3.32975697517395, "step": 3790 }, { "epoch": 1.38, "grad_norm": 35.76212329882805, "learning_rate": 3.9869366947515747e-08, "logits/chosen": -2.0777387619018555, "logits/rejected": -1.9436269998550415, "logps/chosen": -91.76081848144531, "logps/rejected": -149.74957275390625, "loss": 0.144, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.5345284938812256, "rewards/margins": 4.457833290100098, "rewards/rejected": -2.923304557800293, "step": 3800 }, { "epoch": 1.38, "eval_logits/chosen": -2.445099115371704, "eval_logits/rejected": -2.2687337398529053, "eval_logps/chosen": -92.02857971191406, "eval_logps/rejected": -154.06690979003906, "eval_loss": 0.263904869556427, "eval_rewards/accuracies": 0.8928571343421936, "eval_rewards/chosen": 0.7927670478820801, "eval_rewards/margins": 3.662313938140869, "eval_rewards/rejected": -2.869547128677368, "eval_runtime": 72.6671, "eval_samples_per_second": 12.248, "eval_steps_per_second": 0.193, "step": 3800 }, { "epoch": 1.38, "grad_norm": 34.70092608089086, "learning_rate": 3.945028721830289e-08, "logits/chosen": -1.8025611639022827, "logits/rejected": -1.4805386066436768, "logps/chosen": -94.7345962524414, "logps/rejected": -197.67295837402344, "loss": 0.1849, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.3399019241333008, "rewards/margins": 4.401653289794922, "rewards/rejected": -3.061751127243042, "step": 3810 }, { "epoch": 1.39, "grad_norm": 35.29839166964984, "learning_rate": 3.903263423347678e-08, "logits/chosen": -1.9683891534805298, "logits/rejected": -1.4604440927505493, "logps/chosen": -89.45817565917969, "logps/rejected": -148.0181121826172, "loss": 0.1761, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.6148315072059631, "rewards/margins": 3.239147186279297, "rewards/rejected": -2.624316453933716, "step": 3820 }, { "epoch": 1.39, "grad_norm": 39.48429335160676, "learning_rate": 3.8616424755030845e-08, "logits/chosen": -1.5986571311950684, "logits/rejected": -1.3868935108184814, "logps/chosen": -101.1136474609375, "logps/rejected": -143.46604919433594, "loss": 0.1891, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8768807649612427, "rewards/margins": 3.3683598041534424, "rewards/rejected": -2.4914793968200684, "step": 3830 }, { "epoch": 1.39, "grad_norm": 49.23729367857185, "learning_rate": 3.820167548702516e-08, "logits/chosen": -1.5271122455596924, "logits/rejected": -1.0103445053100586, "logps/chosen": -68.83717346191406, "logps/rejected": -165.7958984375, "loss": 0.1597, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.0218682289123535, "rewards/margins": 5.000029563903809, "rewards/rejected": -3.978161334991455, "step": 3840 }, { "epoch": 1.4, "grad_norm": 24.868995408783775, "learning_rate": 3.778840307491595e-08, "logits/chosen": -1.700553297996521, "logits/rejected": -1.1538689136505127, "logps/chosen": -91.43379211425781, "logps/rejected": -162.2164306640625, "loss": 0.1497, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.1438677310943604, "rewards/margins": 4.128595352172852, "rewards/rejected": -2.9847278594970703, "step": 3850 }, { "epoch": 1.4, "grad_norm": 27.603218968201812, "learning_rate": 3.737662410488772e-08, "logits/chosen": -1.768294334411621, "logits/rejected": -1.8469886779785156, "logps/chosen": -95.86221313476562, "logps/rejected": -161.03114318847656, "loss": 0.1792, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.108917236328125, "rewards/margins": 4.2280168533325195, "rewards/rejected": -3.1190993785858154, "step": 3860 }, { "epoch": 1.4, "grad_norm": 36.9707293925801, "learning_rate": 3.696635510318747e-08, "logits/chosen": -2.072791814804077, "logits/rejected": -1.609438180923462, "logps/chosen": -96.32298278808594, "logps/rejected": -139.48504638671875, "loss": 0.1769, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.095580816268921, "rewards/margins": 2.998603105545044, "rewards/rejected": -1.9030221700668335, "step": 3870 }, { "epoch": 1.41, "grad_norm": 18.130530465583483, "learning_rate": 3.655761253546142e-08, "logits/chosen": -1.739894151687622, "logits/rejected": -1.5710958242416382, "logps/chosen": -73.37211608886719, "logps/rejected": -147.71841430664062, "loss": 0.1407, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.7478740215301514, "rewards/margins": 4.458775520324707, "rewards/rejected": -2.7109017372131348, "step": 3880 }, { "epoch": 1.41, "grad_norm": 41.20250396110067, "learning_rate": 3.6150412806094344e-08, "logits/chosen": -2.221762180328369, "logits/rejected": -1.7284727096557617, "logps/chosen": -87.04502868652344, "logps/rejected": -178.15328979492188, "loss": 0.1511, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.5952017307281494, "rewards/margins": 4.024810791015625, "rewards/rejected": -3.4296088218688965, "step": 3890 }, { "epoch": 1.42, "grad_norm": 30.898435410912892, "learning_rate": 3.574477225755092e-08, "logits/chosen": -1.9189503192901611, "logits/rejected": -1.6017463207244873, "logps/chosen": -91.14994049072266, "logps/rejected": -140.59140014648438, "loss": 0.1603, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.5078619718551636, "rewards/margins": 3.65142560005188, "rewards/rejected": -2.143563747406006, "step": 3900 }, { "epoch": 1.42, "eval_logits/chosen": -2.447599411010742, "eval_logits/rejected": -2.271988868713379, "eval_logps/chosen": -89.99532318115234, "eval_logps/rejected": -152.36782836914062, "eval_loss": 0.2630765736103058, "eval_rewards/accuracies": 0.875, "eval_rewards/chosen": 0.9960936903953552, "eval_rewards/margins": 3.695732831954956, "eval_rewards/rejected": -2.699639081954956, "eval_runtime": 71.5499, "eval_samples_per_second": 12.439, "eval_steps_per_second": 0.196, "step": 3900 }, { "epoch": 1.42, "grad_norm": 15.623201016392487, "learning_rate": 3.534070716972011e-08, "logits/chosen": -1.98562490940094, "logits/rejected": -1.7183992862701416, "logps/chosen": -67.69291687011719, "logps/rejected": -132.70785522460938, "loss": 0.1543, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.898485541343689, "rewards/margins": 4.570878028869629, "rewards/rejected": -2.6723921298980713, "step": 3910 }, { "epoch": 1.42, "grad_norm": 29.502016145046106, "learning_rate": 3.493823375926165e-08, "logits/chosen": -2.040684938430786, "logits/rejected": -1.617761254310608, "logps/chosen": -94.72434997558594, "logps/rejected": -180.46612548828125, "loss": 0.1709, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.4578778743743896, "rewards/margins": 4.434561729431152, "rewards/rejected": -2.9766833782196045, "step": 3920 }, { "epoch": 1.43, "grad_norm": 43.61044884590936, "learning_rate": 3.4537368178955237e-08, "logits/chosen": -2.5114989280700684, "logits/rejected": -1.929688811302185, "logps/chosen": -63.439727783203125, "logps/rejected": -167.50003051757812, "loss": 0.1306, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.4715431928634644, "rewards/margins": 5.039919853210449, "rewards/rejected": -3.5683765411376953, "step": 3930 }, { "epoch": 1.43, "grad_norm": 31.861912930744854, "learning_rate": 3.4138126517052315e-08, "logits/chosen": -2.129574775695801, "logits/rejected": -2.0275707244873047, "logps/chosen": -91.8274917602539, "logps/rejected": -152.502685546875, "loss": 0.1992, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.9415672421455383, "rewards/margins": 3.8349013328552246, "rewards/rejected": -2.893334150314331, "step": 3940 }, { "epoch": 1.43, "grad_norm": 10.50472857948733, "learning_rate": 3.374052479663024e-08, "logits/chosen": -1.59491765499115, "logits/rejected": -1.3649214506149292, "logps/chosen": -104.72843933105469, "logps/rejected": -193.97207641601562, "loss": 0.1731, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.1260311603546143, "rewards/margins": 5.22696590423584, "rewards/rejected": -4.1009345054626465, "step": 3950 }, { "epoch": 1.44, "grad_norm": 23.4008383750431, "learning_rate": 3.334457897494941e-08, "logits/chosen": -1.9443241357803345, "logits/rejected": -1.6959731578826904, "logps/chosen": -85.87373352050781, "logps/rejected": -158.98065185546875, "loss": 0.1546, "rewards/accuracies": 1.0, "rewards/chosen": 1.3743432760238647, "rewards/margins": 4.179642677307129, "rewards/rejected": -2.8052992820739746, "step": 3960 }, { "epoch": 1.44, "grad_norm": 33.938523290472, "learning_rate": 3.29503049428127e-08, "logits/chosen": -1.9471511840820312, "logits/rejected": -1.7361555099487305, "logps/chosen": -84.15934753417969, "logps/rejected": -153.57553100585938, "loss": 0.1711, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.410812497138977, "rewards/margins": 4.902033805847168, "rewards/rejected": -3.4912219047546387, "step": 3970 }, { "epoch": 1.44, "grad_norm": 42.88422707114712, "learning_rate": 3.255771852392775e-08, "logits/chosen": -1.8716990947723389, "logits/rejected": -1.7490049600601196, "logps/chosen": -93.93757629394531, "logps/rejected": -118.72102355957031, "loss": 0.1712, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7341419458389282, "rewards/margins": 2.5488927364349365, "rewards/rejected": -1.8147506713867188, "step": 3980 }, { "epoch": 1.45, "grad_norm": 31.96232509403753, "learning_rate": 3.2166835474271995e-08, "logits/chosen": -2.2499547004699707, "logits/rejected": -1.7057373523712158, "logps/chosen": -70.68228912353516, "logps/rejected": -136.07669067382812, "loss": 0.1381, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8701726794242859, "rewards/margins": 2.541351556777954, "rewards/rejected": -1.671179175376892, "step": 3990 }, { "epoch": 1.45, "grad_norm": 33.013579048970634, "learning_rate": 3.177767148146004e-08, "logits/chosen": -1.196561336517334, "logits/rejected": -0.9995512962341309, "logps/chosen": -89.14258575439453, "logps/rejected": -131.5675048828125, "loss": 0.2054, "rewards/accuracies": 1.0, "rewards/chosen": 0.6014581918716431, "rewards/margins": 2.8417820930480957, "rewards/rejected": -2.240324020385742, "step": 4000 }, { "epoch": 1.45, "eval_logits/chosen": -2.5068588256835938, "eval_logits/rejected": -2.316378116607666, "eval_logps/chosen": -92.20320129394531, "eval_logps/rejected": -153.47879028320312, "eval_loss": 0.25913339853286743, "eval_rewards/accuracies": 0.8928571343421936, "eval_rewards/chosen": 0.7753064036369324, "eval_rewards/margins": 3.5860416889190674, "eval_rewards/rejected": -2.8107354640960693, "eval_runtime": 71.5791, "eval_samples_per_second": 12.434, "eval_steps_per_second": 0.196, "step": 4000 }, { "epoch": 1.46, "grad_norm": 29.18650004448433, "learning_rate": 3.139024216411438e-08, "logits/chosen": -1.9775199890136719, "logits/rejected": -1.7694860696792603, "logps/chosen": -74.32111358642578, "logps/rejected": -114.33467102050781, "loss": 0.137, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.9357797503471375, "rewards/margins": 3.2922019958496094, "rewards/rejected": -2.356421947479248, "step": 4010 }, { "epoch": 1.46, "grad_norm": 29.240217688028995, "learning_rate": 3.100456307123838e-08, "logits/chosen": -2.0903449058532715, "logits/rejected": -1.7641003131866455, "logps/chosen": -81.42185974121094, "logps/rejected": -165.4561309814453, "loss": 0.1658, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.1881463527679443, "rewards/margins": 4.836016654968262, "rewards/rejected": -3.6478705406188965, "step": 4020 }, { "epoch": 1.46, "grad_norm": 30.89971434639723, "learning_rate": 3.062064968159231e-08, "logits/chosen": -2.3161113262176514, "logits/rejected": -1.780773401260376, "logps/chosen": -84.45279693603516, "logps/rejected": -165.2125701904297, "loss": 0.1622, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.193415880203247, "rewards/margins": 4.027709484100342, "rewards/rejected": -2.834293842315674, "step": 4030 }, { "epoch": 1.47, "grad_norm": 17.28427871490954, "learning_rate": 3.023851740307201e-08, "logits/chosen": -2.0731730461120605, "logits/rejected": -1.6455036401748657, "logps/chosen": -110.5876693725586, "logps/rejected": -180.8351287841797, "loss": 0.1633, "rewards/accuracies": 1.0, "rewards/chosen": 0.24699239432811737, "rewards/margins": 3.7656092643737793, "rewards/rejected": -3.5186171531677246, "step": 4040 }, { "epoch": 1.47, "grad_norm": 25.73309666117762, "learning_rate": 2.9858181572090675e-08, "logits/chosen": -1.7583844661712646, "logits/rejected": -1.8522754907608032, "logps/chosen": -94.708251953125, "logps/rejected": -162.52867126464844, "loss": 0.1325, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.0252021551132202, "rewards/margins": 3.6236395835876465, "rewards/rejected": -2.5984373092651367, "step": 4050 }, { "epoch": 1.47, "grad_norm": 31.111251255931403, "learning_rate": 2.9479657452963256e-08, "logits/chosen": -2.248737096786499, "logits/rejected": -1.7086880207061768, "logps/chosen": -104.32584381103516, "logps/rejected": -170.23004150390625, "loss": 0.151, "rewards/accuracies": 1.0, "rewards/chosen": 0.08391883969306946, "rewards/margins": 3.075791835784912, "rewards/rejected": -2.991872787475586, "step": 4060 }, { "epoch": 1.48, "grad_norm": 18.513737492370375, "learning_rate": 2.910296023729384e-08, "logits/chosen": -1.4177262783050537, "logits/rejected": -1.364580750465393, "logps/chosen": -89.30236053466797, "logps/rejected": -147.00576782226562, "loss": 0.1544, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.0421960353851318, "rewards/margins": 3.4464340209960938, "rewards/rejected": -2.404238224029541, "step": 4070 }, { "epoch": 1.48, "grad_norm": 45.87936243907442, "learning_rate": 2.8728105043365984e-08, "logits/chosen": -1.8459665775299072, "logits/rejected": -1.5093791484832764, "logps/chosen": -97.5422592163086, "logps/rejected": -131.14620971679688, "loss": 0.1398, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.9450712203979492, "rewards/margins": 3.339611530303955, "rewards/rejected": -2.394540309906006, "step": 4080 }, { "epoch": 1.48, "grad_norm": 27.364987131956532, "learning_rate": 2.8355106915535932e-08, "logits/chosen": -1.9753601551055908, "logits/rejected": -1.554747462272644, "logps/chosen": -72.57929229736328, "logps/rejected": -136.12213134765625, "loss": 0.135, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.8972055315971375, "rewards/margins": 4.053958415985107, "rewards/rejected": -3.156752824783325, "step": 4090 }, { "epoch": 1.49, "grad_norm": 32.389832751742276, "learning_rate": 2.798398082362886e-08, "logits/chosen": -1.6929162740707397, "logits/rejected": -1.7215791940689087, "logps/chosen": -115.38690185546875, "logps/rejected": -146.203125, "loss": 0.1413, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.22363097965717316, "rewards/margins": 3.6949760913848877, "rewards/rejected": -3.4713454246520996, "step": 4100 }, { "epoch": 1.49, "eval_logits/chosen": -2.485478639602661, "eval_logits/rejected": -2.2992606163024902, "eval_logps/chosen": -91.98760223388672, "eval_logps/rejected": -154.0671844482422, "eval_loss": 0.26037731766700745, "eval_rewards/accuracies": 0.8928571343421936, "eval_rewards/chosen": 0.796866238117218, "eval_rewards/margins": 3.6664421558380127, "eval_rewards/rejected": -2.8695759773254395, "eval_runtime": 71.6081, "eval_samples_per_second": 12.429, "eval_steps_per_second": 0.196, "step": 4100 }, { "epoch": 1.49, "grad_norm": 14.2142969265953, "learning_rate": 2.761474166233805e-08, "logits/chosen": -1.9014062881469727, "logits/rejected": -1.554809331893921, "logps/chosen": -91.71360778808594, "logps/rejected": -198.91380310058594, "loss": 0.1107, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.8375331163406372, "rewards/margins": 5.089506149291992, "rewards/rejected": -4.2519731521606445, "step": 4110 }, { "epoch": 1.5, "grad_norm": 36.46497820122211, "learning_rate": 2.724740425062714e-08, "logits/chosen": -1.5665128231048584, "logits/rejected": -1.6638376712799072, "logps/chosen": -97.43113708496094, "logps/rejected": -185.1346893310547, "loss": 0.1309, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.911544680595398, "rewards/margins": 4.338427543640137, "rewards/rejected": -3.4268829822540283, "step": 4120 }, { "epoch": 1.5, "grad_norm": 50.17754349555712, "learning_rate": 2.6881983331135378e-08, "logits/chosen": -1.887635588645935, "logits/rejected": -1.7573896646499634, "logps/chosen": -80.06953430175781, "logps/rejected": -133.02284240722656, "loss": 0.1722, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.585761547088623, "rewards/margins": 3.1335628032684326, "rewards/rejected": -2.5478012561798096, "step": 4130 }, { "epoch": 1.5, "grad_norm": 22.44755921748364, "learning_rate": 2.6518493569585857e-08, "logits/chosen": -1.984948754310608, "logits/rejected": -2.0342633724212646, "logps/chosen": -114.79627990722656, "logps/rejected": -142.2074432373047, "loss": 0.199, "rewards/accuracies": 0.75, "rewards/chosen": 0.6468585729598999, "rewards/margins": 2.1931614875793457, "rewards/rejected": -1.546303153038025, "step": 4140 }, { "epoch": 1.51, "grad_norm": 24.955293029439613, "learning_rate": 2.6156949554197095e-08, "logits/chosen": -2.307645797729492, "logits/rejected": -1.599973201751709, "logps/chosen": -90.35932922363281, "logps/rejected": -148.82672119140625, "loss": 0.1611, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.5787138938903809, "rewards/margins": 3.0710272789001465, "rewards/rejected": -2.492313861846924, "step": 4150 }, { "epoch": 1.51, "grad_norm": 36.71467993455758, "learning_rate": 2.5797365795097407e-08, "logits/chosen": -1.9828819036483765, "logits/rejected": -1.6299062967300415, "logps/chosen": -103.05985260009766, "logps/rejected": -187.84640502929688, "loss": 0.1561, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.7202814817428589, "rewards/margins": 4.248810768127441, "rewards/rejected": -3.528529405593872, "step": 4160 }, { "epoch": 1.51, "grad_norm": 25.40867584669409, "learning_rate": 2.543975672374264e-08, "logits/chosen": -1.7713234424591064, "logits/rejected": -1.78484308719635, "logps/chosen": -91.5001220703125, "logps/rejected": -149.6800079345703, "loss": 0.1647, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.5808213353157043, "rewards/margins": 3.1474668979644775, "rewards/rejected": -2.566645383834839, "step": 4170 }, { "epoch": 1.52, "grad_norm": 19.8252963506834, "learning_rate": 2.5084136692336926e-08, "logits/chosen": -1.3749628067016602, "logits/rejected": -1.4281375408172607, "logps/chosen": -70.6727066040039, "logps/rejected": -277.53790283203125, "loss": 0.1486, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.1151177883148193, "rewards/margins": 4.473038196563721, "rewards/rejected": -3.3579201698303223, "step": 4180 }, { "epoch": 1.52, "grad_norm": 22.788082275545953, "learning_rate": 2.4730519973256725e-08, "logits/chosen": -1.382116436958313, "logits/rejected": -1.2883073091506958, "logps/chosen": -77.42723083496094, "logps/rejected": -168.14895629882812, "loss": 0.1365, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.9808204770088196, "rewards/margins": 4.480748176574707, "rewards/rejected": -3.499927520751953, "step": 4190 }, { "epoch": 1.52, "grad_norm": 17.55083100932304, "learning_rate": 2.4378920758477996e-08, "logits/chosen": -2.1403841972351074, "logits/rejected": -1.4080195426940918, "logps/chosen": -69.2994384765625, "logps/rejected": -133.26760864257812, "loss": 0.1498, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.458009958267212, "rewards/margins": 3.704465389251709, "rewards/rejected": -2.246455669403076, "step": 4200 }, { "epoch": 1.52, "eval_logits/chosen": -2.486311435699463, "eval_logits/rejected": -2.3020877838134766, "eval_logps/chosen": -91.95867919921875, "eval_logps/rejected": -154.05789184570312, "eval_loss": 0.2579483687877655, "eval_rewards/accuracies": 0.8928571343421936, "eval_rewards/chosen": 0.7997574210166931, "eval_rewards/margins": 3.6684017181396484, "eval_rewards/rejected": -2.8686444759368896, "eval_runtime": 71.505, "eval_samples_per_second": 12.447, "eval_steps_per_second": 0.196, "step": 4200 }, { "epoch": 1.53, "grad_norm": 35.97397604275955, "learning_rate": 2.4029353159006606e-08, "logits/chosen": -1.889622688293457, "logits/rejected": -1.7178815603256226, "logps/chosen": -70.93376159667969, "logps/rejected": -144.45614624023438, "loss": 0.1486, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.563561201095581, "rewards/margins": 4.030331611633301, "rewards/rejected": -2.466770648956299, "step": 4210 }, { "epoch": 1.53, "grad_norm": 25.472759515203542, "learning_rate": 2.368183120431205e-08, "logits/chosen": -1.6090589761734009, "logits/rejected": -1.5134804248809814, "logps/chosen": -93.47647857666016, "logps/rejected": -154.41531372070312, "loss": 0.1534, "rewards/accuracies": 1.0, "rewards/chosen": 1.2347394227981567, "rewards/margins": 4.24044942855835, "rewards/rejected": -3.0057103633880615, "step": 4220 }, { "epoch": 1.54, "grad_norm": 26.748913628743217, "learning_rate": 2.3336368841764356e-08, "logits/chosen": -1.6902498006820679, "logits/rejected": -1.2772510051727295, "logps/chosen": -72.32806396484375, "logps/rejected": -127.91917419433594, "loss": 0.105, "rewards/accuracies": 1.0, "rewards/chosen": 1.5376206636428833, "rewards/margins": 3.2143516540527344, "rewards/rejected": -1.676730751991272, "step": 4230 }, { "epoch": 1.54, "grad_norm": 38.70782633546678, "learning_rate": 2.2992979936074264e-08, "logits/chosen": -2.003537654876709, "logits/rejected": -1.539896011352539, "logps/chosen": -107.56465148925781, "logps/rejected": -155.28176879882812, "loss": 0.1745, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.1425919532775879, "rewards/margins": 2.318450927734375, "rewards/rejected": -2.461042881011963, "step": 4240 }, { "epoch": 1.54, "grad_norm": 29.839188839127143, "learning_rate": 2.2651678268736942e-08, "logits/chosen": -1.5676755905151367, "logits/rejected": -1.5963351726531982, "logps/chosen": -117.62342834472656, "logps/rejected": -148.70346069335938, "loss": 0.1028, "rewards/accuracies": 1.0, "rewards/chosen": 0.9985088109970093, "rewards/margins": 3.8390114307403564, "rewards/rejected": -2.840503215789795, "step": 4250 }, { "epoch": 1.55, "grad_norm": 19.326703363193936, "learning_rate": 2.2312477537478763e-08, "logits/chosen": -2.033979892730713, "logits/rejected": -1.5905263423919678, "logps/chosen": -78.28575134277344, "logps/rejected": -167.78639221191406, "loss": 0.1449, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.2223570346832275, "rewards/margins": 4.399596214294434, "rewards/rejected": -3.1772396564483643, "step": 4260 }, { "epoch": 1.55, "grad_norm": 34.56110655069492, "learning_rate": 2.1975391355707567e-08, "logits/chosen": -2.090172529220581, "logits/rejected": -1.7063558101654053, "logps/chosen": -69.6578369140625, "logps/rejected": -146.59194946289062, "loss": 0.1482, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.4781410694122314, "rewards/margins": 4.314321994781494, "rewards/rejected": -2.836181163787842, "step": 4270 }, { "epoch": 1.55, "grad_norm": 37.15442072437241, "learning_rate": 2.164043325196635e-08, "logits/chosen": -1.6971238851547241, "logits/rejected": -1.8592636585235596, "logps/chosen": -82.22840881347656, "logps/rejected": -163.99058532714844, "loss": 0.134, "rewards/accuracies": 1.0, "rewards/chosen": 1.3080239295959473, "rewards/margins": 3.911048412322998, "rewards/rejected": -2.60302472114563, "step": 4280 }, { "epoch": 1.56, "grad_norm": 22.652340412985147, "learning_rate": 2.13076166693903e-08, "logits/chosen": -2.017364501953125, "logits/rejected": -1.9596898555755615, "logps/chosen": -100.14933776855469, "logps/rejected": -186.43014526367188, "loss": 0.1298, "rewards/accuracies": 1.0, "rewards/chosen": 1.3219308853149414, "rewards/margins": 4.975711822509766, "rewards/rejected": -3.653780698776245, "step": 4290 }, { "epoch": 1.56, "grad_norm": 26.988430640272753, "learning_rate": 2.0976954965167228e-08, "logits/chosen": -2.23913311958313, "logits/rejected": -2.0381526947021484, "logps/chosen": -100.4576644897461, "logps/rejected": -205.36434936523438, "loss": 0.174, "rewards/accuracies": 1.0, "rewards/chosen": 0.9682433009147644, "rewards/margins": 4.4989728927612305, "rewards/rejected": -3.5307304859161377, "step": 4300 }, { "epoch": 1.56, "eval_logits/chosen": -2.39996600151062, "eval_logits/rejected": -2.239955186843872, "eval_logps/chosen": -92.15650939941406, "eval_logps/rejected": -155.64747619628906, "eval_loss": 0.26092293858528137, "eval_rewards/accuracies": 0.8928571343421936, "eval_rewards/chosen": 0.7799752354621887, "eval_rewards/margins": 3.8075778484344482, "eval_rewards/rejected": -3.0276029109954834, "eval_runtime": 71.3862, "eval_samples_per_second": 12.467, "eval_steps_per_second": 0.196, "step": 4300 }, { "epoch": 1.56, "grad_norm": 20.13961405134126, "learning_rate": 2.064846141000156e-08, "logits/chosen": -1.785154104232788, "logits/rejected": -1.782621145248413, "logps/chosen": -100.51982116699219, "logps/rejected": -170.2032012939453, "loss": 0.1628, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.1105843782424927, "rewards/margins": 4.3706769943237305, "rewards/rejected": -3.260092258453369, "step": 4310 }, { "epoch": 1.57, "grad_norm": 26.57133171694419, "learning_rate": 2.0322149187581696e-08, "logits/chosen": -2.1262426376342773, "logits/rejected": -1.7420053482055664, "logps/chosen": -85.6638412475586, "logps/rejected": -192.35430908203125, "loss": 0.1245, "rewards/accuracies": 1.0, "rewards/chosen": 0.9371218681335449, "rewards/margins": 5.653896331787109, "rewards/rejected": -4.7167744636535645, "step": 4320 }, { "epoch": 1.57, "grad_norm": 30.486425514760217, "learning_rate": 1.9998031394050925e-08, "logits/chosen": -2.0862393379211426, "logits/rejected": -1.73501718044281, "logps/chosen": -110.37203216552734, "logps/rejected": -184.22003173828125, "loss": 0.1412, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.701205849647522, "rewards/margins": 4.286178112030029, "rewards/rejected": -3.584972381591797, "step": 4330 }, { "epoch": 1.58, "grad_norm": 29.613748447060853, "learning_rate": 1.9676121037481733e-08, "logits/chosen": -1.8443056344985962, "logits/rejected": -1.37973153591156, "logps/chosen": -87.62504577636719, "logps/rejected": -148.2349090576172, "loss": 0.1641, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.0517264604568481, "rewards/margins": 3.312018632888794, "rewards/rejected": -2.2602920532226562, "step": 4340 }, { "epoch": 1.58, "grad_norm": 38.941503925465945, "learning_rate": 1.935643103735389e-08, "logits/chosen": -2.3647444248199463, "logits/rejected": -1.6877624988555908, "logps/chosen": -78.06304168701172, "logps/rejected": -170.48617553710938, "loss": 0.1479, "rewards/accuracies": 1.0, "rewards/chosen": 1.1019824743270874, "rewards/margins": 4.5098443031311035, "rewards/rejected": -3.4078621864318848, "step": 4350 }, { "epoch": 1.58, "grad_norm": 50.92032810215081, "learning_rate": 1.9038974224035848e-08, "logits/chosen": -1.945586919784546, "logits/rejected": -1.535428762435913, "logps/chosen": -93.87750244140625, "logps/rejected": -185.80007934570312, "loss": 0.1745, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.7316128611564636, "rewards/margins": 3.479790210723877, "rewards/rejected": -2.7481772899627686, "step": 4360 }, { "epoch": 1.59, "grad_norm": 33.56498772715042, "learning_rate": 1.8723763338269824e-08, "logits/chosen": -2.1573054790496826, "logits/rejected": -1.6914573907852173, "logps/chosen": -97.86043548583984, "logps/rejected": -163.97647094726562, "loss": 0.1815, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.35831624269485474, "rewards/margins": 3.7290472984313965, "rewards/rejected": -3.3707308769226074, "step": 4370 }, { "epoch": 1.59, "grad_norm": 26.65376761598817, "learning_rate": 1.8410811030660466e-08, "logits/chosen": -1.8529369831085205, "logits/rejected": -1.8225460052490234, "logps/chosen": -126.21983337402344, "logps/rejected": -171.356689453125, "loss": 0.1578, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.27563539147377014, "rewards/margins": 3.433450222015381, "rewards/rejected": -3.1578147411346436, "step": 4380 }, { "epoch": 1.59, "grad_norm": 41.658972724370116, "learning_rate": 1.810012986116715e-08, "logits/chosen": -2.1115715503692627, "logits/rejected": -1.5005934238433838, "logps/chosen": -78.10107421875, "logps/rejected": -153.611572265625, "loss": 0.1355, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.1123895645141602, "rewards/margins": 4.308642387390137, "rewards/rejected": -3.1962530612945557, "step": 4390 }, { "epoch": 1.6, "grad_norm": 23.104818947703635, "learning_rate": 1.7791732298599888e-08, "logits/chosen": -1.8663402795791626, "logits/rejected": -1.7844841480255127, "logps/chosen": -83.04045104980469, "logps/rejected": -153.29029846191406, "loss": 0.1129, "rewards/accuracies": 1.0, "rewards/chosen": 1.5516669750213623, "rewards/margins": 4.48703670501709, "rewards/rejected": -2.9353702068328857, "step": 4400 }, { "epoch": 1.6, "eval_logits/chosen": -2.4757511615753174, "eval_logits/rejected": -2.2932116985321045, "eval_logps/chosen": -92.66739654541016, "eval_logps/rejected": -155.24755859375, "eval_loss": 0.2576039731502533, "eval_rewards/accuracies": 0.8928571343421936, "eval_rewards/chosen": 0.7288867235183716, "eval_rewards/margins": 3.716498851776123, "eval_rewards/rejected": -2.987612009048462, "eval_runtime": 71.4688, "eval_samples_per_second": 12.453, "eval_steps_per_second": 0.196, "step": 4400 }, { "epoch": 1.6, "grad_norm": 30.602138050700304, "learning_rate": 1.7485630720118904e-08, "logits/chosen": -1.7544384002685547, "logits/rejected": -1.5731135606765747, "logps/chosen": -95.06303405761719, "logps/rejected": -155.60504150390625, "loss": 0.1394, "rewards/accuracies": 1.0, "rewards/chosen": 0.6706847548484802, "rewards/margins": 3.354102373123169, "rewards/rejected": -2.683417320251465, "step": 4410 }, { "epoch": 1.6, "grad_norm": 24.591287815914352, "learning_rate": 1.7181837410737932e-08, "logits/chosen": -1.6455243825912476, "logits/rejected": -1.8114473819732666, "logps/chosen": -91.33721923828125, "logps/rejected": -158.2550811767578, "loss": 0.1183, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.6706674098968506, "rewards/margins": 3.6940701007843018, "rewards/rejected": -2.023402690887451, "step": 4420 }, { "epoch": 1.61, "grad_norm": 28.285812502384303, "learning_rate": 1.688036456283108e-08, "logits/chosen": -1.4960906505584717, "logits/rejected": -1.562246322631836, "logps/chosen": -92.74504089355469, "logps/rejected": -129.95484924316406, "loss": 0.1344, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.7566136121749878, "rewards/margins": 3.488935947418213, "rewards/rejected": -2.7323222160339355, "step": 4430 }, { "epoch": 1.61, "grad_norm": 21.99612078068548, "learning_rate": 1.6581224275643602e-08, "logits/chosen": -2.4308037757873535, "logits/rejected": -2.031317710876465, "logps/chosen": -81.02774810791016, "logps/rejected": -149.41329956054688, "loss": 0.124, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.0930092334747314, "rewards/margins": 3.704552173614502, "rewards/rejected": -2.6115427017211914, "step": 4440 }, { "epoch": 1.62, "grad_norm": 37.76234741898126, "learning_rate": 1.6284428554806282e-08, "logits/chosen": -1.8159290552139282, "logits/rejected": -1.7262006998062134, "logps/chosen": -84.27841186523438, "logps/rejected": -133.88829040527344, "loss": 0.1195, "rewards/accuracies": 1.0, "rewards/chosen": 0.7679897546768188, "rewards/margins": 3.5734684467315674, "rewards/rejected": -2.805478572845459, "step": 4450 }, { "epoch": 1.62, "grad_norm": 20.751714992592504, "learning_rate": 1.598998931185358e-08, "logits/chosen": -1.6911875009536743, "logits/rejected": -1.4998903274536133, "logps/chosen": -85.60092163085938, "logps/rejected": -157.67355346679688, "loss": 0.1644, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.2675038576126099, "rewards/margins": 3.786600112915039, "rewards/rejected": -2.5190961360931396, "step": 4460 }, { "epoch": 1.62, "grad_norm": 31.961939419295213, "learning_rate": 1.5697918363745567e-08, "logits/chosen": -1.8280389308929443, "logits/rejected": -1.5867749452590942, "logps/chosen": -108.87247467041016, "logps/rejected": -214.6339111328125, "loss": 0.1564, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.9745191335678101, "rewards/margins": 5.607847213745117, "rewards/rejected": -4.633328437805176, "step": 4470 }, { "epoch": 1.63, "grad_norm": 38.17110938660031, "learning_rate": 1.5408227432393714e-08, "logits/chosen": -2.4660305976867676, "logits/rejected": -1.5764108896255493, "logps/chosen": -89.4808120727539, "logps/rejected": -156.589111328125, "loss": 0.1159, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.1624079942703247, "rewards/margins": 3.693568706512451, "rewards/rejected": -2.531160593032837, "step": 4480 }, { "epoch": 1.63, "grad_norm": 36.10544360692532, "learning_rate": 1.5120928144190412e-08, "logits/chosen": -1.9956271648406982, "logits/rejected": -1.7658560276031494, "logps/chosen": -117.41062927246094, "logps/rejected": -145.37120056152344, "loss": 0.1571, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.006312882993370295, "rewards/margins": 2.9415173530578613, "rewards/rejected": -2.93520450592041, "step": 4490 }, { "epoch": 1.63, "grad_norm": 27.729025187636683, "learning_rate": 1.483603202954238e-08, "logits/chosen": -1.9484901428222656, "logits/rejected": -1.8669170141220093, "logps/chosen": -92.9903335571289, "logps/rejected": -148.825927734375, "loss": 0.1424, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.7940378189086914, "rewards/margins": 3.46022367477417, "rewards/rejected": -2.6661858558654785, "step": 4500 }, { "epoch": 1.63, "eval_logits/chosen": -2.4859211444854736, "eval_logits/rejected": -2.302330732345581, "eval_logps/chosen": -92.06935119628906, "eval_logps/rejected": -154.83358764648438, "eval_loss": 0.2585107982158661, "eval_rewards/accuracies": 0.8928571343421936, "eval_rewards/chosen": 0.7886915802955627, "eval_rewards/margins": 3.734905242919922, "eval_rewards/rejected": -2.9462132453918457, "eval_runtime": 71.5035, "eval_samples_per_second": 12.447, "eval_steps_per_second": 0.196, "step": 4500 }, { "epoch": 1.64, "grad_norm": 20.03094242415938, "learning_rate": 1.4553550522407868e-08, "logits/chosen": -1.8428974151611328, "logits/rejected": -1.440744161605835, "logps/chosen": -84.1056137084961, "logps/rejected": -152.34756469726562, "loss": 0.1426, "rewards/accuracies": 1.0, "rewards/chosen": 1.6048812866210938, "rewards/margins": 4.184465408325195, "rewards/rejected": -2.5795845985412598, "step": 4510 }, { "epoch": 1.64, "grad_norm": 34.20593083265495, "learning_rate": 1.4273494959837854e-08, "logits/chosen": -1.979557752609253, "logits/rejected": -1.8085553646087646, "logps/chosen": -87.48872375488281, "logps/rejected": -144.94422912597656, "loss": 0.1596, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.7250168323516846, "rewards/margins": 3.676483154296875, "rewards/rejected": -1.9514662027359009, "step": 4520 }, { "epoch": 1.64, "grad_norm": 15.481458090321123, "learning_rate": 1.3995876581520893e-08, "logits/chosen": -1.636747121810913, "logits/rejected": -1.7493568658828735, "logps/chosen": -104.45011901855469, "logps/rejected": -178.22219848632812, "loss": 0.1425, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.7096387147903442, "rewards/margins": 4.242130756378174, "rewards/rejected": -3.5324923992156982, "step": 4530 }, { "epoch": 1.65, "grad_norm": 26.973245248163447, "learning_rate": 1.3720706529332202e-08, "logits/chosen": -2.262129545211792, "logits/rejected": -1.6040267944335938, "logps/chosen": -77.4686279296875, "logps/rejected": -172.17333984375, "loss": 0.1379, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.497057318687439, "rewards/margins": 4.911464691162109, "rewards/rejected": -3.414407253265381, "step": 4540 }, { "epoch": 1.65, "grad_norm": 24.948661161132325, "learning_rate": 1.3447995846886393e-08, "logits/chosen": -1.9659942388534546, "logits/rejected": -1.7948821783065796, "logps/chosen": -74.55500793457031, "logps/rejected": -173.8816680908203, "loss": 0.149, "rewards/accuracies": 1.0, "rewards/chosen": 1.5743433237075806, "rewards/margins": 5.000136375427246, "rewards/rejected": -3.425793409347534, "step": 4550 }, { "epoch": 1.66, "grad_norm": 27.776188946775548, "learning_rate": 1.317775547909426e-08, "logits/chosen": -1.6119670867919922, "logits/rejected": -1.385824203491211, "logps/chosen": -96.07972717285156, "logps/rejected": -175.02867126464844, "loss": 0.1963, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.7563056945800781, "rewards/margins": 3.682525634765625, "rewards/rejected": -2.926219940185547, "step": 4560 }, { "epoch": 1.66, "grad_norm": 43.370687625677974, "learning_rate": 1.2909996271723539e-08, "logits/chosen": -2.330936908721924, "logits/rejected": -1.7249195575714111, "logps/chosen": -81.22795104980469, "logps/rejected": -180.45004272460938, "loss": 0.1118, "rewards/accuracies": 1.0, "rewards/chosen": 1.1179533004760742, "rewards/margins": 4.484683990478516, "rewards/rejected": -3.3667304515838623, "step": 4570 }, { "epoch": 1.66, "grad_norm": 52.192177435639636, "learning_rate": 1.2644728970963616e-08, "logits/chosen": -2.1745333671569824, "logits/rejected": -1.5888748168945312, "logps/chosen": -76.57296752929688, "logps/rejected": -189.40585327148438, "loss": 0.1714, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.506654143333435, "rewards/margins": 5.129171371459961, "rewards/rejected": -3.6225173473358154, "step": 4580 }, { "epoch": 1.67, "grad_norm": 15.461194155069908, "learning_rate": 1.2381964222994248e-08, "logits/chosen": -1.6822162866592407, "logits/rejected": -1.4808707237243652, "logps/chosen": -88.69744110107422, "logps/rejected": -152.0595245361328, "loss": 0.1573, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.7915393710136414, "rewards/margins": 4.33894157409668, "rewards/rejected": -3.5474014282226562, "step": 4590 }, { "epoch": 1.67, "grad_norm": 30.84506660543241, "learning_rate": 1.2121712573558262e-08, "logits/chosen": -1.8852580785751343, "logits/rejected": -1.933457374572754, "logps/chosen": -88.162841796875, "logps/rejected": -145.27273559570312, "loss": 0.1531, "rewards/accuracies": 1.0, "rewards/chosen": 1.409010410308838, "rewards/margins": 4.220501899719238, "rewards/rejected": -2.8114914894104004, "step": 4600 }, { "epoch": 1.67, "eval_logits/chosen": -2.4970266819000244, "eval_logits/rejected": -2.308969020843506, "eval_logps/chosen": -92.29559326171875, "eval_logps/rejected": -154.68136596679688, "eval_loss": 0.257046639919281, "eval_rewards/accuracies": 0.8928571343421936, "eval_rewards/chosen": 0.7660664319992065, "eval_rewards/margins": 3.6970560550689697, "eval_rewards/rejected": -2.930989980697632, "eval_runtime": 71.5124, "eval_samples_per_second": 12.445, "eval_steps_per_second": 0.196, "step": 4600 }, { "epoch": 1.67, "grad_norm": 32.065023239868225, "learning_rate": 1.1863984467538368e-08, "logits/chosen": -1.5617568492889404, "logits/rejected": -1.5484387874603271, "logps/chosen": -84.5037612915039, "logps/rejected": -140.86459350585938, "loss": 0.1778, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.0827562808990479, "rewards/margins": 4.16703987121582, "rewards/rejected": -3.0842833518981934, "step": 4610 }, { "epoch": 1.68, "grad_norm": 40.53992535432375, "learning_rate": 1.1608790248537947e-08, "logits/chosen": -1.849543809890747, "logits/rejected": -1.3818111419677734, "logps/chosen": -98.61241912841797, "logps/rejected": -204.64102172851562, "loss": 0.1815, "rewards/accuracies": 1.0, "rewards/chosen": 0.39084136486053467, "rewards/margins": 4.119144916534424, "rewards/rejected": -3.7283034324645996, "step": 4620 }, { "epoch": 1.68, "grad_norm": 44.60677557796461, "learning_rate": 1.1356140158465846e-08, "logits/chosen": -1.6492226123809814, "logits/rejected": -1.5845674276351929, "logps/chosen": -99.52052307128906, "logps/rejected": -152.729248046875, "loss": 0.152, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.465355783700943, "rewards/margins": 3.311537981033325, "rewards/rejected": -2.846181631088257, "step": 4630 }, { "epoch": 1.68, "grad_norm": 38.975584748722106, "learning_rate": 1.1106044337125478e-08, "logits/chosen": -1.9782928228378296, "logits/rejected": -1.9134845733642578, "logps/chosen": -93.96483612060547, "logps/rejected": -147.70205688476562, "loss": 0.1498, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.3415434956550598, "rewards/margins": 3.8670287132263184, "rewards/rejected": -3.5254852771759033, "step": 4640 }, { "epoch": 1.69, "grad_norm": 30.98909384785237, "learning_rate": 1.0858512821807742e-08, "logits/chosen": -1.6544148921966553, "logits/rejected": -1.434407114982605, "logps/chosen": -97.20882415771484, "logps/rejected": -143.4949188232422, "loss": 0.184, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.9800602793693542, "rewards/margins": 2.9634454250335693, "rewards/rejected": -1.9833850860595703, "step": 4650 }, { "epoch": 1.69, "grad_norm": 37.51557850755165, "learning_rate": 1.0613555546888275e-08, "logits/chosen": -1.5016518831253052, "logits/rejected": -1.68484628200531, "logps/chosen": -88.76266479492188, "logps/rejected": -165.8483123779297, "loss": 0.1375, "rewards/accuracies": 1.0, "rewards/chosen": 0.8447409868240356, "rewards/margins": 4.642016887664795, "rewards/rejected": -3.797276020050049, "step": 4660 }, { "epoch": 1.7, "grad_norm": 38.74297206743728, "learning_rate": 1.0371182343428694e-08, "logits/chosen": -1.381477952003479, "logits/rejected": -2.027268171310425, "logps/chosen": -98.62785339355469, "logps/rejected": -132.6935577392578, "loss": 0.1417, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.7803923487663269, "rewards/margins": 3.797145366668701, "rewards/rejected": -3.0167534351348877, "step": 4670 }, { "epoch": 1.7, "grad_norm": 37.59159294477782, "learning_rate": 1.0131402938782063e-08, "logits/chosen": -1.829708456993103, "logits/rejected": -1.5681079626083374, "logps/chosen": -89.08270263671875, "logps/rejected": -157.6560821533203, "loss": 0.1658, "rewards/accuracies": 1.0, "rewards/chosen": 1.0118838548660278, "rewards/margins": 4.21151065826416, "rewards/rejected": -3.19962739944458, "step": 4680 }, { "epoch": 1.7, "grad_norm": 18.933900384534923, "learning_rate": 9.894226956202484e-09, "logits/chosen": -1.6854585409164429, "logits/rejected": -1.6041696071624756, "logps/chosen": -87.95982360839844, "logps/rejected": -164.80075073242188, "loss": 0.1541, "rewards/accuracies": 1.0, "rewards/chosen": 1.1225395202636719, "rewards/margins": 4.531619071960449, "rewards/rejected": -3.4090800285339355, "step": 4690 }, { "epoch": 1.71, "grad_norm": 25.177344401658505, "learning_rate": 9.659663914458913e-09, "logits/chosen": -1.7834850549697876, "logits/rejected": -1.7646923065185547, "logps/chosen": -72.14763641357422, "logps/rejected": -126.3357162475586, "loss": 0.1295, "rewards/accuracies": 1.0, "rewards/chosen": 1.4011167287826538, "rewards/margins": 3.857720136642456, "rewards/rejected": -2.4566032886505127, "step": 4700 }, { "epoch": 1.71, "eval_logits/chosen": -2.4951701164245605, "eval_logits/rejected": -2.3085877895355225, "eval_logps/chosen": -93.2258071899414, "eval_logps/rejected": -155.58546447753906, "eval_loss": 0.25638025999069214, "eval_rewards/accuracies": 0.8928571343421936, "eval_rewards/chosen": 0.6730464100837708, "eval_rewards/margins": 3.6944470405578613, "eval_rewards/rejected": -3.0214006900787354, "eval_runtime": 71.5377, "eval_samples_per_second": 12.441, "eval_steps_per_second": 0.196, "step": 4700 }, { "epoch": 1.71, "grad_norm": 36.30094639660896, "learning_rate": 9.427723227453092e-09, "logits/chosen": -2.1383352279663086, "logits/rejected": -1.7759078741073608, "logps/chosen": -97.42940521240234, "logps/rejected": -208.8566131591797, "loss": 0.1838, "rewards/accuracies": 1.0, "rewards/chosen": 0.45860522985458374, "rewards/margins": 4.734647274017334, "rewards/rejected": -4.276042461395264, "step": 4710 }, { "epoch": 1.71, "grad_norm": 46.07295378551623, "learning_rate": 9.198414203841732e-09, "logits/chosen": -2.0975241661071777, "logits/rejected": -1.4306576251983643, "logps/chosen": -75.3366928100586, "logps/rejected": -164.50413513183594, "loss": 0.148, "rewards/accuracies": 1.0, "rewards/chosen": 1.5581697225570679, "rewards/margins": 5.427392482757568, "rewards/rejected": -3.869223117828369, "step": 4720 }, { "epoch": 1.72, "grad_norm": 42.391432305833874, "learning_rate": 8.971746046662982e-09, "logits/chosen": -2.13923978805542, "logits/rejected": -1.8475595712661743, "logps/chosen": -85.93409729003906, "logps/rejected": -238.55447387695312, "loss": 0.1795, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.9917472004890442, "rewards/margins": 4.265665054321289, "rewards/rejected": -3.2739181518554688, "step": 4730 }, { "epoch": 1.72, "grad_norm": 41.9386180805472, "learning_rate": 8.747727852967013e-09, "logits/chosen": -1.9976091384887695, "logits/rejected": -1.6085163354873657, "logps/chosen": -85.17452239990234, "logps/rejected": -167.8629608154297, "loss": 0.1809, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.7708709239959717, "rewards/margins": 4.317328929901123, "rewards/rejected": -3.5464580059051514, "step": 4740 }, { "epoch": 1.72, "grad_norm": 27.55797286092099, "learning_rate": 8.526368613450938e-09, "logits/chosen": -1.8709558248519897, "logits/rejected": -1.3762298822402954, "logps/chosen": -77.7394790649414, "logps/rejected": -249.26089477539062, "loss": 0.137, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.0649783611297607, "rewards/margins": 5.4183220863342285, "rewards/rejected": -4.353343963623047, "step": 4750 }, { "epoch": 1.73, "grad_norm": 37.03238003499655, "learning_rate": 8.307677212098013e-09, "logits/chosen": -2.055666446685791, "logits/rejected": -1.7216275930404663, "logps/chosen": -78.11017608642578, "logps/rejected": -145.96435546875, "loss": 0.1323, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.7488200664520264, "rewards/margins": 4.145888328552246, "rewards/rejected": -2.3970682621002197, "step": 4760 }, { "epoch": 1.73, "grad_norm": 14.394014565941422, "learning_rate": 8.091662425821027e-09, "logits/chosen": -2.1242330074310303, "logits/rejected": -1.989148736000061, "logps/chosen": -96.9617691040039, "logps/rejected": -156.48365783691406, "loss": 0.146, "rewards/accuracies": 1.0, "rewards/chosen": 1.1661460399627686, "rewards/margins": 4.207326889038086, "rewards/rejected": -3.0411806106567383, "step": 4770 }, { "epoch": 1.74, "grad_norm": 45.068535611865215, "learning_rate": 7.878332924110114e-09, "logits/chosen": -1.6985795497894287, "logits/rejected": -1.4704097509384155, "logps/chosen": -105.20362854003906, "logps/rejected": -165.98757934570312, "loss": 0.1946, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.41264334321022034, "rewards/margins": 3.4486331939697266, "rewards/rejected": -3.035989761352539, "step": 4780 }, { "epoch": 1.74, "grad_norm": 32.28722287206916, "learning_rate": 7.66769726868476e-09, "logits/chosen": -2.389949321746826, "logits/rejected": -1.8705673217773438, "logps/chosen": -78.36787414550781, "logps/rejected": -172.9830780029297, "loss": 0.1543, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.4516584873199463, "rewards/margins": 4.765595436096191, "rewards/rejected": -3.313936948776245, "step": 4790 }, { "epoch": 1.74, "grad_norm": 27.305664645978823, "learning_rate": 7.459763913150232e-09, "logits/chosen": -1.8216984272003174, "logits/rejected": -1.5844495296478271, "logps/chosen": -94.18133544921875, "logps/rejected": -148.17559814453125, "loss": 0.1277, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.8979905843734741, "rewards/margins": 4.38170051574707, "rewards/rejected": -3.483710527420044, "step": 4800 }, { "epoch": 1.74, "eval_logits/chosen": -2.5054409503936768, "eval_logits/rejected": -2.3161699771881104, "eval_logps/chosen": -93.1070327758789, "eval_logps/rejected": -155.18020629882812, "eval_loss": 0.2575148642063141, "eval_rewards/accuracies": 0.8928571343421936, "eval_rewards/chosen": 0.6849234700202942, "eval_rewards/margins": 3.665799856185913, "eval_rewards/rejected": -2.9808764457702637, "eval_runtime": 71.4935, "eval_samples_per_second": 12.449, "eval_steps_per_second": 0.196, "step": 4800 }, { "epoch": 1.75, "grad_norm": 33.21624232483828, "learning_rate": 7.254541202658298e-09, "logits/chosen": -1.866506576538086, "logits/rejected": -1.7012319564819336, "logps/chosen": -72.63794708251953, "logps/rejected": -140.51022338867188, "loss": 0.1671, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.9564884305000305, "rewards/margins": 3.6187520027160645, "rewards/rejected": -2.6622631549835205, "step": 4810 }, { "epoch": 1.75, "grad_norm": 33.30055773008314, "learning_rate": 7.052037373572247e-09, "logits/chosen": -2.050233840942383, "logits/rejected": -1.8784773349761963, "logps/chosen": -76.64759826660156, "logps/rejected": -121.65924072265625, "loss": 0.1252, "rewards/accuracies": 1.0, "rewards/chosen": 1.1043304204940796, "rewards/margins": 3.284348249435425, "rewards/rejected": -2.1800179481506348, "step": 4820 }, { "epoch": 1.75, "grad_norm": 30.226579991502298, "learning_rate": 6.8522605531363995e-09, "logits/chosen": -2.1329455375671387, "logits/rejected": -1.5537135601043701, "logps/chosen": -81.99913787841797, "logps/rejected": -166.19219970703125, "loss": 0.1238, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.3520516157150269, "rewards/margins": 4.49784517288208, "rewards/rejected": -3.1457934379577637, "step": 4830 }, { "epoch": 1.76, "grad_norm": 20.76586446781195, "learning_rate": 6.655218759149936e-09, "logits/chosen": -1.8627560138702393, "logits/rejected": -1.4420944452285767, "logps/chosen": -81.84294128417969, "logps/rejected": -161.7127227783203, "loss": 0.1383, "rewards/accuracies": 1.0, "rewards/chosen": 0.9598122835159302, "rewards/margins": 3.9863998889923096, "rewards/rejected": -3.026587963104248, "step": 4840 }, { "epoch": 1.76, "grad_norm": 39.39440151277521, "learning_rate": 6.460919899645045e-09, "logits/chosen": -1.4853651523590088, "logits/rejected": -1.0775649547576904, "logps/chosen": -79.50499725341797, "logps/rejected": -160.72036743164062, "loss": 0.1273, "rewards/accuracies": 1.0, "rewards/chosen": 1.1241073608398438, "rewards/margins": 4.072818756103516, "rewards/rejected": -2.948711633682251, "step": 4850 }, { "epoch": 1.76, "grad_norm": 28.117092220468667, "learning_rate": 6.2693717725696064e-09, "logits/chosen": -2.3298912048339844, "logits/rejected": -1.7818387746810913, "logps/chosen": -81.07538604736328, "logps/rejected": -149.2225341796875, "loss": 0.1653, "rewards/accuracies": 1.0, "rewards/chosen": 1.2655856609344482, "rewards/margins": 4.464354038238525, "rewards/rejected": -3.198768138885498, "step": 4860 }, { "epoch": 1.77, "grad_norm": 27.103505885216574, "learning_rate": 6.080582065474191e-09, "logits/chosen": -1.792157769203186, "logits/rejected": -1.6782718896865845, "logps/chosen": -82.19720458984375, "logps/rejected": -142.2840118408203, "loss": 0.181, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.406231164932251, "rewards/margins": 3.7717814445495605, "rewards/rejected": -2.365550994873047, "step": 4870 }, { "epoch": 1.77, "grad_norm": 30.313103414178183, "learning_rate": 5.8945583552035664e-09, "logits/chosen": -2.15095853805542, "logits/rejected": -1.6400353908538818, "logps/chosen": -76.02684783935547, "logps/rejected": -157.7095947265625, "loss": 0.1545, "rewards/accuracies": 1.0, "rewards/chosen": 1.7171390056610107, "rewards/margins": 4.5480499267578125, "rewards/rejected": -2.8309109210968018, "step": 4880 }, { "epoch": 1.77, "grad_norm": 21.326128946676743, "learning_rate": 5.711308107592575e-09, "logits/chosen": -2.1534788608551025, "logits/rejected": -1.7716478109359741, "logps/chosen": -78.25006103515625, "logps/rejected": -139.49354553222656, "loss": 0.115, "rewards/accuracies": 1.0, "rewards/chosen": 1.0134289264678955, "rewards/margins": 3.7863471508026123, "rewards/rejected": -2.7729179859161377, "step": 4890 }, { "epoch": 1.78, "grad_norm": 23.675706025969955, "learning_rate": 5.530838677166514e-09, "logits/chosen": -2.062622308731079, "logits/rejected": -1.7914386987686157, "logps/chosen": -71.51246643066406, "logps/rejected": -151.47512817382812, "loss": 0.1166, "rewards/accuracies": 1.0, "rewards/chosen": 1.6398112773895264, "rewards/margins": 4.577850341796875, "rewards/rejected": -2.9380390644073486, "step": 4900 }, { "epoch": 1.78, "eval_logits/chosen": -2.498281478881836, "eval_logits/rejected": -2.3098886013031006, "eval_logps/chosen": -92.86518859863281, "eval_logps/rejected": -155.0869140625, "eval_loss": 0.25684645771980286, "eval_rewards/accuracies": 0.8928571343421936, "eval_rewards/chosen": 0.7091068625450134, "eval_rewards/margins": 3.680652618408203, "eval_rewards/rejected": -2.971545934677124, "eval_runtime": 71.3383, "eval_samples_per_second": 12.476, "eval_steps_per_second": 0.196, "step": 4900 }, { "epoch": 1.78, "grad_norm": 38.69506032631391, "learning_rate": 5.353157306845987e-09, "logits/chosen": -1.927911400794983, "logits/rejected": -1.6266618967056274, "logps/chosen": -102.5804443359375, "logps/rejected": -156.56808471679688, "loss": 0.1485, "rewards/accuracies": 1.0, "rewards/chosen": 0.6019805669784546, "rewards/margins": 3.804718017578125, "rewards/rejected": -3.202737331390381, "step": 4910 }, { "epoch": 1.79, "grad_norm": 13.185460602842793, "learning_rate": 5.178271127656184e-09, "logits/chosen": -1.6339209079742432, "logits/rejected": -1.4754202365875244, "logps/chosen": -77.33897399902344, "logps/rejected": -137.8895721435547, "loss": 0.1252, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.3045456409454346, "rewards/margins": 4.217949867248535, "rewards/rejected": -2.9134037494659424, "step": 4920 }, { "epoch": 1.79, "grad_norm": 19.230753940442828, "learning_rate": 5.006187158440717e-09, "logits/chosen": -1.7411714792251587, "logits/rejected": -1.4912729263305664, "logps/chosen": -79.78264617919922, "logps/rejected": -136.29086303710938, "loss": 0.1605, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.1097663640975952, "rewards/margins": 3.9273438453674316, "rewards/rejected": -2.817577838897705, "step": 4930 }, { "epoch": 1.79, "grad_norm": 46.07395788437603, "learning_rate": 4.8369123055799295e-09, "logits/chosen": -1.8338695764541626, "logits/rejected": -1.4250398874282837, "logps/chosen": -88.02108001708984, "logps/rejected": -194.01132202148438, "loss": 0.1362, "rewards/accuracies": 1.0, "rewards/chosen": 1.0556516647338867, "rewards/margins": 5.144927024841309, "rewards/rejected": -4.0892744064331055, "step": 4940 }, { "epoch": 1.8, "grad_norm": 13.001369713867522, "learning_rate": 4.67045336271368e-09, "logits/chosen": -1.945007085800171, "logits/rejected": -1.674750566482544, "logps/chosen": -85.85972595214844, "logps/rejected": -164.24606323242188, "loss": 0.1525, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.063456654548645, "rewards/margins": 4.070924758911133, "rewards/rejected": -3.0074684619903564, "step": 4950 }, { "epoch": 1.8, "grad_norm": 36.531934974949316, "learning_rate": 4.506817010468731e-09, "logits/chosen": -1.7169277667999268, "logits/rejected": -1.929560899734497, "logps/chosen": -106.58872985839844, "logps/rejected": -153.10482788085938, "loss": 0.1546, "rewards/accuracies": 1.0, "rewards/chosen": 0.9283292889595032, "rewards/margins": 3.2342593669891357, "rewards/rejected": -2.3059301376342773, "step": 4960 }, { "epoch": 1.8, "grad_norm": 21.121804389411686, "learning_rate": 4.346009816190596e-09, "logits/chosen": -1.7104358673095703, "logits/rejected": -1.638135313987732, "logps/chosen": -96.46279907226562, "logps/rejected": -162.83309936523438, "loss": 0.1416, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.8155636787414551, "rewards/margins": 4.107938766479492, "rewards/rejected": -3.292375087738037, "step": 4970 }, { "epoch": 1.81, "grad_norm": 25.432800607703953, "learning_rate": 4.188038233680005e-09, "logits/chosen": -2.3670763969421387, "logits/rejected": -1.915074348449707, "logps/chosen": -76.34284210205078, "logps/rejected": -146.34146118164062, "loss": 0.1577, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.011473536491394, "rewards/margins": 3.7747600078582764, "rewards/rejected": -2.763286590576172, "step": 4980 }, { "epoch": 1.81, "grad_norm": 31.52925521872735, "learning_rate": 4.032908602933835e-09, "logits/chosen": -1.9845958948135376, "logits/rejected": -1.6368509531021118, "logps/chosen": -78.15166473388672, "logps/rejected": -183.4367218017578, "loss": 0.1362, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.296852946281433, "rewards/margins": 5.180371284484863, "rewards/rejected": -3.8835182189941406, "step": 4990 }, { "epoch": 1.81, "grad_norm": 24.267438377007352, "learning_rate": 3.880627149890725e-09, "logits/chosen": -1.933571219444275, "logits/rejected": -1.8695943355560303, "logps/chosen": -86.22135925292969, "logps/rejected": -128.43087768554688, "loss": 0.1273, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.3141534328460693, "rewards/margins": 3.2799339294433594, "rewards/rejected": -1.9657806158065796, "step": 5000 }, { "epoch": 1.81, "eval_logits/chosen": -2.4975826740264893, "eval_logits/rejected": -2.3100476264953613, "eval_logps/chosen": -92.64019775390625, "eval_logps/rejected": -155.10391235351562, "eval_loss": 0.25643154978752136, "eval_rewards/accuracies": 0.8928571343421936, "eval_rewards/chosen": 0.7316060066223145, "eval_rewards/margins": 3.70485258102417, "eval_rewards/rejected": -2.9732463359832764, "eval_runtime": 71.5276, "eval_samples_per_second": 12.443, "eval_steps_per_second": 0.196, "step": 5000 }, { "epoch": 1.82, "grad_norm": 23.454697098867037, "learning_rate": 3.731199986181161e-09, "logits/chosen": -1.693359375, "logits/rejected": -1.7281681299209595, "logps/chosen": -86.59770202636719, "logps/rejected": -154.07138061523438, "loss": 0.1429, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.8508288264274597, "rewards/margins": 3.944756031036377, "rewards/rejected": -3.0939269065856934, "step": 5010 }, { "epoch": 1.82, "grad_norm": 21.363949418831528, "learning_rate": 3.5846331088821848e-09, "logits/chosen": -1.7737480401992798, "logits/rejected": -1.9281337261199951, "logps/chosen": -82.63226318359375, "logps/rejected": -151.45726013183594, "loss": 0.1787, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7113445401191711, "rewards/margins": 3.4898841381073, "rewards/rejected": -2.7785401344299316, "step": 5020 }, { "epoch": 1.83, "grad_norm": 49.695952302649566, "learning_rate": 3.440932400276758e-09, "logits/chosen": -2.308803081512451, "logits/rejected": -2.2135608196258545, "logps/chosen": -83.8191909790039, "logps/rejected": -141.08892822265625, "loss": 0.1696, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.9245914220809937, "rewards/margins": 3.2115063667297363, "rewards/rejected": -2.286914825439453, "step": 5030 }, { "epoch": 1.83, "grad_norm": 40.81068471888173, "learning_rate": 3.300103627617656e-09, "logits/chosen": -1.7834829092025757, "logits/rejected": -1.9567530155181885, "logps/chosen": -70.00810241699219, "logps/rejected": -122.92684173583984, "loss": 0.1558, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.0573456287384033, "rewards/margins": 4.120889186859131, "rewards/rejected": -3.0635437965393066, "step": 5040 }, { "epoch": 1.83, "grad_norm": 21.938277111569853, "learning_rate": 3.162152442895996e-09, "logits/chosen": -1.793460488319397, "logits/rejected": -1.8442881107330322, "logps/chosen": -101.73194885253906, "logps/rejected": -148.40234375, "loss": 0.1295, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.0204694271087646, "rewards/margins": 3.785799503326416, "rewards/rejected": -2.7653300762176514, "step": 5050 }, { "epoch": 1.84, "grad_norm": 37.435753201875194, "learning_rate": 3.0270843826143837e-09, "logits/chosen": -1.8540923595428467, "logits/rejected": -1.9534356594085693, "logps/chosen": -107.13368225097656, "logps/rejected": -207.95645141601562, "loss": 0.1731, "rewards/accuracies": 1.0, "rewards/chosen": 1.3544046878814697, "rewards/margins": 4.78790807723999, "rewards/rejected": -3.4335033893585205, "step": 5060 }, { "epoch": 1.84, "grad_norm": 19.208193674331728, "learning_rate": 2.894904867564793e-09, "logits/chosen": -1.6729189157485962, "logits/rejected": -1.791587471961975, "logps/chosen": -92.18028259277344, "logps/rejected": -181.7095947265625, "loss": 0.1094, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.4680941104888916, "rewards/margins": 5.646910190582275, "rewards/rejected": -4.178815841674805, "step": 5070 }, { "epoch": 1.84, "grad_norm": 29.97158214233071, "learning_rate": 2.765619202610939e-09, "logits/chosen": -1.8277852535247803, "logits/rejected": -1.64214289188385, "logps/chosen": -70.78485870361328, "logps/rejected": -141.9372100830078, "loss": 0.1758, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.2488759756088257, "rewards/margins": 3.880674362182617, "rewards/rejected": -2.631798505783081, "step": 5080 }, { "epoch": 1.85, "grad_norm": 17.74181700068005, "learning_rate": 2.639232576475364e-09, "logits/chosen": -1.552199125289917, "logits/rejected": -1.4841490983963013, "logps/chosen": -81.1534652709961, "logps/rejected": -125.95042419433594, "loss": 0.1427, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.3184212446212769, "rewards/margins": 3.3990256786346436, "rewards/rejected": -2.080604314804077, "step": 5090 }, { "epoch": 1.85, "grad_norm": 22.318813582652908, "learning_rate": 2.5157500615312577e-09, "logits/chosen": -1.8579572439193726, "logits/rejected": -2.0184388160705566, "logps/chosen": -89.17039489746094, "logps/rejected": -150.26077270507812, "loss": 0.1221, "rewards/accuracies": 1.0, "rewards/chosen": 1.099916934967041, "rewards/margins": 4.67440938949585, "rewards/rejected": -3.5744922161102295, "step": 5100 }, { "epoch": 1.85, "eval_logits/chosen": -2.479732036590576, "eval_logits/rejected": -2.2966747283935547, "eval_logps/chosen": -92.56282806396484, "eval_logps/rejected": -155.0851593017578, "eval_loss": 0.2575416564941406, "eval_rewards/accuracies": 0.9107142686843872, "eval_rewards/chosen": 0.7393438220024109, "eval_rewards/margins": 3.7107155323028564, "eval_rewards/rejected": -2.9713714122772217, "eval_runtime": 71.474, "eval_samples_per_second": 12.452, "eval_steps_per_second": 0.196, "step": 5100 }, { "epoch": 1.85, "grad_norm": 44.029689486297656, "learning_rate": 2.395176613598815e-09, "logits/chosen": -1.909277319908142, "logits/rejected": -1.5398352146148682, "logps/chosen": -100.70805358886719, "logps/rejected": -156.3821563720703, "loss": 0.1731, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.70123291015625, "rewards/margins": 3.1760170459747314, "rewards/rejected": -2.4747843742370605, "step": 5110 }, { "epoch": 1.86, "grad_norm": 29.264329504378058, "learning_rate": 2.2775170717463902e-09, "logits/chosen": -2.2539279460906982, "logits/rejected": -2.0020687580108643, "logps/chosen": -96.94004821777344, "logps/rejected": -158.09829711914062, "loss": 0.1387, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.7452301383018494, "rewards/margins": 4.1389360427856445, "rewards/rejected": -3.3937058448791504, "step": 5120 }, { "epoch": 1.86, "grad_norm": 25.34055019816174, "learning_rate": 2.1627761580962687e-09, "logits/chosen": -2.1744818687438965, "logits/rejected": -1.969853162765503, "logps/chosen": -87.83367156982422, "logps/rejected": -131.83323669433594, "loss": 0.1646, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.7479078769683838, "rewards/margins": 2.210872173309326, "rewards/rejected": -1.4629642963409424, "step": 5130 }, { "epoch": 1.87, "grad_norm": 28.764305762206437, "learning_rate": 2.0509584776351506e-09, "logits/chosen": -1.656760573387146, "logits/rejected": -1.4122374057769775, "logps/chosen": -72.59513092041016, "logps/rejected": -148.3843994140625, "loss": 0.1104, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.1340370178222656, "rewards/margins": 4.139061450958252, "rewards/rejected": -3.0050246715545654, "step": 5140 }, { "epoch": 1.87, "grad_norm": 25.193191046247687, "learning_rate": 1.942068518029333e-09, "logits/chosen": -2.1100473403930664, "logits/rejected": -1.478046178817749, "logps/chosen": -74.53004455566406, "logps/rejected": -263.0050964355469, "loss": 0.1563, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.2791024446487427, "rewards/margins": 4.334400177001953, "rewards/rejected": -3.055298089981079, "step": 5150 }, { "epoch": 1.87, "grad_norm": 38.53224961606162, "learning_rate": 1.8361106494445943e-09, "logits/chosen": -2.0669713020324707, "logits/rejected": -1.703884482383728, "logps/chosen": -94.36878967285156, "logps/rejected": -157.6041717529297, "loss": 0.1447, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.8586713671684265, "rewards/margins": 3.7407805919647217, "rewards/rejected": -2.8821091651916504, "step": 5160 }, { "epoch": 1.88, "grad_norm": 43.063554524576524, "learning_rate": 1.7330891243708456e-09, "logits/chosen": -1.6464707851409912, "logits/rejected": -1.8101832866668701, "logps/chosen": -121.72802734375, "logps/rejected": -158.669677734375, "loss": 0.1641, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.5665308237075806, "rewards/margins": 3.349714756011963, "rewards/rejected": -2.7831835746765137, "step": 5170 }, { "epoch": 1.88, "grad_norm": 40.989160241743704, "learning_rate": 1.633008077451406e-09, "logits/chosen": -2.263235569000244, "logits/rejected": -2.035400867462158, "logps/chosen": -78.44254302978516, "logps/rejected": -152.73580932617188, "loss": 0.1752, "rewards/accuracies": 1.0, "rewards/chosen": 1.6941092014312744, "rewards/margins": 4.51024055480957, "rewards/rejected": -2.816131353378296, "step": 5180 }, { "epoch": 1.88, "grad_norm": 35.64059838178094, "learning_rate": 1.5358715253170785e-09, "logits/chosen": -1.6528642177581787, "logits/rejected": -1.7316625118255615, "logps/chosen": -97.46934509277344, "logps/rejected": -128.89291381835938, "loss": 0.1489, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.1307111978530884, "rewards/margins": 3.048318386077881, "rewards/rejected": -1.917607069015503, "step": 5190 }, { "epoch": 1.89, "grad_norm": 21.32011780611291, "learning_rate": 1.4416833664249867e-09, "logits/chosen": -1.8180415630340576, "logits/rejected": -1.7365614175796509, "logps/chosen": -99.86917877197266, "logps/rejected": -146.1660919189453, "loss": 0.1752, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.311732530593872, "rewards/margins": 3.4762260913848877, "rewards/rejected": -2.1644930839538574, "step": 5200 }, { "epoch": 1.89, "eval_logits/chosen": -2.4827630519866943, "eval_logits/rejected": -2.2978460788726807, "eval_logps/chosen": -92.51727294921875, "eval_logps/rejected": -155.1031036376953, "eval_loss": 0.2567766308784485, "eval_rewards/accuracies": 0.9107142686843872, "eval_rewards/chosen": 0.7438983917236328, "eval_rewards/margins": 3.717064380645752, "eval_rewards/rejected": -2.973165512084961, "eval_runtime": 71.4535, "eval_samples_per_second": 12.456, "eval_steps_per_second": 0.196, "step": 5200 }, { "epoch": 1.89, "grad_norm": 14.906342368048502, "learning_rate": 1.3504473809020673e-09, "logits/chosen": -1.907004714012146, "logits/rejected": -1.4068089723587036, "logps/chosen": -86.4648208618164, "logps/rejected": -206.89736938476562, "loss": 0.1684, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.7831238508224487, "rewards/margins": 4.615767478942871, "rewards/rejected": -3.8326430320739746, "step": 5210 }, { "epoch": 1.89, "grad_norm": 34.29482897768453, "learning_rate": 1.2621672303933739e-09, "logits/chosen": -1.811163306236267, "logits/rejected": -1.6804473400115967, "logps/chosen": -95.26229858398438, "logps/rejected": -157.65988159179688, "loss": 0.1628, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.0265977382659912, "rewards/margins": 3.7031357288360596, "rewards/rejected": -2.6765377521514893, "step": 5220 }, { "epoch": 1.9, "grad_norm": 28.11537513046596, "learning_rate": 1.1768464579151373e-09, "logits/chosen": -1.8990215063095093, "logits/rejected": -1.9127355813980103, "logps/chosen": -82.57646179199219, "logps/rejected": -136.97134399414062, "loss": 0.1398, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8750249743461609, "rewards/margins": 3.5186927318573, "rewards/rejected": -2.643667697906494, "step": 5230 }, { "epoch": 1.9, "grad_norm": 9.966571219874533, "learning_rate": 1.0944884877125527e-09, "logits/chosen": -1.402052640914917, "logits/rejected": -1.1487385034561157, "logps/chosen": -104.62020111083984, "logps/rejected": -200.74160766601562, "loss": 0.1199, "rewards/accuracies": 1.0, "rewards/chosen": 0.388352632522583, "rewards/margins": 4.531402111053467, "rewards/rejected": -4.143049716949463, "step": 5240 }, { "epoch": 1.91, "grad_norm": 20.5792937534979, "learning_rate": 1.0150966251223664e-09, "logits/chosen": -2.0096936225891113, "logits/rejected": -1.805040717124939, "logps/chosen": -64.17274475097656, "logps/rejected": -122.17179870605469, "loss": 0.1306, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.0267283916473389, "rewards/margins": 4.174807071685791, "rewards/rejected": -3.1480789184570312, "step": 5250 }, { "epoch": 1.91, "grad_norm": 20.585526219504107, "learning_rate": 9.386740564401808e-10, "logits/chosen": -2.2588255405426025, "logits/rejected": -1.8842235803604126, "logps/chosen": -115.33465576171875, "logps/rejected": -176.01412963867188, "loss": 0.1458, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.2555707693099976, "rewards/margins": 4.751716613769531, "rewards/rejected": -3.4961459636688232, "step": 5260 }, { "epoch": 1.91, "grad_norm": 34.226305800359405, "learning_rate": 8.652238487926566e-10, "logits/chosen": -1.9869095087051392, "logits/rejected": -2.1987595558166504, "logps/chosen": -92.55632781982422, "logps/rejected": -212.2284393310547, "loss": 0.1584, "rewards/accuracies": 1.0, "rewards/chosen": 1.0699260234832764, "rewards/margins": 3.5520572662353516, "rewards/rejected": -2.4821314811706543, "step": 5270 }, { "epoch": 1.92, "grad_norm": 36.554661458368045, "learning_rate": 7.947489500143206e-10, "logits/chosen": -2.079451560974121, "logits/rejected": -1.7273361682891846, "logps/chosen": -59.18944549560547, "logps/rejected": -125.39189147949219, "loss": 0.1414, "rewards/accuracies": 1.0, "rewards/chosen": 1.8779332637786865, "rewards/margins": 3.9232311248779297, "rewards/rejected": -2.045297622680664, "step": 5280 }, { "epoch": 1.92, "grad_norm": 29.3453328036471, "learning_rate": 7.272521885293343e-10, "logits/chosen": -1.7185804843902588, "logits/rejected": -1.2968146800994873, "logps/chosen": -69.44091033935547, "logps/rejected": -128.18624877929688, "loss": 0.1573, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.9322481155395508, "rewards/margins": 3.3514316082000732, "rewards/rejected": -2.4191837310791016, "step": 5290 }, { "epoch": 1.92, "grad_norm": 19.154161212924226, "learning_rate": 6.627362732379433e-10, "logits/chosen": -1.8239357471466064, "logits/rejected": -1.7603832483291626, "logps/chosen": -96.85991668701172, "logps/rejected": -174.81370544433594, "loss": 0.1235, "rewards/accuracies": 1.0, "rewards/chosen": 0.8014510869979858, "rewards/margins": 3.704523801803589, "rewards/rejected": -2.9030728340148926, "step": 5300 }, { "epoch": 1.92, "eval_logits/chosen": -2.494673490524292, "eval_logits/rejected": -2.307727336883545, "eval_logps/chosen": -92.54339599609375, "eval_logps/rejected": -155.1482696533203, "eval_loss": 0.25662314891815186, "eval_rewards/accuracies": 0.8928571343421936, "eval_rewards/chosen": 0.7412868142127991, "eval_rewards/margins": 3.718968629837036, "eval_rewards/rejected": -2.97768235206604, "eval_runtime": 71.3827, "eval_samples_per_second": 12.468, "eval_steps_per_second": 0.196, "step": 5300 }, { "epoch": 1.93, "grad_norm": 37.14417488124076, "learning_rate": 6.012037934077563e-10, "logits/chosen": -1.4998286962509155, "logits/rejected": -1.2366199493408203, "logps/chosen": -93.3680648803711, "logps/rejected": -167.04367065429688, "loss": 0.148, "rewards/accuracies": 1.0, "rewards/chosen": 0.9477823972702026, "rewards/margins": 3.8279433250427246, "rewards/rejected": -2.8801610469818115, "step": 5310 }, { "epoch": 1.93, "grad_norm": 36.70052172928171, "learning_rate": 5.426572185698363e-10, "logits/chosen": -2.037949323654175, "logits/rejected": -1.7177932262420654, "logps/chosen": -88.7237548828125, "logps/rejected": -163.99179077148438, "loss": 0.1393, "rewards/accuracies": 1.0, "rewards/chosen": 1.0409705638885498, "rewards/margins": 3.7800567150115967, "rewards/rejected": -2.7390859127044678, "step": 5320 }, { "epoch": 1.93, "grad_norm": 57.51895033093402, "learning_rate": 4.870988984196134e-10, "logits/chosen": -1.5953149795532227, "logits/rejected": -1.3189541101455688, "logps/chosen": -92.99564361572266, "logps/rejected": -165.5485382080078, "loss": 0.1624, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.016343355178833, "rewards/margins": 4.139420509338379, "rewards/rejected": -3.123077392578125, "step": 5330 }, { "epoch": 1.94, "grad_norm": 29.612269350826534, "learning_rate": 4.345310627225179e-10, "logits/chosen": -2.3673906326293945, "logits/rejected": -1.5509955883026123, "logps/chosen": -72.61958312988281, "logps/rejected": -196.7791290283203, "loss": 0.1208, "rewards/accuracies": 1.0, "rewards/chosen": 1.651362419128418, "rewards/margins": 6.749650478363037, "rewards/rejected": -5.098288536071777, "step": 5340 }, { "epoch": 1.94, "grad_norm": 13.368844959539826, "learning_rate": 3.849558212245696e-10, "logits/chosen": -1.5998425483703613, "logits/rejected": -1.4031832218170166, "logps/chosen": -85.34275817871094, "logps/rejected": -154.41502380371094, "loss": 0.1364, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.0656564235687256, "rewards/margins": 3.6739463806152344, "rewards/rejected": -2.608290195465088, "step": 5350 }, { "epoch": 1.95, "grad_norm": 32.464438755302695, "learning_rate": 3.3837516356764464e-10, "logits/chosen": -2.0724587440490723, "logits/rejected": -1.6080551147460938, "logps/chosen": -91.76570129394531, "logps/rejected": -153.9037628173828, "loss": 0.1588, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.46443551778793335, "rewards/margins": 2.967500925064087, "rewards/rejected": -2.503065586090088, "step": 5360 }, { "epoch": 1.95, "grad_norm": 17.335570454742644, "learning_rate": 2.947909592096648e-10, "logits/chosen": -1.7819137573242188, "logits/rejected": -1.6901859045028687, "logps/chosen": -83.9917221069336, "logps/rejected": -155.39439392089844, "loss": 0.17, "rewards/accuracies": 1.0, "rewards/chosen": 1.1832668781280518, "rewards/margins": 4.525895118713379, "rewards/rejected": -3.3426289558410645, "step": 5370 }, { "epoch": 1.95, "grad_norm": 24.93213979968397, "learning_rate": 2.542049573495325e-10, "logits/chosen": -1.923814058303833, "logits/rejected": -1.7243621349334717, "logps/chosen": -109.0640869140625, "logps/rejected": -164.2493438720703, "loss": 0.1342, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.23761026561260223, "rewards/margins": 3.5649254322052, "rewards/rejected": -3.327315092086792, "step": 5380 }, { "epoch": 1.96, "grad_norm": 25.680467090159638, "learning_rate": 2.166187868569619e-10, "logits/chosen": -1.646674394607544, "logits/rejected": -1.8654903173446655, "logps/chosen": -107.28630065917969, "logps/rejected": -148.73915100097656, "loss": 0.1411, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.48937278985977173, "rewards/margins": 3.2133946418762207, "rewards/rejected": -2.7240214347839355, "step": 5390 }, { "epoch": 1.96, "grad_norm": 26.62093298807946, "learning_rate": 1.8203395620708107e-10, "logits/chosen": -2.068347215652466, "logits/rejected": -1.635754942893982, "logps/chosen": -80.6448745727539, "logps/rejected": -183.50900268554688, "loss": 0.1288, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.8882523775100708, "rewards/margins": 4.448118209838867, "rewards/rejected": -3.559866428375244, "step": 5400 }, { "epoch": 1.96, "eval_logits/chosen": -2.488870143890381, "eval_logits/rejected": -2.304009199142456, "eval_logps/chosen": -92.57149505615234, "eval_logps/rejected": -155.0608673095703, "eval_loss": 0.2569134533405304, "eval_rewards/accuracies": 0.9107142686843872, "eval_rewards/chosen": 0.738475501537323, "eval_rewards/margins": 3.7074153423309326, "eval_rewards/rejected": -2.968940258026123, "eval_runtime": 71.568, "eval_samples_per_second": 12.436, "eval_steps_per_second": 0.196, "step": 5400 }, { "epoch": 1.96, "grad_norm": 60.89983951450628, "learning_rate": 1.5045185341992228e-10, "logits/chosen": -2.275635242462158, "logits/rejected": -2.2029271125793457, "logps/chosen": -95.09009552001953, "logps/rejected": -132.654296875, "loss": 0.1563, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.0345149040222168, "rewards/margins": 3.2276673316955566, "rewards/rejected": -2.193152666091919, "step": 5410 }, { "epoch": 1.97, "grad_norm": 42.30577726329947, "learning_rate": 1.218737460046748e-10, "logits/chosen": -1.8863006830215454, "logits/rejected": -1.45746910572052, "logps/chosen": -86.98088836669922, "logps/rejected": -162.14358520507812, "loss": 0.1412, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.4029314517974854, "rewards/margins": 4.1309075355529785, "rewards/rejected": -2.727975368499756, "step": 5420 }, { "epoch": 1.97, "grad_norm": 22.851908976691202, "learning_rate": 9.630078090883398e-11, "logits/chosen": -2.069901943206787, "logits/rejected": -1.7926028966903687, "logps/chosen": -121.98948669433594, "logps/rejected": -309.41168212890625, "loss": 0.1431, "rewards/accuracies": 1.0, "rewards/chosen": -0.005540943238884211, "rewards/margins": 4.165543556213379, "rewards/rejected": -4.171084403991699, "step": 5430 }, { "epoch": 1.97, "grad_norm": 24.450065176253204, "learning_rate": 7.373398447218792e-11, "logits/chosen": -1.7372915744781494, "logits/rejected": -1.5503891706466675, "logps/chosen": -108.78375244140625, "logps/rejected": -175.95468139648438, "loss": 0.1728, "rewards/accuracies": 1.0, "rewards/chosen": 1.1438463926315308, "rewards/margins": 4.212247848510742, "rewards/rejected": -3.06840181350708, "step": 5440 }, { "epoch": 1.98, "grad_norm": 13.790530961269235, "learning_rate": 5.417426238560896e-11, "logits/chosen": -1.820669412612915, "logits/rejected": -1.7920053005218506, "logps/chosen": -83.18289184570312, "logps/rejected": -164.08082580566406, "loss": 0.1263, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.9669841527938843, "rewards/margins": 4.896847724914551, "rewards/rejected": -3.9298641681671143, "step": 5450 }, { "epoch": 1.98, "grad_norm": 21.370222843228014, "learning_rate": 3.7622399654682614e-11, "logits/chosen": -2.2546284198760986, "logits/rejected": -1.9654285907745361, "logps/chosen": -91.28435516357422, "logps/rejected": -227.45193481445312, "loss": 0.1813, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5948358774185181, "rewards/margins": 4.2998552322387695, "rewards/rejected": -3.705019474029541, "step": 5460 }, { "epoch": 1.99, "grad_norm": 19.65088303473795, "learning_rate": 2.4079060568257813e-11, "logits/chosen": -1.8808691501617432, "logits/rejected": -1.6000111103057861, "logps/chosen": -88.05937957763672, "logps/rejected": -188.5884552001953, "loss": 0.1328, "rewards/accuracies": 1.0, "rewards/chosen": 1.3033487796783447, "rewards/margins": 4.862802982330322, "rewards/rejected": -3.5594539642333984, "step": 5470 }, { "epoch": 1.99, "grad_norm": 27.89489697549713, "learning_rate": 1.354478867173492e-11, "logits/chosen": -1.8578226566314697, "logits/rejected": -1.4207289218902588, "logps/chosen": -113.3355941772461, "logps/rejected": -168.69332885742188, "loss": 0.1488, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.764236330986023, "rewards/margins": 3.6643853187561035, "rewards/rejected": -2.900148868560791, "step": 5480 }, { "epoch": 1.99, "grad_norm": 20.734274017785214, "learning_rate": 6.020006745274808e-12, "logits/chosen": -2.122195243835449, "logits/rejected": -1.6640615463256836, "logps/chosen": -59.822662353515625, "logps/rejected": -172.18661499023438, "loss": 0.1118, "rewards/accuracies": 1.0, "rewards/chosen": 1.6453478336334229, "rewards/margins": 4.6514892578125, "rewards/rejected": -3.006141185760498, "step": 5490 }, { "epoch": 2.0, "grad_norm": 45.552319616907596, "learning_rate": 1.5050167868208009e-12, "logits/chosen": -1.6916072368621826, "logits/rejected": -1.7415540218353271, "logps/chosen": -121.29073333740234, "logps/rejected": -157.93690490722656, "loss": 0.145, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.7984371185302734, "rewards/margins": 4.016904354095459, "rewards/rejected": -3.2184669971466064, "step": 5500 }, { "epoch": 2.0, "eval_logits/chosen": -2.4879519939422607, "eval_logits/rejected": -2.303224802017212, "eval_logps/chosen": -92.59040069580078, "eval_logps/rejected": -155.1884002685547, "eval_loss": 0.2571623623371124, "eval_rewards/accuracies": 0.8928571343421936, "eval_rewards/chosen": 0.7365875840187073, "eval_rewards/margins": 3.7182834148406982, "eval_rewards/rejected": -2.9816958904266357, "eval_runtime": 71.3635, "eval_samples_per_second": 12.471, "eval_steps_per_second": 0.196, "step": 5500 }, { "epoch": 2.0, "grad_norm": 18.200273756614273, "learning_rate": 0.0, "logits/chosen": -1.8841540813446045, "logits/rejected": -1.5108808279037476, "logps/chosen": -88.10948181152344, "logps/rejected": -193.3212127685547, "loss": 0.1231, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.3105828762054443, "rewards/margins": 4.1531171798706055, "rewards/rejected": -2.842533826828003, "step": 5510 }, { "epoch": 2.0, "step": 5510, "total_flos": 0.0, "train_loss": 0.2711078498627443, "train_runtime": 49070.973, "train_samples_per_second": 3.592, "train_steps_per_second": 0.112 } ], "logging_steps": 10, "max_steps": 5510, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 50, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }