{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 3821, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00026171159382360636, "grad_norm": 1.999703049659729, "learning_rate": 1.3054830287206266e-09, "logits/chosen": -2.9875593185424805, "logits/rejected": -2.936753749847412, "logps/chosen": -307.4898681640625, "logps/rejected": -392.088623046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0026171159382360636, "grad_norm": 1.9285504817962646, "learning_rate": 1.3054830287206264e-08, "logits/chosen": -2.8448944091796875, "logits/rejected": -2.83210825920105, "logps/chosen": -299.1453857421875, "logps/rejected": -260.9873352050781, "loss": 0.693, "rewards/accuracies": 0.4930555522441864, "rewards/chosen": -0.00014580304559785873, "rewards/margins": 0.0003282717370893806, "rewards/rejected": -0.00047407473903149366, "step": 10 }, { "epoch": 0.005234231876472127, "grad_norm": 2.234384775161743, "learning_rate": 2.610966057441253e-08, "logits/chosen": -2.861093044281006, "logits/rejected": -2.826277732849121, "logps/chosen": -325.42889404296875, "logps/rejected": -252.72314453125, "loss": 0.6928, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.00027085753390565515, "rewards/margins": 0.0006726925494149327, "rewards/rejected": -0.00040183504461310804, "step": 20 }, { "epoch": 0.007851347814708191, "grad_norm": 2.5200695991516113, "learning_rate": 3.91644908616188e-08, "logits/chosen": -2.8650269508361816, "logits/rejected": -2.839594841003418, "logps/chosen": -269.79888916015625, "logps/rejected": -268.51544189453125, "loss": 0.6928, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0004993680049665272, "rewards/margins": 0.0007416309672407806, "rewards/rejected": -0.00024226296227425337, "step": 30 }, { "epoch": 0.010468463752944255, "grad_norm": 1.6392391920089722, "learning_rate": 5.221932114882506e-08, "logits/chosen": -2.8317809104919434, "logits/rejected": -2.8215935230255127, "logps/chosen": -233.3176727294922, "logps/rejected": -238.38671875, "loss": 0.6929, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -3.822711369139142e-05, "rewards/margins": 0.000457162968814373, "rewards/rejected": -0.0004953901516273618, "step": 40 }, { "epoch": 0.01308557969118032, "grad_norm": 1.624583125114441, "learning_rate": 6.527415143603133e-08, "logits/chosen": -2.865053176879883, "logits/rejected": -2.852184295654297, "logps/chosen": -290.0357360839844, "logps/rejected": -253.96719360351562, "loss": 0.6931, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.00021998901502229273, "rewards/margins": 8.350692223757505e-05, "rewards/rejected": -0.00030349590815603733, "step": 50 }, { "epoch": 0.015702695629416383, "grad_norm": 1.7673835754394531, "learning_rate": 7.83289817232376e-08, "logits/chosen": -2.8233509063720703, "logits/rejected": -2.809717893600464, "logps/chosen": -273.7070617675781, "logps/rejected": -246.9080352783203, "loss": 0.6931, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.00012425810564309359, "rewards/margins": 6.1127066146582365e-06, "rewards/rejected": -0.00013037076860200614, "step": 60 }, { "epoch": 0.018319811567652448, "grad_norm": 1.7462002038955688, "learning_rate": 9.138381201044386e-08, "logits/chosen": -2.8822834491729736, "logits/rejected": -2.8470146656036377, "logps/chosen": -293.1849060058594, "logps/rejected": -266.12908935546875, "loss": 0.6931, "rewards/accuracies": 0.5, "rewards/chosen": -0.00014021484821569175, "rewards/margins": 4.102182720089331e-05, "rewards/rejected": -0.00018123674090020359, "step": 70 }, { "epoch": 0.02093692750588851, "grad_norm": 2.281116008758545, "learning_rate": 1.0443864229765012e-07, "logits/chosen": -2.820223331451416, "logits/rejected": -2.797712564468384, "logps/chosen": -279.3045959472656, "logps/rejected": -266.4049072265625, "loss": 0.6932, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.00035889382706955075, "rewards/margins": -3.7797075492562726e-05, "rewards/rejected": -0.00032109676976688206, "step": 80 }, { "epoch": 0.023554043444124574, "grad_norm": 1.8048748970031738, "learning_rate": 1.174934725848564e-07, "logits/chosen": -2.834364652633667, "logits/rejected": -2.821197032928467, "logps/chosen": -270.66107177734375, "logps/rejected": -251.8137664794922, "loss": 0.693, "rewards/accuracies": 0.5, "rewards/chosen": -2.5717377866385505e-05, "rewards/margins": 0.00027994689298793674, "rewards/rejected": -0.00030566431814804673, "step": 90 }, { "epoch": 0.02617115938236064, "grad_norm": 1.8376109600067139, "learning_rate": 1.3054830287206266e-07, "logits/chosen": -2.8485753536224365, "logits/rejected": -2.8414525985717773, "logps/chosen": -267.0416259765625, "logps/rejected": -248.66622924804688, "loss": 0.6929, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.00016696630336809903, "rewards/margins": 0.0004185012076050043, "rewards/rejected": -0.0005854673800058663, "step": 100 }, { "epoch": 0.02617115938236064, "eval_logits/chosen": -2.8661274909973145, "eval_logits/rejected": -2.8388071060180664, "eval_logps/chosen": -282.74957275390625, "eval_logps/rejected": -261.47882080078125, "eval_loss": 0.693004846572876, "eval_rewards/accuracies": 0.5249999761581421, "eval_rewards/chosen": -0.00011926326260436326, "eval_rewards/margins": 0.0002895805810112506, "eval_rewards/rejected": -0.00040884382906369865, "eval_runtime": 692.2735, "eval_samples_per_second": 2.889, "eval_steps_per_second": 0.361, "step": 100 }, { "epoch": 0.028788275320596704, "grad_norm": 2.015868663787842, "learning_rate": 1.4360313315926893e-07, "logits/chosen": -2.856309652328491, "logits/rejected": -2.823089361190796, "logps/chosen": -307.3843994140625, "logps/rejected": -257.291015625, "loss": 0.6932, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.0002652711991686374, "rewards/margins": -0.00011966088641202077, "rewards/rejected": -0.00014561018906533718, "step": 110 }, { "epoch": 0.031405391258832765, "grad_norm": 1.7159242630004883, "learning_rate": 1.566579634464752e-07, "logits/chosen": -2.869659423828125, "logits/rejected": -2.8464877605438232, "logps/chosen": -310.60089111328125, "logps/rejected": -287.7904357910156, "loss": 0.6929, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.0001522126840427518, "rewards/margins": 0.0004031356074847281, "rewards/rejected": -0.00025092283613048494, "step": 120 }, { "epoch": 0.03402250719706883, "grad_norm": 2.0958242416381836, "learning_rate": 1.6971279373368143e-07, "logits/chosen": -2.850337266921997, "logits/rejected": -2.8188374042510986, "logps/chosen": -271.6417236328125, "logps/rejected": -269.60174560546875, "loss": 0.6928, "rewards/accuracies": 0.53125, "rewards/chosen": 0.00013120910443831235, "rewards/margins": 0.0006835443200543523, "rewards/rejected": -0.0005523352883756161, "step": 130 }, { "epoch": 0.036639623135304895, "grad_norm": 1.8925613164901733, "learning_rate": 1.8276762402088773e-07, "logits/chosen": -2.8673295974731445, "logits/rejected": -2.8122167587280273, "logps/chosen": -291.46307373046875, "logps/rejected": -247.7669677734375, "loss": 0.6927, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.00036148293293081224, "rewards/margins": 0.0009279497899115086, "rewards/rejected": -0.0005664670607075095, "step": 140 }, { "epoch": 0.03925673907354096, "grad_norm": 1.9597433805465698, "learning_rate": 1.95822454308094e-07, "logits/chosen": -2.8569109439849854, "logits/rejected": -2.837003707885742, "logps/chosen": -298.9459228515625, "logps/rejected": -256.0478515625, "loss": 0.6927, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0006078753503970802, "rewards/margins": 0.0009616016177460551, "rewards/rejected": -0.00035372626734897494, "step": 150 }, { "epoch": 0.04187385501177702, "grad_norm": 1.913694977760315, "learning_rate": 2.0887728459530023e-07, "logits/chosen": -2.864971876144409, "logits/rejected": -2.8458945751190186, "logps/chosen": -275.124755859375, "logps/rejected": -275.0151062011719, "loss": 0.6926, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.00012734555639326572, "rewards/margins": 0.001163811655715108, "rewards/rejected": -0.0010364660993218422, "step": 160 }, { "epoch": 0.04449097095001309, "grad_norm": 2.1846537590026855, "learning_rate": 2.2193211488250652e-07, "logits/chosen": -2.822680950164795, "logits/rejected": -2.8042876720428467, "logps/chosen": -236.7074432373047, "logps/rejected": -238.3466339111328, "loss": 0.6927, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0003129563410766423, "rewards/margins": 0.0008108107140287757, "rewards/rejected": -0.0011237671133130789, "step": 170 }, { "epoch": 0.04710808688824915, "grad_norm": 1.6035895347595215, "learning_rate": 2.349869451697128e-07, "logits/chosen": -2.850816249847412, "logits/rejected": -2.823718309402466, "logps/chosen": -276.2500915527344, "logps/rejected": -259.9451904296875, "loss": 0.6927, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.0001872165739769116, "rewards/margins": 0.0008747532265260816, "rewards/rejected": -0.001061969785951078, "step": 180 }, { "epoch": 0.04972520282648522, "grad_norm": 3.182461738586426, "learning_rate": 2.4804177545691903e-07, "logits/chosen": -2.8869190216064453, "logits/rejected": -2.8687491416931152, "logps/chosen": -290.9490661621094, "logps/rejected": -257.3797302246094, "loss": 0.6927, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0006612293072976172, "rewards/margins": 0.000992011046037078, "rewards/rejected": -0.00033078185515478253, "step": 190 }, { "epoch": 0.05234231876472128, "grad_norm": 1.8618322610855103, "learning_rate": 2.610966057441253e-07, "logits/chosen": -2.837772846221924, "logits/rejected": -2.8276214599609375, "logps/chosen": -267.96173095703125, "logps/rejected": -225.5831756591797, "loss": 0.6923, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0006539617897942662, "rewards/margins": 0.0017792375292629004, "rewards/rejected": -0.0011252757394686341, "step": 200 }, { "epoch": 0.05234231876472128, "eval_logits/chosen": -2.8652713298797607, "eval_logits/rejected": -2.837984800338745, "eval_logps/chosen": -282.66241455078125, "eval_logps/rejected": -261.5315856933594, "eval_loss": 0.6923088431358337, "eval_rewards/accuracies": 0.6050000190734863, "eval_rewards/chosen": 0.0007522286614403129, "eval_rewards/margins": 0.001688659773208201, "eval_rewards/rejected": -0.0009364310535602272, "eval_runtime": 693.0899, "eval_samples_per_second": 2.886, "eval_steps_per_second": 0.361, "step": 200 }, { "epoch": 0.05495943470295734, "grad_norm": 1.7776113748550415, "learning_rate": 2.7415143603133156e-07, "logits/chosen": -2.8762500286102295, "logits/rejected": -2.8429489135742188, "logps/chosen": -275.98614501953125, "logps/rejected": -245.2783660888672, "loss": 0.6922, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.0009210329735651612, "rewards/margins": 0.0018816586816683412, "rewards/rejected": -0.0009606255334801972, "step": 210 }, { "epoch": 0.05757655064119341, "grad_norm": 1.6921358108520508, "learning_rate": 2.8720626631853785e-07, "logits/chosen": -2.817211627960205, "logits/rejected": -2.811617851257324, "logps/chosen": -274.0498962402344, "logps/rejected": -242.93923950195312, "loss": 0.6919, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.0014726849040016532, "rewards/margins": 0.0024847507011145353, "rewards/rejected": -0.0010120656806975603, "step": 220 }, { "epoch": 0.06019366657942947, "grad_norm": 2.0040206909179688, "learning_rate": 3.002610966057441e-07, "logits/chosen": -2.885439157485962, "logits/rejected": -2.86034893989563, "logps/chosen": -322.754150390625, "logps/rejected": -285.758056640625, "loss": 0.6922, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0018624020740389824, "rewards/margins": 0.0018660586792975664, "rewards/rejected": -3.6565586469805567e-06, "step": 230 }, { "epoch": 0.06281078251766553, "grad_norm": 1.809605360031128, "learning_rate": 3.133159268929504e-07, "logits/chosen": -2.8532462120056152, "logits/rejected": -2.8391811847686768, "logps/chosen": -312.47088623046875, "logps/rejected": -297.48907470703125, "loss": 0.6921, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.0017323732608929276, "rewards/margins": 0.0021942437160760164, "rewards/rejected": -0.00046187033876776695, "step": 240 }, { "epoch": 0.06542789845590159, "grad_norm": 1.6686596870422363, "learning_rate": 3.263707571801567e-07, "logits/chosen": -2.814990282058716, "logits/rejected": -2.81905198097229, "logps/chosen": -277.08941650390625, "logps/rejected": -249.03414916992188, "loss": 0.6915, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.001977517269551754, "rewards/margins": 0.003367725061252713, "rewards/rejected": -0.0013902074424549937, "step": 250 }, { "epoch": 0.06804501439413765, "grad_norm": 1.5935229063034058, "learning_rate": 3.3942558746736286e-07, "logits/chosen": -2.8718338012695312, "logits/rejected": -2.8251404762268066, "logps/chosen": -297.3100280761719, "logps/rejected": -277.9830017089844, "loss": 0.6916, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0025989424902945757, "rewards/margins": 0.0032064050901681185, "rewards/rejected": -0.00060746242525056, "step": 260 }, { "epoch": 0.07066213033237373, "grad_norm": 1.4248483180999756, "learning_rate": 3.5248041775456916e-07, "logits/chosen": -2.8370590209960938, "logits/rejected": -2.8248658180236816, "logps/chosen": -281.2889709472656, "logps/rejected": -245.48855590820312, "loss": 0.6901, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.003083507064729929, "rewards/margins": 0.006150919944047928, "rewards/rejected": -0.003067413344979286, "step": 270 }, { "epoch": 0.07327924627060979, "grad_norm": 1.725456714630127, "learning_rate": 3.6553524804177545e-07, "logits/chosen": -2.8781139850616455, "logits/rejected": -2.8350632190704346, "logps/chosen": -276.51568603515625, "logps/rejected": -253.5542755126953, "loss": 0.6906, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.0024674157612025738, "rewards/margins": 0.005187267437577248, "rewards/rejected": -0.00271985144354403, "step": 280 }, { "epoch": 0.07589636220884585, "grad_norm": 1.9681357145309448, "learning_rate": 3.785900783289817e-07, "logits/chosen": -2.849203586578369, "logits/rejected": -2.838613986968994, "logps/chosen": -304.06463623046875, "logps/rejected": -279.3326721191406, "loss": 0.6901, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0034332734066993, "rewards/margins": 0.0062034172005951405, "rewards/rejected": -0.0027701437938958406, "step": 290 }, { "epoch": 0.07851347814708191, "grad_norm": 2.0513315200805664, "learning_rate": 3.91644908616188e-07, "logits/chosen": -2.8060500621795654, "logits/rejected": -2.76236629486084, "logps/chosen": -266.20794677734375, "logps/rejected": -248.80886840820312, "loss": 0.6898, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.0026042419485747814, "rewards/margins": 0.006667142268270254, "rewards/rejected": -0.004062901251018047, "step": 300 }, { "epoch": 0.07851347814708191, "eval_logits/chosen": -2.8622689247131348, "eval_logits/rejected": -2.834963321685791, "eval_logps/chosen": -282.39178466796875, "eval_logps/rejected": -261.6759948730469, "eval_loss": 0.6902644038200378, "eval_rewards/accuracies": 0.6639999747276306, "eval_rewards/chosen": 0.0034584649838507175, "eval_rewards/margins": 0.0058389026671648026, "eval_rewards/rejected": -0.0023804374504834414, "eval_runtime": 692.5367, "eval_samples_per_second": 2.888, "eval_steps_per_second": 0.361, "step": 300 }, { "epoch": 0.08113059408531798, "grad_norm": 2.1205692291259766, "learning_rate": 4.046997389033943e-07, "logits/chosen": -2.893097400665283, "logits/rejected": -2.87463641166687, "logps/chosen": -306.21636962890625, "logps/rejected": -250.2729949951172, "loss": 0.6888, "rewards/accuracies": 0.6875, "rewards/chosen": 0.004871034994721413, "rewards/margins": 0.008721152320504189, "rewards/rejected": -0.003850117791444063, "step": 310 }, { "epoch": 0.08374771002355404, "grad_norm": 1.7468680143356323, "learning_rate": 4.1775456919060046e-07, "logits/chosen": -2.873706817626953, "logits/rejected": -2.8421998023986816, "logps/chosen": -272.94659423828125, "logps/rejected": -255.0898895263672, "loss": 0.6904, "rewards/accuracies": 0.65625, "rewards/chosen": 0.00492675369605422, "rewards/margins": 0.005588999018073082, "rewards/rejected": -0.000662245147395879, "step": 320 }, { "epoch": 0.08636482596179011, "grad_norm": 1.7784926891326904, "learning_rate": 4.3080939947780675e-07, "logits/chosen": -2.8389968872070312, "logits/rejected": -2.8390631675720215, "logps/chosen": -277.24652099609375, "logps/rejected": -250.9720458984375, "loss": 0.6892, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.007157427724450827, "rewards/margins": 0.00795576348900795, "rewards/rejected": -0.0007983351242728531, "step": 330 }, { "epoch": 0.08898194190002617, "grad_norm": 2.0122432708740234, "learning_rate": 4.4386422976501305e-07, "logits/chosen": -2.868762254714966, "logits/rejected": -2.8562684059143066, "logps/chosen": -306.8142395019531, "logps/rejected": -284.90679931640625, "loss": 0.6886, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.00881933607161045, "rewards/margins": 0.009326713159680367, "rewards/rejected": -0.000507376913446933, "step": 340 }, { "epoch": 0.09159905783826224, "grad_norm": 1.7484519481658936, "learning_rate": 4.569190600522193e-07, "logits/chosen": -2.824993848800659, "logits/rejected": -2.797851085662842, "logps/chosen": -309.11224365234375, "logps/rejected": -296.3442687988281, "loss": 0.6894, "rewards/accuracies": 0.6875, "rewards/chosen": 0.009017640724778175, "rewards/margins": 0.00765979802235961, "rewards/rejected": 0.0013578429352492094, "step": 350 }, { "epoch": 0.0942161737764983, "grad_norm": 1.2647193670272827, "learning_rate": 4.699738903394256e-07, "logits/chosen": -2.8344480991363525, "logits/rejected": -2.816068649291992, "logps/chosen": -256.1959533691406, "logps/rejected": -236.88818359375, "loss": 0.6883, "rewards/accuracies": 0.75, "rewards/chosen": 0.007074951194226742, "rewards/margins": 0.009867229498922825, "rewards/rejected": -0.0027922778390347958, "step": 360 }, { "epoch": 0.09683328971473436, "grad_norm": 2.0885772705078125, "learning_rate": 4.830287206266319e-07, "logits/chosen": -2.8475875854492188, "logits/rejected": -2.8186795711517334, "logps/chosen": -295.1861572265625, "logps/rejected": -251.5151824951172, "loss": 0.6856, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.010460047982633114, "rewards/margins": 0.015231410041451454, "rewards/rejected": -0.004771359730511904, "step": 370 }, { "epoch": 0.09945040565297043, "grad_norm": 1.8870456218719482, "learning_rate": 4.960835509138381e-07, "logits/chosen": -2.8488352298736572, "logits/rejected": -2.7997212409973145, "logps/chosen": -315.6346740722656, "logps/rejected": -279.5706481933594, "loss": 0.6871, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.011897383257746696, "rewards/margins": 0.012441580183804035, "rewards/rejected": -0.0005441965768113732, "step": 380 }, { "epoch": 0.1020675215912065, "grad_norm": 2.3549890518188477, "learning_rate": 4.999948856244767e-07, "logits/chosen": -2.8280773162841797, "logits/rejected": -2.8224241733551025, "logps/chosen": -297.057373046875, "logps/rejected": -278.00421142578125, "loss": 0.6836, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.01873602904379368, "rewards/margins": 0.01945691928267479, "rewards/rejected": -0.0007208908209577203, "step": 390 }, { "epoch": 0.10468463752944256, "grad_norm": 1.818867802619934, "learning_rate": 4.999698361256577e-07, "logits/chosen": -2.851010799407959, "logits/rejected": -2.8151259422302246, "logps/chosen": -279.1597900390625, "logps/rejected": -237.5978546142578, "loss": 0.6872, "rewards/accuracies": 0.625, "rewards/chosen": 0.016593072563409805, "rewards/margins": 0.012265140190720558, "rewards/rejected": 0.004327933304011822, "step": 400 }, { "epoch": 0.10468463752944256, "eval_logits/chosen": -2.857703685760498, "eval_logits/rejected": -2.830756425857544, "eval_logps/chosen": -281.0899963378906, "eval_logps/rejected": -261.22564697265625, "eval_loss": 0.6861628293991089, "eval_rewards/accuracies": 0.6669999957084656, "eval_rewards/chosen": 0.01647624559700489, "eval_rewards/margins": 0.014353430829942226, "eval_rewards/rejected": 0.002122814767062664, "eval_runtime": 692.2781, "eval_samples_per_second": 2.889, "eval_steps_per_second": 0.361, "step": 400 }, { "epoch": 0.10730175346767862, "grad_norm": 1.9545940160751343, "learning_rate": 4.99923914217458e-07, "logits/chosen": -2.818399667739868, "logits/rejected": -2.802830457687378, "logps/chosen": -256.24957275390625, "logps/rejected": -256.09527587890625, "loss": 0.6893, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.013771469704806805, "rewards/margins": 0.008097216486930847, "rewards/rejected": 0.005674251355230808, "step": 410 }, { "epoch": 0.10991886940591468, "grad_norm": 4.077869415283203, "learning_rate": 4.99857123734344e-07, "logits/chosen": -2.8153655529022217, "logits/rejected": -2.769317865371704, "logps/chosen": -244.53890991210938, "logps/rejected": -238.0004119873047, "loss": 0.6855, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.015213017351925373, "rewards/margins": 0.015682024881243706, "rewards/rejected": -0.00046900735469534993, "step": 420 }, { "epoch": 0.11253598534415074, "grad_norm": 2.243114471435547, "learning_rate": 4.997694702533016e-07, "logits/chosen": -2.837740182876587, "logits/rejected": -2.806856870651245, "logps/chosen": -293.7519836425781, "logps/rejected": -272.25494384765625, "loss": 0.6835, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.022876007482409477, "rewards/margins": 0.019848225638270378, "rewards/rejected": 0.0030277802143245935, "step": 430 }, { "epoch": 0.11515310128238682, "grad_norm": 1.829640507698059, "learning_rate": 4.996609610933712e-07, "logits/chosen": -2.875370740890503, "logits/rejected": -2.8540024757385254, "logps/chosen": -285.1123962402344, "logps/rejected": -256.6170654296875, "loss": 0.6833, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.02266586944460869, "rewards/margins": 0.020275097340345383, "rewards/rejected": 0.0023907723370939493, "step": 440 }, { "epoch": 0.11777021722062288, "grad_norm": 1.756147861480713, "learning_rate": 4.995316053150366e-07, "logits/chosen": -2.806842088699341, "logits/rejected": -2.8101210594177246, "logps/chosen": -288.1036376953125, "logps/rejected": -259.46014404296875, "loss": 0.6824, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.029574494808912277, "rewards/margins": 0.022273657843470573, "rewards/rejected": 0.007300837431102991, "step": 450 }, { "epoch": 0.12038733315885894, "grad_norm": 3.1120874881744385, "learning_rate": 4.99381413719468e-07, "logits/chosen": -2.825704574584961, "logits/rejected": -2.81204891204834, "logps/chosen": -279.86334228515625, "logps/rejected": -268.80755615234375, "loss": 0.6796, "rewards/accuracies": 0.75, "rewards/chosen": 0.029285842552781105, "rewards/margins": 0.027944009751081467, "rewards/rejected": 0.0013418343150988221, "step": 460 }, { "epoch": 0.123004449097095, "grad_norm": 1.9212427139282227, "learning_rate": 4.992103988476205e-07, "logits/chosen": -2.83656644821167, "logits/rejected": -2.810007333755493, "logps/chosen": -257.7132873535156, "logps/rejected": -245.3390655517578, "loss": 0.6831, "rewards/accuracies": 0.6875, "rewards/chosen": 0.024322878569364548, "rewards/margins": 0.020839061588048935, "rewards/rejected": 0.003483818843960762, "step": 470 }, { "epoch": 0.12562156503533106, "grad_norm": 2.0051708221435547, "learning_rate": 4.990185749791864e-07, "logits/chosen": -2.868682622909546, "logits/rejected": -2.836199998855591, "logps/chosen": -271.63922119140625, "logps/rejected": -274.00189208984375, "loss": 0.68, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.027854889631271362, "rewards/margins": 0.0271223783493042, "rewards/rejected": 0.0007325109909288585, "step": 480 }, { "epoch": 0.12823868097356714, "grad_norm": 2.0355913639068604, "learning_rate": 4.988059581314039e-07, "logits/chosen": -2.8479950428009033, "logits/rejected": -2.8285024166107178, "logps/chosen": -305.7145690917969, "logps/rejected": -269.5832214355469, "loss": 0.6789, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.02704049088060856, "rewards/margins": 0.029708972200751305, "rewards/rejected": -0.002668480621650815, "step": 490 }, { "epoch": 0.13085579691180318, "grad_norm": 1.996235966682434, "learning_rate": 4.985725660577184e-07, "logits/chosen": -2.8617165088653564, "logits/rejected": -2.843017101287842, "logps/chosen": -288.36846923828125, "logps/rejected": -249.8210906982422, "loss": 0.6783, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.023136448115110397, "rewards/margins": 0.031000768765807152, "rewards/rejected": -0.007864321582019329, "step": 500 }, { "epoch": 0.13085579691180318, "eval_logits/chosen": -2.848633289337158, "eval_logits/rejected": -2.8214972019195557, "eval_logps/chosen": -280.6480712890625, "eval_logps/rejected": -262.0230407714844, "eval_loss": 0.6803756356239319, "eval_rewards/accuracies": 0.6834999918937683, "eval_rewards/chosen": 0.020895304158329964, "eval_rewards/margins": 0.026746317744255066, "eval_rewards/rejected": -0.005851015914231539, "eval_runtime": 691.0122, "eval_samples_per_second": 2.894, "eval_steps_per_second": 0.362, "step": 500 }, { "epoch": 0.13347291285003926, "grad_norm": 2.2953689098358154, "learning_rate": 4.983184182463008e-07, "logits/chosen": -2.83900785446167, "logits/rejected": -2.8163068294525146, "logps/chosen": -292.3056335449219, "logps/rejected": -256.3818359375, "loss": 0.6779, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0237285066395998, "rewards/margins": 0.03204946964979172, "rewards/rejected": -0.008320963010191917, "step": 510 }, { "epoch": 0.1360900287882753, "grad_norm": 2.152860164642334, "learning_rate": 4.980435359184203e-07, "logits/chosen": -2.8620104789733887, "logits/rejected": -2.8637924194335938, "logps/chosen": -285.1622314453125, "logps/rejected": -270.9977722167969, "loss": 0.6791, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.026320820674300194, "rewards/margins": 0.029663830995559692, "rewards/rejected": -0.0033430135808885098, "step": 520 }, { "epoch": 0.13870714472651138, "grad_norm": 2.3760368824005127, "learning_rate": 4.977479420266723e-07, "logits/chosen": -2.8074328899383545, "logits/rejected": -2.8127429485321045, "logps/chosen": -278.2021484375, "logps/rejected": -288.5596618652344, "loss": 0.6792, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.02414657548069954, "rewards/margins": 0.02932720258831978, "rewards/rejected": -0.005180628038942814, "step": 530 }, { "epoch": 0.14132426066474746, "grad_norm": 1.8068273067474365, "learning_rate": 4.974316612530614e-07, "logits/chosen": -2.799464464187622, "logits/rejected": -2.781719446182251, "logps/chosen": -296.43017578125, "logps/rejected": -260.1778869628906, "loss": 0.6685, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.03263556957244873, "rewards/margins": 0.05155158042907715, "rewards/rejected": -0.018916018307209015, "step": 540 }, { "epoch": 0.1439413766029835, "grad_norm": 2.295518636703491, "learning_rate": 4.970947200069415e-07, "logits/chosen": -2.8136024475097656, "logits/rejected": -2.8002548217773438, "logps/chosen": -296.8650817871094, "logps/rejected": -277.0992431640625, "loss": 0.6793, "rewards/accuracies": 0.65625, "rewards/chosen": 0.026846662163734436, "rewards/margins": 0.029769038781523705, "rewards/rejected": -0.0029223733581602573, "step": 550 }, { "epoch": 0.14655849254121958, "grad_norm": 1.8040831089019775, "learning_rate": 4.967371464228095e-07, "logits/chosen": -2.8747551441192627, "logits/rejected": -2.8538835048675537, "logps/chosen": -269.18994140625, "logps/rejected": -272.37799072265625, "loss": 0.6782, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.026889195665717125, "rewards/margins": 0.03184649348258972, "rewards/rejected": -0.004957299679517746, "step": 560 }, { "epoch": 0.14917560847945563, "grad_norm": 2.131438970565796, "learning_rate": 4.963589703579569e-07, "logits/chosen": -2.899491310119629, "logits/rejected": -2.8730692863464355, "logps/chosen": -313.0187072753906, "logps/rejected": -280.3568420410156, "loss": 0.6752, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.028542449697852135, "rewards/margins": 0.03851853683590889, "rewards/rejected": -0.009976087138056755, "step": 570 }, { "epoch": 0.1517927244176917, "grad_norm": 1.8194427490234375, "learning_rate": 4.959602233899761e-07, "logits/chosen": -2.892979621887207, "logits/rejected": -2.8543694019317627, "logps/chosen": -311.68353271484375, "logps/rejected": -272.5694580078125, "loss": 0.673, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.035731758922338486, "rewards/margins": 0.04327362775802612, "rewards/rejected": -0.007541867904365063, "step": 580 }, { "epoch": 0.15440984035592778, "grad_norm": 2.1900675296783447, "learning_rate": 4.955409388141243e-07, "logits/chosen": -2.8265955448150635, "logits/rejected": -2.8132894039154053, "logps/chosen": -273.9072265625, "logps/rejected": -251.5390167236328, "loss": 0.6752, "rewards/accuracies": 0.71875, "rewards/chosen": 0.016455931589007378, "rewards/margins": 0.03850039094686508, "rewards/rejected": -0.022044459357857704, "step": 590 }, { "epoch": 0.15702695629416383, "grad_norm": 1.8198952674865723, "learning_rate": 4.951011516405429e-07, "logits/chosen": -2.84102201461792, "logits/rejected": -2.84004807472229, "logps/chosen": -265.394775390625, "logps/rejected": -252.8574676513672, "loss": 0.6729, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.021321838721632957, "rewards/margins": 0.04377777501940727, "rewards/rejected": -0.022455941885709763, "step": 600 }, { "epoch": 0.15702695629416383, "eval_logits/chosen": -2.8409736156463623, "eval_logits/rejected": -2.813835382461548, "eval_logps/chosen": -281.19580078125, "eval_logps/rejected": -264.16082763671875, "eval_loss": 0.6732848882675171, "eval_rewards/accuracies": 0.6840000152587891, "eval_rewards/chosen": 0.015417821705341339, "eval_rewards/margins": 0.04264672100543976, "eval_rewards/rejected": -0.02722889743745327, "eval_runtime": 691.9111, "eval_samples_per_second": 2.891, "eval_steps_per_second": 0.361, "step": 600 }, { "epoch": 0.1596440722323999, "grad_norm": 2.117947578430176, "learning_rate": 4.946408985913344e-07, "logits/chosen": -2.834245204925537, "logits/rejected": -2.8125996589660645, "logps/chosen": -262.54144287109375, "logps/rejected": -246.34860229492188, "loss": 0.6734, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.020137300714850426, "rewards/margins": 0.04266170784831047, "rewards/rejected": -0.022524405270814896, "step": 610 }, { "epoch": 0.16226118817063595, "grad_norm": 2.218667507171631, "learning_rate": 4.941602180974958e-07, "logits/chosen": -2.8357930183410645, "logits/rejected": -2.7973721027374268, "logps/chosen": -303.65606689453125, "logps/rejected": -245.33108520507812, "loss": 0.6696, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.019601870328187943, "rewards/margins": 0.049685824662446976, "rewards/rejected": -0.030083950608968735, "step": 620 }, { "epoch": 0.16487830410887203, "grad_norm": 1.9840420484542847, "learning_rate": 4.936591502957101e-07, "logits/chosen": -2.8378233909606934, "logits/rejected": -2.8140475749969482, "logps/chosen": -261.1944580078125, "logps/rejected": -257.957763671875, "loss": 0.6647, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.024741780012845993, "rewards/margins": 0.06145521253347397, "rewards/rejected": -0.036713436245918274, "step": 630 }, { "epoch": 0.16749542004710807, "grad_norm": 2.034658432006836, "learning_rate": 4.931377370249945e-07, "logits/chosen": -2.845576763153076, "logits/rejected": -2.78796124458313, "logps/chosen": -281.0826110839844, "logps/rejected": -263.23370361328125, "loss": 0.6673, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -6.524250056827441e-05, "rewards/margins": 0.05555204302072525, "rewards/rejected": -0.055617284029722214, "step": 640 }, { "epoch": 0.17011253598534415, "grad_norm": 2.102283239364624, "learning_rate": 4.925960218232072e-07, "logits/chosen": -2.8266994953155518, "logits/rejected": -2.8046762943267822, "logps/chosen": -269.2861633300781, "logps/rejected": -264.4281005859375, "loss": 0.6646, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.008663799613714218, "rewards/margins": 0.06168809533119202, "rewards/rejected": -0.0530242919921875, "step": 650 }, { "epoch": 0.17272965192358022, "grad_norm": 3.1403772830963135, "learning_rate": 4.920340499234116e-07, "logits/chosen": -2.796461343765259, "logits/rejected": -2.757336139678955, "logps/chosen": -285.25445556640625, "logps/rejected": -251.8562469482422, "loss": 0.6684, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.010964155197143555, "rewards/margins": 0.05367765575647354, "rewards/rejected": -0.04271350055932999, "step": 660 }, { "epoch": 0.17534676786181627, "grad_norm": 1.932573914527893, "learning_rate": 4.914518682500995e-07, "logits/chosen": -2.870535373687744, "logits/rejected": -2.840186595916748, "logps/chosen": -297.72967529296875, "logps/rejected": -261.30780029296875, "loss": 0.661, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0239148810505867, "rewards/margins": 0.07002829760313034, "rewards/rejected": -0.04611341655254364, "step": 670 }, { "epoch": 0.17796388380005235, "grad_norm": 2.7643067836761475, "learning_rate": 4.90849525415273e-07, "logits/chosen": -2.830029249191284, "logits/rejected": -2.8078887462615967, "logps/chosen": -288.3429260253906, "logps/rejected": -245.07369995117188, "loss": 0.6589, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.02092517912387848, "rewards/margins": 0.07483113557100296, "rewards/rejected": -0.05390595644712448, "step": 680 }, { "epoch": 0.1805809997382884, "grad_norm": 2.184591054916382, "learning_rate": 4.902270717143858e-07, "logits/chosen": -2.837787628173828, "logits/rejected": -2.8210721015930176, "logps/chosen": -255.417724609375, "logps/rejected": -272.31591796875, "loss": 0.6509, "rewards/accuracies": 0.78125, "rewards/chosen": 0.005492637865245342, "rewards/margins": 0.09071613848209381, "rewards/rejected": -0.0852234959602356, "step": 690 }, { "epoch": 0.18319811567652447, "grad_norm": 2.2565648555755615, "learning_rate": 4.895845591221426e-07, "logits/chosen": -2.833556652069092, "logits/rejected": -2.836822032928467, "logps/chosen": -269.5510559082031, "logps/rejected": -269.97686767578125, "loss": 0.6665, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.003929516766220331, "rewards/margins": 0.058883119374513626, "rewards/rejected": -0.06281263381242752, "step": 700 }, { "epoch": 0.18319811567652447, "eval_logits/chosen": -2.8327224254608154, "eval_logits/rejected": -2.8060340881347656, "eval_logps/chosen": -283.0862731933594, "eval_logps/rejected": -268.32659912109375, "eval_loss": 0.6637989282608032, "eval_rewards/accuracies": 0.6754999756813049, "eval_rewards/chosen": -0.0034864526242017746, "eval_rewards/margins": 0.06540023535490036, "eval_rewards/rejected": -0.06888668984174728, "eval_runtime": 691.7822, "eval_samples_per_second": 2.891, "eval_steps_per_second": 0.361, "step": 700 }, { "epoch": 0.18581523161476055, "grad_norm": 2.449979782104492, "learning_rate": 4.8892204128816e-07, "logits/chosen": -2.865187644958496, "logits/rejected": -2.8416965007781982, "logps/chosen": -281.83489990234375, "logps/rejected": -273.02984619140625, "loss": 0.6666, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0031673975754529238, "rewards/margins": 0.059172265231609344, "rewards/rejected": -0.062339670956134796, "step": 710 }, { "epoch": 0.1884323475529966, "grad_norm": 2.0199317932128906, "learning_rate": 4.882395735324863e-07, "logits/chosen": -2.840233325958252, "logits/rejected": -2.7969911098480225, "logps/chosen": -281.1783447265625, "logps/rejected": -274.934326171875, "loss": 0.6572, "rewards/accuracies": 0.75, "rewards/chosen": 0.0017295643920078874, "rewards/margins": 0.08150311559438705, "rewards/rejected": -0.07977355271577835, "step": 720 }, { "epoch": 0.19104946349123267, "grad_norm": 2.187190294265747, "learning_rate": 4.875372128409829e-07, "logits/chosen": -2.815016269683838, "logits/rejected": -2.7854647636413574, "logps/chosen": -285.82489013671875, "logps/rejected": -259.6023254394531, "loss": 0.6616, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.02074645273387432, "rewards/margins": 0.07150407880544662, "rewards/rejected": -0.0922505259513855, "step": 730 }, { "epoch": 0.19366657942946872, "grad_norm": 2.0459957122802734, "learning_rate": 4.868150178605653e-07, "logits/chosen": -2.812069892883301, "logits/rejected": -2.7864902019500732, "logps/chosen": -246.3455352783203, "logps/rejected": -221.7488250732422, "loss": 0.6527, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.03750302642583847, "rewards/margins": 0.08891085535287857, "rewards/rejected": -0.12641388177871704, "step": 740 }, { "epoch": 0.1962836953677048, "grad_norm": 2.3921523094177246, "learning_rate": 4.860730488943068e-07, "logits/chosen": -2.7749264240264893, "logits/rejected": -2.7638156414031982, "logps/chosen": -253.1526641845703, "logps/rejected": -256.56072998046875, "loss": 0.657, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.013170385733246803, "rewards/margins": 0.08013583719730377, "rewards/rejected": -0.09330622851848602, "step": 750 }, { "epoch": 0.19890081130594087, "grad_norm": 2.7103869915008545, "learning_rate": 4.853113678964021e-07, "logits/chosen": -2.7963593006134033, "logits/rejected": -2.786759376525879, "logps/chosen": -295.2373962402344, "logps/rejected": -288.03070068359375, "loss": 0.6532, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.0016003316268324852, "rewards/margins": 0.09059783071279526, "rewards/rejected": -0.09219817072153091, "step": 760 }, { "epoch": 0.20151792724417691, "grad_norm": 2.149914026260376, "learning_rate": 4.845300384669957e-07, "logits/chosen": -2.81345534324646, "logits/rejected": -2.783003807067871, "logps/chosen": -270.67730712890625, "logps/rejected": -254.6434326171875, "loss": 0.6605, "rewards/accuracies": 0.6875, "rewards/chosen": -0.006530989892780781, "rewards/margins": 0.07502902299165726, "rewards/rejected": -0.08156001567840576, "step": 770 }, { "epoch": 0.204135043182413, "grad_norm": 2.4296960830688477, "learning_rate": 4.8372912584687e-07, "logits/chosen": -2.8353335857391357, "logits/rejected": -2.801575183868408, "logps/chosen": -300.9684143066406, "logps/rejected": -283.5567626953125, "loss": 0.6587, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.0004579909145832062, "rewards/margins": 0.0798453614115715, "rewards/rejected": -0.079387366771698, "step": 780 }, { "epoch": 0.20675215912064904, "grad_norm": 3.0373857021331787, "learning_rate": 4.829086969119983e-07, "logits/chosen": -2.8006482124328613, "logits/rejected": -2.8082146644592285, "logps/chosen": -276.4783020019531, "logps/rejected": -276.69720458984375, "loss": 0.6671, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.024218443781137466, "rewards/margins": 0.06143224984407425, "rewards/rejected": -0.08565069735050201, "step": 790 }, { "epoch": 0.2093692750588851, "grad_norm": 2.1895201206207275, "learning_rate": 4.820688201679605e-07, "logits/chosen": -2.8546204566955566, "logits/rejected": -2.809619426727295, "logps/chosen": -277.23187255859375, "logps/rejected": -223.0809783935547, "loss": 0.6427, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.0034676387440413237, "rewards/margins": 0.11413818597793579, "rewards/rejected": -0.1106705442070961, "step": 800 }, { "epoch": 0.2093692750588851, "eval_logits/chosen": -2.828324317932129, "eval_logits/rejected": -2.8020219802856445, "eval_logps/chosen": -284.8824768066406, "eval_logps/rejected": -272.4747314453125, "eval_loss": 0.6546491980552673, "eval_rewards/accuracies": 0.6815000176429749, "eval_rewards/chosen": -0.02144855633378029, "eval_rewards/margins": 0.08891918510198593, "eval_rewards/rejected": -0.11036773025989532, "eval_runtime": 691.3571, "eval_samples_per_second": 2.893, "eval_steps_per_second": 0.362, "step": 800 }, { "epoch": 0.21198639099712116, "grad_norm": 2.411094903945923, "learning_rate": 4.812095657442231e-07, "logits/chosen": -2.8379623889923096, "logits/rejected": -2.8474135398864746, "logps/chosen": -292.9294128417969, "logps/rejected": -291.79937744140625, "loss": 0.6657, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.03384638577699661, "rewards/margins": 0.06732925027608871, "rewards/rejected": -0.10117564350366592, "step": 810 }, { "epoch": 0.21460350693535724, "grad_norm": 2.2789130210876465, "learning_rate": 4.803310053882831e-07, "logits/chosen": -2.820188522338867, "logits/rejected": -2.8341267108917236, "logps/chosen": -253.18002319335938, "logps/rejected": -271.46209716796875, "loss": 0.6585, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.04002877324819565, "rewards/margins": 0.08040440827608109, "rewards/rejected": -0.12043318897485733, "step": 820 }, { "epoch": 0.2172206228735933, "grad_norm": 2.6294658184051514, "learning_rate": 4.794332124596775e-07, "logits/chosen": -2.8491604328155518, "logits/rejected": -2.8390445709228516, "logps/chosen": -288.0977478027344, "logps/rejected": -289.91839599609375, "loss": 0.6617, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.025598719716072083, "rewards/margins": 0.0781911239027977, "rewards/rejected": -0.10378985106945038, "step": 830 }, { "epoch": 0.21983773881182936, "grad_norm": 2.718003273010254, "learning_rate": 4.785162619238574e-07, "logits/chosen": -2.7903778553009033, "logits/rejected": -2.750192880630493, "logps/chosen": -271.6007995605469, "logps/rejected": -255.642822265625, "loss": 0.6434, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.013516816310584545, "rewards/margins": 0.11254201829433441, "rewards/rejected": -0.12605881690979004, "step": 840 }, { "epoch": 0.22245485475006543, "grad_norm": 2.693995714187622, "learning_rate": 4.775802303459287e-07, "logits/chosen": -2.7961440086364746, "logits/rejected": -2.782381534576416, "logps/chosen": -266.48406982421875, "logps/rejected": -271.54876708984375, "loss": 0.6543, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.025890201330184937, "rewards/margins": 0.09162938594818115, "rewards/rejected": -0.11751959472894669, "step": 850 }, { "epoch": 0.22507197068830148, "grad_norm": 3.3223588466644287, "learning_rate": 4.766251958842589e-07, "logits/chosen": -2.770634174346924, "logits/rejected": -2.7624752521514893, "logps/chosen": -295.11322021484375, "logps/rejected": -291.52655029296875, "loss": 0.6493, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.03162473067641258, "rewards/margins": 0.10102814435958862, "rewards/rejected": -0.1326528638601303, "step": 860 }, { "epoch": 0.22768908662653756, "grad_norm": 2.2951784133911133, "learning_rate": 4.756512382839506e-07, "logits/chosen": -2.792806625366211, "logits/rejected": -2.7687854766845703, "logps/chosen": -276.4913024902344, "logps/rejected": -288.6650390625, "loss": 0.6455, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.06362788379192352, "rewards/margins": 0.11521414667367935, "rewards/rejected": -0.17884202301502228, "step": 870 }, { "epoch": 0.23030620256477363, "grad_norm": 2.3468611240386963, "learning_rate": 4.746584388701831e-07, "logits/chosen": -2.804765224456787, "logits/rejected": -2.8049676418304443, "logps/chosen": -284.9786071777344, "logps/rejected": -280.96392822265625, "loss": 0.6438, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.05107206106185913, "rewards/margins": 0.11629124730825424, "rewards/rejected": -0.16736331582069397, "step": 880 }, { "epoch": 0.23292331850300968, "grad_norm": 3.075714588165283, "learning_rate": 4.736468805414218e-07, "logits/chosen": -2.77662992477417, "logits/rejected": -2.7775301933288574, "logps/chosen": -271.46368408203125, "logps/rejected": -293.26531982421875, "loss": 0.6421, "rewards/accuracies": 0.6875, "rewards/chosen": -0.033290714025497437, "rewards/margins": 0.12201287597417831, "rewards/rejected": -0.15530358254909515, "step": 890 }, { "epoch": 0.23554043444124576, "grad_norm": 2.879183769226074, "learning_rate": 4.7261664776249595e-07, "logits/chosen": -2.7510781288146973, "logits/rejected": -2.7387068271636963, "logps/chosen": -250.3533477783203, "logps/rejected": -251.46630859375, "loss": 0.6428, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.04132508859038353, "rewards/margins": 0.12072241306304932, "rewards/rejected": -0.16204750537872314, "step": 900 }, { "epoch": 0.23554043444124576, "eval_logits/chosen": -2.819901704788208, "eval_logits/rejected": -2.794234275817871, "eval_logps/chosen": -285.2049865722656, "eval_logps/rejected": -275.2684631347656, "eval_loss": 0.6458239555358887, "eval_rewards/accuracies": 0.6769999861717224, "eval_rewards/chosen": -0.024673735722899437, "eval_rewards/margins": 0.1136314645409584, "eval_rewards/rejected": -0.138305202126503, "eval_runtime": 690.9829, "eval_samples_per_second": 2.894, "eval_steps_per_second": 0.362, "step": 900 }, { "epoch": 0.2381575503794818, "grad_norm": 2.7687416076660156, "learning_rate": 4.7156782655754624e-07, "logits/chosen": -2.8114147186279297, "logits/rejected": -2.772068977355957, "logps/chosen": -300.78826904296875, "logps/rejected": -255.8038330078125, "loss": 0.6426, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.002123198937624693, "rewards/margins": 0.1206832155585289, "rewards/rejected": -0.12280640751123428, "step": 910 }, { "epoch": 0.24077466631771788, "grad_norm": 2.5618391036987305, "learning_rate": 4.705005045028414e-07, "logits/chosen": -2.765242338180542, "logits/rejected": -2.737863063812256, "logps/chosen": -287.15667724609375, "logps/rejected": -278.50726318359375, "loss": 0.6459, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.05967919901013374, "rewards/margins": 0.11529602855443954, "rewards/rejected": -0.1749752312898636, "step": 920 }, { "epoch": 0.24339178225595393, "grad_norm": 2.9336323738098145, "learning_rate": 4.694147707194659e-07, "logits/chosen": -2.832733631134033, "logits/rejected": -2.8244283199310303, "logps/chosen": -294.346923828125, "logps/rejected": -287.9342346191406, "loss": 0.6366, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.06329428404569626, "rewards/margins": 0.1405760794878006, "rewards/rejected": -0.20387034118175507, "step": 930 }, { "epoch": 0.24600889819419, "grad_norm": 3.908505439758301, "learning_rate": 4.683107158658781e-07, "logits/chosen": -2.7808585166931152, "logits/rejected": -2.763042688369751, "logps/chosen": -314.3782653808594, "logps/rejected": -299.661865234375, "loss": 0.6227, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.05701801925897598, "rewards/margins": 0.16755308210849762, "rewards/rejected": -0.2245711088180542, "step": 940 }, { "epoch": 0.24862601413242608, "grad_norm": 3.2749459743499756, "learning_rate": 4.6718843213034066e-07, "logits/chosen": -2.7944037914276123, "logits/rejected": -2.77887225151062, "logps/chosen": -272.23724365234375, "logps/rejected": -273.14776611328125, "loss": 0.633, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.09230604767799377, "rewards/margins": 0.14217710494995117, "rewards/rejected": -0.23448316752910614, "step": 950 }, { "epoch": 0.2512431300706621, "grad_norm": 3.0224010944366455, "learning_rate": 4.660480132232224e-07, "logits/chosen": -2.805572986602783, "logits/rejected": -2.80751371383667, "logps/chosen": -293.3813171386719, "logps/rejected": -280.83465576171875, "loss": 0.6507, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.0641000047326088, "rewards/margins": 0.10990612208843231, "rewards/rejected": -0.1740061342716217, "step": 960 }, { "epoch": 0.25386024600889817, "grad_norm": 3.5039138793945312, "learning_rate": 4.64889554369174e-07, "logits/chosen": -2.805609941482544, "logits/rejected": -2.771754741668701, "logps/chosen": -298.55157470703125, "logps/rejected": -267.65087890625, "loss": 0.6166, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.0059810527600348, "rewards/margins": 0.18814215064048767, "rewards/rejected": -0.1821610927581787, "step": 970 }, { "epoch": 0.2564773619471343, "grad_norm": 2.8160240650177, "learning_rate": 4.637131522991764e-07, "logits/chosen": -2.7994441986083984, "logits/rejected": -2.7969179153442383, "logps/chosen": -309.35089111328125, "logps/rejected": -296.6192321777344, "loss": 0.6321, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.029499268159270287, "rewards/margins": 0.14880326390266418, "rewards/rejected": -0.17830254137516022, "step": 980 }, { "epoch": 0.2590944778853703, "grad_norm": 3.782945156097412, "learning_rate": 4.6251890524246375e-07, "logits/chosen": -2.8050458431243896, "logits/rejected": -2.786475658416748, "logps/chosen": -262.4518737792969, "logps/rejected": -256.80792236328125, "loss": 0.6166, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.0701083093881607, "rewards/margins": 0.18339978158473969, "rewards/rejected": -0.253508061170578, "step": 990 }, { "epoch": 0.26171159382360637, "grad_norm": 3.791015148162842, "learning_rate": 4.613069129183218e-07, "logits/chosen": -2.8377981185913086, "logits/rejected": -2.799161911010742, "logps/chosen": -328.35491943359375, "logps/rejected": -301.65679931640625, "loss": 0.6381, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.0686495453119278, "rewards/margins": 0.13748301565647125, "rewards/rejected": -0.20613256096839905, "step": 1000 }, { "epoch": 0.26171159382360637, "eval_logits/chosen": -2.813830852508545, "eval_logits/rejected": -2.7887284755706787, "eval_logps/chosen": -289.12060546875, "eval_logps/rejected": -282.1760559082031, "eval_loss": 0.635771632194519, "eval_rewards/accuracies": 0.6784999966621399, "eval_rewards/chosen": -0.06382979452610016, "eval_rewards/margins": 0.14355140924453735, "eval_rewards/rejected": -0.2073812186717987, "eval_runtime": 691.4427, "eval_samples_per_second": 2.893, "eval_steps_per_second": 0.362, "step": 1000 }, { "epoch": 0.2643287097618425, "grad_norm": 4.366467475891113, "learning_rate": 4.6007727652776065e-07, "logits/chosen": -2.7737021446228027, "logits/rejected": -2.7608792781829834, "logps/chosen": -254.6834259033203, "logps/rejected": -263.98565673828125, "loss": 0.6304, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.036558397114276886, "rewards/margins": 0.1544768214225769, "rewards/rejected": -0.1910352259874344, "step": 1010 }, { "epoch": 0.2669458257000785, "grad_norm": 3.2850377559661865, "learning_rate": 4.588300987450652e-07, "logits/chosen": -2.82348895072937, "logits/rejected": -2.7995572090148926, "logps/chosen": -271.41241455078125, "logps/rejected": -254.01864624023438, "loss": 0.6293, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.008820459246635437, "rewards/margins": 0.1594310700893402, "rewards/rejected": -0.16825154423713684, "step": 1020 }, { "epoch": 0.26956294163831457, "grad_norm": 3.3716328144073486, "learning_rate": 4.5756548370922134e-07, "logits/chosen": -2.781808853149414, "logits/rejected": -2.7637503147125244, "logps/chosen": -258.62860107421875, "logps/rejected": -260.2466125488281, "loss": 0.6508, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.024007773026823997, "rewards/margins": 0.11937548965215683, "rewards/rejected": -0.14338326454162598, "step": 1030 }, { "epoch": 0.2721800575765506, "grad_norm": 3.529965400695801, "learning_rate": 4.5628353701522047e-07, "logits/chosen": -2.815080404281616, "logits/rejected": -2.7873313426971436, "logps/chosen": -321.65435791015625, "logps/rejected": -310.28497314453125, "loss": 0.6072, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.017561940476298332, "rewards/margins": 0.2143036425113678, "rewards/rejected": -0.2318655550479889, "step": 1040 }, { "epoch": 0.2747971735147867, "grad_norm": 2.87839412689209, "learning_rate": 4.549843657052429e-07, "logits/chosen": -2.834746837615967, "logits/rejected": -2.808051347732544, "logps/chosen": -287.9942321777344, "logps/rejected": -302.9963684082031, "loss": 0.6048, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.03615923970937729, "rewards/margins": 0.21066415309906006, "rewards/rejected": -0.24682338535785675, "step": 1050 }, { "epoch": 0.27741428945302277, "grad_norm": 3.860949993133545, "learning_rate": 4.5366807825971907e-07, "logits/chosen": -2.780369758605957, "logits/rejected": -2.7750542163848877, "logps/chosen": -262.59075927734375, "logps/rejected": -269.21051025390625, "loss": 0.6437, "rewards/accuracies": 0.59375, "rewards/chosen": -0.08822160959243774, "rewards/margins": 0.14002035558223724, "rewards/rejected": -0.228241965174675, "step": 1060 }, { "epoch": 0.2800314053912588, "grad_norm": 6.0348801612854, "learning_rate": 4.5233478458827176e-07, "logits/chosen": -2.8092315196990967, "logits/rejected": -2.785090446472168, "logps/chosen": -316.466064453125, "logps/rejected": -282.1798400878906, "loss": 0.6104, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.08112485706806183, "rewards/margins": 0.2059168517589569, "rewards/rejected": -0.2870417535305023, "step": 1070 }, { "epoch": 0.2826485213294949, "grad_norm": 4.09010124206543, "learning_rate": 4.509845960205389e-07, "logits/chosen": -2.749141216278076, "logits/rejected": -2.753202438354492, "logps/chosen": -304.83111572265625, "logps/rejected": -288.3349304199219, "loss": 0.626, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.07901586592197418, "rewards/margins": 0.17285946011543274, "rewards/rejected": -0.2518753409385681, "step": 1080 }, { "epoch": 0.28526563726773096, "grad_norm": 4.772919654846191, "learning_rate": 4.4961762529687736e-07, "logits/chosen": -2.8033485412597656, "logits/rejected": -2.7844488620758057, "logps/chosen": -288.91998291015625, "logps/rejected": -284.6497802734375, "loss": 0.6324, "rewards/accuracies": 0.6875, "rewards/chosen": -0.09251121431589127, "rewards/margins": 0.15693159401416779, "rewards/rejected": -0.24944277107715607, "step": 1090 }, { "epoch": 0.287882753205967, "grad_norm": 4.188416957855225, "learning_rate": 4.482339865589492e-07, "logits/chosen": -2.8103842735290527, "logits/rejected": -2.768054962158203, "logps/chosen": -299.87091064453125, "logps/rejected": -267.5564880371094, "loss": 0.6488, "rewards/accuracies": 0.625, "rewards/chosen": -0.16943010687828064, "rewards/margins": 0.12456401437520981, "rewards/rejected": -0.29399409890174866, "step": 1100 }, { "epoch": 0.287882753205967, "eval_logits/chosen": -2.8070549964904785, "eval_logits/rejected": -2.782604694366455, "eval_logps/chosen": -296.5137634277344, "eval_logps/rejected": -291.989013671875, "eval_loss": 0.6283535361289978, "eval_rewards/accuracies": 0.6790000200271606, "eval_rewards/chosen": -0.13776110112667084, "eval_rewards/margins": 0.16774973273277283, "eval_rewards/rejected": -0.30551087856292725, "eval_runtime": 691.0066, "eval_samples_per_second": 2.894, "eval_steps_per_second": 0.362, "step": 1100 }, { "epoch": 0.2904998691442031, "grad_norm": 4.440745830535889, "learning_rate": 4.4683379534019076e-07, "logits/chosen": -2.803920269012451, "logits/rejected": -2.8017265796661377, "logps/chosen": -300.3214111328125, "logps/rejected": -309.1615905761719, "loss": 0.6336, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.1395951509475708, "rewards/margins": 0.1519310027360916, "rewards/rejected": -0.2915261387825012, "step": 1110 }, { "epoch": 0.29311698508243916, "grad_norm": 3.8111138343811035, "learning_rate": 4.4541716855616593e-07, "logits/chosen": -2.7794926166534424, "logits/rejected": -2.7597875595092773, "logps/chosen": -264.9614562988281, "logps/rejected": -282.9358825683594, "loss": 0.6252, "rewards/accuracies": 0.65625, "rewards/chosen": -0.07038460671901703, "rewards/margins": 0.17066633701324463, "rewards/rejected": -0.24105095863342285, "step": 1120 }, { "epoch": 0.2957341010206752, "grad_norm": 5.494072914123535, "learning_rate": 4.4398422449480357e-07, "logits/chosen": -2.774218797683716, "logits/rejected": -2.725161075592041, "logps/chosen": -294.66448974609375, "logps/rejected": -311.0096740722656, "loss": 0.6402, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.1438552290201187, "rewards/margins": 0.14675047993659973, "rewards/rejected": -0.29060572385787964, "step": 1130 }, { "epoch": 0.29835121695891126, "grad_norm": 4.3281474113464355, "learning_rate": 4.4253508280652036e-07, "logits/chosen": -2.7951579093933105, "logits/rejected": -2.7520532608032227, "logps/chosen": -317.461181640625, "logps/rejected": -285.7931213378906, "loss": 0.6139, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.13621816039085388, "rewards/margins": 0.19606857001781464, "rewards/rejected": -0.3322867453098297, "step": 1140 }, { "epoch": 0.30096833289714736, "grad_norm": 6.221525192260742, "learning_rate": 4.410698644942302e-07, "logits/chosen": -2.8402047157287598, "logits/rejected": -2.816387176513672, "logps/chosen": -297.50286865234375, "logps/rejected": -292.28436279296875, "loss": 0.6183, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.10158324241638184, "rewards/margins": 0.19611066579818726, "rewards/rejected": -0.2976939082145691, "step": 1150 }, { "epoch": 0.3035854488353834, "grad_norm": 4.492012023925781, "learning_rate": 4.3958869190324057e-07, "logits/chosen": -2.76503586769104, "logits/rejected": -2.7254602909088135, "logps/chosen": -291.94873046875, "logps/rejected": -282.52880859375, "loss": 0.6221, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.12198346853256226, "rewards/margins": 0.18694952130317688, "rewards/rejected": -0.30893296003341675, "step": 1160 }, { "epoch": 0.30620256477361946, "grad_norm": 3.562570810317993, "learning_rate": 4.380916887110365e-07, "logits/chosen": -2.829111099243164, "logits/rejected": -2.800809383392334, "logps/chosen": -290.05316162109375, "logps/rejected": -266.3580017089844, "loss": 0.6199, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.14898671209812164, "rewards/margins": 0.19158688187599182, "rewards/rejected": -0.34057360887527466, "step": 1170 }, { "epoch": 0.30881968071185556, "grad_norm": 5.379666805267334, "learning_rate": 4.3657897991695394e-07, "logits/chosen": -2.7369437217712402, "logits/rejected": -2.7774927616119385, "logps/chosen": -281.9171142578125, "logps/rejected": -300.78912353515625, "loss": 0.6192, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.11646691709756851, "rewards/margins": 0.19827672839164734, "rewards/rejected": -0.31474363803863525, "step": 1180 }, { "epoch": 0.3114367966500916, "grad_norm": 4.079792499542236, "learning_rate": 4.350506918317416e-07, "logits/chosen": -2.8184256553649902, "logits/rejected": -2.788510799407959, "logps/chosen": -274.4839172363281, "logps/rejected": -287.8948669433594, "loss": 0.6194, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.12529827654361725, "rewards/margins": 0.19565680623054504, "rewards/rejected": -0.3209550976753235, "step": 1190 }, { "epoch": 0.31405391258832765, "grad_norm": 4.406829833984375, "learning_rate": 4.335069520670149e-07, "logits/chosen": -2.7956674098968506, "logits/rejected": -2.7690110206604004, "logps/chosen": -252.70156860351562, "logps/rejected": -279.14111328125, "loss": 0.6427, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.09379851073026657, "rewards/margins": 0.14501607418060303, "rewards/rejected": -0.2388145923614502, "step": 1200 }, { "epoch": 0.31405391258832765, "eval_logits/chosen": -2.8165299892425537, "eval_logits/rejected": -2.793107032775879, "eval_logps/chosen": -293.77850341796875, "eval_logps/rejected": -291.3028259277344, "eval_loss": 0.622346818447113, "eval_rewards/accuracies": 0.6834999918937683, "eval_rewards/chosen": -0.11040891706943512, "eval_rewards/margins": 0.18824002146720886, "eval_rewards/rejected": -0.2986489236354828, "eval_runtime": 690.8187, "eval_samples_per_second": 2.895, "eval_steps_per_second": 0.362, "step": 1200 }, { "epoch": 0.3166710285265637, "grad_norm": 4.730831146240234, "learning_rate": 4.319478895245999e-07, "logits/chosen": -2.8096089363098145, "logits/rejected": -2.781852960586548, "logps/chosen": -277.19305419921875, "logps/rejected": -268.88653564453125, "loss": 0.6189, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.12051185220479965, "rewards/margins": 0.19446460902690887, "rewards/rejected": -0.3149764835834503, "step": 1210 }, { "epoch": 0.3192881444647998, "grad_norm": 4.179198741912842, "learning_rate": 4.3037363438577036e-07, "logits/chosen": -2.8334312438964844, "logits/rejected": -2.796905517578125, "logps/chosen": -275.5434875488281, "logps/rejected": -309.56561279296875, "loss": 0.6074, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.03255675360560417, "rewards/margins": 0.21560052037239075, "rewards/rejected": -0.24815726280212402, "step": 1220 }, { "epoch": 0.32190526040303585, "grad_norm": 3.7570934295654297, "learning_rate": 4.2878431810037716e-07, "logits/chosen": -2.8290486335754395, "logits/rejected": -2.821361780166626, "logps/chosen": -317.92926025390625, "logps/rejected": -291.9640197753906, "loss": 0.6102, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.06272344291210175, "rewards/margins": 0.21540877223014832, "rewards/rejected": -0.27813225984573364, "step": 1230 }, { "epoch": 0.3245223763412719, "grad_norm": 5.973113536834717, "learning_rate": 4.271800733758729e-07, "logits/chosen": -2.801720380783081, "logits/rejected": -2.804701566696167, "logps/chosen": -308.4283142089844, "logps/rejected": -294.974609375, "loss": 0.6055, "rewards/accuracies": 0.6875, "rewards/chosen": -0.04135540500283241, "rewards/margins": 0.23178556561470032, "rewards/rejected": -0.27314096689224243, "step": 1240 }, { "epoch": 0.327139492279508, "grad_norm": 5.047220706939697, "learning_rate": 4.255610341662304e-07, "logits/chosen": -2.8307595252990723, "logits/rejected": -2.779573440551758, "logps/chosen": -282.5008239746094, "logps/rejected": -278.0930480957031, "loss": 0.6297, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.07180126756429672, "rewards/margins": 0.17990802228450775, "rewards/rejected": -0.2517092823982239, "step": 1250 }, { "epoch": 0.32975660821774405, "grad_norm": 4.12667179107666, "learning_rate": 4.2392733566075757e-07, "logits/chosen": -2.8080954551696777, "logits/rejected": -2.7833712100982666, "logps/chosen": -279.9812927246094, "logps/rejected": -274.603271484375, "loss": 0.6437, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.06445430964231491, "rewards/margins": 0.1353849321603775, "rewards/rejected": -0.19983923435211182, "step": 1260 }, { "epoch": 0.3323737241559801, "grad_norm": 3.241464138031006, "learning_rate": 4.2227911427280973e-07, "logits/chosen": -2.7715563774108887, "logits/rejected": -2.7483251094818115, "logps/chosen": -269.14215087890625, "logps/rejected": -254.9038543701172, "loss": 0.6275, "rewards/accuracies": 0.625, "rewards/chosen": -0.029628584161400795, "rewards/margins": 0.1794588267803192, "rewards/rejected": -0.20908741652965546, "step": 1270 }, { "epoch": 0.33499084009421615, "grad_norm": 6.028203010559082, "learning_rate": 4.206165076283982e-07, "logits/chosen": -2.8015265464782715, "logits/rejected": -2.7831873893737793, "logps/chosen": -270.62139892578125, "logps/rejected": -273.0738830566406, "loss": 0.6107, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.09085245430469513, "rewards/margins": 0.2116876095533371, "rewards/rejected": -0.30254003405570984, "step": 1280 }, { "epoch": 0.33760795603245225, "grad_norm": 5.242630958557129, "learning_rate": 4.1893965455469946e-07, "logits/chosen": -2.8173327445983887, "logits/rejected": -2.7973732948303223, "logps/chosen": -279.14031982421875, "logps/rejected": -275.79638671875, "loss": 0.6269, "rewards/accuracies": 0.6875, "rewards/chosen": -0.14117182791233063, "rewards/margins": 0.18503603339195251, "rewards/rejected": -0.32620781660079956, "step": 1290 }, { "epoch": 0.3402250719706883, "grad_norm": 5.775106430053711, "learning_rate": 4.172486950684626e-07, "logits/chosen": -2.821103096008301, "logits/rejected": -2.814502477645874, "logps/chosen": -279.78289794921875, "logps/rejected": -298.9765930175781, "loss": 0.6131, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.11109775304794312, "rewards/margins": 0.21843478083610535, "rewards/rejected": -0.32953253388404846, "step": 1300 }, { "epoch": 0.3402250719706883, "eval_logits/chosen": -2.818049430847168, "eval_logits/rejected": -2.7951488494873047, "eval_logps/chosen": -297.3945007324219, "eval_logps/rejected": -296.5805969238281, "eval_loss": 0.6172210574150085, "eval_rewards/accuracies": 0.6865000128746033, "eval_rewards/chosen": -0.14656904339790344, "eval_rewards/margins": 0.2048574537038803, "eval_rewards/rejected": -0.35142648220062256, "eval_runtime": 691.9861, "eval_samples_per_second": 2.89, "eval_steps_per_second": 0.361, "step": 1300 }, { "epoch": 0.34284218790892435, "grad_norm": 8.304680824279785, "learning_rate": 4.155437703643181e-07, "logits/chosen": -2.841334581375122, "logits/rejected": -2.806217670440674, "logps/chosen": -272.61444091796875, "logps/rejected": -267.8605041503906, "loss": 0.6005, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.11493051052093506, "rewards/margins": 0.24178418517112732, "rewards/rejected": -0.35671466588974, "step": 1310 }, { "epoch": 0.34545930384716045, "grad_norm": 6.887094497680664, "learning_rate": 4.138250228029881e-07, "logits/chosen": -2.811464786529541, "logits/rejected": -2.797884941101074, "logps/chosen": -295.8591613769531, "logps/rejected": -319.4233703613281, "loss": 0.6383, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.2382466346025467, "rewards/margins": 0.16607843339443207, "rewards/rejected": -0.40432506799697876, "step": 1320 }, { "epoch": 0.3480764197853965, "grad_norm": 4.52334451675415, "learning_rate": 4.1209259589939935e-07, "logits/chosen": -2.8012988567352295, "logits/rejected": -2.8001253604888916, "logps/chosen": -262.8810119628906, "logps/rejected": -272.76788330078125, "loss": 0.6321, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.13144102692604065, "rewards/margins": 0.17341327667236328, "rewards/rejected": -0.30485430359840393, "step": 1330 }, { "epoch": 0.35069353572363254, "grad_norm": 3.246675729751587, "learning_rate": 4.103466343106998e-07, "logits/chosen": -2.8291964530944824, "logits/rejected": -2.824831247329712, "logps/chosen": -302.6276550292969, "logps/rejected": -286.753662109375, "loss": 0.6334, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.1298406422138214, "rewards/margins": 0.16963128745555878, "rewards/rejected": -0.2994719445705414, "step": 1340 }, { "epoch": 0.35331065166186865, "grad_norm": 4.933244705200195, "learning_rate": 4.085872838241796e-07, "logits/chosen": -2.767702102661133, "logits/rejected": -2.730109691619873, "logps/chosen": -311.7983703613281, "logps/rejected": -294.95294189453125, "loss": 0.6356, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.16958799958229065, "rewards/margins": 0.17119386792182922, "rewards/rejected": -0.3407818675041199, "step": 1350 }, { "epoch": 0.3559277676001047, "grad_norm": 6.131802082061768, "learning_rate": 4.06814691345098e-07, "logits/chosen": -2.7470338344573975, "logits/rejected": -2.722545862197876, "logps/chosen": -288.4170837402344, "logps/rejected": -289.61102294921875, "loss": 0.602, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.1348382532596588, "rewards/margins": 0.2365628182888031, "rewards/rejected": -0.3714010715484619, "step": 1360 }, { "epoch": 0.35854488353834074, "grad_norm": 4.9708638191223145, "learning_rate": 4.0502900488441707e-07, "logits/chosen": -2.7989072799682617, "logits/rejected": -2.789274215698242, "logps/chosen": -306.6829528808594, "logps/rejected": -320.0224304199219, "loss": 0.6285, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.20997491478919983, "rewards/margins": 0.1841730773448944, "rewards/rejected": -0.39414799213409424, "step": 1370 }, { "epoch": 0.3611619994765768, "grad_norm": 6.784174919128418, "learning_rate": 4.032303735464422e-07, "logits/chosen": -2.880401134490967, "logits/rejected": -2.835643768310547, "logps/chosen": -310.90679931640625, "logps/rejected": -308.8883361816406, "loss": 0.6053, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.20711734890937805, "rewards/margins": 0.24106808006763458, "rewards/rejected": -0.44818538427352905, "step": 1380 }, { "epoch": 0.3637791154148129, "grad_norm": 5.785353183746338, "learning_rate": 4.014189475163726e-07, "logits/chosen": -2.794342517852783, "logits/rejected": -2.7849628925323486, "logps/chosen": -297.41961669921875, "logps/rejected": -308.3134765625, "loss": 0.6053, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.24589386582374573, "rewards/margins": 0.2298090010881424, "rewards/rejected": -0.47570285201072693, "step": 1390 }, { "epoch": 0.36639623135304894, "grad_norm": 6.076969146728516, "learning_rate": 3.995948780477605e-07, "logits/chosen": -2.8259429931640625, "logits/rejected": -2.795186996459961, "logps/chosen": -306.1077880859375, "logps/rejected": -299.7892150878906, "loss": 0.6326, "rewards/accuracies": 0.625, "rewards/chosen": -0.21178540587425232, "rewards/margins": 0.17982172966003418, "rewards/rejected": -0.3916071355342865, "step": 1400 }, { "epoch": 0.36639623135304894, "eval_logits/chosen": -2.814655065536499, "eval_logits/rejected": -2.7920358180999756, "eval_logps/chosen": -300.2596740722656, "eval_logps/rejected": -300.3965759277344, "eval_loss": 0.6155202388763428, "eval_rewards/accuracies": 0.6859999895095825, "eval_rewards/chosen": -0.175220787525177, "eval_rewards/margins": 0.2143653929233551, "eval_rewards/rejected": -0.3895862102508545, "eval_runtime": 692.0291, "eval_samples_per_second": 2.89, "eval_steps_per_second": 0.361, "step": 1400 }, { "epoch": 0.369013347291285, "grad_norm": 6.421947479248047, "learning_rate": 3.977583174498816e-07, "logits/chosen": -2.816697359085083, "logits/rejected": -2.8030014038085938, "logps/chosen": -300.00640869140625, "logps/rejected": -303.1688232421875, "loss": 0.5882, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.14107367396354675, "rewards/margins": 0.27628999948501587, "rewards/rejected": -0.41736364364624023, "step": 1410 }, { "epoch": 0.3716304632295211, "grad_norm": 4.980222225189209, "learning_rate": 3.9590941907501717e-07, "logits/chosen": -2.8284125328063965, "logits/rejected": -2.812608242034912, "logps/chosen": -307.8800354003906, "logps/rejected": -303.53021240234375, "loss": 0.6005, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.056650467216968536, "rewards/margins": 0.2519657611846924, "rewards/rejected": -0.3086162507534027, "step": 1420 }, { "epoch": 0.37424757916775714, "grad_norm": 5.049463272094727, "learning_rate": 3.9404833730564974e-07, "logits/chosen": -2.735870838165283, "logits/rejected": -2.722884178161621, "logps/chosen": -285.8304443359375, "logps/rejected": -297.43341064453125, "loss": 0.6055, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.12556666135787964, "rewards/margins": 0.2363204061985016, "rewards/rejected": -0.3618870973587036, "step": 1430 }, { "epoch": 0.3768646951059932, "grad_norm": 6.007881164550781, "learning_rate": 3.9217522754157117e-07, "logits/chosen": -2.8069920539855957, "logits/rejected": -2.80522084236145, "logps/chosen": -284.0002136230469, "logps/rejected": -286.4706115722656, "loss": 0.5941, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.15235498547554016, "rewards/margins": 0.26103848218917847, "rewards/rejected": -0.41339343786239624, "step": 1440 }, { "epoch": 0.37948181104422923, "grad_norm": 4.487087726593018, "learning_rate": 3.9029024618690785e-07, "logits/chosen": -2.8235816955566406, "logits/rejected": -2.7990283966064453, "logps/chosen": -266.3917541503906, "logps/rejected": -270.59381103515625, "loss": 0.6161, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.11356230825185776, "rewards/margins": 0.21844033896923065, "rewards/rejected": -0.3320026695728302, "step": 1450 }, { "epoch": 0.38209892698246534, "grad_norm": 3.7364535331726074, "learning_rate": 3.883935506370605e-07, "logits/chosen": -2.7793936729431152, "logits/rejected": -2.770378589630127, "logps/chosen": -278.8677062988281, "logps/rejected": -271.43145751953125, "loss": 0.6076, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.08725923299789429, "rewards/margins": 0.2380957156419754, "rewards/rejected": -0.3253549635410309, "step": 1460 }, { "epoch": 0.3847160429207014, "grad_norm": 4.045937538146973, "learning_rate": 3.864852992655616e-07, "logits/chosen": -2.7860310077667236, "logits/rejected": -2.7741951942443848, "logps/chosen": -279.3297119140625, "logps/rejected": -292.84356689453125, "loss": 0.5813, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.10104944556951523, "rewards/margins": 0.2876027524471283, "rewards/rejected": -0.38865217566490173, "step": 1470 }, { "epoch": 0.38733315885893743, "grad_norm": 5.180766582489014, "learning_rate": 3.845656514108515e-07, "logits/chosen": -2.8035526275634766, "logits/rejected": -2.784550189971924, "logps/chosen": -299.1927490234375, "logps/rejected": -258.96661376953125, "loss": 0.6143, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.18017061054706573, "rewards/margins": 0.21995961666107178, "rewards/rejected": -0.40013018250465393, "step": 1480 }, { "epoch": 0.38995027479717354, "grad_norm": 3.420503616333008, "learning_rate": 3.8263476736297375e-07, "logits/chosen": -2.8004748821258545, "logits/rejected": -2.755922794342041, "logps/chosen": -280.3719177246094, "logps/rejected": -276.71051025390625, "loss": 0.6096, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.11658191680908203, "rewards/margins": 0.22706842422485352, "rewards/rejected": -0.34365034103393555, "step": 1490 }, { "epoch": 0.3925673907354096, "grad_norm": 6.24570369720459, "learning_rate": 3.8069280835019055e-07, "logits/chosen": -2.7886569499969482, "logits/rejected": -2.757636070251465, "logps/chosen": -291.5840759277344, "logps/rejected": -290.7030334472656, "loss": 0.6128, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.07107678055763245, "rewards/margins": 0.2125014066696167, "rewards/rejected": -0.28357818722724915, "step": 1500 }, { "epoch": 0.3925673907354096, "eval_logits/chosen": -2.819805145263672, "eval_logits/rejected": -2.798032283782959, "eval_logps/chosen": -289.036865234375, "eval_logps/rejected": -288.3089904785156, "eval_loss": 0.6180471777915955, "eval_rewards/accuracies": 0.6890000104904175, "eval_rewards/chosen": -0.06299243867397308, "eval_rewards/margins": 0.20571817457675934, "eval_rewards/rejected": -0.2687106430530548, "eval_runtime": 691.9992, "eval_samples_per_second": 2.89, "eval_steps_per_second": 0.361, "step": 1500 }, { "epoch": 0.39518450667364563, "grad_norm": 7.418298721313477, "learning_rate": 3.7873993652552073e-07, "logits/chosen": -2.7985031604766846, "logits/rejected": -2.7847418785095215, "logps/chosen": -256.2576904296875, "logps/rejected": -263.3230895996094, "loss": 0.646, "rewards/accuracies": 0.59375, "rewards/chosen": -0.07267605513334274, "rewards/margins": 0.14168903231620789, "rewards/rejected": -0.21436509490013123, "step": 1510 }, { "epoch": 0.39780162261188173, "grad_norm": 3.0412213802337646, "learning_rate": 3.767763149531995e-07, "logits/chosen": -2.8065857887268066, "logits/rejected": -2.792532205581665, "logps/chosen": -282.3772888183594, "logps/rejected": -286.32757568359375, "loss": 0.6036, "rewards/accuracies": 0.75, "rewards/chosen": -0.029223937541246414, "rewards/margins": 0.23573264479637146, "rewards/rejected": -0.26495662331581116, "step": 1520 }, { "epoch": 0.4004187385501178, "grad_norm": 6.914887428283691, "learning_rate": 3.7480210759506326e-07, "logits/chosen": -2.771960973739624, "logits/rejected": -2.769230365753174, "logps/chosen": -301.027099609375, "logps/rejected": -306.0934143066406, "loss": 0.6321, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.05497425049543381, "rewards/margins": 0.1824551671743393, "rewards/rejected": -0.2374294102191925, "step": 1530 }, { "epoch": 0.40303585448835383, "grad_norm": 5.229218006134033, "learning_rate": 3.728174792968582e-07, "logits/chosen": -2.7818996906280518, "logits/rejected": -2.753554582595825, "logps/chosen": -264.9828186035156, "logps/rejected": -266.6888122558594, "loss": 0.6304, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.10081575810909271, "rewards/margins": 0.1800784170627594, "rewards/rejected": -0.2808941900730133, "step": 1540 }, { "epoch": 0.4056529704265899, "grad_norm": 3.8269035816192627, "learning_rate": 3.70822595774476e-07, "logits/chosen": -2.8083198070526123, "logits/rejected": -2.7798688411712646, "logps/chosen": -294.8878479003906, "logps/rejected": -306.19659423828125, "loss": 0.5877, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.06873732060194016, "rewards/margins": 0.28800445795059204, "rewards/rejected": -0.3567417860031128, "step": 1550 }, { "epoch": 0.408270086364826, "grad_norm": 6.544018268585205, "learning_rate": 3.688176236001168e-07, "logits/chosen": -2.7987208366394043, "logits/rejected": -2.7670371532440186, "logps/chosen": -304.5577392578125, "logps/rejected": -289.78729248046875, "loss": 0.611, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.0676363930106163, "rewards/margins": 0.23785026371479034, "rewards/rejected": -0.30548661947250366, "step": 1560 }, { "epoch": 0.410887202303062, "grad_norm": 9.901212692260742, "learning_rate": 3.6680273018838016e-07, "logits/chosen": -2.8177802562713623, "logits/rejected": -2.806378126144409, "logps/chosen": -281.0837707519531, "logps/rejected": -286.8470153808594, "loss": 0.6035, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.11407822370529175, "rewards/margins": 0.25138336420059204, "rewards/rejected": -0.3654615879058838, "step": 1570 }, { "epoch": 0.4135043182412981, "grad_norm": 7.281955718994141, "learning_rate": 3.6477808378228596e-07, "logits/chosen": -2.787090539932251, "logits/rejected": -2.7860255241394043, "logps/chosen": -283.32928466796875, "logps/rejected": -338.25714111328125, "loss": 0.6043, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.12236142158508301, "rewards/margins": 0.2562143802642822, "rewards/rejected": -0.37857580184936523, "step": 1580 }, { "epoch": 0.4161214341795342, "grad_norm": 8.57088565826416, "learning_rate": 3.6274385343922674e-07, "logits/chosen": -2.8543007373809814, "logits/rejected": -2.8531434535980225, "logps/chosen": -267.55767822265625, "logps/rejected": -295.7901306152344, "loss": 0.6187, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.15387986600399017, "rewards/margins": 0.21341195702552795, "rewards/rejected": -0.36729180812835693, "step": 1590 }, { "epoch": 0.4187385501177702, "grad_norm": 5.7539849281311035, "learning_rate": 3.6070020901685057e-07, "logits/chosen": -2.7576816082000732, "logits/rejected": -2.769594669342041, "logps/chosen": -300.43572998046875, "logps/rejected": -298.788818359375, "loss": 0.6223, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.18216048181056976, "rewards/margins": 0.21212442219257355, "rewards/rejected": -0.3942849040031433, "step": 1600 }, { "epoch": 0.4187385501177702, "eval_logits/chosen": -2.8147764205932617, "eval_logits/rejected": -2.792606830596924, "eval_logps/chosen": -299.62200927734375, "eval_logps/rejected": -302.40740966796875, "eval_loss": 0.6088424324989319, "eval_rewards/accuracies": 0.6945000290870667, "eval_rewards/chosen": -0.16884401440620422, "eval_rewards/margins": 0.2408505380153656, "eval_rewards/rejected": -0.4096945822238922, "eval_runtime": 691.674, "eval_samples_per_second": 2.892, "eval_steps_per_second": 0.361, "step": 1600 }, { "epoch": 0.4213556660560063, "grad_norm": 6.157792568206787, "learning_rate": 3.5864732115887863e-07, "logits/chosen": -2.81066632270813, "logits/rejected": -2.802830219268799, "logps/chosen": -273.0591735839844, "logps/rejected": -307.04254150390625, "loss": 0.5896, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.12094251811504364, "rewards/margins": 0.2827422022819519, "rewards/rejected": -0.40368470549583435, "step": 1610 }, { "epoch": 0.4239727819942423, "grad_norm": 6.331284999847412, "learning_rate": 3.565853612808562e-07, "logits/chosen": -2.823272466659546, "logits/rejected": -2.794790744781494, "logps/chosen": -303.06683349609375, "logps/rejected": -291.0, "loss": 0.639, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.23127253353595734, "rewards/margins": 0.17943724989891052, "rewards/rejected": -0.41070979833602905, "step": 1620 }, { "epoch": 0.4265898979324784, "grad_norm": 9.121101379394531, "learning_rate": 3.5451450155583984e-07, "logits/chosen": -2.733624219894409, "logits/rejected": -2.7721478939056396, "logps/chosen": -277.8062744140625, "logps/rejected": -282.9922790527344, "loss": 0.623, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.28953424096107483, "rewards/margins": 0.21646256744861603, "rewards/rejected": -0.5059967041015625, "step": 1630 }, { "epoch": 0.42920701387071447, "grad_norm": 4.436567306518555, "learning_rate": 3.5243491490002055e-07, "logits/chosen": -2.817996025085449, "logits/rejected": -2.8122916221618652, "logps/chosen": -305.4420471191406, "logps/rejected": -318.54742431640625, "loss": 0.6265, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.32780542969703674, "rewards/margins": 0.21562886238098145, "rewards/rejected": -0.5434342622756958, "step": 1640 }, { "epoch": 0.4318241298089505, "grad_norm": 7.695457935333252, "learning_rate": 3.503467749582857e-07, "logits/chosen": -2.790708303451538, "logits/rejected": -2.7539708614349365, "logps/chosen": -298.7849426269531, "logps/rejected": -281.51995849609375, "loss": 0.6324, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.2754608690738678, "rewards/margins": 0.19722957909107208, "rewards/rejected": -0.47269049286842346, "step": 1650 }, { "epoch": 0.4344412457471866, "grad_norm": 8.035721778869629, "learning_rate": 3.482502560897194e-07, "logits/chosen": -2.7719411849975586, "logits/rejected": -2.762267589569092, "logps/chosen": -256.39263916015625, "logps/rejected": -276.6297607421875, "loss": 0.6336, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.19001971185207367, "rewards/margins": 0.172675222158432, "rewards/rejected": -0.3626949191093445, "step": 1660 }, { "epoch": 0.43705836168542267, "grad_norm": 4.791623115539551, "learning_rate": 3.4614553335304403e-07, "logits/chosen": -2.8094491958618164, "logits/rejected": -2.7578389644622803, "logps/chosen": -303.371337890625, "logps/rejected": -291.80615234375, "loss": 0.5957, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.12800468504428864, "rewards/margins": 0.26551762223243713, "rewards/rejected": -0.39352232217788696, "step": 1670 }, { "epoch": 0.4396754776236587, "grad_norm": 7.589243412017822, "learning_rate": 3.440327824920022e-07, "logits/chosen": -2.7957282066345215, "logits/rejected": -2.775707483291626, "logps/chosen": -309.8748474121094, "logps/rejected": -299.0494384765625, "loss": 0.5742, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.08200428634881973, "rewards/margins": 0.3152574598789215, "rewards/rejected": -0.39726167917251587, "step": 1680 }, { "epoch": 0.44229259356189476, "grad_norm": 6.186291694641113, "learning_rate": 3.4191217992068287e-07, "logits/chosen": -2.8362536430358887, "logits/rejected": -2.8137047290802, "logps/chosen": -306.2242431640625, "logps/rejected": -284.80548095703125, "loss": 0.6043, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.115182064473629, "rewards/margins": 0.25850868225097656, "rewards/rejected": -0.37369078397750854, "step": 1690 }, { "epoch": 0.44490970950013087, "grad_norm": 12.576449394226074, "learning_rate": 3.3978390270879056e-07, "logits/chosen": -2.7859883308410645, "logits/rejected": -2.7761070728302, "logps/chosen": -251.69168090820312, "logps/rejected": -273.64825439453125, "loss": 0.6338, "rewards/accuracies": 0.6875, "rewards/chosen": -0.23590262234210968, "rewards/margins": 0.1843734234571457, "rewards/rejected": -0.42027607560157776, "step": 1700 }, { "epoch": 0.44490970950013087, "eval_logits/chosen": -2.818115234375, "eval_logits/rejected": -2.7960946559906006, "eval_logps/chosen": -304.2535095214844, "eval_logps/rejected": -308.0869140625, "eval_loss": 0.6060847043991089, "eval_rewards/accuracies": 0.6924999952316284, "eval_rewards/chosen": -0.21515871584415436, "eval_rewards/margins": 0.2513309419155121, "eval_rewards/rejected": -0.46648964285850525, "eval_runtime": 691.2139, "eval_samples_per_second": 2.893, "eval_steps_per_second": 0.362, "step": 1700 }, { "epoch": 0.4475268254383669, "grad_norm": 8.074392318725586, "learning_rate": 3.376481285668599e-07, "logits/chosen": -2.8055875301361084, "logits/rejected": -2.8101181983947754, "logps/chosen": -259.6014404296875, "logps/rejected": -299.0648193359375, "loss": 0.6022, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.21092364192008972, "rewards/margins": 0.25584885478019714, "rewards/rejected": -0.4667724668979645, "step": 1710 }, { "epoch": 0.45014394137660296, "grad_norm": 9.234480857849121, "learning_rate": 3.355050358314172e-07, "logits/chosen": -2.838655948638916, "logits/rejected": -2.825796604156494, "logps/chosen": -299.0382995605469, "logps/rejected": -306.70733642578125, "loss": 0.5981, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.14200787246227264, "rewards/margins": 0.2596356272697449, "rewards/rejected": -0.40164345502853394, "step": 1720 }, { "epoch": 0.45276105731483907, "grad_norm": 6.1853437423706055, "learning_rate": 3.33354803450089e-07, "logits/chosen": -2.745539426803589, "logits/rejected": -2.7465980052948, "logps/chosen": -298.8321533203125, "logps/rejected": -300.1834411621094, "loss": 0.6179, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.14898642897605896, "rewards/margins": 0.23417282104492188, "rewards/rejected": -0.38315925002098083, "step": 1730 }, { "epoch": 0.4553781732530751, "grad_norm": 3.701824426651001, "learning_rate": 3.311976109666605e-07, "logits/chosen": -2.762765407562256, "logits/rejected": -2.745163917541504, "logps/chosen": -306.2688293457031, "logps/rejected": -297.1578369140625, "loss": 0.6142, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.11404751241207123, "rewards/margins": 0.22985681891441345, "rewards/rejected": -0.3439043462276459, "step": 1740 }, { "epoch": 0.45799528919131116, "grad_norm": 5.698086738586426, "learning_rate": 3.2903363850608317e-07, "logits/chosen": -2.8657941818237305, "logits/rejected": -2.8256325721740723, "logps/chosen": -286.952392578125, "logps/rejected": -288.02484130859375, "loss": 0.609, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.22551126778125763, "rewards/margins": 0.23173291981220245, "rewards/rejected": -0.45724421739578247, "step": 1750 }, { "epoch": 0.46061240512954726, "grad_norm": 7.6980085372924805, "learning_rate": 3.2686306675943477e-07, "logits/chosen": -2.792118549346924, "logits/rejected": -2.8060059547424316, "logps/chosen": -294.06951904296875, "logps/rejected": -291.16302490234375, "loss": 0.6134, "rewards/accuracies": 0.6875, "rewards/chosen": -0.20452764630317688, "rewards/margins": 0.24044232070446014, "rewards/rejected": -0.44496995210647583, "step": 1760 }, { "epoch": 0.4632295210677833, "grad_norm": 4.300843238830566, "learning_rate": 3.2468607696883145e-07, "logits/chosen": -2.7653212547302246, "logits/rejected": -2.756118059158325, "logps/chosen": -298.01544189453125, "logps/rejected": -333.34234619140625, "loss": 0.5883, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.2969765067100525, "rewards/margins": 0.29465410113334656, "rewards/rejected": -0.5916305780410767, "step": 1770 }, { "epoch": 0.46584663700601936, "grad_norm": 9.618111610412598, "learning_rate": 3.2250285091229435e-07, "logits/chosen": -2.825916290283203, "logits/rejected": -2.8047428131103516, "logps/chosen": -277.54571533203125, "logps/rejected": -286.90704345703125, "loss": 0.6269, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.2800549864768982, "rewards/margins": 0.20103518664836884, "rewards/rejected": -0.4810902178287506, "step": 1780 }, { "epoch": 0.4684637529442554, "grad_norm": 15.666852951049805, "learning_rate": 3.2031357088857083e-07, "logits/chosen": -2.8130288124084473, "logits/rejected": -2.8077621459960938, "logps/chosen": -317.0379333496094, "logps/rejected": -347.8671569824219, "loss": 0.6115, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.24338212609291077, "rewards/margins": 0.24569590389728546, "rewards/rejected": -0.4890781044960022, "step": 1790 }, { "epoch": 0.4710808688824915, "grad_norm": 6.9462571144104, "learning_rate": 3.1811841970191267e-07, "logits/chosen": -2.736687183380127, "logits/rejected": -2.714433193206787, "logps/chosen": -264.3397521972656, "logps/rejected": -324.6456604003906, "loss": 0.585, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.18001236021518707, "rewards/margins": 0.31897181272506714, "rewards/rejected": -0.4989841878414154, "step": 1800 }, { "epoch": 0.4710808688824915, "eval_logits/chosen": -2.8173904418945312, "eval_logits/rejected": -2.7949471473693848, "eval_logps/chosen": -296.00537109375, "eval_logps/rejected": -299.93682861328125, "eval_loss": 0.6049584746360779, "eval_rewards/accuracies": 0.6915000081062317, "eval_rewards/chosen": -0.1326776146888733, "eval_rewards/margins": 0.25231143832206726, "eval_rewards/rejected": -0.38498908281326294, "eval_runtime": 691.5153, "eval_samples_per_second": 2.892, "eval_steps_per_second": 0.362, "step": 1800 }, { "epoch": 0.47369798482072756, "grad_norm": 4.673962116241455, "learning_rate": 3.1591758064681257e-07, "logits/chosen": -2.7477469444274902, "logits/rejected": -2.7178540229797363, "logps/chosen": -282.83074951171875, "logps/rejected": -272.26715087890625, "loss": 0.5961, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.11454129219055176, "rewards/margins": 0.27904239296913147, "rewards/rejected": -0.3935837149620056, "step": 1810 }, { "epoch": 0.4763151007589636, "grad_norm": 7.684245586395264, "learning_rate": 3.13711237492698e-07, "logits/chosen": -2.7976129055023193, "logits/rejected": -2.7869057655334473, "logps/chosen": -313.35540771484375, "logps/rejected": -318.04559326171875, "loss": 0.6319, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.1507539302110672, "rewards/margins": 0.1945343315601349, "rewards/rejected": -0.3452882170677185, "step": 1820 }, { "epoch": 0.4789322166971997, "grad_norm": 4.426579475402832, "learning_rate": 3.1149957446858767e-07, "logits/chosen": -2.791010618209839, "logits/rejected": -2.807931423187256, "logps/chosen": -277.4505310058594, "logps/rejected": -279.3646240234375, "loss": 0.6403, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.12662403285503387, "rewards/margins": 0.16396556794643402, "rewards/rejected": -0.2905896306037903, "step": 1830 }, { "epoch": 0.48154933263543576, "grad_norm": 5.900054931640625, "learning_rate": 3.0928277624770736e-07, "logits/chosen": -2.843986988067627, "logits/rejected": -2.823529005050659, "logps/chosen": -312.50799560546875, "logps/rejected": -315.56402587890625, "loss": 0.5825, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.0948447436094284, "rewards/margins": 0.32227185368537903, "rewards/rejected": -0.41711658239364624, "step": 1840 }, { "epoch": 0.4841664485736718, "grad_norm": 4.000248908996582, "learning_rate": 3.0706102793207073e-07, "logits/chosen": -2.8290603160858154, "logits/rejected": -2.8024706840515137, "logps/chosen": -316.80023193359375, "logps/rejected": -323.507080078125, "loss": 0.5882, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.1260642558336258, "rewards/margins": 0.2963547706604004, "rewards/rejected": -0.422419011592865, "step": 1850 }, { "epoch": 0.48678356451190785, "grad_norm": 7.178162574768066, "learning_rate": 3.048345150370226e-07, "logits/chosen": -2.8230552673339844, "logits/rejected": -2.817823886871338, "logps/chosen": -320.08123779296875, "logps/rejected": -328.2519836425781, "loss": 0.6011, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.1783401370048523, "rewards/margins": 0.27760833501815796, "rewards/rejected": -0.45594844222068787, "step": 1860 }, { "epoch": 0.48940068045014395, "grad_norm": 5.042900562286377, "learning_rate": 3.0260342347574913e-07, "logits/chosen": -2.809600353240967, "logits/rejected": -2.78784441947937, "logps/chosen": -304.2792053222656, "logps/rejected": -314.709716796875, "loss": 0.5808, "rewards/accuracies": 0.78125, "rewards/chosen": -0.1620454490184784, "rewards/margins": 0.3016073703765869, "rewards/rejected": -0.4636527895927429, "step": 1870 }, { "epoch": 0.49201779638838, "grad_norm": 6.708124160766602, "learning_rate": 3.0036793954375357e-07, "logits/chosen": -2.840010643005371, "logits/rejected": -2.820410966873169, "logps/chosen": -301.98583984375, "logps/rejected": -291.33465576171875, "loss": 0.5776, "rewards/accuracies": 0.71875, "rewards/chosen": -0.15946264564990997, "rewards/margins": 0.32609638571739197, "rewards/rejected": -0.48555904626846313, "step": 1880 }, { "epoch": 0.49463491232661605, "grad_norm": 4.842483043670654, "learning_rate": 2.9812824990330085e-07, "logits/chosen": -2.8116726875305176, "logits/rejected": -2.8013501167297363, "logps/chosen": -312.96807861328125, "logps/rejected": -315.23675537109375, "loss": 0.5975, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.20859690010547638, "rewards/margins": 0.28837090730667114, "rewards/rejected": -0.4969678521156311, "step": 1890 }, { "epoch": 0.49725202826485215, "grad_norm": 11.47492790222168, "learning_rate": 2.958845415678316e-07, "logits/chosen": -2.8100364208221436, "logits/rejected": -2.7813189029693604, "logps/chosen": -317.1954650878906, "logps/rejected": -327.9840087890625, "loss": 0.577, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.21498079597949982, "rewards/margins": 0.32284659147262573, "rewards/rejected": -0.5378273725509644, "step": 1900 }, { "epoch": 0.49725202826485215, "eval_logits/chosen": -2.8176026344299316, "eval_logits/rejected": -2.7953593730926514, "eval_logps/chosen": -304.433349609375, "eval_logps/rejected": -310.2669677734375, "eval_loss": 0.6012681722640991, "eval_rewards/accuracies": 0.6965000033378601, "eval_rewards/chosen": -0.2169574648141861, "eval_rewards/margins": 0.27133309841156006, "eval_rewards/rejected": -0.4882905185222626, "eval_runtime": 691.3293, "eval_samples_per_second": 2.893, "eval_steps_per_second": 0.362, "step": 1900 }, { "epoch": 0.4998691442030882, "grad_norm": 8.036276817321777, "learning_rate": 2.936370018863459e-07, "logits/chosen": -2.833437442779541, "logits/rejected": -2.8240761756896973, "logps/chosen": -301.29473876953125, "logps/rejected": -287.30487060546875, "loss": 0.6058, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2125242054462433, "rewards/margins": 0.2442711889743805, "rewards/rejected": -0.4567953944206238, "step": 1910 }, { "epoch": 0.5024862601413242, "grad_norm": 6.088084697723389, "learning_rate": 2.913858185277605e-07, "logits/chosen": -2.793074131011963, "logits/rejected": -2.7879836559295654, "logps/chosen": -291.63409423828125, "logps/rejected": -303.8699035644531, "loss": 0.5963, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.14563243091106415, "rewards/margins": 0.27376314997673035, "rewards/rejected": -0.4193955361843109, "step": 1920 }, { "epoch": 0.5051033760795604, "grad_norm": 6.633253574371338, "learning_rate": 2.89131179465238e-07, "logits/chosen": -2.763582706451416, "logits/rejected": -2.7273335456848145, "logps/chosen": -300.27764892578125, "logps/rejected": -291.0055236816406, "loss": 0.5841, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.12304127216339111, "rewards/margins": 0.3036150336265564, "rewards/rejected": -0.4266563355922699, "step": 1930 }, { "epoch": 0.5077204920177963, "grad_norm": 4.170144557952881, "learning_rate": 2.8687327296049125e-07, "logits/chosen": -2.803448438644409, "logits/rejected": -2.7855215072631836, "logps/chosen": -287.71673583984375, "logps/rejected": -312.64544677734375, "loss": 0.6077, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.14048686623573303, "rewards/margins": 0.2633481025695801, "rewards/rejected": -0.4038349688053131, "step": 1940 }, { "epoch": 0.5103376079560324, "grad_norm": 4.711779594421387, "learning_rate": 2.846122875480637e-07, "logits/chosen": -2.823185682296753, "logits/rejected": -2.7931466102600098, "logps/chosen": -301.4597473144531, "logps/rejected": -299.9159851074219, "loss": 0.6066, "rewards/accuracies": 0.71875, "rewards/chosen": -0.10608525574207306, "rewards/margins": 0.25272199511528015, "rewards/rejected": -0.3588072657585144, "step": 1950 }, { "epoch": 0.5129547238942685, "grad_norm": 5.881545543670654, "learning_rate": 2.8234841201958647e-07, "logits/chosen": -2.8165388107299805, "logits/rejected": -2.784043550491333, "logps/chosen": -311.29217529296875, "logps/rejected": -301.19964599609375, "loss": 0.5839, "rewards/accuracies": 0.71875, "rewards/chosen": -0.1093025654554367, "rewards/margins": 0.299915611743927, "rewards/rejected": -0.4092181622982025, "step": 1960 }, { "epoch": 0.5155718398325045, "grad_norm": 10.640946388244629, "learning_rate": 2.800818354080148e-07, "logits/chosen": -2.7974326610565186, "logits/rejected": -2.7710323333740234, "logps/chosen": -303.19610595703125, "logps/rejected": -281.1106872558594, "loss": 0.6138, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.13868093490600586, "rewards/margins": 0.2444918155670166, "rewards/rejected": -0.38317275047302246, "step": 1970 }, { "epoch": 0.5181889557707406, "grad_norm": 5.855273246765137, "learning_rate": 2.778127469718435e-07, "logits/chosen": -2.751603364944458, "logits/rejected": -2.7628543376922607, "logps/chosen": -261.6673278808594, "logps/rejected": -309.0796813964844, "loss": 0.5864, "rewards/accuracies": 0.71875, "rewards/chosen": -0.1476416289806366, "rewards/margins": 0.2927255630493164, "rewards/rejected": -0.4403671622276306, "step": 1980 }, { "epoch": 0.5208060717089767, "grad_norm": 5.992628574371338, "learning_rate": 2.755413361793039e-07, "logits/chosen": -2.7673847675323486, "logits/rejected": -2.7404510974884033, "logps/chosen": -280.890869140625, "logps/rejected": -294.01092529296875, "loss": 0.6048, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.15447109937667847, "rewards/margins": 0.2593531310558319, "rewards/rejected": -0.4138242304325104, "step": 1990 }, { "epoch": 0.5234231876472127, "grad_norm": 6.741150379180908, "learning_rate": 2.7326779269254356e-07, "logits/chosen": -2.826737880706787, "logits/rejected": -2.811283588409424, "logps/chosen": -320.9913024902344, "logps/rejected": -290.5726318359375, "loss": 0.5945, "rewards/accuracies": 0.71875, "rewards/chosen": -0.14564435184001923, "rewards/margins": 0.29357942938804626, "rewards/rejected": -0.4392237663269043, "step": 2000 }, { "epoch": 0.5234231876472127, "eval_logits/chosen": -2.812201976776123, "eval_logits/rejected": -2.7902560234069824, "eval_logps/chosen": -303.8027648925781, "eval_logps/rejected": -310.42926025390625, "eval_loss": 0.5991718173027039, "eval_rewards/accuracies": 0.6995000243186951, "eval_rewards/chosen": -0.21065115928649902, "eval_rewards/margins": 0.27926215529441833, "eval_rewards/rejected": -0.48991334438323975, "eval_runtime": 691.9553, "eval_samples_per_second": 2.89, "eval_steps_per_second": 0.361, "step": 2000 }, { "epoch": 0.5260403035854488, "grad_norm": 5.159753322601318, "learning_rate": 2.709923063517895e-07, "logits/chosen": -2.770754337310791, "logits/rejected": -2.7877042293548584, "logps/chosen": -297.4669494628906, "logps/rejected": -326.15008544921875, "loss": 0.5803, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.18324916064739227, "rewards/margins": 0.3264145255088806, "rewards/rejected": -0.5096637010574341, "step": 2010 }, { "epoch": 0.528657419523685, "grad_norm": 9.780900001525879, "learning_rate": 2.68715067159496e-07, "logits/chosen": -2.804417133331299, "logits/rejected": -2.7843241691589355, "logps/chosen": -287.03619384765625, "logps/rejected": -296.3020324707031, "loss": 0.5831, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.18021352589130402, "rewards/margins": 0.30431440472602844, "rewards/rejected": -0.4845278859138489, "step": 2020 }, { "epoch": 0.5312745354619209, "grad_norm": 7.88455867767334, "learning_rate": 2.664362652644806e-07, "logits/chosen": -2.820744514465332, "logits/rejected": -2.8191521167755127, "logps/chosen": -334.691650390625, "logps/rejected": -322.51885986328125, "loss": 0.5813, "rewards/accuracies": 0.71875, "rewards/chosen": -0.22317573428153992, "rewards/margins": 0.33498162031173706, "rewards/rejected": -0.5581573247909546, "step": 2030 }, { "epoch": 0.533891651400157, "grad_norm": 6.620345115661621, "learning_rate": 2.6415609094604555e-07, "logits/chosen": -2.802522659301758, "logits/rejected": -2.8061249256134033, "logps/chosen": -310.2366638183594, "logps/rejected": -317.20941162109375, "loss": 0.6023, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.22533388435840607, "rewards/margins": 0.28193774819374084, "rewards/rejected": -0.5072715878486633, "step": 2040 }, { "epoch": 0.5365087673383931, "grad_norm": 8.580389022827148, "learning_rate": 2.618747345980904e-07, "logits/chosen": -2.8094029426574707, "logits/rejected": -2.768106460571289, "logps/chosen": -293.4418029785156, "logps/rejected": -266.50897216796875, "loss": 0.6014, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.28857478499412537, "rewards/margins": 0.2732298970222473, "rewards/rejected": -0.5618046522140503, "step": 2050 }, { "epoch": 0.5391258832766291, "grad_norm": 11.197132110595703, "learning_rate": 2.595923867132136e-07, "logits/chosen": -2.8401012420654297, "logits/rejected": -2.835894823074341, "logps/chosen": -327.6039733886719, "logps/rejected": -335.93634033203125, "loss": 0.5892, "rewards/accuracies": 0.71875, "rewards/chosen": -0.29747676849365234, "rewards/margins": 0.3320815861225128, "rewards/rejected": -0.6295583844184875, "step": 2060 }, { "epoch": 0.5417429992148652, "grad_norm": 7.386964797973633, "learning_rate": 2.5730923786680667e-07, "logits/chosen": -2.820725917816162, "logits/rejected": -2.821699619293213, "logps/chosen": -294.2755432128906, "logps/rejected": -329.28900146484375, "loss": 0.6084, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.28539037704467773, "rewards/margins": 0.27198493480682373, "rewards/rejected": -0.5573753714561462, "step": 2070 }, { "epoch": 0.5443601151531012, "grad_norm": 10.91450023651123, "learning_rate": 2.5502547870114135e-07, "logits/chosen": -2.798468589782715, "logits/rejected": -2.764756441116333, "logps/chosen": -296.8208923339844, "logps/rejected": -290.93609619140625, "loss": 0.6123, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.25504210591316223, "rewards/margins": 0.26738548278808594, "rewards/rejected": -0.5224276185035706, "step": 2080 }, { "epoch": 0.5469772310913373, "grad_norm": 9.419450759887695, "learning_rate": 2.527412999094506e-07, "logits/chosen": -2.7591891288757324, "logits/rejected": -2.7384586334228516, "logps/chosen": -340.7040100097656, "logps/rejected": -353.3229064941406, "loss": 0.5947, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.2267749011516571, "rewards/margins": 0.2946481704711914, "rewards/rejected": -0.5214229822158813, "step": 2090 }, { "epoch": 0.5495943470295734, "grad_norm": 9.121070861816406, "learning_rate": 2.5045689222000636e-07, "logits/chosen": -2.748777151107788, "logits/rejected": -2.737816333770752, "logps/chosen": -279.33941650390625, "logps/rejected": -290.88262939453125, "loss": 0.5913, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.20830078423023224, "rewards/margins": 0.28861740231513977, "rewards/rejected": -0.4969182014465332, "step": 2100 }, { "epoch": 0.5495943470295734, "eval_logits/chosen": -2.8085484504699707, "eval_logits/rejected": -2.786346673965454, "eval_logps/chosen": -306.4640808105469, "eval_logps/rejected": -313.952880859375, "eval_loss": 0.5981019139289856, "eval_rewards/accuracies": 0.7024999856948853, "eval_rewards/chosen": -0.23726463317871094, "eval_rewards/margins": 0.2878848612308502, "eval_rewards/rejected": -0.5251494646072388, "eval_runtime": 690.4278, "eval_samples_per_second": 2.897, "eval_steps_per_second": 0.362, "step": 2100 }, { "epoch": 0.5522114629678094, "grad_norm": 7.360952854156494, "learning_rate": 2.481724463801933e-07, "logits/chosen": -2.7974154949188232, "logits/rejected": -2.7778165340423584, "logps/chosen": -320.70465087890625, "logps/rejected": -308.23455810546875, "loss": 0.5916, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.25429460406303406, "rewards/margins": 0.29730120301246643, "rewards/rejected": -0.5515958070755005, "step": 2110 }, { "epoch": 0.5548285789060455, "grad_norm": 9.077162742614746, "learning_rate": 2.4588815314058154e-07, "logits/chosen": -2.7863690853118896, "logits/rejected": -2.787247896194458, "logps/chosen": -283.7870788574219, "logps/rejected": -277.558837890625, "loss": 0.5976, "rewards/accuracies": 0.6875, "rewards/chosen": -0.24108314514160156, "rewards/margins": 0.27977603673934937, "rewards/rejected": -0.5208591818809509, "step": 2120 }, { "epoch": 0.5574456948442816, "grad_norm": 6.194889545440674, "learning_rate": 2.4360420323899917e-07, "logits/chosen": -2.7870755195617676, "logits/rejected": -2.779362916946411, "logps/chosen": -321.5159606933594, "logps/rejected": -313.3367614746094, "loss": 0.6106, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.25045931339263916, "rewards/margins": 0.27981314063072205, "rewards/rejected": -0.5302724242210388, "step": 2130 }, { "epoch": 0.5600628107825176, "grad_norm": 9.01162338256836, "learning_rate": 2.4132078738460583e-07, "logits/chosen": -2.821700096130371, "logits/rejected": -2.7977004051208496, "logps/chosen": -299.77734375, "logps/rejected": -288.15472412109375, "loss": 0.5911, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2075999230146408, "rewards/margins": 0.2872273027896881, "rewards/rejected": -0.49482718110084534, "step": 2140 }, { "epoch": 0.5626799267207537, "grad_norm": 8.978148460388184, "learning_rate": 2.390380962419682e-07, "logits/chosen": -2.7910008430480957, "logits/rejected": -2.7853500843048096, "logps/chosen": -271.1761474609375, "logps/rejected": -258.0618896484375, "loss": 0.6279, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.2159349024295807, "rewards/margins": 0.2157304286956787, "rewards/rejected": -0.4316653609275818, "step": 2150 }, { "epoch": 0.5652970426589898, "grad_norm": 10.330108642578125, "learning_rate": 2.3675632041513977e-07, "logits/chosen": -2.8272249698638916, "logits/rejected": -2.781740427017212, "logps/chosen": -321.1408996582031, "logps/rejected": -290.31451416015625, "loss": 0.566, "rewards/accuracies": 0.75, "rewards/chosen": -0.1839137077331543, "rewards/margins": 0.36078041791915894, "rewards/rejected": -0.5446941256523132, "step": 2160 }, { "epoch": 0.5679141585972258, "grad_norm": 4.827859401702881, "learning_rate": 2.344756504317453e-07, "logits/chosen": -2.7731990814208984, "logits/rejected": -2.739841938018799, "logps/chosen": -311.63385009765625, "logps/rejected": -300.05657958984375, "loss": 0.6069, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.37105852365493774, "rewards/margins": 0.2651851773262024, "rewards/rejected": -0.6362437009811401, "step": 2170 }, { "epoch": 0.5705312745354619, "grad_norm": 7.324320316314697, "learning_rate": 2.3219627672707237e-07, "logits/chosen": -2.7636940479278564, "logits/rejected": -2.7629504203796387, "logps/chosen": -312.3614196777344, "logps/rejected": -291.49920654296875, "loss": 0.6201, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.40163812041282654, "rewards/margins": 0.2354915589094162, "rewards/rejected": -0.6371296644210815, "step": 2180 }, { "epoch": 0.573148390473698, "grad_norm": 9.793487548828125, "learning_rate": 2.2991838962816918e-07, "logits/chosen": -2.760166645050049, "logits/rejected": -2.7421138286590576, "logps/chosen": -309.69378662109375, "logps/rejected": -330.1057434082031, "loss": 0.6189, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.391974538564682, "rewards/margins": 0.23559853434562683, "rewards/rejected": -0.6275731325149536, "step": 2190 }, { "epoch": 0.575765506411934, "grad_norm": 4.884433746337891, "learning_rate": 2.2764217933795297e-07, "logits/chosen": -2.7735462188720703, "logits/rejected": -2.7576115131378174, "logps/chosen": -306.01983642578125, "logps/rejected": -319.36273193359375, "loss": 0.5816, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.28672754764556885, "rewards/margins": 0.3387922942638397, "rewards/rejected": -0.625519871711731, "step": 2200 }, { "epoch": 0.575765506411934, "eval_logits/chosen": -2.806988000869751, "eval_logits/rejected": -2.7848920822143555, "eval_logps/chosen": -309.6146240234375, "eval_logps/rejected": -317.14105224609375, "eval_loss": 0.5989395976066589, "eval_rewards/accuracies": 0.6970000267028809, "eval_rewards/chosen": -0.26876989006996155, "eval_rewards/margins": 0.28826138377189636, "eval_rewards/rejected": -0.5570313334465027, "eval_runtime": 692.0182, "eval_samples_per_second": 2.89, "eval_steps_per_second": 0.361, "step": 2200 }, { "epoch": 0.5783826223501701, "grad_norm": 5.080691337585449, "learning_rate": 2.253678359193278e-07, "logits/chosen": -2.8626627922058105, "logits/rejected": -2.8227312564849854, "logps/chosen": -323.10284423828125, "logps/rejected": -324.9154968261719, "loss": 0.6192, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.28973332047462463, "rewards/margins": 0.24134087562561035, "rewards/rejected": -0.5310741662979126, "step": 2210 }, { "epoch": 0.5809997382884062, "grad_norm": 8.136847496032715, "learning_rate": 2.230955492793149e-07, "logits/chosen": -2.7363781929016113, "logits/rejected": -2.747398853302002, "logps/chosen": -315.01092529296875, "logps/rejected": -321.312744140625, "loss": 0.6301, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.2270394265651703, "rewards/margins": 0.22412936389446259, "rewards/rejected": -0.4511687755584717, "step": 2220 }, { "epoch": 0.5836168542266422, "grad_norm": 3.2636797428131104, "learning_rate": 2.2082550915319468e-07, "logits/chosen": -2.746173858642578, "logits/rejected": -2.7479488849639893, "logps/chosen": -311.60443115234375, "logps/rejected": -304.00933837890625, "loss": 0.5897, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.16526171565055847, "rewards/margins": 0.31148332357406616, "rewards/rejected": -0.47674503922462463, "step": 2230 }, { "epoch": 0.5862339701648783, "grad_norm": 7.513117790222168, "learning_rate": 2.1855790508866433e-07, "logits/chosen": -2.7626214027404785, "logits/rejected": -2.766356945037842, "logps/chosen": -345.93560791015625, "logps/rejected": -345.16632080078125, "loss": 0.6017, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.19639845192432404, "rewards/margins": 0.2772556245326996, "rewards/rejected": -0.473654180765152, "step": 2240 }, { "epoch": 0.5888510861031143, "grad_norm": 4.226502418518066, "learning_rate": 2.162929264300107e-07, "logits/chosen": -2.7443809509277344, "logits/rejected": -2.740731716156006, "logps/chosen": -298.61883544921875, "logps/rejected": -312.0686950683594, "loss": 0.5729, "rewards/accuracies": 0.75, "rewards/chosen": -0.14046551287174225, "rewards/margins": 0.34373658895492554, "rewards/rejected": -0.4842020869255066, "step": 2250 }, { "epoch": 0.5914682020413504, "grad_norm": 5.33687162399292, "learning_rate": 2.1403076230230005e-07, "logits/chosen": -2.767137289047241, "logits/rejected": -2.7396111488342285, "logps/chosen": -312.28643798828125, "logps/rejected": -306.20172119140625, "loss": 0.616, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.19273105263710022, "rewards/margins": 0.26331207156181335, "rewards/rejected": -0.45604315400123596, "step": 2260 }, { "epoch": 0.5940853179795865, "grad_norm": 9.639008522033691, "learning_rate": 2.1177160159558596e-07, "logits/chosen": -2.7518250942230225, "logits/rejected": -2.7383649349212646, "logps/chosen": -321.7221374511719, "logps/rejected": -297.3667297363281, "loss": 0.6038, "rewards/accuracies": 0.6875, "rewards/chosen": -0.21679162979125977, "rewards/margins": 0.29109686613082886, "rewards/rejected": -0.5078884959220886, "step": 2270 }, { "epoch": 0.5967024339178225, "grad_norm": 6.384767055511475, "learning_rate": 2.0951563294913734e-07, "logits/chosen": -2.760425090789795, "logits/rejected": -2.7438526153564453, "logps/chosen": -299.39373779296875, "logps/rejected": -302.9912109375, "loss": 0.5717, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.20336699485778809, "rewards/margins": 0.3353096842765808, "rewards/rejected": -0.5386766791343689, "step": 2280 }, { "epoch": 0.5993195498560586, "grad_norm": 6.036366939544678, "learning_rate": 2.072630447356869e-07, "logits/chosen": -2.7959117889404297, "logits/rejected": -2.7956790924072266, "logps/chosen": -300.03179931640625, "logps/rejected": -291.49481201171875, "loss": 0.6001, "rewards/accuracies": 0.6875, "rewards/chosen": -0.23898771405220032, "rewards/margins": 0.26846712827682495, "rewards/rejected": -0.5074548125267029, "step": 2290 }, { "epoch": 0.6019366657942947, "grad_norm": 7.8020195960998535, "learning_rate": 2.0501402504570232e-07, "logits/chosen": -2.829082727432251, "logits/rejected": -2.772502899169922, "logps/chosen": -318.4316711425781, "logps/rejected": -315.959716796875, "loss": 0.5824, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.22740764915943146, "rewards/margins": 0.3216533958911896, "rewards/rejected": -0.5490610003471375, "step": 2300 }, { "epoch": 0.6019366657942947, "eval_logits/chosen": -2.80366849899292, "eval_logits/rejected": -2.7820827960968018, "eval_logps/chosen": -305.00982666015625, "eval_logps/rejected": -313.32330322265625, "eval_loss": 0.5960872769355774, "eval_rewards/accuracies": 0.6955000162124634, "eval_rewards/chosen": -0.2227218896150589, "eval_rewards/margins": 0.2961318790912628, "eval_rewards/rejected": -0.5188537836074829, "eval_runtime": 691.9375, "eval_samples_per_second": 2.89, "eval_steps_per_second": 0.361, "step": 2300 }, { "epoch": 0.6045537817325307, "grad_norm": 12.083107948303223, "learning_rate": 2.027687616716804e-07, "logits/chosen": -2.72344970703125, "logits/rejected": -2.7168376445770264, "logps/chosen": -268.31243896484375, "logps/rejected": -255.6737518310547, "loss": 0.6189, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.2189827412366867, "rewards/margins": 0.24416430294513702, "rewards/rejected": -0.46314701437950134, "step": 2310 }, { "epoch": 0.6071708976707668, "grad_norm": 8.845372200012207, "learning_rate": 2.005274420924668e-07, "logits/chosen": -2.790346145629883, "logits/rejected": -2.778743267059326, "logps/chosen": -295.9941711425781, "logps/rejected": -287.6865234375, "loss": 0.6086, "rewards/accuracies": 0.65625, "rewards/chosen": -0.25174736976623535, "rewards/margins": 0.2748829126358032, "rewards/rejected": -0.5266302824020386, "step": 2320 }, { "epoch": 0.6097880136090029, "grad_norm": 7.964311599731445, "learning_rate": 1.9829025345760121e-07, "logits/chosen": -2.7749578952789307, "logits/rejected": -2.7802319526672363, "logps/chosen": -315.29290771484375, "logps/rejected": -332.8951721191406, "loss": 0.6062, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.17806461453437805, "rewards/margins": 0.2752231955528259, "rewards/rejected": -0.4532877802848816, "step": 2330 }, { "epoch": 0.6124051295472389, "grad_norm": 8.214485168457031, "learning_rate": 1.960573825716911e-07, "logits/chosen": -2.743821620941162, "logits/rejected": -2.7305188179016113, "logps/chosen": -275.1949768066406, "logps/rejected": -297.45172119140625, "loss": 0.6016, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.23889228701591492, "rewards/margins": 0.29088443517684937, "rewards/rejected": -0.5297766923904419, "step": 2340 }, { "epoch": 0.615022245485475, "grad_norm": 7.783448696136475, "learning_rate": 1.9382901587881273e-07, "logits/chosen": -2.8195502758026123, "logits/rejected": -2.8172898292541504, "logps/chosen": -291.1629333496094, "logps/rejected": -292.11553955078125, "loss": 0.5555, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.15334704518318176, "rewards/margins": 0.37875789403915405, "rewards/rejected": -0.5321049094200134, "step": 2350 }, { "epoch": 0.6176393614237111, "grad_norm": 7.713850498199463, "learning_rate": 1.9160533944694364e-07, "logits/chosen": -2.802713394165039, "logits/rejected": -2.763248920440674, "logps/chosen": -297.48541259765625, "logps/rejected": -321.0580139160156, "loss": 0.5661, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.1875167191028595, "rewards/margins": 0.3671106696128845, "rewards/rejected": -0.5546274185180664, "step": 2360 }, { "epoch": 0.6202564773619471, "grad_norm": 7.275653839111328, "learning_rate": 1.8938653895242602e-07, "logits/chosen": -2.805842161178589, "logits/rejected": -2.7778079509735107, "logps/chosen": -301.32257080078125, "logps/rejected": -307.5292663574219, "loss": 0.569, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.22137200832366943, "rewards/margins": 0.3620893061161041, "rewards/rejected": -0.583461344242096, "step": 2370 }, { "epoch": 0.6228735933001832, "grad_norm": 7.8891282081604, "learning_rate": 1.8717279966446264e-07, "logits/chosen": -2.702014684677124, "logits/rejected": -2.6890392303466797, "logps/chosen": -299.67095947265625, "logps/rejected": -315.53125, "loss": 0.6047, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.3092700242996216, "rewards/margins": 0.2915950417518616, "rewards/rejected": -0.6008650660514832, "step": 2380 }, { "epoch": 0.6254907092384192, "grad_norm": 9.103086471557617, "learning_rate": 1.8496430642964694e-07, "logits/chosen": -2.7693662643432617, "logits/rejected": -2.749218702316284, "logps/chosen": -320.30596923828125, "logps/rejected": -322.6269226074219, "loss": 0.6135, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.2899993658065796, "rewards/margins": 0.2783369719982147, "rewards/rejected": -0.5683363676071167, "step": 2390 }, { "epoch": 0.6281078251766553, "grad_norm": 8.552151679992676, "learning_rate": 1.8276124365652855e-07, "logits/chosen": -2.796008586883545, "logits/rejected": -2.750042200088501, "logps/chosen": -308.24066162109375, "logps/rejected": -318.9580993652344, "loss": 0.602, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.283893346786499, "rewards/margins": 0.2797131836414337, "rewards/rejected": -0.5636065602302551, "step": 2400 }, { "epoch": 0.6281078251766553, "eval_logits/chosen": -2.796116352081299, "eval_logits/rejected": -2.774383783340454, "eval_logps/chosen": -309.5652160644531, "eval_logps/rejected": -318.12506103515625, "eval_loss": 0.5968618392944336, "eval_rewards/accuracies": 0.6990000009536743, "eval_rewards/chosen": -0.2682757079601288, "eval_rewards/margins": 0.2985955774784088, "eval_rewards/rejected": -0.5668712258338928, "eval_runtime": 690.9152, "eval_samples_per_second": 2.895, "eval_steps_per_second": 0.362, "step": 2400 }, { "epoch": 0.6307249411148914, "grad_norm": 10.884597778320312, "learning_rate": 1.805637953002149e-07, "logits/chosen": -2.806243658065796, "logits/rejected": -2.804234266281128, "logps/chosen": -287.49090576171875, "logps/rejected": -287.6014404296875, "loss": 0.6169, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.27734607458114624, "rewards/margins": 0.24837279319763184, "rewards/rejected": -0.5257189273834229, "step": 2410 }, { "epoch": 0.6333420570531274, "grad_norm": 9.013958930969238, "learning_rate": 1.7837214484701153e-07, "logits/chosen": -2.7953040599823, "logits/rejected": -2.7851452827453613, "logps/chosen": -289.382568359375, "logps/rejected": -297.02679443359375, "loss": 0.5733, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.2011108100414276, "rewards/margins": 0.34568914771080017, "rewards/rejected": -0.5468000173568726, "step": 2420 }, { "epoch": 0.6359591729913635, "grad_norm": 14.238588333129883, "learning_rate": 1.761864752991004e-07, "logits/chosen": -2.778735399246216, "logits/rejected": -2.759908437728882, "logps/chosen": -295.66241455078125, "logps/rejected": -312.7738952636719, "loss": 0.5791, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.21105961501598358, "rewards/margins": 0.3268287181854248, "rewards/rejected": -0.5378884077072144, "step": 2430 }, { "epoch": 0.6385762889295996, "grad_norm": 5.6600518226623535, "learning_rate": 1.7400696915925995e-07, "logits/chosen": -2.7974464893341064, "logits/rejected": -2.7732651233673096, "logps/chosen": -312.24798583984375, "logps/rejected": -279.251708984375, "loss": 0.5943, "rewards/accuracies": 0.6875, "rewards/chosen": -0.23233290016651154, "rewards/margins": 0.3078458309173584, "rewards/rejected": -0.5401787161827087, "step": 2440 }, { "epoch": 0.6411934048678356, "grad_norm": 11.058223724365234, "learning_rate": 1.718338084156254e-07, "logits/chosen": -2.7382242679595947, "logits/rejected": -2.727843761444092, "logps/chosen": -323.4954528808594, "logps/rejected": -317.99456787109375, "loss": 0.57, "rewards/accuracies": 0.78125, "rewards/chosen": -0.1701376736164093, "rewards/margins": 0.3507465720176697, "rewards/rejected": -0.5208842754364014, "step": 2450 }, { "epoch": 0.6438105208060717, "grad_norm": 14.676642417907715, "learning_rate": 1.696671745264937e-07, "logits/chosen": -2.799201488494873, "logits/rejected": -2.8146328926086426, "logps/chosen": -313.3539733886719, "logps/rejected": -290.71197509765625, "loss": 0.5616, "rewards/accuracies": 0.78125, "rewards/chosen": -0.154522106051445, "rewards/margins": 0.36096832156181335, "rewards/rejected": -0.5154904127120972, "step": 2460 }, { "epoch": 0.6464276367443078, "grad_norm": 7.134603500366211, "learning_rate": 1.67507248405171e-07, "logits/chosen": -2.786536693572998, "logits/rejected": -2.7716171741485596, "logps/chosen": -290.3885192871094, "logps/rejected": -317.96453857421875, "loss": 0.6052, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.17861530184745789, "rewards/margins": 0.2776513695716858, "rewards/rejected": -0.4562666416168213, "step": 2470 }, { "epoch": 0.6490447526825438, "grad_norm": 9.284005165100098, "learning_rate": 1.6535421040486683e-07, "logits/chosen": -2.695885181427002, "logits/rejected": -2.683889150619507, "logps/chosen": -292.3827209472656, "logps/rejected": -295.35003662109375, "loss": 0.5708, "rewards/accuracies": 0.6875, "rewards/chosen": -0.20486466586589813, "rewards/margins": 0.3616489768028259, "rewards/rejected": -0.5665136575698853, "step": 2480 }, { "epoch": 0.6516618686207799, "grad_norm": 11.596046447753906, "learning_rate": 1.6320824030363456e-07, "logits/chosen": -2.7673633098602295, "logits/rejected": -2.7697348594665527, "logps/chosen": -269.5127868652344, "logps/rejected": -284.500732421875, "loss": 0.5804, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.19040192663669586, "rewards/margins": 0.32062506675720215, "rewards/rejected": -0.5110269784927368, "step": 2490 }, { "epoch": 0.654278984559016, "grad_norm": 8.306464195251465, "learning_rate": 1.6106951728936024e-07, "logits/chosen": -2.8287737369537354, "logits/rejected": -2.785698413848877, "logps/chosen": -290.69586181640625, "logps/rejected": -315.9652404785156, "loss": 0.5792, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.17289450764656067, "rewards/margins": 0.32756882905960083, "rewards/rejected": -0.5004633069038391, "step": 2500 }, { "epoch": 0.654278984559016, "eval_logits/chosen": -2.7979679107666016, "eval_logits/rejected": -2.776271104812622, "eval_logps/chosen": -303.76153564453125, "eval_logps/rejected": -311.8429260253906, "eval_loss": 0.5962891578674316, "eval_rewards/accuracies": 0.6974999904632568, "eval_rewards/chosen": -0.2102394998073578, "eval_rewards/margins": 0.2938106954097748, "eval_rewards/rejected": -0.5040501952171326, "eval_runtime": 692.3854, "eval_samples_per_second": 2.889, "eval_steps_per_second": 0.361, "step": 2500 }, { "epoch": 0.656896100497252, "grad_norm": 6.3364176750183105, "learning_rate": 1.5893821994479994e-07, "logits/chosen": -2.8073089122772217, "logits/rejected": -2.7984962463378906, "logps/chosen": -307.6702880859375, "logps/rejected": -299.78192138671875, "loss": 0.583, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.15238206088542938, "rewards/margins": 0.3230430781841278, "rewards/rejected": -0.4754251539707184, "step": 2510 }, { "epoch": 0.6595132164354881, "grad_norm": 7.475069999694824, "learning_rate": 1.5681452623266867e-07, "logits/chosen": -2.788701057434082, "logits/rejected": -2.7505264282226562, "logps/chosen": -323.1575012207031, "logps/rejected": -304.9902038574219, "loss": 0.5469, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.1838502436876297, "rewards/margins": 0.4115122854709625, "rewards/rejected": -0.5953624844551086, "step": 2520 }, { "epoch": 0.6621303323737242, "grad_norm": 9.084112167358398, "learning_rate": 1.546986134807801e-07, "logits/chosen": -2.8091278076171875, "logits/rejected": -2.780764102935791, "logps/chosen": -293.3882751464844, "logps/rejected": -309.5545349121094, "loss": 0.5931, "rewards/accuracies": 0.71875, "rewards/chosen": -0.28720271587371826, "rewards/margins": 0.30004793405532837, "rewards/rejected": -0.5872506499290466, "step": 2530 }, { "epoch": 0.6647474483119602, "grad_norm": 7.817606449127197, "learning_rate": 1.5259065836724034e-07, "logits/chosen": -2.7307331562042236, "logits/rejected": -2.7140753269195557, "logps/chosen": -290.29443359375, "logps/rejected": -307.90399169921875, "loss": 0.5968, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2654728889465332, "rewards/margins": 0.2819042205810547, "rewards/rejected": -0.5473771095275879, "step": 2540 }, { "epoch": 0.6673645642501963, "grad_norm": 8.136064529418945, "learning_rate": 1.5049083690569454e-07, "logits/chosen": -2.7462635040283203, "logits/rejected": -2.731522798538208, "logps/chosen": -279.6645812988281, "logps/rejected": -303.47857666015625, "loss": 0.6011, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.2667672336101532, "rewards/margins": 0.28752660751342773, "rewards/rejected": -0.5542938113212585, "step": 2550 }, { "epoch": 0.6699816801884323, "grad_norm": 5.6162896156311035, "learning_rate": 1.4839932443063056e-07, "logits/chosen": -2.7818315029144287, "logits/rejected": -2.754776954650879, "logps/chosen": -331.192626953125, "logps/rejected": -306.44342041015625, "loss": 0.5807, "rewards/accuracies": 0.71875, "rewards/chosen": -0.23324036598205566, "rewards/margins": 0.33265605568885803, "rewards/rejected": -0.5658964514732361, "step": 2560 }, { "epoch": 0.6725987961266684, "grad_norm": 15.203133583068848, "learning_rate": 1.46316295582738e-07, "logits/chosen": -2.755795955657959, "logits/rejected": -2.745166301727295, "logps/chosen": -288.94012451171875, "logps/rejected": -295.92974853515625, "loss": 0.63, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.30726075172424316, "rewards/margins": 0.21980533003807068, "rewards/rejected": -0.5270661115646362, "step": 2570 }, { "epoch": 0.6752159120649045, "grad_norm": 23.822792053222656, "learning_rate": 1.4424192429432655e-07, "logits/chosen": -2.783210515975952, "logits/rejected": -2.766979694366455, "logps/chosen": -291.4307556152344, "logps/rejected": -328.7579040527344, "loss": 0.5738, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.18577826023101807, "rewards/margins": 0.34509676694869995, "rewards/rejected": -0.5308750867843628, "step": 2580 }, { "epoch": 0.6778330280031405, "grad_norm": 9.544054985046387, "learning_rate": 1.4217638377480158e-07, "logits/chosen": -2.7744319438934326, "logits/rejected": -2.7644972801208496, "logps/chosen": -299.30975341796875, "logps/rejected": -312.57220458984375, "loss": 0.598, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.23222167789936066, "rewards/margins": 0.28205937147140503, "rewards/rejected": -0.5142810344696045, "step": 2590 }, { "epoch": 0.6804501439413766, "grad_norm": 7.35859489440918, "learning_rate": 1.401198464962021e-07, "logits/chosen": -2.7667133808135986, "logits/rejected": -2.7541134357452393, "logps/chosen": -305.63446044921875, "logps/rejected": -288.49676513671875, "loss": 0.6028, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2002829611301422, "rewards/margins": 0.26447853446006775, "rewards/rejected": -0.4647614359855652, "step": 2600 }, { "epoch": 0.6804501439413766, "eval_logits/chosen": -2.793254852294922, "eval_logits/rejected": -2.771672010421753, "eval_logps/chosen": -301.69635009765625, "eval_logps/rejected": -309.3417053222656, "eval_loss": 0.5973595976829529, "eval_rewards/accuracies": 0.6919999718666077, "eval_rewards/chosen": -0.18958736956119537, "eval_rewards/margins": 0.289450466632843, "eval_rewards/rejected": -0.4790377914905548, "eval_runtime": 692.1987, "eval_samples_per_second": 2.889, "eval_steps_per_second": 0.361, "step": 2600 }, { "epoch": 0.6830672598796127, "grad_norm": 6.412085056304932, "learning_rate": 1.3807248417879894e-07, "logits/chosen": -2.799522638320923, "logits/rejected": -2.801234483718872, "logps/chosen": -304.61505126953125, "logps/rejected": -318.75360107421875, "loss": 0.5742, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.1567406803369522, "rewards/margins": 0.35466814041137695, "rewards/rejected": -0.511408805847168, "step": 2610 }, { "epoch": 0.6856843758178487, "grad_norm": 6.595985412597656, "learning_rate": 1.3603446777675665e-07, "logits/chosen": -2.7163891792297363, "logits/rejected": -2.6980533599853516, "logps/chosen": -301.43170166015625, "logps/rejected": -309.5948486328125, "loss": 0.5767, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.18890248239040375, "rewards/margins": 0.33902615308761597, "rewards/rejected": -0.5279285907745361, "step": 2620 }, { "epoch": 0.6883014917560848, "grad_norm": 5.626343250274658, "learning_rate": 1.3400596746385814e-07, "logits/chosen": -2.785409450531006, "logits/rejected": -2.7549426555633545, "logps/chosen": -305.23779296875, "logps/rejected": -306.29864501953125, "loss": 0.5866, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.17120136320590973, "rewards/margins": 0.3220587372779846, "rewards/rejected": -0.49326008558273315, "step": 2630 }, { "epoch": 0.6909186076943209, "grad_norm": 7.084354400634766, "learning_rate": 1.3198715261929586e-07, "logits/chosen": -2.8111932277679443, "logits/rejected": -2.7792601585388184, "logps/chosen": -269.24957275390625, "logps/rejected": -297.8160400390625, "loss": 0.5557, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.19386166334152222, "rewards/margins": 0.37062662839889526, "rewards/rejected": -0.5644882917404175, "step": 2640 }, { "epoch": 0.6935357236325569, "grad_norm": 6.301397800445557, "learning_rate": 1.299781918135282e-07, "logits/chosen": -2.780548095703125, "logits/rejected": -2.7463881969451904, "logps/chosen": -331.93035888671875, "logps/rejected": -346.24005126953125, "loss": 0.5488, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.12747205793857574, "rewards/margins": 0.4090425372123718, "rewards/rejected": -0.5365146398544312, "step": 2650 }, { "epoch": 0.696152839570793, "grad_norm": 4.976480007171631, "learning_rate": 1.279792527942045e-07, "logits/chosen": -2.7965517044067383, "logits/rejected": -2.7541985511779785, "logps/chosen": -308.75946044921875, "logps/rejected": -333.583251953125, "loss": 0.573, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.2170572280883789, "rewards/margins": 0.3559117913246155, "rewards/rejected": -0.5729690194129944, "step": 2660 }, { "epoch": 0.6987699555090291, "grad_norm": 7.420611381530762, "learning_rate": 1.259905024721576e-07, "logits/chosen": -2.7755208015441895, "logits/rejected": -2.7653794288635254, "logps/chosen": -297.36810302734375, "logps/rejected": -308.62139892578125, "loss": 0.574, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.21521492302417755, "rewards/margins": 0.3440507650375366, "rewards/rejected": -0.5592657327651978, "step": 2670 }, { "epoch": 0.7013870714472651, "grad_norm": 9.432327270507812, "learning_rate": 1.2401210690746703e-07, "logits/chosen": -2.7644107341766357, "logits/rejected": -2.7474875450134277, "logps/chosen": -305.26129150390625, "logps/rejected": -300.5979309082031, "loss": 0.5966, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.19491124153137207, "rewards/margins": 0.2979043126106262, "rewards/rejected": -0.4928155541419983, "step": 2680 }, { "epoch": 0.7040041873855012, "grad_norm": 13.687203407287598, "learning_rate": 1.2204423129559305e-07, "logits/chosen": -2.803926467895508, "logits/rejected": -2.8096935749053955, "logps/chosen": -304.5517272949219, "logps/rejected": -332.74627685546875, "loss": 0.5878, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.21299275755882263, "rewards/margins": 0.32770127058029175, "rewards/rejected": -0.540693998336792, "step": 2690 }, { "epoch": 0.7066213033237373, "grad_norm": 9.307769775390625, "learning_rate": 1.2008703995358299e-07, "logits/chosen": -2.7696948051452637, "logits/rejected": -2.7626984119415283, "logps/chosen": -305.66973876953125, "logps/rejected": -309.4637756347656, "loss": 0.5854, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.23966650664806366, "rewards/margins": 0.3371264636516571, "rewards/rejected": -0.5767929553985596, "step": 2700 }, { "epoch": 0.7066213033237373, "eval_logits/chosen": -2.7892041206359863, "eval_logits/rejected": -2.7675600051879883, "eval_logps/chosen": -307.9026794433594, "eval_logps/rejected": -317.58642578125, "eval_loss": 0.5930463671684265, "eval_rewards/accuracies": 0.7020000219345093, "eval_rewards/chosen": -0.2516505718231201, "eval_rewards/margins": 0.309834361076355, "eval_rewards/rejected": -0.5614849925041199, "eval_runtime": 692.1934, "eval_samples_per_second": 2.889, "eval_steps_per_second": 0.361, "step": 2700 }, { "epoch": 0.7092384192619733, "grad_norm": 7.60300874710083, "learning_rate": 1.1814069630635068e-07, "logits/chosen": -2.7490410804748535, "logits/rejected": -2.7561395168304443, "logps/chosen": -311.02667236328125, "logps/rejected": -334.8045349121094, "loss": 0.5936, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.2228337824344635, "rewards/margins": 0.31492942571640015, "rewards/rejected": -0.5377631783485413, "step": 2710 }, { "epoch": 0.7118555352002094, "grad_norm": 5.55739164352417, "learning_rate": 1.1620536287303051e-07, "logits/chosen": -2.7841482162475586, "logits/rejected": -2.7707200050354004, "logps/chosen": -330.66802978515625, "logps/rejected": -324.71453857421875, "loss": 0.6076, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.21253342926502228, "rewards/margins": 0.2718030512332916, "rewards/rejected": -0.4843364655971527, "step": 2720 }, { "epoch": 0.7144726511384454, "grad_norm": 4.946017742156982, "learning_rate": 1.1428120125340716e-07, "logits/chosen": -2.771012783050537, "logits/rejected": -2.751859188079834, "logps/chosen": -299.06195068359375, "logps/rejected": -291.7746276855469, "loss": 0.5414, "rewards/accuracies": 0.78125, "rewards/chosen": -0.18322893977165222, "rewards/margins": 0.4256429076194763, "rewards/rejected": -0.6088718175888062, "step": 2730 }, { "epoch": 0.7170897670766815, "grad_norm": 8.510547637939453, "learning_rate": 1.123683721144223e-07, "logits/chosen": -2.773465871810913, "logits/rejected": -2.750523328781128, "logps/chosen": -322.75030517578125, "logps/rejected": -322.23541259765625, "loss": 0.5924, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2108650654554367, "rewards/margins": 0.3147924840450287, "rewards/rejected": -0.5256575345993042, "step": 2740 }, { "epoch": 0.7197068830149176, "grad_norm": 6.666440010070801, "learning_rate": 1.1046703517675845e-07, "logits/chosen": -2.792327642440796, "logits/rejected": -2.780276298522949, "logps/chosen": -292.0575256347656, "logps/rejected": -331.8373718261719, "loss": 0.5803, "rewards/accuracies": 0.75, "rewards/chosen": -0.20287561416625977, "rewards/margins": 0.3353033661842346, "rewards/rejected": -0.5381789803504944, "step": 2750 }, { "epoch": 0.7223239989531536, "grad_norm": 3.823488712310791, "learning_rate": 1.085773492015028e-07, "logits/chosen": -2.7709414958953857, "logits/rejected": -2.7493114471435547, "logps/chosen": -284.67193603515625, "logps/rejected": -288.34991455078125, "loss": 0.5487, "rewards/accuracies": 0.75, "rewards/chosen": -0.2005012482404709, "rewards/margins": 0.4104704260826111, "rewards/rejected": -0.6109716892242432, "step": 2760 }, { "epoch": 0.7249411148913897, "grad_norm": 10.498513221740723, "learning_rate": 1.0669947197689033e-07, "logits/chosen": -2.7609269618988037, "logits/rejected": -2.723078489303589, "logps/chosen": -316.71929931640625, "logps/rejected": -321.02239990234375, "loss": 0.5936, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.2569184899330139, "rewards/margins": 0.3084716498851776, "rewards/rejected": -0.5653902292251587, "step": 2770 }, { "epoch": 0.7275582308296258, "grad_norm": 9.501131057739258, "learning_rate": 1.048335603051291e-07, "logits/chosen": -2.7370448112487793, "logits/rejected": -2.730591058731079, "logps/chosen": -329.8760986328125, "logps/rejected": -340.55413818359375, "loss": 0.5523, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.2385425865650177, "rewards/margins": 0.41302841901779175, "rewards/rejected": -0.6515710353851318, "step": 2780 }, { "epoch": 0.7301753467678618, "grad_norm": 9.440362930297852, "learning_rate": 1.0297976998930663e-07, "logits/chosen": -2.787727117538452, "logits/rejected": -2.7839837074279785, "logps/chosen": -315.8175048828125, "logps/rejected": -321.4845275878906, "loss": 0.5551, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.23409982025623322, "rewards/margins": 0.4074832797050476, "rewards/rejected": -0.6415830850601196, "step": 2790 }, { "epoch": 0.7327924627060979, "grad_norm": 9.004974365234375, "learning_rate": 1.0113825582038077e-07, "logits/chosen": -2.7806646823883057, "logits/rejected": -2.770219326019287, "logps/chosen": -309.5851135253906, "logps/rejected": -321.6380310058594, "loss": 0.5994, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.2802024185657501, "rewards/margins": 0.2918320596218109, "rewards/rejected": -0.572034478187561, "step": 2800 }, { "epoch": 0.7327924627060979, "eval_logits/chosen": -2.785149335861206, "eval_logits/rejected": -2.7636430263519287, "eval_logps/chosen": -308.8106689453125, "eval_logps/rejected": -319.18377685546875, "eval_loss": 0.5920370221138, "eval_rewards/accuracies": 0.7045000195503235, "eval_rewards/chosen": -0.2607303559780121, "eval_rewards/margins": 0.31672805547714233, "eval_rewards/rejected": -0.577458381652832, "eval_runtime": 691.5482, "eval_samples_per_second": 2.892, "eval_steps_per_second": 0.362, "step": 2800 }, { "epoch": 0.735409578644334, "grad_norm": 5.153034687042236, "learning_rate": 9.930917156425475e-08, "logits/chosen": -2.7953689098358154, "logits/rejected": -2.7769198417663574, "logps/chosen": -307.6942443847656, "logps/rejected": -336.81036376953125, "loss": 0.5828, "rewards/accuracies": 0.71875, "rewards/chosen": -0.2690412104129791, "rewards/margins": 0.3371729254722595, "rewards/rejected": -0.6062140464782715, "step": 2810 }, { "epoch": 0.73802669458257, "grad_norm": 10.421857833862305, "learning_rate": 9.749266994893754e-08, "logits/chosen": -2.7286500930786133, "logits/rejected": -2.696841239929199, "logps/chosen": -283.78277587890625, "logps/rejected": -293.64666748046875, "loss": 0.6332, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2909180223941803, "rewards/margins": 0.21305350959300995, "rewards/rejected": -0.5039715766906738, "step": 2820 }, { "epoch": 0.7406438105208061, "grad_norm": 14.213560104370117, "learning_rate": 9.568890265179128e-08, "logits/chosen": -2.7485554218292236, "logits/rejected": -2.7543232440948486, "logps/chosen": -308.8101806640625, "logps/rejected": -305.62347412109375, "loss": 0.609, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.2877466678619385, "rewards/margins": 0.28105878829956055, "rewards/rejected": -0.568805456161499, "step": 2830 }, { "epoch": 0.7432609264590422, "grad_norm": 5.577268600463867, "learning_rate": 9.389802028686616e-08, "logits/chosen": -2.7711002826690674, "logits/rejected": -2.7511260509490967, "logps/chosen": -308.267822265625, "logps/rejected": -295.8204650878906, "loss": 0.6301, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.292976438999176, "rewards/margins": 0.21805603802204132, "rewards/rejected": -0.5110324621200562, "step": 2840 }, { "epoch": 0.7458780423972782, "grad_norm": 5.392404556274414, "learning_rate": 9.212017239232426e-08, "logits/chosen": -2.7617223262786865, "logits/rejected": -2.7573046684265137, "logps/chosen": -312.38421630859375, "logps/rejected": -330.9461975097656, "loss": 0.5444, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.22561874985694885, "rewards/margins": 0.4286450445652008, "rewards/rejected": -0.6542637348175049, "step": 2850 }, { "epoch": 0.7484951583355143, "grad_norm": 6.394357681274414, "learning_rate": 9.035550741795328e-08, "logits/chosen": -2.7431981563568115, "logits/rejected": -2.7521939277648926, "logps/chosen": -295.7667541503906, "logps/rejected": -334.49688720703125, "loss": 0.5794, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.21194259822368622, "rewards/margins": 0.35274478793144226, "rewards/rejected": -0.5646874308586121, "step": 2860 }, { "epoch": 0.7511122742737504, "grad_norm": 9.479743003845215, "learning_rate": 8.860417271277065e-08, "logits/chosen": -2.819362163543701, "logits/rejected": -2.8213016986846924, "logps/chosen": -308.4556884765625, "logps/rejected": -324.0565490722656, "loss": 0.6036, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.23003943264484406, "rewards/margins": 0.26295268535614014, "rewards/rejected": -0.492992103099823, "step": 2870 }, { "epoch": 0.7537293902119864, "grad_norm": 9.29710865020752, "learning_rate": 8.686631451272029e-08, "logits/chosen": -2.7966079711914062, "logits/rejected": -2.7735276222229004, "logps/chosen": -297.5863952636719, "logps/rejected": -300.37908935546875, "loss": 0.6135, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2746056914329529, "rewards/margins": 0.2642548680305481, "rewards/rejected": -0.5388606190681458, "step": 2880 }, { "epoch": 0.7563465061502225, "grad_norm": 9.630151748657227, "learning_rate": 8.514207792846168e-08, "logits/chosen": -2.7753801345825195, "logits/rejected": -2.775832414627075, "logps/chosen": -292.93609619140625, "logps/rejected": -292.79754638671875, "loss": 0.5907, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.2579854130744934, "rewards/margins": 0.3091490864753723, "rewards/rejected": -0.5671344995498657, "step": 2890 }, { "epoch": 0.7589636220884585, "grad_norm": 7.0608439445495605, "learning_rate": 8.343160693325355e-08, "logits/chosen": -2.7492966651916504, "logits/rejected": -2.7410671710968018, "logps/chosen": -293.8484802246094, "logps/rejected": -324.77001953125, "loss": 0.5837, "rewards/accuracies": 0.6875, "rewards/chosen": -0.22494366765022278, "rewards/margins": 0.3548448979854584, "rewards/rejected": -0.5797885656356812, "step": 2900 }, { "epoch": 0.7589636220884585, "eval_logits/chosen": -2.783421277999878, "eval_logits/rejected": -2.7619221210479736, "eval_logps/chosen": -308.137939453125, "eval_logps/rejected": -318.6510925292969, "eval_loss": 0.5913165211677551, "eval_rewards/accuracies": 0.7055000066757202, "eval_rewards/chosen": -0.2540031671524048, "eval_rewards/margins": 0.3181284964084625, "eval_rewards/rejected": -0.5721316933631897, "eval_runtime": 692.0731, "eval_samples_per_second": 2.89, "eval_steps_per_second": 0.361, "step": 2900 }, { "epoch": 0.7615807380266946, "grad_norm": 7.802112579345703, "learning_rate": 8.173504435093173e-08, "logits/chosen": -2.7537245750427246, "logits/rejected": -2.726355791091919, "logps/chosen": -290.5617980957031, "logps/rejected": -287.50799560546875, "loss": 0.5806, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.2531769275665283, "rewards/margins": 0.35345658659935, "rewards/rejected": -0.6066334843635559, "step": 2910 }, { "epoch": 0.7641978539649307, "grad_norm": 9.018595695495605, "learning_rate": 8.005253184398359e-08, "logits/chosen": -2.7553019523620605, "logits/rejected": -2.745943546295166, "logps/chosen": -320.03265380859375, "logps/rejected": -340.8626403808594, "loss": 0.6027, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.24576649069786072, "rewards/margins": 0.28911441564559937, "rewards/rejected": -0.5348808765411377, "step": 2920 }, { "epoch": 0.7668149699031667, "grad_norm": 6.111194133758545, "learning_rate": 7.838420990171926e-08, "logits/chosen": -2.789515972137451, "logits/rejected": -2.7570273876190186, "logps/chosen": -310.61224365234375, "logps/rejected": -312.87152099609375, "loss": 0.5865, "rewards/accuracies": 0.6875, "rewards/chosen": -0.22255787253379822, "rewards/margins": 0.31383711099624634, "rewards/rejected": -0.5363950133323669, "step": 2930 }, { "epoch": 0.7694320858414028, "grad_norm": 5.815800666809082, "learning_rate": 7.673021782854083e-08, "logits/chosen": -2.69783091545105, "logits/rejected": -2.6870310306549072, "logps/chosen": -311.68963623046875, "logps/rejected": -288.39215087890625, "loss": 0.5979, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.2525468170642853, "rewards/margins": 0.31668832898139954, "rewards/rejected": -0.5692351460456848, "step": 2940 }, { "epoch": 0.7720492017796389, "grad_norm": 10.589014053344727, "learning_rate": 7.509069373231039e-08, "logits/chosen": -2.742522716522217, "logits/rejected": -2.7218940258026123, "logps/chosen": -293.1689453125, "logps/rejected": -302.7828369140625, "loss": 0.6006, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.24362894892692566, "rewards/margins": 0.29250627756118774, "rewards/rejected": -0.536135196685791, "step": 2950 }, { "epoch": 0.7746663177178749, "grad_norm": 8.408040046691895, "learning_rate": 7.346577451281821e-08, "logits/chosen": -2.7488350868225098, "logits/rejected": -2.7583699226379395, "logps/chosen": -308.5254821777344, "logps/rejected": -321.6301574707031, "loss": 0.578, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.26960092782974243, "rewards/margins": 0.3470562696456909, "rewards/rejected": -0.6166571378707886, "step": 2960 }, { "epoch": 0.777283433656111, "grad_norm": 7.626022815704346, "learning_rate": 7.185559585035136e-08, "logits/chosen": -2.7650535106658936, "logits/rejected": -2.736623764038086, "logps/chosen": -327.43792724609375, "logps/rejected": -349.74005126953125, "loss": 0.5695, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.28370755910873413, "rewards/margins": 0.38453495502471924, "rewards/rejected": -0.6682425737380981, "step": 2970 }, { "epoch": 0.7799005495943471, "grad_norm": 8.664432525634766, "learning_rate": 7.026029219436502e-08, "logits/chosen": -2.7403178215026855, "logits/rejected": -2.726973533630371, "logps/chosen": -296.88629150390625, "logps/rejected": -320.1584167480469, "loss": 0.5807, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.2508087158203125, "rewards/margins": 0.3491096496582031, "rewards/rejected": -0.5999183058738708, "step": 2980 }, { "epoch": 0.7825176655325831, "grad_norm": 7.381548881530762, "learning_rate": 6.867999675225522e-08, "logits/chosen": -2.7898964881896973, "logits/rejected": -2.765493392944336, "logps/chosen": -269.5013427734375, "logps/rejected": -287.95318603515625, "loss": 0.577, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.22013553977012634, "rewards/margins": 0.3466190695762634, "rewards/rejected": -0.5667546391487122, "step": 2990 }, { "epoch": 0.7851347814708192, "grad_norm": 8.886544227600098, "learning_rate": 6.711484147823662e-08, "logits/chosen": -2.7362468242645264, "logits/rejected": -2.7374088764190674, "logps/chosen": -273.03204345703125, "logps/rejected": -309.46832275390625, "loss": 0.5858, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.22477373480796814, "rewards/margins": 0.3105041980743408, "rewards/rejected": -0.5352779626846313, "step": 3000 }, { "epoch": 0.7851347814708192, "eval_logits/chosen": -2.781898021697998, "eval_logits/rejected": -2.7604949474334717, "eval_logps/chosen": -308.9897766113281, "eval_logps/rejected": -319.7853088378906, "eval_loss": 0.5910181999206543, "eval_rewards/accuracies": 0.7055000066757202, "eval_rewards/chosen": -0.2625214755535126, "eval_rewards/margins": 0.32095208764076233, "eval_rewards/rejected": -0.5834735035896301, "eval_runtime": 691.7146, "eval_samples_per_second": 2.891, "eval_steps_per_second": 0.361, "step": 3000 }, { "epoch": 0.7877518974090553, "grad_norm": 12.21480655670166, "learning_rate": 6.556495706232412e-08, "logits/chosen": -2.7469980716705322, "logits/rejected": -2.7527496814727783, "logps/chosen": -316.41766357421875, "logps/rejected": -328.52532958984375, "loss": 0.5886, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.291492760181427, "rewards/margins": 0.32380086183547974, "rewards/rejected": -0.6152936816215515, "step": 3010 }, { "epoch": 0.7903690133472913, "grad_norm": 8.182783126831055, "learning_rate": 6.403047291942057e-08, "logits/chosen": -2.722087860107422, "logits/rejected": -2.6903903484344482, "logps/chosen": -275.5090637207031, "logps/rejected": -277.62420654296875, "loss": 0.5972, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.3089084327220917, "rewards/margins": 0.29682403802871704, "rewards/rejected": -0.6057325005531311, "step": 3020 }, { "epoch": 0.7929861292855274, "grad_norm": 8.147031784057617, "learning_rate": 6.251151717851021e-08, "logits/chosen": -2.743332624435425, "logits/rejected": -2.7332491874694824, "logps/chosen": -280.6979064941406, "logps/rejected": -292.1900329589844, "loss": 0.6154, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.29725611209869385, "rewards/margins": 0.27910858392715454, "rewards/rejected": -0.5763646960258484, "step": 3030 }, { "epoch": 0.7956032452237635, "grad_norm": 10.667434692382812, "learning_rate": 6.100821667196041e-08, "logits/chosen": -2.8258140087127686, "logits/rejected": -2.772840976715088, "logps/chosen": -316.3697204589844, "logps/rejected": -283.46575927734375, "loss": 0.5777, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.2557021975517273, "rewards/margins": 0.3566380739212036, "rewards/rejected": -0.6123403310775757, "step": 3040 }, { "epoch": 0.7982203611619995, "grad_norm": 11.156988143920898, "learning_rate": 5.952069692493061e-08, "logits/chosen": -2.7050204277038574, "logits/rejected": -2.7095789909362793, "logps/chosen": -266.9496154785156, "logps/rejected": -308.8603515625, "loss": 0.5668, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.21333126723766327, "rewards/margins": 0.3779350519180298, "rewards/rejected": -0.5912663340568542, "step": 3050 }, { "epoch": 0.8008374771002356, "grad_norm": 17.065628051757812, "learning_rate": 5.8049082144891794e-08, "logits/chosen": -2.702791452407837, "logits/rejected": -2.6872074604034424, "logps/chosen": -304.93463134765625, "logps/rejected": -380.0108642578125, "loss": 0.5933, "rewards/accuracies": 0.75, "rewards/chosen": -0.24746175110340118, "rewards/margins": 0.32271090149879456, "rewards/rejected": -0.5701726675033569, "step": 3060 }, { "epoch": 0.8034545930384716, "grad_norm": 5.375977516174316, "learning_rate": 5.659349521125459e-08, "logits/chosen": -2.828627109527588, "logits/rejected": -2.8292970657348633, "logps/chosen": -323.8910827636719, "logps/rejected": -331.82403564453125, "loss": 0.5963, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.25231170654296875, "rewards/margins": 0.3079237937927246, "rewards/rejected": -0.5602355003356934, "step": 3070 }, { "epoch": 0.8060717089767077, "grad_norm": 10.280311584472656, "learning_rate": 5.5154057665109e-08, "logits/chosen": -2.772552490234375, "logits/rejected": -2.7637112140655518, "logps/chosen": -304.2619934082031, "logps/rejected": -313.9085998535156, "loss": 0.5688, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.28161460161209106, "rewards/margins": 0.3681698441505432, "rewards/rejected": -0.6497844457626343, "step": 3080 }, { "epoch": 0.8086888249149438, "grad_norm": 5.905206203460693, "learning_rate": 5.3730889699075853e-08, "logits/chosen": -2.790621280670166, "logits/rejected": -2.764768123626709, "logps/chosen": -320.5517272949219, "logps/rejected": -295.2154541015625, "loss": 0.5839, "rewards/accuracies": 0.71875, "rewards/chosen": -0.23616118729114532, "rewards/margins": 0.32327955961227417, "rewards/rejected": -0.5594406723976135, "step": 3090 }, { "epoch": 0.8113059408531798, "grad_norm": 5.722733974456787, "learning_rate": 5.2324110147270893e-08, "logits/chosen": -2.766014814376831, "logits/rejected": -2.758927583694458, "logps/chosen": -317.6996154785156, "logps/rejected": -342.97039794921875, "loss": 0.5685, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.17896804213523865, "rewards/margins": 0.3621399402618408, "rewards/rejected": -0.5411080121994019, "step": 3100 }, { "epoch": 0.8113059408531798, "eval_logits/chosen": -2.7776589393615723, "eval_logits/rejected": -2.7558252811431885, "eval_logps/chosen": -306.57073974609375, "eval_logps/rejected": -317.1507263183594, "eval_loss": 0.5914422869682312, "eval_rewards/accuracies": 0.7039999961853027, "eval_rewards/chosen": -0.23833158612251282, "eval_rewards/margins": 0.3187963366508484, "eval_rewards/rejected": -0.5571279525756836, "eval_runtime": 692.3976, "eval_samples_per_second": 2.889, "eval_steps_per_second": 0.361, "step": 3100 }, { "epoch": 0.8139230567914159, "grad_norm": 5.692158222198486, "learning_rate": 5.0933836475381795e-08, "logits/chosen": -2.773538827896118, "logits/rejected": -2.743774175643921, "logps/chosen": -323.03564453125, "logps/rejected": -339.22576904296875, "loss": 0.5839, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.20304706692695618, "rewards/margins": 0.33373111486434937, "rewards/rejected": -0.5367781519889832, "step": 3110 }, { "epoch": 0.816540172729652, "grad_norm": 6.522732734680176, "learning_rate": 4.956018477086005e-08, "logits/chosen": -2.7541415691375732, "logits/rejected": -2.7304270267486572, "logps/chosen": -312.82550048828125, "logps/rejected": -319.4942626953125, "loss": 0.5787, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.2153932750225067, "rewards/margins": 0.3583284020423889, "rewards/rejected": -0.5737215876579285, "step": 3120 }, { "epoch": 0.819157288667888, "grad_norm": 12.873359680175781, "learning_rate": 4.820326973322763e-08, "logits/chosen": -2.7611987590789795, "logits/rejected": -2.7416489124298096, "logps/chosen": -294.5945129394531, "logps/rejected": -322.9219055175781, "loss": 0.5902, "rewards/accuracies": 0.6875, "rewards/chosen": -0.26755794882774353, "rewards/margins": 0.30830827355384827, "rewards/rejected": -0.5758662223815918, "step": 3130 }, { "epoch": 0.821774404606124, "grad_norm": 6.0704731941223145, "learning_rate": 4.686320466449981e-08, "logits/chosen": -2.765129566192627, "logits/rejected": -2.712188482284546, "logps/chosen": -279.4689025878906, "logps/rejected": -308.8946533203125, "loss": 0.5878, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.21677632629871368, "rewards/margins": 0.3269808888435364, "rewards/rejected": -0.5437572598457336, "step": 3140 }, { "epoch": 0.8243915205443602, "grad_norm": 9.32778549194336, "learning_rate": 4.554010145972417e-08, "logits/chosen": -2.8120663166046143, "logits/rejected": -2.7678775787353516, "logps/chosen": -308.05975341796875, "logps/rejected": -326.4994812011719, "loss": 0.6037, "rewards/accuracies": 0.6875, "rewards/chosen": -0.28121477365493774, "rewards/margins": 0.3111681342124939, "rewards/rejected": -0.5923829078674316, "step": 3150 }, { "epoch": 0.8270086364825961, "grad_norm": 6.878976345062256, "learning_rate": 4.423407059763745e-08, "logits/chosen": -2.769566535949707, "logits/rejected": -2.754739999771118, "logps/chosen": -313.4940490722656, "logps/rejected": -338.7357482910156, "loss": 0.5795, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.22392907738685608, "rewards/margins": 0.3541107773780823, "rewards/rejected": -0.578039824962616, "step": 3160 }, { "epoch": 0.8296257524208323, "grad_norm": 8.941882133483887, "learning_rate": 4.294522113144078e-08, "logits/chosen": -2.7120773792266846, "logits/rejected": -2.676596164703369, "logps/chosen": -310.96600341796875, "logps/rejected": -309.7723083496094, "loss": 0.5784, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.23985318839550018, "rewards/margins": 0.3455398380756378, "rewards/rejected": -0.5853930115699768, "step": 3170 }, { "epoch": 0.8322428683590684, "grad_norm": 11.861396789550781, "learning_rate": 4.1673660679693804e-08, "logits/chosen": -2.759885311126709, "logits/rejected": -2.7518694400787354, "logps/chosen": -264.2064514160156, "logps/rejected": -315.90380859375, "loss": 0.6069, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.26855209469795227, "rewards/margins": 0.2709905505180359, "rewards/rejected": -0.539542555809021, "step": 3180 }, { "epoch": 0.8348599842973043, "grad_norm": 3.688720941543579, "learning_rate": 4.041949541732825e-08, "logits/chosen": -2.7698843479156494, "logits/rejected": -2.773341655731201, "logps/chosen": -306.61480712890625, "logps/rejected": -325.04541015625, "loss": 0.5851, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.2628583610057831, "rewards/margins": 0.3378385603427887, "rewards/rejected": -0.6006969213485718, "step": 3190 }, { "epoch": 0.8374771002355405, "grad_norm": 3.2142703533172607, "learning_rate": 3.9182830066782605e-08, "logits/chosen": -2.7356200218200684, "logits/rejected": -2.740725040435791, "logps/chosen": -303.8326721191406, "logps/rejected": -351.736083984375, "loss": 0.5753, "rewards/accuracies": 0.71875, "rewards/chosen": -0.28992363810539246, "rewards/margins": 0.3618486821651459, "rewards/rejected": -0.6517723798751831, "step": 3200 }, { "epoch": 0.8374771002355405, "eval_logits/chosen": -2.778296709060669, "eval_logits/rejected": -2.7567243576049805, "eval_logps/chosen": -308.9666442871094, "eval_logps/rejected": -320.12237548828125, "eval_loss": 0.5903262495994568, "eval_rewards/accuracies": 0.7020000219345093, "eval_rewards/chosen": -0.26229044795036316, "eval_rewards/margins": 0.3245540261268616, "eval_rewards/rejected": -0.5868445038795471, "eval_runtime": 691.7572, "eval_samples_per_second": 2.891, "eval_steps_per_second": 0.361, "step": 3200 }, { "epoch": 0.8400942161737766, "grad_norm": 5.404438018798828, "learning_rate": 3.79637678892577e-08, "logits/chosen": -2.737617015838623, "logits/rejected": -2.7435827255249023, "logps/chosen": -313.7263488769531, "logps/rejected": -326.2721862792969, "loss": 0.5958, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.21620874106884003, "rewards/margins": 0.29549044370651245, "rewards/rejected": -0.5116991996765137, "step": 3210 }, { "epoch": 0.8427113321120125, "grad_norm": 8.482666015625, "learning_rate": 3.6762410676094645e-08, "logits/chosen": -2.7493488788604736, "logits/rejected": -2.751436233520508, "logps/chosen": -342.2435302734375, "logps/rejected": -334.9501953125, "loss": 0.5649, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.22917640209197998, "rewards/margins": 0.40211135149002075, "rewards/rejected": -0.631287693977356, "step": 3220 }, { "epoch": 0.8453284480502486, "grad_norm": 21.451396942138672, "learning_rate": 3.557885874027497e-08, "logits/chosen": -2.7467381954193115, "logits/rejected": -2.7420356273651123, "logps/chosen": -307.3967590332031, "logps/rejected": -319.23785400390625, "loss": 0.626, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.2908255457878113, "rewards/margins": 0.24548819661140442, "rewards/rejected": -0.5363136529922485, "step": 3230 }, { "epoch": 0.8479455639884846, "grad_norm": 9.142580032348633, "learning_rate": 3.441321090804469e-08, "logits/chosen": -2.805671215057373, "logits/rejected": -2.7749440670013428, "logps/chosen": -311.969482421875, "logps/rejected": -301.92559814453125, "loss": 0.5872, "rewards/accuracies": 0.71875, "rewards/chosen": -0.28862375020980835, "rewards/margins": 0.3031871020793915, "rewards/rejected": -0.5918108820915222, "step": 3240 }, { "epoch": 0.8505626799267207, "grad_norm": 6.999141216278076, "learning_rate": 3.326556451066234e-08, "logits/chosen": -2.8003592491149902, "logits/rejected": -2.7750496864318848, "logps/chosen": -333.262451171875, "logps/rejected": -342.88970947265625, "loss": 0.5676, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.21939554810523987, "rewards/margins": 0.38298407196998596, "rewards/rejected": -0.602379560470581, "step": 3250 }, { "epoch": 0.8531797958649568, "grad_norm": 8.473172187805176, "learning_rate": 3.2136015376271946e-08, "logits/chosen": -2.7543041706085205, "logits/rejected": -2.7237446308135986, "logps/chosen": -310.47503662109375, "logps/rejected": -316.1898498535156, "loss": 0.6202, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.3483801782131195, "rewards/margins": 0.25630325078964233, "rewards/rejected": -0.6046834588050842, "step": 3260 }, { "epoch": 0.8557969118031928, "grad_norm": 6.828322887420654, "learning_rate": 3.102465782190106e-08, "logits/chosen": -2.765094041824341, "logits/rejected": -2.7622992992401123, "logps/chosen": -292.77264404296875, "logps/rejected": -306.03790283203125, "loss": 0.6049, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.26343613862991333, "rewards/margins": 0.2959148585796356, "rewards/rejected": -0.5593509674072266, "step": 3270 }, { "epoch": 0.8584140277414289, "grad_norm": 7.230039119720459, "learning_rate": 2.993158464558565e-08, "logits/chosen": -2.752277135848999, "logits/rejected": -2.7456305027008057, "logps/chosen": -313.83514404296875, "logps/rejected": -343.77923583984375, "loss": 0.6083, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.2221953421831131, "rewards/margins": 0.2806113660335541, "rewards/rejected": -0.5028067231178284, "step": 3280 }, { "epoch": 0.861031143679665, "grad_norm": 3.2468912601470947, "learning_rate": 2.8856887118621358e-08, "logits/chosen": -2.7951433658599854, "logits/rejected": -2.8030707836151123, "logps/chosen": -308.23077392578125, "logps/rejected": -336.6316223144531, "loss": 0.6066, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.3232649266719818, "rewards/margins": 0.30740997195243835, "rewards/rejected": -0.6306749582290649, "step": 3290 }, { "epoch": 0.863648259617901, "grad_norm": 6.59912109375, "learning_rate": 2.7800654977942482e-08, "logits/chosen": -2.7431418895721436, "logits/rejected": -2.7131383419036865, "logps/chosen": -301.9719543457031, "logps/rejected": -354.3257751464844, "loss": 0.5769, "rewards/accuracies": 0.75, "rewards/chosen": -0.2676336467266083, "rewards/margins": 0.3562368154525757, "rewards/rejected": -0.6238704919815063, "step": 3300 }, { "epoch": 0.863648259617901, "eval_logits/chosen": -2.7770590782165527, "eval_logits/rejected": -2.755500555038452, "eval_logps/chosen": -309.4716491699219, "eval_logps/rejected": -320.77569580078125, "eval_loss": 0.5899637341499329, "eval_rewards/accuracies": 0.703000009059906, "eval_rewards/chosen": -0.2673403322696686, "eval_rewards/margins": 0.3260369896888733, "eval_rewards/rejected": -0.5933773517608643, "eval_runtime": 692.4414, "eval_samples_per_second": 2.888, "eval_steps_per_second": 0.361, "step": 3300 }, { "epoch": 0.8662653755561371, "grad_norm": 7.842947959899902, "learning_rate": 2.676297641862879e-08, "logits/chosen": -2.76792049407959, "logits/rejected": -2.7621943950653076, "logps/chosen": -265.380859375, "logps/rejected": -254.47140502929688, "loss": 0.5895, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.22911493480205536, "rewards/margins": 0.32636719942092896, "rewards/rejected": -0.5554821491241455, "step": 3310 }, { "epoch": 0.8688824914943732, "grad_norm": 13.967310905456543, "learning_rate": 2.5743938086541352e-08, "logits/chosen": -2.7548770904541016, "logits/rejected": -2.729977607727051, "logps/chosen": -309.2705383300781, "logps/rejected": -313.9998779296875, "loss": 0.603, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.29073604941368103, "rewards/margins": 0.31791952252388, "rewards/rejected": -0.6086556315422058, "step": 3320 }, { "epoch": 0.8714996074326092, "grad_norm": 11.057051658630371, "learning_rate": 2.474362507108757e-08, "logits/chosen": -2.814598560333252, "logits/rejected": -2.7810606956481934, "logps/chosen": -317.7953186035156, "logps/rejected": -332.5885314941406, "loss": 0.5725, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.25249534845352173, "rewards/margins": 0.38154152035713196, "rewards/rejected": -0.6340368390083313, "step": 3330 }, { "epoch": 0.8741167233708453, "grad_norm": 10.906637191772461, "learning_rate": 2.3762120898116495e-08, "logits/chosen": -2.774956226348877, "logits/rejected": -2.764927625656128, "logps/chosen": -322.2221984863281, "logps/rejected": -341.53216552734375, "loss": 0.6079, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.3415859639644623, "rewards/margins": 0.2912564277648926, "rewards/rejected": -0.6328424215316772, "step": 3340 }, { "epoch": 0.8767338393090814, "grad_norm": 6.918145656585693, "learning_rate": 2.2799507522944044e-08, "logits/chosen": -2.689883232116699, "logits/rejected": -2.6739673614501953, "logps/chosen": -313.18524169921875, "logps/rejected": -340.9402770996094, "loss": 0.5669, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.26544609665870667, "rewards/margins": 0.35022976994514465, "rewards/rejected": -0.6156758069992065, "step": 3350 }, { "epoch": 0.8793509552473174, "grad_norm": 10.59185791015625, "learning_rate": 2.1855865323510054e-08, "logits/chosen": -2.7279655933380127, "logits/rejected": -2.6860973834991455, "logps/chosen": -320.9715576171875, "logps/rejected": -355.20880126953125, "loss": 0.5657, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.2595919072628021, "rewards/margins": 0.40920543670654297, "rewards/rejected": -0.6687973141670227, "step": 3360 }, { "epoch": 0.8819680711855535, "grad_norm": 6.1795830726623535, "learning_rate": 2.0931273093666573e-08, "logits/chosen": -2.728386878967285, "logits/rejected": -2.7089622020721436, "logps/chosen": -283.88409423828125, "logps/rejected": -303.3033142089844, "loss": 0.5462, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.2573816776275635, "rewards/margins": 0.4072348475456238, "rewards/rejected": -0.6646164655685425, "step": 3370 }, { "epoch": 0.8845851871237895, "grad_norm": 6.445786476135254, "learning_rate": 2.002580803659873e-08, "logits/chosen": -2.747699022293091, "logits/rejected": -2.7049365043640137, "logps/chosen": -303.89813232421875, "logps/rejected": -318.79693603515625, "loss": 0.617, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.3385586738586426, "rewards/margins": 0.2693432867527008, "rewards/rejected": -0.607901930809021, "step": 3380 }, { "epoch": 0.8872023030620256, "grad_norm": 9.493855476379395, "learning_rate": 1.9139545758378256e-08, "logits/chosen": -2.770669460296631, "logits/rejected": -2.722433090209961, "logps/chosen": -311.3063659667969, "logps/rejected": -296.7181701660156, "loss": 0.5721, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.2407282292842865, "rewards/margins": 0.3538174629211426, "rewards/rejected": -0.5945457220077515, "step": 3390 }, { "epoch": 0.8898194190002617, "grad_norm": 8.795994758605957, "learning_rate": 1.8272560261650277e-08, "logits/chosen": -2.782130479812622, "logits/rejected": -2.757819652557373, "logps/chosen": -354.10919189453125, "logps/rejected": -333.00250244140625, "loss": 0.5608, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.22043180465698242, "rewards/margins": 0.3896182179450989, "rewards/rejected": -0.6100499629974365, "step": 3400 }, { "epoch": 0.8898194190002617, "eval_logits/chosen": -2.773853302001953, "eval_logits/rejected": -2.7519986629486084, "eval_logps/chosen": -309.8929748535156, "eval_logps/rejected": -321.31964111328125, "eval_loss": 0.5895980000495911, "eval_rewards/accuracies": 0.7020000219345093, "eval_rewards/chosen": -0.27155351638793945, "eval_rewards/margins": 0.3272639214992523, "eval_rewards/rejected": -0.5988174676895142, "eval_runtime": 692.3174, "eval_samples_per_second": 2.889, "eval_steps_per_second": 0.361, "step": 3400 }, { "epoch": 0.8924365349384977, "grad_norm": 7.369442462921143, "learning_rate": 1.742492393945427e-08, "logits/chosen": -2.7513797283172607, "logits/rejected": -2.710066318511963, "logps/chosen": -323.8204650878906, "logps/rejected": -317.6787109375, "loss": 0.568, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.2688294053077698, "rewards/margins": 0.36988669633865356, "rewards/rejected": -0.6387161016464233, "step": 3410 }, { "epoch": 0.8950536508767338, "grad_norm": 7.45905876159668, "learning_rate": 1.6596707569179302e-08, "logits/chosen": -2.791177749633789, "logits/rejected": -2.7743191719055176, "logps/chosen": -325.4018249511719, "logps/rejected": -326.23291015625, "loss": 0.5784, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.28556251525878906, "rewards/margins": 0.3457964062690735, "rewards/rejected": -0.6313589215278625, "step": 3420 }, { "epoch": 0.8976707668149699, "grad_norm": 6.628225326538086, "learning_rate": 1.5787980306653848e-08, "logits/chosen": -2.75858736038208, "logits/rejected": -2.7154600620269775, "logps/chosen": -316.15985107421875, "logps/rejected": -336.3743896484375, "loss": 0.5708, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.24860498309135437, "rewards/margins": 0.3617299795150757, "rewards/rejected": -0.6103349924087524, "step": 3430 }, { "epoch": 0.9002878827532059, "grad_norm": 10.542095184326172, "learning_rate": 1.499880968037165e-08, "logits/chosen": -2.752002477645874, "logits/rejected": -2.733220100402832, "logps/chosen": -292.7621765136719, "logps/rejected": -285.80218505859375, "loss": 0.5813, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.22942595183849335, "rewards/margins": 0.32194358110427856, "rewards/rejected": -0.5513694882392883, "step": 3440 }, { "epoch": 0.902904998691442, "grad_norm": 5.9859395027160645, "learning_rate": 1.4229261585852803e-08, "logits/chosen": -2.77447772026062, "logits/rejected": -2.7663679122924805, "logps/chosen": -305.6563415527344, "logps/rejected": -314.01043701171875, "loss": 0.5806, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.22854971885681152, "rewards/margins": 0.3463636040687561, "rewards/rejected": -0.5749133825302124, "step": 3450 }, { "epoch": 0.9055221146296781, "grad_norm": 9.172728538513184, "learning_rate": 1.3479400280141883e-08, "logits/chosen": -2.74762225151062, "logits/rejected": -2.7340774536132812, "logps/chosen": -290.8319396972656, "logps/rejected": -326.6239929199219, "loss": 0.5852, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.2622153162956238, "rewards/margins": 0.35056665539741516, "rewards/rejected": -0.6127818822860718, "step": 3460 }, { "epoch": 0.9081392305679141, "grad_norm": 8.79883098602295, "learning_rate": 1.2749288376442042e-08, "logits/chosen": -2.7586569786071777, "logits/rejected": -2.730827569961548, "logps/chosen": -337.0930480957031, "logps/rejected": -317.09912109375, "loss": 0.5455, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.19493858516216278, "rewards/margins": 0.4252621531486511, "rewards/rejected": -0.6202007532119751, "step": 3470 }, { "epoch": 0.9107563465061502, "grad_norm": 11.71596622467041, "learning_rate": 1.2038986838887127e-08, "logits/chosen": -2.792734384536743, "logits/rejected": -2.77490234375, "logps/chosen": -288.8994445800781, "logps/rejected": -313.22430419921875, "loss": 0.6242, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.30519285798072815, "rewards/margins": 0.2674819231033325, "rewards/rejected": -0.5726747512817383, "step": 3480 }, { "epoch": 0.9133734624443863, "grad_norm": 6.5518951416015625, "learning_rate": 1.1348554977451131e-08, "logits/chosen": -2.805830478668213, "logits/rejected": -2.7894272804260254, "logps/chosen": -327.4478759765625, "logps/rejected": -324.9560546875, "loss": 0.582, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.2583698630332947, "rewards/margins": 0.3414859175682068, "rewards/rejected": -0.5998557806015015, "step": 3490 }, { "epoch": 0.9159905783826223, "grad_norm": 5.205156326293945, "learning_rate": 1.06780504429958e-08, "logits/chosen": -2.7797505855560303, "logits/rejected": -2.7590694427490234, "logps/chosen": -325.8748779296875, "logps/rejected": -310.8509521484375, "loss": 0.6008, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.27413299679756165, "rewards/margins": 0.3151172399520874, "rewards/rejected": -0.5892502665519714, "step": 3500 }, { "epoch": 0.9159905783826223, "eval_logits/chosen": -2.775543689727783, "eval_logits/rejected": -2.7539024353027344, "eval_logps/chosen": -309.8999938964844, "eval_logps/rejected": -321.37445068359375, "eval_loss": 0.5894958972930908, "eval_rewards/accuracies": 0.703499972820282, "eval_rewards/chosen": -0.2716234028339386, "eval_rewards/margins": 0.3277418315410614, "eval_rewards/rejected": -0.599365234375, "eval_runtime": 692.3998, "eval_samples_per_second": 2.889, "eval_steps_per_second": 0.361, "step": 3500 }, { "epoch": 0.9186076943208584, "grad_norm": 6.786498069763184, "learning_rate": 1.0027529222456754e-08, "logits/chosen": -2.7301533222198486, "logits/rejected": -2.702810764312744, "logps/chosen": -296.23834228515625, "logps/rejected": -315.268310546875, "loss": 0.5539, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.2563706934452057, "rewards/margins": 0.3883191645145416, "rewards/rejected": -0.6446898579597473, "step": 3510 }, { "epoch": 0.9212248102590945, "grad_norm": 9.892511367797852, "learning_rate": 9.397045634168766e-09, "logits/chosen": -2.8002243041992188, "logits/rejected": -2.7856602668762207, "logps/chosen": -308.3498229980469, "logps/rejected": -351.95831298828125, "loss": 0.57, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.21786804497241974, "rewards/margins": 0.41681188344955444, "rewards/rejected": -0.634679913520813, "step": 3520 }, { "epoch": 0.9238419261973305, "grad_norm": 12.571949005126953, "learning_rate": 8.78665232332998e-09, "logits/chosen": -2.724975347518921, "logits/rejected": -2.708922863006592, "logps/chosen": -277.4271545410156, "logps/rejected": -300.2417297363281, "loss": 0.6055, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.3028232455253601, "rewards/margins": 0.2615777850151062, "rewards/rejected": -0.5644010305404663, "step": 3530 }, { "epoch": 0.9264590421355666, "grad_norm": 7.908664703369141, "learning_rate": 8.196400257606206e-09, "logits/chosen": -2.772461414337158, "logits/rejected": -2.7343640327453613, "logps/chosen": -328.0716247558594, "logps/rejected": -358.15655517578125, "loss": 0.577, "rewards/accuracies": 0.71875, "rewards/chosen": -0.2738083302974701, "rewards/margins": 0.3519710600376129, "rewards/rejected": -0.625779390335083, "step": 3540 }, { "epoch": 0.9290761580738026, "grad_norm": 5.722252368927002, "learning_rate": 7.626338722875075e-09, "logits/chosen": -2.7591617107391357, "logits/rejected": -2.780594825744629, "logps/chosen": -298.6004943847656, "logps/rejected": -326.13287353515625, "loss": 0.5986, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2562271058559418, "rewards/margins": 0.3074356019496918, "rewards/rejected": -0.5636627078056335, "step": 3550 }, { "epoch": 0.9316932740120387, "grad_norm": 8.03117847442627, "learning_rate": 7.0765153191106875e-09, "logits/chosen": -2.781140089035034, "logits/rejected": -2.7692975997924805, "logps/chosen": -295.3600158691406, "logps/rejected": -291.2763366699219, "loss": 0.5659, "rewards/accuracies": 0.75, "rewards/chosen": -0.2436678409576416, "rewards/margins": 0.39614516496658325, "rewards/rejected": -0.6398130655288696, "step": 3560 }, { "epoch": 0.9343103899502748, "grad_norm": 7.668455600738525, "learning_rate": 6.54697595640899e-09, "logits/chosen": -2.7558670043945312, "logits/rejected": -2.7410783767700195, "logps/chosen": -333.0140075683594, "logps/rejected": -347.9772033691406, "loss": 0.5718, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.23105120658874512, "rewards/margins": 0.38840624690055847, "rewards/rejected": -0.6194573640823364, "step": 3570 }, { "epoch": 0.9369275058885108, "grad_norm": 7.808078765869141, "learning_rate": 6.037764851154425e-09, "logits/chosen": -2.7314181327819824, "logits/rejected": -2.7231030464172363, "logps/chosen": -305.7143249511719, "logps/rejected": -345.88983154296875, "loss": 0.5699, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.22997505962848663, "rewards/margins": 0.37085598707199097, "rewards/rejected": -0.6008309721946716, "step": 3580 }, { "epoch": 0.9395446218267469, "grad_norm": 9.760852813720703, "learning_rate": 5.548924522327747e-09, "logits/chosen": -2.7540392875671387, "logits/rejected": -2.7462592124938965, "logps/chosen": -308.9768981933594, "logps/rejected": -327.16802978515625, "loss": 0.5826, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.299643337726593, "rewards/margins": 0.3448326587677002, "rewards/rejected": -0.6444759368896484, "step": 3590 }, { "epoch": 0.942161737764983, "grad_norm": 11.123191833496094, "learning_rate": 5.080495787955691e-09, "logits/chosen": -2.734261989593506, "logits/rejected": -2.717097043991089, "logps/chosen": -269.73223876953125, "logps/rejected": -300.8177490234375, "loss": 0.585, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.254092276096344, "rewards/margins": 0.306030809879303, "rewards/rejected": -0.560123085975647, "step": 3600 }, { "epoch": 0.942161737764983, "eval_logits/chosen": -2.776420831680298, "eval_logits/rejected": -2.7549078464508057, "eval_logps/chosen": -309.95306396484375, "eval_logps/rejected": -321.4418029785156, "eval_loss": 0.5895029306411743, "eval_rewards/accuracies": 0.7020000219345093, "eval_rewards/chosen": -0.27215421199798584, "eval_rewards/margins": 0.3278846740722656, "eval_rewards/rejected": -0.6000389456748962, "eval_runtime": 692.4927, "eval_samples_per_second": 2.888, "eval_steps_per_second": 0.361, "step": 3600 }, { "epoch": 0.944778853703219, "grad_norm": 7.403170585632324, "learning_rate": 4.632517761702814e-09, "logits/chosen": -2.7008776664733887, "logits/rejected": -2.6773476600646973, "logps/chosen": -289.5223083496094, "logps/rejected": -309.5367431640625, "loss": 0.5795, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.308106005191803, "rewards/margins": 0.33852237462997437, "rewards/rejected": -0.6466284394264221, "step": 3610 }, { "epoch": 0.9473959696414551, "grad_norm": 9.613285064697266, "learning_rate": 4.205027849605358e-09, "logits/chosen": -2.738858699798584, "logits/rejected": -2.726569414138794, "logps/chosen": -294.84014892578125, "logps/rejected": -290.58770751953125, "loss": 0.5959, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.292969286441803, "rewards/margins": 0.3049730956554413, "rewards/rejected": -0.5979424715042114, "step": 3620 }, { "epoch": 0.9500130855796912, "grad_norm": 4.820310115814209, "learning_rate": 3.798061746947995e-09, "logits/chosen": -2.785492420196533, "logits/rejected": -2.767252206802368, "logps/chosen": -311.9582214355469, "logps/rejected": -305.7359924316406, "loss": 0.5893, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.31197255849838257, "rewards/margins": 0.3228316307067871, "rewards/rejected": -0.6348041296005249, "step": 3630 }, { "epoch": 0.9526302015179272, "grad_norm": 5.795242786407471, "learning_rate": 3.411653435283157e-09, "logits/chosen": -2.7570109367370605, "logits/rejected": -2.7326931953430176, "logps/chosen": -313.0288391113281, "logps/rejected": -286.85894775390625, "loss": 0.5868, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.23662319779396057, "rewards/margins": 0.32369619607925415, "rewards/rejected": -0.5603194236755371, "step": 3640 }, { "epoch": 0.9552473174561633, "grad_norm": 8.141414642333984, "learning_rate": 3.0458351795936698e-09, "logits/chosen": -2.800523281097412, "logits/rejected": -2.7803540229797363, "logps/chosen": -287.27178955078125, "logps/rejected": -296.94482421875, "loss": 0.5557, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.20535226166248322, "rewards/margins": 0.4138811230659485, "rewards/rejected": -0.6192333102226257, "step": 3650 }, { "epoch": 0.9578644333943994, "grad_norm": 10.963499069213867, "learning_rate": 2.700637525598598e-09, "logits/chosen": -2.7325665950775146, "logits/rejected": -2.742112636566162, "logps/chosen": -318.7773742675781, "logps/rejected": -340.5607604980469, "loss": 0.6213, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.2901912331581116, "rewards/margins": 0.23596885800361633, "rewards/rejected": -0.5261600613594055, "step": 3660 }, { "epoch": 0.9604815493326354, "grad_norm": 5.604915618896484, "learning_rate": 2.3760892972027324e-09, "logits/chosen": -2.8125240802764893, "logits/rejected": -2.794743061065674, "logps/chosen": -320.9376525878906, "logps/rejected": -314.6265869140625, "loss": 0.6086, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.3326115012168884, "rewards/margins": 0.2905888855457306, "rewards/rejected": -0.6232004165649414, "step": 3670 }, { "epoch": 0.9630986652708715, "grad_norm": 8.076900482177734, "learning_rate": 2.0722175940897645e-09, "logits/chosen": -2.730006694793701, "logits/rejected": -2.7527151107788086, "logps/chosen": -304.4130554199219, "logps/rejected": -333.45281982421875, "loss": 0.5561, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.2793710231781006, "rewards/margins": 0.40125495195388794, "rewards/rejected": -0.6806259751319885, "step": 3680 }, { "epoch": 0.9657157812091076, "grad_norm": 5.261369705200195, "learning_rate": 1.7890477894593748e-09, "logits/chosen": -2.7596428394317627, "logits/rejected": -2.73931622505188, "logps/chosen": -363.08984375, "logps/rejected": -348.8448486328125, "loss": 0.5621, "rewards/accuracies": 0.75, "rewards/chosen": -0.23692288994789124, "rewards/margins": 0.40229707956314087, "rewards/rejected": -0.6392199993133545, "step": 3690 }, { "epoch": 0.9683328971473436, "grad_norm": 7.541417598724365, "learning_rate": 1.5266035279088708e-09, "logits/chosen": -2.6856465339660645, "logits/rejected": -2.6826679706573486, "logps/chosen": -347.5863952636719, "logps/rejected": -356.30120849609375, "loss": 0.567, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.2766670286655426, "rewards/margins": 0.367573082447052, "rewards/rejected": -0.6442400813102722, "step": 3700 }, { "epoch": 0.9683328971473436, "eval_logits/chosen": -2.7754881381988525, "eval_logits/rejected": -2.753868341445923, "eval_logps/chosen": -310.11712646484375, "eval_logps/rejected": -321.65545654296875, "eval_loss": 0.5893409252166748, "eval_rewards/accuracies": 0.7014999985694885, "eval_rewards/chosen": -0.2737952172756195, "eval_rewards/margins": 0.32838013768196106, "eval_rewards/rejected": -0.6021752953529358, "eval_runtime": 692.7848, "eval_samples_per_second": 2.887, "eval_steps_per_second": 0.361, "step": 3700 }, { "epoch": 0.9709500130855797, "grad_norm": 11.719736099243164, "learning_rate": 1.2849067234584621e-09, "logits/chosen": -2.714137315750122, "logits/rejected": -2.7111401557922363, "logps/chosen": -280.48919677734375, "logps/rejected": -300.55706787109375, "loss": 0.607, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2787315249443054, "rewards/margins": 0.30049681663513184, "rewards/rejected": -0.5792283415794373, "step": 3710 }, { "epoch": 0.9735671290238157, "grad_norm": 12.492560386657715, "learning_rate": 1.0639775577218625e-09, "logits/chosen": -2.719714403152466, "logits/rejected": -2.667534589767456, "logps/chosen": -295.1371765136719, "logps/rejected": -294.61932373046875, "loss": 0.5762, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.27527686953544617, "rewards/margins": 0.36209625005722046, "rewards/rejected": -0.637373149394989, "step": 3720 }, { "epoch": 0.9761842449620518, "grad_norm": 7.440390110015869, "learning_rate": 8.638344782207485e-10, "logits/chosen": -2.725163459777832, "logits/rejected": -2.7303969860076904, "logps/chosen": -296.50689697265625, "logps/rejected": -305.67706298828125, "loss": 0.5767, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.22756004333496094, "rewards/margins": 0.3577590882778168, "rewards/rejected": -0.5853191018104553, "step": 3730 }, { "epoch": 0.9788013609002879, "grad_norm": 10.965612411499023, "learning_rate": 6.844941968447149e-10, "logits/chosen": -2.7626724243164062, "logits/rejected": -2.7460460662841797, "logps/chosen": -316.35015869140625, "logps/rejected": -349.7431945800781, "loss": 0.5453, "rewards/accuracies": 0.75, "rewards/chosen": -0.2542489767074585, "rewards/margins": 0.45952582359313965, "rewards/rejected": -0.7137748003005981, "step": 3740 }, { "epoch": 0.9814184768385239, "grad_norm": 5.883279323577881, "learning_rate": 5.25971688455612e-10, "logits/chosen": -2.7904438972473145, "logits/rejected": -2.775864362716675, "logps/chosen": -316.23297119140625, "logps/rejected": -347.6502685546875, "loss": 0.5698, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.2520793080329895, "rewards/margins": 0.36716121435165405, "rewards/rejected": -0.6192405223846436, "step": 3750 }, { "epoch": 0.98403559277676, "grad_norm": 4.377948760986328, "learning_rate": 3.882801896372967e-10, "logits/chosen": -2.785407543182373, "logits/rejected": -2.785416841506958, "logps/chosen": -311.1086120605469, "logps/rejected": -308.876220703125, "loss": 0.6124, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.2892715036869049, "rewards/margins": 0.29838478565216064, "rewards/rejected": -0.5876562595367432, "step": 3760 }, { "epoch": 0.9866527087149961, "grad_norm": 8.081770896911621, "learning_rate": 2.714311975902661e-10, "logits/chosen": -2.7383980751037598, "logits/rejected": -2.710829257965088, "logps/chosen": -330.71771240234375, "logps/rejected": -337.7955627441406, "loss": 0.5649, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.25471562147140503, "rewards/margins": 0.3609776496887207, "rewards/rejected": -0.6156932711601257, "step": 3770 }, { "epoch": 0.9892698246532321, "grad_norm": 7.887190818786621, "learning_rate": 1.754344691717591e-10, "logits/chosen": -2.761021852493286, "logits/rejected": -2.7340810298919678, "logps/chosen": -295.04718017578125, "logps/rejected": -336.95147705078125, "loss": 0.6306, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.27056482434272766, "rewards/margins": 0.2132827490568161, "rewards/rejected": -0.4838475286960602, "step": 3780 }, { "epoch": 0.9918869405914682, "grad_norm": 7.817293643951416, "learning_rate": 1.0029802008096333e-10, "logits/chosen": -2.7683863639831543, "logits/rejected": -2.7289211750030518, "logps/chosen": -316.55340576171875, "logps/rejected": -334.72845458984375, "loss": 0.5639, "rewards/accuracies": 0.71875, "rewards/chosen": -0.25747808814048767, "rewards/margins": 0.4023471474647522, "rewards/rejected": -0.6598252654075623, "step": 3790 }, { "epoch": 0.9945040565297043, "grad_norm": 6.426971435546875, "learning_rate": 4.602812418974533e-11, "logits/chosen": -2.791513442993164, "logits/rejected": -2.7664811611175537, "logps/chosen": -328.2163391113281, "logps/rejected": -337.54974365234375, "loss": 0.5834, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.2448674440383911, "rewards/margins": 0.3448673486709595, "rewards/rejected": -0.5897347927093506, "step": 3800 }, { "epoch": 0.9945040565297043, "eval_logits/chosen": -2.7742366790771484, "eval_logits/rejected": -2.7524607181549072, "eval_logps/chosen": -310.13330078125, "eval_logps/rejected": -321.6666259765625, "eval_loss": 0.5893096923828125, "eval_rewards/accuracies": 0.7024999856948853, "eval_rewards/chosen": -0.2739570438861847, "eval_rewards/margins": 0.32832974195480347, "eval_rewards/rejected": -0.6022867560386658, "eval_runtime": 692.7928, "eval_samples_per_second": 2.887, "eval_steps_per_second": 0.361, "step": 3800 }, { "epoch": 0.9971211724679403, "grad_norm": 7.6028289794921875, "learning_rate": 1.2629313018819309e-11, "logits/chosen": -2.7530319690704346, "logits/rejected": -2.7311320304870605, "logps/chosen": -300.90142822265625, "logps/rejected": -311.88006591796875, "loss": 0.5936, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.2676599621772766, "rewards/margins": 0.3127606511116028, "rewards/rejected": -0.5804205536842346, "step": 3810 }, { "epoch": 0.9997382884061764, "grad_norm": 10.209754943847656, "learning_rate": 1.0437535929996855e-13, "logits/chosen": -2.765655279159546, "logits/rejected": -2.7465381622314453, "logps/chosen": -334.4398498535156, "logps/rejected": -327.4457702636719, "loss": 0.5626, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.2764059007167816, "rewards/margins": 0.397102028131485, "rewards/rejected": -0.6735079288482666, "step": 3820 }, { "epoch": 1.0, "step": 3821, "total_flos": 0.0, "train_loss": 0.6164219083351729, "train_runtime": 73481.1174, "train_samples_per_second": 0.832, "train_steps_per_second": 0.052 } ], "logging_steps": 10, "max_steps": 3821, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }