diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6383 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 100, + "global_step": 3821, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00026171159382360636, + "grad_norm": 1.999703049659729, + "learning_rate": 1.3054830287206266e-09, + "logits/chosen": -2.9875593185424805, + "logits/rejected": -2.936753749847412, + "logps/chosen": -307.4898681640625, + "logps/rejected": -392.088623046875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0026171159382360636, + "grad_norm": 1.9285504817962646, + "learning_rate": 1.3054830287206264e-08, + "logits/chosen": -2.8448944091796875, + "logits/rejected": -2.83210825920105, + "logps/chosen": -299.1453857421875, + "logps/rejected": -260.9873352050781, + "loss": 0.693, + "rewards/accuracies": 0.4930555522441864, + "rewards/chosen": -0.00014580304559785873, + "rewards/margins": 0.0003282717370893806, + "rewards/rejected": -0.00047407473903149366, + "step": 10 + }, + { + "epoch": 0.005234231876472127, + "grad_norm": 2.234384775161743, + "learning_rate": 2.610966057441253e-08, + "logits/chosen": -2.861093044281006, + "logits/rejected": -2.826277732849121, + "logps/chosen": -325.42889404296875, + "logps/rejected": -252.72314453125, + "loss": 0.6928, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": 0.00027085753390565515, + "rewards/margins": 0.0006726925494149327, + "rewards/rejected": -0.00040183504461310804, + "step": 20 + }, + { + "epoch": 0.007851347814708191, + "grad_norm": 2.5200695991516113, + "learning_rate": 3.91644908616188e-08, + "logits/chosen": -2.8650269508361816, + "logits/rejected": -2.839594841003418, + "logps/chosen": -269.79888916015625, + "logps/rejected": -268.51544189453125, + "loss": 0.6928, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0004993680049665272, + "rewards/margins": 0.0007416309672407806, + "rewards/rejected": -0.00024226296227425337, + "step": 30 + }, + { + "epoch": 0.010468463752944255, + "grad_norm": 1.6392391920089722, + "learning_rate": 5.221932114882506e-08, + "logits/chosen": -2.8317809104919434, + "logits/rejected": -2.8215935230255127, + "logps/chosen": -233.3176727294922, + "logps/rejected": -238.38671875, + "loss": 0.6929, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -3.822711369139142e-05, + "rewards/margins": 0.000457162968814373, + "rewards/rejected": -0.0004953901516273618, + "step": 40 + }, + { + "epoch": 0.01308557969118032, + "grad_norm": 1.624583125114441, + "learning_rate": 6.527415143603133e-08, + "logits/chosen": -2.865053176879883, + "logits/rejected": -2.852184295654297, + "logps/chosen": -290.0357360839844, + "logps/rejected": -253.96719360351562, + "loss": 0.6931, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.00021998901502229273, + "rewards/margins": 8.350692223757505e-05, + "rewards/rejected": -0.00030349590815603733, + "step": 50 + }, + { + "epoch": 0.015702695629416383, + "grad_norm": 1.7673835754394531, + "learning_rate": 7.83289817232376e-08, + "logits/chosen": -2.8233509063720703, + "logits/rejected": -2.809717893600464, + "logps/chosen": -273.7070617675781, + "logps/rejected": -246.9080352783203, + "loss": 0.6931, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.00012425810564309359, + "rewards/margins": 6.1127066146582365e-06, + "rewards/rejected": -0.00013037076860200614, + "step": 60 + }, + { + "epoch": 0.018319811567652448, + "grad_norm": 1.7462002038955688, + "learning_rate": 9.138381201044386e-08, + "logits/chosen": -2.8822834491729736, + "logits/rejected": -2.8470146656036377, + "logps/chosen": -293.1849060058594, + "logps/rejected": -266.12908935546875, + "loss": 0.6931, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.00014021484821569175, + "rewards/margins": 4.102182720089331e-05, + "rewards/rejected": -0.00018123674090020359, + "step": 70 + }, + { + "epoch": 0.02093692750588851, + "grad_norm": 2.281116008758545, + "learning_rate": 1.0443864229765012e-07, + "logits/chosen": -2.820223331451416, + "logits/rejected": -2.797712564468384, + "logps/chosen": -279.3045959472656, + "logps/rejected": -266.4049072265625, + "loss": 0.6932, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -0.00035889382706955075, + "rewards/margins": -3.7797075492562726e-05, + "rewards/rejected": -0.00032109676976688206, + "step": 80 + }, + { + "epoch": 0.023554043444124574, + "grad_norm": 1.8048748970031738, + "learning_rate": 1.174934725848564e-07, + "logits/chosen": -2.834364652633667, + "logits/rejected": -2.821197032928467, + "logps/chosen": -270.66107177734375, + "logps/rejected": -251.8137664794922, + "loss": 0.693, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.5717377866385505e-05, + "rewards/margins": 0.00027994689298793674, + "rewards/rejected": -0.00030566431814804673, + "step": 90 + }, + { + "epoch": 0.02617115938236064, + "grad_norm": 1.8376109600067139, + "learning_rate": 1.3054830287206266e-07, + "logits/chosen": -2.8485753536224365, + "logits/rejected": -2.8414525985717773, + "logps/chosen": -267.0416259765625, + "logps/rejected": -248.66622924804688, + "loss": 0.6929, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.00016696630336809903, + "rewards/margins": 0.0004185012076050043, + "rewards/rejected": -0.0005854673800058663, + "step": 100 + }, + { + "epoch": 0.02617115938236064, + "eval_logits/chosen": -2.8661274909973145, + "eval_logits/rejected": -2.8388071060180664, + "eval_logps/chosen": -282.74957275390625, + "eval_logps/rejected": -261.47882080078125, + "eval_loss": 0.693004846572876, + "eval_rewards/accuracies": 0.5249999761581421, + "eval_rewards/chosen": -0.00011926326260436326, + "eval_rewards/margins": 0.0002895805810112506, + "eval_rewards/rejected": -0.00040884382906369865, + "eval_runtime": 692.2735, + "eval_samples_per_second": 2.889, + "eval_steps_per_second": 0.361, + "step": 100 + }, + { + "epoch": 0.028788275320596704, + "grad_norm": 2.015868663787842, + "learning_rate": 1.4360313315926893e-07, + "logits/chosen": -2.856309652328491, + "logits/rejected": -2.823089361190796, + "logps/chosen": -307.3843994140625, + "logps/rejected": -257.291015625, + "loss": 0.6932, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -0.0002652711991686374, + "rewards/margins": -0.00011966088641202077, + "rewards/rejected": -0.00014561018906533718, + "step": 110 + }, + { + "epoch": 0.031405391258832765, + "grad_norm": 1.7159242630004883, + "learning_rate": 1.566579634464752e-07, + "logits/chosen": -2.869659423828125, + "logits/rejected": -2.8464877605438232, + "logps/chosen": -310.60089111328125, + "logps/rejected": -287.7904357910156, + "loss": 0.6929, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": 0.0001522126840427518, + "rewards/margins": 0.0004031356074847281, + "rewards/rejected": -0.00025092283613048494, + "step": 120 + }, + { + "epoch": 0.03402250719706883, + "grad_norm": 2.0958242416381836, + "learning_rate": 1.6971279373368143e-07, + "logits/chosen": -2.850337266921997, + "logits/rejected": -2.8188374042510986, + "logps/chosen": -271.6417236328125, + "logps/rejected": -269.60174560546875, + "loss": 0.6928, + "rewards/accuracies": 0.53125, + "rewards/chosen": 0.00013120910443831235, + "rewards/margins": 0.0006835443200543523, + "rewards/rejected": -0.0005523352883756161, + "step": 130 + }, + { + "epoch": 0.036639623135304895, + "grad_norm": 1.8925613164901733, + "learning_rate": 1.8276762402088773e-07, + "logits/chosen": -2.8673295974731445, + "logits/rejected": -2.8122167587280273, + "logps/chosen": -291.46307373046875, + "logps/rejected": -247.7669677734375, + "loss": 0.6927, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.00036148293293081224, + "rewards/margins": 0.0009279497899115086, + "rewards/rejected": -0.0005664670607075095, + "step": 140 + }, + { + "epoch": 0.03925673907354096, + "grad_norm": 1.9597433805465698, + "learning_rate": 1.95822454308094e-07, + "logits/chosen": -2.8569109439849854, + "logits/rejected": -2.837003707885742, + "logps/chosen": -298.9459228515625, + "logps/rejected": -256.0478515625, + "loss": 0.6927, + "rewards/accuracies": 0.53125, + "rewards/chosen": 0.0006078753503970802, + "rewards/margins": 0.0009616016177460551, + "rewards/rejected": -0.00035372626734897494, + "step": 150 + }, + { + "epoch": 0.04187385501177702, + "grad_norm": 1.913694977760315, + "learning_rate": 2.0887728459530023e-07, + "logits/chosen": -2.864971876144409, + "logits/rejected": -2.8458945751190186, + "logps/chosen": -275.124755859375, + "logps/rejected": -275.0151062011719, + "loss": 0.6926, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": 0.00012734555639326572, + "rewards/margins": 0.001163811655715108, + "rewards/rejected": -0.0010364660993218422, + "step": 160 + }, + { + "epoch": 0.04449097095001309, + "grad_norm": 2.1846537590026855, + "learning_rate": 2.2193211488250652e-07, + "logits/chosen": -2.822680950164795, + "logits/rejected": -2.8042876720428467, + "logps/chosen": -236.7074432373047, + "logps/rejected": -238.3466339111328, + "loss": 0.6927, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.0003129563410766423, + "rewards/margins": 0.0008108107140287757, + "rewards/rejected": -0.0011237671133130789, + "step": 170 + }, + { + "epoch": 0.04710808688824915, + "grad_norm": 1.6035895347595215, + "learning_rate": 2.349869451697128e-07, + "logits/chosen": -2.850816249847412, + "logits/rejected": -2.823718309402466, + "logps/chosen": -276.2500915527344, + "logps/rejected": -259.9451904296875, + "loss": 0.6927, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.0001872165739769116, + "rewards/margins": 0.0008747532265260816, + "rewards/rejected": -0.001061969785951078, + "step": 180 + }, + { + "epoch": 0.04972520282648522, + "grad_norm": 3.182461738586426, + "learning_rate": 2.4804177545691903e-07, + "logits/chosen": -2.8869190216064453, + "logits/rejected": -2.8687491416931152, + "logps/chosen": -290.9490661621094, + "logps/rejected": -257.3797302246094, + "loss": 0.6927, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0006612293072976172, + "rewards/margins": 0.000992011046037078, + "rewards/rejected": -0.00033078185515478253, + "step": 190 + }, + { + "epoch": 0.05234231876472128, + "grad_norm": 1.8618322610855103, + "learning_rate": 2.610966057441253e-07, + "logits/chosen": -2.837772846221924, + "logits/rejected": -2.8276214599609375, + "logps/chosen": -267.96173095703125, + "logps/rejected": -225.5831756591797, + "loss": 0.6923, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0006539617897942662, + "rewards/margins": 0.0017792375292629004, + "rewards/rejected": -0.0011252757394686341, + "step": 200 + }, + { + "epoch": 0.05234231876472128, + "eval_logits/chosen": -2.8652713298797607, + "eval_logits/rejected": -2.837984800338745, + "eval_logps/chosen": -282.66241455078125, + "eval_logps/rejected": -261.5315856933594, + "eval_loss": 0.6923088431358337, + "eval_rewards/accuracies": 0.6050000190734863, + "eval_rewards/chosen": 0.0007522286614403129, + "eval_rewards/margins": 0.001688659773208201, + "eval_rewards/rejected": -0.0009364310535602272, + "eval_runtime": 693.0899, + "eval_samples_per_second": 2.886, + "eval_steps_per_second": 0.361, + "step": 200 + }, + { + "epoch": 0.05495943470295734, + "grad_norm": 1.7776113748550415, + "learning_rate": 2.7415143603133156e-07, + "logits/chosen": -2.8762500286102295, + "logits/rejected": -2.8429489135742188, + "logps/chosen": -275.98614501953125, + "logps/rejected": -245.2783660888672, + "loss": 0.6922, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.0009210329735651612, + "rewards/margins": 0.0018816586816683412, + "rewards/rejected": -0.0009606255334801972, + "step": 210 + }, + { + "epoch": 0.05757655064119341, + "grad_norm": 1.6921358108520508, + "learning_rate": 2.8720626631853785e-07, + "logits/chosen": -2.817211627960205, + "logits/rejected": -2.811617851257324, + "logps/chosen": -274.0498962402344, + "logps/rejected": -242.93923950195312, + "loss": 0.6919, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.0014726849040016532, + "rewards/margins": 0.0024847507011145353, + "rewards/rejected": -0.0010120656806975603, + "step": 220 + }, + { + "epoch": 0.06019366657942947, + "grad_norm": 2.0040206909179688, + "learning_rate": 3.002610966057441e-07, + "logits/chosen": -2.885439157485962, + "logits/rejected": -2.86034893989563, + "logps/chosen": -322.754150390625, + "logps/rejected": -285.758056640625, + "loss": 0.6922, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0018624020740389824, + "rewards/margins": 0.0018660586792975664, + "rewards/rejected": -3.6565586469805567e-06, + "step": 230 + }, + { + "epoch": 0.06281078251766553, + "grad_norm": 1.809605360031128, + "learning_rate": 3.133159268929504e-07, + "logits/chosen": -2.8532462120056152, + "logits/rejected": -2.8391811847686768, + "logps/chosen": -312.47088623046875, + "logps/rejected": -297.48907470703125, + "loss": 0.6921, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": 0.0017323732608929276, + "rewards/margins": 0.0021942437160760164, + "rewards/rejected": -0.00046187033876776695, + "step": 240 + }, + { + "epoch": 0.06542789845590159, + "grad_norm": 1.6686596870422363, + "learning_rate": 3.263707571801567e-07, + "logits/chosen": -2.814990282058716, + "logits/rejected": -2.81905198097229, + "logps/chosen": -277.08941650390625, + "logps/rejected": -249.03414916992188, + "loss": 0.6915, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.001977517269551754, + "rewards/margins": 0.003367725061252713, + "rewards/rejected": -0.0013902074424549937, + "step": 250 + }, + { + "epoch": 0.06804501439413765, + "grad_norm": 1.5935229063034058, + "learning_rate": 3.3942558746736286e-07, + "logits/chosen": -2.8718338012695312, + "logits/rejected": -2.8251404762268066, + "logps/chosen": -297.3100280761719, + "logps/rejected": -277.9830017089844, + "loss": 0.6916, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0025989424902945757, + "rewards/margins": 0.0032064050901681185, + "rewards/rejected": -0.00060746242525056, + "step": 260 + }, + { + "epoch": 0.07066213033237373, + "grad_norm": 1.4248483180999756, + "learning_rate": 3.5248041775456916e-07, + "logits/chosen": -2.8370590209960938, + "logits/rejected": -2.8248658180236816, + "logps/chosen": -281.2889709472656, + "logps/rejected": -245.48855590820312, + "loss": 0.6901, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": 0.003083507064729929, + "rewards/margins": 0.006150919944047928, + "rewards/rejected": -0.003067413344979286, + "step": 270 + }, + { + "epoch": 0.07327924627060979, + "grad_norm": 1.725456714630127, + "learning_rate": 3.6553524804177545e-07, + "logits/chosen": -2.8781139850616455, + "logits/rejected": -2.8350632190704346, + "logps/chosen": -276.51568603515625, + "logps/rejected": -253.5542755126953, + "loss": 0.6906, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.0024674157612025738, + "rewards/margins": 0.005187267437577248, + "rewards/rejected": -0.00271985144354403, + "step": 280 + }, + { + "epoch": 0.07589636220884585, + "grad_norm": 1.9681357145309448, + "learning_rate": 3.785900783289817e-07, + "logits/chosen": -2.849203586578369, + "logits/rejected": -2.838613986968994, + "logps/chosen": -304.06463623046875, + "logps/rejected": -279.3326721191406, + "loss": 0.6901, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.0034332734066993, + "rewards/margins": 0.0062034172005951405, + "rewards/rejected": -0.0027701437938958406, + "step": 290 + }, + { + "epoch": 0.07851347814708191, + "grad_norm": 2.0513315200805664, + "learning_rate": 3.91644908616188e-07, + "logits/chosen": -2.8060500621795654, + "logits/rejected": -2.76236629486084, + "logps/chosen": -266.20794677734375, + "logps/rejected": -248.80886840820312, + "loss": 0.6898, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": 0.0026042419485747814, + "rewards/margins": 0.006667142268270254, + "rewards/rejected": -0.004062901251018047, + "step": 300 + }, + { + "epoch": 0.07851347814708191, + "eval_logits/chosen": -2.8622689247131348, + "eval_logits/rejected": -2.834963321685791, + "eval_logps/chosen": -282.39178466796875, + "eval_logps/rejected": -261.6759948730469, + "eval_loss": 0.6902644038200378, + "eval_rewards/accuracies": 0.6639999747276306, + "eval_rewards/chosen": 0.0034584649838507175, + "eval_rewards/margins": 0.0058389026671648026, + "eval_rewards/rejected": -0.0023804374504834414, + "eval_runtime": 692.5367, + "eval_samples_per_second": 2.888, + "eval_steps_per_second": 0.361, + "step": 300 + }, + { + "epoch": 0.08113059408531798, + "grad_norm": 2.1205692291259766, + "learning_rate": 4.046997389033943e-07, + "logits/chosen": -2.893097400665283, + "logits/rejected": -2.87463641166687, + "logps/chosen": -306.21636962890625, + "logps/rejected": -250.2729949951172, + "loss": 0.6888, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.004871034994721413, + "rewards/margins": 0.008721152320504189, + "rewards/rejected": -0.003850117791444063, + "step": 310 + }, + { + "epoch": 0.08374771002355404, + "grad_norm": 1.7468680143356323, + "learning_rate": 4.1775456919060046e-07, + "logits/chosen": -2.873706817626953, + "logits/rejected": -2.8421998023986816, + "logps/chosen": -272.94659423828125, + "logps/rejected": -255.0898895263672, + "loss": 0.6904, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.00492675369605422, + "rewards/margins": 0.005588999018073082, + "rewards/rejected": -0.000662245147395879, + "step": 320 + }, + { + "epoch": 0.08636482596179011, + "grad_norm": 1.7784926891326904, + "learning_rate": 4.3080939947780675e-07, + "logits/chosen": -2.8389968872070312, + "logits/rejected": -2.8390631675720215, + "logps/chosen": -277.24652099609375, + "logps/rejected": -250.9720458984375, + "loss": 0.6892, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.007157427724450827, + "rewards/margins": 0.00795576348900795, + "rewards/rejected": -0.0007983351242728531, + "step": 330 + }, + { + "epoch": 0.08898194190002617, + "grad_norm": 2.0122432708740234, + "learning_rate": 4.4386422976501305e-07, + "logits/chosen": -2.868762254714966, + "logits/rejected": -2.8562684059143066, + "logps/chosen": -306.8142395019531, + "logps/rejected": -284.90679931640625, + "loss": 0.6886, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.00881933607161045, + "rewards/margins": 0.009326713159680367, + "rewards/rejected": -0.000507376913446933, + "step": 340 + }, + { + "epoch": 0.09159905783826224, + "grad_norm": 1.7484519481658936, + "learning_rate": 4.569190600522193e-07, + "logits/chosen": -2.824993848800659, + "logits/rejected": -2.797851085662842, + "logps/chosen": -309.11224365234375, + "logps/rejected": -296.3442687988281, + "loss": 0.6894, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.009017640724778175, + "rewards/margins": 0.00765979802235961, + "rewards/rejected": 0.0013578429352492094, + "step": 350 + }, + { + "epoch": 0.0942161737764983, + "grad_norm": 1.2647193670272827, + "learning_rate": 4.699738903394256e-07, + "logits/chosen": -2.8344480991363525, + "logits/rejected": -2.816068649291992, + "logps/chosen": -256.1959533691406, + "logps/rejected": -236.88818359375, + "loss": 0.6883, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.007074951194226742, + "rewards/margins": 0.009867229498922825, + "rewards/rejected": -0.0027922778390347958, + "step": 360 + }, + { + "epoch": 0.09683328971473436, + "grad_norm": 2.0885772705078125, + "learning_rate": 4.830287206266319e-07, + "logits/chosen": -2.8475875854492188, + "logits/rejected": -2.8186795711517334, + "logps/chosen": -295.1861572265625, + "logps/rejected": -251.5151824951172, + "loss": 0.6856, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.010460047982633114, + "rewards/margins": 0.015231410041451454, + "rewards/rejected": -0.004771359730511904, + "step": 370 + }, + { + "epoch": 0.09945040565297043, + "grad_norm": 1.8870456218719482, + "learning_rate": 4.960835509138381e-07, + "logits/chosen": -2.8488352298736572, + "logits/rejected": -2.7997212409973145, + "logps/chosen": -315.6346740722656, + "logps/rejected": -279.5706481933594, + "loss": 0.6871, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.011897383257746696, + "rewards/margins": 0.012441580183804035, + "rewards/rejected": -0.0005441965768113732, + "step": 380 + }, + { + "epoch": 0.1020675215912065, + "grad_norm": 2.3549890518188477, + "learning_rate": 4.999948856244767e-07, + "logits/chosen": -2.8280773162841797, + "logits/rejected": -2.8224241733551025, + "logps/chosen": -297.057373046875, + "logps/rejected": -278.00421142578125, + "loss": 0.6836, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.01873602904379368, + "rewards/margins": 0.01945691928267479, + "rewards/rejected": -0.0007208908209577203, + "step": 390 + }, + { + "epoch": 0.10468463752944256, + "grad_norm": 1.818867802619934, + "learning_rate": 4.999698361256577e-07, + "logits/chosen": -2.851010799407959, + "logits/rejected": -2.8151259422302246, + "logps/chosen": -279.1597900390625, + "logps/rejected": -237.5978546142578, + "loss": 0.6872, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.016593072563409805, + "rewards/margins": 0.012265140190720558, + "rewards/rejected": 0.004327933304011822, + "step": 400 + }, + { + "epoch": 0.10468463752944256, + "eval_logits/chosen": -2.857703685760498, + "eval_logits/rejected": -2.830756425857544, + "eval_logps/chosen": -281.0899963378906, + "eval_logps/rejected": -261.22564697265625, + "eval_loss": 0.6861628293991089, + "eval_rewards/accuracies": 0.6669999957084656, + "eval_rewards/chosen": 0.01647624559700489, + "eval_rewards/margins": 0.014353430829942226, + "eval_rewards/rejected": 0.002122814767062664, + "eval_runtime": 692.2781, + "eval_samples_per_second": 2.889, + "eval_steps_per_second": 0.361, + "step": 400 + }, + { + "epoch": 0.10730175346767862, + "grad_norm": 1.9545940160751343, + "learning_rate": 4.99923914217458e-07, + "logits/chosen": -2.818399667739868, + "logits/rejected": -2.802830457687378, + "logps/chosen": -256.24957275390625, + "logps/rejected": -256.09527587890625, + "loss": 0.6893, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.013771469704806805, + "rewards/margins": 0.008097216486930847, + "rewards/rejected": 0.005674251355230808, + "step": 410 + }, + { + "epoch": 0.10991886940591468, + "grad_norm": 4.077869415283203, + "learning_rate": 4.99857123734344e-07, + "logits/chosen": -2.8153655529022217, + "logits/rejected": -2.769317865371704, + "logps/chosen": -244.53890991210938, + "logps/rejected": -238.0004119873047, + "loss": 0.6855, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": 0.015213017351925373, + "rewards/margins": 0.015682024881243706, + "rewards/rejected": -0.00046900735469534993, + "step": 420 + }, + { + "epoch": 0.11253598534415074, + "grad_norm": 2.243114471435547, + "learning_rate": 4.997694702533016e-07, + "logits/chosen": -2.837740182876587, + "logits/rejected": -2.806856870651245, + "logps/chosen": -293.7519836425781, + "logps/rejected": -272.25494384765625, + "loss": 0.6835, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.022876007482409477, + "rewards/margins": 0.019848225638270378, + "rewards/rejected": 0.0030277802143245935, + "step": 430 + }, + { + "epoch": 0.11515310128238682, + "grad_norm": 1.829640507698059, + "learning_rate": 4.996609610933712e-07, + "logits/chosen": -2.875370740890503, + "logits/rejected": -2.8540024757385254, + "logps/chosen": -285.1123962402344, + "logps/rejected": -256.6170654296875, + "loss": 0.6833, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.02266586944460869, + "rewards/margins": 0.020275097340345383, + "rewards/rejected": 0.0023907723370939493, + "step": 440 + }, + { + "epoch": 0.11777021722062288, + "grad_norm": 1.756147861480713, + "learning_rate": 4.995316053150366e-07, + "logits/chosen": -2.806842088699341, + "logits/rejected": -2.8101210594177246, + "logps/chosen": -288.1036376953125, + "logps/rejected": -259.46014404296875, + "loss": 0.6824, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.029574494808912277, + "rewards/margins": 0.022273657843470573, + "rewards/rejected": 0.007300837431102991, + "step": 450 + }, + { + "epoch": 0.12038733315885894, + "grad_norm": 3.1120874881744385, + "learning_rate": 4.99381413719468e-07, + "logits/chosen": -2.825704574584961, + "logits/rejected": -2.81204891204834, + "logps/chosen": -279.86334228515625, + "logps/rejected": -268.80755615234375, + "loss": 0.6796, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.029285842552781105, + "rewards/margins": 0.027944009751081467, + "rewards/rejected": 0.0013418343150988221, + "step": 460 + }, + { + "epoch": 0.123004449097095, + "grad_norm": 1.9212427139282227, + "learning_rate": 4.992103988476205e-07, + "logits/chosen": -2.83656644821167, + "logits/rejected": -2.810007333755493, + "logps/chosen": -257.7132873535156, + "logps/rejected": -245.3390655517578, + "loss": 0.6831, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.024322878569364548, + "rewards/margins": 0.020839061588048935, + "rewards/rejected": 0.003483818843960762, + "step": 470 + }, + { + "epoch": 0.12562156503533106, + "grad_norm": 2.0051708221435547, + "learning_rate": 4.990185749791864e-07, + "logits/chosen": -2.868682622909546, + "logits/rejected": -2.836199998855591, + "logps/chosen": -271.63922119140625, + "logps/rejected": -274.00189208984375, + "loss": 0.68, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": 0.027854889631271362, + "rewards/margins": 0.0271223783493042, + "rewards/rejected": 0.0007325109909288585, + "step": 480 + }, + { + "epoch": 0.12823868097356714, + "grad_norm": 2.0355913639068604, + "learning_rate": 4.988059581314039e-07, + "logits/chosen": -2.8479950428009033, + "logits/rejected": -2.8285024166107178, + "logps/chosen": -305.7145690917969, + "logps/rejected": -269.5832214355469, + "loss": 0.6789, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.02704049088060856, + "rewards/margins": 0.029708972200751305, + "rewards/rejected": -0.002668480621650815, + "step": 490 + }, + { + "epoch": 0.13085579691180318, + "grad_norm": 1.996235966682434, + "learning_rate": 4.985725660577184e-07, + "logits/chosen": -2.8617165088653564, + "logits/rejected": -2.843017101287842, + "logps/chosen": -288.36846923828125, + "logps/rejected": -249.8210906982422, + "loss": 0.6783, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": 0.023136448115110397, + "rewards/margins": 0.031000768765807152, + "rewards/rejected": -0.007864321582019329, + "step": 500 + }, + { + "epoch": 0.13085579691180318, + "eval_logits/chosen": -2.848633289337158, + "eval_logits/rejected": -2.8214972019195557, + "eval_logps/chosen": -280.6480712890625, + "eval_logps/rejected": -262.0230407714844, + "eval_loss": 0.6803756356239319, + "eval_rewards/accuracies": 0.6834999918937683, + "eval_rewards/chosen": 0.020895304158329964, + "eval_rewards/margins": 0.026746317744255066, + "eval_rewards/rejected": -0.005851015914231539, + "eval_runtime": 691.0122, + "eval_samples_per_second": 2.894, + "eval_steps_per_second": 0.362, + "step": 500 + }, + { + "epoch": 0.13347291285003926, + "grad_norm": 2.2953689098358154, + "learning_rate": 4.983184182463008e-07, + "logits/chosen": -2.83900785446167, + "logits/rejected": -2.8163068294525146, + "logps/chosen": -292.3056335449219, + "logps/rejected": -256.3818359375, + "loss": 0.6779, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0237285066395998, + "rewards/margins": 0.03204946964979172, + "rewards/rejected": -0.008320963010191917, + "step": 510 + }, + { + "epoch": 0.1360900287882753, + "grad_norm": 2.152860164642334, + "learning_rate": 4.980435359184203e-07, + "logits/chosen": -2.8620104789733887, + "logits/rejected": -2.8637924194335938, + "logps/chosen": -285.1622314453125, + "logps/rejected": -270.9977722167969, + "loss": 0.6791, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.026320820674300194, + "rewards/margins": 0.029663830995559692, + "rewards/rejected": -0.0033430135808885098, + "step": 520 + }, + { + "epoch": 0.13870714472651138, + "grad_norm": 2.3760368824005127, + "learning_rate": 4.977479420266723e-07, + "logits/chosen": -2.8074328899383545, + "logits/rejected": -2.8127429485321045, + "logps/chosen": -278.2021484375, + "logps/rejected": -288.5596618652344, + "loss": 0.6792, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.02414657548069954, + "rewards/margins": 0.02932720258831978, + "rewards/rejected": -0.005180628038942814, + "step": 530 + }, + { + "epoch": 0.14132426066474746, + "grad_norm": 1.8068273067474365, + "learning_rate": 4.974316612530614e-07, + "logits/chosen": -2.799464464187622, + "logits/rejected": -2.781719446182251, + "logps/chosen": -296.43017578125, + "logps/rejected": -260.1778869628906, + "loss": 0.6685, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.03263556957244873, + "rewards/margins": 0.05155158042907715, + "rewards/rejected": -0.018916018307209015, + "step": 540 + }, + { + "epoch": 0.1439413766029835, + "grad_norm": 2.295518636703491, + "learning_rate": 4.970947200069415e-07, + "logits/chosen": -2.8136024475097656, + "logits/rejected": -2.8002548217773438, + "logps/chosen": -296.8650817871094, + "logps/rejected": -277.0992431640625, + "loss": 0.6793, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.026846662163734436, + "rewards/margins": 0.029769038781523705, + "rewards/rejected": -0.0029223733581602573, + "step": 550 + }, + { + "epoch": 0.14655849254121958, + "grad_norm": 1.8040831089019775, + "learning_rate": 4.967371464228095e-07, + "logits/chosen": -2.8747551441192627, + "logits/rejected": -2.8538835048675537, + "logps/chosen": -269.18994140625, + "logps/rejected": -272.37799072265625, + "loss": 0.6782, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": 0.026889195665717125, + "rewards/margins": 0.03184649348258972, + "rewards/rejected": -0.004957299679517746, + "step": 560 + }, + { + "epoch": 0.14917560847945563, + "grad_norm": 2.131438970565796, + "learning_rate": 4.963589703579569e-07, + "logits/chosen": -2.899491310119629, + "logits/rejected": -2.8730692863464355, + "logps/chosen": -313.0187072753906, + "logps/rejected": -280.3568420410156, + "loss": 0.6752, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": 0.028542449697852135, + "rewards/margins": 0.03851853683590889, + "rewards/rejected": -0.009976087138056755, + "step": 570 + }, + { + "epoch": 0.1517927244176917, + "grad_norm": 1.8194427490234375, + "learning_rate": 4.959602233899761e-07, + "logits/chosen": -2.892979621887207, + "logits/rejected": -2.8543694019317627, + "logps/chosen": -311.68353271484375, + "logps/rejected": -272.5694580078125, + "loss": 0.673, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.035731758922338486, + "rewards/margins": 0.04327362775802612, + "rewards/rejected": -0.007541867904365063, + "step": 580 + }, + { + "epoch": 0.15440984035592778, + "grad_norm": 2.1900675296783447, + "learning_rate": 4.955409388141243e-07, + "logits/chosen": -2.8265955448150635, + "logits/rejected": -2.8132894039154053, + "logps/chosen": -273.9072265625, + "logps/rejected": -251.5390167236328, + "loss": 0.6752, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.016455931589007378, + "rewards/margins": 0.03850039094686508, + "rewards/rejected": -0.022044459357857704, + "step": 590 + }, + { + "epoch": 0.15702695629416383, + "grad_norm": 1.8198952674865723, + "learning_rate": 4.951011516405429e-07, + "logits/chosen": -2.84102201461792, + "logits/rejected": -2.84004807472229, + "logps/chosen": -265.394775390625, + "logps/rejected": -252.8574676513672, + "loss": 0.6729, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.021321838721632957, + "rewards/margins": 0.04377777501940727, + "rewards/rejected": -0.022455941885709763, + "step": 600 + }, + { + "epoch": 0.15702695629416383, + "eval_logits/chosen": -2.8409736156463623, + "eval_logits/rejected": -2.813835382461548, + "eval_logps/chosen": -281.19580078125, + "eval_logps/rejected": -264.16082763671875, + "eval_loss": 0.6732848882675171, + "eval_rewards/accuracies": 0.6840000152587891, + "eval_rewards/chosen": 0.015417821705341339, + "eval_rewards/margins": 0.04264672100543976, + "eval_rewards/rejected": -0.02722889743745327, + "eval_runtime": 691.9111, + "eval_samples_per_second": 2.891, + "eval_steps_per_second": 0.361, + "step": 600 + }, + { + "epoch": 0.1596440722323999, + "grad_norm": 2.117947578430176, + "learning_rate": 4.946408985913344e-07, + "logits/chosen": -2.834245204925537, + "logits/rejected": -2.8125996589660645, + "logps/chosen": -262.54144287109375, + "logps/rejected": -246.34860229492188, + "loss": 0.6734, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.020137300714850426, + "rewards/margins": 0.04266170784831047, + "rewards/rejected": -0.022524405270814896, + "step": 610 + }, + { + "epoch": 0.16226118817063595, + "grad_norm": 2.218667507171631, + "learning_rate": 4.941602180974958e-07, + "logits/chosen": -2.8357930183410645, + "logits/rejected": -2.7973721027374268, + "logps/chosen": -303.65606689453125, + "logps/rejected": -245.33108520507812, + "loss": 0.6696, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.019601870328187943, + "rewards/margins": 0.049685824662446976, + "rewards/rejected": -0.030083950608968735, + "step": 620 + }, + { + "epoch": 0.16487830410887203, + "grad_norm": 1.9840420484542847, + "learning_rate": 4.936591502957101e-07, + "logits/chosen": -2.8378233909606934, + "logits/rejected": -2.8140475749969482, + "logps/chosen": -261.1944580078125, + "logps/rejected": -257.957763671875, + "loss": 0.6647, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.024741780012845993, + "rewards/margins": 0.06145521253347397, + "rewards/rejected": -0.036713436245918274, + "step": 630 + }, + { + "epoch": 0.16749542004710807, + "grad_norm": 2.034658432006836, + "learning_rate": 4.931377370249945e-07, + "logits/chosen": -2.845576763153076, + "logits/rejected": -2.78796124458313, + "logps/chosen": -281.0826110839844, + "logps/rejected": -263.23370361328125, + "loss": 0.6673, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -6.524250056827441e-05, + "rewards/margins": 0.05555204302072525, + "rewards/rejected": -0.055617284029722214, + "step": 640 + }, + { + "epoch": 0.17011253598534415, + "grad_norm": 2.102283239364624, + "learning_rate": 4.925960218232072e-07, + "logits/chosen": -2.8266994953155518, + "logits/rejected": -2.8046762943267822, + "logps/chosen": -269.2861633300781, + "logps/rejected": -264.4281005859375, + "loss": 0.6646, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": 0.008663799613714218, + "rewards/margins": 0.06168809533119202, + "rewards/rejected": -0.0530242919921875, + "step": 650 + }, + { + "epoch": 0.17272965192358022, + "grad_norm": 3.1403772830963135, + "learning_rate": 4.920340499234116e-07, + "logits/chosen": -2.796461343765259, + "logits/rejected": -2.757336139678955, + "logps/chosen": -285.25445556640625, + "logps/rejected": -251.8562469482422, + "loss": 0.6684, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": 0.010964155197143555, + "rewards/margins": 0.05367765575647354, + "rewards/rejected": -0.04271350055932999, + "step": 660 + }, + { + "epoch": 0.17534676786181627, + "grad_norm": 1.932573914527893, + "learning_rate": 4.914518682500995e-07, + "logits/chosen": -2.870535373687744, + "logits/rejected": -2.840186595916748, + "logps/chosen": -297.72967529296875, + "logps/rejected": -261.30780029296875, + "loss": 0.661, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.0239148810505867, + "rewards/margins": 0.07002829760313034, + "rewards/rejected": -0.04611341655254364, + "step": 670 + }, + { + "epoch": 0.17796388380005235, + "grad_norm": 2.7643067836761475, + "learning_rate": 4.90849525415273e-07, + "logits/chosen": -2.830029249191284, + "logits/rejected": -2.8078887462615967, + "logps/chosen": -288.3429260253906, + "logps/rejected": -245.07369995117188, + "loss": 0.6589, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.02092517912387848, + "rewards/margins": 0.07483113557100296, + "rewards/rejected": -0.05390595644712448, + "step": 680 + }, + { + "epoch": 0.1805809997382884, + "grad_norm": 2.184591054916382, + "learning_rate": 4.902270717143858e-07, + "logits/chosen": -2.837787628173828, + "logits/rejected": -2.8210721015930176, + "logps/chosen": -255.417724609375, + "logps/rejected": -272.31591796875, + "loss": 0.6509, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.005492637865245342, + "rewards/margins": 0.09071613848209381, + "rewards/rejected": -0.0852234959602356, + "step": 690 + }, + { + "epoch": 0.18319811567652447, + "grad_norm": 2.2565648555755615, + "learning_rate": 4.895845591221426e-07, + "logits/chosen": -2.833556652069092, + "logits/rejected": -2.836822032928467, + "logps/chosen": -269.5510559082031, + "logps/rejected": -269.97686767578125, + "loss": 0.6665, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.003929516766220331, + "rewards/margins": 0.058883119374513626, + "rewards/rejected": -0.06281263381242752, + "step": 700 + }, + { + "epoch": 0.18319811567652447, + "eval_logits/chosen": -2.8327224254608154, + "eval_logits/rejected": -2.8060340881347656, + "eval_logps/chosen": -283.0862731933594, + "eval_logps/rejected": -268.32659912109375, + "eval_loss": 0.6637989282608032, + "eval_rewards/accuracies": 0.6754999756813049, + "eval_rewards/chosen": -0.0034864526242017746, + "eval_rewards/margins": 0.06540023535490036, + "eval_rewards/rejected": -0.06888668984174728, + "eval_runtime": 691.7822, + "eval_samples_per_second": 2.891, + "eval_steps_per_second": 0.361, + "step": 700 + }, + { + "epoch": 0.18581523161476055, + "grad_norm": 2.449979782104492, + "learning_rate": 4.8892204128816e-07, + "logits/chosen": -2.865187644958496, + "logits/rejected": -2.8416965007781982, + "logps/chosen": -281.83489990234375, + "logps/rejected": -273.02984619140625, + "loss": 0.6666, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.0031673975754529238, + "rewards/margins": 0.059172265231609344, + "rewards/rejected": -0.062339670956134796, + "step": 710 + }, + { + "epoch": 0.1884323475529966, + "grad_norm": 2.0199317932128906, + "learning_rate": 4.882395735324863e-07, + "logits/chosen": -2.840233325958252, + "logits/rejected": -2.7969911098480225, + "logps/chosen": -281.1783447265625, + "logps/rejected": -274.934326171875, + "loss": 0.6572, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0017295643920078874, + "rewards/margins": 0.08150311559438705, + "rewards/rejected": -0.07977355271577835, + "step": 720 + }, + { + "epoch": 0.19104946349123267, + "grad_norm": 2.187190294265747, + "learning_rate": 4.875372128409829e-07, + "logits/chosen": -2.815016269683838, + "logits/rejected": -2.7854647636413574, + "logps/chosen": -285.82489013671875, + "logps/rejected": -259.6023254394531, + "loss": 0.6616, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.02074645273387432, + "rewards/margins": 0.07150407880544662, + "rewards/rejected": -0.0922505259513855, + "step": 730 + }, + { + "epoch": 0.19366657942946872, + "grad_norm": 2.0459957122802734, + "learning_rate": 4.868150178605653e-07, + "logits/chosen": -2.812069892883301, + "logits/rejected": -2.7864902019500732, + "logps/chosen": -246.3455352783203, + "logps/rejected": -221.7488250732422, + "loss": 0.6527, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.03750302642583847, + "rewards/margins": 0.08891085535287857, + "rewards/rejected": -0.12641388177871704, + "step": 740 + }, + { + "epoch": 0.1962836953677048, + "grad_norm": 2.3921523094177246, + "learning_rate": 4.860730488943068e-07, + "logits/chosen": -2.7749264240264893, + "logits/rejected": -2.7638156414031982, + "logps/chosen": -253.1526641845703, + "logps/rejected": -256.56072998046875, + "loss": 0.657, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.013170385733246803, + "rewards/margins": 0.08013583719730377, + "rewards/rejected": -0.09330622851848602, + "step": 750 + }, + { + "epoch": 0.19890081130594087, + "grad_norm": 2.7103869915008545, + "learning_rate": 4.853113678964021e-07, + "logits/chosen": -2.7963593006134033, + "logits/rejected": -2.786759376525879, + "logps/chosen": -295.2373962402344, + "logps/rejected": -288.03070068359375, + "loss": 0.6532, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.0016003316268324852, + "rewards/margins": 0.09059783071279526, + "rewards/rejected": -0.09219817072153091, + "step": 760 + }, + { + "epoch": 0.20151792724417691, + "grad_norm": 2.149914026260376, + "learning_rate": 4.845300384669957e-07, + "logits/chosen": -2.81345534324646, + "logits/rejected": -2.783003807067871, + "logps/chosen": -270.67730712890625, + "logps/rejected": -254.6434326171875, + "loss": 0.6605, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.006530989892780781, + "rewards/margins": 0.07502902299165726, + "rewards/rejected": -0.08156001567840576, + "step": 770 + }, + { + "epoch": 0.204135043182413, + "grad_norm": 2.4296960830688477, + "learning_rate": 4.8372912584687e-07, + "logits/chosen": -2.8353335857391357, + "logits/rejected": -2.801575183868408, + "logps/chosen": -300.9684143066406, + "logps/rejected": -283.5567626953125, + "loss": 0.6587, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": 0.0004579909145832062, + "rewards/margins": 0.0798453614115715, + "rewards/rejected": -0.079387366771698, + "step": 780 + }, + { + "epoch": 0.20675215912064904, + "grad_norm": 3.0373857021331787, + "learning_rate": 4.829086969119983e-07, + "logits/chosen": -2.8006482124328613, + "logits/rejected": -2.8082146644592285, + "logps/chosen": -276.4783020019531, + "logps/rejected": -276.69720458984375, + "loss": 0.6671, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.024218443781137466, + "rewards/margins": 0.06143224984407425, + "rewards/rejected": -0.08565069735050201, + "step": 790 + }, + { + "epoch": 0.2093692750588851, + "grad_norm": 2.1895201206207275, + "learning_rate": 4.820688201679605e-07, + "logits/chosen": -2.8546204566955566, + "logits/rejected": -2.809619426727295, + "logps/chosen": -277.23187255859375, + "logps/rejected": -223.0809783935547, + "loss": 0.6427, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": 0.0034676387440413237, + "rewards/margins": 0.11413818597793579, + "rewards/rejected": -0.1106705442070961, + "step": 800 + }, + { + "epoch": 0.2093692750588851, + "eval_logits/chosen": -2.828324317932129, + "eval_logits/rejected": -2.8020219802856445, + "eval_logps/chosen": -284.8824768066406, + "eval_logps/rejected": -272.4747314453125, + "eval_loss": 0.6546491980552673, + "eval_rewards/accuracies": 0.6815000176429749, + "eval_rewards/chosen": -0.02144855633378029, + "eval_rewards/margins": 0.08891918510198593, + "eval_rewards/rejected": -0.11036773025989532, + "eval_runtime": 691.3571, + "eval_samples_per_second": 2.893, + "eval_steps_per_second": 0.362, + "step": 800 + }, + { + "epoch": 0.21198639099712116, + "grad_norm": 2.411094903945923, + "learning_rate": 4.812095657442231e-07, + "logits/chosen": -2.8379623889923096, + "logits/rejected": -2.8474135398864746, + "logps/chosen": -292.9294128417969, + "logps/rejected": -291.79937744140625, + "loss": 0.6657, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.03384638577699661, + "rewards/margins": 0.06732925027608871, + "rewards/rejected": -0.10117564350366592, + "step": 810 + }, + { + "epoch": 0.21460350693535724, + "grad_norm": 2.2789130210876465, + "learning_rate": 4.803310053882831e-07, + "logits/chosen": -2.820188522338867, + "logits/rejected": -2.8341267108917236, + "logps/chosen": -253.18002319335938, + "logps/rejected": -271.46209716796875, + "loss": 0.6585, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.04002877324819565, + "rewards/margins": 0.08040440827608109, + "rewards/rejected": -0.12043318897485733, + "step": 820 + }, + { + "epoch": 0.2172206228735933, + "grad_norm": 2.6294658184051514, + "learning_rate": 4.794332124596775e-07, + "logits/chosen": -2.8491604328155518, + "logits/rejected": -2.8390445709228516, + "logps/chosen": -288.0977478027344, + "logps/rejected": -289.91839599609375, + "loss": 0.6617, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.025598719716072083, + "rewards/margins": 0.0781911239027977, + "rewards/rejected": -0.10378985106945038, + "step": 830 + }, + { + "epoch": 0.21983773881182936, + "grad_norm": 2.718003273010254, + "learning_rate": 4.785162619238574e-07, + "logits/chosen": -2.7903778553009033, + "logits/rejected": -2.750192880630493, + "logps/chosen": -271.6007995605469, + "logps/rejected": -255.642822265625, + "loss": 0.6434, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.013516816310584545, + "rewards/margins": 0.11254201829433441, + "rewards/rejected": -0.12605881690979004, + "step": 840 + }, + { + "epoch": 0.22245485475006543, + "grad_norm": 2.693995714187622, + "learning_rate": 4.775802303459287e-07, + "logits/chosen": -2.7961440086364746, + "logits/rejected": -2.782381534576416, + "logps/chosen": -266.48406982421875, + "logps/rejected": -271.54876708984375, + "loss": 0.6543, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.025890201330184937, + "rewards/margins": 0.09162938594818115, + "rewards/rejected": -0.11751959472894669, + "step": 850 + }, + { + "epoch": 0.22507197068830148, + "grad_norm": 3.3223588466644287, + "learning_rate": 4.766251958842589e-07, + "logits/chosen": -2.770634174346924, + "logits/rejected": -2.7624752521514893, + "logps/chosen": -295.11322021484375, + "logps/rejected": -291.52655029296875, + "loss": 0.6493, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.03162473067641258, + "rewards/margins": 0.10102814435958862, + "rewards/rejected": -0.1326528638601303, + "step": 860 + }, + { + "epoch": 0.22768908662653756, + "grad_norm": 2.2951784133911133, + "learning_rate": 4.756512382839506e-07, + "logits/chosen": -2.792806625366211, + "logits/rejected": -2.7687854766845703, + "logps/chosen": -276.4913024902344, + "logps/rejected": -288.6650390625, + "loss": 0.6455, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.06362788379192352, + "rewards/margins": 0.11521414667367935, + "rewards/rejected": -0.17884202301502228, + "step": 870 + }, + { + "epoch": 0.23030620256477363, + "grad_norm": 2.3468611240386963, + "learning_rate": 4.746584388701831e-07, + "logits/chosen": -2.804765224456787, + "logits/rejected": -2.8049676418304443, + "logps/chosen": -284.9786071777344, + "logps/rejected": -280.96392822265625, + "loss": 0.6438, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.05107206106185913, + "rewards/margins": 0.11629124730825424, + "rewards/rejected": -0.16736331582069397, + "step": 880 + }, + { + "epoch": 0.23292331850300968, + "grad_norm": 3.075714588165283, + "learning_rate": 4.736468805414218e-07, + "logits/chosen": -2.77662992477417, + "logits/rejected": -2.7775301933288574, + "logps/chosen": -271.46368408203125, + "logps/rejected": -293.26531982421875, + "loss": 0.6421, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.033290714025497437, + "rewards/margins": 0.12201287597417831, + "rewards/rejected": -0.15530358254909515, + "step": 890 + }, + { + "epoch": 0.23554043444124576, + "grad_norm": 2.879183769226074, + "learning_rate": 4.7261664776249595e-07, + "logits/chosen": -2.7510781288146973, + "logits/rejected": -2.7387068271636963, + "logps/chosen": -250.3533477783203, + "logps/rejected": -251.46630859375, + "loss": 0.6428, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.04132508859038353, + "rewards/margins": 0.12072241306304932, + "rewards/rejected": -0.16204750537872314, + "step": 900 + }, + { + "epoch": 0.23554043444124576, + "eval_logits/chosen": -2.819901704788208, + "eval_logits/rejected": -2.794234275817871, + "eval_logps/chosen": -285.2049865722656, + "eval_logps/rejected": -275.2684631347656, + "eval_loss": 0.6458239555358887, + "eval_rewards/accuracies": 0.6769999861717224, + "eval_rewards/chosen": -0.024673735722899437, + "eval_rewards/margins": 0.1136314645409584, + "eval_rewards/rejected": -0.138305202126503, + "eval_runtime": 690.9829, + "eval_samples_per_second": 2.894, + "eval_steps_per_second": 0.362, + "step": 900 + }, + { + "epoch": 0.2381575503794818, + "grad_norm": 2.7687416076660156, + "learning_rate": 4.7156782655754624e-07, + "logits/chosen": -2.8114147186279297, + "logits/rejected": -2.772068977355957, + "logps/chosen": -300.78826904296875, + "logps/rejected": -255.8038330078125, + "loss": 0.6426, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.002123198937624693, + "rewards/margins": 0.1206832155585289, + "rewards/rejected": -0.12280640751123428, + "step": 910 + }, + { + "epoch": 0.24077466631771788, + "grad_norm": 2.5618391036987305, + "learning_rate": 4.705005045028414e-07, + "logits/chosen": -2.765242338180542, + "logits/rejected": -2.737863063812256, + "logps/chosen": -287.15667724609375, + "logps/rejected": -278.50726318359375, + "loss": 0.6459, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.05967919901013374, + "rewards/margins": 0.11529602855443954, + "rewards/rejected": -0.1749752312898636, + "step": 920 + }, + { + "epoch": 0.24339178225595393, + "grad_norm": 2.9336323738098145, + "learning_rate": 4.694147707194659e-07, + "logits/chosen": -2.832733631134033, + "logits/rejected": -2.8244283199310303, + "logps/chosen": -294.346923828125, + "logps/rejected": -287.9342346191406, + "loss": 0.6366, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.06329428404569626, + "rewards/margins": 0.1405760794878006, + "rewards/rejected": -0.20387034118175507, + "step": 930 + }, + { + "epoch": 0.24600889819419, + "grad_norm": 3.908505439758301, + "learning_rate": 4.683107158658781e-07, + "logits/chosen": -2.7808585166931152, + "logits/rejected": -2.763042688369751, + "logps/chosen": -314.3782653808594, + "logps/rejected": -299.661865234375, + "loss": 0.6227, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.05701801925897598, + "rewards/margins": 0.16755308210849762, + "rewards/rejected": -0.2245711088180542, + "step": 940 + }, + { + "epoch": 0.24862601413242608, + "grad_norm": 3.2749459743499756, + "learning_rate": 4.6718843213034066e-07, + "logits/chosen": -2.7944037914276123, + "logits/rejected": -2.77887225151062, + "logps/chosen": -272.23724365234375, + "logps/rejected": -273.14776611328125, + "loss": 0.633, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.09230604767799377, + "rewards/margins": 0.14217710494995117, + "rewards/rejected": -0.23448316752910614, + "step": 950 + }, + { + "epoch": 0.2512431300706621, + "grad_norm": 3.0224010944366455, + "learning_rate": 4.660480132232224e-07, + "logits/chosen": -2.805572986602783, + "logits/rejected": -2.80751371383667, + "logps/chosen": -293.3813171386719, + "logps/rejected": -280.83465576171875, + "loss": 0.6507, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.0641000047326088, + "rewards/margins": 0.10990612208843231, + "rewards/rejected": -0.1740061342716217, + "step": 960 + }, + { + "epoch": 0.25386024600889817, + "grad_norm": 3.5039138793945312, + "learning_rate": 4.64889554369174e-07, + "logits/chosen": -2.805609941482544, + "logits/rejected": -2.771754741668701, + "logps/chosen": -298.55157470703125, + "logps/rejected": -267.65087890625, + "loss": 0.6166, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": 0.0059810527600348, + "rewards/margins": 0.18814215064048767, + "rewards/rejected": -0.1821610927581787, + "step": 970 + }, + { + "epoch": 0.2564773619471343, + "grad_norm": 2.8160240650177, + "learning_rate": 4.637131522991764e-07, + "logits/chosen": -2.7994441986083984, + "logits/rejected": -2.7969179153442383, + "logps/chosen": -309.35089111328125, + "logps/rejected": -296.6192321777344, + "loss": 0.6321, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.029499268159270287, + "rewards/margins": 0.14880326390266418, + "rewards/rejected": -0.17830254137516022, + "step": 980 + }, + { + "epoch": 0.2590944778853703, + "grad_norm": 3.782945156097412, + "learning_rate": 4.6251890524246375e-07, + "logits/chosen": -2.8050458431243896, + "logits/rejected": -2.786475658416748, + "logps/chosen": -262.4518737792969, + "logps/rejected": -256.80792236328125, + "loss": 0.6166, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.0701083093881607, + "rewards/margins": 0.18339978158473969, + "rewards/rejected": -0.253508061170578, + "step": 990 + }, + { + "epoch": 0.26171159382360637, + "grad_norm": 3.791015148162842, + "learning_rate": 4.613069129183218e-07, + "logits/chosen": -2.8377981185913086, + "logits/rejected": -2.799161911010742, + "logps/chosen": -328.35491943359375, + "logps/rejected": -301.65679931640625, + "loss": 0.6381, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.0686495453119278, + "rewards/margins": 0.13748301565647125, + "rewards/rejected": -0.20613256096839905, + "step": 1000 + }, + { + "epoch": 0.26171159382360637, + "eval_logits/chosen": -2.813830852508545, + "eval_logits/rejected": -2.7887284755706787, + "eval_logps/chosen": -289.12060546875, + "eval_logps/rejected": -282.1760559082031, + "eval_loss": 0.635771632194519, + "eval_rewards/accuracies": 0.6784999966621399, + "eval_rewards/chosen": -0.06382979452610016, + "eval_rewards/margins": 0.14355140924453735, + "eval_rewards/rejected": -0.2073812186717987, + "eval_runtime": 691.4427, + "eval_samples_per_second": 2.893, + "eval_steps_per_second": 0.362, + "step": 1000 + }, + { + "epoch": 0.2643287097618425, + "grad_norm": 4.366467475891113, + "learning_rate": 4.6007727652776065e-07, + "logits/chosen": -2.7737021446228027, + "logits/rejected": -2.7608792781829834, + "logps/chosen": -254.6834259033203, + "logps/rejected": -263.98565673828125, + "loss": 0.6304, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.036558397114276886, + "rewards/margins": 0.1544768214225769, + "rewards/rejected": -0.1910352259874344, + "step": 1010 + }, + { + "epoch": 0.2669458257000785, + "grad_norm": 3.2850377559661865, + "learning_rate": 4.588300987450652e-07, + "logits/chosen": -2.82348895072937, + "logits/rejected": -2.7995572090148926, + "logps/chosen": -271.41241455078125, + "logps/rejected": -254.01864624023438, + "loss": 0.6293, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.008820459246635437, + "rewards/margins": 0.1594310700893402, + "rewards/rejected": -0.16825154423713684, + "step": 1020 + }, + { + "epoch": 0.26956294163831457, + "grad_norm": 3.3716328144073486, + "learning_rate": 4.5756548370922134e-07, + "logits/chosen": -2.781808853149414, + "logits/rejected": -2.7637503147125244, + "logps/chosen": -258.62860107421875, + "logps/rejected": -260.2466125488281, + "loss": 0.6508, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.024007773026823997, + "rewards/margins": 0.11937548965215683, + "rewards/rejected": -0.14338326454162598, + "step": 1030 + }, + { + "epoch": 0.2721800575765506, + "grad_norm": 3.529965400695801, + "learning_rate": 4.5628353701522047e-07, + "logits/chosen": -2.815080404281616, + "logits/rejected": -2.7873313426971436, + "logps/chosen": -321.65435791015625, + "logps/rejected": -310.28497314453125, + "loss": 0.6072, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.017561940476298332, + "rewards/margins": 0.2143036425113678, + "rewards/rejected": -0.2318655550479889, + "step": 1040 + }, + { + "epoch": 0.2747971735147867, + "grad_norm": 2.87839412689209, + "learning_rate": 4.549843657052429e-07, + "logits/chosen": -2.834746837615967, + "logits/rejected": -2.808051347732544, + "logps/chosen": -287.9942321777344, + "logps/rejected": -302.9963684082031, + "loss": 0.6048, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.03615923970937729, + "rewards/margins": 0.21066415309906006, + "rewards/rejected": -0.24682338535785675, + "step": 1050 + }, + { + "epoch": 0.27741428945302277, + "grad_norm": 3.860949993133545, + "learning_rate": 4.5366807825971907e-07, + "logits/chosen": -2.780369758605957, + "logits/rejected": -2.7750542163848877, + "logps/chosen": -262.59075927734375, + "logps/rejected": -269.21051025390625, + "loss": 0.6437, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.08822160959243774, + "rewards/margins": 0.14002035558223724, + "rewards/rejected": -0.228241965174675, + "step": 1060 + }, + { + "epoch": 0.2800314053912588, + "grad_norm": 6.0348801612854, + "learning_rate": 4.5233478458827176e-07, + "logits/chosen": -2.8092315196990967, + "logits/rejected": -2.785090446472168, + "logps/chosen": -316.466064453125, + "logps/rejected": -282.1798400878906, + "loss": 0.6104, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.08112485706806183, + "rewards/margins": 0.2059168517589569, + "rewards/rejected": -0.2870417535305023, + "step": 1070 + }, + { + "epoch": 0.2826485213294949, + "grad_norm": 4.09010124206543, + "learning_rate": 4.509845960205389e-07, + "logits/chosen": -2.749141216278076, + "logits/rejected": -2.753202438354492, + "logps/chosen": -304.83111572265625, + "logps/rejected": -288.3349304199219, + "loss": 0.626, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.07901586592197418, + "rewards/margins": 0.17285946011543274, + "rewards/rejected": -0.2518753409385681, + "step": 1080 + }, + { + "epoch": 0.28526563726773096, + "grad_norm": 4.772919654846191, + "learning_rate": 4.4961762529687736e-07, + "logits/chosen": -2.8033485412597656, + "logits/rejected": -2.7844488620758057, + "logps/chosen": -288.91998291015625, + "logps/rejected": -284.6497802734375, + "loss": 0.6324, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.09251121431589127, + "rewards/margins": 0.15693159401416779, + "rewards/rejected": -0.24944277107715607, + "step": 1090 + }, + { + "epoch": 0.287882753205967, + "grad_norm": 4.188416957855225, + "learning_rate": 4.482339865589492e-07, + "logits/chosen": -2.8103842735290527, + "logits/rejected": -2.768054962158203, + "logps/chosen": -299.87091064453125, + "logps/rejected": -267.5564880371094, + "loss": 0.6488, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.16943010687828064, + "rewards/margins": 0.12456401437520981, + "rewards/rejected": -0.29399409890174866, + "step": 1100 + }, + { + "epoch": 0.287882753205967, + "eval_logits/chosen": -2.8070549964904785, + "eval_logits/rejected": -2.782604694366455, + "eval_logps/chosen": -296.5137634277344, + "eval_logps/rejected": -291.989013671875, + "eval_loss": 0.6283535361289978, + "eval_rewards/accuracies": 0.6790000200271606, + "eval_rewards/chosen": -0.13776110112667084, + "eval_rewards/margins": 0.16774973273277283, + "eval_rewards/rejected": -0.30551087856292725, + "eval_runtime": 691.0066, + "eval_samples_per_second": 2.894, + "eval_steps_per_second": 0.362, + "step": 1100 + }, + { + "epoch": 0.2904998691442031, + "grad_norm": 4.440745830535889, + "learning_rate": 4.4683379534019076e-07, + "logits/chosen": -2.803920269012451, + "logits/rejected": -2.8017265796661377, + "logps/chosen": -300.3214111328125, + "logps/rejected": -309.1615905761719, + "loss": 0.6336, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.1395951509475708, + "rewards/margins": 0.1519310027360916, + "rewards/rejected": -0.2915261387825012, + "step": 1110 + }, + { + "epoch": 0.29311698508243916, + "grad_norm": 3.8111138343811035, + "learning_rate": 4.4541716855616593e-07, + "logits/chosen": -2.7794926166534424, + "logits/rejected": -2.7597875595092773, + "logps/chosen": -264.9614562988281, + "logps/rejected": -282.9358825683594, + "loss": 0.6252, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.07038460671901703, + "rewards/margins": 0.17066633701324463, + "rewards/rejected": -0.24105095863342285, + "step": 1120 + }, + { + "epoch": 0.2957341010206752, + "grad_norm": 5.494072914123535, + "learning_rate": 4.4398422449480357e-07, + "logits/chosen": -2.774218797683716, + "logits/rejected": -2.725161075592041, + "logps/chosen": -294.66448974609375, + "logps/rejected": -311.0096740722656, + "loss": 0.6402, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.1438552290201187, + "rewards/margins": 0.14675047993659973, + "rewards/rejected": -0.29060572385787964, + "step": 1130 + }, + { + "epoch": 0.29835121695891126, + "grad_norm": 4.3281474113464355, + "learning_rate": 4.4253508280652036e-07, + "logits/chosen": -2.7951579093933105, + "logits/rejected": -2.7520532608032227, + "logps/chosen": -317.461181640625, + "logps/rejected": -285.7931213378906, + "loss": 0.6139, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.13621816039085388, + "rewards/margins": 0.19606857001781464, + "rewards/rejected": -0.3322867453098297, + "step": 1140 + }, + { + "epoch": 0.30096833289714736, + "grad_norm": 6.221525192260742, + "learning_rate": 4.410698644942302e-07, + "logits/chosen": -2.8402047157287598, + "logits/rejected": -2.816387176513672, + "logps/chosen": -297.50286865234375, + "logps/rejected": -292.28436279296875, + "loss": 0.6183, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.10158324241638184, + "rewards/margins": 0.19611066579818726, + "rewards/rejected": -0.2976939082145691, + "step": 1150 + }, + { + "epoch": 0.3035854488353834, + "grad_norm": 4.492012023925781, + "learning_rate": 4.3958869190324057e-07, + "logits/chosen": -2.76503586769104, + "logits/rejected": -2.7254602909088135, + "logps/chosen": -291.94873046875, + "logps/rejected": -282.52880859375, + "loss": 0.6221, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.12198346853256226, + "rewards/margins": 0.18694952130317688, + "rewards/rejected": -0.30893296003341675, + "step": 1160 + }, + { + "epoch": 0.30620256477361946, + "grad_norm": 3.562570810317993, + "learning_rate": 4.380916887110365e-07, + "logits/chosen": -2.829111099243164, + "logits/rejected": -2.800809383392334, + "logps/chosen": -290.05316162109375, + "logps/rejected": -266.3580017089844, + "loss": 0.6199, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.14898671209812164, + "rewards/margins": 0.19158688187599182, + "rewards/rejected": -0.34057360887527466, + "step": 1170 + }, + { + "epoch": 0.30881968071185556, + "grad_norm": 5.379666805267334, + "learning_rate": 4.3657897991695394e-07, + "logits/chosen": -2.7369437217712402, + "logits/rejected": -2.7774927616119385, + "logps/chosen": -281.9171142578125, + "logps/rejected": -300.78912353515625, + "loss": 0.6192, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.11646691709756851, + "rewards/margins": 0.19827672839164734, + "rewards/rejected": -0.31474363803863525, + "step": 1180 + }, + { + "epoch": 0.3114367966500916, + "grad_norm": 4.079792499542236, + "learning_rate": 4.350506918317416e-07, + "logits/chosen": -2.8184256553649902, + "logits/rejected": -2.788510799407959, + "logps/chosen": -274.4839172363281, + "logps/rejected": -287.8948669433594, + "loss": 0.6194, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.12529827654361725, + "rewards/margins": 0.19565680623054504, + "rewards/rejected": -0.3209550976753235, + "step": 1190 + }, + { + "epoch": 0.31405391258832765, + "grad_norm": 4.406829833984375, + "learning_rate": 4.335069520670149e-07, + "logits/chosen": -2.7956674098968506, + "logits/rejected": -2.7690110206604004, + "logps/chosen": -252.70156860351562, + "logps/rejected": -279.14111328125, + "loss": 0.6427, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.09379851073026657, + "rewards/margins": 0.14501607418060303, + "rewards/rejected": -0.2388145923614502, + "step": 1200 + }, + { + "epoch": 0.31405391258832765, + "eval_logits/chosen": -2.8165299892425537, + "eval_logits/rejected": -2.793107032775879, + "eval_logps/chosen": -293.77850341796875, + "eval_logps/rejected": -291.3028259277344, + "eval_loss": 0.622346818447113, + "eval_rewards/accuracies": 0.6834999918937683, + "eval_rewards/chosen": -0.11040891706943512, + "eval_rewards/margins": 0.18824002146720886, + "eval_rewards/rejected": -0.2986489236354828, + "eval_runtime": 690.8187, + "eval_samples_per_second": 2.895, + "eval_steps_per_second": 0.362, + "step": 1200 + }, + { + "epoch": 0.3166710285265637, + "grad_norm": 4.730831146240234, + "learning_rate": 4.319478895245999e-07, + "logits/chosen": -2.8096089363098145, + "logits/rejected": -2.781852960586548, + "logps/chosen": -277.19305419921875, + "logps/rejected": -268.88653564453125, + "loss": 0.6189, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.12051185220479965, + "rewards/margins": 0.19446460902690887, + "rewards/rejected": -0.3149764835834503, + "step": 1210 + }, + { + "epoch": 0.3192881444647998, + "grad_norm": 4.179198741912842, + "learning_rate": 4.3037363438577036e-07, + "logits/chosen": -2.8334312438964844, + "logits/rejected": -2.796905517578125, + "logps/chosen": -275.5434875488281, + "logps/rejected": -309.56561279296875, + "loss": 0.6074, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.03255675360560417, + "rewards/margins": 0.21560052037239075, + "rewards/rejected": -0.24815726280212402, + "step": 1220 + }, + { + "epoch": 0.32190526040303585, + "grad_norm": 3.7570934295654297, + "learning_rate": 4.2878431810037716e-07, + "logits/chosen": -2.8290486335754395, + "logits/rejected": -2.821361780166626, + "logps/chosen": -317.92926025390625, + "logps/rejected": -291.9640197753906, + "loss": 0.6102, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.06272344291210175, + "rewards/margins": 0.21540877223014832, + "rewards/rejected": -0.27813225984573364, + "step": 1230 + }, + { + "epoch": 0.3245223763412719, + "grad_norm": 5.973113536834717, + "learning_rate": 4.271800733758729e-07, + "logits/chosen": -2.801720380783081, + "logits/rejected": -2.804701566696167, + "logps/chosen": -308.4283142089844, + "logps/rejected": -294.974609375, + "loss": 0.6055, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.04135540500283241, + "rewards/margins": 0.23178556561470032, + "rewards/rejected": -0.27314096689224243, + "step": 1240 + }, + { + "epoch": 0.327139492279508, + "grad_norm": 5.047220706939697, + "learning_rate": 4.255610341662304e-07, + "logits/chosen": -2.8307595252990723, + "logits/rejected": -2.779573440551758, + "logps/chosen": -282.5008239746094, + "logps/rejected": -278.0930480957031, + "loss": 0.6297, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.07180126756429672, + "rewards/margins": 0.17990802228450775, + "rewards/rejected": -0.2517092823982239, + "step": 1250 + }, + { + "epoch": 0.32975660821774405, + "grad_norm": 4.12667179107666, + "learning_rate": 4.2392733566075757e-07, + "logits/chosen": -2.8080954551696777, + "logits/rejected": -2.7833712100982666, + "logps/chosen": -279.9812927246094, + "logps/rejected": -274.603271484375, + "loss": 0.6437, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.06445430964231491, + "rewards/margins": 0.1353849321603775, + "rewards/rejected": -0.19983923435211182, + "step": 1260 + }, + { + "epoch": 0.3323737241559801, + "grad_norm": 3.241464138031006, + "learning_rate": 4.2227911427280973e-07, + "logits/chosen": -2.7715563774108887, + "logits/rejected": -2.7483251094818115, + "logps/chosen": -269.14215087890625, + "logps/rejected": -254.9038543701172, + "loss": 0.6275, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.029628584161400795, + "rewards/margins": 0.1794588267803192, + "rewards/rejected": -0.20908741652965546, + "step": 1270 + }, + { + "epoch": 0.33499084009421615, + "grad_norm": 6.028203010559082, + "learning_rate": 4.206165076283982e-07, + "logits/chosen": -2.8015265464782715, + "logits/rejected": -2.7831873893737793, + "logps/chosen": -270.62139892578125, + "logps/rejected": -273.0738830566406, + "loss": 0.6107, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.09085245430469513, + "rewards/margins": 0.2116876095533371, + "rewards/rejected": -0.30254003405570984, + "step": 1280 + }, + { + "epoch": 0.33760795603245225, + "grad_norm": 5.242630958557129, + "learning_rate": 4.1893965455469946e-07, + "logits/chosen": -2.8173327445983887, + "logits/rejected": -2.7973732948303223, + "logps/chosen": -279.14031982421875, + "logps/rejected": -275.79638671875, + "loss": 0.6269, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.14117182791233063, + "rewards/margins": 0.18503603339195251, + "rewards/rejected": -0.32620781660079956, + "step": 1290 + }, + { + "epoch": 0.3402250719706883, + "grad_norm": 5.775106430053711, + "learning_rate": 4.172486950684626e-07, + "logits/chosen": -2.821103096008301, + "logits/rejected": -2.814502477645874, + "logps/chosen": -279.78289794921875, + "logps/rejected": -298.9765930175781, + "loss": 0.6131, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.11109775304794312, + "rewards/margins": 0.21843478083610535, + "rewards/rejected": -0.32953253388404846, + "step": 1300 + }, + { + "epoch": 0.3402250719706883, + "eval_logits/chosen": -2.818049430847168, + "eval_logits/rejected": -2.7951488494873047, + "eval_logps/chosen": -297.3945007324219, + "eval_logps/rejected": -296.5805969238281, + "eval_loss": 0.6172210574150085, + "eval_rewards/accuracies": 0.6865000128746033, + "eval_rewards/chosen": -0.14656904339790344, + "eval_rewards/margins": 0.2048574537038803, + "eval_rewards/rejected": -0.35142648220062256, + "eval_runtime": 691.9861, + "eval_samples_per_second": 2.89, + "eval_steps_per_second": 0.361, + "step": 1300 + }, + { + "epoch": 0.34284218790892435, + "grad_norm": 8.304680824279785, + "learning_rate": 4.155437703643181e-07, + "logits/chosen": -2.841334581375122, + "logits/rejected": -2.806217670440674, + "logps/chosen": -272.61444091796875, + "logps/rejected": -267.8605041503906, + "loss": 0.6005, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.11493051052093506, + "rewards/margins": 0.24178418517112732, + "rewards/rejected": -0.35671466588974, + "step": 1310 + }, + { + "epoch": 0.34545930384716045, + "grad_norm": 6.887094497680664, + "learning_rate": 4.138250228029881e-07, + "logits/chosen": -2.811464786529541, + "logits/rejected": -2.797884941101074, + "logps/chosen": -295.8591613769531, + "logps/rejected": -319.4233703613281, + "loss": 0.6383, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2382466346025467, + "rewards/margins": 0.16607843339443207, + "rewards/rejected": -0.40432506799697876, + "step": 1320 + }, + { + "epoch": 0.3480764197853965, + "grad_norm": 4.52334451675415, + "learning_rate": 4.1209259589939935e-07, + "logits/chosen": -2.8012988567352295, + "logits/rejected": -2.8001253604888916, + "logps/chosen": -262.8810119628906, + "logps/rejected": -272.76788330078125, + "loss": 0.6321, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.13144102692604065, + "rewards/margins": 0.17341327667236328, + "rewards/rejected": -0.30485430359840393, + "step": 1330 + }, + { + "epoch": 0.35069353572363254, + "grad_norm": 3.246675729751587, + "learning_rate": 4.103466343106998e-07, + "logits/chosen": -2.8291964530944824, + "logits/rejected": -2.824831247329712, + "logps/chosen": -302.6276550292969, + "logps/rejected": -286.753662109375, + "loss": 0.6334, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.1298406422138214, + "rewards/margins": 0.16963128745555878, + "rewards/rejected": -0.2994719445705414, + "step": 1340 + }, + { + "epoch": 0.35331065166186865, + "grad_norm": 4.933244705200195, + "learning_rate": 4.085872838241796e-07, + "logits/chosen": -2.767702102661133, + "logits/rejected": -2.730109691619873, + "logps/chosen": -311.7983703613281, + "logps/rejected": -294.95294189453125, + "loss": 0.6356, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.16958799958229065, + "rewards/margins": 0.17119386792182922, + "rewards/rejected": -0.3407818675041199, + "step": 1350 + }, + { + "epoch": 0.3559277676001047, + "grad_norm": 6.131802082061768, + "learning_rate": 4.06814691345098e-07, + "logits/chosen": -2.7470338344573975, + "logits/rejected": -2.722545862197876, + "logps/chosen": -288.4170837402344, + "logps/rejected": -289.61102294921875, + "loss": 0.602, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.1348382532596588, + "rewards/margins": 0.2365628182888031, + "rewards/rejected": -0.3714010715484619, + "step": 1360 + }, + { + "epoch": 0.35854488353834074, + "grad_norm": 4.9708638191223145, + "learning_rate": 4.0502900488441707e-07, + "logits/chosen": -2.7989072799682617, + "logits/rejected": -2.789274215698242, + "logps/chosen": -306.6829528808594, + "logps/rejected": -320.0224304199219, + "loss": 0.6285, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.20997491478919983, + "rewards/margins": 0.1841730773448944, + "rewards/rejected": -0.39414799213409424, + "step": 1370 + }, + { + "epoch": 0.3611619994765768, + "grad_norm": 6.784174919128418, + "learning_rate": 4.032303735464422e-07, + "logits/chosen": -2.880401134490967, + "logits/rejected": -2.835643768310547, + "logps/chosen": -310.90679931640625, + "logps/rejected": -308.8883361816406, + "loss": 0.6053, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.20711734890937805, + "rewards/margins": 0.24106808006763458, + "rewards/rejected": -0.44818538427352905, + "step": 1380 + }, + { + "epoch": 0.3637791154148129, + "grad_norm": 5.785353183746338, + "learning_rate": 4.014189475163726e-07, + "logits/chosen": -2.794342517852783, + "logits/rejected": -2.7849628925323486, + "logps/chosen": -297.41961669921875, + "logps/rejected": -308.3134765625, + "loss": 0.6053, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.24589386582374573, + "rewards/margins": 0.2298090010881424, + "rewards/rejected": -0.47570285201072693, + "step": 1390 + }, + { + "epoch": 0.36639623135304894, + "grad_norm": 6.076969146728516, + "learning_rate": 3.995948780477605e-07, + "logits/chosen": -2.8259429931640625, + "logits/rejected": -2.795186996459961, + "logps/chosen": -306.1077880859375, + "logps/rejected": -299.7892150878906, + "loss": 0.6326, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21178540587425232, + "rewards/margins": 0.17982172966003418, + "rewards/rejected": -0.3916071355342865, + "step": 1400 + }, + { + "epoch": 0.36639623135304894, + "eval_logits/chosen": -2.814655065536499, + "eval_logits/rejected": -2.7920358180999756, + "eval_logps/chosen": -300.2596740722656, + "eval_logps/rejected": -300.3965759277344, + "eval_loss": 0.6155202388763428, + "eval_rewards/accuracies": 0.6859999895095825, + "eval_rewards/chosen": -0.175220787525177, + "eval_rewards/margins": 0.2143653929233551, + "eval_rewards/rejected": -0.3895862102508545, + "eval_runtime": 692.0291, + "eval_samples_per_second": 2.89, + "eval_steps_per_second": 0.361, + "step": 1400 + }, + { + "epoch": 0.369013347291285, + "grad_norm": 6.421947479248047, + "learning_rate": 3.977583174498816e-07, + "logits/chosen": -2.816697359085083, + "logits/rejected": -2.8030014038085938, + "logps/chosen": -300.00640869140625, + "logps/rejected": -303.1688232421875, + "loss": 0.5882, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.14107367396354675, + "rewards/margins": 0.27628999948501587, + "rewards/rejected": -0.41736364364624023, + "step": 1410 + }, + { + "epoch": 0.3716304632295211, + "grad_norm": 4.980222225189209, + "learning_rate": 3.9590941907501717e-07, + "logits/chosen": -2.8284125328063965, + "logits/rejected": -2.812608242034912, + "logps/chosen": -307.8800354003906, + "logps/rejected": -303.53021240234375, + "loss": 0.6005, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.056650467216968536, + "rewards/margins": 0.2519657611846924, + "rewards/rejected": -0.3086162507534027, + "step": 1420 + }, + { + "epoch": 0.37424757916775714, + "grad_norm": 5.049463272094727, + "learning_rate": 3.9404833730564974e-07, + "logits/chosen": -2.735870838165283, + "logits/rejected": -2.722884178161621, + "logps/chosen": -285.8304443359375, + "logps/rejected": -297.43341064453125, + "loss": 0.6055, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.12556666135787964, + "rewards/margins": 0.2363204061985016, + "rewards/rejected": -0.3618870973587036, + "step": 1430 + }, + { + "epoch": 0.3768646951059932, + "grad_norm": 6.007881164550781, + "learning_rate": 3.9217522754157117e-07, + "logits/chosen": -2.8069920539855957, + "logits/rejected": -2.80522084236145, + "logps/chosen": -284.0002136230469, + "logps/rejected": -286.4706115722656, + "loss": 0.5941, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.15235498547554016, + "rewards/margins": 0.26103848218917847, + "rewards/rejected": -0.41339343786239624, + "step": 1440 + }, + { + "epoch": 0.37948181104422923, + "grad_norm": 4.487087726593018, + "learning_rate": 3.9029024618690785e-07, + "logits/chosen": -2.8235816955566406, + "logits/rejected": -2.7990283966064453, + "logps/chosen": -266.3917541503906, + "logps/rejected": -270.59381103515625, + "loss": 0.6161, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.11356230825185776, + "rewards/margins": 0.21844033896923065, + "rewards/rejected": -0.3320026695728302, + "step": 1450 + }, + { + "epoch": 0.38209892698246534, + "grad_norm": 3.7364535331726074, + "learning_rate": 3.883935506370605e-07, + "logits/chosen": -2.7793936729431152, + "logits/rejected": -2.770378589630127, + "logps/chosen": -278.8677062988281, + "logps/rejected": -271.43145751953125, + "loss": 0.6076, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.08725923299789429, + "rewards/margins": 0.2380957156419754, + "rewards/rejected": -0.3253549635410309, + "step": 1460 + }, + { + "epoch": 0.3847160429207014, + "grad_norm": 4.045937538146973, + "learning_rate": 3.864852992655616e-07, + "logits/chosen": -2.7860310077667236, + "logits/rejected": -2.7741951942443848, + "logps/chosen": -279.3297119140625, + "logps/rejected": -292.84356689453125, + "loss": 0.5813, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.10104944556951523, + "rewards/margins": 0.2876027524471283, + "rewards/rejected": -0.38865217566490173, + "step": 1470 + }, + { + "epoch": 0.38733315885893743, + "grad_norm": 5.180766582489014, + "learning_rate": 3.845656514108515e-07, + "logits/chosen": -2.8035526275634766, + "logits/rejected": -2.784550189971924, + "logps/chosen": -299.1927490234375, + "logps/rejected": -258.96661376953125, + "loss": 0.6143, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.18017061054706573, + "rewards/margins": 0.21995961666107178, + "rewards/rejected": -0.40013018250465393, + "step": 1480 + }, + { + "epoch": 0.38995027479717354, + "grad_norm": 3.420503616333008, + "learning_rate": 3.8263476736297375e-07, + "logits/chosen": -2.8004748821258545, + "logits/rejected": -2.755922794342041, + "logps/chosen": -280.3719177246094, + "logps/rejected": -276.71051025390625, + "loss": 0.6096, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.11658191680908203, + "rewards/margins": 0.22706842422485352, + "rewards/rejected": -0.34365034103393555, + "step": 1490 + }, + { + "epoch": 0.3925673907354096, + "grad_norm": 6.24570369720459, + "learning_rate": 3.8069280835019055e-07, + "logits/chosen": -2.7886569499969482, + "logits/rejected": -2.757636070251465, + "logps/chosen": -291.5840759277344, + "logps/rejected": -290.7030334472656, + "loss": 0.6128, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.07107678055763245, + "rewards/margins": 0.2125014066696167, + "rewards/rejected": -0.28357818722724915, + "step": 1500 + }, + { + "epoch": 0.3925673907354096, + "eval_logits/chosen": -2.819805145263672, + "eval_logits/rejected": -2.798032283782959, + "eval_logps/chosen": -289.036865234375, + "eval_logps/rejected": -288.3089904785156, + "eval_loss": 0.6180471777915955, + "eval_rewards/accuracies": 0.6890000104904175, + "eval_rewards/chosen": -0.06299243867397308, + "eval_rewards/margins": 0.20571817457675934, + "eval_rewards/rejected": -0.2687106430530548, + "eval_runtime": 691.9992, + "eval_samples_per_second": 2.89, + "eval_steps_per_second": 0.361, + "step": 1500 + }, + { + "epoch": 0.39518450667364563, + "grad_norm": 7.418298721313477, + "learning_rate": 3.7873993652552073e-07, + "logits/chosen": -2.7985031604766846, + "logits/rejected": -2.7847418785095215, + "logps/chosen": -256.2576904296875, + "logps/rejected": -263.3230895996094, + "loss": 0.646, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.07267605513334274, + "rewards/margins": 0.14168903231620789, + "rewards/rejected": -0.21436509490013123, + "step": 1510 + }, + { + "epoch": 0.39780162261188173, + "grad_norm": 3.0412213802337646, + "learning_rate": 3.767763149531995e-07, + "logits/chosen": -2.8065857887268066, + "logits/rejected": -2.792532205581665, + "logps/chosen": -282.3772888183594, + "logps/rejected": -286.32757568359375, + "loss": 0.6036, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.029223937541246414, + "rewards/margins": 0.23573264479637146, + "rewards/rejected": -0.26495662331581116, + "step": 1520 + }, + { + "epoch": 0.4004187385501178, + "grad_norm": 6.914887428283691, + "learning_rate": 3.7480210759506326e-07, + "logits/chosen": -2.771960973739624, + "logits/rejected": -2.769230365753174, + "logps/chosen": -301.027099609375, + "logps/rejected": -306.0934143066406, + "loss": 0.6321, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.05497425049543381, + "rewards/margins": 0.1824551671743393, + "rewards/rejected": -0.2374294102191925, + "step": 1530 + }, + { + "epoch": 0.40303585448835383, + "grad_norm": 5.229218006134033, + "learning_rate": 3.728174792968582e-07, + "logits/chosen": -2.7818996906280518, + "logits/rejected": -2.753554582595825, + "logps/chosen": -264.9828186035156, + "logps/rejected": -266.6888122558594, + "loss": 0.6304, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.10081575810909271, + "rewards/margins": 0.1800784170627594, + "rewards/rejected": -0.2808941900730133, + "step": 1540 + }, + { + "epoch": 0.4056529704265899, + "grad_norm": 3.8269035816192627, + "learning_rate": 3.70822595774476e-07, + "logits/chosen": -2.8083198070526123, + "logits/rejected": -2.7798688411712646, + "logps/chosen": -294.8878479003906, + "logps/rejected": -306.19659423828125, + "loss": 0.5877, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.06873732060194016, + "rewards/margins": 0.28800445795059204, + "rewards/rejected": -0.3567417860031128, + "step": 1550 + }, + { + "epoch": 0.408270086364826, + "grad_norm": 6.544018268585205, + "learning_rate": 3.688176236001168e-07, + "logits/chosen": -2.7987208366394043, + "logits/rejected": -2.7670371532440186, + "logps/chosen": -304.5577392578125, + "logps/rejected": -289.78729248046875, + "loss": 0.611, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.0676363930106163, + "rewards/margins": 0.23785026371479034, + "rewards/rejected": -0.30548661947250366, + "step": 1560 + }, + { + "epoch": 0.410887202303062, + "grad_norm": 9.901212692260742, + "learning_rate": 3.6680273018838016e-07, + "logits/chosen": -2.8177802562713623, + "logits/rejected": -2.806378126144409, + "logps/chosen": -281.0837707519531, + "logps/rejected": -286.8470153808594, + "loss": 0.6035, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.11407822370529175, + "rewards/margins": 0.25138336420059204, + "rewards/rejected": -0.3654615879058838, + "step": 1570 + }, + { + "epoch": 0.4135043182412981, + "grad_norm": 7.281955718994141, + "learning_rate": 3.6477808378228596e-07, + "logits/chosen": -2.787090539932251, + "logits/rejected": -2.7860255241394043, + "logps/chosen": -283.32928466796875, + "logps/rejected": -338.25714111328125, + "loss": 0.6043, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.12236142158508301, + "rewards/margins": 0.2562143802642822, + "rewards/rejected": -0.37857580184936523, + "step": 1580 + }, + { + "epoch": 0.4161214341795342, + "grad_norm": 8.57088565826416, + "learning_rate": 3.6274385343922674e-07, + "logits/chosen": -2.8543007373809814, + "logits/rejected": -2.8531434535980225, + "logps/chosen": -267.55767822265625, + "logps/rejected": -295.7901306152344, + "loss": 0.6187, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.15387986600399017, + "rewards/margins": 0.21341195702552795, + "rewards/rejected": -0.36729180812835693, + "step": 1590 + }, + { + "epoch": 0.4187385501177702, + "grad_norm": 5.7539849281311035, + "learning_rate": 3.6070020901685057e-07, + "logits/chosen": -2.7576816082000732, + "logits/rejected": -2.769594669342041, + "logps/chosen": -300.43572998046875, + "logps/rejected": -298.788818359375, + "loss": 0.6223, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.18216048181056976, + "rewards/margins": 0.21212442219257355, + "rewards/rejected": -0.3942849040031433, + "step": 1600 + }, + { + "epoch": 0.4187385501177702, + "eval_logits/chosen": -2.8147764205932617, + "eval_logits/rejected": -2.792606830596924, + "eval_logps/chosen": -299.62200927734375, + "eval_logps/rejected": -302.40740966796875, + "eval_loss": 0.6088424324989319, + "eval_rewards/accuracies": 0.6945000290870667, + "eval_rewards/chosen": -0.16884401440620422, + "eval_rewards/margins": 0.2408505380153656, + "eval_rewards/rejected": -0.4096945822238922, + "eval_runtime": 691.674, + "eval_samples_per_second": 2.892, + "eval_steps_per_second": 0.361, + "step": 1600 + }, + { + "epoch": 0.4213556660560063, + "grad_norm": 6.157792568206787, + "learning_rate": 3.5864732115887863e-07, + "logits/chosen": -2.81066632270813, + "logits/rejected": -2.802830219268799, + "logps/chosen": -273.0591735839844, + "logps/rejected": -307.04254150390625, + "loss": 0.5896, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.12094251811504364, + "rewards/margins": 0.2827422022819519, + "rewards/rejected": -0.40368470549583435, + "step": 1610 + }, + { + "epoch": 0.4239727819942423, + "grad_norm": 6.331284999847412, + "learning_rate": 3.565853612808562e-07, + "logits/chosen": -2.823272466659546, + "logits/rejected": -2.794790744781494, + "logps/chosen": -303.06683349609375, + "logps/rejected": -291.0, + "loss": 0.639, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.23127253353595734, + "rewards/margins": 0.17943724989891052, + "rewards/rejected": -0.41070979833602905, + "step": 1620 + }, + { + "epoch": 0.4265898979324784, + "grad_norm": 9.121101379394531, + "learning_rate": 3.5451450155583984e-07, + "logits/chosen": -2.733624219894409, + "logits/rejected": -2.7721478939056396, + "logps/chosen": -277.8062744140625, + "logps/rejected": -282.9922790527344, + "loss": 0.623, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.28953424096107483, + "rewards/margins": 0.21646256744861603, + "rewards/rejected": -0.5059967041015625, + "step": 1630 + }, + { + "epoch": 0.42920701387071447, + "grad_norm": 4.436567306518555, + "learning_rate": 3.5243491490002055e-07, + "logits/chosen": -2.817996025085449, + "logits/rejected": -2.8122916221618652, + "logps/chosen": -305.4420471191406, + "logps/rejected": -318.54742431640625, + "loss": 0.6265, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.32780542969703674, + "rewards/margins": 0.21562886238098145, + "rewards/rejected": -0.5434342622756958, + "step": 1640 + }, + { + "epoch": 0.4318241298089505, + "grad_norm": 7.695457935333252, + "learning_rate": 3.503467749582857e-07, + "logits/chosen": -2.790708303451538, + "logits/rejected": -2.7539708614349365, + "logps/chosen": -298.7849426269531, + "logps/rejected": -281.51995849609375, + "loss": 0.6324, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.2754608690738678, + "rewards/margins": 0.19722957909107208, + "rewards/rejected": -0.47269049286842346, + "step": 1650 + }, + { + "epoch": 0.4344412457471866, + "grad_norm": 8.035721778869629, + "learning_rate": 3.482502560897194e-07, + "logits/chosen": -2.7719411849975586, + "logits/rejected": -2.762267589569092, + "logps/chosen": -256.39263916015625, + "logps/rejected": -276.6297607421875, + "loss": 0.6336, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.19001971185207367, + "rewards/margins": 0.172675222158432, + "rewards/rejected": -0.3626949191093445, + "step": 1660 + }, + { + "epoch": 0.43705836168542267, + "grad_norm": 4.791623115539551, + "learning_rate": 3.4614553335304403e-07, + "logits/chosen": -2.8094491958618164, + "logits/rejected": -2.7578389644622803, + "logps/chosen": -303.371337890625, + "logps/rejected": -291.80615234375, + "loss": 0.5957, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.12800468504428864, + "rewards/margins": 0.26551762223243713, + "rewards/rejected": -0.39352232217788696, + "step": 1670 + }, + { + "epoch": 0.4396754776236587, + "grad_norm": 7.589243412017822, + "learning_rate": 3.440327824920022e-07, + "logits/chosen": -2.7957282066345215, + "logits/rejected": -2.775707483291626, + "logps/chosen": -309.8748474121094, + "logps/rejected": -299.0494384765625, + "loss": 0.5742, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.08200428634881973, + "rewards/margins": 0.3152574598789215, + "rewards/rejected": -0.39726167917251587, + "step": 1680 + }, + { + "epoch": 0.44229259356189476, + "grad_norm": 6.186291694641113, + "learning_rate": 3.4191217992068287e-07, + "logits/chosen": -2.8362536430358887, + "logits/rejected": -2.8137047290802, + "logps/chosen": -306.2242431640625, + "logps/rejected": -284.80548095703125, + "loss": 0.6043, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.115182064473629, + "rewards/margins": 0.25850868225097656, + "rewards/rejected": -0.37369078397750854, + "step": 1690 + }, + { + "epoch": 0.44490970950013087, + "grad_norm": 12.576449394226074, + "learning_rate": 3.3978390270879056e-07, + "logits/chosen": -2.7859883308410645, + "logits/rejected": -2.7761070728302, + "logps/chosen": -251.69168090820312, + "logps/rejected": -273.64825439453125, + "loss": 0.6338, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.23590262234210968, + "rewards/margins": 0.1843734234571457, + "rewards/rejected": -0.42027607560157776, + "step": 1700 + }, + { + "epoch": 0.44490970950013087, + "eval_logits/chosen": -2.818115234375, + "eval_logits/rejected": -2.7960946559906006, + "eval_logps/chosen": -304.2535095214844, + "eval_logps/rejected": -308.0869140625, + "eval_loss": 0.6060847043991089, + "eval_rewards/accuracies": 0.6924999952316284, + "eval_rewards/chosen": -0.21515871584415436, + "eval_rewards/margins": 0.2513309419155121, + "eval_rewards/rejected": -0.46648964285850525, + "eval_runtime": 691.2139, + "eval_samples_per_second": 2.893, + "eval_steps_per_second": 0.362, + "step": 1700 + }, + { + "epoch": 0.4475268254383669, + "grad_norm": 8.074392318725586, + "learning_rate": 3.376481285668599e-07, + "logits/chosen": -2.8055875301361084, + "logits/rejected": -2.8101181983947754, + "logps/chosen": -259.6014404296875, + "logps/rejected": -299.0648193359375, + "loss": 0.6022, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.21092364192008972, + "rewards/margins": 0.25584885478019714, + "rewards/rejected": -0.4667724668979645, + "step": 1710 + }, + { + "epoch": 0.45014394137660296, + "grad_norm": 9.234480857849121, + "learning_rate": 3.355050358314172e-07, + "logits/chosen": -2.838655948638916, + "logits/rejected": -2.825796604156494, + "logps/chosen": -299.0382995605469, + "logps/rejected": -306.70733642578125, + "loss": 0.5981, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.14200787246227264, + "rewards/margins": 0.2596356272697449, + "rewards/rejected": -0.40164345502853394, + "step": 1720 + }, + { + "epoch": 0.45276105731483907, + "grad_norm": 6.1853437423706055, + "learning_rate": 3.33354803450089e-07, + "logits/chosen": -2.745539426803589, + "logits/rejected": -2.7465980052948, + "logps/chosen": -298.8321533203125, + "logps/rejected": -300.1834411621094, + "loss": 0.6179, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.14898642897605896, + "rewards/margins": 0.23417282104492188, + "rewards/rejected": -0.38315925002098083, + "step": 1730 + }, + { + "epoch": 0.4553781732530751, + "grad_norm": 3.701824426651001, + "learning_rate": 3.311976109666605e-07, + "logits/chosen": -2.762765407562256, + "logits/rejected": -2.745163917541504, + "logps/chosen": -306.2688293457031, + "logps/rejected": -297.1578369140625, + "loss": 0.6142, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.11404751241207123, + "rewards/margins": 0.22985681891441345, + "rewards/rejected": -0.3439043462276459, + "step": 1740 + }, + { + "epoch": 0.45799528919131116, + "grad_norm": 5.698086738586426, + "learning_rate": 3.2903363850608317e-07, + "logits/chosen": -2.8657941818237305, + "logits/rejected": -2.8256325721740723, + "logps/chosen": -286.952392578125, + "logps/rejected": -288.02484130859375, + "loss": 0.609, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.22551126778125763, + "rewards/margins": 0.23173291981220245, + "rewards/rejected": -0.45724421739578247, + "step": 1750 + }, + { + "epoch": 0.46061240512954726, + "grad_norm": 7.6980085372924805, + "learning_rate": 3.2686306675943477e-07, + "logits/chosen": -2.792118549346924, + "logits/rejected": -2.8060059547424316, + "logps/chosen": -294.06951904296875, + "logps/rejected": -291.16302490234375, + "loss": 0.6134, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.20452764630317688, + "rewards/margins": 0.24044232070446014, + "rewards/rejected": -0.44496995210647583, + "step": 1760 + }, + { + "epoch": 0.4632295210677833, + "grad_norm": 4.300843238830566, + "learning_rate": 3.2468607696883145e-07, + "logits/chosen": -2.7653212547302246, + "logits/rejected": -2.756118059158325, + "logps/chosen": -298.01544189453125, + "logps/rejected": -333.34234619140625, + "loss": 0.5883, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.2969765067100525, + "rewards/margins": 0.29465410113334656, + "rewards/rejected": -0.5916305780410767, + "step": 1770 + }, + { + "epoch": 0.46584663700601936, + "grad_norm": 9.618111610412598, + "learning_rate": 3.2250285091229435e-07, + "logits/chosen": -2.825916290283203, + "logits/rejected": -2.8047428131103516, + "logps/chosen": -277.54571533203125, + "logps/rejected": -286.90704345703125, + "loss": 0.6269, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.2800549864768982, + "rewards/margins": 0.20103518664836884, + "rewards/rejected": -0.4810902178287506, + "step": 1780 + }, + { + "epoch": 0.4684637529442554, + "grad_norm": 15.666852951049805, + "learning_rate": 3.2031357088857083e-07, + "logits/chosen": -2.8130288124084473, + "logits/rejected": -2.8077621459960938, + "logps/chosen": -317.0379333496094, + "logps/rejected": -347.8671569824219, + "loss": 0.6115, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.24338212609291077, + "rewards/margins": 0.24569590389728546, + "rewards/rejected": -0.4890781044960022, + "step": 1790 + }, + { + "epoch": 0.4710808688824915, + "grad_norm": 6.9462571144104, + "learning_rate": 3.1811841970191267e-07, + "logits/chosen": -2.736687183380127, + "logits/rejected": -2.714433193206787, + "logps/chosen": -264.3397521972656, + "logps/rejected": -324.6456604003906, + "loss": 0.585, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.18001236021518707, + "rewards/margins": 0.31897181272506714, + "rewards/rejected": -0.4989841878414154, + "step": 1800 + }, + { + "epoch": 0.4710808688824915, + "eval_logits/chosen": -2.8173904418945312, + "eval_logits/rejected": -2.7949471473693848, + "eval_logps/chosen": -296.00537109375, + "eval_logps/rejected": -299.93682861328125, + "eval_loss": 0.6049584746360779, + "eval_rewards/accuracies": 0.6915000081062317, + "eval_rewards/chosen": -0.1326776146888733, + "eval_rewards/margins": 0.25231143832206726, + "eval_rewards/rejected": -0.38498908281326294, + "eval_runtime": 691.5153, + "eval_samples_per_second": 2.892, + "eval_steps_per_second": 0.362, + "step": 1800 + }, + { + "epoch": 0.47369798482072756, + "grad_norm": 4.673962116241455, + "learning_rate": 3.1591758064681257e-07, + "logits/chosen": -2.7477469444274902, + "logits/rejected": -2.7178540229797363, + "logps/chosen": -282.83074951171875, + "logps/rejected": -272.26715087890625, + "loss": 0.5961, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.11454129219055176, + "rewards/margins": 0.27904239296913147, + "rewards/rejected": -0.3935837149620056, + "step": 1810 + }, + { + "epoch": 0.4763151007589636, + "grad_norm": 7.684245586395264, + "learning_rate": 3.13711237492698e-07, + "logits/chosen": -2.7976129055023193, + "logits/rejected": -2.7869057655334473, + "logps/chosen": -313.35540771484375, + "logps/rejected": -318.04559326171875, + "loss": 0.6319, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.1507539302110672, + "rewards/margins": 0.1945343315601349, + "rewards/rejected": -0.3452882170677185, + "step": 1820 + }, + { + "epoch": 0.4789322166971997, + "grad_norm": 4.426579475402832, + "learning_rate": 3.1149957446858767e-07, + "logits/chosen": -2.791010618209839, + "logits/rejected": -2.807931423187256, + "logps/chosen": -277.4505310058594, + "logps/rejected": -279.3646240234375, + "loss": 0.6403, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.12662403285503387, + "rewards/margins": 0.16396556794643402, + "rewards/rejected": -0.2905896306037903, + "step": 1830 + }, + { + "epoch": 0.48154933263543576, + "grad_norm": 5.900054931640625, + "learning_rate": 3.0928277624770736e-07, + "logits/chosen": -2.843986988067627, + "logits/rejected": -2.823529005050659, + "logps/chosen": -312.50799560546875, + "logps/rejected": -315.56402587890625, + "loss": 0.5825, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.0948447436094284, + "rewards/margins": 0.32227185368537903, + "rewards/rejected": -0.41711658239364624, + "step": 1840 + }, + { + "epoch": 0.4841664485736718, + "grad_norm": 4.000248908996582, + "learning_rate": 3.0706102793207073e-07, + "logits/chosen": -2.8290603160858154, + "logits/rejected": -2.8024706840515137, + "logps/chosen": -316.80023193359375, + "logps/rejected": -323.507080078125, + "loss": 0.5882, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.1260642558336258, + "rewards/margins": 0.2963547706604004, + "rewards/rejected": -0.422419011592865, + "step": 1850 + }, + { + "epoch": 0.48678356451190785, + "grad_norm": 7.178162574768066, + "learning_rate": 3.048345150370226e-07, + "logits/chosen": -2.8230552673339844, + "logits/rejected": -2.817823886871338, + "logps/chosen": -320.08123779296875, + "logps/rejected": -328.2519836425781, + "loss": 0.6011, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.1783401370048523, + "rewards/margins": 0.27760833501815796, + "rewards/rejected": -0.45594844222068787, + "step": 1860 + }, + { + "epoch": 0.48940068045014395, + "grad_norm": 5.042900562286377, + "learning_rate": 3.0260342347574913e-07, + "logits/chosen": -2.809600353240967, + "logits/rejected": -2.78784441947937, + "logps/chosen": -304.2792053222656, + "logps/rejected": -314.709716796875, + "loss": 0.5808, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.1620454490184784, + "rewards/margins": 0.3016073703765869, + "rewards/rejected": -0.4636527895927429, + "step": 1870 + }, + { + "epoch": 0.49201779638838, + "grad_norm": 6.708124160766602, + "learning_rate": 3.0036793954375357e-07, + "logits/chosen": -2.840010643005371, + "logits/rejected": -2.820410966873169, + "logps/chosen": -301.98583984375, + "logps/rejected": -291.33465576171875, + "loss": 0.5776, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.15946264564990997, + "rewards/margins": 0.32609638571739197, + "rewards/rejected": -0.48555904626846313, + "step": 1880 + }, + { + "epoch": 0.49463491232661605, + "grad_norm": 4.842483043670654, + "learning_rate": 2.9812824990330085e-07, + "logits/chosen": -2.8116726875305176, + "logits/rejected": -2.8013501167297363, + "logps/chosen": -312.96807861328125, + "logps/rejected": -315.23675537109375, + "loss": 0.5975, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.20859690010547638, + "rewards/margins": 0.28837090730667114, + "rewards/rejected": -0.4969678521156311, + "step": 1890 + }, + { + "epoch": 0.49725202826485215, + "grad_norm": 11.47492790222168, + "learning_rate": 2.958845415678316e-07, + "logits/chosen": -2.8100364208221436, + "logits/rejected": -2.7813189029693604, + "logps/chosen": -317.1954650878906, + "logps/rejected": -327.9840087890625, + "loss": 0.577, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.21498079597949982, + "rewards/margins": 0.32284659147262573, + "rewards/rejected": -0.5378273725509644, + "step": 1900 + }, + { + "epoch": 0.49725202826485215, + "eval_logits/chosen": -2.8176026344299316, + "eval_logits/rejected": -2.7953593730926514, + "eval_logps/chosen": -304.433349609375, + "eval_logps/rejected": -310.2669677734375, + "eval_loss": 0.6012681722640991, + "eval_rewards/accuracies": 0.6965000033378601, + "eval_rewards/chosen": -0.2169574648141861, + "eval_rewards/margins": 0.27133309841156006, + "eval_rewards/rejected": -0.4882905185222626, + "eval_runtime": 691.3293, + "eval_samples_per_second": 2.893, + "eval_steps_per_second": 0.362, + "step": 1900 + }, + { + "epoch": 0.4998691442030882, + "grad_norm": 8.036276817321777, + "learning_rate": 2.936370018863459e-07, + "logits/chosen": -2.833437442779541, + "logits/rejected": -2.8240761756896973, + "logps/chosen": -301.29473876953125, + "logps/rejected": -287.30487060546875, + "loss": 0.6058, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2125242054462433, + "rewards/margins": 0.2442711889743805, + "rewards/rejected": -0.4567953944206238, + "step": 1910 + }, + { + "epoch": 0.5024862601413242, + "grad_norm": 6.088084697723389, + "learning_rate": 2.913858185277605e-07, + "logits/chosen": -2.793074131011963, + "logits/rejected": -2.7879836559295654, + "logps/chosen": -291.63409423828125, + "logps/rejected": -303.8699035644531, + "loss": 0.5963, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.14563243091106415, + "rewards/margins": 0.27376314997673035, + "rewards/rejected": -0.4193955361843109, + "step": 1920 + }, + { + "epoch": 0.5051033760795604, + "grad_norm": 6.633253574371338, + "learning_rate": 2.89131179465238e-07, + "logits/chosen": -2.763582706451416, + "logits/rejected": -2.7273335456848145, + "logps/chosen": -300.27764892578125, + "logps/rejected": -291.0055236816406, + "loss": 0.5841, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.12304127216339111, + "rewards/margins": 0.3036150336265564, + "rewards/rejected": -0.4266563355922699, + "step": 1930 + }, + { + "epoch": 0.5077204920177963, + "grad_norm": 4.170144557952881, + "learning_rate": 2.8687327296049125e-07, + "logits/chosen": -2.803448438644409, + "logits/rejected": -2.7855215072631836, + "logps/chosen": -287.71673583984375, + "logps/rejected": -312.64544677734375, + "loss": 0.6077, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.14048686623573303, + "rewards/margins": 0.2633481025695801, + "rewards/rejected": -0.4038349688053131, + "step": 1940 + }, + { + "epoch": 0.5103376079560324, + "grad_norm": 4.711779594421387, + "learning_rate": 2.846122875480637e-07, + "logits/chosen": -2.823185682296753, + "logits/rejected": -2.7931466102600098, + "logps/chosen": -301.4597473144531, + "logps/rejected": -299.9159851074219, + "loss": 0.6066, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.10608525574207306, + "rewards/margins": 0.25272199511528015, + "rewards/rejected": -0.3588072657585144, + "step": 1950 + }, + { + "epoch": 0.5129547238942685, + "grad_norm": 5.881545543670654, + "learning_rate": 2.8234841201958647e-07, + "logits/chosen": -2.8165388107299805, + "logits/rejected": -2.784043550491333, + "logps/chosen": -311.29217529296875, + "logps/rejected": -301.19964599609375, + "loss": 0.5839, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.1093025654554367, + "rewards/margins": 0.299915611743927, + "rewards/rejected": -0.4092181622982025, + "step": 1960 + }, + { + "epoch": 0.5155718398325045, + "grad_norm": 10.640946388244629, + "learning_rate": 2.800818354080148e-07, + "logits/chosen": -2.7974326610565186, + "logits/rejected": -2.7710323333740234, + "logps/chosen": -303.19610595703125, + "logps/rejected": -281.1106872558594, + "loss": 0.6138, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.13868093490600586, + "rewards/margins": 0.2444918155670166, + "rewards/rejected": -0.38317275047302246, + "step": 1970 + }, + { + "epoch": 0.5181889557707406, + "grad_norm": 5.855273246765137, + "learning_rate": 2.778127469718435e-07, + "logits/chosen": -2.751603364944458, + "logits/rejected": -2.7628543376922607, + "logps/chosen": -261.6673278808594, + "logps/rejected": -309.0796813964844, + "loss": 0.5864, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.1476416289806366, + "rewards/margins": 0.2927255630493164, + "rewards/rejected": -0.4403671622276306, + "step": 1980 + }, + { + "epoch": 0.5208060717089767, + "grad_norm": 5.992628574371338, + "learning_rate": 2.755413361793039e-07, + "logits/chosen": -2.7673847675323486, + "logits/rejected": -2.7404510974884033, + "logps/chosen": -280.890869140625, + "logps/rejected": -294.01092529296875, + "loss": 0.6048, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.15447109937667847, + "rewards/margins": 0.2593531310558319, + "rewards/rejected": -0.4138242304325104, + "step": 1990 + }, + { + "epoch": 0.5234231876472127, + "grad_norm": 6.741150379180908, + "learning_rate": 2.7326779269254356e-07, + "logits/chosen": -2.826737880706787, + "logits/rejected": -2.811283588409424, + "logps/chosen": -320.9913024902344, + "logps/rejected": -290.5726318359375, + "loss": 0.5945, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.14564435184001923, + "rewards/margins": 0.29357942938804626, + "rewards/rejected": -0.4392237663269043, + "step": 2000 + }, + { + "epoch": 0.5234231876472127, + "eval_logits/chosen": -2.812201976776123, + "eval_logits/rejected": -2.7902560234069824, + "eval_logps/chosen": -303.8027648925781, + "eval_logps/rejected": -310.42926025390625, + "eval_loss": 0.5991718173027039, + "eval_rewards/accuracies": 0.6995000243186951, + "eval_rewards/chosen": -0.21065115928649902, + "eval_rewards/margins": 0.27926215529441833, + "eval_rewards/rejected": -0.48991334438323975, + "eval_runtime": 691.9553, + "eval_samples_per_second": 2.89, + "eval_steps_per_second": 0.361, + "step": 2000 + }, + { + "epoch": 0.5260403035854488, + "grad_norm": 5.159753322601318, + "learning_rate": 2.709923063517895e-07, + "logits/chosen": -2.770754337310791, + "logits/rejected": -2.7877042293548584, + "logps/chosen": -297.4669494628906, + "logps/rejected": -326.15008544921875, + "loss": 0.5803, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.18324916064739227, + "rewards/margins": 0.3264145255088806, + "rewards/rejected": -0.5096637010574341, + "step": 2010 + }, + { + "epoch": 0.528657419523685, + "grad_norm": 9.780900001525879, + "learning_rate": 2.68715067159496e-07, + "logits/chosen": -2.804417133331299, + "logits/rejected": -2.7843241691589355, + "logps/chosen": -287.03619384765625, + "logps/rejected": -296.3020324707031, + "loss": 0.5831, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.18021352589130402, + "rewards/margins": 0.30431440472602844, + "rewards/rejected": -0.4845278859138489, + "step": 2020 + }, + { + "epoch": 0.5312745354619209, + "grad_norm": 7.88455867767334, + "learning_rate": 2.664362652644806e-07, + "logits/chosen": -2.820744514465332, + "logits/rejected": -2.8191521167755127, + "logps/chosen": -334.691650390625, + "logps/rejected": -322.51885986328125, + "loss": 0.5813, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.22317573428153992, + "rewards/margins": 0.33498162031173706, + "rewards/rejected": -0.5581573247909546, + "step": 2030 + }, + { + "epoch": 0.533891651400157, + "grad_norm": 6.620345115661621, + "learning_rate": 2.6415609094604555e-07, + "logits/chosen": -2.802522659301758, + "logits/rejected": -2.8061249256134033, + "logps/chosen": -310.2366638183594, + "logps/rejected": -317.20941162109375, + "loss": 0.6023, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.22533388435840607, + "rewards/margins": 0.28193774819374084, + "rewards/rejected": -0.5072715878486633, + "step": 2040 + }, + { + "epoch": 0.5365087673383931, + "grad_norm": 8.580389022827148, + "learning_rate": 2.618747345980904e-07, + "logits/chosen": -2.8094029426574707, + "logits/rejected": -2.768106460571289, + "logps/chosen": -293.4418029785156, + "logps/rejected": -266.50897216796875, + "loss": 0.6014, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.28857478499412537, + "rewards/margins": 0.2732298970222473, + "rewards/rejected": -0.5618046522140503, + "step": 2050 + }, + { + "epoch": 0.5391258832766291, + "grad_norm": 11.197132110595703, + "learning_rate": 2.595923867132136e-07, + "logits/chosen": -2.8401012420654297, + "logits/rejected": -2.835894823074341, + "logps/chosen": -327.6039733886719, + "logps/rejected": -335.93634033203125, + "loss": 0.5892, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.29747676849365234, + "rewards/margins": 0.3320815861225128, + "rewards/rejected": -0.6295583844184875, + "step": 2060 + }, + { + "epoch": 0.5417429992148652, + "grad_norm": 7.386964797973633, + "learning_rate": 2.5730923786680667e-07, + "logits/chosen": -2.820725917816162, + "logits/rejected": -2.821699619293213, + "logps/chosen": -294.2755432128906, + "logps/rejected": -329.28900146484375, + "loss": 0.6084, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.28539037704467773, + "rewards/margins": 0.27198493480682373, + "rewards/rejected": -0.5573753714561462, + "step": 2070 + }, + { + "epoch": 0.5443601151531012, + "grad_norm": 10.91450023651123, + "learning_rate": 2.5502547870114135e-07, + "logits/chosen": -2.798468589782715, + "logits/rejected": -2.764756441116333, + "logps/chosen": -296.8208923339844, + "logps/rejected": -290.93609619140625, + "loss": 0.6123, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.25504210591316223, + "rewards/margins": 0.26738548278808594, + "rewards/rejected": -0.5224276185035706, + "step": 2080 + }, + { + "epoch": 0.5469772310913373, + "grad_norm": 9.419450759887695, + "learning_rate": 2.527412999094506e-07, + "logits/chosen": -2.7591891288757324, + "logits/rejected": -2.7384586334228516, + "logps/chosen": -340.7040100097656, + "logps/rejected": -353.3229064941406, + "loss": 0.5947, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.2267749011516571, + "rewards/margins": 0.2946481704711914, + "rewards/rejected": -0.5214229822158813, + "step": 2090 + }, + { + "epoch": 0.5495943470295734, + "grad_norm": 9.121070861816406, + "learning_rate": 2.5045689222000636e-07, + "logits/chosen": -2.748777151107788, + "logits/rejected": -2.737816333770752, + "logps/chosen": -279.33941650390625, + "logps/rejected": -290.88262939453125, + "loss": 0.5913, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.20830078423023224, + "rewards/margins": 0.28861740231513977, + "rewards/rejected": -0.4969182014465332, + "step": 2100 + }, + { + "epoch": 0.5495943470295734, + "eval_logits/chosen": -2.8085484504699707, + "eval_logits/rejected": -2.786346673965454, + "eval_logps/chosen": -306.4640808105469, + "eval_logps/rejected": -313.952880859375, + "eval_loss": 0.5981019139289856, + "eval_rewards/accuracies": 0.7024999856948853, + "eval_rewards/chosen": -0.23726463317871094, + "eval_rewards/margins": 0.2878848612308502, + "eval_rewards/rejected": -0.5251494646072388, + "eval_runtime": 690.4278, + "eval_samples_per_second": 2.897, + "eval_steps_per_second": 0.362, + "step": 2100 + }, + { + "epoch": 0.5522114629678094, + "grad_norm": 7.360952854156494, + "learning_rate": 2.481724463801933e-07, + "logits/chosen": -2.7974154949188232, + "logits/rejected": -2.7778165340423584, + "logps/chosen": -320.70465087890625, + "logps/rejected": -308.23455810546875, + "loss": 0.5916, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.25429460406303406, + "rewards/margins": 0.29730120301246643, + "rewards/rejected": -0.5515958070755005, + "step": 2110 + }, + { + "epoch": 0.5548285789060455, + "grad_norm": 9.077162742614746, + "learning_rate": 2.4588815314058154e-07, + "logits/chosen": -2.7863690853118896, + "logits/rejected": -2.787247896194458, + "logps/chosen": -283.7870788574219, + "logps/rejected": -277.558837890625, + "loss": 0.5976, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.24108314514160156, + "rewards/margins": 0.27977603673934937, + "rewards/rejected": -0.5208591818809509, + "step": 2120 + }, + { + "epoch": 0.5574456948442816, + "grad_norm": 6.194889545440674, + "learning_rate": 2.4360420323899917e-07, + "logits/chosen": -2.7870755195617676, + "logits/rejected": -2.779362916946411, + "logps/chosen": -321.5159606933594, + "logps/rejected": -313.3367614746094, + "loss": 0.6106, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.25045931339263916, + "rewards/margins": 0.27981314063072205, + "rewards/rejected": -0.5302724242210388, + "step": 2130 + }, + { + "epoch": 0.5600628107825176, + "grad_norm": 9.01162338256836, + "learning_rate": 2.4132078738460583e-07, + "logits/chosen": -2.821700096130371, + "logits/rejected": -2.7977004051208496, + "logps/chosen": -299.77734375, + "logps/rejected": -288.15472412109375, + "loss": 0.5911, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2075999230146408, + "rewards/margins": 0.2872273027896881, + "rewards/rejected": -0.49482718110084534, + "step": 2140 + }, + { + "epoch": 0.5626799267207537, + "grad_norm": 8.978148460388184, + "learning_rate": 2.390380962419682e-07, + "logits/chosen": -2.7910008430480957, + "logits/rejected": -2.7853500843048096, + "logps/chosen": -271.1761474609375, + "logps/rejected": -258.0618896484375, + "loss": 0.6279, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2159349024295807, + "rewards/margins": 0.2157304286956787, + "rewards/rejected": -0.4316653609275818, + "step": 2150 + }, + { + "epoch": 0.5652970426589898, + "grad_norm": 10.330108642578125, + "learning_rate": 2.3675632041513977e-07, + "logits/chosen": -2.8272249698638916, + "logits/rejected": -2.781740427017212, + "logps/chosen": -321.1408996582031, + "logps/rejected": -290.31451416015625, + "loss": 0.566, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1839137077331543, + "rewards/margins": 0.36078041791915894, + "rewards/rejected": -0.5446941256523132, + "step": 2160 + }, + { + "epoch": 0.5679141585972258, + "grad_norm": 4.827859401702881, + "learning_rate": 2.344756504317453e-07, + "logits/chosen": -2.7731990814208984, + "logits/rejected": -2.739841938018799, + "logps/chosen": -311.63385009765625, + "logps/rejected": -300.05657958984375, + "loss": 0.6069, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.37105852365493774, + "rewards/margins": 0.2651851773262024, + "rewards/rejected": -0.6362437009811401, + "step": 2170 + }, + { + "epoch": 0.5705312745354619, + "grad_norm": 7.324320316314697, + "learning_rate": 2.3219627672707237e-07, + "logits/chosen": -2.7636940479278564, + "logits/rejected": -2.7629504203796387, + "logps/chosen": -312.3614196777344, + "logps/rejected": -291.49920654296875, + "loss": 0.6201, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.40163812041282654, + "rewards/margins": 0.2354915589094162, + "rewards/rejected": -0.6371296644210815, + "step": 2180 + }, + { + "epoch": 0.573148390473698, + "grad_norm": 9.793487548828125, + "learning_rate": 2.2991838962816918e-07, + "logits/chosen": -2.760166645050049, + "logits/rejected": -2.7421138286590576, + "logps/chosen": -309.69378662109375, + "logps/rejected": -330.1057434082031, + "loss": 0.6189, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.391974538564682, + "rewards/margins": 0.23559853434562683, + "rewards/rejected": -0.6275731325149536, + "step": 2190 + }, + { + "epoch": 0.575765506411934, + "grad_norm": 4.884433746337891, + "learning_rate": 2.2764217933795297e-07, + "logits/chosen": -2.7735462188720703, + "logits/rejected": -2.7576115131378174, + "logps/chosen": -306.01983642578125, + "logps/rejected": -319.36273193359375, + "loss": 0.5816, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.28672754764556885, + "rewards/margins": 0.3387922942638397, + "rewards/rejected": -0.625519871711731, + "step": 2200 + }, + { + "epoch": 0.575765506411934, + "eval_logits/chosen": -2.806988000869751, + "eval_logits/rejected": -2.7848920822143555, + "eval_logps/chosen": -309.6146240234375, + "eval_logps/rejected": -317.14105224609375, + "eval_loss": 0.5989395976066589, + "eval_rewards/accuracies": 0.6970000267028809, + "eval_rewards/chosen": -0.26876989006996155, + "eval_rewards/margins": 0.28826138377189636, + "eval_rewards/rejected": -0.5570313334465027, + "eval_runtime": 692.0182, + "eval_samples_per_second": 2.89, + "eval_steps_per_second": 0.361, + "step": 2200 + }, + { + "epoch": 0.5783826223501701, + "grad_norm": 5.080691337585449, + "learning_rate": 2.253678359193278e-07, + "logits/chosen": -2.8626627922058105, + "logits/rejected": -2.8227312564849854, + "logps/chosen": -323.10284423828125, + "logps/rejected": -324.9154968261719, + "loss": 0.6192, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.28973332047462463, + "rewards/margins": 0.24134087562561035, + "rewards/rejected": -0.5310741662979126, + "step": 2210 + }, + { + "epoch": 0.5809997382884062, + "grad_norm": 8.136847496032715, + "learning_rate": 2.230955492793149e-07, + "logits/chosen": -2.7363781929016113, + "logits/rejected": -2.747398853302002, + "logps/chosen": -315.01092529296875, + "logps/rejected": -321.312744140625, + "loss": 0.6301, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2270394265651703, + "rewards/margins": 0.22412936389446259, + "rewards/rejected": -0.4511687755584717, + "step": 2220 + }, + { + "epoch": 0.5836168542266422, + "grad_norm": 3.2636797428131104, + "learning_rate": 2.2082550915319468e-07, + "logits/chosen": -2.746173858642578, + "logits/rejected": -2.7479488849639893, + "logps/chosen": -311.60443115234375, + "logps/rejected": -304.00933837890625, + "loss": 0.5897, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.16526171565055847, + "rewards/margins": 0.31148332357406616, + "rewards/rejected": -0.47674503922462463, + "step": 2230 + }, + { + "epoch": 0.5862339701648783, + "grad_norm": 7.513117790222168, + "learning_rate": 2.1855790508866433e-07, + "logits/chosen": -2.7626214027404785, + "logits/rejected": -2.766356945037842, + "logps/chosen": -345.93560791015625, + "logps/rejected": -345.16632080078125, + "loss": 0.6017, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.19639845192432404, + "rewards/margins": 0.2772556245326996, + "rewards/rejected": -0.473654180765152, + "step": 2240 + }, + { + "epoch": 0.5888510861031143, + "grad_norm": 4.226502418518066, + "learning_rate": 2.162929264300107e-07, + "logits/chosen": -2.7443809509277344, + "logits/rejected": -2.740731716156006, + "logps/chosen": -298.61883544921875, + "logps/rejected": -312.0686950683594, + "loss": 0.5729, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14046551287174225, + "rewards/margins": 0.34373658895492554, + "rewards/rejected": -0.4842020869255066, + "step": 2250 + }, + { + "epoch": 0.5914682020413504, + "grad_norm": 5.33687162399292, + "learning_rate": 2.1403076230230005e-07, + "logits/chosen": -2.767137289047241, + "logits/rejected": -2.7396111488342285, + "logps/chosen": -312.28643798828125, + "logps/rejected": -306.20172119140625, + "loss": 0.616, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.19273105263710022, + "rewards/margins": 0.26331207156181335, + "rewards/rejected": -0.45604315400123596, + "step": 2260 + }, + { + "epoch": 0.5940853179795865, + "grad_norm": 9.639008522033691, + "learning_rate": 2.1177160159558596e-07, + "logits/chosen": -2.7518250942230225, + "logits/rejected": -2.7383649349212646, + "logps/chosen": -321.7221374511719, + "logps/rejected": -297.3667297363281, + "loss": 0.6038, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.21679162979125977, + "rewards/margins": 0.29109686613082886, + "rewards/rejected": -0.5078884959220886, + "step": 2270 + }, + { + "epoch": 0.5967024339178225, + "grad_norm": 6.384767055511475, + "learning_rate": 2.0951563294913734e-07, + "logits/chosen": -2.760425090789795, + "logits/rejected": -2.7438526153564453, + "logps/chosen": -299.39373779296875, + "logps/rejected": -302.9912109375, + "loss": 0.5717, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.20336699485778809, + "rewards/margins": 0.3353096842765808, + "rewards/rejected": -0.5386766791343689, + "step": 2280 + }, + { + "epoch": 0.5993195498560586, + "grad_norm": 6.036366939544678, + "learning_rate": 2.072630447356869e-07, + "logits/chosen": -2.7959117889404297, + "logits/rejected": -2.7956790924072266, + "logps/chosen": -300.03179931640625, + "logps/rejected": -291.49481201171875, + "loss": 0.6001, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.23898771405220032, + "rewards/margins": 0.26846712827682495, + "rewards/rejected": -0.5074548125267029, + "step": 2290 + }, + { + "epoch": 0.6019366657942947, + "grad_norm": 7.8020195960998535, + "learning_rate": 2.0501402504570232e-07, + "logits/chosen": -2.829082727432251, + "logits/rejected": -2.772502899169922, + "logps/chosen": -318.4316711425781, + "logps/rejected": -315.959716796875, + "loss": 0.5824, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.22740764915943146, + "rewards/margins": 0.3216533958911896, + "rewards/rejected": -0.5490610003471375, + "step": 2300 + }, + { + "epoch": 0.6019366657942947, + "eval_logits/chosen": -2.80366849899292, + "eval_logits/rejected": -2.7820827960968018, + "eval_logps/chosen": -305.00982666015625, + "eval_logps/rejected": -313.32330322265625, + "eval_loss": 0.5960872769355774, + "eval_rewards/accuracies": 0.6955000162124634, + "eval_rewards/chosen": -0.2227218896150589, + "eval_rewards/margins": 0.2961318790912628, + "eval_rewards/rejected": -0.5188537836074829, + "eval_runtime": 691.9375, + "eval_samples_per_second": 2.89, + "eval_steps_per_second": 0.361, + "step": 2300 + }, + { + "epoch": 0.6045537817325307, + "grad_norm": 12.083107948303223, + "learning_rate": 2.027687616716804e-07, + "logits/chosen": -2.72344970703125, + "logits/rejected": -2.7168376445770264, + "logps/chosen": -268.31243896484375, + "logps/rejected": -255.6737518310547, + "loss": 0.6189, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2189827412366867, + "rewards/margins": 0.24416430294513702, + "rewards/rejected": -0.46314701437950134, + "step": 2310 + }, + { + "epoch": 0.6071708976707668, + "grad_norm": 8.845372200012207, + "learning_rate": 2.005274420924668e-07, + "logits/chosen": -2.790346145629883, + "logits/rejected": -2.778743267059326, + "logps/chosen": -295.9941711425781, + "logps/rejected": -287.6865234375, + "loss": 0.6086, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.25174736976623535, + "rewards/margins": 0.2748829126358032, + "rewards/rejected": -0.5266302824020386, + "step": 2320 + }, + { + "epoch": 0.6097880136090029, + "grad_norm": 7.964311599731445, + "learning_rate": 1.9829025345760121e-07, + "logits/chosen": -2.7749578952789307, + "logits/rejected": -2.7802319526672363, + "logps/chosen": -315.29290771484375, + "logps/rejected": -332.8951721191406, + "loss": 0.6062, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.17806461453437805, + "rewards/margins": 0.2752231955528259, + "rewards/rejected": -0.4532877802848816, + "step": 2330 + }, + { + "epoch": 0.6124051295472389, + "grad_norm": 8.214485168457031, + "learning_rate": 1.960573825716911e-07, + "logits/chosen": -2.743821620941162, + "logits/rejected": -2.7305188179016113, + "logps/chosen": -275.1949768066406, + "logps/rejected": -297.45172119140625, + "loss": 0.6016, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.23889228701591492, + "rewards/margins": 0.29088443517684937, + "rewards/rejected": -0.5297766923904419, + "step": 2340 + }, + { + "epoch": 0.615022245485475, + "grad_norm": 7.783448696136475, + "learning_rate": 1.9382901587881273e-07, + "logits/chosen": -2.8195502758026123, + "logits/rejected": -2.8172898292541504, + "logps/chosen": -291.1629333496094, + "logps/rejected": -292.11553955078125, + "loss": 0.5555, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.15334704518318176, + "rewards/margins": 0.37875789403915405, + "rewards/rejected": -0.5321049094200134, + "step": 2350 + }, + { + "epoch": 0.6176393614237111, + "grad_norm": 7.713850498199463, + "learning_rate": 1.9160533944694364e-07, + "logits/chosen": -2.802713394165039, + "logits/rejected": -2.763248920440674, + "logps/chosen": -297.48541259765625, + "logps/rejected": -321.0580139160156, + "loss": 0.5661, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.1875167191028595, + "rewards/margins": 0.3671106696128845, + "rewards/rejected": -0.5546274185180664, + "step": 2360 + }, + { + "epoch": 0.6202564773619471, + "grad_norm": 7.275653839111328, + "learning_rate": 1.8938653895242602e-07, + "logits/chosen": -2.805842161178589, + "logits/rejected": -2.7778079509735107, + "logps/chosen": -301.32257080078125, + "logps/rejected": -307.5292663574219, + "loss": 0.569, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.22137200832366943, + "rewards/margins": 0.3620893061161041, + "rewards/rejected": -0.583461344242096, + "step": 2370 + }, + { + "epoch": 0.6228735933001832, + "grad_norm": 7.8891282081604, + "learning_rate": 1.8717279966446264e-07, + "logits/chosen": -2.702014684677124, + "logits/rejected": -2.6890392303466797, + "logps/chosen": -299.67095947265625, + "logps/rejected": -315.53125, + "loss": 0.6047, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.3092700242996216, + "rewards/margins": 0.2915950417518616, + "rewards/rejected": -0.6008650660514832, + "step": 2380 + }, + { + "epoch": 0.6254907092384192, + "grad_norm": 9.103086471557617, + "learning_rate": 1.8496430642964694e-07, + "logits/chosen": -2.7693662643432617, + "logits/rejected": -2.749218702316284, + "logps/chosen": -320.30596923828125, + "logps/rejected": -322.6269226074219, + "loss": 0.6135, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.2899993658065796, + "rewards/margins": 0.2783369719982147, + "rewards/rejected": -0.5683363676071167, + "step": 2390 + }, + { + "epoch": 0.6281078251766553, + "grad_norm": 8.552151679992676, + "learning_rate": 1.8276124365652855e-07, + "logits/chosen": -2.796008586883545, + "logits/rejected": -2.750042200088501, + "logps/chosen": -308.24066162109375, + "logps/rejected": -318.9580993652344, + "loss": 0.602, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.283893346786499, + "rewards/margins": 0.2797131836414337, + "rewards/rejected": -0.5636065602302551, + "step": 2400 + }, + { + "epoch": 0.6281078251766553, + "eval_logits/chosen": -2.796116352081299, + "eval_logits/rejected": -2.774383783340454, + "eval_logps/chosen": -309.5652160644531, + "eval_logps/rejected": -318.12506103515625, + "eval_loss": 0.5968618392944336, + "eval_rewards/accuracies": 0.6990000009536743, + "eval_rewards/chosen": -0.2682757079601288, + "eval_rewards/margins": 0.2985955774784088, + "eval_rewards/rejected": -0.5668712258338928, + "eval_runtime": 690.9152, + "eval_samples_per_second": 2.895, + "eval_steps_per_second": 0.362, + "step": 2400 + }, + { + "epoch": 0.6307249411148914, + "grad_norm": 10.884597778320312, + "learning_rate": 1.805637953002149e-07, + "logits/chosen": -2.806243658065796, + "logits/rejected": -2.804234266281128, + "logps/chosen": -287.49090576171875, + "logps/rejected": -287.6014404296875, + "loss": 0.6169, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.27734607458114624, + "rewards/margins": 0.24837279319763184, + "rewards/rejected": -0.5257189273834229, + "step": 2410 + }, + { + "epoch": 0.6333420570531274, + "grad_norm": 9.013958930969238, + "learning_rate": 1.7837214484701153e-07, + "logits/chosen": -2.7953040599823, + "logits/rejected": -2.7851452827453613, + "logps/chosen": -289.382568359375, + "logps/rejected": -297.02679443359375, + "loss": 0.5733, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.2011108100414276, + "rewards/margins": 0.34568914771080017, + "rewards/rejected": -0.5468000173568726, + "step": 2420 + }, + { + "epoch": 0.6359591729913635, + "grad_norm": 14.238588333129883, + "learning_rate": 1.761864752991004e-07, + "logits/chosen": -2.778735399246216, + "logits/rejected": -2.759908437728882, + "logps/chosen": -295.66241455078125, + "logps/rejected": -312.7738952636719, + "loss": 0.5791, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.21105961501598358, + "rewards/margins": 0.3268287181854248, + "rewards/rejected": -0.5378884077072144, + "step": 2430 + }, + { + "epoch": 0.6385762889295996, + "grad_norm": 5.6600518226623535, + "learning_rate": 1.7400696915925995e-07, + "logits/chosen": -2.7974464893341064, + "logits/rejected": -2.7732651233673096, + "logps/chosen": -312.24798583984375, + "logps/rejected": -279.251708984375, + "loss": 0.5943, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.23233290016651154, + "rewards/margins": 0.3078458309173584, + "rewards/rejected": -0.5401787161827087, + "step": 2440 + }, + { + "epoch": 0.6411934048678356, + "grad_norm": 11.058223724365234, + "learning_rate": 1.718338084156254e-07, + "logits/chosen": -2.7382242679595947, + "logits/rejected": -2.727843761444092, + "logps/chosen": -323.4954528808594, + "logps/rejected": -317.99456787109375, + "loss": 0.57, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.1701376736164093, + "rewards/margins": 0.3507465720176697, + "rewards/rejected": -0.5208842754364014, + "step": 2450 + }, + { + "epoch": 0.6438105208060717, + "grad_norm": 14.676642417907715, + "learning_rate": 1.696671745264937e-07, + "logits/chosen": -2.799201488494873, + "logits/rejected": -2.8146328926086426, + "logps/chosen": -313.3539733886719, + "logps/rejected": -290.71197509765625, + "loss": 0.5616, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.154522106051445, + "rewards/margins": 0.36096832156181335, + "rewards/rejected": -0.5154904127120972, + "step": 2460 + }, + { + "epoch": 0.6464276367443078, + "grad_norm": 7.134603500366211, + "learning_rate": 1.67507248405171e-07, + "logits/chosen": -2.786536693572998, + "logits/rejected": -2.7716171741485596, + "logps/chosen": -290.3885192871094, + "logps/rejected": -317.96453857421875, + "loss": 0.6052, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.17861530184745789, + "rewards/margins": 0.2776513695716858, + "rewards/rejected": -0.4562666416168213, + "step": 2470 + }, + { + "epoch": 0.6490447526825438, + "grad_norm": 9.284005165100098, + "learning_rate": 1.6535421040486683e-07, + "logits/chosen": -2.695885181427002, + "logits/rejected": -2.683889150619507, + "logps/chosen": -292.3827209472656, + "logps/rejected": -295.35003662109375, + "loss": 0.5708, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.20486466586589813, + "rewards/margins": 0.3616489768028259, + "rewards/rejected": -0.5665136575698853, + "step": 2480 + }, + { + "epoch": 0.6516618686207799, + "grad_norm": 11.596046447753906, + "learning_rate": 1.6320824030363456e-07, + "logits/chosen": -2.7673633098602295, + "logits/rejected": -2.7697348594665527, + "logps/chosen": -269.5127868652344, + "logps/rejected": -284.500732421875, + "loss": 0.5804, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.19040192663669586, + "rewards/margins": 0.32062506675720215, + "rewards/rejected": -0.5110269784927368, + "step": 2490 + }, + { + "epoch": 0.654278984559016, + "grad_norm": 8.306464195251465, + "learning_rate": 1.6106951728936024e-07, + "logits/chosen": -2.8287737369537354, + "logits/rejected": -2.785698413848877, + "logps/chosen": -290.69586181640625, + "logps/rejected": -315.9652404785156, + "loss": 0.5792, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.17289450764656067, + "rewards/margins": 0.32756882905960083, + "rewards/rejected": -0.5004633069038391, + "step": 2500 + }, + { + "epoch": 0.654278984559016, + "eval_logits/chosen": -2.7979679107666016, + "eval_logits/rejected": -2.776271104812622, + "eval_logps/chosen": -303.76153564453125, + "eval_logps/rejected": -311.8429260253906, + "eval_loss": 0.5962891578674316, + "eval_rewards/accuracies": 0.6974999904632568, + "eval_rewards/chosen": -0.2102394998073578, + "eval_rewards/margins": 0.2938106954097748, + "eval_rewards/rejected": -0.5040501952171326, + "eval_runtime": 692.3854, + "eval_samples_per_second": 2.889, + "eval_steps_per_second": 0.361, + "step": 2500 + }, + { + "epoch": 0.656896100497252, + "grad_norm": 6.3364176750183105, + "learning_rate": 1.5893821994479994e-07, + "logits/chosen": -2.8073089122772217, + "logits/rejected": -2.7984962463378906, + "logps/chosen": -307.6702880859375, + "logps/rejected": -299.78192138671875, + "loss": 0.583, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.15238206088542938, + "rewards/margins": 0.3230430781841278, + "rewards/rejected": -0.4754251539707184, + "step": 2510 + }, + { + "epoch": 0.6595132164354881, + "grad_norm": 7.475069999694824, + "learning_rate": 1.5681452623266867e-07, + "logits/chosen": -2.788701057434082, + "logits/rejected": -2.7505264282226562, + "logps/chosen": -323.1575012207031, + "logps/rejected": -304.9902038574219, + "loss": 0.5469, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.1838502436876297, + "rewards/margins": 0.4115122854709625, + "rewards/rejected": -0.5953624844551086, + "step": 2520 + }, + { + "epoch": 0.6621303323737242, + "grad_norm": 9.084112167358398, + "learning_rate": 1.546986134807801e-07, + "logits/chosen": -2.8091278076171875, + "logits/rejected": -2.780764102935791, + "logps/chosen": -293.3882751464844, + "logps/rejected": -309.5545349121094, + "loss": 0.5931, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.28720271587371826, + "rewards/margins": 0.30004793405532837, + "rewards/rejected": -0.5872506499290466, + "step": 2530 + }, + { + "epoch": 0.6647474483119602, + "grad_norm": 7.817606449127197, + "learning_rate": 1.5259065836724034e-07, + "logits/chosen": -2.7307331562042236, + "logits/rejected": -2.7140753269195557, + "logps/chosen": -290.29443359375, + "logps/rejected": -307.90399169921875, + "loss": 0.5968, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2654728889465332, + "rewards/margins": 0.2819042205810547, + "rewards/rejected": -0.5473771095275879, + "step": 2540 + }, + { + "epoch": 0.6673645642501963, + "grad_norm": 8.136064529418945, + "learning_rate": 1.5049083690569454e-07, + "logits/chosen": -2.7462635040283203, + "logits/rejected": -2.731522798538208, + "logps/chosen": -279.6645812988281, + "logps/rejected": -303.47857666015625, + "loss": 0.6011, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.2667672336101532, + "rewards/margins": 0.28752660751342773, + "rewards/rejected": -0.5542938113212585, + "step": 2550 + }, + { + "epoch": 0.6699816801884323, + "grad_norm": 5.6162896156311035, + "learning_rate": 1.4839932443063056e-07, + "logits/chosen": -2.7818315029144287, + "logits/rejected": -2.754776954650879, + "logps/chosen": -331.192626953125, + "logps/rejected": -306.44342041015625, + "loss": 0.5807, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.23324036598205566, + "rewards/margins": 0.33265605568885803, + "rewards/rejected": -0.5658964514732361, + "step": 2560 + }, + { + "epoch": 0.6725987961266684, + "grad_norm": 15.203133583068848, + "learning_rate": 1.46316295582738e-07, + "logits/chosen": -2.755795955657959, + "logits/rejected": -2.745166301727295, + "logps/chosen": -288.94012451171875, + "logps/rejected": -295.92974853515625, + "loss": 0.63, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.30726075172424316, + "rewards/margins": 0.21980533003807068, + "rewards/rejected": -0.5270661115646362, + "step": 2570 + }, + { + "epoch": 0.6752159120649045, + "grad_norm": 23.822792053222656, + "learning_rate": 1.4424192429432655e-07, + "logits/chosen": -2.783210515975952, + "logits/rejected": -2.766979694366455, + "logps/chosen": -291.4307556152344, + "logps/rejected": -328.7579040527344, + "loss": 0.5738, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.18577826023101807, + "rewards/margins": 0.34509676694869995, + "rewards/rejected": -0.5308750867843628, + "step": 2580 + }, + { + "epoch": 0.6778330280031405, + "grad_norm": 9.544054985046387, + "learning_rate": 1.4217638377480158e-07, + "logits/chosen": -2.7744319438934326, + "logits/rejected": -2.7644972801208496, + "logps/chosen": -299.30975341796875, + "logps/rejected": -312.57220458984375, + "loss": 0.598, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.23222167789936066, + "rewards/margins": 0.28205937147140503, + "rewards/rejected": -0.5142810344696045, + "step": 2590 + }, + { + "epoch": 0.6804501439413766, + "grad_norm": 7.35859489440918, + "learning_rate": 1.401198464962021e-07, + "logits/chosen": -2.7667133808135986, + "logits/rejected": -2.7541134357452393, + "logps/chosen": -305.63446044921875, + "logps/rejected": -288.49676513671875, + "loss": 0.6028, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2002829611301422, + "rewards/margins": 0.26447853446006775, + "rewards/rejected": -0.4647614359855652, + "step": 2600 + }, + { + "epoch": 0.6804501439413766, + "eval_logits/chosen": -2.793254852294922, + "eval_logits/rejected": -2.771672010421753, + "eval_logps/chosen": -301.69635009765625, + "eval_logps/rejected": -309.3417053222656, + "eval_loss": 0.5973595976829529, + "eval_rewards/accuracies": 0.6919999718666077, + "eval_rewards/chosen": -0.18958736956119537, + "eval_rewards/margins": 0.289450466632843, + "eval_rewards/rejected": -0.4790377914905548, + "eval_runtime": 692.1987, + "eval_samples_per_second": 2.889, + "eval_steps_per_second": 0.361, + "step": 2600 + }, + { + "epoch": 0.6830672598796127, + "grad_norm": 6.412085056304932, + "learning_rate": 1.3807248417879894e-07, + "logits/chosen": -2.799522638320923, + "logits/rejected": -2.801234483718872, + "logps/chosen": -304.61505126953125, + "logps/rejected": -318.75360107421875, + "loss": 0.5742, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.1567406803369522, + "rewards/margins": 0.35466814041137695, + "rewards/rejected": -0.511408805847168, + "step": 2610 + }, + { + "epoch": 0.6856843758178487, + "grad_norm": 6.595985412597656, + "learning_rate": 1.3603446777675665e-07, + "logits/chosen": -2.7163891792297363, + "logits/rejected": -2.6980533599853516, + "logps/chosen": -301.43170166015625, + "logps/rejected": -309.5948486328125, + "loss": 0.5767, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.18890248239040375, + "rewards/margins": 0.33902615308761597, + "rewards/rejected": -0.5279285907745361, + "step": 2620 + }, + { + "epoch": 0.6883014917560848, + "grad_norm": 5.626343250274658, + "learning_rate": 1.3400596746385814e-07, + "logits/chosen": -2.785409450531006, + "logits/rejected": -2.7549426555633545, + "logps/chosen": -305.23779296875, + "logps/rejected": -306.29864501953125, + "loss": 0.5866, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.17120136320590973, + "rewards/margins": 0.3220587372779846, + "rewards/rejected": -0.49326008558273315, + "step": 2630 + }, + { + "epoch": 0.6909186076943209, + "grad_norm": 7.084354400634766, + "learning_rate": 1.3198715261929586e-07, + "logits/chosen": -2.8111932277679443, + "logits/rejected": -2.7792601585388184, + "logps/chosen": -269.24957275390625, + "logps/rejected": -297.8160400390625, + "loss": 0.5557, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.19386166334152222, + "rewards/margins": 0.37062662839889526, + "rewards/rejected": -0.5644882917404175, + "step": 2640 + }, + { + "epoch": 0.6935357236325569, + "grad_norm": 6.301397800445557, + "learning_rate": 1.299781918135282e-07, + "logits/chosen": -2.780548095703125, + "logits/rejected": -2.7463881969451904, + "logps/chosen": -331.93035888671875, + "logps/rejected": -346.24005126953125, + "loss": 0.5488, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.12747205793857574, + "rewards/margins": 0.4090425372123718, + "rewards/rejected": -0.5365146398544312, + "step": 2650 + }, + { + "epoch": 0.696152839570793, + "grad_norm": 4.976480007171631, + "learning_rate": 1.279792527942045e-07, + "logits/chosen": -2.7965517044067383, + "logits/rejected": -2.7541985511779785, + "logps/chosen": -308.75946044921875, + "logps/rejected": -333.583251953125, + "loss": 0.573, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.2170572280883789, + "rewards/margins": 0.3559117913246155, + "rewards/rejected": -0.5729690194129944, + "step": 2660 + }, + { + "epoch": 0.6987699555090291, + "grad_norm": 7.420611381530762, + "learning_rate": 1.259905024721576e-07, + "logits/chosen": -2.7755208015441895, + "logits/rejected": -2.7653794288635254, + "logps/chosen": -297.36810302734375, + "logps/rejected": -308.62139892578125, + "loss": 0.574, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.21521492302417755, + "rewards/margins": 0.3440507650375366, + "rewards/rejected": -0.5592657327651978, + "step": 2670 + }, + { + "epoch": 0.7013870714472651, + "grad_norm": 9.432327270507812, + "learning_rate": 1.2401210690746703e-07, + "logits/chosen": -2.7644107341766357, + "logits/rejected": -2.7474875450134277, + "logps/chosen": -305.26129150390625, + "logps/rejected": -300.5979309082031, + "loss": 0.5966, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.19491124153137207, + "rewards/margins": 0.2979043126106262, + "rewards/rejected": -0.4928155541419983, + "step": 2680 + }, + { + "epoch": 0.7040041873855012, + "grad_norm": 13.687203407287598, + "learning_rate": 1.2204423129559305e-07, + "logits/chosen": -2.803926467895508, + "logits/rejected": -2.8096935749053955, + "logps/chosen": -304.5517272949219, + "logps/rejected": -332.74627685546875, + "loss": 0.5878, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.21299275755882263, + "rewards/margins": 0.32770127058029175, + "rewards/rejected": -0.540693998336792, + "step": 2690 + }, + { + "epoch": 0.7066213033237373, + "grad_norm": 9.307769775390625, + "learning_rate": 1.2008703995358299e-07, + "logits/chosen": -2.7696948051452637, + "logits/rejected": -2.7626984119415283, + "logps/chosen": -305.66973876953125, + "logps/rejected": -309.4637756347656, + "loss": 0.5854, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.23966650664806366, + "rewards/margins": 0.3371264636516571, + "rewards/rejected": -0.5767929553985596, + "step": 2700 + }, + { + "epoch": 0.7066213033237373, + "eval_logits/chosen": -2.7892041206359863, + "eval_logits/rejected": -2.7675600051879883, + "eval_logps/chosen": -307.9026794433594, + "eval_logps/rejected": -317.58642578125, + "eval_loss": 0.5930463671684265, + "eval_rewards/accuracies": 0.7020000219345093, + "eval_rewards/chosen": -0.2516505718231201, + "eval_rewards/margins": 0.309834361076355, + "eval_rewards/rejected": -0.5614849925041199, + "eval_runtime": 692.1934, + "eval_samples_per_second": 2.889, + "eval_steps_per_second": 0.361, + "step": 2700 + }, + { + "epoch": 0.7092384192619733, + "grad_norm": 7.60300874710083, + "learning_rate": 1.1814069630635068e-07, + "logits/chosen": -2.7490410804748535, + "logits/rejected": -2.7561395168304443, + "logps/chosen": -311.02667236328125, + "logps/rejected": -334.8045349121094, + "loss": 0.5936, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.2228337824344635, + "rewards/margins": 0.31492942571640015, + "rewards/rejected": -0.5377631783485413, + "step": 2710 + }, + { + "epoch": 0.7118555352002094, + "grad_norm": 5.55739164352417, + "learning_rate": 1.1620536287303051e-07, + "logits/chosen": -2.7841482162475586, + "logits/rejected": -2.7707200050354004, + "logps/chosen": -330.66802978515625, + "logps/rejected": -324.71453857421875, + "loss": 0.6076, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.21253342926502228, + "rewards/margins": 0.2718030512332916, + "rewards/rejected": -0.4843364655971527, + "step": 2720 + }, + { + "epoch": 0.7144726511384454, + "grad_norm": 4.946017742156982, + "learning_rate": 1.1428120125340716e-07, + "logits/chosen": -2.771012783050537, + "logits/rejected": -2.751859188079834, + "logps/chosen": -299.06195068359375, + "logps/rejected": -291.7746276855469, + "loss": 0.5414, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.18322893977165222, + "rewards/margins": 0.4256429076194763, + "rewards/rejected": -0.6088718175888062, + "step": 2730 + }, + { + "epoch": 0.7170897670766815, + "grad_norm": 8.510547637939453, + "learning_rate": 1.123683721144223e-07, + "logits/chosen": -2.773465871810913, + "logits/rejected": -2.750523328781128, + "logps/chosen": -322.75030517578125, + "logps/rejected": -322.23541259765625, + "loss": 0.5924, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2108650654554367, + "rewards/margins": 0.3147924840450287, + "rewards/rejected": -0.5256575345993042, + "step": 2740 + }, + { + "epoch": 0.7197068830149176, + "grad_norm": 6.666440010070801, + "learning_rate": 1.1046703517675845e-07, + "logits/chosen": -2.792327642440796, + "logits/rejected": -2.780276298522949, + "logps/chosen": -292.0575256347656, + "logps/rejected": -331.8373718261719, + "loss": 0.5803, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.20287561416625977, + "rewards/margins": 0.3353033661842346, + "rewards/rejected": -0.5381789803504944, + "step": 2750 + }, + { + "epoch": 0.7223239989531536, + "grad_norm": 3.823488712310791, + "learning_rate": 1.085773492015028e-07, + "logits/chosen": -2.7709414958953857, + "logits/rejected": -2.7493114471435547, + "logps/chosen": -284.67193603515625, + "logps/rejected": -288.34991455078125, + "loss": 0.5487, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2005012482404709, + "rewards/margins": 0.4104704260826111, + "rewards/rejected": -0.6109716892242432, + "step": 2760 + }, + { + "epoch": 0.7249411148913897, + "grad_norm": 10.498513221740723, + "learning_rate": 1.0669947197689033e-07, + "logits/chosen": -2.7609269618988037, + "logits/rejected": -2.723078489303589, + "logps/chosen": -316.71929931640625, + "logps/rejected": -321.02239990234375, + "loss": 0.5936, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.2569184899330139, + "rewards/margins": 0.3084716498851776, + "rewards/rejected": -0.5653902292251587, + "step": 2770 + }, + { + "epoch": 0.7275582308296258, + "grad_norm": 9.501131057739258, + "learning_rate": 1.048335603051291e-07, + "logits/chosen": -2.7370448112487793, + "logits/rejected": -2.730591058731079, + "logps/chosen": -329.8760986328125, + "logps/rejected": -340.55413818359375, + "loss": 0.5523, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.2385425865650177, + "rewards/margins": 0.41302841901779175, + "rewards/rejected": -0.6515710353851318, + "step": 2780 + }, + { + "epoch": 0.7301753467678618, + "grad_norm": 9.440362930297852, + "learning_rate": 1.0297976998930663e-07, + "logits/chosen": -2.787727117538452, + "logits/rejected": -2.7839837074279785, + "logps/chosen": -315.8175048828125, + "logps/rejected": -321.4845275878906, + "loss": 0.5551, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.23409982025623322, + "rewards/margins": 0.4074832797050476, + "rewards/rejected": -0.6415830850601196, + "step": 2790 + }, + { + "epoch": 0.7327924627060979, + "grad_norm": 9.004974365234375, + "learning_rate": 1.0113825582038077e-07, + "logits/chosen": -2.7806646823883057, + "logits/rejected": -2.770219326019287, + "logps/chosen": -309.5851135253906, + "logps/rejected": -321.6380310058594, + "loss": 0.5994, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.2802024185657501, + "rewards/margins": 0.2918320596218109, + "rewards/rejected": -0.572034478187561, + "step": 2800 + }, + { + "epoch": 0.7327924627060979, + "eval_logits/chosen": -2.785149335861206, + "eval_logits/rejected": -2.7636430263519287, + "eval_logps/chosen": -308.8106689453125, + "eval_logps/rejected": -319.18377685546875, + "eval_loss": 0.5920370221138, + "eval_rewards/accuracies": 0.7045000195503235, + "eval_rewards/chosen": -0.2607303559780121, + "eval_rewards/margins": 0.31672805547714233, + "eval_rewards/rejected": -0.577458381652832, + "eval_runtime": 691.5482, + "eval_samples_per_second": 2.892, + "eval_steps_per_second": 0.362, + "step": 2800 + }, + { + "epoch": 0.735409578644334, + "grad_norm": 5.153034687042236, + "learning_rate": 9.930917156425475e-08, + "logits/chosen": -2.7953689098358154, + "logits/rejected": -2.7769198417663574, + "logps/chosen": -307.6942443847656, + "logps/rejected": -336.81036376953125, + "loss": 0.5828, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.2690412104129791, + "rewards/margins": 0.3371729254722595, + "rewards/rejected": -0.6062140464782715, + "step": 2810 + }, + { + "epoch": 0.73802669458257, + "grad_norm": 10.421857833862305, + "learning_rate": 9.749266994893754e-08, + "logits/chosen": -2.7286500930786133, + "logits/rejected": -2.696841239929199, + "logps/chosen": -283.78277587890625, + "logps/rejected": -293.64666748046875, + "loss": 0.6332, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.2909180223941803, + "rewards/margins": 0.21305350959300995, + "rewards/rejected": -0.5039715766906738, + "step": 2820 + }, + { + "epoch": 0.7406438105208061, + "grad_norm": 14.213560104370117, + "learning_rate": 9.568890265179128e-08, + "logits/chosen": -2.7485554218292236, + "logits/rejected": -2.7543232440948486, + "logps/chosen": -308.8101806640625, + "logps/rejected": -305.62347412109375, + "loss": 0.609, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2877466678619385, + "rewards/margins": 0.28105878829956055, + "rewards/rejected": -0.568805456161499, + "step": 2830 + }, + { + "epoch": 0.7432609264590422, + "grad_norm": 5.577268600463867, + "learning_rate": 9.389802028686616e-08, + "logits/chosen": -2.7711002826690674, + "logits/rejected": -2.7511260509490967, + "logps/chosen": -308.267822265625, + "logps/rejected": -295.8204650878906, + "loss": 0.6301, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.292976438999176, + "rewards/margins": 0.21805603802204132, + "rewards/rejected": -0.5110324621200562, + "step": 2840 + }, + { + "epoch": 0.7458780423972782, + "grad_norm": 5.392404556274414, + "learning_rate": 9.212017239232426e-08, + "logits/chosen": -2.7617223262786865, + "logits/rejected": -2.7573046684265137, + "logps/chosen": -312.38421630859375, + "logps/rejected": -330.9461975097656, + "loss": 0.5444, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.22561874985694885, + "rewards/margins": 0.4286450445652008, + "rewards/rejected": -0.6542637348175049, + "step": 2850 + }, + { + "epoch": 0.7484951583355143, + "grad_norm": 6.394357681274414, + "learning_rate": 9.035550741795328e-08, + "logits/chosen": -2.7431981563568115, + "logits/rejected": -2.7521939277648926, + "logps/chosen": -295.7667541503906, + "logps/rejected": -334.49688720703125, + "loss": 0.5794, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.21194259822368622, + "rewards/margins": 0.35274478793144226, + "rewards/rejected": -0.5646874308586121, + "step": 2860 + }, + { + "epoch": 0.7511122742737504, + "grad_norm": 9.479743003845215, + "learning_rate": 8.860417271277065e-08, + "logits/chosen": -2.819362163543701, + "logits/rejected": -2.8213016986846924, + "logps/chosen": -308.4556884765625, + "logps/rejected": -324.0565490722656, + "loss": 0.6036, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.23003943264484406, + "rewards/margins": 0.26295268535614014, + "rewards/rejected": -0.492992103099823, + "step": 2870 + }, + { + "epoch": 0.7537293902119864, + "grad_norm": 9.29710865020752, + "learning_rate": 8.686631451272029e-08, + "logits/chosen": -2.7966079711914062, + "logits/rejected": -2.7735276222229004, + "logps/chosen": -297.5863952636719, + "logps/rejected": -300.37908935546875, + "loss": 0.6135, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2746056914329529, + "rewards/margins": 0.2642548680305481, + "rewards/rejected": -0.5388606190681458, + "step": 2880 + }, + { + "epoch": 0.7563465061502225, + "grad_norm": 9.630151748657227, + "learning_rate": 8.514207792846168e-08, + "logits/chosen": -2.7753801345825195, + "logits/rejected": -2.775832414627075, + "logps/chosen": -292.93609619140625, + "logps/rejected": -292.79754638671875, + "loss": 0.5907, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2579854130744934, + "rewards/margins": 0.3091490864753723, + "rewards/rejected": -0.5671344995498657, + "step": 2890 + }, + { + "epoch": 0.7589636220884585, + "grad_norm": 7.0608439445495605, + "learning_rate": 8.343160693325355e-08, + "logits/chosen": -2.7492966651916504, + "logits/rejected": -2.7410671710968018, + "logps/chosen": -293.8484802246094, + "logps/rejected": -324.77001953125, + "loss": 0.5837, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.22494366765022278, + "rewards/margins": 0.3548448979854584, + "rewards/rejected": -0.5797885656356812, + "step": 2900 + }, + { + "epoch": 0.7589636220884585, + "eval_logits/chosen": -2.783421277999878, + "eval_logits/rejected": -2.7619221210479736, + "eval_logps/chosen": -308.137939453125, + "eval_logps/rejected": -318.6510925292969, + "eval_loss": 0.5913165211677551, + "eval_rewards/accuracies": 0.7055000066757202, + "eval_rewards/chosen": -0.2540031671524048, + "eval_rewards/margins": 0.3181284964084625, + "eval_rewards/rejected": -0.5721316933631897, + "eval_runtime": 692.0731, + "eval_samples_per_second": 2.89, + "eval_steps_per_second": 0.361, + "step": 2900 + }, + { + "epoch": 0.7615807380266946, + "grad_norm": 7.802112579345703, + "learning_rate": 8.173504435093173e-08, + "logits/chosen": -2.7537245750427246, + "logits/rejected": -2.726355791091919, + "logps/chosen": -290.5617980957031, + "logps/rejected": -287.50799560546875, + "loss": 0.5806, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.2531769275665283, + "rewards/margins": 0.35345658659935, + "rewards/rejected": -0.6066334843635559, + "step": 2910 + }, + { + "epoch": 0.7641978539649307, + "grad_norm": 9.018595695495605, + "learning_rate": 8.005253184398359e-08, + "logits/chosen": -2.7553019523620605, + "logits/rejected": -2.745943546295166, + "logps/chosen": -320.03265380859375, + "logps/rejected": -340.8626403808594, + "loss": 0.6027, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.24576649069786072, + "rewards/margins": 0.28911441564559937, + "rewards/rejected": -0.5348808765411377, + "step": 2920 + }, + { + "epoch": 0.7668149699031667, + "grad_norm": 6.111194133758545, + "learning_rate": 7.838420990171926e-08, + "logits/chosen": -2.789515972137451, + "logits/rejected": -2.7570273876190186, + "logps/chosen": -310.61224365234375, + "logps/rejected": -312.87152099609375, + "loss": 0.5865, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.22255787253379822, + "rewards/margins": 0.31383711099624634, + "rewards/rejected": -0.5363950133323669, + "step": 2930 + }, + { + "epoch": 0.7694320858414028, + "grad_norm": 5.815800666809082, + "learning_rate": 7.673021782854083e-08, + "logits/chosen": -2.69783091545105, + "logits/rejected": -2.6870310306549072, + "logps/chosen": -311.68963623046875, + "logps/rejected": -288.39215087890625, + "loss": 0.5979, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.2525468170642853, + "rewards/margins": 0.31668832898139954, + "rewards/rejected": -0.5692351460456848, + "step": 2940 + }, + { + "epoch": 0.7720492017796389, + "grad_norm": 10.589014053344727, + "learning_rate": 7.509069373231039e-08, + "logits/chosen": -2.742522716522217, + "logits/rejected": -2.7218940258026123, + "logps/chosen": -293.1689453125, + "logps/rejected": -302.7828369140625, + "loss": 0.6006, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.24362894892692566, + "rewards/margins": 0.29250627756118774, + "rewards/rejected": -0.536135196685791, + "step": 2950 + }, + { + "epoch": 0.7746663177178749, + "grad_norm": 8.408040046691895, + "learning_rate": 7.346577451281821e-08, + "logits/chosen": -2.7488350868225098, + "logits/rejected": -2.7583699226379395, + "logps/chosen": -308.5254821777344, + "logps/rejected": -321.6301574707031, + "loss": 0.578, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.26960092782974243, + "rewards/margins": 0.3470562696456909, + "rewards/rejected": -0.6166571378707886, + "step": 2960 + }, + { + "epoch": 0.777283433656111, + "grad_norm": 7.626022815704346, + "learning_rate": 7.185559585035136e-08, + "logits/chosen": -2.7650535106658936, + "logits/rejected": -2.736623764038086, + "logps/chosen": -327.43792724609375, + "logps/rejected": -349.74005126953125, + "loss": 0.5695, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.28370755910873413, + "rewards/margins": 0.38453495502471924, + "rewards/rejected": -0.6682425737380981, + "step": 2970 + }, + { + "epoch": 0.7799005495943471, + "grad_norm": 8.664432525634766, + "learning_rate": 7.026029219436502e-08, + "logits/chosen": -2.7403178215026855, + "logits/rejected": -2.726973533630371, + "logps/chosen": -296.88629150390625, + "logps/rejected": -320.1584167480469, + "loss": 0.5807, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2508087158203125, + "rewards/margins": 0.3491096496582031, + "rewards/rejected": -0.5999183058738708, + "step": 2980 + }, + { + "epoch": 0.7825176655325831, + "grad_norm": 7.381548881530762, + "learning_rate": 6.867999675225522e-08, + "logits/chosen": -2.7898964881896973, + "logits/rejected": -2.765493392944336, + "logps/chosen": -269.5013427734375, + "logps/rejected": -287.95318603515625, + "loss": 0.577, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.22013553977012634, + "rewards/margins": 0.3466190695762634, + "rewards/rejected": -0.5667546391487122, + "step": 2990 + }, + { + "epoch": 0.7851347814708192, + "grad_norm": 8.886544227600098, + "learning_rate": 6.711484147823662e-08, + "logits/chosen": -2.7362468242645264, + "logits/rejected": -2.7374088764190674, + "logps/chosen": -273.03204345703125, + "logps/rejected": -309.46832275390625, + "loss": 0.5858, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.22477373480796814, + "rewards/margins": 0.3105041980743408, + "rewards/rejected": -0.5352779626846313, + "step": 3000 + }, + { + "epoch": 0.7851347814708192, + "eval_logits/chosen": -2.781898021697998, + "eval_logits/rejected": -2.7604949474334717, + "eval_logps/chosen": -308.9897766113281, + "eval_logps/rejected": -319.7853088378906, + "eval_loss": 0.5910181999206543, + "eval_rewards/accuracies": 0.7055000066757202, + "eval_rewards/chosen": -0.2625214755535126, + "eval_rewards/margins": 0.32095208764076233, + "eval_rewards/rejected": -0.5834735035896301, + "eval_runtime": 691.7146, + "eval_samples_per_second": 2.891, + "eval_steps_per_second": 0.361, + "step": 3000 + }, + { + "epoch": 0.7877518974090553, + "grad_norm": 12.21480655670166, + "learning_rate": 6.556495706232412e-08, + "logits/chosen": -2.7469980716705322, + "logits/rejected": -2.7527496814727783, + "logps/chosen": -316.41766357421875, + "logps/rejected": -328.52532958984375, + "loss": 0.5886, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.291492760181427, + "rewards/margins": 0.32380086183547974, + "rewards/rejected": -0.6152936816215515, + "step": 3010 + }, + { + "epoch": 0.7903690133472913, + "grad_norm": 8.182783126831055, + "learning_rate": 6.403047291942057e-08, + "logits/chosen": -2.722087860107422, + "logits/rejected": -2.6903903484344482, + "logps/chosen": -275.5090637207031, + "logps/rejected": -277.62420654296875, + "loss": 0.5972, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3089084327220917, + "rewards/margins": 0.29682403802871704, + "rewards/rejected": -0.6057325005531311, + "step": 3020 + }, + { + "epoch": 0.7929861292855274, + "grad_norm": 8.147031784057617, + "learning_rate": 6.251151717851021e-08, + "logits/chosen": -2.743332624435425, + "logits/rejected": -2.7332491874694824, + "logps/chosen": -280.6979064941406, + "logps/rejected": -292.1900329589844, + "loss": 0.6154, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.29725611209869385, + "rewards/margins": 0.27910858392715454, + "rewards/rejected": -0.5763646960258484, + "step": 3030 + }, + { + "epoch": 0.7956032452237635, + "grad_norm": 10.667434692382812, + "learning_rate": 6.100821667196041e-08, + "logits/chosen": -2.8258140087127686, + "logits/rejected": -2.772840976715088, + "logps/chosen": -316.3697204589844, + "logps/rejected": -283.46575927734375, + "loss": 0.5777, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.2557021975517273, + "rewards/margins": 0.3566380739212036, + "rewards/rejected": -0.6123403310775757, + "step": 3040 + }, + { + "epoch": 0.7982203611619995, + "grad_norm": 11.156988143920898, + "learning_rate": 5.952069692493061e-08, + "logits/chosen": -2.7050204277038574, + "logits/rejected": -2.7095789909362793, + "logps/chosen": -266.9496154785156, + "logps/rejected": -308.8603515625, + "loss": 0.5668, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.21333126723766327, + "rewards/margins": 0.3779350519180298, + "rewards/rejected": -0.5912663340568542, + "step": 3050 + }, + { + "epoch": 0.8008374771002356, + "grad_norm": 17.065628051757812, + "learning_rate": 5.8049082144891794e-08, + "logits/chosen": -2.702791452407837, + "logits/rejected": -2.6872074604034424, + "logps/chosen": -304.93463134765625, + "logps/rejected": -380.0108642578125, + "loss": 0.5933, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.24746175110340118, + "rewards/margins": 0.32271090149879456, + "rewards/rejected": -0.5701726675033569, + "step": 3060 + }, + { + "epoch": 0.8034545930384716, + "grad_norm": 5.375977516174316, + "learning_rate": 5.659349521125459e-08, + "logits/chosen": -2.828627109527588, + "logits/rejected": -2.8292970657348633, + "logps/chosen": -323.8910827636719, + "logps/rejected": -331.82403564453125, + "loss": 0.5963, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.25231170654296875, + "rewards/margins": 0.3079237937927246, + "rewards/rejected": -0.5602355003356934, + "step": 3070 + }, + { + "epoch": 0.8060717089767077, + "grad_norm": 10.280311584472656, + "learning_rate": 5.5154057665109e-08, + "logits/chosen": -2.772552490234375, + "logits/rejected": -2.7637112140655518, + "logps/chosen": -304.2619934082031, + "logps/rejected": -313.9085998535156, + "loss": 0.5688, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.28161460161209106, + "rewards/margins": 0.3681698441505432, + "rewards/rejected": -0.6497844457626343, + "step": 3080 + }, + { + "epoch": 0.8086888249149438, + "grad_norm": 5.905206203460693, + "learning_rate": 5.3730889699075853e-08, + "logits/chosen": -2.790621280670166, + "logits/rejected": -2.764768123626709, + "logps/chosen": -320.5517272949219, + "logps/rejected": -295.2154541015625, + "loss": 0.5839, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.23616118729114532, + "rewards/margins": 0.32327955961227417, + "rewards/rejected": -0.5594406723976135, + "step": 3090 + }, + { + "epoch": 0.8113059408531798, + "grad_norm": 5.722733974456787, + "learning_rate": 5.2324110147270893e-08, + "logits/chosen": -2.766014814376831, + "logits/rejected": -2.758927583694458, + "logps/chosen": -317.6996154785156, + "logps/rejected": -342.97039794921875, + "loss": 0.5685, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.17896804213523865, + "rewards/margins": 0.3621399402618408, + "rewards/rejected": -0.5411080121994019, + "step": 3100 + }, + { + "epoch": 0.8113059408531798, + "eval_logits/chosen": -2.7776589393615723, + "eval_logits/rejected": -2.7558252811431885, + "eval_logps/chosen": -306.57073974609375, + "eval_logps/rejected": -317.1507263183594, + "eval_loss": 0.5914422869682312, + "eval_rewards/accuracies": 0.7039999961853027, + "eval_rewards/chosen": -0.23833158612251282, + "eval_rewards/margins": 0.3187963366508484, + "eval_rewards/rejected": -0.5571279525756836, + "eval_runtime": 692.3976, + "eval_samples_per_second": 2.889, + "eval_steps_per_second": 0.361, + "step": 3100 + }, + { + "epoch": 0.8139230567914159, + "grad_norm": 5.692158222198486, + "learning_rate": 5.0933836475381795e-08, + "logits/chosen": -2.773538827896118, + "logits/rejected": -2.743774175643921, + "logps/chosen": -323.03564453125, + "logps/rejected": -339.22576904296875, + "loss": 0.5839, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.20304706692695618, + "rewards/margins": 0.33373111486434937, + "rewards/rejected": -0.5367781519889832, + "step": 3110 + }, + { + "epoch": 0.816540172729652, + "grad_norm": 6.522732734680176, + "learning_rate": 4.956018477086005e-08, + "logits/chosen": -2.7541415691375732, + "logits/rejected": -2.7304270267486572, + "logps/chosen": -312.82550048828125, + "logps/rejected": -319.4942626953125, + "loss": 0.5787, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.2153932750225067, + "rewards/margins": 0.3583284020423889, + "rewards/rejected": -0.5737215876579285, + "step": 3120 + }, + { + "epoch": 0.819157288667888, + "grad_norm": 12.873359680175781, + "learning_rate": 4.820326973322763e-08, + "logits/chosen": -2.7611987590789795, + "logits/rejected": -2.7416489124298096, + "logps/chosen": -294.5945129394531, + "logps/rejected": -322.9219055175781, + "loss": 0.5902, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.26755794882774353, + "rewards/margins": 0.30830827355384827, + "rewards/rejected": -0.5758662223815918, + "step": 3130 + }, + { + "epoch": 0.821774404606124, + "grad_norm": 6.0704731941223145, + "learning_rate": 4.686320466449981e-08, + "logits/chosen": -2.765129566192627, + "logits/rejected": -2.712188482284546, + "logps/chosen": -279.4689025878906, + "logps/rejected": -308.8946533203125, + "loss": 0.5878, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.21677632629871368, + "rewards/margins": 0.3269808888435364, + "rewards/rejected": -0.5437572598457336, + "step": 3140 + }, + { + "epoch": 0.8243915205443602, + "grad_norm": 9.32778549194336, + "learning_rate": 4.554010145972417e-08, + "logits/chosen": -2.8120663166046143, + "logits/rejected": -2.7678775787353516, + "logps/chosen": -308.05975341796875, + "logps/rejected": -326.4994812011719, + "loss": 0.6037, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.28121477365493774, + "rewards/margins": 0.3111681342124939, + "rewards/rejected": -0.5923829078674316, + "step": 3150 + }, + { + "epoch": 0.8270086364825961, + "grad_norm": 6.878976345062256, + "learning_rate": 4.423407059763745e-08, + "logits/chosen": -2.769566535949707, + "logits/rejected": -2.754739999771118, + "logps/chosen": -313.4940490722656, + "logps/rejected": -338.7357482910156, + "loss": 0.5795, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.22392907738685608, + "rewards/margins": 0.3541107773780823, + "rewards/rejected": -0.578039824962616, + "step": 3160 + }, + { + "epoch": 0.8296257524208323, + "grad_norm": 8.941882133483887, + "learning_rate": 4.294522113144078e-08, + "logits/chosen": -2.7120773792266846, + "logits/rejected": -2.676596164703369, + "logps/chosen": -310.96600341796875, + "logps/rejected": -309.7723083496094, + "loss": 0.5784, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.23985318839550018, + "rewards/margins": 0.3455398380756378, + "rewards/rejected": -0.5853930115699768, + "step": 3170 + }, + { + "epoch": 0.8322428683590684, + "grad_norm": 11.861396789550781, + "learning_rate": 4.1673660679693804e-08, + "logits/chosen": -2.759885311126709, + "logits/rejected": -2.7518694400787354, + "logps/chosen": -264.2064514160156, + "logps/rejected": -315.90380859375, + "loss": 0.6069, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.26855209469795227, + "rewards/margins": 0.2709905505180359, + "rewards/rejected": -0.539542555809021, + "step": 3180 + }, + { + "epoch": 0.8348599842973043, + "grad_norm": 3.688720941543579, + "learning_rate": 4.041949541732825e-08, + "logits/chosen": -2.7698843479156494, + "logits/rejected": -2.773341655731201, + "logps/chosen": -306.61480712890625, + "logps/rejected": -325.04541015625, + "loss": 0.5851, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.2628583610057831, + "rewards/margins": 0.3378385603427887, + "rewards/rejected": -0.6006969213485718, + "step": 3190 + }, + { + "epoch": 0.8374771002355405, + "grad_norm": 3.2142703533172607, + "learning_rate": 3.9182830066782605e-08, + "logits/chosen": -2.7356200218200684, + "logits/rejected": -2.740725040435791, + "logps/chosen": -303.8326721191406, + "logps/rejected": -351.736083984375, + "loss": 0.5753, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.28992363810539246, + "rewards/margins": 0.3618486821651459, + "rewards/rejected": -0.6517723798751831, + "step": 3200 + }, + { + "epoch": 0.8374771002355405, + "eval_logits/chosen": -2.778296709060669, + "eval_logits/rejected": -2.7567243576049805, + "eval_logps/chosen": -308.9666442871094, + "eval_logps/rejected": -320.12237548828125, + "eval_loss": 0.5903262495994568, + "eval_rewards/accuracies": 0.7020000219345093, + "eval_rewards/chosen": -0.26229044795036316, + "eval_rewards/margins": 0.3245540261268616, + "eval_rewards/rejected": -0.5868445038795471, + "eval_runtime": 691.7572, + "eval_samples_per_second": 2.891, + "eval_steps_per_second": 0.361, + "step": 3200 + }, + { + "epoch": 0.8400942161737766, + "grad_norm": 5.404438018798828, + "learning_rate": 3.79637678892577e-08, + "logits/chosen": -2.737617015838623, + "logits/rejected": -2.7435827255249023, + "logps/chosen": -313.7263488769531, + "logps/rejected": -326.2721862792969, + "loss": 0.5958, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.21620874106884003, + "rewards/margins": 0.29549044370651245, + "rewards/rejected": -0.5116991996765137, + "step": 3210 + }, + { + "epoch": 0.8427113321120125, + "grad_norm": 8.482666015625, + "learning_rate": 3.6762410676094645e-08, + "logits/chosen": -2.7493488788604736, + "logits/rejected": -2.751436233520508, + "logps/chosen": -342.2435302734375, + "logps/rejected": -334.9501953125, + "loss": 0.5649, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.22917640209197998, + "rewards/margins": 0.40211135149002075, + "rewards/rejected": -0.631287693977356, + "step": 3220 + }, + { + "epoch": 0.8453284480502486, + "grad_norm": 21.451396942138672, + "learning_rate": 3.557885874027497e-08, + "logits/chosen": -2.7467381954193115, + "logits/rejected": -2.7420356273651123, + "logps/chosen": -307.3967590332031, + "logps/rejected": -319.23785400390625, + "loss": 0.626, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.2908255457878113, + "rewards/margins": 0.24548819661140442, + "rewards/rejected": -0.5363136529922485, + "step": 3230 + }, + { + "epoch": 0.8479455639884846, + "grad_norm": 9.142580032348633, + "learning_rate": 3.441321090804469e-08, + "logits/chosen": -2.805671215057373, + "logits/rejected": -2.7749440670013428, + "logps/chosen": -311.969482421875, + "logps/rejected": -301.92559814453125, + "loss": 0.5872, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.28862375020980835, + "rewards/margins": 0.3031871020793915, + "rewards/rejected": -0.5918108820915222, + "step": 3240 + }, + { + "epoch": 0.8505626799267207, + "grad_norm": 6.999141216278076, + "learning_rate": 3.326556451066234e-08, + "logits/chosen": -2.8003592491149902, + "logits/rejected": -2.7750496864318848, + "logps/chosen": -333.262451171875, + "logps/rejected": -342.88970947265625, + "loss": 0.5676, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.21939554810523987, + "rewards/margins": 0.38298407196998596, + "rewards/rejected": -0.602379560470581, + "step": 3250 + }, + { + "epoch": 0.8531797958649568, + "grad_norm": 8.473172187805176, + "learning_rate": 3.2136015376271946e-08, + "logits/chosen": -2.7543041706085205, + "logits/rejected": -2.7237446308135986, + "logps/chosen": -310.47503662109375, + "logps/rejected": -316.1898498535156, + "loss": 0.6202, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.3483801782131195, + "rewards/margins": 0.25630325078964233, + "rewards/rejected": -0.6046834588050842, + "step": 3260 + }, + { + "epoch": 0.8557969118031928, + "grad_norm": 6.828322887420654, + "learning_rate": 3.102465782190106e-08, + "logits/chosen": -2.765094041824341, + "logits/rejected": -2.7622992992401123, + "logps/chosen": -292.77264404296875, + "logps/rejected": -306.03790283203125, + "loss": 0.6049, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.26343613862991333, + "rewards/margins": 0.2959148585796356, + "rewards/rejected": -0.5593509674072266, + "step": 3270 + }, + { + "epoch": 0.8584140277414289, + "grad_norm": 7.230039119720459, + "learning_rate": 2.993158464558565e-08, + "logits/chosen": -2.752277135848999, + "logits/rejected": -2.7456305027008057, + "logps/chosen": -313.83514404296875, + "logps/rejected": -343.77923583984375, + "loss": 0.6083, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2221953421831131, + "rewards/margins": 0.2806113660335541, + "rewards/rejected": -0.5028067231178284, + "step": 3280 + }, + { + "epoch": 0.861031143679665, + "grad_norm": 3.2468912601470947, + "learning_rate": 2.8856887118621358e-08, + "logits/chosen": -2.7951433658599854, + "logits/rejected": -2.8030707836151123, + "logps/chosen": -308.23077392578125, + "logps/rejected": -336.6316223144531, + "loss": 0.6066, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3232649266719818, + "rewards/margins": 0.30740997195243835, + "rewards/rejected": -0.6306749582290649, + "step": 3290 + }, + { + "epoch": 0.863648259617901, + "grad_norm": 6.59912109375, + "learning_rate": 2.7800654977942482e-08, + "logits/chosen": -2.7431418895721436, + "logits/rejected": -2.7131383419036865, + "logps/chosen": -301.9719543457031, + "logps/rejected": -354.3257751464844, + "loss": 0.5769, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2676336467266083, + "rewards/margins": 0.3562368154525757, + "rewards/rejected": -0.6238704919815063, + "step": 3300 + }, + { + "epoch": 0.863648259617901, + "eval_logits/chosen": -2.7770590782165527, + "eval_logits/rejected": -2.755500555038452, + "eval_logps/chosen": -309.4716491699219, + "eval_logps/rejected": -320.77569580078125, + "eval_loss": 0.5899637341499329, + "eval_rewards/accuracies": 0.703000009059906, + "eval_rewards/chosen": -0.2673403322696686, + "eval_rewards/margins": 0.3260369896888733, + "eval_rewards/rejected": -0.5933773517608643, + "eval_runtime": 692.4414, + "eval_samples_per_second": 2.888, + "eval_steps_per_second": 0.361, + "step": 3300 + }, + { + "epoch": 0.8662653755561371, + "grad_norm": 7.842947959899902, + "learning_rate": 2.676297641862879e-08, + "logits/chosen": -2.76792049407959, + "logits/rejected": -2.7621943950653076, + "logps/chosen": -265.380859375, + "logps/rejected": -254.47140502929688, + "loss": 0.5895, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.22911493480205536, + "rewards/margins": 0.32636719942092896, + "rewards/rejected": -0.5554821491241455, + "step": 3310 + }, + { + "epoch": 0.8688824914943732, + "grad_norm": 13.967310905456543, + "learning_rate": 2.5743938086541352e-08, + "logits/chosen": -2.7548770904541016, + "logits/rejected": -2.729977607727051, + "logps/chosen": -309.2705383300781, + "logps/rejected": -313.9998779296875, + "loss": 0.603, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.29073604941368103, + "rewards/margins": 0.31791952252388, + "rewards/rejected": -0.6086556315422058, + "step": 3320 + }, + { + "epoch": 0.8714996074326092, + "grad_norm": 11.057051658630371, + "learning_rate": 2.474362507108757e-08, + "logits/chosen": -2.814598560333252, + "logits/rejected": -2.7810606956481934, + "logps/chosen": -317.7953186035156, + "logps/rejected": -332.5885314941406, + "loss": 0.5725, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.25249534845352173, + "rewards/margins": 0.38154152035713196, + "rewards/rejected": -0.6340368390083313, + "step": 3330 + }, + { + "epoch": 0.8741167233708453, + "grad_norm": 10.906637191772461, + "learning_rate": 2.3762120898116495e-08, + "logits/chosen": -2.774956226348877, + "logits/rejected": -2.764927625656128, + "logps/chosen": -322.2221984863281, + "logps/rejected": -341.53216552734375, + "loss": 0.6079, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.3415859639644623, + "rewards/margins": 0.2912564277648926, + "rewards/rejected": -0.6328424215316772, + "step": 3340 + }, + { + "epoch": 0.8767338393090814, + "grad_norm": 6.918145656585693, + "learning_rate": 2.2799507522944044e-08, + "logits/chosen": -2.689883232116699, + "logits/rejected": -2.6739673614501953, + "logps/chosen": -313.18524169921875, + "logps/rejected": -340.9402770996094, + "loss": 0.5669, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.26544609665870667, + "rewards/margins": 0.35022976994514465, + "rewards/rejected": -0.6156758069992065, + "step": 3350 + }, + { + "epoch": 0.8793509552473174, + "grad_norm": 10.59185791015625, + "learning_rate": 2.1855865323510054e-08, + "logits/chosen": -2.7279655933380127, + "logits/rejected": -2.6860973834991455, + "logps/chosen": -320.9715576171875, + "logps/rejected": -355.20880126953125, + "loss": 0.5657, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.2595919072628021, + "rewards/margins": 0.40920543670654297, + "rewards/rejected": -0.6687973141670227, + "step": 3360 + }, + { + "epoch": 0.8819680711855535, + "grad_norm": 6.1795830726623535, + "learning_rate": 2.0931273093666573e-08, + "logits/chosen": -2.728386878967285, + "logits/rejected": -2.7089622020721436, + "logps/chosen": -283.88409423828125, + "logps/rejected": -303.3033142089844, + "loss": 0.5462, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.2573816776275635, + "rewards/margins": 0.4072348475456238, + "rewards/rejected": -0.6646164655685425, + "step": 3370 + }, + { + "epoch": 0.8845851871237895, + "grad_norm": 6.445786476135254, + "learning_rate": 2.002580803659873e-08, + "logits/chosen": -2.747699022293091, + "logits/rejected": -2.7049365043640137, + "logps/chosen": -303.89813232421875, + "logps/rejected": -318.79693603515625, + "loss": 0.617, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.3385586738586426, + "rewards/margins": 0.2693432867527008, + "rewards/rejected": -0.607901930809021, + "step": 3380 + }, + { + "epoch": 0.8872023030620256, + "grad_norm": 9.493855476379395, + "learning_rate": 1.9139545758378256e-08, + "logits/chosen": -2.770669460296631, + "logits/rejected": -2.722433090209961, + "logps/chosen": -311.3063659667969, + "logps/rejected": -296.7181701660156, + "loss": 0.5721, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.2407282292842865, + "rewards/margins": 0.3538174629211426, + "rewards/rejected": -0.5945457220077515, + "step": 3390 + }, + { + "epoch": 0.8898194190002617, + "grad_norm": 8.795994758605957, + "learning_rate": 1.8272560261650277e-08, + "logits/chosen": -2.782130479812622, + "logits/rejected": -2.757819652557373, + "logps/chosen": -354.10919189453125, + "logps/rejected": -333.00250244140625, + "loss": 0.5608, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.22043180465698242, + "rewards/margins": 0.3896182179450989, + "rewards/rejected": -0.6100499629974365, + "step": 3400 + }, + { + "epoch": 0.8898194190002617, + "eval_logits/chosen": -2.773853302001953, + "eval_logits/rejected": -2.7519986629486084, + "eval_logps/chosen": -309.8929748535156, + "eval_logps/rejected": -321.31964111328125, + "eval_loss": 0.5895980000495911, + "eval_rewards/accuracies": 0.7020000219345093, + "eval_rewards/chosen": -0.27155351638793945, + "eval_rewards/margins": 0.3272639214992523, + "eval_rewards/rejected": -0.5988174676895142, + "eval_runtime": 692.3174, + "eval_samples_per_second": 2.889, + "eval_steps_per_second": 0.361, + "step": 3400 + }, + { + "epoch": 0.8924365349384977, + "grad_norm": 7.369442462921143, + "learning_rate": 1.742492393945427e-08, + "logits/chosen": -2.7513797283172607, + "logits/rejected": -2.710066318511963, + "logps/chosen": -323.8204650878906, + "logps/rejected": -317.6787109375, + "loss": 0.568, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.2688294053077698, + "rewards/margins": 0.36988669633865356, + "rewards/rejected": -0.6387161016464233, + "step": 3410 + }, + { + "epoch": 0.8950536508767338, + "grad_norm": 7.45905876159668, + "learning_rate": 1.6596707569179302e-08, + "logits/chosen": -2.791177749633789, + "logits/rejected": -2.7743191719055176, + "logps/chosen": -325.4018249511719, + "logps/rejected": -326.23291015625, + "loss": 0.5784, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.28556251525878906, + "rewards/margins": 0.3457964062690735, + "rewards/rejected": -0.6313589215278625, + "step": 3420 + }, + { + "epoch": 0.8976707668149699, + "grad_norm": 6.628225326538086, + "learning_rate": 1.5787980306653848e-08, + "logits/chosen": -2.75858736038208, + "logits/rejected": -2.7154600620269775, + "logps/chosen": -316.15985107421875, + "logps/rejected": -336.3743896484375, + "loss": 0.5708, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.24860498309135437, + "rewards/margins": 0.3617299795150757, + "rewards/rejected": -0.6103349924087524, + "step": 3430 + }, + { + "epoch": 0.9002878827532059, + "grad_norm": 10.542095184326172, + "learning_rate": 1.499880968037165e-08, + "logits/chosen": -2.752002477645874, + "logits/rejected": -2.733220100402832, + "logps/chosen": -292.7621765136719, + "logps/rejected": -285.80218505859375, + "loss": 0.5813, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.22942595183849335, + "rewards/margins": 0.32194358110427856, + "rewards/rejected": -0.5513694882392883, + "step": 3440 + }, + { + "epoch": 0.902904998691442, + "grad_norm": 5.9859395027160645, + "learning_rate": 1.4229261585852803e-08, + "logits/chosen": -2.77447772026062, + "logits/rejected": -2.7663679122924805, + "logps/chosen": -305.6563415527344, + "logps/rejected": -314.01043701171875, + "loss": 0.5806, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.22854971885681152, + "rewards/margins": 0.3463636040687561, + "rewards/rejected": -0.5749133825302124, + "step": 3450 + }, + { + "epoch": 0.9055221146296781, + "grad_norm": 9.172728538513184, + "learning_rate": 1.3479400280141883e-08, + "logits/chosen": -2.74762225151062, + "logits/rejected": -2.7340774536132812, + "logps/chosen": -290.8319396972656, + "logps/rejected": -326.6239929199219, + "loss": 0.5852, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.2622153162956238, + "rewards/margins": 0.35056665539741516, + "rewards/rejected": -0.6127818822860718, + "step": 3460 + }, + { + "epoch": 0.9081392305679141, + "grad_norm": 8.79883098602295, + "learning_rate": 1.2749288376442042e-08, + "logits/chosen": -2.7586569786071777, + "logits/rejected": -2.730827569961548, + "logps/chosen": -337.0930480957031, + "logps/rejected": -317.09912109375, + "loss": 0.5455, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.19493858516216278, + "rewards/margins": 0.4252621531486511, + "rewards/rejected": -0.6202007532119751, + "step": 3470 + }, + { + "epoch": 0.9107563465061502, + "grad_norm": 11.71596622467041, + "learning_rate": 1.2038986838887127e-08, + "logits/chosen": -2.792734384536743, + "logits/rejected": -2.77490234375, + "logps/chosen": -288.8994445800781, + "logps/rejected": -313.22430419921875, + "loss": 0.6242, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.30519285798072815, + "rewards/margins": 0.2674819231033325, + "rewards/rejected": -0.5726747512817383, + "step": 3480 + }, + { + "epoch": 0.9133734624443863, + "grad_norm": 6.5518951416015625, + "learning_rate": 1.1348554977451131e-08, + "logits/chosen": -2.805830478668213, + "logits/rejected": -2.7894272804260254, + "logps/chosen": -327.4478759765625, + "logps/rejected": -324.9560546875, + "loss": 0.582, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2583698630332947, + "rewards/margins": 0.3414859175682068, + "rewards/rejected": -0.5998557806015015, + "step": 3490 + }, + { + "epoch": 0.9159905783826223, + "grad_norm": 5.205156326293945, + "learning_rate": 1.06780504429958e-08, + "logits/chosen": -2.7797505855560303, + "logits/rejected": -2.7590694427490234, + "logps/chosen": -325.8748779296875, + "logps/rejected": -310.8509521484375, + "loss": 0.6008, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.27413299679756165, + "rewards/margins": 0.3151172399520874, + "rewards/rejected": -0.5892502665519714, + "step": 3500 + }, + { + "epoch": 0.9159905783826223, + "eval_logits/chosen": -2.775543689727783, + "eval_logits/rejected": -2.7539024353027344, + "eval_logps/chosen": -309.8999938964844, + "eval_logps/rejected": -321.37445068359375, + "eval_loss": 0.5894958972930908, + "eval_rewards/accuracies": 0.703499972820282, + "eval_rewards/chosen": -0.2716234028339386, + "eval_rewards/margins": 0.3277418315410614, + "eval_rewards/rejected": -0.599365234375, + "eval_runtime": 692.3998, + "eval_samples_per_second": 2.889, + "eval_steps_per_second": 0.361, + "step": 3500 + }, + { + "epoch": 0.9186076943208584, + "grad_norm": 6.786498069763184, + "learning_rate": 1.0027529222456754e-08, + "logits/chosen": -2.7301533222198486, + "logits/rejected": -2.702810764312744, + "logps/chosen": -296.23834228515625, + "logps/rejected": -315.268310546875, + "loss": 0.5539, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.2563706934452057, + "rewards/margins": 0.3883191645145416, + "rewards/rejected": -0.6446898579597473, + "step": 3510 + }, + { + "epoch": 0.9212248102590945, + "grad_norm": 9.892511367797852, + "learning_rate": 9.397045634168766e-09, + "logits/chosen": -2.8002243041992188, + "logits/rejected": -2.7856602668762207, + "logps/chosen": -308.3498229980469, + "logps/rejected": -351.95831298828125, + "loss": 0.57, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.21786804497241974, + "rewards/margins": 0.41681188344955444, + "rewards/rejected": -0.634679913520813, + "step": 3520 + }, + { + "epoch": 0.9238419261973305, + "grad_norm": 12.571949005126953, + "learning_rate": 8.78665232332998e-09, + "logits/chosen": -2.724975347518921, + "logits/rejected": -2.708922863006592, + "logps/chosen": -277.4271545410156, + "logps/rejected": -300.2417297363281, + "loss": 0.6055, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3028232455253601, + "rewards/margins": 0.2615777850151062, + "rewards/rejected": -0.5644010305404663, + "step": 3530 + }, + { + "epoch": 0.9264590421355666, + "grad_norm": 7.908664703369141, + "learning_rate": 8.196400257606206e-09, + "logits/chosen": -2.772461414337158, + "logits/rejected": -2.7343640327453613, + "logps/chosen": -328.0716247558594, + "logps/rejected": -358.15655517578125, + "loss": 0.577, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.2738083302974701, + "rewards/margins": 0.3519710600376129, + "rewards/rejected": -0.625779390335083, + "step": 3540 + }, + { + "epoch": 0.9290761580738026, + "grad_norm": 5.722252368927002, + "learning_rate": 7.626338722875075e-09, + "logits/chosen": -2.7591617107391357, + "logits/rejected": -2.780594825744629, + "logps/chosen": -298.6004943847656, + "logps/rejected": -326.13287353515625, + "loss": 0.5986, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2562271058559418, + "rewards/margins": 0.3074356019496918, + "rewards/rejected": -0.5636627078056335, + "step": 3550 + }, + { + "epoch": 0.9316932740120387, + "grad_norm": 8.03117847442627, + "learning_rate": 7.0765153191106875e-09, + "logits/chosen": -2.781140089035034, + "logits/rejected": -2.7692975997924805, + "logps/chosen": -295.3600158691406, + "logps/rejected": -291.2763366699219, + "loss": 0.5659, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2436678409576416, + "rewards/margins": 0.39614516496658325, + "rewards/rejected": -0.6398130655288696, + "step": 3560 + }, + { + "epoch": 0.9343103899502748, + "grad_norm": 7.668455600738525, + "learning_rate": 6.54697595640899e-09, + "logits/chosen": -2.7558670043945312, + "logits/rejected": -2.7410783767700195, + "logps/chosen": -333.0140075683594, + "logps/rejected": -347.9772033691406, + "loss": 0.5718, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.23105120658874512, + "rewards/margins": 0.38840624690055847, + "rewards/rejected": -0.6194573640823364, + "step": 3570 + }, + { + "epoch": 0.9369275058885108, + "grad_norm": 7.808078765869141, + "learning_rate": 6.037764851154425e-09, + "logits/chosen": -2.7314181327819824, + "logits/rejected": -2.7231030464172363, + "logps/chosen": -305.7143249511719, + "logps/rejected": -345.88983154296875, + "loss": 0.5699, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.22997505962848663, + "rewards/margins": 0.37085598707199097, + "rewards/rejected": -0.6008309721946716, + "step": 3580 + }, + { + "epoch": 0.9395446218267469, + "grad_norm": 9.760852813720703, + "learning_rate": 5.548924522327747e-09, + "logits/chosen": -2.7540392875671387, + "logits/rejected": -2.7462592124938965, + "logps/chosen": -308.9768981933594, + "logps/rejected": -327.16802978515625, + "loss": 0.5826, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.299643337726593, + "rewards/margins": 0.3448326587677002, + "rewards/rejected": -0.6444759368896484, + "step": 3590 + }, + { + "epoch": 0.942161737764983, + "grad_norm": 11.123191833496094, + "learning_rate": 5.080495787955691e-09, + "logits/chosen": -2.734261989593506, + "logits/rejected": -2.717097043991089, + "logps/chosen": -269.73223876953125, + "logps/rejected": -300.8177490234375, + "loss": 0.585, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.254092276096344, + "rewards/margins": 0.306030809879303, + "rewards/rejected": -0.560123085975647, + "step": 3600 + }, + { + "epoch": 0.942161737764983, + "eval_logits/chosen": -2.776420831680298, + "eval_logits/rejected": -2.7549078464508057, + "eval_logps/chosen": -309.95306396484375, + "eval_logps/rejected": -321.4418029785156, + "eval_loss": 0.5895029306411743, + "eval_rewards/accuracies": 0.7020000219345093, + "eval_rewards/chosen": -0.27215421199798584, + "eval_rewards/margins": 0.3278846740722656, + "eval_rewards/rejected": -0.6000389456748962, + "eval_runtime": 692.4927, + "eval_samples_per_second": 2.888, + "eval_steps_per_second": 0.361, + "step": 3600 + }, + { + "epoch": 0.944778853703219, + "grad_norm": 7.403170585632324, + "learning_rate": 4.632517761702814e-09, + "logits/chosen": -2.7008776664733887, + "logits/rejected": -2.6773476600646973, + "logps/chosen": -289.5223083496094, + "logps/rejected": -309.5367431640625, + "loss": 0.5795, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.308106005191803, + "rewards/margins": 0.33852237462997437, + "rewards/rejected": -0.6466284394264221, + "step": 3610 + }, + { + "epoch": 0.9473959696414551, + "grad_norm": 9.613285064697266, + "learning_rate": 4.205027849605358e-09, + "logits/chosen": -2.738858699798584, + "logits/rejected": -2.726569414138794, + "logps/chosen": -294.84014892578125, + "logps/rejected": -290.58770751953125, + "loss": 0.5959, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.292969286441803, + "rewards/margins": 0.3049730956554413, + "rewards/rejected": -0.5979424715042114, + "step": 3620 + }, + { + "epoch": 0.9500130855796912, + "grad_norm": 4.820310115814209, + "learning_rate": 3.798061746947995e-09, + "logits/chosen": -2.785492420196533, + "logits/rejected": -2.767252206802368, + "logps/chosen": -311.9582214355469, + "logps/rejected": -305.7359924316406, + "loss": 0.5893, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.31197255849838257, + "rewards/margins": 0.3228316307067871, + "rewards/rejected": -0.6348041296005249, + "step": 3630 + }, + { + "epoch": 0.9526302015179272, + "grad_norm": 5.795242786407471, + "learning_rate": 3.411653435283157e-09, + "logits/chosen": -2.7570109367370605, + "logits/rejected": -2.7326931953430176, + "logps/chosen": -313.0288391113281, + "logps/rejected": -286.85894775390625, + "loss": 0.5868, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.23662319779396057, + "rewards/margins": 0.32369619607925415, + "rewards/rejected": -0.5603194236755371, + "step": 3640 + }, + { + "epoch": 0.9552473174561633, + "grad_norm": 8.141414642333984, + "learning_rate": 3.0458351795936698e-09, + "logits/chosen": -2.800523281097412, + "logits/rejected": -2.7803540229797363, + "logps/chosen": -287.27178955078125, + "logps/rejected": -296.94482421875, + "loss": 0.5557, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.20535226166248322, + "rewards/margins": 0.4138811230659485, + "rewards/rejected": -0.6192333102226257, + "step": 3650 + }, + { + "epoch": 0.9578644333943994, + "grad_norm": 10.963499069213867, + "learning_rate": 2.700637525598598e-09, + "logits/chosen": -2.7325665950775146, + "logits/rejected": -2.742112636566162, + "logps/chosen": -318.7773742675781, + "logps/rejected": -340.5607604980469, + "loss": 0.6213, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.2901912331581116, + "rewards/margins": 0.23596885800361633, + "rewards/rejected": -0.5261600613594055, + "step": 3660 + }, + { + "epoch": 0.9604815493326354, + "grad_norm": 5.604915618896484, + "learning_rate": 2.3760892972027324e-09, + "logits/chosen": -2.8125240802764893, + "logits/rejected": -2.794743061065674, + "logps/chosen": -320.9376525878906, + "logps/rejected": -314.6265869140625, + "loss": 0.6086, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.3326115012168884, + "rewards/margins": 0.2905888855457306, + "rewards/rejected": -0.6232004165649414, + "step": 3670 + }, + { + "epoch": 0.9630986652708715, + "grad_norm": 8.076900482177734, + "learning_rate": 2.0722175940897645e-09, + "logits/chosen": -2.730006694793701, + "logits/rejected": -2.7527151107788086, + "logps/chosen": -304.4130554199219, + "logps/rejected": -333.45281982421875, + "loss": 0.5561, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.2793710231781006, + "rewards/margins": 0.40125495195388794, + "rewards/rejected": -0.6806259751319885, + "step": 3680 + }, + { + "epoch": 0.9657157812091076, + "grad_norm": 5.261369705200195, + "learning_rate": 1.7890477894593748e-09, + "logits/chosen": -2.7596428394317627, + "logits/rejected": -2.73931622505188, + "logps/chosen": -363.08984375, + "logps/rejected": -348.8448486328125, + "loss": 0.5621, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.23692288994789124, + "rewards/margins": 0.40229707956314087, + "rewards/rejected": -0.6392199993133545, + "step": 3690 + }, + { + "epoch": 0.9683328971473436, + "grad_norm": 7.541417598724365, + "learning_rate": 1.5266035279088708e-09, + "logits/chosen": -2.6856465339660645, + "logits/rejected": -2.6826679706573486, + "logps/chosen": -347.5863952636719, + "logps/rejected": -356.30120849609375, + "loss": 0.567, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.2766670286655426, + "rewards/margins": 0.367573082447052, + "rewards/rejected": -0.6442400813102722, + "step": 3700 + }, + { + "epoch": 0.9683328971473436, + "eval_logits/chosen": -2.7754881381988525, + "eval_logits/rejected": -2.753868341445923, + "eval_logps/chosen": -310.11712646484375, + "eval_logps/rejected": -321.65545654296875, + "eval_loss": 0.5893409252166748, + "eval_rewards/accuracies": 0.7014999985694885, + "eval_rewards/chosen": -0.2737952172756195, + "eval_rewards/margins": 0.32838013768196106, + "eval_rewards/rejected": -0.6021752953529358, + "eval_runtime": 692.7848, + "eval_samples_per_second": 2.887, + "eval_steps_per_second": 0.361, + "step": 3700 + }, + { + "epoch": 0.9709500130855797, + "grad_norm": 11.719736099243164, + "learning_rate": 1.2849067234584621e-09, + "logits/chosen": -2.714137315750122, + "logits/rejected": -2.7111401557922363, + "logps/chosen": -280.48919677734375, + "logps/rejected": -300.55706787109375, + "loss": 0.607, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2787315249443054, + "rewards/margins": 0.30049681663513184, + "rewards/rejected": -0.5792283415794373, + "step": 3710 + }, + { + "epoch": 0.9735671290238157, + "grad_norm": 12.492560386657715, + "learning_rate": 1.0639775577218625e-09, + "logits/chosen": -2.719714403152466, + "logits/rejected": -2.667534589767456, + "logps/chosen": -295.1371765136719, + "logps/rejected": -294.61932373046875, + "loss": 0.5762, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.27527686953544617, + "rewards/margins": 0.36209625005722046, + "rewards/rejected": -0.637373149394989, + "step": 3720 + }, + { + "epoch": 0.9761842449620518, + "grad_norm": 7.440390110015869, + "learning_rate": 8.638344782207485e-10, + "logits/chosen": -2.725163459777832, + "logits/rejected": -2.7303969860076904, + "logps/chosen": -296.50689697265625, + "logps/rejected": -305.67706298828125, + "loss": 0.5767, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.22756004333496094, + "rewards/margins": 0.3577590882778168, + "rewards/rejected": -0.5853191018104553, + "step": 3730 + }, + { + "epoch": 0.9788013609002879, + "grad_norm": 10.965612411499023, + "learning_rate": 6.844941968447149e-10, + "logits/chosen": -2.7626724243164062, + "logits/rejected": -2.7460460662841797, + "logps/chosen": -316.35015869140625, + "logps/rejected": -349.7431945800781, + "loss": 0.5453, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2542489767074585, + "rewards/margins": 0.45952582359313965, + "rewards/rejected": -0.7137748003005981, + "step": 3740 + }, + { + "epoch": 0.9814184768385239, + "grad_norm": 5.883279323577881, + "learning_rate": 5.25971688455612e-10, + "logits/chosen": -2.7904438972473145, + "logits/rejected": -2.775864362716675, + "logps/chosen": -316.23297119140625, + "logps/rejected": -347.6502685546875, + "loss": 0.5698, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.2520793080329895, + "rewards/margins": 0.36716121435165405, + "rewards/rejected": -0.6192405223846436, + "step": 3750 + }, + { + "epoch": 0.98403559277676, + "grad_norm": 4.377948760986328, + "learning_rate": 3.882801896372967e-10, + "logits/chosen": -2.785407543182373, + "logits/rejected": -2.785416841506958, + "logps/chosen": -311.1086120605469, + "logps/rejected": -308.876220703125, + "loss": 0.6124, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.2892715036869049, + "rewards/margins": 0.29838478565216064, + "rewards/rejected": -0.5876562595367432, + "step": 3760 + }, + { + "epoch": 0.9866527087149961, + "grad_norm": 8.081770896911621, + "learning_rate": 2.714311975902661e-10, + "logits/chosen": -2.7383980751037598, + "logits/rejected": -2.710829257965088, + "logps/chosen": -330.71771240234375, + "logps/rejected": -337.7955627441406, + "loss": 0.5649, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.25471562147140503, + "rewards/margins": 0.3609776496887207, + "rewards/rejected": -0.6156932711601257, + "step": 3770 + }, + { + "epoch": 0.9892698246532321, + "grad_norm": 7.887190818786621, + "learning_rate": 1.754344691717591e-10, + "logits/chosen": -2.761021852493286, + "logits/rejected": -2.7340810298919678, + "logps/chosen": -295.04718017578125, + "logps/rejected": -336.95147705078125, + "loss": 0.6306, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.27056482434272766, + "rewards/margins": 0.2132827490568161, + "rewards/rejected": -0.4838475286960602, + "step": 3780 + }, + { + "epoch": 0.9918869405914682, + "grad_norm": 7.817293643951416, + "learning_rate": 1.0029802008096333e-10, + "logits/chosen": -2.7683863639831543, + "logits/rejected": -2.7289211750030518, + "logps/chosen": -316.55340576171875, + "logps/rejected": -334.72845458984375, + "loss": 0.5639, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.25747808814048767, + "rewards/margins": 0.4023471474647522, + "rewards/rejected": -0.6598252654075623, + "step": 3790 + }, + { + "epoch": 0.9945040565297043, + "grad_norm": 6.426971435546875, + "learning_rate": 4.602812418974533e-11, + "logits/chosen": -2.791513442993164, + "logits/rejected": -2.7664811611175537, + "logps/chosen": -328.2163391113281, + "logps/rejected": -337.54974365234375, + "loss": 0.5834, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.2448674440383911, + "rewards/margins": 0.3448673486709595, + "rewards/rejected": -0.5897347927093506, + "step": 3800 + }, + { + "epoch": 0.9945040565297043, + "eval_logits/chosen": -2.7742366790771484, + "eval_logits/rejected": -2.7524607181549072, + "eval_logps/chosen": -310.13330078125, + "eval_logps/rejected": -321.6666259765625, + "eval_loss": 0.5893096923828125, + "eval_rewards/accuracies": 0.7024999856948853, + "eval_rewards/chosen": -0.2739570438861847, + "eval_rewards/margins": 0.32832974195480347, + "eval_rewards/rejected": -0.6022867560386658, + "eval_runtime": 692.7928, + "eval_samples_per_second": 2.887, + "eval_steps_per_second": 0.361, + "step": 3800 + }, + { + "epoch": 0.9971211724679403, + "grad_norm": 7.6028289794921875, + "learning_rate": 1.2629313018819309e-11, + "logits/chosen": -2.7530319690704346, + "logits/rejected": -2.7311320304870605, + "logps/chosen": -300.90142822265625, + "logps/rejected": -311.88006591796875, + "loss": 0.5936, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.2676599621772766, + "rewards/margins": 0.3127606511116028, + "rewards/rejected": -0.5804205536842346, + "step": 3810 + }, + { + "epoch": 0.9997382884061764, + "grad_norm": 10.209754943847656, + "learning_rate": 1.0437535929996855e-13, + "logits/chosen": -2.765655279159546, + "logits/rejected": -2.7465381622314453, + "logps/chosen": -334.4398498535156, + "logps/rejected": -327.4457702636719, + "loss": 0.5626, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.2764059007167816, + "rewards/margins": 0.397102028131485, + "rewards/rejected": -0.6735079288482666, + "step": 3820 + }, + { + "epoch": 1.0, + "step": 3821, + "total_flos": 0.0, + "train_loss": 0.6164219083351729, + "train_runtime": 73481.1174, + "train_samples_per_second": 0.832, + "train_steps_per_second": 0.052 + } + ], + "logging_steps": 10, + "max_steps": 3821, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}