martimfasantos's picture
Model save
c504e52 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9964868029907215,
"eval_steps": 800,
"global_step": 2079,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0014413115935501305,
"grad_norm": 15.202939063397405,
"learning_rate": 4.807692307692308e-10,
"logits/chosen": -2.3378124237060547,
"logits/rejected": -2.341672897338867,
"logps/chosen": -1.0059865713119507,
"logps/rejected": -1.105405569076538,
"loss": 1.6556,
"rewards/accuracies": 0.5,
"rewards/chosen": -2.0119731426239014,
"rewards/margins": 0.19883811473846436,
"rewards/rejected": -2.210811138153076,
"step": 1
},
{
"epoch": 0.014413115935501306,
"grad_norm": 18.061978045212722,
"learning_rate": 4.807692307692308e-09,
"logits/chosen": -2.356367826461792,
"logits/rejected": -2.3451521396636963,
"logps/chosen": -1.0228126049041748,
"logps/rejected": -1.1430484056472778,
"loss": 1.6323,
"rewards/accuracies": 0.5694444179534912,
"rewards/chosen": -2.0456252098083496,
"rewards/margins": 0.24047136306762695,
"rewards/rejected": -2.2860968112945557,
"step": 10
},
{
"epoch": 0.02882623187100261,
"grad_norm": 17.723319596995733,
"learning_rate": 9.615384615384615e-09,
"logits/chosen": -2.3264236450195312,
"logits/rejected": -2.321986198425293,
"logps/chosen": -1.0446507930755615,
"logps/rejected": -1.1442738771438599,
"loss": 1.6729,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.089301586151123,
"rewards/margins": 0.19924603402614594,
"rewards/rejected": -2.2885477542877197,
"step": 20
},
{
"epoch": 0.04323934780650392,
"grad_norm": 17.07010517991476,
"learning_rate": 1.442307692307692e-08,
"logits/chosen": -2.3456313610076904,
"logits/rejected": -2.3424785137176514,
"logps/chosen": -1.0158333778381348,
"logps/rejected": -1.076974630355835,
"loss": 1.7109,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -2.0316667556762695,
"rewards/margins": 0.12228262424468994,
"rewards/rejected": -2.15394926071167,
"step": 30
},
{
"epoch": 0.05765246374200522,
"grad_norm": 19.711953891202494,
"learning_rate": 1.923076923076923e-08,
"logits/chosen": -2.383465528488159,
"logits/rejected": -2.3750338554382324,
"logps/chosen": -1.1377735137939453,
"logps/rejected": -1.221296787261963,
"loss": 1.6828,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.2755470275878906,
"rewards/margins": 0.167046457529068,
"rewards/rejected": -2.442593574523926,
"step": 40
},
{
"epoch": 0.07206557967750653,
"grad_norm": 15.368731865288492,
"learning_rate": 2.403846153846154e-08,
"logits/chosen": -2.3631155490875244,
"logits/rejected": -2.362963914871216,
"logps/chosen": -1.0241036415100098,
"logps/rejected": -1.1317743062973022,
"loss": 1.6525,
"rewards/accuracies": 0.546875,
"rewards/chosen": -2.0482072830200195,
"rewards/margins": 0.2153414785861969,
"rewards/rejected": -2.2635486125946045,
"step": 50
},
{
"epoch": 0.08647869561300783,
"grad_norm": 15.486802435760401,
"learning_rate": 2.884615384615384e-08,
"logits/chosen": -2.3361105918884277,
"logits/rejected": -2.327380657196045,
"logps/chosen": -0.9968592524528503,
"logps/rejected": -1.0975861549377441,
"loss": 1.6565,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.9937185049057007,
"rewards/margins": 0.2014540731906891,
"rewards/rejected": -2.1951723098754883,
"step": 60
},
{
"epoch": 0.10089181154850914,
"grad_norm": 15.988415966234422,
"learning_rate": 3.365384615384615e-08,
"logits/chosen": -2.3774499893188477,
"logits/rejected": -2.3742191791534424,
"logps/chosen": -1.028954267501831,
"logps/rejected": -1.1373963356018066,
"loss": 1.6496,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.057908535003662,
"rewards/margins": 0.21688416600227356,
"rewards/rejected": -2.2747926712036133,
"step": 70
},
{
"epoch": 0.11530492748401044,
"grad_norm": 13.627900414661896,
"learning_rate": 3.846153846153846e-08,
"logits/chosen": -2.3636672496795654,
"logits/rejected": -2.354912757873535,
"logps/chosen": -0.9835589528083801,
"logps/rejected": -1.1169239282608032,
"loss": 1.6095,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -1.9671179056167603,
"rewards/margins": 0.2667301595211029,
"rewards/rejected": -2.2338478565216064,
"step": 80
},
{
"epoch": 0.12971804341951176,
"grad_norm": 16.99221012864124,
"learning_rate": 4.326923076923077e-08,
"logits/chosen": -2.3509373664855957,
"logits/rejected": -2.3414111137390137,
"logps/chosen": -1.0289192199707031,
"logps/rejected": -1.1351473331451416,
"loss": 1.6614,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.0578384399414062,
"rewards/margins": 0.21245631575584412,
"rewards/rejected": -2.270294666290283,
"step": 90
},
{
"epoch": 0.14413115935501306,
"grad_norm": 16.26579840133319,
"learning_rate": 4.807692307692308e-08,
"logits/chosen": -2.4182028770446777,
"logits/rejected": -2.416335105895996,
"logps/chosen": -0.9977607727050781,
"logps/rejected": -1.108969807624817,
"loss": 1.637,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.9955215454101562,
"rewards/margins": 0.22241799533367157,
"rewards/rejected": -2.217939615249634,
"step": 100
},
{
"epoch": 0.15854427529051437,
"grad_norm": 14.431674139311319,
"learning_rate": 5.288461538461538e-08,
"logits/chosen": -2.342700719833374,
"logits/rejected": -2.3403000831604004,
"logps/chosen": -1.0405100584030151,
"logps/rejected": -1.1639328002929688,
"loss": 1.6325,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.0810201168060303,
"rewards/margins": 0.24684572219848633,
"rewards/rejected": -2.3278656005859375,
"step": 110
},
{
"epoch": 0.17295739122601567,
"grad_norm": 16.881846104086076,
"learning_rate": 5.769230769230768e-08,
"logits/chosen": -2.3760740756988525,
"logits/rejected": -2.373129367828369,
"logps/chosen": -1.0364916324615479,
"logps/rejected": -1.1324373483657837,
"loss": 1.67,
"rewards/accuracies": 0.596875011920929,
"rewards/chosen": -2.0729832649230957,
"rewards/margins": 0.19189123809337616,
"rewards/rejected": -2.2648746967315674,
"step": 120
},
{
"epoch": 0.18737050716151699,
"grad_norm": 16.764431844922484,
"learning_rate": 6.25e-08,
"logits/chosen": -2.3209102153778076,
"logits/rejected": -2.3239667415618896,
"logps/chosen": -1.0940515995025635,
"logps/rejected": -1.1949011087417603,
"loss": 1.6633,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.188103199005127,
"rewards/margins": 0.20169904828071594,
"rewards/rejected": -2.3898022174835205,
"step": 130
},
{
"epoch": 0.20178362309701828,
"grad_norm": 17.534779544810593,
"learning_rate": 6.73076923076923e-08,
"logits/chosen": -2.3762125968933105,
"logits/rejected": -2.368044376373291,
"logps/chosen": -1.0029666423797607,
"logps/rejected": -1.1249053478240967,
"loss": 1.6237,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -2.0059332847595215,
"rewards/margins": 0.2438771277666092,
"rewards/rejected": -2.2498106956481934,
"step": 140
},
{
"epoch": 0.2161967390325196,
"grad_norm": 15.578504627710455,
"learning_rate": 7.211538461538461e-08,
"logits/chosen": -2.3589887619018555,
"logits/rejected": -2.3546345233917236,
"logps/chosen": -1.0512168407440186,
"logps/rejected": -1.1491758823394775,
"loss": 1.6633,
"rewards/accuracies": 0.5718749761581421,
"rewards/chosen": -2.102433681488037,
"rewards/margins": 0.19591817259788513,
"rewards/rejected": -2.298351764678955,
"step": 150
},
{
"epoch": 0.2306098549680209,
"grad_norm": 13.745585175489111,
"learning_rate": 7.692307692307692e-08,
"logits/chosen": -2.338444232940674,
"logits/rejected": -2.332979679107666,
"logps/chosen": -1.0473906993865967,
"logps/rejected": -1.1564788818359375,
"loss": 1.6513,
"rewards/accuracies": 0.6031249761581421,
"rewards/chosen": -2.0947813987731934,
"rewards/margins": 0.21817633509635925,
"rewards/rejected": -2.312957763671875,
"step": 160
},
{
"epoch": 0.2450229709035222,
"grad_norm": 16.783418396767676,
"learning_rate": 8.173076923076923e-08,
"logits/chosen": -2.3806934356689453,
"logits/rejected": -2.3792760372161865,
"logps/chosen": -1.0662988424301147,
"logps/rejected": -1.1184349060058594,
"loss": 1.7353,
"rewards/accuracies": 0.5093749761581421,
"rewards/chosen": -2.1325976848602295,
"rewards/margins": 0.1042722687125206,
"rewards/rejected": -2.2368698120117188,
"step": 170
},
{
"epoch": 0.2594360868390235,
"grad_norm": 15.562730291374017,
"learning_rate": 8.653846153846154e-08,
"logits/chosen": -2.3370161056518555,
"logits/rejected": -2.3294992446899414,
"logps/chosen": -1.0367413759231567,
"logps/rejected": -1.1586549282073975,
"loss": 1.6251,
"rewards/accuracies": 0.6156250238418579,
"rewards/chosen": -2.0734827518463135,
"rewards/margins": 0.24382701516151428,
"rewards/rejected": -2.317309856414795,
"step": 180
},
{
"epoch": 0.2738492027745248,
"grad_norm": 14.57246304002355,
"learning_rate": 9.134615384615383e-08,
"logits/chosen": -2.355874538421631,
"logits/rejected": -2.357952833175659,
"logps/chosen": -1.0316553115844727,
"logps/rejected": -1.1332082748413086,
"loss": 1.6605,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.0633106231689453,
"rewards/margins": 0.20310597121715546,
"rewards/rejected": -2.266416549682617,
"step": 190
},
{
"epoch": 0.2882623187100261,
"grad_norm": 14.510113595673776,
"learning_rate": 9.615384615384616e-08,
"logits/chosen": -2.3815228939056396,
"logits/rejected": -2.377211332321167,
"logps/chosen": -1.0085281133651733,
"logps/rejected": -1.0985215902328491,
"loss": 1.6684,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.0170562267303467,
"rewards/margins": 0.17998693883419037,
"rewards/rejected": -2.1970431804656982,
"step": 200
},
{
"epoch": 0.30267543464552743,
"grad_norm": 14.940800895121608,
"learning_rate": 9.999971806320255e-08,
"logits/chosen": -2.4093306064605713,
"logits/rejected": -2.4097609519958496,
"logps/chosen": -1.0589462518692017,
"logps/rejected": -1.1346651315689087,
"loss": 1.695,
"rewards/accuracies": 0.53125,
"rewards/chosen": -2.1178925037384033,
"rewards/margins": 0.15143761038780212,
"rewards/rejected": -2.2693302631378174,
"step": 210
},
{
"epoch": 0.31708855058102875,
"grad_norm": 15.468071809971288,
"learning_rate": 9.998985060913876e-08,
"logits/chosen": -2.327671527862549,
"logits/rejected": -2.3280539512634277,
"logps/chosen": -1.0390589237213135,
"logps/rejected": -1.1213579177856445,
"loss": 1.6904,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -2.078117847442627,
"rewards/margins": 0.1645977944135666,
"rewards/rejected": -2.242715835571289,
"step": 220
},
{
"epoch": 0.33150166651653007,
"grad_norm": 18.885553561709102,
"learning_rate": 9.996588949457546e-08,
"logits/chosen": -2.3791205883026123,
"logits/rejected": -2.3730788230895996,
"logps/chosen": -1.156124472618103,
"logps/rejected": -1.2356293201446533,
"loss": 1.6937,
"rewards/accuracies": 0.559374988079071,
"rewards/chosen": -2.312248945236206,
"rewards/margins": 0.15900969505310059,
"rewards/rejected": -2.4712586402893066,
"step": 230
},
{
"epoch": 0.34591478245203133,
"grad_norm": 18.61654233250297,
"learning_rate": 9.992784147488017e-08,
"logits/chosen": -2.4054293632507324,
"logits/rejected": -2.3909668922424316,
"logps/chosen": -1.040718674659729,
"logps/rejected": -1.1538527011871338,
"loss": 1.6368,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.081437349319458,
"rewards/margins": 0.22626809775829315,
"rewards/rejected": -2.3077054023742676,
"step": 240
},
{
"epoch": 0.36032789838753265,
"grad_norm": 15.133106885435941,
"learning_rate": 9.987571727694775e-08,
"logits/chosen": -2.377009630203247,
"logits/rejected": -2.371063232421875,
"logps/chosen": -0.997736930847168,
"logps/rejected": -1.1200191974639893,
"loss": 1.6202,
"rewards/accuracies": 0.621874988079071,
"rewards/chosen": -1.995473861694336,
"rewards/margins": 0.24456438422203064,
"rewards/rejected": -2.2400383949279785,
"step": 250
},
{
"epoch": 0.37474101432303397,
"grad_norm": 15.527267890358452,
"learning_rate": 9.98095315961762e-08,
"logits/chosen": -2.38106369972229,
"logits/rejected": -2.3770012855529785,
"logps/chosen": -1.073089838027954,
"logps/rejected": -1.1799663305282593,
"loss": 1.6494,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.146179676055908,
"rewards/margins": 0.2137528359889984,
"rewards/rejected": -2.3599326610565186,
"step": 260
},
{
"epoch": 0.3891541302585353,
"grad_norm": 16.699106788545635,
"learning_rate": 9.97293030923235e-08,
"logits/chosen": -2.3734331130981445,
"logits/rejected": -2.36216402053833,
"logps/chosen": -1.0048857927322388,
"logps/rejected": -1.0962402820587158,
"loss": 1.6741,
"rewards/accuracies": 0.5718749761581421,
"rewards/chosen": -2.0097715854644775,
"rewards/margins": 0.18270887434482574,
"rewards/rejected": -2.1924805641174316,
"step": 270
},
{
"epoch": 0.40356724619403656,
"grad_norm": 14.595168551654872,
"learning_rate": 9.963505438424693e-08,
"logits/chosen": -2.340841293334961,
"logits/rejected": -2.3415005207061768,
"logps/chosen": -1.0379191637039185,
"logps/rejected": -1.1280016899108887,
"loss": 1.6851,
"rewards/accuracies": 0.5406249761581421,
"rewards/chosen": -2.075838327407837,
"rewards/margins": 0.18016524612903595,
"rewards/rejected": -2.2560033798217773,
"step": 280
},
{
"epoch": 0.4179803621295379,
"grad_norm": 14.286732447718073,
"learning_rate": 9.952681204352607e-08,
"logits/chosen": -2.361560821533203,
"logits/rejected": -2.3513660430908203,
"logps/chosen": -1.0380117893218994,
"logps/rejected": -1.1370676755905151,
"loss": 1.6637,
"rewards/accuracies": 0.565625011920929,
"rewards/chosen": -2.076023578643799,
"rewards/margins": 0.198111891746521,
"rewards/rejected": -2.2741353511810303,
"step": 290
},
{
"epoch": 0.4323934780650392,
"grad_norm": 17.31273729578293,
"learning_rate": 9.94046065869715e-08,
"logits/chosen": -2.377479314804077,
"logits/rejected": -2.375476360321045,
"logps/chosen": -1.0271109342575073,
"logps/rejected": -1.1700676679611206,
"loss": 1.5942,
"rewards/accuracies": 0.628125011920929,
"rewards/chosen": -2.0542218685150146,
"rewards/margins": 0.2859136462211609,
"rewards/rejected": -2.340135335922241,
"step": 300
},
{
"epoch": 0.4468065940005405,
"grad_norm": 17.694546366405458,
"learning_rate": 9.926847246802116e-08,
"logits/chosen": -2.3561387062072754,
"logits/rejected": -2.3444766998291016,
"logps/chosen": -1.0410211086273193,
"logps/rejected": -1.1159262657165527,
"loss": 1.6942,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -2.0820422172546387,
"rewards/margins": 0.14981010556221008,
"rewards/rejected": -2.2318525314331055,
"step": 310
},
{
"epoch": 0.4612197099360418,
"grad_norm": 13.051339803328997,
"learning_rate": 9.911844806702691e-08,
"logits/chosen": -2.3585753440856934,
"logits/rejected": -2.360156297683716,
"logps/chosen": -1.015515923500061,
"logps/rejected": -1.1353641748428345,
"loss": 1.6286,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.031031847000122,
"rewards/margins": 0.23969626426696777,
"rewards/rejected": -2.270728349685669,
"step": 320
},
{
"epoch": 0.4756328258715431,
"grad_norm": 17.002852190341585,
"learning_rate": 9.895457568043387e-08,
"logits/chosen": -2.3824462890625,
"logits/rejected": -2.3757641315460205,
"logps/chosen": -1.059061050415039,
"logps/rejected": -1.14574134349823,
"loss": 1.6835,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -2.118122100830078,
"rewards/margins": 0.17336080968379974,
"rewards/rejected": -2.29148268699646,
"step": 330
},
{
"epoch": 0.4900459418070444,
"grad_norm": 16.276382330404722,
"learning_rate": 9.877690150885587e-08,
"logits/chosen": -2.324713945388794,
"logits/rejected": -2.314767599105835,
"logps/chosen": -1.0457204580307007,
"logps/rejected": -1.135799527168274,
"loss": 1.6763,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.0914409160614014,
"rewards/margins": 0.18015804886817932,
"rewards/rejected": -2.271599054336548,
"step": 340
},
{
"epoch": 0.5044590577425457,
"grad_norm": 14.679321409845278,
"learning_rate": 9.858547564404998e-08,
"logits/chosen": -2.368298292160034,
"logits/rejected": -2.3589999675750732,
"logps/chosen": -1.0575425624847412,
"logps/rejected": -1.1802635192871094,
"loss": 1.6339,
"rewards/accuracies": 0.578125,
"rewards/chosen": -2.1150851249694824,
"rewards/margins": 0.24544170498847961,
"rewards/rejected": -2.3605270385742188,
"step": 350
},
{
"epoch": 0.518872173678047,
"grad_norm": 16.288849210972156,
"learning_rate": 9.838035205479418e-08,
"logits/chosen": -2.3341236114501953,
"logits/rejected": -2.328613042831421,
"logps/chosen": -0.9657120704650879,
"logps/rejected": -1.0940418243408203,
"loss": 1.6196,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -1.9314241409301758,
"rewards/margins": 0.25665926933288574,
"rewards/rejected": -2.1880836486816406,
"step": 360
},
{
"epoch": 0.5332852896135484,
"grad_norm": 15.065053010351129,
"learning_rate": 9.816158857167196e-08,
"logits/chosen": -2.3553214073181152,
"logits/rejected": -2.3543648719787598,
"logps/chosen": -1.017580509185791,
"logps/rejected": -1.093390703201294,
"loss": 1.706,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -2.035161018371582,
"rewards/margins": 0.15162022411823273,
"rewards/rejected": -2.186781406402588,
"step": 370
},
{
"epoch": 0.5476984055490496,
"grad_norm": 15.268674336756646,
"learning_rate": 9.7929246870768e-08,
"logits/chosen": -2.3563642501831055,
"logits/rejected": -2.357172727584839,
"logps/chosen": -1.0474622249603271,
"logps/rejected": -1.1527016162872314,
"loss": 1.6593,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.0949244499206543,
"rewards/margins": 0.2104784995317459,
"rewards/rejected": -2.305403232574463,
"step": 380
},
{
"epoch": 0.5621115214845509,
"grad_norm": 19.51913775076441,
"learning_rate": 9.768339245627993e-08,
"logits/chosen": -2.329598903656006,
"logits/rejected": -2.3325648307800293,
"logps/chosen": -1.0032579898834229,
"logps/rejected": -1.1267921924591064,
"loss": 1.6287,
"rewards/accuracies": 0.596875011920929,
"rewards/chosen": -2.0065159797668457,
"rewards/margins": 0.24706879258155823,
"rewards/rejected": -2.253584384918213,
"step": 390
},
{
"epoch": 0.5765246374200522,
"grad_norm": 15.08719846804436,
"learning_rate": 9.742409464205059e-08,
"logits/chosen": -2.364119052886963,
"logits/rejected": -2.3581573963165283,
"logps/chosen": -1.054837942123413,
"logps/rejected": -1.1783701181411743,
"loss": 1.6358,
"rewards/accuracies": 0.590624988079071,
"rewards/chosen": -2.109675884246826,
"rewards/margins": 0.2470642328262329,
"rewards/rejected": -2.3567402362823486,
"step": 400
},
{
"epoch": 0.5909377533555535,
"grad_norm": 16.155157647324575,
"learning_rate": 9.715142653202644e-08,
"logits/chosen": -2.347181558609009,
"logits/rejected": -2.342615842819214,
"logps/chosen": -1.017263650894165,
"logps/rejected": -1.1102826595306396,
"loss": 1.6768,
"rewards/accuracies": 0.6031249761581421,
"rewards/chosen": -2.03452730178833,
"rewards/margins": 0.18603798747062683,
"rewards/rejected": -2.2205653190612793,
"step": 410
},
{
"epoch": 0.6053508692910549,
"grad_norm": 14.897089823744135,
"learning_rate": 9.68654649996473e-08,
"logits/chosen": -2.364981174468994,
"logits/rejected": -2.3646998405456543,
"logps/chosen": -1.0181089639663696,
"logps/rejected": -1.1212923526763916,
"loss": 1.6626,
"rewards/accuracies": 0.590624988079071,
"rewards/chosen": -2.0362179279327393,
"rewards/margins": 0.206366628408432,
"rewards/rejected": -2.242584705352783,
"step": 420
},
{
"epoch": 0.6197639852265562,
"grad_norm": 15.109629627010106,
"learning_rate": 9.656629066617335e-08,
"logits/chosen": -2.351111650466919,
"logits/rejected": -2.3459696769714355,
"logps/chosen": -1.1007968187332153,
"logps/rejected": -1.1891463994979858,
"loss": 1.6834,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.2015936374664307,
"rewards/margins": 0.17669954895973206,
"rewards/rejected": -2.3782927989959717,
"step": 430
},
{
"epoch": 0.6341771011620575,
"grad_norm": 16.01041357452403,
"learning_rate": 9.62539878779556e-08,
"logits/chosen": -2.3512957096099854,
"logits/rejected": -2.3472342491149902,
"logps/chosen": -1.0058082342147827,
"logps/rejected": -1.1037191152572632,
"loss": 1.6651,
"rewards/accuracies": 0.559374988079071,
"rewards/chosen": -2.0116164684295654,
"rewards/margins": 0.19582167267799377,
"rewards/rejected": -2.2074382305145264,
"step": 440
},
{
"epoch": 0.6485902170975588,
"grad_norm": 18.411662730620584,
"learning_rate": 9.592864468265604e-08,
"logits/chosen": -2.3800835609436035,
"logits/rejected": -2.3797011375427246,
"logps/chosen": -1.0755730867385864,
"logps/rejected": -1.1656855344772339,
"loss": 1.6784,
"rewards/accuracies": 0.5718749761581421,
"rewards/chosen": -2.151146173477173,
"rewards/margins": 0.18022510409355164,
"rewards/rejected": -2.3313710689544678,
"step": 450
},
{
"epoch": 0.6630033330330601,
"grad_norm": 17.51219332799835,
"learning_rate": 9.559035280442441e-08,
"logits/chosen": -2.3352417945861816,
"logits/rejected": -2.3331692218780518,
"logps/chosen": -1.0036710500717163,
"logps/rejected": -1.0872585773468018,
"loss": 1.6865,
"rewards/accuracies": 0.559374988079071,
"rewards/chosen": -2.0073421001434326,
"rewards/margins": 0.16717498004436493,
"rewards/rejected": -2.1745171546936035,
"step": 460
},
{
"epoch": 0.6774164489685613,
"grad_norm": 18.31866820732837,
"learning_rate": 9.523920761803823e-08,
"logits/chosen": -2.3979227542877197,
"logits/rejected": -2.399036407470703,
"logps/chosen": -1.0747919082641602,
"logps/rejected": -1.1746306419372559,
"loss": 1.6553,
"rewards/accuracies": 0.628125011920929,
"rewards/chosen": -2.1495838165283203,
"rewards/margins": 0.1996772736310959,
"rewards/rejected": -2.3492612838745117,
"step": 470
},
{
"epoch": 0.6918295649040627,
"grad_norm": 16.80271538537987,
"learning_rate": 9.487530812201383e-08,
"logits/chosen": -2.35792875289917,
"logits/rejected": -2.3569393157958984,
"logps/chosen": -1.0264079570770264,
"logps/rejected": -1.1486434936523438,
"loss": 1.6324,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.0528159141540527,
"rewards/margins": 0.24447116255760193,
"rewards/rejected": -2.2972869873046875,
"step": 480
},
{
"epoch": 0.706242680839564,
"grad_norm": 16.799352219592777,
"learning_rate": 9.449875691069571e-08,
"logits/chosen": -2.356339931488037,
"logits/rejected": -2.354175567626953,
"logps/chosen": -1.0335304737091064,
"logps/rejected": -1.1673954725265503,
"loss": 1.6051,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -2.067060947418213,
"rewards/margins": 0.26773008704185486,
"rewards/rejected": -2.3347909450531006,
"step": 490
},
{
"epoch": 0.7206557967750653,
"grad_norm": 15.404244347962265,
"learning_rate": 9.410966014533195e-08,
"logits/chosen": -2.3478922843933105,
"logits/rejected": -2.3435702323913574,
"logps/chosen": -1.053039312362671,
"logps/rejected": -1.1690478324890137,
"loss": 1.6495,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -2.106078624725342,
"rewards/margins": 0.23201718926429749,
"rewards/rejected": -2.3380956649780273,
"step": 500
},
{
"epoch": 0.7350689127105666,
"grad_norm": 15.81308480269748,
"learning_rate": 9.37081275241442e-08,
"logits/chosen": -2.3459486961364746,
"logits/rejected": -2.339306592941284,
"logps/chosen": -1.0136808156967163,
"logps/rejected": -1.1138548851013184,
"loss": 1.667,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.0273616313934326,
"rewards/margins": 0.20034781098365784,
"rewards/rejected": -2.2277097702026367,
"step": 510
},
{
"epoch": 0.7494820286460679,
"grad_norm": 14.877032985004409,
"learning_rate": 9.329427225140042e-08,
"logits/chosen": -2.3370301723480225,
"logits/rejected": -2.3319363594055176,
"logps/chosen": -1.0117393732070923,
"logps/rejected": -1.1295689344406128,
"loss": 1.6432,
"rewards/accuracies": 0.53125,
"rewards/chosen": -2.0234787464141846,
"rewards/margins": 0.23565927147865295,
"rewards/rejected": -2.2591378688812256,
"step": 520
},
{
"epoch": 0.7638951445815693,
"grad_norm": 16.317618504393014,
"learning_rate": 9.286821100549906e-08,
"logits/chosen": -2.336864471435547,
"logits/rejected": -2.329371929168701,
"logps/chosen": -0.9821737408638,
"logps/rejected": -1.1123030185699463,
"loss": 1.6226,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.9643474817276,
"rewards/margins": 0.26025891304016113,
"rewards/rejected": -2.2246060371398926,
"step": 530
},
{
"epoch": 0.7783082605170706,
"grad_norm": 16.918699303271303,
"learning_rate": 9.243006390607402e-08,
"logits/chosen": -2.3681960105895996,
"logits/rejected": -2.3686928749084473,
"logps/chosen": -1.0731232166290283,
"logps/rejected": -1.2046077251434326,
"loss": 1.6286,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -2.1462464332580566,
"rewards/margins": 0.2629690170288086,
"rewards/rejected": -2.4092154502868652,
"step": 540
},
{
"epoch": 0.7927213764525718,
"grad_norm": 17.901668830250117,
"learning_rate": 9.197995448012912e-08,
"logits/chosen": -2.3749890327453613,
"logits/rejected": -2.368088960647583,
"logps/chosen": -1.0722578763961792,
"logps/rejected": -1.2028658390045166,
"loss": 1.6224,
"rewards/accuracies": 0.6031249761581421,
"rewards/chosen": -2.1445157527923584,
"rewards/margins": 0.26121601462364197,
"rewards/rejected": -2.405731678009033,
"step": 550
},
{
"epoch": 0.8071344923880731,
"grad_norm": 15.889671449808617,
"learning_rate": 9.151800962721217e-08,
"logits/chosen": -2.320263147354126,
"logits/rejected": -2.3110299110412598,
"logps/chosen": -1.0240787267684937,
"logps/rejected": -1.1282823085784912,
"loss": 1.6579,
"rewards/accuracies": 0.578125,
"rewards/chosen": -2.0481574535369873,
"rewards/margins": 0.20840716361999512,
"rewards/rejected": -2.2565646171569824,
"step": 560
},
{
"epoch": 0.8215476083235744,
"grad_norm": 16.160221475349292,
"learning_rate": 9.104435958363807e-08,
"logits/chosen": -2.3726491928100586,
"logits/rejected": -2.3696436882019043,
"logps/chosen": -1.0209132432937622,
"logps/rejected": -1.134126901626587,
"loss": 1.6464,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.0418264865875244,
"rewards/margins": 0.22642748057842255,
"rewards/rejected": -2.268253803253174,
"step": 570
},
{
"epoch": 0.8359607242590757,
"grad_norm": 16.09504542028388,
"learning_rate": 9.055913788577128e-08,
"logits/chosen": -2.3402140140533447,
"logits/rejected": -2.334770679473877,
"logps/chosen": -1.0541826486587524,
"logps/rejected": -1.1505852937698364,
"loss": 1.6795,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -2.108365297317505,
"rewards/margins": 0.19280506670475006,
"rewards/rejected": -2.301170587539673,
"step": 580
},
{
"epoch": 0.8503738401945771,
"grad_norm": 18.45826863343491,
"learning_rate": 9.006248133237782e-08,
"logits/chosen": -2.3699214458465576,
"logits/rejected": -2.361508846282959,
"logps/chosen": -1.037255048751831,
"logps/rejected": -1.155447006225586,
"loss": 1.6428,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.074510097503662,
"rewards/margins": 0.2363840639591217,
"rewards/rejected": -2.310894012451172,
"step": 590
},
{
"epoch": 0.8647869561300784,
"grad_norm": 18.434587269982643,
"learning_rate": 8.955452994605753e-08,
"logits/chosen": -2.3500571250915527,
"logits/rejected": -2.338733196258545,
"logps/chosen": -1.0794237852096558,
"logps/rejected": -1.170361042022705,
"loss": 1.6733,
"rewards/accuracies": 0.5718749761581421,
"rewards/chosen": -2.1588475704193115,
"rewards/margins": 0.1818745732307434,
"rewards/rejected": -2.34072208404541,
"step": 600
},
{
"epoch": 0.8792000720655797,
"grad_norm": 14.237081246848815,
"learning_rate": 8.903542693376747e-08,
"logits/chosen": -2.3270299434661865,
"logits/rejected": -2.3305177688598633,
"logps/chosen": -0.9713711738586426,
"logps/rejected": -1.1125681400299072,
"loss": 1.6091,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.9427423477172852,
"rewards/margins": 0.2823939025402069,
"rewards/rejected": -2.2251362800598145,
"step": 610
},
{
"epoch": 0.893613188001081,
"grad_norm": 17.400582788834974,
"learning_rate": 8.850531864644748e-08,
"logits/chosen": -2.3322553634643555,
"logits/rejected": -2.321770668029785,
"logps/chosen": -0.9585525393486023,
"logps/rejected": -1.0878236293792725,
"loss": 1.6235,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -1.9171050786972046,
"rewards/margins": 0.2585422694683075,
"rewards/rejected": -2.175647258758545,
"step": 620
},
{
"epoch": 0.9080263039365822,
"grad_norm": 18.38773462583586,
"learning_rate": 8.796435453775943e-08,
"logits/chosen": -2.3591456413269043,
"logits/rejected": -2.3641350269317627,
"logps/chosen": -1.0922317504882812,
"logps/rejected": -1.245233416557312,
"loss": 1.5961,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.1844635009765625,
"rewards/margins": 0.30600348114967346,
"rewards/rejected": -2.490466833114624,
"step": 630
},
{
"epoch": 0.9224394198720836,
"grad_norm": 17.0793455640924,
"learning_rate": 8.741268712195164e-08,
"logits/chosen": -2.362234115600586,
"logits/rejected": -2.3535900115966797,
"logps/chosen": -0.9950187802314758,
"logps/rejected": -1.1404359340667725,
"loss": 1.5986,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -1.9900375604629517,
"rewards/margins": 0.29083460569381714,
"rewards/rejected": -2.280871868133545,
"step": 640
},
{
"epoch": 0.9368525358075849,
"grad_norm": 18.356266999768685,
"learning_rate": 8.685047193086053e-08,
"logits/chosen": -2.3747830390930176,
"logits/rejected": -2.3743832111358643,
"logps/chosen": -1.0230966806411743,
"logps/rejected": -1.1178253889083862,
"loss": 1.6728,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.0461933612823486,
"rewards/margins": 0.18945762515068054,
"rewards/rejected": -2.2356507778167725,
"step": 650
},
{
"epoch": 0.9512656517430862,
"grad_norm": 16.97821645636938,
"learning_rate": 8.627786747006144e-08,
"logits/chosen": -2.3651280403137207,
"logits/rejected": -2.3614325523376465,
"logps/chosen": -1.028911828994751,
"logps/rejected": -1.1648304462432861,
"loss": 1.6105,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -2.057823657989502,
"rewards/margins": 0.2718368470668793,
"rewards/rejected": -2.3296608924865723,
"step": 660
},
{
"epoch": 0.9656787676785875,
"grad_norm": 18.242383473952547,
"learning_rate": 8.569503517418104e-08,
"logits/chosen": -2.3506455421447754,
"logits/rejected": -2.346644401550293,
"logps/chosen": -1.038861870765686,
"logps/rejected": -1.1740354299545288,
"loss": 1.6204,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.077723741531372,
"rewards/margins": 0.27034711837768555,
"rewards/rejected": -2.3480708599090576,
"step": 670
},
{
"epoch": 0.9800918836140888,
"grad_norm": 15.51076376279893,
"learning_rate": 8.510213936138402e-08,
"logits/chosen": -2.3083348274230957,
"logits/rejected": -2.3014862537384033,
"logps/chosen": -0.9869492650032043,
"logps/rejected": -1.0866016149520874,
"loss": 1.6735,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -1.9738985300064087,
"rewards/margins": 0.19930467009544373,
"rewards/rejected": -2.173203229904175,
"step": 680
},
{
"epoch": 0.9945049995495902,
"grad_norm": 17.843639653030788,
"learning_rate": 8.449934718704685e-08,
"logits/chosen": -2.3410897254943848,
"logits/rejected": -2.334183692932129,
"logps/chosen": -1.02655827999115,
"logps/rejected": -1.1379454135894775,
"loss": 1.6428,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.0531165599823,
"rewards/margins": 0.22277435660362244,
"rewards/rejected": -2.275890827178955,
"step": 690
},
{
"epoch": 1.0089181154850915,
"grad_norm": 18.24062737002371,
"learning_rate": 8.388682859663152e-08,
"logits/chosen": -2.3235275745391846,
"logits/rejected": -2.323727607727051,
"logps/chosen": -1.0423524379730225,
"logps/rejected": -1.1892979145050049,
"loss": 1.6146,
"rewards/accuracies": 0.590624988079071,
"rewards/chosen": -2.084704875946045,
"rewards/margins": 0.2938912510871887,
"rewards/rejected": -2.3785958290100098,
"step": 700
},
{
"epoch": 1.0233312314205927,
"grad_norm": 18.30818756183919,
"learning_rate": 8.326475627777277e-08,
"logits/chosen": -2.3337440490722656,
"logits/rejected": -2.3330025672912598,
"logps/chosen": -1.0714682340621948,
"logps/rejected": -1.2082436084747314,
"loss": 1.6339,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.1429364681243896,
"rewards/margins": 0.27355074882507324,
"rewards/rejected": -2.416487216949463,
"step": 710
},
{
"epoch": 1.037744347356094,
"grad_norm": 17.423864156378112,
"learning_rate": 8.26333056115922e-08,
"logits/chosen": -2.373300075531006,
"logits/rejected": -2.3668229579925537,
"logps/chosen": -1.0338383913040161,
"logps/rejected": -1.1421548128128052,
"loss": 1.6639,
"rewards/accuracies": 0.559374988079071,
"rewards/chosen": -2.0676767826080322,
"rewards/margins": 0.21663276851177216,
"rewards/rejected": -2.2843096256256104,
"step": 720
},
{
"epoch": 1.0521574632915953,
"grad_norm": 16.635043052348962,
"learning_rate": 8.1992654623253e-08,
"logits/chosen": -2.3428361415863037,
"logits/rejected": -2.33913516998291,
"logps/chosen": -1.009476900100708,
"logps/rejected": -1.1869137287139893,
"loss": 1.559,
"rewards/accuracies": 0.621874988079071,
"rewards/chosen": -2.018953800201416,
"rewards/margins": 0.35487350821495056,
"rewards/rejected": -2.3738274574279785,
"step": 730
},
{
"epoch": 1.0665705792270967,
"grad_norm": 19.25205105759611,
"learning_rate": 8.134298393176915e-08,
"logits/chosen": -2.301328420639038,
"logits/rejected": -2.2953743934631348,
"logps/chosen": -0.9850282669067383,
"logps/rejected": -1.131919264793396,
"loss": 1.6056,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -1.9700565338134766,
"rewards/margins": 0.2937820851802826,
"rewards/rejected": -2.263838529586792,
"step": 740
},
{
"epoch": 1.080983695162598,
"grad_norm": 16.04856542856117,
"learning_rate": 8.068447669908356e-08,
"logits/chosen": -2.306058168411255,
"logits/rejected": -2.294712781906128,
"logps/chosen": -1.06520676612854,
"logps/rejected": -1.1720651388168335,
"loss": 1.6557,
"rewards/accuracies": 0.565625011920929,
"rewards/chosen": -2.13041353225708,
"rewards/margins": 0.21371681988239288,
"rewards/rejected": -2.344130277633667,
"step": 750
},
{
"epoch": 1.0953968110980994,
"grad_norm": 14.81697278342191,
"learning_rate": 8.001731857842906e-08,
"logits/chosen": -2.317549705505371,
"logits/rejected": -2.3219799995422363,
"logps/chosen": -1.0585771799087524,
"logps/rejected": -1.1321176290512085,
"loss": 1.7105,
"rewards/accuracies": 0.528124988079071,
"rewards/chosen": -2.117154359817505,
"rewards/margins": 0.14708088338375092,
"rewards/rejected": -2.264235258102417,
"step": 760
},
{
"epoch": 1.1098099270336006,
"grad_norm": 19.08608533403698,
"learning_rate": 7.934169766198712e-08,
"logits/chosen": -2.347382068634033,
"logits/rejected": -2.3347859382629395,
"logps/chosen": -0.9919846653938293,
"logps/rejected": -1.155458688735962,
"loss": 1.5702,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.9839693307876587,
"rewards/margins": 0.32694780826568604,
"rewards/rejected": -2.310917377471924,
"step": 770
},
{
"epoch": 1.1242230429691018,
"grad_norm": 21.782769163652045,
"learning_rate": 7.86578044278589e-08,
"logits/chosen": -2.3568646907806396,
"logits/rejected": -2.350098133087158,
"logps/chosen": -1.0653258562088013,
"logps/rejected": -1.2129188776016235,
"loss": 1.6052,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.1306517124176025,
"rewards/margins": 0.29518604278564453,
"rewards/rejected": -2.425837755203247,
"step": 780
},
{
"epoch": 1.1386361589046032,
"grad_norm": 20.459222597520984,
"learning_rate": 7.796583168636375e-08,
"logits/chosen": -2.3612263202667236,
"logits/rejected": -2.3560619354248047,
"logps/chosen": -1.0090010166168213,
"logps/rejected": -1.1769835948944092,
"loss": 1.5759,
"rewards/accuracies": 0.6343749761581421,
"rewards/chosen": -2.0180020332336426,
"rewards/margins": 0.3359653949737549,
"rewards/rejected": -2.3539671897888184,
"step": 790
},
{
"epoch": 1.1530492748401044,
"grad_norm": 16.695222101185497,
"learning_rate": 7.726597452568007e-08,
"logits/chosen": -2.3381145000457764,
"logits/rejected": -2.3316009044647217,
"logps/chosen": -1.0254031419754028,
"logps/rejected": -1.16634202003479,
"loss": 1.6068,
"rewards/accuracies": 0.590624988079071,
"rewards/chosen": -2.0508062839508057,
"rewards/margins": 0.2818780839443207,
"rewards/rejected": -2.33268404006958,
"step": 800
},
{
"epoch": 1.1674623907756059,
"grad_norm": 17.52985696830486,
"learning_rate": 7.655843025684402e-08,
"logits/chosen": -2.3598532676696777,
"logits/rejected": -2.362898349761963,
"logps/chosen": -1.044235348701477,
"logps/rejected": -1.1720434427261353,
"loss": 1.6296,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.088470697402954,
"rewards/margins": 0.25561633706092834,
"rewards/rejected": -2.3440868854522705,
"step": 810
},
{
"epoch": 1.181875506711107,
"grad_norm": 14.910484844275423,
"learning_rate": 7.584339835812151e-08,
"logits/chosen": -2.3223514556884766,
"logits/rejected": -2.323925495147705,
"logps/chosen": -1.0323957204818726,
"logps/rejected": -1.1369130611419678,
"loss": 1.6678,
"rewards/accuracies": 0.5718749761581421,
"rewards/chosen": -2.064791440963745,
"rewards/margins": 0.20903484523296356,
"rewards/rejected": -2.2738261222839355,
"step": 820
},
{
"epoch": 1.1962886226466085,
"grad_norm": 16.58257930662513,
"learning_rate": 7.512108041876924e-08,
"logits/chosen": -2.2956104278564453,
"logits/rejected": -2.298205852508545,
"logps/chosen": -0.9996700286865234,
"logps/rejected": -1.1152664422988892,
"loss": 1.6512,
"rewards/accuracies": 0.578125,
"rewards/chosen": -1.9993400573730469,
"rewards/margins": 0.23119251430034637,
"rewards/rejected": -2.2305328845977783,
"step": 830
},
{
"epoch": 1.2107017385821097,
"grad_norm": 16.103489416598062,
"learning_rate": 7.439168008220056e-08,
"logits/chosen": -2.333143949508667,
"logits/rejected": -2.327017068862915,
"logps/chosen": -1.0302656888961792,
"logps/rejected": -1.1976699829101562,
"loss": 1.5731,
"rewards/accuracies": 0.621874988079071,
"rewards/chosen": -2.0605313777923584,
"rewards/margins": 0.3348085880279541,
"rewards/rejected": -2.3953399658203125,
"step": 840
},
{
"epoch": 1.225114854517611,
"grad_norm": 17.820096880219356,
"learning_rate": 7.365540298857215e-08,
"logits/chosen": -2.3323662281036377,
"logits/rejected": -2.3332276344299316,
"logps/chosen": -1.0587284564971924,
"logps/rejected": -1.2181167602539062,
"loss": 1.5796,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -2.1174569129943848,
"rewards/margins": 0.3187769949436188,
"rewards/rejected": -2.4362335205078125,
"step": 850
},
{
"epoch": 1.2395279704531124,
"grad_norm": 18.066090520662634,
"learning_rate": 7.291245671680781e-08,
"logits/chosen": -2.3100619316101074,
"logits/rejected": -2.3028578758239746,
"logps/chosen": -0.9891204833984375,
"logps/rejected": -1.1562236547470093,
"loss": 1.5852,
"rewards/accuracies": 0.5843750238418579,
"rewards/chosen": -1.978240966796875,
"rewards/margins": 0.3342065215110779,
"rewards/rejected": -2.3124473094940186,
"step": 860
},
{
"epoch": 1.2539410863886136,
"grad_norm": 16.723867521490277,
"learning_rate": 7.216305072607568e-08,
"logits/chosen": -2.3490469455718994,
"logits/rejected": -2.351792812347412,
"logps/chosen": -1.0800100564956665,
"logps/rejected": -1.2314789295196533,
"loss": 1.6035,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.160020112991333,
"rewards/margins": 0.30293765664100647,
"rewards/rejected": -2.4629578590393066,
"step": 870
},
{
"epoch": 1.268354202324115,
"grad_norm": 19.696376219540245,
"learning_rate": 7.14073962967353e-08,
"logits/chosen": -2.361971855163574,
"logits/rejected": -2.3552968502044678,
"logps/chosen": -1.1068134307861328,
"logps/rejected": -1.2376269102096558,
"loss": 1.6428,
"rewards/accuracies": 0.565625011920929,
"rewards/chosen": -2.2136268615722656,
"rewards/margins": 0.2616268992424011,
"rewards/rejected": -2.4752538204193115,
"step": 880
},
{
"epoch": 1.2827673182596162,
"grad_norm": 18.939981579389148,
"learning_rate": 7.064570647077124e-08,
"logits/chosen": -2.34350848197937,
"logits/rejected": -2.335470676422119,
"logps/chosen": -1.1084269285202026,
"logps/rejected": -1.230513095855713,
"loss": 1.6428,
"rewards/accuracies": 0.6031249761581421,
"rewards/chosen": -2.2168538570404053,
"rewards/margins": 0.24417249858379364,
"rewards/rejected": -2.461026191711426,
"step": 890
},
{
"epoch": 1.2971804341951176,
"grad_norm": 18.47019854160618,
"learning_rate": 6.987819599173006e-08,
"logits/chosen": -2.3356449604034424,
"logits/rejected": -2.331501007080078,
"logps/chosen": -1.0205782651901245,
"logps/rejected": -1.1818567514419556,
"loss": 1.588,
"rewards/accuracies": 0.596875011920929,
"rewards/chosen": -2.041156530380249,
"rewards/margins": 0.322556734085083,
"rewards/rejected": -2.363713502883911,
"step": 900
},
{
"epoch": 1.3115935501306188,
"grad_norm": 21.06251591954156,
"learning_rate": 6.910508124417765e-08,
"logits/chosen": -2.3116612434387207,
"logits/rejected": -2.311708927154541,
"logps/chosen": -1.0073387622833252,
"logps/rejected": -1.1689893007278442,
"loss": 1.5949,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.0146775245666504,
"rewards/margins": 0.32330113649368286,
"rewards/rejected": -2.3379786014556885,
"step": 910
},
{
"epoch": 1.32600666606612,
"grad_norm": 15.75888959059691,
"learning_rate": 6.832658019269373e-08,
"logits/chosen": -2.2905359268188477,
"logits/rejected": -2.285813808441162,
"logps/chosen": -1.017747402191162,
"logps/rejected": -1.1801689863204956,
"loss": 1.5957,
"rewards/accuracies": 0.6031249761581421,
"rewards/chosen": -2.035494804382324,
"rewards/margins": 0.3248431086540222,
"rewards/rejected": -2.360337972640991,
"step": 920
},
{
"epoch": 1.3404197820016215,
"grad_norm": 16.36860064354464,
"learning_rate": 6.75429123204211e-08,
"logits/chosen": -2.3322787284851074,
"logits/rejected": -2.325899600982666,
"logps/chosen": -1.0550917387008667,
"logps/rejected": -1.2269432544708252,
"loss": 1.5757,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -2.1101834774017334,
"rewards/margins": 0.34370261430740356,
"rewards/rejected": -2.4538865089416504,
"step": 930
},
{
"epoch": 1.354832897937123,
"grad_norm": 15.89341720744674,
"learning_rate": 6.675429856718652e-08,
"logits/chosen": -2.302473306655884,
"logits/rejected": -2.292829990386963,
"logps/chosen": -0.9993384480476379,
"logps/rejected": -1.1607972383499146,
"loss": 1.5858,
"rewards/accuracies": 0.609375,
"rewards/chosen": -1.9986768960952759,
"rewards/margins": 0.3229173719882965,
"rewards/rejected": -2.321594476699829,
"step": 940
},
{
"epoch": 1.3692460138726241,
"grad_norm": 16.669054151143325,
"learning_rate": 6.596096126721123e-08,
"logits/chosen": -2.273181200027466,
"logits/rejected": -2.2777457237243652,
"logps/chosen": -1.0447285175323486,
"logps/rejected": -1.2103157043457031,
"loss": 1.5821,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.0894570350646973,
"rewards/margins": 0.3311743438243866,
"rewards/rejected": -2.4206314086914062,
"step": 950
},
{
"epoch": 1.3836591298081253,
"grad_norm": 15.868141254654335,
"learning_rate": 6.516312408642804e-08,
"logits/chosen": -2.322033405303955,
"logits/rejected": -2.3260583877563477,
"logps/chosen": -1.0269404649734497,
"logps/rejected": -1.217023491859436,
"loss": 1.543,
"rewards/accuracies": 0.621874988079071,
"rewards/chosen": -2.0538809299468994,
"rewards/margins": 0.3801659941673279,
"rewards/rejected": -2.434046983718872,
"step": 960
},
{
"epoch": 1.3980722457436268,
"grad_norm": 19.7395273688106,
"learning_rate": 6.436101195942312e-08,
"logits/chosen": -2.3190536499023438,
"logits/rejected": -2.321190357208252,
"logps/chosen": -1.0408755540847778,
"logps/rejected": -1.1574127674102783,
"loss": 1.6495,
"rewards/accuracies": 0.596875011920929,
"rewards/chosen": -2.0817511081695557,
"rewards/margins": 0.23307427763938904,
"rewards/rejected": -2.3148255348205566,
"step": 970
},
{
"epoch": 1.412485361679128,
"grad_norm": 17.85424182086385,
"learning_rate": 6.35548510260201e-08,
"logits/chosen": -2.2950663566589355,
"logits/rejected": -2.290828227996826,
"logps/chosen": -1.015590786933899,
"logps/rejected": -1.1845998764038086,
"loss": 1.5815,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -2.031181573867798,
"rewards/margins": 0.33801814913749695,
"rewards/rejected": -2.369199752807617,
"step": 980
},
{
"epoch": 1.4268984776146292,
"grad_norm": 17.0591983972092,
"learning_rate": 6.274486856752442e-08,
"logits/chosen": -2.3268628120422363,
"logits/rejected": -2.3215243816375732,
"logps/chosen": -1.054785132408142,
"logps/rejected": -1.2332737445831299,
"loss": 1.5786,
"rewards/accuracies": 0.6031249761581421,
"rewards/chosen": -2.109570264816284,
"rewards/margins": 0.35697704553604126,
"rewards/rejected": -2.4665474891662598,
"step": 990
},
{
"epoch": 1.4413115935501306,
"grad_norm": 15.976591290404047,
"learning_rate": 6.193129294264568e-08,
"logits/chosen": -2.3251538276672363,
"logits/rejected": -2.319453477859497,
"logps/chosen": -1.0316834449768066,
"logps/rejected": -1.2238515615463257,
"loss": 1.549,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.0633668899536133,
"rewards/margins": 0.3843366503715515,
"rewards/rejected": -2.4477031230926514,
"step": 1000
},
{
"epoch": 1.455724709485632,
"grad_norm": 17.378099075031535,
"learning_rate": 6.111435352311653e-08,
"logits/chosen": -2.3224568367004395,
"logits/rejected": -2.318516254425049,
"logps/chosen": -1.044806718826294,
"logps/rejected": -1.204319715499878,
"loss": 1.5956,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.089613437652588,
"rewards/margins": 0.3190259337425232,
"rewards/rejected": -2.408639430999756,
"step": 1010
},
{
"epoch": 1.4701378254211332,
"grad_norm": 18.355317239262256,
"learning_rate": 6.02942806290257e-08,
"logits/chosen": -2.337299346923828,
"logits/rejected": -2.334476947784424,
"logps/chosen": -1.0204999446868896,
"logps/rejected": -1.182180404663086,
"loss": 1.5882,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.0409998893737793,
"rewards/margins": 0.3233610987663269,
"rewards/rejected": -2.364360809326172,
"step": 1020
},
{
"epoch": 1.4845509413566345,
"grad_norm": 16.458819438737027,
"learning_rate": 5.947130546388376e-08,
"logits/chosen": -2.307170867919922,
"logits/rejected": -2.297262668609619,
"logps/chosen": -1.1198623180389404,
"logps/rejected": -1.2803127765655518,
"loss": 1.6069,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.239724636077881,
"rewards/margins": 0.32090049982070923,
"rewards/rejected": -2.5606255531311035,
"step": 1030
},
{
"epoch": 1.4989640572921359,
"grad_norm": 18.315663658527253,
"learning_rate": 5.864566004943983e-08,
"logits/chosen": -2.3090689182281494,
"logits/rejected": -2.299919605255127,
"logps/chosen": -1.1342939138412476,
"logps/rejected": -1.2915699481964111,
"loss": 1.5918,
"rewards/accuracies": 0.6156250238418579,
"rewards/chosen": -2.268587827682495,
"rewards/margins": 0.3145517408847809,
"rewards/rejected": -2.5831398963928223,
"step": 1040
},
{
"epoch": 1.513377173227637,
"grad_norm": 18.253777248388865,
"learning_rate": 5.78175771602676e-08,
"logits/chosen": -2.3258557319641113,
"logits/rejected": -2.329089641571045,
"logps/chosen": -1.0340855121612549,
"logps/rejected": -1.1988188028335571,
"loss": 1.5903,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.0681710243225098,
"rewards/margins": 0.3294665813446045,
"rewards/rejected": -2.3976376056671143,
"step": 1050
},
{
"epoch": 1.5277902891631383,
"grad_norm": 20.03722300524917,
"learning_rate": 5.6987290258139073e-08,
"logits/chosen": -2.269885301589966,
"logits/rejected": -2.2610838413238525,
"logps/chosen": -1.0655957460403442,
"logps/rejected": -1.2299748659133911,
"loss": 1.5939,
"rewards/accuracies": 0.621874988079071,
"rewards/chosen": -2.1311914920806885,
"rewards/margins": 0.3287580609321594,
"rewards/rejected": -2.4599497318267822,
"step": 1060
},
{
"epoch": 1.5422034050986397,
"grad_norm": 19.363745969848598,
"learning_rate": 5.6155033426204615e-08,
"logits/chosen": -2.3013463020324707,
"logits/rejected": -2.30194091796875,
"logps/chosen": -1.1020151376724243,
"logps/rejected": -1.2730225324630737,
"loss": 1.58,
"rewards/accuracies": 0.6031249761581421,
"rewards/chosen": -2.2040302753448486,
"rewards/margins": 0.34201496839523315,
"rewards/rejected": -2.5460450649261475,
"step": 1070
},
{
"epoch": 1.5566165210341412,
"grad_norm": 21.243971440197193,
"learning_rate": 5.532104130299771e-08,
"logits/chosen": -2.306084632873535,
"logits/rejected": -2.3026065826416016,
"logps/chosen": -1.1136653423309326,
"logps/rejected": -1.253650426864624,
"loss": 1.6339,
"rewards/accuracies": 0.5843750238418579,
"rewards/chosen": -2.2273306846618652,
"rewards/margins": 0.27996987104415894,
"rewards/rejected": -2.507300853729248,
"step": 1080
},
{
"epoch": 1.5710296369696424,
"grad_norm": 18.884950972549078,
"learning_rate": 5.448554901628333e-08,
"logits/chosen": -2.3047351837158203,
"logits/rejected": -2.30297589302063,
"logps/chosen": -1.057666301727295,
"logps/rejected": -1.2256438732147217,
"loss": 1.5844,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.11533260345459,
"rewards/margins": 0.3359553813934326,
"rewards/rejected": -2.4512877464294434,
"step": 1090
},
{
"epoch": 1.5854427529051436,
"grad_norm": 16.26327515212116,
"learning_rate": 5.364879211676816e-08,
"logits/chosen": -2.3229575157165527,
"logits/rejected": -2.322633743286133,
"logps/chosen": -1.0644395351409912,
"logps/rejected": -1.2588599920272827,
"loss": 1.5435,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -2.1288790702819824,
"rewards/margins": 0.3888412117958069,
"rewards/rejected": -2.5177199840545654,
"step": 1100
},
{
"epoch": 1.599855868840645,
"grad_norm": 16.929494402078088,
"learning_rate": 5.281100651169175e-08,
"logits/chosen": -2.3269693851470947,
"logits/rejected": -2.329103946685791,
"logps/chosen": -1.1110026836395264,
"logps/rejected": -1.3049942255020142,
"loss": 1.5754,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.2220053672790527,
"rewards/margins": 0.3879828453063965,
"rewards/rejected": -2.6099884510040283,
"step": 1110
},
{
"epoch": 1.6142689847761464,
"grad_norm": 19.384751167038143,
"learning_rate": 5.197242839831706e-08,
"logits/chosen": -2.2902255058288574,
"logits/rejected": -2.2878143787384033,
"logps/chosen": -1.0505023002624512,
"logps/rejected": -1.2497543096542358,
"loss": 1.5559,
"rewards/accuracies": 0.640625,
"rewards/chosen": -2.1010046005249023,
"rewards/margins": 0.39850395917892456,
"rewards/rejected": -2.4995086193084717,
"step": 1120
},
{
"epoch": 1.6286821007116477,
"grad_norm": 21.020671773840373,
"learning_rate": 5.1133294197339274e-08,
"logits/chosen": -2.3327059745788574,
"logits/rejected": -2.3221957683563232,
"logps/chosen": -1.0784157514572144,
"logps/rejected": -1.2418811321258545,
"loss": 1.6035,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.1568315029144287,
"rewards/margins": 0.3269307017326355,
"rewards/rejected": -2.483762264251709,
"step": 1130
},
{
"epoch": 1.6430952166471489,
"grad_norm": 21.249031332264607,
"learning_rate": 5.029384048623153e-08,
"logits/chosen": -2.2892603874206543,
"logits/rejected": -2.2838594913482666,
"logps/chosen": -1.1238863468170166,
"logps/rejected": -1.2816271781921387,
"loss": 1.5968,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.247772693634033,
"rewards/margins": 0.3154818117618561,
"rewards/rejected": -2.5632543563842773,
"step": 1140
},
{
"epoch": 1.6575083325826503,
"grad_norm": 18.66086972186176,
"learning_rate": 4.9454303932546675e-08,
"logits/chosen": -2.28279447555542,
"logits/rejected": -2.2724807262420654,
"logps/chosen": -1.0907418727874756,
"logps/rejected": -1.2298866510391235,
"loss": 1.6405,
"rewards/accuracies": 0.578125,
"rewards/chosen": -2.181483745574951,
"rewards/margins": 0.2782895267009735,
"rewards/rejected": -2.459773302078247,
"step": 1150
},
{
"epoch": 1.6719214485181515,
"grad_norm": 19.50349240348182,
"learning_rate": 4.861492122719338e-08,
"logits/chosen": -2.319563388824463,
"logits/rejected": -2.3177480697631836,
"logps/chosen": -1.0951299667358398,
"logps/rejected": -1.260750651359558,
"loss": 1.6022,
"rewards/accuracies": 0.5843750238418579,
"rewards/chosen": -2.1902599334716797,
"rewards/margins": 0.3312414586544037,
"rewards/rejected": -2.521501302719116,
"step": 1160
},
{
"epoch": 1.6863345644536527,
"grad_norm": 17.58127266536524,
"learning_rate": 4.777592901770575e-08,
"logits/chosen": -2.327413558959961,
"logits/rejected": -2.3294601440429688,
"logps/chosen": -1.0109418630599976,
"logps/rejected": -1.214444637298584,
"loss": 1.5519,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.021883726119995,
"rewards/margins": 0.4070053994655609,
"rewards/rejected": -2.428889274597168,
"step": 1170
},
{
"epoch": 1.7007476803891541,
"grad_norm": 16.893442050436466,
"learning_rate": 4.693756384152529e-08,
"logits/chosen": -2.290790557861328,
"logits/rejected": -2.2821555137634277,
"logps/chosen": -1.0620388984680176,
"logps/rejected": -1.2741947174072266,
"loss": 1.5403,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.124077796936035,
"rewards/margins": 0.42431193590164185,
"rewards/rejected": -2.548389434814453,
"step": 1180
},
{
"epoch": 1.7151607963246556,
"grad_norm": 16.76150597577845,
"learning_rate": 4.610006205931365e-08,
"logits/chosen": -2.334803342819214,
"logits/rejected": -2.3295693397521973,
"logps/chosen": -1.1866618394851685,
"logps/rejected": -1.3234022855758667,
"loss": 1.6392,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.373323678970337,
"rewards/margins": 0.2734811305999756,
"rewards/rejected": -2.6468045711517334,
"step": 1190
},
{
"epoch": 1.7295739122601568,
"grad_norm": 24.57055189161366,
"learning_rate": 4.526365978831551e-08,
"logits/chosen": -2.3130276203155518,
"logits/rejected": -2.30517578125,
"logps/chosen": -1.1128777265548706,
"logps/rejected": -1.3150999546051025,
"loss": 1.556,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.225755453109741,
"rewards/margins": 0.40444430708885193,
"rewards/rejected": -2.630199909210205,
"step": 1200
},
{
"epoch": 1.743987028195658,
"grad_norm": 19.26814679538138,
"learning_rate": 4.442859283578981e-08,
"logits/chosen": -2.312147617340088,
"logits/rejected": -2.3039205074310303,
"logps/chosen": -1.0945560932159424,
"logps/rejected": -1.2648680210113525,
"loss": 1.6149,
"rewards/accuracies": 0.596875011920929,
"rewards/chosen": -2.1891121864318848,
"rewards/margins": 0.3406239151954651,
"rewards/rejected": -2.529736042022705,
"step": 1210
},
{
"epoch": 1.7584001441311594,
"grad_norm": 18.13222142013933,
"learning_rate": 4.359509663252864e-08,
"logits/chosen": -2.289947986602783,
"logits/rejected": -2.2836596965789795,
"logps/chosen": -1.0912672281265259,
"logps/rejected": -1.261278748512268,
"loss": 1.5891,
"rewards/accuracies": 0.6156250238418579,
"rewards/chosen": -2.1825344562530518,
"rewards/margins": 0.3400228023529053,
"rewards/rejected": -2.522557497024536,
"step": 1220
},
{
"epoch": 1.7728132600666606,
"grad_norm": 19.057526927248425,
"learning_rate": 4.276340616648198e-08,
"logits/chosen": -2.341885566711426,
"logits/rejected": -2.3356499671936035,
"logps/chosen": -1.10612154006958,
"logps/rejected": -1.2711408138275146,
"loss": 1.6144,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.21224308013916,
"rewards/margins": 0.33003857731819153,
"rewards/rejected": -2.5422816276550293,
"step": 1230
},
{
"epoch": 1.7872263760021618,
"grad_norm": 19.76084929838562,
"learning_rate": 4.193375591650758e-08,
"logits/chosen": -2.3344829082489014,
"logits/rejected": -2.3287951946258545,
"logps/chosen": -1.1671698093414307,
"logps/rejected": -1.3440189361572266,
"loss": 1.6093,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -2.3343396186828613,
"rewards/margins": 0.353698194026947,
"rewards/rejected": -2.688037872314453,
"step": 1240
},
{
"epoch": 1.8016394919376633,
"grad_norm": 19.067146028274564,
"learning_rate": 4.110637978626415e-08,
"logits/chosen": -2.298180341720581,
"logits/rejected": -2.2934188842773438,
"logps/chosen": -1.030287504196167,
"logps/rejected": -1.2465605735778809,
"loss": 1.5146,
"rewards/accuracies": 0.65625,
"rewards/chosen": -2.060575008392334,
"rewards/margins": 0.43254607915878296,
"rewards/rejected": -2.4931211471557617,
"step": 1250
},
{
"epoch": 1.8160526078731647,
"grad_norm": 18.276378668755576,
"learning_rate": 4.0281511038266867e-08,
"logits/chosen": -2.234718084335327,
"logits/rejected": -2.2318148612976074,
"logps/chosen": -1.0859392881393433,
"logps/rejected": -1.2924591302871704,
"loss": 1.5609,
"rewards/accuracies": 0.6031249761581421,
"rewards/chosen": -2.1718785762786865,
"rewards/margins": 0.41303977370262146,
"rewards/rejected": -2.584918260574341,
"step": 1260
},
{
"epoch": 1.830465723808666,
"grad_norm": 17.470784593739236,
"learning_rate": 3.9459382228123475e-08,
"logits/chosen": -2.279468059539795,
"logits/rejected": -2.273711919784546,
"logps/chosen": -1.0365493297576904,
"logps/rejected": -1.2447311878204346,
"loss": 1.5556,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.073098659515381,
"rewards/margins": 0.41636401414871216,
"rewards/rejected": -2.489462375640869,
"step": 1270
},
{
"epoch": 1.844878839744167,
"grad_norm": 21.830692496447263,
"learning_rate": 3.864022513896989e-08,
"logits/chosen": -2.2853286266326904,
"logits/rejected": -2.2701587677001953,
"logps/chosen": -1.0575942993164062,
"logps/rejected": -1.2254334688186646,
"loss": 1.6005,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.1151885986328125,
"rewards/margins": 0.3356781005859375,
"rewards/rejected": -2.450866937637329,
"step": 1280
},
{
"epoch": 1.8592919556796685,
"grad_norm": 20.0916366903334,
"learning_rate": 3.782427071612339e-08,
"logits/chosen": -2.3116753101348877,
"logits/rejected": -2.306715488433838,
"logps/chosen": -1.1340314149856567,
"logps/rejected": -1.3019399642944336,
"loss": 1.5867,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.2680628299713135,
"rewards/margins": 0.33581703901290894,
"rewards/rejected": -2.603879928588867,
"step": 1290
},
{
"epoch": 1.87370507161517,
"grad_norm": 22.477485924506297,
"learning_rate": 3.7011749001972174e-08,
"logits/chosen": -2.3057870864868164,
"logits/rejected": -2.3034915924072266,
"logps/chosen": -1.053118348121643,
"logps/rejected": -1.2349039316177368,
"loss": 1.5867,
"rewards/accuracies": 0.559374988079071,
"rewards/chosen": -2.106236696243286,
"rewards/margins": 0.36357131600379944,
"rewards/rejected": -2.4698078632354736,
"step": 1300
},
{
"epoch": 1.888118187550671,
"grad_norm": 20.557013864835106,
"learning_rate": 3.620288907111931e-08,
"logits/chosen": -2.277376651763916,
"logits/rejected": -2.272871255874634,
"logps/chosen": -1.096543312072754,
"logps/rejected": -1.3053501844406128,
"loss": 1.5318,
"rewards/accuracies": 0.590624988079071,
"rewards/chosen": -2.193086624145508,
"rewards/margins": 0.4176138937473297,
"rewards/rejected": -2.6107003688812256,
"step": 1310
},
{
"epoch": 1.9025313034861724,
"grad_norm": 27.1695631827936,
"learning_rate": 3.539791896579978e-08,
"logits/chosen": -2.317373752593994,
"logits/rejected": -2.318577289581299,
"logps/chosen": -1.2034056186676025,
"logps/rejected": -1.344125747680664,
"loss": 1.6377,
"rewards/accuracies": 0.5718749761581421,
"rewards/chosen": -2.406811237335205,
"rewards/margins": 0.28144046664237976,
"rewards/rejected": -2.688251495361328,
"step": 1320
},
{
"epoch": 1.9169444194216738,
"grad_norm": 25.106064057973505,
"learning_rate": 3.459706563158828e-08,
"logits/chosen": -2.279590129852295,
"logits/rejected": -2.281261682510376,
"logps/chosen": -1.1769063472747803,
"logps/rejected": -1.3924826383590698,
"loss": 1.5341,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -2.3538126945495605,
"rewards/margins": 0.4311525821685791,
"rewards/rejected": -2.7849652767181396,
"step": 1330
},
{
"epoch": 1.931357535357175,
"grad_norm": 20.192509452290462,
"learning_rate": 3.380055485341644e-08,
"logits/chosen": -2.314013957977295,
"logits/rejected": -2.3160252571105957,
"logps/chosen": -1.1351264715194702,
"logps/rejected": -1.3126869201660156,
"loss": 1.5828,
"rewards/accuracies": 0.621874988079071,
"rewards/chosen": -2.2702529430389404,
"rewards/margins": 0.35512077808380127,
"rewards/rejected": -2.6253738403320312,
"step": 1340
},
{
"epoch": 1.9457706512926762,
"grad_norm": 22.1205875163306,
"learning_rate": 3.300861119191718e-08,
"logits/chosen": -2.2895724773406982,
"logits/rejected": -2.283412456512451,
"logps/chosen": -1.179337739944458,
"logps/rejected": -1.3338556289672852,
"loss": 1.6304,
"rewards/accuracies": 0.528124988079071,
"rewards/chosen": -2.358675479888916,
"rewards/margins": 0.30903515219688416,
"rewards/rejected": -2.6677112579345703,
"step": 1350
},
{
"epoch": 1.9601837672281777,
"grad_norm": 21.26891098809936,
"learning_rate": 3.2221457920114213e-08,
"logits/chosen": -2.307619094848633,
"logits/rejected": -2.3046841621398926,
"logps/chosen": -1.1182931661605835,
"logps/rejected": -1.3411715030670166,
"loss": 1.5205,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -2.236586332321167,
"rewards/margins": 0.4457565248012543,
"rewards/rejected": -2.682343006134033,
"step": 1360
},
{
"epoch": 1.974596883163679,
"grad_norm": 23.3986392290044,
"learning_rate": 3.143931696047454e-08,
"logits/chosen": -2.302565813064575,
"logits/rejected": -2.298037528991699,
"logps/chosen": -1.0839837789535522,
"logps/rejected": -1.2788712978363037,
"loss": 1.559,
"rewards/accuracies": 0.6156250238418579,
"rewards/chosen": -2.1679675579071045,
"rewards/margins": 0.38977518677711487,
"rewards/rejected": -2.5577425956726074,
"step": 1370
},
{
"epoch": 1.9890099990991803,
"grad_norm": 19.075694699589782,
"learning_rate": 3.066240882234186e-08,
"logits/chosen": -2.306809663772583,
"logits/rejected": -2.3090083599090576,
"logps/chosen": -1.150748372077942,
"logps/rejected": -1.330487847328186,
"loss": 1.5692,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.301496744155884,
"rewards/margins": 0.35947883129119873,
"rewards/rejected": -2.660975694656372,
"step": 1380
},
{
"epoch": 2.0034231150346815,
"grad_norm": 23.08357458694508,
"learning_rate": 2.989095253976816e-08,
"logits/chosen": -2.2911369800567627,
"logits/rejected": -2.2887818813323975,
"logps/chosen": -1.1655315160751343,
"logps/rejected": -1.3231830596923828,
"loss": 1.6272,
"rewards/accuracies": 0.546875,
"rewards/chosen": -2.3310630321502686,
"rewards/margins": 0.3153030276298523,
"rewards/rejected": -2.6463661193847656,
"step": 1390
},
{
"epoch": 2.017836230970183,
"grad_norm": 21.786843412845027,
"learning_rate": 2.912516560976146e-08,
"logits/chosen": -2.2617886066436768,
"logits/rejected": -2.261368989944458,
"logps/chosen": -1.116999864578247,
"logps/rejected": -1.3585065603256226,
"loss": 1.5173,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.233999729156494,
"rewards/margins": 0.4830136299133301,
"rewards/rejected": -2.717013120651245,
"step": 1400
},
{
"epoch": 2.0322493469056844,
"grad_norm": 19.872912648108493,
"learning_rate": 2.836526393096661e-08,
"logits/chosen": -2.3144338130950928,
"logits/rejected": -2.319342613220215,
"logps/chosen": -1.127329707145691,
"logps/rejected": -1.3289254903793335,
"loss": 1.5402,
"rewards/accuracies": 0.628125011920929,
"rewards/chosen": -2.254659414291382,
"rewards/margins": 0.40319204330444336,
"rewards/rejected": -2.657850980758667,
"step": 1410
},
{
"epoch": 2.0466624628411854,
"grad_norm": 22.10407026857419,
"learning_rate": 2.7611461742797165e-08,
"logits/chosen": -2.2922112941741943,
"logits/rejected": -2.2878568172454834,
"logps/chosen": -1.0672378540039062,
"logps/rejected": -1.2899413108825684,
"loss": 1.5126,
"rewards/accuracies": 0.6156250238418579,
"rewards/chosen": -2.1344757080078125,
"rewards/margins": 0.44540706276893616,
"rewards/rejected": -2.5798826217651367,
"step": 1420
},
{
"epoch": 2.061075578776687,
"grad_norm": 26.207768824418398,
"learning_rate": 2.686397156503445e-08,
"logits/chosen": -2.2948415279388428,
"logits/rejected": -2.28835129737854,
"logps/chosen": -1.1063997745513916,
"logps/rejected": -1.3052228689193726,
"loss": 1.5589,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.212799549102783,
"rewards/margins": 0.39764639735221863,
"rewards/rejected": -2.610445737838745,
"step": 1430
},
{
"epoch": 2.075488694712188,
"grad_norm": 18.99932149970658,
"learning_rate": 2.6123004137912084e-08,
"logits/chosen": -2.2723312377929688,
"logits/rejected": -2.276716709136963,
"logps/chosen": -1.0470964908599854,
"logps/rejected": -1.2561558485031128,
"loss": 1.5356,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -2.0941929817199707,
"rewards/margins": 0.4181187152862549,
"rewards/rejected": -2.5123116970062256,
"step": 1440
},
{
"epoch": 2.089901810647689,
"grad_norm": 19.445114453376085,
"learning_rate": 2.5388768362701585e-08,
"logits/chosen": -2.2706756591796875,
"logits/rejected": -2.269131898880005,
"logps/chosen": -1.1902254819869995,
"logps/rejected": -1.351431131362915,
"loss": 1.6073,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.380450963973999,
"rewards/margins": 0.3224112391471863,
"rewards/rejected": -2.70286226272583,
"step": 1450
},
{
"epoch": 2.1043149265831906,
"grad_norm": 22.70265803179129,
"learning_rate": 2.466147124281703e-08,
"logits/chosen": -2.3346049785614014,
"logits/rejected": -2.3269667625427246,
"logps/chosen": -1.1868515014648438,
"logps/rejected": -1.3827440738677979,
"loss": 1.5644,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.3737030029296875,
"rewards/margins": 0.39178499579429626,
"rewards/rejected": -2.7654881477355957,
"step": 1460
},
{
"epoch": 2.118728042518692,
"grad_norm": 25.431369552773468,
"learning_rate": 2.3941317825454278e-08,
"logits/chosen": -2.287153720855713,
"logits/rejected": -2.274724006652832,
"logps/chosen": -1.1501365900039673,
"logps/rejected": -1.3252675533294678,
"loss": 1.599,
"rewards/accuracies": 0.596875011920929,
"rewards/chosen": -2.3002731800079346,
"rewards/margins": 0.35026198625564575,
"rewards/rejected": -2.6505351066589355,
"step": 1470
},
{
"epoch": 2.1331411584541935,
"grad_norm": 38.861924452847305,
"learning_rate": 2.322851114378203e-08,
"logits/chosen": -2.2646145820617676,
"logits/rejected": -2.2705867290496826,
"logps/chosen": -1.2125260829925537,
"logps/rejected": -1.4090855121612549,
"loss": 1.5981,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.4250521659851074,
"rewards/margins": 0.39311888813972473,
"rewards/rejected": -2.8181710243225098,
"step": 1480
},
{
"epoch": 2.1475542743896945,
"grad_norm": 20.599427677239603,
"learning_rate": 2.252325215970059e-08,
"logits/chosen": -2.2515275478363037,
"logits/rejected": -2.24314022064209,
"logps/chosen": -1.1347332000732422,
"logps/rejected": -1.3541853427886963,
"loss": 1.5426,
"rewards/accuracies": 0.609375,
"rewards/chosen": -2.2694664001464844,
"rewards/margins": 0.438904345035553,
"rewards/rejected": -2.7083706855773926,
"step": 1490
},
{
"epoch": 2.161967390325196,
"grad_norm": 20.697243890138434,
"learning_rate": 2.182573970718449e-08,
"logits/chosen": -2.279026746749878,
"logits/rejected": -2.2784788608551025,
"logps/chosen": -1.1145248413085938,
"logps/rejected": -1.3219712972640991,
"loss": 1.5631,
"rewards/accuracies": 0.578125,
"rewards/chosen": -2.2290496826171875,
"rewards/margins": 0.4148930013179779,
"rewards/rejected": -2.6439425945281982,
"step": 1500
},
{
"epoch": 2.1763805062606973,
"grad_norm": 20.97814093763114,
"learning_rate": 2.113617043622536e-08,
"logits/chosen": -2.2447619438171387,
"logits/rejected": -2.2397830486297607,
"logps/chosen": -1.108572006225586,
"logps/rejected": -1.312126874923706,
"loss": 1.5638,
"rewards/accuracies": 0.596875011920929,
"rewards/chosen": -2.217144012451172,
"rewards/margins": 0.4071098268032074,
"rewards/rejected": -2.624253749847412,
"step": 1510
},
{
"epoch": 2.1907936221961988,
"grad_norm": 19.658252029005208,
"learning_rate": 2.045473875739001e-08,
"logits/chosen": -2.286835193634033,
"logits/rejected": -2.284726619720459,
"logps/chosen": -1.1268645524978638,
"logps/rejected": -1.3589181900024414,
"loss": 1.5125,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -2.2537291049957275,
"rewards/margins": 0.4641071856021881,
"rewards/rejected": -2.717836380004883,
"step": 1520
},
{
"epoch": 2.2052067381316998,
"grad_norm": 19.675863885214547,
"learning_rate": 1.9781636787010503e-08,
"logits/chosen": -2.296203851699829,
"logits/rejected": -2.292480230331421,
"logps/chosen": -1.1581227779388428,
"logps/rejected": -1.3830742835998535,
"loss": 1.5552,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.3162455558776855,
"rewards/margins": 0.4499031603336334,
"rewards/rejected": -2.766148567199707,
"step": 1530
},
{
"epoch": 2.219619854067201,
"grad_norm": 26.028820150112818,
"learning_rate": 1.911705429302038e-08,
"logits/chosen": -2.2454471588134766,
"logits/rejected": -2.2483785152435303,
"logps/chosen": -1.1285746097564697,
"logps/rejected": -1.2919931411743164,
"loss": 1.5857,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.2571492195129395,
"rewards/margins": 0.3268371522426605,
"rewards/rejected": -2.583986282348633,
"step": 1540
},
{
"epoch": 2.2340329700027026,
"grad_norm": 23.71926436834239,
"learning_rate": 1.8461178641453617e-08,
"logits/chosen": -2.2616686820983887,
"logits/rejected": -2.2652456760406494,
"logps/chosen": -1.1020487546920776,
"logps/rejected": -1.310429573059082,
"loss": 1.5808,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.2040975093841553,
"rewards/margins": 0.41676193475723267,
"rewards/rejected": -2.620859146118164,
"step": 1550
},
{
"epoch": 2.2484460859382036,
"grad_norm": 18.888058220721906,
"learning_rate": 1.781419474362017e-08,
"logits/chosen": -2.2560315132141113,
"logits/rejected": -2.2563912868499756,
"logps/chosen": -1.120178461074829,
"logps/rejected": -1.3521924018859863,
"loss": 1.5308,
"rewards/accuracies": 0.628125011920929,
"rewards/chosen": -2.240356922149658,
"rewards/margins": 0.46402817964553833,
"rewards/rejected": -2.7043848037719727,
"step": 1560
},
{
"epoch": 2.262859201873705,
"grad_norm": 18.99138375232662,
"learning_rate": 1.7176285003974033e-08,
"logits/chosen": -2.2571425437927246,
"logits/rejected": -2.253202199935913,
"logps/chosen": -1.1062372922897339,
"logps/rejected": -1.313024640083313,
"loss": 1.5606,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -2.2124745845794678,
"rewards/margins": 0.413574755191803,
"rewards/rejected": -2.626049280166626,
"step": 1570
},
{
"epoch": 2.2772723178092065,
"grad_norm": 24.162865311479557,
"learning_rate": 1.6547629268687786e-08,
"logits/chosen": -2.2994749546051025,
"logits/rejected": -2.296318531036377,
"logps/chosen": -1.0731937885284424,
"logps/rejected": -1.3190656900405884,
"loss": 1.5119,
"rewards/accuracies": 0.6468750238418579,
"rewards/chosen": -2.1463875770568848,
"rewards/margins": 0.49174371361732483,
"rewards/rejected": -2.6381313800811768,
"step": 1580
},
{
"epoch": 2.291685433744708,
"grad_norm": 18.432849736683174,
"learning_rate": 1.59284047749485e-08,
"logits/chosen": -2.2636983394622803,
"logits/rejected": -2.2557337284088135,
"logps/chosen": -1.0886359214782715,
"logps/rejected": -1.2910807132720947,
"loss": 1.5641,
"rewards/accuracies": 0.628125011920929,
"rewards/chosen": -2.177271842956543,
"rewards/margins": 0.4048894941806793,
"rewards/rejected": -2.5821614265441895,
"step": 1590
},
{
"epoch": 2.306098549680209,
"grad_norm": 28.646123727089137,
"learning_rate": 1.5318786100989188e-08,
"logits/chosen": -2.229341506958008,
"logits/rejected": -2.226560115814209,
"logps/chosen": -1.2336177825927734,
"logps/rejected": -1.4262335300445557,
"loss": 1.6148,
"rewards/accuracies": 0.578125,
"rewards/chosen": -2.467235565185547,
"rewards/margins": 0.3852314352989197,
"rewards/rejected": -2.8524670600891113,
"step": 1600
},
{
"epoch": 2.3205116656157103,
"grad_norm": 23.756121348250495,
"learning_rate": 1.471894511686988e-08,
"logits/chosen": -2.2284324169158936,
"logits/rejected": -2.2255947589874268,
"logps/chosen": -1.1893842220306396,
"logps/rejected": -1.3409416675567627,
"loss": 1.6367,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.3787684440612793,
"rewards/margins": 0.30311447381973267,
"rewards/rejected": -2.6818833351135254,
"step": 1610
},
{
"epoch": 2.3349247815512117,
"grad_norm": 18.450150129405873,
"learning_rate": 1.4129050936022214e-08,
"logits/chosen": -2.2338924407958984,
"logits/rejected": -2.235215663909912,
"logps/chosen": -1.0769164562225342,
"logps/rejected": -1.2985079288482666,
"loss": 1.5409,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.1538329124450684,
"rewards/margins": 0.44318294525146484,
"rewards/rejected": -2.597015857696533,
"step": 1620
},
{
"epoch": 2.3493378974867127,
"grad_norm": 22.180084405255627,
"learning_rate": 1.3549269867571222e-08,
"logits/chosen": -2.2351133823394775,
"logits/rejected": -2.2372500896453857,
"logps/chosen": -1.1330866813659668,
"logps/rejected": -1.2997318506240845,
"loss": 1.6214,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.2661733627319336,
"rewards/margins": 0.33329010009765625,
"rewards/rejected": -2.599463701248169,
"step": 1630
},
{
"epoch": 2.363751013422214,
"grad_norm": 23.08714654459471,
"learning_rate": 1.2979765369447742e-08,
"logits/chosen": -2.304003953933716,
"logits/rejected": -2.2949726581573486,
"logps/chosen": -1.1455012559890747,
"logps/rejected": -1.3875641822814941,
"loss": 1.5371,
"rewards/accuracies": 0.628125011920929,
"rewards/chosen": -2.2910025119781494,
"rewards/margins": 0.48412585258483887,
"rewards/rejected": -2.7751283645629883,
"step": 1640
},
{
"epoch": 2.3781641293577156,
"grad_norm": 30.56182243031503,
"learning_rate": 1.2420698002304608e-08,
"logits/chosen": -2.2411041259765625,
"logits/rejected": -2.2343127727508545,
"logps/chosen": -1.0859107971191406,
"logps/rejected": -1.3196165561676025,
"loss": 1.5388,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.1718215942382812,
"rewards/margins": 0.4674110412597656,
"rewards/rejected": -2.639233112335205,
"step": 1650
},
{
"epoch": 2.392577245293217,
"grad_norm": 19.77198047003492,
"learning_rate": 1.1872225384249768e-08,
"logits/chosen": -2.268101215362549,
"logits/rejected": -2.2637829780578613,
"logps/chosen": -1.1163004636764526,
"logps/rejected": -1.3505176305770874,
"loss": 1.5169,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.2326009273529053,
"rewards/margins": 0.46843448281288147,
"rewards/rejected": -2.701035261154175,
"step": 1660
},
{
"epoch": 2.406990361228718,
"grad_norm": 26.906205506300168,
"learning_rate": 1.1334502146408881e-08,
"logits/chosen": -2.2429723739624023,
"logits/rejected": -2.249293804168701,
"logps/chosen": -1.1734583377838135,
"logps/rejected": -1.3377552032470703,
"loss": 1.6096,
"rewards/accuracies": 0.590624988079071,
"rewards/chosen": -2.346916675567627,
"rewards/margins": 0.3285936415195465,
"rewards/rejected": -2.6755104064941406,
"step": 1670
},
{
"epoch": 2.4214034771642194,
"grad_norm": 21.73816659360824,
"learning_rate": 1.0807679889330163e-08,
"logits/chosen": -2.314985990524292,
"logits/rejected": -2.320690870285034,
"logps/chosen": -1.17433762550354,
"logps/rejected": -1.3522727489471436,
"loss": 1.5944,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.34867525100708,
"rewards/margins": 0.35587045550346375,
"rewards/rejected": -2.704545497894287,
"step": 1680
},
{
"epoch": 2.435816593099721,
"grad_norm": 17.239308701432627,
"learning_rate": 1.0291907140243538e-08,
"logits/chosen": -2.2565197944641113,
"logits/rejected": -2.255737781524658,
"logps/chosen": -1.1245791912078857,
"logps/rejected": -1.4125820398330688,
"loss": 1.4673,
"rewards/accuracies": 0.6468750238418579,
"rewards/chosen": -2.2491583824157715,
"rewards/margins": 0.5760055184364319,
"rewards/rejected": -2.8251640796661377,
"step": 1690
},
{
"epoch": 2.450229709035222,
"grad_norm": 21.369654725894584,
"learning_rate": 9.787329311186249e-09,
"logits/chosen": -2.252303123474121,
"logits/rejected": -2.251774787902832,
"logps/chosen": -1.1287000179290771,
"logps/rejected": -1.3461166620254517,
"loss": 1.5545,
"rewards/accuracies": 0.6156250238418579,
"rewards/chosen": -2.2574000358581543,
"rewards/margins": 0.43483343720436096,
"rewards/rejected": -2.6922333240509033,
"step": 1700
},
{
"epoch": 2.4646428249707233,
"grad_norm": 25.680472794698755,
"learning_rate": 9.294088658006916e-09,
"logits/chosen": -2.2721753120422363,
"logits/rejected": -2.2618608474731445,
"logps/chosen": -1.1408545970916748,
"logps/rejected": -1.366431474685669,
"loss": 1.5555,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.2817091941833496,
"rewards/margins": 0.45115384459495544,
"rewards/rejected": -2.732862949371338,
"step": 1710
},
{
"epoch": 2.4790559409062247,
"grad_norm": 20.80236487452411,
"learning_rate": 8.812324240259094e-09,
"logits/chosen": -2.2599918842315674,
"logits/rejected": -2.2533061504364014,
"logps/chosen": -1.1435985565185547,
"logps/rejected": -1.3751742839813232,
"loss": 1.5389,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.2871971130371094,
"rewards/margins": 0.4631514549255371,
"rewards/rejected": -2.7503485679626465,
"step": 1720
},
{
"epoch": 2.493469056841726,
"grad_norm": 25.297955693939965,
"learning_rate": 8.342171881996351e-09,
"logits/chosen": -2.269395112991333,
"logits/rejected": -2.267338514328003,
"logps/chosen": -1.1785120964050293,
"logps/rejected": -1.3562462329864502,
"loss": 1.6033,
"rewards/accuracies": 0.590624988079071,
"rewards/chosen": -2.3570241928100586,
"rewards/margins": 0.355468213558197,
"rewards/rejected": -2.7124924659729004,
"step": 1730
},
{
"epoch": 2.507882172777227,
"grad_norm": 24.06865322162579,
"learning_rate": 7.883764133479137e-09,
"logits/chosen": -2.260371685028076,
"logits/rejected": -2.2534215450286865,
"logps/chosen": -1.130081295967102,
"logps/rejected": -1.3861533403396606,
"loss": 1.4917,
"rewards/accuracies": 0.6656249761581421,
"rewards/chosen": -2.260162591934204,
"rewards/margins": 0.5121440887451172,
"rewards/rejected": -2.7723066806793213,
"step": 1740
},
{
"epoch": 2.5222952887127286,
"grad_norm": 29.75935812876475,
"learning_rate": 7.43723023380502e-09,
"logits/chosen": -2.2067666053771973,
"logits/rejected": -2.208773136138916,
"logps/chosen": -1.1877186298370361,
"logps/rejected": -1.4029791355133057,
"loss": 1.5553,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.3754372596740723,
"rewards/margins": 0.430520623922348,
"rewards/rejected": -2.8059582710266113,
"step": 1750
},
{
"epoch": 2.53670840464823,
"grad_norm": 24.2432673255774,
"learning_rate": 7.002696074472075e-09,
"logits/chosen": -2.2512130737304688,
"logits/rejected": -2.2531332969665527,
"logps/chosen": -1.2248094081878662,
"logps/rejected": -1.4335352182388306,
"loss": 1.5688,
"rewards/accuracies": 0.578125,
"rewards/chosen": -2.4496188163757324,
"rewards/margins": 0.4174516797065735,
"rewards/rejected": -2.867070436477661,
"step": 1760
},
{
"epoch": 2.551121520583731,
"grad_norm": 32.01658470543389,
"learning_rate": 6.580284163886369e-09,
"logits/chosen": -2.2607645988464355,
"logits/rejected": -2.2610065937042236,
"logps/chosen": -1.1927731037139893,
"logps/rejected": -1.3909296989440918,
"loss": 1.5668,
"rewards/accuracies": 0.590624988079071,
"rewards/chosen": -2.3855462074279785,
"rewards/margins": 0.39631372690200806,
"rewards/rejected": -2.7818593978881836,
"step": 1770
},
{
"epoch": 2.5655346365192324,
"grad_norm": 24.419915253157857,
"learning_rate": 6.1701135928230566e-09,
"logits/chosen": -2.217277765274048,
"logits/rejected": -2.209423303604126,
"logps/chosen": -1.2151906490325928,
"logps/rejected": -1.427695870399475,
"loss": 1.5543,
"rewards/accuracies": 0.5843750238418579,
"rewards/chosen": -2.4303812980651855,
"rewards/margins": 0.4250105321407318,
"rewards/rejected": -2.85539174079895,
"step": 1780
},
{
"epoch": 2.579947752454734,
"grad_norm": 22.58314758967658,
"learning_rate": 5.7723000008510655e-09,
"logits/chosen": -2.2694671154022217,
"logits/rejected": -2.2696220874786377,
"logps/chosen": -1.168027639389038,
"logps/rejected": -1.3549962043762207,
"loss": 1.5926,
"rewards/accuracies": 0.6031249761581421,
"rewards/chosen": -2.336055278778076,
"rewards/margins": 0.3739371597766876,
"rewards/rejected": -2.7099924087524414,
"step": 1790
},
{
"epoch": 2.5943608683902353,
"grad_norm": 22.781044887360306,
"learning_rate": 5.386955543730798e-09,
"logits/chosen": -2.277388334274292,
"logits/rejected": -2.2686190605163574,
"logps/chosen": -1.2046597003936768,
"logps/rejected": -1.4461263418197632,
"loss": 1.5418,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.4093194007873535,
"rewards/margins": 0.4829334318637848,
"rewards/rejected": -2.8922526836395264,
"step": 1800
},
{
"epoch": 2.6087739843257363,
"grad_norm": 22.99449695923957,
"learning_rate": 5.014188861794e-09,
"logits/chosen": -2.2212021350860596,
"logits/rejected": -2.2196457386016846,
"logps/chosen": -1.1851980686187744,
"logps/rejected": -1.4349489212036133,
"loss": 1.5106,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -2.370396137237549,
"rewards/margins": 0.4995017945766449,
"rewards/rejected": -2.8698978424072266,
"step": 1810
},
{
"epoch": 2.6231871002612377,
"grad_norm": 21.158930640881984,
"learning_rate": 4.654105049314744e-09,
"logits/chosen": -2.2831361293792725,
"logits/rejected": -2.2893922328948975,
"logps/chosen": -1.1905128955841064,
"logps/rejected": -1.392458200454712,
"loss": 1.5859,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.381025791168213,
"rewards/margins": 0.40389055013656616,
"rewards/rejected": -2.784916400909424,
"step": 1820
},
{
"epoch": 2.637600216196739,
"grad_norm": 24.606900180349317,
"learning_rate": 4.3068056248801496e-09,
"logits/chosen": -2.260871410369873,
"logits/rejected": -2.2557454109191895,
"logps/chosen": -1.1808732748031616,
"logps/rejected": -1.4025046825408936,
"loss": 1.5385,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -2.3617465496063232,
"rewards/margins": 0.44326257705688477,
"rewards/rejected": -2.805009365081787,
"step": 1830
},
{
"epoch": 2.65201333213224,
"grad_norm": 21.13626030836664,
"learning_rate": 3.972388502769225e-09,
"logits/chosen": -2.298476457595825,
"logits/rejected": -2.2920804023742676,
"logps/chosen": -1.2038078308105469,
"logps/rejected": -1.3969953060150146,
"loss": 1.5752,
"rewards/accuracies": 0.6031249761581421,
"rewards/chosen": -2.4076156616210938,
"rewards/margins": 0.38637492060661316,
"rewards/rejected": -2.7939906120300293,
"step": 1840
},
{
"epoch": 2.6664264480677415,
"grad_norm": 23.476816797872775,
"learning_rate": 3.650947965347817e-09,
"logits/chosen": -2.2797365188598633,
"logits/rejected": -2.2747490406036377,
"logps/chosen": -1.1963701248168945,
"logps/rejected": -1.4419893026351929,
"loss": 1.4998,
"rewards/accuracies": 0.6875,
"rewards/chosen": -2.392740249633789,
"rewards/margins": 0.4912383556365967,
"rewards/rejected": -2.8839786052703857,
"step": 1850
},
{
"epoch": 2.680839564003243,
"grad_norm": 28.256187183267656,
"learning_rate": 3.342574636487583e-09,
"logits/chosen": -2.3183302879333496,
"logits/rejected": -2.3189597129821777,
"logps/chosen": -1.2193528413772583,
"logps/rejected": -1.4082263708114624,
"loss": 1.5796,
"rewards/accuracies": 0.6031249761581421,
"rewards/chosen": -2.4387056827545166,
"rewards/margins": 0.37774714827537537,
"rewards/rejected": -2.816452741622925,
"step": 1860
},
{
"epoch": 2.6952526799387444,
"grad_norm": 19.684709175702448,
"learning_rate": 3.0473554560163207e-09,
"logits/chosen": -2.254714012145996,
"logits/rejected": -2.2444214820861816,
"logps/chosen": -1.1542867422103882,
"logps/rejected": -1.377029538154602,
"loss": 1.5415,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.3085734844207764,
"rewards/margins": 0.44548529386520386,
"rewards/rejected": -2.754059076309204,
"step": 1870
},
{
"epoch": 2.709665795874246,
"grad_norm": 24.08431048004274,
"learning_rate": 2.7653736552070207e-09,
"logits/chosen": -2.2782135009765625,
"logits/rejected": -2.276923179626465,
"logps/chosen": -1.2209516763687134,
"logps/rejected": -1.4482202529907227,
"loss": 1.538,
"rewards/accuracies": 0.590624988079071,
"rewards/chosen": -2.4419033527374268,
"rewards/margins": 0.45453739166259766,
"rewards/rejected": -2.8964405059814453,
"step": 1880
},
{
"epoch": 2.724078911809747,
"grad_norm": 17.160087536859805,
"learning_rate": 2.496708733312419e-09,
"logits/chosen": -2.250776767730713,
"logits/rejected": -2.253812551498413,
"logps/chosen": -1.1638703346252441,
"logps/rejected": -1.3798881769180298,
"loss": 1.5446,
"rewards/accuracies": 0.621874988079071,
"rewards/chosen": -2.3277406692504883,
"rewards/margins": 0.43203577399253845,
"rewards/rejected": -2.7597763538360596,
"step": 1890
},
{
"epoch": 2.7384920277452482,
"grad_norm": 18.244927534685523,
"learning_rate": 2.241436435151717e-09,
"logits/chosen": -2.2549357414245605,
"logits/rejected": -2.247612714767456,
"logps/chosen": -1.1582852602005005,
"logps/rejected": -1.3766599893569946,
"loss": 1.5527,
"rewards/accuracies": 0.559374988079071,
"rewards/chosen": -2.316570520401001,
"rewards/margins": 0.4367493987083435,
"rewards/rejected": -2.7533199787139893,
"step": 1900
},
{
"epoch": 2.7529051436807492,
"grad_norm": 16.475242116483138,
"learning_rate": 1.9996287297558866e-09,
"logits/chosen": -2.241720199584961,
"logits/rejected": -2.246184825897217,
"logps/chosen": -1.1753349304199219,
"logps/rejected": -1.398506760597229,
"loss": 1.5477,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.3506698608398438,
"rewards/margins": 0.44634366035461426,
"rewards/rejected": -2.797013521194458,
"step": 1910
},
{
"epoch": 2.7673182596162507,
"grad_norm": 18.756801068057744,
"learning_rate": 1.7713537900772957e-09,
"logits/chosen": -2.2873311042785645,
"logits/rejected": -2.285597562789917,
"logps/chosen": -1.2065943479537964,
"logps/rejected": -1.3886728286743164,
"loss": 1.587,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -2.4131886959075928,
"rewards/margins": 0.3641572594642639,
"rewards/rejected": -2.777345657348633,
"step": 1920
},
{
"epoch": 2.781731375551752,
"grad_norm": 18.75587536733683,
"learning_rate": 1.5566759737697998e-09,
"logits/chosen": -2.252821922302246,
"logits/rejected": -2.252249240875244,
"logps/chosen": -1.1472349166870117,
"logps/rejected": -1.3485777378082275,
"loss": 1.5582,
"rewards/accuracies": 0.6156250238418579,
"rewards/chosen": -2.2944698333740234,
"rewards/margins": 0.40268588066101074,
"rewards/rejected": -2.697155475616455,
"step": 1930
},
{
"epoch": 2.7961444914872535,
"grad_norm": 18.83657032008189,
"learning_rate": 1.3556558050442425e-09,
"logits/chosen": -2.27396821975708,
"logits/rejected": -2.266453504562378,
"logps/chosen": -1.153480052947998,
"logps/rejected": -1.3870432376861572,
"loss": 1.5257,
"rewards/accuracies": 0.6031249761581421,
"rewards/chosen": -2.306960105895996,
"rewards/margins": 0.46712619066238403,
"rewards/rejected": -2.7740864753723145,
"step": 1940
},
{
"epoch": 2.810557607422755,
"grad_norm": 19.634521530754597,
"learning_rate": 1.1683499576049583e-09,
"logits/chosen": -2.2552907466888428,
"logits/rejected": -2.255131959915161,
"logps/chosen": -1.1509824991226196,
"logps/rejected": -1.3625354766845703,
"loss": 1.543,
"rewards/accuracies": 0.640625,
"rewards/chosen": -2.3019649982452393,
"rewards/margins": 0.4231061041355133,
"rewards/rejected": -2.7250709533691406,
"step": 1950
},
{
"epoch": 2.824970723358256,
"grad_norm": 22.498945774440706,
"learning_rate": 9.948112386716167e-10,
"logits/chosen": -2.2837812900543213,
"logits/rejected": -2.2745299339294434,
"logps/chosen": -1.220226764678955,
"logps/rejected": -1.4314284324645996,
"loss": 1.569,
"rewards/accuracies": 0.565625011920929,
"rewards/chosen": -2.44045352935791,
"rewards/margins": 0.4224032759666443,
"rewards/rejected": -2.862856864929199,
"step": 1960
},
{
"epoch": 2.8393838392937574,
"grad_norm": 24.02219360016628,
"learning_rate": 8.350885740913416e-10,
"logits/chosen": -2.224419116973877,
"logits/rejected": -2.2149837017059326,
"logps/chosen": -1.1606011390686035,
"logps/rejected": -1.3387001752853394,
"loss": 1.6133,
"rewards/accuracies": 0.559374988079071,
"rewards/chosen": -2.321202278137207,
"rewards/margins": 0.3561980724334717,
"rewards/rejected": -2.6774003505706787,
"step": 1970
},
{
"epoch": 2.8537969552292584,
"grad_norm": 19.84204643186706,
"learning_rate": 6.89226994544978e-10,
"logits/chosen": -2.223024845123291,
"logits/rejected": -2.2192695140838623,
"logps/chosen": -1.1890778541564941,
"logps/rejected": -1.3559348583221436,
"loss": 1.6171,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.3781557083129883,
"rewards/margins": 0.33371374011039734,
"rewards/rejected": -2.711869716644287,
"step": 1980
},
{
"epoch": 2.86821007116476,
"grad_norm": 20.65249363397335,
"learning_rate": 5.572676228516038e-10,
"logits/chosen": -2.255366802215576,
"logits/rejected": -2.2476673126220703,
"logps/chosen": -1.1339585781097412,
"logps/rejected": -1.3980591297149658,
"loss": 1.4961,
"rewards/accuracies": 0.653124988079071,
"rewards/chosen": -2.2679171562194824,
"rewards/margins": 0.5282012224197388,
"rewards/rejected": -2.7961182594299316,
"step": 1990
},
{
"epoch": 2.882623187100261,
"grad_norm": 21.582370970938786,
"learning_rate": 4.3924766237473656e-10,
"logits/chosen": -2.2555174827575684,
"logits/rejected": -2.247621536254883,
"logps/chosen": -1.1424418687820435,
"logps/rejected": -1.3766818046569824,
"loss": 1.531,
"rewards/accuracies": 0.6156250238418579,
"rewards/chosen": -2.284883737564087,
"rewards/margins": 0.4684801697731018,
"rewards/rejected": -2.753363609313965,
"step": 2000
},
{
"epoch": 2.8970363030357626,
"grad_norm": 25.67561175147071,
"learning_rate": 3.35200386533574e-10,
"logits/chosen": -2.2250311374664307,
"logits/rejected": -2.2280611991882324,
"logps/chosen": -1.181894063949585,
"logps/rejected": -1.3828670978546143,
"loss": 1.565,
"rewards/accuracies": 0.621874988079071,
"rewards/chosen": -2.36378812789917,
"rewards/margins": 0.40194636583328247,
"rewards/rejected": -2.7657341957092285,
"step": 2010
},
{
"epoch": 2.911449418971264,
"grad_norm": 21.660548030642744,
"learning_rate": 2.4515512942220874e-10,
"logits/chosen": -2.27579927444458,
"logits/rejected": -2.2686378955841064,
"logps/chosen": -1.2043073177337646,
"logps/rejected": -1.3992283344268799,
"loss": 1.5841,
"rewards/accuracies": 0.596875011920929,
"rewards/chosen": -2.4086146354675293,
"rewards/margins": 0.38984209299087524,
"rewards/rejected": -2.7984566688537598,
"step": 2020
},
{
"epoch": 2.925862534906765,
"grad_norm": 24.50163023857697,
"learning_rate": 1.691372775394717e-10,
"logits/chosen": -2.2493457794189453,
"logits/rejected": -2.251462936401367,
"logps/chosen": -1.2009613513946533,
"logps/rejected": -1.3668345212936401,
"loss": 1.6163,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.4019227027893066,
"rewards/margins": 0.33174630999565125,
"rewards/rejected": -2.7336690425872803,
"step": 2030
},
{
"epoch": 2.9402756508422665,
"grad_norm": 23.84173235916362,
"learning_rate": 1.0716826263165724e-10,
"logits/chosen": -2.291029691696167,
"logits/rejected": -2.289228916168213,
"logps/chosen": -1.17218816280365,
"logps/rejected": -1.440246343612671,
"loss": 1.4873,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -2.3443763256073,
"rewards/margins": 0.5361161828041077,
"rewards/rejected": -2.880492687225342,
"step": 2040
},
{
"epoch": 2.954688766777768,
"grad_norm": 21.3140792744408,
"learning_rate": 5.926555565031743e-11,
"logits/chosen": -2.2876641750335693,
"logits/rejected": -2.289773464202881,
"logps/chosen": -1.216587781906128,
"logps/rejected": -1.4193012714385986,
"loss": 1.5845,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.433175563812256,
"rewards/margins": 0.40542715787887573,
"rewards/rejected": -2.8386025428771973,
"step": 2050
},
{
"epoch": 2.969101882713269,
"grad_norm": 22.106407972159015,
"learning_rate": 2.544266182662458e-11,
"logits/chosen": -2.2547993659973145,
"logits/rejected": -2.2469217777252197,
"logps/chosen": -1.1249706745147705,
"logps/rejected": -1.3703702688217163,
"loss": 1.515,
"rewards/accuracies": 0.628125011920929,
"rewards/chosen": -2.249941349029541,
"rewards/margins": 0.49079880118370056,
"rewards/rejected": -2.7407405376434326,
"step": 2060
},
{
"epoch": 2.9835149986487703,
"grad_norm": 22.35894660462506,
"learning_rate": 5.709116863872321e-12,
"logits/chosen": -2.2706878185272217,
"logits/rejected": -2.2676730155944824,
"logps/chosen": -1.1365437507629395,
"logps/rejected": -1.3011773824691772,
"loss": 1.6093,
"rewards/accuracies": 0.5843750238418579,
"rewards/chosen": -2.273087501525879,
"rewards/margins": 0.3292676508426666,
"rewards/rejected": -2.6023547649383545,
"step": 2070
},
{
"epoch": 2.9964868029907215,
"step": 2079,
"total_flos": 0.0,
"train_loss": 1.6015657603367983,
"train_runtime": 23310.5572,
"train_samples_per_second": 2.857,
"train_steps_per_second": 0.089
}
],
"logging_steps": 10,
"max_steps": 2079,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}