zephyr-7b-dpo-full / trainer_state.json
ale-bay's picture
Model save
6708ab2 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 100,
"global_step": 478,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 8.432772549922241,
"learning_rate": 1.0416666666666666e-08,
"logits/chosen": -2.5992650985717773,
"logits/rejected": -2.567516326904297,
"logps/chosen": -272.1844482421875,
"logps/rejected": -362.26898193359375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.02,
"grad_norm": 9.993362324491976,
"learning_rate": 1.0416666666666667e-07,
"logits/chosen": -2.4130637645721436,
"logits/rejected": -2.3763909339904785,
"logps/chosen": -268.1092834472656,
"logps/rejected": -252.8538360595703,
"loss": 0.6932,
"rewards/accuracies": 0.4375,
"rewards/chosen": 0.00011327523679938167,
"rewards/margins": 0.00018297109636478126,
"rewards/rejected": -6.969591049710289e-05,
"step": 10
},
{
"epoch": 0.04,
"grad_norm": 8.212783868686264,
"learning_rate": 2.0833333333333333e-07,
"logits/chosen": -2.472649097442627,
"logits/rejected": -2.4103596210479736,
"logps/chosen": -283.3070068359375,
"logps/rejected": -297.09979248046875,
"loss": 0.6928,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.001224780222401023,
"rewards/margins": 0.0005858406075276434,
"rewards/rejected": 0.0006389396148733795,
"step": 20
},
{
"epoch": 0.06,
"grad_norm": 7.684799704050697,
"learning_rate": 3.1249999999999997e-07,
"logits/chosen": -2.508185386657715,
"logits/rejected": -2.415645122528076,
"logps/chosen": -301.5997619628906,
"logps/rejected": -265.80426025390625,
"loss": 0.6906,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.0062199728563427925,
"rewards/margins": 0.0035330094397068024,
"rewards/rejected": 0.002686963649466634,
"step": 30
},
{
"epoch": 0.08,
"grad_norm": 7.49113948544429,
"learning_rate": 4.1666666666666667e-07,
"logits/chosen": -2.404700756072998,
"logits/rejected": -2.350811243057251,
"logps/chosen": -268.32647705078125,
"logps/rejected": -247.6831817626953,
"loss": 0.6849,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.0182647742331028,
"rewards/margins": 0.020485591143369675,
"rewards/rejected": -0.0022208169102668762,
"step": 40
},
{
"epoch": 0.1,
"grad_norm": 7.812080735900241,
"learning_rate": 4.999733114418725e-07,
"logits/chosen": -2.3245081901550293,
"logits/rejected": -2.2887587547302246,
"logps/chosen": -280.1948547363281,
"logps/rejected": -293.09405517578125,
"loss": 0.6753,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 0.022629689425230026,
"rewards/margins": 0.031411103904247284,
"rewards/rejected": -0.00878141075372696,
"step": 50
},
{
"epoch": 0.13,
"grad_norm": 8.653367610484782,
"learning_rate": 4.990398100856366e-07,
"logits/chosen": -2.3789138793945312,
"logits/rejected": -2.3128437995910645,
"logps/chosen": -271.4405822753906,
"logps/rejected": -303.5579528808594,
"loss": 0.6602,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 0.029351558536291122,
"rewards/margins": 0.06681646406650543,
"rewards/rejected": -0.03746490180492401,
"step": 60
},
{
"epoch": 0.15,
"grad_norm": 9.793859330498844,
"learning_rate": 4.967775735898179e-07,
"logits/chosen": -2.1905629634857178,
"logits/rejected": -2.1994009017944336,
"logps/chosen": -267.55340576171875,
"logps/rejected": -273.4862976074219,
"loss": 0.6361,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.0027915718965232372,
"rewards/margins": 0.1346043348312378,
"rewards/rejected": -0.13739590346813202,
"step": 70
},
{
"epoch": 0.17,
"grad_norm": 13.837088401780129,
"learning_rate": 4.931986719649298e-07,
"logits/chosen": -2.3278985023498535,
"logits/rejected": -2.243424892425537,
"logps/chosen": -337.2379455566406,
"logps/rejected": -292.844970703125,
"loss": 0.6223,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.09643656760454178,
"rewards/margins": 0.17360267043113708,
"rewards/rejected": -0.27003923058509827,
"step": 80
},
{
"epoch": 0.19,
"grad_norm": 17.06156274259609,
"learning_rate": 4.883222001996351e-07,
"logits/chosen": -2.1236023902893066,
"logits/rejected": -2.0597236156463623,
"logps/chosen": -276.4493103027344,
"logps/rejected": -299.7818908691406,
"loss": 0.5987,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.15940961241722107,
"rewards/margins": 0.3067048490047455,
"rewards/rejected": -0.46611452102661133,
"step": 90
},
{
"epoch": 0.21,
"grad_norm": 15.017538794455808,
"learning_rate": 4.821741763807186e-07,
"logits/chosen": -2.0527923107147217,
"logits/rejected": -1.9835008382797241,
"logps/chosen": -294.0035400390625,
"logps/rejected": -321.8211669921875,
"loss": 0.5965,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.27848348021507263,
"rewards/margins": 0.3403889834880829,
"rewards/rejected": -0.6188725233078003,
"step": 100
},
{
"epoch": 0.21,
"eval_logits/chosen": -2.1194002628326416,
"eval_logits/rejected": -2.0640361309051514,
"eval_logps/chosen": -308.44342041015625,
"eval_logps/rejected": -344.93780517578125,
"eval_loss": 0.6008175015449524,
"eval_rewards/accuracies": 0.71484375,
"eval_rewards/chosen": -0.43487486243247986,
"eval_rewards/margins": 0.3607807159423828,
"eval_rewards/rejected": -0.7956556081771851,
"eval_runtime": 39.9329,
"eval_samples_per_second": 50.084,
"eval_steps_per_second": 0.801,
"step": 100
},
{
"epoch": 0.23,
"grad_norm": 13.512613063149377,
"learning_rate": 4.747874028753375e-07,
"logits/chosen": -2.1186444759368896,
"logits/rejected": -1.960219383239746,
"logps/chosen": -346.79327392578125,
"logps/rejected": -330.68634033203125,
"loss": 0.6016,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.373442143201828,
"rewards/margins": 0.33256274461746216,
"rewards/rejected": -0.7060048580169678,
"step": 110
},
{
"epoch": 0.25,
"grad_norm": 16.771221195438027,
"learning_rate": 4.662012913161997e-07,
"logits/chosen": -1.878488302230835,
"logits/rejected": -1.82696533203125,
"logps/chosen": -322.34173583984375,
"logps/rejected": -339.63104248046875,
"loss": 0.5803,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.3693729043006897,
"rewards/margins": 0.3852415680885315,
"rewards/rejected": -0.754614531993866,
"step": 120
},
{
"epoch": 0.27,
"grad_norm": 14.00034800920836,
"learning_rate": 4.5646165232345103e-07,
"logits/chosen": -1.8512026071548462,
"logits/rejected": -1.7661195993423462,
"logps/chosen": -322.70599365234375,
"logps/rejected": -353.8482360839844,
"loss": 0.5644,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.3978124260902405,
"rewards/margins": 0.42583903670310974,
"rewards/rejected": -0.8236514925956726,
"step": 130
},
{
"epoch": 0.29,
"grad_norm": 15.3625036150752,
"learning_rate": 4.456204510851956e-07,
"logits/chosen": -1.7981727123260498,
"logits/rejected": -1.7398284673690796,
"logps/chosen": -359.68994140625,
"logps/rejected": -386.440185546875,
"loss": 0.5654,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.40198105573654175,
"rewards/margins": 0.43909168243408203,
"rewards/rejected": -0.841072678565979,
"step": 140
},
{
"epoch": 0.31,
"grad_norm": 20.80601584306436,
"learning_rate": 4.337355301007335e-07,
"logits/chosen": -1.7028295993804932,
"logits/rejected": -1.5830708742141724,
"logps/chosen": -345.12286376953125,
"logps/rejected": -376.55859375,
"loss": 0.5775,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.5939025282859802,
"rewards/margins": 0.37951546907424927,
"rewards/rejected": -0.9734179377555847,
"step": 150
},
{
"epoch": 0.33,
"grad_norm": 18.137407218927724,
"learning_rate": 4.2087030056579986e-07,
"logits/chosen": -1.6537895202636719,
"logits/rejected": -1.4418971538543701,
"logps/chosen": -318.0480651855469,
"logps/rejected": -349.61431884765625,
"loss": 0.5693,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.43722066283226013,
"rewards/margins": 0.5587003827095032,
"rewards/rejected": -0.9959210157394409,
"step": 160
},
{
"epoch": 0.36,
"grad_norm": 16.981581647441832,
"learning_rate": 4.070934040463998e-07,
"logits/chosen": -1.5120352506637573,
"logits/rejected": -1.4046074151992798,
"logps/chosen": -306.04840087890625,
"logps/rejected": -330.2176208496094,
"loss": 0.5659,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.538715124130249,
"rewards/margins": 0.42584919929504395,
"rewards/rejected": -0.9645644426345825,
"step": 170
},
{
"epoch": 0.38,
"grad_norm": 14.617848943306955,
"learning_rate": 3.9247834624635404e-07,
"logits/chosen": -1.3031604290008545,
"logits/rejected": -1.1622366905212402,
"logps/chosen": -317.8174743652344,
"logps/rejected": -331.2264404296875,
"loss": 0.5424,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.6144936084747314,
"rewards/margins": 0.4146398603916168,
"rewards/rejected": -1.0291334390640259,
"step": 180
},
{
"epoch": 0.4,
"grad_norm": 18.83954708764831,
"learning_rate": 3.7710310482256523e-07,
"logits/chosen": -1.3724639415740967,
"logits/rejected": -1.2839093208312988,
"logps/chosen": -320.2606506347656,
"logps/rejected": -353.50677490234375,
"loss": 0.5567,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.47087445855140686,
"rewards/margins": 0.4510224461555481,
"rewards/rejected": -0.9218968152999878,
"step": 190
},
{
"epoch": 0.42,
"grad_norm": 27.897186532435434,
"learning_rate": 3.610497133404795e-07,
"logits/chosen": -1.14837646484375,
"logits/rejected": -1.072177767753601,
"logps/chosen": -318.0636291503906,
"logps/rejected": -362.46044921875,
"loss": 0.5688,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.5620325207710266,
"rewards/margins": 0.5343230962753296,
"rewards/rejected": -1.096355676651001,
"step": 200
},
{
"epoch": 0.42,
"eval_logits/chosen": -1.2653636932373047,
"eval_logits/rejected": -1.1455148458480835,
"eval_logps/chosen": -328.60369873046875,
"eval_logps/rejected": -382.0739440917969,
"eval_loss": 0.558937132358551,
"eval_rewards/accuracies": 0.73828125,
"eval_rewards/chosen": -0.6364771723747253,
"eval_rewards/margins": 0.5305400490760803,
"eval_rewards/rejected": -1.1670172214508057,
"eval_runtime": 39.8802,
"eval_samples_per_second": 50.15,
"eval_steps_per_second": 0.802,
"step": 200
},
{
"epoch": 0.44,
"grad_norm": 15.625329725139888,
"learning_rate": 3.4440382358952115e-07,
"logits/chosen": -1.0910618305206299,
"logits/rejected": -0.9091793298721313,
"logps/chosen": -360.6944885253906,
"logps/rejected": -366.00146484375,
"loss": 0.5724,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.6661044359207153,
"rewards/margins": 0.46460071206092834,
"rewards/rejected": -1.1307051181793213,
"step": 210
},
{
"epoch": 0.46,
"grad_norm": 19.24668760178638,
"learning_rate": 3.272542485937368e-07,
"logits/chosen": -0.9850679636001587,
"logits/rejected": -0.7914190292358398,
"logps/chosen": -309.50775146484375,
"logps/rejected": -338.098876953125,
"loss": 0.5582,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.44118037819862366,
"rewards/margins": 0.524976372718811,
"rewards/rejected": -0.9661566019058228,
"step": 220
},
{
"epoch": 0.48,
"grad_norm": 19.97223623454459,
"learning_rate": 3.096924887558854e-07,
"logits/chosen": -0.4917120039463043,
"logits/rejected": -0.298466295003891,
"logps/chosen": -313.9906005859375,
"logps/rejected": -375.44989013671875,
"loss": 0.5541,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.6258713603019714,
"rewards/margins": 0.6441494226455688,
"rewards/rejected": -1.2700207233428955,
"step": 230
},
{
"epoch": 0.5,
"grad_norm": 21.91087703960587,
"learning_rate": 2.9181224366319943e-07,
"logits/chosen": 0.09583790600299835,
"logits/rejected": 0.32567495107650757,
"logps/chosen": -339.2015380859375,
"logps/rejected": -384.8148498535156,
"loss": 0.521,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.7344726324081421,
"rewards/margins": 0.6358748078346252,
"rewards/rejected": -1.370347499847412,
"step": 240
},
{
"epoch": 0.52,
"grad_norm": 19.954418058737403,
"learning_rate": 2.7370891215954565e-07,
"logits/chosen": 0.5116527676582336,
"logits/rejected": 0.8739731907844543,
"logps/chosen": -363.95684814453125,
"logps/rejected": -394.17877197265625,
"loss": 0.5327,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -0.6146451830863953,
"rewards/margins": 0.7260497808456421,
"rewards/rejected": -1.3406950235366821,
"step": 250
},
{
"epoch": 0.54,
"grad_norm": 29.50918535565258,
"learning_rate": 2.55479083351317e-07,
"logits/chosen": 0.7538167834281921,
"logits/rejected": 1.1193482875823975,
"logps/chosen": -365.6874694824219,
"logps/rejected": -389.31396484375,
"loss": 0.5412,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.6788283586502075,
"rewards/margins": 0.6221181154251099,
"rewards/rejected": -1.3009464740753174,
"step": 260
},
{
"epoch": 0.56,
"grad_norm": 24.53746609446516,
"learning_rate": 2.3722002126275822e-07,
"logits/chosen": 1.132846474647522,
"logits/rejected": 1.5623472929000854,
"logps/chosen": -345.86700439453125,
"logps/rejected": -378.31719970703125,
"loss": 0.5414,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.6558988690376282,
"rewards/margins": 0.5384365320205688,
"rewards/rejected": -1.1943353414535522,
"step": 270
},
{
"epoch": 0.59,
"grad_norm": 27.660029561250692,
"learning_rate": 2.19029145890313e-07,
"logits/chosen": 1.715608835220337,
"logits/rejected": 2.1731343269348145,
"logps/chosen": -352.11346435546875,
"logps/rejected": -406.60772705078125,
"loss": 0.5375,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.8623917698860168,
"rewards/margins": 0.7531214952468872,
"rewards/rejected": -1.6155132055282593,
"step": 280
},
{
"epoch": 0.61,
"grad_norm": 25.659061335686694,
"learning_rate": 2.0100351342479216e-07,
"logits/chosen": 1.6510066986083984,
"logits/rejected": 1.7990186214447021,
"logps/chosen": -337.61016845703125,
"logps/rejected": -396.5470886230469,
"loss": 0.5336,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.8643373250961304,
"rewards/margins": 0.6685428023338318,
"rewards/rejected": -1.5328800678253174,
"step": 290
},
{
"epoch": 0.63,
"grad_norm": 22.694419610449454,
"learning_rate": 1.8323929841460178e-07,
"logits/chosen": 1.5950249433517456,
"logits/rejected": 2.302058696746826,
"logps/chosen": -376.0797424316406,
"logps/rejected": -401.6100769042969,
"loss": 0.5121,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.7354680299758911,
"rewards/margins": 0.6922025680541992,
"rewards/rejected": -1.4276707172393799,
"step": 300
},
{
"epoch": 0.63,
"eval_logits/chosen": 1.7225008010864258,
"eval_logits/rejected": 2.1388766765594482,
"eval_logps/chosen": -334.2620849609375,
"eval_logps/rejected": -418.3771667480469,
"eval_loss": 0.5288156270980835,
"eval_rewards/accuracies": 0.76171875,
"eval_rewards/chosen": -0.6930612325668335,
"eval_rewards/margins": 0.8369885683059692,
"eval_rewards/rejected": -1.5300499200820923,
"eval_runtime": 39.9288,
"eval_samples_per_second": 50.089,
"eval_steps_per_second": 0.801,
"step": 300
},
{
"epoch": 0.65,
"grad_norm": 24.37280438119094,
"learning_rate": 1.6583128063291573e-07,
"logits/chosen": 2.1118528842926025,
"logits/rejected": 2.5268707275390625,
"logps/chosen": -376.37969970703125,
"logps/rejected": -417.34869384765625,
"loss": 0.507,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.778560996055603,
"rewards/margins": 0.7997097969055176,
"rewards/rejected": -1.5782709121704102,
"step": 310
},
{
"epoch": 0.67,
"grad_norm": 26.080136074985454,
"learning_rate": 1.488723393865766e-07,
"logits/chosen": 2.5625953674316406,
"logits/rejected": 3.1481173038482666,
"logps/chosen": -383.0509338378906,
"logps/rejected": -411.533935546875,
"loss": 0.5013,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.8686873316764832,
"rewards/margins": 0.7724698781967163,
"rewards/rejected": -1.6411571502685547,
"step": 320
},
{
"epoch": 0.69,
"grad_norm": 27.228237079063305,
"learning_rate": 1.3245295796480788e-07,
"logits/chosen": 2.7803778648376465,
"logits/rejected": 3.247398853302002,
"logps/chosen": -351.1916809082031,
"logps/rejected": -420.370849609375,
"loss": 0.5142,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.8757610321044922,
"rewards/margins": 0.7058261632919312,
"rewards/rejected": -1.5815874338150024,
"step": 330
},
{
"epoch": 0.71,
"grad_norm": 28.892127434127993,
"learning_rate": 1.1666074087171627e-07,
"logits/chosen": 2.754971504211426,
"logits/rejected": 3.230527400970459,
"logps/chosen": -377.8298645019531,
"logps/rejected": -465.24761962890625,
"loss": 0.5138,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.8463465571403503,
"rewards/margins": 0.987470269203186,
"rewards/rejected": -1.8338168859481812,
"step": 340
},
{
"epoch": 0.73,
"grad_norm": 26.281931375691812,
"learning_rate": 1.0157994641835734e-07,
"logits/chosen": 2.723754405975342,
"logits/rejected": 3.361722230911255,
"logps/chosen": -351.22900390625,
"logps/rejected": -415.8351135253906,
"loss": 0.4828,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.8969828486442566,
"rewards/margins": 0.857469916343689,
"rewards/rejected": -1.7544529438018799,
"step": 350
},
{
"epoch": 0.75,
"grad_norm": 24.286834238524502,
"learning_rate": 8.729103716819111e-08,
"logits/chosen": 2.8787496089935303,
"logits/rejected": 3.6532554626464844,
"logps/chosen": -402.9510192871094,
"logps/rejected": -443.6593322753906,
"loss": 0.5325,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.9707611203193665,
"rewards/margins": 0.8318966627120972,
"rewards/rejected": -1.8026577234268188,
"step": 360
},
{
"epoch": 0.77,
"grad_norm": 23.72120672745611,
"learning_rate": 7.387025063449081e-08,
"logits/chosen": 3.308849811553955,
"logits/rejected": 3.967015504837036,
"logps/chosen": -388.5707702636719,
"logps/rejected": -417.2923889160156,
"loss": 0.5145,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.0928517580032349,
"rewards/margins": 0.6804816722869873,
"rewards/rejected": -1.7733335494995117,
"step": 370
},
{
"epoch": 0.79,
"grad_norm": 24.194836344161388,
"learning_rate": 6.138919252022435e-08,
"logits/chosen": 3.4659945964813232,
"logits/rejected": 3.6677188873291016,
"logps/chosen": -360.42303466796875,
"logps/rejected": -468.11322021484375,
"loss": 0.5064,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.1740145683288574,
"rewards/margins": 0.8953350186347961,
"rewards/rejected": -2.069349765777588,
"step": 380
},
{
"epoch": 0.82,
"grad_norm": 33.960328274537595,
"learning_rate": 4.991445467064689e-08,
"logits/chosen": 3.0402557849884033,
"logits/rejected": 3.3952622413635254,
"logps/chosen": -395.9051208496094,
"logps/rejected": -456.98162841796875,
"loss": 0.5003,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.9203785061836243,
"rewards/margins": 0.77605140209198,
"rewards/rejected": -1.6964296102523804,
"step": 390
},
{
"epoch": 0.84,
"grad_norm": 24.96336693295718,
"learning_rate": 3.9507259776993954e-08,
"logits/chosen": 3.402864456176758,
"logits/rejected": 3.9089291095733643,
"logps/chosen": -373.3275146484375,
"logps/rejected": -453.81109619140625,
"loss": 0.5208,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.0015218257904053,
"rewards/margins": 0.86052405834198,
"rewards/rejected": -1.8620456457138062,
"step": 400
},
{
"epoch": 0.84,
"eval_logits/chosen": 2.9372177124023438,
"eval_logits/rejected": 3.4323720932006836,
"eval_logps/chosen": -352.0043029785156,
"eval_logps/rejected": -445.87408447265625,
"eval_loss": 0.5152841210365295,
"eval_rewards/accuracies": 0.7578125,
"eval_rewards/chosen": -0.8704833984375,
"eval_rewards/margins": 0.9345353841781616,
"eval_rewards/rejected": -1.8050185441970825,
"eval_runtime": 39.8891,
"eval_samples_per_second": 50.139,
"eval_steps_per_second": 0.802,
"step": 400
},
{
"epoch": 0.86,
"grad_norm": 26.45544298088616,
"learning_rate": 3.022313472693447e-08,
"logits/chosen": 3.266558885574341,
"logits/rejected": 4.045865535736084,
"logps/chosen": -393.68505859375,
"logps/rejected": -454.821044921875,
"loss": 0.5226,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -0.9244238138198853,
"rewards/margins": 0.9223299026489258,
"rewards/rejected": -1.8467538356781006,
"step": 410
},
{
"epoch": 0.88,
"grad_norm": 24.753221828065943,
"learning_rate": 2.2111614344599684e-08,
"logits/chosen": 3.017789363861084,
"logits/rejected": 3.619795322418213,
"logps/chosen": -396.9893798828125,
"logps/rejected": -455.4769592285156,
"loss": 0.5062,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.9954174160957336,
"rewards/margins": 0.8088730573654175,
"rewards/rejected": -1.804290533065796,
"step": 420
},
{
"epoch": 0.9,
"grad_norm": 28.64072501651785,
"learning_rate": 1.521597710086439e-08,
"logits/chosen": 3.4113173484802246,
"logits/rejected": 3.839292526245117,
"logps/chosen": -386.08624267578125,
"logps/rejected": -451.10894775390625,
"loss": 0.4865,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.0454809665679932,
"rewards/margins": 0.8698482513427734,
"rewards/rejected": -1.9153292179107666,
"step": 430
},
{
"epoch": 0.92,
"grad_norm": 28.408098716357102,
"learning_rate": 9.57301420397924e-09,
"logits/chosen": 2.9448680877685547,
"logits/rejected": 3.614654541015625,
"logps/chosen": -380.95782470703125,
"logps/rejected": -454.02191162109375,
"loss": 0.5045,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.9367244839668274,
"rewards/margins": 0.8637276887893677,
"rewards/rejected": -1.8004519939422607,
"step": 440
},
{
"epoch": 0.94,
"grad_norm": 27.41806430030018,
"learning_rate": 5.212833302556258e-09,
"logits/chosen": 3.149013042449951,
"logits/rejected": 3.4816536903381348,
"logps/chosen": -401.301513671875,
"logps/rejected": -495.34002685546875,
"loss": 0.5059,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.096161127090454,
"rewards/margins": 0.7614067792892456,
"rewards/rejected": -1.8575680255889893,
"step": 450
},
{
"epoch": 0.96,
"grad_norm": 30.97252598966743,
"learning_rate": 2.158697848236607e-09,
"logits/chosen": 3.359788417816162,
"logits/rejected": 3.880640745162964,
"logps/chosen": -376.32879638671875,
"logps/rejected": -425.67010498046875,
"loss": 0.5099,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.0073617696762085,
"rewards/margins": 0.8302923440933228,
"rewards/rejected": -1.8376541137695312,
"step": 460
},
{
"epoch": 0.98,
"grad_norm": 23.032101494576157,
"learning_rate": 4.269029751107489e-10,
"logits/chosen": 3.195591688156128,
"logits/rejected": 3.7321903705596924,
"logps/chosen": -378.28631591796875,
"logps/rejected": -458.43096923828125,
"loss": 0.4995,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.0239256620407104,
"rewards/margins": 0.7799959778785706,
"rewards/rejected": -1.8039219379425049,
"step": 470
},
{
"epoch": 1.0,
"step": 478,
"total_flos": 0.0,
"train_loss": 0.5581105443723032,
"train_runtime": 5172.5891,
"train_samples_per_second": 11.819,
"train_steps_per_second": 0.092
}
],
"logging_steps": 10,
"max_steps": 478,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}