zephyr-7b-dpo-full / trainer_state.json
Beanpow's picture
Model save
e49290e verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9994767137624281,
"eval_steps": 100,
"global_step": 955,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0010465724751439038,
"grad_norm": 21.102116873134612,
"learning_rate": 5.208333333333333e-09,
"logits/chosen": -2.924262046813965,
"logits/rejected": -2.7925047874450684,
"logps/chosen": -380.8447570800781,
"logps/rejected": -358.51123046875,
"loss": 4.6506,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.010465724751439037,
"grad_norm": 15.822543074567085,
"learning_rate": 5.208333333333333e-08,
"logits/chosen": -2.595761299133301,
"logits/rejected": -2.569227457046509,
"logps/chosen": -256.6064453125,
"logps/rejected": -234.93408203125,
"loss": 4.5621,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.00042897689854726195,
"rewards/margins": 0.0009927540086209774,
"rewards/rejected": -0.0005637770518660545,
"step": 10
},
{
"epoch": 0.020931449502878074,
"grad_norm": 18.010820015079055,
"learning_rate": 1.0416666666666667e-07,
"logits/chosen": -2.613164186477661,
"logits/rejected": -2.5756287574768066,
"logps/chosen": -283.0158996582031,
"logps/rejected": -282.265869140625,
"loss": 4.4053,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": 0.0006733193295076489,
"rewards/margins": 0.0005819452926516533,
"rewards/rejected": 9.137402230408043e-05,
"step": 20
},
{
"epoch": 0.03139717425431711,
"grad_norm": 21.44807572026145,
"learning_rate": 1.5624999999999999e-07,
"logits/chosen": -2.691143035888672,
"logits/rejected": -2.6666667461395264,
"logps/chosen": -269.9042053222656,
"logps/rejected": -276.4795837402344,
"loss": 5.105,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.0037794082891196012,
"rewards/margins": 0.0018267262494191527,
"rewards/rejected": 0.0019526820397004485,
"step": 30
},
{
"epoch": 0.04186289900575615,
"grad_norm": 17.302023991146115,
"learning_rate": 2.0833333333333333e-07,
"logits/chosen": -2.6577816009521484,
"logits/rejected": -2.5818943977355957,
"logps/chosen": -288.9285888671875,
"logps/rejected": -280.9770202636719,
"loss": 4.9032,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.020702064037322998,
"rewards/margins": 0.009830506518483162,
"rewards/rejected": 0.01087155845016241,
"step": 40
},
{
"epoch": 0.052328623757195186,
"grad_norm": 22.46337927130885,
"learning_rate": 2.604166666666667e-07,
"logits/chosen": -2.6507585048675537,
"logits/rejected": -2.5627222061157227,
"logps/chosen": -263.1905212402344,
"logps/rejected": -234.9305419921875,
"loss": 4.8274,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.044054824858903885,
"rewards/margins": 0.02749818004667759,
"rewards/rejected": 0.016556641086935997,
"step": 50
},
{
"epoch": 0.06279434850863422,
"grad_norm": 18.98737987603255,
"learning_rate": 3.1249999999999997e-07,
"logits/chosen": -2.5976526737213135,
"logits/rejected": -2.5587098598480225,
"logps/chosen": -299.9574890136719,
"logps/rejected": -276.1783142089844,
"loss": 4.5279,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.040667824447155,
"rewards/margins": 0.04492232948541641,
"rewards/rejected": -0.004254504106938839,
"step": 60
},
{
"epoch": 0.07326007326007326,
"grad_norm": 20.501382800234886,
"learning_rate": 3.645833333333333e-07,
"logits/chosen": -2.469130039215088,
"logits/rejected": -2.452857732772827,
"logps/chosen": -265.96978759765625,
"logps/rejected": -271.6788330078125,
"loss": 4.6703,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.0633089542388916,
"rewards/margins": 0.07126398384571075,
"rewards/rejected": -0.13457295298576355,
"step": 70
},
{
"epoch": 0.0837257980115123,
"grad_norm": 25.49997843488533,
"learning_rate": 4.1666666666666667e-07,
"logits/chosen": -2.4551777839660645,
"logits/rejected": -2.3624327182769775,
"logps/chosen": -285.5320739746094,
"logps/rejected": -276.4596252441406,
"loss": 4.5605,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.09489366412162781,
"rewards/margins": 0.15657536685466766,
"rewards/rejected": -0.2514690160751343,
"step": 80
},
{
"epoch": 0.09419152276295134,
"grad_norm": 30.61647338954573,
"learning_rate": 4.6874999999999996e-07,
"logits/chosen": -2.3756256103515625,
"logits/rejected": -2.332918882369995,
"logps/chosen": -277.46014404296875,
"logps/rejected": -290.0049743652344,
"loss": 4.1231,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.21862252056598663,
"rewards/margins": 0.033695660531520844,
"rewards/rejected": -0.25231820344924927,
"step": 90
},
{
"epoch": 0.10465724751439037,
"grad_norm": 38.124561793065574,
"learning_rate": 4.999732492681437e-07,
"logits/chosen": -2.332035779953003,
"logits/rejected": -2.2253689765930176,
"logps/chosen": -314.4341125488281,
"logps/rejected": -317.18695068359375,
"loss": 4.5854,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.1460995227098465,
"rewards/margins": 0.22573721408843994,
"rewards/rejected": -0.37183672189712524,
"step": 100
},
{
"epoch": 0.10465724751439037,
"eval_logits/chosen": -2.2812609672546387,
"eval_logits/rejected": -2.192293167114258,
"eval_logps/chosen": -309.1551818847656,
"eval_logps/rejected": -310.1242370605469,
"eval_loss": 4.381103515625,
"eval_rewards/accuracies": 0.648809552192688,
"eval_rewards/chosen": -0.2718724012374878,
"eval_rewards/margins": 0.2273014634847641,
"eval_rewards/rejected": -0.4991738498210907,
"eval_runtime": 176.2372,
"eval_samples_per_second": 11.348,
"eval_steps_per_second": 0.357,
"step": 100
},
{
"epoch": 0.1151229722658294,
"grad_norm": 47.336977780094564,
"learning_rate": 4.996723692767926e-07,
"logits/chosen": -2.0436112880706787,
"logits/rejected": -1.9534924030303955,
"logps/chosen": -310.6973571777344,
"logps/rejected": -324.1681823730469,
"loss": 3.758,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.6924275755882263,
"rewards/margins": 0.17653007805347443,
"rewards/rejected": -0.8689576387405396,
"step": 110
},
{
"epoch": 0.12558869701726844,
"grad_norm": 109.43376131471078,
"learning_rate": 4.990375746213598e-07,
"logits/chosen": -0.08515436947345734,
"logits/rejected": 0.34949326515197754,
"logps/chosen": -343.26495361328125,
"logps/rejected": -412.98577880859375,
"loss": 4.0333,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.8068662881851196,
"rewards/margins": 0.438527911901474,
"rewards/rejected": -1.2453943490982056,
"step": 120
},
{
"epoch": 0.1360544217687075,
"grad_norm": 95.04671304091885,
"learning_rate": 4.980697142834314e-07,
"logits/chosen": 0.396954745054245,
"logits/rejected": 1.0232269763946533,
"logps/chosen": -406.28521728515625,
"logps/rejected": -430.10760498046875,
"loss": 4.2005,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -1.2501262426376343,
"rewards/margins": 0.5063079595565796,
"rewards/rejected": -1.7564342021942139,
"step": 130
},
{
"epoch": 0.14652014652014653,
"grad_norm": 144.39035434160894,
"learning_rate": 4.967700826904229e-07,
"logits/chosen": -0.1560676395893097,
"logits/rejected": 0.6105406880378723,
"logps/chosen": -416.2538146972656,
"logps/rejected": -463.2472229003906,
"loss": 3.7876,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -1.2516638040542603,
"rewards/margins": 0.49105915427207947,
"rewards/rejected": -1.7427231073379517,
"step": 140
},
{
"epoch": 0.15698587127158556,
"grad_norm": 125.21681673589694,
"learning_rate": 4.951404179843962e-07,
"logits/chosen": 2.0407581329345703,
"logits/rejected": 2.8481547832489014,
"logps/chosen": -510.521484375,
"logps/rejected": -534.6341552734375,
"loss": 3.898,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.226250648498535,
"rewards/margins": 0.6501102447509766,
"rewards/rejected": -2.876361131668091,
"step": 150
},
{
"epoch": 0.1674515960230246,
"grad_norm": 66.88313091855639,
"learning_rate": 4.931828996974498e-07,
"logits/chosen": 2.163175106048584,
"logits/rejected": 3.5420451164245605,
"logps/chosen": -585.4688720703125,
"logps/rejected": -635.2697143554688,
"loss": 3.9393,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -2.89656138420105,
"rewards/margins": 0.8964195251464844,
"rewards/rejected": -3.792980909347534,
"step": 160
},
{
"epoch": 0.17791732077446362,
"grad_norm": 188.98325062900707,
"learning_rate": 4.909001458367866e-07,
"logits/chosen": 0.49319368600845337,
"logits/rejected": 1.3766599893569946,
"logps/chosen": -599.5331420898438,
"logps/rejected": -654.1383056640625,
"loss": 3.9922,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -3.210639476776123,
"rewards/margins": 0.922272801399231,
"rewards/rejected": -4.132911682128906,
"step": 170
},
{
"epoch": 0.18838304552590268,
"grad_norm": 320.6202106283321,
"learning_rate": 4.882952093833627e-07,
"logits/chosen": 0.6820823550224304,
"logits/rejected": 1.588409185409546,
"logps/chosen": -1040.5491943359375,
"logps/rejected": -1233.1207275390625,
"loss": 3.36,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -8.006011009216309,
"rewards/margins": 1.8573882579803467,
"rewards/rejected": -9.86340045928955,
"step": 180
},
{
"epoch": 0.1988487702773417,
"grad_norm": 157.79546381015746,
"learning_rate": 4.853715742087946e-07,
"logits/chosen": 3.3087031841278076,
"logits/rejected": 4.11985445022583,
"logps/chosen": -1690.8167724609375,
"logps/rejected": -1890.6634521484375,
"loss": 2.6799,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -14.548372268676758,
"rewards/margins": 1.8595011234283447,
"rewards/rejected": -16.407875061035156,
"step": 190
},
{
"epoch": 0.20931449502878074,
"grad_norm": 178.97245767319544,
"learning_rate": 4.821331504159906e-07,
"logits/chosen": 0.3337511122226715,
"logits/rejected": 1.9961885213851929,
"logps/chosen": -1578.712158203125,
"logps/rejected": -1801.65625,
"loss": 2.6464,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -12.680551528930664,
"rewards/margins": 2.764849901199341,
"rewards/rejected": -15.445402145385742,
"step": 200
},
{
"epoch": 0.20931449502878074,
"eval_logits/chosen": -0.35622134804725647,
"eval_logits/rejected": 0.6981890797615051,
"eval_logps/chosen": -1244.43603515625,
"eval_logps/rejected": -1423.3580322265625,
"eval_loss": 2.606262683868408,
"eval_rewards/accuracies": 0.625,
"eval_rewards/chosen": -9.624680519104004,
"eval_rewards/margins": 2.0068302154541016,
"eval_rewards/rejected": -11.631510734558105,
"eval_runtime": 177.3795,
"eval_samples_per_second": 11.275,
"eval_steps_per_second": 0.355,
"step": 200
},
{
"epoch": 0.21978021978021978,
"grad_norm": 221.39959720400535,
"learning_rate": 4.785842691097342e-07,
"logits/chosen": 0.43124809861183167,
"logits/rejected": 1.6196168661117554,
"logps/chosen": -1394.329345703125,
"logps/rejected": -1612.8701171875,
"loss": 2.2192,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -11.115188598632812,
"rewards/margins": 2.29093337059021,
"rewards/rejected": -13.406122207641602,
"step": 210
},
{
"epoch": 0.2302459445316588,
"grad_norm": 107.97254065213261,
"learning_rate": 4.7472967660421603e-07,
"logits/chosen": 0.5400440096855164,
"logits/rejected": 1.9760030508041382,
"logps/chosen": -1507.001220703125,
"logps/rejected": -1713.616455078125,
"loss": 2.018,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -12.197932243347168,
"rewards/margins": 2.5764663219451904,
"rewards/rejected": -14.774396896362305,
"step": 220
},
{
"epoch": 0.24071166928309787,
"grad_norm": 217.88193736039008,
"learning_rate": 4.705745280752585e-07,
"logits/chosen": 1.4225207567214966,
"logits/rejected": 2.4756038188934326,
"logps/chosen": -1726.320068359375,
"logps/rejected": -2005.7041015625,
"loss": 1.9719,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -14.375930786132812,
"rewards/margins": 2.995753526687622,
"rewards/rejected": -17.37168312072754,
"step": 230
},
{
"epoch": 0.25117739403453687,
"grad_norm": 109.77258728949327,
"learning_rate": 4.6612438066572555e-07,
"logits/chosen": 2.2113587856292725,
"logits/rejected": 3.125591993331909,
"logps/chosen": -1894.770751953125,
"logps/rejected": -2110.86376953125,
"loss": 1.9847,
"rewards/accuracies": 0.59375,
"rewards/chosen": -16.27196502685547,
"rewards/margins": 2.294943332672119,
"rewards/rejected": -18.56690788269043,
"step": 240
},
{
"epoch": 0.2616431187859759,
"grad_norm": 276.53415343052893,
"learning_rate": 4.6138518605333664e-07,
"logits/chosen": 1.203977108001709,
"logits/rejected": 1.9225616455078125,
"logps/chosen": -1561.0047607421875,
"logps/rejected": -1763.075439453125,
"loss": 2.257,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -12.832204818725586,
"rewards/margins": 2.222775936126709,
"rewards/rejected": -15.05497932434082,
"step": 250
},
{
"epoch": 0.272108843537415,
"grad_norm": 159.14963627253198,
"learning_rate": 4.5636328249082514e-07,
"logits/chosen": 1.134037733078003,
"logits/rejected": 2.1568219661712646,
"logps/chosen": -1608.8623046875,
"logps/rejected": -1763.599853515625,
"loss": 2.2606,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -13.198956489562988,
"rewards/margins": 1.6145280599594116,
"rewards/rejected": -14.813486099243164,
"step": 260
},
{
"epoch": 0.282574568288854,
"grad_norm": 199.45417630865836,
"learning_rate": 4.510653863290871e-07,
"logits/chosen": 0.3547247350215912,
"logits/rejected": 1.2751286029815674,
"logps/chosen": -1781.0726318359375,
"logps/rejected": -2089.05615234375,
"loss": 1.7211,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -14.814226150512695,
"rewards/margins": 3.540767192840576,
"rewards/rejected": -18.354991912841797,
"step": 270
},
{
"epoch": 0.29304029304029305,
"grad_norm": 162.5497817330968,
"learning_rate": 4.4549858303465737e-07,
"logits/chosen": 0.21130748093128204,
"logits/rejected": 1.2269564867019653,
"logps/chosen": -1743.0787353515625,
"logps/rejected": -2033.669921875,
"loss": 1.9445,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -14.468345642089844,
"rewards/margins": 3.0355849266052246,
"rewards/rejected": -17.50392723083496,
"step": 280
},
{
"epoch": 0.3035060177917321,
"grad_norm": 307.15808847538113,
"learning_rate": 4.396703177135261e-07,
"logits/chosen": 0.7419403791427612,
"logits/rejected": 1.9202260971069336,
"logps/chosen": -1948.1787109375,
"logps/rejected": -2273.5205078125,
"loss": 1.9864,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -16.753948211669922,
"rewards/margins": 3.6111111640930176,
"rewards/rejected": -20.36505699157715,
"step": 290
},
{
"epoch": 0.3139717425431711,
"grad_norm": 90.00202577382801,
"learning_rate": 4.335883851539693e-07,
"logits/chosen": 0.30849236249923706,
"logits/rejected": 1.1072229146957397,
"logps/chosen": -1431.3275146484375,
"logps/rejected": -1653.4029541015625,
"loss": 1.9069,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -11.623054504394531,
"rewards/margins": 2.2337088584899902,
"rewards/rejected": -13.856762886047363,
"step": 300
},
{
"epoch": 0.3139717425431711,
"eval_logits/chosen": 0.45899611711502075,
"eval_logits/rejected": 1.5569082498550415,
"eval_logps/chosen": -1266.6490478515625,
"eval_logps/rejected": -1452.7674560546875,
"eval_loss": 2.262396812438965,
"eval_rewards/accuracies": 0.6329365372657776,
"eval_rewards/chosen": -9.846811294555664,
"eval_rewards/margins": 2.0787949562072754,
"eval_rewards/rejected": -11.925606727600098,
"eval_runtime": 176.5188,
"eval_samples_per_second": 11.33,
"eval_steps_per_second": 0.357,
"step": 300
},
{
"epoch": 0.32443746729461015,
"grad_norm": 177.8700388917398,
"learning_rate": 4.272609194017105e-07,
"logits/chosen": 0.647371768951416,
"logits/rejected": 2.9104599952697754,
"logps/chosen": -1395.496826171875,
"logps/rejected": -1711.9573974609375,
"loss": 2.3095,
"rewards/accuracies": 0.75,
"rewards/chosen": -11.117349624633789,
"rewards/margins": 3.674748182296753,
"rewards/rejected": -14.792098999023438,
"step": 310
},
{
"epoch": 0.3349031920460492,
"grad_norm": 180.92515200199898,
"learning_rate": 4.2069638288135547e-07,
"logits/chosen": 0.9543863534927368,
"logits/rejected": 1.7447538375854492,
"logps/chosen": -1926.299560546875,
"logps/rejected": -2217.88037109375,
"loss": 2.1724,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -16.73282814025879,
"rewards/margins": 2.939984083175659,
"rewards/rejected": -19.672813415527344,
"step": 320
},
{
"epoch": 0.3453689167974882,
"grad_norm": 145.6894284610869,
"learning_rate": 4.139035550786494e-07,
"logits/chosen": -0.039321091026067734,
"logits/rejected": 0.5018073320388794,
"logps/chosen": -1734.091796875,
"logps/rejected": -1908.339111328125,
"loss": 1.716,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -14.77747917175293,
"rewards/margins": 1.9848415851593018,
"rewards/rejected": -16.76232147216797,
"step": 330
},
{
"epoch": 0.35583464154892724,
"grad_norm": 183.78050890033984,
"learning_rate": 4.0689152079869306e-07,
"logits/chosen": -0.5724295377731323,
"logits/rejected": 0.023262571543455124,
"logps/chosen": -1660.732177734375,
"logps/rejected": -1876.1025390625,
"loss": 1.8439,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -13.912447929382324,
"rewards/margins": 2.488671064376831,
"rewards/rejected": -16.401119232177734,
"step": 340
},
{
"epoch": 0.3663003663003663,
"grad_norm": 149.28700648360655,
"learning_rate": 3.99669658015821e-07,
"logits/chosen": 0.006322336383163929,
"logits/rejected": 0.6332755088806152,
"logps/chosen": -1966.5765380859375,
"logps/rejected": -2201.843505859375,
"loss": 1.6671,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -16.753414154052734,
"rewards/margins": 2.7041499614715576,
"rewards/rejected": -19.457565307617188,
"step": 350
},
{
"epoch": 0.37676609105180536,
"grad_norm": 237.65668361495474,
"learning_rate": 3.92247625331392e-07,
"logits/chosen": -0.21500203013420105,
"logits/rejected": 0.6255682110786438,
"logps/chosen": -1989.7509765625,
"logps/rejected": -2207.83349609375,
"loss": 1.6927,
"rewards/accuracies": 0.625,
"rewards/chosen": -17.038667678833008,
"rewards/margins": 2.4120330810546875,
"rewards/rejected": -19.450698852539062,
"step": 360
},
{
"epoch": 0.3872318158032444,
"grad_norm": 152.55773033990448,
"learning_rate": 3.846353490562664e-07,
"logits/chosen": -0.39199286699295044,
"logits/rejected": -0.043508779257535934,
"logps/chosen": -1889.5286865234375,
"logps/rejected": -2139.589111328125,
"loss": 1.7098,
"rewards/accuracies": 0.59375,
"rewards/chosen": -16.262523651123047,
"rewards/margins": 2.7768733501434326,
"rewards/rejected": -19.039398193359375,
"step": 370
},
{
"epoch": 0.3976975405546834,
"grad_norm": 239.86422108427834,
"learning_rate": 3.768430099352445e-07,
"logits/chosen": -0.5338395833969116,
"logits/rejected": -0.10323655605316162,
"logps/chosen": -1830.7080078125,
"logps/rejected": -2104.773681640625,
"loss": 1.786,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -15.76060962677002,
"rewards/margins": 2.8237688541412354,
"rewards/rejected": -18.58437728881836,
"step": 380
},
{
"epoch": 0.40816326530612246,
"grad_norm": 137.89263121746114,
"learning_rate": 3.6888102953122304e-07,
"logits/chosen": -0.3421451449394226,
"logits/rejected": 0.2877078056335449,
"logps/chosen": -1774.384765625,
"logps/rejected": -2007.7366943359375,
"loss": 1.9274,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -14.92773151397705,
"rewards/margins": 2.465951442718506,
"rewards/rejected": -17.393680572509766,
"step": 390
},
{
"epoch": 0.4186289900575615,
"grad_norm": 164.86784545063486,
"learning_rate": 3.607600562872785e-07,
"logits/chosen": -0.7335325479507446,
"logits/rejected": -0.33919858932495117,
"logps/chosen": -1733.375244140625,
"logps/rejected": -1963.0279541015625,
"loss": 1.6642,
"rewards/accuracies": 0.5625,
"rewards/chosen": -14.63383960723877,
"rewards/margins": 2.319460391998291,
"rewards/rejected": -16.95330047607422,
"step": 400
},
{
"epoch": 0.4186289900575615,
"eval_logits/chosen": -0.7751028537750244,
"eval_logits/rejected": -0.08748837560415268,
"eval_logps/chosen": -1731.152587890625,
"eval_logps/rejected": -2045.1492919921875,
"eval_loss": 1.6421091556549072,
"eval_rewards/accuracies": 0.625,
"eval_rewards/chosen": -14.491846084594727,
"eval_rewards/margins": 3.3575782775878906,
"eval_rewards/rejected": -17.849422454833984,
"eval_runtime": 176.0651,
"eval_samples_per_second": 11.359,
"eval_steps_per_second": 0.358,
"step": 400
},
{
"epoch": 0.4290947148090005,
"grad_norm": 128.91689311765836,
"learning_rate": 3.5249095128531856e-07,
"logits/chosen": -0.10633065551519394,
"logits/rejected": 0.350477933883667,
"logps/chosen": -1862.1099853515625,
"logps/rejected": -2067.15673828125,
"loss": 1.7556,
"rewards/accuracies": 0.59375,
"rewards/chosen": -15.903741836547852,
"rewards/margins": 2.2993245124816895,
"rewards/rejected": -18.203065872192383,
"step": 410
},
{
"epoch": 0.43956043956043955,
"grad_norm": 187.2282869549343,
"learning_rate": 3.4408477372034736e-07,
"logits/chosen": -0.2209610939025879,
"logits/rejected": 0.7663095593452454,
"logps/chosen": -1825.959228515625,
"logps/rejected": -2182.580810546875,
"loss": 1.8542,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -15.437875747680664,
"rewards/margins": 3.867755174636841,
"rewards/rejected": -19.30562973022461,
"step": 420
},
{
"epoch": 0.4500261643118786,
"grad_norm": 150.13979068919696,
"learning_rate": 3.3555276610977276e-07,
"logits/chosen": -1.128701090812683,
"logits/rejected": -0.5558885335922241,
"logps/chosen": -1832.6103515625,
"logps/rejected": -2176.197265625,
"loss": 1.5079,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -15.788568496704102,
"rewards/margins": 3.47161602973938,
"rewards/rejected": -19.26018714904785,
"step": 430
},
{
"epoch": 0.4604918890633176,
"grad_norm": 163.41066719667168,
"learning_rate": 3.269063392575352e-07,
"logits/chosen": -0.6949409246444702,
"logits/rejected": -0.05746125057339668,
"logps/chosen": -1597.5341796875,
"logps/rejected": -1821.0198974609375,
"loss": 1.4868,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -13.023595809936523,
"rewards/margins": 2.632272481918335,
"rewards/rejected": -15.655868530273438,
"step": 440
},
{
"epoch": 0.47095761381475665,
"grad_norm": 133.46596474594617,
"learning_rate": 3.1815705699316964e-07,
"logits/chosen": -0.4808398187160492,
"logits/rejected": 0.3264926075935364,
"logps/chosen": -1599.6370849609375,
"logps/rejected": -1936.6884765625,
"loss": 1.5413,
"rewards/accuracies": 0.625,
"rewards/chosen": -13.172491073608398,
"rewards/margins": 3.4112179279327393,
"rewards/rejected": -16.583707809448242,
"step": 450
},
{
"epoch": 0.48142333856619574,
"grad_norm": 155.84007478164062,
"learning_rate": 3.0931662070620794e-07,
"logits/chosen": -0.719369113445282,
"logits/rejected": -0.06152462959289551,
"logps/chosen": -1643.2447509765625,
"logps/rejected": -1872.9976806640625,
"loss": 1.7906,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -13.85786247253418,
"rewards/margins": 2.4219117164611816,
"rewards/rejected": -16.279773712158203,
"step": 460
},
{
"epoch": 0.49188906331763477,
"grad_norm": 203.3322056694353,
"learning_rate": 3.003968536966078e-07,
"logits/chosen": -0.4609583020210266,
"logits/rejected": -0.09374441206455231,
"logps/chosen": -1654.1614990234375,
"logps/rejected": -1845.5618896484375,
"loss": 1.7718,
"rewards/accuracies": 0.59375,
"rewards/chosen": -13.703729629516602,
"rewards/margins": 2.2525086402893066,
"rewards/rejected": -15.956239700317383,
"step": 470
},
{
"epoch": 0.5023547880690737,
"grad_norm": 156.4799546194198,
"learning_rate": 2.9140968536213693e-07,
"logits/chosen": -0.2353781908750534,
"logits/rejected": 0.5946909785270691,
"logps/chosen": -1859.3265380859375,
"logps/rejected": -2325.88134765625,
"loss": 1.3829,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -15.921140670776367,
"rewards/margins": 4.824706077575684,
"rewards/rejected": -20.745845794677734,
"step": 480
},
{
"epoch": 0.5128205128205128,
"grad_norm": 160.19325879757844,
"learning_rate": 2.823671352438608e-07,
"logits/chosen": -0.9654836654663086,
"logits/rejected": -0.002035105135291815,
"logps/chosen": -1637.873291015625,
"logps/rejected": -2143.010986328125,
"loss": 1.6206,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -13.259417533874512,
"rewards/margins": 5.606515407562256,
"rewards/rejected": -18.86593246459961,
"step": 490
},
{
"epoch": 0.5232862375719518,
"grad_norm": 221.83952267135834,
"learning_rate": 2.73281296951072e-07,
"logits/chosen": -0.6597784161567688,
"logits/rejected": -0.14649493992328644,
"logps/chosen": -1530.5738525390625,
"logps/rejected": -1781.8070068359375,
"loss": 1.6328,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -12.7192964553833,
"rewards/margins": 2.8244967460632324,
"rewards/rejected": -15.543792724609375,
"step": 500
},
{
"epoch": 0.5232862375719518,
"eval_logits/chosen": -0.6590258479118347,
"eval_logits/rejected": -0.091790109872818,
"eval_logps/chosen": -1589.3370361328125,
"eval_logps/rejected": -1890.562255859375,
"eval_loss": 1.5119922161102295,
"eval_rewards/accuracies": 0.6388888955116272,
"eval_rewards/chosen": -13.073691368103027,
"eval_rewards/margins": 3.229863166809082,
"eval_rewards/rejected": -16.303556442260742,
"eval_runtime": 177.8158,
"eval_samples_per_second": 11.248,
"eval_steps_per_second": 0.354,
"step": 500
},
{
"epoch": 0.533751962323391,
"grad_norm": 187.4336485549293,
"learning_rate": 2.641643219871597e-07,
"logits/chosen": -0.5598984360694885,
"logits/rejected": -0.2727218270301819,
"logps/chosen": -1694.568359375,
"logps/rejected": -2086.98193359375,
"loss": 1.4069,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -14.106300354003906,
"rewards/margins": 4.125433921813965,
"rewards/rejected": -18.231733322143555,
"step": 510
},
{
"epoch": 0.54421768707483,
"grad_norm": 180.24950333654212,
"learning_rate": 2.550284034980507e-07,
"logits/chosen": -0.652435302734375,
"logits/rejected": -0.25857192277908325,
"logps/chosen": -1941.6849365234375,
"logps/rejected": -2231.46337890625,
"loss": 1.6022,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -16.69613265991211,
"rewards/margins": 3.1190426349639893,
"rewards/rejected": -19.815174102783203,
"step": 520
},
{
"epoch": 0.554683411826269,
"grad_norm": 147.71519410172087,
"learning_rate": 2.4588575996495794e-07,
"logits/chosen": -0.6198351979255676,
"logits/rejected": -0.19036616384983063,
"logps/chosen": -1601.6470947265625,
"logps/rejected": -1820.4556884765625,
"loss": 1.5136,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -13.329002380371094,
"rewards/margins": 2.3253164291381836,
"rewards/rejected": -15.654316902160645,
"step": 530
},
{
"epoch": 0.565149136577708,
"grad_norm": 146.6770433780799,
"learning_rate": 2.367486188632446e-07,
"logits/chosen": -0.7303057909011841,
"logits/rejected": 0.15564236044883728,
"logps/chosen": -1670.916015625,
"logps/rejected": -2011.5406494140625,
"loss": 1.5458,
"rewards/accuracies": 0.625,
"rewards/chosen": -13.805659294128418,
"rewards/margins": 3.703829288482666,
"rewards/rejected": -17.509489059448242,
"step": 540
},
{
"epoch": 0.5756148613291471,
"grad_norm": 206.94359776232758,
"learning_rate": 2.276292003092593e-07,
"logits/chosen": -0.22513580322265625,
"logits/rejected": 0.4895138740539551,
"logps/chosen": -1914.7532958984375,
"logps/rejected": -2300.30322265625,
"loss": 1.6801,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -16.434810638427734,
"rewards/margins": 4.275403022766113,
"rewards/rejected": -20.71021270751953,
"step": 550
},
{
"epoch": 0.5860805860805861,
"grad_norm": 175.41735239090949,
"learning_rate": 2.185397007170141e-07,
"logits/chosen": -0.1453290730714798,
"logits/rejected": 0.3121495842933655,
"logps/chosen": -1876.300537109375,
"logps/rejected": -2229.38134765625,
"loss": 1.3878,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -15.934832572937012,
"rewards/margins": 3.8099570274353027,
"rewards/rejected": -19.744789123535156,
"step": 560
},
{
"epoch": 0.5965463108320251,
"grad_norm": 142.79294258337345,
"learning_rate": 2.094922764865619e-07,
"logits/chosen": -0.276650995016098,
"logits/rejected": 0.13945253193378448,
"logps/chosen": -1827.0634765625,
"logps/rejected": -2034.280517578125,
"loss": 1.4902,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -15.475687980651855,
"rewards/margins": 2.2050392627716064,
"rewards/rejected": -17.680728912353516,
"step": 570
},
{
"epoch": 0.6070120355834642,
"grad_norm": 245.80968468908674,
"learning_rate": 2.0049902774588797e-07,
"logits/chosen": -0.011815989390015602,
"logits/rejected": 0.42436084151268005,
"logps/chosen": -1794.5543212890625,
"logps/rejected": -2061.93310546875,
"loss": 1.4461,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -15.412150382995605,
"rewards/margins": 3.078895330429077,
"rewards/rejected": -18.491044998168945,
"step": 580
},
{
"epoch": 0.6174777603349032,
"grad_norm": 175.38280547329734,
"learning_rate": 1.9157198216806238e-07,
"logits/chosen": -0.3044319152832031,
"logits/rejected": 0.3406422734260559,
"logps/chosen": -1649.8509521484375,
"logps/rejected": -2006.366455078125,
"loss": 1.5446,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -13.71898365020752,
"rewards/margins": 3.657163143157959,
"rewards/rejected": -17.376148223876953,
"step": 590
},
{
"epoch": 0.6279434850863422,
"grad_norm": 203.04339818262545,
"learning_rate": 1.8272307888529274e-07,
"logits/chosen": 0.16477735340595245,
"logits/rejected": 0.6171606183052063,
"logps/chosen": -1870.41015625,
"logps/rejected": -2165.638427734375,
"loss": 1.6032,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -15.878863334655762,
"rewards/margins": 3.1766743659973145,
"rewards/rejected": -19.055538177490234,
"step": 600
},
{
"epoch": 0.6279434850863422,
"eval_logits/chosen": 0.01903720200061798,
"eval_logits/rejected": 0.6402472853660583,
"eval_logps/chosen": -2015.7071533203125,
"eval_logps/rejected": -2402.58447265625,
"eval_loss": 1.4751698970794678,
"eval_rewards/accuracies": 0.6230158805847168,
"eval_rewards/chosen": -17.33738899230957,
"eval_rewards/margins": 4.086385250091553,
"eval_rewards/rejected": -21.42377471923828,
"eval_runtime": 176.4506,
"eval_samples_per_second": 11.335,
"eval_steps_per_second": 0.357,
"step": 600
},
{
"epoch": 0.6384092098377813,
"grad_norm": 184.64896406440843,
"learning_rate": 1.7396415252139288e-07,
"logits/chosen": 0.0034784465096890926,
"logits/rejected": 0.6044633388519287,
"logps/chosen": -2050.113037109375,
"logps/rejected": -2622.564453125,
"loss": 1.5229,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -17.44542694091797,
"rewards/margins": 6.333140850067139,
"rewards/rejected": -23.778566360473633,
"step": 610
},
{
"epoch": 0.6488749345892203,
"grad_norm": 150.92780625161797,
"learning_rate": 1.6530691736402316e-07,
"logits/chosen": -0.05873150750994682,
"logits/rejected": 0.2572210133075714,
"logps/chosen": -1822.690185546875,
"logps/rejected": -2140.002685546875,
"loss": 1.3047,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -15.516670227050781,
"rewards/margins": 3.519291400909424,
"rewards/rejected": -19.035961151123047,
"step": 620
},
{
"epoch": 0.6593406593406593,
"grad_norm": 158.62413320623054,
"learning_rate": 1.5676295169786864e-07,
"logits/chosen": -0.5535549521446228,
"logits/rejected": -0.16974008083343506,
"logps/chosen": -1799.411376953125,
"logps/rejected": -2184.095458984375,
"loss": 1.4004,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -15.247261047363281,
"rewards/margins": 4.233900547027588,
"rewards/rejected": -19.481159210205078,
"step": 630
},
{
"epoch": 0.6698063840920984,
"grad_norm": 174.63990723873954,
"learning_rate": 1.483436823197092e-07,
"logits/chosen": -0.49727511405944824,
"logits/rejected": -0.09024439752101898,
"logps/chosen": -1910.181396484375,
"logps/rejected": -2272.175537109375,
"loss": 1.2582,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -16.431602478027344,
"rewards/margins": 4.003415107727051,
"rewards/rejected": -20.43501853942871,
"step": 640
},
{
"epoch": 0.6802721088435374,
"grad_norm": 212.30897956956616,
"learning_rate": 1.4006036925609243e-07,
"logits/chosen": -0.5441917777061462,
"logits/rejected": -0.3759006857872009,
"logps/chosen": -1762.1038818359375,
"logps/rejected": -1993.853515625,
"loss": 1.3183,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -14.94421100616455,
"rewards/margins": 2.2772457599639893,
"rewards/rejected": -17.221454620361328,
"step": 650
},
{
"epoch": 0.6907378335949764,
"grad_norm": 122.40725726992933,
"learning_rate": 1.319240907040458e-07,
"logits/chosen": -0.578727126121521,
"logits/rejected": -0.15290720760822296,
"logps/chosen": -1786.3648681640625,
"logps/rejected": -2103.92919921875,
"loss": 1.5482,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -15.042015075683594,
"rewards/margins": 3.422727584838867,
"rewards/rejected": -18.46474266052246,
"step": 660
},
{
"epoch": 0.7012035583464155,
"grad_norm": 273.40146184819037,
"learning_rate": 1.239457282149695e-07,
"logits/chosen": -0.6542818546295166,
"logits/rejected": -0.6405806541442871,
"logps/chosen": -1718.8697509765625,
"logps/rejected": -2025.167236328125,
"loss": 1.1528,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -14.303213119506836,
"rewards/margins": 3.1175124645233154,
"rewards/rejected": -17.420726776123047,
"step": 670
},
{
"epoch": 0.7116692830978545,
"grad_norm": 246.28508875936285,
"learning_rate": 1.1613595214152711e-07,
"logits/chosen": -0.6755629777908325,
"logits/rejected": -0.26193898916244507,
"logps/chosen": -1862.4000244140625,
"logps/rejected": -2191.969482421875,
"loss": 1.3671,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -15.507433891296387,
"rewards/margins": 3.5333023071289062,
"rewards/rejected": -19.04073715209961,
"step": 680
},
{
"epoch": 0.7221350078492935,
"grad_norm": 216.14843384209277,
"learning_rate": 1.0850520736699362e-07,
"logits/chosen": -0.6002136468887329,
"logits/rejected": 0.03606845811009407,
"logps/chosen": -1838.6982421875,
"logps/rejected": -2214.07470703125,
"loss": 1.3895,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -15.680435180664062,
"rewards/margins": 4.133326053619385,
"rewards/rejected": -19.813762664794922,
"step": 690
},
{
"epoch": 0.7326007326007326,
"grad_norm": 162.01079027631573,
"learning_rate": 1.0106369933615042e-07,
"logits/chosen": -0.7846351861953735,
"logits/rejected": -0.5915166735649109,
"logps/chosen": -1752.784423828125,
"logps/rejected": -2021.7802734375,
"loss": 1.5039,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -14.986889839172363,
"rewards/margins": 2.845428943634033,
"rewards/rejected": -17.832317352294922,
"step": 700
},
{
"epoch": 0.7326007326007326,
"eval_logits/chosen": -0.8898468017578125,
"eval_logits/rejected": -0.4967605769634247,
"eval_logps/chosen": -1694.96240234375,
"eval_logps/rejected": -2016.4490966796875,
"eval_loss": 1.3852962255477905,
"eval_rewards/accuracies": 0.6527777910232544,
"eval_rewards/chosen": -14.129942893981934,
"eval_rewards/margins": 3.432478666305542,
"eval_rewards/rejected": -17.562421798706055,
"eval_runtime": 176.0679,
"eval_samples_per_second": 11.359,
"eval_steps_per_second": 0.358,
"step": 700
},
{
"epoch": 0.7430664573521716,
"grad_norm": 177.45761000957364,
"learning_rate": 9.382138040640714e-08,
"logits/chosen": -1.012629747390747,
"logits/rejected": -0.6268833875656128,
"logps/chosen": -1776.499755859375,
"logps/rejected": -2017.539794921875,
"loss": 1.5264,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -15.180249214172363,
"rewards/margins": 2.565770387649536,
"rewards/rejected": -17.74601936340332,
"step": 710
},
{
"epoch": 0.7535321821036107,
"grad_norm": 140.94359920373847,
"learning_rate": 8.678793653740632e-08,
"logits/chosen": -0.9271895289421082,
"logits/rejected": -0.47789469361305237,
"logps/chosen": -1664.4437255859375,
"logps/rejected": -1977.908447265625,
"loss": 1.3295,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -13.8070650100708,
"rewards/margins": 3.4972636699676514,
"rewards/rejected": -17.3043270111084,
"step": 720
},
{
"epoch": 0.7639979068550498,
"grad_norm": 190.75937551525504,
"learning_rate": 7.997277433690983e-08,
"logits/chosen": -0.8303499221801758,
"logits/rejected": -0.2948521077632904,
"logps/chosen": -1813.2340087890625,
"logps/rejected": -2049.240234375,
"loss": 1.4631,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -15.478715896606445,
"rewards/margins": 2.6438088417053223,
"rewards/rejected": -18.12252426147461,
"step": 730
},
{
"epoch": 0.7744636316064888,
"grad_norm": 164.74206538760382,
"learning_rate": 7.338500848029602e-08,
"logits/chosen": -0.6835179924964905,
"logits/rejected": -0.42263850569725037,
"logps/chosen": -1808.6490478515625,
"logps/rejected": -2096.81396484375,
"loss": 1.2242,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -15.557826042175293,
"rewards/margins": 3.0879101753234863,
"rewards/rejected": -18.645736694335938,
"step": 740
},
{
"epoch": 0.7849293563579278,
"grad_norm": 135.0757551116068,
"learning_rate": 6.70334495204884e-08,
"logits/chosen": -0.5583680868148804,
"logits/rejected": -0.36530551314353943,
"logps/chosen": -1854.912353515625,
"logps/rejected": -2177.262451171875,
"loss": 1.3344,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -16.08046531677246,
"rewards/margins": 3.2556281089782715,
"rewards/rejected": -19.33609390258789,
"step": 750
},
{
"epoch": 0.7953950811093669,
"grad_norm": 158.01784405358154,
"learning_rate": 6.092659210462231e-08,
"logits/chosen": -0.653573215007782,
"logits/rejected": -0.4876467287540436,
"logps/chosen": -1903.880615234375,
"logps/rejected": -2182.48291015625,
"loss": 1.4038,
"rewards/accuracies": 0.5625,
"rewards/chosen": -16.625337600708008,
"rewards/margins": 2.7693800926208496,
"rewards/rejected": -19.394718170166016,
"step": 760
},
{
"epoch": 0.8058608058608059,
"grad_norm": 169.97964049682443,
"learning_rate": 5.507260361320737e-08,
"logits/chosen": -1.0366981029510498,
"logits/rejected": -0.9037246704101562,
"logps/chosen": -1879.755126953125,
"logps/rejected": -2001.697265625,
"loss": 1.286,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -15.764042854309082,
"rewards/margins": 1.033178687095642,
"rewards/rejected": -16.797222137451172,
"step": 770
},
{
"epoch": 0.8163265306122449,
"grad_norm": 162.02338031146334,
"learning_rate": 4.947931323697982e-08,
"logits/chosen": -1.0304605960845947,
"logits/rejected": -0.9400796890258789,
"logps/chosen": -1669.2073974609375,
"logps/rejected": -2004.525390625,
"loss": 1.5927,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -13.502462387084961,
"rewards/margins": 3.8671538829803467,
"rewards/rejected": -17.369617462158203,
"step": 780
},
{
"epoch": 0.826792255363684,
"grad_norm": 140.7368428333841,
"learning_rate": 4.415420150605398e-08,
"logits/chosen": -1.0811887979507446,
"logits/rejected": -0.5253428220748901,
"logps/chosen": -1726.182373046875,
"logps/rejected": -2063.27099609375,
"loss": 1.4648,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -14.476069450378418,
"rewards/margins": 3.7039875984191895,
"rewards/rejected": -18.180057525634766,
"step": 790
},
{
"epoch": 0.837257980115123,
"grad_norm": 202.82775780509928,
"learning_rate": 3.9104390285376374e-08,
"logits/chosen": -0.835501492023468,
"logits/rejected": -0.5900505781173706,
"logps/chosen": -1749.853759765625,
"logps/rejected": -1951.329345703125,
"loss": 1.3527,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -14.83747673034668,
"rewards/margins": 2.0425891876220703,
"rewards/rejected": -16.88006591796875,
"step": 800
},
{
"epoch": 0.837257980115123,
"eval_logits/chosen": -1.0374784469604492,
"eval_logits/rejected": -0.6750361919403076,
"eval_logps/chosen": -1672.130615234375,
"eval_logps/rejected": -1986.035888671875,
"eval_loss": 1.366306185722351,
"eval_rewards/accuracies": 0.6448412537574768,
"eval_rewards/chosen": -13.901623725891113,
"eval_rewards/margins": 3.3566668033599854,
"eval_rewards/rejected": -17.25829315185547,
"eval_runtime": 176.0547,
"eval_samples_per_second": 11.36,
"eval_steps_per_second": 0.358,
"step": 800
},
{
"epoch": 0.847723704866562,
"grad_norm": 218.80895490878117,
"learning_rate": 3.433663324986208e-08,
"logits/chosen": -1.2597501277923584,
"logits/rejected": -0.7243804931640625,
"logps/chosen": -1665.3489990234375,
"logps/rejected": -2045.541259765625,
"loss": 1.4186,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -13.741106986999512,
"rewards/margins": 4.249786853790283,
"rewards/rejected": -17.990894317626953,
"step": 810
},
{
"epoch": 0.858189429618001,
"grad_norm": 175.67069527310957,
"learning_rate": 2.9857306851953897e-08,
"logits/chosen": -1.075448751449585,
"logits/rejected": -0.8459098935127258,
"logps/chosen": -1705.802734375,
"logps/rejected": -1971.207275390625,
"loss": 1.1819,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -14.010282516479492,
"rewards/margins": 2.8707687854766846,
"rewards/rejected": -16.881052017211914,
"step": 820
},
{
"epoch": 0.8686551543694401,
"grad_norm": 150.14969837730865,
"learning_rate": 2.567240179368185e-08,
"logits/chosen": -0.8211779594421387,
"logits/rejected": -0.672277569770813,
"logps/chosen": -1724.1959228515625,
"logps/rejected": -1975.289306640625,
"loss": 1.3771,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -14.66050910949707,
"rewards/margins": 2.5944151878356934,
"rewards/rejected": -17.25492286682129,
"step": 830
},
{
"epoch": 0.8791208791208791,
"grad_norm": 143.51050018041488,
"learning_rate": 2.1787515014630357e-08,
"logits/chosen": -0.9592329263687134,
"logits/rejected": -0.6304475665092468,
"logps/chosen": -1664.050048828125,
"logps/rejected": -2090.85107421875,
"loss": 1.1841,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -13.817761421203613,
"rewards/margins": 4.430028915405273,
"rewards/rejected": -18.247791290283203,
"step": 840
},
{
"epoch": 0.8895866038723181,
"grad_norm": 160.09590738302992,
"learning_rate": 1.820784220652766e-08,
"logits/chosen": -0.8976573944091797,
"logits/rejected": -0.619744598865509,
"logps/chosen": -1732.6185302734375,
"logps/rejected": -2009.6126708984375,
"loss": 1.3946,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -14.51783561706543,
"rewards/margins": 3.2437214851379395,
"rewards/rejected": -17.761554718017578,
"step": 850
},
{
"epoch": 0.9000523286237572,
"grad_norm": 140.45079725700174,
"learning_rate": 1.4938170864468636e-08,
"logits/chosen": -1.2183126211166382,
"logits/rejected": -0.7451462149620056,
"logps/chosen": -1663.8861083984375,
"logps/rejected": -2030.5501708984375,
"loss": 1.403,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -13.755941390991211,
"rewards/margins": 3.9360270500183105,
"rewards/rejected": -17.691970825195312,
"step": 860
},
{
"epoch": 0.9105180533751962,
"grad_norm": 177.87764974909854,
"learning_rate": 1.1982873884064465e-08,
"logits/chosen": -1.142114281654358,
"logits/rejected": -0.8570957183837891,
"logps/chosen": -1702.1165771484375,
"logps/rejected": -2053.07568359375,
"loss": 1.364,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -14.30346393585205,
"rewards/margins": 3.762028932571411,
"rewards/rejected": -18.06549072265625,
"step": 870
},
{
"epoch": 0.9209837781266352,
"grad_norm": 138.3301348415624,
"learning_rate": 9.345903713082304e-09,
"logits/chosen": -1.0760080814361572,
"logits/rejected": -0.866096019744873,
"logps/chosen": -1735.3382568359375,
"logps/rejected": -2023.660888671875,
"loss": 1.355,
"rewards/accuracies": 0.5625,
"rewards/chosen": -14.312044143676758,
"rewards/margins": 2.978205680847168,
"rewards/rejected": -17.290246963500977,
"step": 880
},
{
"epoch": 0.9314495028780743,
"grad_norm": 179.16273994251034,
"learning_rate": 7.030787065396865e-09,
"logits/chosen": -1.0234577655792236,
"logits/rejected": -0.9720734357833862,
"logps/chosen": -1736.5269775390625,
"logps/rejected": -2083.37939453125,
"loss": 1.4332,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -14.488537788391113,
"rewards/margins": 3.543290376663208,
"rewards/rejected": -18.031827926635742,
"step": 890
},
{
"epoch": 0.9419152276295133,
"grad_norm": 163.4835379161221,
"learning_rate": 5.04062020432286e-09,
"logits/chosen": -0.8189510107040405,
"logits/rejected": -0.8584410548210144,
"logps/chosen": -1706.8818359375,
"logps/rejected": -1968.8441162109375,
"loss": 1.5137,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -14.264904975891113,
"rewards/margins": 2.609503984451294,
"rewards/rejected": -16.874408721923828,
"step": 900
},
{
"epoch": 0.9419152276295133,
"eval_logits/chosen": -1.001752495765686,
"eval_logits/rejected": -0.673967182636261,
"eval_logps/chosen": -1735.9151611328125,
"eval_logps/rejected": -2073.3388671875,
"eval_loss": 1.3373700380325317,
"eval_rewards/accuracies": 0.6408730149269104,
"eval_rewards/chosen": -14.539473533630371,
"eval_rewards/margins": 3.5918467044830322,
"eval_rewards/rejected": -18.13132095336914,
"eval_runtime": 176.3334,
"eval_samples_per_second": 11.342,
"eval_steps_per_second": 0.357,
"step": 900
},
{
"epoch": 0.9523809523809523,
"grad_norm": 190.32378571700949,
"learning_rate": 3.3780648016376866e-09,
"logits/chosen": -0.9321626424789429,
"logits/rejected": -0.5902298092842102,
"logps/chosen": -1696.779296875,
"logps/rejected": -1922.1607666015625,
"loss": 1.4578,
"rewards/accuracies": 0.625,
"rewards/chosen": -14.7335786819458,
"rewards/margins": 2.3592441082000732,
"rewards/rejected": -17.092823028564453,
"step": 910
},
{
"epoch": 0.9628466771323915,
"grad_norm": 183.98567167006505,
"learning_rate": 2.0453443778310766e-09,
"logits/chosen": -1.0600922107696533,
"logits/rejected": -0.7931039929389954,
"logps/chosen": -1763.392822265625,
"logps/rejected": -2107.805419921875,
"loss": 1.3202,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -14.767558097839355,
"rewards/margins": 3.794466495513916,
"rewards/rejected": -18.562023162841797,
"step": 920
},
{
"epoch": 0.9733124018838305,
"grad_norm": 181.56437725274117,
"learning_rate": 1.0442413283435758e-09,
"logits/chosen": -1.1890182495117188,
"logits/rejected": -0.5295430421829224,
"logps/chosen": -1729.0921630859375,
"logps/rejected": -1985.2783203125,
"loss": 1.5669,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -14.14258098602295,
"rewards/margins": 3.068037748336792,
"rewards/rejected": -17.210617065429688,
"step": 930
},
{
"epoch": 0.9837781266352695,
"grad_norm": 173.28786175289625,
"learning_rate": 3.760945397705828e-10,
"logits/chosen": -0.856045126914978,
"logits/rejected": -0.7398639917373657,
"logps/chosen": -1713.3883056640625,
"logps/rejected": -2039.740966796875,
"loss": 1.266,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -14.502069473266602,
"rewards/margins": 3.2981293201446533,
"rewards/rejected": -17.800199508666992,
"step": 940
},
{
"epoch": 0.9942438513867086,
"grad_norm": 188.65879146663107,
"learning_rate": 4.17975992204056e-11,
"logits/chosen": -1.168084740638733,
"logits/rejected": -0.8855546116828918,
"logps/chosen": -1736.102783203125,
"logps/rejected": -1955.3255615234375,
"loss": 1.4604,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -14.33259391784668,
"rewards/margins": 2.370856761932373,
"rewards/rejected": -16.70345115661621,
"step": 950
},
{
"epoch": 0.9994767137624281,
"step": 955,
"total_flos": 0.0,
"train_loss": 2.1165736393154604,
"train_runtime": 18133.1885,
"train_samples_per_second": 3.371,
"train_steps_per_second": 0.053
}
],
"logging_steps": 10,
"max_steps": 955,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}