zephyr-7b-dpo-lora / trainer_state.json
Jan Majkutewicz
Model save
c097e33 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 100,
"global_step": 3821,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00026171159382360636,
"grad_norm": 1.421875,
"learning_rate": 1.3054830287206266e-09,
"logits/chosen": -2.9892377853393555,
"logits/rejected": -2.938478946685791,
"logps/chosen": -307.68707275390625,
"logps/rejected": -392.1196594238281,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.0026171159382360636,
"grad_norm": 1.359375,
"learning_rate": 1.3054830287206264e-08,
"logits/chosen": -2.846788167953491,
"logits/rejected": -2.834296941757202,
"logps/chosen": -299.1590881347656,
"logps/rejected": -260.9870300292969,
"loss": 0.6927,
"rewards/accuracies": 0.4791666567325592,
"rewards/chosen": 0.00017009497969411314,
"rewards/margins": 0.0008415079792030156,
"rewards/rejected": -0.0006714130286127329,
"step": 10
},
{
"epoch": 0.005234231876472127,
"grad_norm": 1.609375,
"learning_rate": 2.610966057441253e-08,
"logits/chosen": -2.8615875244140625,
"logits/rejected": -2.8269271850585938,
"logps/chosen": -325.3974609375,
"logps/rejected": -252.712158203125,
"loss": 0.6931,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": 0.0003612989676184952,
"rewards/margins": 2.1457055481732823e-05,
"rewards/rejected": 0.00033984187757596374,
"step": 20
},
{
"epoch": 0.007851347814708191,
"grad_norm": 1.796875,
"learning_rate": 3.91644908616188e-08,
"logits/chosen": -2.8635482788085938,
"logits/rejected": -2.83804988861084,
"logps/chosen": -269.81329345703125,
"logps/rejected": -268.55670166015625,
"loss": 0.6931,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.000123085526865907,
"rewards/margins": 6.434940587496385e-05,
"rewards/rejected": -0.0001874349982244894,
"step": 30
},
{
"epoch": 0.010468463752944255,
"grad_norm": 1.1640625,
"learning_rate": 5.221932114882506e-08,
"logits/chosen": -2.8312931060791016,
"logits/rejected": -2.821013927459717,
"logps/chosen": -233.34909057617188,
"logps/rejected": -238.37490844726562,
"loss": 0.6934,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.0005193200195208192,
"rewards/margins": -0.0004341518506407738,
"rewards/rejected": -8.516813250025734e-05,
"step": 40
},
{
"epoch": 0.01308557969118032,
"grad_norm": 1.15625,
"learning_rate": 6.527415143603133e-08,
"logits/chosen": -2.866091251373291,
"logits/rejected": -2.85339093208313,
"logps/chosen": -290.05963134765625,
"logps/rejected": -253.92349243164062,
"loss": 0.693,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": 0.00046620480134151876,
"rewards/margins": 0.0002595257537905127,
"rewards/rejected": 0.0002066790621029213,
"step": 50
},
{
"epoch": 0.015702695629416383,
"grad_norm": 1.25,
"learning_rate": 7.83289817232376e-08,
"logits/chosen": -2.825549364089966,
"logits/rejected": -2.8121423721313477,
"logps/chosen": -273.64691162109375,
"logps/rejected": -246.85317993164062,
"loss": 0.693,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": 0.00024823236162774265,
"rewards/margins": 0.00039805466076359153,
"rewards/rejected": -0.0001498223573435098,
"step": 60
},
{
"epoch": 0.018319811567652448,
"grad_norm": 1.234375,
"learning_rate": 9.138381201044386e-08,
"logits/chosen": -2.8805994987487793,
"logits/rejected": -2.8450770378112793,
"logps/chosen": -293.1197814941406,
"logps/rejected": -266.08135986328125,
"loss": 0.6931,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.0004158564261160791,
"rewards/margins": 7.372137770289555e-05,
"rewards/rejected": 0.0003421350847929716,
"step": 70
},
{
"epoch": 0.02093692750588851,
"grad_norm": 1.6328125,
"learning_rate": 1.0443864229765012e-07,
"logits/chosen": -2.820730209350586,
"logits/rejected": -2.7984094619750977,
"logps/chosen": -279.29498291015625,
"logps/rejected": -266.357666015625,
"loss": 0.6933,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": 9.654175664763898e-05,
"rewards/margins": -0.00033630471443757415,
"rewards/rejected": 0.0004328465147409588,
"step": 80
},
{
"epoch": 0.023554043444124574,
"grad_norm": 1.28125,
"learning_rate": 1.174934725848564e-07,
"logits/chosen": -2.8342747688293457,
"logits/rejected": -2.8211700916290283,
"logps/chosen": -270.66888427734375,
"logps/rejected": -251.8229522705078,
"loss": 0.6929,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.0002475693472661078,
"rewards/margins": 0.0005019751843065023,
"rewards/rejected": -0.000254405866144225,
"step": 90
},
{
"epoch": 0.02617115938236064,
"grad_norm": 1.296875,
"learning_rate": 1.3054830287206266e-07,
"logits/chosen": -2.849017381668091,
"logits/rejected": -2.842028856277466,
"logps/chosen": -267.05035400390625,
"logps/rejected": -248.63992309570312,
"loss": 0.6929,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": 0.00027873244835063815,
"rewards/margins": 0.0004233802610542625,
"rewards/rejected": -0.00014464779815170914,
"step": 100
},
{
"epoch": 0.02617115938236064,
"eval_logits/chosen": -2.865492343902588,
"eval_logits/rejected": -2.838137626647949,
"eval_logps/chosen": -282.7629699707031,
"eval_logps/rejected": -261.4512023925781,
"eval_loss": 0.6930338740348816,
"eval_rewards/accuracies": 0.5134999752044678,
"eval_rewards/chosen": 0.00010537073103478178,
"eval_rewards/margins": 0.00023393578885588795,
"eval_rewards/rejected": -0.00012856510875280946,
"eval_runtime": 623.7252,
"eval_samples_per_second": 3.207,
"eval_steps_per_second": 0.401,
"step": 100
},
{
"epoch": 0.028788275320596704,
"grad_norm": 1.4375,
"learning_rate": 1.4360313315926893e-07,
"logits/chosen": -2.855942964553833,
"logits/rejected": -2.822741985321045,
"logps/chosen": -307.44110107421875,
"logps/rejected": -257.2309265136719,
"loss": 0.6929,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": 0.00018880394054576755,
"rewards/margins": 0.0004303649184294045,
"rewards/rejected": -0.0002415610069874674,
"step": 110
},
{
"epoch": 0.031405391258832765,
"grad_norm": 1.21875,
"learning_rate": 1.566579634464752e-07,
"logits/chosen": -2.86763334274292,
"logits/rejected": -2.844106435775757,
"logps/chosen": -310.5987854003906,
"logps/rejected": -287.745361328125,
"loss": 0.693,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": 0.00025308955810032785,
"rewards/margins": 0.00023145000159274787,
"rewards/rejected": 2.1639541955664754e-05,
"step": 120
},
{
"epoch": 0.03402250719706883,
"grad_norm": 1.4921875,
"learning_rate": 1.6971279373368143e-07,
"logits/chosen": -2.847980499267578,
"logits/rejected": -2.8163723945617676,
"logps/chosen": -271.6886291503906,
"logps/rejected": -269.58660888671875,
"loss": 0.6928,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": 1.363331080028729e-06,
"rewards/margins": 0.000652765913400799,
"rewards/rejected": -0.0006514025735668838,
"step": 130
},
{
"epoch": 0.036639623135304895,
"grad_norm": 1.3125,
"learning_rate": 1.8276762402088773e-07,
"logits/chosen": -2.8673386573791504,
"logits/rejected": -2.8119819164276123,
"logps/chosen": -291.5235290527344,
"logps/rejected": -247.7689971923828,
"loss": 0.6928,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.0003679326910059899,
"rewards/margins": 0.000704582198522985,
"rewards/rejected": -0.00033664953662082553,
"step": 140
},
{
"epoch": 0.03925673907354096,
"grad_norm": 1.3828125,
"learning_rate": 1.95822454308094e-07,
"logits/chosen": -2.8565478324890137,
"logits/rejected": -2.8365466594696045,
"logps/chosen": -299.02996826171875,
"logps/rejected": -255.97604370117188,
"loss": 0.693,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.0001471690193284303,
"rewards/margins": 0.0002785170800052583,
"rewards/rejected": -0.00013134813343640417,
"step": 150
},
{
"epoch": 0.04187385501177702,
"grad_norm": 1.359375,
"learning_rate": 2.0887728459530023e-07,
"logits/chosen": -2.8643641471862793,
"logits/rejected": -2.8453617095947266,
"logps/chosen": -275.17669677734375,
"logps/rejected": -274.9828186035156,
"loss": 0.6929,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -8.313215948874131e-05,
"rewards/margins": 0.0004607086593750864,
"rewards/rejected": -0.0005438407533802092,
"step": 160
},
{
"epoch": 0.04449097095001309,
"grad_norm": 1.546875,
"learning_rate": 2.2193211488250652e-07,
"logits/chosen": -2.8222973346710205,
"logits/rejected": -2.803818941116333,
"logps/chosen": -236.69189453125,
"logps/rejected": -238.2162628173828,
"loss": 0.693,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": 0.00023162660363595933,
"rewards/margins": 0.00025372960953973234,
"rewards/rejected": -2.2102987713878974e-05,
"step": 170
},
{
"epoch": 0.04710808688824915,
"grad_norm": 1.125,
"learning_rate": 2.349869451697128e-07,
"logits/chosen": -2.850526809692383,
"logits/rejected": -2.8234286308288574,
"logps/chosen": -276.2384338378906,
"logps/rejected": -259.85089111328125,
"loss": 0.6928,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 0.0005887501174584031,
"rewards/margins": 0.000796462525613606,
"rewards/rejected": -0.000207712480914779,
"step": 180
},
{
"epoch": 0.04972520282648522,
"grad_norm": 2.234375,
"learning_rate": 2.4804177545691903e-07,
"logits/chosen": -2.887956380844116,
"logits/rejected": -2.8700356483459473,
"logps/chosen": -291.0037841796875,
"logps/rejected": -257.3691711425781,
"loss": 0.6929,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": 8.575538959121332e-05,
"rewards/margins": 0.00044340407475829124,
"rewards/rejected": -0.00035764873609878123,
"step": 190
},
{
"epoch": 0.05234231876472128,
"grad_norm": 1.3125,
"learning_rate": 2.610966057441253e-07,
"logits/chosen": -2.838761806488037,
"logits/rejected": -2.828749179840088,
"logps/chosen": -268.03924560546875,
"logps/rejected": -225.5205078125,
"loss": 0.693,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -7.57282687118277e-05,
"rewards/margins": 0.0003965885262005031,
"rewards/rejected": -0.0004723168385680765,
"step": 200
},
{
"epoch": 0.05234231876472128,
"eval_logits/chosen": -2.8625597953796387,
"eval_logits/rejected": -2.8349130153656006,
"eval_logps/chosen": -282.7611389160156,
"eval_logps/rejected": -261.49249267578125,
"eval_loss": 0.6928190588951111,
"eval_rewards/accuracies": 0.546999990940094,
"eval_rewards/chosen": 0.00012387627793941647,
"eval_rewards/margins": 0.0006650119903497398,
"eval_rewards/rejected": -0.0005411357851698995,
"eval_runtime": 622.9697,
"eval_samples_per_second": 3.21,
"eval_steps_per_second": 0.401,
"step": 200
},
{
"epoch": 0.05495943470295734,
"grad_norm": 1.25,
"learning_rate": 2.7415143603133156e-07,
"logits/chosen": -2.875335931777954,
"logits/rejected": -2.841496229171753,
"logps/chosen": -276.1015930175781,
"logps/rejected": -245.19223022460938,
"loss": 0.6931,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.0002998802810907364,
"rewards/margins": 5.954801963525824e-05,
"rewards/rejected": -0.00035942820250056684,
"step": 210
},
{
"epoch": 0.05757655064119341,
"grad_norm": 1.1953125,
"learning_rate": 2.8720626631853785e-07,
"logits/chosen": -2.8162028789520264,
"logits/rejected": -2.810290575027466,
"logps/chosen": -274.1748962402344,
"logps/rejected": -242.88381958007812,
"loss": 0.6926,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.00047311707749031484,
"rewards/margins": 0.0010585600975900888,
"rewards/rejected": -0.0005854429909959435,
"step": 220
},
{
"epoch": 0.06019366657942947,
"grad_norm": 1.4140625,
"learning_rate": 3.002610966057441e-07,
"logits/chosen": -2.886976957321167,
"logits/rejected": -2.862199544906616,
"logps/chosen": -322.8957824707031,
"logps/rejected": -285.7581787109375,
"loss": 0.6925,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": 0.0009428686462342739,
"rewards/margins": 0.0013226759620010853,
"rewards/rejected": -0.0003798073739744723,
"step": 230
},
{
"epoch": 0.06281078251766553,
"grad_norm": 1.2890625,
"learning_rate": 3.133159268929504e-07,
"logits/chosen": -2.8522121906280518,
"logits/rejected": -2.838016986846924,
"logps/chosen": -312.5648498535156,
"logps/rejected": -297.47650146484375,
"loss": 0.6924,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.0015294912736862898,
"rewards/margins": 0.0015876994002610445,
"rewards/rejected": -5.820817386847921e-05,
"step": 240
},
{
"epoch": 0.06542789845590159,
"grad_norm": 1.1640625,
"learning_rate": 3.263707571801567e-07,
"logits/chosen": -2.815152883529663,
"logits/rejected": -2.8188998699188232,
"logps/chosen": -277.23309326171875,
"logps/rejected": -249.0277862548828,
"loss": 0.6924,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 0.0006144286599010229,
"rewards/margins": 0.0015933450777083635,
"rewards/rejected": -0.0009789163013920188,
"step": 250
},
{
"epoch": 0.06804501439413765,
"grad_norm": 1.1171875,
"learning_rate": 3.3942558746736286e-07,
"logits/chosen": -2.8725204467773438,
"logits/rejected": -2.8254947662353516,
"logps/chosen": -297.4732971191406,
"logps/rejected": -277.87225341796875,
"loss": 0.6927,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.000995874172076583,
"rewards/margins": 0.0009019881254062057,
"rewards/rejected": 9.38860684982501e-05,
"step": 260
},
{
"epoch": 0.07066213033237373,
"grad_norm": 0.99609375,
"learning_rate": 3.5248041775456916e-07,
"logits/chosen": -2.8370730876922607,
"logits/rejected": -2.825009346008301,
"logps/chosen": -281.54547119140625,
"logps/rejected": -245.32528686523438,
"loss": 0.692,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 0.0012482403544709086,
"rewards/margins": 0.002398666925728321,
"rewards/rejected": -0.0011504264548420906,
"step": 270
},
{
"epoch": 0.07327924627060979,
"grad_norm": 1.203125,
"learning_rate": 3.6553524804177545e-07,
"logits/chosen": -2.8796634674072266,
"logits/rejected": -2.836472272872925,
"logps/chosen": -276.66485595703125,
"logps/rejected": -253.39230346679688,
"loss": 0.6922,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 0.0009085072088055313,
"rewards/margins": 0.0018395546358078718,
"rewards/rejected": -0.0009310474852100015,
"step": 280
},
{
"epoch": 0.07589636220884585,
"grad_norm": 1.359375,
"learning_rate": 3.785900783289817e-07,
"logits/chosen": -2.8511414527893066,
"logits/rejected": -2.840785264968872,
"logps/chosen": -304.3522033691406,
"logps/rejected": -279.17950439453125,
"loss": 0.6922,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.0009422661969438195,
"rewards/margins": 0.0019500417402014136,
"rewards/rejected": -0.0010077755432575941,
"step": 290
},
{
"epoch": 0.07851347814708191,
"grad_norm": 1.4453125,
"learning_rate": 3.91644908616188e-07,
"logits/chosen": -2.8077988624572754,
"logits/rejected": -2.763946294784546,
"logps/chosen": -266.3786315917969,
"logps/rejected": -248.56350708007812,
"loss": 0.692,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.0008328545955009758,
"rewards/margins": 0.0023509101010859013,
"rewards/rejected": -0.0015180552145466208,
"step": 300
},
{
"epoch": 0.07851347814708191,
"eval_logits/chosen": -2.8650434017181396,
"eval_logits/rejected": -2.8377885818481445,
"eval_logps/chosen": -282.674560546875,
"eval_logps/rejected": -261.546142578125,
"eval_loss": 0.692122220993042,
"eval_rewards/accuracies": 0.6050000190734863,
"eval_rewards/chosen": 0.000989287393167615,
"eval_rewards/margins": 0.0020671640522778034,
"eval_rewards/rejected": -0.0010778764262795448,
"eval_runtime": 622.8014,
"eval_samples_per_second": 3.211,
"eval_steps_per_second": 0.401,
"step": 300
},
{
"epoch": 0.08113059408531798,
"grad_norm": 1.4921875,
"learning_rate": 4.046997389033943e-07,
"logits/chosen": -2.895244598388672,
"logits/rejected": -2.8767800331115723,
"logps/chosen": -306.62994384765625,
"logps/rejected": -250.0150909423828,
"loss": 0.6919,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.0013422651682049036,
"rewards/margins": 0.0025762903969734907,
"rewards/rejected": -0.001234025345183909,
"step": 310
},
{
"epoch": 0.08374771002355404,
"grad_norm": 1.2265625,
"learning_rate": 4.1775456919060046e-07,
"logits/chosen": -2.8745856285095215,
"logits/rejected": -2.8429815769195557,
"logps/chosen": -273.4037170410156,
"logps/rejected": -255.09585571289062,
"loss": 0.6922,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.0011208092328161001,
"rewards/margins": 0.001973007107153535,
"rewards/rejected": -0.0008521980489604175,
"step": 320
},
{
"epoch": 0.08636482596179011,
"grad_norm": 1.234375,
"learning_rate": 4.3080939947780675e-07,
"logits/chosen": -2.8409087657928467,
"logits/rejected": -2.8410866260528564,
"logps/chosen": -277.77545166015625,
"logps/rejected": -250.94821166992188,
"loss": 0.6917,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.002120083197951317,
"rewards/margins": 0.0029364789370447397,
"rewards/rejected": -0.0008163956226781011,
"step": 330
},
{
"epoch": 0.08898194190002617,
"grad_norm": 1.3984375,
"learning_rate": 4.4386422976501305e-07,
"logits/chosen": -2.8706066608428955,
"logits/rejected": -2.857938766479492,
"logps/chosen": -307.44732666015625,
"logps/rejected": -284.9738464355469,
"loss": 0.6912,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.0030747547280043364,
"rewards/margins": 0.00399099662899971,
"rewards/rejected": -0.0009162420174106956,
"step": 340
},
{
"epoch": 0.09159905783826224,
"grad_norm": 1.234375,
"learning_rate": 4.569190600522193e-07,
"logits/chosen": -2.8302149772644043,
"logits/rejected": -2.803089141845703,
"logps/chosen": -309.71893310546875,
"logps/rejected": -296.48101806640625,
"loss": 0.6917,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.002736264606937766,
"rewards/margins": 0.002880766289308667,
"rewards/rejected": -0.00014450155140366405,
"step": 350
},
{
"epoch": 0.0942161737764983,
"grad_norm": 0.890625,
"learning_rate": 4.699738903394256e-07,
"logits/chosen": -2.8377342224121094,
"logits/rejected": -2.8193764686584473,
"logps/chosen": -256.7732238769531,
"logps/rejected": -236.75698852539062,
"loss": 0.6914,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.0019551387522369623,
"rewards/margins": 0.003464858280494809,
"rewards/rejected": -0.0015097195282578468,
"step": 360
},
{
"epoch": 0.09683328971473436,
"grad_norm": 1.4609375,
"learning_rate": 4.830287206266319e-07,
"logits/chosen": -2.8519506454467773,
"logits/rejected": -2.822915554046631,
"logps/chosen": -295.97418212890625,
"logps/rejected": -251.2534637451172,
"loss": 0.6908,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.002289209049195051,
"rewards/margins": 0.004647364374250174,
"rewards/rejected": -0.002358155557885766,
"step": 370
},
{
"epoch": 0.09945040565297043,
"grad_norm": 1.3046875,
"learning_rate": 4.960835509138381e-07,
"logits/chosen": -2.8553032875061035,
"logits/rejected": -2.8058464527130127,
"logps/chosen": -316.52178955078125,
"logps/rejected": -279.59136962890625,
"loss": 0.6912,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.0028482459019869566,
"rewards/margins": 0.0039718905463814735,
"rewards/rejected": -0.0011236447608098388,
"step": 380
},
{
"epoch": 0.1020675215912065,
"grad_norm": 1.6328125,
"learning_rate": 4.999948856244767e-07,
"logits/chosen": -2.8345859050750732,
"logits/rejected": -2.829207420349121,
"logps/chosen": -298.51348876953125,
"logps/rejected": -278.06976318359375,
"loss": 0.6904,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.004357966594398022,
"rewards/margins": 0.00553758442401886,
"rewards/rejected": -0.00117961794603616,
"step": 390
},
{
"epoch": 0.10468463752944256,
"grad_norm": 1.25,
"learning_rate": 4.999698361256577e-07,
"logits/chosen": -2.8570194244384766,
"logits/rejected": -2.820826768875122,
"logps/chosen": -280.4349670410156,
"logps/rejected": -238.0439453125,
"loss": 0.6913,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.003575119422748685,
"rewards/margins": 0.0037900402676314116,
"rewards/rejected": -0.00021492131054401398,
"step": 400
},
{
"epoch": 0.10468463752944256,
"eval_logits/chosen": -2.862203598022461,
"eval_logits/rejected": -2.8348591327667236,
"eval_logps/chosen": -282.4126892089844,
"eval_logps/rejected": -261.5210876464844,
"eval_loss": 0.6909525394439697,
"eval_rewards/accuracies": 0.6395000219345093,
"eval_rewards/chosen": 0.003608107101172209,
"eval_rewards/margins": 0.004435193259268999,
"eval_rewards/rejected": -0.0008270857506431639,
"eval_runtime": 623.9261,
"eval_samples_per_second": 3.206,
"eval_steps_per_second": 0.401,
"step": 400
},
{
"epoch": 0.10730175346767862,
"grad_norm": 1.34375,
"learning_rate": 4.99923914217458e-07,
"logits/chosen": -2.8254337310791016,
"logits/rejected": -2.810080051422119,
"logps/chosen": -257.35760498046875,
"logps/rejected": -256.613525390625,
"loss": 0.6921,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 0.0025605126284062862,
"rewards/margins": 0.0021402649581432343,
"rewards/rejected": 0.000420247990405187,
"step": 410
},
{
"epoch": 0.10991886940591468,
"grad_norm": 2.890625,
"learning_rate": 4.99857123734344e-07,
"logits/chosen": -2.823087215423584,
"logits/rejected": -2.776906967163086,
"logps/chosen": -245.77804565429688,
"logps/rejected": -238.0629119873047,
"loss": 0.6908,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 0.0032352313864976168,
"rewards/margins": 0.004795724991708994,
"rewards/rejected": -0.0015604936052113771,
"step": 420
},
{
"epoch": 0.11253598534415074,
"grad_norm": 1.5703125,
"learning_rate": 4.997694702533016e-07,
"logits/chosen": -2.8463032245635986,
"logits/rejected": -2.815331220626831,
"logps/chosen": -295.5052490234375,
"logps/rejected": -272.6297912597656,
"loss": 0.6903,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 0.004903838969767094,
"rewards/margins": 0.005789603106677532,
"rewards/rejected": -0.0008857650682330132,
"step": 430
},
{
"epoch": 0.11515310128238682,
"grad_norm": 1.2578125,
"learning_rate": 4.996609610933712e-07,
"logits/chosen": -2.88322114944458,
"logits/rejected": -2.861856460571289,
"logps/chosen": -286.9052734375,
"logps/rejected": -257.013427734375,
"loss": 0.6898,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 0.005403473507612944,
"rewards/margins": 0.0067200749181210995,
"rewards/rejected": -0.0013166010612621903,
"step": 440
},
{
"epoch": 0.11777021722062288,
"grad_norm": 1.21875,
"learning_rate": 4.995316053150366e-07,
"logits/chosen": -2.814988374710083,
"logits/rejected": -2.8178412914276123,
"logps/chosen": -290.4037170410156,
"logps/rejected": -260.14459228515625,
"loss": 0.6903,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.006219993345439434,
"rewards/margins": 0.005712195299565792,
"rewards/rejected": 0.0005077989189885557,
"step": 450
},
{
"epoch": 0.12038733315885894,
"grad_norm": 2.140625,
"learning_rate": 4.99381413719468e-07,
"logits/chosen": -2.8341031074523926,
"logits/rejected": -2.8203091621398926,
"logps/chosen": -282.1357421875,
"logps/rejected": -269.10565185546875,
"loss": 0.6887,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.006647266447544098,
"rewards/margins": 0.00890478678047657,
"rewards/rejected": -0.0022575196344405413,
"step": 460
},
{
"epoch": 0.123004449097095,
"grad_norm": 1.3046875,
"learning_rate": 4.992103988476205e-07,
"logits/chosen": -2.846776247024536,
"logits/rejected": -2.8195786476135254,
"logps/chosen": -259.63775634765625,
"logps/rejected": -245.67117309570312,
"loss": 0.6905,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 0.005352762993425131,
"rewards/margins": 0.0052968584932386875,
"rewards/rejected": 5.59051513846498e-05,
"step": 470
},
{
"epoch": 0.12562156503533106,
"grad_norm": 1.359375,
"learning_rate": 4.990185749791864e-07,
"logits/chosen": -2.8792474269866943,
"logits/rejected": -2.8467297554016113,
"logps/chosen": -273.77880859375,
"logps/rejected": -274.2110595703125,
"loss": 0.6895,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.005960241891443729,
"rewards/margins": 0.007407790515571833,
"rewards/rejected": -0.001447548856958747,
"step": 480
},
{
"epoch": 0.12823868097356714,
"grad_norm": 1.3828125,
"learning_rate": 4.988059581314039e-07,
"logits/chosen": -2.858649730682373,
"logits/rejected": -2.8390355110168457,
"logps/chosen": -307.80267333984375,
"logps/rejected": -269.5003662109375,
"loss": 0.6891,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.007140771951526403,
"rewards/margins": 0.008223852142691612,
"rewards/rejected": -0.0010830799583345652,
"step": 490
},
{
"epoch": 0.13085579691180318,
"grad_norm": 1.34375,
"learning_rate": 4.985725660577184e-07,
"logits/chosen": -2.8739092350006104,
"logits/rejected": -2.8554844856262207,
"logps/chosen": -290.1658935546875,
"logps/rejected": -249.3054656982422,
"loss": 0.689,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.005505991168320179,
"rewards/margins": 0.008494934067130089,
"rewards/rejected": -0.0029889424331486225,
"step": 500
},
{
"epoch": 0.13085579691180318,
"eval_logits/chosen": -2.8655941486358643,
"eval_logits/rejected": -2.838855028152466,
"eval_logps/chosen": -282.2831115722656,
"eval_logps/rejected": -261.68048095703125,
"eval_loss": 0.6895392537117004,
"eval_rewards/accuracies": 0.6700000166893005,
"eval_rewards/chosen": 0.004903781693428755,
"eval_rewards/margins": 0.007324740756303072,
"eval_rewards/rejected": -0.00242095859721303,
"eval_runtime": 622.8706,
"eval_samples_per_second": 3.211,
"eval_steps_per_second": 0.401,
"step": 500
},
{
"epoch": 0.13347291285003926,
"grad_norm": 1.546875,
"learning_rate": 4.983184182463008e-07,
"logits/chosen": -2.8507940769195557,
"logits/rejected": -2.828244686126709,
"logps/chosen": -294.1042785644531,
"logps/rejected": -255.8568572998047,
"loss": 0.6886,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.0062326351180672646,
"rewards/margins": 0.009330684319138527,
"rewards/rejected": -0.003098049433901906,
"step": 510
},
{
"epoch": 0.1360900287882753,
"grad_norm": 1.546875,
"learning_rate": 4.980435359184203e-07,
"logits/chosen": -2.8747315406799316,
"logits/rejected": -2.8765642642974854,
"logps/chosen": -287.198486328125,
"logps/rejected": -270.84130859375,
"loss": 0.6888,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.006150488276034594,
"rewards/margins": 0.008788048289716244,
"rewards/rejected": -0.002637560246512294,
"step": 520
},
{
"epoch": 0.13870714472651138,
"grad_norm": 1.6015625,
"learning_rate": 4.977479420266723e-07,
"logits/chosen": -2.8206260204315186,
"logits/rejected": -2.8258962631225586,
"logps/chosen": -280.0619201660156,
"logps/rejected": -288.264892578125,
"loss": 0.6891,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.005794334691017866,
"rewards/margins": 0.008307529613375664,
"rewards/rejected": -0.0025131958536803722,
"step": 530
},
{
"epoch": 0.14132426066474746,
"grad_norm": 1.1953125,
"learning_rate": 4.974316612530614e-07,
"logits/chosen": -2.813945770263672,
"logits/rejected": -2.796184778213501,
"logps/chosen": -298.8610534667969,
"logps/rejected": -258.8526611328125,
"loss": 0.6865,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": 0.0080220652744174,
"rewards/margins": 0.01355043239891529,
"rewards/rejected": -0.00552836898714304,
"step": 540
},
{
"epoch": 0.1439413766029835,
"grad_norm": 1.3359375,
"learning_rate": 4.970947200069415e-07,
"logits/chosen": -2.829272747039795,
"logits/rejected": -2.816063404083252,
"logps/chosen": -298.9976806640625,
"logps/rejected": -276.99444580078125,
"loss": 0.6894,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.005816199816763401,
"rewards/margins": 0.007670801132917404,
"rewards/rejected": -0.0018546013161540031,
"step": 550
},
{
"epoch": 0.14655849254121958,
"grad_norm": 1.1875,
"learning_rate": 4.967371464228095e-07,
"logits/chosen": -2.890547513961792,
"logits/rejected": -2.869276762008667,
"logps/chosen": -271.36053466796875,
"logps/rejected": -272.12469482421875,
"loss": 0.689,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.005742850713431835,
"rewards/margins": 0.00846049003303051,
"rewards/rejected": -0.0027176393195986748,
"step": 560
},
{
"epoch": 0.14917560847945563,
"grad_norm": 1.3984375,
"learning_rate": 4.963589703579569e-07,
"logits/chosen": -2.9156060218811035,
"logits/rejected": -2.888892412185669,
"logps/chosen": -315.22613525390625,
"logps/rejected": -279.62847900390625,
"loss": 0.6881,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 0.007057487033307552,
"rewards/margins": 0.010187765583395958,
"rewards/rejected": -0.0031302771531045437,
"step": 570
},
{
"epoch": 0.1517927244176917,
"grad_norm": 1.15625,
"learning_rate": 4.959602233899761e-07,
"logits/chosen": -2.9088664054870605,
"logits/rejected": -2.8700101375579834,
"logps/chosen": -314.38787841796875,
"logps/rejected": -272.05511474609375,
"loss": 0.6876,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.009098478592932224,
"rewards/margins": 0.011330665089190006,
"rewards/rejected": -0.002232185797765851,
"step": 580
},
{
"epoch": 0.15440984035592778,
"grad_norm": 1.4609375,
"learning_rate": 4.955409388141243e-07,
"logits/chosen": -2.843714475631714,
"logits/rejected": -2.8304061889648438,
"logps/chosen": -275.0677490234375,
"logps/rejected": -249.8322296142578,
"loss": 0.6884,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.005214322358369827,
"rewards/margins": 0.009766822680830956,
"rewards/rejected": -0.004552501253783703,
"step": 590
},
{
"epoch": 0.15702695629416383,
"grad_norm": 1.2109375,
"learning_rate": 4.951011516405429e-07,
"logits/chosen": -2.858010768890381,
"logits/rejected": -2.856701374053955,
"logps/chosen": -266.87335205078125,
"logps/rejected": -251.0322265625,
"loss": 0.6875,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.007665629498660564,
"rewards/margins": 0.011526472866535187,
"rewards/rejected": -0.0038608419708907604,
"step": 600
},
{
"epoch": 0.15702695629416383,
"eval_logits/chosen": -2.860320568084717,
"eval_logits/rejected": -2.8332278728485107,
"eval_logps/chosen": -282.1841125488281,
"eval_logps/rejected": -261.906005859375,
"eval_loss": 0.6879660487174988,
"eval_rewards/accuracies": 0.6690000295639038,
"eval_rewards/chosen": 0.005893694702535868,
"eval_rewards/margins": 0.010570226237177849,
"eval_rewards/rejected": -0.004676531068980694,
"eval_runtime": 622.7218,
"eval_samples_per_second": 3.212,
"eval_steps_per_second": 0.401,
"step": 600
},
{
"epoch": 0.1596440722323999,
"grad_norm": 1.421875,
"learning_rate": 4.946408985913344e-07,
"logits/chosen": -2.852583169937134,
"logits/rejected": -2.8311781883239746,
"logps/chosen": -263.86956787109375,
"logps/rejected": -244.29763793945312,
"loss": 0.6885,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.007280237972736359,
"rewards/margins": 0.009582052007317543,
"rewards/rejected": -0.002301814965903759,
"step": 610
},
{
"epoch": 0.16226118817063595,
"grad_norm": 1.40625,
"learning_rate": 4.941602180974958e-07,
"logits/chosen": -2.8539230823516846,
"logits/rejected": -2.8148884773254395,
"logps/chosen": -304.7937316894531,
"logps/rejected": -242.7307891845703,
"loss": 0.6874,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.007980446331202984,
"rewards/margins": 0.011698475107550621,
"rewards/rejected": -0.003718029009178281,
"step": 620
},
{
"epoch": 0.16487830410887203,
"grad_norm": 1.296875,
"learning_rate": 4.936591502957101e-07,
"logits/chosen": -2.857060194015503,
"logits/rejected": -2.83305025100708,
"logps/chosen": -262.7440490722656,
"logps/rejected": -254.7294464111328,
"loss": 0.6863,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.00918588787317276,
"rewards/margins": 0.014022831805050373,
"rewards/rejected": -0.004836943931877613,
"step": 630
},
{
"epoch": 0.16749542004710807,
"grad_norm": 1.2734375,
"learning_rate": 4.931377370249945e-07,
"logits/chosen": -2.8656134605407715,
"logits/rejected": -2.8077850341796875,
"logps/chosen": -280.6070251464844,
"logps/rejected": -258.4964599609375,
"loss": 0.6866,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.005175912287086248,
"rewards/margins": 0.013334142975509167,
"rewards/rejected": -0.008158231154084206,
"step": 640
},
{
"epoch": 0.17011253598534415,
"grad_norm": 1.3125,
"learning_rate": 4.925960218232072e-07,
"logits/chosen": -2.84588885307312,
"logits/rejected": -2.82362699508667,
"logps/chosen": -269.46539306640625,
"logps/rejected": -259.7959899902344,
"loss": 0.6864,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.006764856167137623,
"rewards/margins": 0.013819174841046333,
"rewards/rejected": -0.007054319139569998,
"step": 650
},
{
"epoch": 0.17272965192358022,
"grad_norm": 2.046875,
"learning_rate": 4.920340499234116e-07,
"logits/chosen": -2.81691312789917,
"logits/rejected": -2.7776267528533936,
"logps/chosen": -285.55377197265625,
"logps/rejected": -248.0377960205078,
"loss": 0.6868,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.008050017058849335,
"rewards/margins": 0.012922885827720165,
"rewards/rejected": -0.0048728687688708305,
"step": 660
},
{
"epoch": 0.17534676786181627,
"grad_norm": 1.1875,
"learning_rate": 4.914518682500995e-07,
"logits/chosen": -2.8940651416778564,
"logits/rejected": -2.864454507827759,
"logps/chosen": -299.08636474609375,
"logps/rejected": -257.27215576171875,
"loss": 0.6848,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.011382282711565495,
"rewards/margins": 0.017072781920433044,
"rewards/rejected": -0.005690500605851412,
"step": 670
},
{
"epoch": 0.17796388380005235,
"grad_norm": 1.8515625,
"learning_rate": 4.90849525415273e-07,
"logits/chosen": -2.8536484241485596,
"logits/rejected": -2.832000255584717,
"logps/chosen": -289.3675231933594,
"logps/rejected": -240.21029663085938,
"loss": 0.6854,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.010842313058674335,
"rewards/margins": 0.01590149477124214,
"rewards/rejected": -0.005059181712567806,
"step": 680
},
{
"epoch": 0.1805809997382884,
"grad_norm": 1.3515625,
"learning_rate": 4.902270717143858e-07,
"logits/chosen": -2.862431049346924,
"logits/rejected": -2.8454391956329346,
"logps/chosen": -255.1441650390625,
"logps/rejected": -264.6862487792969,
"loss": 0.685,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": 0.008327770046889782,
"rewards/margins": 0.016681838780641556,
"rewards/rejected": -0.0083540678024292,
"step": 690
},
{
"epoch": 0.18319811567652447,
"grad_norm": 1.65625,
"learning_rate": 4.895845591221426e-07,
"logits/chosen": -2.85901141166687,
"logits/rejected": -2.8616912364959717,
"logps/chosen": -268.5135803222656,
"logps/rejected": -264.209716796875,
"loss": 0.6874,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.00641227001324296,
"rewards/margins": 0.011734376661479473,
"rewards/rejected": -0.005322106648236513,
"step": 700
},
{
"epoch": 0.18319811567652447,
"eval_logits/chosen": -2.861030340194702,
"eval_logits/rejected": -2.8341639041900635,
"eval_logps/chosen": -281.93695068359375,
"eval_logps/rejected": -261.9841613769531,
"eval_loss": 0.686406135559082,
"eval_rewards/accuracies": 0.6784999966621399,
"eval_rewards/chosen": 0.008365440182387829,
"eval_rewards/margins": 0.013823293149471283,
"eval_rewards/rejected": -0.005457851104438305,
"eval_runtime": 622.4947,
"eval_samples_per_second": 3.213,
"eval_steps_per_second": 0.402,
"step": 700
},
{
"epoch": 0.18581523161476055,
"grad_norm": 1.4921875,
"learning_rate": 4.8892204128816e-07,
"logits/chosen": -2.8912875652313232,
"logits/rejected": -2.8667664527893066,
"logps/chosen": -280.8445739746094,
"logps/rejected": -267.3077392578125,
"loss": 0.6874,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 0.006677532102912664,
"rewards/margins": 0.011775776743888855,
"rewards/rejected": -0.005098243709653616,
"step": 710
},
{
"epoch": 0.1884323475529966,
"grad_norm": 1.1953125,
"learning_rate": 4.882395735324863e-07,
"logits/chosen": -2.8655221462249756,
"logits/rejected": -2.8227005004882812,
"logps/chosen": -280.4420166015625,
"logps/rejected": -267.8427429199219,
"loss": 0.6841,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.00945108663290739,
"rewards/margins": 0.01858513429760933,
"rewards/rejected": -0.009134046733379364,
"step": 720
},
{
"epoch": 0.19104946349123267,
"grad_norm": 1.2578125,
"learning_rate": 4.875372128409829e-07,
"logits/chosen": -2.8432908058166504,
"logits/rejected": -2.813136577606201,
"logps/chosen": -282.8307800292969,
"logps/rejected": -251.0844268798828,
"loss": 0.6851,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 0.009715097956359386,
"rewards/margins": 0.01650545559823513,
"rewards/rejected": -0.006790356244891882,
"step": 730
},
{
"epoch": 0.19366657942946872,
"grad_norm": 1.0390625,
"learning_rate": 4.868150178605653e-07,
"logits/chosen": -2.8426120281219482,
"logits/rejected": -2.8164243698120117,
"logps/chosen": -242.0150146484375,
"logps/rejected": -210.2700653076172,
"loss": 0.6844,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.0059540546499192715,
"rewards/margins": 0.017786705866456032,
"rewards/rejected": -0.011832650750875473,
"step": 740
},
{
"epoch": 0.1962836953677048,
"grad_norm": 1.5078125,
"learning_rate": 4.860730488943068e-07,
"logits/chosen": -2.8057663440704346,
"logits/rejected": -2.794985294342041,
"logps/chosen": -250.8291778564453,
"logps/rejected": -247.78134155273438,
"loss": 0.6854,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.010436545126140118,
"rewards/margins": 0.0157928504049778,
"rewards/rejected": -0.005356303416192532,
"step": 750
},
{
"epoch": 0.19890081130594087,
"grad_norm": 1.578125,
"learning_rate": 4.853113678964021e-07,
"logits/chosen": -2.8220317363739014,
"logits/rejected": -2.8117756843566895,
"logps/chosen": -293.8874206542969,
"logps/rejected": -279.42962646484375,
"loss": 0.6839,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 0.012539887800812721,
"rewards/margins": 0.019003767520189285,
"rewards/rejected": -0.006463879253715277,
"step": 760
},
{
"epoch": 0.20151792724417691,
"grad_norm": 1.1484375,
"learning_rate": 4.845300384669957e-07,
"logits/chosen": -2.839818239212036,
"logits/rejected": -2.8078839778900146,
"logps/chosen": -269.07232666015625,
"logps/rejected": -247.03567504882812,
"loss": 0.6861,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.009587548673152924,
"rewards/margins": 0.014613436535000801,
"rewards/rejected": -0.005025886930525303,
"step": 770
},
{
"epoch": 0.204135043182413,
"grad_norm": 1.3671875,
"learning_rate": 4.8372912584687e-07,
"logits/chosen": -2.8615410327911377,
"logits/rejected": -2.827815055847168,
"logps/chosen": -300.0118103027344,
"logps/rejected": -276.13140869140625,
"loss": 0.6856,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 0.01048839557915926,
"rewards/margins": 0.015544673427939415,
"rewards/rejected": -0.005056279711425304,
"step": 780
},
{
"epoch": 0.20675215912064904,
"grad_norm": 1.9140625,
"learning_rate": 4.829086969119983e-07,
"logits/chosen": -2.827129602432251,
"logits/rejected": -2.8344006538391113,
"logps/chosen": -273.4031066894531,
"logps/rejected": -268.6723937988281,
"loss": 0.6875,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.0065932185389101505,
"rewards/margins": 0.011614800430834293,
"rewards/rejected": -0.00502158235758543,
"step": 790
},
{
"epoch": 0.2093692750588851,
"grad_norm": 1.265625,
"learning_rate": 4.820688201679605e-07,
"logits/chosen": -2.8842103481292725,
"logits/rejected": -2.836937189102173,
"logps/chosen": -276.0343933105469,
"logps/rejected": -212.7479705810547,
"loss": 0.682,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.015654196962714195,
"rewards/margins": 0.022758139297366142,
"rewards/rejected": -0.007103943265974522,
"step": 800
},
{
"epoch": 0.2093692750588851,
"eval_logits/chosen": -2.857757568359375,
"eval_logits/rejected": -2.8307132720947266,
"eval_logps/chosen": -281.70330810546875,
"eval_logps/rejected": -262.04193115234375,
"eval_loss": 0.6850252151489258,
"eval_rewards/accuracies": 0.6800000071525574,
"eval_rewards/chosen": 0.010701690800487995,
"eval_rewards/margins": 0.016737323254346848,
"eval_rewards/rejected": -0.006035633385181427,
"eval_runtime": 622.1863,
"eval_samples_per_second": 3.214,
"eval_steps_per_second": 0.402,
"step": 800
},
{
"epoch": 0.21198639099712116,
"grad_norm": 1.2734375,
"learning_rate": 4.812095657442231e-07,
"logits/chosen": -2.8657875061035156,
"logits/rejected": -2.8741297721862793,
"logps/chosen": -288.73614501953125,
"logps/rejected": -281.89031982421875,
"loss": 0.6884,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.008292586542665958,
"rewards/margins": 0.010020687244832516,
"rewards/rejected": -0.001728100934997201,
"step": 810
},
{
"epoch": 0.21460350693535724,
"grad_norm": 1.34375,
"learning_rate": 4.803310053882831e-07,
"logits/chosen": -2.851111650466919,
"logits/rejected": -2.863678455352783,
"logps/chosen": -248.24319458007812,
"logps/rejected": -259.7985534667969,
"loss": 0.6865,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.009849630296230316,
"rewards/margins": 0.013707734644412994,
"rewards/rejected": -0.003858105046674609,
"step": 820
},
{
"epoch": 0.2172206228735933,
"grad_norm": 1.484375,
"learning_rate": 4.794332124596775e-07,
"logits/chosen": -2.881307363510132,
"logits/rejected": -2.8686277866363525,
"logps/chosen": -284.14605712890625,
"logps/rejected": -279.83477783203125,
"loss": 0.6849,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.014169754460453987,
"rewards/margins": 0.017167720943689346,
"rewards/rejected": -0.00299796718172729,
"step": 830
},
{
"epoch": 0.21983773881182936,
"grad_norm": 1.5859375,
"learning_rate": 4.785162619238574e-07,
"logits/chosen": -2.824626922607422,
"logits/rejected": -2.7853519916534424,
"logps/chosen": -269.13934326171875,
"logps/rejected": -243.8739471435547,
"loss": 0.6831,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 0.011793789453804493,
"rewards/margins": 0.020573901012539864,
"rewards/rejected": -0.008780110627412796,
"step": 840
},
{
"epoch": 0.22245485475006543,
"grad_norm": 1.625,
"learning_rate": 4.775802303459287e-07,
"logits/chosen": -2.8298261165618896,
"logits/rejected": -2.8166985511779785,
"logps/chosen": -262.9281311035156,
"logps/rejected": -260.26129150390625,
"loss": 0.686,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.010038264095783234,
"rewards/margins": 0.014955776743590832,
"rewards/rejected": -0.004917514510452747,
"step": 850
},
{
"epoch": 0.22507197068830148,
"grad_norm": 1.8125,
"learning_rate": 4.766251958842589e-07,
"logits/chosen": -2.800764799118042,
"logits/rejected": -2.7910823822021484,
"logps/chosen": -290.7630615234375,
"logps/rejected": -278.87103271484375,
"loss": 0.6842,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.012506058439612389,
"rewards/margins": 0.018407398834824562,
"rewards/rejected": -0.005901341326534748,
"step": 860
},
{
"epoch": 0.22768908662653756,
"grad_norm": 1.21875,
"learning_rate": 4.756512382839506e-07,
"logits/chosen": -2.82393217086792,
"logits/rejected": -2.7991511821746826,
"logps/chosen": -269.1101989746094,
"logps/rejected": -271.6678161621094,
"loss": 0.6838,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.01023150235414505,
"rewards/margins": 0.01938125677406788,
"rewards/rejected": -0.009149751625955105,
"step": 870
},
{
"epoch": 0.23030620256477363,
"grad_norm": 1.2109375,
"learning_rate": 4.746584388701831e-07,
"logits/chosen": -2.840731620788574,
"logits/rejected": -2.840230941772461,
"logps/chosen": -278.5453186035156,
"logps/rejected": -264.83380126953125,
"loss": 0.6835,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": 0.014049595221877098,
"rewards/margins": 0.019881747663021088,
"rewards/rejected": -0.005832154303789139,
"step": 880
},
{
"epoch": 0.23292331850300968,
"grad_norm": 1.75,
"learning_rate": 4.736468805414218e-07,
"logits/chosen": -2.811013698577881,
"logits/rejected": -2.8108229637145996,
"logps/chosen": -266.8439636230469,
"logps/rejected": -278.64306640625,
"loss": 0.6825,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.013364280574023724,
"rewards/margins": 0.021861828863620758,
"rewards/rejected": -0.008497546426951885,
"step": 890
},
{
"epoch": 0.23554043444124576,
"grad_norm": 1.3359375,
"learning_rate": 4.7261664776249595e-07,
"logits/chosen": -2.783407211303711,
"logits/rejected": -2.7699193954467773,
"logps/chosen": -245.31137084960938,
"logps/rejected": -236.346923828125,
"loss": 0.6837,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.008871756494045258,
"rewards/margins": 0.01949758641421795,
"rewards/rejected": -0.010625829920172691,
"step": 900
},
{
"epoch": 0.23554043444124576,
"eval_logits/chosen": -2.8573083877563477,
"eval_logits/rejected": -2.8303797245025635,
"eval_logps/chosen": -281.4179992675781,
"eval_logps/rejected": -261.9797058105469,
"eval_loss": 0.6839740872383118,
"eval_rewards/accuracies": 0.6840000152587891,
"eval_rewards/chosen": 0.013555029407143593,
"eval_rewards/margins": 0.01896839775145054,
"eval_rewards/rejected": -0.005413366016000509,
"eval_runtime": 621.6447,
"eval_samples_per_second": 3.217,
"eval_steps_per_second": 0.402,
"step": 900
},
{
"epoch": 0.2381575503794818,
"grad_norm": 1.203125,
"learning_rate": 4.7156782655754624e-07,
"logits/chosen": -2.847557544708252,
"logits/rejected": -2.807833194732666,
"logps/chosen": -298.7544250488281,
"logps/rejected": -243.92807006835938,
"loss": 0.6823,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.01836327835917473,
"rewards/margins": 0.0223236046731472,
"rewards/rejected": -0.003960323985666037,
"step": 910
},
{
"epoch": 0.24077466631771788,
"grad_norm": 1.25,
"learning_rate": 4.705005045028414e-07,
"logits/chosen": -2.8043971061706543,
"logits/rejected": -2.775317430496216,
"logps/chosen": -279.80584716796875,
"logps/rejected": -261.52227783203125,
"loss": 0.6838,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.014174017123878002,
"rewards/margins": 0.01934727467596531,
"rewards/rejected": -0.005173257552087307,
"step": 920
},
{
"epoch": 0.24339178225595393,
"grad_norm": 1.3984375,
"learning_rate": 4.694147707194659e-07,
"logits/chosen": -2.871277332305908,
"logits/rejected": -2.861692428588867,
"logps/chosen": -286.360107421875,
"logps/rejected": -268.2128601074219,
"loss": 0.6821,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.016537340357899666,
"rewards/margins": 0.022989634424448013,
"rewards/rejected": -0.00645229360088706,
"step": 930
},
{
"epoch": 0.24600889819419,
"grad_norm": 2.578125,
"learning_rate": 4.683107158658781e-07,
"logits/chosen": -2.818206548690796,
"logits/rejected": -2.799956798553467,
"logps/chosen": -306.58203125,
"logps/rejected": -278.14752197265625,
"loss": 0.678,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.021548133343458176,
"rewards/margins": 0.03120315447449684,
"rewards/rejected": -0.009655019268393517,
"step": 940
},
{
"epoch": 0.24862601413242608,
"grad_norm": 1.4609375,
"learning_rate": 4.6718843213034066e-07,
"logits/chosen": -2.831376314163208,
"logits/rejected": -2.815030097961426,
"logps/chosen": -261.57659912109375,
"logps/rejected": -250.5101318359375,
"loss": 0.6823,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.014541561715304852,
"rewards/margins": 0.022364726290106773,
"rewards/rejected": -0.007823166437447071,
"step": 950
},
{
"epoch": 0.2512431300706621,
"grad_norm": 1.3984375,
"learning_rate": 4.660480132232224e-07,
"logits/chosen": -2.8427586555480957,
"logits/rejected": -2.8429322242736816,
"logps/chosen": -285.56378173828125,
"logps/rejected": -263.73260498046875,
"loss": 0.6849,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.014430510811507702,
"rewards/margins": 0.01709624193608761,
"rewards/rejected": -0.002665730658918619,
"step": 960
},
{
"epoch": 0.25386024600889817,
"grad_norm": 1.1484375,
"learning_rate": 4.64889554369174e-07,
"logits/chosen": -2.8453879356384277,
"logits/rejected": -2.8087105751037598,
"logps/chosen": -296.9171142578125,
"logps/rejected": -250.25,
"loss": 0.6784,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 0.022362882271409035,
"rewards/margins": 0.030544385313987732,
"rewards/rejected": -0.008181498385965824,
"step": 970
},
{
"epoch": 0.2564773619471343,
"grad_norm": 1.328125,
"learning_rate": 4.637131522991764e-07,
"logits/chosen": -2.8417975902557373,
"logits/rejected": -2.8379454612731934,
"logps/chosen": -304.47784423828125,
"logps/rejected": -279.02911376953125,
"loss": 0.6831,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.01869601011276245,
"rewards/margins": 0.02095440961420536,
"rewards/rejected": -0.002258400898426771,
"step": 980
},
{
"epoch": 0.2590944778853703,
"grad_norm": 1.171875,
"learning_rate": 4.6251890524246375e-07,
"logits/chosen": -2.8454818725585938,
"logits/rejected": -2.8245933055877686,
"logps/chosen": -253.8804931640625,
"logps/rejected": -232.57418823242188,
"loss": 0.6802,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.015828203409910202,
"rewards/margins": 0.02681964635848999,
"rewards/rejected": -0.010991444811224937,
"step": 990
},
{
"epoch": 0.26171159382360637,
"grad_norm": 1.21875,
"learning_rate": 4.613069129183218e-07,
"logits/chosen": -2.8802895545959473,
"logits/rejected": -2.840637683868408,
"logps/chosen": -319.56268310546875,
"logps/rejected": -281.52264404296875,
"loss": 0.6819,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.018990501761436462,
"rewards/margins": 0.023325005546212196,
"rewards/rejected": -0.004334500525146723,
"step": 1000
},
{
"epoch": 0.26171159382360637,
"eval_logits/chosen": -2.853975534439087,
"eval_logits/rejected": -2.8269295692443848,
"eval_logps/chosen": -281.1678466796875,
"eval_logps/rejected": -261.9830017089844,
"eval_loss": 0.6827893257141113,
"eval_rewards/accuracies": 0.6809999942779541,
"eval_rewards/chosen": 0.016056543216109276,
"eval_rewards/margins": 0.021502956748008728,
"eval_rewards/rejected": -0.005446411669254303,
"eval_runtime": 621.9705,
"eval_samples_per_second": 3.216,
"eval_steps_per_second": 0.402,
"step": 1000
},
{
"epoch": 0.2643287097618425,
"grad_norm": 1.265625,
"learning_rate": 4.6007727652776065e-07,
"logits/chosen": -2.8141977787017822,
"logits/rejected": -2.7999210357666016,
"logps/chosen": -249.3959197998047,
"logps/rejected": -245.47830200195312,
"loss": 0.6824,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.016399413347244263,
"rewards/margins": 0.02237236499786377,
"rewards/rejected": -0.0059729525819420815,
"step": 1010
},
{
"epoch": 0.2669458257000785,
"grad_norm": 1.578125,
"learning_rate": 4.588300987450652e-07,
"logits/chosen": -2.86116099357605,
"logits/rejected": -2.8352913856506348,
"logps/chosen": -268.90374755859375,
"logps/rejected": -237.9334259033203,
"loss": 0.682,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.016041552647948265,
"rewards/margins": 0.02327124960720539,
"rewards/rejected": -0.007229696027934551,
"step": 1020
},
{
"epoch": 0.26956294163831457,
"grad_norm": 1.0859375,
"learning_rate": 4.5756548370922134e-07,
"logits/chosen": -2.8210678100585938,
"logits/rejected": -2.8014864921569824,
"logps/chosen": -254.616455078125,
"logps/rejected": -245.8887939453125,
"loss": 0.6857,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.01615295186638832,
"rewards/margins": 0.015799041837453842,
"rewards/rejected": 0.0003539065073709935,
"step": 1030
},
{
"epoch": 0.2721800575765506,
"grad_norm": 1.1328125,
"learning_rate": 4.5628353701522047e-07,
"logits/chosen": -2.8566808700561523,
"logits/rejected": -2.823983669281006,
"logps/chosen": -317.4922180175781,
"logps/rejected": -287.9608154296875,
"loss": 0.6771,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.024755671620368958,
"rewards/margins": 0.03346611559391022,
"rewards/rejected": -0.00871044397354126,
"step": 1040
},
{
"epoch": 0.2747971735147867,
"grad_norm": 1.6875,
"learning_rate": 4.549843657052429e-07,
"logits/chosen": -2.8731515407562256,
"logits/rejected": -2.845651388168335,
"logps/chosen": -282.2266540527344,
"logps/rejected": -279.3399658203125,
"loss": 0.677,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.02262326516211033,
"rewards/margins": 0.03333864361047745,
"rewards/rejected": -0.010715381242334843,
"step": 1050
},
{
"epoch": 0.27741428945302277,
"grad_norm": 1.140625,
"learning_rate": 4.5366807825971907e-07,
"logits/chosen": -2.8223326206207275,
"logits/rejected": -2.813047409057617,
"logps/chosen": -252.23336791992188,
"logps/rejected": -246.99490356445312,
"loss": 0.6827,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 0.015223952010273933,
"rewards/margins": 0.02188166417181492,
"rewards/rejected": -0.006657709833234549,
"step": 1060
},
{
"epoch": 0.2800314053912588,
"grad_norm": 1.2734375,
"learning_rate": 4.5233478458827176e-07,
"logits/chosen": -2.856292247772217,
"logits/rejected": -2.8288464546203613,
"logps/chosen": -306.0169372558594,
"logps/rejected": -254.22042846679688,
"loss": 0.6782,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": 0.0232028029859066,
"rewards/margins": 0.03090915083885193,
"rewards/rejected": -0.007706350646913052,
"step": 1070
},
{
"epoch": 0.2826485213294949,
"grad_norm": 1.328125,
"learning_rate": 4.509845960205389e-07,
"logits/chosen": -2.791149854660034,
"logits/rejected": -2.7939980030059814,
"logps/chosen": -294.93231201171875,
"logps/rejected": -263.44830322265625,
"loss": 0.6818,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.020023692399263382,
"rewards/margins": 0.023503463715314865,
"rewards/rejected": -0.0034797731786966324,
"step": 1080
},
{
"epoch": 0.28526563726773096,
"grad_norm": 1.765625,
"learning_rate": 4.4961762529687736e-07,
"logits/chosen": -2.8478240966796875,
"logits/rejected": -2.8274638652801514,
"logps/chosen": -277.98541259765625,
"logps/rejected": -260.0039978027344,
"loss": 0.6833,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.01708926074206829,
"rewards/margins": 0.020394863560795784,
"rewards/rejected": -0.0033055986277759075,
"step": 1090
},
{
"epoch": 0.287882753205967,
"grad_norm": 1.2890625,
"learning_rate": 4.482339865589492e-07,
"logits/chosen": -2.8545358180999756,
"logits/rejected": -2.8111648559570312,
"logps/chosen": -281.62652587890625,
"logps/rejected": -238.7964324951172,
"loss": 0.6836,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.013560217805206776,
"rewards/margins": 0.019967440515756607,
"rewards/rejected": -0.006407222244888544,
"step": 1100
},
{
"epoch": 0.287882753205967,
"eval_logits/chosen": -2.852851152420044,
"eval_logits/rejected": -2.825786590576172,
"eval_logps/chosen": -280.98529052734375,
"eval_logps/rejected": -262.00518798828125,
"eval_loss": 0.6818436980247498,
"eval_rewards/accuracies": 0.6784999966621399,
"eval_rewards/chosen": 0.01788218505680561,
"eval_rewards/margins": 0.023550525307655334,
"eval_rewards/rejected": -0.00566834257915616,
"eval_runtime": 621.9928,
"eval_samples_per_second": 3.215,
"eval_steps_per_second": 0.402,
"step": 1100
},
{
"epoch": 0.2904998691442031,
"grad_norm": 1.2890625,
"learning_rate": 4.4683379534019076e-07,
"logits/chosen": -2.8489432334899902,
"logits/rejected": -2.8446106910705566,
"logps/chosen": -284.64337158203125,
"logps/rejected": -280.3296203613281,
"loss": 0.6832,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.01724550686776638,
"rewards/margins": 0.020626548677682877,
"rewards/rejected": -0.0033810404129326344,
"step": 1110
},
{
"epoch": 0.29311698508243916,
"grad_norm": 1.28125,
"learning_rate": 4.4541716855616593e-07,
"logits/chosen": -2.822422742843628,
"logits/rejected": -2.8002758026123047,
"logps/chosen": -256.53265380859375,
"logps/rejected": -259.35723876953125,
"loss": 0.684,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 0.01423877477645874,
"rewards/margins": 0.019155513495206833,
"rewards/rejected": -0.004916741047054529,
"step": 1120
},
{
"epoch": 0.2957341010206752,
"grad_norm": 1.203125,
"learning_rate": 4.4398422449480357e-07,
"logits/chosen": -2.8172736167907715,
"logits/rejected": -2.7678089141845703,
"logps/chosen": -278.84429931640625,
"logps/rejected": -282.52862548828125,
"loss": 0.6835,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.014500883407890797,
"rewards/margins": 0.020229389891028404,
"rewards/rejected": -0.0057285078801214695,
"step": 1130
},
{
"epoch": 0.29835121695891126,
"grad_norm": 1.1953125,
"learning_rate": 4.4253508280652036e-07,
"logits/chosen": -2.838125705718994,
"logits/rejected": -2.791625738143921,
"logps/chosen": -301.6747741699219,
"logps/rejected": -253.36123657226562,
"loss": 0.6789,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.02180757373571396,
"rewards/margins": 0.029516074806451797,
"rewards/rejected": -0.007708498742431402,
"step": 1140
},
{
"epoch": 0.30096833289714736,
"grad_norm": 1.1640625,
"learning_rate": 4.410698644942302e-07,
"logits/chosen": -2.879281997680664,
"logits/rejected": -2.851644277572632,
"logps/chosen": -285.1121520996094,
"logps/rejected": -263.08416748046875,
"loss": 0.6796,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.022570660337805748,
"rewards/margins": 0.02802709862589836,
"rewards/rejected": -0.005456441547721624,
"step": 1150
},
{
"epoch": 0.3035854488353834,
"grad_norm": 1.546875,
"learning_rate": 4.3958869190324057e-07,
"logits/chosen": -2.8084969520568848,
"logits/rejected": -2.7690627574920654,
"logps/chosen": -277.8807067871094,
"logps/rejected": -252.42825317382812,
"loss": 0.6801,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.018787220120429993,
"rewards/margins": 0.027080217376351357,
"rewards/rejected": -0.008292997255921364,
"step": 1160
},
{
"epoch": 0.30620256477361946,
"grad_norm": 1.4140625,
"learning_rate": 4.380916887110365e-07,
"logits/chosen": -2.869062900543213,
"logits/rejected": -2.8376870155334473,
"logps/chosen": -273.46063232421875,
"logps/rejected": -233.0785675048828,
"loss": 0.6807,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.01758132129907608,
"rewards/margins": 0.026008691638708115,
"rewards/rejected": -0.008427368476986885,
"step": 1170
},
{
"epoch": 0.30881968071185556,
"grad_norm": 1.3046875,
"learning_rate": 4.3657897991695394e-07,
"logits/chosen": -2.7800498008728027,
"logits/rejected": -2.8185646533966064,
"logps/chosen": -268.2494201660156,
"logps/rejected": -269.7010498046875,
"loss": 0.6818,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.019908545538783073,
"rewards/margins": 0.023755352944135666,
"rewards/rejected": -0.0038468041457235813,
"step": 1180
},
{
"epoch": 0.3114367966500916,
"grad_norm": 1.2265625,
"learning_rate": 4.350506918317416e-07,
"logits/chosen": -2.856717824935913,
"logits/rejected": -2.824436664581299,
"logps/chosen": -260.1253967285156,
"logps/rejected": -256.15118408203125,
"loss": 0.6829,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.017839016392827034,
"rewards/margins": 0.021472811698913574,
"rewards/rejected": -0.003633796004578471,
"step": 1190
},
{
"epoch": 0.31405391258832765,
"grad_norm": 1.515625,
"learning_rate": 4.335069520670149e-07,
"logits/chosen": -2.8279824256896973,
"logits/rejected": -2.802241325378418,
"logps/chosen": -241.59786987304688,
"logps/rejected": -255.2316131591797,
"loss": 0.685,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.017605960369110107,
"rewards/margins": 0.01734979636967182,
"rewards/rejected": 0.0002561651053838432,
"step": 1200
},
{
"epoch": 0.31405391258832765,
"eval_logits/chosen": -2.8509681224823,
"eval_logits/rejected": -2.8237783908843994,
"eval_logps/chosen": -280.56793212890625,
"eval_logps/rejected": -261.7609558105469,
"eval_loss": 0.6810342073440552,
"eval_rewards/accuracies": 0.6809999942779541,
"eval_rewards/chosen": 0.022055484354496002,
"eval_rewards/margins": 0.02528143860399723,
"eval_rewards/rejected": -0.003225954482331872,
"eval_runtime": 623.5714,
"eval_samples_per_second": 3.207,
"eval_steps_per_second": 0.401,
"step": 1200
},
{
"epoch": 0.3166710285265637,
"grad_norm": 1.2578125,
"learning_rate": 4.319478895245999e-07,
"logits/chosen": -2.846019744873047,
"logits/rejected": -2.8138914108276367,
"logps/chosen": -262.89471435546875,
"logps/rejected": -238.1546173095703,
"loss": 0.6785,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.02249886468052864,
"rewards/margins": 0.030421704053878784,
"rewards/rejected": -0.007922842167317867,
"step": 1210
},
{
"epoch": 0.3192881444647998,
"grad_norm": 1.359375,
"learning_rate": 4.3037363438577036e-07,
"logits/chosen": -2.8656246662139893,
"logits/rejected": -2.828981399536133,
"logps/chosen": -269.7254943847656,
"logps/rejected": -284.9521789550781,
"loss": 0.6799,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.025678550824522972,
"rewards/margins": 0.027557630091905594,
"rewards/rejected": -0.001879077055491507,
"step": 1220
},
{
"epoch": 0.32190526040303585,
"grad_norm": 1.3671875,
"learning_rate": 4.2878431810037716e-07,
"logits/chosen": -2.8651747703552246,
"logits/rejected": -2.851069927215576,
"logps/chosen": -309.12872314453125,
"logps/rejected": -264.8939514160156,
"loss": 0.6775,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.025112558156251907,
"rewards/margins": 0.03247564285993576,
"rewards/rejected": -0.007363081909716129,
"step": 1230
},
{
"epoch": 0.3245223763412719,
"grad_norm": 1.34375,
"learning_rate": 4.271800733758729e-07,
"logits/chosen": -2.838114023208618,
"logits/rejected": -2.8366000652313232,
"logps/chosen": -301.5270690917969,
"logps/rejected": -268.2883605957031,
"loss": 0.6769,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.027721276506781578,
"rewards/margins": 0.03389520198106766,
"rewards/rejected": -0.006173927802592516,
"step": 1240
},
{
"epoch": 0.327139492279508,
"grad_norm": 1.3359375,
"learning_rate": 4.255610341662304e-07,
"logits/chosen": -2.863908529281616,
"logits/rejected": -2.806837797164917,
"logps/chosen": -273.0283203125,
"logps/rejected": -253.21798706054688,
"loss": 0.6806,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 0.023084726184606552,
"rewards/margins": 0.026232142001390457,
"rewards/rejected": -0.003147417213767767,
"step": 1250
},
{
"epoch": 0.32975660821774405,
"grad_norm": 1.4453125,
"learning_rate": 4.2392733566075757e-07,
"logits/chosen": -2.8411412239074707,
"logits/rejected": -2.812453508377075,
"logps/chosen": -271.3999938964844,
"logps/rejected": -254.3863983154297,
"loss": 0.6838,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 0.021707288920879364,
"rewards/margins": 0.019517619162797928,
"rewards/rejected": 0.002189669292420149,
"step": 1260
},
{
"epoch": 0.3323737241559801,
"grad_norm": 1.2265625,
"learning_rate": 4.2227911427280973e-07,
"logits/chosen": -2.8064496517181396,
"logits/rejected": -2.777616500854492,
"logps/chosen": -263.7330627441406,
"logps/rejected": -234.1592559814453,
"loss": 0.6807,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.0242301132529974,
"rewards/margins": 0.026022180914878845,
"rewards/rejected": -0.0017920676618814468,
"step": 1270
},
{
"epoch": 0.33499084009421615,
"grad_norm": 1.359375,
"learning_rate": 4.206165076283982e-07,
"logits/chosen": -2.8404831886291504,
"logits/rejected": -2.81711483001709,
"logps/chosen": -259.4391174316406,
"logps/rejected": -243.4575653076172,
"loss": 0.6796,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.02131321281194687,
"rewards/margins": 0.028222400695085526,
"rewards/rejected": -0.00690918555483222,
"step": 1280
},
{
"epoch": 0.33760795603245225,
"grad_norm": 1.4296875,
"learning_rate": 4.1893965455469946e-07,
"logits/chosen": -2.855498790740967,
"logits/rejected": -2.832735538482666,
"logps/chosen": -263.16021728515625,
"logps/rejected": -243.6949462890625,
"loss": 0.6818,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.01883005164563656,
"rewards/margins": 0.02386125549674034,
"rewards/rejected": -0.005031202454119921,
"step": 1290
},
{
"epoch": 0.3402250719706883,
"grad_norm": 1.1328125,
"learning_rate": 4.172486950684626e-07,
"logits/chosen": -2.851036548614502,
"logits/rejected": -2.842118501663208,
"logps/chosen": -266.0923767089844,
"logps/rejected": -266.4703369140625,
"loss": 0.6785,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.02591409906744957,
"rewards/margins": 0.030536871403455734,
"rewards/rejected": -0.004622773267328739,
"step": 1300
},
{
"epoch": 0.3402250719706883,
"eval_logits/chosen": -2.85286283493042,
"eval_logits/rejected": -2.825887441635132,
"eval_logps/chosen": -280.6852111816406,
"eval_logps/rejected": -262.0453186035156,
"eval_loss": 0.6802608966827393,
"eval_rewards/accuracies": 0.6840000152587891,
"eval_rewards/chosen": 0.02088269591331482,
"eval_rewards/margins": 0.026952272281050682,
"eval_rewards/rejected": -0.006069576367735863,
"eval_runtime": 623.414,
"eval_samples_per_second": 3.208,
"eval_steps_per_second": 0.401,
"step": 1300
},
{
"epoch": 0.34284218790892435,
"grad_norm": 1.265625,
"learning_rate": 4.155437703643181e-07,
"logits/chosen": -2.877864360809326,
"logits/rejected": -2.8368418216705322,
"logps/chosen": -258.5326843261719,
"logps/rejected": -233.0809326171875,
"loss": 0.6764,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 0.025921067222952843,
"rewards/margins": 0.034750454127788544,
"rewards/rejected": -0.008829386904835701,
"step": 1310
},
{
"epoch": 0.34545930384716045,
"grad_norm": 1.21875,
"learning_rate": 4.138250228029881e-07,
"logits/chosen": -2.8507418632507324,
"logits/rejected": -2.8341784477233887,
"logps/chosen": -270.1946716308594,
"logps/rejected": -279.32489013671875,
"loss": 0.6829,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.018970279023051262,
"rewards/margins": 0.021766219288110733,
"rewards/rejected": -0.002795940963551402,
"step": 1320
},
{
"epoch": 0.3480764197853965,
"grad_norm": 1.1796875,
"learning_rate": 4.1209259589939935e-07,
"logits/chosen": -2.8348724842071533,
"logits/rejected": -2.8295791149139404,
"logps/chosen": -247.55770874023438,
"logps/rejected": -242.4851837158203,
"loss": 0.6819,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.022017816081643105,
"rewards/margins": 0.02364877425134182,
"rewards/rejected": -0.00163096038158983,
"step": 1330
},
{
"epoch": 0.35069353572363254,
"grad_norm": 1.2578125,
"learning_rate": 4.103466343106998e-07,
"logits/chosen": -2.868483543395996,
"logits/rejected": -2.855750560760498,
"logps/chosen": -287.6753845214844,
"logps/rejected": -257.07257080078125,
"loss": 0.6827,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.01955850049853325,
"rewards/margins": 0.02184746228158474,
"rewards/rejected": -0.002288959687575698,
"step": 1340
},
{
"epoch": 0.35331065166186865,
"grad_norm": 1.1328125,
"learning_rate": 4.085872838241796e-07,
"logits/chosen": -2.8042919635772705,
"logits/rejected": -2.765010356903076,
"logps/chosen": -293.25677490234375,
"logps/rejected": -261.722412109375,
"loss": 0.6818,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.015371786430478096,
"rewards/margins": 0.023923274129629135,
"rewards/rejected": -0.008551487699151039,
"step": 1350
},
{
"epoch": 0.3559277676001047,
"grad_norm": 1.390625,
"learning_rate": 4.06814691345098e-07,
"logits/chosen": -2.7857346534729004,
"logits/rejected": -2.756810426712036,
"logps/chosen": -272.96185302734375,
"logps/rejected": -253.2128448486328,
"loss": 0.6801,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 0.02000296302139759,
"rewards/margins": 0.027280131354928017,
"rewards/rejected": -0.00727717112749815,
"step": 1360
},
{
"epoch": 0.35854488353834074,
"grad_norm": 1.6328125,
"learning_rate": 4.0502900488441707e-07,
"logits/chosen": -2.8389689922332764,
"logits/rejected": -2.8258564472198486,
"logps/chosen": -283.4964294433594,
"logps/rejected": -280.92559814453125,
"loss": 0.6809,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.0220597293227911,
"rewards/margins": 0.025511642917990685,
"rewards/rejected": -0.003451913595199585,
"step": 1370
},
{
"epoch": 0.3611619994765768,
"grad_norm": 1.3046875,
"learning_rate": 4.032303735464422e-07,
"logits/chosen": -2.9172284603118896,
"logits/rejected": -2.868638753890991,
"logps/chosen": -287.5843811035156,
"logps/rejected": -264.72808837890625,
"loss": 0.6775,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.02606380544602871,
"rewards/margins": 0.032886773347854614,
"rewards/rejected": -0.006822962313890457,
"step": 1380
},
{
"epoch": 0.3637791154148129,
"grad_norm": 1.4296875,
"learning_rate": 4.014189475163726e-07,
"logits/chosen": -2.8349239826202393,
"logits/rejected": -2.8192358016967773,
"logps/chosen": -270.93731689453125,
"logps/rejected": -261.95611572265625,
"loss": 0.6779,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.019413406029343605,
"rewards/margins": 0.0317561998963356,
"rewards/rejected": -0.012342792935669422,
"step": 1390
},
{
"epoch": 0.36639623135304894,
"grad_norm": 1.59375,
"learning_rate": 3.995948780477605e-07,
"logits/chosen": -2.8566317558288574,
"logits/rejected": -2.821171998977661,
"logps/chosen": -283.40374755859375,
"logps/rejected": -261.3340148925781,
"loss": 0.6828,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.015268507413566113,
"rewards/margins": 0.022029511630535126,
"rewards/rejected": -0.0067610046826303005,
"step": 1400
},
{
"epoch": 0.36639623135304894,
"eval_logits/chosen": -2.850475549697876,
"eval_logits/rejected": -2.8233399391174316,
"eval_logps/chosen": -280.606201171875,
"eval_logps/rejected": -262.1006774902344,
"eval_loss": 0.6796398758888245,
"eval_rewards/accuracies": 0.6865000128746033,
"eval_rewards/chosen": 0.02167338877916336,
"eval_rewards/margins": 0.028296444565057755,
"eval_rewards/rejected": -0.006623056251555681,
"eval_runtime": 622.3734,
"eval_samples_per_second": 3.214,
"eval_steps_per_second": 0.402,
"step": 1400
},
{
"epoch": 0.369013347291285,
"grad_norm": 1.3515625,
"learning_rate": 3.977583174498816e-07,
"logits/chosen": -2.856717824935913,
"logits/rejected": -2.8365931510925293,
"logps/chosen": -283.3439025878906,
"logps/rejected": -262.3489990234375,
"loss": 0.676,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": 0.025825675576925278,
"rewards/margins": 0.03570712357759476,
"rewards/rejected": -0.009881444275379181,
"step": 1410
},
{
"epoch": 0.3716304632295211,
"grad_norm": 1.3984375,
"learning_rate": 3.9590941907501717e-07,
"logits/chosen": -2.86562180519104,
"logits/rejected": -2.84371280670166,
"logps/chosen": -298.9716796875,
"logps/rejected": -272.981689453125,
"loss": 0.676,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": 0.03262133151292801,
"rewards/margins": 0.03562031686306,
"rewards/rejected": -0.002998985815793276,
"step": 1420
},
{
"epoch": 0.37424757916775714,
"grad_norm": 2.0625,
"learning_rate": 3.9404833730564974e-07,
"logits/chosen": -2.766550302505493,
"logits/rejected": -2.7500851154327393,
"logps/chosen": -270.97052001953125,
"logps/rejected": -261.961181640625,
"loss": 0.6788,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.02307405136525631,
"rewards/margins": 0.0301786120980978,
"rewards/rejected": -0.007104557007551193,
"step": 1430
},
{
"epoch": 0.3768646951059932,
"grad_norm": 1.296875,
"learning_rate": 3.9217522754157117e-07,
"logits/chosen": -2.84255051612854,
"logits/rejected": -2.838588237762451,
"logps/chosen": -266.6410827636719,
"logps/rejected": -246.63916015625,
"loss": 0.6756,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.02127646468579769,
"rewards/margins": 0.036509204655885696,
"rewards/rejected": -0.015232739970088005,
"step": 1440
},
{
"epoch": 0.37948181104422923,
"grad_norm": 1.296875,
"learning_rate": 3.9029024618690785e-07,
"logits/chosen": -2.854072093963623,
"logits/rejected": -2.8262181282043457,
"logps/chosen": -253.68600463867188,
"logps/rejected": -238.5004425048828,
"loss": 0.6816,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.01315716840326786,
"rewards/margins": 0.024385813623666763,
"rewards/rejected": -0.011228645220398903,
"step": 1450
},
{
"epoch": 0.38209892698246534,
"grad_norm": 1.4609375,
"learning_rate": 3.883935506370605e-07,
"logits/chosen": -2.816702365875244,
"logits/rejected": -2.8033461570739746,
"logps/chosen": -267.4139404296875,
"logps/rejected": -239.7705535888672,
"loss": 0.6758,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.027285242453217506,
"rewards/margins": 0.036063503473997116,
"rewards/rejected": -0.008778261952102184,
"step": 1460
},
{
"epoch": 0.3847160429207014,
"grad_norm": 1.734375,
"learning_rate": 3.864852992655616e-07,
"logits/chosen": -2.8303184509277344,
"logits/rejected": -2.813981533050537,
"logps/chosen": -266.30218505859375,
"logps/rejected": -254.7268524169922,
"loss": 0.6749,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.02994285151362419,
"rewards/margins": 0.03793361037969589,
"rewards/rejected": -0.007990758866071701,
"step": 1470
},
{
"epoch": 0.38733315885893743,
"grad_norm": 1.140625,
"learning_rate": 3.845656514108515e-07,
"logits/chosen": -2.8377737998962402,
"logits/rejected": -2.815836191177368,
"logps/chosen": -279.17205810546875,
"logps/rejected": -219.8304901123047,
"loss": 0.6792,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.02100013568997383,
"rewards/margins": 0.029268179088830948,
"rewards/rejected": -0.008268042467534542,
"step": 1480
},
{
"epoch": 0.38995027479717354,
"grad_norm": 1.4609375,
"learning_rate": 3.8263476736297375e-07,
"logits/chosen": -2.8322536945343018,
"logits/rejected": -2.7860350608825684,
"logps/chosen": -266.8233947753906,
"logps/rejected": -243.4827880859375,
"loss": 0.6787,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.018916089087724686,
"rewards/margins": 0.03032476268708706,
"rewards/rejected": -0.011408672668039799,
"step": 1490
},
{
"epoch": 0.3925673907354096,
"grad_norm": 1.9453125,
"learning_rate": 3.8069280835019055e-07,
"logits/chosen": -2.822990894317627,
"logits/rejected": -2.789095878601074,
"logps/chosen": -282.0010070800781,
"logps/rejected": -262.74383544921875,
"loss": 0.6795,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.024773618206381798,
"rewards/margins": 0.028730124235153198,
"rewards/rejected": -0.0039565060287714005,
"step": 1500
},
{
"epoch": 0.3925673907354096,
"eval_logits/chosen": -2.851984977722168,
"eval_logits/rejected": -2.825019121170044,
"eval_logps/chosen": -280.51751708984375,
"eval_logps/rejected": -262.1142883300781,
"eval_loss": 0.6791806221008301,
"eval_rewards/accuracies": 0.6830000281333923,
"eval_rewards/chosen": 0.022559717297554016,
"eval_rewards/margins": 0.029319126158952713,
"eval_rewards/rejected": -0.0067594097927212715,
"eval_runtime": 621.2203,
"eval_samples_per_second": 3.219,
"eval_steps_per_second": 0.402,
"step": 1500
},
{
"epoch": 0.39518450667364563,
"grad_norm": 1.3125,
"learning_rate": 3.7873993652552073e-07,
"logits/chosen": -2.8283066749572754,
"logits/rejected": -2.811255693435669,
"logps/chosen": -247.559814453125,
"logps/rejected": -242.0292205810547,
"loss": 0.6857,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.015011796727776527,
"rewards/margins": 0.016142752021551132,
"rewards/rejected": -0.0011309570400044322,
"step": 1510
},
{
"epoch": 0.39780162261188173,
"grad_norm": 1.34375,
"learning_rate": 3.767763149531995e-07,
"logits/chosen": -2.8359994888305664,
"logits/rejected": -2.819225311279297,
"logps/chosen": -277.1551208496094,
"logps/rejected": -260.8001708984375,
"loss": 0.6772,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.02346893586218357,
"rewards/margins": 0.03324516490101814,
"rewards/rejected": -0.009776233695447445,
"step": 1520
},
{
"epoch": 0.4004187385501178,
"grad_norm": 1.5546875,
"learning_rate": 3.7480210759506326e-07,
"logits/chosen": -2.808189868927002,
"logits/rejected": -2.8010077476501465,
"logps/chosen": -292.74847412109375,
"logps/rejected": -281.989013671875,
"loss": 0.6814,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 0.02831420861184597,
"rewards/margins": 0.025223467499017715,
"rewards/rejected": 0.003090745070949197,
"step": 1530
},
{
"epoch": 0.40303585448835383,
"grad_norm": 1.234375,
"learning_rate": 3.728174792968582e-07,
"logits/chosen": -2.8126258850097656,
"logits/rejected": -2.782135248184204,
"logps/chosen": -253.31851196289062,
"logps/rejected": -239.48095703125,
"loss": 0.6811,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.016257187351584435,
"rewards/margins": 0.02541721798479557,
"rewards/rejected": -0.00916003156453371,
"step": 1540
},
{
"epoch": 0.4056529704265899,
"grad_norm": 1.3828125,
"learning_rate": 3.70822595774476e-07,
"logits/chosen": -2.8408150672912598,
"logits/rejected": -2.8108041286468506,
"logps/chosen": -285.0970153808594,
"logps/rejected": -271.54425048828125,
"loss": 0.6743,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.029356488958001137,
"rewards/margins": 0.03932540863752365,
"rewards/rejected": -0.009968922473490238,
"step": 1550
},
{
"epoch": 0.408270086364826,
"grad_norm": 1.4609375,
"learning_rate": 3.688176236001168e-07,
"logits/chosen": -2.837639331817627,
"logits/rejected": -2.7985403537750244,
"logps/chosen": -294.97491455078125,
"logps/rejected": -259.5853271484375,
"loss": 0.6779,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.028945360332727432,
"rewards/margins": 0.03233477473258972,
"rewards/rejected": -0.0033894157968461514,
"step": 1560
},
{
"epoch": 0.410887202303062,
"grad_norm": 1.34375,
"learning_rate": 3.6680273018838016e-07,
"logits/chosen": -2.847684144973755,
"logits/rejected": -2.8297903537750244,
"logps/chosen": -267.1182556152344,
"logps/rejected": -251.4414825439453,
"loss": 0.6751,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.0260789655148983,
"rewards/margins": 0.03772038221359253,
"rewards/rejected": -0.011641415767371655,
"step": 1570
},
{
"epoch": 0.4135043182412981,
"grad_norm": 1.265625,
"learning_rate": 3.6477808378228596e-07,
"logits/chosen": -2.8226513862609863,
"logits/rejected": -2.8190042972564697,
"logps/chosen": -268.88055419921875,
"logps/rejected": -301.400146484375,
"loss": 0.678,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.022145092487335205,
"rewards/margins": 0.03182849660515785,
"rewards/rejected": -0.009683402255177498,
"step": 1580
},
{
"epoch": 0.4161214341795342,
"grad_norm": 1.140625,
"learning_rate": 3.6274385343922674e-07,
"logits/chosen": -2.8877930641174316,
"logits/rejected": -2.8846001625061035,
"logps/chosen": -250.56411743164062,
"logps/rejected": -259.74505615234375,
"loss": 0.6826,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.015636231750249863,
"rewards/margins": 0.0221557579934597,
"rewards/rejected": -0.006519525311887264,
"step": 1590
},
{
"epoch": 0.4187385501177702,
"grad_norm": 1.140625,
"learning_rate": 3.6070020901685057e-07,
"logits/chosen": -2.8079447746276855,
"logits/rejected": -2.812084197998047,
"logps/chosen": -280.13079833984375,
"logps/rejected": -259.9898681640625,
"loss": 0.6801,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.02125183865427971,
"rewards/margins": 0.027416234835982323,
"rewards/rejected": -0.00616439338773489,
"step": 1600
},
{
"epoch": 0.4187385501177702,
"eval_logits/chosen": -2.8515522480010986,
"eval_logits/rejected": -2.824509382247925,
"eval_logps/chosen": -280.82861328125,
"eval_logps/rejected": -262.506591796875,
"eval_loss": 0.6788100600242615,
"eval_rewards/accuracies": 0.684499979019165,
"eval_rewards/chosen": 0.01944848708808422,
"eval_rewards/margins": 0.030131228268146515,
"eval_rewards/rejected": -0.010682739317417145,
"eval_runtime": 623.5252,
"eval_samples_per_second": 3.208,
"eval_steps_per_second": 0.401,
"step": 1600
},
{
"epoch": 0.4213556660560063,
"grad_norm": 1.3046875,
"learning_rate": 3.5864732115887863e-07,
"logits/chosen": -2.8428633213043213,
"logits/rejected": -2.8304831981658936,
"logps/chosen": -258.84423828125,
"logps/rejected": -267.82415771484375,
"loss": 0.6772,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.021647287532687187,
"rewards/margins": 0.03337367996573448,
"rewards/rejected": -0.011726390570402145,
"step": 1610
},
{
"epoch": 0.4239727819942423,
"grad_norm": 1.515625,
"learning_rate": 3.565853612808562e-07,
"logits/chosen": -2.8622894287109375,
"logits/rejected": -2.8286705017089844,
"logps/chosen": -278.792724609375,
"logps/rejected": -251.3448028564453,
"loss": 0.6813,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.011435525491833687,
"rewards/margins": 0.02519642934203148,
"rewards/rejected": -0.013760904781520367,
"step": 1620
},
{
"epoch": 0.4265898979324784,
"grad_norm": 1.234375,
"learning_rate": 3.5451450155583984e-07,
"logits/chosen": -2.776291608810425,
"logits/rejected": -2.8099074363708496,
"logps/chosen": -247.71432495117188,
"logps/rejected": -233.94900512695312,
"loss": 0.6799,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.01177397184073925,
"rewards/margins": 0.027832742780447006,
"rewards/rejected": -0.016058770939707756,
"step": 1630
},
{
"epoch": 0.42920701387071447,
"grad_norm": 1.6171875,
"learning_rate": 3.5243491490002055e-07,
"logits/chosen": -2.8553478717803955,
"logits/rejected": -2.8476006984710693,
"logps/chosen": -271.2582702636719,
"logps/rejected": -265.4175720214844,
"loss": 0.681,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.014447445049881935,
"rewards/margins": 0.026055917143821716,
"rewards/rejected": -0.011608473025262356,
"step": 1640
},
{
"epoch": 0.4318241298089505,
"grad_norm": 1.3515625,
"learning_rate": 3.503467749582857e-07,
"logits/chosen": -2.834573984146118,
"logits/rejected": -2.7920243740081787,
"logps/chosen": -269.9908142089844,
"logps/rejected": -235.1788787841797,
"loss": 0.6829,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.01244247704744339,
"rewards/margins": 0.021913422271609306,
"rewards/rejected": -0.009470945224165916,
"step": 1650
},
{
"epoch": 0.4344412457471866,
"grad_norm": 1.15625,
"learning_rate": 3.482502560897194e-07,
"logits/chosen": -2.8074707984924316,
"logits/rejected": -2.7939584255218506,
"logps/chosen": -236.347412109375,
"logps/rejected": -241.1566925048828,
"loss": 0.684,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 0.011268834583461285,
"rewards/margins": 0.019097527489066124,
"rewards/rejected": -0.007828695699572563,
"step": 1660
},
{
"epoch": 0.43705836168542267,
"grad_norm": 1.5546875,
"learning_rate": 3.4614553335304403e-07,
"logits/chosen": -2.8423566818237305,
"logits/rejected": -2.7879586219787598,
"logps/chosen": -288.4642639160156,
"logps/rejected": -253.7875213623047,
"loss": 0.6767,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.02120782807469368,
"rewards/margins": 0.034634821116924286,
"rewards/rejected": -0.013426998630166054,
"step": 1670
},
{
"epoch": 0.4396754776236587,
"grad_norm": 1.75,
"learning_rate": 3.440327824920022e-07,
"logits/chosen": -2.831993579864502,
"logits/rejected": -2.8056693077087402,
"logps/chosen": -299.0148010253906,
"logps/rejected": -260.6773681640625,
"loss": 0.6741,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.026383286342024803,
"rewards/margins": 0.03987019881606102,
"rewards/rejected": -0.013486906886100769,
"step": 1680
},
{
"epoch": 0.44229259356189476,
"grad_norm": 1.671875,
"learning_rate": 3.4191217992068287e-07,
"logits/chosen": -2.870518207550049,
"logits/rejected": -2.841670513153076,
"logps/chosen": -292.0727844238281,
"logps/rejected": -247.9720916748047,
"loss": 0.678,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.026544999331235886,
"rewards/margins": 0.03184535354375839,
"rewards/rejected": -0.005300348624587059,
"step": 1690
},
{
"epoch": 0.44490970950013087,
"grad_norm": 1.3828125,
"learning_rate": 3.3978390270879056e-07,
"logits/chosen": -2.8237035274505615,
"logits/rejected": -2.810272455215454,
"logps/chosen": -227.2367706298828,
"logps/rejected": -232.7034454345703,
"loss": 0.6839,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.008710166439414024,
"rewards/margins": 0.01971607096493244,
"rewards/rejected": -0.011005903594195843,
"step": 1700
},
{
"epoch": 0.44490970950013087,
"eval_logits/chosen": -2.853027105331421,
"eval_logits/rejected": -2.8261446952819824,
"eval_logps/chosen": -280.7289123535156,
"eval_logps/rejected": -262.4769592285156,
"eval_loss": 0.6784868240356445,
"eval_rewards/accuracies": 0.6855000257492065,
"eval_rewards/chosen": 0.02044598199427128,
"eval_rewards/margins": 0.030832206830382347,
"eval_rewards/rejected": -0.010386227630078793,
"eval_runtime": 622.0841,
"eval_samples_per_second": 3.215,
"eval_steps_per_second": 0.402,
"step": 1700
},
{
"epoch": 0.4475268254383669,
"grad_norm": 1.2734375,
"learning_rate": 3.376481285668599e-07,
"logits/chosen": -2.8446857929229736,
"logits/rejected": -2.8485488891601562,
"logps/chosen": -237.1801300048828,
"logps/rejected": -253.5928192138672,
"loss": 0.6814,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.013800742104649544,
"rewards/margins": 0.024945253506302834,
"rewards/rejected": -0.01114450953900814,
"step": 1710
},
{
"epoch": 0.45014394137660296,
"grad_norm": 1.3046875,
"learning_rate": 3.355050358314172e-07,
"logits/chosen": -2.874572515487671,
"logits/rejected": -2.8544344902038574,
"logps/chosen": -282.2732238769531,
"logps/rejected": -267.3336181640625,
"loss": 0.6765,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.026168251410126686,
"rewards/margins": 0.03479185700416565,
"rewards/rejected": -0.008623604662716389,
"step": 1720
},
{
"epoch": 0.45276105731483907,
"grad_norm": 1.3515625,
"learning_rate": 3.33354803450089e-07,
"logits/chosen": -2.7801265716552734,
"logits/rejected": -2.7793445587158203,
"logps/chosen": -282.25518798828125,
"logps/rejected": -262.77978515625,
"loss": 0.6811,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.016791898757219315,
"rewards/margins": 0.025678789243102074,
"rewards/rejected": -0.008886890485882759,
"step": 1730
},
{
"epoch": 0.4553781732530751,
"grad_norm": 1.1953125,
"learning_rate": 3.311976109666605e-07,
"logits/chosen": -2.8067824840545654,
"logits/rejected": -2.7824745178222656,
"logps/chosen": -292.15130615234375,
"logps/rejected": -263.1966247558594,
"loss": 0.6782,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.027290191501379013,
"rewards/margins": 0.03162240982055664,
"rewards/rejected": -0.004332221113145351,
"step": 1740
},
{
"epoch": 0.45799528919131116,
"grad_norm": 1.3125,
"learning_rate": 3.2903363850608317e-07,
"logits/chosen": -2.899350166320801,
"logits/rejected": -2.855714797973633,
"logps/chosen": -263.13433837890625,
"logps/rejected": -244.21298217773438,
"loss": 0.6777,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.012938638217747211,
"rewards/margins": 0.03214184567332268,
"rewards/rejected": -0.01920320652425289,
"step": 1750
},
{
"epoch": 0.46061240512954726,
"grad_norm": 1.25,
"learning_rate": 3.2686306675943477e-07,
"logits/chosen": -2.8296382427215576,
"logits/rejected": -2.8444466590881348,
"logps/chosen": -271.3722839355469,
"logps/rejected": -247.5081787109375,
"loss": 0.6782,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 0.02265056222677231,
"rewards/margins": 0.031140562146902084,
"rewards/rejected": -0.008490001782774925,
"step": 1760
},
{
"epoch": 0.4632295210677833,
"grad_norm": 1.5,
"learning_rate": 3.2468607696883145e-07,
"logits/chosen": -2.8013827800750732,
"logits/rejected": -2.793203592300415,
"logps/chosen": -266.6105651855469,
"logps/rejected": -276.02911376953125,
"loss": 0.6764,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.01664569415152073,
"rewards/margins": 0.034995269030332565,
"rewards/rejected": -0.018349576741456985,
"step": 1770
},
{
"epoch": 0.46584663700601936,
"grad_norm": 1.2421875,
"learning_rate": 3.2250285091229435e-07,
"logits/chosen": -2.863778591156006,
"logits/rejected": -2.8398165702819824,
"logps/chosen": -248.42300415039062,
"logps/rejected": -239.9966583251953,
"loss": 0.682,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.011377891525626183,
"rewards/margins": 0.02343577891588211,
"rewards/rejected": -0.012057888321578503,
"step": 1780
},
{
"epoch": 0.4684637529442554,
"grad_norm": 7.71875,
"learning_rate": 3.2031357088857083e-07,
"logits/chosen": -2.851457118988037,
"logits/rejected": -2.8426060676574707,
"logps/chosen": -291.23236083984375,
"logps/rejected": -300.08843994140625,
"loss": 0.6808,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 0.015203478746116161,
"rewards/margins": 0.02637268602848053,
"rewards/rejected": -0.011169209145009518,
"step": 1790
},
{
"epoch": 0.4710808688824915,
"grad_norm": 1.4375,
"learning_rate": 3.1811841970191267e-07,
"logits/chosen": -2.7736434936523438,
"logits/rejected": -2.7497920989990234,
"logps/chosen": -245.38912963867188,
"logps/rejected": -276.73297119140625,
"loss": 0.6793,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.009697729721665382,
"rewards/margins": 0.029323875904083252,
"rewards/rejected": -0.01962614618241787,
"step": 1800
},
{
"epoch": 0.4710808688824915,
"eval_logits/chosen": -2.851853132247925,
"eval_logits/rejected": -2.8248438835144043,
"eval_logps/chosen": -280.89361572265625,
"eval_logps/rejected": -262.69610595703125,
"eval_loss": 0.6782403588294983,
"eval_rewards/accuracies": 0.6869999766349792,
"eval_rewards/chosen": 0.018798967823386192,
"eval_rewards/margins": 0.03137620911002159,
"eval_rewards/rejected": -0.012577244080603123,
"eval_runtime": 622.7177,
"eval_samples_per_second": 3.212,
"eval_steps_per_second": 0.401,
"step": 1800
},
{
"epoch": 0.47369798482072756,
"grad_norm": 1.28125,
"learning_rate": 3.1591758064681257e-07,
"logits/chosen": -2.779759407043457,
"logits/rejected": -2.7464611530303955,
"logps/chosen": -269.7041015625,
"logps/rejected": -234.5542755126953,
"loss": 0.6773,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.017099203541874886,
"rewards/margins": 0.03334728628396988,
"rewards/rejected": -0.01624808833003044,
"step": 1810
},
{
"epoch": 0.4763151007589636,
"grad_norm": 1.484375,
"learning_rate": 3.13711237492698e-07,
"logits/chosen": -2.8304784297943115,
"logits/rejected": -2.8185439109802246,
"logps/chosen": -296.6722717285156,
"logps/rejected": -284.0693664550781,
"loss": 0.6829,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.016499513760209084,
"rewards/margins": 0.02195136621594429,
"rewards/rejected": -0.005451851524412632,
"step": 1820
},
{
"epoch": 0.4789322166971997,
"grad_norm": 1.046875,
"learning_rate": 3.1149957446858767e-07,
"logits/chosen": -2.82464599609375,
"logits/rejected": -2.8389458656311035,
"logps/chosen": -263.2919006347656,
"logps/rejected": -250.74813842773438,
"loss": 0.6841,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.0153631791472435,
"rewards/margins": 0.019609825685620308,
"rewards/rejected": -0.004246644675731659,
"step": 1830
},
{
"epoch": 0.48154933263543576,
"grad_norm": 1.4609375,
"learning_rate": 3.0928277624770736e-07,
"logits/chosen": -2.8787219524383545,
"logits/rejected": -2.8530170917510986,
"logps/chosen": -300.767333984375,
"logps/rejected": -275.71856689453125,
"loss": 0.6736,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.02286524511873722,
"rewards/margins": 0.041333895176649094,
"rewards/rejected": -0.018468648195266724,
"step": 1840
},
{
"epoch": 0.4841664485736718,
"grad_norm": 1.3203125,
"learning_rate": 3.0706102793207073e-07,
"logits/chosen": -2.8641512393951416,
"logits/rejected": -2.832724094390869,
"logps/chosen": -301.1673278808594,
"logps/rejected": -282.6499938964844,
"loss": 0.6723,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.030217334628105164,
"rewards/margins": 0.043759047985076904,
"rewards/rejected": -0.01354171335697174,
"step": 1850
},
{
"epoch": 0.48678356451190785,
"grad_norm": 1.28125,
"learning_rate": 3.048345150370226e-07,
"logits/chosen": -2.8586244583129883,
"logits/rejected": -2.8518083095550537,
"logps/chosen": -300.2522888183594,
"logps/rejected": -283.9441223144531,
"loss": 0.6775,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.020278874784708023,
"rewards/margins": 0.03307543322443962,
"rewards/rejected": -0.012796561233699322,
"step": 1860
},
{
"epoch": 0.48940068045014395,
"grad_norm": 1.1171875,
"learning_rate": 3.0260342347574913e-07,
"logits/chosen": -2.8430685997009277,
"logits/rejected": -2.8185229301452637,
"logps/chosen": -285.382568359375,
"logps/rejected": -269.14886474609375,
"loss": 0.6767,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.026956235989928246,
"rewards/margins": 0.03441258519887924,
"rewards/rejected": -0.007456351071596146,
"step": 1870
},
{
"epoch": 0.49201779638838,
"grad_norm": 1.21875,
"learning_rate": 3.0036793954375357e-07,
"logits/chosen": -2.8681893348693848,
"logits/rejected": -2.8438541889190674,
"logps/chosen": -283.40582275390625,
"logps/rejected": -243.862060546875,
"loss": 0.6752,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.026450032368302345,
"rewards/margins": 0.037637047469615936,
"rewards/rejected": -0.01118701882660389,
"step": 1880
},
{
"epoch": 0.49463491232661605,
"grad_norm": 1.375,
"learning_rate": 2.9812824990330085e-07,
"logits/chosen": -2.837024211883545,
"logits/rejected": -2.825876235961914,
"logps/chosen": -290.41644287109375,
"logps/rejected": -267.0772705078125,
"loss": 0.6775,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.017299989238381386,
"rewards/margins": 0.03299538046121597,
"rewards/rejected": -0.015695389360189438,
"step": 1890
},
{
"epoch": 0.49725202826485215,
"grad_norm": 1.421875,
"learning_rate": 2.958845415678316e-07,
"logits/chosen": -2.8465914726257324,
"logits/rejected": -2.8124914169311523,
"logps/chosen": -293.70098876953125,
"logps/rejected": -275.6326904296875,
"loss": 0.6766,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.020625559613108635,
"rewards/margins": 0.034684114158153534,
"rewards/rejected": -0.01405855268239975,
"step": 1900
},
{
"epoch": 0.49725202826485215,
"eval_logits/chosen": -2.854793071746826,
"eval_logits/rejected": -2.8281238079071045,
"eval_logps/chosen": -280.89208984375,
"eval_logps/rejected": -262.73114013671875,
"eval_loss": 0.6780784726142883,
"eval_rewards/accuracies": 0.6809999942779541,
"eval_rewards/chosen": 0.01881374977529049,
"eval_rewards/margins": 0.03174133226275444,
"eval_rewards/rejected": -0.012927580624818802,
"eval_runtime": 623.0648,
"eval_samples_per_second": 3.21,
"eval_steps_per_second": 0.401,
"step": 1900
},
{
"epoch": 0.4998691442030882,
"grad_norm": 1.0625,
"learning_rate": 2.936370018863459e-07,
"logits/chosen": -2.86594295501709,
"logits/rejected": -2.852074384689331,
"logps/chosen": -278.3857727050781,
"logps/rejected": -242.82290649414062,
"loss": 0.6795,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.016295427456498146,
"rewards/margins": 0.028570901602506638,
"rewards/rejected": -0.012275472283363342,
"step": 1910
},
{
"epoch": 0.5024862601413242,
"grad_norm": 1.3125,
"learning_rate": 2.913858185277605e-07,
"logits/chosen": -2.8365845680236816,
"logits/rejected": -2.8241991996765137,
"logps/chosen": -274.61334228515625,
"logps/rejected": -262.88934326171875,
"loss": 0.6764,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.025087693706154823,
"rewards/margins": 0.03509819880127907,
"rewards/rejected": -0.010010505095124245,
"step": 1920
},
{
"epoch": 0.5051033760795604,
"grad_norm": 1.3828125,
"learning_rate": 2.89131179465238e-07,
"logits/chosen": -2.802734375,
"logits/rejected": -2.7592692375183105,
"logps/chosen": -286.10986328125,
"logps/rejected": -250.1353759765625,
"loss": 0.6755,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.019024692475795746,
"rewards/margins": 0.03733197599649429,
"rewards/rejected": -0.018307287245988846,
"step": 1930
},
{
"epoch": 0.5077204920177963,
"grad_norm": 1.2265625,
"learning_rate": 2.8687327296049125e-07,
"logits/chosen": -2.841648578643799,
"logits/rejected": -2.817791700363159,
"logps/chosen": -272.06378173828125,
"logps/rejected": -273.47930908203125,
"loss": 0.6797,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.0162358395755291,
"rewards/margins": 0.02848033234477043,
"rewards/rejected": -0.012244494631886482,
"step": 1940
},
{
"epoch": 0.5103376079560324,
"grad_norm": 1.21875,
"learning_rate": 2.846122875480637e-07,
"logits/chosen": -2.8606886863708496,
"logits/rejected": -2.822801113128662,
"logps/chosen": -288.4855651855469,
"logps/rejected": -264.97247314453125,
"loss": 0.6769,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 0.024403361603617668,
"rewards/margins": 0.03406291827559471,
"rewards/rejected": -0.009659556671977043,
"step": 1950
},
{
"epoch": 0.5129547238942685,
"grad_norm": 1.1484375,
"learning_rate": 2.8234841201958647e-07,
"logits/chosen": -2.8555562496185303,
"logits/rejected": -2.819885730743408,
"logps/chosen": -297.9819641113281,
"logps/rejected": -261.5909423828125,
"loss": 0.6754,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.02403775416314602,
"rewards/margins": 0.03708943352103233,
"rewards/rejected": -0.013051679357886314,
"step": 1960
},
{
"epoch": 0.5155718398325045,
"grad_norm": 1.4375,
"learning_rate": 2.800818354080148e-07,
"logits/chosen": -2.83642840385437,
"logits/rejected": -2.805063486099243,
"logps/chosen": -287.24420166015625,
"logps/rejected": -243.9567413330078,
"loss": 0.6775,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 0.021363835781812668,
"rewards/margins": 0.032995063811540604,
"rewards/rejected": -0.011631224304437637,
"step": 1970
},
{
"epoch": 0.5181889557707406,
"grad_norm": 1.265625,
"learning_rate": 2.778127469718435e-07,
"logits/chosen": -2.7859811782836914,
"logits/rejected": -2.7979307174682617,
"logps/chosen": -245.3804168701172,
"logps/rejected": -266.1874694824219,
"loss": 0.6803,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.015351531095802784,
"rewards/margins": 0.02699609100818634,
"rewards/rejected": -0.011644558981060982,
"step": 1980
},
{
"epoch": 0.5208060717089767,
"grad_norm": 1.3515625,
"learning_rate": 2.755413361793039e-07,
"logits/chosen": -2.8089661598205566,
"logits/rejected": -2.779783010482788,
"logps/chosen": -262.85589599609375,
"logps/rejected": -253.60653686523438,
"loss": 0.6757,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.026462215930223465,
"rewards/margins": 0.03666644170880318,
"rewards/rejected": -0.010204223915934563,
"step": 1990
},
{
"epoch": 0.5234231876472127,
"grad_norm": 1.3125,
"learning_rate": 2.7326779269254356e-07,
"logits/chosen": -2.8754069805145264,
"logits/rejected": -2.8527140617370605,
"logps/chosen": -303.8473205566406,
"logps/rejected": -247.6015625,
"loss": 0.6762,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.026804108172655106,
"rewards/margins": 0.03586486726999283,
"rewards/rejected": -0.009060760028660297,
"step": 2000
},
{
"epoch": 0.5234231876472127,
"eval_logits/chosen": -2.853806972503662,
"eval_logits/rejected": -2.8270227909088135,
"eval_logps/chosen": -280.87493896484375,
"eval_logps/rejected": -262.7651062011719,
"eval_loss": 0.6778436303138733,
"eval_rewards/accuracies": 0.6840000152587891,
"eval_rewards/chosen": 0.01898558810353279,
"eval_rewards/margins": 0.03225287050008774,
"eval_rewards/rejected": -0.013267277739942074,
"eval_runtime": 622.6334,
"eval_samples_per_second": 3.212,
"eval_steps_per_second": 0.402,
"step": 2000
},
{
"epoch": 0.5260403035854488,
"grad_norm": 1.4296875,
"learning_rate": 2.709923063517895e-07,
"logits/chosen": -2.8158721923828125,
"logits/rejected": -2.8292183876037598,
"logps/chosen": -277.1296691894531,
"logps/rejected": -276.7500915527344,
"loss": 0.6757,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.020963847637176514,
"rewards/margins": 0.03652495518326759,
"rewards/rejected": -0.015561106614768505,
"step": 2010
},
{
"epoch": 0.528657419523685,
"grad_norm": 1.5390625,
"learning_rate": 2.68715067159496e-07,
"logits/chosen": -2.8547072410583496,
"logits/rejected": -2.8296151161193848,
"logps/chosen": -266.697265625,
"logps/rejected": -248.7971649169922,
"loss": 0.6773,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 0.023486215621232986,
"rewards/margins": 0.0331868901848793,
"rewards/rejected": -0.009700671769678593,
"step": 2020
},
{
"epoch": 0.5312745354619209,
"grad_norm": 1.5625,
"learning_rate": 2.664362652644806e-07,
"logits/chosen": -2.871127128601074,
"logits/rejected": -2.859767198562622,
"logps/chosen": -309.3524475097656,
"logps/rejected": -267.5863342285156,
"loss": 0.6745,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.03075565956532955,
"rewards/margins": 0.039490751922130585,
"rewards/rejected": -0.008735088631510735,
"step": 2030
},
{
"epoch": 0.533891651400157,
"grad_norm": 1.2421875,
"learning_rate": 2.6415609094604555e-07,
"logits/chosen": -2.8490989208221436,
"logits/rejected": -2.849595308303833,
"logps/chosen": -284.77679443359375,
"logps/rejected": -266.975830078125,
"loss": 0.6771,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 0.029801562428474426,
"rewards/margins": 0.034142639487981796,
"rewards/rejected": -0.004341077525168657,
"step": 2040
},
{
"epoch": 0.5365087673383931,
"grad_norm": 1.1875,
"learning_rate": 2.618747345980904e-07,
"logits/chosen": -2.85640287399292,
"logits/rejected": -2.8106446266174316,
"logps/chosen": -262.911376953125,
"logps/rejected": -212.4907989501953,
"loss": 0.6744,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.0176672525703907,
"rewards/margins": 0.039339274168014526,
"rewards/rejected": -0.021672027185559273,
"step": 2050
},
{
"epoch": 0.5391258832766291,
"grad_norm": 1.421875,
"learning_rate": 2.595923867132136e-07,
"logits/chosen": -2.8832926750183105,
"logits/rejected": -2.872882604598999,
"logps/chosen": -296.1038818359375,
"logps/rejected": -274.6143493652344,
"loss": 0.6772,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.01775936782360077,
"rewards/margins": 0.03422468900680542,
"rewards/rejected": -0.0164653230458498,
"step": 2060
},
{
"epoch": 0.5417429992148652,
"grad_norm": 1.3359375,
"learning_rate": 2.5730923786680667e-07,
"logits/chosen": -2.860802173614502,
"logits/rejected": -2.862802028656006,
"logps/chosen": -264.1892395019531,
"logps/rejected": -275.02532958984375,
"loss": 0.6787,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.016188761219382286,
"rewards/margins": 0.030575359240174294,
"rewards/rejected": -0.014386599883437157,
"step": 2070
},
{
"epoch": 0.5443601151531012,
"grad_norm": 1.2890625,
"learning_rate": 2.5502547870114135e-07,
"logits/chosen": -2.8470053672790527,
"logits/rejected": -2.8130552768707275,
"logps/chosen": -269.6037902832031,
"logps/rejected": -240.1824493408203,
"loss": 0.6779,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.01747250370681286,
"rewards/margins": 0.03254149109125137,
"rewards/rejected": -0.015068987384438515,
"step": 2080
},
{
"epoch": 0.5469772310913373,
"grad_norm": 3.09375,
"learning_rate": 2.527412999094506e-07,
"logits/chosen": -2.8123087882995605,
"logits/rejected": -2.7862212657928467,
"logps/chosen": -315.90447998046875,
"logps/rejected": -302.16552734375,
"loss": 0.6781,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 0.022080183029174805,
"rewards/margins": 0.031787317246198654,
"rewards/rejected": -0.009707136079668999,
"step": 2090
},
{
"epoch": 0.5495943470295734,
"grad_norm": 1.828125,
"learning_rate": 2.5045689222000636e-07,
"logits/chosen": -2.798381805419922,
"logits/rejected": -2.7811503410339355,
"logps/chosen": -256.8103332519531,
"logps/rejected": -242.29971313476562,
"loss": 0.6796,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.01742711290717125,
"rewards/margins": 0.02845141850411892,
"rewards/rejected": -0.011024304665625095,
"step": 2100
},
{
"epoch": 0.5495943470295734,
"eval_logits/chosen": -2.856419324874878,
"eval_logits/rejected": -2.8298940658569336,
"eval_logps/chosen": -280.9320983886719,
"eval_logps/rejected": -262.8512878417969,
"eval_loss": 0.6777089834213257,
"eval_rewards/accuracies": 0.6794999837875366,
"eval_rewards/chosen": 0.018413949757814407,
"eval_rewards/margins": 0.03254299610853195,
"eval_rewards/rejected": -0.01412904355674982,
"eval_runtime": 623.6848,
"eval_samples_per_second": 3.207,
"eval_steps_per_second": 0.401,
"step": 2100
},
{
"epoch": 0.5522114629678094,
"grad_norm": 1.2734375,
"learning_rate": 2.481724463801933e-07,
"logits/chosen": -2.837977170944214,
"logits/rejected": -2.8143982887268066,
"logps/chosen": -293.23687744140625,
"logps/rejected": -254.9249725341797,
"loss": 0.6749,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.01989181898534298,
"rewards/margins": 0.03839176893234253,
"rewards/rejected": -0.01849994622170925,
"step": 2110
},
{
"epoch": 0.5548285789060455,
"grad_norm": 1.3984375,
"learning_rate": 2.4588815314058154e-07,
"logits/chosen": -2.828207492828369,
"logits/rejected": -2.825892448425293,
"logps/chosen": -257.57012939453125,
"logps/rejected": -226.6698455810547,
"loss": 0.677,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.021512161940336227,
"rewards/margins": 0.03372306749224663,
"rewards/rejected": -0.01221090741455555,
"step": 2120
},
{
"epoch": 0.5574456948442816,
"grad_norm": 1.2265625,
"learning_rate": 2.4360420323899917e-07,
"logits/chosen": -2.8333821296691895,
"logits/rejected": -2.8204522132873535,
"logps/chosen": -294.45135498046875,
"logps/rejected": -261.3866271972656,
"loss": 0.6785,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.021041734144091606,
"rewards/margins": 0.031500063836574554,
"rewards/rejected": -0.01045832596719265,
"step": 2130
},
{
"epoch": 0.5600628107825176,
"grad_norm": 1.2578125,
"learning_rate": 2.4132078738460583e-07,
"logits/chosen": -2.8641979694366455,
"logits/rejected": -2.8372373580932617,
"logps/chosen": -277.0169677734375,
"logps/rejected": -240.1663055419922,
"loss": 0.6765,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.020255176350474358,
"rewards/margins": 0.03485842049121857,
"rewards/rejected": -0.014603245072066784,
"step": 2140
},
{
"epoch": 0.5626799267207537,
"grad_norm": 1.3984375,
"learning_rate": 2.390380962419682e-07,
"logits/chosen": -2.8402717113494873,
"logits/rejected": -2.82948899269104,
"logps/chosen": -248.21908569335938,
"logps/rejected": -215.9970703125,
"loss": 0.6813,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.013856288976967335,
"rewards/margins": 0.02503989078104496,
"rewards/rejected": -0.011183603666722775,
"step": 2150
},
{
"epoch": 0.5652970426589898,
"grad_norm": 1.203125,
"learning_rate": 2.3675632041513977e-07,
"logits/chosen": -2.8817086219787598,
"logits/rejected": -2.830371856689453,
"logps/chosen": -299.80419921875,
"logps/rejected": -237.6551055908203,
"loss": 0.6704,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.029662128537893295,
"rewards/margins": 0.0477459654211998,
"rewards/rejected": -0.018083838745951653,
"step": 2160
},
{
"epoch": 0.5679141585972258,
"grad_norm": 1.1953125,
"learning_rate": 2.344756504317453e-07,
"logits/chosen": -2.8310768604278564,
"logits/rejected": -2.7922732830047607,
"logps/chosen": -273.2166442871094,
"logps/rejected": -238.2533721923828,
"loss": 0.6782,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.013544699177145958,
"rewards/margins": 0.031235402449965477,
"rewards/rejected": -0.01769069954752922,
"step": 2170
},
{
"epoch": 0.5705312745354619,
"grad_norm": 1.3203125,
"learning_rate": 2.3219627672707237e-07,
"logits/chosen": -2.814478874206543,
"logits/rejected": -2.807798147201538,
"logps/chosen": -271.345703125,
"logps/rejected": -229.5119171142578,
"loss": 0.6808,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.008285674266517162,
"rewards/margins": 0.025873666629195213,
"rewards/rejected": -0.017587993294000626,
"step": 2180
},
{
"epoch": 0.573148390473698,
"grad_norm": 1.359375,
"learning_rate": 2.2991838962816918e-07,
"logits/chosen": -2.812224864959717,
"logits/rejected": -2.787701368331909,
"logps/chosen": -268.98724365234375,
"logps/rejected": -268.3625793457031,
"loss": 0.681,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.01591324433684349,
"rewards/margins": 0.025850754231214523,
"rewards/rejected": -0.009937510825693607,
"step": 2190
},
{
"epoch": 0.575765506411934,
"grad_norm": 1.4453125,
"learning_rate": 2.2764217933795297e-07,
"logits/chosen": -2.8286900520324707,
"logits/rejected": -2.8099260330200195,
"logps/chosen": -274.784912109375,
"logps/rejected": -258.30084228515625,
"loss": 0.6736,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.02573958970606327,
"rewards/margins": 0.04115080088376999,
"rewards/rejected": -0.015411211177706718,
"step": 2200
},
{
"epoch": 0.575765506411934,
"eval_logits/chosen": -2.857100486755371,
"eval_logits/rejected": -2.8306167125701904,
"eval_logps/chosen": -280.9635314941406,
"eval_logps/rejected": -262.8892822265625,
"eval_loss": 0.6776819825172424,
"eval_rewards/accuracies": 0.6825000047683716,
"eval_rewards/chosen": 0.018099820241332054,
"eval_rewards/margins": 0.03260912373661995,
"eval_rewards/rejected": -0.014509301632642746,
"eval_runtime": 623.717,
"eval_samples_per_second": 3.207,
"eval_steps_per_second": 0.401,
"step": 2200
},
{
"epoch": 0.5783826223501701,
"grad_norm": 1.6171875,
"learning_rate": 2.253678359193278e-07,
"logits/chosen": -2.901681423187256,
"logits/rejected": -2.858135223388672,
"logps/chosen": -292.3751525878906,
"logps/rejected": -273.216064453125,
"loss": 0.6782,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.017452511936426163,
"rewards/margins": 0.0317564532160759,
"rewards/rejected": -0.014303937554359436,
"step": 2210
},
{
"epoch": 0.5809997382884062,
"grad_norm": 1.2421875,
"learning_rate": 2.230955492793149e-07,
"logits/chosen": -2.785632371902466,
"logits/rejected": -2.791489362716675,
"logps/chosen": -290.7291259765625,
"logps/rejected": -277.0824890136719,
"loss": 0.6816,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 0.015981845557689667,
"rewards/margins": 0.02465016394853592,
"rewards/rejected": -0.008668316528201103,
"step": 2220
},
{
"epoch": 0.5836168542266422,
"grad_norm": 1.5,
"learning_rate": 2.2082550915319468e-07,
"logits/chosen": -2.797928810119629,
"logits/rejected": -2.795860767364502,
"logps/chosen": -292.92291259765625,
"logps/rejected": -257.6455078125,
"loss": 0.6766,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.02159210667014122,
"rewards/margins": 0.03476356342434883,
"rewards/rejected": -0.013171456754207611,
"step": 2230
},
{
"epoch": 0.5862339701648783,
"grad_norm": 1.3203125,
"learning_rate": 2.1855790508866433e-07,
"logits/chosen": -2.8182015419006348,
"logits/rejected": -2.815925121307373,
"logps/chosen": -324.13592529296875,
"logps/rejected": -299.08135986328125,
"loss": 0.6768,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.02183537557721138,
"rewards/margins": 0.034798912703990936,
"rewards/rejected": -0.012963538058102131,
"step": 2240
},
{
"epoch": 0.5888510861031143,
"grad_norm": 1.2421875,
"learning_rate": 2.162929264300107e-07,
"logits/chosen": -2.79923939704895,
"logits/rejected": -2.7914280891418457,
"logps/chosen": -282.1067810058594,
"logps/rejected": -265.30364990234375,
"loss": 0.6733,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": 0.02532428503036499,
"rewards/margins": 0.04165149852633476,
"rewards/rejected": -0.016327213495969772,
"step": 2250
},
{
"epoch": 0.5914682020413504,
"grad_norm": 1.390625,
"learning_rate": 2.1403076230230005e-07,
"logits/chosen": -2.816969394683838,
"logits/rejected": -2.7890102863311768,
"logps/chosen": -290.90618896484375,
"logps/rejected": -261.3204345703125,
"loss": 0.6801,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.02144182100892067,
"rewards/margins": 0.028353065252304077,
"rewards/rejected": -0.006911243312060833,
"step": 2260
},
{
"epoch": 0.5940853179795865,
"grad_norm": 1.703125,
"learning_rate": 2.1177160159558596e-07,
"logits/chosen": -2.8060302734375,
"logits/rejected": -2.788020610809326,
"logps/chosen": -297.41741943359375,
"logps/rejected": -247.78640747070312,
"loss": 0.6745,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.02653617225587368,
"rewards/margins": 0.03934457153081894,
"rewards/rejected": -0.01280839741230011,
"step": 2270
},
{
"epoch": 0.5967024339178225,
"grad_norm": 1.3203125,
"learning_rate": 2.0951563294913734e-07,
"logits/chosen": -2.8132269382476807,
"logits/rejected": -2.7866339683532715,
"logps/chosen": -277.17449951171875,
"logps/rejected": -250.9734344482422,
"loss": 0.6754,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.01868070289492607,
"rewards/margins": 0.03689347952604294,
"rewards/rejected": -0.018212776631116867,
"step": 2280
},
{
"epoch": 0.5993195498560586,
"grad_norm": 1.1875,
"learning_rate": 2.072630447356869e-07,
"logits/chosen": -2.8403782844543457,
"logits/rejected": -2.8337533473968506,
"logps/chosen": -274.53546142578125,
"logps/rejected": -242.531494140625,
"loss": 0.6767,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.01636110618710518,
"rewards/margins": 0.03431684896349907,
"rewards/rejected": -0.01795574650168419,
"step": 2290
},
{
"epoch": 0.6019366657942947,
"grad_norm": 1.5625,
"learning_rate": 2.0501402504570232e-07,
"logits/chosen": -2.87614107131958,
"logits/rejected": -2.8186376094818115,
"logps/chosen": -293.62347412109375,
"logps/rejected": -262.20794677734375,
"loss": 0.6779,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.020630482584238052,
"rewards/margins": 0.032361775636672974,
"rewards/rejected": -0.011731292121112347,
"step": 2300
},
{
"epoch": 0.6019366657942947,
"eval_logits/chosen": -2.8548264503479004,
"eval_logits/rejected": -2.8281185626983643,
"eval_logps/chosen": -281.0184326171875,
"eval_logps/rejected": -262.955810546875,
"eval_loss": 0.6776320934295654,
"eval_rewards/accuracies": 0.6875,
"eval_rewards/chosen": 0.017550628632307053,
"eval_rewards/margins": 0.032725006341934204,
"eval_rewards/rejected": -0.015174377709627151,
"eval_runtime": 623.5172,
"eval_samples_per_second": 3.208,
"eval_steps_per_second": 0.401,
"step": 2300
},
{
"epoch": 0.6045537817325307,
"grad_norm": 1.3125,
"learning_rate": 2.027687616716804e-07,
"logits/chosen": -2.776857614517212,
"logits/rejected": -2.766300916671753,
"logps/chosen": -245.21249389648438,
"logps/rejected": -210.9021453857422,
"loss": 0.6796,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.012185106053948402,
"rewards/margins": 0.028403136879205704,
"rewards/rejected": -0.016218028962612152,
"step": 2310
},
{
"epoch": 0.6071708976707668,
"grad_norm": 1.5859375,
"learning_rate": 2.005274420924668e-07,
"logits/chosen": -2.842299699783325,
"logits/rejected": -2.8256301879882812,
"logps/chosen": -268.92669677734375,
"logps/rejected": -236.4410858154297,
"loss": 0.6773,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.01880219765007496,
"rewards/margins": 0.03351093456149101,
"rewards/rejected": -0.014708739705383778,
"step": 2320
},
{
"epoch": 0.6097880136090029,
"grad_norm": 1.8203125,
"learning_rate": 1.9829025345760121e-07,
"logits/chosen": -2.8297770023345947,
"logits/rejected": -2.830937147140503,
"logps/chosen": -295.0563049316406,
"logps/rejected": -287.86016845703125,
"loss": 0.6799,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.02504206821322441,
"rewards/margins": 0.02800445258617401,
"rewards/rejected": -0.002962383907288313,
"step": 2330
},
{
"epoch": 0.6124051295472389,
"grad_norm": 1.4453125,
"learning_rate": 1.960573825716911e-07,
"logits/chosen": -2.8119211196899414,
"logits/rejected": -2.7910213470458984,
"logps/chosen": -250.70425415039062,
"logps/rejected": -246.34921264648438,
"loss": 0.6817,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 0.005780251231044531,
"rewards/margins": 0.024395998567342758,
"rewards/rejected": -0.01861574873328209,
"step": 2340
},
{
"epoch": 0.615022245485475,
"grad_norm": 1.4453125,
"learning_rate": 1.9382901587881273e-07,
"logits/chosen": -2.8759961128234863,
"logits/rejected": -2.864570379257202,
"logps/chosen": -273.0665588378906,
"logps/rejected": -240.42294311523438,
"loss": 0.6727,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.027859041467308998,
"rewards/margins": 0.04289738088846207,
"rewards/rejected": -0.015038339421153069,
"step": 2350
},
{
"epoch": 0.6176393614237111,
"grad_norm": 1.53125,
"learning_rate": 1.9160533944694364e-07,
"logits/chosen": -2.870702028274536,
"logits/rejected": -2.8234288692474365,
"logps/chosen": -276.2862854003906,
"logps/rejected": -267.0870666503906,
"loss": 0.6738,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": 0.025392061099410057,
"rewards/margins": 0.04046647623181343,
"rewards/rejected": -0.015074415132403374,
"step": 2360
},
{
"epoch": 0.6202564773619471,
"grad_norm": 1.3671875,
"learning_rate": 1.8938653895242602e-07,
"logits/chosen": -2.861572027206421,
"logits/rejected": -2.8287158012390137,
"logps/chosen": -277.0164489746094,
"logps/rejected": -251.5226593017578,
"loss": 0.6715,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.02161785028874874,
"rewards/margins": 0.045254360884428024,
"rewards/rejected": -0.02363651618361473,
"step": 2370
},
{
"epoch": 0.6228735933001832,
"grad_norm": 1.3515625,
"learning_rate": 1.8717279966446264e-07,
"logits/chosen": -2.7649269104003906,
"logits/rejected": -2.748934268951416,
"logps/chosen": -266.9613037109375,
"logps/rejected": -256.705810546875,
"loss": 0.6789,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 0.01746024750173092,
"rewards/margins": 0.030255427584052086,
"rewards/rejected": -0.012795181944966316,
"step": 2380
},
{
"epoch": 0.6254907092384192,
"grad_norm": 1.3046875,
"learning_rate": 1.8496430642964694e-07,
"logits/chosen": -2.8276329040527344,
"logits/rejected": -2.8031527996063232,
"logps/chosen": -289.2098083496094,
"logps/rejected": -266.8305969238281,
"loss": 0.6785,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.020906824618577957,
"rewards/margins": 0.031347136944532394,
"rewards/rejected": -0.010440316051244736,
"step": 2390
},
{
"epoch": 0.6281078251766553,
"grad_norm": 1.375,
"learning_rate": 1.8276124365652855e-07,
"logits/chosen": -2.8458991050720215,
"logits/rejected": -2.7970054149627686,
"logps/chosen": -278.1546325683594,
"logps/rejected": -264.06427001953125,
"loss": 0.6782,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.01725105755031109,
"rewards/margins": 0.031516797840595245,
"rewards/rejected": -0.014265733771026134,
"step": 2400
},
{
"epoch": 0.6281078251766553,
"eval_logits/chosen": -2.8540356159210205,
"eval_logits/rejected": -2.8272616863250732,
"eval_logps/chosen": -280.9810485839844,
"eval_logps/rejected": -262.9154968261719,
"eval_loss": 0.6776500344276428,
"eval_rewards/accuracies": 0.6834999918937683,
"eval_rewards/chosen": 0.01792425848543644,
"eval_rewards/margins": 0.032695669680833817,
"eval_rewards/rejected": -0.014771413058042526,
"eval_runtime": 623.7982,
"eval_samples_per_second": 3.206,
"eval_steps_per_second": 0.401,
"step": 2400
},
{
"epoch": 0.6307249411148914,
"grad_norm": 1.328125,
"learning_rate": 1.805637953002149e-07,
"logits/chosen": -2.8647313117980957,
"logits/rejected": -2.8551206588745117,
"logps/chosen": -258.0870666503906,
"logps/rejected": -236.39266967773438,
"loss": 0.6787,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.016582269221544266,
"rewards/margins": 0.030466347932815552,
"rewards/rejected": -0.013884077779948711,
"step": 2410
},
{
"epoch": 0.6333420570531274,
"grad_norm": 1.1484375,
"learning_rate": 1.7837214484701153e-07,
"logits/chosen": -2.8555073738098145,
"logits/rejected": -2.837476968765259,
"logps/chosen": -266.9813537597656,
"logps/rejected": -244.21142578125,
"loss": 0.673,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.023186923936009407,
"rewards/margins": 0.04209943115711212,
"rewards/rejected": -0.018912509083747864,
"step": 2420
},
{
"epoch": 0.6359591729913635,
"grad_norm": 1.28125,
"learning_rate": 1.761864752991004e-07,
"logits/chosen": -2.8415451049804688,
"logits/rejected": -2.8176722526550293,
"logps/chosen": -272.5819396972656,
"logps/rejected": -260.46868896484375,
"loss": 0.6765,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.019747793674468994,
"rewards/margins": 0.034796275198459625,
"rewards/rejected": -0.015048478730022907,
"step": 2430
},
{
"epoch": 0.6385762889295996,
"grad_norm": 1.3515625,
"learning_rate": 1.7400696915925995e-07,
"logits/chosen": -2.8523507118225098,
"logits/rejected": -2.8236594200134277,
"logps/chosen": -287.25445556640625,
"logps/rejected": -227.38986206054688,
"loss": 0.6748,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.017432263121008873,
"rewards/margins": 0.03874523937702179,
"rewards/rejected": -0.021312978118658066,
"step": 2440
},
{
"epoch": 0.6411934048678356,
"grad_norm": 1.6875,
"learning_rate": 1.718338084156254e-07,
"logits/chosen": -2.797588586807251,
"logits/rejected": -2.780844211578369,
"logps/chosen": -304.15277099609375,
"logps/rejected": -267.3579406738281,
"loss": 0.6756,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.023083947598934174,
"rewards/margins": 0.036666251718997955,
"rewards/rejected": -0.013582308776676655,
"step": 2450
},
{
"epoch": 0.6438105208060717,
"grad_norm": 1.09375,
"learning_rate": 1.696671745264937e-07,
"logits/chosen": -2.860663652420044,
"logits/rejected": -2.863227128982544,
"logps/chosen": -295.4175720214844,
"logps/rejected": -240.9844207763672,
"loss": 0.6724,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 0.025130432099103928,
"rewards/margins": 0.04342951625585556,
"rewards/rejected": -0.018299078568816185,
"step": 2460
},
{
"epoch": 0.6464276367443078,
"grad_norm": 1.3359375,
"learning_rate": 1.67507248405171e-07,
"logits/chosen": -2.846250534057617,
"logits/rejected": -2.82800030708313,
"logps/chosen": -270.4811706542969,
"logps/rejected": -273.17401123046875,
"loss": 0.6794,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 0.020759141072630882,
"rewards/margins": 0.029174262657761574,
"rewards/rejected": -0.00841512344777584,
"step": 2470
},
{
"epoch": 0.6490447526825438,
"grad_norm": 1.2890625,
"learning_rate": 1.6535421040486683e-07,
"logits/chosen": -2.760650396347046,
"logits/rejected": -2.743494987487793,
"logps/chosen": -270.25933837890625,
"logps/rejected": -240.8209228515625,
"loss": 0.6748,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.016713624820113182,
"rewards/margins": 0.03836838901042938,
"rewards/rejected": -0.02165476605296135,
"step": 2480
},
{
"epoch": 0.6516618686207799,
"grad_norm": 1.234375,
"learning_rate": 1.6320824030363456e-07,
"logits/chosen": -2.8263466358184814,
"logits/rejected": -2.828117609024048,
"logps/chosen": -248.79129028320312,
"logps/rejected": -235.21774291992188,
"loss": 0.6765,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.017327528446912766,
"rewards/margins": 0.03489188104867935,
"rewards/rejected": -0.01756434701383114,
"step": 2490
},
{
"epoch": 0.654278984559016,
"grad_norm": 1.359375,
"learning_rate": 1.6106951728936024e-07,
"logits/chosen": -2.8797359466552734,
"logits/rejected": -2.8334743976593018,
"logps/chosen": -271.38677978515625,
"logps/rejected": -267.6949768066406,
"loss": 0.6753,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 0.020725153386592865,
"rewards/margins": 0.03783208504319191,
"rewards/rejected": -0.017106933519244194,
"step": 2500
},
{
"epoch": 0.654278984559016,
"eval_logits/chosen": -2.852537155151367,
"eval_logits/rejected": -2.8256473541259766,
"eval_logps/chosen": -280.9631042480469,
"eval_logps/rejected": -262.90740966796875,
"eval_loss": 0.6776077151298523,
"eval_rewards/accuracies": 0.6804999709129333,
"eval_rewards/chosen": 0.01810392364859581,
"eval_rewards/margins": 0.0327942781150341,
"eval_rewards/rejected": -0.014690355397760868,
"eval_runtime": 623.8344,
"eval_samples_per_second": 3.206,
"eval_steps_per_second": 0.401,
"step": 2500
},
{
"epoch": 0.656896100497252,
"grad_norm": 1.3125,
"learning_rate": 1.5893821994479994e-07,
"logits/chosen": -2.860830307006836,
"logits/rejected": -2.8481099605560303,
"logps/chosen": -290.0372009277344,
"logps/rejected": -253.4038848876953,
"loss": 0.6757,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 0.024327334016561508,
"rewards/margins": 0.03673567250370979,
"rewards/rejected": -0.012408342212438583,
"step": 2510
},
{
"epoch": 0.6595132164354881,
"grad_norm": 1.34375,
"learning_rate": 1.5681452623266867e-07,
"logits/chosen": -2.8527517318725586,
"logits/rejected": -2.8060853481292725,
"logps/chosen": -301.49542236328125,
"logps/rejected": -247.3957977294922,
"loss": 0.6683,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": 0.03281102329492569,
"rewards/margins": 0.05217736214399338,
"rewards/rejected": -0.019366348162293434,
"step": 2520
},
{
"epoch": 0.6621303323737242,
"grad_norm": 2.484375,
"learning_rate": 1.546986134807801e-07,
"logits/chosen": -2.8651843070983887,
"logits/rejected": -2.8340165615081787,
"logps/chosen": -263.162109375,
"logps/rejected": -252.37625122070312,
"loss": 0.6783,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.015812261030077934,
"rewards/margins": 0.0313270129263401,
"rewards/rejected": -0.015514750964939594,
"step": 2530
},
{
"epoch": 0.6647474483119602,
"grad_norm": 1.296875,
"learning_rate": 1.5259065836724034e-07,
"logits/chosen": -2.7946388721466064,
"logits/rejected": -2.7776710987091064,
"logps/chosen": -262.2623291015625,
"logps/rejected": -254.7265167236328,
"loss": 0.6787,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.01517055369913578,
"rewards/margins": 0.030337844043970108,
"rewards/rejected": -0.015167290344834328,
"step": 2540
},
{
"epoch": 0.6673645642501963,
"grad_norm": 1.25,
"learning_rate": 1.5049083690569454e-07,
"logits/chosen": -2.809542179107666,
"logits/rejected": -2.793139934539795,
"logps/chosen": -251.2034454345703,
"logps/rejected": -249.8028106689453,
"loss": 0.6761,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.018413711339235306,
"rewards/margins": 0.03606470674276352,
"rewards/rejected": -0.017650997266173363,
"step": 2550
},
{
"epoch": 0.6699816801884323,
"grad_norm": 1.2421875,
"learning_rate": 1.4839932443063056e-07,
"logits/chosen": -2.837368965148926,
"logits/rejected": -2.810147523880005,
"logps/chosen": -305.4731750488281,
"logps/rejected": -251.56161499023438,
"loss": 0.6732,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.024250676855444908,
"rewards/margins": 0.041700925678014755,
"rewards/rejected": -0.017450252547860146,
"step": 2560
},
{
"epoch": 0.6725987961266684,
"grad_norm": 1.1171875,
"learning_rate": 1.46316295582738e-07,
"logits/chosen": -2.8204097747802734,
"logits/rejected": -2.8028788566589355,
"logps/chosen": -257.53228759765625,
"logps/rejected": -245.14895629882812,
"loss": 0.6809,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.006483917590230703,
"rewards/margins": 0.025950897485017776,
"rewards/rejected": -0.019466979429125786,
"step": 2570
},
{
"epoch": 0.6752159120649045,
"grad_norm": 1.6171875,
"learning_rate": 1.4424192429432655e-07,
"logits/chosen": -2.848489999771118,
"logits/rejected": -2.8286855220794678,
"logps/chosen": -270.4859619140625,
"logps/rejected": -277.18206787109375,
"loss": 0.6742,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 0.024375300854444504,
"rewards/margins": 0.03957264870405197,
"rewards/rejected": -0.015197351574897766,
"step": 2580
},
{
"epoch": 0.6778330280031405,
"grad_norm": 2.0625,
"learning_rate": 1.4217638377480158e-07,
"logits/chosen": -2.829794406890869,
"logits/rejected": -2.8167781829833984,
"logps/chosen": -274.6668395996094,
"logps/rejected": -262.61700439453125,
"loss": 0.6789,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.014721376821398735,
"rewards/margins": 0.02995181456208229,
"rewards/rejected": -0.015230434946715832,
"step": 2590
},
{
"epoch": 0.6804501439413766,
"grad_norm": 1.3203125,
"learning_rate": 1.401198464962021e-07,
"logits/chosen": -2.8342947959899902,
"logits/rejected": -2.8125643730163574,
"logps/chosen": -284.03253173828125,
"logps/rejected": -243.7402801513672,
"loss": 0.6776,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.015603385865688324,
"rewards/margins": 0.03253168612718582,
"rewards/rejected": -0.016928300261497498,
"step": 2600
},
{
"epoch": 0.6804501439413766,
"eval_logits/chosen": -2.849823474884033,
"eval_logits/rejected": -2.822629690170288,
"eval_logps/chosen": -280.9640808105469,
"eval_logps/rejected": -262.91668701171875,
"eval_loss": 0.6775689721107483,
"eval_rewards/accuracies": 0.6775000095367432,
"eval_rewards/chosen": 0.018093857914209366,
"eval_rewards/margins": 0.0328776054084301,
"eval_rewards/rejected": -0.014783743768930435,
"eval_runtime": 624.015,
"eval_samples_per_second": 3.205,
"eval_steps_per_second": 0.401,
"step": 2600
},
{
"epoch": 0.6830672598796127,
"grad_norm": 1.15625,
"learning_rate": 1.3807248417879894e-07,
"logits/chosen": -2.866183280944824,
"logits/rejected": -2.85908842086792,
"logps/chosen": -286.482666015625,
"logps/rejected": -269.18145751953125,
"loss": 0.6742,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": 0.024856090545654297,
"rewards/margins": 0.03993413224816322,
"rewards/rejected": -0.015078043565154076,
"step": 2610
},
{
"epoch": 0.6856843758178487,
"grad_norm": 1.40625,
"learning_rate": 1.3603446777675665e-07,
"logits/chosen": -2.7812695503234863,
"logits/rejected": -2.7601253986358643,
"logps/chosen": -280.71826171875,
"logps/rejected": -258.6421813964844,
"loss": 0.6757,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.018343228846788406,
"rewards/margins": 0.036758117377758026,
"rewards/rejected": -0.01841488853096962,
"step": 2620
},
{
"epoch": 0.6883014917560848,
"grad_norm": 1.3359375,
"learning_rate": 1.3400596746385814e-07,
"logits/chosen": -2.844383716583252,
"logits/rejected": -2.8064093589782715,
"logps/chosen": -286.34820556640625,
"logps/rejected": -258.3263244628906,
"loss": 0.6781,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.018320731818675995,
"rewards/margins": 0.03194695711135864,
"rewards/rejected": -0.013626225292682648,
"step": 2630
},
{
"epoch": 0.6909186076943209,
"grad_norm": 1.1484375,
"learning_rate": 1.3198715261929586e-07,
"logits/chosen": -2.8701038360595703,
"logits/rejected": -2.835305690765381,
"logps/chosen": -248.2649688720703,
"logps/rejected": -243.41397094726562,
"loss": 0.6755,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.015776509419083595,
"rewards/margins": 0.03662073612213135,
"rewards/rejected": -0.02084423042833805,
"step": 2640
},
{
"epoch": 0.6935357236325569,
"grad_norm": 1.3359375,
"learning_rate": 1.299781918135282e-07,
"logits/chosen": -2.85074782371521,
"logits/rejected": -2.8091163635253906,
"logps/chosen": -315.61590576171875,
"logps/rejected": -293.9820251464844,
"loss": 0.6697,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.035952307283878326,
"rewards/margins": 0.04937596619129181,
"rewards/rejected": -0.013423657044768333,
"step": 2650
},
{
"epoch": 0.696152839570793,
"grad_norm": 1.3359375,
"learning_rate": 1.279792527942045e-07,
"logits/chosen": -2.8586459159851074,
"logits/rejected": -2.8158531188964844,
"logps/chosen": -284.60882568359375,
"logps/rejected": -277.387451171875,
"loss": 0.6764,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.0246993750333786,
"rewards/margins": 0.035598695278167725,
"rewards/rejected": -0.010899320244789124,
"step": 2660
},
{
"epoch": 0.6987699555090291,
"grad_norm": 1.6796875,
"learning_rate": 1.259905024721576e-07,
"logits/chosen": -2.8398678302764893,
"logits/rejected": -2.823967695236206,
"logps/chosen": -273.98944091796875,
"logps/rejected": -254.6985321044922,
"loss": 0.6742,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": 0.019164234399795532,
"rewards/margins": 0.039530009031295776,
"rewards/rejected": -0.020365772768855095,
"step": 2670
},
{
"epoch": 0.7013870714472651,
"grad_norm": 1.3984375,
"learning_rate": 1.2401210690746703e-07,
"logits/chosen": -2.8383288383483887,
"logits/rejected": -2.8142457008361816,
"logps/chosen": -283.2543029785156,
"logps/rejected": -252.313720703125,
"loss": 0.6763,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.02543952502310276,
"rewards/margins": 0.03542017191648483,
"rewards/rejected": -0.009980651549994946,
"step": 2680
},
{
"epoch": 0.7040041873855012,
"grad_norm": 1.5625,
"learning_rate": 1.2204423129559305e-07,
"logits/chosen": -2.861647844314575,
"logits/rejected": -2.8627748489379883,
"logps/chosen": -281.1255798339844,
"logps/rejected": -280.34027099609375,
"loss": 0.6749,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.021706473082304,
"rewards/margins": 0.03822758048772812,
"rewards/rejected": -0.016521107405424118,
"step": 2690
},
{
"epoch": 0.7066213033237373,
"grad_norm": 2.171875,
"learning_rate": 1.2008703995358299e-07,
"logits/chosen": -2.837878704071045,
"logits/rejected": -2.8239123821258545,
"logps/chosen": -279.67852783203125,
"logps/rejected": -253.0082550048828,
"loss": 0.6774,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.020584728568792343,
"rewards/margins": 0.033195436000823975,
"rewards/rejected": -0.012610706500709057,
"step": 2700
},
{
"epoch": 0.7066213033237373,
"eval_logits/chosen": -2.85300874710083,
"eval_logits/rejected": -2.82612943649292,
"eval_logps/chosen": -280.95526123046875,
"eval_logps/rejected": -262.92626953125,
"eval_loss": 0.6774773001670837,
"eval_rewards/accuracies": 0.6859999895095825,
"eval_rewards/chosen": 0.018182458356022835,
"eval_rewards/margins": 0.03306160494685173,
"eval_rewards/rejected": -0.014879145659506321,
"eval_runtime": 623.2959,
"eval_samples_per_second": 3.209,
"eval_steps_per_second": 0.401,
"step": 2700
},
{
"epoch": 0.7092384192619733,
"grad_norm": 1.6171875,
"learning_rate": 1.1814069630635068e-07,
"logits/chosen": -2.8202016353607178,
"logits/rejected": -2.8188061714172363,
"logps/chosen": -286.5379333496094,
"logps/rejected": -281.89678955078125,
"loss": 0.6787,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.02239087037742138,
"rewards/margins": 0.030664747580885887,
"rewards/rejected": -0.008273878134787083,
"step": 2710
},
{
"epoch": 0.7118555352002094,
"grad_norm": 1.7109375,
"learning_rate": 1.1620536287303051e-07,
"logits/chosen": -2.8520309925079346,
"logits/rejected": -2.8322553634643555,
"logps/chosen": -306.9613342285156,
"logps/rejected": -276.6466979980469,
"loss": 0.6798,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.024515343829989433,
"rewards/margins": 0.02836497500538826,
"rewards/rejected": -0.0038496279157698154,
"step": 2720
},
{
"epoch": 0.7144726511384454,
"grad_norm": 1.2265625,
"learning_rate": 1.1428120125340716e-07,
"logits/chosen": -2.8408806324005127,
"logits/rejected": -2.8193554878234863,
"logps/chosen": -278.6138000488281,
"logps/rejected": -233.0282440185547,
"loss": 0.6728,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.020976107567548752,
"rewards/margins": 0.042338818311691284,
"rewards/rejected": -0.021362707018852234,
"step": 2730
},
{
"epoch": 0.7170897670766815,
"grad_norm": 1.3515625,
"learning_rate": 1.123683721144223e-07,
"logits/chosen": -2.8390605449676514,
"logits/rejected": -2.817472457885742,
"logps/chosen": -299.55767822265625,
"logps/rejected": -270.6512145996094,
"loss": 0.6779,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.022130262106657028,
"rewards/margins": 0.03234560787677765,
"rewards/rejected": -0.010215344838798046,
"step": 2740
},
{
"epoch": 0.7197068830149176,
"grad_norm": 1.4296875,
"learning_rate": 1.1046703517675845e-07,
"logits/chosen": -2.8513295650482178,
"logits/rejected": -2.834134578704834,
"logps/chosen": -269.61492919921875,
"logps/rejected": -279.116455078125,
"loss": 0.6781,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.020882591605186462,
"rewards/margins": 0.03162946552038193,
"rewards/rejected": -0.010746878571808338,
"step": 2750
},
{
"epoch": 0.7223239989531536,
"grad_norm": 1.25,
"learning_rate": 1.085773492015028e-07,
"logits/chosen": -2.837613821029663,
"logits/rejected": -2.811807155609131,
"logps/chosen": -262.3995361328125,
"logps/rejected": -229.64999389648438,
"loss": 0.671,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.02249622717499733,
"rewards/margins": 0.04641376808285713,
"rewards/rejected": -0.023917539045214653,
"step": 2760
},
{
"epoch": 0.7249411148913897,
"grad_norm": 1.421875,
"learning_rate": 1.0669947197689033e-07,
"logits/chosen": -2.829591989517212,
"logits/rejected": -2.78979754447937,
"logps/chosen": -289.36322021484375,
"logps/rejected": -266.07537841796875,
"loss": 0.6777,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.016385111957788467,
"rewards/margins": 0.0324719175696373,
"rewards/rejected": -0.01608681119978428,
"step": 2770
},
{
"epoch": 0.7275582308296258,
"grad_norm": 1.53125,
"learning_rate": 1.048335603051291e-07,
"logits/chosen": -2.811000347137451,
"logits/rejected": -2.7975831031799316,
"logps/chosen": -303.7996520996094,
"logps/rejected": -278.1279296875,
"loss": 0.6693,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": 0.022211695089936256,
"rewards/margins": 0.05009465292096138,
"rewards/rejected": -0.027882959693670273,
"step": 2780
},
{
"epoch": 0.7301753467678618,
"grad_norm": 2.515625,
"learning_rate": 1.0297976998930663e-07,
"logits/chosen": -2.853620767593384,
"logits/rejected": -2.842625379562378,
"logps/chosen": -290.62567138671875,
"logps/rejected": -259.65020751953125,
"loss": 0.6734,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.018552960827946663,
"rewards/margins": 0.04151231050491333,
"rewards/rejected": -0.022959351539611816,
"step": 2790
},
{
"epoch": 0.7327924627060979,
"grad_norm": 1.421875,
"learning_rate": 1.0113825582038077e-07,
"logits/chosen": -2.854717493057251,
"logits/rejected": -2.836151123046875,
"logps/chosen": -279.61346435546875,
"logps/rejected": -265.38531494140625,
"loss": 0.679,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.020294269546866417,
"rewards/margins": 0.02987518534064293,
"rewards/rejected": -0.009580916725099087,
"step": 2800
},
{
"epoch": 0.7327924627060979,
"eval_logits/chosen": -2.853910446166992,
"eval_logits/rejected": -2.8271100521087646,
"eval_logps/chosen": -280.93585205078125,
"eval_logps/rejected": -262.9162292480469,
"eval_loss": 0.6774327754974365,
"eval_rewards/accuracies": 0.6850000023841858,
"eval_rewards/chosen": 0.018376635387539864,
"eval_rewards/margins": 0.0331551730632782,
"eval_rewards/rejected": -0.014778541401028633,
"eval_runtime": 623.7205,
"eval_samples_per_second": 3.207,
"eval_steps_per_second": 0.401,
"step": 2800
},
{
"epoch": 0.735409578644334,
"grad_norm": 1.453125,
"learning_rate": 9.930917156425475e-08,
"logits/chosen": -2.86027193069458,
"logits/rejected": -2.840989828109741,
"logps/chosen": -278.8993225097656,
"logps/rejected": -277.65216064453125,
"loss": 0.6774,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.01882421225309372,
"rewards/margins": 0.033416565507650375,
"rewards/rejected": -0.01459235418587923,
"step": 2810
},
{
"epoch": 0.73802669458257,
"grad_norm": 1.484375,
"learning_rate": 9.749266994893754e-08,
"logits/chosen": -2.8003342151641846,
"logits/rejected": -2.7634270191192627,
"logps/chosen": -253.88516235351562,
"logps/rejected": -244.5758056640625,
"loss": 0.6833,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.008167347870767117,
"rewards/margins": 0.02078983187675476,
"rewards/rejected": -0.012622484937310219,
"step": 2820
},
{
"epoch": 0.7406438105208061,
"grad_norm": 1.265625,
"learning_rate": 9.568890265179128e-08,
"logits/chosen": -2.8219611644744873,
"logits/rejected": -2.818441867828369,
"logps/chosen": -277.968994140625,
"logps/rejected": -250.047119140625,
"loss": 0.6772,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.020925423130393028,
"rewards/margins": 0.033755991607904434,
"rewards/rejected": -0.012830562889575958,
"step": 2830
},
{
"epoch": 0.7432609264590422,
"grad_norm": 1.265625,
"learning_rate": 9.389802028686616e-08,
"logits/chosen": -2.8413894176483154,
"logits/rejected": -2.823989152908325,
"logps/chosen": -277.80926513671875,
"logps/rejected": -246.23593139648438,
"loss": 0.6805,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.011136185377836227,
"rewards/margins": 0.02661561407148838,
"rewards/rejected": -0.015479430556297302,
"step": 2840
},
{
"epoch": 0.7458780423972782,
"grad_norm": 1.2734375,
"learning_rate": 9.212017239232426e-08,
"logits/chosen": -2.831408977508545,
"logits/rejected": -2.8223624229431152,
"logps/chosen": -287.7478942871094,
"logps/rejected": -267.5721435546875,
"loss": 0.673,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.021497588604688644,
"rewards/margins": 0.04216768592596054,
"rewards/rejected": -0.020670095458626747,
"step": 2850
},
{
"epoch": 0.7484951583355143,
"grad_norm": 1.609375,
"learning_rate": 9.035550741795328e-08,
"logits/chosen": -2.814898729324341,
"logits/rejected": -2.8209142684936523,
"logps/chosen": -271.9150085449219,
"logps/rejected": -279.5975646972656,
"loss": 0.6727,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.02707098424434662,
"rewards/margins": 0.04284884035587311,
"rewards/rejected": -0.01577785238623619,
"step": 2860
},
{
"epoch": 0.7511122742737504,
"grad_norm": 1.2109375,
"learning_rate": 8.860417271277065e-08,
"logits/chosen": -2.8868813514709473,
"logits/rejected": -2.88276743888855,
"logps/chosen": -283.67864990234375,
"logps/rejected": -275.3718566894531,
"loss": 0.6819,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.018196921795606613,
"rewards/margins": 0.02416098490357399,
"rewards/rejected": -0.005964064504951239,
"step": 2870
},
{
"epoch": 0.7537293902119864,
"grad_norm": 1.3359375,
"learning_rate": 8.686631451272029e-08,
"logits/chosen": -2.869019031524658,
"logits/rejected": -2.84165096282959,
"logps/chosen": -268.4243469238281,
"logps/rejected": -247.5635986328125,
"loss": 0.6799,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.0169739481061697,
"rewards/margins": 0.028122667223215103,
"rewards/rejected": -0.011148716323077679,
"step": 2880
},
{
"epoch": 0.7563465061502225,
"grad_norm": 1.4453125,
"learning_rate": 8.514207792846168e-08,
"logits/chosen": -2.8492226600646973,
"logits/rejected": -2.844165086746216,
"logps/chosen": -265.9365539550781,
"logps/rejected": -237.9735107421875,
"loss": 0.6782,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.012092510238289833,
"rewards/margins": 0.031133780255913734,
"rewards/rejected": -0.0190412737429142,
"step": 2890
},
{
"epoch": 0.7589636220884585,
"grad_norm": 1.34375,
"learning_rate": 8.343160693325355e-08,
"logits/chosen": -2.815880060195923,
"logits/rejected": -2.804381847381592,
"logps/chosen": -269.6540832519531,
"logps/rejected": -268.2055358886719,
"loss": 0.6782,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.017512865364551544,
"rewards/margins": 0.03190717473626137,
"rewards/rejected": -0.014394307509064674,
"step": 2900
},
{
"epoch": 0.7589636220884585,
"eval_logits/chosen": -2.852909803390503,
"eval_logits/rejected": -2.826012134552002,
"eval_logps/chosen": -280.9681396484375,
"eval_logps/rejected": -262.93359375,
"eval_loss": 0.6775044202804565,
"eval_rewards/accuracies": 0.684499979019165,
"eval_rewards/chosen": 0.01805364154279232,
"eval_rewards/margins": 0.033006127923727036,
"eval_rewards/rejected": -0.014952489174902439,
"eval_runtime": 622.9143,
"eval_samples_per_second": 3.211,
"eval_steps_per_second": 0.401,
"step": 2900
},
{
"epoch": 0.7615807380266946,
"grad_norm": 1.09375,
"learning_rate": 8.173504435093173e-08,
"logits/chosen": -2.832644462585449,
"logits/rejected": -2.7984976768493652,
"logps/chosen": -263.4750061035156,
"logps/rejected": -228.9986114501953,
"loss": 0.674,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.018485140055418015,
"rewards/margins": 0.040302351117134094,
"rewards/rejected": -0.02181720733642578,
"step": 2910
},
{
"epoch": 0.7641978539649307,
"grad_norm": 1.2890625,
"learning_rate": 8.005253184398359e-08,
"logits/chosen": -2.8306822776794434,
"logits/rejected": -2.8159360885620117,
"logps/chosen": -292.702880859375,
"logps/rejected": -288.18695068359375,
"loss": 0.6762,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.02734486199915409,
"rewards/margins": 0.03592860698699951,
"rewards/rejected": -0.008583742193877697,
"step": 2920
},
{
"epoch": 0.7668149699031667,
"grad_norm": 1.21875,
"learning_rate": 7.838420990171926e-08,
"logits/chosen": -2.8607592582702637,
"logits/rejected": -2.825814962387085,
"logps/chosen": -286.3602600097656,
"logps/rejected": -260.51641845703125,
"loss": 0.6777,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.01990603655576706,
"rewards/margins": 0.03237393498420715,
"rewards/rejected": -0.012467900291085243,
"step": 2930
},
{
"epoch": 0.7694320858414028,
"grad_norm": 1.2578125,
"learning_rate": 7.673021782854083e-08,
"logits/chosen": -2.780974864959717,
"logits/rejected": -2.763579845428467,
"logps/chosen": -284.2770690917969,
"logps/rejected": -233.38668823242188,
"loss": 0.6742,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.02143119089305401,
"rewards/margins": 0.04003281518816948,
"rewards/rejected": -0.01860162802040577,
"step": 2940
},
{
"epoch": 0.7720492017796389,
"grad_norm": 1.234375,
"learning_rate": 7.509069373231039e-08,
"logits/chosen": -2.8137221336364746,
"logits/rejected": -2.788684368133545,
"logps/chosen": -266.8929138183594,
"logps/rejected": -250.69326782226562,
"loss": 0.6767,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 0.018960343673825264,
"rewards/margins": 0.034537579864263535,
"rewards/rejected": -0.015577234327793121,
"step": 2950
},
{
"epoch": 0.7746663177178749,
"grad_norm": 1.3203125,
"learning_rate": 7.346577451281821e-08,
"logits/chosen": -2.823444366455078,
"logits/rejected": -2.826974391937256,
"logps/chosen": -279.7121276855469,
"logps/rejected": -261.50518798828125,
"loss": 0.6768,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.018806222826242447,
"rewards/margins": 0.03476772829890251,
"rewards/rejected": -0.015961505472660065,
"step": 2960
},
{
"epoch": 0.777283433656111,
"grad_norm": 1.59375,
"learning_rate": 7.185559585035136e-08,
"logits/chosen": -2.84146785736084,
"logits/rejected": -2.8069043159484863,
"logps/chosen": -296.74932861328125,
"logps/rejected": -284.635009765625,
"loss": 0.6736,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.02409261278808117,
"rewards/margins": 0.041189759969711304,
"rewards/rejected": -0.017097145318984985,
"step": 2970
},
{
"epoch": 0.7799005495943471,
"grad_norm": 1.5703125,
"learning_rate": 7.026029219436502e-08,
"logits/chosen": -2.816230535507202,
"logits/rejected": -2.792483329772949,
"logps/chosen": -270.294189453125,
"logps/rejected": -262.2767333984375,
"loss": 0.6756,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 0.01601272262632847,
"rewards/margins": 0.03693497180938721,
"rewards/rejected": -0.02092224732041359,
"step": 2980
},
{
"epoch": 0.7825176655325831,
"grad_norm": 1.1875,
"learning_rate": 6.867999675225522e-08,
"logits/chosen": -2.864154815673828,
"logits/rejected": -2.833688259124756,
"logps/chosen": -245.67288208007812,
"logps/rejected": -232.77743530273438,
"loss": 0.6775,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.017845796421170235,
"rewards/margins": 0.03295399993658066,
"rewards/rejected": -0.015108207240700722,
"step": 2990
},
{
"epoch": 0.7851347814708192,
"grad_norm": 1.203125,
"learning_rate": 6.711484147823662e-08,
"logits/chosen": -2.8105015754699707,
"logits/rejected": -2.8049824237823486,
"logps/chosen": -248.8085479736328,
"logps/rejected": -257.3614807128906,
"loss": 0.6784,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.017226997762918472,
"rewards/margins": 0.03089422546327114,
"rewards/rejected": -0.01366722583770752,
"step": 3000
},
{
"epoch": 0.7851347814708192,
"eval_logits/chosen": -2.8550119400024414,
"eval_logits/rejected": -2.8283438682556152,
"eval_logps/chosen": -280.97314453125,
"eval_logps/rejected": -262.9586486816406,
"eval_loss": 0.67740797996521,
"eval_rewards/accuracies": 0.6890000104904175,
"eval_rewards/chosen": 0.018003566190600395,
"eval_rewards/margins": 0.03320648893713951,
"eval_rewards/rejected": -0.015202920883893967,
"eval_runtime": 623.2623,
"eval_samples_per_second": 3.209,
"eval_steps_per_second": 0.401,
"step": 3000
},
{
"epoch": 0.7877518974090553,
"grad_norm": 1.4609375,
"learning_rate": 6.556495706232412e-08,
"logits/chosen": -2.820091724395752,
"logits/rejected": -2.819214105606079,
"logps/chosen": -285.6153869628906,
"logps/rejected": -269.1045227050781,
"loss": 0.6751,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 0.017025303095579147,
"rewards/margins": 0.03793289139866829,
"rewards/rejected": -0.020907586440443993,
"step": 3010
},
{
"epoch": 0.7903690133472913,
"grad_norm": 1.4375,
"learning_rate": 6.403047291942057e-08,
"logits/chosen": -2.7955071926116943,
"logits/rejected": -2.757645606994629,
"logps/chosen": -243.58535766601562,
"logps/rejected": -218.78341674804688,
"loss": 0.6802,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.010361788794398308,
"rewards/margins": 0.02738323248922825,
"rewards/rejected": -0.017021439969539642,
"step": 3020
},
{
"epoch": 0.7929861292855274,
"grad_norm": 1.6328125,
"learning_rate": 6.251151717851021e-08,
"logits/chosen": -2.819065570831299,
"logits/rejected": -2.8068315982818604,
"logps/chosen": -249.41567993164062,
"logps/rejected": -235.5353546142578,
"loss": 0.681,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 0.016130346804857254,
"rewards/margins": 0.025897834450006485,
"rewards/rejected": -0.009767485782504082,
"step": 3030
},
{
"epoch": 0.7956032452237635,
"grad_norm": 1.1328125,
"learning_rate": 6.100821667196041e-08,
"logits/chosen": -2.893716812133789,
"logits/rejected": -2.835082530975342,
"logps/chosen": -288.95574951171875,
"logps/rejected": -224.43881225585938,
"loss": 0.6735,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.01886625960469246,
"rewards/margins": 0.04120669886469841,
"rewards/rejected": -0.0223404411226511,
"step": 3040
},
{
"epoch": 0.7982203611619995,
"grad_norm": 1.4140625,
"learning_rate": 5.952069692493061e-08,
"logits/chosen": -2.790799617767334,
"logits/rejected": -2.7859864234924316,
"logps/chosen": -243.0890350341797,
"logps/rejected": -251.2998504638672,
"loss": 0.6737,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 0.025469347834587097,
"rewards/margins": 0.041153885424137115,
"rewards/rejected": -0.01568453758955002,
"step": 3050
},
{
"epoch": 0.8008374771002356,
"grad_norm": 1.578125,
"learning_rate": 5.8049082144891794e-08,
"logits/chosen": -2.7803027629852295,
"logits/rejected": -2.7686164379119873,
"logps/chosen": -278.40985107421875,
"logps/rejected": -323.7679443359375,
"loss": 0.6815,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.01774556189775467,
"rewards/margins": 0.024822643026709557,
"rewards/rejected": -0.007077082060277462,
"step": 3060
},
{
"epoch": 0.8034545930384716,
"grad_norm": 1.1953125,
"learning_rate": 5.659349521125459e-08,
"logits/chosen": -2.9022018909454346,
"logits/rejected": -2.901132822036743,
"logps/chosen": -296.6135559082031,
"logps/rejected": -276.8060607910156,
"loss": 0.6787,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.020619522780179977,
"rewards/margins": 0.030655449256300926,
"rewards/rejected": -0.010035926476120949,
"step": 3070
},
{
"epoch": 0.8060717089767077,
"grad_norm": 1.2578125,
"learning_rate": 5.5154057665109e-08,
"logits/chosen": -2.8536829948425293,
"logits/rejected": -2.8322811126708984,
"logps/chosen": -274.37506103515625,
"logps/rejected": -250.5011749267578,
"loss": 0.6774,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.017293009907007217,
"rewards/margins": 0.03290316089987755,
"rewards/rejected": -0.015610149130225182,
"step": 3080
},
{
"epoch": 0.8086888249149438,
"grad_norm": 1.34375,
"learning_rate": 5.3730889699075853e-08,
"logits/chosen": -2.864671230316162,
"logits/rejected": -2.833627700805664,
"logps/chosen": -294.9132995605469,
"logps/rejected": -240.95458984375,
"loss": 0.6754,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.020536890253424644,
"rewards/margins": 0.037192363291978836,
"rewards/rejected": -0.016655471175909042,
"step": 3090
},
{
"epoch": 0.8113059408531798,
"grad_norm": 1.34375,
"learning_rate": 5.2324110147270893e-08,
"logits/chosen": -2.837832450866699,
"logits/rejected": -2.827129602432251,
"logps/chosen": -296.73492431640625,
"logps/rejected": -290.35296630859375,
"loss": 0.6713,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 0.03090183436870575,
"rewards/margins": 0.04617828503251076,
"rewards/rejected": -0.015276448801159859,
"step": 3100
},
{
"epoch": 0.8113059408531798,
"eval_logits/chosen": -2.8546648025512695,
"eval_logits/rejected": -2.827969789505005,
"eval_logps/chosen": -280.9596252441406,
"eval_logps/rejected": -262.92376708984375,
"eval_loss": 0.6775153279304504,
"eval_rewards/accuracies": 0.6825000047683716,
"eval_rewards/chosen": 0.01813914068043232,
"eval_rewards/margins": 0.03299335017800331,
"eval_rewards/rejected": -0.014854210428893566,
"eval_runtime": 623.3449,
"eval_samples_per_second": 3.208,
"eval_steps_per_second": 0.401,
"step": 3100
},
{
"epoch": 0.8139230567914159,
"grad_norm": 1.2578125,
"learning_rate": 5.0933836475381795e-08,
"logits/chosen": -2.8541502952575684,
"logits/rejected": -2.819565534591675,
"logps/chosen": -299.7816467285156,
"logps/rejected": -286.51507568359375,
"loss": 0.6744,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.02967965044081211,
"rewards/margins": 0.039316095411777496,
"rewards/rejected": -0.00963644403964281,
"step": 3110
},
{
"epoch": 0.816540172729652,
"grad_norm": 1.8203125,
"learning_rate": 4.956018477086005e-08,
"logits/chosen": -2.8387677669525146,
"logits/rejected": -2.8084654808044434,
"logps/chosen": -289.3456115722656,
"logps/rejected": -263.594970703125,
"loss": 0.6773,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.019268421456217766,
"rewards/margins": 0.03356018662452698,
"rewards/rejected": -0.014291766099631786,
"step": 3120
},
{
"epoch": 0.819157288667888,
"grad_norm": 1.203125,
"learning_rate": 4.820326973322763e-08,
"logits/chosen": -2.839634656906128,
"logits/rejected": -2.8185315132141113,
"logps/chosen": -266.79034423828125,
"logps/rejected": -267.4413146972656,
"loss": 0.6779,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 0.010893099941313267,
"rewards/margins": 0.03200749307870865,
"rewards/rejected": -0.021114394068717957,
"step": 3130
},
{
"epoch": 0.821774404606124,
"grad_norm": 1.2578125,
"learning_rate": 4.686320466449981e-08,
"logits/chosen": -2.8295464515686035,
"logits/rejected": -2.778109312057495,
"logps/chosen": -256.5823059082031,
"logps/rejected": -256.2238464355469,
"loss": 0.6793,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.012616870924830437,
"rewards/margins": 0.029145419597625732,
"rewards/rejected": -0.016528548672795296,
"step": 3140
},
{
"epoch": 0.8243915205443602,
"grad_norm": 1.3359375,
"learning_rate": 4.554010145972417e-08,
"logits/chosen": -2.8855738639831543,
"logits/rejected": -2.836945056915283,
"logps/chosen": -278.258544921875,
"logps/rejected": -268.62969970703125,
"loss": 0.6786,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.016889285296201706,
"rewards/margins": 0.030887436121702194,
"rewards/rejected": -0.013998152688145638,
"step": 3150
},
{
"epoch": 0.8270086364825961,
"grad_norm": 1.25,
"learning_rate": 4.423407059763745e-08,
"logits/chosen": -2.8406424522399902,
"logits/rejected": -2.825413227081299,
"logps/chosen": -289.0135192871094,
"logps/rejected": -282.2171630859375,
"loss": 0.6774,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 0.020682137459516525,
"rewards/margins": 0.0332643985748291,
"rewards/rejected": -0.012582260183990002,
"step": 3160
},
{
"epoch": 0.8296257524208323,
"grad_norm": 1.15625,
"learning_rate": 4.294522113144078e-08,
"logits/chosen": -2.791628360748291,
"logits/rejected": -2.753540515899658,
"logps/chosen": -284.7926025390625,
"logps/rejected": -252.89364624023438,
"loss": 0.6749,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": 0.02161264792084694,
"rewards/margins": 0.03834725171327591,
"rewards/rejected": -0.01673460379242897,
"step": 3170
},
{
"epoch": 0.8322428683590684,
"grad_norm": 1.140625,
"learning_rate": 4.1673660679693804e-08,
"logits/chosen": -2.8362486362457275,
"logits/rejected": -2.8258681297302246,
"logps/chosen": -235.7146759033203,
"logps/rejected": -263.33929443359375,
"loss": 0.6785,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.01654769480228424,
"rewards/margins": 0.03069342114031315,
"rewards/rejected": -0.014145726338028908,
"step": 3180
},
{
"epoch": 0.8348599842973043,
"grad_norm": 1.4609375,
"learning_rate": 4.041949541732825e-08,
"logits/chosen": -2.846217632293701,
"logits/rejected": -2.845409870147705,
"logps/chosen": -278.4856872558594,
"logps/rejected": -266.3340759277344,
"loss": 0.678,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": 0.018012622371315956,
"rewards/margins": 0.03179007023572922,
"rewards/rejected": -0.01377745158970356,
"step": 3190
},
{
"epoch": 0.8374771002355405,
"grad_norm": 2.15625,
"learning_rate": 3.9182830066782605e-08,
"logits/chosen": -2.809504747390747,
"logits/rejected": -2.8133652210235596,
"logps/chosen": -273.1795349121094,
"logps/rejected": -288.18035888671875,
"loss": 0.6774,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.016945619136095047,
"rewards/margins": 0.03313397616147995,
"rewards/rejected": -0.016188358888030052,
"step": 3200
},
{
"epoch": 0.8374771002355405,
"eval_logits/chosen": -2.854266405105591,
"eval_logits/rejected": -2.8275365829467773,
"eval_logps/chosen": -280.958251953125,
"eval_logps/rejected": -262.9411315917969,
"eval_loss": 0.6774209141731262,
"eval_rewards/accuracies": 0.6830000281333923,
"eval_rewards/chosen": 0.018152602016925812,
"eval_rewards/margins": 0.03318041190505028,
"eval_rewards/rejected": -0.015027807094156742,
"eval_runtime": 624.1731,
"eval_samples_per_second": 3.204,
"eval_steps_per_second": 0.401,
"step": 3200
},
{
"epoch": 0.8400942161737766,
"grad_norm": 1.484375,
"learning_rate": 3.79637678892577e-08,
"logits/chosen": -2.8115928173065186,
"logits/rejected": -2.8144474029541016,
"logps/chosen": -290.44683837890625,
"logps/rejected": -275.8350830078125,
"loss": 0.6819,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.01646428182721138,
"rewards/margins": 0.0238783098757267,
"rewards/rejected": -0.007414024323225021,
"step": 3210
},
{
"epoch": 0.8427113321120125,
"grad_norm": 1.203125,
"learning_rate": 3.6762410676094645e-08,
"logits/chosen": -2.822303533554077,
"logits/rejected": -2.8179895877838135,
"logps/chosen": -316.88623046875,
"logps/rejected": -273.47967529296875,
"loss": 0.6733,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.02506067231297493,
"rewards/margins": 0.04178461804986,
"rewards/rejected": -0.01672394946217537,
"step": 3220
},
{
"epoch": 0.8453284480502486,
"grad_norm": 1.21875,
"learning_rate": 3.557885874027497e-08,
"logits/chosen": -2.8261001110076904,
"logits/rejected": -2.814418077468872,
"logps/chosen": -276.53558349609375,
"logps/rejected": -266.6348571777344,
"loss": 0.6794,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.018592124804854393,
"rewards/margins": 0.029115628451108932,
"rewards/rejected": -0.010523504577577114,
"step": 3230
},
{
"epoch": 0.8479455639884846,
"grad_norm": 1.359375,
"learning_rate": 3.441321090804469e-08,
"logits/chosen": -2.8752944469451904,
"logits/rejected": -2.841357469558716,
"logps/chosen": -281.1075744628906,
"logps/rejected": -243.8683319091797,
"loss": 0.6785,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.01994512416422367,
"rewards/margins": 0.030809426680207253,
"rewards/rejected": -0.010864300653338432,
"step": 3240
},
{
"epoch": 0.8505626799267207,
"grad_norm": 1.25,
"learning_rate": 3.326556451066234e-08,
"logits/chosen": -2.877654552459717,
"logits/rejected": -2.847358465194702,
"logps/chosen": -308.5765380859375,
"logps/rejected": -283.89654541015625,
"loss": 0.6741,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.02807055041193962,
"rewards/margins": 0.04025264084339142,
"rewards/rejected": -0.012182091362774372,
"step": 3250
},
{
"epoch": 0.8531797958649568,
"grad_norm": 1.2578125,
"learning_rate": 3.2136015376271946e-08,
"logits/chosen": -2.8326189517974854,
"logits/rejected": -2.801771640777588,
"logps/chosen": -274.5975646972656,
"logps/rejected": -257.2004699707031,
"loss": 0.6812,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.010432440787553787,
"rewards/margins": 0.02520870603621006,
"rewards/rejected": -0.014776261523365974,
"step": 3260
},
{
"epoch": 0.8557969118031928,
"grad_norm": 1.6640625,
"learning_rate": 3.102465782190106e-08,
"logits/chosen": -2.84391713142395,
"logits/rejected": -2.8385825157165527,
"logps/chosen": -264.6709899902344,
"logps/rejected": -251.43215942382812,
"loss": 0.6785,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.017527025192975998,
"rewards/margins": 0.0312645398080349,
"rewards/rejected": -0.01373751275241375,
"step": 3270
},
{
"epoch": 0.8584140277414289,
"grad_norm": 1.28125,
"learning_rate": 2.993158464558565e-08,
"logits/chosen": -2.8273541927337646,
"logits/rejected": -2.8198060989379883,
"logps/chosen": -289.29791259765625,
"logps/rejected": -293.8225402832031,
"loss": 0.6809,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.02297678217291832,
"rewards/margins": 0.026020046323537827,
"rewards/rejected": -0.0030432622879743576,
"step": 3280
},
{
"epoch": 0.861031143679665,
"grad_norm": 1.359375,
"learning_rate": 2.8856887118621358e-08,
"logits/chosen": -2.870941638946533,
"logits/rejected": -2.8777921199798584,
"logps/chosen": -274.54119873046875,
"logps/rejected": -275.3123474121094,
"loss": 0.6788,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.013969512656331062,
"rewards/margins": 0.030474882572889328,
"rewards/rejected": -0.016505368053913116,
"step": 3290
},
{
"epoch": 0.863648259617901,
"grad_norm": 1.296875,
"learning_rate": 2.7800654977942482e-08,
"logits/chosen": -2.828477382659912,
"logits/rejected": -2.7952821254730225,
"logps/chosen": -273.37109375,
"logps/rejected": -293.2996520996094,
"loss": 0.6781,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.017902836203575134,
"rewards/margins": 0.03182779997587204,
"rewards/rejected": -0.013924960978329182,
"step": 3300
},
{
"epoch": 0.863648259617901,
"eval_logits/chosen": -2.8559255599975586,
"eval_logits/rejected": -2.8293371200561523,
"eval_logps/chosen": -280.9559326171875,
"eval_logps/rejected": -262.91455078125,
"eval_loss": 0.6775384545326233,
"eval_rewards/accuracies": 0.6809999942779541,
"eval_rewards/chosen": 0.018175845965743065,
"eval_rewards/margins": 0.03293789178133011,
"eval_rewards/rejected": -0.014762048609554768,
"eval_runtime": 623.0311,
"eval_samples_per_second": 3.21,
"eval_steps_per_second": 0.401,
"step": 3300
},
{
"epoch": 0.8662653755561371,
"grad_norm": 1.0625,
"learning_rate": 2.676297641862879e-08,
"logits/chosen": -2.8453030586242676,
"logits/rejected": -2.831481456756592,
"logps/chosen": -240.8975830078125,
"logps/rejected": -200.9988250732422,
"loss": 0.6755,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.016030047088861465,
"rewards/margins": 0.03695772588253021,
"rewards/rejected": -0.020927678793668747,
"step": 3310
},
{
"epoch": 0.8688824914943732,
"grad_norm": 1.3125,
"learning_rate": 2.5743938086541352e-08,
"logits/chosen": -2.8288021087646484,
"logits/rejected": -2.802476167678833,
"logps/chosen": -278.8105773925781,
"logps/rejected": -255.1897430419922,
"loss": 0.6772,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.013786676339805126,
"rewards/margins": 0.03420311585068703,
"rewards/rejected": -0.020416438579559326,
"step": 3320
},
{
"epoch": 0.8714996074326092,
"grad_norm": 1.40625,
"learning_rate": 2.474362507108757e-08,
"logits/chosen": -2.8980019092559814,
"logits/rejected": -2.8595707416534424,
"logps/chosen": -289.9441833496094,
"logps/rejected": -271.2764587402344,
"loss": 0.6707,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.026317168027162552,
"rewards/margins": 0.047435760498046875,
"rewards/rejected": -0.02111859992146492,
"step": 3330
},
{
"epoch": 0.8741167233708453,
"grad_norm": 1.40625,
"learning_rate": 2.3762120898116495e-08,
"logits/chosen": -2.849844455718994,
"logits/rejected": -2.834979772567749,
"logps/chosen": -287.1720886230469,
"logps/rejected": -279.83941650390625,
"loss": 0.6816,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.00895099900662899,
"rewards/margins": 0.0245995931327343,
"rewards/rejected": -0.01564859412610531,
"step": 3340
},
{
"epoch": 0.8767338393090814,
"grad_norm": 1.546875,
"learning_rate": 2.2799507522944044e-08,
"logits/chosen": -2.7699649333953857,
"logits/rejected": -2.7505178451538086,
"logps/chosen": -284.6213073730469,
"logps/rejected": -281.0083923339844,
"loss": 0.6758,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.02020053006708622,
"rewards/margins": 0.036288876086473465,
"rewards/rejected": -0.016088349744677544,
"step": 3350
},
{
"epoch": 0.8793509552473174,
"grad_norm": 1.4296875,
"learning_rate": 2.1855865323510054e-08,
"logits/chosen": -2.8084092140197754,
"logits/rejected": -2.7662644386291504,
"logps/chosen": -292.76629638671875,
"logps/rejected": -290.1708679199219,
"loss": 0.6738,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.022515032440423965,
"rewards/margins": 0.04099477082490921,
"rewards/rejected": -0.018479738384485245,
"step": 3360
},
{
"epoch": 0.8819680711855535,
"grad_norm": 1.3125,
"learning_rate": 2.0931273093666573e-08,
"logits/chosen": -2.817469358444214,
"logits/rejected": -2.792358875274658,
"logps/chosen": -256.8840637207031,
"logps/rejected": -239.3094024658203,
"loss": 0.6749,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.013278042897582054,
"rewards/margins": 0.03780357167124748,
"rewards/rejected": -0.02452552691102028,
"step": 3370
},
{
"epoch": 0.8845851871237895,
"grad_norm": 1.3671875,
"learning_rate": 2.002580803659873e-08,
"logits/chosen": -2.821927070617676,
"logits/rejected": -2.779125213623047,
"logps/chosen": -268.94305419921875,
"logps/rejected": -259.77130126953125,
"loss": 0.6793,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.011967052705585957,
"rewards/margins": 0.02943551540374756,
"rewards/rejected": -0.017468463629484177,
"step": 3380
},
{
"epoch": 0.8872023030620256,
"grad_norm": 1.171875,
"learning_rate": 1.9139545758378256e-08,
"logits/chosen": -2.8498919010162354,
"logits/rejected": -2.797402858734131,
"logps/chosen": -284.4458923339844,
"logps/rejected": -238.79989624023438,
"loss": 0.6721,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": 0.028188347816467285,
"rewards/margins": 0.04387947544455528,
"rewards/rejected": -0.015691127628087997,
"step": 3390
},
{
"epoch": 0.8898194190002617,
"grad_norm": 1.3671875,
"learning_rate": 1.8272560261650277e-08,
"logits/chosen": -2.8581271171569824,
"logits/rejected": -2.828350305557251,
"logps/chosen": -329.14361572265625,
"logps/rejected": -273.26776123046875,
"loss": 0.6733,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.02928735874593258,
"rewards/margins": 0.04170341044664383,
"rewards/rejected": -0.012416050769388676,
"step": 3400
},
{
"epoch": 0.8898194190002617,
"eval_logits/chosen": -2.850832462310791,
"eval_logits/rejected": -2.823747396469116,
"eval_logps/chosen": -280.9770202636719,
"eval_logps/rejected": -262.9403381347656,
"eval_loss": 0.6775196194648743,
"eval_rewards/accuracies": 0.6825000047683716,
"eval_rewards/chosen": 0.017964746803045273,
"eval_rewards/margins": 0.03298423811793327,
"eval_rewards/rejected": -0.01501949317753315,
"eval_runtime": 623.1437,
"eval_samples_per_second": 3.21,
"eval_steps_per_second": 0.401,
"step": 3400
},
{
"epoch": 0.8924365349384977,
"grad_norm": 1.40625,
"learning_rate": 1.742492393945427e-08,
"logits/chosen": -2.830068826675415,
"logits/rejected": -2.787055015563965,
"logps/chosen": -295.30035400390625,
"logps/rejected": -255.3282470703125,
"loss": 0.6776,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.01680312678217888,
"rewards/margins": 0.032698854804039,
"rewards/rejected": -0.015895728021860123,
"step": 3410
},
{
"epoch": 0.8950536508767338,
"grad_norm": 1.46875,
"learning_rate": 1.6596707569179302e-08,
"logits/chosen": -2.8651199340820312,
"logits/rejected": -2.8412108421325684,
"logps/chosen": -294.6651916503906,
"logps/rejected": -264.3734130859375,
"loss": 0.6762,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.022434063255786896,
"rewards/margins": 0.035665739327669144,
"rewards/rejected": -0.013231677003204823,
"step": 3420
},
{
"epoch": 0.8976707668149699,
"grad_norm": 1.265625,
"learning_rate": 1.5787980306653848e-08,
"logits/chosen": -2.8364651203155518,
"logits/rejected": -2.7913882732391357,
"logps/chosen": -288.8866271972656,
"logps/rejected": -276.93865966796875,
"loss": 0.6742,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.024098176509141922,
"rewards/margins": 0.03978481888771057,
"rewards/rejected": -0.01568664237856865,
"step": 3430
},
{
"epoch": 0.9002878827532059,
"grad_norm": 1.203125,
"learning_rate": 1.499880968037165e-08,
"logits/chosen": -2.8334012031555176,
"logits/rejected": -2.8106682300567627,
"logps/chosen": -267.6552734375,
"logps/rejected": -231.9657745361328,
"loss": 0.6767,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.02195141650736332,
"rewards/margins": 0.03444907069206238,
"rewards/rejected": -0.012497651390731335,
"step": 3440
},
{
"epoch": 0.902904998691442,
"grad_norm": 1.2109375,
"learning_rate": 1.4229261585852803e-08,
"logits/chosen": -2.8546454906463623,
"logits/rejected": -2.8432843685150146,
"logps/chosen": -280.5713806152344,
"logps/rejected": -257.9490966796875,
"loss": 0.6755,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.022586923092603683,
"rewards/margins": 0.037085022777318954,
"rewards/rejected": -0.01449810154736042,
"step": 3450
},
{
"epoch": 0.9055221146296781,
"grad_norm": 1.328125,
"learning_rate": 1.3479400280141883e-08,
"logits/chosen": -2.823387384414673,
"logits/rejected": -2.8125298023223877,
"logps/chosen": -262.6939697265625,
"logps/rejected": -266.7622985839844,
"loss": 0.6774,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.018851932138204575,
"rewards/margins": 0.03330928832292557,
"rewards/rejected": -0.014457357116043568,
"step": 3460
},
{
"epoch": 0.9081392305679141,
"grad_norm": 1.203125,
"learning_rate": 1.2749288376442042e-08,
"logits/chosen": -2.839914083480835,
"logits/rejected": -2.8061881065368652,
"logps/chosen": -314.83599853515625,
"logps/rejected": -256.56256103515625,
"loss": 0.6731,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": 0.027168557047843933,
"rewards/margins": 0.041992831975221634,
"rewards/rejected": -0.01482427679002285,
"step": 3470
},
{
"epoch": 0.9107563465061502,
"grad_norm": 1.3828125,
"learning_rate": 1.2038986838887127e-08,
"logits/chosen": -2.8704135417938232,
"logits/rejected": -2.8492181301116943,
"logps/chosen": -257.75701904296875,
"logps/rejected": -257.2181091308594,
"loss": 0.6845,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.006689242087304592,
"rewards/margins": 0.019121108576655388,
"rewards/rejected": -0.01243186742067337,
"step": 3480
},
{
"epoch": 0.9133734624443863,
"grad_norm": 1.25,
"learning_rate": 1.1348554977451131e-08,
"logits/chosen": -2.879945993423462,
"logits/rejected": -2.854792356491089,
"logps/chosen": -299.6977233886719,
"logps/rejected": -266.33148193359375,
"loss": 0.6776,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.01960437372326851,
"rewards/margins": 0.03308872506022453,
"rewards/rejected": -0.013484349474310875,
"step": 3490
},
{
"epoch": 0.9159905783826223,
"grad_norm": 1.2890625,
"learning_rate": 1.06780504429958e-08,
"logits/chosen": -2.860222578048706,
"logits/rejected": -2.8316166400909424,
"logps/chosen": -295.9383850097656,
"logps/rejected": -253.4501495361328,
"loss": 0.6739,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.025537196546792984,
"rewards/margins": 0.040846120566129684,
"rewards/rejected": -0.015308921225368977,
"step": 3500
},
{
"epoch": 0.9159905783826223,
"eval_logits/chosen": -2.8575375080108643,
"eval_logits/rejected": -2.8311452865600586,
"eval_logps/chosen": -280.9686279296875,
"eval_logps/rejected": -262.9413146972656,
"eval_loss": 0.677466869354248,
"eval_rewards/accuracies": 0.6850000023841858,
"eval_rewards/chosen": 0.01804887317121029,
"eval_rewards/margins": 0.033078454434871674,
"eval_rewards/rejected": -0.015029575675725937,
"eval_runtime": 623.0737,
"eval_samples_per_second": 3.21,
"eval_steps_per_second": 0.401,
"step": 3500
},
{
"epoch": 0.9186076943208584,
"grad_norm": 2.078125,
"learning_rate": 1.0027529222456754e-08,
"logits/chosen": -2.80438232421875,
"logits/rejected": -2.773073673248291,
"logps/chosen": -268.601318359375,
"logps/rejected": -252.92831420898438,
"loss": 0.6728,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": 0.02063935436308384,
"rewards/margins": 0.0424073152244091,
"rewards/rejected": -0.021767962723970413,
"step": 3510
},
{
"epoch": 0.9212248102590945,
"grad_norm": 1.2734375,
"learning_rate": 9.397045634168766e-09,
"logits/chosen": -2.8710060119628906,
"logits/rejected": -2.8575310707092285,
"logps/chosen": -283.4559020996094,
"logps/rejected": -289.69219970703125,
"loss": 0.6729,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 0.031010348349809647,
"rewards/margins": 0.04308422654867172,
"rewards/rejected": -0.012073880061507225,
"step": 3520
},
{
"epoch": 0.9238419261973305,
"grad_norm": 1.3125,
"learning_rate": 8.78665232332998e-09,
"logits/chosen": -2.8076834678649902,
"logits/rejected": -2.7891430854797363,
"logps/chosen": -245.83383178710938,
"logps/rejected": -245.38418579101562,
"loss": 0.6793,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.013311171904206276,
"rewards/margins": 0.028882578015327454,
"rewards/rejected": -0.015571406111121178,
"step": 3530
},
{
"epoch": 0.9264590421355666,
"grad_norm": 1.21875,
"learning_rate": 8.196400257606206e-09,
"logits/chosen": -2.8521595001220703,
"logits/rejected": -2.810176134109497,
"logps/chosen": -298.8065490722656,
"logps/rejected": -297.49627685546875,
"loss": 0.6753,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.018893834203481674,
"rewards/margins": 0.037972934544086456,
"rewards/rejected": -0.019079100340604782,
"step": 3540
},
{
"epoch": 0.9290761580738026,
"grad_norm": 1.421875,
"learning_rate": 7.626338722875075e-09,
"logits/chosen": -2.8442888259887695,
"logits/rejected": -2.8555784225463867,
"logps/chosen": -271.51593017578125,
"logps/rejected": -271.29620361328125,
"loss": 0.679,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.014854473061859608,
"rewards/margins": 0.02983902022242546,
"rewards/rejected": -0.014984548091888428,
"step": 3550
},
{
"epoch": 0.9316932740120387,
"grad_norm": 1.3203125,
"learning_rate": 7.0765153191106875e-09,
"logits/chosen": -2.8553731441497803,
"logits/rejected": -2.8395111560821533,
"logps/chosen": -269.2403564453125,
"logps/rejected": -229.14599609375,
"loss": 0.6757,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 0.017869068309664726,
"rewards/margins": 0.03671371936798096,
"rewards/rejected": -0.01884464919567108,
"step": 3560
},
{
"epoch": 0.9343103899502748,
"grad_norm": 1.0,
"learning_rate": 6.54697595640899e-09,
"logits/chosen": -2.8470935821533203,
"logits/rejected": -2.827101945877075,
"logps/chosen": -307.4560241699219,
"logps/rejected": -287.3361511230469,
"loss": 0.6755,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.024625394493341446,
"rewards/margins": 0.037567656487226486,
"rewards/rejected": -0.01294226385653019,
"step": 3570
},
{
"epoch": 0.9369275058885108,
"grad_norm": 1.2578125,
"learning_rate": 6.037764851154425e-09,
"logits/chosen": -2.817866802215576,
"logits/rejected": -2.8075547218322754,
"logps/chosen": -280.808837890625,
"logps/rejected": -287.5721740722656,
"loss": 0.6757,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.019267046824097633,
"rewards/margins": 0.03673623502254486,
"rewards/rejected": -0.017469191923737526,
"step": 3580
},
{
"epoch": 0.9395446218267469,
"grad_norm": 1.171875,
"learning_rate": 5.548924522327747e-09,
"logits/chosen": -2.839773178100586,
"logits/rejected": -2.8247604370117188,
"logps/chosen": -277.45343017578125,
"logps/rejected": -264.47119140625,
"loss": 0.6772,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.015755945816636086,
"rewards/margins": 0.03340662270784378,
"rewards/rejected": -0.017650676891207695,
"step": 3590
},
{
"epoch": 0.942161737764983,
"grad_norm": 1.34375,
"learning_rate": 5.080495787955691e-09,
"logits/chosen": -2.8103365898132324,
"logits/rejected": -2.794466257095337,
"logps/chosen": -242.8784637451172,
"logps/rejected": -246.01123046875,
"loss": 0.6807,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.014323743060231209,
"rewards/margins": 0.02595413103699684,
"rewards/rejected": -0.011630385182797909,
"step": 3600
},
{
"epoch": 0.942161737764983,
"eval_logits/chosen": -2.8526551723480225,
"eval_logits/rejected": -2.825742483139038,
"eval_logps/chosen": -280.9523620605469,
"eval_logps/rejected": -262.9205017089844,
"eval_loss": 0.6774939298629761,
"eval_rewards/accuracies": 0.6855000257492065,
"eval_rewards/chosen": 0.01821131445467472,
"eval_rewards/margins": 0.03303277865052223,
"eval_rewards/rejected": -0.014821460470557213,
"eval_runtime": 622.8509,
"eval_samples_per_second": 3.211,
"eval_steps_per_second": 0.401,
"step": 3600
},
{
"epoch": 0.944778853703219,
"grad_norm": 2.9375,
"learning_rate": 4.632517761702814e-09,
"logits/chosen": -2.7846920490264893,
"logits/rejected": -2.756865978240967,
"logps/chosen": -257.6873779296875,
"logps/rejected": -247.35317993164062,
"loss": 0.6763,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 0.009808182716369629,
"rewards/margins": 0.03539792820811272,
"rewards/rejected": -0.025589745491743088,
"step": 3610
},
{
"epoch": 0.9473959696414551,
"grad_norm": 1.3515625,
"learning_rate": 4.205027849605358e-09,
"logits/chosen": -2.8126158714294434,
"logits/rejected": -2.7998046875,
"logps/chosen": -264.48272705078125,
"logps/rejected": -232.7639617919922,
"loss": 0.6783,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.010889967903494835,
"rewards/margins": 0.031071290373802185,
"rewards/rejected": -0.020181316882371902,
"step": 3620
},
{
"epoch": 0.9500130855796912,
"grad_norm": 1.34375,
"learning_rate": 3.798061746947995e-09,
"logits/chosen": -2.8637187480926514,
"logits/rejected": -2.8383963108062744,
"logps/chosen": -279.2807922363281,
"logps/rejected": -244.65072631835938,
"loss": 0.6745,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.01557912863790989,
"rewards/margins": 0.03933199122548103,
"rewards/rejected": -0.023752864450216293,
"step": 3630
},
{
"epoch": 0.9526302015179272,
"grad_norm": 1.296875,
"learning_rate": 3.411653435283157e-09,
"logits/chosen": -2.8357815742492676,
"logits/rejected": -2.8046395778656006,
"logps/chosen": -287.30950927734375,
"logps/rejected": -232.4551239013672,
"loss": 0.6757,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 0.02017480693757534,
"rewards/margins": 0.03672494366765022,
"rewards/rejected": -0.016550134867429733,
"step": 3640
},
{
"epoch": 0.9552473174561633,
"grad_norm": 1.390625,
"learning_rate": 3.0458351795936698e-09,
"logits/chosen": -2.8704733848571777,
"logits/rejected": -2.8465213775634766,
"logps/chosen": -264.4095764160156,
"logps/rejected": -236.84811401367188,
"loss": 0.6731,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.023878643289208412,
"rewards/margins": 0.04207443445920944,
"rewards/rejected": -0.01819578930735588,
"step": 3650
},
{
"epoch": 0.9578644333943994,
"grad_norm": 1.5078125,
"learning_rate": 2.700637525598598e-09,
"logits/chosen": -2.8182005882263184,
"logits/rejected": -2.823500156402588,
"logps/chosen": -287.9983825683594,
"logps/rejected": -288.73779296875,
"loss": 0.681,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.018164271488785744,
"rewards/margins": 0.02582050859928131,
"rewards/rejected": -0.007656236179172993,
"step": 3660
},
{
"epoch": 0.9604815493326354,
"grad_norm": 1.3515625,
"learning_rate": 2.3760892972027324e-09,
"logits/chosen": -2.886582612991333,
"logits/rejected": -2.8624680042266846,
"logps/chosen": -286.31439208984375,
"logps/rejected": -253.89181518554688,
"loss": 0.6793,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 0.013353118672966957,
"rewards/margins": 0.029065540060400963,
"rewards/rejected": -0.015712425112724304,
"step": 3670
},
{
"epoch": 0.9630986652708715,
"grad_norm": 1.4765625,
"learning_rate": 2.0722175940897645e-09,
"logits/chosen": -2.8091981410980225,
"logits/rejected": -2.827847719192505,
"logps/chosen": -275.06439208984375,
"logps/rejected": -267.3816833496094,
"loss": 0.6769,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 0.014090280048549175,
"rewards/margins": 0.03396814316511154,
"rewards/rejected": -0.01987786404788494,
"step": 3680
},
{
"epoch": 0.9657157812091076,
"grad_norm": 2.0,
"learning_rate": 1.7890477894593748e-09,
"logits/chosen": -2.8381717205047607,
"logits/rejected": -2.8093409538269043,
"logps/chosen": -335.9226379394531,
"logps/rejected": -286.562744140625,
"loss": 0.6684,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": 0.035428646951913834,
"rewards/margins": 0.05196043848991394,
"rewards/rejected": -0.016531798988580704,
"step": 3690
},
{
"epoch": 0.9683328971473436,
"grad_norm": 1.2890625,
"learning_rate": 1.5266035279088708e-09,
"logits/chosen": -2.7718958854675293,
"logits/rejected": -2.762516498565674,
"logps/chosen": -317.29156494140625,
"logps/rejected": -293.4043884277344,
"loss": 0.6731,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.02600114978849888,
"rewards/margins": 0.042024485766887665,
"rewards/rejected": -0.016023332253098488,
"step": 3700
},
{
"epoch": 0.9683328971473436,
"eval_logits/chosen": -2.850998640060425,
"eval_logits/rejected": -2.8239121437072754,
"eval_logps/chosen": -280.9513854980469,
"eval_logps/rejected": -262.9112854003906,
"eval_loss": 0.6775330901145935,
"eval_rewards/accuracies": 0.6834999918937683,
"eval_rewards/chosen": 0.018221192061901093,
"eval_rewards/margins": 0.03295028209686279,
"eval_rewards/rejected": -0.014729092828929424,
"eval_runtime": 623.4667,
"eval_samples_per_second": 3.208,
"eval_steps_per_second": 0.401,
"step": 3700
},
{
"epoch": 0.9709500130855797,
"grad_norm": 1.5390625,
"learning_rate": 1.2849067234584621e-09,
"logits/chosen": -2.7960381507873535,
"logits/rejected": -2.789794921875,
"logps/chosen": -251.1592559814453,
"logps/rejected": -244.37112426757812,
"loss": 0.6784,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.014368300326168537,
"rewards/margins": 0.031433962285518646,
"rewards/rejected": -0.017065661028027534,
"step": 3710
},
{
"epoch": 0.9735671290238157,
"grad_norm": 1.65625,
"learning_rate": 1.0639775577218625e-09,
"logits/chosen": -2.7958970069885254,
"logits/rejected": -2.7415878772735596,
"logps/chosen": -266.13140869140625,
"logps/rejected": -232.8394775390625,
"loss": 0.677,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.015364277176558971,
"rewards/margins": 0.03415878862142563,
"rewards/rejected": -0.018794508650898933,
"step": 3720
},
{
"epoch": 0.9761842449620518,
"grad_norm": 1.578125,
"learning_rate": 8.638344782207485e-10,
"logits/chosen": -2.802359104156494,
"logits/rejected": -2.7990007400512695,
"logps/chosen": -271.896240234375,
"logps/rejected": -248.9223175048828,
"loss": 0.6758,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.018408995121717453,
"rewards/margins": 0.036677196621894836,
"rewards/rejected": -0.018268201500177383,
"step": 3730
},
{
"epoch": 0.9788013609002879,
"grad_norm": 1.5859375,
"learning_rate": 6.844941968447149e-10,
"logits/chosen": -2.8409347534179688,
"logits/rejected": -2.8161139488220215,
"logps/chosen": -288.1478576660156,
"logps/rejected": -280.65985107421875,
"loss": 0.6694,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.027417827397584915,
"rewards/margins": 0.049938250333070755,
"rewards/rejected": -0.02252042479813099,
"step": 3740
},
{
"epoch": 0.9814184768385239,
"grad_norm": 1.25,
"learning_rate": 5.25971688455612e-10,
"logits/chosen": -2.8641200065612793,
"logits/rejected": -2.8417608737945557,
"logps/chosen": -288.54437255859375,
"logps/rejected": -287.37908935546875,
"loss": 0.6733,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": 0.0248104277998209,
"rewards/margins": 0.04141296073794365,
"rewards/rejected": -0.016602538526058197,
"step": 3750
},
{
"epoch": 0.98403559277676,
"grad_norm": 1.3515625,
"learning_rate": 3.882801896372967e-10,
"logits/chosen": -2.8650496006011963,
"logits/rejected": -2.8601956367492676,
"logps/chosen": -280.47113037109375,
"logps/rejected": -251.7021942138672,
"loss": 0.6775,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.017177898436784744,
"rewards/margins": 0.03319885581731796,
"rewards/rejected": -0.016020962968468666,
"step": 3760
},
{
"epoch": 0.9866527087149961,
"grad_norm": 1.2578125,
"learning_rate": 2.714311975902661e-10,
"logits/chosen": -2.8119492530822754,
"logits/rejected": -2.7749814987182617,
"logps/chosen": -303.2042541503906,
"logps/rejected": -277.75457763671875,
"loss": 0.6761,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.020443648099899292,
"rewards/margins": 0.03567901626229286,
"rewards/rejected": -0.015235371887683868,
"step": 3770
},
{
"epoch": 0.9892698246532321,
"grad_norm": 1.4453125,
"learning_rate": 1.754344691717591e-10,
"logits/chosen": -2.8344626426696777,
"logits/rejected": -2.811007022857666,
"logps/chosen": -266.02685546875,
"logps/rejected": -288.6179504394531,
"loss": 0.6835,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 0.019811708480119705,
"rewards/margins": 0.020612578839063644,
"rewards/rejected": -0.0008008688455447555,
"step": 3780
},
{
"epoch": 0.9918869405914682,
"grad_norm": 1.4609375,
"learning_rate": 1.0029802008096333e-10,
"logits/chosen": -2.843888521194458,
"logits/rejected": -2.801081418991089,
"logps/chosen": -288.82562255859375,
"logps/rejected": -270.85028076171875,
"loss": 0.6735,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.020014481619000435,
"rewards/margins": 0.04117124527692795,
"rewards/rejected": -0.021156763657927513,
"step": 3790
},
{
"epoch": 0.9945040565297043,
"grad_norm": 1.296875,
"learning_rate": 4.602812418974533e-11,
"logits/chosen": -2.866516351699829,
"logits/rejected": -2.8406145572662354,
"logps/chosen": -301.0005798339844,
"logps/rejected": -279.62908935546875,
"loss": 0.675,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.02747102454304695,
"rewards/margins": 0.03847536817193031,
"rewards/rejected": -0.011004343628883362,
"step": 3800
},
{
"epoch": 0.9945040565297043,
"eval_logits/chosen": -2.850416660308838,
"eval_logits/rejected": -2.8232762813568115,
"eval_logps/chosen": -280.95458984375,
"eval_logps/rejected": -262.90020751953125,
"eval_loss": 0.6776041388511658,
"eval_rewards/accuracies": 0.6855000257492065,
"eval_rewards/chosen": 0.018189024180173874,
"eval_rewards/margins": 0.03280767798423767,
"eval_rewards/rejected": -0.014618655666708946,
"eval_runtime": 622.8726,
"eval_samples_per_second": 3.211,
"eval_steps_per_second": 0.401,
"step": 3800
},
{
"epoch": 0.9971211724679403,
"grad_norm": 1.2890625,
"learning_rate": 1.2629313018819309e-11,
"logits/chosen": -2.8219265937805176,
"logits/rejected": -2.7998881340026855,
"logps/chosen": -272.62823486328125,
"logps/rejected": -255.8968048095703,
"loss": 0.676,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.015002429485321045,
"rewards/margins": 0.03602247312664986,
"rewards/rejected": -0.021020041778683662,
"step": 3810
},
{
"epoch": 0.9997382884061764,
"grad_norm": 3.515625,
"learning_rate": 1.0437535929996855e-13,
"logits/chosen": -2.8465044498443604,
"logits/rejected": -2.825206995010376,
"logps/chosen": -305.00775146484375,
"logps/rejected": -262.2654724121094,
"loss": 0.6739,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.018057797104120255,
"rewards/margins": 0.04028897359967232,
"rewards/rejected": -0.02223118022084236,
"step": 3820
},
{
"epoch": 1.0,
"step": 3821,
"total_flos": 0.0,
"train_loss": 0.680465580483626,
"train_runtime": 64957.9706,
"train_samples_per_second": 0.941,
"train_steps_per_second": 0.059
}
],
"logging_steps": 10,
"max_steps": 3821,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}