zephyr-7b-gpo-v6-i1 / trainer_state.json
lole25's picture
Model save
c02105b verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9998286203941731,
"eval_steps": 500,
"global_step": 2917,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"learning_rate": 1.7123287671232876e-08,
"logits/chosen": -2.21498966217041,
"logits/rejected": -1.5619134902954102,
"logps/chosen": -448.18634033203125,
"logps/rejected": -230.1645965576172,
"loss": 0.1703,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.0,
"learning_rate": 1.7123287671232878e-07,
"logits/chosen": -1.9158155918121338,
"logits/rejected": -1.947864055633545,
"logps/chosen": -236.8881072998047,
"logps/rejected": -271.3336181640625,
"loss": 0.1345,
"rewards/accuracies": 0.3888888955116272,
"rewards/chosen": 1.0350075172027573e-05,
"rewards/margins": -1.4042092516319826e-05,
"rewards/rejected": 2.4392174964305013e-05,
"step": 10
},
{
"epoch": 0.01,
"learning_rate": 3.4246575342465755e-07,
"logits/chosen": -1.9916515350341797,
"logits/rejected": -1.7161877155303955,
"logps/chosen": -181.11163330078125,
"logps/rejected": -196.61138916015625,
"loss": 0.1196,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": 4.240129783283919e-06,
"rewards/margins": 0.00010243832366541028,
"rewards/rejected": -9.819817205425352e-05,
"step": 20
},
{
"epoch": 0.01,
"learning_rate": 5.136986301369864e-07,
"logits/chosen": -1.9248673915863037,
"logits/rejected": -1.8731294870376587,
"logps/chosen": -227.29898071289062,
"logps/rejected": -272.0036315917969,
"loss": 0.1104,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.00023775253794156015,
"rewards/margins": 0.00018605976947583258,
"rewards/rejected": -0.00042381230741739273,
"step": 30
},
{
"epoch": 0.01,
"learning_rate": 6.849315068493151e-07,
"logits/chosen": -1.9346742630004883,
"logits/rejected": -1.6983258724212646,
"logps/chosen": -284.8092346191406,
"logps/rejected": -235.57955932617188,
"loss": 0.103,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.00043622878729365766,
"rewards/margins": 0.0012934322003275156,
"rewards/rejected": -0.001729660900309682,
"step": 40
},
{
"epoch": 0.02,
"learning_rate": 8.561643835616439e-07,
"logits/chosen": -1.997926950454712,
"logits/rejected": -1.5733534097671509,
"logps/chosen": -234.2779998779297,
"logps/rejected": -186.01576232910156,
"loss": 0.1645,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.004240738693624735,
"rewards/margins": 0.0029823766089975834,
"rewards/rejected": -0.007223114371299744,
"step": 50
},
{
"epoch": 0.02,
"learning_rate": 1.0273972602739727e-06,
"logits/chosen": -1.8750112056732178,
"logits/rejected": -1.5471408367156982,
"logps/chosen": -226.1807098388672,
"logps/rejected": -231.8741455078125,
"loss": 0.1159,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.026738092303276062,
"rewards/margins": 0.011812428012490273,
"rewards/rejected": -0.038550518453121185,
"step": 60
},
{
"epoch": 0.02,
"learning_rate": 1.1986301369863014e-06,
"logits/chosen": -1.8785406351089478,
"logits/rejected": -1.6199915409088135,
"logps/chosen": -309.3338623046875,
"logps/rejected": -310.3027038574219,
"loss": 0.1245,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.06837661564350128,
"rewards/margins": 0.023886824026703835,
"rewards/rejected": -0.09226343780755997,
"step": 70
},
{
"epoch": 0.03,
"learning_rate": 1.3698630136986302e-06,
"logits/chosen": -1.8794485330581665,
"logits/rejected": -1.588679552078247,
"logps/chosen": -331.4400939941406,
"logps/rejected": -316.6481018066406,
"loss": 0.1391,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.09261558204889297,
"rewards/margins": 0.05595500394701958,
"rewards/rejected": -0.14857056736946106,
"step": 80
},
{
"epoch": 0.03,
"learning_rate": 1.541095890410959e-06,
"logits/chosen": -1.73800790309906,
"logits/rejected": -1.5066587924957275,
"logps/chosen": -349.1307678222656,
"logps/rejected": -445.1373596191406,
"loss": 0.1099,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.1359734833240509,
"rewards/margins": 0.0638502761721611,
"rewards/rejected": -0.1998237520456314,
"step": 90
},
{
"epoch": 0.03,
"learning_rate": 1.7123287671232877e-06,
"logits/chosen": -1.9925429821014404,
"logits/rejected": -1.5489239692687988,
"logps/chosen": -431.8665466308594,
"logps/rejected": -449.634521484375,
"loss": 0.124,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.14928413927555084,
"rewards/margins": 0.0672868937253952,
"rewards/rejected": -0.21657104790210724,
"step": 100
},
{
"epoch": 0.04,
"learning_rate": 1.8835616438356166e-06,
"logits/chosen": -1.938245177268982,
"logits/rejected": -1.5735212564468384,
"logps/chosen": -445.81585693359375,
"logps/rejected": -423.18975830078125,
"loss": 0.1018,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.15499410033226013,
"rewards/margins": 0.051616422832012177,
"rewards/rejected": -0.2066105306148529,
"step": 110
},
{
"epoch": 0.04,
"learning_rate": 2.0547945205479454e-06,
"logits/chosen": -1.8546117544174194,
"logits/rejected": -1.6232588291168213,
"logps/chosen": -401.47039794921875,
"logps/rejected": -448.80926513671875,
"loss": 0.0749,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.13759753108024597,
"rewards/margins": 0.0838770940899849,
"rewards/rejected": -0.22147460281848907,
"step": 120
},
{
"epoch": 0.04,
"learning_rate": 2.2260273972602743e-06,
"logits/chosen": -1.7494646310806274,
"logits/rejected": -1.6120729446411133,
"logps/chosen": -393.23095703125,
"logps/rejected": -414.7491760253906,
"loss": 0.067,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.1571560651063919,
"rewards/margins": 0.04211575910449028,
"rewards/rejected": -0.19927182793617249,
"step": 130
},
{
"epoch": 0.05,
"learning_rate": 2.3972602739726027e-06,
"logits/chosen": -1.8314697742462158,
"logits/rejected": -1.4660162925720215,
"logps/chosen": -343.4485778808594,
"logps/rejected": -441.86578369140625,
"loss": 0.1026,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.11567596346139908,
"rewards/margins": 0.09308116137981415,
"rewards/rejected": -0.20875711739063263,
"step": 140
},
{
"epoch": 0.05,
"learning_rate": 2.568493150684932e-06,
"logits/chosen": -1.769061803817749,
"logits/rejected": -1.5675675868988037,
"logps/chosen": -377.51739501953125,
"logps/rejected": -394.83465576171875,
"loss": 0.0902,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.15446873009204865,
"rewards/margins": 0.04394759237766266,
"rewards/rejected": -0.1984163224697113,
"step": 150
},
{
"epoch": 0.05,
"learning_rate": 2.7397260273972604e-06,
"logits/chosen": -1.7472827434539795,
"logits/rejected": -1.5711588859558105,
"logps/chosen": -377.2109069824219,
"logps/rejected": -495.3035583496094,
"loss": 0.0971,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.1695915013551712,
"rewards/margins": 0.08974287658929825,
"rewards/rejected": -0.25933438539505005,
"step": 160
},
{
"epoch": 0.06,
"learning_rate": 2.9109589041095893e-06,
"logits/chosen": -1.8495628833770752,
"logits/rejected": -1.5801998376846313,
"logps/chosen": -324.21337890625,
"logps/rejected": -401.52374267578125,
"loss": 0.1013,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.06990089267492294,
"rewards/margins": 0.1042117103934288,
"rewards/rejected": -0.17411258816719055,
"step": 170
},
{
"epoch": 0.06,
"learning_rate": 3.082191780821918e-06,
"logits/chosen": -1.929395079612732,
"logits/rejected": -1.5674490928649902,
"logps/chosen": -332.16339111328125,
"logps/rejected": -346.80450439453125,
"loss": 0.087,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.08584436774253845,
"rewards/margins": 0.055128611624240875,
"rewards/rejected": -0.14097298681735992,
"step": 180
},
{
"epoch": 0.07,
"learning_rate": 3.253424657534247e-06,
"logits/chosen": -2.0753486156463623,
"logits/rejected": -1.6974204778671265,
"logps/chosen": -361.5826721191406,
"logps/rejected": -427.80426025390625,
"loss": 0.0783,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.09311284124851227,
"rewards/margins": 0.08312083035707474,
"rewards/rejected": -0.1762336641550064,
"step": 190
},
{
"epoch": 0.07,
"learning_rate": 3.4246575342465754e-06,
"logits/chosen": -2.206291675567627,
"logits/rejected": -1.9346578121185303,
"logps/chosen": -351.0352783203125,
"logps/rejected": -328.7702941894531,
"loss": 0.0932,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.09371860325336456,
"rewards/margins": 0.030456852167844772,
"rewards/rejected": -0.12417546659708023,
"step": 200
},
{
"epoch": 0.07,
"learning_rate": 3.5958904109589043e-06,
"logits/chosen": -1.9507122039794922,
"logits/rejected": -1.7119086980819702,
"logps/chosen": -292.4835205078125,
"logps/rejected": -361.983154296875,
"loss": 0.0902,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.10735298693180084,
"rewards/margins": 0.070334292948246,
"rewards/rejected": -0.17768728733062744,
"step": 210
},
{
"epoch": 0.08,
"learning_rate": 3.767123287671233e-06,
"logits/chosen": -1.8516258001327515,
"logits/rejected": -1.7747443914413452,
"logps/chosen": -324.3973388671875,
"logps/rejected": -373.698486328125,
"loss": 0.1124,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -0.1862211674451828,
"rewards/margins": 0.040039997547864914,
"rewards/rejected": -0.226261168718338,
"step": 220
},
{
"epoch": 0.08,
"learning_rate": 3.938356164383562e-06,
"logits/chosen": -2.0984139442443848,
"logits/rejected": -1.8558800220489502,
"logps/chosen": -437.14459228515625,
"logps/rejected": -443.85089111328125,
"loss": 0.0886,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.15925905108451843,
"rewards/margins": 0.06432916969060898,
"rewards/rejected": -0.223588228225708,
"step": 230
},
{
"epoch": 0.08,
"learning_rate": 4.109589041095891e-06,
"logits/chosen": -1.8379871845245361,
"logits/rejected": -1.6939672231674194,
"logps/chosen": -351.00531005859375,
"logps/rejected": -395.9615783691406,
"loss": 0.0919,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.1394365429878235,
"rewards/margins": 0.06487082690000534,
"rewards/rejected": -0.20430736243724823,
"step": 240
},
{
"epoch": 0.09,
"learning_rate": 4.28082191780822e-06,
"logits/chosen": -2.0115137100219727,
"logits/rejected": -1.811408281326294,
"logps/chosen": -411.9342346191406,
"logps/rejected": -407.1709289550781,
"loss": 0.085,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.115334153175354,
"rewards/margins": 0.04877592995762825,
"rewards/rejected": -0.16411006450653076,
"step": 250
},
{
"epoch": 0.09,
"learning_rate": 4.4520547945205486e-06,
"logits/chosen": -1.917514443397522,
"logits/rejected": -1.6930125951766968,
"logps/chosen": -409.4349365234375,
"logps/rejected": -449.8536071777344,
"loss": 0.0983,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.15566658973693848,
"rewards/margins": 0.08868524432182312,
"rewards/rejected": -0.24435186386108398,
"step": 260
},
{
"epoch": 0.09,
"learning_rate": 4.6232876712328774e-06,
"logits/chosen": -1.9329277276992798,
"logits/rejected": -1.6485719680786133,
"logps/chosen": -373.9496765136719,
"logps/rejected": -461.1018981933594,
"loss": 0.0747,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.17361843585968018,
"rewards/margins": 0.12690795958042145,
"rewards/rejected": -0.30052638053894043,
"step": 270
},
{
"epoch": 0.1,
"learning_rate": 4.7945205479452054e-06,
"logits/chosen": -2.2051990032196045,
"logits/rejected": -1.7319234609603882,
"logps/chosen": -330.0890197753906,
"logps/rejected": -398.8971252441406,
"loss": 0.1105,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.09975247085094452,
"rewards/margins": 0.12828990817070007,
"rewards/rejected": -0.2280423939228058,
"step": 280
},
{
"epoch": 0.1,
"learning_rate": 4.965753424657534e-06,
"logits/chosen": -2.15606427192688,
"logits/rejected": -2.102695941925049,
"logps/chosen": -421.451171875,
"logps/rejected": -486.7090759277344,
"loss": 0.0951,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.15245939791202545,
"rewards/margins": 0.05438145995140076,
"rewards/rejected": -0.20684084296226501,
"step": 290
},
{
"epoch": 0.1,
"learning_rate": 4.99988541499203e-06,
"logits/chosen": -1.9494987726211548,
"logits/rejected": -1.7490851879119873,
"logps/chosen": -476.6026916503906,
"logps/rejected": -605.9563598632812,
"loss": 0.1238,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.25119680166244507,
"rewards/margins": 0.1370239108800888,
"rewards/rejected": -0.38822072744369507,
"step": 300
},
{
"epoch": 0.11,
"learning_rate": 4.999419931399174e-06,
"logits/chosen": -1.973769187927246,
"logits/rejected": -2.039909839630127,
"logps/chosen": -482.8154296875,
"logps/rejected": -553.8546142578125,
"loss": 0.0801,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.27585893869400024,
"rewards/margins": 0.04923711344599724,
"rewards/rejected": -0.325096070766449,
"step": 310
},
{
"epoch": 0.11,
"learning_rate": 4.998596454278661e-06,
"logits/chosen": -2.062340021133423,
"logits/rejected": -1.94599187374115,
"logps/chosen": -504.60546875,
"logps/rejected": -554.1131591796875,
"loss": 0.0724,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.24708731472492218,
"rewards/margins": 0.06553421169519424,
"rewards/rejected": -0.31262150406837463,
"step": 320
},
{
"epoch": 0.11,
"learning_rate": 4.99741510157765e-06,
"logits/chosen": -1.9648933410644531,
"logits/rejected": -1.780860185623169,
"logps/chosen": -620.1107788085938,
"logps/rejected": -667.7522583007812,
"loss": 0.1007,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.313930869102478,
"rewards/margins": 0.06759389489889145,
"rewards/rejected": -0.3815247416496277,
"step": 330
},
{
"epoch": 0.12,
"learning_rate": 4.995876042502048e-06,
"logits/chosen": -2.1744627952575684,
"logits/rejected": -1.8270336389541626,
"logps/chosen": -484.35101318359375,
"logps/rejected": -553.4263305664062,
"loss": 0.1147,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.2436065375804901,
"rewards/margins": 0.08232472836971283,
"rewards/rejected": -0.32593125104904175,
"step": 340
},
{
"epoch": 0.12,
"learning_rate": 4.993979497492282e-06,
"logits/chosen": -1.914698600769043,
"logits/rejected": -1.7681747674942017,
"logps/chosen": -472.70794677734375,
"logps/rejected": -629.3675537109375,
"loss": 0.1157,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.2542678713798523,
"rewards/margins": 0.09860087186098099,
"rewards/rejected": -0.3528687059879303,
"step": 350
},
{
"epoch": 0.12,
"learning_rate": 4.9917257381917115e-06,
"logits/chosen": -2.0197696685791016,
"logits/rejected": -1.7743446826934814,
"logps/chosen": -639.1318359375,
"logps/rejected": -686.443603515625,
"loss": 0.1203,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.3869578242301941,
"rewards/margins": 0.06751660257577896,
"rewards/rejected": -0.45447444915771484,
"step": 360
},
{
"epoch": 0.13,
"learning_rate": 4.989115087407737e-06,
"logits/chosen": -1.904762864112854,
"logits/rejected": -1.8776410818099976,
"logps/chosen": -539.9891357421875,
"logps/rejected": -601.7970581054688,
"loss": 0.0625,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.3085237145423889,
"rewards/margins": 0.08412410318851471,
"rewards/rejected": -0.3926478326320648,
"step": 370
},
{
"epoch": 0.13,
"learning_rate": 4.986147919065551e-06,
"logits/chosen": -1.876813530921936,
"logits/rejected": -1.7952611446380615,
"logps/chosen": -488.16107177734375,
"logps/rejected": -583.83056640625,
"loss": 0.0983,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.25909119844436646,
"rewards/margins": 0.04833118990063667,
"rewards/rejected": -0.30742236971855164,
"step": 380
},
{
"epoch": 0.13,
"learning_rate": 4.982824658154589e-06,
"logits/chosen": -2.1078734397888184,
"logits/rejected": -1.9783456325531006,
"logps/chosen": -396.71630859375,
"logps/rejected": -487.6923828125,
"loss": 0.0596,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.23540160059928894,
"rewards/margins": 0.09165269136428833,
"rewards/rejected": -0.32705432176589966,
"step": 390
},
{
"epoch": 0.14,
"learning_rate": 4.979145780667652e-06,
"logits/chosen": -2.0578982830047607,
"logits/rejected": -1.7134244441986084,
"logps/chosen": -535.1932983398438,
"logps/rejected": -577.6029663085938,
"loss": 0.0977,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.24049289524555206,
"rewards/margins": 0.0864410474896431,
"rewards/rejected": -0.32693392038345337,
"step": 400
},
{
"epoch": 0.14,
"learning_rate": 4.975111813532733e-06,
"logits/chosen": -1.844091773033142,
"logits/rejected": -1.5848596096038818,
"logps/chosen": -569.6875,
"logps/rejected": -624.45849609375,
"loss": 0.0983,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.30232542753219604,
"rewards/margins": 0.08498513698577881,
"rewards/rejected": -0.38731056451797485,
"step": 410
},
{
"epoch": 0.14,
"learning_rate": 4.970723334537547e-06,
"logits/chosen": -1.7672590017318726,
"logits/rejected": -1.5241564512252808,
"logps/chosen": -493.00628662109375,
"logps/rejected": -542.5633544921875,
"loss": 0.0683,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.2543202340602875,
"rewards/margins": 0.0733615979552269,
"rewards/rejected": -0.32768186926841736,
"step": 420
},
{
"epoch": 0.15,
"learning_rate": 4.965980972246767e-06,
"logits/chosen": -1.9794318675994873,
"logits/rejected": -1.9301669597625732,
"logps/chosen": -460.9219665527344,
"logps/rejected": -573.4908447265625,
"loss": 0.1151,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.26725929975509644,
"rewards/margins": 0.08439052850008011,
"rewards/rejected": -0.3516498804092407,
"step": 430
},
{
"epoch": 0.15,
"learning_rate": 4.960885405912001e-06,
"logits/chosen": -1.9738355875015259,
"logits/rejected": -1.8872768878936768,
"logps/chosen": -490.4803771972656,
"logps/rejected": -552.944091796875,
"loss": 0.1051,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.22316575050354004,
"rewards/margins": 0.07666633278131485,
"rewards/rejected": -0.2998320460319519,
"step": 440
},
{
"epoch": 0.15,
"learning_rate": 4.955437365374499e-06,
"logits/chosen": -2.042649507522583,
"logits/rejected": -1.833367943763733,
"logps/chosen": -427.7418518066406,
"logps/rejected": -428.240966796875,
"loss": 0.0845,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.15850776433944702,
"rewards/margins": 0.04146740958094597,
"rewards/rejected": -0.1999751627445221,
"step": 450
},
{
"epoch": 0.16,
"learning_rate": 4.949637630960618e-06,
"logits/chosen": -2.2080771923065186,
"logits/rejected": -1.9125267267227173,
"logps/chosen": -363.9242858886719,
"logps/rejected": -413.38140869140625,
"loss": 0.1212,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.151325061917305,
"rewards/margins": 0.08935161679983139,
"rewards/rejected": -0.2406766712665558,
"step": 460
},
{
"epoch": 0.16,
"learning_rate": 4.943487033370056e-06,
"logits/chosen": -2.1318681240081787,
"logits/rejected": -1.7986198663711548,
"logps/chosen": -613.80712890625,
"logps/rejected": -721.6338500976562,
"loss": 0.0967,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.3193725049495697,
"rewards/margins": 0.10743044316768646,
"rewards/rejected": -0.42680296301841736,
"step": 470
},
{
"epoch": 0.16,
"learning_rate": 4.936986453556871e-06,
"logits/chosen": -2.0002779960632324,
"logits/rejected": -1.736196756362915,
"logps/chosen": -580.8267211914062,
"logps/rejected": -735.5391845703125,
"loss": 0.0949,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.3693394362926483,
"rewards/margins": 0.12292595952749252,
"rewards/rejected": -0.49226540327072144,
"step": 480
},
{
"epoch": 0.17,
"learning_rate": 4.930136822603299e-06,
"logits/chosen": -1.894122838973999,
"logits/rejected": -1.7605243921279907,
"logps/chosen": -661.7432250976562,
"logps/rejected": -752.2081909179688,
"loss": 0.0547,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.40788716077804565,
"rewards/margins": 0.11583086103200912,
"rewards/rejected": -0.5237180590629578,
"step": 490
},
{
"epoch": 0.17,
"learning_rate": 4.922939121586396e-06,
"logits/chosen": -1.8540306091308594,
"logits/rejected": -1.6888984441757202,
"logps/chosen": -703.7432861328125,
"logps/rejected": -763.1231689453125,
"loss": 0.1083,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.4497678875923157,
"rewards/margins": 0.08937375247478485,
"rewards/rejected": -0.5391416549682617,
"step": 500
},
{
"epoch": 0.17,
"learning_rate": 4.915394381437517e-06,
"logits/chosen": -2.064244508743286,
"logits/rejected": -1.787021279335022,
"logps/chosen": -432.42877197265625,
"logps/rejected": -491.94720458984375,
"loss": 0.1268,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.30147069692611694,
"rewards/margins": 0.08128456771373749,
"rewards/rejected": -0.3827553391456604,
"step": 510
},
{
"epoch": 0.18,
"learning_rate": 4.907503682794656e-06,
"logits/chosen": -2.1678388118743896,
"logits/rejected": -1.9495502710342407,
"logps/chosen": -532.3627319335938,
"logps/rejected": -569.6402587890625,
"loss": 0.106,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.314094603061676,
"rewards/margins": 0.050387926399707794,
"rewards/rejected": -0.36448249220848083,
"step": 520
},
{
"epoch": 0.18,
"learning_rate": 4.899268155847667e-06,
"logits/chosen": -1.9529002904891968,
"logits/rejected": -1.7975317239761353,
"logps/chosen": -348.3830871582031,
"logps/rejected": -417.27947998046875,
"loss": 0.0803,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.20450131595134735,
"rewards/margins": 0.07301940768957138,
"rewards/rejected": -0.2775207459926605,
"step": 530
},
{
"epoch": 0.19,
"learning_rate": 4.890688980176381e-06,
"logits/chosen": -2.286426067352295,
"logits/rejected": -1.9330307245254517,
"logps/chosen": -511.94647216796875,
"logps/rejected": -562.6546630859375,
"loss": 0.0752,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.24677078425884247,
"rewards/margins": 0.09590072929859161,
"rewards/rejected": -0.34267157316207886,
"step": 540
},
{
"epoch": 0.19,
"learning_rate": 4.881767384581658e-06,
"logits/chosen": -2.3378746509552,
"logits/rejected": -2.0311076641082764,
"logps/chosen": -499.9852600097656,
"logps/rejected": -580.5631103515625,
"loss": 0.0586,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.24438512325286865,
"rewards/margins": 0.08834482729434967,
"rewards/rejected": -0.3327299654483795,
"step": 550
},
{
"epoch": 0.19,
"learning_rate": 4.872504646909387e-06,
"logits/chosen": -2.234276056289673,
"logits/rejected": -1.9043071269989014,
"logps/chosen": -550.0889282226562,
"logps/rejected": -577.0806884765625,
"loss": 0.0994,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.29700541496276855,
"rewards/margins": 0.07648201286792755,
"rewards/rejected": -0.3734873831272125,
"step": 560
},
{
"epoch": 0.2,
"learning_rate": 4.8629020938674536e-06,
"logits/chosen": -2.295952558517456,
"logits/rejected": -1.7963495254516602,
"logps/chosen": -415.9605407714844,
"logps/rejected": -460.894287109375,
"loss": 0.0775,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.14038066565990448,
"rewards/margins": 0.0807698667049408,
"rewards/rejected": -0.22115054726600647,
"step": 570
},
{
"epoch": 0.2,
"learning_rate": 4.852961100835717e-06,
"logits/chosen": -2.271327257156372,
"logits/rejected": -1.7637627124786377,
"logps/chosen": -500.87481689453125,
"logps/rejected": -481.28228759765625,
"loss": 0.1147,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.21432165801525116,
"rewards/margins": 0.062083542346954346,
"rewards/rejected": -0.2764051854610443,
"step": 580
},
{
"epoch": 0.2,
"learning_rate": 4.84268309166902e-06,
"logits/chosen": -1.9748462438583374,
"logits/rejected": -1.959495186805725,
"logps/chosen": -416.51531982421875,
"logps/rejected": -488.2625427246094,
"loss": 0.0904,
"rewards/accuracies": 0.42500001192092896,
"rewards/chosen": -0.24128298461437225,
"rewards/margins": 0.043503545224666595,
"rewards/rejected": -0.28478652238845825,
"step": 590
},
{
"epoch": 0.21,
"learning_rate": 4.832069538493237e-06,
"logits/chosen": -2.1981587409973145,
"logits/rejected": -1.8468116521835327,
"logps/chosen": -520.8303833007812,
"logps/rejected": -545.0802001953125,
"loss": 0.0926,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.259421169757843,
"rewards/margins": 0.06593780219554901,
"rewards/rejected": -0.3253589868545532,
"step": 600
},
{
"epoch": 0.21,
"learning_rate": 4.821121961494431e-06,
"logits/chosen": -2.008756160736084,
"logits/rejected": -1.8565582036972046,
"logps/chosen": -487.271240234375,
"logps/rejected": -622.4865112304688,
"loss": 0.1233,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.26701903343200684,
"rewards/margins": 0.1124114841222763,
"rewards/rejected": -0.37943053245544434,
"step": 610
},
{
"epoch": 0.21,
"learning_rate": 4.80984192870111e-06,
"logits/chosen": -2.2143161296844482,
"logits/rejected": -2.0251948833465576,
"logps/chosen": -475.0022888183594,
"logps/rejected": -536.0418090820312,
"loss": 0.1049,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.23833520710468292,
"rewards/margins": 0.08123020827770233,
"rewards/rejected": -0.31956541538238525,
"step": 620
},
{
"epoch": 0.22,
"learning_rate": 4.798231055759643e-06,
"logits/chosen": -2.1074166297912598,
"logits/rejected": -1.8376226425170898,
"logps/chosen": -575.6253662109375,
"logps/rejected": -653.4076538085938,
"loss": 0.0738,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.31133222579956055,
"rewards/margins": 0.125440776348114,
"rewards/rejected": -0.43677300214767456,
"step": 630
},
{
"epoch": 0.22,
"learning_rate": 4.786291005702841e-06,
"logits/chosen": -1.996763825416565,
"logits/rejected": -1.8434244394302368,
"logps/chosen": -673.5606689453125,
"logps/rejected": -735.9039306640625,
"loss": 0.0861,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.4302713871002197,
"rewards/margins": 0.07868895679712296,
"rewards/rejected": -0.5089603662490845,
"step": 640
},
{
"epoch": 0.22,
"learning_rate": 4.7740234887117745e-06,
"logits/chosen": -2.1286087036132812,
"logits/rejected": -2.0892319679260254,
"logps/chosen": -660.74267578125,
"logps/rejected": -708.2584838867188,
"loss": 0.0776,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.3790927827358246,
"rewards/margins": 0.09251677989959717,
"rewards/rejected": -0.47160959243774414,
"step": 650
},
{
"epoch": 0.23,
"learning_rate": 4.761430261870804e-06,
"logits/chosen": -2.271576404571533,
"logits/rejected": -2.0834946632385254,
"logps/chosen": -636.5614624023438,
"logps/rejected": -729.883544921875,
"loss": 0.0955,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.3770856559276581,
"rewards/margins": 0.0929422676563263,
"rewards/rejected": -0.4700279235839844,
"step": 660
},
{
"epoch": 0.23,
"learning_rate": 4.748513128915928e-06,
"logits/chosen": -2.1836562156677246,
"logits/rejected": -1.9008392095565796,
"logps/chosen": -611.7904663085938,
"logps/rejected": -666.9830932617188,
"loss": 0.0629,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.35665163397789,
"rewards/margins": 0.07555453479290009,
"rewards/rejected": -0.4322062134742737,
"step": 670
},
{
"epoch": 0.23,
"learning_rate": 4.735273939976425e-06,
"logits/chosen": -2.0491878986358643,
"logits/rejected": -1.8870747089385986,
"logps/chosen": -617.8563842773438,
"logps/rejected": -739.245361328125,
"loss": 0.0922,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.3806142210960388,
"rewards/margins": 0.11178413778543472,
"rewards/rejected": -0.49239835143089294,
"step": 680
},
{
"epoch": 0.24,
"learning_rate": 4.721714591309859e-06,
"logits/chosen": -2.241105079650879,
"logits/rejected": -1.764789342880249,
"logps/chosen": -493.39361572265625,
"logps/rejected": -587.4572143554688,
"loss": 0.0869,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.25177472829818726,
"rewards/margins": 0.11156761646270752,
"rewards/rejected": -0.3633423447608948,
"step": 690
},
{
"epoch": 0.24,
"learning_rate": 4.707837025030478e-06,
"logits/chosen": -2.0533032417297363,
"logits/rejected": -1.9060271978378296,
"logps/chosen": -480.1116638183594,
"logps/rejected": -561.89599609375,
"loss": 0.0799,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.291046142578125,
"rewards/margins": 0.0875721424818039,
"rewards/rejected": -0.3786182999610901,
"step": 700
},
{
"epoch": 0.24,
"learning_rate": 4.693643228831046e-06,
"logits/chosen": -2.1423745155334473,
"logits/rejected": -1.8586080074310303,
"logps/chosen": -486.69598388671875,
"logps/rejected": -578.1256713867188,
"loss": 0.0801,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.2516762316226959,
"rewards/margins": 0.10918694734573364,
"rewards/rejected": -0.3608631491661072,
"step": 710
},
{
"epoch": 0.25,
"learning_rate": 4.67913523569814e-06,
"logits/chosen": -2.124239444732666,
"logits/rejected": -1.8211021423339844,
"logps/chosen": -505.65789794921875,
"logps/rejected": -655.1524658203125,
"loss": 0.0577,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.2613288164138794,
"rewards/margins": 0.16436012089252472,
"rewards/rejected": -0.4256889224052429,
"step": 720
},
{
"epoch": 0.25,
"learning_rate": 4.664315123620965e-06,
"logits/chosen": -2.059915542602539,
"logits/rejected": -1.8637025356292725,
"logps/chosen": -627.88232421875,
"logps/rejected": -771.10595703125,
"loss": 0.0728,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.3556235432624817,
"rewards/margins": 0.15885277092456818,
"rewards/rejected": -0.5144763588905334,
"step": 730
},
{
"epoch": 0.25,
"learning_rate": 4.649185015293728e-06,
"logits/chosen": -2.202380895614624,
"logits/rejected": -1.7330690622329712,
"logps/chosen": -586.8130493164062,
"logps/rejected": -704.4319458007812,
"loss": 0.0514,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.3131484091281891,
"rewards/margins": 0.13473856449127197,
"rewards/rejected": -0.44788694381713867,
"step": 740
},
{
"epoch": 0.26,
"learning_rate": 4.6337470778115946e-06,
"logits/chosen": -2.2767772674560547,
"logits/rejected": -1.9689744710922241,
"logps/chosen": -588.1864013671875,
"logps/rejected": -624.829345703125,
"loss": 0.0587,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.2952669858932495,
"rewards/margins": 0.10641799122095108,
"rewards/rejected": -0.4016849398612976,
"step": 750
},
{
"epoch": 0.26,
"learning_rate": 4.6180035223603e-06,
"logits/chosen": -2.1548593044281006,
"logits/rejected": -1.7463791370391846,
"logps/chosen": -608.1296997070312,
"logps/rejected": -624.1170043945312,
"loss": 0.0462,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.3367452323436737,
"rewards/margins": 0.07480922341346741,
"rewards/rejected": -0.4115544855594635,
"step": 760
},
{
"epoch": 0.26,
"learning_rate": 4.60195660389944e-06,
"logits/chosen": -1.9999465942382812,
"logits/rejected": -1.7375587224960327,
"logps/chosen": -535.9517822265625,
"logps/rejected": -687.1197509765625,
"loss": 0.1019,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.3230445384979248,
"rewards/margins": 0.11864666640758514,
"rewards/rejected": -0.44169121980667114,
"step": 770
},
{
"epoch": 0.27,
"learning_rate": 4.585608620839487e-06,
"logits/chosen": -2.0938560962677,
"logits/rejected": -1.678571343421936,
"logps/chosen": -540.8283081054688,
"logps/rejected": -611.6381225585938,
"loss": 0.0954,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.3455389142036438,
"rewards/margins": 0.125118687748909,
"rewards/rejected": -0.4706575870513916,
"step": 780
},
{
"epoch": 0.27,
"learning_rate": 4.56896191471259e-06,
"logits/chosen": -2.2690582275390625,
"logits/rejected": -1.8676296472549438,
"logps/chosen": -589.1588745117188,
"logps/rejected": -705.4854736328125,
"loss": 0.0739,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.3118017613887787,
"rewards/margins": 0.1364874541759491,
"rewards/rejected": -0.4482892155647278,
"step": 790
},
{
"epoch": 0.27,
"learning_rate": 4.552018869837197e-06,
"logits/chosen": -2.1564137935638428,
"logits/rejected": -1.83013916015625,
"logps/chosen": -609.3465576171875,
"logps/rejected": -699.4984130859375,
"loss": 0.0745,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.3426007628440857,
"rewards/margins": 0.13322630524635315,
"rewards/rejected": -0.47582703828811646,
"step": 800
},
{
"epoch": 0.28,
"learning_rate": 4.534781912976546e-06,
"logits/chosen": -2.1592516899108887,
"logits/rejected": -1.8255417346954346,
"logps/chosen": -491.94366455078125,
"logps/rejected": -572.9866943359375,
"loss": 0.0519,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.29055461287498474,
"rewards/margins": 0.10878726094961166,
"rewards/rejected": -0.399341881275177,
"step": 810
},
{
"epoch": 0.28,
"learning_rate": 4.517253512991077e-06,
"logits/chosen": -2.1750411987304688,
"logits/rejected": -1.8149001598358154,
"logps/chosen": -588.0299682617188,
"logps/rejected": -729.2427368164062,
"loss": 0.0431,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.3461843430995941,
"rewards/margins": 0.1527387797832489,
"rewards/rejected": -0.498923122882843,
"step": 820
},
{
"epoch": 0.28,
"learning_rate": 4.499436180484816e-06,
"logits/chosen": -2.0803980827331543,
"logits/rejected": -1.8419866561889648,
"logps/chosen": -657.2296142578125,
"logps/rejected": -715.0289916992188,
"loss": 0.0468,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.3858215808868408,
"rewards/margins": 0.1072002500295639,
"rewards/rejected": -0.49302178621292114,
"step": 830
},
{
"epoch": 0.29,
"learning_rate": 4.481332467445784e-06,
"logits/chosen": -2.1348459720611572,
"logits/rejected": -1.8123409748077393,
"logps/chosen": -555.2808837890625,
"logps/rejected": -704.3569946289062,
"loss": 0.0561,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.36617225408554077,
"rewards/margins": 0.1446944773197174,
"rewards/rejected": -0.5108667016029358,
"step": 840
},
{
"epoch": 0.29,
"learning_rate": 4.462944966880464e-06,
"logits/chosen": -2.137538433074951,
"logits/rejected": -1.9754664897918701,
"logps/chosen": -614.5938110351562,
"logps/rejected": -646.7432250976562,
"loss": 0.0661,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.3780440390110016,
"rewards/margins": 0.04127226397395134,
"rewards/rejected": -0.41931629180908203,
"step": 850
},
{
"epoch": 0.29,
"learning_rate": 4.444276312442415e-06,
"logits/chosen": -2.0289080142974854,
"logits/rejected": -1.7629003524780273,
"logps/chosen": -541.0709838867188,
"logps/rejected": -660.1776123046875,
"loss": 0.1268,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.3162756562232971,
"rewards/margins": 0.12853361666202545,
"rewards/rejected": -0.4448092579841614,
"step": 860
},
{
"epoch": 0.3,
"learning_rate": 4.425329178055044e-06,
"logits/chosen": -2.194471597671509,
"logits/rejected": -2.0296597480773926,
"logps/chosen": -475.4537048339844,
"logps/rejected": -510.2806701660156,
"loss": 0.0885,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.2241251915693283,
"rewards/margins": 0.07522304356098175,
"rewards/rejected": -0.29934826493263245,
"step": 870
},
{
"epoch": 0.3,
"learning_rate": 4.40610627752862e-06,
"logits/chosen": -2.2251474857330322,
"logits/rejected": -1.7184536457061768,
"logps/chosen": -499.7225036621094,
"logps/rejected": -632.2643432617188,
"loss": 0.0669,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.2541348338127136,
"rewards/margins": 0.1577138453722,
"rewards/rejected": -0.4118487238883972,
"step": 880
},
{
"epoch": 0.31,
"learning_rate": 4.386610364171575e-06,
"logits/chosen": -2.0547173023223877,
"logits/rejected": -1.9380995035171509,
"logps/chosen": -583.1637573242188,
"logps/rejected": -675.6350708007812,
"loss": 0.0608,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.31417426466941833,
"rewards/margins": 0.07938437163829803,
"rewards/rejected": -0.3935586214065552,
"step": 890
},
{
"epoch": 0.31,
"learning_rate": 4.366844230396145e-06,
"logits/chosen": -2.1797802448272705,
"logits/rejected": -1.739689588546753,
"logps/chosen": -619.1736450195312,
"logps/rejected": -779.1910400390625,
"loss": 0.0831,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.354561984539032,
"rewards/margins": 0.16057774424552917,
"rewards/rejected": -0.5151397585868835,
"step": 900
},
{
"epoch": 0.31,
"learning_rate": 4.346810707318409e-06,
"logits/chosen": -2.101902723312378,
"logits/rejected": -1.756699800491333,
"logps/chosen": -550.2091064453125,
"logps/rejected": -652.6905517578125,
"loss": 0.0876,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.34942546486854553,
"rewards/margins": 0.1116580218076706,
"rewards/rejected": -0.4610835015773773,
"step": 910
},
{
"epoch": 0.32,
"learning_rate": 4.326512664352788e-06,
"logits/chosen": -2.1261112689971924,
"logits/rejected": -1.695481538772583,
"logps/chosen": -602.4801635742188,
"logps/rejected": -667.7581176757812,
"loss": 0.0785,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.3986489176750183,
"rewards/margins": 0.07721661776304245,
"rewards/rejected": -0.47586554288864136,
"step": 920
},
{
"epoch": 0.32,
"learning_rate": 4.30595300880106e-06,
"logits/chosen": -1.9621734619140625,
"logits/rejected": -1.8363412618637085,
"logps/chosen": -516.5333862304688,
"logps/rejected": -650.1600952148438,
"loss": 0.0876,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.3551289439201355,
"rewards/margins": 0.12143947929143906,
"rewards/rejected": -0.47656846046447754,
"step": 930
},
{
"epoch": 0.32,
"learning_rate": 4.285134685435941e-06,
"logits/chosen": -2.111262321472168,
"logits/rejected": -1.8083570003509521,
"logps/chosen": -619.1475830078125,
"logps/rejected": -671.5328369140625,
"loss": 0.069,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.3735567033290863,
"rewards/margins": 0.09376771748065948,
"rewards/rejected": -0.467324435710907,
"step": 940
},
{
"epoch": 0.33,
"learning_rate": 4.264060676079302e-06,
"logits/chosen": -1.897774338722229,
"logits/rejected": -1.758368730545044,
"logps/chosen": -648.8074951171875,
"logps/rejected": -753.0172119140625,
"loss": 0.099,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.3972818851470947,
"rewards/margins": 0.11452829837799072,
"rewards/rejected": -0.5118101239204407,
"step": 950
},
{
"epoch": 0.33,
"learning_rate": 4.242733999175087e-06,
"logits/chosen": -2.1442208290100098,
"logits/rejected": -1.8442842960357666,
"logps/chosen": -574.822265625,
"logps/rejected": -674.7249145507812,
"loss": 0.0727,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.3319811522960663,
"rewards/margins": 0.10243819653987885,
"rewards/rejected": -0.4344193935394287,
"step": 960
},
{
"epoch": 0.33,
"learning_rate": 4.221157709356973e-06,
"logits/chosen": -2.069833517074585,
"logits/rejected": -2.053417682647705,
"logps/chosen": -473.2567443847656,
"logps/rejected": -551.2691650390625,
"loss": 0.0748,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.28391993045806885,
"rewards/margins": 0.07798723131418228,
"rewards/rejected": -0.3619071841239929,
"step": 970
},
{
"epoch": 0.34,
"learning_rate": 4.199334897010857e-06,
"logits/chosen": -2.428363561630249,
"logits/rejected": -1.9904381036758423,
"logps/chosen": -574.2955932617188,
"logps/rejected": -636.1633911132812,
"loss": 0.0508,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.29294562339782715,
"rewards/margins": 0.11134348064661026,
"rewards/rejected": -0.404289186000824,
"step": 980
},
{
"epoch": 0.34,
"learning_rate": 4.177268687832216e-06,
"logits/chosen": -2.2618508338928223,
"logits/rejected": -1.9453493356704712,
"logps/chosen": -583.1436767578125,
"logps/rejected": -659.47998046875,
"loss": 0.0492,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.3153453767299652,
"rewards/margins": 0.0904412493109703,
"rewards/rejected": -0.4057866036891937,
"step": 990
},
{
"epoch": 0.34,
"learning_rate": 4.154962242378413e-06,
"logits/chosen": -2.2178263664245605,
"logits/rejected": -1.5476510524749756,
"logps/chosen": -663.134765625,
"logps/rejected": -688.6644287109375,
"loss": 0.0723,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.3838910460472107,
"rewards/margins": 0.10614491999149323,
"rewards/rejected": -0.4900360107421875,
"step": 1000
},
{
"epoch": 0.35,
"learning_rate": 4.132418755616006e-06,
"logits/chosen": -2.1056065559387207,
"logits/rejected": -1.8169822692871094,
"logps/chosen": -689.2779541015625,
"logps/rejected": -805.4830322265625,
"loss": 0.0795,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.42020684480667114,
"rewards/margins": 0.09372207522392273,
"rewards/rejected": -0.5139288902282715,
"step": 1010
},
{
"epoch": 0.35,
"learning_rate": 4.109641456463135e-06,
"logits/chosen": -2.270031452178955,
"logits/rejected": -2.0831856727600098,
"logps/chosen": -583.760009765625,
"logps/rejected": -599.2212524414062,
"loss": 0.1385,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.3357154428958893,
"rewards/margins": 0.05204144865274429,
"rewards/rejected": -0.387756884098053,
"step": 1020
},
{
"epoch": 0.35,
"learning_rate": 4.086633607327036e-06,
"logits/chosen": -1.9891811609268188,
"logits/rejected": -1.892112135887146,
"logps/chosen": -592.84814453125,
"logps/rejected": -705.7501831054688,
"loss": 0.0726,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.40938377380371094,
"rewards/margins": 0.07667826116085052,
"rewards/rejected": -0.4860619902610779,
"step": 1030
},
{
"epoch": 0.36,
"learning_rate": 4.06339850363677e-06,
"logits/chosen": -2.229407548904419,
"logits/rejected": -1.667838454246521,
"logps/chosen": -679.7462158203125,
"logps/rejected": -710.1227416992188,
"loss": 0.0926,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.380540132522583,
"rewards/margins": 0.09510533511638641,
"rewards/rejected": -0.4756454527378082,
"step": 1040
},
{
"epoch": 0.36,
"learning_rate": 4.039939473371213e-06,
"logits/chosen": -2.238617420196533,
"logits/rejected": -1.8673069477081299,
"logps/chosen": -528.8187255859375,
"logps/rejected": -692.1638793945312,
"loss": 0.0882,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.32173532247543335,
"rewards/margins": 0.16088572144508362,
"rewards/rejected": -0.48262104392051697,
"step": 1050
},
{
"epoch": 0.36,
"learning_rate": 4.01625987658239e-06,
"logits/chosen": -2.209980010986328,
"logits/rejected": -1.9042339324951172,
"logps/chosen": -692.3306884765625,
"logps/rejected": -672.4944458007812,
"loss": 0.0636,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.3526589572429657,
"rewards/margins": 0.04882120341062546,
"rewards/rejected": -0.40148013830184937,
"step": 1060
},
{
"epoch": 0.37,
"learning_rate": 3.992363104914211e-06,
"logits/chosen": -2.256624698638916,
"logits/rejected": -1.9974693059921265,
"logps/chosen": -560.591796875,
"logps/rejected": -578.005859375,
"loss": 0.0446,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.27544528245925903,
"rewards/margins": 0.07316794246435165,
"rewards/rejected": -0.3486132025718689,
"step": 1070
},
{
"epoch": 0.37,
"learning_rate": 3.9682525811166835e-06,
"logits/chosen": -2.007150173187256,
"logits/rejected": -1.7732871770858765,
"logps/chosen": -560.3460693359375,
"logps/rejected": -629.5783081054688,
"loss": 0.0836,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.37917017936706543,
"rewards/margins": 0.04371767118573189,
"rewards/rejected": -0.4228878617286682,
"step": 1080
},
{
"epoch": 0.37,
"learning_rate": 3.943931758555669e-06,
"logits/chosen": -2.0913567543029785,
"logits/rejected": -1.7354761362075806,
"logps/chosen": -608.615478515625,
"logps/rejected": -674.1383666992188,
"loss": 0.0904,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.3448850214481354,
"rewards/margins": 0.10200424492359161,
"rewards/rejected": -0.4468892514705658,
"step": 1090
},
{
"epoch": 0.38,
"learning_rate": 3.91940412071826e-06,
"logits/chosen": -2.225922107696533,
"logits/rejected": -1.8326537609100342,
"logps/chosen": -651.896728515625,
"logps/rejected": -687.8331298828125,
"loss": 0.0807,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.3168756365776062,
"rewards/margins": 0.10071317106485367,
"rewards/rejected": -0.4175888001918793,
"step": 1100
},
{
"epoch": 0.38,
"learning_rate": 3.894673180713829e-06,
"logits/chosen": -2.0696487426757812,
"logits/rejected": -1.9062414169311523,
"logps/chosen": -542.2778930664062,
"logps/rejected": -659.351318359375,
"loss": 0.1094,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.29152894020080566,
"rewards/margins": 0.09752384573221207,
"rewards/rejected": -0.38905277848243713,
"step": 1110
},
{
"epoch": 0.38,
"learning_rate": 3.869742480770855e-06,
"logits/chosen": -2.371598958969116,
"logits/rejected": -2.1790008544921875,
"logps/chosen": -508.5203552246094,
"logps/rejected": -568.6731567382812,
"loss": 0.0912,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.2144095003604889,
"rewards/margins": 0.08987125009298325,
"rewards/rejected": -0.30428069829940796,
"step": 1120
},
{
"epoch": 0.39,
"learning_rate": 3.844615591729558e-06,
"logits/chosen": -2.045975685119629,
"logits/rejected": -1.9603124856948853,
"logps/chosen": -437.69635009765625,
"logps/rejected": -593.5106201171875,
"loss": 0.0675,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.2701892852783203,
"rewards/margins": 0.0922120064496994,
"rewards/rejected": -0.3624013364315033,
"step": 1130
},
{
"epoch": 0.39,
"learning_rate": 3.819296112530448e-06,
"logits/chosen": -1.885154128074646,
"logits/rejected": -1.9763774871826172,
"logps/chosen": -534.5982055664062,
"logps/rejected": -644.5667114257812,
"loss": 0.103,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.3009013533592224,
"rewards/margins": 0.07459478080272675,
"rewards/rejected": -0.37549614906311035,
"step": 1140
},
{
"epoch": 0.39,
"learning_rate": 3.7937876696988505e-06,
"logits/chosen": -2.228935718536377,
"logits/rejected": -1.992790937423706,
"logps/chosen": -595.5947265625,
"logps/rejected": -676.4202880859375,
"loss": 0.0764,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.33396145701408386,
"rewards/margins": 0.09032727777957916,
"rewards/rejected": -0.4242887496948242,
"step": 1150
},
{
"epoch": 0.4,
"learning_rate": 3.7680939168254733e-06,
"logits/chosen": -2.148974657058716,
"logits/rejected": -1.8327066898345947,
"logps/chosen": -651.0219116210938,
"logps/rejected": -701.9970092773438,
"loss": 0.0987,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.37643447518348694,
"rewards/margins": 0.08819916099309921,
"rewards/rejected": -0.46463364362716675,
"step": 1160
},
{
"epoch": 0.4,
"learning_rate": 3.7422185340430983e-06,
"logits/chosen": -2.2028675079345703,
"logits/rejected": -2.0429883003234863,
"logps/chosen": -584.5667114257812,
"logps/rejected": -641.1693115234375,
"loss": 0.0937,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.34211236238479614,
"rewards/margins": 0.09406879544258118,
"rewards/rejected": -0.4361811578273773,
"step": 1170
},
{
"epoch": 0.4,
"learning_rate": 3.71616522749948e-06,
"logits/chosen": -2.465292453765869,
"logits/rejected": -2.052821159362793,
"logps/chosen": -604.7883911132812,
"logps/rejected": -693.3052978515625,
"loss": 0.0774,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.33726420998573303,
"rewards/margins": 0.13398997485637665,
"rewards/rejected": -0.4712541103363037,
"step": 1180
},
{
"epoch": 0.41,
"learning_rate": 3.6899377288265043e-06,
"logits/chosen": -2.0992684364318848,
"logits/rejected": -1.9688608646392822,
"logps/chosen": -596.2180786132812,
"logps/rejected": -685.1240844726562,
"loss": 0.0809,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.3490615487098694,
"rewards/margins": 0.11136557906866074,
"rewards/rejected": -0.46042710542678833,
"step": 1190
},
{
"epoch": 0.41,
"learning_rate": 3.6635397946057114e-06,
"logits/chosen": -2.260376453399658,
"logits/rejected": -1.8176358938217163,
"logps/chosen": -605.2567138671875,
"logps/rejected": -649.65673828125,
"loss": 0.0525,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.3633373975753784,
"rewards/margins": 0.11985437572002411,
"rewards/rejected": -0.4831917881965637,
"step": 1200
},
{
"epoch": 0.41,
"learning_rate": 3.6369752058302327e-06,
"logits/chosen": -2.275251865386963,
"logits/rejected": -1.9965429306030273,
"logps/chosen": -539.2314453125,
"logps/rejected": -636.8073120117188,
"loss": 0.0995,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.3215656876564026,
"rewards/margins": 0.13287541270256042,
"rewards/rejected": -0.4544410705566406,
"step": 1210
},
{
"epoch": 0.42,
"learning_rate": 3.610247767363239e-06,
"logits/chosen": -1.9407621622085571,
"logits/rejected": -1.8752870559692383,
"logps/chosen": -587.7548828125,
"logps/rejected": -671.2868041992188,
"loss": 0.0756,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.3427480161190033,
"rewards/margins": 0.06522272527217865,
"rewards/rejected": -0.40797075629234314,
"step": 1220
},
{
"epoch": 0.42,
"learning_rate": 3.5833613073929684e-06,
"logits/chosen": -2.1943631172180176,
"logits/rejected": -1.859442114830017,
"logps/chosen": -558.7838134765625,
"logps/rejected": -717.7572021484375,
"loss": 0.0603,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.3341369032859802,
"rewards/margins": 0.13402590155601501,
"rewards/rejected": -0.46816277503967285,
"step": 1230
},
{
"epoch": 0.43,
"learning_rate": 3.55631967688441e-06,
"logits/chosen": -2.1805663108825684,
"logits/rejected": -1.8173482418060303,
"logps/chosen": -807.4880981445312,
"logps/rejected": -837.0833740234375,
"loss": 0.0518,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.45633822679519653,
"rewards/margins": 0.08246854692697525,
"rewards/rejected": -0.5388067364692688,
"step": 1240
},
{
"epoch": 0.43,
"learning_rate": 3.5291267490277316e-06,
"logits/chosen": -1.8296005725860596,
"logits/rejected": -1.6457901000976562,
"logps/chosen": -621.5354614257812,
"logps/rejected": -701.6083374023438,
"loss": 0.1176,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.3704153597354889,
"rewards/margins": 0.11859778314828873,
"rewards/rejected": -0.48901304602622986,
"step": 1250
},
{
"epoch": 0.43,
"learning_rate": 3.501786418683515e-06,
"logits/chosen": -2.0369582176208496,
"logits/rejected": -1.9080692529678345,
"logps/chosen": -765.410400390625,
"logps/rejected": -825.2081298828125,
"loss": 0.0802,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.5283640623092651,
"rewards/margins": 0.08725164830684662,
"rewards/rejected": -0.615615725517273,
"step": 1260
},
{
"epoch": 0.44,
"learning_rate": 3.474302601824896e-06,
"logits/chosen": -2.2294676303863525,
"logits/rejected": -1.7623846530914307,
"logps/chosen": -824.7463989257812,
"logps/rejected": -865.5594482421875,
"loss": 0.104,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.5527879595756531,
"rewards/margins": 0.07611383497714996,
"rewards/rejected": -0.6289017796516418,
"step": 1270
},
{
"epoch": 0.44,
"learning_rate": 3.4466792349766767e-06,
"logits/chosen": -2.3877675533294678,
"logits/rejected": -2.136277198791504,
"logps/chosen": -631.8321533203125,
"logps/rejected": -653.0026245117188,
"loss": 0.0999,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.3877051770687103,
"rewards/margins": 0.06024375557899475,
"rewards/rejected": -0.4479489326477051,
"step": 1280
},
{
"epoch": 0.44,
"learning_rate": 3.4189202746514938e-06,
"logits/chosen": -2.127175807952881,
"logits/rejected": -1.8073135614395142,
"logps/chosen": -673.6165771484375,
"logps/rejected": -750.7117309570312,
"loss": 0.0823,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.4296782910823822,
"rewards/margins": 0.1071913093328476,
"rewards/rejected": -0.5368696451187134,
"step": 1290
},
{
"epoch": 0.45,
"learning_rate": 3.391029696783127e-06,
"logits/chosen": -1.9842946529388428,
"logits/rejected": -1.5577259063720703,
"logps/chosen": -650.3092041015625,
"logps/rejected": -740.8306274414062,
"loss": 0.0833,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.4263342022895813,
"rewards/margins": 0.12290897220373154,
"rewards/rejected": -0.549243152141571,
"step": 1300
},
{
"epoch": 0.45,
"learning_rate": 3.3630114961570187e-06,
"logits/chosen": -2.326686382293701,
"logits/rejected": -1.8404957056045532,
"logps/chosen": -725.4781494140625,
"logps/rejected": -822.6609497070312,
"loss": 0.0923,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.4563368260860443,
"rewards/margins": 0.14041298627853394,
"rewards/rejected": -0.5967497825622559,
"step": 1310
},
{
"epoch": 0.45,
"learning_rate": 3.3348696858381023e-06,
"logits/chosen": -2.081413984298706,
"logits/rejected": -1.8651702404022217,
"logps/chosen": -677.7269897460938,
"logps/rejected": -756.968505859375,
"loss": 0.0801,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.4635470509529114,
"rewards/margins": 0.09350712597370148,
"rewards/rejected": -0.5570541024208069,
"step": 1320
},
{
"epoch": 0.46,
"learning_rate": 3.3066082965960082e-06,
"logits/chosen": -2.1301093101501465,
"logits/rejected": -2.035060405731201,
"logps/chosen": -712.52685546875,
"logps/rejected": -769.6886596679688,
"loss": 0.0568,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.4348185062408447,
"rewards/margins": 0.09993582218885422,
"rewards/rejected": -0.5347543954849243,
"step": 1330
},
{
"epoch": 0.46,
"learning_rate": 3.278231376327731e-06,
"logits/chosen": -2.1865429878234863,
"logits/rejected": -1.7728700637817383,
"logps/chosen": -692.3992919921875,
"logps/rejected": -818.3483276367188,
"loss": 0.0822,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.4515460431575775,
"rewards/margins": 0.13385489583015442,
"rewards/rejected": -0.5854009985923767,
"step": 1340
},
{
"epoch": 0.46,
"learning_rate": 3.249742989477851e-06,
"logits/chosen": -2.199068546295166,
"logits/rejected": -1.866813063621521,
"logps/chosen": -817.2093505859375,
"logps/rejected": -920.2403564453125,
"loss": 0.0511,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.5562337040901184,
"rewards/margins": 0.11897413432598114,
"rewards/rejected": -0.6752079129219055,
"step": 1350
},
{
"epoch": 0.47,
"learning_rate": 3.2211472164563756e-06,
"logits/chosen": -2.109049081802368,
"logits/rejected": -1.7997972965240479,
"logps/chosen": -705.8816528320312,
"logps/rejected": -768.1865234375,
"loss": 0.095,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.4462059438228607,
"rewards/margins": 0.09233604371547699,
"rewards/rejected": -0.5385419130325317,
"step": 1360
},
{
"epoch": 0.47,
"learning_rate": 3.192448153054306e-06,
"logits/chosen": -2.2047770023345947,
"logits/rejected": -1.8513100147247314,
"logps/chosen": -771.5567626953125,
"logps/rejected": -856.609375,
"loss": 0.0693,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.5159443616867065,
"rewards/margins": 0.11039619147777557,
"rewards/rejected": -0.6263405680656433,
"step": 1370
},
{
"epoch": 0.47,
"learning_rate": 3.16364990985699e-06,
"logits/chosen": -2.357393980026245,
"logits/rejected": -1.782231330871582,
"logps/chosen": -714.89404296875,
"logps/rejected": -804.54150390625,
"loss": 0.0874,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.446491539478302,
"rewards/margins": 0.16014492511749268,
"rewards/rejected": -0.6066364645957947,
"step": 1380
},
{
"epoch": 0.48,
"learning_rate": 3.134756611655362e-06,
"logits/chosen": -2.3336434364318848,
"logits/rejected": -2.126812219619751,
"logps/chosen": -558.0953369140625,
"logps/rejected": -690.7258911132812,
"loss": 0.0954,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.36971205472946167,
"rewards/margins": 0.10154370963573456,
"rewards/rejected": -0.4712557792663574,
"step": 1390
},
{
"epoch": 0.48,
"learning_rate": 3.1057723968551427e-06,
"logits/chosen": -2.052511215209961,
"logits/rejected": -1.6268789768218994,
"logps/chosen": -709.0806884765625,
"logps/rejected": -791.6183471679688,
"loss": 0.0784,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.45772653818130493,
"rewards/margins": 0.14719423651695251,
"rewards/rejected": -0.6049207448959351,
"step": 1400
},
{
"epoch": 0.48,
"learning_rate": 3.0767014168841e-06,
"logits/chosen": -2.0302186012268066,
"logits/rejected": -1.9973528385162354,
"logps/chosen": -628.5819091796875,
"logps/rejected": -709.375,
"loss": 0.0716,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.34966421127319336,
"rewards/margins": 0.07439250499010086,
"rewards/rejected": -0.4240567088127136,
"step": 1410
},
{
"epoch": 0.49,
"learning_rate": 3.047547835597432e-06,
"logits/chosen": -1.909949541091919,
"logits/rejected": -1.8792356252670288,
"logps/chosen": -582.7291259765625,
"logps/rejected": -683.0811767578125,
"loss": 0.0874,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.3829612135887146,
"rewards/margins": 0.07136234641075134,
"rewards/rejected": -0.45432358980178833,
"step": 1420
},
{
"epoch": 0.49,
"learning_rate": 3.0183158286813755e-06,
"logits/chosen": -2.278263568878174,
"logits/rejected": -1.8273910284042358,
"logps/chosen": -655.0009765625,
"logps/rejected": -684.890625,
"loss": 0.075,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.3779233396053314,
"rewards/margins": 0.10793854296207428,
"rewards/rejected": -0.4858619272708893,
"step": 1430
},
{
"epoch": 0.49,
"learning_rate": 2.989009583055121e-06,
"logits/chosen": -2.129441022872925,
"logits/rejected": -1.9885050058364868,
"logps/chosen": -765.9420166015625,
"logps/rejected": -878.65576171875,
"loss": 0.0642,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.47016677260398865,
"rewards/margins": 0.11287762969732285,
"rewards/rejected": -0.5830444097518921,
"step": 1440
},
{
"epoch": 0.5,
"learning_rate": 2.959633296271117e-06,
"logits/chosen": -2.113435983657837,
"logits/rejected": -1.8716232776641846,
"logps/chosen": -611.4641723632812,
"logps/rejected": -689.0220336914062,
"loss": 0.0852,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.4303819537162781,
"rewards/margins": 0.09832239896059036,
"rewards/rejected": -0.5287044048309326,
"step": 1450
},
{
"epoch": 0.5,
"learning_rate": 2.9301911759138535e-06,
"logits/chosen": -2.1812188625335693,
"logits/rejected": -1.9601389169692993,
"logps/chosen": -618.2063598632812,
"logps/rejected": -735.6884155273438,
"loss": 0.0752,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.4212234914302826,
"rewards/margins": 0.11810547113418579,
"rewards/rejected": -0.539328932762146,
"step": 1460
},
{
"epoch": 0.5,
"learning_rate": 2.900687438997205e-06,
"logits/chosen": -2.0657143592834473,
"logits/rejected": -1.8039214611053467,
"logps/chosen": -632.2223510742188,
"logps/rejected": -755.59228515625,
"loss": 0.0795,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.4010123312473297,
"rewards/margins": 0.09653668105602264,
"rewards/rejected": -0.49754899740219116,
"step": 1470
},
{
"epoch": 0.51,
"learning_rate": 2.871126311360424e-06,
"logits/chosen": -2.4564061164855957,
"logits/rejected": -1.8799717426300049,
"logps/chosen": -642.9153442382812,
"logps/rejected": -700.4526977539062,
"loss": 0.08,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.341757595539093,
"rewards/margins": 0.12287576496601105,
"rewards/rejected": -0.46463337540626526,
"step": 1480
},
{
"epoch": 0.51,
"learning_rate": 2.8415120270628756e-06,
"logits/chosen": -2.2899577617645264,
"logits/rejected": -1.8913564682006836,
"logps/chosen": -625.5499877929688,
"logps/rejected": -751.3653564453125,
"loss": 0.059,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.34753698110580444,
"rewards/margins": 0.17222611606121063,
"rewards/rejected": -0.5197631120681763,
"step": 1490
},
{
"epoch": 0.51,
"learning_rate": 2.8118488277775852e-06,
"logits/chosen": -2.1799449920654297,
"logits/rejected": -2.2029454708099365,
"logps/chosen": -541.7261352539062,
"logps/rejected": -652.3367919921875,
"loss": 0.0522,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.3303873538970947,
"rewards/margins": 0.08029700815677643,
"rewards/rejected": -0.4106842875480652,
"step": 1500
},
{
"epoch": 0.52,
"learning_rate": 2.7821409621837042e-06,
"logits/chosen": -2.434246063232422,
"logits/rejected": -1.9443886280059814,
"logps/chosen": -614.01220703125,
"logps/rejected": -698.8470458984375,
"loss": 0.048,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.36497288942337036,
"rewards/margins": 0.10476745665073395,
"rewards/rejected": -0.4697403311729431,
"step": 1510
},
{
"epoch": 0.52,
"learning_rate": 2.7523926853579702e-06,
"logits/chosen": -2.0561671257019043,
"logits/rejected": -1.7254194021224976,
"logps/chosen": -624.192626953125,
"logps/rejected": -757.9207763671875,
"loss": 0.0956,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.4151741564273834,
"rewards/margins": 0.1256508082151413,
"rewards/rejected": -0.5408250093460083,
"step": 1520
},
{
"epoch": 0.52,
"learning_rate": 2.722608258165244e-06,
"logits/chosen": -2.2351367473602295,
"logits/rejected": -1.6919574737548828,
"logps/chosen": -727.2100830078125,
"logps/rejected": -801.911865234375,
"loss": 0.094,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.43864089250564575,
"rewards/margins": 0.1414739489555359,
"rewards/rejected": -0.5801147818565369,
"step": 1530
},
{
"epoch": 0.53,
"learning_rate": 2.6927919466482293e-06,
"logits/chosen": -2.0343658924102783,
"logits/rejected": -1.7758781909942627,
"logps/chosen": -717.9749755859375,
"logps/rejected": -791.5612182617188,
"loss": 0.0753,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.501410186290741,
"rewards/margins": 0.08459311723709106,
"rewards/rejected": -0.5860033631324768,
"step": 1540
},
{
"epoch": 0.53,
"learning_rate": 2.662948021416441e-06,
"logits/chosen": -2.232266426086426,
"logits/rejected": -2.0454351902008057,
"logps/chosen": -727.7106323242188,
"logps/rejected": -814.9327392578125,
"loss": 0.0557,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.46008116006851196,
"rewards/margins": 0.10495243221521378,
"rewards/rejected": -0.5650335550308228,
"step": 1550
},
{
"epoch": 0.53,
"learning_rate": 2.6330807570345253e-06,
"logits/chosen": -2.1601340770721436,
"logits/rejected": -1.9538816213607788,
"logps/chosen": -685.18896484375,
"logps/rejected": -795.1231689453125,
"loss": 0.0791,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.4878101348876953,
"rewards/margins": 0.1159614771604538,
"rewards/rejected": -0.6037715673446655,
"step": 1560
},
{
"epoch": 0.54,
"learning_rate": 2.6031944314100077e-06,
"logits/chosen": -2.442682981491089,
"logits/rejected": -2.2220892906188965,
"logps/chosen": -678.3485717773438,
"logps/rejected": -837.5185546875,
"loss": 0.0906,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.4468511641025543,
"rewards/margins": 0.12742993235588074,
"rewards/rejected": -0.5742811560630798,
"step": 1570
},
{
"epoch": 0.54,
"learning_rate": 2.5732933251805716e-06,
"logits/chosen": -2.238412380218506,
"logits/rejected": -2.0841736793518066,
"logps/chosen": -718.62060546875,
"logps/rejected": -867.1959228515625,
"loss": 0.0541,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.4921676218509674,
"rewards/margins": 0.10790624469518661,
"rewards/rejected": -0.6000738143920898,
"step": 1580
},
{
"epoch": 0.54,
"learning_rate": 2.543381721100931e-06,
"logits/chosen": -2.335407257080078,
"logits/rejected": -2.028334140777588,
"logps/chosen": -614.9449462890625,
"logps/rejected": -788.1812133789062,
"loss": 0.0811,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.41624245047569275,
"rewards/margins": 0.17170578241348267,
"rewards/rejected": -0.587948203086853,
"step": 1590
},
{
"epoch": 0.55,
"learning_rate": 2.513463903429418e-06,
"logits/chosen": -2.4693076610565186,
"logits/rejected": -2.0430703163146973,
"logps/chosen": -725.3846435546875,
"logps/rejected": -755.0938720703125,
"loss": 0.1021,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.4062575697898865,
"rewards/margins": 0.10169617831707001,
"rewards/rejected": -0.5079537630081177,
"step": 1600
},
{
"epoch": 0.55,
"learning_rate": 2.483544157314338e-06,
"logits/chosen": -2.293912410736084,
"logits/rejected": -1.9852508306503296,
"logps/chosen": -639.7050170898438,
"logps/rejected": -763.0711059570312,
"loss": 0.0737,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.4078396260738373,
"rewards/margins": 0.1239791288971901,
"rewards/rejected": -0.5318187475204468,
"step": 1610
},
{
"epoch": 0.56,
"learning_rate": 2.453626768180214e-06,
"logits/chosen": -2.122490167617798,
"logits/rejected": -1.8506181240081787,
"logps/chosen": -720.9524536132812,
"logps/rejected": -728.7947998046875,
"loss": 0.1032,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.4711574912071228,
"rewards/margins": 0.0486263632774353,
"rewards/rejected": -0.5197838544845581,
"step": 1620
},
{
"epoch": 0.56,
"learning_rate": 2.4237160211139697e-06,
"logits/chosen": -2.0483882427215576,
"logits/rejected": -1.869739294052124,
"logps/chosen": -622.5816650390625,
"logps/rejected": -701.8380126953125,
"loss": 0.0577,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.41618186235427856,
"rewards/margins": 0.08069188892841339,
"rewards/rejected": -0.49687376618385315,
"step": 1630
},
{
"epoch": 0.56,
"learning_rate": 2.393816200251187e-06,
"logits/chosen": -2.0031468868255615,
"logits/rejected": -1.54337739944458,
"logps/chosen": -700.4393310546875,
"logps/rejected": -752.5362548828125,
"loss": 0.0727,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.4837239384651184,
"rewards/margins": 0.10101475566625595,
"rewards/rejected": -0.584738552570343,
"step": 1640
},
{
"epoch": 0.57,
"learning_rate": 2.3639315881624776e-06,
"logits/chosen": -2.3299944400787354,
"logits/rejected": -1.946599006652832,
"logps/chosen": -644.5682373046875,
"logps/rejected": -742.0753173828125,
"loss": 0.0999,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.44712233543395996,
"rewards/margins": 0.09644552320241928,
"rewards/rejected": -0.5435678362846375,
"step": 1650
},
{
"epoch": 0.57,
"learning_rate": 2.334066465240093e-06,
"logits/chosen": -2.055642604827881,
"logits/rejected": -1.5556762218475342,
"logps/chosen": -778.634033203125,
"logps/rejected": -831.9388427734375,
"loss": 0.0767,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.4928356111049652,
"rewards/margins": 0.1221829205751419,
"rewards/rejected": -0.6150184869766235,
"step": 1660
},
{
"epoch": 0.57,
"learning_rate": 2.3042251090848357e-06,
"logits/chosen": -2.259159564971924,
"logits/rejected": -1.7662973403930664,
"logps/chosen": -643.4910888671875,
"logps/rejected": -777.8291015625,
"loss": 0.0825,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.4092964231967926,
"rewards/margins": 0.1640159636735916,
"rewards/rejected": -0.5733123421669006,
"step": 1670
},
{
"epoch": 0.58,
"learning_rate": 2.2744117938933814e-06,
"logits/chosen": -2.2976372241973877,
"logits/rejected": -1.9439456462860107,
"logps/chosen": -806.0792236328125,
"logps/rejected": -835.6940307617188,
"loss": 0.0729,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.49456721544265747,
"rewards/margins": 0.07213100790977478,
"rewards/rejected": -0.5666981935501099,
"step": 1680
},
{
"epoch": 0.58,
"learning_rate": 2.2446307898460807e-06,
"logits/chosen": -2.1391043663024902,
"logits/rejected": -1.7574889659881592,
"logps/chosen": -778.1689453125,
"logps/rejected": -875.1658935546875,
"loss": 0.0758,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.5087161064147949,
"rewards/margins": 0.12866072356700897,
"rewards/rejected": -0.6373767852783203,
"step": 1690
},
{
"epoch": 0.58,
"learning_rate": 2.2148863624953364e-06,
"logits/chosen": -2.1262030601501465,
"logits/rejected": -1.8642327785491943,
"logps/chosen": -721.8367919921875,
"logps/rejected": -874.4264526367188,
"loss": 0.0491,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.48331218957901,
"rewards/margins": 0.12461258471012115,
"rewards/rejected": -0.6079246997833252,
"step": 1700
},
{
"epoch": 0.59,
"learning_rate": 2.1851827721546483e-06,
"logits/chosen": -2.0042014122009277,
"logits/rejected": -1.7912979125976562,
"logps/chosen": -779.1425170898438,
"logps/rejected": -934.787109375,
"loss": 0.046,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.5522741079330444,
"rewards/margins": 0.15596961975097656,
"rewards/rejected": -0.7082436680793762,
"step": 1710
},
{
"epoch": 0.59,
"learning_rate": 2.155524273288405e-06,
"logits/chosen": -2.2937986850738525,
"logits/rejected": -1.780461072921753,
"logps/chosen": -812.2808837890625,
"logps/rejected": -918.3689575195312,
"loss": 0.0805,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.4778391420841217,
"rewards/margins": 0.1493079662322998,
"rewards/rejected": -0.6271471381187439,
"step": 1720
},
{
"epoch": 0.59,
"learning_rate": 2.125915113902514e-06,
"logits/chosen": -2.125365734100342,
"logits/rejected": -1.9615271091461182,
"logps/chosen": -657.5159912109375,
"logps/rejected": -758.6436767578125,
"loss": 0.0946,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.42783278226852417,
"rewards/margins": 0.09773501008749008,
"rewards/rejected": -0.5255678296089172,
"step": 1730
},
{
"epoch": 0.6,
"learning_rate": 2.096359534935958e-06,
"logits/chosen": -1.965488076210022,
"logits/rejected": -1.7874730825424194,
"logps/chosen": -719.5172119140625,
"logps/rejected": -838.3024291992188,
"loss": 0.0825,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.520493745803833,
"rewards/margins": 0.09665954858064651,
"rewards/rejected": -0.6171532869338989,
"step": 1740
},
{
"epoch": 0.6,
"learning_rate": 2.0668617696533603e-06,
"logits/chosen": -2.1595165729522705,
"logits/rejected": -1.795940637588501,
"logps/chosen": -750.8433227539062,
"logps/rejected": -791.1815795898438,
"loss": 0.0898,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.5119301080703735,
"rewards/margins": 0.08175458759069443,
"rewards/rejected": -0.5936846137046814,
"step": 1750
},
{
"epoch": 0.6,
"learning_rate": 2.0374260430386542e-06,
"logits/chosen": -2.072263717651367,
"logits/rejected": -1.8800386190414429,
"logps/chosen": -735.1260986328125,
"logps/rejected": -801.7227172851562,
"loss": 0.0447,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.49476614594459534,
"rewards/margins": 0.09266269207000732,
"rewards/rejected": -0.5874288082122803,
"step": 1760
},
{
"epoch": 0.61,
"learning_rate": 2.0080565711899327e-06,
"logits/chosen": -1.9065930843353271,
"logits/rejected": -1.6727092266082764,
"logps/chosen": -684.2578125,
"logps/rejected": -754.9581298828125,
"loss": 0.0742,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.47697681188583374,
"rewards/margins": 0.08060398697853088,
"rewards/rejected": -0.5575807690620422,
"step": 1770
},
{
"epoch": 0.61,
"learning_rate": 1.978757560715579e-06,
"logits/chosen": -2.1670429706573486,
"logits/rejected": -1.99936842918396,
"logps/chosen": -735.7562866210938,
"logps/rejected": -859.4212036132812,
"loss": 0.0952,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.4949742257595062,
"rewards/margins": 0.13264210522174835,
"rewards/rejected": -0.627616286277771,
"step": 1780
},
{
"epoch": 0.61,
"learning_rate": 1.9495332081317466e-06,
"logits/chosen": -2.0642776489257812,
"logits/rejected": -1.8938987255096436,
"logps/chosen": -819.5340576171875,
"logps/rejected": -883.53564453125,
"loss": 0.0872,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.5653868317604065,
"rewards/margins": 0.07868463546037674,
"rewards/rejected": -0.6440714597702026,
"step": 1790
},
{
"epoch": 0.62,
"learning_rate": 1.9203876992612904e-06,
"logits/chosen": -2.0933823585510254,
"logits/rejected": -1.7218735218048096,
"logps/chosen": -661.1719360351562,
"logps/rejected": -781.9850463867188,
"loss": 0.1006,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.43831753730773926,
"rewards/margins": 0.16069361567497253,
"rewards/rejected": -0.5990111231803894,
"step": 1800
},
{
"epoch": 0.62,
"learning_rate": 1.891325208634231e-06,
"logits/chosen": -2.287635087966919,
"logits/rejected": -1.9859931468963623,
"logps/chosen": -612.9700927734375,
"logps/rejected": -735.92919921875,
"loss": 0.0608,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.4461449682712555,
"rewards/margins": 0.13415846228599548,
"rewards/rejected": -0.580303430557251,
"step": 1810
},
{
"epoch": 0.62,
"learning_rate": 1.8623498988898309e-06,
"logits/chosen": -2.2971930503845215,
"logits/rejected": -1.8358027935028076,
"logps/chosen": -716.4727172851562,
"logps/rejected": -849.65869140625,
"loss": 0.0872,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.47384414076805115,
"rewards/margins": 0.13949860632419586,
"rewards/rejected": -0.6133427023887634,
"step": 1820
},
{
"epoch": 0.63,
"learning_rate": 1.83346592018038e-06,
"logits/chosen": -2.231283664703369,
"logits/rejected": -1.9385484457015991,
"logps/chosen": -650.612060546875,
"logps/rejected": -773.4234008789062,
"loss": 0.1013,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.4305783212184906,
"rewards/margins": 0.13019639253616333,
"rewards/rejected": -0.5607747435569763,
"step": 1830
},
{
"epoch": 0.63,
"learning_rate": 1.8046774095767652e-06,
"logits/chosen": -2.3146252632141113,
"logits/rejected": -2.038005828857422,
"logps/chosen": -650.5593872070312,
"logps/rejected": -713.4359130859375,
"loss": 0.0792,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.4199472963809967,
"rewards/margins": 0.11954204738140106,
"rewards/rejected": -0.5394893884658813,
"step": 1840
},
{
"epoch": 0.63,
"learning_rate": 1.775988490475914e-06,
"logits/chosen": -1.9583438634872437,
"logits/rejected": -1.9360641241073608,
"logps/chosen": -607.3677978515625,
"logps/rejected": -768.2495727539062,
"loss": 0.0828,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.4246648848056793,
"rewards/margins": 0.1216825619339943,
"rewards/rejected": -0.546347439289093,
"step": 1850
},
{
"epoch": 0.64,
"learning_rate": 1.7474032720101991e-06,
"logits/chosen": -2.302241802215576,
"logits/rejected": -2.053952932357788,
"logps/chosen": -578.2738647460938,
"logps/rejected": -709.3789672851562,
"loss": 0.0998,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.3617783188819885,
"rewards/margins": 0.12743337452411652,
"rewards/rejected": -0.48921164870262146,
"step": 1860
},
{
"epoch": 0.64,
"learning_rate": 1.7189258484588853e-06,
"logits/chosen": -2.295841932296753,
"logits/rejected": -1.8487634658813477,
"logps/chosen": -805.4288330078125,
"logps/rejected": -849.3688354492188,
"loss": 0.0616,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.5005868673324585,
"rewards/margins": 0.10474538803100586,
"rewards/rejected": -0.6053322553634644,
"step": 1870
},
{
"epoch": 0.64,
"learning_rate": 1.6905602986617006e-06,
"logits/chosen": -2.2122347354888916,
"logits/rejected": -1.8240172863006592,
"logps/chosen": -648.9810791015625,
"logps/rejected": -798.175048828125,
"loss": 0.0807,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.4233244061470032,
"rewards/margins": 0.15655961632728577,
"rewards/rejected": -0.5798839926719666,
"step": 1880
},
{
"epoch": 0.65,
"learning_rate": 1.662310685434625e-06,
"logits/chosen": -2.3458707332611084,
"logits/rejected": -2.2734622955322266,
"logps/chosen": -647.6605224609375,
"logps/rejected": -782.6307373046875,
"loss": 0.073,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.3885071873664856,
"rewards/margins": 0.09876324236392975,
"rewards/rejected": -0.48727044463157654,
"step": 1890
},
{
"epoch": 0.65,
"learning_rate": 1.6341810549879666e-06,
"logits/chosen": -2.344203233718872,
"logits/rejected": -2.0581679344177246,
"logps/chosen": -571.4796752929688,
"logps/rejected": -596.5723876953125,
"loss": 0.0721,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.35855913162231445,
"rewards/margins": 0.06313261389732361,
"rewards/rejected": -0.4216917157173157,
"step": 1900
},
{
"epoch": 0.65,
"learning_rate": 1.6061754363468255e-06,
"logits/chosen": -2.259507894515991,
"logits/rejected": -2.1293258666992188,
"logps/chosen": -650.2342529296875,
"logps/rejected": -721.8099365234375,
"loss": 0.0816,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.3676004707813263,
"rewards/margins": 0.07684332877397537,
"rewards/rejected": -0.44444379210472107,
"step": 1910
},
{
"epoch": 0.66,
"learning_rate": 1.5782978407740087e-06,
"logits/chosen": -2.028473138809204,
"logits/rejected": -2.022217273712158,
"logps/chosen": -670.6177368164062,
"logps/rejected": -752.2406616210938,
"loss": 0.0834,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.44136539101600647,
"rewards/margins": 0.08413775265216827,
"rewards/rejected": -0.5255030393600464,
"step": 1920
},
{
"epoch": 0.66,
"learning_rate": 1.5505522611954977e-06,
"logits/chosen": -2.2423367500305176,
"logits/rejected": -1.7249571084976196,
"logps/chosen": -652.0220947265625,
"logps/rejected": -791.6546020507812,
"loss": 0.1056,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.42029309272766113,
"rewards/margins": 0.17177040874958038,
"rewards/rejected": -0.5920634865760803,
"step": 1930
},
{
"epoch": 0.66,
"learning_rate": 1.522942671628537e-06,
"logits/chosen": -2.2815046310424805,
"logits/rejected": -2.0123918056488037,
"logps/chosen": -581.7095336914062,
"logps/rejected": -690.0474853515625,
"loss": 0.0764,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.37874311208724976,
"rewards/margins": 0.09943266212940216,
"rewards/rejected": -0.4781757891178131,
"step": 1940
},
{
"epoch": 0.67,
"learning_rate": 1.495473026612435e-06,
"logits/chosen": -2.2578682899475098,
"logits/rejected": -1.8968786001205444,
"logps/chosen": -684.0667724609375,
"logps/rejected": -732.0203247070312,
"loss": 0.0857,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.43941551446914673,
"rewards/margins": 0.09977763891220093,
"rewards/rejected": -0.5391931533813477,
"step": 1950
},
{
"epoch": 0.67,
"learning_rate": 1.4681472606421512e-06,
"logits/chosen": -2.2577121257781982,
"logits/rejected": -1.9029508829116821,
"logps/chosen": -682.9171142578125,
"logps/rejected": -752.029296875,
"loss": 0.0774,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.46012359857559204,
"rewards/margins": 0.10639479011297226,
"rewards/rejected": -0.5665184259414673,
"step": 1960
},
{
"epoch": 0.68,
"learning_rate": 1.4409692876047582e-06,
"logits/chosen": -2.3715949058532715,
"logits/rejected": -1.9981542825698853,
"logps/chosen": -686.21630859375,
"logps/rejected": -765.2901611328125,
"loss": 0.0823,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.44159477949142456,
"rewards/margins": 0.10385797917842865,
"rewards/rejected": -0.5454527735710144,
"step": 1970
},
{
"epoch": 0.68,
"learning_rate": 1.4139430002188486e-06,
"logits/chosen": -2.1407675743103027,
"logits/rejected": -2.0084660053253174,
"logps/chosen": -565.1947631835938,
"logps/rejected": -623.761474609375,
"loss": 0.0826,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.380687952041626,
"rewards/margins": 0.06562694162130356,
"rewards/rejected": -0.44631490111351013,
"step": 1980
},
{
"epoch": 0.68,
"learning_rate": 1.3870722694769858e-06,
"logits/chosen": -2.301060199737549,
"logits/rejected": -2.099889039993286,
"logps/chosen": -667.0670166015625,
"logps/rejected": -778.9105834960938,
"loss": 0.0774,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.43501314520835876,
"rewards/margins": 0.11666470766067505,
"rewards/rejected": -0.5516778230667114,
"step": 1990
},
{
"epoch": 0.69,
"learning_rate": 1.3603609440912508e-06,
"logits/chosen": -2.1625216007232666,
"logits/rejected": -2.026824951171875,
"logps/chosen": -466.3900451660156,
"logps/rejected": -600.93896484375,
"loss": 0.1211,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.3264867663383484,
"rewards/margins": 0.1292310655117035,
"rewards/rejected": -0.4557178020477295,
"step": 2000
},
{
"epoch": 0.69,
"learning_rate": 1.3338128499419925e-06,
"logits/chosen": -2.277644634246826,
"logits/rejected": -1.7413593530654907,
"logps/chosen": -672.1463623046875,
"logps/rejected": -718.1644287109375,
"loss": 0.0756,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.4492368698120117,
"rewards/margins": 0.10236699879169464,
"rewards/rejected": -0.5516039133071899,
"step": 2010
},
{
"epoch": 0.69,
"learning_rate": 1.3074317895298492e-06,
"logits/chosen": -2.2224433422088623,
"logits/rejected": -1.9981178045272827,
"logps/chosen": -816.6317138671875,
"logps/rejected": -843.4306640625,
"loss": 0.0589,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.539585292339325,
"rewards/margins": 0.07050606608390808,
"rewards/rejected": -0.6100913882255554,
"step": 2020
},
{
"epoch": 0.7,
"learning_rate": 1.2812215414311036e-06,
"logits/chosen": -2.049561023712158,
"logits/rejected": -1.8878101110458374,
"logps/chosen": -747.5322265625,
"logps/rejected": -847.0015869140625,
"loss": 0.0788,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.5520612597465515,
"rewards/margins": 0.08974529802799225,
"rewards/rejected": -0.6418064832687378,
"step": 2030
},
{
"epoch": 0.7,
"learning_rate": 1.2551858597564859e-06,
"logits/chosen": -2.118635654449463,
"logits/rejected": -1.993412971496582,
"logps/chosen": -758.1939697265625,
"logps/rejected": -849.9517822265625,
"loss": 0.0765,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.5349586009979248,
"rewards/margins": 0.06833256781101227,
"rewards/rejected": -0.6032911539077759,
"step": 2040
},
{
"epoch": 0.7,
"learning_rate": 1.2293284736134605e-06,
"logits/chosen": -2.226203441619873,
"logits/rejected": -2.0049796104431152,
"logps/chosen": -663.0206298828125,
"logps/rejected": -744.8146362304688,
"loss": 0.0652,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.4578720033168793,
"rewards/margins": 0.0849481076002121,
"rewards/rejected": -0.5428200960159302,
"step": 2050
},
{
"epoch": 0.71,
"learning_rate": 1.2036530865721115e-06,
"logits/chosen": -2.1977055072784424,
"logits/rejected": -1.923651099205017,
"logps/chosen": -752.8314208984375,
"logps/rejected": -866.5847778320312,
"loss": 0.0559,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.5004793405532837,
"rewards/margins": 0.11119532585144043,
"rewards/rejected": -0.6116746664047241,
"step": 2060
},
{
"epoch": 0.71,
"learning_rate": 1.178163376134671e-06,
"logits/chosen": -2.203216552734375,
"logits/rejected": -2.1631100177764893,
"logps/chosen": -730.8253784179688,
"logps/rejected": -790.9908447265625,
"loss": 0.086,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.49312084913253784,
"rewards/margins": 0.06093892455101013,
"rewards/rejected": -0.5540598034858704,
"step": 2070
},
{
"epoch": 0.71,
"learning_rate": 1.152862993208794e-06,
"logits/chosen": -2.1542184352874756,
"logits/rejected": -1.721289038658142,
"logps/chosen": -694.8465576171875,
"logps/rejected": -716.5363159179688,
"loss": 0.0579,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.4218994081020355,
"rewards/margins": 0.10558446496725082,
"rewards/rejected": -0.5274838805198669,
"step": 2080
},
{
"epoch": 0.72,
"learning_rate": 1.1277555615846339e-06,
"logits/chosen": -2.0153145790100098,
"logits/rejected": -1.8428608179092407,
"logps/chosen": -670.2786865234375,
"logps/rejected": -854.0362548828125,
"loss": 0.0693,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.43655434250831604,
"rewards/margins": 0.15288135409355164,
"rewards/rejected": -0.5894356966018677,
"step": 2090
},
{
"epoch": 0.72,
"learning_rate": 1.1028446774158021e-06,
"logits/chosen": -2.2273738384246826,
"logits/rejected": -2.0033624172210693,
"logps/chosen": -650.3299560546875,
"logps/rejected": -771.5960693359375,
"loss": 0.0669,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.44391244649887085,
"rewards/margins": 0.1265546977519989,
"rewards/rejected": -0.5704671740531921,
"step": 2100
},
{
"epoch": 0.72,
"learning_rate": 1.0781339087042955e-06,
"logits/chosen": -2.233987808227539,
"logits/rejected": -1.9722740650177002,
"logps/chosen": -668.952392578125,
"logps/rejected": -746.8803100585938,
"loss": 0.092,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.4446966052055359,
"rewards/margins": 0.10449746996164322,
"rewards/rejected": -0.5491940975189209,
"step": 2110
},
{
"epoch": 0.73,
"learning_rate": 1.053626794789441e-06,
"logits/chosen": -2.167900562286377,
"logits/rejected": -2.1640636920928955,
"logps/chosen": -722.8687744140625,
"logps/rejected": -834.3153076171875,
"loss": 0.0647,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.5201176404953003,
"rewards/margins": 0.06028919294476509,
"rewards/rejected": -0.5804067850112915,
"step": 2120
},
{
"epoch": 0.73,
"learning_rate": 1.029326845840961e-06,
"logits/chosen": -2.3251118659973145,
"logits/rejected": -1.9303410053253174,
"logps/chosen": -679.392822265625,
"logps/rejected": -753.8246459960938,
"loss": 0.056,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.43752750754356384,
"rewards/margins": 0.13090887665748596,
"rewards/rejected": -0.5684363842010498,
"step": 2130
},
{
"epoch": 0.73,
"learning_rate": 1.0052375423562038e-06,
"logits/chosen": -2.242652416229248,
"logits/rejected": -2.0578553676605225,
"logps/chosen": -684.14013671875,
"logps/rejected": -813.5594482421875,
"loss": 0.0601,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.4639926850795746,
"rewards/margins": 0.08700541406869888,
"rewards/rejected": -0.5509980916976929,
"step": 2140
},
{
"epoch": 0.74,
"learning_rate": 9.813623346616325e-07,
"logits/chosen": -1.9593162536621094,
"logits/rejected": -1.3148808479309082,
"logps/chosen": -727.7343139648438,
"logps/rejected": -799.2838134765625,
"loss": 0.0755,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.43542367219924927,
"rewards/margins": 0.15335293114185333,
"rewards/rejected": -0.5887765884399414,
"step": 2150
},
{
"epoch": 0.74,
"learning_rate": 9.577046424186336e-07,
"logits/chosen": -2.250488758087158,
"logits/rejected": -2.152696132659912,
"logps/chosen": -743.4170532226562,
"logps/rejected": -749.8002319335938,
"loss": 0.0858,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.45544886589050293,
"rewards/margins": 0.04140906408429146,
"rewards/rejected": -0.4968579411506653,
"step": 2160
},
{
"epoch": 0.74,
"learning_rate": 9.342678541337155e-07,
"logits/chosen": -2.17391037940979,
"logits/rejected": -1.7850377559661865,
"logps/chosen": -670.3488159179688,
"logps/rejected": -719.6155395507812,
"loss": 0.1111,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.47339048981666565,
"rewards/margins": 0.07163342088460922,
"rewards/rejected": -0.5450239181518555,
"step": 2170
},
{
"epoch": 0.75,
"learning_rate": 9.110553266731676e-07,
"logits/chosen": -1.9487330913543701,
"logits/rejected": -1.920854926109314,
"logps/chosen": -684.4945068359375,
"logps/rejected": -799.9359130859375,
"loss": 0.0455,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.47916141152381897,
"rewards/margins": 0.1000744104385376,
"rewards/rejected": -0.5792357921600342,
"step": 2180
},
{
"epoch": 0.75,
"learning_rate": 8.880703847822603e-07,
"logits/chosen": -2.07055401802063,
"logits/rejected": -1.928727149963379,
"logps/chosen": -658.1580200195312,
"logps/rejected": -768.7928466796875,
"loss": 0.0684,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.4428200125694275,
"rewards/margins": 0.12602929770946503,
"rewards/rejected": -0.5688492655754089,
"step": 2190
},
{
"epoch": 0.75,
"learning_rate": 8.653163206090326e-07,
"logits/chosen": -2.4357573986053467,
"logits/rejected": -1.9457374811172485,
"logps/chosen": -623.8139038085938,
"logps/rejected": -643.5352783203125,
"loss": 0.0653,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.37632665038108826,
"rewards/margins": 0.08694492280483246,
"rewards/rejected": -0.4632716178894043,
"step": 2200
},
{
"epoch": 0.76,
"learning_rate": 8.427963932327621e-07,
"logits/chosen": -2.18113112449646,
"logits/rejected": -2.0196382999420166,
"logps/chosen": -602.2532958984375,
"logps/rejected": -781.3187255859375,
"loss": 0.0677,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.4085915982723236,
"rewards/margins": 0.16642124950885773,
"rewards/rejected": -0.5750128030776978,
"step": 2210
},
{
"epoch": 0.76,
"learning_rate": 8.205138281971617e-07,
"logits/chosen": -2.0964841842651367,
"logits/rejected": -1.816980004310608,
"logps/chosen": -691.4669189453125,
"logps/rejected": -711.4818725585938,
"loss": 0.0626,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.4612464904785156,
"rewards/margins": 0.0856751948595047,
"rewards/rejected": -0.5469216704368591,
"step": 2220
},
{
"epoch": 0.76,
"learning_rate": 7.984718170483813e-07,
"logits/chosen": -2.1438546180725098,
"logits/rejected": -1.992221474647522,
"logps/chosen": -613.9456787109375,
"logps/rejected": -792.9467163085938,
"loss": 0.1208,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.41083821654319763,
"rewards/margins": 0.1366868019104004,
"rewards/rejected": -0.5475250482559204,
"step": 2230
},
{
"epoch": 0.77,
"learning_rate": 7.766735168778853e-07,
"logits/chosen": -2.3303608894348145,
"logits/rejected": -1.964838981628418,
"logps/chosen": -724.9031982421875,
"logps/rejected": -794.0396728515625,
"loss": 0.065,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.4382435381412506,
"rewards/margins": 0.1007271558046341,
"rewards/rejected": -0.5389707088470459,
"step": 2240
},
{
"epoch": 0.77,
"learning_rate": 7.551220498702547e-07,
"logits/chosen": -2.219709873199463,
"logits/rejected": -1.8147242069244385,
"logps/chosen": -690.845458984375,
"logps/rejected": -772.4855346679688,
"loss": 0.0714,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.4500810205936432,
"rewards/margins": 0.10746470838785172,
"rewards/rejected": -0.5575457215309143,
"step": 2250
},
{
"epoch": 0.77,
"learning_rate": 7.338205028560003e-07,
"logits/chosen": -2.296119213104248,
"logits/rejected": -1.979353904724121,
"logps/chosen": -654.2385864257812,
"logps/rejected": -730.311767578125,
"loss": 0.0731,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.4028875231742859,
"rewards/margins": 0.1070215255022049,
"rewards/rejected": -0.5099090337753296,
"step": 2260
},
{
"epoch": 0.78,
"learning_rate": 7.127719268694294e-07,
"logits/chosen": -2.161729097366333,
"logits/rejected": -1.8845264911651611,
"logps/chosen": -690.8424682617188,
"logps/rejected": -765.8018798828125,
"loss": 0.0763,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.45226621627807617,
"rewards/margins": 0.1182960644364357,
"rewards/rejected": -0.5705623626708984,
"step": 2270
},
{
"epoch": 0.78,
"learning_rate": 6.919793367116453e-07,
"logits/chosen": -2.2758870124816895,
"logits/rejected": -2.1513137817382812,
"logps/chosen": -645.7116088867188,
"logps/rejected": -750.9886474609375,
"loss": 0.0744,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.39909255504608154,
"rewards/margins": 0.11400028318166733,
"rewards/rejected": -0.5130928158760071,
"step": 2280
},
{
"epoch": 0.78,
"learning_rate": 6.714457105187383e-07,
"logits/chosen": -2.358992338180542,
"logits/rejected": -1.8777456283569336,
"logps/chosen": -718.3897094726562,
"logps/rejected": -846.4613037109375,
"loss": 0.069,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.4483721852302551,
"rewards/margins": 0.1424846649169922,
"rewards/rejected": -0.5908567905426025,
"step": 2290
},
{
"epoch": 0.79,
"learning_rate": 6.511739893352226e-07,
"logits/chosen": -2.1870875358581543,
"logits/rejected": -2.080655336380005,
"logps/chosen": -697.439208984375,
"logps/rejected": -725.9364013671875,
"loss": 0.0663,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.4647183418273926,
"rewards/margins": 0.0482785627245903,
"rewards/rejected": -0.5129969120025635,
"step": 2300
},
{
"epoch": 0.79,
"learning_rate": 6.311670766927869e-07,
"logits/chosen": -1.9962774515151978,
"logits/rejected": -1.9848453998565674,
"logps/chosen": -651.7489013671875,
"logps/rejected": -729.2185668945312,
"loss": 0.1025,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.46112656593322754,
"rewards/margins": 0.07899859547615051,
"rewards/rejected": -0.5401251912117004,
"step": 2310
},
{
"epoch": 0.8,
"learning_rate": 6.114278381944253e-07,
"logits/chosen": -2.1914873123168945,
"logits/rejected": -2.372349500656128,
"logps/chosen": -576.5204467773438,
"logps/rejected": -652.2879638671875,
"loss": 0.0874,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.38175448775291443,
"rewards/margins": 0.045350007712841034,
"rewards/rejected": -0.42710447311401367,
"step": 2320
},
{
"epoch": 0.8,
"learning_rate": 5.91959101103988e-07,
"logits/chosen": -2.443941593170166,
"logits/rejected": -2.3585076332092285,
"logps/chosen": -623.3038940429688,
"logps/rejected": -749.5606689453125,
"loss": 0.0805,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.40153616666793823,
"rewards/margins": 0.12084267288446426,
"rewards/rejected": -0.5223788619041443,
"step": 2330
},
{
"epoch": 0.8,
"learning_rate": 5.727636539412368e-07,
"logits/chosen": -2.2379660606384277,
"logits/rejected": -1.8004734516143799,
"logps/chosen": -637.1851806640625,
"logps/rejected": -713.89697265625,
"loss": 0.0514,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.3866319954395294,
"rewards/margins": 0.12998130917549133,
"rewards/rejected": -0.5166133642196655,
"step": 2340
},
{
"epoch": 0.81,
"learning_rate": 5.538442460824417e-07,
"logits/chosen": -2.189680576324463,
"logits/rejected": -1.9954487085342407,
"logps/chosen": -629.7688598632812,
"logps/rejected": -715.0001831054688,
"loss": 0.0985,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.4157083034515381,
"rewards/margins": 0.10353025048971176,
"rewards/rejected": -0.5192385911941528,
"step": 2350
},
{
"epoch": 0.81,
"learning_rate": 5.352035873665817e-07,
"logits/chosen": -2.3851559162139893,
"logits/rejected": -2.140918016433716,
"logps/chosen": -579.01123046875,
"logps/rejected": -623.9471435546875,
"loss": 0.0898,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.35611358284950256,
"rewards/margins": 0.07924910634756088,
"rewards/rejected": -0.43536263704299927,
"step": 2360
},
{
"epoch": 0.81,
"learning_rate": 5.168443477072207e-07,
"logits/chosen": -2.320765256881714,
"logits/rejected": -1.9919421672821045,
"logps/chosen": -578.265625,
"logps/rejected": -717.7510375976562,
"loss": 0.065,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.3810497522354126,
"rewards/margins": 0.13789904117584229,
"rewards/rejected": -0.5189487338066101,
"step": 2370
},
{
"epoch": 0.82,
"learning_rate": 4.987691567100866e-07,
"logits/chosen": -2.205519199371338,
"logits/rejected": -1.9962717294692993,
"logps/chosen": -694.5599365234375,
"logps/rejected": -781.102783203125,
"loss": 0.0873,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.4531136155128479,
"rewards/margins": 0.08138440549373627,
"rewards/rejected": -0.5344979763031006,
"step": 2380
},
{
"epoch": 0.82,
"learning_rate": 4.809806032964351e-07,
"logits/chosen": -2.2204413414001465,
"logits/rejected": -2.0109105110168457,
"logps/chosen": -601.0872802734375,
"logps/rejected": -644.5216064453125,
"loss": 0.1044,
"rewards/accuracies": 0.42500001192092896,
"rewards/chosen": -0.3811626136302948,
"rewards/margins": 0.06587550789117813,
"rewards/rejected": -0.44703811407089233,
"step": 2390
},
{
"epoch": 0.82,
"learning_rate": 4.634812353322371e-07,
"logits/chosen": -2.3256497383117676,
"logits/rejected": -1.8949310779571533,
"logps/chosen": -661.0736083984375,
"logps/rejected": -761.2631225585938,
"loss": 0.0912,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.4190675616264343,
"rewards/margins": 0.11600930988788605,
"rewards/rejected": -0.535076916217804,
"step": 2400
},
{
"epoch": 0.83,
"learning_rate": 4.462735592632439e-07,
"logits/chosen": -2.003680944442749,
"logits/rejected": -1.8513685464859009,
"logps/chosen": -706.0535278320312,
"logps/rejected": -843.9226684570312,
"loss": 0.0716,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.45064783096313477,
"rewards/margins": 0.1290721446275711,
"rewards/rejected": -0.5797199606895447,
"step": 2410
},
{
"epoch": 0.83,
"learning_rate": 4.293600397559897e-07,
"logits/chosen": -2.1723484992980957,
"logits/rejected": -1.9190336465835571,
"logps/chosen": -624.4666137695312,
"logps/rejected": -677.8482055664062,
"loss": 0.0984,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.41719359159469604,
"rewards/margins": 0.05301886796951294,
"rewards/rejected": -0.470212459564209,
"step": 2420
},
{
"epoch": 0.83,
"learning_rate": 4.1274309934477454e-07,
"logits/chosen": -2.106175422668457,
"logits/rejected": -1.8884546756744385,
"logps/chosen": -612.33544921875,
"logps/rejected": -652.5370483398438,
"loss": 0.0818,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.39127975702285767,
"rewards/margins": 0.054196156561374664,
"rewards/rejected": -0.44547590613365173,
"step": 2430
},
{
"epoch": 0.84,
"learning_rate": 3.964251180846826e-07,
"logits/chosen": -2.3211159706115723,
"logits/rejected": -1.9752849340438843,
"logps/chosen": -544.6336669921875,
"logps/rejected": -604.1102294921875,
"loss": 0.0891,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.34942182898521423,
"rewards/margins": 0.07262709736824036,
"rewards/rejected": -0.422048956155777,
"step": 2440
},
{
"epoch": 0.84,
"learning_rate": 3.8040843321068746e-07,
"logits/chosen": -2.1404693126678467,
"logits/rejected": -1.9147093296051025,
"logps/chosen": -709.58154296875,
"logps/rejected": -804.4060668945312,
"loss": 0.0657,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.48798832297325134,
"rewards/margins": 0.10476765781641006,
"rewards/rejected": -0.5927559733390808,
"step": 2450
},
{
"epoch": 0.84,
"learning_rate": 3.646953388028854e-07,
"logits/chosen": -2.04292631149292,
"logits/rejected": -1.5764219760894775,
"logps/chosen": -687.10546875,
"logps/rejected": -760.7067260742188,
"loss": 0.0816,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.4793581962585449,
"rewards/margins": 0.11864249408245087,
"rewards/rejected": -0.598000705242157,
"step": 2460
},
{
"epoch": 0.85,
"learning_rate": 3.4928808545791614e-07,
"logits/chosen": -2.073615550994873,
"logits/rejected": -2.1788148880004883,
"logps/chosen": -638.221435546875,
"logps/rejected": -761.6395263671875,
"loss": 0.0891,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.44060444831848145,
"rewards/margins": 0.09731185436248779,
"rewards/rejected": -0.5379163026809692,
"step": 2470
},
{
"epoch": 0.85,
"learning_rate": 3.341888799666068e-07,
"logits/chosen": -2.262629747390747,
"logits/rejected": -1.9284954071044922,
"logps/chosen": -686.7388916015625,
"logps/rejected": -743.3775634765625,
"loss": 0.0855,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.4059697091579437,
"rewards/margins": 0.09988965839147568,
"rewards/rejected": -0.505859375,
"step": 2480
},
{
"epoch": 0.85,
"learning_rate": 3.1939988499789075e-07,
"logits/chosen": -2.0978739261627197,
"logits/rejected": -1.8978351354599,
"logps/chosen": -708.4168701171875,
"logps/rejected": -820.39404296875,
"loss": 0.0743,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.49596619606018066,
"rewards/margins": 0.1125420555472374,
"rewards/rejected": -0.6085082292556763,
"step": 2490
},
{
"epoch": 0.86,
"learning_rate": 3.0492321878904913e-07,
"logits/chosen": -2.175656795501709,
"logits/rejected": -1.7882719039916992,
"logps/chosen": -786.6962280273438,
"logps/rejected": -890.03466796875,
"loss": 0.1061,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.5101443529129028,
"rewards/margins": 0.11035291850566864,
"rewards/rejected": -0.6204972267150879,
"step": 2500
},
{
"epoch": 0.86,
"learning_rate": 2.907609548423135e-07,
"logits/chosen": -2.3059334754943848,
"logits/rejected": -2.0130183696746826,
"logps/chosen": -542.1705322265625,
"logps/rejected": -715.5118408203125,
"loss": 0.0811,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.36655205488204956,
"rewards/margins": 0.14779023826122284,
"rewards/rejected": -0.5143422484397888,
"step": 2510
},
{
"epoch": 0.86,
"learning_rate": 2.7691512162787567e-07,
"logits/chosen": -2.0405287742614746,
"logits/rejected": -2.0915091037750244,
"logps/chosen": -702.5558471679688,
"logps/rejected": -788.6998291015625,
"loss": 0.0696,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.4405028223991394,
"rewards/margins": 0.09354646503925323,
"rewards/rejected": -0.5340492129325867,
"step": 2520
},
{
"epoch": 0.87,
"learning_rate": 2.6338770229335176e-07,
"logits/chosen": -2.1023449897766113,
"logits/rejected": -1.8752168416976929,
"logps/chosen": -760.4970092773438,
"logps/rejected": -833.1339721679688,
"loss": 0.0622,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.46654874086380005,
"rewards/margins": 0.10432098805904388,
"rewards/rejected": -0.5708697438240051,
"step": 2530
},
{
"epoch": 0.87,
"learning_rate": 2.501806343797303e-07,
"logits/chosen": -2.3457446098327637,
"logits/rejected": -2.3163278102874756,
"logps/chosen": -633.0409545898438,
"logps/rejected": -750.288818359375,
"loss": 0.093,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.43354344367980957,
"rewards/margins": 0.08344617486000061,
"rewards/rejected": -0.516989529132843,
"step": 2540
},
{
"epoch": 0.87,
"learning_rate": 2.3729580954386183e-07,
"logits/chosen": -2.204981803894043,
"logits/rejected": -1.9701652526855469,
"logps/chosen": -688.1578979492188,
"logps/rejected": -808.5591430664062,
"loss": 0.061,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.44313064217567444,
"rewards/margins": 0.14626091718673706,
"rewards/rejected": -0.5893915891647339,
"step": 2550
},
{
"epoch": 0.88,
"learning_rate": 2.2473507328751086e-07,
"logits/chosen": -1.9686768054962158,
"logits/rejected": -1.7817811965942383,
"logps/chosen": -637.0069580078125,
"logps/rejected": -730.8543090820312,
"loss": 0.0792,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.3913404643535614,
"rewards/margins": 0.11009415239095688,
"rewards/rejected": -0.5014346241950989,
"step": 2560
},
{
"epoch": 0.88,
"learning_rate": 2.1250022469302745e-07,
"logits/chosen": -2.3968021869659424,
"logits/rejected": -1.9710814952850342,
"logps/chosen": -783.5001220703125,
"logps/rejected": -783.8637084960938,
"loss": 0.0559,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.4583209156990051,
"rewards/margins": 0.07906268537044525,
"rewards/rejected": -0.537383496761322,
"step": 2570
},
{
"epoch": 0.88,
"learning_rate": 2.0059301616566107e-07,
"logits/chosen": -1.9925310611724854,
"logits/rejected": -2.1229825019836426,
"logps/chosen": -628.9017944335938,
"logps/rejected": -814.9282836914062,
"loss": 0.0987,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.48036131262779236,
"rewards/margins": 0.1412503570318222,
"rewards/rejected": -0.6216117143630981,
"step": 2580
},
{
"epoch": 0.89,
"learning_rate": 1.8901515318256318e-07,
"logits/chosen": -2.3602724075317383,
"logits/rejected": -1.8974645137786865,
"logps/chosen": -728.95947265625,
"logps/rejected": -866.9889526367188,
"loss": 0.053,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.4281562268733978,
"rewards/margins": 0.11615494638681412,
"rewards/rejected": -0.5443111658096313,
"step": 2590
},
{
"epoch": 0.89,
"learning_rate": 1.7776829404851092e-07,
"logits/chosen": -2.2596402168273926,
"logits/rejected": -1.877781867980957,
"logps/chosen": -777.1273193359375,
"logps/rejected": -868.2491455078125,
"loss": 0.0797,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.4920421242713928,
"rewards/margins": 0.14914286136627197,
"rewards/rejected": -0.64118492603302,
"step": 2600
},
{
"epoch": 0.89,
"learning_rate": 1.6685404965838647e-07,
"logits/chosen": -2.3345799446105957,
"logits/rejected": -2.0257980823516846,
"logps/chosen": -681.239990234375,
"logps/rejected": -695.14501953125,
"loss": 0.0789,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.4106011986732483,
"rewards/margins": 0.07871082425117493,
"rewards/rejected": -0.48931199312210083,
"step": 2610
},
{
"epoch": 0.9,
"learning_rate": 1.5627398326644811e-07,
"logits/chosen": -2.2587788105010986,
"logits/rejected": -2.0362513065338135,
"logps/chosen": -705.2887573242188,
"logps/rejected": -740.9500122070312,
"loss": 0.0835,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.45956987142562866,
"rewards/margins": 0.06559441983699799,
"rewards/rejected": -0.5251643061637878,
"step": 2620
},
{
"epoch": 0.9,
"learning_rate": 1.460296102624248e-07,
"logits/chosen": -2.258861541748047,
"logits/rejected": -2.2292912006378174,
"logps/chosen": -662.3797607421875,
"logps/rejected": -802.5979614257812,
"loss": 0.0791,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.43469151854515076,
"rewards/margins": 0.11645804345607758,
"rewards/rejected": -0.5511494874954224,
"step": 2630
},
{
"epoch": 0.9,
"learning_rate": 1.3612239795446348e-07,
"logits/chosen": -2.229052782058716,
"logits/rejected": -1.8262239694595337,
"logps/chosen": -585.1064453125,
"logps/rejected": -665.6681518554688,
"loss": 0.0482,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.3851460814476013,
"rewards/margins": 0.09434196352958679,
"rewards/rejected": -0.4794880449771881,
"step": 2640
},
{
"epoch": 0.91,
"learning_rate": 1.2655376535896852e-07,
"logits/chosen": -2.274597644805908,
"logits/rejected": -1.9262031316757202,
"logps/chosen": -641.3375244140625,
"logps/rejected": -756.7232666015625,
"loss": 0.0618,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.3911328911781311,
"rewards/margins": 0.13705289363861084,
"rewards/rejected": -0.5281857252120972,
"step": 2650
},
{
"epoch": 0.91,
"learning_rate": 1.1732508299735379e-07,
"logits/chosen": -2.2501373291015625,
"logits/rejected": -1.8707962036132812,
"logps/chosen": -603.0183715820312,
"logps/rejected": -690.1578369140625,
"loss": 0.0705,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.39655619859695435,
"rewards/margins": 0.1162206381559372,
"rewards/rejected": -0.5127768516540527,
"step": 2660
},
{
"epoch": 0.92,
"learning_rate": 1.0843767269974131e-07,
"logits/chosen": -2.2717010974884033,
"logits/rejected": -2.084096908569336,
"logps/chosen": -690.6661987304688,
"logps/rejected": -738.0612182617188,
"loss": 0.0704,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.44601020216941833,
"rewards/margins": 0.0691831111907959,
"rewards/rejected": -0.5151932239532471,
"step": 2670
},
{
"epoch": 0.92,
"learning_rate": 9.989280741563689e-08,
"logits/chosen": -2.2434730529785156,
"logits/rejected": -1.928205132484436,
"logps/chosen": -684.7159423828125,
"logps/rejected": -753.5374755859375,
"loss": 0.0737,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.42067623138427734,
"rewards/margins": 0.11073021590709686,
"rewards/rejected": -0.5314064621925354,
"step": 2680
},
{
"epoch": 0.92,
"learning_rate": 9.169171103160123e-08,
"logits/chosen": -2.2055764198303223,
"logits/rejected": -1.9301769733428955,
"logps/chosen": -670.9398803710938,
"logps/rejected": -797.8900756835938,
"loss": 0.0828,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.4384661316871643,
"rewards/margins": 0.12283768504858017,
"rewards/rejected": -0.5613037347793579,
"step": 2690
},
{
"epoch": 0.93,
"learning_rate": 8.383555819595601e-08,
"logits/chosen": -2.2271904945373535,
"logits/rejected": -2.005030393600464,
"logps/chosen": -687.7010498046875,
"logps/rejected": -819.2864379882812,
"loss": 0.0767,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.4591870903968811,
"rewards/margins": 0.14114715158939362,
"rewards/rejected": -0.6003342270851135,
"step": 2700
},
{
"epoch": 0.93,
"learning_rate": 7.632547415053482e-08,
"logits/chosen": -2.4016735553741455,
"logits/rejected": -2.1341352462768555,
"logps/chosen": -751.8325805664062,
"logps/rejected": -827.7970581054688,
"loss": 0.1146,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.48245877027511597,
"rewards/margins": 0.08054832369089127,
"rewards/rejected": -0.5630070567131042,
"step": 2710
},
{
"epoch": 0.93,
"learning_rate": 6.916253456951572e-08,
"logits/chosen": -2.2689526081085205,
"logits/rejected": -2.0502147674560547,
"logps/chosen": -725.8679809570312,
"logps/rejected": -830.0611572265625,
"loss": 0.042,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.46661239862442017,
"rewards/margins": 0.1116294115781784,
"rewards/rejected": -0.578241765499115,
"step": 2720
},
{
"epoch": 0.94,
"learning_rate": 6.23477654053517e-08,
"logits/chosen": -2.389768600463867,
"logits/rejected": -1.788630723953247,
"logps/chosen": -633.7202758789062,
"logps/rejected": -682.6315307617188,
"loss": 0.078,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.37593162059783936,
"rewards/margins": 0.1151694804430008,
"rewards/rejected": -0.49110108613967896,
"step": 2730
},
{
"epoch": 0.94,
"learning_rate": 5.588214274182158e-08,
"logits/chosen": -2.29005765914917,
"logits/rejected": -1.8733975887298584,
"logps/chosen": -736.537841796875,
"logps/rejected": -748.2416381835938,
"loss": 0.1075,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.42465394735336304,
"rewards/margins": 0.09247386455535889,
"rewards/rejected": -0.5171278119087219,
"step": 2740
},
{
"epoch": 0.94,
"learning_rate": 4.9766592654227344e-08,
"logits/chosen": -2.290553092956543,
"logits/rejected": -1.8525292873382568,
"logps/chosen": -776.4290161132812,
"logps/rejected": -816.3228149414062,
"loss": 0.0501,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.49286454916000366,
"rewards/margins": 0.10129784047603607,
"rewards/rejected": -0.5941623449325562,
"step": 2750
},
{
"epoch": 0.95,
"learning_rate": 4.400199107674946e-08,
"logits/chosen": -2.1630892753601074,
"logits/rejected": -1.940159559249878,
"logps/chosen": -665.2659912109375,
"logps/rejected": -710.6292724609375,
"loss": 0.0637,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.4292607307434082,
"rewards/margins": 0.062006641179323196,
"rewards/rejected": -0.4912673532962799,
"step": 2760
},
{
"epoch": 0.95,
"learning_rate": 3.8589163676986674e-08,
"logits/chosen": -2.389782190322876,
"logits/rejected": -2.072308301925659,
"logps/chosen": -739.9847412109375,
"logps/rejected": -799.0582885742188,
"loss": 0.0508,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.42423492670059204,
"rewards/margins": 0.11466383934020996,
"rewards/rejected": -0.538898766040802,
"step": 2770
},
{
"epoch": 0.95,
"learning_rate": 3.3528885737696136e-08,
"logits/chosen": -2.229954957962036,
"logits/rejected": -2.001962661743164,
"logps/chosen": -763.1449584960938,
"logps/rejected": -890.884765625,
"loss": 0.0776,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.4881008267402649,
"rewards/margins": 0.1101212278008461,
"rewards/rejected": -0.598222017288208,
"step": 2780
},
{
"epoch": 0.96,
"learning_rate": 2.8821882045748928e-08,
"logits/chosen": -2.1927006244659424,
"logits/rejected": -1.9519379138946533,
"logps/chosen": -609.7821044921875,
"logps/rejected": -670.4013671875,
"loss": 0.1037,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.43954816460609436,
"rewards/margins": 0.08661254495382309,
"rewards/rejected": -0.526160717010498,
"step": 2790
},
{
"epoch": 0.96,
"learning_rate": 2.4468826788316967e-08,
"logits/chosen": -2.2692456245422363,
"logits/rejected": -1.8533859252929688,
"logps/chosen": -694.7283935546875,
"logps/rejected": -838.3411865234375,
"loss": 0.0369,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.41949623823165894,
"rewards/margins": 0.1585858017206192,
"rewards/rejected": -0.5780820846557617,
"step": 2800
},
{
"epoch": 0.96,
"learning_rate": 2.0470343456310827e-08,
"logits/chosen": -2.1465401649475098,
"logits/rejected": -1.92649245262146,
"logps/chosen": -801.5432739257812,
"logps/rejected": -876.83203125,
"loss": 0.0925,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.5086441040039062,
"rewards/margins": 0.1165408119559288,
"rewards/rejected": -0.6251848936080933,
"step": 2810
},
{
"epoch": 0.97,
"learning_rate": 1.682700475507476e-08,
"logits/chosen": -2.319272518157959,
"logits/rejected": -2.125800609588623,
"logps/chosen": -730.833740234375,
"logps/rejected": -750.211669921875,
"loss": 0.08,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.44253936409950256,
"rewards/margins": 0.06718467175960541,
"rewards/rejected": -0.5097240209579468,
"step": 2820
},
{
"epoch": 0.97,
"learning_rate": 1.3539332522359282e-08,
"logits/chosen": -2.2746429443359375,
"logits/rejected": -1.9740593433380127,
"logps/chosen": -795.7684936523438,
"logps/rejected": -879.1627197265625,
"loss": 0.0623,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.519045352935791,
"rewards/margins": 0.11925999075174332,
"rewards/rejected": -0.6383053660392761,
"step": 2830
},
{
"epoch": 0.97,
"learning_rate": 1.0607797653577333e-08,
"logits/chosen": -2.160733938217163,
"logits/rejected": -1.9417308568954468,
"logps/chosen": -561.7853393554688,
"logps/rejected": -670.7401733398438,
"loss": 0.0739,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.3653794825077057,
"rewards/margins": 0.09239096939563751,
"rewards/rejected": -0.457770437002182,
"step": 2840
},
{
"epoch": 0.98,
"learning_rate": 8.032820034357126e-09,
"logits/chosen": -2.2955965995788574,
"logits/rejected": -2.136737585067749,
"logps/chosen": -670.2855224609375,
"logps/rejected": -771.9925537109375,
"loss": 0.0742,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.4561973214149475,
"rewards/margins": 0.10345951467752457,
"rewards/rejected": -0.5596567988395691,
"step": 2850
},
{
"epoch": 0.98,
"learning_rate": 5.814768480403021e-09,
"logits/chosen": -2.0290589332580566,
"logits/rejected": -1.981529951095581,
"logps/chosen": -576.2341918945312,
"logps/rejected": -762.9359130859375,
"loss": 0.0692,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.4308190941810608,
"rewards/margins": 0.13548357784748077,
"rewards/rejected": -0.5663026571273804,
"step": 2860
},
{
"epoch": 0.98,
"learning_rate": 3.953960684668634e-09,
"logits/chosen": -2.110337018966675,
"logits/rejected": -2.0485644340515137,
"logps/chosen": -660.661865234375,
"logps/rejected": -738.81884765625,
"loss": 0.0598,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.44688519835472107,
"rewards/margins": 0.08198593556880951,
"rewards/rejected": -0.5288710594177246,
"step": 2870
},
{
"epoch": 0.99,
"learning_rate": 2.4506631718534956e-09,
"logits/chosen": -2.2245066165924072,
"logits/rejected": -1.9781780242919922,
"logps/chosen": -732.562255859375,
"logps/rejected": -833.1080932617188,
"loss": 0.0685,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.48917946219444275,
"rewards/margins": 0.0765211433172226,
"rewards/rejected": -0.5657006502151489,
"step": 2880
},
{
"epoch": 0.99,
"learning_rate": 1.3050912602297071e-09,
"logits/chosen": -2.0510215759277344,
"logits/rejected": -1.8791606426239014,
"logps/chosen": -708.2174072265625,
"logps/rejected": -814.8040161132812,
"loss": 0.079,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.46390801668167114,
"rewards/margins": 0.11161540448665619,
"rewards/rejected": -0.5755234360694885,
"step": 2890
},
{
"epoch": 0.99,
"learning_rate": 5.1740903080022e-10,
"logits/chosen": -2.4098868370056152,
"logits/rejected": -1.9928724765777588,
"logps/chosen": -596.1256713867188,
"logps/rejected": -630.7376708984375,
"loss": 0.0822,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.3659174144268036,
"rewards/margins": 0.11456866562366486,
"rewards/rejected": -0.48048609495162964,
"step": 2900
},
{
"epoch": 1.0,
"learning_rate": 8.772930379846723e-11,
"logits/chosen": -2.240025043487549,
"logits/rejected": -2.1385819911956787,
"logps/chosen": -616.0072021484375,
"logps/rejected": -749.4381103515625,
"loss": 0.0707,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.4194551110267639,
"rewards/margins": 0.09669093787670135,
"rewards/rejected": -0.5161460041999817,
"step": 2910
},
{
"epoch": 1.0,
"step": 2917,
"total_flos": 0.0,
"train_loss": 0.0819665433958372,
"train_runtime": 16805.1619,
"train_samples_per_second": 1.389,
"train_steps_per_second": 0.174
}
],
"logging_steps": 10,
"max_steps": 2917,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}