BraylonDash's picture
Model save
0621f73 verified
raw
history blame
60.5 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 1250,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"learning_rate": 4e-08,
"logits/chosen": -0.09526942670345306,
"logits/rejected": -0.23948004841804504,
"logps/chosen": -3969.244140625,
"logps/rejected": -2912.11376953125,
"loss": 0.1112,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.01,
"learning_rate": 4.0000000000000003e-07,
"logits/chosen": -0.11336694657802582,
"logits/rejected": -0.21862205862998962,
"logps/chosen": -3883.32763671875,
"logps/rejected": -3105.751708984375,
"loss": 0.1476,
"rewards/accuracies": 0.3541666567325592,
"rewards/chosen": -6.194857178343227e-06,
"rewards/margins": -7.76553206378594e-05,
"rewards/rejected": 7.146046118577942e-05,
"step": 10
},
{
"epoch": 0.02,
"learning_rate": 8.000000000000001e-07,
"logits/chosen": -0.11791403591632843,
"logits/rejected": -0.19368262588977814,
"logps/chosen": -3725.202392578125,
"logps/rejected": -3018.818603515625,
"loss": 0.1581,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": 0.00026606960454955697,
"rewards/margins": -6.655660399701446e-05,
"rewards/rejected": 0.00033262622309848666,
"step": 20
},
{
"epoch": 0.02,
"learning_rate": 1.2000000000000002e-06,
"logits/chosen": -0.10180139541625977,
"logits/rejected": -0.1368643343448639,
"logps/chosen": -3734.001953125,
"logps/rejected": -3405.164794921875,
"loss": 0.1348,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.001434823265299201,
"rewards/margins": 0.0006993044517003,
"rewards/rejected": 0.000735518871806562,
"step": 30
},
{
"epoch": 0.03,
"learning_rate": 1.6000000000000001e-06,
"logits/chosen": -0.11290457099676132,
"logits/rejected": -0.15302149951457977,
"logps/chosen": -3876.04052734375,
"logps/rejected": -3476.99365234375,
"loss": 0.1472,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 0.00464777322486043,
"rewards/margins": 0.0033736887853592634,
"rewards/rejected": 0.0012740844395011663,
"step": 40
},
{
"epoch": 0.04,
"learning_rate": 2.0000000000000003e-06,
"logits/chosen": -0.10660350322723389,
"logits/rejected": -0.16240839660167694,
"logps/chosen": -3837.369873046875,
"logps/rejected": -3373.931640625,
"loss": 0.1426,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.013103676959872246,
"rewards/margins": 0.008479808457195759,
"rewards/rejected": 0.004623868502676487,
"step": 50
},
{
"epoch": 0.05,
"learning_rate": 2.4000000000000003e-06,
"logits/chosen": -0.08258971571922302,
"logits/rejected": -0.15586063265800476,
"logps/chosen": -3829.67041015625,
"logps/rejected": -3369.47119140625,
"loss": 0.1363,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": 0.029313404113054276,
"rewards/margins": 0.022997483611106873,
"rewards/rejected": 0.006315918173640966,
"step": 60
},
{
"epoch": 0.06,
"learning_rate": 2.8000000000000003e-06,
"logits/chosen": -0.10445211082696915,
"logits/rejected": -0.180179625749588,
"logps/chosen": -3644.919921875,
"logps/rejected": -3038.00732421875,
"loss": 0.1343,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.05821167677640915,
"rewards/margins": 0.030616426840424538,
"rewards/rejected": 0.02759525738656521,
"step": 70
},
{
"epoch": 0.06,
"learning_rate": 3.2000000000000003e-06,
"logits/chosen": 0.0064078932628035545,
"logits/rejected": -0.10336550325155258,
"logps/chosen": -3655.2109375,
"logps/rejected": -3017.391357421875,
"loss": 0.1205,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.0925077348947525,
"rewards/margins": 0.05041419342160225,
"rewards/rejected": 0.04209354892373085,
"step": 80
},
{
"epoch": 0.07,
"learning_rate": 3.6000000000000003e-06,
"logits/chosen": 0.034665923565626144,
"logits/rejected": -0.03560350090265274,
"logps/chosen": -3621.365234375,
"logps/rejected": -3200.21240234375,
"loss": 0.1068,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.1416129767894745,
"rewards/margins": 0.08034636825323105,
"rewards/rejected": 0.06126661226153374,
"step": 90
},
{
"epoch": 0.08,
"learning_rate": 4.000000000000001e-06,
"logits/chosen": 0.04580535367131233,
"logits/rejected": -0.005379015114158392,
"logps/chosen": -3695.321533203125,
"logps/rejected": -3426.08447265625,
"loss": 0.1051,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": 0.17054443061351776,
"rewards/margins": 0.08689786493778229,
"rewards/rejected": 0.08364654332399368,
"step": 100
},
{
"epoch": 0.09,
"learning_rate": 4.4e-06,
"logits/chosen": 0.05780696123838425,
"logits/rejected": -0.00175203918479383,
"logps/chosen": -3636.29638671875,
"logps/rejected": -3395.66748046875,
"loss": 0.1036,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": 0.2138124257326126,
"rewards/margins": 0.0803312435746193,
"rewards/rejected": 0.1334811896085739,
"step": 110
},
{
"epoch": 0.1,
"learning_rate": 4.800000000000001e-06,
"logits/chosen": 0.06642362475395203,
"logits/rejected": -0.037016235291957855,
"logps/chosen": -3482.66015625,
"logps/rejected": -2920.569580078125,
"loss": 0.1081,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.2094995528459549,
"rewards/margins": 0.10118832439184189,
"rewards/rejected": 0.1083112508058548,
"step": 120
},
{
"epoch": 0.1,
"learning_rate": 4.999756310023261e-06,
"logits/chosen": 0.13063621520996094,
"logits/rejected": 0.06078845262527466,
"logps/chosen": -3470.27197265625,
"logps/rejected": -3143.759033203125,
"loss": 0.0841,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.2582111954689026,
"rewards/margins": 0.10591275990009308,
"rewards/rejected": 0.1522984504699707,
"step": 130
},
{
"epoch": 0.11,
"learning_rate": 4.997807075247147e-06,
"logits/chosen": 0.16255763173103333,
"logits/rejected": 0.07761454582214355,
"logps/chosen": -3592.858642578125,
"logps/rejected": -3239.969482421875,
"loss": 0.1071,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": 0.2606235444545746,
"rewards/margins": 0.11447002738714218,
"rewards/rejected": 0.1461535096168518,
"step": 140
},
{
"epoch": 0.12,
"learning_rate": 4.993910125649561e-06,
"logits/chosen": 0.16677160561084747,
"logits/rejected": 0.08901594579219818,
"logps/chosen": -3506.703857421875,
"logps/rejected": -3104.87255859375,
"loss": 0.0945,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": 0.2724655568599701,
"rewards/margins": 0.12591706216335297,
"rewards/rejected": 0.14654847979545593,
"step": 150
},
{
"epoch": 0.13,
"learning_rate": 4.988068499954578e-06,
"logits/chosen": 0.22131207585334778,
"logits/rejected": 0.13661739230155945,
"logps/chosen": -3316.39208984375,
"logps/rejected": -2960.86083984375,
"loss": 0.0905,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": 0.2880935072898865,
"rewards/margins": 0.1261982023715973,
"rewards/rejected": 0.16189530491828918,
"step": 160
},
{
"epoch": 0.14,
"learning_rate": 4.980286753286196e-06,
"logits/chosen": 0.26340895891189575,
"logits/rejected": 0.17107249796390533,
"logps/chosen": -3592.39306640625,
"logps/rejected": -3124.1767578125,
"loss": 0.0955,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": 0.30015623569488525,
"rewards/margins": 0.14308349788188934,
"rewards/rejected": 0.15707270801067352,
"step": 170
},
{
"epoch": 0.14,
"learning_rate": 4.970570953616383e-06,
"logits/chosen": 0.2763732671737671,
"logits/rejected": 0.1752476692199707,
"logps/chosen": -3527.703125,
"logps/rejected": -3171.340576171875,
"loss": 0.0918,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": 0.3266971707344055,
"rewards/margins": 0.15975165367126465,
"rewards/rejected": 0.16694548726081848,
"step": 180
},
{
"epoch": 0.15,
"learning_rate": 4.958928677033465e-06,
"logits/chosen": 0.2631959021091461,
"logits/rejected": 0.21199622750282288,
"logps/chosen": -3533.75341796875,
"logps/rejected": -3337.073486328125,
"loss": 0.0921,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": 0.3471175730228424,
"rewards/margins": 0.11177588999271393,
"rewards/rejected": 0.23534169793128967,
"step": 190
},
{
"epoch": 0.16,
"learning_rate": 4.9453690018345144e-06,
"logits/chosen": 0.2420426905155182,
"logits/rejected": 0.1836567521095276,
"logps/chosen": -3306.966064453125,
"logps/rejected": -2964.663818359375,
"loss": 0.0895,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.32517337799072266,
"rewards/margins": 0.12965548038482666,
"rewards/rejected": 0.19551792740821838,
"step": 200
},
{
"epoch": 0.17,
"learning_rate": 4.9299025014463665e-06,
"logits/chosen": 0.23070940375328064,
"logits/rejected": 0.17138567566871643,
"logps/chosen": -3292.091796875,
"logps/rejected": -2975.43115234375,
"loss": 0.1009,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": 0.328936368227005,
"rewards/margins": 0.11976213753223419,
"rewards/rejected": 0.20917420089244843,
"step": 210
},
{
"epoch": 0.18,
"learning_rate": 4.912541236180779e-06,
"logits/chosen": 0.15947876870632172,
"logits/rejected": 0.08568959683179855,
"logps/chosen": -3484.258544921875,
"logps/rejected": -3174.15087890625,
"loss": 0.0956,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": 0.34722018241882324,
"rewards/margins": 0.155741885304451,
"rewards/rejected": 0.19147828221321106,
"step": 220
},
{
"epoch": 0.18,
"learning_rate": 4.893298743830168e-06,
"logits/chosen": 0.22774775326251984,
"logits/rejected": 0.13324826955795288,
"logps/chosen": -3539.89453125,
"logps/rejected": -3201.11279296875,
"loss": 0.0968,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": 0.38138940930366516,
"rewards/margins": 0.15053245425224304,
"rewards/rejected": 0.23085694015026093,
"step": 230
},
{
"epoch": 0.19,
"learning_rate": 4.8721900291112415e-06,
"logits/chosen": 0.24971647560596466,
"logits/rejected": 0.18798983097076416,
"logps/chosen": -3449.693359375,
"logps/rejected": -3182.7216796875,
"loss": 0.1007,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": 0.4053890109062195,
"rewards/margins": 0.12963128089904785,
"rewards/rejected": 0.27575770020484924,
"step": 240
},
{
"epoch": 0.2,
"learning_rate": 4.849231551964771e-06,
"logits/chosen": 0.290091335773468,
"logits/rejected": 0.2263306826353073,
"logps/chosen": -3255.100830078125,
"logps/rejected": -2931.810546875,
"loss": 0.0973,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": 0.38671380281448364,
"rewards/margins": 0.1347990781068802,
"rewards/rejected": 0.2519146800041199,
"step": 250
},
{
"epoch": 0.21,
"learning_rate": 4.824441214720629e-06,
"logits/chosen": 0.35076624155044556,
"logits/rejected": 0.26289868354797363,
"logps/chosen": -3281.548828125,
"logps/rejected": -2823.29052734375,
"loss": 0.0937,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": 0.3694974184036255,
"rewards/margins": 0.1352803260087967,
"rewards/rejected": 0.2342170774936676,
"step": 260
},
{
"epoch": 0.22,
"learning_rate": 4.7978383481380865e-06,
"logits/chosen": 0.3338952660560608,
"logits/rejected": 0.30002114176750183,
"logps/chosen": -3367.43115234375,
"logps/rejected": -3190.338623046875,
"loss": 0.0961,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.38234108686447144,
"rewards/margins": 0.14520631730556488,
"rewards/rejected": 0.23713478446006775,
"step": 270
},
{
"epoch": 0.22,
"learning_rate": 4.769443696332272e-06,
"logits/chosen": 0.4176582396030426,
"logits/rejected": 0.34488004446029663,
"logps/chosen": -3451.33447265625,
"logps/rejected": -3161.616943359375,
"loss": 0.091,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": 0.3714825510978699,
"rewards/margins": 0.14414021372795105,
"rewards/rejected": 0.22734233736991882,
"step": 280
},
{
"epoch": 0.23,
"learning_rate": 4.7392794005985324e-06,
"logits/chosen": 0.39374208450317383,
"logits/rejected": 0.36565738916397095,
"logps/chosen": -3225.15380859375,
"logps/rejected": -3087.46728515625,
"loss": 0.1046,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": 0.3453277051448822,
"rewards/margins": 0.16379894316196442,
"rewards/rejected": 0.18152877688407898,
"step": 290
},
{
"epoch": 0.24,
"learning_rate": 4.707368982147318e-06,
"logits/chosen": 0.39281272888183594,
"logits/rejected": 0.36582762002944946,
"logps/chosen": -3398.608154296875,
"logps/rejected": -3263.87939453125,
"loss": 0.0904,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.3241587281227112,
"rewards/margins": 0.14963512122631073,
"rewards/rejected": 0.17452362179756165,
"step": 300
},
{
"epoch": 0.25,
"learning_rate": 4.673737323763048e-06,
"logits/chosen": 0.30504000186920166,
"logits/rejected": 0.22714261710643768,
"logps/chosen": -3418.66748046875,
"logps/rejected": -3106.970947265625,
"loss": 0.0872,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.3434060215950012,
"rewards/margins": 0.173831969499588,
"rewards/rejected": 0.16957402229309082,
"step": 310
},
{
"epoch": 0.26,
"learning_rate": 4.638410650401267e-06,
"logits/chosen": 0.34125423431396484,
"logits/rejected": 0.2718544602394104,
"logps/chosen": -3394.98193359375,
"logps/rejected": -3080.618408203125,
"loss": 0.0786,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": 0.32423415780067444,
"rewards/margins": 0.1511869728565216,
"rewards/rejected": 0.17304711043834686,
"step": 320
},
{
"epoch": 0.26,
"learning_rate": 4.601416508739211e-06,
"logits/chosen": 0.36246171593666077,
"logits/rejected": 0.3016008734703064,
"logps/chosen": -3435.645263671875,
"logps/rejected": -3126.3828125,
"loss": 0.0868,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": 0.31689783930778503,
"rewards/margins": 0.16403648257255554,
"rewards/rejected": 0.1528613567352295,
"step": 330
},
{
"epoch": 0.27,
"learning_rate": 4.562783745695738e-06,
"logits/chosen": 0.3510586619377136,
"logits/rejected": 0.254965603351593,
"logps/chosen": -3339.10986328125,
"logps/rejected": -2880.090576171875,
"loss": 0.0873,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.3381843566894531,
"rewards/margins": 0.14746293425559998,
"rewards/rejected": 0.19072142243385315,
"step": 340
},
{
"epoch": 0.28,
"learning_rate": 4.522542485937369e-06,
"logits/chosen": 0.3423737585544586,
"logits/rejected": 0.2638542056083679,
"logps/chosen": -3313.405517578125,
"logps/rejected": -2987.49609375,
"loss": 0.0887,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": 0.35876303911209106,
"rewards/margins": 0.15431997179985046,
"rewards/rejected": 0.204443097114563,
"step": 350
},
{
"epoch": 0.29,
"learning_rate": 4.4807241083879774e-06,
"logits/chosen": 0.35648784041404724,
"logits/rejected": 0.276287704706192,
"logps/chosen": -3448.21142578125,
"logps/rejected": -3130.1142578125,
"loss": 0.0875,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": 0.3623107969760895,
"rewards/margins": 0.15894022583961487,
"rewards/rejected": 0.2033705711364746,
"step": 360
},
{
"epoch": 0.3,
"learning_rate": 4.437361221760449e-06,
"logits/chosen": 0.42664772272109985,
"logits/rejected": 0.379373162984848,
"logps/chosen": -3359.43603515625,
"logps/rejected": -3039.10693359375,
"loss": 0.1004,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": 0.34946465492248535,
"rewards/margins": 0.14916878938674927,
"rewards/rejected": 0.20029588043689728,
"step": 370
},
{
"epoch": 0.3,
"learning_rate": 4.3924876391293915e-06,
"logits/chosen": 0.467120498418808,
"logits/rejected": 0.40080124139785767,
"logps/chosen": -3476.703125,
"logps/rejected": -3288.689453125,
"loss": 0.0805,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": 0.3640100955963135,
"rewards/margins": 0.17574051022529602,
"rewards/rejected": 0.18826961517333984,
"step": 380
},
{
"epoch": 0.31,
"learning_rate": 4.346138351564711e-06,
"logits/chosen": 0.42980876564979553,
"logits/rejected": 0.3273950517177582,
"logps/chosen": -3256.771484375,
"logps/rejected": -2806.90625,
"loss": 0.0925,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": 0.3438069224357605,
"rewards/margins": 0.14692214131355286,
"rewards/rejected": 0.19688478112220764,
"step": 390
},
{
"epoch": 0.32,
"learning_rate": 4.2983495008466285e-06,
"logits/chosen": 0.4755614697933197,
"logits/rejected": 0.4100918769836426,
"logps/chosen": -3387.567626953125,
"logps/rejected": -3176.111572265625,
"loss": 0.1011,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": 0.3908953368663788,
"rewards/margins": 0.14914441108703613,
"rewards/rejected": 0.24175091087818146,
"step": 400
},
{
"epoch": 0.33,
"learning_rate": 4.249158351283414e-06,
"logits/chosen": 0.43400949239730835,
"logits/rejected": 0.35584893822669983,
"logps/chosen": -3405.94384765625,
"logps/rejected": -3099.514404296875,
"loss": 0.0885,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": 0.3543582856655121,
"rewards/margins": 0.15985527634620667,
"rewards/rejected": 0.1945030391216278,
"step": 410
},
{
"epoch": 0.34,
"learning_rate": 4.198603260653792e-06,
"logits/chosen": 0.4672483801841736,
"logits/rejected": 0.38711977005004883,
"logps/chosen": -3460.95947265625,
"logps/rejected": -3150.413330078125,
"loss": 0.0982,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": 0.38315701484680176,
"rewards/margins": 0.16369104385375977,
"rewards/rejected": 0.219465970993042,
"step": 420
},
{
"epoch": 0.34,
"learning_rate": 4.146723650296701e-06,
"logits/chosen": 0.483567476272583,
"logits/rejected": 0.4229060113430023,
"logps/chosen": -3454.10107421875,
"logps/rejected": -3221.44091796875,
"loss": 0.0865,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": 0.38951292634010315,
"rewards/margins": 0.1551971733570099,
"rewards/rejected": 0.23431572318077087,
"step": 430
},
{
"epoch": 0.35,
"learning_rate": 4.093559974371725e-06,
"logits/chosen": 0.5139747858047485,
"logits/rejected": 0.42474398016929626,
"logps/chosen": -3357.93896484375,
"logps/rejected": -3006.518310546875,
"loss": 0.0832,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": 0.3771596848964691,
"rewards/margins": 0.16284802556037903,
"rewards/rejected": 0.21431168913841248,
"step": 440
},
{
"epoch": 0.36,
"learning_rate": 4.039153688314146e-06,
"logits/chosen": 0.5247809886932373,
"logits/rejected": 0.4323784410953522,
"logps/chosen": -3264.754638671875,
"logps/rejected": -2934.151123046875,
"loss": 0.0968,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.3777909576892853,
"rewards/margins": 0.1437477171421051,
"rewards/rejected": 0.2340432107448578,
"step": 450
},
{
"epoch": 0.37,
"learning_rate": 3.983547216509254e-06,
"logits/chosen": 0.5001921653747559,
"logits/rejected": 0.40190553665161133,
"logps/chosen": -3468.16015625,
"logps/rejected": -3026.26025390625,
"loss": 0.09,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": 0.38785520195961,
"rewards/margins": 0.17032787203788757,
"rewards/rejected": 0.2175273448228836,
"step": 460
},
{
"epoch": 0.38,
"learning_rate": 3.92678391921108e-06,
"logits/chosen": 0.5006144642829895,
"logits/rejected": 0.415066659450531,
"logps/chosen": -3334.10009765625,
"logps/rejected": -2893.51123046875,
"loss": 0.0972,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": 0.3449471592903137,
"rewards/margins": 0.1539374142885208,
"rewards/rejected": 0.1910097301006317,
"step": 470
},
{
"epoch": 0.38,
"learning_rate": 3.868908058731376e-06,
"logits/chosen": 0.5540028810501099,
"logits/rejected": 0.48031479120254517,
"logps/chosen": -3356.93212890625,
"logps/rejected": -2993.399169921875,
"loss": 0.0985,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.3838370144367218,
"rewards/margins": 0.14018234610557556,
"rewards/rejected": 0.24365463852882385,
"step": 480
},
{
"epoch": 0.39,
"learning_rate": 3.8099647649251984e-06,
"logits/chosen": 0.5899518728256226,
"logits/rejected": 0.4895103871822357,
"logps/chosen": -3501.290283203125,
"logps/rejected": -3142.33544921875,
"loss": 0.0869,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": 0.386918306350708,
"rewards/margins": 0.17316767573356628,
"rewards/rejected": 0.21375060081481934,
"step": 490
},
{
"epoch": 0.4,
"learning_rate": 3.7500000000000005e-06,
"logits/chosen": 0.533571720123291,
"logits/rejected": 0.45087581872940063,
"logps/chosen": -3382.43115234375,
"logps/rejected": -3039.810791015625,
"loss": 0.0938,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": 0.35097357630729675,
"rewards/margins": 0.16260935366153717,
"rewards/rejected": 0.18836425244808197,
"step": 500
},
{
"epoch": 0.41,
"learning_rate": 3.689060522675689e-06,
"logits/chosen": 0.5135028958320618,
"logits/rejected": 0.38714414834976196,
"logps/chosen": -3217.434814453125,
"logps/rejected": -2690.976806640625,
"loss": 0.0962,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": 0.3489993214607239,
"rewards/margins": 0.14681319892406464,
"rewards/rejected": 0.20218610763549805,
"step": 510
},
{
"epoch": 0.42,
"learning_rate": 3.627193851723577e-06,
"logits/chosen": 0.49498695135116577,
"logits/rejected": 0.4188925325870514,
"logps/chosen": -3364.91552734375,
"logps/rejected": -3090.07275390625,
"loss": 0.0802,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.37084800004959106,
"rewards/margins": 0.15501223504543304,
"rewards/rejected": 0.2158357799053192,
"step": 520
},
{
"epoch": 0.42,
"learning_rate": 3.564448228912682e-06,
"logits/chosen": 0.4635513722896576,
"logits/rejected": 0.36989638209342957,
"logps/chosen": -3387.86474609375,
"logps/rejected": -3016.7216796875,
"loss": 0.0943,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": 0.3274918794631958,
"rewards/margins": 0.15053078532218933,
"rewards/rejected": 0.17696109414100647,
"step": 530
},
{
"epoch": 0.43,
"learning_rate": 3.5008725813922383e-06,
"logits/chosen": 0.49751853942871094,
"logits/rejected": 0.3913223147392273,
"logps/chosen": -3440.885986328125,
"logps/rejected": -2968.994873046875,
"loss": 0.0981,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.34751924872398376,
"rewards/margins": 0.14958377182483673,
"rewards/rejected": 0.19793547689914703,
"step": 540
},
{
"epoch": 0.44,
"learning_rate": 3.436516483539781e-06,
"logits/chosen": 0.5272361040115356,
"logits/rejected": 0.40506014227867126,
"logps/chosen": -3340.62060546875,
"logps/rejected": -2864.970947265625,
"loss": 0.0903,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": 0.3856109082698822,
"rewards/margins": 0.15098164975643158,
"rewards/rejected": 0.23462922871112823,
"step": 550
},
{
"epoch": 0.45,
"learning_rate": 3.3714301183045382e-06,
"logits/chosen": 0.4976809024810791,
"logits/rejected": 0.4073059558868408,
"logps/chosen": -3362.13134765625,
"logps/rejected": -2928.28515625,
"loss": 0.1007,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": 0.4102960526943207,
"rewards/margins": 0.14597077667713165,
"rewards/rejected": 0.2643252909183502,
"step": 560
},
{
"epoch": 0.46,
"learning_rate": 3.3056642380762783e-06,
"logits/chosen": 0.5032496452331543,
"logits/rejected": 0.40577760338783264,
"logps/chosen": -3327.85888671875,
"logps/rejected": -2854.94970703125,
"loss": 0.1083,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": 0.3718569278717041,
"rewards/margins": 0.14686135947704315,
"rewards/rejected": 0.22499553859233856,
"step": 570
},
{
"epoch": 0.46,
"learning_rate": 3.2392701251101172e-06,
"logits/chosen": 0.5247712731361389,
"logits/rejected": 0.4414879381656647,
"logps/chosen": -3346.58740234375,
"logps/rejected": -3087.098876953125,
"loss": 0.0987,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": 0.38225480914115906,
"rewards/margins": 0.13378790020942688,
"rewards/rejected": 0.24846693873405457,
"step": 580
},
{
"epoch": 0.47,
"learning_rate": 3.1722995515381644e-06,
"logits/chosen": 0.514184296131134,
"logits/rejected": 0.40285858511924744,
"logps/chosen": -3443.8828125,
"logps/rejected": -2874.86181640625,
"loss": 0.0832,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.3588571548461914,
"rewards/margins": 0.15165671706199646,
"rewards/rejected": 0.20720043778419495,
"step": 590
},
{
"epoch": 0.48,
"learning_rate": 3.1048047389991693e-06,
"logits/chosen": 0.5682590007781982,
"logits/rejected": 0.5120213627815247,
"logps/chosen": -3459.860595703125,
"logps/rejected": -3225.1884765625,
"loss": 0.0838,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": 0.35979193449020386,
"rewards/margins": 0.15927888453006744,
"rewards/rejected": 0.2005130797624588,
"step": 600
},
{
"epoch": 0.49,
"learning_rate": 3.0368383179176584e-06,
"logits/chosen": 0.5174692869186401,
"logits/rejected": 0.41469916701316833,
"logps/chosen": -3251.609619140625,
"logps/rejected": -2831.67431640625,
"loss": 0.111,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": 0.3455356955528259,
"rewards/margins": 0.15155228972434998,
"rewards/rejected": 0.19398342072963715,
"step": 610
},
{
"epoch": 0.5,
"learning_rate": 2.9684532864643123e-06,
"logits/chosen": 0.5736369490623474,
"logits/rejected": 0.42854124307632446,
"logps/chosen": -3492.6796875,
"logps/rejected": -2873.034912109375,
"loss": 0.0857,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": 0.3865596354007721,
"rewards/margins": 0.1634531319141388,
"rewards/rejected": 0.2231064736843109,
"step": 620
},
{
"epoch": 0.5,
"learning_rate": 2.8997029692295875e-06,
"logits/chosen": 0.5368185043334961,
"logits/rejected": 0.4608641266822815,
"logps/chosen": -3310.38623046875,
"logps/rejected": -3082.637939453125,
"loss": 0.0929,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": 0.3720545768737793,
"rewards/margins": 0.14468377828598022,
"rewards/rejected": 0.22737076878547668,
"step": 630
},
{
"epoch": 0.51,
"learning_rate": 2.8306409756428067e-06,
"logits/chosen": 0.5856886506080627,
"logits/rejected": 0.4743286669254303,
"logps/chosen": -3404.43310546875,
"logps/rejected": -3081.451904296875,
"loss": 0.0699,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": 0.37632375955581665,
"rewards/margins": 0.16852709650993347,
"rewards/rejected": 0.20779672265052795,
"step": 640
},
{
"epoch": 0.52,
"learning_rate": 2.761321158169134e-06,
"logits/chosen": 0.6226561665534973,
"logits/rejected": 0.5113543272018433,
"logps/chosen": -3556.053955078125,
"logps/rejected": -3128.98486328125,
"loss": 0.0791,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": 0.3752782642841339,
"rewards/margins": 0.17961682379245758,
"rewards/rejected": 0.19566142559051514,
"step": 650
},
{
"epoch": 0.53,
"learning_rate": 2.6917975703170466e-06,
"logits/chosen": 0.6197515726089478,
"logits/rejected": 0.5554805994033813,
"logps/chosen": -3466.33251953125,
"logps/rejected": -3276.38671875,
"loss": 0.0789,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": 0.41308966279029846,
"rewards/margins": 0.15082643926143646,
"rewards/rejected": 0.2622632086277008,
"step": 660
},
{
"epoch": 0.54,
"learning_rate": 2.6221244244890336e-06,
"logits/chosen": 0.6192020773887634,
"logits/rejected": 0.5320231914520264,
"logps/chosen": -3314.12548828125,
"logps/rejected": -3007.730224609375,
"loss": 0.1062,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.4171117842197418,
"rewards/margins": 0.1608533412218094,
"rewards/rejected": 0.25625842809677124,
"step": 670
},
{
"epoch": 0.54,
"learning_rate": 2.5523560497083927e-06,
"logits/chosen": 0.5730468034744263,
"logits/rejected": 0.5238832235336304,
"logps/chosen": -3282.682373046875,
"logps/rejected": -3070.96875,
"loss": 0.1052,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": 0.347822368144989,
"rewards/margins": 0.12122434377670288,
"rewards/rejected": 0.22659802436828613,
"step": 680
},
{
"epoch": 0.55,
"learning_rate": 2.482546849255096e-06,
"logits/chosen": 0.5806129574775696,
"logits/rejected": 0.4810718595981598,
"logps/chosen": -3249.9609375,
"logps/rejected": -2905.1328125,
"loss": 0.0932,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": 0.36439889669418335,
"rewards/margins": 0.15161724388599396,
"rewards/rejected": 0.21278166770935059,
"step": 690
},
{
"epoch": 0.56,
"learning_rate": 2.4127512582437486e-06,
"logits/chosen": 0.5876488089561462,
"logits/rejected": 0.47464412450790405,
"logps/chosen": -3425.721923828125,
"logps/rejected": -3029.7919921875,
"loss": 0.08,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 0.43426513671875,
"rewards/margins": 0.17609842121601105,
"rewards/rejected": 0.2581667900085449,
"step": 700
},
{
"epoch": 0.57,
"learning_rate": 2.3430237011767166e-06,
"logits/chosen": 0.5383955836296082,
"logits/rejected": 0.4219956398010254,
"logps/chosen": -3329.063720703125,
"logps/rejected": -2868.416015625,
"loss": 0.0784,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.4002605974674225,
"rewards/margins": 0.15942516922950745,
"rewards/rejected": 0.24083542823791504,
"step": 710
},
{
"epoch": 0.58,
"learning_rate": 2.2734185495055503e-06,
"logits/chosen": 0.5360678434371948,
"logits/rejected": 0.4522096514701843,
"logps/chosen": -3392.73388671875,
"logps/rejected": -3113.76708984375,
"loss": 0.1041,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": 0.42230457067489624,
"rewards/margins": 0.14559249579906464,
"rewards/rejected": 0.2767120599746704,
"step": 720
},
{
"epoch": 0.58,
"learning_rate": 2.2039900792337477e-06,
"logits/chosen": 0.5705752968788147,
"logits/rejected": 0.5024815201759338,
"logps/chosen": -3402.189453125,
"logps/rejected": -3169.858154296875,
"loss": 0.0749,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": 0.4054113030433655,
"rewards/margins": 0.16554218530654907,
"rewards/rejected": 0.2398691177368164,
"step": 730
},
{
"epoch": 0.59,
"learning_rate": 2.134792428593971e-06,
"logits/chosen": 0.5779368281364441,
"logits/rejected": 0.4750920832157135,
"logps/chosen": -3396.721923828125,
"logps/rejected": -3118.03076171875,
"loss": 0.0873,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": 0.3966867923736572,
"rewards/margins": 0.15320774912834167,
"rewards/rejected": 0.24347904324531555,
"step": 740
},
{
"epoch": 0.6,
"learning_rate": 2.0658795558326745e-06,
"logits/chosen": 0.5595996379852295,
"logits/rejected": 0.49162426590919495,
"logps/chosen": -3377.83984375,
"logps/rejected": -3203.514404296875,
"loss": 0.0759,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": 0.41549405455589294,
"rewards/margins": 0.15610775351524353,
"rewards/rejected": 0.2593863010406494,
"step": 750
},
{
"epoch": 0.61,
"learning_rate": 1.997305197135089e-06,
"logits/chosen": 0.5741230249404907,
"logits/rejected": 0.499004602432251,
"logps/chosen": -3418.917236328125,
"logps/rejected": -3171.630859375,
"loss": 0.0854,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": 0.38339635729789734,
"rewards/margins": 0.15282298624515533,
"rewards/rejected": 0.2305733859539032,
"step": 760
},
{
"epoch": 0.62,
"learning_rate": 1.9291228247233607e-06,
"logits/chosen": 0.5983024835586548,
"logits/rejected": 0.5140877962112427,
"logps/chosen": -3385.004638671875,
"logps/rejected": -3130.511474609375,
"loss": 0.076,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": 0.41556110978126526,
"rewards/margins": 0.18630118668079376,
"rewards/rejected": 0.2292599231004715,
"step": 770
},
{
"epoch": 0.62,
"learning_rate": 1.8613856051605242e-06,
"logits/chosen": 0.5828371644020081,
"logits/rejected": 0.509624719619751,
"logps/chosen": -3329.71923828125,
"logps/rejected": -3106.307861328125,
"loss": 0.0886,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": 0.3785203993320465,
"rewards/margins": 0.17451588809490204,
"rewards/rejected": 0.20400448143482208,
"step": 780
},
{
"epoch": 0.63,
"learning_rate": 1.7941463578928088e-06,
"logits/chosen": 0.5620870590209961,
"logits/rejected": 0.4992052912712097,
"logps/chosen": -3354.768310546875,
"logps/rejected": -3018.64599609375,
"loss": 0.0991,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": 0.3805944621562958,
"rewards/margins": 0.16154679656028748,
"rewards/rejected": 0.2190476357936859,
"step": 790
},
{
"epoch": 0.64,
"learning_rate": 1.7274575140626318e-06,
"logits/chosen": 0.6222743391990662,
"logits/rejected": 0.5590968132019043,
"logps/chosen": -3424.283935546875,
"logps/rejected": -3156.167724609375,
"loss": 0.0838,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": 0.39403584599494934,
"rewards/margins": 0.16475993394851685,
"rewards/rejected": 0.2292759120464325,
"step": 800
},
{
"epoch": 0.65,
"learning_rate": 1.661371075624363e-06,
"logits/chosen": 0.6057112216949463,
"logits/rejected": 0.5170978903770447,
"logps/chosen": -3288.67333984375,
"logps/rejected": -2878.7490234375,
"loss": 0.0902,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": 0.3696768581867218,
"rewards/margins": 0.1375175267457962,
"rewards/rejected": 0.2321593314409256,
"step": 810
},
{
"epoch": 0.66,
"learning_rate": 1.5959385747947697e-06,
"logits/chosen": 0.6024297475814819,
"logits/rejected": 0.46327948570251465,
"logps/chosen": -3431.43798828125,
"logps/rejected": -2972.927734375,
"loss": 0.0918,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": 0.4163932204246521,
"rewards/margins": 0.17428722977638245,
"rewards/rejected": 0.24210599064826965,
"step": 820
},
{
"epoch": 0.66,
"learning_rate": 1.5312110338697427e-06,
"logits/chosen": 0.6199443340301514,
"logits/rejected": 0.5287885069847107,
"logps/chosen": -3343.03955078125,
"logps/rejected": -3022.42236328125,
"loss": 0.0814,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.399059921503067,
"rewards/margins": 0.15364623069763184,
"rewards/rejected": 0.2454136610031128,
"step": 830
},
{
"epoch": 0.67,
"learning_rate": 1.467238925438646e-06,
"logits/chosen": 0.6143923401832581,
"logits/rejected": 0.5960370898246765,
"logps/chosen": -3441.96875,
"logps/rejected": -3306.02587890625,
"loss": 0.0976,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.40943044424057007,
"rewards/margins": 0.15342941880226135,
"rewards/rejected": 0.2560010552406311,
"step": 840
},
{
"epoch": 0.68,
"learning_rate": 1.4040721330273063e-06,
"logits/chosen": 0.5770421624183655,
"logits/rejected": 0.4791272282600403,
"logps/chosen": -3493.46533203125,
"logps/rejected": -3112.23486328125,
"loss": 0.078,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": 0.4063073992729187,
"rewards/margins": 0.16731533408164978,
"rewards/rejected": 0.23899206519126892,
"step": 850
},
{
"epoch": 0.69,
"learning_rate": 1.3417599122003464e-06,
"logits/chosen": 0.6255184412002563,
"logits/rejected": 0.5643167495727539,
"logps/chosen": -3481.497314453125,
"logps/rejected": -3120.768310546875,
"loss": 0.0768,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": 0.38735657930374146,
"rewards/margins": 0.1667819321155548,
"rewards/rejected": 0.22057469189167023,
"step": 860
},
{
"epoch": 0.7,
"learning_rate": 1.280350852153168e-06,
"logits/chosen": 0.5782762765884399,
"logits/rejected": 0.49263420701026917,
"logps/chosen": -3499.00146484375,
"logps/rejected": -3000.46728515625,
"loss": 0.0836,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": 0.3818155527114868,
"rewards/margins": 0.158293217420578,
"rewards/rejected": 0.2235223352909088,
"step": 870
},
{
"epoch": 0.7,
"learning_rate": 1.2198928378235717e-06,
"logits/chosen": 0.5858504176139832,
"logits/rejected": 0.4866867661476135,
"logps/chosen": -3311.93115234375,
"logps/rejected": -2876.67041015625,
"loss": 0.0807,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 0.3901521563529968,
"rewards/margins": 0.17163410782814026,
"rewards/rejected": 0.21851806342601776,
"step": 880
},
{
"epoch": 0.71,
"learning_rate": 1.160433012552508e-06,
"logits/chosen": 0.6068114638328552,
"logits/rejected": 0.5134158730506897,
"logps/chosen": -3114.994140625,
"logps/rejected": -2820.002685546875,
"loss": 0.0843,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": 0.37761688232421875,
"rewards/margins": 0.1500733643770218,
"rewards/rejected": 0.22754351794719696,
"step": 890
},
{
"epoch": 0.72,
"learning_rate": 1.1020177413231334e-06,
"logits/chosen": 0.6057130098342896,
"logits/rejected": 0.4943726062774658,
"logps/chosen": -3233.512939453125,
"logps/rejected": -2793.17724609375,
"loss": 0.1014,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": 0.37504011392593384,
"rewards/margins": 0.15143273770809174,
"rewards/rejected": 0.2236073762178421,
"step": 900
},
{
"epoch": 0.73,
"learning_rate": 1.0446925746067768e-06,
"logits/chosen": 0.6174426078796387,
"logits/rejected": 0.5331483483314514,
"logps/chosen": -3416.95556640625,
"logps/rejected": -3206.383056640625,
"loss": 0.0898,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": 0.3918195962905884,
"rewards/margins": 0.1552480012178421,
"rewards/rejected": 0.23657159507274628,
"step": 910
},
{
"epoch": 0.74,
"learning_rate": 9.88502212844063e-07,
"logits/chosen": 0.6079164743423462,
"logits/rejected": 0.5762253999710083,
"logps/chosen": -3149.41357421875,
"logps/rejected": -2938.776123046875,
"loss": 0.0841,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": 0.3338172435760498,
"rewards/margins": 0.11891861259937286,
"rewards/rejected": 0.21489866077899933,
"step": 920
},
{
"epoch": 0.74,
"learning_rate": 9.334904715888496e-07,
"logits/chosen": 0.6562485694885254,
"logits/rejected": 0.5610599517822266,
"logps/chosen": -3430.844482421875,
"logps/rejected": -3026.52587890625,
"loss": 0.0942,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.3751566410064697,
"rewards/margins": 0.17377465963363647,
"rewards/rejected": 0.20138195157051086,
"step": 930
},
{
"epoch": 0.75,
"learning_rate": 8.797002473421729e-07,
"logits/chosen": 0.6029684543609619,
"logits/rejected": 0.5253230333328247,
"logps/chosen": -3235.121826171875,
"logps/rejected": -2938.1640625,
"loss": 0.0747,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": 0.35139986872673035,
"rewards/margins": 0.16471409797668457,
"rewards/rejected": 0.18668580055236816,
"step": 940
},
{
"epoch": 0.76,
"learning_rate": 8.271734841028553e-07,
"logits/chosen": 0.6538732051849365,
"logits/rejected": 0.5982731580734253,
"logps/chosen": -3381.48388671875,
"logps/rejected": -3193.509033203125,
"loss": 0.0925,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": 0.3694911003112793,
"rewards/margins": 0.15476444363594055,
"rewards/rejected": 0.21472665667533875,
"step": 950
},
{
"epoch": 0.77,
"learning_rate": 7.759511406608255e-07,
"logits/chosen": 0.6155306100845337,
"logits/rejected": 0.5355127453804016,
"logps/chosen": -3603.495361328125,
"logps/rejected": -3289.190673828125,
"loss": 0.0996,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": 0.3712473511695862,
"rewards/margins": 0.14055751264095306,
"rewards/rejected": 0.23068983852863312,
"step": 960
},
{
"epoch": 0.78,
"learning_rate": 7.260731586586983e-07,
"logits/chosen": 0.6570934057235718,
"logits/rejected": 0.5331934094429016,
"logps/chosen": -3460.41552734375,
"logps/rejected": -3020.72216796875,
"loss": 0.094,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.36751216650009155,
"rewards/margins": 0.16225677728652954,
"rewards/rejected": 0.2052554190158844,
"step": 970
},
{
"epoch": 0.78,
"learning_rate": 6.775784314464717e-07,
"logits/chosen": 0.6543987393379211,
"logits/rejected": 0.5447486639022827,
"logps/chosen": -3222.0947265625,
"logps/rejected": -2851.6064453125,
"loss": 0.0877,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": 0.3648824095726013,
"rewards/margins": 0.15842023491859436,
"rewards/rejected": 0.20646218955516815,
"step": 980
},
{
"epoch": 0.79,
"learning_rate": 6.305047737536707e-07,
"logits/chosen": 0.6519988179206848,
"logits/rejected": 0.5568141341209412,
"logps/chosen": -3336.877685546875,
"logps/rejected": -3047.09326171875,
"loss": 0.0764,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 0.40213853120803833,
"rewards/margins": 0.1877773106098175,
"rewards/rejected": 0.21436119079589844,
"step": 990
},
{
"epoch": 0.8,
"learning_rate": 5.848888922025553e-07,
"logits/chosen": 0.6370071172714233,
"logits/rejected": 0.5316422581672668,
"logps/chosen": -3368.276611328125,
"logps/rejected": -2950.913330078125,
"loss": 0.0863,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.41091403365135193,
"rewards/margins": 0.1616295725107193,
"rewards/rejected": 0.24928446114063263,
"step": 1000
},
{
"epoch": 0.81,
"learning_rate": 5.407663566854008e-07,
"logits/chosen": 0.6158221960067749,
"logits/rejected": 0.5360954403877258,
"logps/chosen": -3394.571533203125,
"logps/rejected": -3010.104248046875,
"loss": 0.0845,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": 0.4172899127006531,
"rewards/margins": 0.18432098627090454,
"rewards/rejected": 0.23296895623207092,
"step": 1010
},
{
"epoch": 0.82,
"learning_rate": 4.981715726281666e-07,
"logits/chosen": 0.6194905042648315,
"logits/rejected": 0.5224823355674744,
"logps/chosen": -3173.560546875,
"logps/rejected": -2845.927734375,
"loss": 0.0886,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": 0.3923606276512146,
"rewards/margins": 0.1794053614139557,
"rewards/rejected": 0.2129552811384201,
"step": 1020
},
{
"epoch": 0.82,
"learning_rate": 4.5713775416217884e-07,
"logits/chosen": 0.630397617816925,
"logits/rejected": 0.5148654580116272,
"logps/chosen": -3482.11181640625,
"logps/rejected": -3051.90673828125,
"loss": 0.0877,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": 0.41265344619750977,
"rewards/margins": 0.1835349202156067,
"rewards/rejected": 0.22911854088306427,
"step": 1030
},
{
"epoch": 0.83,
"learning_rate": 4.1769689822475147e-07,
"logits/chosen": 0.6434907913208008,
"logits/rejected": 0.548559308052063,
"logps/chosen": -3349.734375,
"logps/rejected": -3031.77978515625,
"loss": 0.0941,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": 0.3825233578681946,
"rewards/margins": 0.14600971341133118,
"rewards/rejected": 0.2365136444568634,
"step": 1040
},
{
"epoch": 0.84,
"learning_rate": 3.798797596089351e-07,
"logits/chosen": 0.6427907943725586,
"logits/rejected": 0.5501333475112915,
"logps/chosen": -3478.50537109375,
"logps/rejected": -3245.615234375,
"loss": 0.0937,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": 0.41868337988853455,
"rewards/margins": 0.1864897906780243,
"rewards/rejected": 0.23219358921051025,
"step": 1050
},
{
"epoch": 0.85,
"learning_rate": 3.4371582698185636e-07,
"logits/chosen": 0.6372936964035034,
"logits/rejected": 0.5432217717170715,
"logps/chosen": -3423.430908203125,
"logps/rejected": -3087.765380859375,
"loss": 0.092,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": 0.4114980101585388,
"rewards/margins": 0.17217496037483215,
"rewards/rejected": 0.23932309448719025,
"step": 1060
},
{
"epoch": 0.86,
"learning_rate": 3.092332998903416e-07,
"logits/chosen": 0.5996044874191284,
"logits/rejected": 0.4986226558685303,
"logps/chosen": -3295.41455078125,
"logps/rejected": -2947.134765625,
"loss": 0.0865,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.39540520310401917,
"rewards/margins": 0.1856795847415924,
"rewards/rejected": 0.20972561836242676,
"step": 1070
},
{
"epoch": 0.86,
"learning_rate": 2.764590667717562e-07,
"logits/chosen": 0.6375841498374939,
"logits/rejected": 0.544662356376648,
"logps/chosen": -3654.934326171875,
"logps/rejected": -3230.848876953125,
"loss": 0.067,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.3808661103248596,
"rewards/margins": 0.15394194424152374,
"rewards/rejected": 0.22692415118217468,
"step": 1080
},
{
"epoch": 0.87,
"learning_rate": 2.454186839872158e-07,
"logits/chosen": 0.6398609280586243,
"logits/rejected": 0.5464919805526733,
"logps/chosen": -3495.825439453125,
"logps/rejected": -3124.951416015625,
"loss": 0.081,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": 0.39376845955848694,
"rewards/margins": 0.16370807588100433,
"rewards/rejected": 0.23006033897399902,
"step": 1090
},
{
"epoch": 0.88,
"learning_rate": 2.1613635589349756e-07,
"logits/chosen": 0.6471344232559204,
"logits/rejected": 0.5833622813224792,
"logps/chosen": -3451.12890625,
"logps/rejected": -3257.12353515625,
"loss": 0.0869,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": 0.3840171694755554,
"rewards/margins": 0.14787636697292328,
"rewards/rejected": 0.23614077270030975,
"step": 1100
},
{
"epoch": 0.89,
"learning_rate": 1.8863491596921745e-07,
"logits/chosen": 0.6442585587501526,
"logits/rejected": 0.5597777962684631,
"logps/chosen": -3400.688720703125,
"logps/rejected": -3029.727294921875,
"loss": 0.0815,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": 0.3747108280658722,
"rewards/margins": 0.14767669141292572,
"rewards/rejected": 0.22703413665294647,
"step": 1110
},
{
"epoch": 0.9,
"learning_rate": 1.629358090099639e-07,
"logits/chosen": 0.6222900152206421,
"logits/rejected": 0.5375300645828247,
"logps/chosen": -3375.23583984375,
"logps/rejected": -3024.98876953125,
"loss": 0.088,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.37642619013786316,
"rewards/margins": 0.1609790325164795,
"rewards/rejected": 0.21544715762138367,
"step": 1120
},
{
"epoch": 0.9,
"learning_rate": 1.3905907440629752e-07,
"logits/chosen": 0.6418689489364624,
"logits/rejected": 0.5823384523391724,
"logps/chosen": -3317.762451171875,
"logps/rejected": -3089.084716796875,
"loss": 0.0788,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 0.3975849449634552,
"rewards/margins": 0.17512866854667664,
"rewards/rejected": 0.22245629131793976,
"step": 1130
},
{
"epoch": 0.91,
"learning_rate": 1.1702333051763271e-07,
"logits/chosen": 0.5996700525283813,
"logits/rejected": 0.5386776924133301,
"logps/chosen": -3202.969970703125,
"logps/rejected": -2970.28173828125,
"loss": 0.1025,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": 0.343997061252594,
"rewards/margins": 0.16654863953590393,
"rewards/rejected": 0.17744839191436768,
"step": 1140
},
{
"epoch": 0.92,
"learning_rate": 9.684576015420277e-08,
"logits/chosen": 0.6135612726211548,
"logits/rejected": 0.5466006398200989,
"logps/chosen": -3316.67578125,
"logps/rejected": -3048.313720703125,
"loss": 0.0874,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.3800659775733948,
"rewards/margins": 0.17580585181713104,
"rewards/rejected": 0.20426008105278015,
"step": 1150
},
{
"epoch": 0.93,
"learning_rate": 7.854209717842231e-08,
"logits/chosen": 0.6101081967353821,
"logits/rejected": 0.5615028738975525,
"logps/chosen": -3356.20849609375,
"logps/rejected": -3210.05224609375,
"loss": 0.0957,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.39068323373794556,
"rewards/margins": 0.14721594750881195,
"rewards/rejected": 0.2434672862291336,
"step": 1160
},
{
"epoch": 0.94,
"learning_rate": 6.212661423609184e-08,
"logits/chosen": 0.6116551160812378,
"logits/rejected": 0.47189703583717346,
"logps/chosen": -3391.901611328125,
"logps/rejected": -2864.9482421875,
"loss": 0.0721,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": 0.3698135316371918,
"rewards/margins": 0.1599336564540863,
"rewards/rejected": 0.20987987518310547,
"step": 1170
},
{
"epoch": 0.94,
"learning_rate": 4.761211162702117e-08,
"logits/chosen": 0.6695979833602905,
"logits/rejected": 0.5461623668670654,
"logps/chosen": -3409.35302734375,
"logps/rejected": -3067.67431640625,
"loss": 0.0853,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 0.3886914849281311,
"rewards/margins": 0.18535657227039337,
"rewards/rejected": 0.20333492755889893,
"step": 1180
},
{
"epoch": 0.95,
"learning_rate": 3.5009907323737826e-08,
"logits/chosen": 0.6564788818359375,
"logits/rejected": 0.5006071925163269,
"logps/chosen": -3463.62255859375,
"logps/rejected": -2909.28857421875,
"loss": 0.0859,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": 0.37883251905441284,
"rewards/margins": 0.15553273260593414,
"rewards/rejected": 0.2232998162508011,
"step": 1190
},
{
"epoch": 0.96,
"learning_rate": 2.4329828146074096e-08,
"logits/chosen": 0.6350787878036499,
"logits/rejected": 0.5539794564247131,
"logps/chosen": -3336.46044921875,
"logps/rejected": -3126.2548828125,
"loss": 0.0896,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": 0.4052211344242096,
"rewards/margins": 0.1808997541666031,
"rewards/rejected": 0.2243214100599289,
"step": 1200
},
{
"epoch": 0.97,
"learning_rate": 1.5580202098509078e-08,
"logits/chosen": 0.6311666965484619,
"logits/rejected": 0.599091649055481,
"logps/chosen": -3485.178955078125,
"logps/rejected": -3325.38134765625,
"loss": 0.0873,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": 0.39726322889328003,
"rewards/margins": 0.1585725098848343,
"rewards/rejected": 0.23869077861309052,
"step": 1210
},
{
"epoch": 0.98,
"learning_rate": 8.767851876239075e-09,
"logits/chosen": 0.6433550119400024,
"logits/rejected": 0.5538659691810608,
"logps/chosen": -3310.471923828125,
"logps/rejected": -2977.670166015625,
"loss": 0.0878,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": 0.384525865316391,
"rewards/margins": 0.16290965676307678,
"rewards/rejected": 0.2216162383556366,
"step": 1220
},
{
"epoch": 0.98,
"learning_rate": 3.8980895450474455e-09,
"logits/chosen": 0.6242701411247253,
"logits/rejected": 0.5688928365707397,
"logps/chosen": -3227.052490234375,
"logps/rejected": -2975.194091796875,
"loss": 0.0877,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": 0.3587878346443176,
"rewards/margins": 0.16212721168994904,
"rewards/rejected": 0.19666056334972382,
"step": 1230
},
{
"epoch": 0.99,
"learning_rate": 9.747123991141193e-10,
"logits/chosen": 0.655865490436554,
"logits/rejected": 0.518544614315033,
"logps/chosen": -3414.157470703125,
"logps/rejected": -2845.206298828125,
"loss": 0.0955,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": 0.37420162558555603,
"rewards/margins": 0.18059025704860687,
"rewards/rejected": 0.19361138343811035,
"step": 1240
},
{
"epoch": 1.0,
"learning_rate": 0.0,
"logits/chosen": 0.6276296973228455,
"logits/rejected": 0.4904142916202545,
"logps/chosen": -3474.27197265625,
"logps/rejected": -2985.078857421875,
"loss": 0.0931,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": 0.36780911684036255,
"rewards/margins": 0.17106689512729645,
"rewards/rejected": 0.1967422217130661,
"step": 1250
},
{
"epoch": 1.0,
"step": 1250,
"total_flos": 0.0,
"train_loss": 0.09343003117442131,
"train_runtime": 11089.5357,
"train_samples_per_second": 1.804,
"train_steps_per_second": 0.113
}
],
"logging_steps": 10,
"max_steps": 1250,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}