{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 4e-08, "logits/chosen": -0.09526942670345306, "logits/rejected": -0.23948004841804504, "logps/chosen": -3969.244140625, "logps/rejected": -2912.11376953125, "loss": 0.1112, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "learning_rate": 4.0000000000000003e-07, "logits/chosen": -0.11336694657802582, "logits/rejected": -0.21862205862998962, "logps/chosen": -3883.32763671875, "logps/rejected": -3105.751708984375, "loss": 0.1476, "rewards/accuracies": 0.3541666567325592, "rewards/chosen": -6.194857178343227e-06, "rewards/margins": -7.76553206378594e-05, "rewards/rejected": 7.146046118577942e-05, "step": 10 }, { "epoch": 0.02, "learning_rate": 8.000000000000001e-07, "logits/chosen": -0.11791403591632843, "logits/rejected": -0.19368262588977814, "logps/chosen": -3725.202392578125, "logps/rejected": -3018.818603515625, "loss": 0.1581, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.00026606960454955697, "rewards/margins": -6.655660399701446e-05, "rewards/rejected": 0.00033262622309848666, "step": 20 }, { "epoch": 0.02, "learning_rate": 1.2000000000000002e-06, "logits/chosen": -0.10180139541625977, "logits/rejected": -0.1368643343448639, "logps/chosen": -3734.001953125, "logps/rejected": -3405.164794921875, "loss": 0.1348, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.001434823265299201, "rewards/margins": 0.0006993044517003, "rewards/rejected": 0.000735518871806562, "step": 30 }, { "epoch": 0.03, "learning_rate": 1.6000000000000001e-06, "logits/chosen": -0.11290457099676132, "logits/rejected": -0.15302149951457977, "logps/chosen": -3876.04052734375, "logps/rejected": -3476.99365234375, "loss": 0.1472, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.00464777322486043, "rewards/margins": 0.0033736887853592634, "rewards/rejected": 0.0012740844395011663, "step": 40 }, { "epoch": 0.04, "learning_rate": 2.0000000000000003e-06, "logits/chosen": -0.10660350322723389, "logits/rejected": -0.16240839660167694, "logps/chosen": -3837.369873046875, "logps/rejected": -3373.931640625, "loss": 0.1426, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.013103676959872246, "rewards/margins": 0.008479808457195759, "rewards/rejected": 0.004623868502676487, "step": 50 }, { "epoch": 0.05, "learning_rate": 2.4000000000000003e-06, "logits/chosen": -0.08258971571922302, "logits/rejected": -0.15586063265800476, "logps/chosen": -3829.67041015625, "logps/rejected": -3369.47119140625, "loss": 0.1363, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.029313404113054276, "rewards/margins": 0.022997483611106873, "rewards/rejected": 0.006315918173640966, "step": 60 }, { "epoch": 0.06, "learning_rate": 2.8000000000000003e-06, "logits/chosen": -0.10445211082696915, "logits/rejected": -0.180179625749588, "logps/chosen": -3644.919921875, "logps/rejected": -3038.00732421875, "loss": 0.1343, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.05821167677640915, "rewards/margins": 0.030616426840424538, "rewards/rejected": 0.02759525738656521, "step": 70 }, { "epoch": 0.06, "learning_rate": 3.2000000000000003e-06, "logits/chosen": 0.0064078932628035545, "logits/rejected": -0.10336550325155258, "logps/chosen": -3655.2109375, "logps/rejected": -3017.391357421875, "loss": 0.1205, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.0925077348947525, "rewards/margins": 0.05041419342160225, "rewards/rejected": 0.04209354892373085, "step": 80 }, { "epoch": 0.07, "learning_rate": 3.6000000000000003e-06, "logits/chosen": 0.034665923565626144, "logits/rejected": -0.03560350090265274, "logps/chosen": -3621.365234375, "logps/rejected": -3200.21240234375, "loss": 0.1068, "rewards/accuracies": 0.71875, "rewards/chosen": 0.1416129767894745, "rewards/margins": 0.08034636825323105, "rewards/rejected": 0.06126661226153374, "step": 90 }, { "epoch": 0.08, "learning_rate": 4.000000000000001e-06, "logits/chosen": 0.04580535367131233, "logits/rejected": -0.005379015114158392, "logps/chosen": -3695.321533203125, "logps/rejected": -3426.08447265625, "loss": 0.1051, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.17054443061351776, "rewards/margins": 0.08689786493778229, "rewards/rejected": 0.08364654332399368, "step": 100 }, { "epoch": 0.09, "learning_rate": 4.4e-06, "logits/chosen": 0.05780696123838425, "logits/rejected": -0.00175203918479383, "logps/chosen": -3636.29638671875, "logps/rejected": -3395.66748046875, "loss": 0.1036, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.2138124257326126, "rewards/margins": 0.0803312435746193, "rewards/rejected": 0.1334811896085739, "step": 110 }, { "epoch": 0.1, "learning_rate": 4.800000000000001e-06, "logits/chosen": 0.06642362475395203, "logits/rejected": -0.037016235291957855, "logps/chosen": -3482.66015625, "logps/rejected": -2920.569580078125, "loss": 0.1081, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.2094995528459549, "rewards/margins": 0.10118832439184189, "rewards/rejected": 0.1083112508058548, "step": 120 }, { "epoch": 0.1, "learning_rate": 4.999756310023261e-06, "logits/chosen": 0.13063621520996094, "logits/rejected": 0.06078845262527466, "logps/chosen": -3470.27197265625, "logps/rejected": -3143.759033203125, "loss": 0.0841, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.2582111954689026, "rewards/margins": 0.10591275990009308, "rewards/rejected": 0.1522984504699707, "step": 130 }, { "epoch": 0.11, "learning_rate": 4.997807075247147e-06, "logits/chosen": 0.16255763173103333, "logits/rejected": 0.07761454582214355, "logps/chosen": -3592.858642578125, "logps/rejected": -3239.969482421875, "loss": 0.1071, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.2606235444545746, "rewards/margins": 0.11447002738714218, "rewards/rejected": 0.1461535096168518, "step": 140 }, { "epoch": 0.12, "learning_rate": 4.993910125649561e-06, "logits/chosen": 0.16677160561084747, "logits/rejected": 0.08901594579219818, "logps/chosen": -3506.703857421875, "logps/rejected": -3104.87255859375, "loss": 0.0945, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.2724655568599701, "rewards/margins": 0.12591706216335297, "rewards/rejected": 0.14654847979545593, "step": 150 }, { "epoch": 0.13, "learning_rate": 4.988068499954578e-06, "logits/chosen": 0.22131207585334778, "logits/rejected": 0.13661739230155945, "logps/chosen": -3316.39208984375, "logps/rejected": -2960.86083984375, "loss": 0.0905, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.2880935072898865, "rewards/margins": 0.1261982023715973, "rewards/rejected": 0.16189530491828918, "step": 160 }, { "epoch": 0.14, "learning_rate": 4.980286753286196e-06, "logits/chosen": 0.26340895891189575, "logits/rejected": 0.17107249796390533, "logps/chosen": -3592.39306640625, "logps/rejected": -3124.1767578125, "loss": 0.0955, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 0.30015623569488525, "rewards/margins": 0.14308349788188934, "rewards/rejected": 0.15707270801067352, "step": 170 }, { "epoch": 0.14, "learning_rate": 4.970570953616383e-06, "logits/chosen": 0.2763732671737671, "logits/rejected": 0.1752476692199707, "logps/chosen": -3527.703125, "logps/rejected": -3171.340576171875, "loss": 0.0918, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 0.3266971707344055, "rewards/margins": 0.15975165367126465, "rewards/rejected": 0.16694548726081848, "step": 180 }, { "epoch": 0.15, "learning_rate": 4.958928677033465e-06, "logits/chosen": 0.2631959021091461, "logits/rejected": 0.21199622750282288, "logps/chosen": -3533.75341796875, "logps/rejected": -3337.073486328125, "loss": 0.0921, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 0.3471175730228424, "rewards/margins": 0.11177588999271393, "rewards/rejected": 0.23534169793128967, "step": 190 }, { "epoch": 0.16, "learning_rate": 4.9453690018345144e-06, "logits/chosen": 0.2420426905155182, "logits/rejected": 0.1836567521095276, "logps/chosen": -3306.966064453125, "logps/rejected": -2964.663818359375, "loss": 0.0895, "rewards/accuracies": 0.78125, "rewards/chosen": 0.32517337799072266, "rewards/margins": 0.12965548038482666, "rewards/rejected": 0.19551792740821838, "step": 200 }, { "epoch": 0.17, "learning_rate": 4.9299025014463665e-06, "logits/chosen": 0.23070940375328064, "logits/rejected": 0.17138567566871643, "logps/chosen": -3292.091796875, "logps/rejected": -2975.43115234375, "loss": 0.1009, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.328936368227005, "rewards/margins": 0.11976213753223419, "rewards/rejected": 0.20917420089244843, "step": 210 }, { "epoch": 0.18, "learning_rate": 4.912541236180779e-06, "logits/chosen": 0.15947876870632172, "logits/rejected": 0.08568959683179855, "logps/chosen": -3484.258544921875, "logps/rejected": -3174.15087890625, "loss": 0.0956, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": 0.34722018241882324, "rewards/margins": 0.155741885304451, "rewards/rejected": 0.19147828221321106, "step": 220 }, { "epoch": 0.18, "learning_rate": 4.893298743830168e-06, "logits/chosen": 0.22774775326251984, "logits/rejected": 0.13324826955795288, "logps/chosen": -3539.89453125, "logps/rejected": -3201.11279296875, "loss": 0.0968, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 0.38138940930366516, "rewards/margins": 0.15053245425224304, "rewards/rejected": 0.23085694015026093, "step": 230 }, { "epoch": 0.19, "learning_rate": 4.8721900291112415e-06, "logits/chosen": 0.24971647560596466, "logits/rejected": 0.18798983097076416, "logps/chosen": -3449.693359375, "logps/rejected": -3182.7216796875, "loss": 0.1007, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 0.4053890109062195, "rewards/margins": 0.12963128089904785, "rewards/rejected": 0.27575770020484924, "step": 240 }, { "epoch": 0.2, "learning_rate": 4.849231551964771e-06, "logits/chosen": 0.290091335773468, "logits/rejected": 0.2263306826353073, "logps/chosen": -3255.100830078125, "logps/rejected": -2931.810546875, "loss": 0.0973, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.38671380281448364, "rewards/margins": 0.1347990781068802, "rewards/rejected": 0.2519146800041199, "step": 250 }, { "epoch": 0.21, "learning_rate": 4.824441214720629e-06, "logits/chosen": 0.35076624155044556, "logits/rejected": 0.26289868354797363, "logps/chosen": -3281.548828125, "logps/rejected": -2823.29052734375, "loss": 0.0937, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.3694974184036255, "rewards/margins": 0.1352803260087967, "rewards/rejected": 0.2342170774936676, "step": 260 }, { "epoch": 0.22, "learning_rate": 4.7978383481380865e-06, "logits/chosen": 0.3338952660560608, "logits/rejected": 0.30002114176750183, "logps/chosen": -3367.43115234375, "logps/rejected": -3190.338623046875, "loss": 0.0961, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.38234108686447144, "rewards/margins": 0.14520631730556488, "rewards/rejected": 0.23713478446006775, "step": 270 }, { "epoch": 0.22, "learning_rate": 4.769443696332272e-06, "logits/chosen": 0.4176582396030426, "logits/rejected": 0.34488004446029663, "logps/chosen": -3451.33447265625, "logps/rejected": -3161.616943359375, "loss": 0.091, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 0.3714825510978699, "rewards/margins": 0.14414021372795105, "rewards/rejected": 0.22734233736991882, "step": 280 }, { "epoch": 0.23, "learning_rate": 4.7392794005985324e-06, "logits/chosen": 0.39374208450317383, "logits/rejected": 0.36565738916397095, "logps/chosen": -3225.15380859375, "logps/rejected": -3087.46728515625, "loss": 0.1046, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.3453277051448822, "rewards/margins": 0.16379894316196442, "rewards/rejected": 0.18152877688407898, "step": 290 }, { "epoch": 0.24, "learning_rate": 4.707368982147318e-06, "logits/chosen": 0.39281272888183594, "logits/rejected": 0.36582762002944946, "logps/chosen": -3398.608154296875, "logps/rejected": -3263.87939453125, "loss": 0.0904, "rewards/accuracies": 0.78125, "rewards/chosen": 0.3241587281227112, "rewards/margins": 0.14963512122631073, "rewards/rejected": 0.17452362179756165, "step": 300 }, { "epoch": 0.25, "learning_rate": 4.673737323763048e-06, "logits/chosen": 0.30504000186920166, "logits/rejected": 0.22714261710643768, "logps/chosen": -3418.66748046875, "logps/rejected": -3106.970947265625, "loss": 0.0872, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.3434060215950012, "rewards/margins": 0.173831969499588, "rewards/rejected": 0.16957402229309082, "step": 310 }, { "epoch": 0.26, "learning_rate": 4.638410650401267e-06, "logits/chosen": 0.34125423431396484, "logits/rejected": 0.2718544602394104, "logps/chosen": -3394.98193359375, "logps/rejected": -3080.618408203125, "loss": 0.0786, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": 0.32423415780067444, "rewards/margins": 0.1511869728565216, "rewards/rejected": 0.17304711043834686, "step": 320 }, { "epoch": 0.26, "learning_rate": 4.601416508739211e-06, "logits/chosen": 0.36246171593666077, "logits/rejected": 0.3016008734703064, "logps/chosen": -3435.645263671875, "logps/rejected": -3126.3828125, "loss": 0.0868, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 0.31689783930778503, "rewards/margins": 0.16403648257255554, "rewards/rejected": 0.1528613567352295, "step": 330 }, { "epoch": 0.27, "learning_rate": 4.562783745695738e-06, "logits/chosen": 0.3510586619377136, "logits/rejected": 0.254965603351593, "logps/chosen": -3339.10986328125, "logps/rejected": -2880.090576171875, "loss": 0.0873, "rewards/accuracies": 0.78125, "rewards/chosen": 0.3381843566894531, "rewards/margins": 0.14746293425559998, "rewards/rejected": 0.19072142243385315, "step": 340 }, { "epoch": 0.28, "learning_rate": 4.522542485937369e-06, "logits/chosen": 0.3423737585544586, "logits/rejected": 0.2638542056083679, "logps/chosen": -3313.405517578125, "logps/rejected": -2987.49609375, "loss": 0.0887, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 0.35876303911209106, "rewards/margins": 0.15431997179985046, "rewards/rejected": 0.204443097114563, "step": 350 }, { "epoch": 0.29, "learning_rate": 4.4807241083879774e-06, "logits/chosen": 0.35648784041404724, "logits/rejected": 0.276287704706192, "logps/chosen": -3448.21142578125, "logps/rejected": -3130.1142578125, "loss": 0.0875, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.3623107969760895, "rewards/margins": 0.15894022583961487, "rewards/rejected": 0.2033705711364746, "step": 360 }, { "epoch": 0.3, "learning_rate": 4.437361221760449e-06, "logits/chosen": 0.42664772272109985, "logits/rejected": 0.379373162984848, "logps/chosen": -3359.43603515625, "logps/rejected": -3039.10693359375, "loss": 0.1004, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 0.34946465492248535, "rewards/margins": 0.14916878938674927, "rewards/rejected": 0.20029588043689728, "step": 370 }, { "epoch": 0.3, "learning_rate": 4.3924876391293915e-06, "logits/chosen": 0.467120498418808, "logits/rejected": 0.40080124139785767, "logps/chosen": -3476.703125, "logps/rejected": -3288.689453125, "loss": 0.0805, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": 0.3640100955963135, "rewards/margins": 0.17574051022529602, "rewards/rejected": 0.18826961517333984, "step": 380 }, { "epoch": 0.31, "learning_rate": 4.346138351564711e-06, "logits/chosen": 0.42980876564979553, "logits/rejected": 0.3273950517177582, "logps/chosen": -3256.771484375, "logps/rejected": -2806.90625, "loss": 0.0925, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.3438069224357605, "rewards/margins": 0.14692214131355286, "rewards/rejected": 0.19688478112220764, "step": 390 }, { "epoch": 0.32, "learning_rate": 4.2983495008466285e-06, "logits/chosen": 0.4755614697933197, "logits/rejected": 0.4100918769836426, "logps/chosen": -3387.567626953125, "logps/rejected": -3176.111572265625, "loss": 0.1011, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 0.3908953368663788, "rewards/margins": 0.14914441108703613, "rewards/rejected": 0.24175091087818146, "step": 400 }, { "epoch": 0.33, "learning_rate": 4.249158351283414e-06, "logits/chosen": 0.43400949239730835, "logits/rejected": 0.35584893822669983, "logps/chosen": -3405.94384765625, "logps/rejected": -3099.514404296875, "loss": 0.0885, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 0.3543582856655121, "rewards/margins": 0.15985527634620667, "rewards/rejected": 0.1945030391216278, "step": 410 }, { "epoch": 0.34, "learning_rate": 4.198603260653792e-06, "logits/chosen": 0.4672483801841736, "logits/rejected": 0.38711977005004883, "logps/chosen": -3460.95947265625, "logps/rejected": -3150.413330078125, "loss": 0.0982, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 0.38315701484680176, "rewards/margins": 0.16369104385375977, "rewards/rejected": 0.219465970993042, "step": 420 }, { "epoch": 0.34, "learning_rate": 4.146723650296701e-06, "logits/chosen": 0.483567476272583, "logits/rejected": 0.4229060113430023, "logps/chosen": -3454.10107421875, "logps/rejected": -3221.44091796875, "loss": 0.0865, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.38951292634010315, "rewards/margins": 0.1551971733570099, "rewards/rejected": 0.23431572318077087, "step": 430 }, { "epoch": 0.35, "learning_rate": 4.093559974371725e-06, "logits/chosen": 0.5139747858047485, "logits/rejected": 0.42474398016929626, "logps/chosen": -3357.93896484375, "logps/rejected": -3006.518310546875, "loss": 0.0832, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 0.3771596848964691, "rewards/margins": 0.16284802556037903, "rewards/rejected": 0.21431168913841248, "step": 440 }, { "epoch": 0.36, "learning_rate": 4.039153688314146e-06, "logits/chosen": 0.5247809886932373, "logits/rejected": 0.4323784410953522, "logps/chosen": -3264.754638671875, "logps/rejected": -2934.151123046875, "loss": 0.0968, "rewards/accuracies": 0.8125, "rewards/chosen": 0.3777909576892853, "rewards/margins": 0.1437477171421051, "rewards/rejected": 0.2340432107448578, "step": 450 }, { "epoch": 0.37, "learning_rate": 3.983547216509254e-06, "logits/chosen": 0.5001921653747559, "logits/rejected": 0.40190553665161133, "logps/chosen": -3468.16015625, "logps/rejected": -3026.26025390625, "loss": 0.09, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.38785520195961, "rewards/margins": 0.17032787203788757, "rewards/rejected": 0.2175273448228836, "step": 460 }, { "epoch": 0.38, "learning_rate": 3.92678391921108e-06, "logits/chosen": 0.5006144642829895, "logits/rejected": 0.415066659450531, "logps/chosen": -3334.10009765625, "logps/rejected": -2893.51123046875, "loss": 0.0972, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.3449471592903137, "rewards/margins": 0.1539374142885208, "rewards/rejected": 0.1910097301006317, "step": 470 }, { "epoch": 0.38, "learning_rate": 3.868908058731376e-06, "logits/chosen": 0.5540028810501099, "logits/rejected": 0.48031479120254517, "logps/chosen": -3356.93212890625, "logps/rejected": -2993.399169921875, "loss": 0.0985, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.3838370144367218, "rewards/margins": 0.14018234610557556, "rewards/rejected": 0.24365463852882385, "step": 480 }, { "epoch": 0.39, "learning_rate": 3.8099647649251984e-06, "logits/chosen": 0.5899518728256226, "logits/rejected": 0.4895103871822357, "logps/chosen": -3501.290283203125, "logps/rejected": -3142.33544921875, "loss": 0.0869, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 0.386918306350708, "rewards/margins": 0.17316767573356628, "rewards/rejected": 0.21375060081481934, "step": 490 }, { "epoch": 0.4, "learning_rate": 3.7500000000000005e-06, "logits/chosen": 0.533571720123291, "logits/rejected": 0.45087581872940063, "logps/chosen": -3382.43115234375, "logps/rejected": -3039.810791015625, "loss": 0.0938, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": 0.35097357630729675, "rewards/margins": 0.16260935366153717, "rewards/rejected": 0.18836425244808197, "step": 500 }, { "epoch": 0.41, "learning_rate": 3.689060522675689e-06, "logits/chosen": 0.5135028958320618, "logits/rejected": 0.38714414834976196, "logps/chosen": -3217.434814453125, "logps/rejected": -2690.976806640625, "loss": 0.0962, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.3489993214607239, "rewards/margins": 0.14681319892406464, "rewards/rejected": 0.20218610763549805, "step": 510 }, { "epoch": 0.42, "learning_rate": 3.627193851723577e-06, "logits/chosen": 0.49498695135116577, "logits/rejected": 0.4188925325870514, "logps/chosen": -3364.91552734375, "logps/rejected": -3090.07275390625, "loss": 0.0802, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.37084800004959106, "rewards/margins": 0.15501223504543304, "rewards/rejected": 0.2158357799053192, "step": 520 }, { "epoch": 0.42, "learning_rate": 3.564448228912682e-06, "logits/chosen": 0.4635513722896576, "logits/rejected": 0.36989638209342957, "logps/chosen": -3387.86474609375, "logps/rejected": -3016.7216796875, "loss": 0.0943, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.3274918794631958, "rewards/margins": 0.15053078532218933, "rewards/rejected": 0.17696109414100647, "step": 530 }, { "epoch": 0.43, "learning_rate": 3.5008725813922383e-06, "logits/chosen": 0.49751853942871094, "logits/rejected": 0.3913223147392273, "logps/chosen": -3440.885986328125, "logps/rejected": -2968.994873046875, "loss": 0.0981, "rewards/accuracies": 0.78125, "rewards/chosen": 0.34751924872398376, "rewards/margins": 0.14958377182483673, "rewards/rejected": 0.19793547689914703, "step": 540 }, { "epoch": 0.44, "learning_rate": 3.436516483539781e-06, "logits/chosen": 0.5272361040115356, "logits/rejected": 0.40506014227867126, "logps/chosen": -3340.62060546875, "logps/rejected": -2864.970947265625, "loss": 0.0903, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.3856109082698822, "rewards/margins": 0.15098164975643158, "rewards/rejected": 0.23462922871112823, "step": 550 }, { "epoch": 0.45, "learning_rate": 3.3714301183045382e-06, "logits/chosen": 0.4976809024810791, "logits/rejected": 0.4073059558868408, "logps/chosen": -3362.13134765625, "logps/rejected": -2928.28515625, "loss": 0.1007, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": 0.4102960526943207, "rewards/margins": 0.14597077667713165, "rewards/rejected": 0.2643252909183502, "step": 560 }, { "epoch": 0.46, "learning_rate": 3.3056642380762783e-06, "logits/chosen": 0.5032496452331543, "logits/rejected": 0.40577760338783264, "logps/chosen": -3327.85888671875, "logps/rejected": -2854.94970703125, "loss": 0.1083, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.3718569278717041, "rewards/margins": 0.14686135947704315, "rewards/rejected": 0.22499553859233856, "step": 570 }, { "epoch": 0.46, "learning_rate": 3.2392701251101172e-06, "logits/chosen": 0.5247712731361389, "logits/rejected": 0.4414879381656647, "logps/chosen": -3346.58740234375, "logps/rejected": -3087.098876953125, "loss": 0.0987, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.38225480914115906, "rewards/margins": 0.13378790020942688, "rewards/rejected": 0.24846693873405457, "step": 580 }, { "epoch": 0.47, "learning_rate": 3.1722995515381644e-06, "logits/chosen": 0.514184296131134, "logits/rejected": 0.40285858511924744, "logps/chosen": -3443.8828125, "logps/rejected": -2874.86181640625, "loss": 0.0832, "rewards/accuracies": 0.84375, "rewards/chosen": 0.3588571548461914, "rewards/margins": 0.15165671706199646, "rewards/rejected": 0.20720043778419495, "step": 590 }, { "epoch": 0.48, "learning_rate": 3.1048047389991693e-06, "logits/chosen": 0.5682590007781982, "logits/rejected": 0.5120213627815247, "logps/chosen": -3459.860595703125, "logps/rejected": -3225.1884765625, "loss": 0.0838, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 0.35979193449020386, "rewards/margins": 0.15927888453006744, "rewards/rejected": 0.2005130797624588, "step": 600 }, { "epoch": 0.49, "learning_rate": 3.0368383179176584e-06, "logits/chosen": 0.5174692869186401, "logits/rejected": 0.41469916701316833, "logps/chosen": -3251.609619140625, "logps/rejected": -2831.67431640625, "loss": 0.111, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.3455356955528259, "rewards/margins": 0.15155228972434998, "rewards/rejected": 0.19398342072963715, "step": 610 }, { "epoch": 0.5, "learning_rate": 2.9684532864643123e-06, "logits/chosen": 0.5736369490623474, "logits/rejected": 0.42854124307632446, "logps/chosen": -3492.6796875, "logps/rejected": -2873.034912109375, "loss": 0.0857, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.3865596354007721, "rewards/margins": 0.1634531319141388, "rewards/rejected": 0.2231064736843109, "step": 620 }, { "epoch": 0.5, "learning_rate": 2.8997029692295875e-06, "logits/chosen": 0.5368185043334961, "logits/rejected": 0.4608641266822815, "logps/chosen": -3310.38623046875, "logps/rejected": -3082.637939453125, "loss": 0.0929, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.3720545768737793, "rewards/margins": 0.14468377828598022, "rewards/rejected": 0.22737076878547668, "step": 630 }, { "epoch": 0.51, "learning_rate": 2.8306409756428067e-06, "logits/chosen": 0.5856886506080627, "logits/rejected": 0.4743286669254303, "logps/chosen": -3404.43310546875, "logps/rejected": -3081.451904296875, "loss": 0.0699, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 0.37632375955581665, "rewards/margins": 0.16852709650993347, "rewards/rejected": 0.20779672265052795, "step": 640 }, { "epoch": 0.52, "learning_rate": 2.761321158169134e-06, "logits/chosen": 0.6226561665534973, "logits/rejected": 0.5113543272018433, "logps/chosen": -3556.053955078125, "logps/rejected": -3128.98486328125, "loss": 0.0791, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 0.3752782642841339, "rewards/margins": 0.17961682379245758, "rewards/rejected": 0.19566142559051514, "step": 650 }, { "epoch": 0.53, "learning_rate": 2.6917975703170466e-06, "logits/chosen": 0.6197515726089478, "logits/rejected": 0.5554805994033813, "logps/chosen": -3466.33251953125, "logps/rejected": -3276.38671875, "loss": 0.0789, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.41308966279029846, "rewards/margins": 0.15082643926143646, "rewards/rejected": 0.2622632086277008, "step": 660 }, { "epoch": 0.54, "learning_rate": 2.6221244244890336e-06, "logits/chosen": 0.6192020773887634, "logits/rejected": 0.5320231914520264, "logps/chosen": -3314.12548828125, "logps/rejected": -3007.730224609375, "loss": 0.1062, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.4171117842197418, "rewards/margins": 0.1608533412218094, "rewards/rejected": 0.25625842809677124, "step": 670 }, { "epoch": 0.54, "learning_rate": 2.5523560497083927e-06, "logits/chosen": 0.5730468034744263, "logits/rejected": 0.5238832235336304, "logps/chosen": -3282.682373046875, "logps/rejected": -3070.96875, "loss": 0.1052, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.347822368144989, "rewards/margins": 0.12122434377670288, "rewards/rejected": 0.22659802436828613, "step": 680 }, { "epoch": 0.55, "learning_rate": 2.482546849255096e-06, "logits/chosen": 0.5806129574775696, "logits/rejected": 0.4810718595981598, "logps/chosen": -3249.9609375, "logps/rejected": -2905.1328125, "loss": 0.0932, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.36439889669418335, "rewards/margins": 0.15161724388599396, "rewards/rejected": 0.21278166770935059, "step": 690 }, { "epoch": 0.56, "learning_rate": 2.4127512582437486e-06, "logits/chosen": 0.5876488089561462, "logits/rejected": 0.47464412450790405, "logps/chosen": -3425.721923828125, "logps/rejected": -3029.7919921875, "loss": 0.08, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.43426513671875, "rewards/margins": 0.17609842121601105, "rewards/rejected": 0.2581667900085449, "step": 700 }, { "epoch": 0.57, "learning_rate": 2.3430237011767166e-06, "logits/chosen": 0.5383955836296082, "logits/rejected": 0.4219956398010254, "logps/chosen": -3329.063720703125, "logps/rejected": -2868.416015625, "loss": 0.0784, "rewards/accuracies": 0.8125, "rewards/chosen": 0.4002605974674225, "rewards/margins": 0.15942516922950745, "rewards/rejected": 0.24083542823791504, "step": 710 }, { "epoch": 0.58, "learning_rate": 2.2734185495055503e-06, "logits/chosen": 0.5360678434371948, "logits/rejected": 0.4522096514701843, "logps/chosen": -3392.73388671875, "logps/rejected": -3113.76708984375, "loss": 0.1041, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 0.42230457067489624, "rewards/margins": 0.14559249579906464, "rewards/rejected": 0.2767120599746704, "step": 720 }, { "epoch": 0.58, "learning_rate": 2.2039900792337477e-06, "logits/chosen": 0.5705752968788147, "logits/rejected": 0.5024815201759338, "logps/chosen": -3402.189453125, "logps/rejected": -3169.858154296875, "loss": 0.0749, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": 0.4054113030433655, "rewards/margins": 0.16554218530654907, "rewards/rejected": 0.2398691177368164, "step": 730 }, { "epoch": 0.59, "learning_rate": 2.134792428593971e-06, "logits/chosen": 0.5779368281364441, "logits/rejected": 0.4750920832157135, "logps/chosen": -3396.721923828125, "logps/rejected": -3118.03076171875, "loss": 0.0873, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 0.3966867923736572, "rewards/margins": 0.15320774912834167, "rewards/rejected": 0.24347904324531555, "step": 740 }, { "epoch": 0.6, "learning_rate": 2.0658795558326745e-06, "logits/chosen": 0.5595996379852295, "logits/rejected": 0.49162426590919495, "logps/chosen": -3377.83984375, "logps/rejected": -3203.514404296875, "loss": 0.0759, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.41549405455589294, "rewards/margins": 0.15610775351524353, "rewards/rejected": 0.2593863010406494, "step": 750 }, { "epoch": 0.61, "learning_rate": 1.997305197135089e-06, "logits/chosen": 0.5741230249404907, "logits/rejected": 0.499004602432251, "logps/chosen": -3418.917236328125, "logps/rejected": -3171.630859375, "loss": 0.0854, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": 0.38339635729789734, "rewards/margins": 0.15282298624515533, "rewards/rejected": 0.2305733859539032, "step": 760 }, { "epoch": 0.62, "learning_rate": 1.9291228247233607e-06, "logits/chosen": 0.5983024835586548, "logits/rejected": 0.5140877962112427, "logps/chosen": -3385.004638671875, "logps/rejected": -3130.511474609375, "loss": 0.076, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 0.41556110978126526, "rewards/margins": 0.18630118668079376, "rewards/rejected": 0.2292599231004715, "step": 770 }, { "epoch": 0.62, "learning_rate": 1.8613856051605242e-06, "logits/chosen": 0.5828371644020081, "logits/rejected": 0.509624719619751, "logps/chosen": -3329.71923828125, "logps/rejected": -3106.307861328125, "loss": 0.0886, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.3785203993320465, "rewards/margins": 0.17451588809490204, "rewards/rejected": 0.20400448143482208, "step": 780 }, { "epoch": 0.63, "learning_rate": 1.7941463578928088e-06, "logits/chosen": 0.5620870590209961, "logits/rejected": 0.4992052912712097, "logps/chosen": -3354.768310546875, "logps/rejected": -3018.64599609375, "loss": 0.0991, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.3805944621562958, "rewards/margins": 0.16154679656028748, "rewards/rejected": 0.2190476357936859, "step": 790 }, { "epoch": 0.64, "learning_rate": 1.7274575140626318e-06, "logits/chosen": 0.6222743391990662, "logits/rejected": 0.5590968132019043, "logps/chosen": -3424.283935546875, "logps/rejected": -3156.167724609375, "loss": 0.0838, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 0.39403584599494934, "rewards/margins": 0.16475993394851685, "rewards/rejected": 0.2292759120464325, "step": 800 }, { "epoch": 0.65, "learning_rate": 1.661371075624363e-06, "logits/chosen": 0.6057112216949463, "logits/rejected": 0.5170978903770447, "logps/chosen": -3288.67333984375, "logps/rejected": -2878.7490234375, "loss": 0.0902, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.3696768581867218, "rewards/margins": 0.1375175267457962, "rewards/rejected": 0.2321593314409256, "step": 810 }, { "epoch": 0.66, "learning_rate": 1.5959385747947697e-06, "logits/chosen": 0.6024297475814819, "logits/rejected": 0.46327948570251465, "logps/chosen": -3431.43798828125, "logps/rejected": -2972.927734375, "loss": 0.0918, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 0.4163932204246521, "rewards/margins": 0.17428722977638245, "rewards/rejected": 0.24210599064826965, "step": 820 }, { "epoch": 0.66, "learning_rate": 1.5312110338697427e-06, "logits/chosen": 0.6199443340301514, "logits/rejected": 0.5287885069847107, "logps/chosen": -3343.03955078125, "logps/rejected": -3022.42236328125, "loss": 0.0814, "rewards/accuracies": 0.78125, "rewards/chosen": 0.399059921503067, "rewards/margins": 0.15364623069763184, "rewards/rejected": 0.2454136610031128, "step": 830 }, { "epoch": 0.67, "learning_rate": 1.467238925438646e-06, "logits/chosen": 0.6143923401832581, "logits/rejected": 0.5960370898246765, "logps/chosen": -3441.96875, "logps/rejected": -3306.02587890625, "loss": 0.0976, "rewards/accuracies": 0.84375, "rewards/chosen": 0.40943044424057007, "rewards/margins": 0.15342941880226135, "rewards/rejected": 0.2560010552406311, "step": 840 }, { "epoch": 0.68, "learning_rate": 1.4040721330273063e-06, "logits/chosen": 0.5770421624183655, "logits/rejected": 0.4791272282600403, "logps/chosen": -3493.46533203125, "logps/rejected": -3112.23486328125, "loss": 0.078, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 0.4063073992729187, "rewards/margins": 0.16731533408164978, "rewards/rejected": 0.23899206519126892, "step": 850 }, { "epoch": 0.69, "learning_rate": 1.3417599122003464e-06, "logits/chosen": 0.6255184412002563, "logits/rejected": 0.5643167495727539, "logps/chosen": -3481.497314453125, "logps/rejected": -3120.768310546875, "loss": 0.0768, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 0.38735657930374146, "rewards/margins": 0.1667819321155548, "rewards/rejected": 0.22057469189167023, "step": 860 }, { "epoch": 0.7, "learning_rate": 1.280350852153168e-06, "logits/chosen": 0.5782762765884399, "logits/rejected": 0.49263420701026917, "logps/chosen": -3499.00146484375, "logps/rejected": -3000.46728515625, "loss": 0.0836, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 0.3818155527114868, "rewards/margins": 0.158293217420578, "rewards/rejected": 0.2235223352909088, "step": 870 }, { "epoch": 0.7, "learning_rate": 1.2198928378235717e-06, "logits/chosen": 0.5858504176139832, "logits/rejected": 0.4866867661476135, "logps/chosen": -3311.93115234375, "logps/rejected": -2876.67041015625, "loss": 0.0807, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.3901521563529968, "rewards/margins": 0.17163410782814026, "rewards/rejected": 0.21851806342601776, "step": 880 }, { "epoch": 0.71, "learning_rate": 1.160433012552508e-06, "logits/chosen": 0.6068114638328552, "logits/rejected": 0.5134158730506897, "logps/chosen": -3114.994140625, "logps/rejected": -2820.002685546875, "loss": 0.0843, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 0.37761688232421875, "rewards/margins": 0.1500733643770218, "rewards/rejected": 0.22754351794719696, "step": 890 }, { "epoch": 0.72, "learning_rate": 1.1020177413231334e-06, "logits/chosen": 0.6057130098342896, "logits/rejected": 0.4943726062774658, "logps/chosen": -3233.512939453125, "logps/rejected": -2793.17724609375, "loss": 0.1014, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.37504011392593384, "rewards/margins": 0.15143273770809174, "rewards/rejected": 0.2236073762178421, "step": 900 }, { "epoch": 0.73, "learning_rate": 1.0446925746067768e-06, "logits/chosen": 0.6174426078796387, "logits/rejected": 0.5331483483314514, "logps/chosen": -3416.95556640625, "logps/rejected": -3206.383056640625, "loss": 0.0898, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 0.3918195962905884, "rewards/margins": 0.1552480012178421, "rewards/rejected": 0.23657159507274628, "step": 910 }, { "epoch": 0.74, "learning_rate": 9.88502212844063e-07, "logits/chosen": 0.6079164743423462, "logits/rejected": 0.5762253999710083, "logps/chosen": -3149.41357421875, "logps/rejected": -2938.776123046875, "loss": 0.0841, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.3338172435760498, "rewards/margins": 0.11891861259937286, "rewards/rejected": 0.21489866077899933, "step": 920 }, { "epoch": 0.74, "learning_rate": 9.334904715888496e-07, "logits/chosen": 0.6562485694885254, "logits/rejected": 0.5610599517822266, "logps/chosen": -3430.844482421875, "logps/rejected": -3026.52587890625, "loss": 0.0942, "rewards/accuracies": 0.84375, "rewards/chosen": 0.3751566410064697, "rewards/margins": 0.17377465963363647, "rewards/rejected": 0.20138195157051086, "step": 930 }, { "epoch": 0.75, "learning_rate": 8.797002473421729e-07, "logits/chosen": 0.6029684543609619, "logits/rejected": 0.5253230333328247, "logps/chosen": -3235.121826171875, "logps/rejected": -2938.1640625, "loss": 0.0747, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.35139986872673035, "rewards/margins": 0.16471409797668457, "rewards/rejected": 0.18668580055236816, "step": 940 }, { "epoch": 0.76, "learning_rate": 8.271734841028553e-07, "logits/chosen": 0.6538732051849365, "logits/rejected": 0.5982731580734253, "logps/chosen": -3381.48388671875, "logps/rejected": -3193.509033203125, "loss": 0.0925, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.3694911003112793, "rewards/margins": 0.15476444363594055, "rewards/rejected": 0.21472665667533875, "step": 950 }, { "epoch": 0.77, "learning_rate": 7.759511406608255e-07, "logits/chosen": 0.6155306100845337, "logits/rejected": 0.5355127453804016, "logps/chosen": -3603.495361328125, "logps/rejected": -3289.190673828125, "loss": 0.0996, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": 0.3712473511695862, "rewards/margins": 0.14055751264095306, "rewards/rejected": 0.23068983852863312, "step": 960 }, { "epoch": 0.78, "learning_rate": 7.260731586586983e-07, "logits/chosen": 0.6570934057235718, "logits/rejected": 0.5331934094429016, "logps/chosen": -3460.41552734375, "logps/rejected": -3020.72216796875, "loss": 0.094, "rewards/accuracies": 0.8125, "rewards/chosen": 0.36751216650009155, "rewards/margins": 0.16225677728652954, "rewards/rejected": 0.2052554190158844, "step": 970 }, { "epoch": 0.78, "learning_rate": 6.775784314464717e-07, "logits/chosen": 0.6543987393379211, "logits/rejected": 0.5447486639022827, "logps/chosen": -3222.0947265625, "logps/rejected": -2851.6064453125, "loss": 0.0877, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.3648824095726013, "rewards/margins": 0.15842023491859436, "rewards/rejected": 0.20646218955516815, "step": 980 }, { "epoch": 0.79, "learning_rate": 6.305047737536707e-07, "logits/chosen": 0.6519988179206848, "logits/rejected": 0.5568141341209412, "logps/chosen": -3336.877685546875, "logps/rejected": -3047.09326171875, "loss": 0.0764, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.40213853120803833, "rewards/margins": 0.1877773106098175, "rewards/rejected": 0.21436119079589844, "step": 990 }, { "epoch": 0.8, "learning_rate": 5.848888922025553e-07, "logits/chosen": 0.6370071172714233, "logits/rejected": 0.5316422581672668, "logps/chosen": -3368.276611328125, "logps/rejected": -2950.913330078125, "loss": 0.0863, "rewards/accuracies": 0.8125, "rewards/chosen": 0.41091403365135193, "rewards/margins": 0.1616295725107193, "rewards/rejected": 0.24928446114063263, "step": 1000 }, { "epoch": 0.81, "learning_rate": 5.407663566854008e-07, "logits/chosen": 0.6158221960067749, "logits/rejected": 0.5360954403877258, "logps/chosen": -3394.571533203125, "logps/rejected": -3010.104248046875, "loss": 0.0845, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 0.4172899127006531, "rewards/margins": 0.18432098627090454, "rewards/rejected": 0.23296895623207092, "step": 1010 }, { "epoch": 0.82, "learning_rate": 4.981715726281666e-07, "logits/chosen": 0.6194905042648315, "logits/rejected": 0.5224823355674744, "logps/chosen": -3173.560546875, "logps/rejected": -2845.927734375, "loss": 0.0886, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 0.3923606276512146, "rewards/margins": 0.1794053614139557, "rewards/rejected": 0.2129552811384201, "step": 1020 }, { "epoch": 0.82, "learning_rate": 4.5713775416217884e-07, "logits/chosen": 0.630397617816925, "logits/rejected": 0.5148654580116272, "logps/chosen": -3482.11181640625, "logps/rejected": -3051.90673828125, "loss": 0.0877, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.41265344619750977, "rewards/margins": 0.1835349202156067, "rewards/rejected": 0.22911854088306427, "step": 1030 }, { "epoch": 0.83, "learning_rate": 4.1769689822475147e-07, "logits/chosen": 0.6434907913208008, "logits/rejected": 0.548559308052063, "logps/chosen": -3349.734375, "logps/rejected": -3031.77978515625, "loss": 0.0941, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 0.3825233578681946, "rewards/margins": 0.14600971341133118, "rewards/rejected": 0.2365136444568634, "step": 1040 }, { "epoch": 0.84, "learning_rate": 3.798797596089351e-07, "logits/chosen": 0.6427907943725586, "logits/rejected": 0.5501333475112915, "logps/chosen": -3478.50537109375, "logps/rejected": -3245.615234375, "loss": 0.0937, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.41868337988853455, "rewards/margins": 0.1864897906780243, "rewards/rejected": 0.23219358921051025, "step": 1050 }, { "epoch": 0.85, "learning_rate": 3.4371582698185636e-07, "logits/chosen": 0.6372936964035034, "logits/rejected": 0.5432217717170715, "logps/chosen": -3423.430908203125, "logps/rejected": -3087.765380859375, "loss": 0.092, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 0.4114980101585388, "rewards/margins": 0.17217496037483215, "rewards/rejected": 0.23932309448719025, "step": 1060 }, { "epoch": 0.86, "learning_rate": 3.092332998903416e-07, "logits/chosen": 0.5996044874191284, "logits/rejected": 0.4986226558685303, "logps/chosen": -3295.41455078125, "logps/rejected": -2947.134765625, "loss": 0.0865, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.39540520310401917, "rewards/margins": 0.1856795847415924, "rewards/rejected": 0.20972561836242676, "step": 1070 }, { "epoch": 0.86, "learning_rate": 2.764590667717562e-07, "logits/chosen": 0.6375841498374939, "logits/rejected": 0.544662356376648, "logps/chosen": -3654.934326171875, "logps/rejected": -3230.848876953125, "loss": 0.067, "rewards/accuracies": 0.875, "rewards/chosen": 0.3808661103248596, "rewards/margins": 0.15394194424152374, "rewards/rejected": 0.22692415118217468, "step": 1080 }, { "epoch": 0.87, "learning_rate": 2.454186839872158e-07, "logits/chosen": 0.6398609280586243, "logits/rejected": 0.5464919805526733, "logps/chosen": -3495.825439453125, "logps/rejected": -3124.951416015625, "loss": 0.081, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 0.39376845955848694, "rewards/margins": 0.16370807588100433, "rewards/rejected": 0.23006033897399902, "step": 1090 }, { "epoch": 0.88, "learning_rate": 2.1613635589349756e-07, "logits/chosen": 0.6471344232559204, "logits/rejected": 0.5833622813224792, "logps/chosen": -3451.12890625, "logps/rejected": -3257.12353515625, "loss": 0.0869, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.3840171694755554, "rewards/margins": 0.14787636697292328, "rewards/rejected": 0.23614077270030975, "step": 1100 }, { "epoch": 0.89, "learning_rate": 1.8863491596921745e-07, "logits/chosen": 0.6442585587501526, "logits/rejected": 0.5597777962684631, "logps/chosen": -3400.688720703125, "logps/rejected": -3029.727294921875, "loss": 0.0815, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 0.3747108280658722, "rewards/margins": 0.14767669141292572, "rewards/rejected": 0.22703413665294647, "step": 1110 }, { "epoch": 0.9, "learning_rate": 1.629358090099639e-07, "logits/chosen": 0.6222900152206421, "logits/rejected": 0.5375300645828247, "logps/chosen": -3375.23583984375, "logps/rejected": -3024.98876953125, "loss": 0.088, "rewards/accuracies": 0.8125, "rewards/chosen": 0.37642619013786316, "rewards/margins": 0.1609790325164795, "rewards/rejected": 0.21544715762138367, "step": 1120 }, { "epoch": 0.9, "learning_rate": 1.3905907440629752e-07, "logits/chosen": 0.6418689489364624, "logits/rejected": 0.5823384523391724, "logps/chosen": -3317.762451171875, "logps/rejected": -3089.084716796875, "loss": 0.0788, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.3975849449634552, "rewards/margins": 0.17512866854667664, "rewards/rejected": 0.22245629131793976, "step": 1130 }, { "epoch": 0.91, "learning_rate": 1.1702333051763271e-07, "logits/chosen": 0.5996700525283813, "logits/rejected": 0.5386776924133301, "logps/chosen": -3202.969970703125, "logps/rejected": -2970.28173828125, "loss": 0.1025, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.343997061252594, "rewards/margins": 0.16654863953590393, "rewards/rejected": 0.17744839191436768, "step": 1140 }, { "epoch": 0.92, "learning_rate": 9.684576015420277e-08, "logits/chosen": 0.6135612726211548, "logits/rejected": 0.5466006398200989, "logps/chosen": -3316.67578125, "logps/rejected": -3048.313720703125, "loss": 0.0874, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.3800659775733948, "rewards/margins": 0.17580585181713104, "rewards/rejected": 0.20426008105278015, "step": 1150 }, { "epoch": 0.93, "learning_rate": 7.854209717842231e-08, "logits/chosen": 0.6101081967353821, "logits/rejected": 0.5615028738975525, "logps/chosen": -3356.20849609375, "logps/rejected": -3210.05224609375, "loss": 0.0957, "rewards/accuracies": 0.78125, "rewards/chosen": 0.39068323373794556, "rewards/margins": 0.14721594750881195, "rewards/rejected": 0.2434672862291336, "step": 1160 }, { "epoch": 0.94, "learning_rate": 6.212661423609184e-08, "logits/chosen": 0.6116551160812378, "logits/rejected": 0.47189703583717346, "logps/chosen": -3391.901611328125, "logps/rejected": -2864.9482421875, "loss": 0.0721, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.3698135316371918, "rewards/margins": 0.1599336564540863, "rewards/rejected": 0.20987987518310547, "step": 1170 }, { "epoch": 0.94, "learning_rate": 4.761211162702117e-08, "logits/chosen": 0.6695979833602905, "logits/rejected": 0.5461623668670654, "logps/chosen": -3409.35302734375, "logps/rejected": -3067.67431640625, "loss": 0.0853, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.3886914849281311, "rewards/margins": 0.18535657227039337, "rewards/rejected": 0.20333492755889893, "step": 1180 }, { "epoch": 0.95, "learning_rate": 3.5009907323737826e-08, "logits/chosen": 0.6564788818359375, "logits/rejected": 0.5006071925163269, "logps/chosen": -3463.62255859375, "logps/rejected": -2909.28857421875, "loss": 0.0859, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": 0.37883251905441284, "rewards/margins": 0.15553273260593414, "rewards/rejected": 0.2232998162508011, "step": 1190 }, { "epoch": 0.96, "learning_rate": 2.4329828146074096e-08, "logits/chosen": 0.6350787878036499, "logits/rejected": 0.5539794564247131, "logps/chosen": -3336.46044921875, "logps/rejected": -3126.2548828125, "loss": 0.0896, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 0.4052211344242096, "rewards/margins": 0.1808997541666031, "rewards/rejected": 0.2243214100599289, "step": 1200 }, { "epoch": 0.97, "learning_rate": 1.5580202098509078e-08, "logits/chosen": 0.6311666965484619, "logits/rejected": 0.599091649055481, "logps/chosen": -3485.178955078125, "logps/rejected": -3325.38134765625, "loss": 0.0873, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.39726322889328003, "rewards/margins": 0.1585725098848343, "rewards/rejected": 0.23869077861309052, "step": 1210 }, { "epoch": 0.98, "learning_rate": 8.767851876239075e-09, "logits/chosen": 0.6433550119400024, "logits/rejected": 0.5538659691810608, "logps/chosen": -3310.471923828125, "logps/rejected": -2977.670166015625, "loss": 0.0878, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.384525865316391, "rewards/margins": 0.16290965676307678, "rewards/rejected": 0.2216162383556366, "step": 1220 }, { "epoch": 0.98, "learning_rate": 3.8980895450474455e-09, "logits/chosen": 0.6242701411247253, "logits/rejected": 0.5688928365707397, "logps/chosen": -3227.052490234375, "logps/rejected": -2975.194091796875, "loss": 0.0877, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 0.3587878346443176, "rewards/margins": 0.16212721168994904, "rewards/rejected": 0.19666056334972382, "step": 1230 }, { "epoch": 0.99, "learning_rate": 9.747123991141193e-10, "logits/chosen": 0.655865490436554, "logits/rejected": 0.518544614315033, "logps/chosen": -3414.157470703125, "logps/rejected": -2845.206298828125, "loss": 0.0955, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 0.37420162558555603, "rewards/margins": 0.18059025704860687, "rewards/rejected": 0.19361138343811035, "step": 1240 }, { "epoch": 1.0, "learning_rate": 0.0, "logits/chosen": 0.6276296973228455, "logits/rejected": 0.4904142916202545, "logps/chosen": -3474.27197265625, "logps/rejected": -2985.078857421875, "loss": 0.0931, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.36780911684036255, "rewards/margins": 0.17106689512729645, "rewards/rejected": 0.1967422217130661, "step": 1250 }, { "epoch": 1.0, "step": 1250, "total_flos": 0.0, "train_loss": 0.09343003117442131, "train_runtime": 11089.5357, "train_samples_per_second": 1.804, "train_steps_per_second": 0.113 } ], "logging_steps": 10, "max_steps": 1250, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }