{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9996397045577372, "eval_steps": 100, "global_step": 5550, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003602954422626554, "grad_norm": 3.3397774696350098, "learning_rate": 9.009009009009008e-10, "logits/chosen": -1.9418047666549683, "logits/rejected": -1.931673288345337, "logps/chosen": -29.41129493713379, "logps/rejected": -34.63249206542969, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0036029544226265538, "grad_norm": 3.7182133197784424, "learning_rate": 9.009009009009009e-09, "logits/chosen": -1.7383267879486084, "logits/rejected": -1.7110590934753418, "logps/chosen": -43.03986740112305, "logps/rejected": -43.70203399658203, "loss": 0.6932, "rewards/accuracies": 0.4722222089767456, "rewards/chosen": -0.0002388593857176602, "rewards/margins": -0.00014030587044544518, "rewards/rejected": -9.855348616838455e-05, "step": 10 }, { "epoch": 0.0072059088452531075, "grad_norm": 3.187004804611206, "learning_rate": 1.8018018018018017e-08, "logits/chosen": -1.7016605138778687, "logits/rejected": -1.684544324874878, "logps/chosen": -41.77806854248047, "logps/rejected": -44.841636657714844, "loss": 0.6932, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -5.4537718824576586e-05, "rewards/margins": -8.961025741882622e-05, "rewards/rejected": 3.50725713360589e-05, "step": 20 }, { "epoch": 0.010808863267879661, "grad_norm": 4.540226459503174, "learning_rate": 2.7027027027027028e-08, "logits/chosen": -1.6793386936187744, "logits/rejected": -1.6608819961547852, "logps/chosen": -45.18395233154297, "logps/rejected": -48.683021545410156, "loss": 0.6932, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -5.281760422803927e-06, "rewards/margins": -0.00016755127580836415, "rewards/rejected": 0.0001622694981051609, "step": 30 }, { "epoch": 0.014411817690506215, "grad_norm": 3.5439584255218506, "learning_rate": 3.6036036036036035e-08, "logits/chosen": -1.7683446407318115, "logits/rejected": -1.7544472217559814, "logps/chosen": -42.96169662475586, "logps/rejected": -44.480674743652344, "loss": 0.6931, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.9183791664545424e-05, "rewards/margins": 0.00010197125084232539, "rewards/rejected": -0.00012115505523979664, "step": 40 }, { "epoch": 0.01801477211313277, "grad_norm": 4.501108646392822, "learning_rate": 4.504504504504504e-08, "logits/chosen": -1.7577524185180664, "logits/rejected": -1.741100549697876, "logps/chosen": -50.425445556640625, "logps/rejected": -52.546630859375, "loss": 0.693, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.00023509531456511468, "rewards/margins": 0.00022509522386826575, "rewards/rejected": 1.0000122529163491e-05, "step": 50 }, { "epoch": 0.021617726535759323, "grad_norm": 4.594651222229004, "learning_rate": 5.4054054054054056e-08, "logits/chosen": -1.5931284427642822, "logits/rejected": -1.5910460948944092, "logps/chosen": -45.28499984741211, "logps/rejected": -49.063377380371094, "loss": 0.6932, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.00016079492343124002, "rewards/margins": -1.9398279619053937e-05, "rewards/rejected": -0.00014139662380330265, "step": 60 }, { "epoch": 0.025220680958385876, "grad_norm": 3.4112484455108643, "learning_rate": 6.306306306306305e-08, "logits/chosen": -1.7921539545059204, "logits/rejected": -1.7922885417938232, "logps/chosen": -46.665245056152344, "logps/rejected": -48.13717269897461, "loss": 0.6933, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -4.898941915598698e-05, "rewards/margins": -0.0002294534060638398, "rewards/rejected": 0.00018046401964966208, "step": 70 }, { "epoch": 0.02882363538101243, "grad_norm": 3.629711627960205, "learning_rate": 7.207207207207207e-08, "logits/chosen": -1.7722715139389038, "logits/rejected": -1.7587053775787354, "logps/chosen": -39.46010208129883, "logps/rejected": -42.94160461425781, "loss": 0.6932, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -3.910423401976004e-05, "rewards/margins": -1.4359999113366939e-05, "rewards/rejected": -2.4744216716499068e-05, "step": 80 }, { "epoch": 0.03242658980363899, "grad_norm": 3.2938477993011475, "learning_rate": 8.108108108108108e-08, "logits/chosen": -1.6195322275161743, "logits/rejected": -1.6349570751190186, "logps/chosen": -43.25347900390625, "logps/rejected": -45.54538345336914, "loss": 0.6933, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.00023719388991594315, "rewards/margins": -0.000220087036723271, "rewards/rejected": 0.00045728092663921416, "step": 90 }, { "epoch": 0.03602954422626554, "grad_norm": 4.304085731506348, "learning_rate": 9.009009009009008e-08, "logits/chosen": -1.690190076828003, "logits/rejected": -1.668461799621582, "logps/chosen": -42.72962188720703, "logps/rejected": -44.08342361450195, "loss": 0.6932, "rewards/accuracies": 0.5, "rewards/chosen": -3.6253164580557495e-05, "rewards/margins": -4.533391984296031e-05, "rewards/rejected": 9.080766176339239e-06, "step": 100 }, { "epoch": 0.039632498648892095, "grad_norm": 4.63545560836792, "learning_rate": 9.909909909909909e-08, "logits/chosen": -1.753379225730896, "logits/rejected": -1.748039960861206, "logps/chosen": -49.486961364746094, "logps/rejected": -52.599761962890625, "loss": 0.6932, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -7.084323442541063e-06, "rewards/margins": -0.0001257166441064328, "rewards/rejected": 0.00011863231338793412, "step": 110 }, { "epoch": 0.043235453071518645, "grad_norm": 5.669524669647217, "learning_rate": 1.0810810810810811e-07, "logits/chosen": -1.7147136926651, "logits/rejected": -1.702016830444336, "logps/chosen": -51.8984489440918, "logps/rejected": -51.4279899597168, "loss": 0.6931, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": 9.030712681123987e-05, "rewards/margins": 5.703966598957777e-05, "rewards/rejected": 3.326743535581045e-05, "step": 120 }, { "epoch": 0.0468384074941452, "grad_norm": 3.564293146133423, "learning_rate": 1.171171171171171e-07, "logits/chosen": -1.665785789489746, "logits/rejected": -1.6387412548065186, "logps/chosen": -46.71335983276367, "logps/rejected": -49.116981506347656, "loss": 0.6931, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.00015762264956720173, "rewards/margins": 9.968892118195072e-05, "rewards/rejected": 5.7933742937166244e-05, "step": 130 }, { "epoch": 0.05044136191677175, "grad_norm": 3.2253429889678955, "learning_rate": 1.261261261261261e-07, "logits/chosen": -1.7900612354278564, "logits/rejected": -1.8017972707748413, "logps/chosen": -53.443931579589844, "logps/rejected": -56.4853515625, "loss": 0.6931, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -2.9897346394136548e-05, "rewards/margins": 7.243625441333279e-05, "rewards/rejected": -0.00010233358625555411, "step": 140 }, { "epoch": 0.05404431633939831, "grad_norm": 3.723487377166748, "learning_rate": 1.3513513513513515e-07, "logits/chosen": -1.722447156906128, "logits/rejected": -1.7019611597061157, "logps/chosen": -53.44658279418945, "logps/rejected": -54.85149383544922, "loss": 0.6931, "rewards/accuracies": 0.5625, "rewards/chosen": -5.108219193061814e-05, "rewards/margins": 0.0001519794896012172, "rewards/rejected": -0.0002030616597039625, "step": 150 }, { "epoch": 0.05764727076202486, "grad_norm": 5.437617301940918, "learning_rate": 1.4414414414414414e-07, "logits/chosen": -1.7410888671875, "logits/rejected": -1.7434587478637695, "logps/chosen": -48.747833251953125, "logps/rejected": -50.5507698059082, "loss": 0.693, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.0003015986585523933, "rewards/margins": 0.00039068650221452117, "rewards/rejected": -8.908782911021262e-05, "step": 160 }, { "epoch": 0.06125022518465142, "grad_norm": 5.302024841308594, "learning_rate": 1.5315315315315313e-07, "logits/chosen": -1.5808699131011963, "logits/rejected": -1.5805413722991943, "logps/chosen": -50.6431770324707, "logps/rejected": -52.430580139160156, "loss": 0.693, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0003786823363043368, "rewards/margins": 0.00028472780832089484, "rewards/rejected": 9.395449887961149e-05, "step": 170 }, { "epoch": 0.06485317960727797, "grad_norm": 3.489992141723633, "learning_rate": 1.6216216216216215e-07, "logits/chosen": -1.7394211292266846, "logits/rejected": -1.7349265813827515, "logps/chosen": -47.33000946044922, "logps/rejected": -49.8638801574707, "loss": 0.6932, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.00022297371469903737, "rewards/margins": -5.568843334913254e-05, "rewards/rejected": 0.0002786621334962547, "step": 180 }, { "epoch": 0.06845613402990453, "grad_norm": 3.072382688522339, "learning_rate": 1.7117117117117117e-07, "logits/chosen": -1.8074477910995483, "logits/rejected": -1.7950681447982788, "logps/chosen": -40.70524978637695, "logps/rejected": -44.75992202758789, "loss": 0.6932, "rewards/accuracies": 0.4375, "rewards/chosen": 0.00018566983635537326, "rewards/margins": -0.00016857947048265487, "rewards/rejected": 0.0003542492922861129, "step": 190 }, { "epoch": 0.07205908845253108, "grad_norm": 4.253015041351318, "learning_rate": 1.8018018018018017e-07, "logits/chosen": -1.7659227848052979, "logits/rejected": -1.762160062789917, "logps/chosen": -50.597862243652344, "logps/rejected": -52.700660705566406, "loss": 0.6931, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.0002902823907788843, "rewards/margins": 0.00015576460282318294, "rewards/rejected": 0.0001345178607152775, "step": 200 }, { "epoch": 0.07566204287515763, "grad_norm": 4.544305324554443, "learning_rate": 1.891891891891892e-07, "logits/chosen": -1.6628167629241943, "logits/rejected": -1.645263433456421, "logps/chosen": -50.96441650390625, "logps/rejected": -52.852989196777344, "loss": 0.693, "rewards/accuracies": 0.5625, "rewards/chosen": 0.00016514495655428618, "rewards/margins": 0.0003304726560600102, "rewards/rejected": -0.00016532771405763924, "step": 210 }, { "epoch": 0.07926499729778419, "grad_norm": 2.668222427368164, "learning_rate": 1.9819819819819818e-07, "logits/chosen": -1.717664122581482, "logits/rejected": -1.686693787574768, "logps/chosen": -46.837650299072266, "logps/rejected": -49.449607849121094, "loss": 0.693, "rewards/accuracies": 0.4375, "rewards/chosen": 0.00034715054789558053, "rewards/margins": 0.0003410983190406114, "rewards/rejected": 6.052269782230724e-06, "step": 220 }, { "epoch": 0.08286795172041074, "grad_norm": 3.3485231399536133, "learning_rate": 2.072072072072072e-07, "logits/chosen": -1.6586008071899414, "logits/rejected": -1.6540342569351196, "logps/chosen": -36.831581115722656, "logps/rejected": -39.075965881347656, "loss": 0.693, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.0005781602812930942, "rewards/margins": 0.0003644340031314641, "rewards/rejected": 0.00021372627816163003, "step": 230 }, { "epoch": 0.08647090614303729, "grad_norm": 3.4034311771392822, "learning_rate": 2.1621621621621622e-07, "logits/chosen": -1.6835705041885376, "logits/rejected": -1.6781299114227295, "logps/chosen": -51.54401779174805, "logps/rejected": -54.11664581298828, "loss": 0.6926, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.0009162005735561252, "rewards/margins": 0.0011217787396162748, "rewards/rejected": -0.00020557809330057353, "step": 240 }, { "epoch": 0.09007386056566384, "grad_norm": 3.7628700733184814, "learning_rate": 2.2522522522522522e-07, "logits/chosen": -1.756882905960083, "logits/rejected": -1.7513837814331055, "logps/chosen": -55.46052932739258, "logps/rejected": -57.28558349609375, "loss": 0.6928, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.0012927096104249358, "rewards/margins": 0.0006762434495612979, "rewards/rejected": 0.0006164660444483161, "step": 250 }, { "epoch": 0.0936768149882904, "grad_norm": 4.458310604095459, "learning_rate": 2.342342342342342e-07, "logits/chosen": -1.7510229349136353, "logits/rejected": -1.7402935028076172, "logps/chosen": -43.141502380371094, "logps/rejected": -45.22883987426758, "loss": 0.6928, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.0007634961511939764, "rewards/margins": 0.0006153647555038333, "rewards/rejected": 0.00014813135203439742, "step": 260 }, { "epoch": 0.09727976941091696, "grad_norm": 3.138791799545288, "learning_rate": 2.4324324324324326e-07, "logits/chosen": -1.6900430917739868, "logits/rejected": -1.6775468587875366, "logps/chosen": -48.70859909057617, "logps/rejected": -53.02415084838867, "loss": 0.6926, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0015983255580067635, "rewards/margins": 0.0010901021305471659, "rewards/rejected": 0.0005082233110442758, "step": 270 }, { "epoch": 0.1008827238335435, "grad_norm": 3.4879350662231445, "learning_rate": 2.522522522522522e-07, "logits/chosen": -1.659205675125122, "logits/rejected": -1.650123953819275, "logps/chosen": -42.67085647583008, "logps/rejected": -47.37016677856445, "loss": 0.6926, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.0017988558392971754, "rewards/margins": 0.001047928468324244, "rewards/rejected": 0.0007509274873882532, "step": 280 }, { "epoch": 0.10448567825617006, "grad_norm": 5.909618377685547, "learning_rate": 2.6126126126126124e-07, "logits/chosen": -1.6975767612457275, "logits/rejected": -1.6904857158660889, "logps/chosen": -49.704307556152344, "logps/rejected": -53.21978759765625, "loss": 0.6926, "rewards/accuracies": 0.5625, "rewards/chosen": 0.001446915091946721, "rewards/margins": 0.001141748740337789, "rewards/rejected": 0.0003051663807127625, "step": 290 }, { "epoch": 0.10808863267879662, "grad_norm": 6.1678853034973145, "learning_rate": 2.702702702702703e-07, "logits/chosen": -1.7316814661026, "logits/rejected": -1.7211496829986572, "logps/chosen": -46.54294967651367, "logps/rejected": -49.60760498046875, "loss": 0.692, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.002290563890710473, "rewards/margins": 0.0022531836293637753, "rewards/rejected": 3.7380214052973315e-05, "step": 300 }, { "epoch": 0.11169158710142317, "grad_norm": 3.0397493839263916, "learning_rate": 2.7927927927927923e-07, "logits/chosen": -1.7990278005599976, "logits/rejected": -1.7732198238372803, "logps/chosen": -38.29768753051758, "logps/rejected": -41.79767990112305, "loss": 0.6921, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.0024967927020043135, "rewards/margins": 0.0021534208208322525, "rewards/rejected": 0.0003433715901337564, "step": 310 }, { "epoch": 0.11529454152404972, "grad_norm": 3.145111560821533, "learning_rate": 2.882882882882883e-07, "logits/chosen": -1.5970180034637451, "logits/rejected": -1.5767868757247925, "logps/chosen": -43.7572135925293, "logps/rejected": -47.30242919921875, "loss": 0.6923, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0019000060856342316, "rewards/margins": 0.0017187722260132432, "rewards/rejected": 0.00018123406334780157, "step": 320 }, { "epoch": 0.11889749594667627, "grad_norm": 3.4514622688293457, "learning_rate": 2.972972972972973e-07, "logits/chosen": -1.7697391510009766, "logits/rejected": -1.725419282913208, "logps/chosen": -46.631221771240234, "logps/rejected": -48.73198699951172, "loss": 0.692, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0027934780810028315, "rewards/margins": 0.002238652901723981, "rewards/rejected": 0.0005548248300328851, "step": 330 }, { "epoch": 0.12250045036930284, "grad_norm": 4.041067123413086, "learning_rate": 3.0630630630630627e-07, "logits/chosen": -1.674444556236267, "logits/rejected": -1.674968957901001, "logps/chosen": -42.48195266723633, "logps/rejected": -44.92963409423828, "loss": 0.6918, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.0031822682358324528, "rewards/margins": 0.0027774940244853497, "rewards/rejected": 0.0004047740367241204, "step": 340 }, { "epoch": 0.12610340479192939, "grad_norm": 5.933683395385742, "learning_rate": 3.153153153153153e-07, "logits/chosen": -1.6941207647323608, "logits/rejected": -1.681911826133728, "logps/chosen": -49.30591583251953, "logps/rejected": -51.21431350708008, "loss": 0.6919, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.0034376985859125853, "rewards/margins": 0.002514602616429329, "rewards/rejected": 0.0009230962023139, "step": 350 }, { "epoch": 0.12970635921455595, "grad_norm": 4.507534027099609, "learning_rate": 3.243243243243243e-07, "logits/chosen": -1.7088005542755127, "logits/rejected": -1.693945288658142, "logps/chosen": -40.02022171020508, "logps/rejected": -42.422332763671875, "loss": 0.6902, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0054672518745064735, "rewards/margins": 0.005961936432868242, "rewards/rejected": -0.0004946846747770905, "step": 360 }, { "epoch": 0.13330931363718249, "grad_norm": 2.685657501220703, "learning_rate": 3.333333333333333e-07, "logits/chosen": -1.703002691268921, "logits/rejected": -1.7116321325302124, "logps/chosen": -41.613525390625, "logps/rejected": -46.64238739013672, "loss": 0.6911, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.004459428135305643, "rewards/margins": 0.004147970117628574, "rewards/rejected": 0.00031145798857323825, "step": 370 }, { "epoch": 0.13691226805980905, "grad_norm": 3.4917635917663574, "learning_rate": 3.4234234234234235e-07, "logits/chosen": -1.8192132711410522, "logits/rejected": -1.80001699924469, "logps/chosen": -41.8767204284668, "logps/rejected": -43.953147888183594, "loss": 0.6911, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.005829638335853815, "rewards/margins": 0.004118152894079685, "rewards/rejected": 0.001711485325358808, "step": 380 }, { "epoch": 0.1405152224824356, "grad_norm": 3.218789577484131, "learning_rate": 3.5135135135135134e-07, "logits/chosen": -1.7629024982452393, "logits/rejected": -1.7467416524887085, "logps/chosen": -42.2618293762207, "logps/rejected": -44.198760986328125, "loss": 0.6912, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.004580915905535221, "rewards/margins": 0.003979508765041828, "rewards/rejected": 0.000601407082285732, "step": 390 }, { "epoch": 0.14411817690506215, "grad_norm": 6.135593414306641, "learning_rate": 3.6036036036036033e-07, "logits/chosen": -1.6290171146392822, "logits/rejected": -1.6323350667953491, "logps/chosen": -43.67991256713867, "logps/rejected": -49.84873580932617, "loss": 0.6911, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.005232072435319424, "rewards/margins": 0.004124890547245741, "rewards/rejected": 0.0011071818880736828, "step": 400 }, { "epoch": 0.14772113132768872, "grad_norm": 3.853224039077759, "learning_rate": 3.6936936936936933e-07, "logits/chosen": -1.7248551845550537, "logits/rejected": -1.7100718021392822, "logps/chosen": -38.718650817871094, "logps/rejected": -40.440452575683594, "loss": 0.6895, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.004796669818460941, "rewards/margins": 0.007348486687988043, "rewards/rejected": -0.002551817102357745, "step": 410 }, { "epoch": 0.15132408575031525, "grad_norm": 3.83880877494812, "learning_rate": 3.783783783783784e-07, "logits/chosen": -1.6671268939971924, "logits/rejected": -1.6588226556777954, "logps/chosen": -46.447330474853516, "logps/rejected": -46.89680862426758, "loss": 0.6897, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.004480901174247265, "rewards/margins": 0.00695028668269515, "rewards/rejected": -0.002469385042786598, "step": 420 }, { "epoch": 0.15492704017294182, "grad_norm": 4.403101921081543, "learning_rate": 3.8738738738738737e-07, "logits/chosen": -1.7254947423934937, "logits/rejected": -1.7298320531845093, "logps/chosen": -47.78804016113281, "logps/rejected": -55.29218292236328, "loss": 0.6878, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.003573561552911997, "rewards/margins": 0.010887719690799713, "rewards/rejected": -0.007314157672226429, "step": 430 }, { "epoch": 0.15852999459556838, "grad_norm": 4.868645191192627, "learning_rate": 3.9639639639639636e-07, "logits/chosen": -1.688886284828186, "logits/rejected": -1.670697569847107, "logps/chosen": -46.06879806518555, "logps/rejected": -51.101715087890625, "loss": 0.6875, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.0051663839258253574, "rewards/margins": 0.01150798611342907, "rewards/rejected": -0.006341601721942425, "step": 440 }, { "epoch": 0.16213294901819492, "grad_norm": 4.604848384857178, "learning_rate": 4.054054054054054e-07, "logits/chosen": -1.6967153549194336, "logits/rejected": -1.6993358135223389, "logps/chosen": -44.147483825683594, "logps/rejected": -50.80498504638672, "loss": 0.6879, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.006432811263948679, "rewards/margins": 0.010621501132845879, "rewards/rejected": -0.0041886912658810616, "step": 450 }, { "epoch": 0.16573590344082148, "grad_norm": 4.497652053833008, "learning_rate": 4.144144144144144e-07, "logits/chosen": -1.764259696006775, "logits/rejected": -1.7514150142669678, "logps/chosen": -43.84403610229492, "logps/rejected": -46.98237991333008, "loss": 0.6873, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.0029778308235108852, "rewards/margins": 0.011888700537383556, "rewards/rejected": -0.008910869248211384, "step": 460 }, { "epoch": 0.16933885786344802, "grad_norm": 4.147437572479248, "learning_rate": 4.234234234234234e-07, "logits/chosen": -1.8267230987548828, "logits/rejected": -1.8229129314422607, "logps/chosen": -42.10190963745117, "logps/rejected": -46.67747497558594, "loss": 0.6902, "rewards/accuracies": 0.5, "rewards/chosen": 0.0027582012116909027, "rewards/margins": 0.006137028336524963, "rewards/rejected": -0.0033788266591727734, "step": 470 }, { "epoch": 0.17294181228607458, "grad_norm": 4.121939659118652, "learning_rate": 4.3243243243243244e-07, "logits/chosen": -1.7018091678619385, "logits/rejected": -1.681560754776001, "logps/chosen": -55.833404541015625, "logps/rejected": -57.60951614379883, "loss": 0.6854, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0008912247722037137, "rewards/margins": 0.016034971922636032, "rewards/rejected": -0.015143746510148048, "step": 480 }, { "epoch": 0.17654476670870115, "grad_norm": 3.8007025718688965, "learning_rate": 4.414414414414414e-07, "logits/chosen": -1.6423200368881226, "logits/rejected": -1.6475427150726318, "logps/chosen": -49.587913513183594, "logps/rejected": -54.42341232299805, "loss": 0.6843, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.0029760824982076883, "rewards/margins": 0.018089579418301582, "rewards/rejected": -0.015113498084247112, "step": 490 }, { "epoch": 0.18014772113132768, "grad_norm": 6.992798328399658, "learning_rate": 4.5045045045045043e-07, "logits/chosen": -1.6029027700424194, "logits/rejected": -1.576078176498413, "logps/chosen": -49.34995651245117, "logps/rejected": -51.18292999267578, "loss": 0.6797, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0019840318709611893, "rewards/margins": 0.027620479464530945, "rewards/rejected": -0.025636449456214905, "step": 500 }, { "epoch": 0.18375067555395425, "grad_norm": 6.653567790985107, "learning_rate": 4.594594594594595e-07, "logits/chosen": -1.6146643161773682, "logits/rejected": -1.6065555810928345, "logps/chosen": -45.69719314575195, "logps/rejected": -49.471824645996094, "loss": 0.6827, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 3.5562366974772885e-05, "rewards/margins": 0.0216769240796566, "rewards/rejected": -0.021641362458467484, "step": 510 }, { "epoch": 0.1873536299765808, "grad_norm": 5.723504066467285, "learning_rate": 4.684684684684684e-07, "logits/chosen": -1.687718391418457, "logits/rejected": -1.6782705783843994, "logps/chosen": -55.974578857421875, "logps/rejected": -58.01393508911133, "loss": 0.69, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.016683388501405716, "rewards/margins": 0.007352002896368504, "rewards/rejected": -0.024035390466451645, "step": 520 }, { "epoch": 0.19095658439920735, "grad_norm": 4.197725772857666, "learning_rate": 4.774774774774775e-07, "logits/chosen": -1.7161273956298828, "logits/rejected": -1.7011467218399048, "logps/chosen": -44.741886138916016, "logps/rejected": -50.11282730102539, "loss": 0.6836, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.013202029280364513, "rewards/margins": 0.02028127945959568, "rewards/rejected": -0.03348330780863762, "step": 530 }, { "epoch": 0.1945595388218339, "grad_norm": 4.084933280944824, "learning_rate": 4.864864864864865e-07, "logits/chosen": -1.6937358379364014, "logits/rejected": -1.670254111289978, "logps/chosen": -50.84514236450195, "logps/rejected": -55.533592224121094, "loss": 0.6794, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.007875502109527588, "rewards/margins": 0.02891036495566368, "rewards/rejected": -0.03678586333990097, "step": 540 }, { "epoch": 0.19816249324446045, "grad_norm": 6.4256911277771, "learning_rate": 4.954954954954955e-07, "logits/chosen": -1.5930603742599487, "logits/rejected": -1.5785481929779053, "logps/chosen": -49.83226776123047, "logps/rejected": -51.831695556640625, "loss": 0.6835, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.00859083142131567, "rewards/margins": 0.02024068497121334, "rewards/rejected": -0.028831515461206436, "step": 550 }, { "epoch": 0.201765447667087, "grad_norm": 5.257549285888672, "learning_rate": 4.999987638293614e-07, "logits/chosen": -1.688340187072754, "logits/rejected": -1.6726744174957275, "logps/chosen": -37.71453094482422, "logps/rejected": -44.47661590576172, "loss": 0.6733, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.017620814964175224, "rewards/margins": 0.04227476567029953, "rewards/rejected": -0.0598955862224102, "step": 560 }, { "epoch": 0.20536840208971358, "grad_norm": 6.3576884269714355, "learning_rate": 4.999888745376028e-07, "logits/chosen": -1.5950968265533447, "logits/rejected": -1.5918595790863037, "logps/chosen": -48.646385192871094, "logps/rejected": -54.294288635253906, "loss": 0.6794, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.03091433085501194, "rewards/margins": 0.02918051742017269, "rewards/rejected": -0.06009485572576523, "step": 570 }, { "epoch": 0.2089713565123401, "grad_norm": 6.035130500793457, "learning_rate": 4.999690963452795e-07, "logits/chosen": -1.7354274988174438, "logits/rejected": -1.7171862125396729, "logps/chosen": -49.645389556884766, "logps/rejected": -51.204010009765625, "loss": 0.6857, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0618695430457592, "rewards/margins": 0.016820868477225304, "rewards/rejected": -0.07869041711091995, "step": 580 }, { "epoch": 0.21257431093496668, "grad_norm": 7.045923233032227, "learning_rate": 4.999394300347652e-07, "logits/chosen": -1.5585837364196777, "logits/rejected": -1.5490951538085938, "logps/chosen": -58.975677490234375, "logps/rejected": -63.8331413269043, "loss": 0.6795, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.058443933725357056, "rewards/margins": 0.0303555428981781, "rewards/rejected": -0.08879947662353516, "step": 590 }, { "epoch": 0.21617726535759324, "grad_norm": 5.241565704345703, "learning_rate": 4.998998767795804e-07, "logits/chosen": -1.551060676574707, "logits/rejected": -1.5270111560821533, "logps/chosen": -52.183326721191406, "logps/rejected": -61.41755294799805, "loss": 0.6659, "rewards/accuracies": 0.625, "rewards/chosen": -0.08854193985462189, "rewards/margins": 0.0613928847014904, "rewards/rejected": -0.1499348133802414, "step": 600 }, { "epoch": 0.21978021978021978, "grad_norm": 8.864941596984863, "learning_rate": 4.998504381443478e-07, "logits/chosen": -1.402044653892517, "logits/rejected": -1.4134305715560913, "logps/chosen": -62.51812744140625, "logps/rejected": -69.44048309326172, "loss": 0.6708, "rewards/accuracies": 0.5625, "rewards/chosen": -0.12022509425878525, "rewards/margins": 0.0526595413684845, "rewards/rejected": -0.17288464307785034, "step": 610 }, { "epoch": 0.22338317420284634, "grad_norm": 12.818338394165039, "learning_rate": 4.997911160847295e-07, "logits/chosen": -1.4413927793502808, "logits/rejected": -1.4285277128219604, "logps/chosen": -59.63031005859375, "logps/rejected": -68.47859191894531, "loss": 0.6644, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.1399981677532196, "rewards/margins": 0.06519350409507751, "rewards/rejected": -0.2051916867494583, "step": 620 }, { "epoch": 0.22698612862547288, "grad_norm": 13.486601829528809, "learning_rate": 4.997219129473494e-07, "logits/chosen": -1.3681890964508057, "logits/rejected": -1.3525458574295044, "logps/chosen": -73.00723266601562, "logps/rejected": -84.07337951660156, "loss": 0.666, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.26885923743247986, "rewards/margins": 0.06815784424543381, "rewards/rejected": -0.3370170593261719, "step": 630 }, { "epoch": 0.23058908304809944, "grad_norm": 11.271373748779297, "learning_rate": 4.996428314697015e-07, "logits/chosen": -1.2895841598510742, "logits/rejected": -1.2766430377960205, "logps/chosen": -70.99652862548828, "logps/rejected": -81.04373931884766, "loss": 0.6722, "rewards/accuracies": 0.5625, "rewards/chosen": -0.2290995866060257, "rewards/margins": 0.05413205549120903, "rewards/rejected": -0.283231645822525, "step": 640 }, { "epoch": 0.234192037470726, "grad_norm": 8.742398262023926, "learning_rate": 4.995538747800402e-07, "logits/chosen": -1.2628368139266968, "logits/rejected": -1.2450822591781616, "logps/chosen": -70.45149993896484, "logps/rejected": -74.6968765258789, "loss": 0.6719, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.1835000365972519, "rewards/margins": 0.05531931668519974, "rewards/rejected": -0.23881936073303223, "step": 650 }, { "epoch": 0.23779499189335254, "grad_norm": 8.972411155700684, "learning_rate": 4.994550463972576e-07, "logits/chosen": -1.288618803024292, "logits/rejected": -1.2918486595153809, "logps/chosen": -69.60689544677734, "logps/rejected": -72.79714965820312, "loss": 0.68, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.20229852199554443, "rewards/margins": 0.04259002208709717, "rewards/rejected": -0.2448885142803192, "step": 660 }, { "epoch": 0.2413979463159791, "grad_norm": 10.873099327087402, "learning_rate": 4.99346350230744e-07, "logits/chosen": -1.3272793292999268, "logits/rejected": -1.3133655786514282, "logps/chosen": -82.43290710449219, "logps/rejected": -95.42029571533203, "loss": 0.6403, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.23164968192577362, "rewards/margins": 0.12773051857948303, "rewards/rejected": -0.35938018560409546, "step": 670 }, { "epoch": 0.24500090073860567, "grad_norm": 12.110127449035645, "learning_rate": 4.992277905802331e-07, "logits/chosen": -1.0249826908111572, "logits/rejected": -1.024247407913208, "logps/chosen": -71.12178802490234, "logps/rejected": -87.04933166503906, "loss": 0.6347, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.2818991243839264, "rewards/margins": 0.14090515673160553, "rewards/rejected": -0.4228042662143707, "step": 680 }, { "epoch": 0.2486038551612322, "grad_norm": 18.234243392944336, "learning_rate": 4.990993721356315e-07, "logits/chosen": -0.8412225842475891, "logits/rejected": -0.8490460515022278, "logps/chosen": -85.99247741699219, "logps/rejected": -94.976806640625, "loss": 0.6776, "rewards/accuracies": 0.5625, "rewards/chosen": -0.40948066115379333, "rewards/margins": 0.04921000450849533, "rewards/rejected": -0.45869070291519165, "step": 690 }, { "epoch": 0.25220680958385877, "grad_norm": 13.038808822631836, "learning_rate": 4.989610999768348e-07, "logits/chosen": -0.7974184155464172, "logits/rejected": -0.7845913767814636, "logps/chosen": -83.24944305419922, "logps/rejected": -98.57124328613281, "loss": 0.6403, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.3702521026134491, "rewards/margins": 0.13462284207344055, "rewards/rejected": -0.5048748850822449, "step": 700 }, { "epoch": 0.2558097640064853, "grad_norm": 11.371808052062988, "learning_rate": 4.988129795735248e-07, "logits/chosen": -0.8534584045410156, "logits/rejected": -0.8415006399154663, "logps/chosen": -89.6837158203125, "logps/rejected": -102.817626953125, "loss": 0.645, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.32941603660583496, "rewards/margins": 0.11651470512151718, "rewards/rejected": -0.44593071937561035, "step": 710 }, { "epoch": 0.2594127184291119, "grad_norm": 14.147513389587402, "learning_rate": 4.986550167849537e-07, "logits/chosen": -0.7822316884994507, "logits/rejected": -0.7774645090103149, "logps/chosen": -92.55155944824219, "logps/rejected": -106.33164978027344, "loss": 0.6527, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.41462668776512146, "rewards/margins": 0.09965832531452179, "rewards/rejected": -0.5142850279808044, "step": 720 }, { "epoch": 0.26301567285173844, "grad_norm": 16.63142204284668, "learning_rate": 4.98487217859713e-07, "logits/chosen": -0.6910918354988098, "logits/rejected": -0.700725793838501, "logps/chosen": -93.5431900024414, "logps/rejected": -109.87126159667969, "loss": 0.6497, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.44943666458129883, "rewards/margins": 0.1114300936460495, "rewards/rejected": -0.5608667135238647, "step": 730 }, { "epoch": 0.26661862727436497, "grad_norm": 19.47760772705078, "learning_rate": 4.983095894354857e-07, "logits/chosen": -0.6674941778182983, "logits/rejected": -0.6683529019355774, "logps/chosen": -103.37028503417969, "logps/rejected": -119.79246520996094, "loss": 0.6494, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.5528812408447266, "rewards/margins": 0.1224491149187088, "rewards/rejected": -0.6753303408622742, "step": 740 }, { "epoch": 0.2702215816969915, "grad_norm": 20.79136085510254, "learning_rate": 4.981221385387837e-07, "logits/chosen": -0.8482168316841125, "logits/rejected": -0.8399343490600586, "logps/chosen": -95.40355682373047, "logps/rejected": -100.78687286376953, "loss": 0.6708, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.4453433156013489, "rewards/margins": 0.06763346493244171, "rewards/rejected": -0.5129767656326294, "step": 750 }, { "epoch": 0.2738245361196181, "grad_norm": 11.2230863571167, "learning_rate": 4.979248725846701e-07, "logits/chosen": -1.2313950061798096, "logits/rejected": -1.2393567562103271, "logps/chosen": -69.79106140136719, "logps/rejected": -82.90087890625, "loss": 0.6528, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.2664026618003845, "rewards/margins": 0.09638935327529907, "rewards/rejected": -0.3627920150756836, "step": 760 }, { "epoch": 0.27742749054224464, "grad_norm": 14.57046127319336, "learning_rate": 4.977177993764659e-07, "logits/chosen": -1.3615708351135254, "logits/rejected": -1.343949556350708, "logps/chosen": -77.51803588867188, "logps/rejected": -86.98101806640625, "loss": 0.6578, "rewards/accuracies": 0.5625, "rewards/chosen": -0.2723795175552368, "rewards/margins": 0.08878382295370102, "rewards/rejected": -0.36116331815719604, "step": 770 }, { "epoch": 0.2810304449648712, "grad_norm": 12.90167236328125, "learning_rate": 4.975009271054409e-07, "logits/chosen": -1.2985954284667969, "logits/rejected": -1.3072916269302368, "logps/chosen": -72.69700622558594, "logps/rejected": -90.4751968383789, "loss": 0.645, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.26569822430610657, "rewards/margins": 0.11753193289041519, "rewards/rejected": -0.38323014974594116, "step": 780 }, { "epoch": 0.28463339938749777, "grad_norm": 13.231833457946777, "learning_rate": 4.972742643504904e-07, "logits/chosen": -1.2923033237457275, "logits/rejected": -1.2837045192718506, "logps/chosen": -73.27848052978516, "logps/rejected": -74.68667602539062, "loss": 0.6953, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.27220702171325684, "rewards/margins": 0.008732852526009083, "rewards/rejected": -0.28093987703323364, "step": 790 }, { "epoch": 0.2882363538101243, "grad_norm": 12.5423002243042, "learning_rate": 4.970378200777948e-07, "logits/chosen": -1.3273861408233643, "logits/rejected": -1.3253200054168701, "logps/chosen": -69.21495056152344, "logps/rejected": -80.0616683959961, "loss": 0.6575, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.2545456290245056, "rewards/margins": 0.08964885026216507, "rewards/rejected": -0.3441944718360901, "step": 800 }, { "epoch": 0.29183930823275084, "grad_norm": 27.842836380004883, "learning_rate": 4.967916036404664e-07, "logits/chosen": -1.2187349796295166, "logits/rejected": -1.2240660190582275, "logps/chosen": -74.37635803222656, "logps/rejected": -85.68818664550781, "loss": 0.673, "rewards/accuracies": 0.5, "rewards/chosen": -0.3206605315208435, "rewards/margins": 0.07676301896572113, "rewards/rejected": -0.3974235951900482, "step": 810 }, { "epoch": 0.29544226265537743, "grad_norm": 16.682466506958008, "learning_rate": 4.965356247781778e-07, "logits/chosen": -1.3194482326507568, "logits/rejected": -1.3158743381500244, "logps/chosen": -88.14268493652344, "logps/rejected": -96.72323608398438, "loss": 0.6606, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3540545701980591, "rewards/margins": 0.08011126518249512, "rewards/rejected": -0.4341658651828766, "step": 820 }, { "epoch": 0.29904521707800397, "grad_norm": 11.893294334411621, "learning_rate": 4.962698936167778e-07, "logits/chosen": -1.3116481304168701, "logits/rejected": -1.3138093948364258, "logps/chosen": -78.41636657714844, "logps/rejected": -91.9908447265625, "loss": 0.6513, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.32214006781578064, "rewards/margins": 0.10692934691905975, "rewards/rejected": -0.4290694296360016, "step": 830 }, { "epoch": 0.3026481715006305, "grad_norm": 16.89285659790039, "learning_rate": 4.959944206678903e-07, "logits/chosen": -1.1496102809906006, "logits/rejected": -1.1655839681625366, "logps/chosen": -85.49686431884766, "logps/rejected": -94.6566390991211, "loss": 0.6752, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.40377917885780334, "rewards/margins": 0.054061077535152435, "rewards/rejected": -0.4578402638435364, "step": 840 }, { "epoch": 0.3062511259232571, "grad_norm": 15.432175636291504, "learning_rate": 4.957092168284986e-07, "logits/chosen": -1.125409483909607, "logits/rejected": -1.1296653747558594, "logps/chosen": -84.02685546875, "logps/rejected": -89.34877014160156, "loss": 0.6686, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.35803866386413574, "rewards/margins": 0.059441715478897095, "rewards/rejected": -0.41748037934303284, "step": 850 }, { "epoch": 0.30985408034588363, "grad_norm": 14.408708572387695, "learning_rate": 4.954142933805145e-07, "logits/chosen": -0.9184282422065735, "logits/rejected": -0.9147431254386902, "logps/chosen": -91.42207336425781, "logps/rejected": -102.40242004394531, "loss": 0.6539, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.47459083795547485, "rewards/margins": 0.10091207176446915, "rewards/rejected": -0.5755028128623962, "step": 860 }, { "epoch": 0.31345703476851017, "grad_norm": 12.480093955993652, "learning_rate": 4.951096619903317e-07, "logits/chosen": -0.9125533103942871, "logits/rejected": -0.913791835308075, "logps/chosen": -91.58851623535156, "logps/rejected": -107.85359191894531, "loss": 0.6569, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.46451687812805176, "rewards/margins": 0.10352380573749542, "rewards/rejected": -0.568040668964386, "step": 870 }, { "epoch": 0.31705998919113676, "grad_norm": 12.55040168762207, "learning_rate": 4.947953347083645e-07, "logits/chosen": -1.222707986831665, "logits/rejected": -1.2129634618759155, "logps/chosen": -86.14817810058594, "logps/rejected": -98.69571685791016, "loss": 0.6519, "rewards/accuracies": 0.5625, "rewards/chosen": -0.39243194460868835, "rewards/margins": 0.11647919565439224, "rewards/rejected": -0.5089111328125, "step": 880 }, { "epoch": 0.3206629436137633, "grad_norm": 14.883810997009277, "learning_rate": 4.944713239685713e-07, "logits/chosen": -1.4141329526901245, "logits/rejected": -1.3852875232696533, "logps/chosen": -95.36436462402344, "logps/rejected": -96.96843719482422, "loss": 0.6809, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3810012936592102, "rewards/margins": 0.0413370355963707, "rewards/rejected": -0.4223383069038391, "step": 890 }, { "epoch": 0.32426589803638983, "grad_norm": 10.3993501663208, "learning_rate": 4.941376425879623e-07, "logits/chosen": -1.3478530645370483, "logits/rejected": -1.336745023727417, "logps/chosen": -88.2947998046875, "logps/rejected": -102.8171615600586, "loss": 0.6299, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.3908626437187195, "rewards/margins": 0.15889796614646912, "rewards/rejected": -0.549760639667511, "step": 900 }, { "epoch": 0.32786885245901637, "grad_norm": 14.404094696044922, "learning_rate": 4.93794303766093e-07, "logits/chosen": -1.1831821203231812, "logits/rejected": -1.1733192205429077, "logps/chosen": -92.18428802490234, "logps/rejected": -102.34025573730469, "loss": 0.6588, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.4617028832435608, "rewards/margins": 0.08420713245868683, "rewards/rejected": -0.5459100008010864, "step": 910 }, { "epoch": 0.33147180688164296, "grad_norm": 13.073732376098633, "learning_rate": 4.934413210845417e-07, "logits/chosen": -1.1828866004943848, "logits/rejected": -1.1633810997009277, "logps/chosen": -102.12568664550781, "logps/rejected": -116.79570007324219, "loss": 0.6401, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.4046645164489746, "rewards/margins": 0.1273595094680786, "rewards/rejected": -0.5320240259170532, "step": 920 }, { "epoch": 0.3350747613042695, "grad_norm": 16.678611755371094, "learning_rate": 4.930787085063722e-07, "logits/chosen": -1.1912957429885864, "logits/rejected": -1.1653717756271362, "logps/chosen": -96.25555419921875, "logps/rejected": -109.46162414550781, "loss": 0.6369, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.4989704489707947, "rewards/margins": 0.13963612914085388, "rewards/rejected": -0.638606607913971, "step": 930 }, { "epoch": 0.33867771572689603, "grad_norm": 17.838438034057617, "learning_rate": 4.927064803755819e-07, "logits/chosen": -1.061514973640442, "logits/rejected": -1.0441539287567139, "logps/chosen": -104.01774597167969, "logps/rejected": -120.51075744628906, "loss": 0.6323, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.45036545395851135, "rewards/margins": 0.14668992161750793, "rewards/rejected": -0.5970553159713745, "step": 940 }, { "epoch": 0.3422806701495226, "grad_norm": 13.769434928894043, "learning_rate": 4.923246514165338e-07, "logits/chosen": -0.9698660969734192, "logits/rejected": -0.949456512928009, "logps/chosen": -89.14216613769531, "logps/rejected": -103.09718322753906, "loss": 0.6471, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.46100878715515137, "rewards/margins": 0.11799683421850204, "rewards/rejected": -0.5790055990219116, "step": 950 }, { "epoch": 0.34588362457214916, "grad_norm": 13.711132049560547, "learning_rate": 4.919332367333748e-07, "logits/chosen": -0.7421332597732544, "logits/rejected": -0.737074077129364, "logps/chosen": -93.01615905761719, "logps/rejected": -108.7341537475586, "loss": 0.6456, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.5092485547065735, "rewards/margins": 0.12116242945194244, "rewards/rejected": -0.6304109692573547, "step": 960 }, { "epoch": 0.3494865789947757, "grad_norm": 13.020487785339355, "learning_rate": 4.915322518094369e-07, "logits/chosen": -0.666183590888977, "logits/rejected": -0.6703653931617737, "logps/chosen": -102.07322692871094, "logps/rejected": -115.79020690917969, "loss": 0.6512, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.5666999816894531, "rewards/margins": 0.11544916778802872, "rewards/rejected": -0.6821491122245789, "step": 970 }, { "epoch": 0.3530895334174023, "grad_norm": 15.92390251159668, "learning_rate": 4.911217125066267e-07, "logits/chosen": -0.7013139724731445, "logits/rejected": -0.6965673565864563, "logps/chosen": -99.9740219116211, "logps/rejected": -125.32454681396484, "loss": 0.6062, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.5242640376091003, "rewards/margins": 0.21401219069957733, "rewards/rejected": -0.7382762432098389, "step": 980 }, { "epoch": 0.3566924878400288, "grad_norm": 22.443201065063477, "learning_rate": 4.90701635064796e-07, "logits/chosen": -0.6783886551856995, "logits/rejected": -0.6558721661567688, "logps/chosen": -104.22843933105469, "logps/rejected": -125.3359375, "loss": 0.6275, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.5865387320518494, "rewards/margins": 0.16658911108970642, "rewards/rejected": -0.7531278729438782, "step": 990 }, { "epoch": 0.36029544226265536, "grad_norm": 16.392536163330078, "learning_rate": 4.902720361011007e-07, "logits/chosen": -0.5944384336471558, "logits/rejected": -0.5874532461166382, "logps/chosen": -97.95899963378906, "logps/rejected": -116.11844635009766, "loss": 0.6374, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.5813181400299072, "rewards/margins": 0.16180863976478577, "rewards/rejected": -0.7431267499923706, "step": 1000 }, { "epoch": 0.36389839668528196, "grad_norm": 15.579933166503906, "learning_rate": 4.898329326093426e-07, "logits/chosen": -0.7319918870925903, "logits/rejected": -0.7233752012252808, "logps/chosen": -96.44636535644531, "logps/rejected": -106.9745864868164, "loss": 0.6682, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.5093886256217957, "rewards/margins": 0.08220504969358444, "rewards/rejected": -0.5915936827659607, "step": 1010 }, { "epoch": 0.3675013511079085, "grad_norm": 15.561565399169922, "learning_rate": 4.893843419592977e-07, "logits/chosen": -0.7630731463432312, "logits/rejected": -0.7514520287513733, "logps/chosen": -105.5160903930664, "logps/rejected": -120.8371810913086, "loss": 0.64, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5894420742988586, "rewards/margins": 0.1372137814760208, "rewards/rejected": -0.7266558408737183, "step": 1020 }, { "epoch": 0.37110430553053503, "grad_norm": 18.01420021057129, "learning_rate": 4.889262818960293e-07, "logits/chosen": -0.817065417766571, "logits/rejected": -0.8111998438835144, "logps/chosen": -103.73628997802734, "logps/rejected": -110.2214584350586, "loss": 0.6694, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.510665774345398, "rewards/margins": 0.0809985101222992, "rewards/rejected": -0.5916643738746643, "step": 1030 }, { "epoch": 0.3747072599531616, "grad_norm": 10.637860298156738, "learning_rate": 4.884587705391851e-07, "logits/chosen": -0.9392485618591309, "logits/rejected": -0.9319330453872681, "logps/chosen": -100.20902252197266, "logps/rejected": -122.7046127319336, "loss": 0.638, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4494263529777527, "rewards/margins": 0.16700775921344757, "rewards/rejected": -0.6164341568946838, "step": 1040 }, { "epoch": 0.37831021437578816, "grad_norm": 13.383898735046387, "learning_rate": 4.879818263822816e-07, "logits/chosen": -0.7321975231170654, "logits/rejected": -0.7081071138381958, "logps/chosen": -94.94624328613281, "logps/rejected": -108.52220153808594, "loss": 0.6477, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.48819559812545776, "rewards/margins": 0.1281278431415558, "rewards/rejected": -0.6163234114646912, "step": 1050 }, { "epoch": 0.3819131687984147, "grad_norm": 13.881481170654297, "learning_rate": 4.874954682919718e-07, "logits/chosen": -0.7877952456474304, "logits/rejected": -0.7841506004333496, "logps/chosen": -96.11666107177734, "logps/rejected": -111.68910217285156, "loss": 0.6458, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.4922049045562744, "rewards/margins": 0.14194722473621368, "rewards/rejected": -0.6341521739959717, "step": 1060 }, { "epoch": 0.38551612322104123, "grad_norm": 9.143404006958008, "learning_rate": 4.869997155072988e-07, "logits/chosen": -0.9350810050964355, "logits/rejected": -0.9262323379516602, "logps/chosen": -84.93910217285156, "logps/rejected": -100.52149963378906, "loss": 0.6481, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.43665748834609985, "rewards/margins": 0.1451665163040161, "rewards/rejected": -0.5818240642547607, "step": 1070 }, { "epoch": 0.3891190776436678, "grad_norm": 21.315027236938477, "learning_rate": 4.864945876389356e-07, "logits/chosen": -1.0451407432556152, "logits/rejected": -1.0247318744659424, "logps/chosen": -100.16545104980469, "logps/rejected": -116.10115814208984, "loss": 0.6461, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.4950154423713684, "rewards/margins": 0.13210979104042053, "rewards/rejected": -0.6271252036094666, "step": 1080 }, { "epoch": 0.39272203206629436, "grad_norm": 11.744331359863281, "learning_rate": 4.859801046684082e-07, "logits/chosen": -1.0961428880691528, "logits/rejected": -1.0880721807479858, "logps/chosen": -86.44023895263672, "logps/rejected": -95.47077178955078, "loss": 0.6537, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.3389251232147217, "rewards/margins": 0.10952061414718628, "rewards/rejected": -0.44844573736190796, "step": 1090 }, { "epoch": 0.3963249864889209, "grad_norm": 16.391756057739258, "learning_rate": 4.854562869473063e-07, "logits/chosen": -0.8601453900337219, "logits/rejected": -0.8658612370491028, "logps/chosen": -91.15989685058594, "logps/rejected": -108.34666442871094, "loss": 0.6385, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.4680752754211426, "rewards/margins": 0.15381982922554016, "rewards/rejected": -0.6218951344490051, "step": 1100 }, { "epoch": 0.3999279409115475, "grad_norm": 21.71736717224121, "learning_rate": 4.849231551964771e-07, "logits/chosen": -0.6439577341079712, "logits/rejected": -0.6424895524978638, "logps/chosen": -99.82119750976562, "logps/rejected": -113.4087905883789, "loss": 0.6658, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.5158894062042236, "rewards/margins": 0.10277016460895538, "rewards/rejected": -0.6186595559120178, "step": 1110 }, { "epoch": 0.403530895334174, "grad_norm": 19.188968658447266, "learning_rate": 4.843807305052068e-07, "logits/chosen": -0.5007362365722656, "logits/rejected": -0.5037881135940552, "logps/chosen": -101.67707824707031, "logps/rejected": -116.08494567871094, "loss": 0.6444, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.5278171300888062, "rewards/margins": 0.14936643838882446, "rewards/rejected": -0.6771835088729858, "step": 1120 }, { "epoch": 0.40713384975680056, "grad_norm": 13.658629417419434, "learning_rate": 4.838290343303857e-07, "logits/chosen": -0.6048570871353149, "logits/rejected": -0.5855607986450195, "logps/chosen": -109.92044830322266, "logps/rejected": -127.4346923828125, "loss": 0.6403, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5725590586662292, "rewards/margins": 0.16029593348503113, "rewards/rejected": -0.732854962348938, "step": 1130 }, { "epoch": 0.41073680417942715, "grad_norm": 22.05428123474121, "learning_rate": 4.832680884956593e-07, "logits/chosen": -0.4781038761138916, "logits/rejected": -0.4626520574092865, "logps/chosen": -124.91099548339844, "logps/rejected": -144.09007263183594, "loss": 0.6293, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7242249846458435, "rewards/margins": 0.16776317358016968, "rewards/rejected": -0.8919881582260132, "step": 1140 }, { "epoch": 0.4143397586020537, "grad_norm": 18.160388946533203, "learning_rate": 4.826979151905655e-07, "logits/chosen": -0.478691965341568, "logits/rejected": -0.47778424620628357, "logps/chosen": -108.81644439697266, "logps/rejected": -129.92697143554688, "loss": 0.6229, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6904457807540894, "rewards/margins": 0.18578965961933136, "rewards/rejected": -0.8762354850769043, "step": 1150 }, { "epoch": 0.4179427130246802, "grad_norm": 18.417818069458008, "learning_rate": 4.821185369696564e-07, "logits/chosen": -0.35122150182724, "logits/rejected": -0.3426826000213623, "logps/chosen": -127.5208511352539, "logps/rejected": -142.33328247070312, "loss": 0.6416, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.8188759088516235, "rewards/margins": 0.14513902366161346, "rewards/rejected": -0.9640148878097534, "step": 1160 }, { "epoch": 0.4215456674473068, "grad_norm": 22.334169387817383, "learning_rate": 4.815299767516065e-07, "logits/chosen": -0.45871931314468384, "logits/rejected": -0.4563066065311432, "logps/chosen": -123.07237243652344, "logps/rejected": -150.54190063476562, "loss": 0.627, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7688714265823364, "rewards/margins": 0.20027267932891846, "rewards/rejected": -0.9691441655158997, "step": 1170 }, { "epoch": 0.42514862186993335, "grad_norm": 26.37111473083496, "learning_rate": 4.809322578183055e-07, "logits/chosen": -0.5161224007606506, "logits/rejected": -0.5265077352523804, "logps/chosen": -116.867431640625, "logps/rejected": -133.74142456054688, "loss": 0.6467, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.6781013607978821, "rewards/margins": 0.13253547251224518, "rewards/rejected": -0.8106366991996765, "step": 1180 }, { "epoch": 0.4287515762925599, "grad_norm": 13.813657760620117, "learning_rate": 4.803254038139385e-07, "logits/chosen": -0.6843366026878357, "logits/rejected": -0.6704198122024536, "logps/chosen": -122.2076187133789, "logps/rejected": -136.3664093017578, "loss": 0.6609, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.7395728826522827, "rewards/margins": 0.11493418365716934, "rewards/rejected": -0.8545069694519043, "step": 1190 }, { "epoch": 0.4323545307151865, "grad_norm": 22.220705032348633, "learning_rate": 4.79709438744049e-07, "logits/chosen": -0.7626785635948181, "logits/rejected": -0.7570369839668274, "logps/chosen": -108.4599609375, "logps/rejected": -138.76187133789062, "loss": 0.5998, "rewards/accuracies": 0.625, "rewards/chosen": -0.6571046113967896, "rewards/margins": 0.2574872076511383, "rewards/rejected": -0.9145916700363159, "step": 1200 }, { "epoch": 0.435957485137813, "grad_norm": 18.7367000579834, "learning_rate": 4.790843869745907e-07, "logits/chosen": -0.6571752429008484, "logits/rejected": -0.646159291267395, "logps/chosen": -111.79692077636719, "logps/rejected": -133.7711181640625, "loss": 0.6151, "rewards/accuracies": 0.625, "rewards/chosen": -0.6365227699279785, "rewards/margins": 0.21131476759910583, "rewards/rejected": -0.8478374481201172, "step": 1210 }, { "epoch": 0.43956043956043955, "grad_norm": 16.725400924682617, "learning_rate": 4.784502732309633e-07, "logits/chosen": -0.8071640133857727, "logits/rejected": -0.7955895662307739, "logps/chosen": -108.87413024902344, "logps/rejected": -112.9708480834961, "loss": 0.6868, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.5596252679824829, "rewards/margins": 0.037677209824323654, "rewards/rejected": -0.5973024964332581, "step": 1220 }, { "epoch": 0.4431633939830661, "grad_norm": 21.371368408203125, "learning_rate": 4.778071225970339e-07, "logits/chosen": -0.6365097165107727, "logits/rejected": -0.6329114437103271, "logps/chosen": -108.97966003417969, "logps/rejected": -123.94303131103516, "loss": 0.649, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.6522558331489563, "rewards/margins": 0.14867492020130157, "rewards/rejected": -0.8009306788444519, "step": 1230 }, { "epoch": 0.4467663484056927, "grad_norm": 18.487884521484375, "learning_rate": 4.771549605141455e-07, "logits/chosen": -0.7710438966751099, "logits/rejected": -0.7733847498893738, "logps/chosen": -102.35829162597656, "logps/rejected": -111.59107971191406, "loss": 0.6823, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.6114463806152344, "rewards/margins": 0.05293966457247734, "rewards/rejected": -0.6643860340118408, "step": 1240 }, { "epoch": 0.4503693028283192, "grad_norm": 23.91623878479004, "learning_rate": 4.764938127801099e-07, "logits/chosen": -0.8593417406082153, "logits/rejected": -0.8609575033187866, "logps/chosen": -111.25724792480469, "logps/rejected": -130.29165649414062, "loss": 0.637, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.6470273733139038, "rewards/margins": 0.15380927920341492, "rewards/rejected": -0.8008366823196411, "step": 1250 }, { "epoch": 0.45397225725094575, "grad_norm": 13.975992202758789, "learning_rate": 4.7582370554818805e-07, "logits/chosen": -0.8937684893608093, "logits/rejected": -0.8855475187301636, "logps/chosen": -93.56584167480469, "logps/rejected": -113.898681640625, "loss": 0.6287, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.5378883481025696, "rewards/margins": 0.17181679606437683, "rewards/rejected": -0.709705114364624, "step": 1260 }, { "epoch": 0.45757521167357235, "grad_norm": 17.672836303710938, "learning_rate": 4.7514466532605457e-07, "logits/chosen": -0.8986455798149109, "logits/rejected": -0.9085055589675903, "logps/chosen": -97.85987854003906, "logps/rejected": -112.6082534790039, "loss": 0.6393, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.49343031644821167, "rewards/margins": 0.1343407779932022, "rewards/rejected": -0.6277710795402527, "step": 1270 }, { "epoch": 0.4611781660961989, "grad_norm": 13.817961692810059, "learning_rate": 4.744567189747498e-07, "logits/chosen": -0.9279476404190063, "logits/rejected": -0.9124320149421692, "logps/chosen": -99.10932159423828, "logps/rejected": -119.8096694946289, "loss": 0.641, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.5664225816726685, "rewards/margins": 0.1693536341190338, "rewards/rejected": -0.7357761859893799, "step": 1280 }, { "epoch": 0.4647811205188254, "grad_norm": 10.271222114562988, "learning_rate": 4.7375989370761695e-07, "logits/chosen": -1.136845350265503, "logits/rejected": -1.1129014492034912, "logps/chosen": -105.83380126953125, "logps/rejected": -125.69932556152344, "loss": 0.6223, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.5844363570213318, "rewards/margins": 0.20007582008838654, "rewards/rejected": -0.7845122218132019, "step": 1290 }, { "epoch": 0.468384074941452, "grad_norm": 14.582992553710938, "learning_rate": 4.7305421708922594e-07, "logits/chosen": -1.0308201313018799, "logits/rejected": -1.0145576000213623, "logps/chosen": -100.9151611328125, "logps/rejected": -112.91084289550781, "loss": 0.6483, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5713368654251099, "rewards/margins": 0.121244415640831, "rewards/rejected": -0.692581295967102, "step": 1300 }, { "epoch": 0.47198702936407855, "grad_norm": 22.442279815673828, "learning_rate": 4.7233971703428253e-07, "logits/chosen": -0.9299166798591614, "logits/rejected": -0.9262005090713501, "logps/chosen": -99.68379211425781, "logps/rejected": -122.30403900146484, "loss": 0.6232, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.5639885663986206, "rewards/margins": 0.20361557602882385, "rewards/rejected": -0.7676041126251221, "step": 1310 }, { "epoch": 0.4755899837867051, "grad_norm": 24.155487060546875, "learning_rate": 4.7161642180652463e-07, "logits/chosen": -0.7510775327682495, "logits/rejected": -0.7490900754928589, "logps/chosen": -122.33967590332031, "logps/rejected": -141.98947143554688, "loss": 0.6294, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7110903859138489, "rewards/margins": 0.16365990042686462, "rewards/rejected": -0.8747503161430359, "step": 1320 }, { "epoch": 0.4791929382093317, "grad_norm": 21.514019012451172, "learning_rate": 4.708843600176038e-07, "logits/chosen": -0.6898788213729858, "logits/rejected": -0.6778839230537415, "logps/chosen": -122.47212219238281, "logps/rejected": -131.47409057617188, "loss": 0.663, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.7074660062789917, "rewards/margins": 0.08889790624380112, "rewards/rejected": -0.796363890171051, "step": 1330 }, { "epoch": 0.4827958926319582, "grad_norm": 17.670543670654297, "learning_rate": 4.7014356062595364e-07, "logits/chosen": -0.7467285990715027, "logits/rejected": -0.7351511716842651, "logps/chosen": -120.872802734375, "logps/rejected": -142.69851684570312, "loss": 0.6231, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7269989252090454, "rewards/margins": 0.19409465789794922, "rewards/rejected": -0.9210936427116394, "step": 1340 }, { "epoch": 0.48639884705458475, "grad_norm": 21.745296478271484, "learning_rate": 4.693940529356444e-07, "logits/chosen": -0.7076036334037781, "logits/rejected": -0.6931304931640625, "logps/chosen": -115.3056411743164, "logps/rejected": -131.36289978027344, "loss": 0.6321, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6344283819198608, "rewards/margins": 0.1647411286830902, "rewards/rejected": -0.7991694211959839, "step": 1350 }, { "epoch": 0.49000180147721134, "grad_norm": 13.729718208312988, "learning_rate": 4.6863586659522353e-07, "logits/chosen": -0.7514731287956238, "logits/rejected": -0.7414983510971069, "logps/chosen": -115.65132141113281, "logps/rejected": -134.80738830566406, "loss": 0.623, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6774997711181641, "rewards/margins": 0.19184240698814392, "rewards/rejected": -0.8693421483039856, "step": 1360 }, { "epoch": 0.4936047558998379, "grad_norm": 16.541501998901367, "learning_rate": 4.678690315965431e-07, "logits/chosen": -0.9065626263618469, "logits/rejected": -0.889665424823761, "logps/chosen": -106.21175384521484, "logps/rejected": -117.76409912109375, "loss": 0.6473, "rewards/accuracies": 0.625, "rewards/chosen": -0.616832971572876, "rewards/margins": 0.12367036193609238, "rewards/rejected": -0.7405033111572266, "step": 1370 }, { "epoch": 0.4972077103224644, "grad_norm": 38.636070251464844, "learning_rate": 4.6709357827357316e-07, "logits/chosen": -0.9566957354545593, "logits/rejected": -0.9666906595230103, "logps/chosen": -125.11567687988281, "logps/rejected": -149.4060516357422, "loss": 0.6209, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7375609874725342, "rewards/margins": 0.18065717816352844, "rewards/rejected": -0.9182182550430298, "step": 1380 }, { "epoch": 0.500810664745091, "grad_norm": 15.660926818847656, "learning_rate": 4.66309537301202e-07, "logits/chosen": -1.0064380168914795, "logits/rejected": -1.0011049509048462, "logps/chosen": -116.38114929199219, "logps/rejected": -130.26541137695312, "loss": 0.6501, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.6288421154022217, "rewards/margins": 0.11721490323543549, "rewards/rejected": -0.7460570335388184, "step": 1390 }, { "epoch": 0.5044136191677175, "grad_norm": 12.651188850402832, "learning_rate": 4.655169396940228e-07, "logits/chosen": -0.9192444086074829, "logits/rejected": -0.908920407295227, "logps/chosen": -97.5096206665039, "logps/rejected": -124.25872802734375, "loss": 0.6087, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5784972906112671, "rewards/margins": 0.23468546569347382, "rewards/rejected": -0.8131827116012573, "step": 1400 }, { "epoch": 0.5080165735903441, "grad_norm": 13.578185081481934, "learning_rate": 4.647158168051065e-07, "logits/chosen": -0.7523177266120911, "logits/rejected": -0.7385509610176086, "logps/chosen": -96.3977279663086, "logps/rejected": -120.1695785522461, "loss": 0.6198, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5647341012954712, "rewards/margins": 0.2241365611553192, "rewards/rejected": -0.7888704538345337, "step": 1410 }, { "epoch": 0.5116195280129706, "grad_norm": 9.682286262512207, "learning_rate": 4.6390620032476165e-07, "logits/chosen": -0.8279479742050171, "logits/rejected": -0.8157111406326294, "logps/chosen": -111.77999114990234, "logps/rejected": -131.52413940429688, "loss": 0.636, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.6873849034309387, "rewards/margins": 0.17221364378929138, "rewards/rejected": -0.8595984578132629, "step": 1420 }, { "epoch": 0.5152224824355972, "grad_norm": 18.07782554626465, "learning_rate": 4.6308812227928097e-07, "logits/chosen": -0.7680369019508362, "logits/rejected": -0.7670931816101074, "logps/chosen": -117.39216613769531, "logps/rejected": -135.0698699951172, "loss": 0.6481, "rewards/accuracies": 0.5, "rewards/chosen": -0.627860426902771, "rewards/margins": 0.14340507984161377, "rewards/rejected": -0.7712655663490295, "step": 1430 }, { "epoch": 0.5188254368582238, "grad_norm": 11.54117488861084, "learning_rate": 4.622616150296744e-07, "logits/chosen": -0.8040092587471008, "logits/rejected": -0.785892128944397, "logps/chosen": -109.7896957397461, "logps/rejected": -128.6796417236328, "loss": 0.6359, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.6554213762283325, "rewards/margins": 0.16084139049053192, "rewards/rejected": -0.8162628412246704, "step": 1440 }, { "epoch": 0.5224283912808503, "grad_norm": 17.470726013183594, "learning_rate": 4.61426711270389e-07, "logits/chosen": -0.7985628247261047, "logits/rejected": -0.7914355397224426, "logps/chosen": -122.31645202636719, "logps/rejected": -141.3916473388672, "loss": 0.6334, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.6971213221549988, "rewards/margins": 0.17073138058185577, "rewards/rejected": -0.8678528070449829, "step": 1450 }, { "epoch": 0.5260313457034769, "grad_norm": 18.514780044555664, "learning_rate": 4.605834440280154e-07, "logits/chosen": -0.5825433135032654, "logits/rejected": -0.571456789970398, "logps/chosen": -104.1174545288086, "logps/rejected": -122.6390380859375, "loss": 0.6368, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.585911214351654, "rewards/margins": 0.16434451937675476, "rewards/rejected": -0.7502557039260864, "step": 1460 }, { "epoch": 0.5296343001261034, "grad_norm": 15.687395095825195, "learning_rate": 4.5973184665998184e-07, "logits/chosen": -0.6224468350410461, "logits/rejected": -0.6154752969741821, "logps/chosen": -105.5159683227539, "logps/rejected": -120.9432601928711, "loss": 0.6397, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.5748180747032166, "rewards/margins": 0.14824509620666504, "rewards/rejected": -0.7230631709098816, "step": 1470 }, { "epoch": 0.5332372545487299, "grad_norm": 27.54644203186035, "learning_rate": 4.588719528532341e-07, "logits/chosen": -0.49367189407348633, "logits/rejected": -0.5038386583328247, "logps/chosen": -114.81755065917969, "logps/rejected": -122.81019592285156, "loss": 0.6636, "rewards/accuracies": 0.5625, "rewards/chosen": -0.691633939743042, "rewards/margins": 0.0962417870759964, "rewards/rejected": -0.7878756523132324, "step": 1480 }, { "epoch": 0.5368402089713565, "grad_norm": 24.379594802856445, "learning_rate": 4.580037966229033e-07, "logits/chosen": -0.5394241213798523, "logits/rejected": -0.5318597555160522, "logps/chosen": -112.4027099609375, "logps/rejected": -135.67672729492188, "loss": 0.6245, "rewards/accuracies": 0.625, "rewards/chosen": -0.6120216250419617, "rewards/margins": 0.21039803326129913, "rewards/rejected": -0.8224196434020996, "step": 1490 }, { "epoch": 0.540443163393983, "grad_norm": 16.60763168334961, "learning_rate": 4.571274123109605e-07, "logits/chosen": -0.2344832718372345, "logits/rejected": -0.22624747455120087, "logps/chosen": -114.34037780761719, "logps/rejected": -133.9674530029297, "loss": 0.6215, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6741780042648315, "rewards/margins": 0.19762232899665833, "rewards/rejected": -0.871800422668457, "step": 1500 }, { "epoch": 0.5440461178166096, "grad_norm": 21.019861221313477, "learning_rate": 4.5624283458485753e-07, "logits/chosen": -0.23028871417045593, "logits/rejected": -0.23043613135814667, "logps/chosen": -99.55493927001953, "logps/rejected": -118.29109954833984, "loss": 0.6355, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5799737572669983, "rewards/margins": 0.1384848803281784, "rewards/rejected": -0.7184587121009827, "step": 1510 }, { "epoch": 0.5476490722392362, "grad_norm": 17.716548919677734, "learning_rate": 4.553500984361563e-07, "logits/chosen": -0.0724945068359375, "logits/rejected": -0.08050285279750824, "logps/chosen": -112.50837707519531, "logps/rejected": -126.670166015625, "loss": 0.6481, "rewards/accuracies": 0.625, "rewards/chosen": -0.6275107264518738, "rewards/margins": 0.1303718388080597, "rewards/rejected": -0.7578826546669006, "step": 1520 }, { "epoch": 0.5512520266618627, "grad_norm": 10.604673385620117, "learning_rate": 4.5444923917914444e-07, "logits/chosen": -0.1800159364938736, "logits/rejected": -0.17468824982643127, "logps/chosen": -110.12767028808594, "logps/rejected": -129.58322143554688, "loss": 0.6307, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.7002070546150208, "rewards/margins": 0.1697019636631012, "rewards/rejected": -0.8699091076850891, "step": 1530 }, { "epoch": 0.5548549810844893, "grad_norm": 11.269343376159668, "learning_rate": 4.5354029244943814e-07, "logits/chosen": -0.3396390676498413, "logits/rejected": -0.33701950311660767, "logps/chosen": -108.12324523925781, "logps/rejected": -131.24703979492188, "loss": 0.6193, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6141228675842285, "rewards/margins": 0.20558467507362366, "rewards/rejected": -0.8197075724601746, "step": 1540 }, { "epoch": 0.5584579355071159, "grad_norm": 22.327653884887695, "learning_rate": 4.5262329420257293e-07, "logits/chosen": -0.43441152572631836, "logits/rejected": -0.4208933413028717, "logps/chosen": -111.65888977050781, "logps/rejected": -131.4414825439453, "loss": 0.622, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.5926408171653748, "rewards/margins": 0.18177416920661926, "rewards/rejected": -0.7744150161743164, "step": 1550 }, { "epoch": 0.5620608899297423, "grad_norm": 21.105283737182617, "learning_rate": 4.516982807125811e-07, "logits/chosen": -0.07212761789560318, "logits/rejected": -0.07756079733371735, "logps/chosen": -109.25408935546875, "logps/rejected": -126.6335220336914, "loss": 0.643, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.644785463809967, "rewards/margins": 0.1567734330892563, "rewards/rejected": -0.8015588521957397, "step": 1560 }, { "epoch": 0.5656638443523689, "grad_norm": 19.152124404907227, "learning_rate": 4.507652885705564e-07, "logits/chosen": -0.08397520333528519, "logits/rejected": -0.07510174810886383, "logps/chosen": -124.69132232666016, "logps/rejected": -134.48948669433594, "loss": 0.6805, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.7654344439506531, "rewards/margins": 0.07131467014551163, "rewards/rejected": -0.8367490768432617, "step": 1570 }, { "epoch": 0.5692667987749955, "grad_norm": 18.12226104736328, "learning_rate": 4.4982435468320757e-07, "logits/chosen": -0.06124221533536911, "logits/rejected": -0.04790102690458298, "logps/chosen": -123.93861389160156, "logps/rejected": -152.51882934570312, "loss": 0.6041, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7189621925354004, "rewards/margins": 0.24458980560302734, "rewards/rejected": -0.9635521173477173, "step": 1580 }, { "epoch": 0.572869753197622, "grad_norm": 25.4002742767334, "learning_rate": 4.488755162713975e-07, "logits/chosen": -0.2089006006717682, "logits/rejected": -0.20480160415172577, "logps/chosen": -109.4067611694336, "logps/rejected": -129.50677490234375, "loss": 0.6268, "rewards/accuracies": 0.625, "rewards/chosen": -0.6752637028694153, "rewards/margins": 0.1842677891254425, "rewards/rejected": -0.8595314025878906, "step": 1590 }, { "epoch": 0.5764727076202486, "grad_norm": 21.045480728149414, "learning_rate": 4.4791881086867133e-07, "logits/chosen": -0.20409497618675232, "logits/rejected": -0.2089627981185913, "logps/chosen": -113.8995132446289, "logps/rejected": -125.38606262207031, "loss": 0.6564, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.6167466640472412, "rewards/margins": 0.1140073910355568, "rewards/rejected": -0.730754017829895, "step": 1600 }, { "epoch": 0.5800756620428752, "grad_norm": 33.92525100708008, "learning_rate": 4.469542763197717e-07, "logits/chosen": -0.2622045874595642, "logits/rejected": -0.2459571808576584, "logps/chosen": -110.90557861328125, "logps/rejected": -135.90432739257812, "loss": 0.6245, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.6229537129402161, "rewards/margins": 0.2116650640964508, "rewards/rejected": -0.8346187472343445, "step": 1610 }, { "epoch": 0.5836786164655017, "grad_norm": 12.207719802856445, "learning_rate": 4.459819507791414e-07, "logits/chosen": -0.2609815299510956, "logits/rejected": -0.27185386419296265, "logps/chosen": -104.3377914428711, "logps/rejected": -121.11332702636719, "loss": 0.6477, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.6319000124931335, "rewards/margins": 0.14542250335216522, "rewards/rejected": -0.7773224711418152, "step": 1620 }, { "epoch": 0.5872815708881283, "grad_norm": 27.201845169067383, "learning_rate": 4.450018727094146e-07, "logits/chosen": -0.27219557762145996, "logits/rejected": -0.2478889524936676, "logps/chosen": -123.58785247802734, "logps/rejected": -139.5199432373047, "loss": 0.6442, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7428973913192749, "rewards/margins": 0.1621369570493698, "rewards/rejected": -0.9050344228744507, "step": 1630 }, { "epoch": 0.5908845253107549, "grad_norm": 21.65987205505371, "learning_rate": 4.4401408087989475e-07, "logits/chosen": -0.18356722593307495, "logits/rejected": -0.1742212474346161, "logps/chosen": -106.0003662109375, "logps/rejected": -124.266357421875, "loss": 0.6408, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.6102654337882996, "rewards/margins": 0.1706978976726532, "rewards/rejected": -0.7809633612632751, "step": 1640 }, { "epoch": 0.5944874797333813, "grad_norm": 19.172645568847656, "learning_rate": 4.4301861436502155e-07, "logits/chosen": -0.09111490100622177, "logits/rejected": -0.08929113298654556, "logps/chosen": -104.4176025390625, "logps/rejected": -128.53921508789062, "loss": 0.6126, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6372945308685303, "rewards/margins": 0.2133907973766327, "rewards/rejected": -0.8506854176521301, "step": 1650 }, { "epoch": 0.5980904341560079, "grad_norm": 23.5217342376709, "learning_rate": 4.420155125428249e-07, "logits/chosen": -0.23100826144218445, "logits/rejected": -0.22501571476459503, "logps/chosen": -116.77998352050781, "logps/rejected": -138.1328582763672, "loss": 0.6313, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7341734766960144, "rewards/margins": 0.18791931867599487, "rewards/rejected": -0.9220927357673645, "step": 1660 }, { "epoch": 0.6016933885786345, "grad_norm": 28.95935821533203, "learning_rate": 4.4100481509336727e-07, "logits/chosen": -0.33691197633743286, "logits/rejected": -0.3338681161403656, "logps/chosen": -124.83697509765625, "logps/rejected": -145.29898071289062, "loss": 0.6312, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.7636333703994751, "rewards/margins": 0.19949015974998474, "rewards/rejected": -0.9631235003471375, "step": 1670 }, { "epoch": 0.605296343001261, "grad_norm": 13.79696273803711, "learning_rate": 4.3998656199717433e-07, "logits/chosen": -0.16279760003089905, "logits/rejected": -0.15554110705852509, "logps/chosen": -123.12381744384766, "logps/rejected": -146.53384399414062, "loss": 0.6095, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7670550346374512, "rewards/margins": 0.23701255023479462, "rewards/rejected": -1.0040675401687622, "step": 1680 }, { "epoch": 0.6088992974238876, "grad_norm": 32.479736328125, "learning_rate": 4.38960793533653e-07, "logits/chosen": -0.1557508260011673, "logits/rejected": -0.1466209590435028, "logps/chosen": -115.62281799316406, "logps/rejected": -147.75576782226562, "loss": 0.5807, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7074651718139648, "rewards/margins": 0.3161167502403259, "rewards/rejected": -1.023581862449646, "step": 1690 }, { "epoch": 0.6125022518465142, "grad_norm": 13.241331100463867, "learning_rate": 4.379275502794983e-07, "logits/chosen": -0.31330347061157227, "logits/rejected": -0.30750396847724915, "logps/chosen": -119.7376937866211, "logps/rejected": -142.54885864257812, "loss": 0.6352, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.6978413462638855, "rewards/margins": 0.1720629334449768, "rewards/rejected": -0.8699043393135071, "step": 1700 }, { "epoch": 0.6161052062691407, "grad_norm": 18.612834930419922, "learning_rate": 4.368868731070884e-07, "logits/chosen": -0.3954562842845917, "logits/rejected": -0.3834627866744995, "logps/chosen": -130.12315368652344, "logps/rejected": -147.21743774414062, "loss": 0.6209, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6318448185920715, "rewards/margins": 0.1988515406847, "rewards/rejected": -0.8306962847709656, "step": 1710 }, { "epoch": 0.6197081606917673, "grad_norm": 23.820240020751953, "learning_rate": 4.358388031828675e-07, "logits/chosen": -0.4271976053714752, "logits/rejected": -0.41857513785362244, "logps/chosen": -111.3486099243164, "logps/rejected": -128.419189453125, "loss": 0.6427, "rewards/accuracies": 0.625, "rewards/chosen": -0.6416288614273071, "rewards/margins": 0.1477826088666916, "rewards/rejected": -0.7894114851951599, "step": 1720 }, { "epoch": 0.6233111151143939, "grad_norm": 19.6106014251709, "learning_rate": 4.3478338196571774e-07, "logits/chosen": -0.592642605304718, "logits/rejected": -0.5813112258911133, "logps/chosen": -105.65118408203125, "logps/rejected": -126.93350982666016, "loss": 0.6237, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.6063104867935181, "rewards/margins": 0.18938353657722473, "rewards/rejected": -0.7956939935684204, "step": 1730 }, { "epoch": 0.6269140695370203, "grad_norm": 26.405860900878906, "learning_rate": 4.3372065120531896e-07, "logits/chosen": -0.6595714688301086, "logits/rejected": -0.6470447778701782, "logps/chosen": -116.03114318847656, "logps/rejected": -144.71456909179688, "loss": 0.6025, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6541780829429626, "rewards/margins": 0.2541576027870178, "rewards/rejected": -0.9083356857299805, "step": 1740 }, { "epoch": 0.6305170239596469, "grad_norm": 21.304424285888672, "learning_rate": 4.326506529404972e-07, "logits/chosen": -0.4566799998283386, "logits/rejected": -0.4474863111972809, "logps/chosen": -122.94720458984375, "logps/rejected": -147.1781463623047, "loss": 0.6345, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7978330850601196, "rewards/margins": 0.20655818283557892, "rewards/rejected": -1.0043913125991821, "step": 1750 }, { "epoch": 0.6341199783822735, "grad_norm": 27.25872230529785, "learning_rate": 4.3157342949756176e-07, "logits/chosen": -0.5053147077560425, "logits/rejected": -0.5010774731636047, "logps/chosen": -123.2105712890625, "logps/rejected": -130.94049072265625, "loss": 0.6711, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.730686604976654, "rewards/margins": 0.0933484211564064, "rewards/rejected": -0.8240349888801575, "step": 1760 }, { "epoch": 0.6377229328049, "grad_norm": 13.439959526062012, "learning_rate": 4.3048902348863106e-07, "logits/chosen": -0.5893678665161133, "logits/rejected": -0.5871925354003906, "logps/chosen": -92.13633728027344, "logps/rejected": -117.0339584350586, "loss": 0.617, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5311223268508911, "rewards/margins": 0.20927007496356964, "rewards/rejected": -0.7403924465179443, "step": 1770 }, { "epoch": 0.6413258872275266, "grad_norm": 14.695592880249023, "learning_rate": 4.2939747780994696e-07, "logits/chosen": -0.7266982197761536, "logits/rejected": -0.7190583944320679, "logps/chosen": -110.9961929321289, "logps/rejected": -130.0135498046875, "loss": 0.6243, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6118022203445435, "rewards/margins": 0.2004629671573639, "rewards/rejected": -0.8122652173042297, "step": 1780 }, { "epoch": 0.6449288416501531, "grad_norm": 17.60588836669922, "learning_rate": 4.2829883564017755e-07, "logits/chosen": -0.6400290727615356, "logits/rejected": -0.6384719014167786, "logps/chosen": -111.03657531738281, "logps/rejected": -124.96858978271484, "loss": 0.6603, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.649237871170044, "rewards/margins": 0.11901859939098358, "rewards/rejected": -0.7682564854621887, "step": 1790 }, { "epoch": 0.6485317960727797, "grad_norm": 37.39412307739258, "learning_rate": 4.2719314043870956e-07, "logits/chosen": -0.6858614683151245, "logits/rejected": -0.6850418448448181, "logps/chosen": -115.84126281738281, "logps/rejected": -136.5898895263672, "loss": 0.6471, "rewards/accuracies": 0.5, "rewards/chosen": -0.5579274892807007, "rewards/margins": 0.1683448851108551, "rewards/rejected": -0.726272463798523, "step": 1800 }, { "epoch": 0.6521347504954063, "grad_norm": 32.6982536315918, "learning_rate": 4.260804359439291e-07, "logits/chosen": -0.4608843922615051, "logits/rejected": -0.4496752619743347, "logps/chosen": -121.73396301269531, "logps/rejected": -141.05642700195312, "loss": 0.6373, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6710211634635925, "rewards/margins": 0.17933639883995056, "rewards/rejected": -0.8503575325012207, "step": 1810 }, { "epoch": 0.6557377049180327, "grad_norm": 15.985424041748047, "learning_rate": 4.2496076617149134e-07, "logits/chosen": -0.22309105098247528, "logits/rejected": -0.22793006896972656, "logps/chosen": -118.05805969238281, "logps/rejected": -141.6474609375, "loss": 0.615, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7154508233070374, "rewards/margins": 0.22133982181549072, "rewards/rejected": -0.9367905855178833, "step": 1820 }, { "epoch": 0.6593406593406593, "grad_norm": 23.936935424804688, "learning_rate": 4.238341754125795e-07, "logits/chosen": -0.20337358117103577, "logits/rejected": -0.1934824436903, "logps/chosen": -133.47775268554688, "logps/rejected": -165.22193908691406, "loss": 0.6023, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.967487633228302, "rewards/margins": 0.28225192427635193, "rewards/rejected": -1.249739646911621, "step": 1830 }, { "epoch": 0.6629436137632859, "grad_norm": 34.532161712646484, "learning_rate": 4.2270070823215275e-07, "logits/chosen": -0.20488937199115753, "logits/rejected": -0.2023644894361496, "logps/chosen": -129.74929809570312, "logps/rejected": -150.7936553955078, "loss": 0.6268, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.8183504939079285, "rewards/margins": 0.21211810410022736, "rewards/rejected": -1.0304687023162842, "step": 1840 }, { "epoch": 0.6665465681859124, "grad_norm": 22.62578773498535, "learning_rate": 4.2156040946718343e-07, "logits/chosen": -0.37808769941329956, "logits/rejected": -0.3699187636375427, "logps/chosen": -115.9354248046875, "logps/rejected": -139.9909210205078, "loss": 0.6198, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.7293509244918823, "rewards/margins": 0.1937699019908905, "rewards/rejected": -0.9231207966804504, "step": 1850 }, { "epoch": 0.670149522608539, "grad_norm": 25.69864845275879, "learning_rate": 4.204133242248832e-07, "logits/chosen": -0.40861696004867554, "logits/rejected": -0.40402859449386597, "logps/chosen": -118.96281433105469, "logps/rejected": -133.44906616210938, "loss": 0.6493, "rewards/accuracies": 0.625, "rewards/chosen": -0.6875437498092651, "rewards/margins": 0.14636746048927307, "rewards/rejected": -0.8339112401008606, "step": 1860 }, { "epoch": 0.6737524770311656, "grad_norm": 15.054122924804688, "learning_rate": 4.1925949788091907e-07, "logits/chosen": -0.4089645743370056, "logits/rejected": -0.40489277243614197, "logps/chosen": -111.43312072753906, "logps/rejected": -127.93333435058594, "loss": 0.6398, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.6141126751899719, "rewards/margins": 0.15593598783016205, "rewards/rejected": -0.7700486779212952, "step": 1870 }, { "epoch": 0.6773554314537921, "grad_norm": 36.226905822753906, "learning_rate": 4.1809897607761814e-07, "logits/chosen": -0.4524051547050476, "logits/rejected": -0.4494766294956207, "logps/chosen": -135.3714141845703, "logps/rejected": -156.458984375, "loss": 0.6346, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.7505577802658081, "rewards/margins": 0.20968285202980042, "rewards/rejected": -0.9602406620979309, "step": 1880 }, { "epoch": 0.6809583858764187, "grad_norm": 38.99123001098633, "learning_rate": 4.169318047221621e-07, "logits/chosen": -0.25047487020492554, "logits/rejected": -0.24576766788959503, "logps/chosen": -122.78657531738281, "logps/rejected": -141.0259552001953, "loss": 0.6485, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.722198486328125, "rewards/margins": 0.13076011836528778, "rewards/rejected": -0.8529586791992188, "step": 1890 }, { "epoch": 0.6845613402990453, "grad_norm": 23.391263961791992, "learning_rate": 4.157580299847717e-07, "logits/chosen": -0.11810042709112167, "logits/rejected": -0.10987571626901627, "logps/chosen": -133.02330017089844, "logps/rejected": -162.12051391601562, "loss": 0.6082, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.9049941301345825, "rewards/margins": 0.2531413733959198, "rewards/rejected": -1.1581355333328247, "step": 1900 }, { "epoch": 0.6881642947216717, "grad_norm": 26.655284881591797, "learning_rate": 4.145776982968797e-07, "logits/chosen": -0.06702479720115662, "logits/rejected": -0.06362093985080719, "logps/chosen": -139.19175720214844, "logps/rejected": -151.38427734375, "loss": 0.6659, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.8984958529472351, "rewards/margins": 0.10784594714641571, "rewards/rejected": -1.006341814994812, "step": 1910 }, { "epoch": 0.6917672491442983, "grad_norm": 22.961259841918945, "learning_rate": 4.1339085634929485e-07, "logits/chosen": -0.07665994018316269, "logits/rejected": -0.07137580215930939, "logps/chosen": -140.45701599121094, "logps/rejected": -164.6908416748047, "loss": 0.6227, "rewards/accuracies": 0.5625, "rewards/chosen": -0.9155426025390625, "rewards/margins": 0.21152743697166443, "rewards/rejected": -1.1270700693130493, "step": 1920 }, { "epoch": 0.6953702035669249, "grad_norm": 24.024768829345703, "learning_rate": 4.1219755109035423e-07, "logits/chosen": -0.04297425225377083, "logits/rejected": -0.029266545549035072, "logps/chosen": -125.0589599609375, "logps/rejected": -154.24522399902344, "loss": 0.6034, "rewards/accuracies": 0.625, "rewards/chosen": -0.8275787234306335, "rewards/margins": 0.25777915120124817, "rewards/rejected": -1.085357904434204, "step": 1930 }, { "epoch": 0.6989731579895514, "grad_norm": 26.384260177612305, "learning_rate": 4.1099782972406703e-07, "logits/chosen": -0.04297472909092903, "logits/rejected": -0.02821093238890171, "logps/chosen": -108.7773666381836, "logps/rejected": -138.58921813964844, "loss": 0.6024, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.615945041179657, "rewards/margins": 0.2714507281780243, "rewards/rejected": -0.8873957395553589, "step": 1940 }, { "epoch": 0.702576112412178, "grad_norm": 19.373769760131836, "learning_rate": 4.097917397082462e-07, "logits/chosen": -0.1805184781551361, "logits/rejected": -0.17657090723514557, "logps/chosen": -118.64015197753906, "logps/rejected": -136.58822631835938, "loss": 0.6442, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6089118123054504, "rewards/margins": 0.13456028699874878, "rewards/rejected": -0.7434720993041992, "step": 1950 }, { "epoch": 0.7061790668348046, "grad_norm": 15.632219314575195, "learning_rate": 4.085793287526319e-07, "logits/chosen": -0.018003929406404495, "logits/rejected": -0.0015068978536874056, "logps/chosen": -116.03665924072266, "logps/rejected": -139.82785034179688, "loss": 0.6434, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.719316303730011, "rewards/margins": 0.18363362550735474, "rewards/rejected": -0.9029499292373657, "step": 1960 }, { "epoch": 0.7097820212574311, "grad_norm": 26.803855895996094, "learning_rate": 4.0736064481700396e-07, "logits/chosen": 0.03305846452713013, "logits/rejected": 0.03875232860445976, "logps/chosen": -110.02363586425781, "logps/rejected": -128.6298828125, "loss": 0.6317, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6139898896217346, "rewards/margins": 0.1760130375623703, "rewards/rejected": -0.7900028228759766, "step": 1970 }, { "epoch": 0.7133849756800577, "grad_norm": 13.6882963180542, "learning_rate": 4.0613573610928477e-07, "logits/chosen": 0.2014857977628708, "logits/rejected": 0.2058900147676468, "logps/chosen": -119.346435546875, "logps/rejected": -143.44493103027344, "loss": 0.6174, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6651163101196289, "rewards/margins": 0.2247789204120636, "rewards/rejected": -0.8898951411247253, "step": 1980 }, { "epoch": 0.7169879301026842, "grad_norm": 54.356929779052734, "learning_rate": 4.0490465108363213e-07, "logits/chosen": 0.4385454058647156, "logits/rejected": 0.4460521340370178, "logps/chosen": -135.75320434570312, "logps/rejected": -148.539794921875, "loss": 0.6576, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.8634354472160339, "rewards/margins": 0.12716665863990784, "rewards/rejected": -0.9906021952629089, "step": 1990 }, { "epoch": 0.7205908845253107, "grad_norm": 35.139137268066406, "learning_rate": 4.036674384385231e-07, "logits/chosen": 0.5260182619094849, "logits/rejected": 0.5229222178459167, "logps/chosen": -133.07302856445312, "logps/rejected": -154.39791870117188, "loss": 0.6436, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.8775787353515625, "rewards/margins": 0.17777515947818756, "rewards/rejected": -1.0553538799285889, "step": 2000 }, { "epoch": 0.7241938389479373, "grad_norm": 24.67560577392578, "learning_rate": 4.0242414711482673e-07, "logits/chosen": 0.4192674160003662, "logits/rejected": 0.42251092195510864, "logps/chosen": -135.64370727539062, "logps/rejected": -154.87327575683594, "loss": 0.6399, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.9285451769828796, "rewards/margins": 0.16990457475185394, "rewards/rejected": -1.0984498262405396, "step": 2010 }, { "epoch": 0.7277967933705639, "grad_norm": 21.97782325744629, "learning_rate": 4.0117482629386884e-07, "logits/chosen": 0.5319436192512512, "logits/rejected": 0.5361469984054565, "logps/chosen": -129.13584899902344, "logps/rejected": -156.07022094726562, "loss": 0.6011, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.8532201647758484, "rewards/margins": 0.26026397943496704, "rewards/rejected": -1.1134841442108154, "step": 2020 }, { "epoch": 0.7313997477931904, "grad_norm": 37.31975173950195, "learning_rate": 3.9991952539548616e-07, "logits/chosen": 0.46142640709877014, "logits/rejected": 0.47692495584487915, "logps/chosen": -147.18496704101562, "logps/rejected": -165.7284698486328, "loss": 0.656, "rewards/accuracies": 0.625, "rewards/chosen": -1.0022342205047607, "rewards/margins": 0.16152818500995636, "rewards/rejected": -1.1637624502182007, "step": 2030 }, { "epoch": 0.735002702215817, "grad_norm": 22.876155853271484, "learning_rate": 3.9865829407607166e-07, "logits/chosen": 0.4777229428291321, "logits/rejected": 0.4776241183280945, "logps/chosen": -138.6183319091797, "logps/rejected": -151.57363891601562, "loss": 0.6566, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.9196840524673462, "rewards/margins": 0.11917316913604736, "rewards/rejected": -1.0388572216033936, "step": 2040 }, { "epoch": 0.7386056566384436, "grad_norm": 24.542051315307617, "learning_rate": 3.9739118222660983e-07, "logits/chosen": 0.40690964460372925, "logits/rejected": 0.41948142647743225, "logps/chosen": -150.92098999023438, "logps/rejected": -181.51345825195312, "loss": 0.6036, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.9969013929367065, "rewards/margins": 0.2801375389099121, "rewards/rejected": -1.2770389318466187, "step": 2050 }, { "epoch": 0.7422086110610701, "grad_norm": 15.832708358764648, "learning_rate": 3.961182399707037e-07, "logits/chosen": 0.4410591721534729, "logits/rejected": 0.4448552131652832, "logps/chosen": -142.0412139892578, "logps/rejected": -171.34796142578125, "loss": 0.6038, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.0556821823120117, "rewards/margins": 0.25854361057281494, "rewards/rejected": -1.3142259120941162, "step": 2060 }, { "epoch": 0.7458115654836966, "grad_norm": 30.200489044189453, "learning_rate": 3.9483951766259174e-07, "logits/chosen": 0.19090792536735535, "logits/rejected": 0.1829378306865692, "logps/chosen": -144.0277557373047, "logps/rejected": -155.68121337890625, "loss": 0.6687, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9700512886047363, "rewards/margins": 0.10892186313867569, "rewards/rejected": -1.0789731740951538, "step": 2070 }, { "epoch": 0.7494145199063232, "grad_norm": 16.804433822631836, "learning_rate": 3.9355506588515587e-07, "logits/chosen": 0.358919233083725, "logits/rejected": 0.3698652684688568, "logps/chosen": -143.83709716796875, "logps/rejected": -174.1533966064453, "loss": 0.594, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9783726930618286, "rewards/margins": 0.2765392065048218, "rewards/rejected": -1.2549118995666504, "step": 2080 }, { "epoch": 0.7530174743289497, "grad_norm": 30.392404556274414, "learning_rate": 3.922649354479209e-07, "logits/chosen": 0.514492392539978, "logits/rejected": 0.5247939825057983, "logps/chosen": -149.9234161376953, "logps/rejected": -183.11795043945312, "loss": 0.5925, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.0595362186431885, "rewards/margins": 0.30505871772766113, "rewards/rejected": -1.3645950555801392, "step": 2090 }, { "epoch": 0.7566204287515763, "grad_norm": 27.8906192779541, "learning_rate": 3.9096917738504444e-07, "logits/chosen": 0.5503655672073364, "logits/rejected": 0.5556210875511169, "logps/chosen": -157.77444458007812, "logps/rejected": -168.21670532226562, "loss": 0.697, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.185608148574829, "rewards/margins": 0.07584402710199356, "rewards/rejected": -1.261452078819275, "step": 2100 }, { "epoch": 0.7602233831742028, "grad_norm": 16.32037925720215, "learning_rate": 3.89667842953298e-07, "logits/chosen": 0.3441751003265381, "logits/rejected": 0.34723028540611267, "logps/chosen": -156.60671997070312, "logps/rejected": -182.07041931152344, "loss": 0.6121, "rewards/accuracies": 0.6875, "rewards/chosen": -1.087058663368225, "rewards/margins": 0.24376948177814484, "rewards/rejected": -1.330828070640564, "step": 2110 }, { "epoch": 0.7638263375968294, "grad_norm": 21.35797119140625, "learning_rate": 3.8836098363003966e-07, "logits/chosen": 0.24712154269218445, "logits/rejected": 0.2610171139240265, "logps/chosen": -148.96975708007812, "logps/rejected": -174.50173950195312, "loss": 0.6262, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.0182521343231201, "rewards/margins": 0.24236364662647247, "rewards/rejected": -1.2606159448623657, "step": 2120 }, { "epoch": 0.767429292019456, "grad_norm": 17.814006805419922, "learning_rate": 3.8704865111117746e-07, "logits/chosen": 0.08137331157922745, "logits/rejected": 0.08755414187908173, "logps/chosen": -154.65521240234375, "logps/rejected": -178.48974609375, "loss": 0.6247, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.9661283493041992, "rewards/margins": 0.23251216113567352, "rewards/rejected": -1.1986405849456787, "step": 2130 }, { "epoch": 0.7710322464420825, "grad_norm": 47.21306610107422, "learning_rate": 3.8573089730912486e-07, "logits/chosen": 0.10043950378894806, "logits/rejected": 0.09298092871904373, "logps/chosen": -155.009033203125, "logps/rejected": -180.91134643554688, "loss": 0.6346, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0485827922821045, "rewards/margins": 0.19028575718402863, "rewards/rejected": -1.2388685941696167, "step": 2140 }, { "epoch": 0.774635200864709, "grad_norm": 27.711566925048828, "learning_rate": 3.8440777435074677e-07, "logits/chosen": 0.06114129349589348, "logits/rejected": 0.07233893126249313, "logps/chosen": -147.94363403320312, "logps/rejected": -179.6943359375, "loss": 0.5764, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.0121772289276123, "rewards/margins": 0.3053652346134186, "rewards/rejected": -1.3175424337387085, "step": 2150 }, { "epoch": 0.7782381552873356, "grad_norm": 46.265296936035156, "learning_rate": 3.8307933457529803e-07, "logits/chosen": -0.10898490250110626, "logits/rejected": -0.10529482364654541, "logps/chosen": -149.0132293701172, "logps/rejected": -181.5883026123047, "loss": 0.5966, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.987149715423584, "rewards/margins": 0.28507328033447266, "rewards/rejected": -1.2722227573394775, "step": 2160 }, { "epoch": 0.7818411097099621, "grad_norm": 33.7895393371582, "learning_rate": 3.8174563053235244e-07, "logits/chosen": 0.034177035093307495, "logits/rejected": 0.03491184115409851, "logps/chosen": -132.97091674804688, "logps/rejected": -162.5496826171875, "loss": 0.5929, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.8792848587036133, "rewards/margins": 0.26454678177833557, "rewards/rejected": -1.1438316106796265, "step": 2170 }, { "epoch": 0.7854440641325887, "grad_norm": 26.706649780273438, "learning_rate": 3.804067149797244e-07, "logits/chosen": 0.025342971086502075, "logits/rejected": 0.042313508689403534, "logps/chosen": -153.59963989257812, "logps/rejected": -181.06259155273438, "loss": 0.6178, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.9308815002441406, "rewards/margins": 0.2412043809890747, "rewards/rejected": -1.1720860004425049, "step": 2180 }, { "epoch": 0.7890470185552153, "grad_norm": 22.888463973999023, "learning_rate": 3.790626408813822e-07, "logits/chosen": 0.09760870039463043, "logits/rejected": 0.11327888816595078, "logps/chosen": -149.55943298339844, "logps/rejected": -168.8435516357422, "loss": 0.6433, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.0106549263000488, "rewards/margins": 0.19457527995109558, "rewards/rejected": -1.2052302360534668, "step": 2190 }, { "epoch": 0.7926499729778418, "grad_norm": 16.771135330200195, "learning_rate": 3.7771346140535214e-07, "logits/chosen": 0.08035653084516525, "logits/rejected": 0.10142220556735992, "logps/chosen": -150.521484375, "logps/rejected": -182.3574981689453, "loss": 0.6012, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0371589660644531, "rewards/margins": 0.2926921248435974, "rewards/rejected": -1.3298509120941162, "step": 2200 }, { "epoch": 0.7962529274004684, "grad_norm": 21.914215087890625, "learning_rate": 3.763592299216161e-07, "logits/chosen": 0.14479905366897583, "logits/rejected": 0.16052904725074768, "logps/chosen": -140.46218872070312, "logps/rejected": -165.92160034179688, "loss": 0.6112, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.9338615536689758, "rewards/margins": 0.2532302141189575, "rewards/rejected": -1.1870917081832886, "step": 2210 }, { "epoch": 0.799855881823095, "grad_norm": 20.929664611816406, "learning_rate": 3.75e-07, "logits/chosen": 0.1540573537349701, "logits/rejected": 0.15413573384284973, "logps/chosen": -127.8952865600586, "logps/rejected": -146.11402893066406, "loss": 0.6447, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.8141363263130188, "rewards/margins": 0.145987868309021, "rewards/rejected": -0.9601241946220398, "step": 2220 }, { "epoch": 0.8034588362457215, "grad_norm": 14.718154907226562, "learning_rate": 3.7363582540805473e-07, "logits/chosen": 0.19897550344467163, "logits/rejected": 0.20991668105125427, "logps/chosen": -136.5651397705078, "logps/rejected": -160.22506713867188, "loss": 0.6231, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.8688238859176636, "rewards/margins": 0.23662717640399933, "rewards/rejected": -1.105450987815857, "step": 2230 }, { "epoch": 0.807061790668348, "grad_norm": 25.30946159362793, "learning_rate": 3.722667601089292e-07, "logits/chosen": 0.3127673864364624, "logits/rejected": 0.3147187829017639, "logps/chosen": -128.371337890625, "logps/rejected": -145.5037078857422, "loss": 0.6542, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.8618924021720886, "rewards/margins": 0.15794196724891663, "rewards/rejected": -1.019834280014038, "step": 2240 }, { "epoch": 0.8106647450909746, "grad_norm": 23.615474700927734, "learning_rate": 3.7089285825923613e-07, "logits/chosen": 0.2622937560081482, "logits/rejected": 0.27894237637519836, "logps/chosen": -140.64035034179688, "logps/rejected": -158.76181030273438, "loss": 0.655, "rewards/accuracies": 0.5625, "rewards/chosen": -0.9477087259292603, "rewards/margins": 0.16048268973827362, "rewards/rejected": -1.1081912517547607, "step": 2250 }, { "epoch": 0.8142676995136011, "grad_norm": 18.702375411987305, "learning_rate": 3.69514174206909e-07, "logits/chosen": 0.2706855833530426, "logits/rejected": 0.27766889333724976, "logps/chosen": -120.02188873291016, "logps/rejected": -126.6515121459961, "loss": 0.6858, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7856532335281372, "rewards/margins": 0.05522824451327324, "rewards/rejected": -0.8408814668655396, "step": 2260 }, { "epoch": 0.8178706539362277, "grad_norm": 26.36187744140625, "learning_rate": 3.6813076248905296e-07, "logits/chosen": 0.08216744661331177, "logits/rejected": 0.08492139726877213, "logps/chosen": -157.15606689453125, "logps/rejected": -181.71217346191406, "loss": 0.6458, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.082599401473999, "rewards/margins": 0.22989189624786377, "rewards/rejected": -1.3124912977218628, "step": 2270 }, { "epoch": 0.8214736083588543, "grad_norm": 13.744657516479492, "learning_rate": 3.66742677829787e-07, "logits/chosen": 0.02621442638337612, "logits/rejected": 0.043502770364284515, "logps/chosen": -130.1800537109375, "logps/rejected": -156.7100372314453, "loss": 0.6069, "rewards/accuracies": 0.625, "rewards/chosen": -0.8457847833633423, "rewards/margins": 0.24271400272846222, "rewards/rejected": -1.088498830795288, "step": 2280 }, { "epoch": 0.8250765627814808, "grad_norm": 27.42162322998047, "learning_rate": 3.6534997513807933e-07, "logits/chosen": 0.24865540862083435, "logits/rejected": 0.25898540019989014, "logps/chosen": -127.29747009277344, "logps/rejected": -148.6510009765625, "loss": 0.6276, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.7310788035392761, "rewards/margins": 0.2048756331205368, "rewards/rejected": -0.9359544515609741, "step": 2290 }, { "epoch": 0.8286795172041074, "grad_norm": 18.887805938720703, "learning_rate": 3.639527095055753e-07, "logits/chosen": 0.11071660369634628, "logits/rejected": 0.12018336355686188, "logps/chosen": -124.1941909790039, "logps/rejected": -148.11679077148438, "loss": 0.6363, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.8029730916023254, "rewards/margins": 0.21025899052619934, "rewards/rejected": -1.0132321119308472, "step": 2300 }, { "epoch": 0.832282471626734, "grad_norm": 18.347463607788086, "learning_rate": 3.625509362044183e-07, "logits/chosen": 0.2225940227508545, "logits/rejected": 0.22224624454975128, "logps/chosen": -137.62863159179688, "logps/rejected": -155.7116241455078, "loss": 0.655, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.8755800127983093, "rewards/margins": 0.14910510182380676, "rewards/rejected": -1.024685025215149, "step": 2310 }, { "epoch": 0.8358854260493604, "grad_norm": 21.459951400756836, "learning_rate": 3.6114471068506315e-07, "logits/chosen": 0.22385597229003906, "logits/rejected": 0.24280044436454773, "logps/chosen": -137.79945373535156, "logps/rejected": -149.93775939941406, "loss": 0.6715, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.859727680683136, "rewards/margins": 0.14403629302978516, "rewards/rejected": -1.0037639141082764, "step": 2320 }, { "epoch": 0.839488380471987, "grad_norm": 13.235859870910645, "learning_rate": 3.5973408857408263e-07, "logits/chosen": 0.2293127030134201, "logits/rejected": 0.2375296801328659, "logps/chosen": -114.64483642578125, "logps/rejected": -139.06341552734375, "loss": 0.6137, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7316290140151978, "rewards/margins": 0.23739755153656006, "rewards/rejected": -0.9690265655517578, "step": 2330 }, { "epoch": 0.8430913348946136, "grad_norm": 24.24904441833496, "learning_rate": 3.5831912567196717e-07, "logits/chosen": 0.15491041541099548, "logits/rejected": 0.16376709938049316, "logps/chosen": -143.97264099121094, "logps/rejected": -166.48516845703125, "loss": 0.6167, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.9123767614364624, "rewards/margins": 0.236104816198349, "rewards/rejected": -1.1484816074371338, "step": 2340 }, { "epoch": 0.8466942893172401, "grad_norm": 20.42092514038086, "learning_rate": 3.568998779509173e-07, "logits/chosen": 0.20104598999023438, "logits/rejected": 0.2010325938463211, "logps/chosen": -132.96450805664062, "logps/rejected": -158.20303344726562, "loss": 0.6197, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.8694693446159363, "rewards/margins": 0.22495004534721375, "rewards/rejected": -1.0944193601608276, "step": 2350 }, { "epoch": 0.8502972437398667, "grad_norm": 34.526084899902344, "learning_rate": 3.5547640155262984e-07, "logits/chosen": 0.10567860305309296, "logits/rejected": 0.11500003188848495, "logps/chosen": -153.9732208251953, "logps/rejected": -160.86959838867188, "loss": 0.692, "rewards/accuracies": 0.5, "rewards/chosen": -1.0203911066055298, "rewards/margins": 0.08179245889186859, "rewards/rejected": -1.1021835803985596, "step": 2360 }, { "epoch": 0.8539001981624933, "grad_norm": 14.429137229919434, "learning_rate": 3.5404875278607685e-07, "logits/chosen": -0.07767397910356522, "logits/rejected": -0.05828147381544113, "logps/chosen": -143.14736938476562, "logps/rejected": -166.0102081298828, "loss": 0.6237, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9092574119567871, "rewards/margins": 0.21947212517261505, "rewards/rejected": -1.1287293434143066, "step": 2370 }, { "epoch": 0.8575031525851198, "grad_norm": 18.862186431884766, "learning_rate": 3.5261698812527847e-07, "logits/chosen": -0.05472679063677788, "logits/rejected": -0.04479784518480301, "logps/chosen": -126.60133361816406, "logps/rejected": -149.07211303710938, "loss": 0.608, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8011096119880676, "rewards/margins": 0.21337732672691345, "rewards/rejected": -1.0144869089126587, "step": 2380 }, { "epoch": 0.8611061070077464, "grad_norm": 25.388795852661133, "learning_rate": 3.511811642070684e-07, "logits/chosen": -0.014821426942944527, "logits/rejected": 0.0019395619165152311, "logps/chosen": -143.45809936523438, "logps/rejected": -159.7408447265625, "loss": 0.6523, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.913117527961731, "rewards/margins": 0.1669197529554367, "rewards/rejected": -1.080037236213684, "step": 2390 }, { "epoch": 0.864709061430373, "grad_norm": 16.353431701660156, "learning_rate": 3.4974133782885407e-07, "logits/chosen": 0.17188173532485962, "logits/rejected": 0.17839708924293518, "logps/chosen": -126.90568542480469, "logps/rejected": -164.10240173339844, "loss": 0.5844, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7514697313308716, "rewards/margins": 0.33532294631004333, "rewards/rejected": -1.0867927074432373, "step": 2400 }, { "epoch": 0.8683120158529994, "grad_norm": 28.33049964904785, "learning_rate": 3.482975659463697e-07, "logits/chosen": 0.23143163323402405, "logits/rejected": 0.23064498603343964, "logps/chosen": -144.11611938476562, "logps/rejected": -165.2440643310547, "loss": 0.6236, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.9507433176040649, "rewards/margins": 0.20142099261283875, "rewards/rejected": -1.1521642208099365, "step": 2410 }, { "epoch": 0.871914970275626, "grad_norm": 19.122085571289062, "learning_rate": 3.4684990567142326e-07, "logits/chosen": 0.2861153483390808, "logits/rejected": 0.3029120862483978, "logps/chosen": -135.65419006347656, "logps/rejected": -157.39295959472656, "loss": 0.6447, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.9725810885429382, "rewards/margins": 0.17101779580116272, "rewards/rejected": -1.1435989141464233, "step": 2420 }, { "epoch": 0.8755179246982525, "grad_norm": 25.838401794433594, "learning_rate": 3.4539841426963714e-07, "logits/chosen": 0.27975279092788696, "logits/rejected": 0.2869497239589691, "logps/chosen": -138.52810668945312, "logps/rejected": -165.72518920898438, "loss": 0.6253, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.8694036602973938, "rewards/margins": 0.24049177765846252, "rewards/rejected": -1.1098954677581787, "step": 2430 }, { "epoch": 0.8791208791208791, "grad_norm": 21.262428283691406, "learning_rate": 3.43943149158183e-07, "logits/chosen": 0.41645437479019165, "logits/rejected": 0.41389793157577515, "logps/chosen": -127.73988342285156, "logps/rejected": -157.58078002929688, "loss": 0.6083, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.8539397120475769, "rewards/margins": 0.26630404591560364, "rewards/rejected": -1.120243787765503, "step": 2440 }, { "epoch": 0.8827238335435057, "grad_norm": 14.900445938110352, "learning_rate": 3.4248416790351084e-07, "logits/chosen": 0.21880879998207092, "logits/rejected": 0.2357928305864334, "logps/chosen": -148.3535614013672, "logps/rejected": -178.9004669189453, "loss": 0.6129, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.9314918518066406, "rewards/margins": 0.276610791683197, "rewards/rejected": -1.2081027030944824, "step": 2450 }, { "epoch": 0.8863267879661322, "grad_norm": 15.81990909576416, "learning_rate": 3.4102152821907094e-07, "logits/chosen": 0.2893638610839844, "logits/rejected": 0.2972751259803772, "logps/chosen": -119.5505142211914, "logps/rejected": -146.17257690429688, "loss": 0.6135, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7644268870353699, "rewards/margins": 0.26205748319625854, "rewards/rejected": -1.0264842510223389, "step": 2460 }, { "epoch": 0.8899297423887588, "grad_norm": 18.757734298706055, "learning_rate": 3.395552879630318e-07, "logits/chosen": 0.45842042565345764, "logits/rejected": 0.44962722063064575, "logps/chosen": -130.2977752685547, "logps/rejected": -158.32522583007812, "loss": 0.5978, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.893252968788147, "rewards/margins": 0.27786940336227417, "rewards/rejected": -1.171122431755066, "step": 2470 }, { "epoch": 0.8935326968113854, "grad_norm": 31.11212730407715, "learning_rate": 3.380855051359911e-07, "logits/chosen": 0.47310179471969604, "logits/rejected": 0.4903450608253479, "logps/chosen": -135.8131866455078, "logps/rejected": -170.66873168945312, "loss": 0.5922, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9029496312141418, "rewards/margins": 0.297838032245636, "rewards/rejected": -1.2007876634597778, "step": 2480 }, { "epoch": 0.8971356512340118, "grad_norm": 40.82261276245117, "learning_rate": 3.366122378786809e-07, "logits/chosen": 0.5346935987472534, "logits/rejected": 0.5293043255805969, "logps/chosen": -159.31536865234375, "logps/rejected": -183.24658203125, "loss": 0.6357, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.0518207550048828, "rewards/margins": 0.2071865350008011, "rewards/rejected": -1.259007215499878, "step": 2490 }, { "epoch": 0.9007386056566384, "grad_norm": 31.10675621032715, "learning_rate": 3.351355444696684e-07, "logits/chosen": 0.4077302813529968, "logits/rejected": 0.40855541825294495, "logps/chosen": -150.89895629882812, "logps/rejected": -178.08035278320312, "loss": 0.6147, "rewards/accuracies": 0.5625, "rewards/chosen": -1.0311760902404785, "rewards/margins": 0.24248750507831573, "rewards/rejected": -1.2736636400222778, "step": 2500 }, { "epoch": 0.904341560079265, "grad_norm": 20.22747802734375, "learning_rate": 3.336554833230504e-07, "logits/chosen": 0.32154136896133423, "logits/rejected": 0.31612735986709595, "logps/chosen": -149.76686096191406, "logps/rejected": -174.6000213623047, "loss": 0.6399, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.9929599761962891, "rewards/margins": 0.18228769302368164, "rewards/rejected": -1.1752477884292603, "step": 2510 }, { "epoch": 0.9079445145018915, "grad_norm": 21.268247604370117, "learning_rate": 3.3217211298614225e-07, "logits/chosen": 0.4436109662055969, "logits/rejected": 0.43255481123924255, "logps/chosen": -143.47824096679688, "logps/rejected": -167.93153381347656, "loss": 0.6506, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.0040336847305298, "rewards/margins": 0.17765824496746063, "rewards/rejected": -1.1816918849945068, "step": 2520 }, { "epoch": 0.9115474689245181, "grad_norm": 29.47437858581543, "learning_rate": 3.306854921371623e-07, "logits/chosen": 0.32976576685905457, "logits/rejected": 0.33072155714035034, "logps/chosen": -149.66160583496094, "logps/rejected": -180.8374481201172, "loss": 0.6016, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.0591691732406616, "rewards/margins": 0.2985631823539734, "rewards/rejected": -1.3577325344085693, "step": 2530 }, { "epoch": 0.9151504233471447, "grad_norm": 15.695672035217285, "learning_rate": 3.291956795829107e-07, "logits/chosen": 0.21910643577575684, "logits/rejected": 0.21998150646686554, "logps/chosen": -149.00985717773438, "logps/rejected": -182.81680297851562, "loss": 0.5892, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0104659795761108, "rewards/margins": 0.3092425763607025, "rewards/rejected": -1.3197085857391357, "step": 2540 }, { "epoch": 0.9187533777697712, "grad_norm": 36.10321044921875, "learning_rate": 3.277027342564428e-07, "logits/chosen": 0.2809387147426605, "logits/rejected": 0.29367387294769287, "logps/chosen": -140.8743438720703, "logps/rejected": -170.73983764648438, "loss": 0.613, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.974753737449646, "rewards/margins": 0.25796306133270264, "rewards/rejected": -1.2327167987823486, "step": 2550 }, { "epoch": 0.9223563321923978, "grad_norm": 50.42454528808594, "learning_rate": 3.262067152147383e-07, "logits/chosen": 0.2883756756782532, "logits/rejected": 0.2923746705055237, "logps/chosen": -167.38450622558594, "logps/rejected": -207.9312286376953, "loss": 0.5722, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.1744787693023682, "rewards/margins": 0.38665971159935, "rewards/rejected": -1.5611385107040405, "step": 2560 }, { "epoch": 0.9259592866150244, "grad_norm": 15.738551139831543, "learning_rate": 3.247076816363649e-07, "logits/chosen": 0.5385319590568542, "logits/rejected": 0.550905704498291, "logps/chosen": -145.3175506591797, "logps/rejected": -181.95762634277344, "loss": 0.5829, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9679282903671265, "rewards/margins": 0.34122833609580994, "rewards/rejected": -1.3091566562652588, "step": 2570 }, { "epoch": 0.9295622410376508, "grad_norm": 34.513187408447266, "learning_rate": 3.2320569281913754e-07, "logits/chosen": 0.5941283106803894, "logits/rejected": 0.600369930267334, "logps/chosen": -156.93695068359375, "logps/rejected": -183.19618225097656, "loss": 0.6259, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1337244510650635, "rewards/margins": 0.22543902695178986, "rewards/rejected": -1.3591634035110474, "step": 2580 }, { "epoch": 0.9331651954602774, "grad_norm": 59.64002227783203, "learning_rate": 3.2170080817777257e-07, "logits/chosen": 0.6295598745346069, "logits/rejected": 0.6174293756484985, "logps/chosen": -164.24746704101562, "logps/rejected": -174.8365936279297, "loss": 0.6714, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.202728509902954, "rewards/margins": 0.09689317643642426, "rewards/rejected": -1.29962158203125, "step": 2590 }, { "epoch": 0.936768149882904, "grad_norm": 14.286884307861328, "learning_rate": 3.2019308724153736e-07, "logits/chosen": 0.48095375299453735, "logits/rejected": 0.47925907373428345, "logps/chosen": -157.25457763671875, "logps/rejected": -174.47314453125, "loss": 0.66, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.0621507167816162, "rewards/margins": 0.1528588831424713, "rewards/rejected": -1.2150094509124756, "step": 2600 }, { "epoch": 0.9403711043055305, "grad_norm": 22.402284622192383, "learning_rate": 3.186825896518958e-07, "logits/chosen": 0.3799865245819092, "logits/rejected": 0.3996959328651428, "logps/chosen": -141.32208251953125, "logps/rejected": -182.70462036132812, "loss": 0.5751, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0077296495437622, "rewards/margins": 0.37155282497406006, "rewards/rejected": -1.3792823553085327, "step": 2610 }, { "epoch": 0.9439740587281571, "grad_norm": 42.79711151123047, "learning_rate": 3.171693751601486e-07, "logits/chosen": 0.4481154978275299, "logits/rejected": 0.453838974237442, "logps/chosen": -152.2375030517578, "logps/rejected": -168.63235473632812, "loss": 0.6532, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.0837265253067017, "rewards/margins": 0.14884427189826965, "rewards/rejected": -1.2325708866119385, "step": 2620 }, { "epoch": 0.9475770131507837, "grad_norm": 25.737060546875, "learning_rate": 3.156535036250705e-07, "logits/chosen": 0.44727545976638794, "logits/rejected": 0.4579724669456482, "logps/chosen": -133.33920288085938, "logps/rejected": -167.18508911132812, "loss": 0.6, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8165891766548157, "rewards/margins": 0.2997525632381439, "rewards/rejected": -1.1163415908813477, "step": 2630 }, { "epoch": 0.9511799675734102, "grad_norm": 18.539308547973633, "learning_rate": 3.141350350105413e-07, "logits/chosen": 0.5598092675209045, "logits/rejected": 0.5695661306381226, "logps/chosen": -134.55953979492188, "logps/rejected": -171.2039031982422, "loss": 0.573, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.9224777221679688, "rewards/margins": 0.3509232997894287, "rewards/rejected": -1.273400902748108, "step": 2640 }, { "epoch": 0.9547829219960368, "grad_norm": 20.745208740234375, "learning_rate": 3.126140293831746e-07, "logits/chosen": 0.5451745986938477, "logits/rejected": 0.5272158980369568, "logps/chosen": -166.62710571289062, "logps/rejected": -186.47088623046875, "loss": 0.6488, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1236568689346313, "rewards/margins": 0.18489673733711243, "rewards/rejected": -1.3085535764694214, "step": 2650 }, { "epoch": 0.9583858764186634, "grad_norm": 30.563156127929688, "learning_rate": 3.1109054690994175e-07, "logits/chosen": 0.7699509859085083, "logits/rejected": 0.7913106679916382, "logps/chosen": -189.5458221435547, "logps/rejected": -222.76651000976562, "loss": 0.6159, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4004586935043335, "rewards/margins": 0.3266792893409729, "rewards/rejected": -1.7271379232406616, "step": 2660 }, { "epoch": 0.9619888308412898, "grad_norm": 18.110149383544922, "learning_rate": 3.095646478557912e-07, "logits/chosen": 0.760633111000061, "logits/rejected": 0.7680120468139648, "logps/chosen": -154.74481201171875, "logps/rejected": -184.4192352294922, "loss": 0.63, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.055311918258667, "rewards/margins": 0.264528751373291, "rewards/rejected": -1.3198407888412476, "step": 2670 }, { "epoch": 0.9655917852639164, "grad_norm": 32.540931701660156, "learning_rate": 3.0803639258126533e-07, "logits/chosen": 0.7593638300895691, "logits/rejected": 0.7703729271888733, "logps/chosen": -142.2159423828125, "logps/rejected": -161.55844116210938, "loss": 0.6595, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.9925827980041504, "rewards/margins": 0.16774709522724152, "rewards/rejected": -1.160329818725586, "step": 2680 }, { "epoch": 0.969194739686543, "grad_norm": 29.07330894470215, "learning_rate": 3.0650584154011226e-07, "logits/chosen": 0.6375503540039062, "logits/rejected": 0.6571865081787109, "logps/chosen": -135.99136352539062, "logps/rejected": -157.39926147460938, "loss": 0.6373, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.8374320864677429, "rewards/margins": 0.20494922995567322, "rewards/rejected": -1.0423814058303833, "step": 2690 }, { "epoch": 0.9727976941091695, "grad_norm": 12.809856414794922, "learning_rate": 3.049730552768944e-07, "logits/chosen": 0.5168333649635315, "logits/rejected": 0.5086256265640259, "logps/chosen": -136.57504272460938, "logps/rejected": -155.1215057373047, "loss": 0.6371, "rewards/accuracies": 0.625, "rewards/chosen": -0.8350626826286316, "rewards/margins": 0.18451206386089325, "rewards/rejected": -1.019574761390686, "step": 2700 }, { "epoch": 0.9764006485317961, "grad_norm": 29.495820999145508, "learning_rate": 3.034380944245939e-07, "logits/chosen": 0.5153013467788696, "logits/rejected": 0.5032767057418823, "logps/chosen": -120.64376068115234, "logps/rejected": -144.13934326171875, "loss": 0.6334, "rewards/accuracies": 0.625, "rewards/chosen": -0.7888122200965881, "rewards/margins": 0.2317841351032257, "rewards/rejected": -1.0205962657928467, "step": 2710 }, { "epoch": 0.9800036029544227, "grad_norm": 16.63632583618164, "learning_rate": 3.0190101970221383e-07, "logits/chosen": 0.5460031628608704, "logits/rejected": 0.5601615905761719, "logps/chosen": -151.58692932128906, "logps/rejected": -186.6927490234375, "loss": 0.5919, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0471632480621338, "rewards/margins": 0.3273688852787018, "rewards/rejected": -1.3745321035385132, "step": 2720 }, { "epoch": 0.9836065573770492, "grad_norm": 19.62331199645996, "learning_rate": 3.0036189191237625e-07, "logits/chosen": 0.5685732960700989, "logits/rejected": 0.5920356512069702, "logps/chosen": -140.84339904785156, "logps/rejected": -171.97987365722656, "loss": 0.6112, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.9551582336425781, "rewards/margins": 0.2565491199493408, "rewards/rejected": -1.211707353591919, "step": 2730 }, { "epoch": 0.9872095117996758, "grad_norm": 28.855976104736328, "learning_rate": 2.9882077193891746e-07, "logits/chosen": 0.6157333254814148, "logits/rejected": 0.6215740442276001, "logps/chosen": -139.96463012695312, "logps/rejected": -158.04739379882812, "loss": 0.6528, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9794723391532898, "rewards/margins": 0.1677200049161911, "rewards/rejected": -1.147192358970642, "step": 2740 }, { "epoch": 0.9908124662223022, "grad_norm": 13.104145050048828, "learning_rate": 2.972777207444791e-07, "logits/chosen": 0.955846905708313, "logits/rejected": 0.9601390957832336, "logps/chosen": -132.74032592773438, "logps/rejected": -150.71829223632812, "loss": 0.6473, "rewards/accuracies": 0.625, "rewards/chosen": -0.8894306421279907, "rewards/margins": 0.15613070130348206, "rewards/rejected": -1.0455615520477295, "step": 2750 }, { "epoch": 0.9944154206449288, "grad_norm": 26.207012176513672, "learning_rate": 2.9573279936809665e-07, "logits/chosen": 1.0859369039535522, "logits/rejected": 1.0927445888519287, "logps/chosen": -141.00888061523438, "logps/rejected": -158.0570068359375, "loss": 0.6478, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9025720357894897, "rewards/margins": 0.15658636391162872, "rewards/rejected": -1.0591583251953125, "step": 2760 }, { "epoch": 0.9980183750675554, "grad_norm": 27.988555908203125, "learning_rate": 2.941860689227854e-07, "logits/chosen": 1.4826513528823853, "logits/rejected": 1.5037438869476318, "logps/chosen": -163.47537231445312, "logps/rejected": -182.8646240234375, "loss": 0.6467, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.173262119293213, "rewards/margins": 0.19181916117668152, "rewards/rejected": -1.3650814294815063, "step": 2770 }, { "epoch": 1.001621329490182, "grad_norm": 16.333715438842773, "learning_rate": 2.9263759059312243e-07, "logits/chosen": 1.7516014575958252, "logits/rejected": 1.7535407543182373, "logps/chosen": -157.38601684570312, "logps/rejected": -180.1123504638672, "loss": 0.6446, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1189873218536377, "rewards/margins": 0.19123554229736328, "rewards/rejected": -1.3102229833602905, "step": 2780 }, { "epoch": 1.0052242839128085, "grad_norm": 18.72308349609375, "learning_rate": 2.910874256328265e-07, "logits/chosen": 1.6995985507965088, "logits/rejected": 1.6985797882080078, "logps/chosen": -158.37786865234375, "logps/rejected": -183.59176635742188, "loss": 0.6184, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.1016285419464111, "rewards/margins": 0.22949960827827454, "rewards/rejected": -1.3311281204223633, "step": 2790 }, { "epoch": 1.008827238335435, "grad_norm": 26.921524047851562, "learning_rate": 2.895356353623352e-07, "logits/chosen": 1.3525912761688232, "logits/rejected": 1.3631973266601562, "logps/chosen": -141.93482971191406, "logps/rejected": -191.4202423095703, "loss": 0.5353, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.9720309972763062, "rewards/margins": 0.47538915276527405, "rewards/rejected": -1.447420358657837, "step": 2800 }, { "epoch": 1.0124301927580617, "grad_norm": 17.440500259399414, "learning_rate": 2.8798228116637895e-07, "logits/chosen": 1.2329285144805908, "logits/rejected": 1.2448673248291016, "logps/chosen": -143.15548706054688, "logps/rejected": -182.87033081054688, "loss": 0.5705, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9814394116401672, "rewards/margins": 0.378670334815979, "rewards/rejected": -1.360109806060791, "step": 2810 }, { "epoch": 1.0160331471806883, "grad_norm": 21.634838104248047, "learning_rate": 2.8642742449155284e-07, "logits/chosen": 1.0341891050338745, "logits/rejected": 1.0338951349258423, "logps/chosen": -134.21636962890625, "logps/rejected": -170.69606018066406, "loss": 0.5857, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8977562785148621, "rewards/margins": 0.31924647092819214, "rewards/rejected": -1.2170026302337646, "step": 2820 }, { "epoch": 1.0196361016033146, "grad_norm": 30.021751403808594, "learning_rate": 2.8487112684388637e-07, "logits/chosen": 1.0514543056488037, "logits/rejected": 1.0618269443511963, "logps/chosen": -148.8506317138672, "logps/rejected": -187.91366577148438, "loss": 0.5703, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9830052256584167, "rewards/margins": 0.3607930541038513, "rewards/rejected": -1.3437983989715576, "step": 2830 }, { "epoch": 1.0232390560259412, "grad_norm": 19.15184783935547, "learning_rate": 2.8331344978640993e-07, "logits/chosen": 1.1980868577957153, "logits/rejected": 1.2043404579162598, "logps/chosen": -135.40493774414062, "logps/rejected": -177.10235595703125, "loss": 0.5614, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.8613203763961792, "rewards/margins": 0.4097590446472168, "rewards/rejected": -1.271079421043396, "step": 2840 }, { "epoch": 1.0268420104485678, "grad_norm": 37.32911682128906, "learning_rate": 2.8175445493671966e-07, "logits/chosen": 1.3482367992401123, "logits/rejected": 1.3609994649887085, "logps/chosen": -138.9056396484375, "logps/rejected": -172.74681091308594, "loss": 0.6008, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0595502853393555, "rewards/margins": 0.3141409754753113, "rewards/rejected": -1.373691439628601, "step": 2850 }, { "epoch": 1.0304449648711944, "grad_norm": 47.66082763671875, "learning_rate": 2.801942039645403e-07, "logits/chosen": 1.2990331649780273, "logits/rejected": 1.3113834857940674, "logps/chosen": -154.54800415039062, "logps/rejected": -191.79144287109375, "loss": 0.5699, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1076867580413818, "rewards/margins": 0.37979164719581604, "rewards/rejected": -1.487478494644165, "step": 2860 }, { "epoch": 1.034047919293821, "grad_norm": 26.355426788330078, "learning_rate": 2.7863275858928527e-07, "logits/chosen": 1.304172396659851, "logits/rejected": 1.31345534324646, "logps/chosen": -170.28895568847656, "logps/rejected": -190.68533325195312, "loss": 0.6352, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1143049001693726, "rewards/margins": 0.21352222561836243, "rewards/rejected": -1.3278272151947021, "step": 2870 }, { "epoch": 1.0376508737164474, "grad_norm": 25.115501403808594, "learning_rate": 2.7707018057761543e-07, "logits/chosen": 1.344402551651001, "logits/rejected": 1.3745404481887817, "logps/chosen": -170.15597534179688, "logps/rejected": -207.5067138671875, "loss": 0.5719, "rewards/accuracies": 0.75, "rewards/chosen": -1.150154948234558, "rewards/margins": 0.36334314942359924, "rewards/rejected": -1.513498067855835, "step": 2880 }, { "epoch": 1.041253828139074, "grad_norm": 24.93830680847168, "learning_rate": 2.7550653174099604e-07, "logits/chosen": 1.3183590173721313, "logits/rejected": 1.3358465433120728, "logps/chosen": -182.6671600341797, "logps/rejected": -229.6891326904297, "loss": 0.5773, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.382752537727356, "rewards/margins": 0.40944772958755493, "rewards/rejected": -1.7922000885009766, "step": 2890 }, { "epoch": 1.0448567825617006, "grad_norm": 32.1740608215332, "learning_rate": 2.73941873933251e-07, "logits/chosen": 1.2893548011779785, "logits/rejected": 1.303473711013794, "logps/chosen": -163.84793090820312, "logps/rejected": -199.80706787109375, "loss": 0.5972, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2317514419555664, "rewards/margins": 0.30811601877212524, "rewards/rejected": -1.5398674011230469, "step": 2900 }, { "epoch": 1.0484597369843272, "grad_norm": 47.1985969543457, "learning_rate": 2.723762690481167e-07, "logits/chosen": 1.1547422409057617, "logits/rejected": 1.1732709407806396, "logps/chosen": -161.27716064453125, "logps/rejected": -200.41769409179688, "loss": 0.5814, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1469686031341553, "rewards/margins": 0.3593052923679352, "rewards/rejected": -1.5062739849090576, "step": 2910 }, { "epoch": 1.0520626914069537, "grad_norm": 19.92612075805664, "learning_rate": 2.708097790167932e-07, "logits/chosen": 1.213963508605957, "logits/rejected": 1.225260615348816, "logps/chosen": -160.59677124023438, "logps/rejected": -218.94448852539062, "loss": 0.5272, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.1272809505462646, "rewards/margins": 0.5488572120666504, "rewards/rejected": -1.676138162612915, "step": 2920 }, { "epoch": 1.0556656458295803, "grad_norm": 27.217994689941406, "learning_rate": 2.692424658054948e-07, "logits/chosen": 1.2021996974945068, "logits/rejected": 1.2243382930755615, "logps/chosen": -150.98605346679688, "logps/rejected": -191.73977661132812, "loss": 0.5703, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0560792684555054, "rewards/margins": 0.38107022643089294, "rewards/rejected": -1.4371494054794312, "step": 2930 }, { "epoch": 1.0592686002522067, "grad_norm": 23.613012313842773, "learning_rate": 2.676743914129986e-07, "logits/chosen": 1.4036846160888672, "logits/rejected": 1.398654818534851, "logps/chosen": -151.34022521972656, "logps/rejected": -191.13412475585938, "loss": 0.568, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0999627113342285, "rewards/margins": 0.3745970129966736, "rewards/rejected": -1.4745595455169678, "step": 2940 }, { "epoch": 1.0628715546748333, "grad_norm": 20.741044998168945, "learning_rate": 2.66105617868192e-07, "logits/chosen": 1.2105770111083984, "logits/rejected": 1.2305647134780884, "logps/chosen": -150.4530487060547, "logps/rejected": -188.18063354492188, "loss": 0.5884, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0379226207733154, "rewards/margins": 0.35215944051742554, "rewards/rejected": -1.3900820016860962, "step": 2950 }, { "epoch": 1.0664745090974599, "grad_norm": 24.45860481262207, "learning_rate": 2.6453620722761895e-07, "logits/chosen": 1.2178564071655273, "logits/rejected": 1.2193264961242676, "logps/chosen": -141.83470153808594, "logps/rejected": -169.4464569091797, "loss": 0.604, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9304254651069641, "rewards/margins": 0.2524339556694031, "rewards/rejected": -1.1828593015670776, "step": 2960 }, { "epoch": 1.0700774635200865, "grad_norm": 16.757301330566406, "learning_rate": 2.629662215730253e-07, "logits/chosen": 1.0107862949371338, "logits/rejected": 1.0186793804168701, "logps/chosen": -154.68051147460938, "logps/rejected": -184.76063537597656, "loss": 0.5993, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.013504981994629, "rewards/margins": 0.2779073417186737, "rewards/rejected": -1.291412353515625, "step": 2970 }, { "epoch": 1.073680417942713, "grad_norm": 18.887826919555664, "learning_rate": 2.6139572300890284e-07, "logits/chosen": 0.9930673837661743, "logits/rejected": 1.0053017139434814, "logps/chosen": -149.2520751953125, "logps/rejected": -189.97689819335938, "loss": 0.5617, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.9238582849502563, "rewards/margins": 0.4110753536224365, "rewards/rejected": -1.3349335193634033, "step": 2980 }, { "epoch": 1.0772833723653397, "grad_norm": 45.729061126708984, "learning_rate": 2.598247736600328e-07, "logits/chosen": 1.266531229019165, "logits/rejected": 1.2912251949310303, "logps/chosen": -156.42117309570312, "logps/rejected": -194.59445190429688, "loss": 0.5698, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1178277730941772, "rewards/margins": 0.35697516798973083, "rewards/rejected": -1.47480309009552, "step": 2990 }, { "epoch": 1.080886326787966, "grad_norm": 33.51045608520508, "learning_rate": 2.5825343566902833e-07, "logits/chosen": 1.4589413404464722, "logits/rejected": 1.462807536125183, "logps/chosen": -165.98814392089844, "logps/rejected": -203.2148895263672, "loss": 0.5743, "rewards/accuracies": 0.75, "rewards/chosen": -1.2189103364944458, "rewards/margins": 0.35195571184158325, "rewards/rejected": -1.5708658695220947, "step": 3000 }, { "epoch": 1.0844892812105926, "grad_norm": 25.705270767211914, "learning_rate": 2.5668177119387617e-07, "logits/chosen": 1.634394884109497, "logits/rejected": 1.6530296802520752, "logps/chosen": -167.57522583007812, "logps/rejected": -193.9906005859375, "loss": 0.6154, "rewards/accuracies": 0.625, "rewards/chosen": -1.272362470626831, "rewards/margins": 0.2553406059741974, "rewards/rejected": -1.5277031660079956, "step": 3010 }, { "epoch": 1.0880922356332192, "grad_norm": 34.7142219543457, "learning_rate": 2.5510984240547787e-07, "logits/chosen": 1.3826122283935547, "logits/rejected": 1.4002482891082764, "logps/chosen": -175.1686553955078, "logps/rejected": -225.0598907470703, "loss": 0.5373, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.293283224105835, "rewards/margins": 0.46684423089027405, "rewards/rejected": -1.7601274251937866, "step": 3020 }, { "epoch": 1.0916951900558458, "grad_norm": 17.66757583618164, "learning_rate": 2.535377114851905e-07, "logits/chosen": 1.4605615139007568, "logits/rejected": 1.4629911184310913, "logps/chosen": -181.341796875, "logps/rejected": -200.5535888671875, "loss": 0.634, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2361996173858643, "rewards/margins": 0.20834848284721375, "rewards/rejected": -1.4445480108261108, "step": 3030 }, { "epoch": 1.0952981444784724, "grad_norm": 18.307947158813477, "learning_rate": 2.5196544062236707e-07, "logits/chosen": 1.6120392084121704, "logits/rejected": 1.6377861499786377, "logps/chosen": -172.27444458007812, "logps/rejected": -202.8404083251953, "loss": 0.5851, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2176176309585571, "rewards/margins": 0.3250825107097626, "rewards/rejected": -1.542700171470642, "step": 3040 }, { "epoch": 1.098901098901099, "grad_norm": 19.542686462402344, "learning_rate": 2.503930920118961e-07, "logits/chosen": 1.651121735572815, "logits/rejected": 1.6678673028945923, "logps/chosen": -187.13082885742188, "logps/rejected": -230.998291015625, "loss": 0.5629, "rewards/accuracies": 0.75, "rewards/chosen": -1.3362236022949219, "rewards/margins": 0.4135264754295349, "rewards/rejected": -1.7497501373291016, "step": 3050 }, { "epoch": 1.1025040533237254, "grad_norm": 27.288055419921875, "learning_rate": 2.4882072785174194e-07, "logits/chosen": 1.6910793781280518, "logits/rejected": 1.7082237005233765, "logps/chosen": -180.62667846679688, "logps/rejected": -218.1063995361328, "loss": 0.5924, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3642722368240356, "rewards/margins": 0.32754629850387573, "rewards/rejected": -1.6918185949325562, "step": 3060 }, { "epoch": 1.106107007746352, "grad_norm": 23.36140251159668, "learning_rate": 2.472484103404839e-07, "logits/chosen": 1.8909047842025757, "logits/rejected": 1.9143555164337158, "logps/chosen": -177.67005920410156, "logps/rejected": -226.27023315429688, "loss": 0.5493, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3711577653884888, "rewards/margins": 0.4601329267024994, "rewards/rejected": -1.831290602684021, "step": 3070 }, { "epoch": 1.1097099621689785, "grad_norm": 23.70610809326172, "learning_rate": 2.456762016748556e-07, "logits/chosen": 1.6471796035766602, "logits/rejected": 1.6680177450180054, "logps/chosen": -182.0051727294922, "logps/rejected": -212.8391876220703, "loss": 0.6011, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.307997226715088, "rewards/margins": 0.28122976422309875, "rewards/rejected": -1.5892269611358643, "step": 3080 }, { "epoch": 1.1133129165916051, "grad_norm": 29.763378143310547, "learning_rate": 2.441041640472858e-07, "logits/chosen": 1.7746632099151611, "logits/rejected": 1.7879133224487305, "logps/chosen": -177.87359619140625, "logps/rejected": -214.80038452148438, "loss": 0.5802, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.3018782138824463, "rewards/margins": 0.3463166654109955, "rewards/rejected": -1.6481950283050537, "step": 3090 }, { "epoch": 1.1169158710142317, "grad_norm": 18.889694213867188, "learning_rate": 2.4253235964343674e-07, "logits/chosen": 1.5324766635894775, "logits/rejected": 1.5299230813980103, "logps/chosen": -167.12747192382812, "logps/rejected": -210.18603515625, "loss": 0.5655, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.2045719623565674, "rewards/margins": 0.4194146990776062, "rewards/rejected": -1.623986840248108, "step": 3100 }, { "epoch": 1.1205188254368583, "grad_norm": 25.045333862304688, "learning_rate": 2.409608506397452e-07, "logits/chosen": 1.4209635257720947, "logits/rejected": 1.4399478435516357, "logps/chosen": -172.46139526367188, "logps/rejected": -226.7716064453125, "loss": 0.5278, "rewards/accuracies": 0.75, "rewards/chosen": -1.2924270629882812, "rewards/margins": 0.5143944621086121, "rewards/rejected": -1.8068214654922485, "step": 3110 }, { "epoch": 1.1241217798594847, "grad_norm": 39.801109313964844, "learning_rate": 2.3938969920096296e-07, "logits/chosen": 1.3100801706314087, "logits/rejected": 1.3263208866119385, "logps/chosen": -182.75791931152344, "logps/rejected": -214.8870086669922, "loss": 0.5971, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2580931186676025, "rewards/margins": 0.2964404225349426, "rewards/rejected": -1.55453360080719, "step": 3120 }, { "epoch": 1.1277247342821113, "grad_norm": 39.32272720336914, "learning_rate": 2.3781896747769694e-07, "logits/chosen": 1.4089020490646362, "logits/rejected": 1.3984390497207642, "logps/chosen": -169.61061096191406, "logps/rejected": -199.11709594726562, "loss": 0.6259, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2158353328704834, "rewards/margins": 0.2781444191932678, "rewards/rejected": -1.4939799308776855, "step": 3130 }, { "epoch": 1.1313276887047379, "grad_norm": 28.20737648010254, "learning_rate": 2.3624871760395174e-07, "logits/chosen": 1.1554807424545288, "logits/rejected": 1.1675853729248047, "logps/chosen": -168.42044067382812, "logps/rejected": -200.41329956054688, "loss": 0.6016, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2163656949996948, "rewards/margins": 0.28965193033218384, "rewards/rejected": -1.5060179233551025, "step": 3140 }, { "epoch": 1.1349306431273645, "grad_norm": 20.062538146972656, "learning_rate": 2.3467901169467096e-07, "logits/chosen": 0.9233118295669556, "logits/rejected": 0.9547305107116699, "logps/chosen": -155.99371337890625, "logps/rejected": -204.6820068359375, "loss": 0.5564, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.0997803211212158, "rewards/margins": 0.4566062092781067, "rewards/rejected": -1.5563864707946777, "step": 3150 }, { "epoch": 1.138533597549991, "grad_norm": 28.119457244873047, "learning_rate": 2.331099118432804e-07, "logits/chosen": 0.9703793525695801, "logits/rejected": 0.9870656728744507, "logps/chosen": -161.81423950195312, "logps/rejected": -210.0536346435547, "loss": 0.5446, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.0635930299758911, "rewards/margins": 0.4630206227302551, "rewards/rejected": -1.5266135931015015, "step": 3160 }, { "epoch": 1.1421365519726177, "grad_norm": 25.566730499267578, "learning_rate": 2.3154148011923205e-07, "logits/chosen": 1.0091289281845093, "logits/rejected": 1.018500804901123, "logps/chosen": -149.37063598632812, "logps/rejected": -185.6090545654297, "loss": 0.5847, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.0620778799057007, "rewards/margins": 0.33241063356399536, "rewards/rejected": -1.3944883346557617, "step": 3170 }, { "epoch": 1.145739506395244, "grad_norm": 19.381229400634766, "learning_rate": 2.299737785655482e-07, "logits/chosen": 0.9534355401992798, "logits/rejected": 0.9737855195999146, "logps/chosen": -136.06202697753906, "logps/rejected": -177.0410614013672, "loss": 0.5696, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.8820406198501587, "rewards/margins": 0.3690762519836426, "rewards/rejected": -1.2511168718338013, "step": 3180 }, { "epoch": 1.1493424608178706, "grad_norm": 22.582590103149414, "learning_rate": 2.284068691963679e-07, "logits/chosen": 0.9719399213790894, "logits/rejected": 0.9772558212280273, "logps/chosen": -156.23695373535156, "logps/rejected": -189.47659301757812, "loss": 0.6057, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9947269558906555, "rewards/margins": 0.333953320980072, "rewards/rejected": -1.3286802768707275, "step": 3190 }, { "epoch": 1.1529454152404972, "grad_norm": 36.40176773071289, "learning_rate": 2.2684081399449323e-07, "logits/chosen": 1.103335976600647, "logits/rejected": 1.128285527229309, "logps/chosen": -155.7313995361328, "logps/rejected": -195.7241973876953, "loss": 0.5751, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1343395709991455, "rewards/margins": 0.3672861158847809, "rewards/rejected": -1.5016257762908936, "step": 3200 }, { "epoch": 1.1565483696631238, "grad_norm": 21.671106338500977, "learning_rate": 2.2527567490893755e-07, "logits/chosen": 1.2460180521011353, "logits/rejected": 1.2542035579681396, "logps/chosen": -171.56942749023438, "logps/rejected": -207.84957885742188, "loss": 0.5866, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2231214046478271, "rewards/margins": 0.34527188539505005, "rewards/rejected": -1.5683931112289429, "step": 3210 }, { "epoch": 1.1601513240857504, "grad_norm": 43.77824020385742, "learning_rate": 2.2371151385247544e-07, "logits/chosen": 1.3986154794692993, "logits/rejected": 1.405017375946045, "logps/chosen": -173.19020080566406, "logps/rejected": -201.94764709472656, "loss": 0.6082, "rewards/accuracies": 0.625, "rewards/chosen": -1.268537163734436, "rewards/margins": 0.2790360450744629, "rewards/rejected": -1.547573208808899, "step": 3220 }, { "epoch": 1.1637542785083768, "grad_norm": 37.33060073852539, "learning_rate": 2.2214839269919288e-07, "logits/chosen": 1.2926814556121826, "logits/rejected": 1.3005971908569336, "logps/chosen": -177.24127197265625, "logps/rejected": -223.33432006835938, "loss": 0.5431, "rewards/accuracies": 0.8125, "rewards/chosen": -1.298537015914917, "rewards/margins": 0.4248967170715332, "rewards/rejected": -1.7234338521957397, "step": 3230 }, { "epoch": 1.1673572329310034, "grad_norm": 18.869747161865234, "learning_rate": 2.205863732820404e-07, "logits/chosen": 1.297871708869934, "logits/rejected": 1.309443473815918, "logps/chosen": -170.31053161621094, "logps/rejected": -194.85159301757812, "loss": 0.6298, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1644783020019531, "rewards/margins": 0.21012084186077118, "rewards/rejected": -1.3745992183685303, "step": 3240 }, { "epoch": 1.17096018735363, "grad_norm": 43.628421783447266, "learning_rate": 2.1902551739038622e-07, "logits/chosen": 1.1812689304351807, "logits/rejected": 1.2138346433639526, "logps/chosen": -169.75918579101562, "logps/rejected": -204.258056640625, "loss": 0.5745, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.1383635997772217, "rewards/margins": 0.37113919854164124, "rewards/rejected": -1.50950288772583, "step": 3250 }, { "epoch": 1.1745631417762565, "grad_norm": 33.38032913208008, "learning_rate": 2.1746588676757308e-07, "logits/chosen": 1.4970940351486206, "logits/rejected": 1.4973194599151611, "logps/chosen": -152.75123596191406, "logps/rejected": -183.58334350585938, "loss": 0.5906, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.0627615451812744, "rewards/margins": 0.29873785376548767, "rewards/rejected": -1.361499547958374, "step": 3260 }, { "epoch": 1.1781660961988831, "grad_norm": 38.73574447631836, "learning_rate": 2.1590754310847508e-07, "logits/chosen": 1.5429904460906982, "logits/rejected": 1.5525925159454346, "logps/chosen": -171.74215698242188, "logps/rejected": -208.65048217773438, "loss": 0.5845, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2257620096206665, "rewards/margins": 0.37997758388519287, "rewards/rejected": -1.6057395935058594, "step": 3270 }, { "epoch": 1.1817690506215097, "grad_norm": 26.747926712036133, "learning_rate": 2.143505480570573e-07, "logits/chosen": 1.7026646137237549, "logits/rejected": 1.706345796585083, "logps/chosen": -172.7478485107422, "logps/rejected": -202.12100219726562, "loss": 0.6002, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.265164852142334, "rewards/margins": 0.2747200131416321, "rewards/rejected": -1.5398849248886108, "step": 3280 }, { "epoch": 1.1853720050441363, "grad_norm": 18.19614601135254, "learning_rate": 2.1279496320393779e-07, "logits/chosen": 1.7919038534164429, "logits/rejected": 1.8115230798721313, "logps/chosen": -169.95596313476562, "logps/rejected": -199.9071807861328, "loss": 0.5977, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2951703071594238, "rewards/margins": 0.2859644293785095, "rewards/rejected": -1.5811350345611572, "step": 3290 }, { "epoch": 1.1889749594667627, "grad_norm": 34.03652572631836, "learning_rate": 2.112408500839505e-07, "logits/chosen": 1.735272765159607, "logits/rejected": 1.7567336559295654, "logps/chosen": -174.78927612304688, "logps/rejected": -207.58273315429688, "loss": 0.5929, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.353682279586792, "rewards/margins": 0.32816195487976074, "rewards/rejected": -1.6818439960479736, "step": 3300 }, { "epoch": 1.1925779138893893, "grad_norm": 24.930068969726562, "learning_rate": 2.0968827017371192e-07, "logits/chosen": 1.8864787817001343, "logits/rejected": 1.8942277431488037, "logps/chosen": -179.2598876953125, "logps/rejected": -224.4481964111328, "loss": 0.5468, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.3099626302719116, "rewards/margins": 0.4144628047943115, "rewards/rejected": -1.7244255542755127, "step": 3310 }, { "epoch": 1.1961808683120159, "grad_norm": 29.145889282226562, "learning_rate": 2.0813728488918848e-07, "logits/chosen": 1.9467144012451172, "logits/rejected": 1.9601352214813232, "logps/chosen": -193.031494140625, "logps/rejected": -219.81509399414062, "loss": 0.6417, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.4769078493118286, "rewards/margins": 0.26800230145454407, "rewards/rejected": -1.7449100017547607, "step": 3320 }, { "epoch": 1.1997838227346425, "grad_norm": 42.68914031982422, "learning_rate": 2.065879555832674e-07, "logits/chosen": 1.8014549016952515, "logits/rejected": 1.8138593435287476, "logps/chosen": -180.00479125976562, "logps/rejected": -225.0125274658203, "loss": 0.5526, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.3541065454483032, "rewards/margins": 0.42764395475387573, "rewards/rejected": -1.7817504405975342, "step": 3330 }, { "epoch": 1.203386777157269, "grad_norm": 34.69449996948242, "learning_rate": 2.0504034354333004e-07, "logits/chosen": 1.8517045974731445, "logits/rejected": 1.8728597164154053, "logps/chosen": -181.0088348388672, "logps/rejected": -217.8776092529297, "loss": 0.5963, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.3582488298416138, "rewards/margins": 0.3246374726295471, "rewards/rejected": -1.6828863620758057, "step": 3340 }, { "epoch": 1.2069897315798954, "grad_norm": 32.25773239135742, "learning_rate": 2.0349450998882698e-07, "logits/chosen": 1.6548973321914673, "logits/rejected": 1.6580450534820557, "logps/chosen": -173.4766082763672, "logps/rejected": -219.5607452392578, "loss": 0.546, "rewards/accuracies": 0.75, "rewards/chosen": -1.3156660795211792, "rewards/margins": 0.4414525032043457, "rewards/rejected": -1.757118582725525, "step": 3350 }, { "epoch": 1.210592686002522, "grad_norm": 23.346769332885742, "learning_rate": 2.0195051606885681e-07, "logits/chosen": 1.716970443725586, "logits/rejected": 1.7288544178009033, "logps/chosen": -179.84616088867188, "logps/rejected": -232.7053985595703, "loss": 0.5229, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.2913398742675781, "rewards/margins": 0.4949992597103119, "rewards/rejected": -1.7863390445709229, "step": 3360 }, { "epoch": 1.2141956404251486, "grad_norm": 41.44517517089844, "learning_rate": 2.0040842285974683e-07, "logits/chosen": 1.8047672510147095, "logits/rejected": 1.8259674310684204, "logps/chosen": -167.1689453125, "logps/rejected": -208.96798706054688, "loss": 0.572, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2015494108200073, "rewards/margins": 0.40487393736839294, "rewards/rejected": -1.6064231395721436, "step": 3370 }, { "epoch": 1.2177985948477752, "grad_norm": 30.084402084350586, "learning_rate": 1.9886829136263728e-07, "logits/chosen": 1.648328185081482, "logits/rejected": 1.664375901222229, "logps/chosen": -181.54995727539062, "logps/rejected": -223.4890594482422, "loss": 0.5653, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2826671600341797, "rewards/margins": 0.3996722102165222, "rewards/rejected": -1.6823394298553467, "step": 3380 }, { "epoch": 1.2214015492704018, "grad_norm": 36.883480072021484, "learning_rate": 1.973301825010685e-07, "logits/chosen": 1.5796527862548828, "logits/rejected": 1.6008110046386719, "logps/chosen": -179.32435607910156, "logps/rejected": -232.6096649169922, "loss": 0.536, "rewards/accuracies": 0.75, "rewards/chosen": -1.2281485795974731, "rewards/margins": 0.506437361240387, "rewards/rejected": -1.7345860004425049, "step": 3390 }, { "epoch": 1.2250045036930284, "grad_norm": 24.782224655151367, "learning_rate": 1.9579415711857016e-07, "logits/chosen": 1.7208601236343384, "logits/rejected": 1.7337154150009155, "logps/chosen": -171.23948669433594, "logps/rejected": -222.1886749267578, "loss": 0.5463, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2556393146514893, "rewards/margins": 0.48701682686805725, "rewards/rejected": -1.7426563501358032, "step": 3400 }, { "epoch": 1.2286074581156547, "grad_norm": 24.2948055267334, "learning_rate": 1.9426027597625572e-07, "logits/chosen": 1.6595081090927124, "logits/rejected": 1.6720672845840454, "logps/chosen": -154.27455139160156, "logps/rejected": -200.77243041992188, "loss": 0.5543, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.1199270486831665, "rewards/margins": 0.42178797721862793, "rewards/rejected": -1.5417152643203735, "step": 3410 }, { "epoch": 1.2322104125382813, "grad_norm": 42.27910614013672, "learning_rate": 1.9272859975041752e-07, "logits/chosen": 1.641770362854004, "logits/rejected": 1.6533361673355103, "logps/chosen": -157.47933959960938, "logps/rejected": -201.40994262695312, "loss": 0.5535, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.150720477104187, "rewards/margins": 0.40662074089050293, "rewards/rejected": -1.5573410987854004, "step": 3420 }, { "epoch": 1.235813366960908, "grad_norm": 16.248056411743164, "learning_rate": 1.911991890301275e-07, "logits/chosen": 1.603314995765686, "logits/rejected": 1.5976498126983643, "logps/chosen": -158.0511474609375, "logps/rejected": -201.87158203125, "loss": 0.5674, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0927891731262207, "rewards/margins": 0.4202328622341156, "rewards/rejected": -1.5130221843719482, "step": 3430 }, { "epoch": 1.2394163213835345, "grad_norm": 29.743022918701172, "learning_rate": 1.896721043148402e-07, "logits/chosen": 1.6673412322998047, "logits/rejected": 1.6792166233062744, "logps/chosen": -160.25917053222656, "logps/rejected": -200.7294921875, "loss": 0.5779, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2035908699035645, "rewards/margins": 0.39745160937309265, "rewards/rejected": -1.6010425090789795, "step": 3440 }, { "epoch": 1.2430192758061611, "grad_norm": 27.176912307739258, "learning_rate": 1.881474060119994e-07, "logits/chosen": 1.6526479721069336, "logits/rejected": 1.6897213459014893, "logps/chosen": -177.30697631835938, "logps/rejected": -229.65560913085938, "loss": 0.5329, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3172175884246826, "rewards/margins": 0.48564568161964417, "rewards/rejected": -1.8028631210327148, "step": 3450 }, { "epoch": 1.2466222302287875, "grad_norm": 36.72285079956055, "learning_rate": 1.866251544346488e-07, "logits/chosen": 1.756397008895874, "logits/rejected": 1.7641305923461914, "logps/chosen": -195.5851287841797, "logps/rejected": -234.1574249267578, "loss": 0.6014, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4670528173446655, "rewards/margins": 0.36591073870658875, "rewards/rejected": -1.8329633474349976, "step": 3460 }, { "epoch": 1.250225184651414, "grad_norm": 22.320993423461914, "learning_rate": 1.8510540979904617e-07, "logits/chosen": 1.8513895273208618, "logits/rejected": 1.8651978969573975, "logps/chosen": -179.46591186523438, "logps/rejected": -221.010986328125, "loss": 0.5725, "rewards/accuracies": 0.75, "rewards/chosen": -1.3116358518600464, "rewards/margins": 0.3873592019081116, "rewards/rejected": -1.6989952325820923, "step": 3470 }, { "epoch": 1.2538281390740407, "grad_norm": 23.453344345092773, "learning_rate": 1.8358823222228096e-07, "logits/chosen": 1.7122529745101929, "logits/rejected": 1.7139049768447876, "logps/chosen": -181.01031494140625, "logps/rejected": -227.14730834960938, "loss": 0.5398, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3143945932388306, "rewards/margins": 0.46285495162010193, "rewards/rejected": -1.7772495746612549, "step": 3480 }, { "epoch": 1.2574310934966673, "grad_norm": 19.974374771118164, "learning_rate": 1.820736817198969e-07, "logits/chosen": 1.8285239934921265, "logits/rejected": 1.845882773399353, "logps/chosen": -195.367431640625, "logps/rejected": -251.32760620117188, "loss": 0.5016, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.4235846996307373, "rewards/margins": 0.6087952256202698, "rewards/rejected": -2.0323803424835205, "step": 3490 }, { "epoch": 1.2610340479192939, "grad_norm": 31.72352409362793, "learning_rate": 1.8056181820351735e-07, "logits/chosen": 1.8216755390167236, "logits/rejected": 1.8578245639801025, "logps/chosen": -192.22169494628906, "logps/rejected": -249.3321990966797, "loss": 0.5411, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.42820143699646, "rewards/margins": 0.5573378801345825, "rewards/rejected": -1.985539197921753, "step": 3500 }, { "epoch": 1.2646370023419204, "grad_norm": 26.99710464477539, "learning_rate": 1.790527014784758e-07, "logits/chosen": 1.7859792709350586, "logits/rejected": 1.8002761602401733, "logps/chosen": -193.13339233398438, "logps/rejected": -230.9226531982422, "loss": 0.5891, "rewards/accuracies": 0.625, "rewards/chosen": -1.4562654495239258, "rewards/margins": 0.3704061210155487, "rewards/rejected": -1.8266716003417969, "step": 3510 }, { "epoch": 1.268239956764547, "grad_norm": 24.12320899963379, "learning_rate": 1.7754639124144977e-07, "logits/chosen": 1.3900184631347656, "logits/rejected": 1.4246901273727417, "logps/chosen": -177.86846923828125, "logps/rejected": -229.09835815429688, "loss": 0.5582, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2260959148406982, "rewards/margins": 0.4854293763637543, "rewards/rejected": -1.711525321006775, "step": 3520 }, { "epoch": 1.2718429111871734, "grad_norm": 22.157583236694336, "learning_rate": 1.760429470780994e-07, "logits/chosen": 1.6068884134292603, "logits/rejected": 1.6049220561981201, "logps/chosen": -195.89663696289062, "logps/rejected": -228.62124633789062, "loss": 0.602, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.4787858724594116, "rewards/margins": 0.31004488468170166, "rewards/rejected": -1.7888309955596924, "step": 3530 }, { "epoch": 1.2754458656098, "grad_norm": 28.59613609313965, "learning_rate": 1.7454242846071082e-07, "logits/chosen": 1.819340467453003, "logits/rejected": 1.8372827768325806, "logps/chosen": -177.56573486328125, "logps/rejected": -217.68765258789062, "loss": 0.581, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.3727514743804932, "rewards/margins": 0.3638874888420105, "rewards/rejected": -1.7366390228271484, "step": 3540 }, { "epoch": 1.2790488200324266, "grad_norm": 13.975077629089355, "learning_rate": 1.7304489474584304e-07, "logits/chosen": 1.8392823934555054, "logits/rejected": 1.8703858852386475, "logps/chosen": -188.02796936035156, "logps/rejected": -227.65371704101562, "loss": 0.5815, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.389987826347351, "rewards/margins": 0.3787449598312378, "rewards/rejected": -1.7687326669692993, "step": 3550 }, { "epoch": 1.2826517744550532, "grad_norm": 30.64284324645996, "learning_rate": 1.715504051719804e-07, "logits/chosen": 1.9738810062408447, "logits/rejected": 2.007951498031616, "logps/chosen": -186.0692138671875, "logps/rejected": -219.5050811767578, "loss": 0.5717, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2764512300491333, "rewards/margins": 0.3545674681663513, "rewards/rejected": -1.6310186386108398, "step": 3560 }, { "epoch": 1.2862547288776798, "grad_norm": 42.76008605957031, "learning_rate": 1.7005901885718867e-07, "logits/chosen": 2.18925142288208, "logits/rejected": 2.197112560272217, "logps/chosen": -211.40823364257812, "logps/rejected": -231.84521484375, "loss": 0.6725, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.6586414575576782, "rewards/margins": 0.19599632918834686, "rewards/rejected": -1.854637861251831, "step": 3570 }, { "epoch": 1.2898576833003061, "grad_norm": 27.870746612548828, "learning_rate": 1.6857079479677737e-07, "logits/chosen": 2.225771427154541, "logits/rejected": 2.2617292404174805, "logps/chosen": -181.1365966796875, "logps/rejected": -224.7388916015625, "loss": 0.5556, "rewards/accuracies": 0.75, "rewards/chosen": -1.3274881839752197, "rewards/margins": 0.4119059443473816, "rewards/rejected": -1.739394187927246, "step": 3580 }, { "epoch": 1.2934606377229327, "grad_norm": 48.422306060791016, "learning_rate": 1.670857918609653e-07, "logits/chosen": 2.1853294372558594, "logits/rejected": 2.2130117416381836, "logps/chosen": -202.51756286621094, "logps/rejected": -249.8184051513672, "loss": 0.5853, "rewards/accuracies": 0.75, "rewards/chosen": -1.5900542736053467, "rewards/margins": 0.42613738775253296, "rewards/rejected": -2.0161914825439453, "step": 3590 }, { "epoch": 1.2970635921455593, "grad_norm": 37.77787780761719, "learning_rate": 1.656040687925519e-07, "logits/chosen": 1.9480301141738892, "logits/rejected": 1.9540107250213623, "logps/chosen": -200.03726196289062, "logps/rejected": -242.9154510498047, "loss": 0.591, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5215644836425781, "rewards/margins": 0.3655272424221039, "rewards/rejected": -1.8870916366577148, "step": 3600 }, { "epoch": 1.300666546568186, "grad_norm": 25.00056266784668, "learning_rate": 1.641256842045942e-07, "logits/chosen": 2.2472565174102783, "logits/rejected": 2.2626872062683105, "logps/chosen": -182.2674102783203, "logps/rejected": -223.0990447998047, "loss": 0.541, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3268883228302002, "rewards/margins": 0.4107748866081238, "rewards/rejected": -1.7376632690429688, "step": 3610 }, { "epoch": 1.3042695009908125, "grad_norm": 25.82343292236328, "learning_rate": 1.6265069657808728e-07, "logits/chosen": 2.3760194778442383, "logits/rejected": 2.397968053817749, "logps/chosen": -194.54000854492188, "logps/rejected": -240.69888305664062, "loss": 0.5759, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5028674602508545, "rewards/margins": 0.40088844299316406, "rewards/rejected": -1.903755784034729, "step": 3620 }, { "epoch": 1.307872455413439, "grad_norm": 19.218997955322266, "learning_rate": 1.6117916425965157e-07, "logits/chosen": 2.402834892272949, "logits/rejected": 2.4300312995910645, "logps/chosen": -207.3802490234375, "logps/rejected": -253.8350830078125, "loss": 0.5656, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6533803939819336, "rewards/margins": 0.43287163972854614, "rewards/rejected": -2.086251735687256, "step": 3630 }, { "epoch": 1.3114754098360657, "grad_norm": 37.13401794433594, "learning_rate": 1.5971114545922475e-07, "logits/chosen": 2.393101453781128, "logits/rejected": 2.4327123165130615, "logps/chosen": -183.045166015625, "logps/rejected": -239.2936553955078, "loss": 0.5543, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4144227504730225, "rewards/margins": 0.5289689898490906, "rewards/rejected": -1.943392038345337, "step": 3640 }, { "epoch": 1.315078364258692, "grad_norm": 21.490629196166992, "learning_rate": 1.5824669824775866e-07, "logits/chosen": 2.260507822036743, "logits/rejected": 2.273637056350708, "logps/chosen": -193.7061004638672, "logps/rejected": -234.71377563476562, "loss": 0.5882, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.4560885429382324, "rewards/margins": 0.4170869290828705, "rewards/rejected": -1.8731753826141357, "step": 3650 }, { "epoch": 1.3186813186813187, "grad_norm": 23.327041625976562, "learning_rate": 1.5678588055492286e-07, "logits/chosen": 2.1389849185943604, "logits/rejected": 2.1763854026794434, "logps/chosen": -172.3188018798828, "logps/rejected": -217.666259765625, "loss": 0.5715, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2966365814208984, "rewards/margins": 0.4119698405265808, "rewards/rejected": -1.708606481552124, "step": 3660 }, { "epoch": 1.3222842731039453, "grad_norm": 33.65090560913086, "learning_rate": 1.5532875016681247e-07, "logits/chosen": 2.0638270378112793, "logits/rejected": 2.0853710174560547, "logps/chosen": -195.6360321044922, "logps/rejected": -230.1496124267578, "loss": 0.5884, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.422572374343872, "rewards/margins": 0.35662734508514404, "rewards/rejected": -1.7791998386383057, "step": 3670 }, { "epoch": 1.3258872275265718, "grad_norm": 27.418167114257812, "learning_rate": 1.5387536472366275e-07, "logits/chosen": 1.9518663883209229, "logits/rejected": 1.9620939493179321, "logps/chosen": -174.151123046875, "logps/rejected": -207.51699829101562, "loss": 0.6117, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3327616453170776, "rewards/margins": 0.31053417921066284, "rewards/rejected": -1.6432958841323853, "step": 3680 }, { "epoch": 1.3294901819491982, "grad_norm": 24.672998428344727, "learning_rate": 1.5242578171756864e-07, "logits/chosen": 1.8104326725006104, "logits/rejected": 1.8332993984222412, "logps/chosen": -172.15753173828125, "logps/rejected": -208.52212524414062, "loss": 0.5907, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3121302127838135, "rewards/margins": 0.3558919131755829, "rewards/rejected": -1.6680221557617188, "step": 3690 }, { "epoch": 1.3330931363718248, "grad_norm": 26.359895706176758, "learning_rate": 1.5098005849021078e-07, "logits/chosen": 1.8233158588409424, "logits/rejected": 1.8709800243377686, "logps/chosen": -188.22592163085938, "logps/rejected": -236.8144073486328, "loss": 0.5635, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4036380052566528, "rewards/margins": 0.44843512773513794, "rewards/rejected": -1.852073073387146, "step": 3700 }, { "epoch": 1.3366960907944514, "grad_norm": 29.816444396972656, "learning_rate": 1.495382522305872e-07, "logits/chosen": 1.8397347927093506, "logits/rejected": 1.8576186895370483, "logps/chosen": -191.6383819580078, "logps/rejected": -232.81326293945312, "loss": 0.5587, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.405597448348999, "rewards/margins": 0.3850526213645935, "rewards/rejected": -1.7906501293182373, "step": 3710 }, { "epoch": 1.340299045217078, "grad_norm": 23.615833282470703, "learning_rate": 1.4810041997275092e-07, "logits/chosen": 1.8601493835449219, "logits/rejected": 1.8741000890731812, "logps/chosen": -184.04672241210938, "logps/rejected": -241.630126953125, "loss": 0.5135, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.3591305017471313, "rewards/margins": 0.5281540155410767, "rewards/rejected": -1.887284517288208, "step": 3720 }, { "epoch": 1.3439019996397046, "grad_norm": 20.799488067626953, "learning_rate": 1.4666661859355404e-07, "logits/chosen": 2.1675801277160645, "logits/rejected": 2.1676344871520996, "logps/chosen": -183.47018432617188, "logps/rejected": -213.0686492919922, "loss": 0.6005, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.4248483180999756, "rewards/margins": 0.26402515172958374, "rewards/rejected": -1.688873529434204, "step": 3730 }, { "epoch": 1.3475049540623312, "grad_norm": 28.859333038330078, "learning_rate": 1.452369048103976e-07, "logits/chosen": 1.9734678268432617, "logits/rejected": 1.9968979358673096, "logps/chosen": -192.2145538330078, "logps/rejected": -248.0688018798828, "loss": 0.5195, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.4926023483276367, "rewards/margins": 0.5251239538192749, "rewards/rejected": -2.017726421356201, "step": 3740 }, { "epoch": 1.3511079084849578, "grad_norm": 18.08772087097168, "learning_rate": 1.4381133517898803e-07, "logits/chosen": 2.115337610244751, "logits/rejected": 2.1246225833892822, "logps/chosen": -196.42152404785156, "logps/rejected": -234.7654266357422, "loss": 0.5767, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.4249114990234375, "rewards/margins": 0.39205366373062134, "rewards/rejected": -1.816965103149414, "step": 3750 }, { "epoch": 1.3547108629075844, "grad_norm": 33.767269134521484, "learning_rate": 1.423899660911005e-07, "logits/chosen": 1.9451555013656616, "logits/rejected": 1.9652153253555298, "logps/chosen": -189.8253631591797, "logps/rejected": -238.01486206054688, "loss": 0.5512, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.4437839984893799, "rewards/margins": 0.4582470953464508, "rewards/rejected": -1.9020313024520874, "step": 3760 }, { "epoch": 1.3583138173302107, "grad_norm": 21.92230796813965, "learning_rate": 1.4097285377234724e-07, "logits/chosen": 1.7935683727264404, "logits/rejected": 1.811952829360962, "logps/chosen": -189.38926696777344, "logps/rejected": -227.97006225585938, "loss": 0.5781, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.397295355796814, "rewards/margins": 0.3482345938682556, "rewards/rejected": -1.7455298900604248, "step": 3770 }, { "epoch": 1.3619167717528373, "grad_norm": 27.76797103881836, "learning_rate": 1.395600542799542e-07, "logits/chosen": 2.068068742752075, "logits/rejected": 2.0708858966827393, "logps/chosen": -191.26097106933594, "logps/rejected": -233.2442169189453, "loss": 0.5504, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.4574092626571655, "rewards/margins": 0.39677366614341736, "rewards/rejected": -1.8541829586029053, "step": 3780 }, { "epoch": 1.365519726175464, "grad_norm": 27.542654037475586, "learning_rate": 1.381516235005433e-07, "logits/chosen": 2.1321825981140137, "logits/rejected": 2.1544833183288574, "logps/chosen": -203.79705810546875, "logps/rejected": -246.9839324951172, "loss": 0.5738, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5344202518463135, "rewards/margins": 0.42250028252601624, "rewards/rejected": -1.9569206237792969, "step": 3790 }, { "epoch": 1.3691226805980905, "grad_norm": 24.526832580566406, "learning_rate": 1.367476171479215e-07, "logits/chosen": 2.1942076683044434, "logits/rejected": 2.220576524734497, "logps/chosen": -180.60507202148438, "logps/rejected": -225.52011108398438, "loss": 0.5746, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.4094579219818115, "rewards/margins": 0.4005855917930603, "rewards/rejected": -1.8100435733795166, "step": 3800 }, { "epoch": 1.3727256350207169, "grad_norm": 44.125572204589844, "learning_rate": 1.3534809076087732e-07, "logits/chosen": 1.9495807886123657, "logits/rejected": 1.999224066734314, "logps/chosen": -195.178466796875, "logps/rejected": -241.162109375, "loss": 0.5579, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.4358845949172974, "rewards/margins": 0.4407084584236145, "rewards/rejected": -1.876592993736267, "step": 3810 }, { "epoch": 1.3763285894433435, "grad_norm": 24.335865020751953, "learning_rate": 1.3395309970098342e-07, "logits/chosen": 2.3719897270202637, "logits/rejected": 2.4065325260162354, "logps/chosen": -202.8979034423828, "logps/rejected": -255.8293914794922, "loss": 0.5364, "rewards/accuracies": 0.6875, "rewards/chosen": -1.622004747390747, "rewards/margins": 0.4795509874820709, "rewards/rejected": -2.101555347442627, "step": 3820 }, { "epoch": 1.37993154386597, "grad_norm": 18.380468368530273, "learning_rate": 1.3256269915040736e-07, "logits/chosen": 2.3298912048339844, "logits/rejected": 2.3397748470306396, "logps/chosen": -200.35983276367188, "logps/rejected": -234.7159881591797, "loss": 0.5998, "rewards/accuracies": 0.625, "rewards/chosen": -1.5939356088638306, "rewards/margins": 0.3117136061191559, "rewards/rejected": -1.905648946762085, "step": 3830 }, { "epoch": 1.3835344982885966, "grad_norm": 27.52941131591797, "learning_rate": 1.3117694410972747e-07, "logits/chosen": 2.283259868621826, "logits/rejected": 2.3234059810638428, "logps/chosen": -183.7685546875, "logps/rejected": -246.74740600585938, "loss": 0.529, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4562674760818481, "rewards/margins": 0.5746394991874695, "rewards/rejected": -2.030907154083252, "step": 3840 }, { "epoch": 1.3871374527112232, "grad_norm": 31.34552001953125, "learning_rate": 1.2979588939575878e-07, "logits/chosen": 2.1807518005371094, "logits/rejected": 2.2002217769622803, "logps/chosen": -208.7446746826172, "logps/rejected": -253.20889282226562, "loss": 0.584, "rewards/accuracies": 0.75, "rewards/chosen": -1.6211105585098267, "rewards/margins": 0.4426586627960205, "rewards/rejected": -2.0637693405151367, "step": 3850 }, { "epoch": 1.3907404071338498, "grad_norm": 49.54113006591797, "learning_rate": 1.2841958963938338e-07, "logits/chosen": 2.1136391162872314, "logits/rejected": 2.0968828201293945, "logps/chosen": -206.9363250732422, "logps/rejected": -228.25143432617188, "loss": 0.6461, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.4934473037719727, "rewards/margins": 0.2033785581588745, "rewards/rejected": -1.6968257427215576, "step": 3860 }, { "epoch": 1.3943433615564764, "grad_norm": 37.7128791809082, "learning_rate": 1.2704809928338957e-07, "logits/chosen": 2.224104404449463, "logits/rejected": 2.2668509483337402, "logps/chosen": -191.69552612304688, "logps/rejected": -232.6199951171875, "loss": 0.5649, "rewards/accuracies": 0.75, "rewards/chosen": -1.503103494644165, "rewards/margins": 0.4015834331512451, "rewards/rejected": -1.9046871662139893, "step": 3870 }, { "epoch": 1.3979463159791028, "grad_norm": 25.900611877441406, "learning_rate": 1.2568147258031897e-07, "logits/chosen": 1.8822847604751587, "logits/rejected": 1.9027903079986572, "logps/chosen": -187.94454956054688, "logps/rejected": -226.4380340576172, "loss": 0.5925, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.40830659866333, "rewards/margins": 0.36093783378601074, "rewards/rejected": -1.7692444324493408, "step": 3880 }, { "epoch": 1.4015492704017294, "grad_norm": 26.54643440246582, "learning_rate": 1.2431976359031955e-07, "logits/chosen": 1.8965444564819336, "logits/rejected": 1.9167616367340088, "logps/chosen": -186.21804809570312, "logps/rejected": -238.50595092773438, "loss": 0.5416, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.4133446216583252, "rewards/margins": 0.4906320571899414, "rewards/rejected": -1.9039767980575562, "step": 3890 }, { "epoch": 1.405152224824356, "grad_norm": 20.78183937072754, "learning_rate": 1.2296302617900768e-07, "logits/chosen": 2.143287181854248, "logits/rejected": 2.1438541412353516, "logps/chosen": -198.9349365234375, "logps/rejected": -233.54049682617188, "loss": 0.6188, "rewards/accuracies": 0.6875, "rewards/chosen": -1.544627070426941, "rewards/margins": 0.3243458569049835, "rewards/rejected": -1.8689727783203125, "step": 3900 }, { "epoch": 1.4087551792469826, "grad_norm": 25.700965881347656, "learning_rate": 1.216113140153371e-07, "logits/chosen": 2.0946247577667236, "logits/rejected": 2.117295742034912, "logps/chosen": -196.2801513671875, "logps/rejected": -242.2544403076172, "loss": 0.5527, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5123510360717773, "rewards/margins": 0.4487608075141907, "rewards/rejected": -1.9611120223999023, "step": 3910 }, { "epoch": 1.4123581336696092, "grad_norm": 33.97287368774414, "learning_rate": 1.2026468056947606e-07, "logits/chosen": 2.178783655166626, "logits/rejected": 2.2043185234069824, "logps/chosen": -201.91873168945312, "logps/rejected": -244.51504516601562, "loss": 0.6021, "rewards/accuracies": 0.625, "rewards/chosen": -1.573336124420166, "rewards/margins": 0.3733164072036743, "rewards/rejected": -1.9466526508331299, "step": 3920 }, { "epoch": 1.4159610880922355, "grad_norm": 45.7540397644043, "learning_rate": 1.189231791106921e-07, "logits/chosen": 2.130858898162842, "logits/rejected": 2.180258274078369, "logps/chosen": -201.24057006835938, "logps/rejected": -257.926025390625, "loss": 0.5199, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6287132501602173, "rewards/margins": 0.5473740696907043, "rewards/rejected": -2.1760871410369873, "step": 3930 }, { "epoch": 1.4195640425148621, "grad_norm": 27.437519073486328, "learning_rate": 1.1758686270524482e-07, "logits/chosen": 2.3916800022125244, "logits/rejected": 2.4437804222106934, "logps/chosen": -217.48770141601562, "logps/rejected": -262.7460632324219, "loss": 0.5762, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.7124818563461304, "rewards/margins": 0.4045422077178955, "rewards/rejected": -2.1170241832733154, "step": 3940 }, { "epoch": 1.4231669969374887, "grad_norm": 28.69942855834961, "learning_rate": 1.1625578421428714e-07, "logits/chosen": 2.472696304321289, "logits/rejected": 2.467665433883667, "logps/chosen": -226.68984985351562, "logps/rejected": -254.80654907226562, "loss": 0.6573, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.8046505451202393, "rewards/margins": 0.23238249123096466, "rewards/rejected": -2.0370326042175293, "step": 3950 }, { "epoch": 1.4267699513601153, "grad_norm": 30.90325355529785, "learning_rate": 1.149299962917733e-07, "logits/chosen": 2.5207648277282715, "logits/rejected": 2.524153470993042, "logps/chosen": -208.6819610595703, "logps/rejected": -243.366943359375, "loss": 0.6122, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.645262360572815, "rewards/margins": 0.32572299242019653, "rewards/rejected": -1.9709851741790771, "step": 3960 }, { "epoch": 1.430372905782742, "grad_norm": 28.970237731933594, "learning_rate": 1.1360955138237699e-07, "logits/chosen": 2.6872897148132324, "logits/rejected": 2.7066454887390137, "logps/chosen": -217.373046875, "logps/rejected": -254.2002716064453, "loss": 0.5834, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.753053069114685, "rewards/margins": 0.342947393655777, "rewards/rejected": -2.0960006713867188, "step": 3970 }, { "epoch": 1.4339758602053685, "grad_norm": 40.99421310424805, "learning_rate": 1.1229450171941657e-07, "logits/chosen": 2.7422609329223633, "logits/rejected": 2.7955024242401123, "logps/chosen": -219.7352294921875, "logps/rejected": -280.3626708984375, "loss": 0.5177, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.7397339344024658, "rewards/margins": 0.6051377058029175, "rewards/rejected": -2.3448715209960938, "step": 3980 }, { "epoch": 1.437578814627995, "grad_norm": 30.1541748046875, "learning_rate": 1.109848993227881e-07, "logits/chosen": 2.5172581672668457, "logits/rejected": 2.5636343955993652, "logps/chosen": -218.80911254882812, "logps/rejected": -269.53973388671875, "loss": 0.5514, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.7507011890411377, "rewards/margins": 0.4547833800315857, "rewards/rejected": -2.205484390258789, "step": 3990 }, { "epoch": 1.4411817690506215, "grad_norm": 34.5133056640625, "learning_rate": 1.0968079599690872e-07, "logits/chosen": 2.8047642707824707, "logits/rejected": 2.835019826889038, "logps/chosen": -232.9544219970703, "logps/rejected": -268.1980285644531, "loss": 0.6157, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.9134280681610107, "rewards/margins": 0.30228549242019653, "rewards/rejected": -2.2157135009765625, "step": 4000 }, { "epoch": 1.444784723473248, "grad_norm": 37.54106140136719, "learning_rate": 1.083822433286666e-07, "logits/chosen": 2.6304919719696045, "logits/rejected": 2.652879476547241, "logps/chosen": -206.3196258544922, "logps/rejected": -249.50363159179688, "loss": 0.5796, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6504005193710327, "rewards/margins": 0.40803390741348267, "rewards/rejected": -2.058434247970581, "step": 4010 }, { "epoch": 1.4483876778958746, "grad_norm": 19.438796997070312, "learning_rate": 1.0708929268538034e-07, "logits/chosen": 2.2060623168945312, "logits/rejected": 2.231776714324951, "logps/chosen": -215.81393432617188, "logps/rejected": -249.65823364257812, "loss": 0.6158, "rewards/accuracies": 0.625, "rewards/chosen": -1.6890411376953125, "rewards/margins": 0.3048761487007141, "rewards/rejected": -1.9939172267913818, "step": 4020 }, { "epoch": 1.4519906323185012, "grad_norm": 30.540685653686523, "learning_rate": 1.0580199521276759e-07, "logits/chosen": 2.5210537910461426, "logits/rejected": 2.549612045288086, "logps/chosen": -206.7338409423828, "logps/rejected": -244.7657012939453, "loss": 0.5878, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6120946407318115, "rewards/margins": 0.36665600538253784, "rewards/rejected": -1.9787505865097046, "step": 4030 }, { "epoch": 1.4555935867411276, "grad_norm": 23.23627471923828, "learning_rate": 1.0452040183292124e-07, "logits/chosen": 2.3309028148651123, "logits/rejected": 2.3343138694763184, "logps/chosen": -208.15908813476562, "logps/rejected": -247.7962188720703, "loss": 0.5666, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.594066858291626, "rewards/margins": 0.38210588693618774, "rewards/rejected": -1.9761728048324585, "step": 4040 }, { "epoch": 1.4591965411637542, "grad_norm": 21.690418243408203, "learning_rate": 1.0324456324229536e-07, "logits/chosen": 2.2826833724975586, "logits/rejected": 2.3174784183502197, "logps/chosen": -195.55740356445312, "logps/rejected": -230.21023559570312, "loss": 0.6036, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.4546035528182983, "rewards/margins": 0.3235613703727722, "rewards/rejected": -1.7781648635864258, "step": 4050 }, { "epoch": 1.4627994955863808, "grad_norm": 24.094581604003906, "learning_rate": 1.0197452990969976e-07, "logits/chosen": 2.2778327465057373, "logits/rejected": 2.343728542327881, "logps/chosen": -198.56021118164062, "logps/rejected": -256.511962890625, "loss": 0.5293, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.5641119480133057, "rewards/margins": 0.533170223236084, "rewards/rejected": -2.0972819328308105, "step": 4060 }, { "epoch": 1.4664024500090074, "grad_norm": 23.411500930786133, "learning_rate": 1.007103520743035e-07, "logits/chosen": 2.2860405445098877, "logits/rejected": 2.3017578125, "logps/chosen": -208.67245483398438, "logps/rejected": -249.4423370361328, "loss": 0.596, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.6604540348052979, "rewards/margins": 0.3682071268558502, "rewards/rejected": -2.0286612510681152, "step": 4070 }, { "epoch": 1.470005404431634, "grad_norm": 29.012025833129883, "learning_rate": 9.945207974364767e-08, "logits/chosen": 2.3106510639190674, "logits/rejected": 2.344446897506714, "logps/chosen": -243.06356811523438, "logps/rejected": -285.7310791015625, "loss": 0.6411, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.8531125783920288, "rewards/margins": 0.3922838568687439, "rewards/rejected": -2.245396614074707, "step": 4080 }, { "epoch": 1.4736083588542606, "grad_norm": 31.737157821655273, "learning_rate": 9.819976269166704e-08, "logits/chosen": 2.3555893898010254, "logits/rejected": 2.340897798538208, "logps/chosen": -219.1803436279297, "logps/rejected": -248.2200927734375, "loss": 0.603, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6209685802459717, "rewards/margins": 0.2904869616031647, "rewards/rejected": -1.911455512046814, "step": 4090 }, { "epoch": 1.4772113132768872, "grad_norm": 37.465389251708984, "learning_rate": 9.695345045672165e-08, "logits/chosen": 2.235996961593628, "logits/rejected": 2.2625551223754883, "logps/chosen": -198.73208618164062, "logps/rejected": -233.3943328857422, "loss": 0.5865, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5233110189437866, "rewards/margins": 0.3367288410663605, "rewards/rejected": -1.8600399494171143, "step": 4100 }, { "epoch": 1.4808142676995135, "grad_norm": 28.036596298217773, "learning_rate": 9.571319233963626e-08, "logits/chosen": 2.2205023765563965, "logits/rejected": 2.242121934890747, "logps/chosen": -189.00392150878906, "logps/rejected": -233.1159210205078, "loss": 0.5559, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.4463272094726562, "rewards/margins": 0.44084176421165466, "rewards/rejected": -1.8871688842773438, "step": 4110 }, { "epoch": 1.4844172221221401, "grad_norm": 36.313133239746094, "learning_rate": 9.447903740175098e-08, "logits/chosen": 2.470679759979248, "logits/rejected": 2.494894504547119, "logps/chosen": -187.43682861328125, "logps/rejected": -241.8269805908203, "loss": 0.5278, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.4297406673431396, "rewards/margins": 0.5476259589195251, "rewards/rejected": -1.9773668050765991, "step": 4120 }, { "epoch": 1.4880201765447667, "grad_norm": 32.384525299072266, "learning_rate": 9.325103446298038e-08, "logits/chosen": 2.352541208267212, "logits/rejected": 2.365428924560547, "logps/chosen": -214.4003448486328, "logps/rejected": -244.2249298095703, "loss": 0.633, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.6720874309539795, "rewards/margins": 0.27814793586730957, "rewards/rejected": -1.9502356052398682, "step": 4130 }, { "epoch": 1.4916231309673933, "grad_norm": 36.610294342041016, "learning_rate": 9.202923209988197e-08, "logits/chosen": 2.142672538757324, "logits/rejected": 2.1541152000427246, "logps/chosen": -198.27978515625, "logps/rejected": -231.7624053955078, "loss": 0.5759, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.429612398147583, "rewards/margins": 0.342256098985672, "rewards/rejected": -1.7718684673309326, "step": 4140 }, { "epoch": 1.4952260853900199, "grad_norm": 20.335296630859375, "learning_rate": 9.081367864373488e-08, "logits/chosen": 2.2970311641693115, "logits/rejected": 2.3052754402160645, "logps/chosen": -192.21348571777344, "logps/rejected": -225.3434295654297, "loss": 0.6121, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.3674556016921997, "rewards/margins": 0.31879502534866333, "rewards/rejected": -1.6862504482269287, "step": 4150 }, { "epoch": 1.4988290398126463, "grad_norm": 25.644750595092773, "learning_rate": 8.960442217862795e-08, "logits/chosen": 2.054680347442627, "logits/rejected": 2.0955567359924316, "logps/chosen": -194.7618408203125, "logps/rejected": -247.1341094970703, "loss": 0.5412, "rewards/accuracies": 0.75, "rewards/chosen": -1.515158772468567, "rewards/margins": 0.46768832206726074, "rewards/rejected": -1.9828474521636963, "step": 4160 }, { "epoch": 1.5024319942352728, "grad_norm": 38.148651123046875, "learning_rate": 8.840151053955772e-08, "logits/chosen": 2.0653910636901855, "logits/rejected": 2.0864741802215576, "logps/chosen": -205.56201171875, "logps/rejected": -238.72677612304688, "loss": 0.6198, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6179357767105103, "rewards/margins": 0.32318201661109924, "rewards/rejected": -1.9411180019378662, "step": 4170 }, { "epoch": 1.5060349486578994, "grad_norm": 28.420183181762695, "learning_rate": 8.720499131053611e-08, "logits/chosen": 2.002610921859741, "logits/rejected": 2.0557868480682373, "logps/chosen": -190.8556671142578, "logps/rejected": -230.8711395263672, "loss": 0.589, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.4824676513671875, "rewards/margins": 0.3686225414276123, "rewards/rejected": -1.8510901927947998, "step": 4180 }, { "epoch": 1.509637903080526, "grad_norm": 23.488773345947266, "learning_rate": 8.601491182270812e-08, "logits/chosen": 2.133180618286133, "logits/rejected": 2.159903049468994, "logps/chosen": -210.1831512451172, "logps/rejected": -258.10443115234375, "loss": 0.5667, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6593990325927734, "rewards/margins": 0.46231454610824585, "rewards/rejected": -2.121713638305664, "step": 4190 }, { "epoch": 1.5132408575031526, "grad_norm": 25.544527053833008, "learning_rate": 8.483131915247967e-08, "logits/chosen": 2.097330093383789, "logits/rejected": 2.1371865272521973, "logps/chosen": -188.57901000976562, "logps/rejected": -234.85470581054688, "loss": 0.5559, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.4768812656402588, "rewards/margins": 0.4339800775051117, "rewards/rejected": -1.910861611366272, "step": 4200 }, { "epoch": 1.5168438119257792, "grad_norm": 19.057374954223633, "learning_rate": 8.365426011965512e-08, "logits/chosen": 2.0573668479919434, "logits/rejected": 2.072871685028076, "logps/chosen": -207.1567840576172, "logps/rejected": -246.02230834960938, "loss": 0.6044, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5855404138565063, "rewards/margins": 0.3767652213573456, "rewards/rejected": -1.9623054265975952, "step": 4210 }, { "epoch": 1.5204467663484058, "grad_norm": 27.889890670776367, "learning_rate": 8.248378128558564e-08, "logits/chosen": 2.236063003540039, "logits/rejected": 2.2808563709259033, "logps/chosen": -213.96591186523438, "logps/rejected": -264.50653076171875, "loss": 0.5562, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.730071783065796, "rewards/margins": 0.4675087332725525, "rewards/rejected": -2.1975808143615723, "step": 4220 }, { "epoch": 1.5240497207710324, "grad_norm": 20.943641662597656, "learning_rate": 8.131992895132692e-08, "logits/chosen": 1.8917369842529297, "logits/rejected": 1.8827037811279297, "logps/chosen": -192.46511840820312, "logps/rejected": -225.7458953857422, "loss": 0.5849, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3797410726547241, "rewards/margins": 0.3336414694786072, "rewards/rejected": -1.7133827209472656, "step": 4230 }, { "epoch": 1.5276526751936588, "grad_norm": 26.538253784179688, "learning_rate": 8.016274915580753e-08, "logits/chosen": 2.1079792976379395, "logits/rejected": 2.124335289001465, "logps/chosen": -211.16259765625, "logps/rejected": -263.89190673828125, "loss": 0.5333, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5972684621810913, "rewards/margins": 0.5083234906196594, "rewards/rejected": -2.1055920124053955, "step": 4240 }, { "epoch": 1.5312556296162854, "grad_norm": 32.481163024902344, "learning_rate": 7.901228767400858e-08, "logits/chosen": 2.065208911895752, "logits/rejected": 2.0380001068115234, "logps/chosen": -196.34857177734375, "logps/rejected": -219.82095336914062, "loss": 0.6259, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.4603230953216553, "rewards/margins": 0.22774550318717957, "rewards/rejected": -1.6880686283111572, "step": 4250 }, { "epoch": 1.534858584038912, "grad_norm": 21.735862731933594, "learning_rate": 7.786859001515195e-08, "logits/chosen": 1.9773706197738647, "logits/rejected": 2.003389835357666, "logps/chosen": -187.12619018554688, "logps/rejected": -237.146484375, "loss": 0.5525, "rewards/accuracies": 0.75, "rewards/chosen": -1.378788709640503, "rewards/margins": 0.4737245440483093, "rewards/rejected": -1.852513313293457, "step": 4260 }, { "epoch": 1.5384615384615383, "grad_norm": 26.040084838867188, "learning_rate": 7.673170142090075e-08, "logits/chosen": 1.9840046167373657, "logits/rejected": 2.0193214416503906, "logps/chosen": -206.76101684570312, "logps/rejected": -261.0201110839844, "loss": 0.5485, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6018749475479126, "rewards/margins": 0.5224472284317017, "rewards/rejected": -2.1243224143981934, "step": 4270 }, { "epoch": 1.542064492884165, "grad_norm": 31.891942977905273, "learning_rate": 7.560166686356928e-08, "logits/chosen": 1.8930866718292236, "logits/rejected": 1.939308762550354, "logps/chosen": -208.3704833984375, "logps/rejected": -253.8648223876953, "loss": 0.5499, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4735658168792725, "rewards/margins": 0.44235682487487793, "rewards/rejected": -1.9159224033355713, "step": 4280 }, { "epoch": 1.5456674473067915, "grad_norm": 26.410696029663086, "learning_rate": 7.447853104434438e-08, "logits/chosen": 1.9173721075057983, "logits/rejected": 1.9585317373275757, "logps/chosen": -216.7724151611328, "logps/rejected": -256.5429992675781, "loss": 0.5841, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6213337182998657, "rewards/margins": 0.3730444312095642, "rewards/rejected": -1.9943780899047852, "step": 4290 }, { "epoch": 1.549270401729418, "grad_norm": 27.23445701599121, "learning_rate": 7.336233839151692e-08, "logits/chosen": 2.2593255043029785, "logits/rejected": 2.3057758808135986, "logps/chosen": -213.1826934814453, "logps/rejected": -269.57177734375, "loss": 0.5471, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6645898818969727, "rewards/margins": 0.5269854664802551, "rewards/rejected": -2.191575527191162, "step": 4300 }, { "epoch": 1.5528733561520447, "grad_norm": 25.672992706298828, "learning_rate": 7.225313305872438e-08, "logits/chosen": 2.176313877105713, "logits/rejected": 2.2026896476745605, "logps/chosen": -217.30252075195312, "logps/rejected": -263.0103759765625, "loss": 0.556, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6082637310028076, "rewards/margins": 0.454536110162735, "rewards/rejected": -2.0627999305725098, "step": 4310 }, { "epoch": 1.5564763105746713, "grad_norm": 26.70905876159668, "learning_rate": 7.115095892320455e-08, "logits/chosen": 2.4024858474731445, "logits/rejected": 2.4109010696411133, "logps/chosen": -224.5513458251953, "logps/rejected": -261.7074279785156, "loss": 0.5765, "rewards/accuracies": 0.625, "rewards/chosen": -1.7521905899047852, "rewards/margins": 0.3836340308189392, "rewards/rejected": -2.135824680328369, "step": 4320 }, { "epoch": 1.5600792649972979, "grad_norm": 22.603137969970703, "learning_rate": 7.005585958405916e-08, "logits/chosen": 2.3563637733459473, "logits/rejected": 2.3754074573516846, "logps/chosen": -221.39016723632812, "logps/rejected": -266.6629638671875, "loss": 0.5719, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.7421430349349976, "rewards/margins": 0.42091941833496094, "rewards/rejected": -2.163062572479248, "step": 4330 }, { "epoch": 1.5636822194199245, "grad_norm": 47.77539825439453, "learning_rate": 6.896787836052992e-08, "logits/chosen": 2.3739821910858154, "logits/rejected": 2.3638415336608887, "logps/chosen": -212.63833618164062, "logps/rejected": -248.03408813476562, "loss": 0.6251, "rewards/accuracies": 0.625, "rewards/chosen": -1.7163009643554688, "rewards/margins": 0.32989609241485596, "rewards/rejected": -2.0461974143981934, "step": 4340 }, { "epoch": 1.5672851738425508, "grad_norm": 22.239276885986328, "learning_rate": 6.788705829028482e-08, "logits/chosen": 2.0460848808288574, "logits/rejected": 2.074023485183716, "logps/chosen": -205.8352508544922, "logps/rejected": -244.93984985351562, "loss": 0.5871, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4822351932525635, "rewards/margins": 0.38295218348503113, "rewards/rejected": -1.8651872873306274, "step": 4350 }, { "epoch": 1.5708881282651774, "grad_norm": 28.040822982788086, "learning_rate": 6.681344212771506e-08, "logits/chosen": 2.334414005279541, "logits/rejected": 2.349910020828247, "logps/chosen": -198.33810424804688, "logps/rejected": -238.9496307373047, "loss": 0.6032, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5019086599349976, "rewards/margins": 0.3424648642539978, "rewards/rejected": -1.8443737030029297, "step": 4360 }, { "epoch": 1.574491082687804, "grad_norm": 32.481414794921875, "learning_rate": 6.574707234224466e-08, "logits/chosen": 2.114077568054199, "logits/rejected": 2.119075298309326, "logps/chosen": -201.87498474121094, "logps/rejected": -227.68698120117188, "loss": 0.6725, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.5666872262954712, "rewards/margins": 0.22483928501605988, "rewards/rejected": -1.791526436805725, "step": 4370 }, { "epoch": 1.5780940371104304, "grad_norm": 22.595666885375977, "learning_rate": 6.468799111665003e-08, "logits/chosen": 2.0177810192108154, "logits/rejected": 2.061736583709717, "logps/chosen": -200.3575439453125, "logps/rejected": -249.4969940185547, "loss": 0.5581, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.4850465059280396, "rewards/margins": 0.43845659494400024, "rewards/rejected": -1.9235031604766846, "step": 4380 }, { "epoch": 1.581696991533057, "grad_norm": 33.4980354309082, "learning_rate": 6.363624034539097e-08, "logits/chosen": 2.0352072715759277, "logits/rejected": 2.0526909828186035, "logps/chosen": -211.1829833984375, "logps/rejected": -253.79928588867188, "loss": 0.5572, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.5847043991088867, "rewards/margins": 0.398946613073349, "rewards/rejected": -1.9836509227752686, "step": 4390 }, { "epoch": 1.5852999459556836, "grad_norm": 18.39958953857422, "learning_rate": 6.259186163295438e-08, "logits/chosen": 2.5003159046173096, "logits/rejected": 2.5333242416381836, "logps/chosen": -197.83883666992188, "logps/rejected": -253.9361572265625, "loss": 0.5233, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.5224320888519287, "rewards/margins": 0.5308869481086731, "rewards/rejected": -2.053318977355957, "step": 4400 }, { "epoch": 1.5889029003783102, "grad_norm": 30.835023880004883, "learning_rate": 6.155489629220764e-08, "logits/chosen": 2.2001023292541504, "logits/rejected": 2.235380172729492, "logps/chosen": -195.66595458984375, "logps/rejected": -237.34512329101562, "loss": 0.5547, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4750816822052002, "rewards/margins": 0.4272097051143646, "rewards/rejected": -1.9022915363311768, "step": 4410 }, { "epoch": 1.5925058548009368, "grad_norm": 28.24887466430664, "learning_rate": 6.052538534276477e-08, "logits/chosen": 2.1940560340881348, "logits/rejected": 2.260021686553955, "logps/chosen": -194.35580444335938, "logps/rejected": -256.4817810058594, "loss": 0.5122, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4525573253631592, "rewards/margins": 0.603963315486908, "rewards/rejected": -2.056520700454712, "step": 4420 }, { "epoch": 1.5961088092235634, "grad_norm": 28.577600479125977, "learning_rate": 5.9503369509363774e-08, "logits/chosen": 2.009065866470337, "logits/rejected": 2.0373148918151855, "logps/chosen": -209.2212677001953, "logps/rejected": -250.5079803466797, "loss": 0.5754, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.497430443763733, "rewards/margins": 0.37567323446273804, "rewards/rejected": -1.8731034994125366, "step": 4430 }, { "epoch": 1.59971176364619, "grad_norm": 43.857566833496094, "learning_rate": 5.848888922025552e-08, "logits/chosen": 2.3759193420410156, "logits/rejected": 2.396794319152832, "logps/chosen": -204.43545532226562, "logps/rejected": -258.35150146484375, "loss": 0.5784, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5501829385757446, "rewards/margins": 0.4931480288505554, "rewards/rejected": -2.043330669403076, "step": 4440 }, { "epoch": 1.6033147180688165, "grad_norm": 34.31081008911133, "learning_rate": 5.748198460560475e-08, "logits/chosen": 2.1870875358581543, "logits/rejected": 2.210813522338867, "logps/chosen": -226.90933227539062, "logps/rejected": -270.5358581542969, "loss": 0.6091, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.7758804559707642, "rewards/margins": 0.4144059121608734, "rewards/rejected": -2.19028639793396, "step": 4450 }, { "epoch": 1.6069176724914431, "grad_norm": 42.93614959716797, "learning_rate": 5.648269549590232e-08, "logits/chosen": 2.306990146636963, "logits/rejected": 2.348219156265259, "logps/chosen": -213.69424438476562, "logps/rejected": -251.96255493164062, "loss": 0.6047, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.6652253866195679, "rewards/margins": 0.3560051918029785, "rewards/rejected": -2.021230697631836, "step": 4460 }, { "epoch": 1.6105206269140695, "grad_norm": 28.934322357177734, "learning_rate": 5.5491061420390174e-08, "logits/chosen": 2.3856258392333984, "logits/rejected": 2.384432315826416, "logps/chosen": -220.46621704101562, "logps/rejected": -264.6500549316406, "loss": 0.5652, "rewards/accuracies": 0.6875, "rewards/chosen": -1.7540113925933838, "rewards/margins": 0.39775171875953674, "rewards/rejected": -2.1517632007598877, "step": 4470 }, { "epoch": 1.614123581336696, "grad_norm": 27.708112716674805, "learning_rate": 5.4507121605496726e-08, "logits/chosen": 2.4679114818573, "logits/rejected": 2.505674362182617, "logps/chosen": -178.9739227294922, "logps/rejected": -243.4705810546875, "loss": 0.5198, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.408857822418213, "rewards/margins": 0.6008615493774414, "rewards/rejected": -2.0097193717956543, "step": 4480 }, { "epoch": 1.6177265357593227, "grad_norm": 43.39290237426758, "learning_rate": 5.353091497328627e-08, "logits/chosen": 2.1443047523498535, "logits/rejected": 2.1748502254486084, "logps/chosen": -196.84353637695312, "logps/rejected": -246.3999481201172, "loss": 0.5834, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5358701944351196, "rewards/margins": 0.4669255316257477, "rewards/rejected": -2.002795696258545, "step": 4490 }, { "epoch": 1.621329490181949, "grad_norm": 35.661983489990234, "learning_rate": 5.256248013991857e-08, "logits/chosen": 2.0213568210601807, "logits/rejected": 2.0241341590881348, "logps/chosen": -203.96005249023438, "logps/rejected": -242.9303741455078, "loss": 0.6014, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.4897011518478394, "rewards/margins": 0.3591596186161041, "rewards/rejected": -1.8488609790802002, "step": 4500 }, { "epoch": 1.6249324446045756, "grad_norm": 21.74781608581543, "learning_rate": 5.1601855414121295e-08, "logits/chosen": 2.185598373413086, "logits/rejected": 2.2083213329315186, "logps/chosen": -185.43930053710938, "logps/rejected": -227.32546997070312, "loss": 0.5827, "rewards/accuracies": 0.6875, "rewards/chosen": -1.39814293384552, "rewards/margins": 0.3859577178955078, "rewards/rejected": -1.7841007709503174, "step": 4510 }, { "epoch": 1.6285353990272022, "grad_norm": 21.03277587890625, "learning_rate": 5.064907879567526e-08, "logits/chosen": 2.1720356941223145, "logits/rejected": 2.1843976974487305, "logps/chosen": -183.34188842773438, "logps/rejected": -229.08407592773438, "loss": 0.55, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.3856041431427002, "rewards/margins": 0.4459781050682068, "rewards/rejected": -1.8315823078155518, "step": 4520 }, { "epoch": 1.6321383534498288, "grad_norm": 29.443302154541016, "learning_rate": 4.9704187973910624e-08, "logits/chosen": 2.2658417224884033, "logits/rejected": 2.2824199199676514, "logps/chosen": -208.009033203125, "logps/rejected": -248.68954467773438, "loss": 0.5848, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6338236331939697, "rewards/margins": 0.3810350000858307, "rewards/rejected": -2.0148587226867676, "step": 4530 }, { "epoch": 1.6357413078724554, "grad_norm": 32.813499450683594, "learning_rate": 4.87672203262163e-08, "logits/chosen": 2.2142345905303955, "logits/rejected": 2.2316298484802246, "logps/chosen": -198.617431640625, "logps/rejected": -236.8822784423828, "loss": 0.5863, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5111747980117798, "rewards/margins": 0.37488654255867004, "rewards/rejected": -1.8860610723495483, "step": 4540 }, { "epoch": 1.639344262295082, "grad_norm": 70.49163818359375, "learning_rate": 4.7838212916561285e-08, "logits/chosen": 2.2292842864990234, "logits/rejected": 2.255504608154297, "logps/chosen": -216.76748657226562, "logps/rejected": -242.6904296875, "loss": 0.6585, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.6865644454956055, "rewards/margins": 0.22057318687438965, "rewards/rejected": -1.9071376323699951, "step": 4550 }, { "epoch": 1.6429472167177086, "grad_norm": 26.967416763305664, "learning_rate": 4.691720249402856e-08, "logits/chosen": 1.9996013641357422, "logits/rejected": 2.0293033123016357, "logps/chosen": -189.83139038085938, "logps/rejected": -233.23388671875, "loss": 0.5648, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3463690280914307, "rewards/margins": 0.4225892126560211, "rewards/rejected": -1.768958330154419, "step": 4560 }, { "epoch": 1.6465501711403352, "grad_norm": 19.056949615478516, "learning_rate": 4.600422549136137e-08, "logits/chosen": 2.1994102001190186, "logits/rejected": 2.231595039367676, "logps/chosen": -199.77468872070312, "logps/rejected": -232.41598510742188, "loss": 0.5955, "rewards/accuracies": 0.625, "rewards/chosen": -1.4999521970748901, "rewards/margins": 0.3468267619609833, "rewards/rejected": -1.8467791080474854, "step": 4570 }, { "epoch": 1.6501531255629618, "grad_norm": 28.43031883239746, "learning_rate": 4.50993180235221e-08, "logits/chosen": 2.098292350769043, "logits/rejected": 2.1165525913238525, "logps/chosen": -199.96417236328125, "logps/rejected": -235.9426727294922, "loss": 0.5796, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.478283166885376, "rewards/margins": 0.3656356930732727, "rewards/rejected": -1.843918800354004, "step": 4580 }, { "epoch": 1.6537560799855882, "grad_norm": 58.72772979736328, "learning_rate": 4.4202515886263725e-08, "logits/chosen": 2.0434346199035645, "logits/rejected": 2.074063301086426, "logps/chosen": -190.58383178710938, "logps/rejected": -238.36990356445312, "loss": 0.5493, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.4896175861358643, "rewards/margins": 0.4576866626739502, "rewards/rejected": -1.947304368019104, "step": 4590 }, { "epoch": 1.6573590344082147, "grad_norm": 36.73121643066406, "learning_rate": 4.331385455471345e-08, "logits/chosen": 2.244147539138794, "logits/rejected": 2.270183801651001, "logps/chosen": -186.93829345703125, "logps/rejected": -219.9781951904297, "loss": 0.6086, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.395939588546753, "rewards/margins": 0.3456757068634033, "rewards/rejected": -1.7416152954101562, "step": 4600 }, { "epoch": 1.6609619888308413, "grad_norm": 30.627487182617188, "learning_rate": 4.24333691819698e-08, "logits/chosen": 2.131470203399658, "logits/rejected": 2.138305187225342, "logps/chosen": -183.6460418701172, "logps/rejected": -226.09512329101562, "loss": 0.557, "rewards/accuracies": 0.75, "rewards/chosen": -1.2923738956451416, "rewards/margins": 0.4197824001312256, "rewards/rejected": -1.7121562957763672, "step": 4610 }, { "epoch": 1.6645649432534677, "grad_norm": 30.62818145751953, "learning_rate": 4.156109459771215e-08, "logits/chosen": 2.113499879837036, "logits/rejected": 2.1298279762268066, "logps/chosen": -193.1403045654297, "logps/rejected": -234.14138793945312, "loss": 0.5866, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3887965679168701, "rewards/margins": 0.40222710371017456, "rewards/rejected": -1.7910236120224, "step": 4620 }, { "epoch": 1.6681678976760943, "grad_norm": 31.726381301879883, "learning_rate": 4.069706530682232e-08, "logits/chosen": 1.8225421905517578, "logits/rejected": 1.8487510681152344, "logps/chosen": -185.66934204101562, "logps/rejected": -235.89761352539062, "loss": 0.5964, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.3873560428619385, "rewards/margins": 0.4824633002281189, "rewards/rejected": -1.8698192834854126, "step": 4630 }, { "epoch": 1.671770852098721, "grad_norm": 30.31300163269043, "learning_rate": 3.984131548802047e-08, "logits/chosen": 2.002676248550415, "logits/rejected": 2.043703556060791, "logps/chosen": -178.9305877685547, "logps/rejected": -227.54110717773438, "loss": 0.5445, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.3820874691009521, "rewards/margins": 0.4529193937778473, "rewards/rejected": -1.8350070714950562, "step": 4640 }, { "epoch": 1.6753738065213475, "grad_norm": 42.443992614746094, "learning_rate": 3.899387899251241e-08, "logits/chosen": 2.156654119491577, "logits/rejected": 2.172597646713257, "logps/chosen": -218.76962280273438, "logps/rejected": -264.9418640136719, "loss": 0.5746, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7126739025115967, "rewards/margins": 0.4262749254703522, "rewards/rejected": -2.138948917388916, "step": 4650 }, { "epoch": 1.678976760943974, "grad_norm": 19.931983947753906, "learning_rate": 3.8154789342650955e-08, "logits/chosen": 2.096226692199707, "logits/rejected": 2.1523149013519287, "logps/chosen": -206.48068237304688, "logps/rejected": -258.7948913574219, "loss": 0.5776, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7011810541152954, "rewards/margins": 0.5019563436508179, "rewards/rejected": -2.2031373977661133, "step": 4660 }, { "epoch": 1.6825797153666007, "grad_norm": 25.14623260498047, "learning_rate": 3.732407973060964e-08, "logits/chosen": 2.0920228958129883, "logits/rejected": 2.109365940093994, "logps/chosen": -191.2772674560547, "logps/rejected": -234.9695281982422, "loss": 0.5615, "rewards/accuracies": 0.75, "rewards/chosen": -1.5086921453475952, "rewards/margins": 0.4210229516029358, "rewards/rejected": -1.9297151565551758, "step": 4670 }, { "epoch": 1.6861826697892273, "grad_norm": 24.882339477539062, "learning_rate": 3.6501783017069823e-08, "logits/chosen": 2.0588390827178955, "logits/rejected": 2.108018159866333, "logps/chosen": -195.41616821289062, "logps/rejected": -265.99078369140625, "loss": 0.4962, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.5025618076324463, "rewards/margins": 0.675110936164856, "rewards/rejected": -2.1776726245880127, "step": 4680 }, { "epoch": 1.6897856242118539, "grad_norm": 42.46844482421875, "learning_rate": 3.5687931729920825e-08, "logits/chosen": 2.229187488555908, "logits/rejected": 2.2838330268859863, "logps/chosen": -199.95668029785156, "logps/rejected": -246.3704376220703, "loss": 0.5611, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4860082864761353, "rewards/margins": 0.44565218687057495, "rewards/rejected": -1.9316606521606445, "step": 4690 }, { "epoch": 1.6933885786344802, "grad_norm": 31.575082778930664, "learning_rate": 3.488255806297311e-08, "logits/chosen": 2.2462501525878906, "logits/rejected": 2.277681589126587, "logps/chosen": -220.80087280273438, "logps/rejected": -274.53033447265625, "loss": 0.5585, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6551021337509155, "rewards/margins": 0.5508934855461121, "rewards/rejected": -2.205995559692383, "step": 4700 }, { "epoch": 1.6969915330571068, "grad_norm": 37.62460708618164, "learning_rate": 3.408569387468488e-08, "logits/chosen": 2.237877130508423, "logits/rejected": 2.329503059387207, "logps/chosen": -186.99349975585938, "logps/rejected": -279.9960632324219, "loss": 0.4353, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.4569637775421143, "rewards/margins": 0.8750921487808228, "rewards/rejected": -2.3320560455322266, "step": 4710 }, { "epoch": 1.7005944874797334, "grad_norm": 34.484046936035156, "learning_rate": 3.3297370686901834e-08, "logits/chosen": 2.1620168685913086, "logits/rejected": 2.177347421646118, "logps/chosen": -207.050048828125, "logps/rejected": -247.9182891845703, "loss": 0.5654, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5944541692733765, "rewards/margins": 0.38728809356689453, "rewards/rejected": -1.9817421436309814, "step": 4720 }, { "epoch": 1.7041974419023598, "grad_norm": 30.381084442138672, "learning_rate": 3.2517619683610084e-08, "logits/chosen": 2.3808302879333496, "logits/rejected": 2.377898693084717, "logps/chosen": -217.5401611328125, "logps/rejected": -243.4801788330078, "loss": 0.6114, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.6837667226791382, "rewards/margins": 0.27860647439956665, "rewards/rejected": -1.96237313747406, "step": 4730 }, { "epoch": 1.7078003963249864, "grad_norm": 34.90840148925781, "learning_rate": 3.174647170970296e-08, "logits/chosen": 2.1793081760406494, "logits/rejected": 2.217268466949463, "logps/chosen": -209.4663848876953, "logps/rejected": -256.37432861328125, "loss": 0.5456, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.565969705581665, "rewards/margins": 0.471188485622406, "rewards/rejected": -2.037158250808716, "step": 4740 }, { "epoch": 1.711403350747613, "grad_norm": 27.377452850341797, "learning_rate": 3.0983957269760496e-08, "logits/chosen": 2.2203779220581055, "logits/rejected": 2.252307415008545, "logps/chosen": -224.4615478515625, "logps/rejected": -269.70343017578125, "loss": 0.6179, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.7553997039794922, "rewards/margins": 0.43910399079322815, "rewards/rejected": -2.1945037841796875, "step": 4750 }, { "epoch": 1.7150063051702396, "grad_norm": 25.391387939453125, "learning_rate": 3.023010652684277e-08, "logits/chosen": 2.140564441680908, "logits/rejected": 2.171532154083252, "logps/chosen": -222.165771484375, "logps/rejected": -273.32904052734375, "loss": 0.5767, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7117284536361694, "rewards/margins": 0.4794246554374695, "rewards/rejected": -2.191153049468994, "step": 4760 }, { "epoch": 1.7186092595928661, "grad_norm": 33.57196044921875, "learning_rate": 2.9484949301297163e-08, "logits/chosen": 2.471834897994995, "logits/rejected": 2.542451858520508, "logps/chosen": -205.19735717773438, "logps/rejected": -266.74700927734375, "loss": 0.5333, "rewards/accuracies": 0.6875, "rewards/chosen": -1.626854658126831, "rewards/margins": 0.5961812734603882, "rewards/rejected": -2.223036050796509, "step": 4770 }, { "epoch": 1.7222122140154927, "grad_norm": 35.74752426147461, "learning_rate": 2.874851506957815e-08, "logits/chosen": 2.269533157348633, "logits/rejected": 2.283372402191162, "logps/chosen": -209.5435333251953, "logps/rejected": -235.3410186767578, "loss": 0.6232, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.60595703125, "rewards/margins": 0.23182055354118347, "rewards/rejected": -1.8377774953842163, "step": 4780 }, { "epoch": 1.7258151684381193, "grad_norm": 34.54887771606445, "learning_rate": 2.8020832963081774e-08, "logits/chosen": 2.1971676349639893, "logits/rejected": 2.2423644065856934, "logps/chosen": -210.47946166992188, "logps/rejected": -253.7831573486328, "loss": 0.5701, "rewards/accuracies": 0.625, "rewards/chosen": -1.5512707233428955, "rewards/margins": 0.4195014536380768, "rewards/rejected": -1.97077214717865, "step": 4790 }, { "epoch": 1.729418122860746, "grad_norm": 36.821964263916016, "learning_rate": 2.7301931766992913e-08, "logits/chosen": 2.4112257957458496, "logits/rejected": 2.466730833053589, "logps/chosen": -228.7645721435547, "logps/rejected": -291.2528076171875, "loss": 0.5122, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.857338547706604, "rewards/margins": 0.5832494497299194, "rewards/rejected": -2.4405877590179443, "step": 4800 }, { "epoch": 1.7330210772833725, "grad_norm": 34.31934356689453, "learning_rate": 2.659183991914696e-08, "logits/chosen": 2.199960470199585, "logits/rejected": 2.2429251670837402, "logps/chosen": -218.9283905029297, "logps/rejected": -259.98016357421875, "loss": 0.5876, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.725171685218811, "rewards/margins": 0.38315922021865845, "rewards/rejected": -2.108330726623535, "step": 4810 }, { "epoch": 1.7366240317059989, "grad_norm": 21.790250778198242, "learning_rate": 2.5890585508904578e-08, "logits/chosen": 2.387810230255127, "logits/rejected": 2.4175963401794434, "logps/chosen": -209.4477996826172, "logps/rejected": -250.9538116455078, "loss": 0.5913, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6332486867904663, "rewards/margins": 0.3776470124721527, "rewards/rejected": -2.0108959674835205, "step": 4820 }, { "epoch": 1.7402269861286255, "grad_norm": 25.149259567260742, "learning_rate": 2.519819627604078e-08, "logits/chosen": 2.5020625591278076, "logits/rejected": 2.5205273628234863, "logps/chosen": -220.3013458251953, "logps/rejected": -256.93267822265625, "loss": 0.6108, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.761733055114746, "rewards/margins": 0.36151832342147827, "rewards/rejected": -2.123251438140869, "step": 4830 }, { "epoch": 1.743829940551252, "grad_norm": 22.088537216186523, "learning_rate": 2.4514699609647637e-08, "logits/chosen": 2.372863292694092, "logits/rejected": 2.4213290214538574, "logps/chosen": -214.58407592773438, "logps/rejected": -275.7231750488281, "loss": 0.525, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.712545394897461, "rewards/margins": 0.5982009172439575, "rewards/rejected": -2.310746431350708, "step": 4840 }, { "epoch": 1.7474328949738784, "grad_norm": 40.956947326660156, "learning_rate": 2.3840122547050478e-08, "logits/chosen": 2.3160908222198486, "logits/rejected": 2.3820090293884277, "logps/chosen": -228.3021697998047, "logps/rejected": -283.82330322265625, "loss": 0.5659, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.8226938247680664, "rewards/margins": 0.5088420510292053, "rewards/rejected": -2.331535816192627, "step": 4850 }, { "epoch": 1.751035849396505, "grad_norm": 25.419803619384766, "learning_rate": 2.317449177273889e-08, "logits/chosen": 2.342552661895752, "logits/rejected": 2.3440239429473877, "logps/chosen": -203.36520385742188, "logps/rejected": -241.7134552001953, "loss": 0.6071, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.56822669506073, "rewards/margins": 0.3357328176498413, "rewards/rejected": -1.9039596319198608, "step": 4860 }, { "epoch": 1.7546388038191316, "grad_norm": 58.29960250854492, "learning_rate": 2.2517833617310855e-08, "logits/chosen": 2.3843963146209717, "logits/rejected": 2.4130828380584717, "logps/chosen": -209.44369506835938, "logps/rejected": -253.1976776123047, "loss": 0.5787, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.635347604751587, "rewards/margins": 0.4348020553588867, "rewards/rejected": -2.0701496601104736, "step": 4870 }, { "epoch": 1.7582417582417582, "grad_norm": 30.26983070373535, "learning_rate": 2.1870174056430962e-08, "logits/chosen": 2.278834819793701, "logits/rejected": 2.3181471824645996, "logps/chosen": -221.67153930664062, "logps/rejected": -263.6853942871094, "loss": 0.5966, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.7323360443115234, "rewards/margins": 0.3889417052268982, "rewards/rejected": -2.1212778091430664, "step": 4880 }, { "epoch": 1.7618447126643848, "grad_norm": 30.307025909423828, "learning_rate": 2.1231538709803488e-08, "logits/chosen": 2.2428359985351562, "logits/rejected": 2.2310616970062256, "logps/chosen": -211.75650024414062, "logps/rejected": -231.0408935546875, "loss": 0.6407, "rewards/accuracies": 0.625, "rewards/chosen": -1.603784203529358, "rewards/margins": 0.200571209192276, "rewards/rejected": -1.8043553829193115, "step": 4890 }, { "epoch": 1.7654476670870114, "grad_norm": 33.48064422607422, "learning_rate": 2.0601952840158364e-08, "logits/chosen": 2.2225899696350098, "logits/rejected": 2.257615566253662, "logps/chosen": -206.1726531982422, "logps/rejected": -253.6875, "loss": 0.5507, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.616708517074585, "rewards/margins": 0.46390867233276367, "rewards/rejected": -2.0806171894073486, "step": 4900 }, { "epoch": 1.769050621509638, "grad_norm": 34.95090866088867, "learning_rate": 1.9981441352252187e-08, "logits/chosen": 2.3539671897888184, "logits/rejected": 2.405588150024414, "logps/chosen": -202.20733642578125, "logps/rejected": -255.64749145507812, "loss": 0.5593, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6130344867706299, "rewards/margins": 0.5116347670555115, "rewards/rejected": -2.124669313430786, "step": 4910 }, { "epoch": 1.7726535759322646, "grad_norm": 64.15029907226562, "learning_rate": 1.9370028791882847e-08, "logits/chosen": 2.24306583404541, "logits/rejected": 2.266782283782959, "logps/chosen": -221.64199829101562, "logps/rejected": -263.9620056152344, "loss": 0.574, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7340548038482666, "rewards/margins": 0.4460780620574951, "rewards/rejected": -2.1801326274871826, "step": 4920 }, { "epoch": 1.7762565303548912, "grad_norm": 40.552127838134766, "learning_rate": 1.8767739344918737e-08, "logits/chosen": 2.121140956878662, "logits/rejected": 2.1537022590637207, "logps/chosen": -209.78128051757812, "logps/rejected": -248.19454956054688, "loss": 0.5906, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5313321352005005, "rewards/margins": 0.3886438012123108, "rewards/rejected": -1.9199758768081665, "step": 4930 }, { "epoch": 1.7798594847775175, "grad_norm": 39.017791748046875, "learning_rate": 1.8174596836341928e-08, "logits/chosen": 2.2085771560668945, "logits/rejected": 2.248157024383545, "logps/chosen": -212.8624725341797, "logps/rejected": -260.98992919921875, "loss": 0.581, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.622436761856079, "rewards/margins": 0.4501896798610687, "rewards/rejected": -2.0726265907287598, "step": 4940 }, { "epoch": 1.7834624392001441, "grad_norm": 35.74030303955078, "learning_rate": 1.75906247293057e-08, "logits/chosen": 2.171400547027588, "logits/rejected": 2.202230215072632, "logps/chosen": -209.862060546875, "logps/rejected": -261.1230773925781, "loss": 0.5805, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5814688205718994, "rewards/margins": 0.5131391286849976, "rewards/rejected": -2.0946078300476074, "step": 4950 }, { "epoch": 1.7870653936227707, "grad_norm": 18.447336196899414, "learning_rate": 1.7015846124206535e-08, "logits/chosen": 2.213371753692627, "logits/rejected": 2.2472574710845947, "logps/chosen": -217.60348510742188, "logps/rejected": -272.07012939453125, "loss": 0.5278, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7204748392105103, "rewards/margins": 0.5277159810066223, "rewards/rejected": -2.2481908798217773, "step": 4960 }, { "epoch": 1.790668348045397, "grad_norm": 20.83319091796875, "learning_rate": 1.6450283757770077e-08, "logits/chosen": 2.417060375213623, "logits/rejected": 2.4168012142181396, "logps/chosen": -191.89654541015625, "logps/rejected": -241.69448852539062, "loss": 0.5303, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.4415607452392578, "rewards/margins": 0.49853330850601196, "rewards/rejected": -1.940093994140625, "step": 4970 }, { "epoch": 1.7942713024680237, "grad_norm": 41.57540512084961, "learning_rate": 1.58939600021519e-08, "logits/chosen": 2.1984307765960693, "logits/rejected": 2.2478766441345215, "logps/chosen": -198.91310119628906, "logps/rejected": -249.40933227539062, "loss": 0.5479, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5529348850250244, "rewards/margins": 0.47851094603538513, "rewards/rejected": -2.0314459800720215, "step": 4980 }, { "epoch": 1.7978742568906503, "grad_norm": 28.625106811523438, "learning_rate": 1.5346896864052716e-08, "logits/chosen": 2.232046127319336, "logits/rejected": 2.2994542121887207, "logps/chosen": -210.97714233398438, "logps/rejected": -279.3887023925781, "loss": 0.5097, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6779661178588867, "rewards/margins": 0.6225734949111938, "rewards/rejected": -2.300539493560791, "step": 4990 }, { "epoch": 1.8014772113132769, "grad_norm": 26.74551010131836, "learning_rate": 1.4809115983847265e-08, "logits/chosen": 2.254472255706787, "logits/rejected": 2.2978768348693848, "logps/chosen": -209.3938751220703, "logps/rejected": -251.89694213867188, "loss": 0.5982, "rewards/accuracies": 0.75, "rewards/chosen": -1.6444065570831299, "rewards/margins": 0.393837034702301, "rewards/rejected": -2.038243532180786, "step": 5000 }, { "epoch": 1.8050801657359035, "grad_norm": 25.60332489013672, "learning_rate": 1.4280638634728948e-08, "logits/chosen": 2.3615641593933105, "logits/rejected": 2.374450206756592, "logps/chosen": -217.30313110351562, "logps/rejected": -250.3795623779297, "loss": 0.6175, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7042795419692993, "rewards/margins": 0.3368242084980011, "rewards/rejected": -2.0411038398742676, "step": 5010 }, { "epoch": 1.80868312015853, "grad_norm": 58.32925796508789, "learning_rate": 1.3761485721867971e-08, "logits/chosen": 2.223480463027954, "logits/rejected": 2.2535018920898438, "logps/chosen": -213.029541015625, "logps/rejected": -260.0022277832031, "loss": 0.5707, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.604480504989624, "rewards/margins": 0.4418935179710388, "rewards/rejected": -2.0463738441467285, "step": 5020 }, { "epoch": 1.8122860745811566, "grad_norm": 28.643110275268555, "learning_rate": 1.3251677781584175e-08, "logits/chosen": 2.3958420753479004, "logits/rejected": 2.4055347442626953, "logps/chosen": -219.23513793945312, "logps/rejected": -259.5078125, "loss": 0.5868, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.7029615640640259, "rewards/margins": 0.3803695738315582, "rewards/rejected": -2.0833308696746826, "step": 5030 }, { "epoch": 1.8158890290037832, "grad_norm": 23.900922775268555, "learning_rate": 1.2751234980535318e-08, "logits/chosen": 2.322385311126709, "logits/rejected": 2.367161512374878, "logps/chosen": -218.500244140625, "logps/rejected": -268.5885314941406, "loss": 0.5338, "rewards/accuracies": 0.75, "rewards/chosen": -1.6854898929595947, "rewards/margins": 0.4811466336250305, "rewards/rejected": -2.1666367053985596, "step": 5040 }, { "epoch": 1.8194919834264096, "grad_norm": 21.290176391601562, "learning_rate": 1.2260177114918668e-08, "logits/chosen": 2.3270578384399414, "logits/rejected": 2.421325445175171, "logps/chosen": -185.88186645507812, "logps/rejected": -258.69708251953125, "loss": 0.4842, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.4692976474761963, "rewards/margins": 0.7032708525657654, "rewards/rejected": -2.1725687980651855, "step": 5050 }, { "epoch": 1.8230949378490362, "grad_norm": 31.082626342773438, "learning_rate": 1.1778523609688313e-08, "logits/chosen": 2.322352647781372, "logits/rejected": 2.3377273082733154, "logps/chosen": -198.61251831054688, "logps/rejected": -242.9839630126953, "loss": 0.5889, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.474416971206665, "rewards/margins": 0.43179792165756226, "rewards/rejected": -1.9062144756317139, "step": 5060 }, { "epoch": 1.8266978922716628, "grad_norm": 35.41000747680664, "learning_rate": 1.1306293517786613e-08, "logits/chosen": 2.469616413116455, "logits/rejected": 2.534945487976074, "logps/chosen": -222.2001953125, "logps/rejected": -296.61187744140625, "loss": 0.5151, "rewards/accuracies": 0.75, "rewards/chosen": -1.8318326473236084, "rewards/margins": 0.7023247480392456, "rewards/rejected": -2.5341572761535645, "step": 5070 }, { "epoch": 1.8303008466942892, "grad_norm": 17.69406509399414, "learning_rate": 1.0843505519390588e-08, "logits/chosen": 2.348323345184326, "logits/rejected": 2.386146306991577, "logps/chosen": -209.5731658935547, "logps/rejected": -267.9241943359375, "loss": 0.572, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.6910864114761353, "rewards/margins": 0.5442907214164734, "rewards/rejected": -2.2353768348693848, "step": 5080 }, { "epoch": 1.8339038011169158, "grad_norm": 56.48704528808594, "learning_rate": 1.039017792117286e-08, "logits/chosen": 2.2373058795928955, "logits/rejected": 2.278470993041992, "logps/chosen": -213.70553588867188, "logps/rejected": -262.6347961425781, "loss": 0.5841, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.7332690954208374, "rewards/margins": 0.4511147439479828, "rewards/rejected": -2.1843838691711426, "step": 5090 }, { "epoch": 1.8375067555395423, "grad_norm": 25.610595703125, "learning_rate": 9.946328655577624e-09, "logits/chosen": 2.4761364459991455, "logits/rejected": 2.5152907371520996, "logps/chosen": -200.73094177246094, "logps/rejected": -257.61474609375, "loss": 0.5157, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5974736213684082, "rewards/margins": 0.5294596552848816, "rewards/rejected": -2.1269335746765137, "step": 5100 }, { "epoch": 1.841109709962169, "grad_norm": 32.335365295410156, "learning_rate": 9.511975280111329e-09, "logits/chosen": 2.2728469371795654, "logits/rejected": 2.3433430194854736, "logps/chosen": -205.5606231689453, "logps/rejected": -245.2855224609375, "loss": 0.5799, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5625890493392944, "rewards/margins": 0.39551809430122375, "rewards/rejected": -1.9581069946289062, "step": 5110 }, { "epoch": 1.8447126643847955, "grad_norm": 25.807748794555664, "learning_rate": 9.087134976647815e-09, "logits/chosen": 2.431424379348755, "logits/rejected": 2.4303934574127197, "logps/chosen": -213.15231323242188, "logps/rejected": -237.0194854736328, "loss": 0.6707, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.705770492553711, "rewards/margins": 0.2457417994737625, "rewards/rejected": -1.951512336730957, "step": 5120 }, { "epoch": 1.8483156188074221, "grad_norm": 28.861064910888672, "learning_rate": 8.671824550749164e-09, "logits/chosen": 2.2334659099578857, "logits/rejected": 2.2756643295288086, "logps/chosen": -218.04867553710938, "logps/rejected": -268.18658447265625, "loss": 0.542, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6735773086547852, "rewards/margins": 0.47903457283973694, "rewards/rejected": -2.15261173248291, "step": 5130 }, { "epoch": 1.8519185732300487, "grad_norm": 40.80805969238281, "learning_rate": 8.266060431000448e-09, "logits/chosen": 2.0608906745910645, "logits/rejected": 2.1089887619018555, "logps/chosen": -228.1378936767578, "logps/rejected": -280.6064147949219, "loss": 0.5458, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.8084704875946045, "rewards/margins": 0.4998833239078522, "rewards/rejected": -2.308354139328003, "step": 5140 }, { "epoch": 1.8555215276526753, "grad_norm": 19.78631591796875, "learning_rate": 7.86985866836004e-09, "logits/chosen": 2.2697739601135254, "logits/rejected": 2.2894599437713623, "logps/chosen": -208.9029998779297, "logps/rejected": -248.81808471679688, "loss": 0.5753, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5741163492202759, "rewards/margins": 0.4058263301849365, "rewards/rejected": -1.9799423217773438, "step": 5150 }, { "epoch": 1.859124482075302, "grad_norm": 23.30661964416504, "learning_rate": 7.483234935524802e-09, "logits/chosen": 2.202749729156494, "logits/rejected": 2.246129274368286, "logps/chosen": -207.0511016845703, "logps/rejected": -258.9788818359375, "loss": 0.5434, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6408188343048096, "rewards/margins": 0.5060772895812988, "rewards/rejected": -2.1468958854675293, "step": 5160 }, { "epoch": 1.8627274364979283, "grad_norm": 23.1809024810791, "learning_rate": 7.106204526309944e-09, "logits/chosen": 2.3511722087860107, "logits/rejected": 2.3740787506103516, "logps/chosen": -203.56642150878906, "logps/rejected": -248.69482421875, "loss": 0.5664, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5973598957061768, "rewards/margins": 0.43096810579299927, "rewards/rejected": -2.0283281803131104, "step": 5170 }, { "epoch": 1.8663303909205549, "grad_norm": 25.39525032043457, "learning_rate": 6.738782355044048e-09, "logits/chosen": 2.307525157928467, "logits/rejected": 2.318244695663452, "logps/chosen": -210.04916381835938, "logps/rejected": -256.23248291015625, "loss": 0.561, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.671134352684021, "rewards/margins": 0.44288238883018494, "rewards/rejected": -2.1140167713165283, "step": 5180 }, { "epoch": 1.8699333453431815, "grad_norm": 34.74847412109375, "learning_rate": 6.380982955979192e-09, "logits/chosen": 2.2530720233917236, "logits/rejected": 2.2938835620880127, "logps/chosen": -206.26876831054688, "logps/rejected": -263.15106201171875, "loss": 0.549, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.588684320449829, "rewards/margins": 0.5294872522354126, "rewards/rejected": -2.118171453475952, "step": 5190 }, { "epoch": 1.8735362997658078, "grad_norm": 41.298126220703125, "learning_rate": 6.032820482716e-09, "logits/chosen": 2.365787982940674, "logits/rejected": 2.3662896156311035, "logps/chosen": -184.18885803222656, "logps/rejected": -222.0736541748047, "loss": 0.5642, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4184465408325195, "rewards/margins": 0.386150062084198, "rewards/rejected": -1.8045965433120728, "step": 5200 }, { "epoch": 1.8771392541884344, "grad_norm": 18.131206512451172, "learning_rate": 5.694308707643619e-09, "logits/chosen": 2.307661294937134, "logits/rejected": 2.3559043407440186, "logps/chosen": -202.79563903808594, "logps/rejected": -245.22146606445312, "loss": 0.563, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5787583589553833, "rewards/margins": 0.4313790202140808, "rewards/rejected": -2.0101375579833984, "step": 5210 }, { "epoch": 1.880742208611061, "grad_norm": 27.24417495727539, "learning_rate": 5.365461021395095e-09, "logits/chosen": 2.4612069129943848, "logits/rejected": 2.503018379211426, "logps/chosen": -223.376708984375, "logps/rejected": -276.1731872558594, "loss": 0.5456, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7949035167694092, "rewards/margins": 0.49945777654647827, "rewards/rejected": -2.2943613529205322, "step": 5220 }, { "epoch": 1.8843451630336876, "grad_norm": 25.800756454467773, "learning_rate": 5.046290432317663e-09, "logits/chosen": 2.2512588500976562, "logits/rejected": 2.3021240234375, "logps/chosen": -230.3580322265625, "logps/rejected": -296.2758483886719, "loss": 0.5168, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.814835786819458, "rewards/margins": 0.640101969242096, "rewards/rejected": -2.454937696456909, "step": 5230 }, { "epoch": 1.8879481174563142, "grad_norm": 30.53504753112793, "learning_rate": 4.736809565958011e-09, "logits/chosen": 2.399794340133667, "logits/rejected": 2.405836582183838, "logps/chosen": -200.7362060546875, "logps/rejected": -243.2240753173828, "loss": 0.6108, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5796430110931396, "rewards/margins": 0.4164234697818756, "rewards/rejected": -1.996066689491272, "step": 5240 }, { "epoch": 1.8915510718789408, "grad_norm": 39.20377731323242, "learning_rate": 4.437030664562968e-09, "logits/chosen": 2.1366195678710938, "logits/rejected": 2.1596555709838867, "logps/chosen": -233.42648315429688, "logps/rejected": -277.30462646484375, "loss": 0.5978, "rewards/accuracies": 0.6875, "rewards/chosen": -1.8780971765518188, "rewards/margins": 0.41063255071640015, "rewards/rejected": -2.288729667663574, "step": 5250 }, { "epoch": 1.8951540263015674, "grad_norm": 39.66998291015625, "learning_rate": 4.14696558659533e-09, "logits/chosen": 2.115039110183716, "logits/rejected": 2.146724224090576, "logps/chosen": -214.2760009765625, "logps/rejected": -255.70059204101562, "loss": 0.5649, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5169379711151123, "rewards/margins": 0.4280829429626465, "rewards/rejected": -1.9450209140777588, "step": 5260 }, { "epoch": 1.898756980724194, "grad_norm": 23.76442527770996, "learning_rate": 3.8666258062645116e-09, "logits/chosen": 2.3175015449523926, "logits/rejected": 2.3244643211364746, "logps/chosen": -207.5288848876953, "logps/rejected": -244.6178741455078, "loss": 0.5804, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.5870716571807861, "rewards/margins": 0.3724648356437683, "rewards/rejected": -1.9595365524291992, "step": 5270 }, { "epoch": 1.9023599351468206, "grad_norm": 27.673847198486328, "learning_rate": 3.5960224130728858e-09, "logits/chosen": 2.3952476978302, "logits/rejected": 2.428849458694458, "logps/chosen": -227.6666259765625, "logps/rejected": -280.7477722167969, "loss": 0.5494, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.7906230688095093, "rewards/margins": 0.5157279372215271, "rewards/rejected": -2.3063511848449707, "step": 5280 }, { "epoch": 1.905962889569447, "grad_norm": 34.72603988647461, "learning_rate": 3.3351661113769914e-09, "logits/chosen": 2.1342246532440186, "logits/rejected": 2.1574902534484863, "logps/chosen": -207.9198455810547, "logps/rejected": -265.78875732421875, "loss": 0.531, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.4568283557891846, "rewards/margins": 0.5453433990478516, "rewards/rejected": -2.002171754837036, "step": 5290 }, { "epoch": 1.9095658439920735, "grad_norm": 22.57170867919922, "learning_rate": 3.0840672199641815e-09, "logits/chosen": 2.3103671073913574, "logits/rejected": 2.350656509399414, "logps/chosen": -206.5009765625, "logps/rejected": -256.1815185546875, "loss": 0.5614, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5754340887069702, "rewards/margins": 0.4603274464607239, "rewards/rejected": -2.035761594772339, "step": 5300 }, { "epoch": 1.9131687984147, "grad_norm": 24.769725799560547, "learning_rate": 2.842735671644336e-09, "logits/chosen": 2.4594054222106934, "logits/rejected": 2.4655635356903076, "logps/chosen": -198.40814208984375, "logps/rejected": -242.697021484375, "loss": 0.5656, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.4905879497528076, "rewards/margins": 0.4295724928379059, "rewards/rejected": -1.9201602935791016, "step": 5310 }, { "epoch": 1.9167717528373265, "grad_norm": 40.51364517211914, "learning_rate": 2.6111810128570386e-09, "logits/chosen": 2.204953908920288, "logits/rejected": 2.233264207839966, "logps/chosen": -214.0225830078125, "logps/rejected": -269.80133056640625, "loss": 0.5394, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.673193335533142, "rewards/margins": 0.5253087282180786, "rewards/rejected": -2.1985020637512207, "step": 5320 }, { "epoch": 1.920374707259953, "grad_norm": 25.24776840209961, "learning_rate": 2.38941240329385e-09, "logits/chosen": 2.290570020675659, "logits/rejected": 2.324573040008545, "logps/chosen": -220.32772827148438, "logps/rejected": -272.10137939453125, "loss": 0.5699, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.7210849523544312, "rewards/margins": 0.48146161437034607, "rewards/rejected": -2.2025465965270996, "step": 5330 }, { "epoch": 1.9239776616825797, "grad_norm": 31.25950813293457, "learning_rate": 2.1774386155361537e-09, "logits/chosen": 2.2316412925720215, "logits/rejected": 2.26287841796875, "logps/chosen": -199.40396118164062, "logps/rejected": -251.3515625, "loss": 0.5369, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.561570405960083, "rewards/margins": 0.49641767144203186, "rewards/rejected": -2.057988166809082, "step": 5340 }, { "epoch": 1.9275806161052063, "grad_norm": 44.36783981323242, "learning_rate": 1.9752680347078774e-09, "logits/chosen": 2.3788223266601562, "logits/rejected": 2.3906877040863037, "logps/chosen": -230.75460815429688, "logps/rejected": -258.0755310058594, "loss": 0.6261, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.8339112997055054, "rewards/margins": 0.23464472591876984, "rewards/rejected": -2.0685558319091797, "step": 5350 }, { "epoch": 1.9311835705278328, "grad_norm": 28.646312713623047, "learning_rate": 1.7829086581440667e-09, "logits/chosen": 2.3322763442993164, "logits/rejected": 2.3718745708465576, "logps/chosen": -203.08871459960938, "logps/rejected": -251.14956665039062, "loss": 0.5678, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5246843099594116, "rewards/margins": 0.44258731603622437, "rewards/rejected": -1.9672715663909912, "step": 5360 }, { "epoch": 1.9347865249504594, "grad_norm": 26.19532012939453, "learning_rate": 1.6003680950742726e-09, "logits/chosen": 2.258373260498047, "logits/rejected": 2.3165993690490723, "logps/chosen": -203.53799438476562, "logps/rejected": -256.0785827636719, "loss": 0.5388, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5047380924224854, "rewards/margins": 0.5257495641708374, "rewards/rejected": -2.030487537384033, "step": 5370 }, { "epoch": 1.938389479373086, "grad_norm": 26.616071701049805, "learning_rate": 1.4276535663217682e-09, "logits/chosen": 2.248194456100464, "logits/rejected": 2.282585620880127, "logps/chosen": -198.4256591796875, "logps/rejected": -242.58901977539062, "loss": 0.5664, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.4669153690338135, "rewards/margins": 0.4206790328025818, "rewards/rejected": -1.8875945806503296, "step": 5380 }, { "epoch": 1.9419924337957126, "grad_norm": 31.801549911499023, "learning_rate": 1.264771904017803e-09, "logits/chosen": 2.246044635772705, "logits/rejected": 2.3059990406036377, "logps/chosen": -221.56613159179688, "logps/rejected": -281.8612365722656, "loss": 0.5373, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7499691247940063, "rewards/margins": 0.5678530931472778, "rewards/rejected": -2.317822217941284, "step": 5390 }, { "epoch": 1.945595388218339, "grad_norm": 43.92122268676758, "learning_rate": 1.1117295513313473e-09, "logits/chosen": 2.4421546459198, "logits/rejected": 2.4628496170043945, "logps/chosen": -227.49951171875, "logps/rejected": -257.37713623046875, "loss": 0.6306, "rewards/accuracies": 0.625, "rewards/chosen": -1.7916418313980103, "rewards/margins": 0.3117103576660156, "rewards/rejected": -2.1033523082733154, "step": 5400 }, { "epoch": 1.9491983426409656, "grad_norm": 31.931947708129883, "learning_rate": 9.685325622142692e-10, "logits/chosen": 2.235248327255249, "logits/rejected": 2.2608213424682617, "logps/chosen": -201.73275756835938, "logps/rejected": -240.254150390625, "loss": 0.5679, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.563132405281067, "rewards/margins": 0.3913213610649109, "rewards/rejected": -1.9544538259506226, "step": 5410 }, { "epoch": 1.9528012970635922, "grad_norm": 55.04080581665039, "learning_rate": 8.351866011617748e-10, "logits/chosen": 2.243534564971924, "logits/rejected": 2.2653114795684814, "logps/chosen": -208.999755859375, "logps/rejected": -250.69638061523438, "loss": 0.5978, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5816562175750732, "rewards/margins": 0.3955465853214264, "rewards/rejected": -1.9772027730941772, "step": 5420 }, { "epoch": 1.9564042514862185, "grad_norm": 26.250028610229492, "learning_rate": 7.116969429883934e-10, "logits/chosen": 2.1618504524230957, "logits/rejected": 2.240119457244873, "logps/chosen": -201.18685913085938, "logps/rejected": -257.50616455078125, "loss": 0.5461, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.609142541885376, "rewards/margins": 0.511391818523407, "rewards/rejected": -2.1205344200134277, "step": 5430 }, { "epoch": 1.9600072059088451, "grad_norm": 30.596418380737305, "learning_rate": 5.980684726193397e-10, "logits/chosen": 2.4325242042541504, "logits/rejected": 2.4933700561523438, "logps/chosen": -200.2987823486328, "logps/rejected": -264.40252685546875, "loss": 0.5397, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5802122354507446, "rewards/margins": 0.6055213212966919, "rewards/rejected": -2.1857335567474365, "step": 5440 }, { "epoch": 1.9636101603314717, "grad_norm": 24.75813865661621, "learning_rate": 4.943056848972226e-10, "logits/chosen": 2.3395333290100098, "logits/rejected": 2.367810010910034, "logps/chosen": -192.31149291992188, "logps/rejected": -250.6825408935547, "loss": 0.5491, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5191051959991455, "rewards/margins": 0.5737230777740479, "rewards/rejected": -2.0928282737731934, "step": 5450 }, { "epoch": 1.9672131147540983, "grad_norm": 38.39195251464844, "learning_rate": 4.0041268440424434e-10, "logits/chosen": 2.3370611667633057, "logits/rejected": 2.3738551139831543, "logps/chosen": -219.6514892578125, "logps/rejected": -255.0663299560547, "loss": 0.6021, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.6766411066055298, "rewards/margins": 0.3405107855796814, "rewards/rejected": -2.0171518325805664, "step": 5460 }, { "epoch": 1.970816069176725, "grad_norm": 33.038150787353516, "learning_rate": 3.163931852998569e-10, "logits/chosen": 2.4940686225891113, "logits/rejected": 2.526848316192627, "logps/chosen": -223.1959228515625, "logps/rejected": -278.0130920410156, "loss": 0.5404, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.7955191135406494, "rewards/margins": 0.5243655443191528, "rewards/rejected": -2.319884777069092, "step": 5470 }, { "epoch": 1.9744190235993515, "grad_norm": 56.82768630981445, "learning_rate": 2.4225051117390817e-10, "logits/chosen": 2.2985916137695312, "logits/rejected": 2.349461317062378, "logps/chosen": -239.0338134765625, "logps/rejected": -290.59857177734375, "loss": 0.5749, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.9805402755737305, "rewards/margins": 0.48385435342788696, "rewards/rejected": -2.4643945693969727, "step": 5480 }, { "epoch": 1.978021978021978, "grad_norm": 32.097084045410156, "learning_rate": 1.779875949149967e-10, "logits/chosen": 2.401785373687744, "logits/rejected": 2.4758224487304688, "logps/chosen": -204.2624969482422, "logps/rejected": -258.7240905761719, "loss": 0.5555, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6019967794418335, "rewards/margins": 0.47049999237060547, "rewards/rejected": -2.0724966526031494, "step": 5490 }, { "epoch": 1.9816249324446047, "grad_norm": 29.596235275268555, "learning_rate": 1.2360697859462033e-10, "logits/chosen": 2.2443838119506836, "logits/rejected": 2.2928102016448975, "logps/chosen": -213.0929412841797, "logps/rejected": -258.2647705078125, "loss": 0.5449, "rewards/accuracies": 0.75, "rewards/chosen": -1.6028426885604858, "rewards/margins": 0.44829025864601135, "rewards/rejected": -2.051133155822754, "step": 5500 }, { "epoch": 1.9852278868672313, "grad_norm": 29.45462989807129, "learning_rate": 7.911081336656189e-11, "logits/chosen": 2.351747512817383, "logits/rejected": 2.353079319000244, "logps/chosen": -203.29656982421875, "logps/rejected": -239.415771484375, "loss": 0.6046, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.591109037399292, "rewards/margins": 0.346746027469635, "rewards/rejected": -1.9378551244735718, "step": 5510 }, { "epoch": 1.9888308412898577, "grad_norm": 50.27937316894531, "learning_rate": 4.4500859381707553e-11, "logits/chosen": 2.1259727478027344, "logits/rejected": 2.1634602546691895, "logps/chosen": -221.07583618164062, "logps/rejected": -272.86737060546875, "loss": 0.5746, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6605325937271118, "rewards/margins": 0.4804542660713196, "rewards/rejected": -2.140986919403076, "step": 5520 }, { "epoch": 1.9924337957124842, "grad_norm": 29.29747772216797, "learning_rate": 1.9778485718630056e-11, "logits/chosen": 2.165703058242798, "logits/rejected": 2.1946606636047363, "logps/chosen": -227.1041259765625, "logps/rejected": -273.04376220703125, "loss": 0.5728, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7531017065048218, "rewards/margins": 0.45691174268722534, "rewards/rejected": -2.2100133895874023, "step": 5530 }, { "epoch": 1.9960367501351108, "grad_norm": 27.07058334350586, "learning_rate": 4.944670329187772e-12, "logits/chosen": 2.43207049369812, "logits/rejected": 2.4914469718933105, "logps/chosen": -199.6597137451172, "logps/rejected": -240.5431671142578, "loss": 0.5931, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5773383378982544, "rewards/margins": 0.39172571897506714, "rewards/rejected": -1.9690639972686768, "step": 5540 }, { "epoch": 1.9996397045577372, "grad_norm": 23.37407112121582, "learning_rate": 0.0, "logits/chosen": 2.2600417137145996, "logits/rejected": 2.296538829803467, "logps/chosen": -213.81613159179688, "logps/rejected": -258.1305847167969, "loss": 0.5881, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6341050863265991, "rewards/margins": 0.4021049439907074, "rewards/rejected": -2.036210060119629, "step": 5550 }, { "epoch": 1.9996397045577372, "step": 5550, "total_flos": 0.0, "train_loss": 0.6103729385513443, "train_runtime": 10082.6993, "train_samples_per_second": 4.404, "train_steps_per_second": 0.55 } ], "logging_steps": 10, "max_steps": 5550, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }