{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9964868029907215, "eval_steps": 800, "global_step": 2079, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0014413115935501305, "grad_norm": 15.202939063397405, "learning_rate": 4.807692307692308e-10, "logits/chosen": -2.3378124237060547, "logits/rejected": -2.341672897338867, "logps/chosen": -1.0059865713119507, "logps/rejected": -1.105405569076538, "loss": 1.6556, "rewards/accuracies": 0.5, "rewards/chosen": -2.0119731426239014, "rewards/margins": 0.19883811473846436, "rewards/rejected": -2.210811138153076, "step": 1 }, { "epoch": 0.014413115935501306, "grad_norm": 18.061978045212722, "learning_rate": 4.807692307692308e-09, "logits/chosen": -2.356367826461792, "logits/rejected": -2.3451521396636963, "logps/chosen": -1.0228126049041748, "logps/rejected": -1.1430484056472778, "loss": 1.6323, "rewards/accuracies": 0.5694444179534912, "rewards/chosen": -2.0456252098083496, "rewards/margins": 0.24047136306762695, "rewards/rejected": -2.2860968112945557, "step": 10 }, { "epoch": 0.02882623187100261, "grad_norm": 17.723319596995733, "learning_rate": 9.615384615384615e-09, "logits/chosen": -2.3264236450195312, "logits/rejected": -2.321986198425293, "logps/chosen": -1.0446507930755615, "logps/rejected": -1.1442738771438599, "loss": 1.6729, "rewards/accuracies": 0.59375, "rewards/chosen": -2.089301586151123, "rewards/margins": 0.19924603402614594, "rewards/rejected": -2.2885477542877197, "step": 20 }, { "epoch": 0.04323934780650392, "grad_norm": 17.07010517991476, "learning_rate": 1.442307692307692e-08, "logits/chosen": -2.3456313610076904, "logits/rejected": -2.3424785137176514, "logps/chosen": -1.0158333778381348, "logps/rejected": -1.076974630355835, "loss": 1.7109, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -2.0316667556762695, "rewards/margins": 0.12228262424468994, "rewards/rejected": -2.15394926071167, "step": 30 }, { "epoch": 0.05765246374200522, "grad_norm": 19.711953891202494, "learning_rate": 1.923076923076923e-08, "logits/chosen": -2.383465528488159, "logits/rejected": -2.3750338554382324, "logps/chosen": -1.1377735137939453, "logps/rejected": -1.221296787261963, "loss": 1.6828, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.2755470275878906, "rewards/margins": 0.167046457529068, "rewards/rejected": -2.442593574523926, "step": 40 }, { "epoch": 0.07206557967750653, "grad_norm": 15.368731865288492, "learning_rate": 2.403846153846154e-08, "logits/chosen": -2.3631155490875244, "logits/rejected": -2.362963914871216, "logps/chosen": -1.0241036415100098, "logps/rejected": -1.1317743062973022, "loss": 1.6525, "rewards/accuracies": 0.546875, "rewards/chosen": -2.0482072830200195, "rewards/margins": 0.2153414785861969, "rewards/rejected": -2.2635486125946045, "step": 50 }, { "epoch": 0.08647869561300783, "grad_norm": 15.486802435760401, "learning_rate": 2.884615384615384e-08, "logits/chosen": -2.3361105918884277, "logits/rejected": -2.327380657196045, "logps/chosen": -0.9968592524528503, "logps/rejected": -1.0975861549377441, "loss": 1.6565, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.9937185049057007, "rewards/margins": 0.2014540731906891, "rewards/rejected": -2.1951723098754883, "step": 60 }, { "epoch": 0.10089181154850914, "grad_norm": 15.988415966234422, "learning_rate": 3.365384615384615e-08, "logits/chosen": -2.3774499893188477, "logits/rejected": -2.3742191791534424, "logps/chosen": -1.028954267501831, "logps/rejected": -1.1373963356018066, "loss": 1.6496, "rewards/accuracies": 0.5625, "rewards/chosen": -2.057908535003662, "rewards/margins": 0.21688416600227356, "rewards/rejected": -2.2747926712036133, "step": 70 }, { "epoch": 0.11530492748401044, "grad_norm": 13.627900414661896, "learning_rate": 3.846153846153846e-08, "logits/chosen": -2.3636672496795654, "logits/rejected": -2.354912757873535, "logps/chosen": -0.9835589528083801, "logps/rejected": -1.1169239282608032, "loss": 1.6095, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.9671179056167603, "rewards/margins": 0.2667301595211029, "rewards/rejected": -2.2338478565216064, "step": 80 }, { "epoch": 0.12971804341951176, "grad_norm": 16.99221012864124, "learning_rate": 4.326923076923077e-08, "logits/chosen": -2.3509373664855957, "logits/rejected": -2.3414111137390137, "logps/chosen": -1.0289192199707031, "logps/rejected": -1.1351473331451416, "loss": 1.6614, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.0578384399414062, "rewards/margins": 0.21245631575584412, "rewards/rejected": -2.270294666290283, "step": 90 }, { "epoch": 0.14413115935501306, "grad_norm": 16.26579840133319, "learning_rate": 4.807692307692308e-08, "logits/chosen": -2.4182028770446777, "logits/rejected": -2.416335105895996, "logps/chosen": -0.9977607727050781, "logps/rejected": -1.108969807624817, "loss": 1.637, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.9955215454101562, "rewards/margins": 0.22241799533367157, "rewards/rejected": -2.217939615249634, "step": 100 }, { "epoch": 0.15854427529051437, "grad_norm": 14.431674139311319, "learning_rate": 5.288461538461538e-08, "logits/chosen": -2.342700719833374, "logits/rejected": -2.3403000831604004, "logps/chosen": -1.0405100584030151, "logps/rejected": -1.1639328002929688, "loss": 1.6325, "rewards/accuracies": 0.59375, "rewards/chosen": -2.0810201168060303, "rewards/margins": 0.24684572219848633, "rewards/rejected": -2.3278656005859375, "step": 110 }, { "epoch": 0.17295739122601567, "grad_norm": 16.881846104086076, "learning_rate": 5.769230769230768e-08, "logits/chosen": -2.3760740756988525, "logits/rejected": -2.373129367828369, "logps/chosen": -1.0364916324615479, "logps/rejected": -1.1324373483657837, "loss": 1.67, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -2.0729832649230957, "rewards/margins": 0.19189123809337616, "rewards/rejected": -2.2648746967315674, "step": 120 }, { "epoch": 0.18737050716151699, "grad_norm": 16.764431844922484, "learning_rate": 6.25e-08, "logits/chosen": -2.3209102153778076, "logits/rejected": -2.3239667415618896, "logps/chosen": -1.0940515995025635, "logps/rejected": -1.1949011087417603, "loss": 1.6633, "rewards/accuracies": 0.5625, "rewards/chosen": -2.188103199005127, "rewards/margins": 0.20169904828071594, "rewards/rejected": -2.3898022174835205, "step": 130 }, { "epoch": 0.20178362309701828, "grad_norm": 17.534779544810593, "learning_rate": 6.73076923076923e-08, "logits/chosen": -2.3762125968933105, "logits/rejected": -2.368044376373291, "logps/chosen": -1.0029666423797607, "logps/rejected": -1.1249053478240967, "loss": 1.6237, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.0059332847595215, "rewards/margins": 0.2438771277666092, "rewards/rejected": -2.2498106956481934, "step": 140 }, { "epoch": 0.2161967390325196, "grad_norm": 15.578504627710455, "learning_rate": 7.211538461538461e-08, "logits/chosen": -2.3589887619018555, "logits/rejected": -2.3546345233917236, "logps/chosen": -1.0512168407440186, "logps/rejected": -1.1491758823394775, "loss": 1.6633, "rewards/accuracies": 0.5718749761581421, "rewards/chosen": -2.102433681488037, "rewards/margins": 0.19591817259788513, "rewards/rejected": -2.298351764678955, "step": 150 }, { "epoch": 0.2306098549680209, "grad_norm": 13.745585175489111, "learning_rate": 7.692307692307692e-08, "logits/chosen": -2.338444232940674, "logits/rejected": -2.332979679107666, "logps/chosen": -1.0473906993865967, "logps/rejected": -1.1564788818359375, "loss": 1.6513, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -2.0947813987731934, "rewards/margins": 0.21817633509635925, "rewards/rejected": -2.312957763671875, "step": 160 }, { "epoch": 0.2450229709035222, "grad_norm": 16.783418396767676, "learning_rate": 8.173076923076923e-08, "logits/chosen": -2.3806934356689453, "logits/rejected": -2.3792760372161865, "logps/chosen": -1.0662988424301147, "logps/rejected": -1.1184349060058594, "loss": 1.7353, "rewards/accuracies": 0.5093749761581421, "rewards/chosen": -2.1325976848602295, "rewards/margins": 0.1042722687125206, "rewards/rejected": -2.2368698120117188, "step": 170 }, { "epoch": 0.2594360868390235, "grad_norm": 15.562730291374017, "learning_rate": 8.653846153846154e-08, "logits/chosen": -2.3370161056518555, "logits/rejected": -2.3294992446899414, "logps/chosen": -1.0367413759231567, "logps/rejected": -1.1586549282073975, "loss": 1.6251, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -2.0734827518463135, "rewards/margins": 0.24382701516151428, "rewards/rejected": -2.317309856414795, "step": 180 }, { "epoch": 0.2738492027745248, "grad_norm": 14.57246304002355, "learning_rate": 9.134615384615383e-08, "logits/chosen": -2.355874538421631, "logits/rejected": -2.357952833175659, "logps/chosen": -1.0316553115844727, "logps/rejected": -1.1332082748413086, "loss": 1.6605, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.0633106231689453, "rewards/margins": 0.20310597121715546, "rewards/rejected": -2.266416549682617, "step": 190 }, { "epoch": 0.2882623187100261, "grad_norm": 14.510113595673776, "learning_rate": 9.615384615384616e-08, "logits/chosen": -2.3815228939056396, "logits/rejected": -2.377211332321167, "logps/chosen": -1.0085281133651733, "logps/rejected": -1.0985215902328491, "loss": 1.6684, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.0170562267303467, "rewards/margins": 0.17998693883419037, "rewards/rejected": -2.1970431804656982, "step": 200 }, { "epoch": 0.30267543464552743, "grad_norm": 14.940800895121608, "learning_rate": 9.999971806320255e-08, "logits/chosen": -2.4093306064605713, "logits/rejected": -2.4097609519958496, "logps/chosen": -1.0589462518692017, "logps/rejected": -1.1346651315689087, "loss": 1.695, "rewards/accuracies": 0.53125, "rewards/chosen": -2.1178925037384033, "rewards/margins": 0.15143761038780212, "rewards/rejected": -2.2693302631378174, "step": 210 }, { "epoch": 0.31708855058102875, "grad_norm": 15.468071809971288, "learning_rate": 9.998985060913876e-08, "logits/chosen": -2.327671527862549, "logits/rejected": -2.3280539512634277, "logps/chosen": -1.0390589237213135, "logps/rejected": -1.1213579177856445, "loss": 1.6904, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -2.078117847442627, "rewards/margins": 0.1645977944135666, "rewards/rejected": -2.242715835571289, "step": 220 }, { "epoch": 0.33150166651653007, "grad_norm": 18.885553561709102, "learning_rate": 9.996588949457546e-08, "logits/chosen": -2.3791205883026123, "logits/rejected": -2.3730788230895996, "logps/chosen": -1.156124472618103, "logps/rejected": -1.2356293201446533, "loss": 1.6937, "rewards/accuracies": 0.559374988079071, "rewards/chosen": -2.312248945236206, "rewards/margins": 0.15900969505310059, "rewards/rejected": -2.4712586402893066, "step": 230 }, { "epoch": 0.34591478245203133, "grad_norm": 18.61654233250297, "learning_rate": 9.992784147488017e-08, "logits/chosen": -2.4054293632507324, "logits/rejected": -2.3909668922424316, "logps/chosen": -1.040718674659729, "logps/rejected": -1.1538527011871338, "loss": 1.6368, "rewards/accuracies": 0.5625, "rewards/chosen": -2.081437349319458, "rewards/margins": 0.22626809775829315, "rewards/rejected": -2.3077054023742676, "step": 240 }, { "epoch": 0.36032789838753265, "grad_norm": 15.133106885435941, "learning_rate": 9.987571727694775e-08, "logits/chosen": -2.377009630203247, "logits/rejected": -2.371063232421875, "logps/chosen": -0.997736930847168, "logps/rejected": -1.1200191974639893, "loss": 1.6202, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -1.995473861694336, "rewards/margins": 0.24456438422203064, "rewards/rejected": -2.2400383949279785, "step": 250 }, { "epoch": 0.37474101432303397, "grad_norm": 15.527267890358452, "learning_rate": 9.98095315961762e-08, "logits/chosen": -2.38106369972229, "logits/rejected": -2.3770012855529785, "logps/chosen": -1.073089838027954, "logps/rejected": -1.1799663305282593, "loss": 1.6494, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.146179676055908, "rewards/margins": 0.2137528359889984, "rewards/rejected": -2.3599326610565186, "step": 260 }, { "epoch": 0.3891541302585353, "grad_norm": 16.699106788545635, "learning_rate": 9.97293030923235e-08, "logits/chosen": -2.3734331130981445, "logits/rejected": -2.36216402053833, "logps/chosen": -1.0048857927322388, "logps/rejected": -1.0962402820587158, "loss": 1.6741, "rewards/accuracies": 0.5718749761581421, "rewards/chosen": -2.0097715854644775, "rewards/margins": 0.18270887434482574, "rewards/rejected": -2.1924805641174316, "step": 270 }, { "epoch": 0.40356724619403656, "grad_norm": 14.595168551654872, "learning_rate": 9.963505438424693e-08, "logits/chosen": -2.340841293334961, "logits/rejected": -2.3415005207061768, "logps/chosen": -1.0379191637039185, "logps/rejected": -1.1280016899108887, "loss": 1.6851, "rewards/accuracies": 0.5406249761581421, "rewards/chosen": -2.075838327407837, "rewards/margins": 0.18016524612903595, "rewards/rejected": -2.2560033798217773, "step": 280 }, { "epoch": 0.4179803621295379, "grad_norm": 14.286732447718073, "learning_rate": 9.952681204352607e-08, "logits/chosen": -2.361560821533203, "logits/rejected": -2.3513660430908203, "logps/chosen": -1.0380117893218994, "logps/rejected": -1.1370676755905151, "loss": 1.6637, "rewards/accuracies": 0.565625011920929, "rewards/chosen": -2.076023578643799, "rewards/margins": 0.198111891746521, "rewards/rejected": -2.2741353511810303, "step": 290 }, { "epoch": 0.4323934780650392, "grad_norm": 17.31273729578293, "learning_rate": 9.94046065869715e-08, "logits/chosen": -2.377479314804077, "logits/rejected": -2.375476360321045, "logps/chosen": -1.0271109342575073, "logps/rejected": -1.1700676679611206, "loss": 1.5942, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -2.0542218685150146, "rewards/margins": 0.2859136462211609, "rewards/rejected": -2.340135335922241, "step": 300 }, { "epoch": 0.4468065940005405, "grad_norm": 17.694546366405458, "learning_rate": 9.926847246802116e-08, "logits/chosen": -2.3561387062072754, "logits/rejected": -2.3444766998291016, "logps/chosen": -1.0410211086273193, "logps/rejected": -1.1159262657165527, "loss": 1.6942, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.0820422172546387, "rewards/margins": 0.14981010556221008, "rewards/rejected": -2.2318525314331055, "step": 310 }, { "epoch": 0.4612197099360418, "grad_norm": 13.051339803328997, "learning_rate": 9.911844806702691e-08, "logits/chosen": -2.3585753440856934, "logits/rejected": -2.360156297683716, "logps/chosen": -1.015515923500061, "logps/rejected": -1.1353641748428345, "loss": 1.6286, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.031031847000122, "rewards/margins": 0.23969626426696777, "rewards/rejected": -2.270728349685669, "step": 320 }, { "epoch": 0.4756328258715431, "grad_norm": 17.002852190341585, "learning_rate": 9.895457568043387e-08, "logits/chosen": -2.3824462890625, "logits/rejected": -2.3757641315460205, "logps/chosen": -1.059061050415039, "logps/rejected": -1.14574134349823, "loss": 1.6835, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -2.118122100830078, "rewards/margins": 0.17336080968379974, "rewards/rejected": -2.29148268699646, "step": 330 }, { "epoch": 0.4900459418070444, "grad_norm": 16.276382330404722, "learning_rate": 9.877690150885587e-08, "logits/chosen": -2.324713945388794, "logits/rejected": -2.314767599105835, "logps/chosen": -1.0457204580307007, "logps/rejected": -1.135799527168274, "loss": 1.6763, "rewards/accuracies": 0.5625, "rewards/chosen": -2.0914409160614014, "rewards/margins": 0.18015804886817932, "rewards/rejected": -2.271599054336548, "step": 340 }, { "epoch": 0.5044590577425457, "grad_norm": 14.679321409845278, "learning_rate": 9.858547564404998e-08, "logits/chosen": -2.368298292160034, "logits/rejected": -2.3589999675750732, "logps/chosen": -1.0575425624847412, "logps/rejected": -1.1802635192871094, "loss": 1.6339, "rewards/accuracies": 0.578125, "rewards/chosen": -2.1150851249694824, "rewards/margins": 0.24544170498847961, "rewards/rejected": -2.3605270385742188, "step": 350 }, { "epoch": 0.518872173678047, "grad_norm": 16.288849210972156, "learning_rate": 9.838035205479418e-08, "logits/chosen": -2.3341236114501953, "logits/rejected": -2.328613042831421, "logps/chosen": -0.9657120704650879, "logps/rejected": -1.0940418243408203, "loss": 1.6196, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.9314241409301758, "rewards/margins": 0.25665926933288574, "rewards/rejected": -2.1880836486816406, "step": 360 }, { "epoch": 0.5332852896135484, "grad_norm": 15.065053010351129, "learning_rate": 9.816158857167196e-08, "logits/chosen": -2.3553214073181152, "logits/rejected": -2.3543648719787598, "logps/chosen": -1.017580509185791, "logps/rejected": -1.093390703201294, "loss": 1.706, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.035161018371582, "rewards/margins": 0.15162022411823273, "rewards/rejected": -2.186781406402588, "step": 370 }, { "epoch": 0.5476984055490496, "grad_norm": 15.268674336756646, "learning_rate": 9.7929246870768e-08, "logits/chosen": -2.3563642501831055, "logits/rejected": -2.357172727584839, "logps/chosen": -1.0474622249603271, "logps/rejected": -1.1527016162872314, "loss": 1.6593, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.0949244499206543, "rewards/margins": 0.2104784995317459, "rewards/rejected": -2.305403232574463, "step": 380 }, { "epoch": 0.5621115214845509, "grad_norm": 19.51913775076441, "learning_rate": 9.768339245627993e-08, "logits/chosen": -2.329598903656006, "logits/rejected": -2.3325648307800293, "logps/chosen": -1.0032579898834229, "logps/rejected": -1.1267921924591064, "loss": 1.6287, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -2.0065159797668457, "rewards/margins": 0.24706879258155823, "rewards/rejected": -2.253584384918213, "step": 390 }, { "epoch": 0.5765246374200522, "grad_norm": 15.08719846804436, "learning_rate": 9.742409464205059e-08, "logits/chosen": -2.364119052886963, "logits/rejected": -2.3581573963165283, "logps/chosen": -1.054837942123413, "logps/rejected": -1.1783701181411743, "loss": 1.6358, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -2.109675884246826, "rewards/margins": 0.2470642328262329, "rewards/rejected": -2.3567402362823486, "step": 400 }, { "epoch": 0.5909377533555535, "grad_norm": 16.155157647324575, "learning_rate": 9.715142653202644e-08, "logits/chosen": -2.347181558609009, "logits/rejected": -2.342615842819214, "logps/chosen": -1.017263650894165, "logps/rejected": -1.1102826595306396, "loss": 1.6768, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -2.03452730178833, "rewards/margins": 0.18603798747062683, "rewards/rejected": -2.2205653190612793, "step": 410 }, { "epoch": 0.6053508692910549, "grad_norm": 14.897089823744135, "learning_rate": 9.68654649996473e-08, "logits/chosen": -2.364981174468994, "logits/rejected": -2.3646998405456543, "logps/chosen": -1.0181089639663696, "logps/rejected": -1.1212923526763916, "loss": 1.6626, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -2.0362179279327393, "rewards/margins": 0.206366628408432, "rewards/rejected": -2.242584705352783, "step": 420 }, { "epoch": 0.6197639852265562, "grad_norm": 15.109629627010106, "learning_rate": 9.656629066617335e-08, "logits/chosen": -2.351111650466919, "logits/rejected": -2.3459696769714355, "logps/chosen": -1.1007968187332153, "logps/rejected": -1.1891463994979858, "loss": 1.6834, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.2015936374664307, "rewards/margins": 0.17669954895973206, "rewards/rejected": -2.3782927989959717, "step": 430 }, { "epoch": 0.6341771011620575, "grad_norm": 16.01041357452403, "learning_rate": 9.62539878779556e-08, "logits/chosen": -2.3512957096099854, "logits/rejected": -2.3472342491149902, "logps/chosen": -1.0058082342147827, "logps/rejected": -1.1037191152572632, "loss": 1.6651, "rewards/accuracies": 0.559374988079071, "rewards/chosen": -2.0116164684295654, "rewards/margins": 0.19582167267799377, "rewards/rejected": -2.2074382305145264, "step": 440 }, { "epoch": 0.6485902170975588, "grad_norm": 18.411662730620584, "learning_rate": 9.592864468265604e-08, "logits/chosen": -2.3800835609436035, "logits/rejected": -2.3797011375427246, "logps/chosen": -1.0755730867385864, "logps/rejected": -1.1656855344772339, "loss": 1.6784, "rewards/accuracies": 0.5718749761581421, "rewards/chosen": -2.151146173477173, "rewards/margins": 0.18022510409355164, "rewards/rejected": -2.3313710689544678, "step": 450 }, { "epoch": 0.6630033330330601, "grad_norm": 17.51219332799835, "learning_rate": 9.559035280442441e-08, "logits/chosen": -2.3352417945861816, "logits/rejected": -2.3331692218780518, "logps/chosen": -1.0036710500717163, "logps/rejected": -1.0872585773468018, "loss": 1.6865, "rewards/accuracies": 0.559374988079071, "rewards/chosen": -2.0073421001434326, "rewards/margins": 0.16717498004436493, "rewards/rejected": -2.1745171546936035, "step": 460 }, { "epoch": 0.6774164489685613, "grad_norm": 18.31866820732837, "learning_rate": 9.523920761803823e-08, "logits/chosen": -2.3979227542877197, "logits/rejected": -2.399036407470703, "logps/chosen": -1.0747919082641602, "logps/rejected": -1.1746306419372559, "loss": 1.6553, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -2.1495838165283203, "rewards/margins": 0.1996772736310959, "rewards/rejected": -2.3492612838745117, "step": 470 }, { "epoch": 0.6918295649040627, "grad_norm": 16.80271538537987, "learning_rate": 9.487530812201383e-08, "logits/chosen": -2.35792875289917, "logits/rejected": -2.3569393157958984, "logps/chosen": -1.0264079570770264, "logps/rejected": -1.1486434936523438, "loss": 1.6324, "rewards/accuracies": 0.59375, "rewards/chosen": -2.0528159141540527, "rewards/margins": 0.24447116255760193, "rewards/rejected": -2.2972869873046875, "step": 480 }, { "epoch": 0.706242680839564, "grad_norm": 16.799352219592777, "learning_rate": 9.449875691069571e-08, "logits/chosen": -2.356339931488037, "logits/rejected": -2.354175567626953, "logps/chosen": -1.0335304737091064, "logps/rejected": -1.1673954725265503, "loss": 1.6051, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.067060947418213, "rewards/margins": 0.26773008704185486, "rewards/rejected": -2.3347909450531006, "step": 490 }, { "epoch": 0.7206557967750653, "grad_norm": 15.404244347962265, "learning_rate": 9.410966014533195e-08, "logits/chosen": -2.3478922843933105, "logits/rejected": -2.3435702323913574, "logps/chosen": -1.053039312362671, "logps/rejected": -1.1690478324890137, "loss": 1.6495, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.106078624725342, "rewards/margins": 0.23201718926429749, "rewards/rejected": -2.3380956649780273, "step": 500 }, { "epoch": 0.7350689127105666, "grad_norm": 15.81308480269748, "learning_rate": 9.37081275241442e-08, "logits/chosen": -2.3459486961364746, "logits/rejected": -2.339306592941284, "logps/chosen": -1.0136808156967163, "logps/rejected": -1.1138548851013184, "loss": 1.667, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.0273616313934326, "rewards/margins": 0.20034781098365784, "rewards/rejected": -2.2277097702026367, "step": 510 }, { "epoch": 0.7494820286460679, "grad_norm": 14.877032985004409, "learning_rate": 9.329427225140042e-08, "logits/chosen": -2.3370301723480225, "logits/rejected": -2.3319363594055176, "logps/chosen": -1.0117393732070923, "logps/rejected": -1.1295689344406128, "loss": 1.6432, "rewards/accuracies": 0.53125, "rewards/chosen": -2.0234787464141846, "rewards/margins": 0.23565927147865295, "rewards/rejected": -2.2591378688812256, "step": 520 }, { "epoch": 0.7638951445815693, "grad_norm": 16.317618504393014, "learning_rate": 9.286821100549906e-08, "logits/chosen": -2.336864471435547, "logits/rejected": -2.329371929168701, "logps/chosen": -0.9821737408638, "logps/rejected": -1.1123030185699463, "loss": 1.6226, "rewards/accuracies": 0.59375, "rewards/chosen": -1.9643474817276, "rewards/margins": 0.26025891304016113, "rewards/rejected": -2.2246060371398926, "step": 530 }, { "epoch": 0.7783082605170706, "grad_norm": 16.918699303271303, "learning_rate": 9.243006390607402e-08, "logits/chosen": -2.3681960105895996, "logits/rejected": -2.3686928749084473, "logps/chosen": -1.0731232166290283, "logps/rejected": -1.2046077251434326, "loss": 1.6286, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.1462464332580566, "rewards/margins": 0.2629690170288086, "rewards/rejected": -2.4092154502868652, "step": 540 }, { "epoch": 0.7927213764525718, "grad_norm": 17.901668830250117, "learning_rate": 9.197995448012912e-08, "logits/chosen": -2.3749890327453613, "logits/rejected": -2.368088960647583, "logps/chosen": -1.0722578763961792, "logps/rejected": -1.2028658390045166, "loss": 1.6224, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -2.1445157527923584, "rewards/margins": 0.26121601462364197, "rewards/rejected": -2.405731678009033, "step": 550 }, { "epoch": 0.8071344923880731, "grad_norm": 15.889671449808617, "learning_rate": 9.151800962721217e-08, "logits/chosen": -2.320263147354126, "logits/rejected": -2.3110299110412598, "logps/chosen": -1.0240787267684937, "logps/rejected": -1.1282823085784912, "loss": 1.6579, "rewards/accuracies": 0.578125, "rewards/chosen": -2.0481574535369873, "rewards/margins": 0.20840716361999512, "rewards/rejected": -2.2565646171569824, "step": 560 }, { "epoch": 0.8215476083235744, "grad_norm": 16.160221475349292, "learning_rate": 9.104435958363807e-08, "logits/chosen": -2.3726491928100586, "logits/rejected": -2.3696436882019043, "logps/chosen": -1.0209132432937622, "logps/rejected": -1.134126901626587, "loss": 1.6464, "rewards/accuracies": 0.59375, "rewards/chosen": -2.0418264865875244, "rewards/margins": 0.22642748057842255, "rewards/rejected": -2.268253803253174, "step": 570 }, { "epoch": 0.8359607242590757, "grad_norm": 16.09504542028388, "learning_rate": 9.055913788577128e-08, "logits/chosen": -2.3402140140533447, "logits/rejected": -2.334770679473877, "logps/chosen": -1.0541826486587524, "logps/rejected": -1.1505852937698364, "loss": 1.6795, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.108365297317505, "rewards/margins": 0.19280506670475006, "rewards/rejected": -2.301170587539673, "step": 580 }, { "epoch": 0.8503738401945771, "grad_norm": 18.45826863343491, "learning_rate": 9.006248133237782e-08, "logits/chosen": -2.3699214458465576, "logits/rejected": -2.361508846282959, "logps/chosen": -1.037255048751831, "logps/rejected": -1.155447006225586, "loss": 1.6428, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.074510097503662, "rewards/margins": 0.2363840639591217, "rewards/rejected": -2.310894012451172, "step": 590 }, { "epoch": 0.8647869561300784, "grad_norm": 18.434587269982643, "learning_rate": 8.955452994605753e-08, "logits/chosen": -2.3500571250915527, "logits/rejected": -2.338733196258545, "logps/chosen": -1.0794237852096558, "logps/rejected": -1.170361042022705, "loss": 1.6733, "rewards/accuracies": 0.5718749761581421, "rewards/chosen": -2.1588475704193115, "rewards/margins": 0.1818745732307434, "rewards/rejected": -2.34072208404541, "step": 600 }, { "epoch": 0.8792000720655797, "grad_norm": 14.237081246848815, "learning_rate": 8.903542693376747e-08, "logits/chosen": -2.3270299434661865, "logits/rejected": -2.3305177688598633, "logps/chosen": -0.9713711738586426, "logps/rejected": -1.1125681400299072, "loss": 1.6091, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.9427423477172852, "rewards/margins": 0.2823939025402069, "rewards/rejected": -2.2251362800598145, "step": 610 }, { "epoch": 0.893613188001081, "grad_norm": 17.400582788834974, "learning_rate": 8.850531864644748e-08, "logits/chosen": -2.3322553634643555, "logits/rejected": -2.321770668029785, "logps/chosen": -0.9585525393486023, "logps/rejected": -1.0878236293792725, "loss": 1.6235, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.9171050786972046, "rewards/margins": 0.2585422694683075, "rewards/rejected": -2.175647258758545, "step": 620 }, { "epoch": 0.9080263039365822, "grad_norm": 18.38773462583586, "learning_rate": 8.796435453775943e-08, "logits/chosen": -2.3591456413269043, "logits/rejected": -2.3641350269317627, "logps/chosen": -1.0922317504882812, "logps/rejected": -1.245233416557312, "loss": 1.5961, "rewards/accuracies": 0.59375, "rewards/chosen": -2.1844635009765625, "rewards/margins": 0.30600348114967346, "rewards/rejected": -2.490466833114624, "step": 630 }, { "epoch": 0.9224394198720836, "grad_norm": 17.0793455640924, "learning_rate": 8.741268712195164e-08, "logits/chosen": -2.362234115600586, "logits/rejected": -2.3535900115966797, "logps/chosen": -0.9950187802314758, "logps/rejected": -1.1404359340667725, "loss": 1.5986, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.9900375604629517, "rewards/margins": 0.29083460569381714, "rewards/rejected": -2.280871868133545, "step": 640 }, { "epoch": 0.9368525358075849, "grad_norm": 18.356266999768685, "learning_rate": 8.685047193086053e-08, "logits/chosen": -2.3747830390930176, "logits/rejected": -2.3743832111358643, "logps/chosen": -1.0230966806411743, "logps/rejected": -1.1178253889083862, "loss": 1.6728, "rewards/accuracies": 0.5625, "rewards/chosen": -2.0461933612823486, "rewards/margins": 0.18945762515068054, "rewards/rejected": -2.2356507778167725, "step": 650 }, { "epoch": 0.9512656517430862, "grad_norm": 16.97821645636938, "learning_rate": 8.627786747006144e-08, "logits/chosen": -2.3651280403137207, "logits/rejected": -2.3614325523376465, "logps/chosen": -1.028911828994751, "logps/rejected": -1.1648304462432861, "loss": 1.6105, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.057823657989502, "rewards/margins": 0.2718368470668793, "rewards/rejected": -2.3296608924865723, "step": 660 }, { "epoch": 0.9656787676785875, "grad_norm": 18.242383473952547, "learning_rate": 8.569503517418104e-08, "logits/chosen": -2.3506455421447754, "logits/rejected": -2.346644401550293, "logps/chosen": -1.038861870765686, "logps/rejected": -1.1740354299545288, "loss": 1.6204, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.077723741531372, "rewards/margins": 0.27034711837768555, "rewards/rejected": -2.3480708599090576, "step": 670 }, { "epoch": 0.9800918836140888, "grad_norm": 15.51076376279893, "learning_rate": 8.510213936138402e-08, "logits/chosen": -2.3083348274230957, "logits/rejected": -2.3014862537384033, "logps/chosen": -0.9869492650032043, "logps/rejected": -1.0866016149520874, "loss": 1.6735, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.9738985300064087, "rewards/margins": 0.19930467009544373, "rewards/rejected": -2.173203229904175, "step": 680 }, { "epoch": 0.9945049995495902, "grad_norm": 17.843639653030788, "learning_rate": 8.449934718704685e-08, "logits/chosen": -2.3410897254943848, "logits/rejected": -2.334183692932129, "logps/chosen": -1.02655827999115, "logps/rejected": -1.1379454135894775, "loss": 1.6428, "rewards/accuracies": 0.59375, "rewards/chosen": -2.0531165599823, "rewards/margins": 0.22277435660362244, "rewards/rejected": -2.275890827178955, "step": 690 }, { "epoch": 1.0089181154850915, "grad_norm": 18.24062737002371, "learning_rate": 8.388682859663152e-08, "logits/chosen": -2.3235275745391846, "logits/rejected": -2.323727607727051, "logps/chosen": -1.0423524379730225, "logps/rejected": -1.1892979145050049, "loss": 1.6146, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -2.084704875946045, "rewards/margins": 0.2938912510871887, "rewards/rejected": -2.3785958290100098, "step": 700 }, { "epoch": 1.0233312314205927, "grad_norm": 18.30818756183919, "learning_rate": 8.326475627777277e-08, "logits/chosen": -2.3337440490722656, "logits/rejected": -2.3330025672912598, "logps/chosen": -1.0714682340621948, "logps/rejected": -1.2082436084747314, "loss": 1.6339, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.1429364681243896, "rewards/margins": 0.27355074882507324, "rewards/rejected": -2.416487216949463, "step": 710 }, { "epoch": 1.037744347356094, "grad_norm": 17.423864156378112, "learning_rate": 8.26333056115922e-08, "logits/chosen": -2.373300075531006, "logits/rejected": -2.3668229579925537, "logps/chosen": -1.0338383913040161, "logps/rejected": -1.1421548128128052, "loss": 1.6639, "rewards/accuracies": 0.559374988079071, "rewards/chosen": -2.0676767826080322, "rewards/margins": 0.21663276851177216, "rewards/rejected": -2.2843096256256104, "step": 720 }, { "epoch": 1.0521574632915953, "grad_norm": 16.635043052348962, "learning_rate": 8.1992654623253e-08, "logits/chosen": -2.3428361415863037, "logits/rejected": -2.33913516998291, "logps/chosen": -1.009476900100708, "logps/rejected": -1.1869137287139893, "loss": 1.559, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -2.018953800201416, "rewards/margins": 0.35487350821495056, "rewards/rejected": -2.3738274574279785, "step": 730 }, { "epoch": 1.0665705792270967, "grad_norm": 19.25205105759611, "learning_rate": 8.134298393176915e-08, "logits/chosen": -2.301328420639038, "logits/rejected": -2.2953743934631348, "logps/chosen": -0.9850282669067383, "logps/rejected": -1.131919264793396, "loss": 1.6056, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.9700565338134766, "rewards/margins": 0.2937820851802826, "rewards/rejected": -2.263838529586792, "step": 740 }, { "epoch": 1.080983695162598, "grad_norm": 16.04856542856117, "learning_rate": 8.068447669908356e-08, "logits/chosen": -2.306058168411255, "logits/rejected": -2.294712781906128, "logps/chosen": -1.06520676612854, "logps/rejected": -1.1720651388168335, "loss": 1.6557, "rewards/accuracies": 0.565625011920929, "rewards/chosen": -2.13041353225708, "rewards/margins": 0.21371681988239288, "rewards/rejected": -2.344130277633667, "step": 750 }, { "epoch": 1.0953968110980994, "grad_norm": 14.81697278342191, "learning_rate": 8.001731857842906e-08, "logits/chosen": -2.317549705505371, "logits/rejected": -2.3219799995422363, "logps/chosen": -1.0585771799087524, "logps/rejected": -1.1321176290512085, "loss": 1.7105, "rewards/accuracies": 0.528124988079071, "rewards/chosen": -2.117154359817505, "rewards/margins": 0.14708088338375092, "rewards/rejected": -2.264235258102417, "step": 760 }, { "epoch": 1.1098099270336006, "grad_norm": 19.08608533403698, "learning_rate": 7.934169766198712e-08, "logits/chosen": -2.347382068634033, "logits/rejected": -2.3347859382629395, "logps/chosen": -0.9919846653938293, "logps/rejected": -1.155458688735962, "loss": 1.5702, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.9839693307876587, "rewards/margins": 0.32694780826568604, "rewards/rejected": -2.310917377471924, "step": 770 }, { "epoch": 1.1242230429691018, "grad_norm": 21.782769163652045, "learning_rate": 7.86578044278589e-08, "logits/chosen": -2.3568646907806396, "logits/rejected": -2.350098133087158, "logps/chosen": -1.0653258562088013, "logps/rejected": -1.2129188776016235, "loss": 1.6052, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.1306517124176025, "rewards/margins": 0.29518604278564453, "rewards/rejected": -2.425837755203247, "step": 780 }, { "epoch": 1.1386361589046032, "grad_norm": 20.459222597520984, "learning_rate": 7.796583168636375e-08, "logits/chosen": -2.3612263202667236, "logits/rejected": -2.3560619354248047, "logps/chosen": -1.0090010166168213, "logps/rejected": -1.1769835948944092, "loss": 1.5759, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -2.0180020332336426, "rewards/margins": 0.3359653949737549, "rewards/rejected": -2.3539671897888184, "step": 790 }, { "epoch": 1.1530492748401044, "grad_norm": 16.695222101185497, "learning_rate": 7.726597452568007e-08, "logits/chosen": -2.3381145000457764, "logits/rejected": -2.3316009044647217, "logps/chosen": -1.0254031419754028, "logps/rejected": -1.16634202003479, "loss": 1.6068, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -2.0508062839508057, "rewards/margins": 0.2818780839443207, "rewards/rejected": -2.33268404006958, "step": 800 }, { "epoch": 1.1674623907756059, "grad_norm": 17.52985696830486, "learning_rate": 7.655843025684402e-08, "logits/chosen": -2.3598532676696777, "logits/rejected": -2.362898349761963, "logps/chosen": -1.044235348701477, "logps/rejected": -1.1720434427261353, "loss": 1.6296, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.088470697402954, "rewards/margins": 0.25561633706092834, "rewards/rejected": -2.3440868854522705, "step": 810 }, { "epoch": 1.181875506711107, "grad_norm": 14.910484844275423, "learning_rate": 7.584339835812151e-08, "logits/chosen": -2.3223514556884766, "logits/rejected": -2.323925495147705, "logps/chosen": -1.0323957204818726, "logps/rejected": -1.1369130611419678, "loss": 1.6678, "rewards/accuracies": 0.5718749761581421, "rewards/chosen": -2.064791440963745, "rewards/margins": 0.20903484523296356, "rewards/rejected": -2.2738261222839355, "step": 820 }, { "epoch": 1.1962886226466085, "grad_norm": 16.58257930662513, "learning_rate": 7.512108041876924e-08, "logits/chosen": -2.2956104278564453, "logits/rejected": -2.298205852508545, "logps/chosen": -0.9996700286865234, "logps/rejected": -1.1152664422988892, "loss": 1.6512, "rewards/accuracies": 0.578125, "rewards/chosen": -1.9993400573730469, "rewards/margins": 0.23119251430034637, "rewards/rejected": -2.2305328845977783, "step": 830 }, { "epoch": 1.2107017385821097, "grad_norm": 16.103489416598062, "learning_rate": 7.439168008220056e-08, "logits/chosen": -2.333143949508667, "logits/rejected": -2.327017068862915, "logps/chosen": -1.0302656888961792, "logps/rejected": -1.1976699829101562, "loss": 1.5731, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -2.0605313777923584, "rewards/margins": 0.3348085880279541, "rewards/rejected": -2.3953399658203125, "step": 840 }, { "epoch": 1.225114854517611, "grad_norm": 17.820096880219356, "learning_rate": 7.365540298857215e-08, "logits/chosen": -2.3323662281036377, "logits/rejected": -2.3332276344299316, "logps/chosen": -1.0587284564971924, "logps/rejected": -1.2181167602539062, "loss": 1.5796, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.1174569129943848, "rewards/margins": 0.3187769949436188, "rewards/rejected": -2.4362335205078125, "step": 850 }, { "epoch": 1.2395279704531124, "grad_norm": 18.066090520662634, "learning_rate": 7.291245671680781e-08, "logits/chosen": -2.3100619316101074, "logits/rejected": -2.3028578758239746, "logps/chosen": -0.9891204833984375, "logps/rejected": -1.1562236547470093, "loss": 1.5852, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": -1.978240966796875, "rewards/margins": 0.3342065215110779, "rewards/rejected": -2.3124473094940186, "step": 860 }, { "epoch": 1.2539410863886136, "grad_norm": 16.723867521490277, "learning_rate": 7.216305072607568e-08, "logits/chosen": -2.3490469455718994, "logits/rejected": -2.351792812347412, "logps/chosen": -1.0800100564956665, "logps/rejected": -1.2314789295196533, "loss": 1.6035, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.160020112991333, "rewards/margins": 0.30293765664100647, "rewards/rejected": -2.4629578590393066, "step": 870 }, { "epoch": 1.268354202324115, "grad_norm": 19.696376219540245, "learning_rate": 7.14073962967353e-08, "logits/chosen": -2.361971855163574, "logits/rejected": -2.3552968502044678, "logps/chosen": -1.1068134307861328, "logps/rejected": -1.2376269102096558, "loss": 1.6428, "rewards/accuracies": 0.565625011920929, "rewards/chosen": -2.2136268615722656, "rewards/margins": 0.2616268992424011, "rewards/rejected": -2.4752538204193115, "step": 880 }, { "epoch": 1.2827673182596162, "grad_norm": 18.939981579389148, "learning_rate": 7.064570647077124e-08, "logits/chosen": -2.34350848197937, "logits/rejected": -2.335470676422119, "logps/chosen": -1.1084269285202026, "logps/rejected": -1.230513095855713, "loss": 1.6428, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -2.2168538570404053, "rewards/margins": 0.24417249858379364, "rewards/rejected": -2.461026191711426, "step": 890 }, { "epoch": 1.2971804341951176, "grad_norm": 18.47019854160618, "learning_rate": 6.987819599173006e-08, "logits/chosen": -2.3356449604034424, "logits/rejected": -2.331501007080078, "logps/chosen": -1.0205782651901245, "logps/rejected": -1.1818567514419556, "loss": 1.588, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -2.041156530380249, "rewards/margins": 0.322556734085083, "rewards/rejected": -2.363713502883911, "step": 900 }, { "epoch": 1.3115935501306188, "grad_norm": 21.06251591954156, "learning_rate": 6.910508124417765e-08, "logits/chosen": -2.3116612434387207, "logits/rejected": -2.311708927154541, "logps/chosen": -1.0073387622833252, "logps/rejected": -1.1689893007278442, "loss": 1.5949, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.0146775245666504, "rewards/margins": 0.32330113649368286, "rewards/rejected": -2.3379786014556885, "step": 910 }, { "epoch": 1.32600666606612, "grad_norm": 15.75888959059691, "learning_rate": 6.832658019269373e-08, "logits/chosen": -2.2905359268188477, "logits/rejected": -2.285813808441162, "logps/chosen": -1.017747402191162, "logps/rejected": -1.1801689863204956, "loss": 1.5957, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -2.035494804382324, "rewards/margins": 0.3248431086540222, "rewards/rejected": -2.360337972640991, "step": 920 }, { "epoch": 1.3404197820016215, "grad_norm": 16.36860064354464, "learning_rate": 6.75429123204211e-08, "logits/chosen": -2.3322787284851074, "logits/rejected": -2.325899600982666, "logps/chosen": -1.0550917387008667, "logps/rejected": -1.2269432544708252, "loss": 1.5757, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.1101834774017334, "rewards/margins": 0.34370261430740356, "rewards/rejected": -2.4538865089416504, "step": 930 }, { "epoch": 1.354832897937123, "grad_norm": 15.89341720744674, "learning_rate": 6.675429856718652e-08, "logits/chosen": -2.302473306655884, "logits/rejected": -2.292829990386963, "logps/chosen": -0.9993384480476379, "logps/rejected": -1.1607972383499146, "loss": 1.5858, "rewards/accuracies": 0.609375, "rewards/chosen": -1.9986768960952759, "rewards/margins": 0.3229173719882965, "rewards/rejected": -2.321594476699829, "step": 940 }, { "epoch": 1.3692460138726241, "grad_norm": 16.669054151143325, "learning_rate": 6.596096126721123e-08, "logits/chosen": -2.273181200027466, "logits/rejected": -2.2777457237243652, "logps/chosen": -1.0447285175323486, "logps/rejected": -1.2103157043457031, "loss": 1.5821, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.0894570350646973, "rewards/margins": 0.3311743438243866, "rewards/rejected": -2.4206314086914062, "step": 950 }, { "epoch": 1.3836591298081253, "grad_norm": 15.868141254654335, "learning_rate": 6.516312408642804e-08, "logits/chosen": -2.322033405303955, "logits/rejected": -2.3260583877563477, "logps/chosen": -1.0269404649734497, "logps/rejected": -1.217023491859436, "loss": 1.543, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -2.0538809299468994, "rewards/margins": 0.3801659941673279, "rewards/rejected": -2.434046983718872, "step": 960 }, { "epoch": 1.3980722457436268, "grad_norm": 19.7395273688106, "learning_rate": 6.436101195942312e-08, "logits/chosen": -2.3190536499023438, "logits/rejected": -2.321190357208252, "logps/chosen": -1.0408755540847778, "logps/rejected": -1.1574127674102783, "loss": 1.6495, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -2.0817511081695557, "rewards/margins": 0.23307427763938904, "rewards/rejected": -2.3148255348205566, "step": 970 }, { "epoch": 1.412485361679128, "grad_norm": 17.85424182086385, "learning_rate": 6.35548510260201e-08, "logits/chosen": -2.2950663566589355, "logits/rejected": -2.290828227996826, "logps/chosen": -1.015590786933899, "logps/rejected": -1.1845998764038086, "loss": 1.5815, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.031181573867798, "rewards/margins": 0.33801814913749695, "rewards/rejected": -2.369199752807617, "step": 980 }, { "epoch": 1.4268984776146292, "grad_norm": 17.0591983972092, "learning_rate": 6.274486856752442e-08, "logits/chosen": -2.3268628120422363, "logits/rejected": -2.3215243816375732, "logps/chosen": -1.054785132408142, "logps/rejected": -1.2332737445831299, "loss": 1.5786, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -2.109570264816284, "rewards/margins": 0.35697704553604126, "rewards/rejected": -2.4665474891662598, "step": 990 }, { "epoch": 1.4413115935501306, "grad_norm": 15.976591290404047, "learning_rate": 6.193129294264568e-08, "logits/chosen": -2.3251538276672363, "logits/rejected": -2.319453477859497, "logps/chosen": -1.0316834449768066, "logps/rejected": -1.2238515615463257, "loss": 1.549, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.0633668899536133, "rewards/margins": 0.3843366503715515, "rewards/rejected": -2.4477031230926514, "step": 1000 }, { "epoch": 1.455724709485632, "grad_norm": 17.378099075031535, "learning_rate": 6.111435352311653e-08, "logits/chosen": -2.3224568367004395, "logits/rejected": -2.318516254425049, "logps/chosen": -1.044806718826294, "logps/rejected": -1.204319715499878, "loss": 1.5956, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.089613437652588, "rewards/margins": 0.3190259337425232, "rewards/rejected": -2.408639430999756, "step": 1010 }, { "epoch": 1.4701378254211332, "grad_norm": 18.355317239262256, "learning_rate": 6.02942806290257e-08, "logits/chosen": -2.337299346923828, "logits/rejected": -2.334476947784424, "logps/chosen": -1.0204999446868896, "logps/rejected": -1.182180404663086, "loss": 1.5882, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.0409998893737793, "rewards/margins": 0.3233610987663269, "rewards/rejected": -2.364360809326172, "step": 1020 }, { "epoch": 1.4845509413566345, "grad_norm": 16.458819438737027, "learning_rate": 5.947130546388376e-08, "logits/chosen": -2.307170867919922, "logits/rejected": -2.297262668609619, "logps/chosen": -1.1198623180389404, "logps/rejected": -1.2803127765655518, "loss": 1.6069, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.239724636077881, "rewards/margins": 0.32090049982070923, "rewards/rejected": -2.5606255531311035, "step": 1030 }, { "epoch": 1.4989640572921359, "grad_norm": 18.315663658527253, "learning_rate": 5.864566004943983e-08, "logits/chosen": -2.3090689182281494, "logits/rejected": -2.299919605255127, "logps/chosen": -1.1342939138412476, "logps/rejected": -1.2915699481964111, "loss": 1.5918, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -2.268587827682495, "rewards/margins": 0.3145517408847809, "rewards/rejected": -2.5831398963928223, "step": 1040 }, { "epoch": 1.513377173227637, "grad_norm": 18.253777248388865, "learning_rate": 5.78175771602676e-08, "logits/chosen": -2.3258557319641113, "logits/rejected": -2.329089641571045, "logps/chosen": -1.0340855121612549, "logps/rejected": -1.1988188028335571, "loss": 1.5903, "rewards/accuracies": 0.59375, "rewards/chosen": -2.0681710243225098, "rewards/margins": 0.3294665813446045, "rewards/rejected": -2.3976376056671143, "step": 1050 }, { "epoch": 1.5277902891631383, "grad_norm": 20.03722300524917, "learning_rate": 5.6987290258139073e-08, "logits/chosen": -2.269885301589966, "logits/rejected": -2.2610838413238525, "logps/chosen": -1.0655957460403442, "logps/rejected": -1.2299748659133911, "loss": 1.5939, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -2.1311914920806885, "rewards/margins": 0.3287580609321594, "rewards/rejected": -2.4599497318267822, "step": 1060 }, { "epoch": 1.5422034050986397, "grad_norm": 19.363745969848598, "learning_rate": 5.6155033426204615e-08, "logits/chosen": -2.3013463020324707, "logits/rejected": -2.30194091796875, "logps/chosen": -1.1020151376724243, "logps/rejected": -1.2730225324630737, "loss": 1.58, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -2.2040302753448486, "rewards/margins": 0.34201496839523315, "rewards/rejected": -2.5460450649261475, "step": 1070 }, { "epoch": 1.5566165210341412, "grad_norm": 21.243971440197193, "learning_rate": 5.532104130299771e-08, "logits/chosen": -2.306084632873535, "logits/rejected": -2.3026065826416016, "logps/chosen": -1.1136653423309326, "logps/rejected": -1.253650426864624, "loss": 1.6339, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": -2.2273306846618652, "rewards/margins": 0.27996987104415894, "rewards/rejected": -2.507300853729248, "step": 1080 }, { "epoch": 1.5710296369696424, "grad_norm": 18.884950972549078, "learning_rate": 5.448554901628333e-08, "logits/chosen": -2.3047351837158203, "logits/rejected": -2.30297589302063, "logps/chosen": -1.057666301727295, "logps/rejected": -1.2256438732147217, "loss": 1.5844, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.11533260345459, "rewards/margins": 0.3359553813934326, "rewards/rejected": -2.4512877464294434, "step": 1090 }, { "epoch": 1.5854427529051436, "grad_norm": 16.26327515212116, "learning_rate": 5.364879211676816e-08, "logits/chosen": -2.3229575157165527, "logits/rejected": -2.322633743286133, "logps/chosen": -1.0644395351409912, "logps/rejected": -1.2588599920272827, "loss": 1.5435, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.1288790702819824, "rewards/margins": 0.3888412117958069, "rewards/rejected": -2.5177199840545654, "step": 1100 }, { "epoch": 1.599855868840645, "grad_norm": 16.929494402078088, "learning_rate": 5.281100651169175e-08, "logits/chosen": -2.3269693851470947, "logits/rejected": -2.329103946685791, "logps/chosen": -1.1110026836395264, "logps/rejected": -1.3049942255020142, "loss": 1.5754, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.2220053672790527, "rewards/margins": 0.3879828453063965, "rewards/rejected": -2.6099884510040283, "step": 1110 }, { "epoch": 1.6142689847761464, "grad_norm": 19.384751167038143, "learning_rate": 5.197242839831706e-08, "logits/chosen": -2.2902255058288574, "logits/rejected": -2.2878143787384033, "logps/chosen": -1.0505023002624512, "logps/rejected": -1.2497543096542358, "loss": 1.5559, "rewards/accuracies": 0.640625, "rewards/chosen": -2.1010046005249023, "rewards/margins": 0.39850395917892456, "rewards/rejected": -2.4995086193084717, "step": 1120 }, { "epoch": 1.6286821007116477, "grad_norm": 21.020671773840373, "learning_rate": 5.1133294197339274e-08, "logits/chosen": -2.3327059745788574, "logits/rejected": -2.3221957683563232, "logps/chosen": -1.0784157514572144, "logps/rejected": -1.2418811321258545, "loss": 1.6035, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.1568315029144287, "rewards/margins": 0.3269307017326355, "rewards/rejected": -2.483762264251709, "step": 1130 }, { "epoch": 1.6430952166471489, "grad_norm": 21.249031332264607, "learning_rate": 5.029384048623153e-08, "logits/chosen": -2.2892603874206543, "logits/rejected": -2.2838594913482666, "logps/chosen": -1.1238863468170166, "logps/rejected": -1.2816271781921387, "loss": 1.5968, "rewards/accuracies": 0.59375, "rewards/chosen": -2.247772693634033, "rewards/margins": 0.3154818117618561, "rewards/rejected": -2.5632543563842773, "step": 1140 }, { "epoch": 1.6575083325826503, "grad_norm": 18.66086972186176, "learning_rate": 4.9454303932546675e-08, "logits/chosen": -2.28279447555542, "logits/rejected": -2.2724807262420654, "logps/chosen": -1.0907418727874756, "logps/rejected": -1.2298866510391235, "loss": 1.6405, "rewards/accuracies": 0.578125, "rewards/chosen": -2.181483745574951, "rewards/margins": 0.2782895267009735, "rewards/rejected": -2.459773302078247, "step": 1150 }, { "epoch": 1.6719214485181515, "grad_norm": 19.50349240348182, "learning_rate": 4.861492122719338e-08, "logits/chosen": -2.319563388824463, "logits/rejected": -2.3177480697631836, "logps/chosen": -1.0951299667358398, "logps/rejected": -1.260750651359558, "loss": 1.6022, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": -2.1902599334716797, "rewards/margins": 0.3312414586544037, "rewards/rejected": -2.521501302719116, "step": 1160 }, { "epoch": 1.6863345644536527, "grad_norm": 17.58127266536524, "learning_rate": 4.777592901770575e-08, "logits/chosen": -2.327413558959961, "logits/rejected": -2.3294601440429688, "logps/chosen": -1.0109418630599976, "logps/rejected": -1.214444637298584, "loss": 1.5519, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.021883726119995, "rewards/margins": 0.4070053994655609, "rewards/rejected": -2.428889274597168, "step": 1170 }, { "epoch": 1.7007476803891541, "grad_norm": 16.893442050436466, "learning_rate": 4.693756384152529e-08, "logits/chosen": -2.290790557861328, "logits/rejected": -2.2821555137634277, "logps/chosen": -1.0620388984680176, "logps/rejected": -1.2741947174072266, "loss": 1.5403, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.124077796936035, "rewards/margins": 0.42431193590164185, "rewards/rejected": -2.548389434814453, "step": 1180 }, { "epoch": 1.7151607963246556, "grad_norm": 16.76150597577845, "learning_rate": 4.610006205931365e-08, "logits/chosen": -2.334803342819214, "logits/rejected": -2.3295693397521973, "logps/chosen": -1.1866618394851685, "logps/rejected": -1.3234022855758667, "loss": 1.6392, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.373323678970337, "rewards/margins": 0.2734811305999756, "rewards/rejected": -2.6468045711517334, "step": 1190 }, { "epoch": 1.7295739122601568, "grad_norm": 24.57055189161366, "learning_rate": 4.526365978831551e-08, "logits/chosen": -2.3130276203155518, "logits/rejected": -2.30517578125, "logps/chosen": -1.1128777265548706, "logps/rejected": -1.3150999546051025, "loss": 1.556, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.225755453109741, "rewards/margins": 0.40444430708885193, "rewards/rejected": -2.630199909210205, "step": 1200 }, { "epoch": 1.743987028195658, "grad_norm": 19.26814679538138, "learning_rate": 4.442859283578981e-08, "logits/chosen": -2.312147617340088, "logits/rejected": -2.3039205074310303, "logps/chosen": -1.0945560932159424, "logps/rejected": -1.2648680210113525, "loss": 1.6149, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -2.1891121864318848, "rewards/margins": 0.3406239151954651, "rewards/rejected": -2.529736042022705, "step": 1210 }, { "epoch": 1.7584001441311594, "grad_norm": 18.13222142013933, "learning_rate": 4.359509663252864e-08, "logits/chosen": -2.289947986602783, "logits/rejected": -2.2836596965789795, "logps/chosen": -1.0912672281265259, "logps/rejected": -1.261278748512268, "loss": 1.5891, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -2.1825344562530518, "rewards/margins": 0.3400228023529053, "rewards/rejected": -2.522557497024536, "step": 1220 }, { "epoch": 1.7728132600666606, "grad_norm": 19.057526927248425, "learning_rate": 4.276340616648198e-08, "logits/chosen": -2.341885566711426, "logits/rejected": -2.3356499671936035, "logps/chosen": -1.10612154006958, "logps/rejected": -1.2711408138275146, "loss": 1.6144, "rewards/accuracies": 0.5625, "rewards/chosen": -2.21224308013916, "rewards/margins": 0.33003857731819153, "rewards/rejected": -2.5422816276550293, "step": 1230 }, { "epoch": 1.7872263760021618, "grad_norm": 19.76084929838562, "learning_rate": 4.193375591650758e-08, "logits/chosen": -2.3344829082489014, "logits/rejected": -2.3287951946258545, "logps/chosen": -1.1671698093414307, "logps/rejected": -1.3440189361572266, "loss": 1.6093, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.3343396186828613, "rewards/margins": 0.353698194026947, "rewards/rejected": -2.688037872314453, "step": 1240 }, { "epoch": 1.8016394919376633, "grad_norm": 19.067146028274564, "learning_rate": 4.110637978626415e-08, "logits/chosen": -2.298180341720581, "logits/rejected": -2.2934188842773438, "logps/chosen": -1.030287504196167, "logps/rejected": -1.2465605735778809, "loss": 1.5146, "rewards/accuracies": 0.65625, "rewards/chosen": -2.060575008392334, "rewards/margins": 0.43254607915878296, "rewards/rejected": -2.4931211471557617, "step": 1250 }, { "epoch": 1.8160526078731647, "grad_norm": 18.276378668755576, "learning_rate": 4.0281511038266867e-08, "logits/chosen": -2.234718084335327, "logits/rejected": -2.2318148612976074, "logps/chosen": -1.0859392881393433, "logps/rejected": -1.2924591302871704, "loss": 1.5609, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -2.1718785762786865, "rewards/margins": 0.41303977370262146, "rewards/rejected": -2.584918260574341, "step": 1260 }, { "epoch": 1.830465723808666, "grad_norm": 17.470784593739236, "learning_rate": 3.9459382228123475e-08, "logits/chosen": -2.279468059539795, "logits/rejected": -2.273711919784546, "logps/chosen": -1.0365493297576904, "logps/rejected": -1.2447311878204346, "loss": 1.5556, "rewards/accuracies": 0.59375, "rewards/chosen": -2.073098659515381, "rewards/margins": 0.41636401414871216, "rewards/rejected": -2.489462375640869, "step": 1270 }, { "epoch": 1.844878839744167, "grad_norm": 21.830692496447263, "learning_rate": 3.864022513896989e-08, "logits/chosen": -2.2853286266326904, "logits/rejected": -2.2701587677001953, "logps/chosen": -1.0575942993164062, "logps/rejected": -1.2254334688186646, "loss": 1.6005, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.1151885986328125, "rewards/margins": 0.3356781005859375, "rewards/rejected": -2.450866937637329, "step": 1280 }, { "epoch": 1.8592919556796685, "grad_norm": 20.0916366903334, "learning_rate": 3.782427071612339e-08, "logits/chosen": -2.3116753101348877, "logits/rejected": -2.306715488433838, "logps/chosen": -1.1340314149856567, "logps/rejected": -1.3019399642944336, "loss": 1.5867, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.2680628299713135, "rewards/margins": 0.33581703901290894, "rewards/rejected": -2.603879928588867, "step": 1290 }, { "epoch": 1.87370507161517, "grad_norm": 22.477485924506297, "learning_rate": 3.7011749001972174e-08, "logits/chosen": -2.3057870864868164, "logits/rejected": -2.3034915924072266, "logps/chosen": -1.053118348121643, "logps/rejected": -1.2349039316177368, "loss": 1.5867, "rewards/accuracies": 0.559374988079071, "rewards/chosen": -2.106236696243286, "rewards/margins": 0.36357131600379944, "rewards/rejected": -2.4698078632354736, "step": 1300 }, { "epoch": 1.888118187550671, "grad_norm": 20.557013864835106, "learning_rate": 3.620288907111931e-08, "logits/chosen": -2.277376651763916, "logits/rejected": -2.272871255874634, "logps/chosen": -1.096543312072754, "logps/rejected": -1.3053501844406128, "loss": 1.5318, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -2.193086624145508, "rewards/margins": 0.4176138937473297, "rewards/rejected": -2.6107003688812256, "step": 1310 }, { "epoch": 1.9025313034861724, "grad_norm": 27.1695631827936, "learning_rate": 3.539791896579978e-08, "logits/chosen": -2.317373752593994, "logits/rejected": -2.318577289581299, "logps/chosen": -1.2034056186676025, "logps/rejected": -1.344125747680664, "loss": 1.6377, "rewards/accuracies": 0.5718749761581421, "rewards/chosen": -2.406811237335205, "rewards/margins": 0.28144046664237976, "rewards/rejected": -2.688251495361328, "step": 1320 }, { "epoch": 1.9169444194216738, "grad_norm": 25.106064057973505, "learning_rate": 3.459706563158828e-08, "logits/chosen": -2.279590129852295, "logits/rejected": -2.281261682510376, "logps/chosen": -1.1769063472747803, "logps/rejected": -1.3924826383590698, "loss": 1.5341, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.3538126945495605, "rewards/margins": 0.4311525821685791, "rewards/rejected": -2.7849652767181396, "step": 1330 }, { "epoch": 1.931357535357175, "grad_norm": 20.192509452290462, "learning_rate": 3.380055485341644e-08, "logits/chosen": -2.314013957977295, "logits/rejected": -2.3160252571105957, "logps/chosen": -1.1351264715194702, "logps/rejected": -1.3126869201660156, "loss": 1.5828, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -2.2702529430389404, "rewards/margins": 0.35512077808380127, "rewards/rejected": -2.6253738403320312, "step": 1340 }, { "epoch": 1.9457706512926762, "grad_norm": 22.1205875163306, "learning_rate": 3.300861119191718e-08, "logits/chosen": -2.2895724773406982, "logits/rejected": -2.283412456512451, "logps/chosen": -1.179337739944458, "logps/rejected": -1.3338556289672852, "loss": 1.6304, "rewards/accuracies": 0.528124988079071, "rewards/chosen": -2.358675479888916, "rewards/margins": 0.30903515219688416, "rewards/rejected": -2.6677112579345703, "step": 1350 }, { "epoch": 1.9601837672281777, "grad_norm": 21.26891098809936, "learning_rate": 3.2221457920114213e-08, "logits/chosen": -2.307619094848633, "logits/rejected": -2.3046841621398926, "logps/chosen": -1.1182931661605835, "logps/rejected": -1.3411715030670166, "loss": 1.5205, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.236586332321167, "rewards/margins": 0.4457565248012543, "rewards/rejected": -2.682343006134033, "step": 1360 }, { "epoch": 1.974596883163679, "grad_norm": 23.3986392290044, "learning_rate": 3.143931696047454e-08, "logits/chosen": -2.302565813064575, "logits/rejected": -2.298037528991699, "logps/chosen": -1.0839837789535522, "logps/rejected": -1.2788712978363037, "loss": 1.559, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -2.1679675579071045, "rewards/margins": 0.38977518677711487, "rewards/rejected": -2.5577425956726074, "step": 1370 }, { "epoch": 1.9890099990991803, "grad_norm": 19.075694699589782, "learning_rate": 3.066240882234186e-08, "logits/chosen": -2.306809663772583, "logits/rejected": -2.3090083599090576, "logps/chosen": -1.150748372077942, "logps/rejected": -1.330487847328186, "loss": 1.5692, "rewards/accuracies": 0.625, "rewards/chosen": -2.301496744155884, "rewards/margins": 0.35947883129119873, "rewards/rejected": -2.660975694656372, "step": 1380 }, { "epoch": 2.0034231150346815, "grad_norm": 23.08357458694508, "learning_rate": 2.989095253976816e-08, "logits/chosen": -2.2911369800567627, "logits/rejected": -2.2887818813323975, "logps/chosen": -1.1655315160751343, "logps/rejected": -1.3231830596923828, "loss": 1.6272, "rewards/accuracies": 0.546875, "rewards/chosen": -2.3310630321502686, "rewards/margins": 0.3153030276298523, "rewards/rejected": -2.6463661193847656, "step": 1390 }, { "epoch": 2.017836230970183, "grad_norm": 21.786843412845027, "learning_rate": 2.912516560976146e-08, "logits/chosen": -2.2617886066436768, "logits/rejected": -2.261368989944458, "logps/chosen": -1.116999864578247, "logps/rejected": -1.3585065603256226, "loss": 1.5173, "rewards/accuracies": 0.625, "rewards/chosen": -2.233999729156494, "rewards/margins": 0.4830136299133301, "rewards/rejected": -2.717013120651245, "step": 1400 }, { "epoch": 2.0322493469056844, "grad_norm": 19.872912648108493, "learning_rate": 2.836526393096661e-08, "logits/chosen": -2.3144338130950928, "logits/rejected": -2.319342613220215, "logps/chosen": -1.127329707145691, "logps/rejected": -1.3289254903793335, "loss": 1.5402, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -2.254659414291382, "rewards/margins": 0.40319204330444336, "rewards/rejected": -2.657850980758667, "step": 1410 }, { "epoch": 2.0466624628411854, "grad_norm": 22.10407026857419, "learning_rate": 2.7611461742797165e-08, "logits/chosen": -2.2922112941741943, "logits/rejected": -2.2878568172454834, "logps/chosen": -1.0672378540039062, "logps/rejected": -1.2899413108825684, "loss": 1.5126, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -2.1344757080078125, "rewards/margins": 0.44540706276893616, "rewards/rejected": -2.5798826217651367, "step": 1420 }, { "epoch": 2.061075578776687, "grad_norm": 26.207768824418398, "learning_rate": 2.686397156503445e-08, "logits/chosen": -2.2948415279388428, "logits/rejected": -2.28835129737854, "logps/chosen": -1.1063997745513916, "logps/rejected": -1.3052228689193726, "loss": 1.5589, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.212799549102783, "rewards/margins": 0.39764639735221863, "rewards/rejected": -2.610445737838745, "step": 1430 }, { "epoch": 2.075488694712188, "grad_norm": 18.99932149970658, "learning_rate": 2.6123004137912084e-08, "logits/chosen": -2.2723312377929688, "logits/rejected": -2.276716709136963, "logps/chosen": -1.0470964908599854, "logps/rejected": -1.2561558485031128, "loss": 1.5356, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.0941929817199707, "rewards/margins": 0.4181187152862549, "rewards/rejected": -2.5123116970062256, "step": 1440 }, { "epoch": 2.089901810647689, "grad_norm": 19.445114453376085, "learning_rate": 2.5388768362701585e-08, "logits/chosen": -2.2706756591796875, "logits/rejected": -2.269131898880005, "logps/chosen": -1.1902254819869995, "logps/rejected": -1.351431131362915, "loss": 1.6073, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.380450963973999, "rewards/margins": 0.3224112391471863, "rewards/rejected": -2.70286226272583, "step": 1450 }, { "epoch": 2.1043149265831906, "grad_norm": 22.70265803179129, "learning_rate": 2.466147124281703e-08, "logits/chosen": -2.3346049785614014, "logits/rejected": -2.3269667625427246, "logps/chosen": -1.1868515014648438, "logps/rejected": -1.3827440738677979, "loss": 1.5644, "rewards/accuracies": 0.625, "rewards/chosen": -2.3737030029296875, "rewards/margins": 0.39178499579429626, "rewards/rejected": -2.7654881477355957, "step": 1460 }, { "epoch": 2.118728042518692, "grad_norm": 25.431369552773468, "learning_rate": 2.3941317825454278e-08, "logits/chosen": -2.287153720855713, "logits/rejected": -2.274724006652832, "logps/chosen": -1.1501365900039673, "logps/rejected": -1.3252675533294678, "loss": 1.599, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -2.3002731800079346, "rewards/margins": 0.35026198625564575, "rewards/rejected": -2.6505351066589355, "step": 1470 }, { "epoch": 2.1331411584541935, "grad_norm": 38.861924452847305, "learning_rate": 2.322851114378203e-08, "logits/chosen": -2.2646145820617676, "logits/rejected": -2.2705867290496826, "logps/chosen": -1.2125260829925537, "logps/rejected": -1.4090855121612549, "loss": 1.5981, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.4250521659851074, "rewards/margins": 0.39311888813972473, "rewards/rejected": -2.8181710243225098, "step": 1480 }, { "epoch": 2.1475542743896945, "grad_norm": 20.599427677239603, "learning_rate": 2.252325215970059e-08, "logits/chosen": -2.2515275478363037, "logits/rejected": -2.24314022064209, "logps/chosen": -1.1347332000732422, "logps/rejected": -1.3541853427886963, "loss": 1.5426, "rewards/accuracies": 0.609375, "rewards/chosen": -2.2694664001464844, "rewards/margins": 0.438904345035553, "rewards/rejected": -2.7083706855773926, "step": 1490 }, { "epoch": 2.161967390325196, "grad_norm": 20.697243890138434, "learning_rate": 2.182573970718449e-08, "logits/chosen": -2.279026746749878, "logits/rejected": -2.2784788608551025, "logps/chosen": -1.1145248413085938, "logps/rejected": -1.3219712972640991, "loss": 1.5631, "rewards/accuracies": 0.578125, "rewards/chosen": -2.2290496826171875, "rewards/margins": 0.4148930013179779, "rewards/rejected": -2.6439425945281982, "step": 1500 }, { "epoch": 2.1763805062606973, "grad_norm": 20.97814093763114, "learning_rate": 2.113617043622536e-08, "logits/chosen": -2.2447619438171387, "logits/rejected": -2.2397830486297607, "logps/chosen": -1.108572006225586, "logps/rejected": -1.312126874923706, "loss": 1.5638, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -2.217144012451172, "rewards/margins": 0.4071098268032074, "rewards/rejected": -2.624253749847412, "step": 1510 }, { "epoch": 2.1907936221961988, "grad_norm": 19.658252029005208, "learning_rate": 2.045473875739001e-08, "logits/chosen": -2.286835193634033, "logits/rejected": -2.284726619720459, "logps/chosen": -1.1268645524978638, "logps/rejected": -1.3589181900024414, "loss": 1.5125, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.2537291049957275, "rewards/margins": 0.4641071856021881, "rewards/rejected": -2.717836380004883, "step": 1520 }, { "epoch": 2.2052067381316998, "grad_norm": 19.675863885214547, "learning_rate": 1.9781636787010503e-08, "logits/chosen": -2.296203851699829, "logits/rejected": -2.292480230331421, "logps/chosen": -1.1581227779388428, "logps/rejected": -1.3830742835998535, "loss": 1.5552, "rewards/accuracies": 0.5625, "rewards/chosen": -2.3162455558776855, "rewards/margins": 0.4499031603336334, "rewards/rejected": -2.766148567199707, "step": 1530 }, { "epoch": 2.219619854067201, "grad_norm": 26.028820150112818, "learning_rate": 1.911705429302038e-08, "logits/chosen": -2.2454471588134766, "logits/rejected": -2.2483785152435303, "logps/chosen": -1.1285746097564697, "logps/rejected": -1.2919931411743164, "loss": 1.5857, "rewards/accuracies": 0.625, "rewards/chosen": -2.2571492195129395, "rewards/margins": 0.3268371522426605, "rewards/rejected": -2.583986282348633, "step": 1540 }, { "epoch": 2.2340329700027026, "grad_norm": 23.71926436834239, "learning_rate": 1.8461178641453617e-08, "logits/chosen": -2.2616686820983887, "logits/rejected": -2.2652456760406494, "logps/chosen": -1.1020487546920776, "logps/rejected": -1.310429573059082, "loss": 1.5808, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.2040975093841553, "rewards/margins": 0.41676193475723267, "rewards/rejected": -2.620859146118164, "step": 1550 }, { "epoch": 2.2484460859382036, "grad_norm": 18.888058220721906, "learning_rate": 1.781419474362017e-08, "logits/chosen": -2.2560315132141113, "logits/rejected": -2.2563912868499756, "logps/chosen": -1.120178461074829, "logps/rejected": -1.3521924018859863, "loss": 1.5308, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -2.240356922149658, "rewards/margins": 0.46402817964553833, "rewards/rejected": -2.7043848037719727, "step": 1560 }, { "epoch": 2.262859201873705, "grad_norm": 18.99138375232662, "learning_rate": 1.7176285003974033e-08, "logits/chosen": -2.2571425437927246, "logits/rejected": -2.253202199935913, "logps/chosen": -1.1062372922897339, "logps/rejected": -1.313024640083313, "loss": 1.5606, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.2124745845794678, "rewards/margins": 0.413574755191803, "rewards/rejected": -2.626049280166626, "step": 1570 }, { "epoch": 2.2772723178092065, "grad_norm": 24.162865311479557, "learning_rate": 1.6547629268687786e-08, "logits/chosen": -2.2994749546051025, "logits/rejected": -2.296318531036377, "logps/chosen": -1.0731937885284424, "logps/rejected": -1.3190656900405884, "loss": 1.5119, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -2.1463875770568848, "rewards/margins": 0.49174371361732483, "rewards/rejected": -2.6381313800811768, "step": 1580 }, { "epoch": 2.291685433744708, "grad_norm": 18.432849736683174, "learning_rate": 1.59284047749485e-08, "logits/chosen": -2.2636983394622803, "logits/rejected": -2.2557337284088135, "logps/chosen": -1.0886359214782715, "logps/rejected": -1.2910807132720947, "loss": 1.5641, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -2.177271842956543, "rewards/margins": 0.4048894941806793, "rewards/rejected": -2.5821614265441895, "step": 1590 }, { "epoch": 2.306098549680209, "grad_norm": 28.646123727089137, "learning_rate": 1.5318786100989188e-08, "logits/chosen": -2.229341506958008, "logits/rejected": -2.226560115814209, "logps/chosen": -1.2336177825927734, "logps/rejected": -1.4262335300445557, "loss": 1.6148, "rewards/accuracies": 0.578125, "rewards/chosen": -2.467235565185547, "rewards/margins": 0.3852314352989197, "rewards/rejected": -2.8524670600891113, "step": 1600 }, { "epoch": 2.3205116656157103, "grad_norm": 23.756121348250495, "learning_rate": 1.471894511686988e-08, "logits/chosen": -2.2284324169158936, "logits/rejected": -2.2255947589874268, "logps/chosen": -1.1893842220306396, "logps/rejected": -1.3409416675567627, "loss": 1.6367, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.3787684440612793, "rewards/margins": 0.30311447381973267, "rewards/rejected": -2.6818833351135254, "step": 1610 }, { "epoch": 2.3349247815512117, "grad_norm": 18.450150129405873, "learning_rate": 1.4129050936022214e-08, "logits/chosen": -2.2338924407958984, "logits/rejected": -2.235215663909912, "logps/chosen": -1.0769164562225342, "logps/rejected": -1.2985079288482666, "loss": 1.5409, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.1538329124450684, "rewards/margins": 0.44318294525146484, "rewards/rejected": -2.597015857696533, "step": 1620 }, { "epoch": 2.3493378974867127, "grad_norm": 22.180084405255627, "learning_rate": 1.3549269867571222e-08, "logits/chosen": -2.2351133823394775, "logits/rejected": -2.2372500896453857, "logps/chosen": -1.1330866813659668, "logps/rejected": -1.2997318506240845, "loss": 1.6214, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.2661733627319336, "rewards/margins": 0.33329010009765625, "rewards/rejected": -2.599463701248169, "step": 1630 }, { "epoch": 2.363751013422214, "grad_norm": 23.08714654459471, "learning_rate": 1.2979765369447742e-08, "logits/chosen": -2.304003953933716, "logits/rejected": -2.2949726581573486, "logps/chosen": -1.1455012559890747, "logps/rejected": -1.3875641822814941, "loss": 1.5371, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -2.2910025119781494, "rewards/margins": 0.48412585258483887, "rewards/rejected": -2.7751283645629883, "step": 1640 }, { "epoch": 2.3781641293577156, "grad_norm": 30.56182243031503, "learning_rate": 1.2420698002304608e-08, "logits/chosen": -2.2411041259765625, "logits/rejected": -2.2343127727508545, "logps/chosen": -1.0859107971191406, "logps/rejected": -1.3196165561676025, "loss": 1.5388, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.1718215942382812, "rewards/margins": 0.4674110412597656, "rewards/rejected": -2.639233112335205, "step": 1650 }, { "epoch": 2.392577245293217, "grad_norm": 19.77198047003492, "learning_rate": 1.1872225384249768e-08, "logits/chosen": -2.268101215362549, "logits/rejected": -2.2637829780578613, "logps/chosen": -1.1163004636764526, "logps/rejected": -1.3505176305770874, "loss": 1.5169, "rewards/accuracies": 0.625, "rewards/chosen": -2.2326009273529053, "rewards/margins": 0.46843448281288147, "rewards/rejected": -2.701035261154175, "step": 1660 }, { "epoch": 2.406990361228718, "grad_norm": 26.906205506300168, "learning_rate": 1.1334502146408881e-08, "logits/chosen": -2.2429723739624023, "logits/rejected": -2.249293804168701, "logps/chosen": -1.1734583377838135, "logps/rejected": -1.3377552032470703, "loss": 1.6096, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -2.346916675567627, "rewards/margins": 0.3285936415195465, "rewards/rejected": -2.6755104064941406, "step": 1670 }, { "epoch": 2.4214034771642194, "grad_norm": 21.73816659360824, "learning_rate": 1.0807679889330163e-08, "logits/chosen": -2.314985990524292, "logits/rejected": -2.320690870285034, "logps/chosen": -1.17433762550354, "logps/rejected": -1.3522727489471436, "loss": 1.5944, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.34867525100708, "rewards/margins": 0.35587045550346375, "rewards/rejected": -2.704545497894287, "step": 1680 }, { "epoch": 2.435816593099721, "grad_norm": 17.239308701432627, "learning_rate": 1.0291907140243538e-08, "logits/chosen": -2.2565197944641113, "logits/rejected": -2.255737781524658, "logps/chosen": -1.1245791912078857, "logps/rejected": -1.4125820398330688, "loss": 1.4673, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -2.2491583824157715, "rewards/margins": 0.5760055184364319, "rewards/rejected": -2.8251640796661377, "step": 1690 }, { "epoch": 2.450229709035222, "grad_norm": 21.369654725894584, "learning_rate": 9.787329311186249e-09, "logits/chosen": -2.252303123474121, "logits/rejected": -2.251774787902832, "logps/chosen": -1.1287000179290771, "logps/rejected": -1.3461166620254517, "loss": 1.5545, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -2.2574000358581543, "rewards/margins": 0.43483343720436096, "rewards/rejected": -2.6922333240509033, "step": 1700 }, { "epoch": 2.4646428249707233, "grad_norm": 25.680472794698755, "learning_rate": 9.294088658006916e-09, "logits/chosen": -2.2721753120422363, "logits/rejected": -2.2618608474731445, "logps/chosen": -1.1408545970916748, "logps/rejected": -1.366431474685669, "loss": 1.5555, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.2817091941833496, "rewards/margins": 0.45115384459495544, "rewards/rejected": -2.732862949371338, "step": 1710 }, { "epoch": 2.4790559409062247, "grad_norm": 20.80236487452411, "learning_rate": 8.812324240259094e-09, "logits/chosen": -2.2599918842315674, "logits/rejected": -2.2533061504364014, "logps/chosen": -1.1435985565185547, "logps/rejected": -1.3751742839813232, "loss": 1.5389, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.2871971130371094, "rewards/margins": 0.4631514549255371, "rewards/rejected": -2.7503485679626465, "step": 1720 }, { "epoch": 2.493469056841726, "grad_norm": 25.297955693939965, "learning_rate": 8.342171881996351e-09, "logits/chosen": -2.269395112991333, "logits/rejected": -2.267338514328003, "logps/chosen": -1.1785120964050293, "logps/rejected": -1.3562462329864502, "loss": 1.6033, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -2.3570241928100586, "rewards/margins": 0.355468213558197, "rewards/rejected": -2.7124924659729004, "step": 1730 }, { "epoch": 2.507882172777227, "grad_norm": 24.06865322162579, "learning_rate": 7.883764133479137e-09, "logits/chosen": -2.260371685028076, "logits/rejected": -2.2534215450286865, "logps/chosen": -1.130081295967102, "logps/rejected": -1.3861533403396606, "loss": 1.4917, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -2.260162591934204, "rewards/margins": 0.5121440887451172, "rewards/rejected": -2.7723066806793213, "step": 1740 }, { "epoch": 2.5222952887127286, "grad_norm": 29.75935812876475, "learning_rate": 7.43723023380502e-09, "logits/chosen": -2.2067666053771973, "logits/rejected": -2.208773136138916, "logps/chosen": -1.1877186298370361, "logps/rejected": -1.4029791355133057, "loss": 1.5553, "rewards/accuracies": 0.59375, "rewards/chosen": -2.3754372596740723, "rewards/margins": 0.430520623922348, "rewards/rejected": -2.8059582710266113, "step": 1750 }, { "epoch": 2.53670840464823, "grad_norm": 24.2432673255774, "learning_rate": 7.002696074472075e-09, "logits/chosen": -2.2512130737304688, "logits/rejected": -2.2531332969665527, "logps/chosen": -1.2248094081878662, "logps/rejected": -1.4335352182388306, "loss": 1.5688, "rewards/accuracies": 0.578125, "rewards/chosen": -2.4496188163757324, "rewards/margins": 0.4174516797065735, "rewards/rejected": -2.867070436477661, "step": 1760 }, { "epoch": 2.551121520583731, "grad_norm": 32.01658470543389, "learning_rate": 6.580284163886369e-09, "logits/chosen": -2.2607645988464355, "logits/rejected": -2.2610065937042236, "logps/chosen": -1.1927731037139893, "logps/rejected": -1.3909296989440918, "loss": 1.5668, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -2.3855462074279785, "rewards/margins": 0.39631372690200806, "rewards/rejected": -2.7818593978881836, "step": 1770 }, { "epoch": 2.5655346365192324, "grad_norm": 24.419915253157857, "learning_rate": 6.1701135928230566e-09, "logits/chosen": -2.217277765274048, "logits/rejected": -2.209423303604126, "logps/chosen": -1.2151906490325928, "logps/rejected": -1.427695870399475, "loss": 1.5543, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": -2.4303812980651855, "rewards/margins": 0.4250105321407318, "rewards/rejected": -2.85539174079895, "step": 1780 }, { "epoch": 2.579947752454734, "grad_norm": 22.58314758967658, "learning_rate": 5.7723000008510655e-09, "logits/chosen": -2.2694671154022217, "logits/rejected": -2.2696220874786377, "logps/chosen": -1.168027639389038, "logps/rejected": -1.3549962043762207, "loss": 1.5926, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -2.336055278778076, "rewards/margins": 0.3739371597766876, "rewards/rejected": -2.7099924087524414, "step": 1790 }, { "epoch": 2.5943608683902353, "grad_norm": 22.781044887360306, "learning_rate": 5.386955543730798e-09, "logits/chosen": -2.277388334274292, "logits/rejected": -2.2686190605163574, "logps/chosen": -1.2046597003936768, "logps/rejected": -1.4461263418197632, "loss": 1.5418, "rewards/accuracies": 0.59375, "rewards/chosen": -2.4093194007873535, "rewards/margins": 0.4829334318637848, "rewards/rejected": -2.8922526836395264, "step": 1800 }, { "epoch": 2.6087739843257363, "grad_norm": 22.99449695923957, "learning_rate": 5.014188861794e-09, "logits/chosen": -2.2212021350860596, "logits/rejected": -2.2196457386016846, "logps/chosen": -1.1851980686187744, "logps/rejected": -1.4349489212036133, "loss": 1.5106, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.370396137237549, "rewards/margins": 0.4995017945766449, "rewards/rejected": -2.8698978424072266, "step": 1810 }, { "epoch": 2.6231871002612377, "grad_norm": 21.158930640881984, "learning_rate": 4.654105049314744e-09, "logits/chosen": -2.2831361293792725, "logits/rejected": -2.2893922328948975, "logps/chosen": -1.1905128955841064, "logps/rejected": -1.392458200454712, "loss": 1.5859, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.381025791168213, "rewards/margins": 0.40389055013656616, "rewards/rejected": -2.784916400909424, "step": 1820 }, { "epoch": 2.637600216196739, "grad_norm": 24.606900180349317, "learning_rate": 4.3068056248801496e-09, "logits/chosen": -2.260871410369873, "logits/rejected": -2.2557454109191895, "logps/chosen": -1.1808732748031616, "logps/rejected": -1.4025046825408936, "loss": 1.5385, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.3617465496063232, "rewards/margins": 0.44326257705688477, "rewards/rejected": -2.805009365081787, "step": 1830 }, { "epoch": 2.65201333213224, "grad_norm": 21.13626030836664, "learning_rate": 3.972388502769225e-09, "logits/chosen": -2.298476457595825, "logits/rejected": -2.2920804023742676, "logps/chosen": -1.2038078308105469, "logps/rejected": -1.3969953060150146, "loss": 1.5752, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -2.4076156616210938, "rewards/margins": 0.38637492060661316, "rewards/rejected": -2.7939906120300293, "step": 1840 }, { "epoch": 2.6664264480677415, "grad_norm": 23.476816797872775, "learning_rate": 3.650947965347817e-09, "logits/chosen": -2.2797365188598633, "logits/rejected": -2.2747490406036377, "logps/chosen": -1.1963701248168945, "logps/rejected": -1.4419893026351929, "loss": 1.4998, "rewards/accuracies": 0.6875, "rewards/chosen": -2.392740249633789, "rewards/margins": 0.4912383556365967, "rewards/rejected": -2.8839786052703857, "step": 1850 }, { "epoch": 2.680839564003243, "grad_norm": 28.256187183267656, "learning_rate": 3.342574636487583e-09, "logits/chosen": -2.3183302879333496, "logits/rejected": -2.3189597129821777, "logps/chosen": -1.2193528413772583, "logps/rejected": -1.4082263708114624, "loss": 1.5796, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -2.4387056827545166, "rewards/margins": 0.37774714827537537, "rewards/rejected": -2.816452741622925, "step": 1860 }, { "epoch": 2.6952526799387444, "grad_norm": 19.684709175702448, "learning_rate": 3.0473554560163207e-09, "logits/chosen": -2.254714012145996, "logits/rejected": -2.2444214820861816, "logps/chosen": -1.1542867422103882, "logps/rejected": -1.377029538154602, "loss": 1.5415, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.3085734844207764, "rewards/margins": 0.44548529386520386, "rewards/rejected": -2.754059076309204, "step": 1870 }, { "epoch": 2.709665795874246, "grad_norm": 24.08431048004274, "learning_rate": 2.7653736552070207e-09, "logits/chosen": -2.2782135009765625, "logits/rejected": -2.276923179626465, "logps/chosen": -1.2209516763687134, "logps/rejected": -1.4482202529907227, "loss": 1.538, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -2.4419033527374268, "rewards/margins": 0.45453739166259766, "rewards/rejected": -2.8964405059814453, "step": 1880 }, { "epoch": 2.724078911809747, "grad_norm": 17.160087536859805, "learning_rate": 2.496708733312419e-09, "logits/chosen": -2.250776767730713, "logits/rejected": -2.253812551498413, "logps/chosen": -1.1638703346252441, "logps/rejected": -1.3798881769180298, "loss": 1.5446, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -2.3277406692504883, "rewards/margins": 0.43203577399253845, "rewards/rejected": -2.7597763538360596, "step": 1890 }, { "epoch": 2.7384920277452482, "grad_norm": 18.244927534685523, "learning_rate": 2.241436435151717e-09, "logits/chosen": -2.2549357414245605, "logits/rejected": -2.247612714767456, "logps/chosen": -1.1582852602005005, "logps/rejected": -1.3766599893569946, "loss": 1.5527, "rewards/accuracies": 0.559374988079071, "rewards/chosen": -2.316570520401001, "rewards/margins": 0.4367493987083435, "rewards/rejected": -2.7533199787139893, "step": 1900 }, { "epoch": 2.7529051436807492, "grad_norm": 16.475242116483138, "learning_rate": 1.9996287297558866e-09, "logits/chosen": -2.241720199584961, "logits/rejected": -2.246184825897217, "logps/chosen": -1.1753349304199219, "logps/rejected": -1.398506760597229, "loss": 1.5477, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.3506698608398438, "rewards/margins": 0.44634366035461426, "rewards/rejected": -2.797013521194458, "step": 1910 }, { "epoch": 2.7673182596162507, "grad_norm": 18.756801068057744, "learning_rate": 1.7713537900772957e-09, "logits/chosen": -2.2873311042785645, "logits/rejected": -2.285597562789917, "logps/chosen": -1.2065943479537964, "logps/rejected": -1.3886728286743164, "loss": 1.587, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.4131886959075928, "rewards/margins": 0.3641572594642639, "rewards/rejected": -2.777345657348633, "step": 1920 }, { "epoch": 2.781731375551752, "grad_norm": 18.75587536733683, "learning_rate": 1.5566759737697998e-09, "logits/chosen": -2.252821922302246, "logits/rejected": -2.252249240875244, "logps/chosen": -1.1472349166870117, "logps/rejected": -1.3485777378082275, "loss": 1.5582, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -2.2944698333740234, "rewards/margins": 0.40268588066101074, "rewards/rejected": -2.697155475616455, "step": 1930 }, { "epoch": 2.7961444914872535, "grad_norm": 18.83657032008189, "learning_rate": 1.3556558050442425e-09, "logits/chosen": -2.27396821975708, "logits/rejected": -2.266453504562378, "logps/chosen": -1.153480052947998, "logps/rejected": -1.3870432376861572, "loss": 1.5257, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -2.306960105895996, "rewards/margins": 0.46712619066238403, "rewards/rejected": -2.7740864753723145, "step": 1940 }, { "epoch": 2.810557607422755, "grad_norm": 19.634521530754597, "learning_rate": 1.1683499576049583e-09, "logits/chosen": -2.2552907466888428, "logits/rejected": -2.255131959915161, "logps/chosen": -1.1509824991226196, "logps/rejected": -1.3625354766845703, "loss": 1.543, "rewards/accuracies": 0.640625, "rewards/chosen": -2.3019649982452393, "rewards/margins": 0.4231061041355133, "rewards/rejected": -2.7250709533691406, "step": 1950 }, { "epoch": 2.824970723358256, "grad_norm": 22.498945774440706, "learning_rate": 9.948112386716167e-10, "logits/chosen": -2.2837812900543213, "logits/rejected": -2.2745299339294434, "logps/chosen": -1.220226764678955, "logps/rejected": -1.4314284324645996, "loss": 1.569, "rewards/accuracies": 0.565625011920929, "rewards/chosen": -2.44045352935791, "rewards/margins": 0.4224032759666443, "rewards/rejected": -2.862856864929199, "step": 1960 }, { "epoch": 2.8393838392937574, "grad_norm": 24.02219360016628, "learning_rate": 8.350885740913416e-10, "logits/chosen": -2.224419116973877, "logits/rejected": -2.2149837017059326, "logps/chosen": -1.1606011390686035, "logps/rejected": -1.3387001752853394, "loss": 1.6133, "rewards/accuracies": 0.559374988079071, "rewards/chosen": -2.321202278137207, "rewards/margins": 0.3561980724334717, "rewards/rejected": -2.6774003505706787, "step": 1970 }, { "epoch": 2.8537969552292584, "grad_norm": 19.84204643186706, "learning_rate": 6.89226994544978e-10, "logits/chosen": -2.223024845123291, "logits/rejected": -2.2192695140838623, "logps/chosen": -1.1890778541564941, "logps/rejected": -1.3559348583221436, "loss": 1.6171, "rewards/accuracies": 0.5625, "rewards/chosen": -2.3781557083129883, "rewards/margins": 0.33371374011039734, "rewards/rejected": -2.711869716644287, "step": 1980 }, { "epoch": 2.86821007116476, "grad_norm": 20.65249363397335, "learning_rate": 5.572676228516038e-10, "logits/chosen": -2.255366802215576, "logits/rejected": -2.2476673126220703, "logps/chosen": -1.1339585781097412, "logps/rejected": -1.3980591297149658, "loss": 1.4961, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -2.2679171562194824, "rewards/margins": 0.5282012224197388, "rewards/rejected": -2.7961182594299316, "step": 1990 }, { "epoch": 2.882623187100261, "grad_norm": 21.582370970938786, "learning_rate": 4.3924766237473656e-10, "logits/chosen": -2.2555174827575684, "logits/rejected": -2.247621536254883, "logps/chosen": -1.1424418687820435, "logps/rejected": -1.3766818046569824, "loss": 1.531, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -2.284883737564087, "rewards/margins": 0.4684801697731018, "rewards/rejected": -2.753363609313965, "step": 2000 }, { "epoch": 2.8970363030357626, "grad_norm": 25.67561175147071, "learning_rate": 3.35200386533574e-10, "logits/chosen": -2.2250311374664307, "logits/rejected": -2.2280611991882324, "logps/chosen": -1.181894063949585, "logps/rejected": -1.3828670978546143, "loss": 1.565, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -2.36378812789917, "rewards/margins": 0.40194636583328247, "rewards/rejected": -2.7657341957092285, "step": 2010 }, { "epoch": 2.911449418971264, "grad_norm": 21.660548030642744, "learning_rate": 2.4515512942220874e-10, "logits/chosen": -2.27579927444458, "logits/rejected": -2.2686378955841064, "logps/chosen": -1.2043073177337646, "logps/rejected": -1.3992283344268799, "loss": 1.5841, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -2.4086146354675293, "rewards/margins": 0.38984209299087524, "rewards/rejected": -2.7984566688537598, "step": 2020 }, { "epoch": 2.925862534906765, "grad_norm": 24.50163023857697, "learning_rate": 1.691372775394717e-10, "logits/chosen": -2.2493457794189453, "logits/rejected": -2.251462936401367, "logps/chosen": -1.2009613513946533, "logps/rejected": -1.3668345212936401, "loss": 1.6163, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.4019227027893066, "rewards/margins": 0.33174630999565125, "rewards/rejected": -2.7336690425872803, "step": 2030 }, { "epoch": 2.9402756508422665, "grad_norm": 23.84173235916362, "learning_rate": 1.0716826263165724e-10, "logits/chosen": -2.291029691696167, "logits/rejected": -2.289228916168213, "logps/chosen": -1.17218816280365, "logps/rejected": -1.440246343612671, "loss": 1.4873, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.3443763256073, "rewards/margins": 0.5361161828041077, "rewards/rejected": -2.880492687225342, "step": 2040 }, { "epoch": 2.954688766777768, "grad_norm": 21.3140792744408, "learning_rate": 5.926555565031743e-11, "logits/chosen": -2.2876641750335693, "logits/rejected": -2.289773464202881, "logps/chosen": -1.216587781906128, "logps/rejected": -1.4193012714385986, "loss": 1.5845, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.433175563812256, "rewards/margins": 0.40542715787887573, "rewards/rejected": -2.8386025428771973, "step": 2050 }, { "epoch": 2.969101882713269, "grad_norm": 22.106407972159015, "learning_rate": 2.544266182662458e-11, "logits/chosen": -2.2547993659973145, "logits/rejected": -2.2469217777252197, "logps/chosen": -1.1249706745147705, "logps/rejected": -1.3703702688217163, "loss": 1.515, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -2.249941349029541, "rewards/margins": 0.49079880118370056, "rewards/rejected": -2.7407405376434326, "step": 2060 }, { "epoch": 2.9835149986487703, "grad_norm": 22.35894660462506, "learning_rate": 5.709116863872321e-12, "logits/chosen": -2.2706878185272217, "logits/rejected": -2.2676730155944824, "logps/chosen": -1.1365437507629395, "logps/rejected": -1.3011773824691772, "loss": 1.6093, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": -2.273087501525879, "rewards/margins": 0.3292676508426666, "rewards/rejected": -2.6023547649383545, "step": 2070 }, { "epoch": 2.9964868029907215, "step": 2079, "total_flos": 0.0, "train_loss": 1.6015657603367983, "train_runtime": 23310.5572, "train_samples_per_second": 2.857, "train_steps_per_second": 0.089 } ], "logging_steps": 10, "max_steps": 2079, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }