{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.999297541394882, "eval_steps": 400, "global_step": 5604, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002676032781401572, "grad_norm": 13.878619431257832, "learning_rate": 8.9126559714795e-09, "logits/chosen": -0.07001040130853653, "logits/rejected": 0.13609513640403748, "logps/chosen": -1.7158489227294922, "logps/rejected": -1.8893795013427734, "loss": 1.7158, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.7158489227294922, "rewards/margins": 0.17353056371212006, "rewards/rejected": -1.8893795013427734, "step": 5 }, { "epoch": 0.005352065562803144, "grad_norm": 27.21486316622042, "learning_rate": 1.7825311942959e-08, "logits/chosen": 0.009137087501585484, "logits/rejected": 0.1292596012353897, "logps/chosen": -1.8023427724838257, "logps/rejected": -1.8460710048675537, "loss": 1.8023, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.8023427724838257, "rewards/margins": 0.043728068470954895, "rewards/rejected": -1.8460710048675537, "step": 10 }, { "epoch": 0.008028098344204716, "grad_norm": 24.84790962654373, "learning_rate": 2.67379679144385e-08, "logits/chosen": -0.022596338763833046, "logits/rejected": 0.0756080150604248, "logps/chosen": -1.6345233917236328, "logps/rejected": -1.7651408910751343, "loss": 1.6345, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.6345233917236328, "rewards/margins": 0.13061748445034027, "rewards/rejected": -1.7651408910751343, "step": 15 }, { "epoch": 0.010704131125606288, "grad_norm": 15.773808874151177, "learning_rate": 3.5650623885918e-08, "logits/chosen": -0.02712991274893284, "logits/rejected": 0.05864949896931648, "logps/chosen": -1.724056601524353, "logps/rejected": -1.804459810256958, "loss": 1.7241, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.724056601524353, "rewards/margins": 0.08040325343608856, "rewards/rejected": -1.804459810256958, "step": 20 }, { "epoch": 0.013380163907007862, "grad_norm": 33.704453308454546, "learning_rate": 4.45632798573975e-08, "logits/chosen": -0.054397739470005035, "logits/rejected": 0.029579197987914085, "logps/chosen": -1.8679015636444092, "logps/rejected": -1.7782576084136963, "loss": 1.8679, "rewards/accuracies": 0.3687500059604645, "rewards/chosen": -1.8679015636444092, "rewards/margins": -0.08964408189058304, "rewards/rejected": -1.7782576084136963, "step": 25 }, { "epoch": 0.016056196688409432, "grad_norm": 29.531424569278162, "learning_rate": 5.3475935828877e-08, "logits/chosen": -0.09195728600025177, "logits/rejected": -0.0005215287092141807, "logps/chosen": -1.9071285724639893, "logps/rejected": -1.8317142724990845, "loss": 1.9071, "rewards/accuracies": 0.4375, "rewards/chosen": -1.9071285724639893, "rewards/margins": -0.07541424036026001, "rewards/rejected": -1.8317142724990845, "step": 30 }, { "epoch": 0.018732229469811006, "grad_norm": 22.763321978219764, "learning_rate": 6.23885918003565e-08, "logits/chosen": -0.0495879203081131, "logits/rejected": 0.11133874952793121, "logps/chosen": -1.846724510192871, "logps/rejected": -1.9960410594940186, "loss": 1.8467, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.846724510192871, "rewards/margins": 0.1493167132139206, "rewards/rejected": -1.9960410594940186, "step": 35 }, { "epoch": 0.021408262251212576, "grad_norm": 28.61236808198426, "learning_rate": 7.1301247771836e-08, "logits/chosen": 0.04060114175081253, "logits/rejected": 0.21388216316699982, "logps/chosen": -1.8800036907196045, "logps/rejected": -1.7416588068008423, "loss": 1.88, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -1.8800036907196045, "rewards/margins": -0.13834482431411743, "rewards/rejected": -1.7416588068008423, "step": 40 }, { "epoch": 0.02408429503261415, "grad_norm": 26.44962322686568, "learning_rate": 8.021390374331551e-08, "logits/chosen": 0.013068842701613903, "logits/rejected": 0.20770780742168427, "logps/chosen": -1.8352298736572266, "logps/rejected": -1.8686631917953491, "loss": 1.8352, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.8352298736572266, "rewards/margins": 0.03343340754508972, "rewards/rejected": -1.8686631917953491, "step": 45 }, { "epoch": 0.026760327814015723, "grad_norm": 30.92549405893149, "learning_rate": 8.9126559714795e-08, "logits/chosen": -0.051671575754880905, "logits/rejected": 0.09738025069236755, "logps/chosen": -1.891575574874878, "logps/rejected": -1.7738415002822876, "loss": 1.8916, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.891575574874878, "rewards/margins": -0.11773432791233063, "rewards/rejected": -1.7738415002822876, "step": 50 }, { "epoch": 0.029436360595417294, "grad_norm": 24.85417189744758, "learning_rate": 9.80392156862745e-08, "logits/chosen": -0.09872293472290039, "logits/rejected": 0.12284208834171295, "logps/chosen": -1.8228514194488525, "logps/rejected": -1.8571574687957764, "loss": 1.8229, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.8228514194488525, "rewards/margins": 0.03430619835853577, "rewards/rejected": -1.8571574687957764, "step": 55 }, { "epoch": 0.032112393376818864, "grad_norm": 28.378252495511468, "learning_rate": 1.06951871657754e-07, "logits/chosen": -0.07762618362903595, "logits/rejected": 0.11175026744604111, "logps/chosen": -1.7786842584609985, "logps/rejected": -1.8833481073379517, "loss": 1.7787, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.7786842584609985, "rewards/margins": 0.1046638935804367, "rewards/rejected": -1.8833481073379517, "step": 60 }, { "epoch": 0.03478842615822044, "grad_norm": 27.257424431426816, "learning_rate": 1.158645276292335e-07, "logits/chosen": -0.017085248604416847, "logits/rejected": 0.13110998272895813, "logps/chosen": -1.627202033996582, "logps/rejected": -1.755152940750122, "loss": 1.6272, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.627202033996582, "rewards/margins": 0.1279507726430893, "rewards/rejected": -1.755152940750122, "step": 65 }, { "epoch": 0.03746445893962201, "grad_norm": 29.339017873258342, "learning_rate": 1.24777183600713e-07, "logits/chosen": -0.06677906960248947, "logits/rejected": 0.08591251075267792, "logps/chosen": -1.7506344318389893, "logps/rejected": -1.7944780588150024, "loss": 1.7506, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -1.7506344318389893, "rewards/margins": 0.0438438318669796, "rewards/rejected": -1.7944780588150024, "step": 70 }, { "epoch": 0.04014049172102358, "grad_norm": 22.83918074349354, "learning_rate": 1.3368983957219251e-07, "logits/chosen": -0.0466216616332531, "logits/rejected": 0.14096233248710632, "logps/chosen": -1.7427504062652588, "logps/rejected": -1.9963117837905884, "loss": 1.7428, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.7427504062652588, "rewards/margins": 0.2535615563392639, "rewards/rejected": -1.9963117837905884, "step": 75 }, { "epoch": 0.04281652450242515, "grad_norm": 21.624624293226333, "learning_rate": 1.42602495543672e-07, "logits/chosen": -0.023247603327035904, "logits/rejected": 0.07852843403816223, "logps/chosen": -1.6767879724502563, "logps/rejected": -1.711572289466858, "loss": 1.6768, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.6767879724502563, "rewards/margins": 0.03478424251079559, "rewards/rejected": -1.711572289466858, "step": 80 }, { "epoch": 0.04549255728382673, "grad_norm": 16.975795545348795, "learning_rate": 1.5151515151515152e-07, "logits/chosen": -0.1615150421857834, "logits/rejected": 0.07920673489570618, "logps/chosen": -1.7423622608184814, "logps/rejected": -1.9086110591888428, "loss": 1.7424, "rewards/accuracies": 0.5, "rewards/chosen": -1.7423622608184814, "rewards/margins": 0.16624853014945984, "rewards/rejected": -1.9086110591888428, "step": 85 }, { "epoch": 0.0481685900652283, "grad_norm": 27.889940428608586, "learning_rate": 1.6042780748663102e-07, "logits/chosen": 0.08420085161924362, "logits/rejected": 0.043289147317409515, "logps/chosen": -1.6908982992172241, "logps/rejected": -1.7311828136444092, "loss": 1.6909, "rewards/accuracies": 0.46875, "rewards/chosen": -1.6908982992172241, "rewards/margins": 0.040284596383571625, "rewards/rejected": -1.7311828136444092, "step": 90 }, { "epoch": 0.05084462284662987, "grad_norm": 33.404609531784246, "learning_rate": 1.693404634581105e-07, "logits/chosen": -0.08638016134500504, "logits/rejected": 0.05936416983604431, "logps/chosen": -1.7255756855010986, "logps/rejected": -1.8513917922973633, "loss": 1.7256, "rewards/accuracies": 0.59375, "rewards/chosen": -1.7255756855010986, "rewards/margins": 0.12581615149974823, "rewards/rejected": -1.8513917922973633, "step": 95 }, { "epoch": 0.05352065562803145, "grad_norm": 16.32745074601894, "learning_rate": 1.7825311942959e-07, "logits/chosen": -0.04187163710594177, "logits/rejected": 0.020019959658384323, "logps/chosen": -1.5966705083847046, "logps/rejected": -1.7010024785995483, "loss": 1.5967, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.5966705083847046, "rewards/margins": 0.10433206707239151, "rewards/rejected": -1.7010024785995483, "step": 100 }, { "epoch": 0.05619668840943302, "grad_norm": 22.170297221162198, "learning_rate": 1.8716577540106952e-07, "logits/chosen": 0.0537613108754158, "logits/rejected": 0.07994085550308228, "logps/chosen": -1.5204919576644897, "logps/rejected": -1.6888647079467773, "loss": 1.5205, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.5204919576644897, "rewards/margins": 0.16837282478809357, "rewards/rejected": -1.6888647079467773, "step": 105 }, { "epoch": 0.05887272119083459, "grad_norm": 22.96507487207059, "learning_rate": 1.96078431372549e-07, "logits/chosen": 0.002391559537500143, "logits/rejected": 0.09733996540307999, "logps/chosen": -1.5090798139572144, "logps/rejected": -1.5708321332931519, "loss": 1.5091, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.5090798139572144, "rewards/margins": 0.06175212189555168, "rewards/rejected": -1.5708321332931519, "step": 110 }, { "epoch": 0.06154875397223616, "grad_norm": 24.149468650328476, "learning_rate": 2.049910873440285e-07, "logits/chosen": -0.0042781708762049675, "logits/rejected": 0.1957433521747589, "logps/chosen": -1.5111892223358154, "logps/rejected": -1.7451274394989014, "loss": 1.5112, "rewards/accuracies": 0.625, "rewards/chosen": -1.5111892223358154, "rewards/margins": 0.23393850028514862, "rewards/rejected": -1.7451274394989014, "step": 115 }, { "epoch": 0.06422478675363773, "grad_norm": 26.907148028069674, "learning_rate": 2.13903743315508e-07, "logits/chosen": -0.09109218418598175, "logits/rejected": 0.0767279714345932, "logps/chosen": -1.5413243770599365, "logps/rejected": -1.6428210735321045, "loss": 1.5413, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.5413243770599365, "rewards/margins": 0.10149665176868439, "rewards/rejected": -1.6428210735321045, "step": 120 }, { "epoch": 0.0669008195350393, "grad_norm": 14.100959118196057, "learning_rate": 2.2281639928698751e-07, "logits/chosen": -0.09779004007577896, "logits/rejected": 0.030154284089803696, "logps/chosen": -1.4948018789291382, "logps/rejected": -1.470332384109497, "loss": 1.4948, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.4948018789291382, "rewards/margins": -0.0244695246219635, "rewards/rejected": -1.470332384109497, "step": 125 }, { "epoch": 0.06957685231644088, "grad_norm": 32.61977982475718, "learning_rate": 2.31729055258467e-07, "logits/chosen": -0.0008839584770612419, "logits/rejected": 0.12865932285785675, "logps/chosen": -1.5060017108917236, "logps/rejected": -1.6161201000213623, "loss": 1.506, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.5060017108917236, "rewards/margins": 0.11011849343776703, "rewards/rejected": -1.6161201000213623, "step": 130 }, { "epoch": 0.07225288509784245, "grad_norm": 17.752136176876174, "learning_rate": 2.406417112299465e-07, "logits/chosen": -0.09724407643079758, "logits/rejected": 0.012994533404707909, "logps/chosen": -1.4998667240142822, "logps/rejected": -1.551896333694458, "loss": 1.4999, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.4998667240142822, "rewards/margins": 0.052029628306627274, "rewards/rejected": -1.551896333694458, "step": 135 }, { "epoch": 0.07492891787924402, "grad_norm": 10.232288856346967, "learning_rate": 2.49554367201426e-07, "logits/chosen": -0.07435207068920135, "logits/rejected": 0.07958526909351349, "logps/chosen": -1.45034658908844, "logps/rejected": -1.5068769454956055, "loss": 1.4503, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.45034658908844, "rewards/margins": 0.05653046816587448, "rewards/rejected": -1.5068769454956055, "step": 140 }, { "epoch": 0.0776049506606456, "grad_norm": 14.48204237469257, "learning_rate": 2.5846702317290554e-07, "logits/chosen": -0.09124326705932617, "logits/rejected": 0.048250533640384674, "logps/chosen": -1.35175621509552, "logps/rejected": -1.4458637237548828, "loss": 1.3518, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.35175621509552, "rewards/margins": 0.09410758316516876, "rewards/rejected": -1.4458637237548828, "step": 145 }, { "epoch": 0.08028098344204716, "grad_norm": 9.003692801675687, "learning_rate": 2.6737967914438503e-07, "logits/chosen": -0.13457007706165314, "logits/rejected": 0.008795162662863731, "logps/chosen": -1.2863819599151611, "logps/rejected": -1.297554612159729, "loss": 1.2864, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2863819599151611, "rewards/margins": 0.011172810569405556, "rewards/rejected": -1.297554612159729, "step": 150 }, { "epoch": 0.08295701622344874, "grad_norm": 10.344764671747717, "learning_rate": 2.762923351158645e-07, "logits/chosen": -0.13015718758106232, "logits/rejected": -0.08562298119068146, "logps/chosen": -1.3001400232315063, "logps/rejected": -1.4157555103302002, "loss": 1.3001, "rewards/accuracies": 0.53125, "rewards/chosen": -1.3001400232315063, "rewards/margins": 0.11561550945043564, "rewards/rejected": -1.4157555103302002, "step": 155 }, { "epoch": 0.0856330490048503, "grad_norm": 7.580098376272968, "learning_rate": 2.85204991087344e-07, "logits/chosen": -0.20770728588104248, "logits/rejected": -0.08208228647708893, "logps/chosen": -1.3863835334777832, "logps/rejected": -1.3667562007904053, "loss": 1.3864, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.3863835334777832, "rewards/margins": -0.019627157598733902, "rewards/rejected": -1.3667562007904053, "step": 160 }, { "epoch": 0.08830908178625188, "grad_norm": 7.586724600835102, "learning_rate": 2.941176470588235e-07, "logits/chosen": -0.0787295401096344, "logits/rejected": 0.08614591509103775, "logps/chosen": -1.3054903745651245, "logps/rejected": -1.3758901357650757, "loss": 1.3055, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.3054903745651245, "rewards/margins": 0.07039965689182281, "rewards/rejected": -1.3758901357650757, "step": 165 }, { "epoch": 0.09098511456765346, "grad_norm": 9.755375757906338, "learning_rate": 3.0303030303030305e-07, "logits/chosen": -0.12472772598266602, "logits/rejected": -0.07689608633518219, "logps/chosen": -1.4194749593734741, "logps/rejected": -1.4888038635253906, "loss": 1.4195, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.4194749593734741, "rewards/margins": 0.06932888180017471, "rewards/rejected": -1.4888038635253906, "step": 170 }, { "epoch": 0.09366114734905502, "grad_norm": 9.234151372703979, "learning_rate": 3.1194295900178254e-07, "logits/chosen": 0.00599729735404253, "logits/rejected": 0.0023250863887369633, "logps/chosen": -1.3066940307617188, "logps/rejected": -1.39181649684906, "loss": 1.3067, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.3066940307617188, "rewards/margins": 0.08512246608734131, "rewards/rejected": -1.39181649684906, "step": 175 }, { "epoch": 0.0963371801304566, "grad_norm": 7.4389304235931055, "learning_rate": 3.2085561497326203e-07, "logits/chosen": -0.02657126449048519, "logits/rejected": -0.02947419509291649, "logps/chosen": -1.3254764080047607, "logps/rejected": -1.5280386209487915, "loss": 1.3255, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.3254764080047607, "rewards/margins": 0.20256224274635315, "rewards/rejected": -1.5280386209487915, "step": 180 }, { "epoch": 0.09901321291185818, "grad_norm": 7.842372011316667, "learning_rate": 3.297682709447415e-07, "logits/chosen": -0.18316921591758728, "logits/rejected": -0.10460956394672394, "logps/chosen": -1.3219475746154785, "logps/rejected": -1.367650032043457, "loss": 1.3219, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.3219475746154785, "rewards/margins": 0.045702509582042694, "rewards/rejected": -1.367650032043457, "step": 185 }, { "epoch": 0.10168924569325974, "grad_norm": 8.127078705252696, "learning_rate": 3.38680926916221e-07, "logits/chosen": -0.09931747615337372, "logits/rejected": 0.007574987597763538, "logps/chosen": -1.239583134651184, "logps/rejected": -1.371794581413269, "loss": 1.2396, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.239583134651184, "rewards/margins": 0.13221141695976257, "rewards/rejected": -1.371794581413269, "step": 190 }, { "epoch": 0.10436527847466132, "grad_norm": 8.515283864369795, "learning_rate": 3.475935828877005e-07, "logits/chosen": -0.0029420270584523678, "logits/rejected": 0.14303915202617645, "logps/chosen": -1.2321486473083496, "logps/rejected": -1.3921966552734375, "loss": 1.2321, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2321486473083496, "rewards/margins": 0.16004805266857147, "rewards/rejected": -1.3921966552734375, "step": 195 }, { "epoch": 0.1070413112560629, "grad_norm": 20.08790277571572, "learning_rate": 3.5650623885918e-07, "logits/chosen": -0.08527259528636932, "logits/rejected": 0.04497109353542328, "logps/chosen": -1.3461731672286987, "logps/rejected": -1.3923439979553223, "loss": 1.3462, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.3461731672286987, "rewards/margins": 0.04617086052894592, "rewards/rejected": -1.3923439979553223, "step": 200 }, { "epoch": 0.10971734403746446, "grad_norm": 12.395488962135373, "learning_rate": 3.654188948306595e-07, "logits/chosen": -0.07998348772525787, "logits/rejected": 0.05175967141985893, "logps/chosen": -1.2560703754425049, "logps/rejected": -1.3336066007614136, "loss": 1.2561, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2560703754425049, "rewards/margins": 0.07753607630729675, "rewards/rejected": -1.3336066007614136, "step": 205 }, { "epoch": 0.11239337681886603, "grad_norm": 9.35596962699534, "learning_rate": 3.7433155080213904e-07, "logits/chosen": -0.14810451865196228, "logits/rejected": 0.028008803725242615, "logps/chosen": -1.3391255140304565, "logps/rejected": -1.4426857233047485, "loss": 1.3391, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.3391255140304565, "rewards/margins": 0.1035601869225502, "rewards/rejected": -1.4426857233047485, "step": 210 }, { "epoch": 0.1150694096002676, "grad_norm": 7.199530938588731, "learning_rate": 3.8324420677361853e-07, "logits/chosen": -0.18892619013786316, "logits/rejected": 0.04510430246591568, "logps/chosen": -1.3652472496032715, "logps/rejected": -1.4181931018829346, "loss": 1.3652, "rewards/accuracies": 0.53125, "rewards/chosen": -1.3652472496032715, "rewards/margins": 0.05294584482908249, "rewards/rejected": -1.4181931018829346, "step": 215 }, { "epoch": 0.11774544238166917, "grad_norm": 16.837624893531355, "learning_rate": 3.92156862745098e-07, "logits/chosen": 0.05079995468258858, "logits/rejected": 0.14220979809761047, "logps/chosen": -1.283282995223999, "logps/rejected": -1.4212286472320557, "loss": 1.2833, "rewards/accuracies": 0.5625, "rewards/chosen": -1.283282995223999, "rewards/margins": 0.13794557750225067, "rewards/rejected": -1.4212286472320557, "step": 220 }, { "epoch": 0.12042147516307075, "grad_norm": 6.80716178125282, "learning_rate": 4.010695187165775e-07, "logits/chosen": -0.11658213287591934, "logits/rejected": 0.034660451114177704, "logps/chosen": -1.2966222763061523, "logps/rejected": -1.4203050136566162, "loss": 1.2966, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2966222763061523, "rewards/margins": 0.12368263304233551, "rewards/rejected": -1.4203050136566162, "step": 225 }, { "epoch": 0.12309750794447231, "grad_norm": 7.0168902683925864, "learning_rate": 4.09982174688057e-07, "logits/chosen": -0.0188574381172657, "logits/rejected": 0.051725804805755615, "logps/chosen": -1.291809320449829, "logps/rejected": -1.4451279640197754, "loss": 1.2918, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.291809320449829, "rewards/margins": 0.15331871807575226, "rewards/rejected": -1.4451279640197754, "step": 230 }, { "epoch": 0.1257735407258739, "grad_norm": 8.038348148822202, "learning_rate": 4.188948306595365e-07, "logits/chosen": -0.009397977963089943, "logits/rejected": 0.11019904911518097, "logps/chosen": -1.266922116279602, "logps/rejected": -1.42803955078125, "loss": 1.2669, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.266922116279602, "rewards/margins": 0.1611175239086151, "rewards/rejected": -1.42803955078125, "step": 235 }, { "epoch": 0.12844957350727546, "grad_norm": 5.86334839205076, "learning_rate": 4.27807486631016e-07, "logits/chosen": -0.019173940643668175, "logits/rejected": 0.09700779616832733, "logps/chosen": -1.2828456163406372, "logps/rejected": -1.4581503868103027, "loss": 1.2828, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2828456163406372, "rewards/margins": 0.17530474066734314, "rewards/rejected": -1.4581503868103027, "step": 240 }, { "epoch": 0.13112560628867703, "grad_norm": 7.8406985544795225, "learning_rate": 4.3672014260249554e-07, "logits/chosen": 0.032234370708465576, "logits/rejected": 0.14184141159057617, "logps/chosen": -1.3996970653533936, "logps/rejected": -1.421585202217102, "loss": 1.3997, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3996970653533936, "rewards/margins": 0.021888162940740585, "rewards/rejected": -1.421585202217102, "step": 245 }, { "epoch": 0.1338016390700786, "grad_norm": 9.47571655963089, "learning_rate": 4.4563279857397503e-07, "logits/chosen": -0.06074149161577225, "logits/rejected": 0.09066110849380493, "logps/chosen": -1.2760133743286133, "logps/rejected": -1.3334109783172607, "loss": 1.276, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2760133743286133, "rewards/margins": 0.05739762261509895, "rewards/rejected": -1.3334109783172607, "step": 250 }, { "epoch": 0.1364776718514802, "grad_norm": 7.135975707990223, "learning_rate": 4.545454545454545e-07, "logits/chosen": -0.03215126320719719, "logits/rejected": 0.09706703573465347, "logps/chosen": -1.2501457929611206, "logps/rejected": -1.3493174314498901, "loss": 1.2501, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2501457929611206, "rewards/margins": 0.09917166084051132, "rewards/rejected": -1.3493174314498901, "step": 255 }, { "epoch": 0.13915370463288176, "grad_norm": 7.6165488210644385, "learning_rate": 4.63458110516934e-07, "logits/chosen": -0.21242889761924744, "logits/rejected": -0.11244054138660431, "logps/chosen": -1.3337657451629639, "logps/rejected": -1.4932091236114502, "loss": 1.3338, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3337657451629639, "rewards/margins": 0.1594432145357132, "rewards/rejected": -1.4932091236114502, "step": 260 }, { "epoch": 0.1418297374142833, "grad_norm": 9.607347513293794, "learning_rate": 4.723707664884135e-07, "logits/chosen": -0.10055886209011078, "logits/rejected": -0.02364879474043846, "logps/chosen": -1.3102421760559082, "logps/rejected": -1.4819821119308472, "loss": 1.3102, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3102421760559082, "rewards/margins": 0.17173996567726135, "rewards/rejected": -1.4819821119308472, "step": 265 }, { "epoch": 0.1445057701956849, "grad_norm": 7.730504018156653, "learning_rate": 4.81283422459893e-07, "logits/chosen": -0.07616910338401794, "logits/rejected": 0.043918270617723465, "logps/chosen": -1.3017504215240479, "logps/rejected": -1.3915989398956299, "loss": 1.3018, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3017504215240479, "rewards/margins": 0.08984844386577606, "rewards/rejected": -1.3915989398956299, "step": 270 }, { "epoch": 0.14718180297708647, "grad_norm": 8.07300248093735, "learning_rate": 4.901960784313725e-07, "logits/chosen": -0.04150385037064552, "logits/rejected": 0.04815072566270828, "logps/chosen": -1.2487050294876099, "logps/rejected": -1.3970403671264648, "loss": 1.2487, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2487050294876099, "rewards/margins": 0.14833518862724304, "rewards/rejected": -1.3970403671264648, "step": 275 }, { "epoch": 0.14985783575848804, "grad_norm": 9.56774273370295, "learning_rate": 4.99108734402852e-07, "logits/chosen": -0.09960125386714935, "logits/rejected": 0.041294507682323456, "logps/chosen": -1.292418122291565, "logps/rejected": -1.3907355070114136, "loss": 1.2924, "rewards/accuracies": 0.59375, "rewards/chosen": -1.292418122291565, "rewards/margins": 0.09831748902797699, "rewards/rejected": -1.3907355070114136, "step": 280 }, { "epoch": 0.15253386853988962, "grad_norm": 6.997323351610601, "learning_rate": 5.080213903743315e-07, "logits/chosen": -0.08608181774616241, "logits/rejected": 0.038638703525066376, "logps/chosen": -1.3334317207336426, "logps/rejected": -1.406348705291748, "loss": 1.3334, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.3334317207336426, "rewards/margins": 0.07291682809591293, "rewards/rejected": -1.406348705291748, "step": 285 }, { "epoch": 0.1552099013212912, "grad_norm": 6.672538501402967, "learning_rate": 5.169340463458111e-07, "logits/chosen": -0.11767790466547012, "logits/rejected": 0.1593274623155594, "logps/chosen": -1.3495060205459595, "logps/rejected": -1.4568264484405518, "loss": 1.3495, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3495060205459595, "rewards/margins": 0.10732054710388184, "rewards/rejected": -1.4568264484405518, "step": 290 }, { "epoch": 0.15788593410269275, "grad_norm": 9.885539782287614, "learning_rate": 5.258467023172905e-07, "logits/chosen": -0.08746533840894699, "logits/rejected": -0.03501410037279129, "logps/chosen": -1.2485096454620361, "logps/rejected": -1.3823903799057007, "loss": 1.2485, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2485096454620361, "rewards/margins": 0.13388076424598694, "rewards/rejected": -1.3823903799057007, "step": 295 }, { "epoch": 0.16056196688409433, "grad_norm": 7.785675572783253, "learning_rate": 5.347593582887701e-07, "logits/chosen": -0.08877870440483093, "logits/rejected": 0.060669220983982086, "logps/chosen": -1.2816146612167358, "logps/rejected": -1.3557677268981934, "loss": 1.2816, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2816146612167358, "rewards/margins": 0.07415294647216797, "rewards/rejected": -1.3557677268981934, "step": 300 }, { "epoch": 0.1632379996654959, "grad_norm": 5.286552266548133, "learning_rate": 5.436720142602496e-07, "logits/chosen": -0.0314701572060585, "logits/rejected": 0.03506339713931084, "logps/chosen": -1.3867058753967285, "logps/rejected": -1.3950142860412598, "loss": 1.3867, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.3867058753967285, "rewards/margins": 0.008308363147079945, "rewards/rejected": -1.3950142860412598, "step": 305 }, { "epoch": 0.16591403244689748, "grad_norm": 6.818760333727034, "learning_rate": 5.52584670231729e-07, "logits/chosen": -0.2135169953107834, "logits/rejected": -0.13312526047229767, "logps/chosen": -1.35116446018219, "logps/rejected": -1.427929162979126, "loss": 1.3512, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.35116446018219, "rewards/margins": 0.07676468044519424, "rewards/rejected": -1.427929162979126, "step": 310 }, { "epoch": 0.16859006522829906, "grad_norm": 9.038451087787795, "learning_rate": 5.614973262032086e-07, "logits/chosen": -0.027707751840353012, "logits/rejected": 0.11673235893249512, "logps/chosen": -1.3438409566879272, "logps/rejected": -1.4826419353485107, "loss": 1.3438, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.3438409566879272, "rewards/margins": 0.13880087435245514, "rewards/rejected": -1.4826419353485107, "step": 315 }, { "epoch": 0.1712660980097006, "grad_norm": 5.1703343570133296, "learning_rate": 5.70409982174688e-07, "logits/chosen": -0.07698366791009903, "logits/rejected": 0.04437372833490372, "logps/chosen": -1.3046934604644775, "logps/rejected": -1.3523242473602295, "loss": 1.3047, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.3046934604644775, "rewards/margins": 0.04763079434633255, "rewards/rejected": -1.3523242473602295, "step": 320 }, { "epoch": 0.17394213079110218, "grad_norm": 8.171510244877455, "learning_rate": 5.793226381461676e-07, "logits/chosen": -0.12285922467708588, "logits/rejected": -0.01704871840775013, "logps/chosen": -1.296194076538086, "logps/rejected": -1.5274325609207153, "loss": 1.2962, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.296194076538086, "rewards/margins": 0.231238454580307, "rewards/rejected": -1.5274325609207153, "step": 325 }, { "epoch": 0.17661816357250376, "grad_norm": 5.750141675781793, "learning_rate": 5.88235294117647e-07, "logits/chosen": -0.036026787012815475, "logits/rejected": 0.10127715021371841, "logps/chosen": -1.3180134296417236, "logps/rejected": -1.4764127731323242, "loss": 1.318, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.3180134296417236, "rewards/margins": 0.15839937329292297, "rewards/rejected": -1.4764127731323242, "step": 330 }, { "epoch": 0.17929419635390534, "grad_norm": 10.044519322959712, "learning_rate": 5.971479500891266e-07, "logits/chosen": 0.011292120441794395, "logits/rejected": 0.10359932482242584, "logps/chosen": -1.3237205743789673, "logps/rejected": -1.3530371189117432, "loss": 1.3237, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.3237205743789673, "rewards/margins": 0.029316654428839684, "rewards/rejected": -1.3530371189117432, "step": 335 }, { "epoch": 0.18197022913530692, "grad_norm": 8.97365035698119, "learning_rate": 6.060606060606061e-07, "logits/chosen": -0.04798301309347153, "logits/rejected": 0.08005028963088989, "logps/chosen": -1.379374623298645, "logps/rejected": -1.4578994512557983, "loss": 1.3794, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.379374623298645, "rewards/margins": 0.07852499186992645, "rewards/rejected": -1.4578994512557983, "step": 340 }, { "epoch": 0.1846462619167085, "grad_norm": 7.4591238826839295, "learning_rate": 6.149732620320855e-07, "logits/chosen": 0.03981132060289383, "logits/rejected": 0.06436873227357864, "logps/chosen": -1.2834408283233643, "logps/rejected": -1.4170081615447998, "loss": 1.2834, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2834408283233643, "rewards/margins": 0.13356713950634003, "rewards/rejected": -1.4170081615447998, "step": 345 }, { "epoch": 0.18732229469811004, "grad_norm": 8.472102564452868, "learning_rate": 6.238859180035651e-07, "logits/chosen": -0.009375160560011864, "logits/rejected": 0.06827092170715332, "logps/chosen": -1.2641198635101318, "logps/rejected": -1.3896639347076416, "loss": 1.2641, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2641198635101318, "rewards/margins": 0.1255439817905426, "rewards/rejected": -1.3896639347076416, "step": 350 }, { "epoch": 0.18999832747951162, "grad_norm": 7.751332211265485, "learning_rate": 6.327985739750445e-07, "logits/chosen": -0.06718407571315765, "logits/rejected": 0.1494477391242981, "logps/chosen": -1.3696873188018799, "logps/rejected": -1.396683931350708, "loss": 1.3697, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.3696873188018799, "rewards/margins": 0.026996690779924393, "rewards/rejected": -1.396683931350708, "step": 355 }, { "epoch": 0.1926743602609132, "grad_norm": 8.922202868307702, "learning_rate": 6.417112299465241e-07, "logits/chosen": -0.04958086460828781, "logits/rejected": 0.023599188774824142, "logps/chosen": -1.29703688621521, "logps/rejected": -1.4158934354782104, "loss": 1.297, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.29703688621521, "rewards/margins": 0.11885659396648407, "rewards/rejected": -1.4158934354782104, "step": 360 }, { "epoch": 0.19535039304231477, "grad_norm": 8.099099436304067, "learning_rate": 6.506238859180035e-07, "logits/chosen": 0.010249042883515358, "logits/rejected": 0.08297108113765717, "logps/chosen": -1.2771332263946533, "logps/rejected": -1.3815836906433105, "loss": 1.2771, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2771332263946533, "rewards/margins": 0.1044505387544632, "rewards/rejected": -1.3815836906433105, "step": 365 }, { "epoch": 0.19802642582371635, "grad_norm": 5.424396477605704, "learning_rate": 6.59536541889483e-07, "logits/chosen": -0.023401916027069092, "logits/rejected": 0.055131226778030396, "logps/chosen": -1.2734225988388062, "logps/rejected": -1.3122045993804932, "loss": 1.2734, "rewards/accuracies": 0.5, "rewards/chosen": -1.2734225988388062, "rewards/margins": 0.03878200799226761, "rewards/rejected": -1.3122045993804932, "step": 370 }, { "epoch": 0.2007024586051179, "grad_norm": 5.907088923266604, "learning_rate": 6.684491978609626e-07, "logits/chosen": -0.046198517084121704, "logits/rejected": 0.10271338373422623, "logps/chosen": -1.2541393041610718, "logps/rejected": -1.3884332180023193, "loss": 1.2541, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2541393041610718, "rewards/margins": 0.13429374992847443, "rewards/rejected": -1.3884332180023193, "step": 375 }, { "epoch": 0.20337849138651948, "grad_norm": 7.220333632566288, "learning_rate": 6.77361853832442e-07, "logits/chosen": -0.019535917788743973, "logits/rejected": 0.05394468456506729, "logps/chosen": -1.27167809009552, "logps/rejected": -1.4252536296844482, "loss": 1.2717, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.27167809009552, "rewards/margins": 0.1535756140947342, "rewards/rejected": -1.4252536296844482, "step": 380 }, { "epoch": 0.20605452416792105, "grad_norm": 5.7636781884345645, "learning_rate": 6.862745098039216e-07, "logits/chosen": 0.017425742000341415, "logits/rejected": 0.09035812318325043, "logps/chosen": -1.3767086267471313, "logps/rejected": -1.3592731952667236, "loss": 1.3767, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.3767086267471313, "rewards/margins": -0.017435405403375626, "rewards/rejected": -1.3592731952667236, "step": 385 }, { "epoch": 0.20873055694932263, "grad_norm": 8.978702188513296, "learning_rate": 6.95187165775401e-07, "logits/chosen": 0.049974922090768814, "logits/rejected": 0.20324969291687012, "logps/chosen": -1.3569238185882568, "logps/rejected": -1.4154407978057861, "loss": 1.3569, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.3569238185882568, "rewards/margins": 0.05851711705327034, "rewards/rejected": -1.4154407978057861, "step": 390 }, { "epoch": 0.2114065897307242, "grad_norm": 6.3354938369870055, "learning_rate": 7.040998217468806e-07, "logits/chosen": -0.05190610885620117, "logits/rejected": 0.09368561208248138, "logps/chosen": -1.307032585144043, "logps/rejected": -1.313922643661499, "loss": 1.307, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.307032585144043, "rewards/margins": 0.006890204735100269, "rewards/rejected": -1.313922643661499, "step": 395 }, { "epoch": 0.2140826225121258, "grad_norm": 5.53785484456203, "learning_rate": 7.1301247771836e-07, "logits/chosen": 0.07213907688856125, "logits/rejected": 0.15816062688827515, "logps/chosen": -1.2903053760528564, "logps/rejected": -1.3748008012771606, "loss": 1.2903, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2903053760528564, "rewards/margins": 0.08449523150920868, "rewards/rejected": -1.3748008012771606, "step": 400 }, { "epoch": 0.2140826225121258, "eval_logits/chosen": 0.2656719982624054, "eval_logits/rejected": 0.34780392050743103, "eval_logps/chosen": -1.323078989982605, "eval_logps/rejected": -1.4418046474456787, "eval_loss": 1.3233901262283325, "eval_rewards/accuracies": 0.5556379556655884, "eval_rewards/chosen": -1.323078989982605, "eval_rewards/margins": 0.11872559040784836, "eval_rewards/rejected": -1.4418046474456787, "eval_runtime": 41.3795, "eval_samples_per_second": 32.504, "eval_steps_per_second": 8.144, "step": 400 }, { "epoch": 0.21675865529352734, "grad_norm": 7.869404472594379, "learning_rate": 7.219251336898395e-07, "logits/chosen": 0.020160216838121414, "logits/rejected": 0.10925710201263428, "logps/chosen": -1.2873663902282715, "logps/rejected": -1.35056471824646, "loss": 1.2874, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2873663902282715, "rewards/margins": 0.06319825351238251, "rewards/rejected": -1.35056471824646, "step": 405 }, { "epoch": 0.2194346880749289, "grad_norm": 6.292974826585783, "learning_rate": 7.30837789661319e-07, "logits/chosen": 0.03363238647580147, "logits/rejected": 0.15326282382011414, "logps/chosen": -1.2625261545181274, "logps/rejected": -1.3448641300201416, "loss": 1.2625, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2625261545181274, "rewards/margins": 0.08233799040317535, "rewards/rejected": -1.3448641300201416, "step": 410 }, { "epoch": 0.2221107208563305, "grad_norm": 5.325313526292669, "learning_rate": 7.397504456327985e-07, "logits/chosen": 0.0031976611353456974, "logits/rejected": 0.03445184975862503, "logps/chosen": -1.262878179550171, "logps/rejected": -1.41534423828125, "loss": 1.2629, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.262878179550171, "rewards/margins": 0.15246590971946716, "rewards/rejected": -1.41534423828125, "step": 415 }, { "epoch": 0.22478675363773207, "grad_norm": 5.69396293063995, "learning_rate": 7.486631016042781e-07, "logits/chosen": -0.005808958318084478, "logits/rejected": 0.17047980427742004, "logps/chosen": -1.2496130466461182, "logps/rejected": -1.3354417085647583, "loss": 1.2496, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2496130466461182, "rewards/margins": 0.08582862466573715, "rewards/rejected": -1.3354417085647583, "step": 420 }, { "epoch": 0.22746278641913364, "grad_norm": 5.43449925274313, "learning_rate": 7.575757575757575e-07, "logits/chosen": -0.0500650517642498, "logits/rejected": 0.13778111338615417, "logps/chosen": -1.2890360355377197, "logps/rejected": -1.435603380203247, "loss": 1.289, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2890360355377197, "rewards/margins": 0.14656727015972137, "rewards/rejected": -1.435603380203247, "step": 425 }, { "epoch": 0.2301388192005352, "grad_norm": 6.418209628990663, "learning_rate": 7.664884135472371e-07, "logits/chosen": -0.03733297437429428, "logits/rejected": 0.15489131212234497, "logps/chosen": -1.3268593549728394, "logps/rejected": -1.4346611499786377, "loss": 1.3269, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3268593549728394, "rewards/margins": 0.10780169814825058, "rewards/rejected": -1.4346611499786377, "step": 430 }, { "epoch": 0.23281485198193677, "grad_norm": 8.420984298948184, "learning_rate": 7.754010695187165e-07, "logits/chosen": 0.017317188903689384, "logits/rejected": 0.10118599236011505, "logps/chosen": -1.1891433000564575, "logps/rejected": -1.3179880380630493, "loss": 1.1891, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1891433000564575, "rewards/margins": 0.12884488701820374, "rewards/rejected": -1.3179880380630493, "step": 435 }, { "epoch": 0.23549088476333835, "grad_norm": 5.70183067340302, "learning_rate": 7.84313725490196e-07, "logits/chosen": 0.02464236132800579, "logits/rejected": 0.10591566562652588, "logps/chosen": -1.269000768661499, "logps/rejected": -1.350835919380188, "loss": 1.269, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.269000768661499, "rewards/margins": 0.08183509856462479, "rewards/rejected": -1.350835919380188, "step": 440 }, { "epoch": 0.23816691754473993, "grad_norm": 6.424715352044819, "learning_rate": 7.932263814616755e-07, "logits/chosen": -0.031163115054368973, "logits/rejected": 0.07137882709503174, "logps/chosen": -1.295470952987671, "logps/rejected": -1.4149221181869507, "loss": 1.2955, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.295470952987671, "rewards/margins": 0.11945096403360367, "rewards/rejected": -1.4149221181869507, "step": 445 }, { "epoch": 0.2408429503261415, "grad_norm": 9.249090597724761, "learning_rate": 8.02139037433155e-07, "logits/chosen": 0.022834043949842453, "logits/rejected": 0.1422138512134552, "logps/chosen": -1.2759243249893188, "logps/rejected": -1.410207986831665, "loss": 1.2759, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2759243249893188, "rewards/margins": 0.134283646941185, "rewards/rejected": -1.410207986831665, "step": 450 }, { "epoch": 0.24351898310754308, "grad_norm": 7.439347098108585, "learning_rate": 8.110516934046346e-07, "logits/chosen": 0.03014320135116577, "logits/rejected": 0.11329454183578491, "logps/chosen": -1.2261297702789307, "logps/rejected": -1.3936500549316406, "loss": 1.2261, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2261297702789307, "rewards/margins": 0.16752028465270996, "rewards/rejected": -1.3936500549316406, "step": 455 }, { "epoch": 0.24619501588894463, "grad_norm": 7.239274432329561, "learning_rate": 8.19964349376114e-07, "logits/chosen": -0.10470881313085556, "logits/rejected": 0.010493827052414417, "logps/chosen": -1.3513978719711304, "logps/rejected": -1.39378023147583, "loss": 1.3514, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.3513978719711304, "rewards/margins": 0.04238252341747284, "rewards/rejected": -1.39378023147583, "step": 460 }, { "epoch": 0.2488710486703462, "grad_norm": 7.270081842041361, "learning_rate": 8.288770053475936e-07, "logits/chosen": 0.16636350750923157, "logits/rejected": 0.17583902180194855, "logps/chosen": -1.2512786388397217, "logps/rejected": -1.403834581375122, "loss": 1.2513, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2512786388397217, "rewards/margins": 0.15255601704120636, "rewards/rejected": -1.403834581375122, "step": 465 }, { "epoch": 0.2515470814517478, "grad_norm": 7.6729675784168725, "learning_rate": 8.37789661319073e-07, "logits/chosen": 0.18932822346687317, "logits/rejected": 0.13803109526634216, "logps/chosen": -1.2077934741973877, "logps/rejected": -1.3848892450332642, "loss": 1.2078, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2077934741973877, "rewards/margins": 0.17709577083587646, "rewards/rejected": -1.3848892450332642, "step": 470 }, { "epoch": 0.25422311423314936, "grad_norm": 5.3341166200045445, "learning_rate": 8.467023172905525e-07, "logits/chosen": -0.041425496339797974, "logits/rejected": 0.08869761228561401, "logps/chosen": -1.283899188041687, "logps/rejected": -1.4534528255462646, "loss": 1.2839, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.283899188041687, "rewards/margins": 0.16955341398715973, "rewards/rejected": -1.4534528255462646, "step": 475 }, { "epoch": 0.2568991470145509, "grad_norm": 7.45348737020828, "learning_rate": 8.55614973262032e-07, "logits/chosen": -0.031778477132320404, "logits/rejected": 0.15576709806919098, "logps/chosen": -1.2473366260528564, "logps/rejected": -1.306390404701233, "loss": 1.2473, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2473366260528564, "rewards/margins": 0.05905361846089363, "rewards/rejected": -1.306390404701233, "step": 480 }, { "epoch": 0.2595751797959525, "grad_norm": 8.810162055243485, "learning_rate": 8.645276292335115e-07, "logits/chosen": 0.018950283527374268, "logits/rejected": 0.0502476692199707, "logps/chosen": -1.340250015258789, "logps/rejected": -1.408266305923462, "loss": 1.3403, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.340250015258789, "rewards/margins": 0.06801621615886688, "rewards/rejected": -1.408266305923462, "step": 485 }, { "epoch": 0.26225121257735406, "grad_norm": 7.527273347608956, "learning_rate": 8.734402852049911e-07, "logits/chosen": 0.012976981699466705, "logits/rejected": 0.07755061239004135, "logps/chosen": -1.2819068431854248, "logps/rejected": -1.3406496047973633, "loss": 1.2819, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.2819068431854248, "rewards/margins": 0.058742742985486984, "rewards/rejected": -1.3406496047973633, "step": 490 }, { "epoch": 0.26492724535875567, "grad_norm": 7.05498999711416, "learning_rate": 8.823529411764705e-07, "logits/chosen": -0.023644695058465004, "logits/rejected": -0.0061363400891423225, "logps/chosen": -1.287520170211792, "logps/rejected": -1.3920199871063232, "loss": 1.2875, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.287520170211792, "rewards/margins": 0.10449989140033722, "rewards/rejected": -1.3920199871063232, "step": 495 }, { "epoch": 0.2676032781401572, "grad_norm": 6.402865145330051, "learning_rate": 8.912655971479501e-07, "logits/chosen": -0.027590449899435043, "logits/rejected": 0.06777093559503555, "logps/chosen": -1.2010070085525513, "logps/rejected": -1.3468819856643677, "loss": 1.201, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2010070085525513, "rewards/margins": 0.1458749771118164, "rewards/rejected": -1.3468819856643677, "step": 500 }, { "epoch": 0.27027931092155877, "grad_norm": 7.127800851575206, "learning_rate": 9.001782531194295e-07, "logits/chosen": -0.05198904126882553, "logits/rejected": 0.0747031718492508, "logps/chosen": -1.3191442489624023, "logps/rejected": -1.3558399677276611, "loss": 1.3191, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.3191442489624023, "rewards/margins": 0.03669555112719536, "rewards/rejected": -1.3558399677276611, "step": 505 }, { "epoch": 0.2729553437029604, "grad_norm": 7.207754171446192, "learning_rate": 9.09090909090909e-07, "logits/chosen": 0.09425203502178192, "logits/rejected": 0.14759503304958344, "logps/chosen": -1.2784950733184814, "logps/rejected": -1.4302576780319214, "loss": 1.2785, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2784950733184814, "rewards/margins": 0.15176253020763397, "rewards/rejected": -1.4302576780319214, "step": 510 }, { "epoch": 0.2756313764843619, "grad_norm": 6.920656317786105, "learning_rate": 9.180035650623885e-07, "logits/chosen": 0.044771708548069, "logits/rejected": 0.1312025636434555, "logps/chosen": -1.2204049825668335, "logps/rejected": -1.3617534637451172, "loss": 1.2204, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2204049825668335, "rewards/margins": 0.14134854078292847, "rewards/rejected": -1.3617534637451172, "step": 515 }, { "epoch": 0.27830740926576353, "grad_norm": 5.6470411953580175, "learning_rate": 9.26916221033868e-07, "logits/chosen": -0.07736536860466003, "logits/rejected": 0.04998119920492172, "logps/chosen": -1.2639336585998535, "logps/rejected": -1.3488229513168335, "loss": 1.2639, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2639336585998535, "rewards/margins": 0.0848892480134964, "rewards/rejected": -1.3488229513168335, "step": 520 }, { "epoch": 0.2809834420471651, "grad_norm": 10.470102188165205, "learning_rate": 9.358288770053476e-07, "logits/chosen": 0.09583423286676407, "logits/rejected": 0.1529451608657837, "logps/chosen": -1.2355114221572876, "logps/rejected": -1.38844895362854, "loss": 1.2355, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2355114221572876, "rewards/margins": 0.15293750166893005, "rewards/rejected": -1.38844895362854, "step": 525 }, { "epoch": 0.2836594748285666, "grad_norm": 4.958975514492293, "learning_rate": 9.44741532976827e-07, "logits/chosen": 0.0513826385140419, "logits/rejected": 0.12926051020622253, "logps/chosen": -1.2146055698394775, "logps/rejected": -1.3041155338287354, "loss": 1.2146, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2146055698394775, "rewards/margins": 0.08951012045145035, "rewards/rejected": -1.3041155338287354, "step": 530 }, { "epoch": 0.28633550760996823, "grad_norm": 5.99394306075575, "learning_rate": 9.536541889483066e-07, "logits/chosen": -0.09171239286661148, "logits/rejected": 0.14905306696891785, "logps/chosen": -1.2355177402496338, "logps/rejected": -1.2842830419540405, "loss": 1.2355, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2355177402496338, "rewards/margins": 0.048765480518341064, "rewards/rejected": -1.2842830419540405, "step": 535 }, { "epoch": 0.2890115403913698, "grad_norm": 7.371833883726386, "learning_rate": 9.62566844919786e-07, "logits/chosen": 0.02602645754814148, "logits/rejected": 0.0875629335641861, "logps/chosen": -1.3601830005645752, "logps/rejected": -1.403725028038025, "loss": 1.3602, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.3601830005645752, "rewards/margins": 0.04354212433099747, "rewards/rejected": -1.403725028038025, "step": 540 }, { "epoch": 0.2916875731727714, "grad_norm": 5.315279026695322, "learning_rate": 9.714795008912655e-07, "logits/chosen": -0.08252374827861786, "logits/rejected": 0.1062747985124588, "logps/chosen": -1.2578928470611572, "logps/rejected": -1.3557822704315186, "loss": 1.2579, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2578928470611572, "rewards/margins": 0.09788928925991058, "rewards/rejected": -1.3557822704315186, "step": 545 }, { "epoch": 0.29436360595417294, "grad_norm": 6.912006448796063, "learning_rate": 9.80392156862745e-07, "logits/chosen": 0.03324595466256142, "logits/rejected": 0.09162168204784393, "logps/chosen": -1.260923981666565, "logps/rejected": -1.3847630023956299, "loss": 1.2609, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.260923981666565, "rewards/margins": 0.12383897602558136, "rewards/rejected": -1.3847630023956299, "step": 550 }, { "epoch": 0.2970396387355745, "grad_norm": 7.93295027568227, "learning_rate": 9.893048128342244e-07, "logits/chosen": -0.07907158881425858, "logits/rejected": 0.0375320203602314, "logps/chosen": -1.3334572315216064, "logps/rejected": -1.3852072954177856, "loss": 1.3335, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3334572315216064, "rewards/margins": 0.05175017565488815, "rewards/rejected": -1.3852072954177856, "step": 555 }, { "epoch": 0.2997156715169761, "grad_norm": 5.47174825389886, "learning_rate": 9.98217468805704e-07, "logits/chosen": 0.040932320058345795, "logits/rejected": 0.04730647802352905, "logps/chosen": -1.1879616975784302, "logps/rejected": -1.3087254762649536, "loss": 1.188, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1879616975784302, "rewards/margins": 0.12076383829116821, "rewards/rejected": -1.3087254762649536, "step": 560 }, { "epoch": 0.30239170429837764, "grad_norm": 5.948275526831955, "learning_rate": 9.999984476788462e-07, "logits/chosen": 0.017047982662916183, "logits/rejected": 0.05929508060216904, "logps/chosen": -1.3045212030410767, "logps/rejected": -1.4134423732757568, "loss": 1.3045, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.3045212030410767, "rewards/margins": 0.10892124474048615, "rewards/rejected": -1.4134423732757568, "step": 565 }, { "epoch": 0.30506773707977924, "grad_norm": 7.408433986773282, "learning_rate": 9.999921413906797e-07, "logits/chosen": -0.044436030089855194, "logits/rejected": 0.1585444211959839, "logps/chosen": -1.2839395999908447, "logps/rejected": -1.3479276895523071, "loss": 1.2839, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2839395999908447, "rewards/margins": 0.06398816406726837, "rewards/rejected": -1.3479276895523071, "step": 570 }, { "epoch": 0.3077437698611808, "grad_norm": 6.388066519771792, "learning_rate": 9.999809841765644e-07, "logits/chosen": -0.03708745911717415, "logits/rejected": 0.01412633340805769, "logps/chosen": -1.216328740119934, "logps/rejected": -1.3118376731872559, "loss": 1.2163, "rewards/accuracies": 0.5625, "rewards/chosen": -1.216328740119934, "rewards/margins": 0.09550894051790237, "rewards/rejected": -1.3118376731872559, "step": 575 }, { "epoch": 0.3104198026425824, "grad_norm": 6.522388890201525, "learning_rate": 9.999649761447477e-07, "logits/chosen": -0.03318170830607414, "logits/rejected": 0.10411947965621948, "logps/chosen": -1.2099888324737549, "logps/rejected": -1.3638687133789062, "loss": 1.21, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2099888324737549, "rewards/margins": 0.15387986600399017, "rewards/rejected": -1.3638687133789062, "step": 580 }, { "epoch": 0.31309583542398395, "grad_norm": 6.748949249192442, "learning_rate": 9.999441174505398e-07, "logits/chosen": -0.07577715814113617, "logits/rejected": 0.008648047223687172, "logps/chosen": -1.345796823501587, "logps/rejected": -1.3773592710494995, "loss": 1.3458, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.345796823501587, "rewards/margins": 0.03156254068017006, "rewards/rejected": -1.3773592710494995, "step": 585 }, { "epoch": 0.3157718682053855, "grad_norm": 7.388739524830364, "learning_rate": 9.999184082963116e-07, "logits/chosen": -0.028050964698195457, "logits/rejected": 0.08810295164585114, "logps/chosen": -1.3278844356536865, "logps/rejected": -1.346457839012146, "loss": 1.3279, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.3278844356536865, "rewards/margins": 0.018573403358459473, "rewards/rejected": -1.346457839012146, "step": 590 }, { "epoch": 0.3184479009867871, "grad_norm": 6.193594213206373, "learning_rate": 9.998878489314937e-07, "logits/chosen": 0.011766708455979824, "logits/rejected": 0.12563146650791168, "logps/chosen": -1.258927822113037, "logps/rejected": -1.2966742515563965, "loss": 1.2589, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -1.258927822113037, "rewards/margins": 0.03774638846516609, "rewards/rejected": -1.2966742515563965, "step": 595 }, { "epoch": 0.32112393376818865, "grad_norm": 5.224951723724826, "learning_rate": 9.99852439652573e-07, "logits/chosen": -0.05751261115074158, "logits/rejected": 0.07072456926107407, "logps/chosen": -1.251206636428833, "logps/rejected": -1.2542259693145752, "loss": 1.2512, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.251206636428833, "rewards/margins": 0.003019398543983698, "rewards/rejected": -1.2542259693145752, "step": 600 }, { "epoch": 0.32379996654959026, "grad_norm": 6.481833870518892, "learning_rate": 9.998121808030904e-07, "logits/chosen": -0.08060504496097565, "logits/rejected": -0.00114238855894655, "logps/chosen": -1.3060376644134521, "logps/rejected": -1.4348504543304443, "loss": 1.306, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.3060376644134521, "rewards/margins": 0.12881268560886383, "rewards/rejected": -1.4348504543304443, "step": 605 }, { "epoch": 0.3264759993309918, "grad_norm": 9.872807084624137, "learning_rate": 9.997670727736379e-07, "logits/chosen": 0.03434712439775467, "logits/rejected": 0.15481127798557281, "logps/chosen": -1.2807388305664062, "logps/rejected": -1.327695608139038, "loss": 1.2807, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2807388305664062, "rewards/margins": 0.046956755220890045, "rewards/rejected": -1.327695608139038, "step": 610 }, { "epoch": 0.32915203211239336, "grad_norm": 5.307162759516255, "learning_rate": 9.99717116001853e-07, "logits/chosen": -0.07419002056121826, "logits/rejected": 0.01720314659178257, "logps/chosen": -1.287549614906311, "logps/rejected": -1.4018094539642334, "loss": 1.2875, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.287549614906311, "rewards/margins": 0.11425991356372833, "rewards/rejected": -1.4018094539642334, "step": 615 }, { "epoch": 0.33182806489379496, "grad_norm": 5.9066868657029925, "learning_rate": 9.996623109724173e-07, "logits/chosen": 0.021942783147096634, "logits/rejected": 0.07414169609546661, "logps/chosen": -1.3426769971847534, "logps/rejected": -1.4397274255752563, "loss": 1.3427, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.3426769971847534, "rewards/margins": 0.09705035388469696, "rewards/rejected": -1.4397274255752563, "step": 620 }, { "epoch": 0.3345040976751965, "grad_norm": 6.686070837077601, "learning_rate": 9.996026582170488e-07, "logits/chosen": 0.051896728575229645, "logits/rejected": 0.15200376510620117, "logps/chosen": -1.2638510465621948, "logps/rejected": -1.366613745689392, "loss": 1.2639, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2638510465621948, "rewards/margins": 0.10276269912719727, "rewards/rejected": -1.366613745689392, "step": 625 }, { "epoch": 0.3371801304565981, "grad_norm": 6.753559650517872, "learning_rate": 9.995381583144996e-07, "logits/chosen": -0.028342988342046738, "logits/rejected": 0.0642150491476059, "logps/chosen": -1.2974637746810913, "logps/rejected": -1.4206868410110474, "loss": 1.2975, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2974637746810913, "rewards/margins": 0.12322302907705307, "rewards/rejected": -1.4206868410110474, "step": 630 }, { "epoch": 0.33985616323799966, "grad_norm": 5.26817687172792, "learning_rate": 9.994688118905471e-07, "logits/chosen": -0.00920817255973816, "logits/rejected": 0.21023285388946533, "logps/chosen": -1.3638373613357544, "logps/rejected": -1.3820571899414062, "loss": 1.3638, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.3638373613357544, "rewards/margins": 0.01821986399590969, "rewards/rejected": -1.3820571899414062, "step": 635 }, { "epoch": 0.3425321960194012, "grad_norm": 7.334960157143995, "learning_rate": 9.993946196179912e-07, "logits/chosen": -0.09801676869392395, "logits/rejected": 0.08195401728153229, "logps/chosen": -1.29988431930542, "logps/rejected": -1.3777023553848267, "loss": 1.2999, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.29988431930542, "rewards/margins": 0.0778181403875351, "rewards/rejected": -1.3777023553848267, "step": 640 }, { "epoch": 0.3452082288008028, "grad_norm": 6.705367450974403, "learning_rate": 9.993155822166455e-07, "logits/chosen": -0.08576785027980804, "logits/rejected": -0.007372510619461536, "logps/chosen": -1.2055470943450928, "logps/rejected": -1.3563463687896729, "loss": 1.2055, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2055470943450928, "rewards/margins": 0.15079930424690247, "rewards/rejected": -1.3563463687896729, "step": 645 }, { "epoch": 0.34788426158220437, "grad_norm": 8.877115629082377, "learning_rate": 9.992317004533313e-07, "logits/chosen": -0.0531020350754261, "logits/rejected": 0.07066147029399872, "logps/chosen": -1.3573925495147705, "logps/rejected": -1.48432195186615, "loss": 1.3574, "rewards/accuracies": 0.53125, "rewards/chosen": -1.3573925495147705, "rewards/margins": 0.12692946195602417, "rewards/rejected": -1.48432195186615, "step": 650 }, { "epoch": 0.350560294363606, "grad_norm": 10.505665198724062, "learning_rate": 9.991429751418696e-07, "logits/chosen": 0.053388338536024094, "logits/rejected": 0.05659248307347298, "logps/chosen": -1.285811424255371, "logps/rejected": -1.4620311260223389, "loss": 1.2858, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.285811424255371, "rewards/margins": 0.17621970176696777, "rewards/rejected": -1.4620311260223389, "step": 655 }, { "epoch": 0.3532363271450075, "grad_norm": 5.260718034747792, "learning_rate": 9.99049407143074e-07, "logits/chosen": -0.00468588899821043, "logits/rejected": 0.10836657136678696, "logps/chosen": -1.2416707277297974, "logps/rejected": -1.2611629962921143, "loss": 1.2417, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2416707277297974, "rewards/margins": 0.019492343068122864, "rewards/rejected": -1.2611629962921143, "step": 660 }, { "epoch": 0.35591235992640907, "grad_norm": 5.727096890656115, "learning_rate": 9.989509973647416e-07, "logits/chosen": -0.019173573702573776, "logits/rejected": 0.10896708816289902, "logps/chosen": -1.2160828113555908, "logps/rejected": -1.3306069374084473, "loss": 1.2161, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2160828113555908, "rewards/margins": 0.11452404409646988, "rewards/rejected": -1.3306069374084473, "step": 665 }, { "epoch": 0.3585883927078107, "grad_norm": 5.3047337096782075, "learning_rate": 9.988477467616445e-07, "logits/chosen": -0.04327046498656273, "logits/rejected": 0.1426888108253479, "logps/chosen": -1.2479137182235718, "logps/rejected": -1.2799546718597412, "loss": 1.2479, "rewards/accuracies": 0.46875, "rewards/chosen": -1.2479137182235718, "rewards/margins": 0.03204081952571869, "rewards/rejected": -1.2799546718597412, "step": 670 }, { "epoch": 0.3612644254892122, "grad_norm": 6.948677622422866, "learning_rate": 9.987396563355205e-07, "logits/chosen": -0.06921616941690445, "logits/rejected": -0.0007533133029937744, "logps/chosen": -1.246091365814209, "logps/rejected": -1.4493107795715332, "loss": 1.2461, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.246091365814209, "rewards/margins": 0.2032192051410675, "rewards/rejected": -1.4493107795715332, "step": 675 }, { "epoch": 0.36394045827061383, "grad_norm": 4.737057663765057, "learning_rate": 9.986267271350631e-07, "logits/chosen": 0.023737525567412376, "logits/rejected": 0.17157700657844543, "logps/chosen": -1.287827491760254, "logps/rejected": -1.3270927667617798, "loss": 1.2878, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.287827491760254, "rewards/margins": 0.03926524519920349, "rewards/rejected": -1.3270927667617798, "step": 680 }, { "epoch": 0.3666164910520154, "grad_norm": 8.255734691478782, "learning_rate": 9.985089602559123e-07, "logits/chosen": -0.012912644073367119, "logits/rejected": 0.1255933940410614, "logps/chosen": -1.271427869796753, "logps/rejected": -1.31276535987854, "loss": 1.2714, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.271427869796753, "rewards/margins": 0.04133733734488487, "rewards/rejected": -1.31276535987854, "step": 685 }, { "epoch": 0.369292523833417, "grad_norm": 5.865734183794819, "learning_rate": 9.983863568406428e-07, "logits/chosen": 0.029357489198446274, "logits/rejected": 0.05100846290588379, "logps/chosen": -1.2746349573135376, "logps/rejected": -1.3741590976715088, "loss": 1.2746, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2746349573135376, "rewards/margins": 0.09952421486377716, "rewards/rejected": -1.3741590976715088, "step": 690 }, { "epoch": 0.37196855661481854, "grad_norm": 6.716248435090254, "learning_rate": 9.982589180787532e-07, "logits/chosen": -0.04156893491744995, "logits/rejected": 0.041723210364580154, "logps/chosen": -1.1536036729812622, "logps/rejected": -1.3146851062774658, "loss": 1.1536, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.1536036729812622, "rewards/margins": 0.1610814929008484, "rewards/rejected": -1.3146851062774658, "step": 695 }, { "epoch": 0.3746445893962201, "grad_norm": 4.922761540889644, "learning_rate": 9.981266452066553e-07, "logits/chosen": -0.11370277404785156, "logits/rejected": 0.007519005332142115, "logps/chosen": -1.335659384727478, "logps/rejected": -1.3933441638946533, "loss": 1.3357, "rewards/accuracies": 0.5, "rewards/chosen": -1.335659384727478, "rewards/margins": 0.057684559375047684, "rewards/rejected": -1.3933441638946533, "step": 700 }, { "epoch": 0.3773206221776217, "grad_norm": 5.519646529332496, "learning_rate": 9.979895395076608e-07, "logits/chosen": -0.07878941297531128, "logits/rejected": 0.08350330591201782, "logps/chosen": -1.2837638854980469, "logps/rejected": -1.4200987815856934, "loss": 1.2838, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2837638854980469, "rewards/margins": 0.13633503019809723, "rewards/rejected": -1.4200987815856934, "step": 705 }, { "epoch": 0.37999665495902324, "grad_norm": 6.8713156370399835, "learning_rate": 9.9784760231197e-07, "logits/chosen": 0.041225384920835495, "logits/rejected": 0.12712953984737396, "logps/chosen": -1.2251508235931396, "logps/rejected": -1.3399101495742798, "loss": 1.2252, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2251508235931396, "rewards/margins": 0.11475928872823715, "rewards/rejected": -1.3399101495742798, "step": 710 }, { "epoch": 0.38267268774042484, "grad_norm": 7.746789222460644, "learning_rate": 9.97700834996658e-07, "logits/chosen": -0.023527052253484726, "logits/rejected": 0.12307412922382355, "logps/chosen": -1.292814016342163, "logps/rejected": -1.4054189920425415, "loss": 1.2928, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.292814016342163, "rewards/margins": 0.11260499805212021, "rewards/rejected": -1.4054189920425415, "step": 715 }, { "epoch": 0.3853487205218264, "grad_norm": 5.9598809311141885, "learning_rate": 9.97549238985662e-07, "logits/chosen": 0.022564511746168137, "logits/rejected": 0.19213271141052246, "logps/chosen": -1.3398385047912598, "logps/rejected": -1.3848450183868408, "loss": 1.3398, "rewards/accuracies": 0.53125, "rewards/chosen": -1.3398385047912598, "rewards/margins": 0.04500637948513031, "rewards/rejected": -1.3848450183868408, "step": 720 }, { "epoch": 0.38802475330322794, "grad_norm": 6.761857070002109, "learning_rate": 9.973928157497674e-07, "logits/chosen": -0.0846472829580307, "logits/rejected": 0.036122508347034454, "logps/chosen": -1.178292989730835, "logps/rejected": -1.4267852306365967, "loss": 1.1783, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.178292989730835, "rewards/margins": 0.24849215149879456, "rewards/rejected": -1.4267852306365967, "step": 725 }, { "epoch": 0.39070078608462955, "grad_norm": 6.671361465350212, "learning_rate": 9.972315668065927e-07, "logits/chosen": -0.12963029742240906, "logits/rejected": 0.02173738181591034, "logps/chosen": -1.291252851486206, "logps/rejected": -1.3559983968734741, "loss": 1.2913, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.291252851486206, "rewards/margins": 0.0647454485297203, "rewards/rejected": -1.3559983968734741, "step": 730 }, { "epoch": 0.3933768188660311, "grad_norm": 5.240073595198557, "learning_rate": 9.97065493720576e-07, "logits/chosen": -0.08771209418773651, "logits/rejected": 0.002774059772491455, "logps/chosen": -1.3078842163085938, "logps/rejected": -1.350559949874878, "loss": 1.3079, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.3078842163085938, "rewards/margins": 0.04267581179738045, "rewards/rejected": -1.350559949874878, "step": 735 }, { "epoch": 0.3960528516474327, "grad_norm": 8.721437924925166, "learning_rate": 9.968945981029594e-07, "logits/chosen": -0.04999725520610809, "logits/rejected": 0.10628406703472137, "logps/chosen": -1.3544535636901855, "logps/rejected": -1.377699613571167, "loss": 1.3545, "rewards/accuracies": 0.46875, "rewards/chosen": -1.3544535636901855, "rewards/margins": 0.023246100172400475, "rewards/rejected": -1.377699613571167, "step": 740 }, { "epoch": 0.39872888442883425, "grad_norm": 5.806053211425104, "learning_rate": 9.967188816117726e-07, "logits/chosen": 0.07237926870584488, "logits/rejected": 0.1335526704788208, "logps/chosen": -1.3352415561676025, "logps/rejected": -1.5106303691864014, "loss": 1.3352, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3352415561676025, "rewards/margins": 0.17538879811763763, "rewards/rejected": -1.5106303691864014, "step": 745 }, { "epoch": 0.4014049172102358, "grad_norm": 5.4841091270570805, "learning_rate": 9.965383459518179e-07, "logits/chosen": -0.011777307838201523, "logits/rejected": 0.14173081517219543, "logps/chosen": -1.27110755443573, "logps/rejected": -1.4177724123001099, "loss": 1.2711, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.27110755443573, "rewards/margins": 0.14666485786437988, "rewards/rejected": -1.4177724123001099, "step": 750 }, { "epoch": 0.4040809499916374, "grad_norm": 5.080410689411889, "learning_rate": 9.963529928746533e-07, "logits/chosen": 0.027251336723566055, "logits/rejected": 0.1377854347229004, "logps/chosen": -1.306553602218628, "logps/rejected": -1.368943452835083, "loss": 1.3066, "rewards/accuracies": 0.5, "rewards/chosen": -1.306553602218628, "rewards/margins": 0.06238982081413269, "rewards/rejected": -1.368943452835083, "step": 755 }, { "epoch": 0.40675698277303896, "grad_norm": 5.567805118126568, "learning_rate": 9.961628241785746e-07, "logits/chosen": -0.0670052319765091, "logits/rejected": -0.005449844989925623, "logps/chosen": -1.3250452280044556, "logps/rejected": -1.4404858350753784, "loss": 1.325, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.3250452280044556, "rewards/margins": 0.1154404878616333, "rewards/rejected": -1.4404858350753784, "step": 760 }, { "epoch": 0.40943301555444056, "grad_norm": 11.74300384635903, "learning_rate": 9.959678417085998e-07, "logits/chosen": -0.028228968381881714, "logits/rejected": 0.05549084022641182, "logps/chosen": -1.2784067392349243, "logps/rejected": -1.3626810312271118, "loss": 1.2784, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2784067392349243, "rewards/margins": 0.08427433669567108, "rewards/rejected": -1.3626810312271118, "step": 765 }, { "epoch": 0.4121090483358421, "grad_norm": 6.369944708390375, "learning_rate": 9.957680473564493e-07, "logits/chosen": 0.06585574895143509, "logits/rejected": 0.1732962280511856, "logps/chosen": -1.2337896823883057, "logps/rejected": -1.4194914102554321, "loss": 1.2338, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2337896823883057, "rewards/margins": 0.18570168316364288, "rewards/rejected": -1.4194914102554321, "step": 770 }, { "epoch": 0.41478508111724366, "grad_norm": 7.129345641067338, "learning_rate": 9.95563443060529e-07, "logits/chosen": -0.049136511981487274, "logits/rejected": 0.1074095219373703, "logps/chosen": -1.3166379928588867, "logps/rejected": -1.4627429246902466, "loss": 1.3166, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3166379928588867, "rewards/margins": 0.14610502123832703, "rewards/rejected": -1.4627429246902466, "step": 775 }, { "epoch": 0.41746111389864526, "grad_norm": 4.332462343401173, "learning_rate": 9.95354030805911e-07, "logits/chosen": -0.12356118112802505, "logits/rejected": 0.012452776543796062, "logps/chosen": -1.2385128736495972, "logps/rejected": -1.3803521394729614, "loss": 1.2385, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2385128736495972, "rewards/margins": 0.14183911681175232, "rewards/rejected": -1.3803521394729614, "step": 780 }, { "epoch": 0.4201371466800468, "grad_norm": 6.009141343259623, "learning_rate": 9.951398126243133e-07, "logits/chosen": 0.012352309189736843, "logits/rejected": 0.12193779647350311, "logps/chosen": -1.2113947868347168, "logps/rejected": -1.3994553089141846, "loss": 1.2114, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2113947868347168, "rewards/margins": 0.18806049227714539, "rewards/rejected": -1.3994553089141846, "step": 785 }, { "epoch": 0.4228131794614484, "grad_norm": 5.272497835004643, "learning_rate": 9.94920790594082e-07, "logits/chosen": -0.06765518337488174, "logits/rejected": 0.03967911750078201, "logps/chosen": -1.2879887819290161, "logps/rejected": -1.3228000402450562, "loss": 1.288, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.2879887819290161, "rewards/margins": 0.03481132537126541, "rewards/rejected": -1.3228000402450562, "step": 790 }, { "epoch": 0.42548921224284997, "grad_norm": 4.892493285201154, "learning_rate": 9.946969668401696e-07, "logits/chosen": -0.09788217395544052, "logits/rejected": 0.07154911011457443, "logps/chosen": -1.2591006755828857, "logps/rejected": -1.3582319021224976, "loss": 1.2591, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2591006755828857, "rewards/margins": 0.0991312637925148, "rewards/rejected": -1.3582319021224976, "step": 795 }, { "epoch": 0.4281652450242516, "grad_norm": 7.175582078959205, "learning_rate": 9.944683435341155e-07, "logits/chosen": -0.034755513072013855, "logits/rejected": 0.03395189344882965, "logps/chosen": -1.2586027383804321, "logps/rejected": -1.2931143045425415, "loss": 1.2586, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2586027383804321, "rewards/margins": 0.03451157361268997, "rewards/rejected": -1.2931143045425415, "step": 800 }, { "epoch": 0.4281652450242516, "eval_logits/chosen": 0.2390989363193512, "eval_logits/rejected": 0.3139877915382385, "eval_logps/chosen": -1.292383074760437, "eval_logps/rejected": -1.4166818857192993, "eval_loss": 1.2926472425460815, "eval_rewards/accuracies": 0.5482195615768433, "eval_rewards/chosen": -1.292383074760437, "eval_rewards/margins": 0.12429890781641006, "eval_rewards/rejected": -1.4166818857192993, "eval_runtime": 40.3544, "eval_samples_per_second": 33.33, "eval_steps_per_second": 8.351, "step": 800 }, { "epoch": 0.4308412778056531, "grad_norm": 6.254579419961171, "learning_rate": 9.942349228940236e-07, "logits/chosen": -0.08818487077951431, "logits/rejected": 0.039764825254678726, "logps/chosen": -1.3074865341186523, "logps/rejected": -1.431410551071167, "loss": 1.3075, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3074865341186523, "rewards/margins": 0.12392395734786987, "rewards/rejected": -1.431410551071167, "step": 805 }, { "epoch": 0.43351731058705467, "grad_norm": 5.964406925427786, "learning_rate": 9.939967071845424e-07, "logits/chosen": 0.016309451311826706, "logits/rejected": 0.08206792920827866, "logps/chosen": -1.2093933820724487, "logps/rejected": -1.3380126953125, "loss": 1.2094, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2093933820724487, "rewards/margins": 0.12861934304237366, "rewards/rejected": -1.3380126953125, "step": 810 }, { "epoch": 0.4361933433684563, "grad_norm": 5.9905366689367, "learning_rate": 9.937536987168413e-07, "logits/chosen": -0.01043208222836256, "logits/rejected": 0.09940309822559357, "logps/chosen": -1.2020695209503174, "logps/rejected": -1.3923689126968384, "loss": 1.2021, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2020695209503174, "rewards/margins": 0.19029945135116577, "rewards/rejected": -1.3923689126968384, "step": 815 }, { "epoch": 0.4388693761498578, "grad_norm": 6.493505017667349, "learning_rate": 9.935058998485896e-07, "logits/chosen": 0.03012007102370262, "logits/rejected": 0.06414420902729034, "logps/chosen": -1.2467553615570068, "logps/rejected": -1.404955267906189, "loss": 1.2468, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2467553615570068, "rewards/margins": 0.1581999957561493, "rewards/rejected": -1.404955267906189, "step": 820 }, { "epoch": 0.44154540893125943, "grad_norm": 5.582430846318556, "learning_rate": 9.932533129839333e-07, "logits/chosen": -0.041133131831884384, "logits/rejected": 0.06763674318790436, "logps/chosen": -1.1922818422317505, "logps/rejected": -1.2692766189575195, "loss": 1.1923, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.1922818422317505, "rewards/margins": 0.07699473947286606, "rewards/rejected": -1.2692766189575195, "step": 825 }, { "epoch": 0.444221441712661, "grad_norm": 6.050213911164321, "learning_rate": 9.929959405734711e-07, "logits/chosen": 0.022936535999178886, "logits/rejected": 0.1669151484966278, "logps/chosen": -1.3202248811721802, "logps/rejected": -1.3600516319274902, "loss": 1.3202, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.3202248811721802, "rewards/margins": 0.039826650172472, "rewards/rejected": -1.3600516319274902, "step": 830 }, { "epoch": 0.44689747449406253, "grad_norm": 6.859921489563553, "learning_rate": 9.927337851142314e-07, "logits/chosen": -0.019439276307821274, "logits/rejected": 0.09653627127408981, "logps/chosen": -1.2219395637512207, "logps/rejected": -1.3350383043289185, "loss": 1.2219, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2219395637512207, "rewards/margins": 0.11309881508350372, "rewards/rejected": -1.3350383043289185, "step": 835 }, { "epoch": 0.44957350727546413, "grad_norm": 4.440007076802051, "learning_rate": 9.924668491496474e-07, "logits/chosen": -0.04892671853303909, "logits/rejected": 0.07650452107191086, "logps/chosen": -1.268222451210022, "logps/rejected": -1.4210572242736816, "loss": 1.2682, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.268222451210022, "rewards/margins": 0.15283489227294922, "rewards/rejected": -1.4210572242736816, "step": 840 }, { "epoch": 0.4522495400568657, "grad_norm": 4.244240361369058, "learning_rate": 9.92195135269533e-07, "logits/chosen": 0.017016131430864334, "logits/rejected": 0.07407237589359283, "logps/chosen": -1.2650668621063232, "logps/rejected": -1.311122179031372, "loss": 1.2651, "rewards/accuracies": 0.5, "rewards/chosen": -1.2650668621063232, "rewards/margins": 0.04605530947446823, "rewards/rejected": -1.311122179031372, "step": 845 }, { "epoch": 0.4549255728382673, "grad_norm": 7.448073440425479, "learning_rate": 9.919186461100574e-07, "logits/chosen": -0.03463117033243179, "logits/rejected": 0.010160955600440502, "logps/chosen": -1.2278153896331787, "logps/rejected": -1.3488781452178955, "loss": 1.2278, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2278153896331787, "rewards/margins": 0.12106279283761978, "rewards/rejected": -1.3488781452178955, "step": 850 }, { "epoch": 0.45760160561966884, "grad_norm": 5.692267328085667, "learning_rate": 9.9163738435372e-07, "logits/chosen": -0.06847278028726578, "logits/rejected": 0.041960228234529495, "logps/chosen": -1.2900992631912231, "logps/rejected": -1.4478404521942139, "loss": 1.2901, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2900992631912231, "rewards/margins": 0.15774127840995789, "rewards/rejected": -1.4478404521942139, "step": 855 }, { "epoch": 0.4602776384010704, "grad_norm": 4.876829638400752, "learning_rate": 9.913513527293234e-07, "logits/chosen": -0.10054681450128555, "logits/rejected": 0.029201824218034744, "logps/chosen": -1.3251417875289917, "logps/rejected": -1.4854522943496704, "loss": 1.3251, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.3251417875289917, "rewards/margins": 0.16031046211719513, "rewards/rejected": -1.4854522943496704, "step": 860 }, { "epoch": 0.462953671182472, "grad_norm": 10.646993508805867, "learning_rate": 9.910605540119474e-07, "logits/chosen": -0.03564943000674248, "logits/rejected": 0.04468190670013428, "logps/chosen": -1.2173900604248047, "logps/rejected": -1.3846566677093506, "loss": 1.2174, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2173900604248047, "rewards/margins": 0.1672665923833847, "rewards/rejected": -1.3846566677093506, "step": 865 }, { "epoch": 0.46562970396387354, "grad_norm": 5.794361029982147, "learning_rate": 9.907649910229227e-07, "logits/chosen": -0.11134270578622818, "logits/rejected": 0.10180030018091202, "logps/chosen": -1.274308681488037, "logps/rejected": -1.3464170694351196, "loss": 1.2743, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.274308681488037, "rewards/margins": 0.07210833579301834, "rewards/rejected": -1.3464170694351196, "step": 870 }, { "epoch": 0.46830573674527515, "grad_norm": 6.436068626132306, "learning_rate": 9.90464666629803e-07, "logits/chosen": -0.024314895272254944, "logits/rejected": 0.030501240864396095, "logps/chosen": -1.2998055219650269, "logps/rejected": -1.4133801460266113, "loss": 1.2998, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2998055219650269, "rewards/margins": 0.11357448250055313, "rewards/rejected": -1.4133801460266113, "step": 875 }, { "epoch": 0.4709817695266767, "grad_norm": 5.1043258602713015, "learning_rate": 9.901595837463363e-07, "logits/chosen": 0.002916154218837619, "logits/rejected": 0.13316118717193604, "logps/chosen": -1.3585437536239624, "logps/rejected": -1.4468364715576172, "loss": 1.3585, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3585437536239624, "rewards/margins": 0.08829256147146225, "rewards/rejected": -1.4468364715576172, "step": 880 }, { "epoch": 0.47365780230807825, "grad_norm": 6.237155520314364, "learning_rate": 9.898497453324384e-07, "logits/chosen": -0.05789249390363693, "logits/rejected": 0.0034675695933401585, "logps/chosen": -1.2383142709732056, "logps/rejected": -1.4161425828933716, "loss": 1.2383, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2383142709732056, "rewards/margins": 0.1778283268213272, "rewards/rejected": -1.4161425828933716, "step": 885 }, { "epoch": 0.47633383508947985, "grad_norm": 7.668492910968571, "learning_rate": 9.895351543941628e-07, "logits/chosen": -0.15281029045581818, "logits/rejected": -0.05185431241989136, "logps/chosen": -1.2960929870605469, "logps/rejected": -1.4078320264816284, "loss": 1.2961, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2960929870605469, "rewards/margins": 0.11173920333385468, "rewards/rejected": -1.4078320264816284, "step": 890 }, { "epoch": 0.4790098678708814, "grad_norm": 3.996445694188199, "learning_rate": 9.892158139836724e-07, "logits/chosen": 0.015809116885066032, "logits/rejected": 0.1107819527387619, "logps/chosen": -1.1920976638793945, "logps/rejected": -1.2812395095825195, "loss": 1.1921, "rewards/accuracies": 0.53125, "rewards/chosen": -1.1920976638793945, "rewards/margins": 0.0891418606042862, "rewards/rejected": -1.2812395095825195, "step": 895 }, { "epoch": 0.481685900652283, "grad_norm": 5.699905495779529, "learning_rate": 9.88891727199209e-07, "logits/chosen": -0.10467209666967392, "logits/rejected": -0.06115953251719475, "logps/chosen": -1.1700263023376465, "logps/rejected": -1.4119365215301514, "loss": 1.17, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1700263023376465, "rewards/margins": 0.2419101744890213, "rewards/rejected": -1.4119365215301514, "step": 900 }, { "epoch": 0.48436193343368455, "grad_norm": 6.591290025476305, "learning_rate": 9.885628971850641e-07, "logits/chosen": -0.0037920798640698195, "logits/rejected": 0.1478206217288971, "logps/chosen": -1.2574379444122314, "logps/rejected": -1.4382083415985107, "loss": 1.2574, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2574379444122314, "rewards/margins": 0.18077044188976288, "rewards/rejected": -1.4382083415985107, "step": 905 }, { "epoch": 0.48703796621508616, "grad_norm": 4.609538818953727, "learning_rate": 9.882293271315481e-07, "logits/chosen": -0.039000123739242554, "logits/rejected": 0.059923432767391205, "logps/chosen": -1.3163846731185913, "logps/rejected": -1.37472403049469, "loss": 1.3164, "rewards/accuracies": 0.53125, "rewards/chosen": -1.3163846731185913, "rewards/margins": 0.05833936855196953, "rewards/rejected": -1.37472403049469, "step": 910 }, { "epoch": 0.4897139989964877, "grad_norm": 5.711783497543647, "learning_rate": 9.878910202749589e-07, "logits/chosen": -0.030093863606452942, "logits/rejected": 0.11769764125347137, "logps/chosen": -1.2365710735321045, "logps/rejected": -1.3598225116729736, "loss": 1.2366, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2365710735321045, "rewards/margins": 0.12325137853622437, "rewards/rejected": -1.3598225116729736, "step": 915 }, { "epoch": 0.49239003177788926, "grad_norm": 6.935651785848156, "learning_rate": 9.875479798975512e-07, "logits/chosen": 0.06522098183631897, "logits/rejected": 0.16547676920890808, "logps/chosen": -1.196593999862671, "logps/rejected": -1.3321568965911865, "loss": 1.1966, "rewards/accuracies": 0.5625, "rewards/chosen": -1.196593999862671, "rewards/margins": 0.135562926530838, "rewards/rejected": -1.3321568965911865, "step": 920 }, { "epoch": 0.49506606455929086, "grad_norm": 5.285612948639042, "learning_rate": 9.87200209327504e-07, "logits/chosen": -0.06866519153118134, "logits/rejected": 0.07046620547771454, "logps/chosen": -1.2875876426696777, "logps/rejected": -1.3216297626495361, "loss": 1.2876, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2875876426696777, "rewards/margins": 0.03404215723276138, "rewards/rejected": -1.3216297626495361, "step": 925 }, { "epoch": 0.4977420973406924, "grad_norm": 8.52846504315105, "learning_rate": 9.868477119388894e-07, "logits/chosen": -0.07610566914081573, "logits/rejected": 0.02326354756951332, "logps/chosen": -1.2710648775100708, "logps/rejected": -1.4431533813476562, "loss": 1.2711, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2710648775100708, "rewards/margins": 0.172088623046875, "rewards/rejected": -1.4431533813476562, "step": 930 }, { "epoch": 0.500418130122094, "grad_norm": 6.475521145284912, "learning_rate": 9.864904911516383e-07, "logits/chosen": 0.00599165353924036, "logits/rejected": 0.029473695904016495, "logps/chosen": -1.1878167390823364, "logps/rejected": -1.361006736755371, "loss": 1.1878, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1878167390823364, "rewards/margins": 0.1731901615858078, "rewards/rejected": -1.361006736755371, "step": 935 }, { "epoch": 0.5030941629034956, "grad_norm": 7.926269855494578, "learning_rate": 9.861285504315084e-07, "logits/chosen": -0.05422784015536308, "logits/rejected": 0.036566127091646194, "logps/chosen": -1.2551628351211548, "logps/rejected": -1.289687156677246, "loss": 1.2552, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2551628351211548, "rewards/margins": 0.034524377435445786, "rewards/rejected": -1.289687156677246, "step": 940 }, { "epoch": 0.5057701956848971, "grad_norm": 6.95525566911425, "learning_rate": 9.857618932900502e-07, "logits/chosen": -0.06869738548994064, "logits/rejected": 0.032166264951229095, "logps/chosen": -1.2346642017364502, "logps/rejected": -1.3727699518203735, "loss": 1.2347, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2346642017364502, "rewards/margins": 0.13810572028160095, "rewards/rejected": -1.3727699518203735, "step": 945 }, { "epoch": 0.5084462284662987, "grad_norm": 4.78299010188956, "learning_rate": 9.853905232845727e-07, "logits/chosen": -0.03540452942252159, "logits/rejected": 0.10095776617527008, "logps/chosen": -1.333021879196167, "logps/rejected": -1.3433998823165894, "loss": 1.333, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.333021879196167, "rewards/margins": 0.010378008708357811, "rewards/rejected": -1.3433998823165894, "step": 950 }, { "epoch": 0.5111222612477003, "grad_norm": 4.918582693095492, "learning_rate": 9.850144440181095e-07, "logits/chosen": -0.021059144288301468, "logits/rejected": 0.16030056774616241, "logps/chosen": -1.3161485195159912, "logps/rejected": -1.3586479425430298, "loss": 1.3161, "rewards/accuracies": 0.46875, "rewards/chosen": -1.3161485195159912, "rewards/margins": 0.04249925538897514, "rewards/rejected": -1.3586479425430298, "step": 955 }, { "epoch": 0.5137982940291018, "grad_norm": 6.035988437434033, "learning_rate": 9.846336591393832e-07, "logits/chosen": -0.06846211850643158, "logits/rejected": 0.05199027806520462, "logps/chosen": -1.273654580116272, "logps/rejected": -1.3454148769378662, "loss": 1.2737, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.273654580116272, "rewards/margins": 0.07176045328378677, "rewards/rejected": -1.3454148769378662, "step": 960 }, { "epoch": 0.5164743268105034, "grad_norm": 6.426818723440502, "learning_rate": 9.842481723427704e-07, "logits/chosen": 0.010282131843268871, "logits/rejected": -0.010356178507208824, "logps/chosen": -1.314780592918396, "logps/rejected": -1.46169114112854, "loss": 1.3148, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.314780592918396, "rewards/margins": 0.14691047370433807, "rewards/rejected": -1.46169114112854, "step": 965 }, { "epoch": 0.519150359591905, "grad_norm": 5.587597613440013, "learning_rate": 9.838579873682658e-07, "logits/chosen": 0.013318764045834541, "logits/rejected": 0.007068639155477285, "logps/chosen": -1.1986637115478516, "logps/rejected": -1.2714143991470337, "loss": 1.1987, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1986637115478516, "rewards/margins": 0.07275055348873138, "rewards/rejected": -1.2714143991470337, "step": 970 }, { "epoch": 0.5218263923733065, "grad_norm": 4.777529546674357, "learning_rate": 9.834631080014457e-07, "logits/chosen": -0.14641256630420685, "logits/rejected": -0.011975902132689953, "logps/chosen": -1.272757649421692, "logps/rejected": -1.3485749959945679, "loss": 1.2728, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.272757649421692, "rewards/margins": 0.07581733167171478, "rewards/rejected": -1.3485749959945679, "step": 975 }, { "epoch": 0.5245024251547081, "grad_norm": 6.243190223746109, "learning_rate": 9.830635380734312e-07, "logits/chosen": -0.12378814071416855, "logits/rejected": 0.04086925834417343, "logps/chosen": -1.2951219081878662, "logps/rejected": -1.3845832347869873, "loss": 1.2951, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2951219081878662, "rewards/margins": 0.08946139365434647, "rewards/rejected": -1.3845832347869873, "step": 980 }, { "epoch": 0.5271784579361097, "grad_norm": 5.335856698508705, "learning_rate": 9.826592814608517e-07, "logits/chosen": -0.03387296944856644, "logits/rejected": 0.11067028343677521, "logps/chosen": -1.2702786922454834, "logps/rejected": -1.387192964553833, "loss": 1.2703, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2702786922454834, "rewards/margins": 0.11691419780254364, "rewards/rejected": -1.387192964553833, "step": 985 }, { "epoch": 0.5298544907175113, "grad_norm": 6.197506473327029, "learning_rate": 9.822503420858067e-07, "logits/chosen": 0.003619621740654111, "logits/rejected": 0.03565412014722824, "logps/chosen": -1.153623342514038, "logps/rejected": -1.3545430898666382, "loss": 1.1536, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.153623342514038, "rewards/margins": 0.20091989636421204, "rewards/rejected": -1.3545430898666382, "step": 990 }, { "epoch": 0.5325305234989128, "grad_norm": 5.711439816739544, "learning_rate": 9.818367239158277e-07, "logits/chosen": 0.013246385380625725, "logits/rejected": 0.0627821832895279, "logps/chosen": -1.2737104892730713, "logps/rejected": -1.2772290706634521, "loss": 1.2737, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -1.2737104892730713, "rewards/margins": 0.0035184845328330994, "rewards/rejected": -1.2772290706634521, "step": 995 }, { "epoch": 0.5352065562803144, "grad_norm": 6.046761013909382, "learning_rate": 9.8141843096384e-07, "logits/chosen": -0.02210906147956848, "logits/rejected": 0.04886433854699135, "logps/chosen": -1.3029171228408813, "logps/rejected": -1.3967106342315674, "loss": 1.3029, "rewards/accuracies": 0.5, "rewards/chosen": -1.3029171228408813, "rewards/margins": 0.09379346668720245, "rewards/rejected": -1.3967106342315674, "step": 1000 }, { "epoch": 0.537882589061716, "grad_norm": 6.360780046449946, "learning_rate": 9.809954672881237e-07, "logits/chosen": -0.03382030874490738, "logits/rejected": 0.10084762424230576, "logps/chosen": -1.2839205265045166, "logps/rejected": -1.3534369468688965, "loss": 1.2839, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2839205265045166, "rewards/margins": 0.06951656937599182, "rewards/rejected": -1.3534369468688965, "step": 1005 }, { "epoch": 0.5405586218431175, "grad_norm": 5.398400629947937, "learning_rate": 9.80567836992274e-07, "logits/chosen": -0.034357231110334396, "logits/rejected": 0.11352918297052383, "logps/chosen": -1.1517225503921509, "logps/rejected": -1.340057611465454, "loss": 1.1517, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.1517225503921509, "rewards/margins": 0.18833516538143158, "rewards/rejected": -1.340057611465454, "step": 1010 }, { "epoch": 0.5432346546245191, "grad_norm": 8.753707310176516, "learning_rate": 9.801355442251625e-07, "logits/chosen": -0.043360546231269836, "logits/rejected": 0.09672240912914276, "logps/chosen": -1.2249188423156738, "logps/rejected": -1.3727611303329468, "loss": 1.2249, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2249188423156738, "rewards/margins": 0.1478423774242401, "rewards/rejected": -1.3727611303329468, "step": 1015 }, { "epoch": 0.5459106874059207, "grad_norm": 6.5100247712782275, "learning_rate": 9.796985931808949e-07, "logits/chosen": -0.036462895572185516, "logits/rejected": 0.07088091224431992, "logps/chosen": -1.2738233804702759, "logps/rejected": -1.4192863702774048, "loss": 1.2738, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2738233804702759, "rewards/margins": 0.1454630196094513, "rewards/rejected": -1.4192863702774048, "step": 1020 }, { "epoch": 0.5485867201873222, "grad_norm": 5.692396218836089, "learning_rate": 9.792569880987724e-07, "logits/chosen": -0.07106520235538483, "logits/rejected": 0.0156484916806221, "logps/chosen": -1.174965262413025, "logps/rejected": -1.3978121280670166, "loss": 1.175, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.174965262413025, "rewards/margins": 0.22284671664237976, "rewards/rejected": -1.3978121280670166, "step": 1025 }, { "epoch": 0.5512627529687238, "grad_norm": 5.565610173067295, "learning_rate": 9.788107332632493e-07, "logits/chosen": -0.0721411257982254, "logits/rejected": -0.0008665561908856034, "logps/chosen": -1.2512609958648682, "logps/rejected": -1.306958556175232, "loss": 1.2513, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2512609958648682, "rewards/margins": 0.0556974932551384, "rewards/rejected": -1.306958556175232, "step": 1030 }, { "epoch": 0.5539387857501255, "grad_norm": 6.608630093586399, "learning_rate": 9.783598330038924e-07, "logits/chosen": -0.08097167313098907, "logits/rejected": 0.006701459642499685, "logps/chosen": -1.3452317714691162, "logps/rejected": -1.3631670475006104, "loss": 1.3452, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.3452317714691162, "rewards/margins": 0.01793515682220459, "rewards/rejected": -1.3631670475006104, "step": 1035 }, { "epoch": 0.5566148185315271, "grad_norm": 6.522233850120741, "learning_rate": 9.779042916953376e-07, "logits/chosen": -0.04059193655848503, "logits/rejected": 0.07133638858795166, "logps/chosen": -1.2659540176391602, "logps/rejected": -1.3634898662567139, "loss": 1.266, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2659540176391602, "rewards/margins": 0.09753577411174774, "rewards/rejected": -1.3634898662567139, "step": 1040 }, { "epoch": 0.5592908513129285, "grad_norm": 4.888357854156223, "learning_rate": 9.774441137572487e-07, "logits/chosen": -0.11140398681163788, "logits/rejected": -0.010750685818493366, "logps/chosen": -1.2365443706512451, "logps/rejected": -1.3860673904418945, "loss": 1.2365, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.2365443706512451, "rewards/margins": 0.14952287077903748, "rewards/rejected": -1.3860673904418945, "step": 1045 }, { "epoch": 0.5619668840943302, "grad_norm": 6.192533210054023, "learning_rate": 9.76979303654274e-07, "logits/chosen": -0.12246431410312653, "logits/rejected": -0.04861663654446602, "logps/chosen": -1.2815401554107666, "logps/rejected": -1.3792847394943237, "loss": 1.2815, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.2815401554107666, "rewards/margins": 0.0977446585893631, "rewards/rejected": -1.3792847394943237, "step": 1050 }, { "epoch": 0.5646429168757318, "grad_norm": 7.025308558953763, "learning_rate": 9.765098658960035e-07, "logits/chosen": -0.01736634597182274, "logits/rejected": 0.040794551372528076, "logps/chosen": -1.27117919921875, "logps/rejected": -1.393020510673523, "loss": 1.2712, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.27117919921875, "rewards/margins": 0.12184134870767593, "rewards/rejected": -1.393020510673523, "step": 1055 }, { "epoch": 0.5673189496571333, "grad_norm": 5.116074756524611, "learning_rate": 9.76035805036924e-07, "logits/chosen": -0.02111620083451271, "logits/rejected": 0.11694250255823135, "logps/chosen": -1.3415400981903076, "logps/rejected": -1.4300519227981567, "loss": 1.3415, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.3415400981903076, "rewards/margins": 0.08851160854101181, "rewards/rejected": -1.4300519227981567, "step": 1060 }, { "epoch": 0.5699949824385349, "grad_norm": 4.254471934425033, "learning_rate": 9.755571256763764e-07, "logits/chosen": 0.007281592581421137, "logits/rejected": 0.10736112296581268, "logps/chosen": -1.1955541372299194, "logps/rejected": -1.382073163986206, "loss": 1.1956, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.1955541372299194, "rewards/margins": 0.186519056558609, "rewards/rejected": -1.382073163986206, "step": 1065 }, { "epoch": 0.5726710152199365, "grad_norm": 6.385615273610758, "learning_rate": 9.750738324585097e-07, "logits/chosen": -0.1415092945098877, "logits/rejected": 0.05616138130426407, "logps/chosen": -1.2617812156677246, "logps/rejected": -1.4284113645553589, "loss": 1.2618, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2617812156677246, "rewards/margins": 0.16663002967834473, "rewards/rejected": -1.4284113645553589, "step": 1070 }, { "epoch": 0.5753470480013381, "grad_norm": 5.298057962756652, "learning_rate": 9.74585930072237e-07, "logits/chosen": -0.04539132118225098, "logits/rejected": 0.05860024690628052, "logps/chosen": -1.209770917892456, "logps/rejected": -1.392956256866455, "loss": 1.2098, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.209770917892456, "rewards/margins": 0.18318527936935425, "rewards/rejected": -1.392956256866455, "step": 1075 }, { "epoch": 0.5780230807827396, "grad_norm": 5.653773983115901, "learning_rate": 9.740934232511892e-07, "logits/chosen": -0.12220083177089691, "logits/rejected": -0.03590819612145424, "logps/chosen": -1.3243439197540283, "logps/rejected": -1.3416998386383057, "loss": 1.3243, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.3243439197540283, "rewards/margins": 0.017356105148792267, "rewards/rejected": -1.3416998386383057, "step": 1080 }, { "epoch": 0.5806991135641412, "grad_norm": 6.027616100650511, "learning_rate": 9.735963167736698e-07, "logits/chosen": -0.04352981224656105, "logits/rejected": 0.08578065037727356, "logps/chosen": -1.2825839519500732, "logps/rejected": -1.30778968334198, "loss": 1.2826, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.2825839519500732, "rewards/margins": 0.025205904617905617, "rewards/rejected": -1.30778968334198, "step": 1085 }, { "epoch": 0.5833751463455428, "grad_norm": 4.100397374731896, "learning_rate": 9.730946154626078e-07, "logits/chosen": -0.05452343076467514, "logits/rejected": 0.022144392132759094, "logps/chosen": -1.2682517766952515, "logps/rejected": -1.2765491008758545, "loss": 1.2683, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -1.2682517766952515, "rewards/margins": 0.008297216147184372, "rewards/rejected": -1.2765491008758545, "step": 1090 }, { "epoch": 0.5860511791269443, "grad_norm": 6.872879728670727, "learning_rate": 9.725883241855117e-07, "logits/chosen": -0.16312849521636963, "logits/rejected": -0.05645165592432022, "logps/chosen": -1.2090058326721191, "logps/rejected": -1.3281110525131226, "loss": 1.209, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2090058326721191, "rewards/margins": 0.11910521984100342, "rewards/rejected": -1.3281110525131226, "step": 1095 }, { "epoch": 0.5887272119083459, "grad_norm": 7.51905136434617, "learning_rate": 9.720774478544218e-07, "logits/chosen": -0.03948826342821121, "logits/rejected": 0.04299502819776535, "logps/chosen": -1.117437481880188, "logps/rejected": -1.4063055515289307, "loss": 1.1174, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.117437481880188, "rewards/margins": 0.2888680398464203, "rewards/rejected": -1.4063055515289307, "step": 1100 }, { "epoch": 0.5914032446897475, "grad_norm": 5.215193878076613, "learning_rate": 9.715619914258624e-07, "logits/chosen": -0.06932996213436127, "logits/rejected": -0.01818094775080681, "logps/chosen": -1.2517797946929932, "logps/rejected": -1.3426105976104736, "loss": 1.2518, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2517797946929932, "rewards/margins": 0.09083087742328644, "rewards/rejected": -1.3426105976104736, "step": 1105 }, { "epoch": 0.594079277471149, "grad_norm": 4.95442381952982, "learning_rate": 9.710419599007937e-07, "logits/chosen": -0.06093784049153328, "logits/rejected": 0.03500527888536453, "logps/chosen": -1.2274868488311768, "logps/rejected": -1.2515859603881836, "loss": 1.2275, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -1.2274868488311768, "rewards/margins": 0.024099132046103477, "rewards/rejected": -1.2515859603881836, "step": 1110 }, { "epoch": 0.5967553102525506, "grad_norm": 5.151741921112019, "learning_rate": 9.705173583245643e-07, "logits/chosen": 0.04982369393110275, "logits/rejected": 0.14359310269355774, "logps/chosen": -1.1499334573745728, "logps/rejected": -1.315768837928772, "loss": 1.1499, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1499334573745728, "rewards/margins": 0.16583546996116638, "rewards/rejected": -1.315768837928772, "step": 1115 }, { "epoch": 0.5994313430339522, "grad_norm": 7.684512249327671, "learning_rate": 9.699881917868609e-07, "logits/chosen": -0.17868992686271667, "logits/rejected": -0.09584345668554306, "logps/chosen": -1.2241437435150146, "logps/rejected": -1.3594564199447632, "loss": 1.2241, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2241437435150146, "rewards/margins": 0.13531282544136047, "rewards/rejected": -1.3594564199447632, "step": 1120 }, { "epoch": 0.6021073758153538, "grad_norm": 7.454229057136473, "learning_rate": 9.694544654216594e-07, "logits/chosen": -0.15242353081703186, "logits/rejected": -0.0018568634986877441, "logps/chosen": -1.2246073484420776, "logps/rejected": -1.329288125038147, "loss": 1.2246, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2246073484420776, "rewards/margins": 0.1046806201338768, "rewards/rejected": -1.329288125038147, "step": 1125 }, { "epoch": 0.6047834085967553, "grad_norm": 5.16061255819568, "learning_rate": 9.689161844071755e-07, "logits/chosen": 0.011076502501964569, "logits/rejected": 0.054216302931308746, "logps/chosen": -1.2689493894577026, "logps/rejected": -1.397051453590393, "loss": 1.2689, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2689493894577026, "rewards/margins": 0.12810207903385162, "rewards/rejected": -1.397051453590393, "step": 1130 }, { "epoch": 0.6074594413781569, "grad_norm": 9.134914348353583, "learning_rate": 9.683733539658138e-07, "logits/chosen": -0.06427105516195297, "logits/rejected": 0.06691170483827591, "logps/chosen": -1.2776857614517212, "logps/rejected": -1.4477813243865967, "loss": 1.2777, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2776857614517212, "rewards/margins": 0.17009560763835907, "rewards/rejected": -1.4477813243865967, "step": 1135 }, { "epoch": 0.6101354741595585, "grad_norm": 6.382089588734222, "learning_rate": 9.678259793641178e-07, "logits/chosen": -0.04775651544332504, "logits/rejected": -0.03034684620797634, "logps/chosen": -1.2444088459014893, "logps/rejected": -1.2599059343338013, "loss": 1.2444, "rewards/accuracies": 0.5, "rewards/chosen": -1.2444088459014893, "rewards/margins": 0.015497100539505482, "rewards/rejected": -1.2599059343338013, "step": 1140 }, { "epoch": 0.61281150694096, "grad_norm": 6.19735966855112, "learning_rate": 9.672740659127183e-07, "logits/chosen": -0.16640301048755646, "logits/rejected": -0.08015020936727524, "logps/chosen": -1.284486174583435, "logps/rejected": -1.374629259109497, "loss": 1.2845, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.284486174583435, "rewards/margins": 0.09014324098825455, "rewards/rejected": -1.374629259109497, "step": 1145 }, { "epoch": 0.6154875397223616, "grad_norm": 6.338416482516323, "learning_rate": 9.667176189662818e-07, "logits/chosen": -0.16387200355529785, "logits/rejected": -0.050604335963726044, "logps/chosen": -1.1729789972305298, "logps/rejected": -1.2999842166900635, "loss": 1.173, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1729789972305298, "rewards/margins": 0.1270054131746292, "rewards/rejected": -1.2999842166900635, "step": 1150 }, { "epoch": 0.6181635725037632, "grad_norm": 6.356638683631875, "learning_rate": 9.661566439234592e-07, "logits/chosen": -0.0625876858830452, "logits/rejected": -0.001631209277547896, "logps/chosen": -1.2728203535079956, "logps/rejected": -1.3516300916671753, "loss": 1.2728, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2728203535079956, "rewards/margins": 0.07880964875221252, "rewards/rejected": -1.3516300916671753, "step": 1155 }, { "epoch": 0.6208396052851648, "grad_norm": 6.487973967123599, "learning_rate": 9.655911462268327e-07, "logits/chosen": -0.010877996683120728, "logits/rejected": 0.03731466084718704, "logps/chosen": -1.2252531051635742, "logps/rejected": -1.317350149154663, "loss": 1.2253, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2252531051635742, "rewards/margins": 0.092097207903862, "rewards/rejected": -1.317350149154663, "step": 1160 }, { "epoch": 0.6235156380665663, "grad_norm": 5.248162108964757, "learning_rate": 9.650211313628636e-07, "logits/chosen": -0.06950392574071884, "logits/rejected": -0.009685784578323364, "logps/chosen": -1.16654372215271, "logps/rejected": -1.3240737915039062, "loss": 1.1665, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.16654372215271, "rewards/margins": 0.15752997994422913, "rewards/rejected": -1.3240737915039062, "step": 1165 }, { "epoch": 0.6261916708479679, "grad_norm": 5.441582278121723, "learning_rate": 9.644466048618386e-07, "logits/chosen": -0.11294262111186981, "logits/rejected": 0.016532057896256447, "logps/chosen": -1.3700830936431885, "logps/rejected": -1.3729360103607178, "loss": 1.3701, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.3700830936431885, "rewards/margins": 0.002852770034223795, "rewards/rejected": -1.3729360103607178, "step": 1170 }, { "epoch": 0.6288677036293695, "grad_norm": 4.729669880583923, "learning_rate": 9.63867572297816e-07, "logits/chosen": -0.09123164415359497, "logits/rejected": 0.04779767617583275, "logps/chosen": -1.2138688564300537, "logps/rejected": -1.2861607074737549, "loss": 1.2139, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2138688564300537, "rewards/margins": 0.07229190319776535, "rewards/rejected": -1.2861607074737549, "step": 1175 }, { "epoch": 0.631543736410771, "grad_norm": 5.4390932066291855, "learning_rate": 9.632840392885727e-07, "logits/chosen": -0.07311855256557465, "logits/rejected": 0.023478861898183823, "logps/chosen": -1.3175314664840698, "logps/rejected": -1.4571441411972046, "loss": 1.3175, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.3175314664840698, "rewards/margins": 0.13961286842823029, "rewards/rejected": -1.4571441411972046, "step": 1180 }, { "epoch": 0.6342197691921726, "grad_norm": 6.597441808433359, "learning_rate": 9.626960114955483e-07, "logits/chosen": -0.023569175973534584, "logits/rejected": 0.07762341946363449, "logps/chosen": -1.3516827821731567, "logps/rejected": -1.3832666873931885, "loss": 1.3517, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.3516827821731567, "rewards/margins": 0.03158385306596756, "rewards/rejected": -1.3832666873931885, "step": 1185 }, { "epoch": 0.6368958019735742, "grad_norm": 6.5470699910488745, "learning_rate": 9.621034946237909e-07, "logits/chosen": -0.10380878299474716, "logits/rejected": -0.0022453113924711943, "logps/chosen": -1.2997486591339111, "logps/rejected": -1.4331480264663696, "loss": 1.2997, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2997486591339111, "rewards/margins": 0.13339951634407043, "rewards/rejected": -1.4331480264663696, "step": 1190 }, { "epoch": 0.6395718347549757, "grad_norm": 5.382058383935546, "learning_rate": 9.615064944219021e-07, "logits/chosen": -0.04178851842880249, "logits/rejected": 0.04940354451537132, "logps/chosen": -1.1705482006072998, "logps/rejected": -1.3567442893981934, "loss": 1.1705, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1705482006072998, "rewards/margins": 0.18619616329669952, "rewards/rejected": -1.3567442893981934, "step": 1195 }, { "epoch": 0.6422478675363773, "grad_norm": 4.6336058205956805, "learning_rate": 9.609050166819803e-07, "logits/chosen": -0.10132310539484024, "logits/rejected": -0.05170116946101189, "logps/chosen": -1.2170246839523315, "logps/rejected": -1.3492891788482666, "loss": 1.217, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2170246839523315, "rewards/margins": 0.13226431608200073, "rewards/rejected": -1.3492891788482666, "step": 1200 }, { "epoch": 0.6422478675363773, "eval_logits/chosen": 0.217824324965477, "eval_logits/rejected": 0.29055988788604736, "eval_logps/chosen": -1.2833325862884521, "eval_logps/rejected": -1.4046558141708374, "eval_loss": 1.2836024761199951, "eval_rewards/accuracies": 0.5474777221679688, "eval_rewards/chosen": -1.2833325862884521, "eval_rewards/margins": 0.12132324278354645, "eval_rewards/rejected": -1.4046558141708374, "eval_runtime": 40.4095, "eval_samples_per_second": 33.284, "eval_steps_per_second": 8.34, "step": 1200 }, { "epoch": 0.6449239003177789, "grad_norm": 6.000685829858074, "learning_rate": 9.602990672395653e-07, "logits/chosen": -0.1521497219800949, "logits/rejected": -0.015983399003744125, "logps/chosen": -1.2322502136230469, "logps/rejected": -1.3438935279846191, "loss": 1.2323, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2322502136230469, "rewards/margins": 0.11164339631795883, "rewards/rejected": -1.3438935279846191, "step": 1205 }, { "epoch": 0.6475999330991805, "grad_norm": 8.548273647048934, "learning_rate": 9.59688651973581e-07, "logits/chosen": -0.06338302046060562, "logits/rejected": 0.09241237491369247, "logps/chosen": -1.270341396331787, "logps/rejected": -1.3546082973480225, "loss": 1.2703, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.270341396331787, "rewards/margins": 0.0842670202255249, "rewards/rejected": -1.3546082973480225, "step": 1210 }, { "epoch": 0.650275965880582, "grad_norm": 5.565750095398654, "learning_rate": 9.590737768062792e-07, "logits/chosen": -0.11839057505130768, "logits/rejected": -0.017757216468453407, "logps/chosen": -1.2639378309249878, "logps/rejected": -1.272821068763733, "loss": 1.2639, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2639378309249878, "rewards/margins": 0.008883127942681313, "rewards/rejected": -1.272821068763733, "step": 1215 }, { "epoch": 0.6529519986619836, "grad_norm": 6.130410782883552, "learning_rate": 9.584544477031816e-07, "logits/chosen": 0.036540139466524124, "logits/rejected": 0.11790430545806885, "logps/chosen": -1.1785613298416138, "logps/rejected": -1.2992117404937744, "loss": 1.1786, "rewards/accuracies": 0.53125, "rewards/chosen": -1.1785613298416138, "rewards/margins": 0.12065033614635468, "rewards/rejected": -1.2992117404937744, "step": 1220 }, { "epoch": 0.6556280314433852, "grad_norm": 4.604353319429889, "learning_rate": 9.578306706730215e-07, "logits/chosen": -0.17595402896404266, "logits/rejected": -0.001795476651750505, "logps/chosen": -1.280834436416626, "logps/rejected": -1.3630173206329346, "loss": 1.2808, "rewards/accuracies": 0.53125, "rewards/chosen": -1.280834436416626, "rewards/margins": 0.08218281716108322, "rewards/rejected": -1.3630173206329346, "step": 1225 }, { "epoch": 0.6583040642247867, "grad_norm": 4.316014405945429, "learning_rate": 9.572024517676865e-07, "logits/chosen": -0.07002115249633789, "logits/rejected": 0.028605511412024498, "logps/chosen": -1.2065143585205078, "logps/rejected": -1.36150062084198, "loss": 1.2065, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2065143585205078, "rewards/margins": 0.1549863964319229, "rewards/rejected": -1.36150062084198, "step": 1230 }, { "epoch": 0.6609800970061883, "grad_norm": 5.2557609106292755, "learning_rate": 9.565697970821593e-07, "logits/chosen": -0.04135257750749588, "logits/rejected": 0.06116024777293205, "logps/chosen": -1.2667913436889648, "logps/rejected": -1.3378912210464478, "loss": 1.2668, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2667913436889648, "rewards/margins": 0.07109979540109634, "rewards/rejected": -1.3378912210464478, "step": 1235 }, { "epoch": 0.6636561297875899, "grad_norm": 7.2556531402448075, "learning_rate": 9.559327127544585e-07, "logits/chosen": -0.1533469408750534, "logits/rejected": -0.04445001855492592, "logps/chosen": -1.2218164205551147, "logps/rejected": -1.3334985971450806, "loss": 1.2218, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2218164205551147, "rewards/margins": 0.11168204247951508, "rewards/rejected": -1.3334985971450806, "step": 1240 }, { "epoch": 0.6663321625689914, "grad_norm": 6.087094599056863, "learning_rate": 9.552912049655789e-07, "logits/chosen": -0.058422207832336426, "logits/rejected": 0.08441738784313202, "logps/chosen": -1.3267234563827515, "logps/rejected": -1.3491876125335693, "loss": 1.3267, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -1.3267234563827515, "rewards/margins": 0.02246423438191414, "rewards/rejected": -1.3491876125335693, "step": 1245 }, { "epoch": 0.669008195350393, "grad_norm": 5.769230069605667, "learning_rate": 9.546452799394315e-07, "logits/chosen": -0.06579374521970749, "logits/rejected": 0.08774848282337189, "logps/chosen": -1.2961641550064087, "logps/rejected": -1.3286808729171753, "loss": 1.2962, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -1.2961641550064087, "rewards/margins": 0.03251679986715317, "rewards/rejected": -1.3286808729171753, "step": 1250 }, { "epoch": 0.6716842281317946, "grad_norm": 8.06128266735004, "learning_rate": 9.539949439427846e-07, "logits/chosen": -0.09073962271213531, "logits/rejected": 0.0048308358527719975, "logps/chosen": -1.255657434463501, "logps/rejected": -1.339793086051941, "loss": 1.2557, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.255657434463501, "rewards/margins": 0.08413554728031158, "rewards/rejected": -1.339793086051941, "step": 1255 }, { "epoch": 0.6743602609131962, "grad_norm": 6.191451784699569, "learning_rate": 9.533402032852002e-07, "logits/chosen": -0.12305567413568497, "logits/rejected": -0.019606847316026688, "logps/chosen": -1.1982133388519287, "logps/rejected": -1.3531486988067627, "loss": 1.1982, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1982133388519287, "rewards/margins": 0.1549353301525116, "rewards/rejected": -1.3531486988067627, "step": 1260 }, { "epoch": 0.6770362936945977, "grad_norm": 4.921820695927524, "learning_rate": 9.526810643189754e-07, "logits/chosen": -0.027819618582725525, "logits/rejected": 0.07450731098651886, "logps/chosen": -1.2468996047973633, "logps/rejected": -1.3264100551605225, "loss": 1.2469, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2468996047973633, "rewards/margins": 0.07951048761606216, "rewards/rejected": -1.3264100551605225, "step": 1265 }, { "epoch": 0.6797123264759993, "grad_norm": 6.557221686405777, "learning_rate": 9.52017533439079e-07, "logits/chosen": -0.08886425197124481, "logits/rejected": 0.003001108765602112, "logps/chosen": -1.2003432512283325, "logps/rejected": -1.341486930847168, "loss": 1.2003, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2003432512283325, "rewards/margins": 0.1411435902118683, "rewards/rejected": -1.341486930847168, "step": 1270 }, { "epoch": 0.6823883592574009, "grad_norm": 4.409535181255283, "learning_rate": 9.513496170830909e-07, "logits/chosen": -0.052580296993255615, "logits/rejected": 0.03460296615958214, "logps/chosen": -1.2626965045928955, "logps/rejected": -1.3196136951446533, "loss": 1.2627, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.2626965045928955, "rewards/margins": 0.056917231529951096, "rewards/rejected": -1.3196136951446533, "step": 1275 }, { "epoch": 0.6850643920388024, "grad_norm": 6.307539026009502, "learning_rate": 9.506773217311382e-07, "logits/chosen": -0.1087227612733841, "logits/rejected": 0.0027766525745391846, "logps/chosen": -1.3303842544555664, "logps/rejected": -1.3781440258026123, "loss": 1.3304, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.3303842544555664, "rewards/margins": 0.04775996133685112, "rewards/rejected": -1.3781440258026123, "step": 1280 }, { "epoch": 0.687740424820204, "grad_norm": 4.825565332051916, "learning_rate": 9.500006539058334e-07, "logits/chosen": -0.07784494012594223, "logits/rejected": 0.029555052518844604, "logps/chosen": -1.2130894660949707, "logps/rejected": -1.301626443862915, "loss": 1.2131, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2130894660949707, "rewards/margins": 0.08853688091039658, "rewards/rejected": -1.301626443862915, "step": 1285 }, { "epoch": 0.6904164576016056, "grad_norm": 7.5040727636123234, "learning_rate": 9.493196201722109e-07, "logits/chosen": -0.17819096148014069, "logits/rejected": -0.05510940030217171, "logps/chosen": -1.2727141380310059, "logps/rejected": -1.3084208965301514, "loss": 1.2727, "rewards/accuracies": 0.46875, "rewards/chosen": -1.2727141380310059, "rewards/margins": 0.035706691443920135, "rewards/rejected": -1.3084208965301514, "step": 1290 }, { "epoch": 0.6930924903830072, "grad_norm": 5.1335101522884905, "learning_rate": 9.486342271376628e-07, "logits/chosen": -0.1001095175743103, "logits/rejected": -0.09477894753217697, "logps/chosen": -1.243051290512085, "logps/rejected": -1.3852254152297974, "loss": 1.2431, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.243051290512085, "rewards/margins": 0.1421741247177124, "rewards/rejected": -1.3852254152297974, "step": 1295 }, { "epoch": 0.6957685231644087, "grad_norm": 5.286057708825833, "learning_rate": 9.479444814518755e-07, "logits/chosen": -0.09017867594957352, "logits/rejected": 0.10676959902048111, "logps/chosen": -1.2467856407165527, "logps/rejected": -1.3615275621414185, "loss": 1.2468, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2467856407165527, "rewards/margins": 0.11474192142486572, "rewards/rejected": -1.3615275621414185, "step": 1300 }, { "epoch": 0.6984445559458103, "grad_norm": 4.692048849516903, "learning_rate": 9.472503898067645e-07, "logits/chosen": -0.003429621458053589, "logits/rejected": 0.041862696409225464, "logps/chosen": -1.2499125003814697, "logps/rejected": -1.374155044555664, "loss": 1.2499, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2499125003814697, "rewards/margins": 0.1242426186800003, "rewards/rejected": -1.374155044555664, "step": 1305 }, { "epoch": 0.701120588727212, "grad_norm": 5.270030033256585, "learning_rate": 9.465519589364099e-07, "logits/chosen": -0.014922015368938446, "logits/rejected": 0.045301780104637146, "logps/chosen": -1.271321177482605, "logps/rejected": -1.3778356313705444, "loss": 1.2713, "rewards/accuracies": 0.53125, "rewards/chosen": -1.271321177482605, "rewards/margins": 0.10651449114084244, "rewards/rejected": -1.3778356313705444, "step": 1310 }, { "epoch": 0.7037966215086134, "grad_norm": 5.82889187950136, "learning_rate": 9.458491956169914e-07, "logits/chosen": -0.08933697640895844, "logits/rejected": 0.06262365728616714, "logps/chosen": -1.2084307670593262, "logps/rejected": -1.4006166458129883, "loss": 1.2084, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2084307670593262, "rewards/margins": 0.19218605756759644, "rewards/rejected": -1.4006166458129883, "step": 1315 }, { "epoch": 0.706472654290015, "grad_norm": 5.232845752841482, "learning_rate": 9.451421066667215e-07, "logits/chosen": -0.1814894825220108, "logits/rejected": -0.00874222069978714, "logps/chosen": -1.1938554048538208, "logps/rejected": -1.332378625869751, "loss": 1.1939, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.1938554048538208, "rewards/margins": 0.13852305710315704, "rewards/rejected": -1.332378625869751, "step": 1320 }, { "epoch": 0.7091486870714167, "grad_norm": 5.248309609989704, "learning_rate": 9.444306989457805e-07, "logits/chosen": -0.058136679232120514, "logits/rejected": 0.023438360542058945, "logps/chosen": -1.266739010810852, "logps/rejected": -1.3324171304702759, "loss": 1.2667, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.266739010810852, "rewards/margins": 0.0656779333949089, "rewards/rejected": -1.3324171304702759, "step": 1325 }, { "epoch": 0.7118247198528181, "grad_norm": 5.075587909376229, "learning_rate": 9.437149793562489e-07, "logits/chosen": -0.08655783534049988, "logits/rejected": 0.013869370333850384, "logps/chosen": -1.2598979473114014, "logps/rejected": -1.3275713920593262, "loss": 1.2599, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2598979473114014, "rewards/margins": 0.06767337024211884, "rewards/rejected": -1.3275713920593262, "step": 1330 }, { "epoch": 0.7145007526342197, "grad_norm": 6.8226424685408835, "learning_rate": 9.429949548420417e-07, "logits/chosen": -0.05602610856294632, "logits/rejected": 0.003136430634185672, "logps/chosen": -1.3283345699310303, "logps/rejected": -1.4464716911315918, "loss": 1.3283, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3283345699310303, "rewards/margins": 0.11813720315694809, "rewards/rejected": -1.4464716911315918, "step": 1335 }, { "epoch": 0.7171767854156214, "grad_norm": 7.709473567773019, "learning_rate": 9.422706323888396e-07, "logits/chosen": -0.05598954111337662, "logits/rejected": -0.04251992702484131, "logps/chosen": -1.3002499341964722, "logps/rejected": -1.3940088748931885, "loss": 1.3002, "rewards/accuracies": 0.5, "rewards/chosen": -1.3002499341964722, "rewards/margins": 0.09375893324613571, "rewards/rejected": -1.3940088748931885, "step": 1340 }, { "epoch": 0.719852818197023, "grad_norm": 4.280685389319865, "learning_rate": 9.415420190240225e-07, "logits/chosen": -0.027994494885206223, "logits/rejected": 0.10712435096502304, "logps/chosen": -1.288765788078308, "logps/rejected": -1.345978021621704, "loss": 1.2888, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.288765788078308, "rewards/margins": 0.05721214413642883, "rewards/rejected": -1.345978021621704, "step": 1345 }, { "epoch": 0.7225288509784245, "grad_norm": 6.598657009131392, "learning_rate": 9.408091218166002e-07, "logits/chosen": -0.029619306325912476, "logits/rejected": 0.007473287172615528, "logps/chosen": -1.2660835981369019, "logps/rejected": -1.284353494644165, "loss": 1.2661, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2660835981369019, "rewards/margins": 0.01827000081539154, "rewards/rejected": -1.284353494644165, "step": 1350 }, { "epoch": 0.7252048837598261, "grad_norm": 5.875456029075568, "learning_rate": 9.400719478771449e-07, "logits/chosen": -0.06321928650140762, "logits/rejected": 0.17421527206897736, "logps/chosen": -1.3124377727508545, "logps/rejected": -1.355648398399353, "loss": 1.3124, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.3124377727508545, "rewards/margins": 0.04321056604385376, "rewards/rejected": -1.355648398399353, "step": 1355 }, { "epoch": 0.7278809165412277, "grad_norm": 7.665719361746558, "learning_rate": 9.393305043577209e-07, "logits/chosen": -0.1660265475511551, "logits/rejected": -0.04848824068903923, "logps/chosen": -1.3318064212799072, "logps/rejected": -1.4386464357376099, "loss": 1.3318, "rewards/accuracies": 0.5, "rewards/chosen": -1.3318064212799072, "rewards/margins": 0.10684003680944443, "rewards/rejected": -1.4386464357376099, "step": 1360 }, { "epoch": 0.7305569493226292, "grad_norm": 4.837511055092271, "learning_rate": 9.38584798451817e-07, "logits/chosen": -0.02699849009513855, "logits/rejected": 0.08308325707912445, "logps/chosen": -1.2567188739776611, "logps/rejected": -1.3732171058654785, "loss": 1.2567, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2567188739776611, "rewards/margins": 0.11649826914072037, "rewards/rejected": -1.3732171058654785, "step": 1365 }, { "epoch": 0.7332329821040308, "grad_norm": 7.209874672561748, "learning_rate": 9.37834837394275e-07, "logits/chosen": -0.05617652088403702, "logits/rejected": 0.03957556560635567, "logps/chosen": -1.3165287971496582, "logps/rejected": -1.469702959060669, "loss": 1.3165, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3165287971496582, "rewards/margins": 0.1531740128993988, "rewards/rejected": -1.469702959060669, "step": 1370 }, { "epoch": 0.7359090148854324, "grad_norm": 5.7199414993838, "learning_rate": 9.370806284612203e-07, "logits/chosen": -0.0839785784482956, "logits/rejected": 0.033510372042655945, "logps/chosen": -1.233891487121582, "logps/rejected": -1.4297235012054443, "loss": 1.2339, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.233891487121582, "rewards/margins": 0.19583192467689514, "rewards/rejected": -1.4297235012054443, "step": 1375 }, { "epoch": 0.738585047666834, "grad_norm": 5.550853269277993, "learning_rate": 9.363221789699912e-07, "logits/chosen": -0.12741391360759735, "logits/rejected": -0.04146786034107208, "logps/chosen": -1.262015461921692, "logps/rejected": -1.296488642692566, "loss": 1.262, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.262015461921692, "rewards/margins": 0.034473054111003876, "rewards/rejected": -1.296488642692566, "step": 1380 }, { "epoch": 0.7412610804482355, "grad_norm": 5.996864360560427, "learning_rate": 9.355594962790682e-07, "logits/chosen": -0.11299796402454376, "logits/rejected": -0.025075841695070267, "logps/chosen": -1.2222387790679932, "logps/rejected": -1.2992204427719116, "loss": 1.2222, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2222387790679932, "rewards/margins": 0.07698164135217667, "rewards/rejected": -1.2992204427719116, "step": 1385 }, { "epoch": 0.7439371132296371, "grad_norm": 7.53364674274462, "learning_rate": 9.34792587788002e-07, "logits/chosen": -0.01501091755926609, "logits/rejected": 0.06940283626317978, "logps/chosen": -1.264173150062561, "logps/rejected": -1.3698807954788208, "loss": 1.2642, "rewards/accuracies": 0.59375, "rewards/chosen": -1.264173150062561, "rewards/margins": 0.10570766776800156, "rewards/rejected": -1.3698807954788208, "step": 1390 }, { "epoch": 0.7466131460110387, "grad_norm": 5.080522101332054, "learning_rate": 9.34021460937342e-07, "logits/chosen": -0.02756296470761299, "logits/rejected": 0.050916485488414764, "logps/chosen": -1.2196497917175293, "logps/rejected": -1.292238473892212, "loss": 1.2197, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2196497917175293, "rewards/margins": 0.07258859276771545, "rewards/rejected": -1.292238473892212, "step": 1395 }, { "epoch": 0.7492891787924402, "grad_norm": 4.689194590704439, "learning_rate": 9.332461232085646e-07, "logits/chosen": -0.21620425581932068, "logits/rejected": -0.12045910209417343, "logps/chosen": -1.3002195358276367, "logps/rejected": -1.3327722549438477, "loss": 1.3002, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.3002195358276367, "rewards/margins": 0.03255276381969452, "rewards/rejected": -1.3327722549438477, "step": 1400 }, { "epoch": 0.7519652115738418, "grad_norm": 4.8449102026342885, "learning_rate": 9.324665821239998e-07, "logits/chosen": -0.09844787418842316, "logits/rejected": 0.044895656406879425, "logps/chosen": -1.15109384059906, "logps/rejected": -1.3417922258377075, "loss": 1.1511, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.15109384059906, "rewards/margins": 0.19069834053516388, "rewards/rejected": -1.3417922258377075, "step": 1405 }, { "epoch": 0.7546412443552434, "grad_norm": 4.904280894224136, "learning_rate": 9.316828452467583e-07, "logits/chosen": -0.1401177942752838, "logits/rejected": -0.002179872477427125, "logps/chosen": -1.2724847793579102, "logps/rejected": -1.3962496519088745, "loss": 1.2725, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2724847793579102, "rewards/margins": 0.12376473098993301, "rewards/rejected": -1.3962496519088745, "step": 1410 }, { "epoch": 0.7573172771366449, "grad_norm": 5.58787385262131, "learning_rate": 9.30894920180659e-07, "logits/chosen": -0.04134818911552429, "logits/rejected": 0.06844131648540497, "logps/chosen": -1.3039209842681885, "logps/rejected": -1.2591978311538696, "loss": 1.3039, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.3039209842681885, "rewards/margins": -0.04472309350967407, "rewards/rejected": -1.2591978311538696, "step": 1415 }, { "epoch": 0.7599933099180465, "grad_norm": 5.6798521044263, "learning_rate": 9.301028145701543e-07, "logits/chosen": -0.042066436260938644, "logits/rejected": 0.046875447034835815, "logps/chosen": -1.199002981185913, "logps/rejected": -1.4048508405685425, "loss": 1.199, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.199002981185913, "rewards/margins": 0.20584771037101746, "rewards/rejected": -1.4048508405685425, "step": 1420 }, { "epoch": 0.7626693426994481, "grad_norm": 5.716641820136112, "learning_rate": 9.293065361002563e-07, "logits/chosen": -0.021493086591362953, "logits/rejected": 0.02113175578415394, "logps/chosen": -1.2145483493804932, "logps/rejected": -1.4494047164916992, "loss": 1.2145, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2145483493804932, "rewards/margins": 0.23485641181468964, "rewards/rejected": -1.4494047164916992, "step": 1425 }, { "epoch": 0.7653453754808497, "grad_norm": 5.893820559673207, "learning_rate": 9.285060924964622e-07, "logits/chosen": -0.10930220037698746, "logits/rejected": -0.011229626834392548, "logps/chosen": -1.2707898616790771, "logps/rejected": -1.3175081014633179, "loss": 1.2708, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.2707898616790771, "rewards/margins": 0.046718332916498184, "rewards/rejected": -1.3175081014633179, "step": 1430 }, { "epoch": 0.7680214082622512, "grad_norm": 5.424605550836344, "learning_rate": 9.277014915246792e-07, "logits/chosen": -0.007498130202293396, "logits/rejected": 0.022906240075826645, "logps/chosen": -1.2251545190811157, "logps/rejected": -1.3801215887069702, "loss": 1.2252, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2251545190811157, "rewards/margins": 0.15496711432933807, "rewards/rejected": -1.3801215887069702, "step": 1435 }, { "epoch": 0.7706974410436528, "grad_norm": 5.08118674548093, "learning_rate": 9.268927409911498e-07, "logits/chosen": -0.08239279687404633, "logits/rejected": -0.013707287609577179, "logps/chosen": -1.28090500831604, "logps/rejected": -1.3163259029388428, "loss": 1.2809, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.28090500831604, "rewards/margins": 0.035420972853899, "rewards/rejected": -1.3163259029388428, "step": 1440 }, { "epoch": 0.7733734738250544, "grad_norm": 5.949062072184871, "learning_rate": 9.260798487423749e-07, "logits/chosen": -0.1501970738172531, "logits/rejected": 0.016708824783563614, "logps/chosen": -1.3148200511932373, "logps/rejected": -1.3959825038909912, "loss": 1.3148, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3148200511932373, "rewards/margins": 0.0811624675989151, "rewards/rejected": -1.3959825038909912, "step": 1445 }, { "epoch": 0.7760495066064559, "grad_norm": 6.824647537935466, "learning_rate": 9.252628226650389e-07, "logits/chosen": -0.03242180496454239, "logits/rejected": 0.041930533945560455, "logps/chosen": -1.1971319913864136, "logps/rejected": -1.3061772584915161, "loss": 1.1971, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.1971319913864136, "rewards/margins": 0.1090451255440712, "rewards/rejected": -1.3061772584915161, "step": 1450 }, { "epoch": 0.7787255393878575, "grad_norm": 6.988090804184654, "learning_rate": 9.244416706859321e-07, "logits/chosen": -0.06413501501083374, "logits/rejected": 0.05337250232696533, "logps/chosen": -1.2282226085662842, "logps/rejected": -1.4047060012817383, "loss": 1.2282, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2282226085662842, "rewards/margins": 0.17648327350616455, "rewards/rejected": -1.4047060012817383, "step": 1455 }, { "epoch": 0.7814015721692591, "grad_norm": 3.910942945066741, "learning_rate": 9.23616400771875e-07, "logits/chosen": -0.05352293327450752, "logits/rejected": 0.06099768728017807, "logps/chosen": -1.1895208358764648, "logps/rejected": -1.3550188541412354, "loss": 1.1895, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1895208358764648, "rewards/margins": 0.16549797356128693, "rewards/rejected": -1.3550188541412354, "step": 1460 }, { "epoch": 0.7840776049506607, "grad_norm": 5.609411753865515, "learning_rate": 9.227870209296395e-07, "logits/chosen": -0.04931207001209259, "logits/rejected": 0.023060161620378494, "logps/chosen": -1.313791275024414, "logps/rejected": -1.412623405456543, "loss": 1.3138, "rewards/accuracies": 0.53125, "rewards/chosen": -1.313791275024414, "rewards/margins": 0.0988321453332901, "rewards/rejected": -1.412623405456543, "step": 1465 }, { "epoch": 0.7867536377320622, "grad_norm": 5.553946616792682, "learning_rate": 9.219535392058728e-07, "logits/chosen": -0.1349884271621704, "logits/rejected": -0.1107080727815628, "logps/chosen": -1.2724605798721313, "logps/rejected": -1.3187648057937622, "loss": 1.2725, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2724605798721313, "rewards/margins": 0.04630430042743683, "rewards/rejected": -1.3187648057937622, "step": 1470 }, { "epoch": 0.7894296705134638, "grad_norm": 4.558776085975967, "learning_rate": 9.211159636870181e-07, "logits/chosen": -0.11348026990890503, "logits/rejected": 0.017362769693136215, "logps/chosen": -1.2414212226867676, "logps/rejected": -1.3804320096969604, "loss": 1.2414, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2414212226867676, "rewards/margins": 0.1390109360218048, "rewards/rejected": -1.3804320096969604, "step": 1475 }, { "epoch": 0.7921057032948654, "grad_norm": 5.641423745535041, "learning_rate": 9.202743024992367e-07, "logits/chosen": -0.04174742102622986, "logits/rejected": 0.042013019323349, "logps/chosen": -1.2246649265289307, "logps/rejected": -1.3954532146453857, "loss": 1.2247, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2246649265289307, "rewards/margins": 0.17078819870948792, "rewards/rejected": -1.3954532146453857, "step": 1480 }, { "epoch": 0.7947817360762669, "grad_norm": 6.450252652398651, "learning_rate": 9.194285638083293e-07, "logits/chosen": -0.04029151052236557, "logits/rejected": 0.08345986902713776, "logps/chosen": -1.2942759990692139, "logps/rejected": -1.4027588367462158, "loss": 1.2943, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2942759990692139, "rewards/margins": 0.10848300158977509, "rewards/rejected": -1.4027588367462158, "step": 1485 }, { "epoch": 0.7974577688576685, "grad_norm": 6.860963399621977, "learning_rate": 9.185787558196562e-07, "logits/chosen": -0.08877672255039215, "logits/rejected": -0.011749161407351494, "logps/chosen": -1.265564203262329, "logps/rejected": -1.2752305269241333, "loss": 1.2656, "rewards/accuracies": 0.4375, "rewards/chosen": -1.265564203262329, "rewards/margins": 0.00966625101864338, "rewards/rejected": -1.2752305269241333, "step": 1490 }, { "epoch": 0.8001338016390701, "grad_norm": 6.908115744202685, "learning_rate": 9.177248867780583e-07, "logits/chosen": -0.10008825361728668, "logits/rejected": -0.009861460886895657, "logps/chosen": -1.3463541269302368, "logps/rejected": -1.3537318706512451, "loss": 1.3464, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.3463541269302368, "rewards/margins": 0.007377811707556248, "rewards/rejected": -1.3537318706512451, "step": 1495 }, { "epoch": 0.8028098344204716, "grad_norm": 7.722241136596385, "learning_rate": 9.168669649677769e-07, "logits/chosen": -0.12104526907205582, "logits/rejected": -0.03852628171443939, "logps/chosen": -1.2310656309127808, "logps/rejected": -1.3607760667800903, "loss": 1.2311, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2310656309127808, "rewards/margins": 0.12971025705337524, "rewards/rejected": -1.3607760667800903, "step": 1500 }, { "epoch": 0.8054858672018732, "grad_norm": 6.593758930542835, "learning_rate": 9.16004998712373e-07, "logits/chosen": -0.04661126434803009, "logits/rejected": -0.010233288630843163, "logps/chosen": -1.1887028217315674, "logps/rejected": -1.3890939950942993, "loss": 1.1887, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.1887028217315674, "rewards/margins": 0.20039117336273193, "rewards/rejected": -1.3890939950942993, "step": 1505 }, { "epoch": 0.8081618999832748, "grad_norm": 5.684592280306354, "learning_rate": 9.151389963746472e-07, "logits/chosen": -0.1421189308166504, "logits/rejected": 0.059901464730501175, "logps/chosen": -1.292877197265625, "logps/rejected": -1.3848272562026978, "loss": 1.2929, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.292877197265625, "rewards/margins": 0.09195006638765335, "rewards/rejected": -1.3848272562026978, "step": 1510 }, { "epoch": 0.8108379327646764, "grad_norm": 5.635582815740526, "learning_rate": 9.142689663565577e-07, "logits/chosen": -0.05868496745824814, "logits/rejected": -0.012107991613447666, "logps/chosen": -1.2360152006149292, "logps/rejected": -1.344347596168518, "loss": 1.236, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.2360152006149292, "rewards/margins": 0.10833243280649185, "rewards/rejected": -1.344347596168518, "step": 1515 }, { "epoch": 0.8135139655460779, "grad_norm": 6.101607736916642, "learning_rate": 9.133949170991397e-07, "logits/chosen": -0.06639351695775986, "logits/rejected": -0.010837098583579063, "logps/chosen": -1.2725540399551392, "logps/rejected": -1.377894401550293, "loss": 1.2726, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2725540399551392, "rewards/margins": 0.10534045845270157, "rewards/rejected": -1.377894401550293, "step": 1520 }, { "epoch": 0.8161899983274795, "grad_norm": 6.377364037631754, "learning_rate": 9.125168570824231e-07, "logits/chosen": -0.08042605221271515, "logits/rejected": 0.05226296931505203, "logps/chosen": -1.254617691040039, "logps/rejected": -1.3008910417556763, "loss": 1.2546, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.254617691040039, "rewards/margins": 0.04627348482608795, "rewards/rejected": -1.3008910417556763, "step": 1525 }, { "epoch": 0.8188660311088811, "grad_norm": 7.375140170113862, "learning_rate": 9.116347948253496e-07, "logits/chosen": -0.10395167768001556, "logits/rejected": -0.028587546199560165, "logps/chosen": -1.2934331893920898, "logps/rejected": -1.3751928806304932, "loss": 1.2934, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2934331893920898, "rewards/margins": 0.0817597359418869, "rewards/rejected": -1.3751928806304932, "step": 1530 }, { "epoch": 0.8215420638902826, "grad_norm": 8.129397648474804, "learning_rate": 9.107487388856916e-07, "logits/chosen": -0.11710800230503082, "logits/rejected": 0.013529367744922638, "logps/chosen": -1.2018146514892578, "logps/rejected": -1.3654011487960815, "loss": 1.2018, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2018146514892578, "rewards/margins": 0.16358642280101776, "rewards/rejected": -1.3654011487960815, "step": 1535 }, { "epoch": 0.8242180966716842, "grad_norm": 8.523370263530879, "learning_rate": 9.098586978599673e-07, "logits/chosen": -0.07853835076093674, "logits/rejected": 0.04062449187040329, "logps/chosen": -1.2477967739105225, "logps/rejected": -1.4284135103225708, "loss": 1.2478, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2477967739105225, "rewards/margins": 0.18061673641204834, "rewards/rejected": -1.4284135103225708, "step": 1540 }, { "epoch": 0.8268941294530858, "grad_norm": 5.716761053986626, "learning_rate": 9.089646803833588e-07, "logits/chosen": -0.04919663816690445, "logits/rejected": 0.08321021497249603, "logps/chosen": -1.2510499954223633, "logps/rejected": -1.3141735792160034, "loss": 1.251, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2510499954223633, "rewards/margins": 0.06312358379364014, "rewards/rejected": -1.3141735792160034, "step": 1545 }, { "epoch": 0.8295701622344873, "grad_norm": 6.954332256137424, "learning_rate": 9.080666951296276e-07, "logits/chosen": -0.18461468815803528, "logits/rejected": 0.026880990713834763, "logps/chosen": -1.3117130994796753, "logps/rejected": -1.383786916732788, "loss": 1.3117, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.3117130994796753, "rewards/margins": 0.07207369059324265, "rewards/rejected": -1.383786916732788, "step": 1550 }, { "epoch": 0.8322461950158889, "grad_norm": 4.309932700986605, "learning_rate": 9.071647508110305e-07, "logits/chosen": -0.14361463487148285, "logits/rejected": 0.043356459587812424, "logps/chosen": -1.3343501091003418, "logps/rejected": -1.4364620447158813, "loss": 1.3343, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.3343501091003418, "rewards/margins": 0.10211189836263657, "rewards/rejected": -1.4364620447158813, "step": 1555 }, { "epoch": 0.8349222277972905, "grad_norm": 6.657059518303394, "learning_rate": 9.062588561782354e-07, "logits/chosen": -0.05316805839538574, "logits/rejected": -0.001307845115661621, "logps/chosen": -1.2854853868484497, "logps/rejected": -1.4104955196380615, "loss": 1.2855, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2854853868484497, "rewards/margins": 0.1250101625919342, "rewards/rejected": -1.4104955196380615, "step": 1560 }, { "epoch": 0.8375982605786921, "grad_norm": 5.826388916391279, "learning_rate": 9.053490200202358e-07, "logits/chosen": -0.07666482776403427, "logits/rejected": -0.001667500240728259, "logps/chosen": -1.2881529331207275, "logps/rejected": -1.379446268081665, "loss": 1.2882, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2881529331207275, "rewards/margins": 0.09129341691732407, "rewards/rejected": -1.379446268081665, "step": 1565 }, { "epoch": 0.8402742933600936, "grad_norm": 6.54790392462915, "learning_rate": 9.044352511642661e-07, "logits/chosen": -0.022969847545027733, "logits/rejected": -0.01960369199514389, "logps/chosen": -1.1744037866592407, "logps/rejected": -1.2922977209091187, "loss": 1.1744, "rewards/accuracies": 0.53125, "rewards/chosen": -1.1744037866592407, "rewards/margins": 0.11789387464523315, "rewards/rejected": -1.2922977209091187, "step": 1570 }, { "epoch": 0.8429503261414952, "grad_norm": 6.3223867132370435, "learning_rate": 9.03517558475716e-07, "logits/chosen": -0.055620092898607254, "logits/rejected": 0.024541283026337624, "logps/chosen": -1.2679303884506226, "logps/rejected": -1.3035547733306885, "loss": 1.2679, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2679303884506226, "rewards/margins": 0.0356244221329689, "rewards/rejected": -1.3035547733306885, "step": 1575 }, { "epoch": 0.8456263589228968, "grad_norm": 6.032916512134836, "learning_rate": 9.025959508580436e-07, "logits/chosen": -0.013345959596335888, "logits/rejected": 0.17455056309700012, "logps/chosen": -1.2534754276275635, "logps/rejected": -1.3733937740325928, "loss": 1.2535, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2534754276275635, "rewards/margins": 0.11991840600967407, "rewards/rejected": -1.3733937740325928, "step": 1580 }, { "epoch": 0.8483023917042983, "grad_norm": 5.082785930124816, "learning_rate": 9.016704372526905e-07, "logits/chosen": -0.049613501876592636, "logits/rejected": 0.07048625499010086, "logps/chosen": -1.1938594579696655, "logps/rejected": -1.4128704071044922, "loss": 1.1939, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.1938594579696655, "rewards/margins": 0.21901094913482666, "rewards/rejected": -1.4128704071044922, "step": 1585 }, { "epoch": 0.8509784244856999, "grad_norm": 6.32812048940542, "learning_rate": 9.007410266389934e-07, "logits/chosen": -0.10778329521417618, "logits/rejected": -0.03499821573495865, "logps/chosen": -1.2097148895263672, "logps/rejected": -1.2936917543411255, "loss": 1.2097, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2097148895263672, "rewards/margins": 0.08397690951824188, "rewards/rejected": -1.2936917543411255, "step": 1590 }, { "epoch": 0.8536544572671015, "grad_norm": 8.102687214414907, "learning_rate": 8.998077280340981e-07, "logits/chosen": -0.05014738440513611, "logits/rejected": 0.007323701865971088, "logps/chosen": -1.3255528211593628, "logps/rejected": -1.292812466621399, "loss": 1.3256, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.3255528211593628, "rewards/margins": -0.03274022415280342, "rewards/rejected": -1.292812466621399, "step": 1595 }, { "epoch": 0.8563304900485031, "grad_norm": 6.3187462705917685, "learning_rate": 8.988705504928722e-07, "logits/chosen": -0.136830136179924, "logits/rejected": 0.006453720387071371, "logps/chosen": -1.299041986465454, "logps/rejected": -1.4341981410980225, "loss": 1.299, "rewards/accuracies": 0.5625, "rewards/chosen": -1.299041986465454, "rewards/margins": 0.13515594601631165, "rewards/rejected": -1.4341981410980225, "step": 1600 }, { "epoch": 0.8563304900485031, "eval_logits/chosen": 0.16828212141990662, "eval_logits/rejected": 0.237099751830101, "eval_logps/chosen": -1.2771652936935425, "eval_logps/rejected": -1.3985117673873901, "eval_loss": 1.2774401903152466, "eval_rewards/accuracies": 0.5467358827590942, "eval_rewards/chosen": -1.2771652936935425, "eval_rewards/margins": 0.12134658545255661, "eval_rewards/rejected": -1.3985117673873901, "eval_runtime": 40.4866, "eval_samples_per_second": 33.221, "eval_steps_per_second": 8.324, "step": 1600 }, { "epoch": 0.8590065228299046, "grad_norm": 4.706129959583797, "learning_rate": 8.979295031078157e-07, "logits/chosen": -0.14844748377799988, "logits/rejected": 0.02692342922091484, "logps/chosen": -1.2316442728042603, "logps/rejected": -1.428823471069336, "loss": 1.2316, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2316442728042603, "rewards/margins": 0.19717933237552643, "rewards/rejected": -1.428823471069336, "step": 1605 }, { "epoch": 0.8616825556113062, "grad_norm": 4.684764204554033, "learning_rate": 8.969845950089751e-07, "logits/chosen": -0.15728111565113068, "logits/rejected": -0.01533517986536026, "logps/chosen": -1.218471884727478, "logps/rejected": -1.3499605655670166, "loss": 1.2185, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.218471884727478, "rewards/margins": 0.1314888298511505, "rewards/rejected": -1.3499605655670166, "step": 1610 }, { "epoch": 0.8643585883927078, "grad_norm": 5.4533499372160295, "learning_rate": 8.960358353638526e-07, "logits/chosen": -0.08542884141206741, "logits/rejected": -0.006005966570228338, "logps/chosen": -1.281269907951355, "logps/rejected": -1.4252756834030151, "loss": 1.2813, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.281269907951355, "rewards/margins": 0.14400574564933777, "rewards/rejected": -1.4252756834030151, "step": 1615 }, { "epoch": 0.8670346211741093, "grad_norm": 6.1818072234106936, "learning_rate": 8.950832333773184e-07, "logits/chosen": -0.0647321417927742, "logits/rejected": 0.04220649227499962, "logps/chosen": -1.1488441228866577, "logps/rejected": -1.3910646438598633, "loss": 1.1488, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.1488441228866577, "rewards/margins": 0.24222052097320557, "rewards/rejected": -1.3910646438598633, "step": 1620 }, { "epoch": 0.869710653955511, "grad_norm": 8.193748855007913, "learning_rate": 8.941267982915213e-07, "logits/chosen": 0.014203441329300404, "logits/rejected": 0.05631190538406372, "logps/chosen": -1.3219023942947388, "logps/rejected": -1.4358569383621216, "loss": 1.3219, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.3219023942947388, "rewards/margins": 0.11395438760519028, "rewards/rejected": -1.4358569383621216, "step": 1625 }, { "epoch": 0.8723866867369126, "grad_norm": 5.908531772837574, "learning_rate": 8.931665393857983e-07, "logits/chosen": -0.05319751054048538, "logits/rejected": 0.057577311992645264, "logps/chosen": -1.2869704961776733, "logps/rejected": -1.3215672969818115, "loss": 1.287, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.2869704961776733, "rewards/margins": 0.034596916288137436, "rewards/rejected": -1.3215672969818115, "step": 1630 }, { "epoch": 0.875062719518314, "grad_norm": 6.2409922953238794, "learning_rate": 8.922024659765861e-07, "logits/chosen": -0.1256805956363678, "logits/rejected": -0.03462417051196098, "logps/chosen": -1.194129228591919, "logps/rejected": -1.329732894897461, "loss": 1.1941, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.194129228591919, "rewards/margins": 0.13560371100902557, "rewards/rejected": -1.329732894897461, "step": 1635 }, { "epoch": 0.8777387522997157, "grad_norm": 6.225754924697384, "learning_rate": 8.912345874173288e-07, "logits/chosen": -0.10527032613754272, "logits/rejected": -0.027467811480164528, "logps/chosen": -1.1909500360488892, "logps/rejected": -1.309623122215271, "loss": 1.191, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.1909500360488892, "rewards/margins": 0.1186731606721878, "rewards/rejected": -1.309623122215271, "step": 1640 }, { "epoch": 0.8804147850811173, "grad_norm": 6.251916559014856, "learning_rate": 8.902629130983885e-07, "logits/chosen": -0.056687306612730026, "logits/rejected": -0.015938516706228256, "logps/chosen": -1.1889241933822632, "logps/rejected": -1.3275558948516846, "loss": 1.1889, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1889241933822632, "rewards/margins": 0.13863176107406616, "rewards/rejected": -1.3275558948516846, "step": 1645 }, { "epoch": 0.8830908178625189, "grad_norm": 7.4091004387765445, "learning_rate": 8.892874524469537e-07, "logits/chosen": 0.004517675843089819, "logits/rejected": 0.05838106945157051, "logps/chosen": -1.2191541194915771, "logps/rejected": -1.3890324831008911, "loss": 1.2192, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2191541194915771, "rewards/margins": 0.16987822949886322, "rewards/rejected": -1.3890324831008911, "step": 1650 }, { "epoch": 0.8857668506439204, "grad_norm": 6.892566495957107, "learning_rate": 8.883082149269478e-07, "logits/chosen": -0.10453947633504868, "logits/rejected": -0.028329085558652878, "logps/chosen": -1.2791041135787964, "logps/rejected": -1.3269407749176025, "loss": 1.2791, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2791041135787964, "rewards/margins": 0.04783656448125839, "rewards/rejected": -1.3269407749176025, "step": 1655 }, { "epoch": 0.888442883425322, "grad_norm": 5.647941330534759, "learning_rate": 8.873252100389377e-07, "logits/chosen": -0.01439726073294878, "logits/rejected": -0.02386179193854332, "logps/chosen": -1.2101633548736572, "logps/rejected": -1.3591997623443604, "loss": 1.2102, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2101633548736572, "rewards/margins": 0.14903631806373596, "rewards/rejected": -1.3591997623443604, "step": 1660 }, { "epoch": 0.8911189162067236, "grad_norm": 4.286583801465364, "learning_rate": 8.863384473200411e-07, "logits/chosen": -0.07431872189044952, "logits/rejected": -0.021642297506332397, "logps/chosen": -1.2794735431671143, "logps/rejected": -1.3915115594863892, "loss": 1.2795, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2794735431671143, "rewards/margins": 0.11203791201114655, "rewards/rejected": -1.3915115594863892, "step": 1665 }, { "epoch": 0.8937949489881251, "grad_norm": 4.996139307225309, "learning_rate": 8.853479363438342e-07, "logits/chosen": -0.03806663304567337, "logits/rejected": 0.08872498571872711, "logps/chosen": -1.2847729921340942, "logps/rejected": -1.333287000656128, "loss": 1.2848, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.2847729921340942, "rewards/margins": 0.04851409047842026, "rewards/rejected": -1.333287000656128, "step": 1670 }, { "epoch": 0.8964709817695267, "grad_norm": 4.173911869418022, "learning_rate": 8.843536867202588e-07, "logits/chosen": -0.013347308151423931, "logits/rejected": 0.15775558352470398, "logps/chosen": -1.2872469425201416, "logps/rejected": -1.4723048210144043, "loss": 1.2872, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2872469425201416, "rewards/margins": 0.18505795300006866, "rewards/rejected": -1.4723048210144043, "step": 1675 }, { "epoch": 0.8991470145509283, "grad_norm": 5.540568690189591, "learning_rate": 8.833557080955292e-07, "logits/chosen": -0.11366579681634903, "logits/rejected": -0.02030355855822563, "logps/chosen": -1.3184024095535278, "logps/rejected": -1.4004123210906982, "loss": 1.3184, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3184024095535278, "rewards/margins": 0.08201000094413757, "rewards/rejected": -1.4004123210906982, "step": 1680 }, { "epoch": 0.9018230473323299, "grad_norm": 6.546255056205437, "learning_rate": 8.823540101520381e-07, "logits/chosen": -0.07983747869729996, "logits/rejected": 0.10967172682285309, "logps/chosen": -1.2601633071899414, "logps/rejected": -1.3461735248565674, "loss": 1.2602, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2601633071899414, "rewards/margins": 0.08601019531488419, "rewards/rejected": -1.3461735248565674, "step": 1685 }, { "epoch": 0.9044990801137314, "grad_norm": 5.558125473030802, "learning_rate": 8.813486026082637e-07, "logits/chosen": -0.08322546631097794, "logits/rejected": 0.06822848320007324, "logps/chosen": -1.1913286447525024, "logps/rejected": -1.3295924663543701, "loss": 1.1913, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1913286447525024, "rewards/margins": 0.1382637470960617, "rewards/rejected": -1.3295924663543701, "step": 1690 }, { "epoch": 0.907175112895133, "grad_norm": 8.032770037301256, "learning_rate": 8.803394952186742e-07, "logits/chosen": -0.19118326902389526, "logits/rejected": -0.07869881391525269, "logps/chosen": -1.296580195426941, "logps/rejected": -1.3727655410766602, "loss": 1.2966, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.296580195426941, "rewards/margins": 0.07618550211191177, "rewards/rejected": -1.3727655410766602, "step": 1695 }, { "epoch": 0.9098511456765346, "grad_norm": 6.488893917545801, "learning_rate": 8.793266977736342e-07, "logits/chosen": -0.05113423615694046, "logits/rejected": -0.08385536074638367, "logps/chosen": -1.3025518655776978, "logps/rejected": -1.3207619190216064, "loss": 1.3026, "rewards/accuracies": 0.46875, "rewards/chosen": -1.3025518655776978, "rewards/margins": 0.01821029745042324, "rewards/rejected": -1.3207619190216064, "step": 1700 }, { "epoch": 0.9125271784579361, "grad_norm": 6.002613642120395, "learning_rate": 8.783102200993085e-07, "logits/chosen": -0.03487701341509819, "logits/rejected": 0.08435748517513275, "logps/chosen": -1.273768663406372, "logps/rejected": -1.3340345621109009, "loss": 1.2738, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.273768663406372, "rewards/margins": 0.06026585027575493, "rewards/rejected": -1.3340345621109009, "step": 1705 }, { "epoch": 0.9152032112393377, "grad_norm": 6.537475319409445, "learning_rate": 8.772900720575683e-07, "logits/chosen": -0.10028685629367828, "logits/rejected": -0.03481400012969971, "logps/chosen": -1.210261344909668, "logps/rejected": -1.3266297578811646, "loss": 1.2103, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.210261344909668, "rewards/margins": 0.1163683533668518, "rewards/rejected": -1.3266297578811646, "step": 1710 }, { "epoch": 0.9178792440207393, "grad_norm": 5.1500303100501785, "learning_rate": 8.762662635458944e-07, "logits/chosen": -0.07758782804012299, "logits/rejected": 0.09597153961658478, "logps/chosen": -1.3024771213531494, "logps/rejected": -1.3409560918807983, "loss": 1.3025, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.3024771213531494, "rewards/margins": 0.038479022681713104, "rewards/rejected": -1.3409560918807983, "step": 1715 }, { "epoch": 0.9205552768021408, "grad_norm": 4.485367654860153, "learning_rate": 8.752388044972811e-07, "logits/chosen": -0.09555426985025406, "logits/rejected": -0.05298949405550957, "logps/chosen": -1.1396243572235107, "logps/rejected": -1.3529847860336304, "loss": 1.1396, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1396243572235107, "rewards/margins": 0.21336038410663605, "rewards/rejected": -1.3529847860336304, "step": 1720 }, { "epoch": 0.9232313095835424, "grad_norm": 5.604939297684912, "learning_rate": 8.74207704880141e-07, "logits/chosen": -0.10861408710479736, "logits/rejected": -0.030441652983427048, "logps/chosen": -1.2947056293487549, "logps/rejected": -1.4465768337249756, "loss": 1.2947, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2947056293487549, "rewards/margins": 0.15187107026576996, "rewards/rejected": -1.4465768337249756, "step": 1725 }, { "epoch": 0.925907342364944, "grad_norm": 6.9333027069449376, "learning_rate": 8.731729746982068e-07, "logits/chosen": -0.0011131629580631852, "logits/rejected": 0.03207176551222801, "logps/chosen": -1.2584741115570068, "logps/rejected": -1.3183884620666504, "loss": 1.2585, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2584741115570068, "rewards/margins": 0.05991457775235176, "rewards/rejected": -1.3183884620666504, "step": 1730 }, { "epoch": 0.9285833751463456, "grad_norm": 5.227438605136431, "learning_rate": 8.721346239904355e-07, "logits/chosen": -0.14465519785881042, "logits/rejected": -0.018878687173128128, "logps/chosen": -1.1944607496261597, "logps/rejected": -1.4893165826797485, "loss": 1.1945, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1944607496261597, "rewards/margins": 0.2948557734489441, "rewards/rejected": -1.4893165826797485, "step": 1735 }, { "epoch": 0.9312594079277471, "grad_norm": 5.424701956821222, "learning_rate": 8.710926628309101e-07, "logits/chosen": -0.10186507552862167, "logits/rejected": -0.011081508360803127, "logps/chosen": -1.244520902633667, "logps/rejected": -1.3266674280166626, "loss": 1.2445, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.244520902633667, "rewards/margins": 0.08214623481035233, "rewards/rejected": -1.3266674280166626, "step": 1740 }, { "epoch": 0.9339354407091487, "grad_norm": 5.449693356091585, "learning_rate": 8.700471013287424e-07, "logits/chosen": -0.03703099489212036, "logits/rejected": -0.02959308959543705, "logps/chosen": -1.238405704498291, "logps/rejected": -1.3383169174194336, "loss": 1.2384, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.238405704498291, "rewards/margins": 0.09991107136011124, "rewards/rejected": -1.3383169174194336, "step": 1745 }, { "epoch": 0.9366114734905503, "grad_norm": 9.80912532288398, "learning_rate": 8.689979496279746e-07, "logits/chosen": -0.1074419617652893, "logits/rejected": -0.05361846089363098, "logps/chosen": -1.2146122455596924, "logps/rejected": -1.4120653867721558, "loss": 1.2146, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2146122455596924, "rewards/margins": 0.19745299220085144, "rewards/rejected": -1.4120653867721558, "step": 1750 }, { "epoch": 0.9392875062719518, "grad_norm": 4.864941410776057, "learning_rate": 8.679452179074811e-07, "logits/chosen": -0.10069048404693604, "logits/rejected": -0.03998316824436188, "logps/chosen": -1.2204759120941162, "logps/rejected": -1.3013700246810913, "loss": 1.2205, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.2204759120941162, "rewards/margins": 0.08089412748813629, "rewards/rejected": -1.3013700246810913, "step": 1755 }, { "epoch": 0.9419635390533534, "grad_norm": 5.190275876421117, "learning_rate": 8.668889163808698e-07, "logits/chosen": -0.06184813380241394, "logits/rejected": 0.017844293266534805, "logps/chosen": -1.2042120695114136, "logps/rejected": -1.342323660850525, "loss": 1.2042, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2042120695114136, "rewards/margins": 0.1381116807460785, "rewards/rejected": -1.342323660850525, "step": 1760 }, { "epoch": 0.944639571834755, "grad_norm": 5.636618978217605, "learning_rate": 8.658290552963827e-07, "logits/chosen": -0.054620254784822464, "logits/rejected": -0.04075001925230026, "logps/chosen": -1.2419077157974243, "logps/rejected": -1.361064076423645, "loss": 1.2419, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2419077157974243, "rewards/margins": 0.11915643513202667, "rewards/rejected": -1.361064076423645, "step": 1765 }, { "epoch": 0.9473156046161565, "grad_norm": 5.480701674496628, "learning_rate": 8.647656449367966e-07, "logits/chosen": -0.04914510250091553, "logits/rejected": 0.08171708881855011, "logps/chosen": -1.2658190727233887, "logps/rejected": -1.305246114730835, "loss": 1.2658, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2658190727233887, "rewards/margins": 0.03942699730396271, "rewards/rejected": -1.305246114730835, "step": 1770 }, { "epoch": 0.9499916373975581, "grad_norm": 5.818767218484126, "learning_rate": 8.636986956193235e-07, "logits/chosen": -0.11155180633068085, "logits/rejected": -0.059415530413389206, "logps/chosen": -1.2084451913833618, "logps/rejected": -1.311943531036377, "loss": 1.2084, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2084451913833618, "rewards/margins": 0.1034984141588211, "rewards/rejected": -1.311943531036377, "step": 1775 }, { "epoch": 0.9526676701789597, "grad_norm": 5.563069107829783, "learning_rate": 8.626282176955104e-07, "logits/chosen": -0.10048563778400421, "logits/rejected": -0.006252491381019354, "logps/chosen": -1.2368733882904053, "logps/rejected": -1.3720879554748535, "loss": 1.2369, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2368733882904053, "rewards/margins": 0.13521453738212585, "rewards/rejected": -1.3720879554748535, "step": 1780 }, { "epoch": 0.9553437029603613, "grad_norm": 5.620243979871199, "learning_rate": 8.615542215511389e-07, "logits/chosen": -0.015386748127639294, "logits/rejected": 0.041078388690948486, "logps/chosen": -1.172307014465332, "logps/rejected": -1.2574660778045654, "loss": 1.1723, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.172307014465332, "rewards/margins": 0.08515889942646027, "rewards/rejected": -1.2574660778045654, "step": 1785 }, { "epoch": 0.9580197357417628, "grad_norm": 5.751149413640933, "learning_rate": 8.604767176061241e-07, "logits/chosen": -0.02715134620666504, "logits/rejected": -0.006797707173973322, "logps/chosen": -1.2590100765228271, "logps/rejected": -1.3531734943389893, "loss": 1.259, "rewards/accuracies": 0.5, "rewards/chosen": -1.2590100765228271, "rewards/margins": 0.09416332095861435, "rewards/rejected": -1.3531734943389893, "step": 1790 }, { "epoch": 0.9606957685231644, "grad_norm": 5.890320598104453, "learning_rate": 8.593957163144141e-07, "logits/chosen": -0.11601047217845917, "logits/rejected": -0.011490595526993275, "logps/chosen": -1.2377374172210693, "logps/rejected": -1.3495173454284668, "loss": 1.2377, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2377374172210693, "rewards/margins": 0.11178008466959, "rewards/rejected": -1.3495173454284668, "step": 1795 }, { "epoch": 0.963371801304566, "grad_norm": 4.71456435634355, "learning_rate": 8.58311228163888e-07, "logits/chosen": -0.039800308644771576, "logits/rejected": 0.008342784829437733, "logps/chosen": -1.216266393661499, "logps/rejected": -1.296705961227417, "loss": 1.2163, "rewards/accuracies": 0.5, "rewards/chosen": -1.216266393661499, "rewards/margins": 0.08043956756591797, "rewards/rejected": -1.296705961227417, "step": 1800 }, { "epoch": 0.9660478340859675, "grad_norm": 5.110034279228774, "learning_rate": 8.57223263676255e-07, "logits/chosen": -0.15422633290290833, "logits/rejected": -0.056101106107234955, "logps/chosen": -1.1808069944381714, "logps/rejected": -1.3436143398284912, "loss": 1.1808, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1808069944381714, "rewards/margins": 0.16280731558799744, "rewards/rejected": -1.3436143398284912, "step": 1805 }, { "epoch": 0.9687238668673691, "grad_norm": 5.249568921549893, "learning_rate": 8.561318334069511e-07, "logits/chosen": -0.053187668323516846, "logits/rejected": 0.062101416289806366, "logps/chosen": -1.2118345499038696, "logps/rejected": -1.293718695640564, "loss": 1.2118, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2118345499038696, "rewards/margins": 0.08188413083553314, "rewards/rejected": -1.293718695640564, "step": 1810 }, { "epoch": 0.9713998996487707, "grad_norm": 4.759606129248082, "learning_rate": 8.550369479450375e-07, "logits/chosen": -0.0789082944393158, "logits/rejected": 0.01260935328900814, "logps/chosen": -1.238387107849121, "logps/rejected": -1.304663896560669, "loss": 1.2384, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.238387107849121, "rewards/margins": 0.06627664715051651, "rewards/rejected": -1.304663896560669, "step": 1815 }, { "epoch": 0.9740759324301723, "grad_norm": 6.763627000096796, "learning_rate": 8.539386179130977e-07, "logits/chosen": -0.06760134547948837, "logits/rejected": -0.01759599708020687, "logps/chosen": -1.2203199863433838, "logps/rejected": -1.3097541332244873, "loss": 1.2203, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2203199863433838, "rewards/margins": 0.08943423628807068, "rewards/rejected": -1.3097541332244873, "step": 1820 }, { "epoch": 0.9767519652115738, "grad_norm": 6.531294731436835, "learning_rate": 8.528368539671347e-07, "logits/chosen": -0.13651055097579956, "logits/rejected": -0.028382977470755577, "logps/chosen": -1.2116529941558838, "logps/rejected": -1.313040018081665, "loss": 1.2117, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2116529941558838, "rewards/margins": 0.10138698667287827, "rewards/rejected": -1.313040018081665, "step": 1825 }, { "epoch": 0.9794279979929754, "grad_norm": 4.223566563876965, "learning_rate": 8.51731666796467e-07, "logits/chosen": 0.014116671867668629, "logits/rejected": 0.02796078287065029, "logps/chosen": -1.2938498258590698, "logps/rejected": -1.307074785232544, "loss": 1.2938, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2938498258590698, "rewards/margins": 0.013224741443991661, "rewards/rejected": -1.307074785232544, "step": 1830 }, { "epoch": 0.982104030774377, "grad_norm": 6.743659849305691, "learning_rate": 8.506230671236254e-07, "logits/chosen": -0.1013193354010582, "logits/rejected": -0.06271430104970932, "logps/chosen": -1.2439030408859253, "logps/rejected": -1.2528250217437744, "loss": 1.2439, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.2439030408859253, "rewards/margins": 0.008922002278268337, "rewards/rejected": -1.2528250217437744, "step": 1835 }, { "epoch": 0.9847800635557785, "grad_norm": 5.685585749367072, "learning_rate": 8.495110657042488e-07, "logits/chosen": -0.04116172343492508, "logits/rejected": 0.042606521397829056, "logps/chosen": -1.2456144094467163, "logps/rejected": -1.4011667966842651, "loss": 1.2456, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2456144094467163, "rewards/margins": 0.15555259585380554, "rewards/rejected": -1.4011667966842651, "step": 1840 }, { "epoch": 0.9874560963371801, "grad_norm": 7.796310777715372, "learning_rate": 8.483956733269799e-07, "logits/chosen": -0.07885689288377762, "logits/rejected": -0.005295447073876858, "logps/chosen": -1.2680327892303467, "logps/rejected": -1.2865180969238281, "loss": 1.268, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.2680327892303467, "rewards/margins": 0.01848551258444786, "rewards/rejected": -1.2865180969238281, "step": 1845 }, { "epoch": 0.9901321291185817, "grad_norm": 5.863534544265974, "learning_rate": 8.472769008133602e-07, "logits/chosen": -0.21118608117103577, "logits/rejected": -0.11683789640665054, "logps/chosen": -1.2787643671035767, "logps/rejected": -1.255531907081604, "loss": 1.2788, "rewards/accuracies": 0.46875, "rewards/chosen": -1.2787643671035767, "rewards/margins": -0.023232419043779373, "rewards/rejected": -1.255531907081604, "step": 1850 }, { "epoch": 0.9928081618999832, "grad_norm": 5.953837154802561, "learning_rate": 8.461547590177259e-07, "logits/chosen": -0.08769471943378448, "logits/rejected": -0.01987278088927269, "logps/chosen": -1.2067186832427979, "logps/rejected": -1.3098644018173218, "loss": 1.2067, "rewards/accuracies": 0.5, "rewards/chosen": -1.2067186832427979, "rewards/margins": 0.10314575582742691, "rewards/rejected": -1.3098644018173218, "step": 1855 }, { "epoch": 0.9954841946813848, "grad_norm": 6.108645301294965, "learning_rate": 8.450292588271014e-07, "logits/chosen": -0.05838220193982124, "logits/rejected": -0.004279000218957663, "logps/chosen": -1.27395761013031, "logps/rejected": -1.348012089729309, "loss": 1.274, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.27395761013031, "rewards/margins": 0.07405432313680649, "rewards/rejected": -1.348012089729309, "step": 1860 }, { "epoch": 0.9981602274627864, "grad_norm": 5.790470270825337, "learning_rate": 8.439004111610945e-07, "logits/chosen": -0.07506807893514633, "logits/rejected": -0.01916552148759365, "logps/chosen": -1.136309266090393, "logps/rejected": -1.351172685623169, "loss": 1.1363, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.136309266090393, "rewards/margins": 0.21486349403858185, "rewards/rejected": -1.351172685623169, "step": 1865 }, { "epoch": 1.000836260244188, "grad_norm": 5.55256667775222, "learning_rate": 8.427682269717901e-07, "logits/chosen": -0.13936546444892883, "logits/rejected": -0.017727404832839966, "logps/chosen": -1.3007433414459229, "logps/rejected": -1.312333345413208, "loss": 1.3007, "rewards/accuracies": 0.46875, "rewards/chosen": -1.3007433414459229, "rewards/margins": 0.01158988382667303, "rewards/rejected": -1.312333345413208, "step": 1870 }, { "epoch": 1.0035122930255895, "grad_norm": 5.53527874224577, "learning_rate": 8.416327172436446e-07, "logits/chosen": -0.16671988368034363, "logits/rejected": -0.06301872432231903, "logps/chosen": -1.2666199207305908, "logps/rejected": -1.3231887817382812, "loss": 1.2666, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2666199207305908, "rewards/margins": 0.05656880885362625, "rewards/rejected": -1.3231887817382812, "step": 1875 }, { "epoch": 1.0061883258069912, "grad_norm": 5.78590090648349, "learning_rate": 8.404938929933778e-07, "logits/chosen": -0.0716826468706131, "logits/rejected": 0.06403736025094986, "logps/chosen": -1.2127718925476074, "logps/rejected": -1.4450677633285522, "loss": 1.2128, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2127718925476074, "rewards/margins": 0.23229601979255676, "rewards/rejected": -1.4450677633285522, "step": 1880 }, { "epoch": 1.0088643585883927, "grad_norm": 6.970591349996469, "learning_rate": 8.39351765269868e-07, "logits/chosen": -0.09602105617523193, "logits/rejected": -0.03710053116083145, "logps/chosen": -1.1678842306137085, "logps/rejected": -1.3162766695022583, "loss": 1.1679, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1678842306137085, "rewards/margins": 0.1483924835920334, "rewards/rejected": -1.3162766695022583, "step": 1885 }, { "epoch": 1.0115403913697942, "grad_norm": 4.764827882172242, "learning_rate": 8.382063451540431e-07, "logits/chosen": -0.11473502218723297, "logits/rejected": 0.05037958174943924, "logps/chosen": -1.239828109741211, "logps/rejected": -1.3561474084854126, "loss": 1.2398, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.239828109741211, "rewards/margins": 0.11631934344768524, "rewards/rejected": -1.3561474084854126, "step": 1890 }, { "epoch": 1.014216424151196, "grad_norm": 5.362149895508951, "learning_rate": 8.370576437587742e-07, "logits/chosen": -0.08011587709188461, "logits/rejected": -0.03522108122706413, "logps/chosen": -1.2102866172790527, "logps/rejected": -1.3315985202789307, "loss": 1.2103, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2102866172790527, "rewards/margins": 0.12131186574697495, "rewards/rejected": -1.3315985202789307, "step": 1895 }, { "epoch": 1.0168924569325974, "grad_norm": 5.082764803073307, "learning_rate": 8.359056722287674e-07, "logits/chosen": -0.1782008707523346, "logits/rejected": 0.01714390330016613, "logps/chosen": -1.2513673305511475, "logps/rejected": -1.3302863836288452, "loss": 1.2514, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2513673305511475, "rewards/margins": 0.07891912758350372, "rewards/rejected": -1.3302863836288452, "step": 1900 }, { "epoch": 1.019568489713999, "grad_norm": 6.004415888211863, "learning_rate": 8.347504417404553e-07, "logits/chosen": -0.11069399118423462, "logits/rejected": 0.002429025713354349, "logps/chosen": -1.2668018341064453, "logps/rejected": -1.3524221181869507, "loss": 1.2668, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2668018341064453, "rewards/margins": 0.08562029898166656, "rewards/rejected": -1.3524221181869507, "step": 1905 }, { "epoch": 1.0222445224954007, "grad_norm": 5.942784571705081, "learning_rate": 8.335919635018893e-07, "logits/chosen": -0.18045949935913086, "logits/rejected": -0.08133818209171295, "logps/chosen": -1.2410888671875, "logps/rejected": -1.3737916946411133, "loss": 1.2411, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2410888671875, "rewards/margins": 0.1327027678489685, "rewards/rejected": -1.3737916946411133, "step": 1910 }, { "epoch": 1.0249205552768021, "grad_norm": 4.530737490973604, "learning_rate": 8.324302487526303e-07, "logits/chosen": -0.12958183884620667, "logits/rejected": -0.07654155045747757, "logps/chosen": -1.201305866241455, "logps/rejected": -1.2773125171661377, "loss": 1.2013, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.201305866241455, "rewards/margins": 0.07600654661655426, "rewards/rejected": -1.2773125171661377, "step": 1915 }, { "epoch": 1.0275965880582036, "grad_norm": 4.968662589128607, "learning_rate": 8.312653087636398e-07, "logits/chosen": -0.1511370986700058, "logits/rejected": -0.09386531263589859, "logps/chosen": -1.1283725500106812, "logps/rejected": -1.2839008569717407, "loss": 1.1284, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1283725500106812, "rewards/margins": 0.155528262257576, "rewards/rejected": -1.2839008569717407, "step": 1920 }, { "epoch": 1.0302726208396054, "grad_norm": 6.04410159796314, "learning_rate": 8.300971548371711e-07, "logits/chosen": -0.2452036589384079, "logits/rejected": -0.07627084851264954, "logps/chosen": -1.3105523586273193, "logps/rejected": -1.3598114252090454, "loss": 1.3106, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.3105523586273193, "rewards/margins": 0.04925920441746712, "rewards/rejected": -1.3598114252090454, "step": 1925 }, { "epoch": 1.0329486536210069, "grad_norm": 5.599404315609211, "learning_rate": 8.289257983066582e-07, "logits/chosen": -0.16792604327201843, "logits/rejected": -0.059492744505405426, "logps/chosen": -1.1681954860687256, "logps/rejected": -1.3302329778671265, "loss": 1.1682, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1681954860687256, "rewards/margins": 0.1620374470949173, "rewards/rejected": -1.3302329778671265, "step": 1930 }, { "epoch": 1.0356246864024083, "grad_norm": 4.915464289872929, "learning_rate": 8.277512505366077e-07, "logits/chosen": -0.19638477265834808, "logits/rejected": -0.05443229526281357, "logps/chosen": -1.2369394302368164, "logps/rejected": -1.3513962030410767, "loss": 1.2369, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2369394302368164, "rewards/margins": 0.11445698887109756, "rewards/rejected": -1.3513962030410767, "step": 1935 }, { "epoch": 1.03830071918381, "grad_norm": 6.00910111314151, "learning_rate": 8.265735229224868e-07, "logits/chosen": -0.10789616405963898, "logits/rejected": -0.0299757719039917, "logps/chosen": -1.2394063472747803, "logps/rejected": -1.3684817552566528, "loss": 1.2394, "rewards/accuracies": 0.5, "rewards/chosen": -1.2394063472747803, "rewards/margins": 0.12907540798187256, "rewards/rejected": -1.3684817552566528, "step": 1940 }, { "epoch": 1.0409767519652116, "grad_norm": 6.802877890217001, "learning_rate": 8.253926268906144e-07, "logits/chosen": -0.20342381298542023, "logits/rejected": -0.07089727371931076, "logps/chosen": -1.226131558418274, "logps/rejected": -1.3154659271240234, "loss": 1.2261, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.226131558418274, "rewards/margins": 0.0893343836069107, "rewards/rejected": -1.3154659271240234, "step": 1945 }, { "epoch": 1.043652784746613, "grad_norm": 5.537511618082795, "learning_rate": 8.242085738980487e-07, "logits/chosen": -0.12597547471523285, "logits/rejected": 0.041483283042907715, "logps/chosen": -1.272071123123169, "logps/rejected": -1.3488727807998657, "loss": 1.2721, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.272071123123169, "rewards/margins": 0.07680176943540573, "rewards/rejected": -1.3488727807998657, "step": 1950 }, { "epoch": 1.0463288175280148, "grad_norm": 4.833152176079654, "learning_rate": 8.230213754324772e-07, "logits/chosen": -0.10111737251281738, "logits/rejected": -0.05218609422445297, "logps/chosen": -1.1657768487930298, "logps/rejected": -1.3429656028747559, "loss": 1.1658, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1657768487930298, "rewards/margins": 0.17718879878520966, "rewards/rejected": -1.3429656028747559, "step": 1955 }, { "epoch": 1.0490048503094163, "grad_norm": 5.053968023104399, "learning_rate": 8.218310430121045e-07, "logits/chosen": -0.17936724424362183, "logits/rejected": -0.1632976084947586, "logps/chosen": -1.2209458351135254, "logps/rejected": -1.3213059902191162, "loss": 1.2209, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2209458351135254, "rewards/margins": 0.10036007314920425, "rewards/rejected": -1.3213059902191162, "step": 1960 }, { "epoch": 1.051680883090818, "grad_norm": 5.765422559782699, "learning_rate": 8.20637588185541e-07, "logits/chosen": -0.08840032666921616, "logits/rejected": -0.03862199932336807, "logps/chosen": -1.169777750968933, "logps/rejected": -1.406432867050171, "loss": 1.1698, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.169777750968933, "rewards/margins": 0.2366550862789154, "rewards/rejected": -1.406432867050171, "step": 1965 }, { "epoch": 1.0543569158722195, "grad_norm": 6.512676705139522, "learning_rate": 8.194410225316906e-07, "logits/chosen": -0.121294304728508, "logits/rejected": -0.014312471263110638, "logps/chosen": -1.2345852851867676, "logps/rejected": -1.372929573059082, "loss": 1.2346, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2345852851867676, "rewards/margins": 0.13834428787231445, "rewards/rejected": -1.372929573059082, "step": 1970 }, { "epoch": 1.057032948653621, "grad_norm": 6.718892091740899, "learning_rate": 8.182413576596385e-07, "logits/chosen": -0.062336765229701996, "logits/rejected": 0.010673092678189278, "logps/chosen": -1.1685123443603516, "logps/rejected": -1.3177303075790405, "loss": 1.1685, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1685123443603516, "rewards/margins": 0.14921803772449493, "rewards/rejected": -1.3177303075790405, "step": 1975 }, { "epoch": 1.0597089814350227, "grad_norm": 6.269028387534846, "learning_rate": 8.170386052085389e-07, "logits/chosen": -0.03442586958408356, "logits/rejected": 0.0655202567577362, "logps/chosen": -1.2380412817001343, "logps/rejected": -1.3594632148742676, "loss": 1.238, "rewards/accuracies": 0.5, "rewards/chosen": -1.2380412817001343, "rewards/margins": 0.12142185121774673, "rewards/rejected": -1.3594632148742676, "step": 1980 }, { "epoch": 1.0623850142164242, "grad_norm": 7.083986919426561, "learning_rate": 8.158327768475008e-07, "logits/chosen": -0.10518360137939453, "logits/rejected": 0.019456366077065468, "logps/chosen": -1.2587789297103882, "logps/rejected": -1.3071810007095337, "loss": 1.2588, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2587789297103882, "rewards/margins": 0.04840191826224327, "rewards/rejected": -1.3071810007095337, "step": 1985 }, { "epoch": 1.0650610469978257, "grad_norm": 6.359174455491181, "learning_rate": 8.146238842754767e-07, "logits/chosen": -0.12381356954574585, "logits/rejected": -0.04899066314101219, "logps/chosen": -1.2622495889663696, "logps/rejected": -1.3800541162490845, "loss": 1.2622, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2622495889663696, "rewards/margins": 0.11780460178852081, "rewards/rejected": -1.3800541162490845, "step": 1990 }, { "epoch": 1.0677370797792274, "grad_norm": 7.031474260723367, "learning_rate": 8.134119392211476e-07, "logits/chosen": -0.011473476886749268, "logits/rejected": 0.10578162968158722, "logps/chosen": -1.1833925247192383, "logps/rejected": -1.39220130443573, "loss": 1.1834, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1833925247192383, "rewards/margins": 0.20880897343158722, "rewards/rejected": -1.39220130443573, "step": 1995 }, { "epoch": 1.0704131125606289, "grad_norm": 6.437384769255352, "learning_rate": 8.121969534428094e-07, "logits/chosen": -0.13591434061527252, "logits/rejected": -0.013417154550552368, "logps/chosen": -1.26173996925354, "logps/rejected": -1.2660226821899414, "loss": 1.2617, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.26173996925354, "rewards/margins": 0.004282678477466106, "rewards/rejected": -1.2660226821899414, "step": 2000 }, { "epoch": 1.0704131125606289, "eval_logits/chosen": 0.11803299188613892, "eval_logits/rejected": 0.18416385352611542, "eval_logps/chosen": -1.2723654508590698, "eval_logps/rejected": -1.3957616090774536, "eval_loss": 1.272648572921753, "eval_rewards/accuracies": 0.5482195615768433, "eval_rewards/chosen": -1.2723654508590698, "eval_rewards/margins": 0.12339626252651215, "eval_rewards/rejected": -1.3957616090774536, "eval_runtime": 40.4759, "eval_samples_per_second": 33.23, "eval_steps_per_second": 8.326, "step": 2000 }, { "epoch": 1.0730891453420304, "grad_norm": 6.646707601263178, "learning_rate": 8.109789387282599e-07, "logits/chosen": -0.1225653663277626, "logits/rejected": -0.06654112040996552, "logps/chosen": -1.2722960710525513, "logps/rejected": -1.3049709796905518, "loss": 1.2723, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.2722960710525513, "rewards/margins": 0.032674990594387054, "rewards/rejected": -1.3049709796905518, "step": 2005 }, { "epoch": 1.075765178123432, "grad_norm": 6.292968364555303, "learning_rate": 8.097579068946827e-07, "logits/chosen": -0.047016628086566925, "logits/rejected": 0.030281126499176025, "logps/chosen": -1.2021185159683228, "logps/rejected": -1.3069348335266113, "loss": 1.2021, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2021185159683228, "rewards/margins": 0.10481645911931992, "rewards/rejected": -1.3069348335266113, "step": 2010 }, { "epoch": 1.0784412109048336, "grad_norm": 6.275413396543524, "learning_rate": 8.085338697885344e-07, "logits/chosen": -0.10963277518749237, "logits/rejected": 0.020370814949274063, "logps/chosen": -1.1683590412139893, "logps/rejected": -1.3022961616516113, "loss": 1.1684, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1683590412139893, "rewards/margins": 0.1339370310306549, "rewards/rejected": -1.3022961616516113, "step": 2015 }, { "epoch": 1.081117243686235, "grad_norm": 4.8680623916634955, "learning_rate": 8.073068392854282e-07, "logits/chosen": -0.12415452301502228, "logits/rejected": -0.003602514509111643, "logps/chosen": -1.282753586769104, "logps/rejected": -1.4027743339538574, "loss": 1.2828, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.282753586769104, "rewards/margins": 0.12002084404230118, "rewards/rejected": -1.4027743339538574, "step": 2020 }, { "epoch": 1.0837932764676368, "grad_norm": 5.592836907360289, "learning_rate": 8.060768272900193e-07, "logits/chosen": -0.04008990526199341, "logits/rejected": 0.0548185296356678, "logps/chosen": -1.2346470355987549, "logps/rejected": -1.3540364503860474, "loss": 1.2346, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2346470355987549, "rewards/margins": 0.11938939243555069, "rewards/rejected": -1.3540364503860474, "step": 2025 }, { "epoch": 1.0864693092490383, "grad_norm": 4.3612813881412995, "learning_rate": 8.0484384573589e-07, "logits/chosen": -0.14926376938819885, "logits/rejected": -0.13337209820747375, "logps/chosen": -1.178349256515503, "logps/rejected": -1.3213645219802856, "loss": 1.1783, "rewards/accuracies": 0.5625, "rewards/chosen": -1.178349256515503, "rewards/margins": 0.14301522076129913, "rewards/rejected": -1.3213645219802856, "step": 2030 }, { "epoch": 1.0891453420304398, "grad_norm": 6.265758681699471, "learning_rate": 8.03607906585432e-07, "logits/chosen": -0.136104553937912, "logits/rejected": 4.01169054384809e-05, "logps/chosen": -1.1802616119384766, "logps/rejected": -1.2947032451629639, "loss": 1.1803, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.1802616119384766, "rewards/margins": 0.1144416555762291, "rewards/rejected": -1.2947032451629639, "step": 2035 }, { "epoch": 1.0918213748118415, "grad_norm": 7.612345844853744, "learning_rate": 8.023690218297329e-07, "logits/chosen": -0.19015324115753174, "logits/rejected": -0.14277532696723938, "logps/chosen": -1.2053598165512085, "logps/rejected": -1.284320592880249, "loss": 1.2054, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2053598165512085, "rewards/margins": 0.07896091043949127, "rewards/rejected": -1.284320592880249, "step": 2040 }, { "epoch": 1.094497407593243, "grad_norm": 6.638676820040484, "learning_rate": 8.01127203488458e-07, "logits/chosen": -0.08868779242038727, "logits/rejected": -0.059639714658260345, "logps/chosen": -1.211193323135376, "logps/rejected": -1.3373340368270874, "loss": 1.2112, "rewards/accuracies": 0.5625, "rewards/chosen": -1.211193323135376, "rewards/margins": 0.1261407434940338, "rewards/rejected": -1.3373340368270874, "step": 2045 }, { "epoch": 1.0971734403746445, "grad_norm": 4.620101967373841, "learning_rate": 7.998824636097339e-07, "logits/chosen": -0.17162132263183594, "logits/rejected": -0.08911529928445816, "logps/chosen": -1.2484534978866577, "logps/rejected": -1.2800065279006958, "loss": 1.2485, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2484534978866577, "rewards/margins": 0.03155306354165077, "rewards/rejected": -1.2800065279006958, "step": 2050 }, { "epoch": 1.0998494731560462, "grad_norm": 5.526195981232156, "learning_rate": 7.986348142700328e-07, "logits/chosen": -0.07313922792673111, "logits/rejected": 0.02104576677083969, "logps/chosen": -1.2223886251449585, "logps/rejected": -1.2981535196304321, "loss": 1.2224, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2223886251449585, "rewards/margins": 0.07576508820056915, "rewards/rejected": -1.2981535196304321, "step": 2055 }, { "epoch": 1.1025255059374477, "grad_norm": 5.80782846960832, "learning_rate": 7.973842675740539e-07, "logits/chosen": -0.0603206641972065, "logits/rejected": -0.029817987233400345, "logps/chosen": -1.2448575496673584, "logps/rejected": -1.3804365396499634, "loss": 1.2449, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2448575496673584, "rewards/margins": 0.13557906448841095, "rewards/rejected": -1.3804365396499634, "step": 2060 }, { "epoch": 1.1052015387188494, "grad_norm": 4.29981677489146, "learning_rate": 7.961308356546066e-07, "logits/chosen": -0.09495701640844345, "logits/rejected": -0.002689081709831953, "logps/chosen": -1.229345440864563, "logps/rejected": -1.2979934215545654, "loss": 1.2293, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.229345440864563, "rewards/margins": 0.06864795088768005, "rewards/rejected": -1.2979934215545654, "step": 2065 }, { "epoch": 1.107877571500251, "grad_norm": 7.310264004235706, "learning_rate": 7.948745306724931e-07, "logits/chosen": -0.10165844857692719, "logits/rejected": 0.02135707437992096, "logps/chosen": -1.1787116527557373, "logps/rejected": -1.3846535682678223, "loss": 1.1787, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1787116527557373, "rewards/margins": 0.20594200491905212, "rewards/rejected": -1.3846535682678223, "step": 2070 }, { "epoch": 1.1105536042816524, "grad_norm": 5.813177305591112, "learning_rate": 7.936153648163897e-07, "logits/chosen": -0.1119382381439209, "logits/rejected": -0.02842354215681553, "logps/chosen": -1.207793951034546, "logps/rejected": -1.3851245641708374, "loss": 1.2078, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.207793951034546, "rewards/margins": 0.1773306429386139, "rewards/rejected": -1.3851245641708374, "step": 2075 }, { "epoch": 1.1132296370630541, "grad_norm": 5.434751053185642, "learning_rate": 7.92353350302729e-07, "logits/chosen": -0.17671611905097961, "logits/rejected": -0.04778124764561653, "logps/chosen": -1.147013783454895, "logps/rejected": -1.3792873620986938, "loss": 1.147, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.147013783454895, "rewards/margins": 0.2322736233472824, "rewards/rejected": -1.3792873620986938, "step": 2080 }, { "epoch": 1.1159056698444556, "grad_norm": 6.538424625835844, "learning_rate": 7.910884993755816e-07, "logits/chosen": -0.1453331708908081, "logits/rejected": -0.05616896599531174, "logps/chosen": -1.2039200067520142, "logps/rejected": -1.2993948459625244, "loss": 1.2039, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2039200067520142, "rewards/margins": 0.09547483921051025, "rewards/rejected": -1.2993948459625244, "step": 2085 }, { "epoch": 1.118581702625857, "grad_norm": 6.47081386538401, "learning_rate": 7.898208243065367e-07, "logits/chosen": -0.18423119187355042, "logits/rejected": -0.18029651045799255, "logps/chosen": -1.1753861904144287, "logps/rejected": -1.2847355604171753, "loss": 1.1754, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1753861904144287, "rewards/margins": 0.10934926569461823, "rewards/rejected": -1.2847355604171753, "step": 2090 }, { "epoch": 1.1212577354072588, "grad_norm": 4.102415823322389, "learning_rate": 7.88550337394583e-07, "logits/chosen": -0.11657415330410004, "logits/rejected": -0.009666027501225471, "logps/chosen": -1.3390775918960571, "logps/rejected": -1.4131731986999512, "loss": 1.3391, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3390775918960571, "rewards/margins": 0.07409565895795822, "rewards/rejected": -1.4131731986999512, "step": 2095 }, { "epoch": 1.1239337681886603, "grad_norm": 7.173426557839658, "learning_rate": 7.872770509659905e-07, "logits/chosen": -0.05759633705019951, "logits/rejected": -0.035628657788038254, "logps/chosen": -1.3439884185791016, "logps/rejected": -1.3715219497680664, "loss": 1.344, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.3439884185791016, "rewards/margins": 0.027533572167158127, "rewards/rejected": -1.3715219497680664, "step": 2100 }, { "epoch": 1.1266098009700618, "grad_norm": 5.135477546192973, "learning_rate": 7.860009773741896e-07, "logits/chosen": -0.05608576536178589, "logits/rejected": 0.030679399147629738, "logps/chosen": -1.2668836116790771, "logps/rejected": -1.3313047885894775, "loss": 1.2669, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2668836116790771, "rewards/margins": 0.06442122161388397, "rewards/rejected": -1.3313047885894775, "step": 2105 }, { "epoch": 1.1292858337514635, "grad_norm": 6.605840647087909, "learning_rate": 7.84722128999652e-07, "logits/chosen": -0.15740814805030823, "logits/rejected": -0.025245826691389084, "logps/chosen": -1.2042474746704102, "logps/rejected": -1.4813419580459595, "loss": 1.2042, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2042474746704102, "rewards/margins": 0.27709439396858215, "rewards/rejected": -1.4813419580459595, "step": 2110 }, { "epoch": 1.131961866532865, "grad_norm": 6.128690480006359, "learning_rate": 7.834405182497699e-07, "logits/chosen": -0.04347889497876167, "logits/rejected": -0.004486176185309887, "logps/chosen": -1.2062757015228271, "logps/rejected": -1.3040331602096558, "loss": 1.2063, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2062757015228271, "rewards/margins": 0.09775739908218384, "rewards/rejected": -1.3040331602096558, "step": 2115 }, { "epoch": 1.1346378993142665, "grad_norm": 6.081292085223822, "learning_rate": 7.821561575587368e-07, "logits/chosen": -0.1146886944770813, "logits/rejected": -0.099217489361763, "logps/chosen": -1.2564787864685059, "logps/rejected": -1.350205659866333, "loss": 1.2565, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2564787864685059, "rewards/margins": 0.09372693300247192, "rewards/rejected": -1.350205659866333, "step": 2120 }, { "epoch": 1.1373139320956682, "grad_norm": 5.567513576354181, "learning_rate": 7.808690593874254e-07, "logits/chosen": -0.09144270420074463, "logits/rejected": -0.04936783015727997, "logps/chosen": -1.1730101108551025, "logps/rejected": -1.3505765199661255, "loss": 1.173, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1730101108551025, "rewards/margins": 0.17756634950637817, "rewards/rejected": -1.3505765199661255, "step": 2125 }, { "epoch": 1.1399899648770697, "grad_norm": 5.256224937376001, "learning_rate": 7.79579236223268e-07, "logits/chosen": -0.06911294907331467, "logits/rejected": 0.11118035018444061, "logps/chosen": -1.2518221139907837, "logps/rejected": -1.368566870689392, "loss": 1.2518, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2518221139907837, "rewards/margins": 0.11674489825963974, "rewards/rejected": -1.368566870689392, "step": 2130 }, { "epoch": 1.1426659976584714, "grad_norm": 4.429382963201047, "learning_rate": 7.782867005801346e-07, "logits/chosen": -0.08276676386594772, "logits/rejected": 0.03229274973273277, "logps/chosen": -1.2602951526641846, "logps/rejected": -1.3564172983169556, "loss": 1.2603, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2602951526641846, "rewards/margins": 0.09612230211496353, "rewards/rejected": -1.3564172983169556, "step": 2135 }, { "epoch": 1.145342030439873, "grad_norm": 6.317787594272561, "learning_rate": 7.769914649982117e-07, "logits/chosen": -0.13067980110645294, "logits/rejected": -0.031168658286333084, "logps/chosen": -1.2303802967071533, "logps/rejected": -1.3224575519561768, "loss": 1.2304, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.2303802967071533, "rewards/margins": 0.09207727760076523, "rewards/rejected": -1.3224575519561768, "step": 2140 }, { "epoch": 1.1480180632212744, "grad_norm": 5.279923851008018, "learning_rate": 7.756935420438803e-07, "logits/chosen": -0.09545958042144775, "logits/rejected": -0.04043347388505936, "logps/chosen": -1.1182196140289307, "logps/rejected": -1.3192049264907837, "loss": 1.1182, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1182196140289307, "rewards/margins": 0.2009853571653366, "rewards/rejected": -1.3192049264907837, "step": 2145 }, { "epoch": 1.1506940960026761, "grad_norm": 4.554423698810828, "learning_rate": 7.743929443095951e-07, "logits/chosen": -0.11814825236797333, "logits/rejected": -0.07862936705350876, "logps/chosen": -1.2877062559127808, "logps/rejected": -1.34181809425354, "loss": 1.2877, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2877062559127808, "rewards/margins": 0.054111890494823456, "rewards/rejected": -1.34181809425354, "step": 2150 }, { "epoch": 1.1533701287840776, "grad_norm": 5.94653081505584, "learning_rate": 7.730896844137609e-07, "logits/chosen": -0.08451677113771439, "logits/rejected": -0.0366448312997818, "logps/chosen": -1.2627698183059692, "logps/rejected": -1.3826115131378174, "loss": 1.2628, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2627698183059692, "rewards/margins": 0.11984151601791382, "rewards/rejected": -1.3826115131378174, "step": 2155 }, { "epoch": 1.1560461615654791, "grad_norm": 6.761829415886502, "learning_rate": 7.717837750006106e-07, "logits/chosen": -0.15608379244804382, "logits/rejected": -0.07101576030254364, "logps/chosen": -1.1800332069396973, "logps/rejected": -1.3448644876480103, "loss": 1.18, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1800332069396973, "rewards/margins": 0.1648310422897339, "rewards/rejected": -1.3448644876480103, "step": 2160 }, { "epoch": 1.1587221943468808, "grad_norm": 6.211343262753404, "learning_rate": 7.704752287400832e-07, "logits/chosen": -0.13989779353141785, "logits/rejected": 0.001315080327913165, "logps/chosen": -1.2382445335388184, "logps/rejected": -1.40060293674469, "loss": 1.2382, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2382445335388184, "rewards/margins": 0.16235843300819397, "rewards/rejected": -1.40060293674469, "step": 2165 }, { "epoch": 1.1613982271282823, "grad_norm": 4.85777188823862, "learning_rate": 7.691640583277004e-07, "logits/chosen": -0.11182068288326263, "logits/rejected": 0.015594090335071087, "logps/chosen": -1.197331190109253, "logps/rejected": -1.3707481622695923, "loss": 1.1973, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.197331190109253, "rewards/margins": 0.17341677844524384, "rewards/rejected": -1.3707481622695923, "step": 2170 }, { "epoch": 1.1640742599096838, "grad_norm": 5.3855378197946875, "learning_rate": 7.678502764844433e-07, "logits/chosen": -0.15428002178668976, "logits/rejected": -0.03136328607797623, "logps/chosen": -1.267195701599121, "logps/rejected": -1.3045258522033691, "loss": 1.2672, "rewards/accuracies": 0.53125, "rewards/chosen": -1.267195701599121, "rewards/margins": 0.03733006864786148, "rewards/rejected": -1.3045258522033691, "step": 2175 }, { "epoch": 1.1667502926910855, "grad_norm": 6.317388118479611, "learning_rate": 7.665338959566288e-07, "logits/chosen": -0.12922653555870056, "logits/rejected": -0.08303678035736084, "logps/chosen": -1.2043240070343018, "logps/rejected": -1.3332353830337524, "loss": 1.2043, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2043240070343018, "rewards/margins": 0.12891142070293427, "rewards/rejected": -1.3332353830337524, "step": 2180 }, { "epoch": 1.169426325472487, "grad_norm": 6.431218369393454, "learning_rate": 7.652149295157868e-07, "logits/chosen": -0.03538278117775917, "logits/rejected": 0.07423512637615204, "logps/chosen": -1.2341632843017578, "logps/rejected": -1.3104605674743652, "loss": 1.2342, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2341632843017578, "rewards/margins": 0.07629726082086563, "rewards/rejected": -1.3104605674743652, "step": 2185 }, { "epoch": 1.1721023582538885, "grad_norm": 5.602234214022139, "learning_rate": 7.638933899585354e-07, "logits/chosen": -0.0021831125486642122, "logits/rejected": 0.028201397508382797, "logps/chosen": -1.214002013206482, "logps/rejected": -1.3034735918045044, "loss": 1.214, "rewards/accuracies": 0.5, "rewards/chosen": -1.214002013206482, "rewards/margins": 0.08947134017944336, "rewards/rejected": -1.3034735918045044, "step": 2190 }, { "epoch": 1.1747783910352902, "grad_norm": 7.33641702429075, "learning_rate": 7.625692901064573e-07, "logits/chosen": -0.048849206417798996, "logits/rejected": 0.02845265530049801, "logps/chosen": -1.1938470602035522, "logps/rejected": -1.4144498109817505, "loss": 1.1938, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1938470602035522, "rewards/margins": 0.2206028401851654, "rewards/rejected": -1.4144498109817505, "step": 2195 }, { "epoch": 1.1774544238166917, "grad_norm": 6.685958641747072, "learning_rate": 7.61242642805975e-07, "logits/chosen": -0.16279780864715576, "logits/rejected": -0.1725049763917923, "logps/chosen": -1.222999930381775, "logps/rejected": -1.3382937908172607, "loss": 1.223, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.222999930381775, "rewards/margins": 0.11529377847909927, "rewards/rejected": -1.3382937908172607, "step": 2200 }, { "epoch": 1.1801304565980932, "grad_norm": 4.846606501475467, "learning_rate": 7.599134609282266e-07, "logits/chosen": -0.16077622771263123, "logits/rejected": -0.021885309368371964, "logps/chosen": -1.1638164520263672, "logps/rejected": -1.302443265914917, "loss": 1.1638, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1638164520263672, "rewards/margins": 0.13862690329551697, "rewards/rejected": -1.302443265914917, "step": 2205 }, { "epoch": 1.182806489379495, "grad_norm": 5.532913780230108, "learning_rate": 7.585817573689402e-07, "logits/chosen": -0.20432424545288086, "logits/rejected": -0.11666438728570938, "logps/chosen": -1.1132609844207764, "logps/rejected": -1.3089935779571533, "loss": 1.1133, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1132609844207764, "rewards/margins": 0.19573244452476501, "rewards/rejected": -1.3089935779571533, "step": 2210 }, { "epoch": 1.1854825221608964, "grad_norm": 5.862701400520488, "learning_rate": 7.572475450483098e-07, "logits/chosen": -0.16771462559700012, "logits/rejected": -0.11557847261428833, "logps/chosen": -1.3321723937988281, "logps/rejected": -1.4550265073776245, "loss": 1.3322, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.3321723937988281, "rewards/margins": 0.12285423278808594, "rewards/rejected": -1.4550265073776245, "step": 2215 }, { "epoch": 1.188158554942298, "grad_norm": 5.157307972388469, "learning_rate": 7.559108369108689e-07, "logits/chosen": -0.20522825419902802, "logits/rejected": -0.10974754393100739, "logps/chosen": -1.170841932296753, "logps/rejected": -1.261368989944458, "loss": 1.1708, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.170841932296753, "rewards/margins": 0.09052698314189911, "rewards/rejected": -1.261368989944458, "step": 2220 }, { "epoch": 1.1908345877236997, "grad_norm": 6.075940450317287, "learning_rate": 7.54571645925366e-07, "logits/chosen": -0.17551535367965698, "logits/rejected": -0.025912348181009293, "logps/chosen": -1.1862270832061768, "logps/rejected": -1.3653333187103271, "loss": 1.1862, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1862270832061768, "rewards/margins": 0.17910635471343994, "rewards/rejected": -1.3653333187103271, "step": 2225 }, { "epoch": 1.1935106205051011, "grad_norm": 6.821516928972686, "learning_rate": 7.532299850846378e-07, "logits/chosen": -0.20919263362884521, "logits/rejected": -0.11018610000610352, "logps/chosen": -1.2411514520645142, "logps/rejected": -1.4534623622894287, "loss": 1.2412, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2411514520645142, "rewards/margins": 0.21231091022491455, "rewards/rejected": -1.4534623622894287, "step": 2230 }, { "epoch": 1.1961866532865026, "grad_norm": 7.2267329048156626, "learning_rate": 7.518858674054838e-07, "logits/chosen": -0.163868710398674, "logits/rejected": -0.026418786495923996, "logps/chosen": -1.189070701599121, "logps/rejected": -1.3908929824829102, "loss": 1.1891, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.189070701599121, "rewards/margins": 0.2018224447965622, "rewards/rejected": -1.3908929824829102, "step": 2235 }, { "epoch": 1.1988626860679044, "grad_norm": 7.042794370703734, "learning_rate": 7.505393059285394e-07, "logits/chosen": -0.15852251648902893, "logits/rejected": -0.04955214634537697, "logps/chosen": -1.2072851657867432, "logps/rejected": -1.3626426458358765, "loss": 1.2073, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2072851657867432, "rewards/margins": 0.15535743534564972, "rewards/rejected": -1.3626426458358765, "step": 2240 }, { "epoch": 1.2015387188493059, "grad_norm": 6.707229730797827, "learning_rate": 7.491903137181501e-07, "logits/chosen": -0.11855274438858032, "logits/rejected": -0.08584754168987274, "logps/chosen": -1.1920125484466553, "logps/rejected": -1.3509137630462646, "loss": 1.192, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1920125484466553, "rewards/margins": 0.158901184797287, "rewards/rejected": -1.3509137630462646, "step": 2245 }, { "epoch": 1.2042147516307076, "grad_norm": 5.485436245441151, "learning_rate": 7.478389038622441e-07, "logits/chosen": -0.072904571890831, "logits/rejected": -0.06373757869005203, "logps/chosen": -1.1484746932983398, "logps/rejected": -1.373477816581726, "loss": 1.1485, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1484746932983398, "rewards/margins": 0.2250029593706131, "rewards/rejected": -1.373477816581726, "step": 2250 }, { "epoch": 1.206890784412109, "grad_norm": 5.133784217675373, "learning_rate": 7.46485089472206e-07, "logits/chosen": -0.16867655515670776, "logits/rejected": -0.08775879442691803, "logps/chosen": -1.2841382026672363, "logps/rejected": -1.291448712348938, "loss": 1.2841, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2841382026672363, "rewards/margins": 0.007310442626476288, "rewards/rejected": -1.291448712348938, "step": 2255 }, { "epoch": 1.2095668171935106, "grad_norm": 6.139681496388769, "learning_rate": 7.451288836827487e-07, "logits/chosen": -0.10172417014837265, "logits/rejected": -0.12270478904247284, "logps/chosen": -1.234037160873413, "logps/rejected": -1.316004753112793, "loss": 1.234, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.234037160873413, "rewards/margins": 0.08196769654750824, "rewards/rejected": -1.316004753112793, "step": 2260 }, { "epoch": 1.2122428499749123, "grad_norm": 8.06322836640158, "learning_rate": 7.437702996517869e-07, "logits/chosen": -0.18928216397762299, "logits/rejected": -0.11542143672704697, "logps/chosen": -1.264397382736206, "logps/rejected": -1.3504045009613037, "loss": 1.2644, "rewards/accuracies": 0.5, "rewards/chosen": -1.264397382736206, "rewards/margins": 0.08600713312625885, "rewards/rejected": -1.3504045009613037, "step": 2265 }, { "epoch": 1.2149188827563138, "grad_norm": 5.971794480385296, "learning_rate": 7.424093505603087e-07, "logits/chosen": -0.24165204167366028, "logits/rejected": -0.11858092248439789, "logps/chosen": -1.2038180828094482, "logps/rejected": -1.3582863807678223, "loss": 1.2038, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2038180828094482, "rewards/margins": 0.15446817874908447, "rewards/rejected": -1.3582863807678223, "step": 2270 }, { "epoch": 1.2175949155377153, "grad_norm": 6.157250673434528, "learning_rate": 7.410460496122482e-07, "logits/chosen": -0.14158464968204498, "logits/rejected": -0.06734490394592285, "logps/chosen": -1.1862826347351074, "logps/rejected": -1.378783106803894, "loss": 1.1863, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1862826347351074, "rewards/margins": 0.19250056147575378, "rewards/rejected": -1.378783106803894, "step": 2275 }, { "epoch": 1.220270948319117, "grad_norm": 6.695541682886889, "learning_rate": 7.396804100343572e-07, "logits/chosen": -0.1869097799062729, "logits/rejected": -0.07291697710752487, "logps/chosen": -1.1259138584136963, "logps/rejected": -1.2565505504608154, "loss": 1.1259, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.1259138584136963, "rewards/margins": 0.13063670694828033, "rewards/rejected": -1.2565505504608154, "step": 2280 }, { "epoch": 1.2229469811005185, "grad_norm": 5.497313435254243, "learning_rate": 7.383124450760768e-07, "logits/chosen": -0.1310300976037979, "logits/rejected": 0.021949628368020058, "logps/chosen": -1.2602825164794922, "logps/rejected": -1.345012903213501, "loss": 1.2603, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.2602825164794922, "rewards/margins": 0.08473043888807297, "rewards/rejected": -1.345012903213501, "step": 2285 }, { "epoch": 1.22562301388192, "grad_norm": 4.083742042294166, "learning_rate": 7.369421680094091e-07, "logits/chosen": -0.19800719618797302, "logits/rejected": -0.07768188416957855, "logps/chosen": -1.1212493181228638, "logps/rejected": -1.2288910150527954, "loss": 1.1212, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1212493181228638, "rewards/margins": 0.10764173418283463, "rewards/rejected": -1.2288910150527954, "step": 2290 }, { "epoch": 1.2282990466633217, "grad_norm": 5.862877842593878, "learning_rate": 7.355695921287881e-07, "logits/chosen": -0.18976297974586487, "logits/rejected": -0.12165684998035431, "logps/chosen": -1.1999415159225464, "logps/rejected": -1.2850992679595947, "loss": 1.1999, "rewards/accuracies": 0.53125, "rewards/chosen": -1.1999415159225464, "rewards/margins": 0.08515767753124237, "rewards/rejected": -1.2850992679595947, "step": 2295 }, { "epoch": 1.2309750794447232, "grad_norm": 5.513505925338819, "learning_rate": 7.341947307509513e-07, "logits/chosen": -0.13513775169849396, "logits/rejected": -0.04171319305896759, "logps/chosen": -1.205573320388794, "logps/rejected": -1.279417634010315, "loss": 1.2056, "rewards/accuracies": 0.46875, "rewards/chosen": -1.205573320388794, "rewards/margins": 0.07384411245584488, "rewards/rejected": -1.279417634010315, "step": 2300 }, { "epoch": 1.233651112226125, "grad_norm": 5.906486208603665, "learning_rate": 7.328175972148094e-07, "logits/chosen": -0.1726270318031311, "logits/rejected": -0.06444640457630157, "logps/chosen": -1.3323463201522827, "logps/rejected": -1.3852739334106445, "loss": 1.3323, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3323463201522827, "rewards/margins": 0.052927445620298386, "rewards/rejected": -1.3852739334106445, "step": 2305 }, { "epoch": 1.2363271450075264, "grad_norm": 5.557988407283486, "learning_rate": 7.314382048813185e-07, "logits/chosen": -0.1374932825565338, "logits/rejected": 0.06605522334575653, "logps/chosen": -1.2610841989517212, "logps/rejected": -1.3871711492538452, "loss": 1.2611, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2610841989517212, "rewards/margins": 0.12608718872070312, "rewards/rejected": -1.3871711492538452, "step": 2310 }, { "epoch": 1.2390031777889279, "grad_norm": 6.157130238083805, "learning_rate": 7.300565671333486e-07, "logits/chosen": -0.1305674910545349, "logits/rejected": -0.005376957822591066, "logps/chosen": -1.22694730758667, "logps/rejected": -1.3395136594772339, "loss": 1.2269, "rewards/accuracies": 0.59375, "rewards/chosen": -1.22694730758667, "rewards/margins": 0.11256654560565948, "rewards/rejected": -1.3395136594772339, "step": 2315 }, { "epoch": 1.2416792105703296, "grad_norm": 6.018672538056714, "learning_rate": 7.286726973755554e-07, "logits/chosen": -0.0636836364865303, "logits/rejected": -0.061504483222961426, "logps/chosen": -1.2489614486694336, "logps/rejected": -1.4065755605697632, "loss": 1.249, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2489614486694336, "rewards/margins": 0.15761418640613556, "rewards/rejected": -1.4065755605697632, "step": 2320 }, { "epoch": 1.244355243351731, "grad_norm": 8.033945868693223, "learning_rate": 7.272866090342493e-07, "logits/chosen": -0.03180120512843132, "logits/rejected": 0.01661248877644539, "logps/chosen": -1.2607100009918213, "logps/rejected": -1.4537031650543213, "loss": 1.2607, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2607100009918213, "rewards/margins": 0.19299302995204926, "rewards/rejected": -1.4537031650543213, "step": 2325 }, { "epoch": 1.2470312761331326, "grad_norm": 5.616544069876255, "learning_rate": 7.258983155572656e-07, "logits/chosen": -0.1952660083770752, "logits/rejected": -0.1158178448677063, "logps/chosen": -1.212640404701233, "logps/rejected": -1.3577678203582764, "loss": 1.2126, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.212640404701233, "rewards/margins": 0.145127534866333, "rewards/rejected": -1.3577678203582764, "step": 2330 }, { "epoch": 1.2497073089145343, "grad_norm": 4.405698192582436, "learning_rate": 7.245078304138335e-07, "logits/chosen": -0.07263140380382538, "logits/rejected": -0.03916376084089279, "logps/chosen": -1.2003884315490723, "logps/rejected": -1.3935892581939697, "loss": 1.2004, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2003884315490723, "rewards/margins": 0.19320061802864075, "rewards/rejected": -1.3935892581939697, "step": 2335 }, { "epoch": 1.2523833416959358, "grad_norm": 4.758035039374324, "learning_rate": 7.231151670944462e-07, "logits/chosen": -0.23078343272209167, "logits/rejected": -0.09720359742641449, "logps/chosen": -1.232654333114624, "logps/rejected": -1.338641881942749, "loss": 1.2327, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.232654333114624, "rewards/margins": 0.10598746687173843, "rewards/rejected": -1.338641881942749, "step": 2340 }, { "epoch": 1.2550593744773373, "grad_norm": 7.090537831505227, "learning_rate": 7.217203391107291e-07, "logits/chosen": -0.13938835263252258, "logits/rejected": -0.018946953117847443, "logps/chosen": -1.2228682041168213, "logps/rejected": -1.3263497352600098, "loss": 1.2229, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2228682041168213, "rewards/margins": 0.1034814715385437, "rewards/rejected": -1.3263497352600098, "step": 2345 }, { "epoch": 1.257735407258739, "grad_norm": 5.860121716928358, "learning_rate": 7.203233599953096e-07, "logits/chosen": -0.16663803160190582, "logits/rejected": -0.07169262319803238, "logps/chosen": -1.2706328630447388, "logps/rejected": -1.331061601638794, "loss": 1.2706, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2706328630447388, "rewards/margins": 0.06042858958244324, "rewards/rejected": -1.331061601638794, "step": 2350 }, { "epoch": 1.2604114400401405, "grad_norm": 6.058008735213792, "learning_rate": 7.189242433016852e-07, "logits/chosen": -0.08751989901065826, "logits/rejected": 0.01646042801439762, "logps/chosen": -1.146209716796875, "logps/rejected": -1.3127912282943726, "loss": 1.1462, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.146209716796875, "rewards/margins": 0.16658176481723785, "rewards/rejected": -1.3127912282943726, "step": 2355 }, { "epoch": 1.263087472821542, "grad_norm": 5.164842370788782, "learning_rate": 7.17523002604092e-07, "logits/chosen": -0.09641105681657791, "logits/rejected": -0.010035079903900623, "logps/chosen": -1.1928147077560425, "logps/rejected": -1.4774106740951538, "loss": 1.1928, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.1928147077560425, "rewards/margins": 0.28459596633911133, "rewards/rejected": -1.4774106740951538, "step": 2360 }, { "epoch": 1.2657635056029437, "grad_norm": 4.352884462228753, "learning_rate": 7.161196514973734e-07, "logits/chosen": -0.08036161959171295, "logits/rejected": 0.0021308318246155977, "logps/chosen": -1.2220855951309204, "logps/rejected": -1.3417062759399414, "loss": 1.2221, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2220855951309204, "rewards/margins": 0.11962060630321503, "rewards/rejected": -1.3417062759399414, "step": 2365 }, { "epoch": 1.2684395383843452, "grad_norm": 6.015266334203248, "learning_rate": 7.147142035968483e-07, "logits/chosen": -0.08045981824398041, "logits/rejected": -0.002406784100458026, "logps/chosen": -1.1720759868621826, "logps/rejected": -1.3281047344207764, "loss": 1.1721, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1720759868621826, "rewards/margins": 0.156028613448143, "rewards/rejected": -1.3281047344207764, "step": 2370 }, { "epoch": 1.2711155711657467, "grad_norm": 6.9848559134913755, "learning_rate": 7.133066725381781e-07, "logits/chosen": -0.22132258117198944, "logits/rejected": -0.10166176408529282, "logps/chosen": -1.1386476755142212, "logps/rejected": -1.1912927627563477, "loss": 1.1386, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.1386476755142212, "rewards/margins": 0.052645307034254074, "rewards/rejected": -1.1912927627563477, "step": 2375 }, { "epoch": 1.2737916039471484, "grad_norm": 5.74223075953877, "learning_rate": 7.118970719772354e-07, "logits/chosen": -0.21080493927001953, "logits/rejected": -0.039296723902225494, "logps/chosen": -1.2530438899993896, "logps/rejected": -1.3619322776794434, "loss": 1.253, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2530438899993896, "rewards/margins": 0.10888823121786118, "rewards/rejected": -1.3619322776794434, "step": 2380 }, { "epoch": 1.27646763672855, "grad_norm": 4.981549525115716, "learning_rate": 7.104854155899711e-07, "logits/chosen": -0.10131505876779556, "logits/rejected": -0.023238830268383026, "logps/chosen": -1.2234230041503906, "logps/rejected": -1.3181798458099365, "loss": 1.2234, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2234230041503906, "rewards/margins": 0.09475687146186829, "rewards/rejected": -1.3181798458099365, "step": 2385 }, { "epoch": 1.2791436695099514, "grad_norm": 5.989309431681977, "learning_rate": 7.090717170722817e-07, "logits/chosen": -0.12014023214578629, "logits/rejected": -0.07773660123348236, "logps/chosen": -1.2022826671600342, "logps/rejected": -1.345832347869873, "loss": 1.2023, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2022826671600342, "rewards/margins": 0.1435496062040329, "rewards/rejected": -1.345832347869873, "step": 2390 }, { "epoch": 1.2818197022913531, "grad_norm": 4.667722065279638, "learning_rate": 7.076559901398762e-07, "logits/chosen": -0.2519562840461731, "logits/rejected": -0.16533006727695465, "logps/chosen": -1.1316670179367065, "logps/rejected": -1.3023967742919922, "loss": 1.1317, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1316670179367065, "rewards/margins": 0.17072978615760803, "rewards/rejected": -1.3023967742919922, "step": 2395 }, { "epoch": 1.2844957350727546, "grad_norm": 7.352796600822539, "learning_rate": 7.062382485281436e-07, "logits/chosen": -0.12674446403980255, "logits/rejected": -0.048222292214632034, "logps/chosen": -1.1894147396087646, "logps/rejected": -1.3358221054077148, "loss": 1.1894, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1894147396087646, "rewards/margins": 0.14640747010707855, "rewards/rejected": -1.3358221054077148, "step": 2400 }, { "epoch": 1.2844957350727546, "eval_logits/chosen": 0.058554064482450485, "eval_logits/rejected": 0.12115480750799179, "eval_logps/chosen": -1.2686524391174316, "eval_logps/rejected": -1.3924317359924316, "eval_loss": 1.2689176797866821, "eval_rewards/accuracies": 0.5459940433502197, "eval_rewards/chosen": -1.2686524391174316, "eval_rewards/margins": 0.12377943098545074, "eval_rewards/rejected": -1.3924317359924316, "eval_runtime": 40.6055, "eval_samples_per_second": 33.124, "eval_steps_per_second": 8.299, "step": 2400 }, { "epoch": 1.287171767854156, "grad_norm": 5.756584148118409, "learning_rate": 7.048185059920193e-07, "logits/chosen": -0.14685367047786713, "logits/rejected": -0.025353770703077316, "logps/chosen": -1.2213528156280518, "logps/rejected": -1.3637504577636719, "loss": 1.2214, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2213528156280518, "rewards/margins": 0.14239779114723206, "rewards/rejected": -1.3637504577636719, "step": 2405 }, { "epoch": 1.2898478006355578, "grad_norm": 5.026092925210539, "learning_rate": 7.033967763058516e-07, "logits/chosen": -0.2729204595088959, "logits/rejected": -0.12199461460113525, "logps/chosen": -1.193528175354004, "logps/rejected": -1.2582372426986694, "loss": 1.1935, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.193528175354004, "rewards/margins": 0.06470900774002075, "rewards/rejected": -1.2582372426986694, "step": 2410 }, { "epoch": 1.2925238334169593, "grad_norm": 4.894382542792849, "learning_rate": 7.019730732632681e-07, "logits/chosen": -0.09305700659751892, "logits/rejected": -0.05088656023144722, "logps/chosen": -1.1329728364944458, "logps/rejected": -1.3588135242462158, "loss": 1.133, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1329728364944458, "rewards/margins": 0.22584085166454315, "rewards/rejected": -1.3588135242462158, "step": 2415 }, { "epoch": 1.2951998661983608, "grad_norm": 5.773258242509515, "learning_rate": 7.005474106770418e-07, "logits/chosen": -0.23501893877983093, "logits/rejected": -0.13734331727027893, "logps/chosen": -1.2450530529022217, "logps/rejected": -1.4533851146697998, "loss": 1.2451, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2450530529022217, "rewards/margins": 0.20833218097686768, "rewards/rejected": -1.4533851146697998, "step": 2420 }, { "epoch": 1.2978758989797625, "grad_norm": 4.917156345654786, "learning_rate": 6.991198023789577e-07, "logits/chosen": -0.09801820665597916, "logits/rejected": -0.037927351891994476, "logps/chosen": -1.1822845935821533, "logps/rejected": -1.339621663093567, "loss": 1.1823, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.1822845935821533, "rewards/margins": 0.15733689069747925, "rewards/rejected": -1.339621663093567, "step": 2425 }, { "epoch": 1.300551931761164, "grad_norm": 7.006274223550623, "learning_rate": 6.976902622196776e-07, "logits/chosen": -0.10454027354717255, "logits/rejected": -0.08601929247379303, "logps/chosen": -1.3191375732421875, "logps/rejected": -1.4358090162277222, "loss": 1.3191, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3191375732421875, "rewards/margins": 0.11667150259017944, "rewards/rejected": -1.4358090162277222, "step": 2430 }, { "epoch": 1.3032279645425655, "grad_norm": 4.1076584987422615, "learning_rate": 6.962588040686064e-07, "logits/chosen": -0.06775353848934174, "logits/rejected": 0.025494346395134926, "logps/chosen": -1.1895873546600342, "logps/rejected": -1.3102784156799316, "loss": 1.1896, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1895873546600342, "rewards/margins": 0.1206909641623497, "rewards/rejected": -1.3102784156799316, "step": 2435 }, { "epoch": 1.3059039973239672, "grad_norm": 7.079979764307052, "learning_rate": 6.948254418137573e-07, "logits/chosen": -0.20635966956615448, "logits/rejected": -0.11962350457906723, "logps/chosen": -1.1720603704452515, "logps/rejected": -1.3039454221725464, "loss": 1.1721, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.1720603704452515, "rewards/margins": 0.13188493251800537, "rewards/rejected": -1.3039454221725464, "step": 2440 }, { "epoch": 1.3085800301053687, "grad_norm": 7.354598891807868, "learning_rate": 6.933901893616174e-07, "logits/chosen": -0.16484594345092773, "logits/rejected": -0.051461853086948395, "logps/chosen": -1.2133208513259888, "logps/rejected": -1.3139936923980713, "loss": 1.2133, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2133208513259888, "rewards/margins": 0.10067275911569595, "rewards/rejected": -1.3139936923980713, "step": 2445 }, { "epoch": 1.3112560628867704, "grad_norm": 5.026820416856582, "learning_rate": 6.919530606370121e-07, "logits/chosen": -0.17562806606292725, "logits/rejected": -0.06000471115112305, "logps/chosen": -1.175545573234558, "logps/rejected": -1.3789474964141846, "loss": 1.1755, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.175545573234558, "rewards/margins": 0.2034018486738205, "rewards/rejected": -1.3789474964141846, "step": 2450 }, { "epoch": 1.313932095668172, "grad_norm": 4.339605884454231, "learning_rate": 6.905140695829706e-07, "logits/chosen": -0.14863739907741547, "logits/rejected": 0.020506704226136208, "logps/chosen": -1.2667148113250732, "logps/rejected": -1.314391851425171, "loss": 1.2667, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2667148113250732, "rewards/margins": 0.04767703264951706, "rewards/rejected": -1.314391851425171, "step": 2455 }, { "epoch": 1.3166081284495736, "grad_norm": 7.906371517016296, "learning_rate": 6.890732301605904e-07, "logits/chosen": -0.13347241282463074, "logits/rejected": -0.06190046668052673, "logps/chosen": -1.2485685348510742, "logps/rejected": -1.298797845840454, "loss": 1.2486, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2485685348510742, "rewards/margins": 0.05022914335131645, "rewards/rejected": -1.298797845840454, "step": 2460 }, { "epoch": 1.3192841612309751, "grad_norm": 5.311298087652864, "learning_rate": 6.876305563489021e-07, "logits/chosen": -0.11935408413410187, "logits/rejected": -0.07656504213809967, "logps/chosen": -1.2215687036514282, "logps/rejected": -1.3551779985427856, "loss": 1.2216, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2215687036514282, "rewards/margins": 0.13360922038555145, "rewards/rejected": -1.3551779985427856, "step": 2465 }, { "epoch": 1.3219601940123766, "grad_norm": 6.785630020467464, "learning_rate": 6.861860621447331e-07, "logits/chosen": -0.2443225383758545, "logits/rejected": -0.14668330550193787, "logps/chosen": -1.2012853622436523, "logps/rejected": -1.234836459159851, "loss": 1.2013, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.2012853622436523, "rewards/margins": 0.03355119749903679, "rewards/rejected": -1.234836459159851, "step": 2470 }, { "epoch": 1.3246362267937783, "grad_norm": 4.626457305304969, "learning_rate": 6.847397615625725e-07, "logits/chosen": -0.18069566786289215, "logits/rejected": -0.13281312584877014, "logps/chosen": -1.2213214635849, "logps/rejected": -1.3903062343597412, "loss": 1.2213, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.2213214635849, "rewards/margins": 0.16898460686206818, "rewards/rejected": -1.3903062343597412, "step": 2475 }, { "epoch": 1.3273122595751798, "grad_norm": 4.741379458396845, "learning_rate": 6.83291668634435e-07, "logits/chosen": -0.2691146731376648, "logits/rejected": -0.14679577946662903, "logps/chosen": -1.2331621646881104, "logps/rejected": -1.3896121978759766, "loss": 1.2332, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2331621646881104, "rewards/margins": 0.1564498245716095, "rewards/rejected": -1.3896121978759766, "step": 2480 }, { "epoch": 1.3299882923565813, "grad_norm": 6.754751634599678, "learning_rate": 6.818417974097246e-07, "logits/chosen": -0.1335296332836151, "logits/rejected": 0.014525236561894417, "logps/chosen": -1.2567392587661743, "logps/rejected": -1.3742547035217285, "loss": 1.2567, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2567392587661743, "rewards/margins": 0.11751551926136017, "rewards/rejected": -1.3742547035217285, "step": 2485 }, { "epoch": 1.332664325137983, "grad_norm": 5.469638119523615, "learning_rate": 6.803901619550981e-07, "logits/chosen": -0.20307962596416473, "logits/rejected": -0.17068053781986237, "logps/chosen": -1.238528847694397, "logps/rejected": -1.4418916702270508, "loss": 1.2385, "rewards/accuracies": 0.625, "rewards/chosen": -1.238528847694397, "rewards/margins": 0.20336279273033142, "rewards/rejected": -1.4418916702270508, "step": 2490 }, { "epoch": 1.3353403579193845, "grad_norm": 5.2078858549704625, "learning_rate": 6.789367763543292e-07, "logits/chosen": -0.11917122453451157, "logits/rejected": -0.1278408318758011, "logps/chosen": -1.2217190265655518, "logps/rejected": -1.3455348014831543, "loss": 1.2217, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2217190265655518, "rewards/margins": 0.12381571531295776, "rewards/rejected": -1.3455348014831543, "step": 2495 }, { "epoch": 1.338016390700786, "grad_norm": 6.423730036327471, "learning_rate": 6.774816547081714e-07, "logits/chosen": -0.11328774690628052, "logits/rejected": 0.005904257297515869, "logps/chosen": -1.1672289371490479, "logps/rejected": -1.308455228805542, "loss": 1.1672, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1672289371490479, "rewards/margins": 0.14122620224952698, "rewards/rejected": -1.308455228805542, "step": 2500 }, { "epoch": 1.3406924234821878, "grad_norm": 6.239205052611046, "learning_rate": 6.760248111342211e-07, "logits/chosen": -0.1269468516111374, "logits/rejected": 0.0018040344584733248, "logps/chosen": -1.1713682413101196, "logps/rejected": -1.291046142578125, "loss": 1.1714, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1713682413101196, "rewards/margins": 0.11967790126800537, "rewards/rejected": -1.291046142578125, "step": 2505 }, { "epoch": 1.3433684562635893, "grad_norm": 6.386059660195411, "learning_rate": 6.745662597667813e-07, "logits/chosen": -0.19629287719726562, "logits/rejected": -0.1059475988149643, "logps/chosen": -1.1748524904251099, "logps/rejected": -1.3185479640960693, "loss": 1.1749, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.1748524904251099, "rewards/margins": 0.1436956375837326, "rewards/rejected": -1.3185479640960693, "step": 2510 }, { "epoch": 1.3460444890449907, "grad_norm": 5.26402311698016, "learning_rate": 6.731060147567236e-07, "logits/chosen": -0.12727400660514832, "logits/rejected": -0.05396232008934021, "logps/chosen": -1.2496747970581055, "logps/rejected": -1.2964622974395752, "loss": 1.2497, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.2496747970581055, "rewards/margins": 0.046787556260824203, "rewards/rejected": -1.2964622974395752, "step": 2515 }, { "epoch": 1.3487205218263925, "grad_norm": 5.058353028030334, "learning_rate": 6.716440902713515e-07, "logits/chosen": -0.22062739729881287, "logits/rejected": -0.16730932891368866, "logps/chosen": -1.2206525802612305, "logps/rejected": -1.3314005136489868, "loss": 1.2207, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2206525802612305, "rewards/margins": 0.11074776947498322, "rewards/rejected": -1.3314005136489868, "step": 2520 }, { "epoch": 1.351396554607794, "grad_norm": 7.203848694323688, "learning_rate": 6.701805004942627e-07, "logits/chosen": -0.17842712998390198, "logits/rejected": -0.1318783164024353, "logps/chosen": -1.2502659559249878, "logps/rejected": -1.3312464952468872, "loss": 1.2503, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2502659559249878, "rewards/margins": 0.08098044991493225, "rewards/rejected": -1.3312464952468872, "step": 2525 }, { "epoch": 1.3540725873891954, "grad_norm": 5.272613143138339, "learning_rate": 6.687152596252119e-07, "logits/chosen": -0.1952860951423645, "logits/rejected": -0.16981364786624908, "logps/chosen": -1.1920936107635498, "logps/rejected": -1.3265459537506104, "loss": 1.1921, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.1920936107635498, "rewards/margins": 0.13445214927196503, "rewards/rejected": -1.3265459537506104, "step": 2530 }, { "epoch": 1.3567486201705972, "grad_norm": 4.758692858915953, "learning_rate": 6.672483818799722e-07, "logits/chosen": -0.2370375096797943, "logits/rejected": -0.14196130633354187, "logps/chosen": -1.2176101207733154, "logps/rejected": -1.3650233745574951, "loss": 1.2176, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2176101207733154, "rewards/margins": 0.14741307497024536, "rewards/rejected": -1.3650233745574951, "step": 2535 }, { "epoch": 1.3594246529519987, "grad_norm": 6.14286994877141, "learning_rate": 6.657798814901978e-07, "logits/chosen": -0.16377051174640656, "logits/rejected": -0.055842675268650055, "logps/chosen": -1.3004534244537354, "logps/rejected": -1.355592131614685, "loss": 1.3005, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.3004534244537354, "rewards/margins": 0.05513881519436836, "rewards/rejected": -1.355592131614685, "step": 2540 }, { "epoch": 1.3621006857334002, "grad_norm": 5.193493803676441, "learning_rate": 6.643097727032863e-07, "logits/chosen": -0.1452886462211609, "logits/rejected": -0.041010379791259766, "logps/chosen": -1.217101812362671, "logps/rejected": -1.365670084953308, "loss": 1.2171, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.217101812362671, "rewards/margins": 0.14856821298599243, "rewards/rejected": -1.365670084953308, "step": 2545 }, { "epoch": 1.3647767185148019, "grad_norm": 5.227448769492306, "learning_rate": 6.628380697822392e-07, "logits/chosen": -0.16134123504161835, "logits/rejected": -0.049857042729854584, "logps/chosen": -1.1966516971588135, "logps/rejected": -1.2474998235702515, "loss": 1.1967, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.1966516971588135, "rewards/margins": 0.05084793642163277, "rewards/rejected": -1.2474998235702515, "step": 2550 }, { "epoch": 1.3674527512962034, "grad_norm": 5.613832442223492, "learning_rate": 6.61364787005525e-07, "logits/chosen": -0.12802474200725555, "logits/rejected": -0.07941067218780518, "logps/chosen": -1.1453697681427002, "logps/rejected": -1.3869714736938477, "loss": 1.1454, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1453697681427002, "rewards/margins": 0.24160175025463104, "rewards/rejected": -1.3869714736938477, "step": 2555 }, { "epoch": 1.3701287840776049, "grad_norm": 6.22388734448733, "learning_rate": 6.598899386669395e-07, "logits/chosen": -0.11219368129968643, "logits/rejected": -0.023617519065737724, "logps/chosen": -1.1915019750595093, "logps/rejected": -1.3306970596313477, "loss": 1.1915, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.1915019750595093, "rewards/margins": 0.13919493556022644, "rewards/rejected": -1.3306970596313477, "step": 2560 }, { "epoch": 1.3728048168590066, "grad_norm": 5.599786101487679, "learning_rate": 6.584135390754679e-07, "logits/chosen": -0.13050976395606995, "logits/rejected": -0.04704320430755615, "logps/chosen": -1.1789895296096802, "logps/rejected": -1.327288269996643, "loss": 1.179, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1789895296096802, "rewards/margins": 0.1482987254858017, "rewards/rejected": -1.327288269996643, "step": 2565 }, { "epoch": 1.375480849640408, "grad_norm": 4.682465309427694, "learning_rate": 6.569356025551454e-07, "logits/chosen": -0.06602467596530914, "logits/rejected": -0.026584768667817116, "logps/chosen": -1.1741793155670166, "logps/rejected": -1.3506289720535278, "loss": 1.1742, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1741793155670166, "rewards/margins": 0.17644961178302765, "rewards/rejected": -1.3506289720535278, "step": 2570 }, { "epoch": 1.3781568824218096, "grad_norm": 5.643582290041965, "learning_rate": 6.554561434449186e-07, "logits/chosen": -0.21986059844493866, "logits/rejected": -0.11279948800802231, "logps/chosen": -1.1720339059829712, "logps/rejected": -1.3566397428512573, "loss": 1.172, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1720339059829712, "rewards/margins": 0.1846059411764145, "rewards/rejected": -1.3566397428512573, "step": 2575 }, { "epoch": 1.3808329152032113, "grad_norm": 5.841951297563531, "learning_rate": 6.539751760985063e-07, "logits/chosen": -0.14384274184703827, "logits/rejected": -0.09286098182201385, "logps/chosen": -1.2822753190994263, "logps/rejected": -1.3806320428848267, "loss": 1.2823, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2822753190994263, "rewards/margins": 0.0983566865324974, "rewards/rejected": -1.3806320428848267, "step": 2580 }, { "epoch": 1.3835089479846128, "grad_norm": 5.573879879493909, "learning_rate": 6.524927148842602e-07, "logits/chosen": -0.07458021491765976, "logits/rejected": 0.049686919897794724, "logps/chosen": -1.110569953918457, "logps/rejected": -1.3284456729888916, "loss": 1.1106, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.110569953918457, "rewards/margins": 0.2178758829832077, "rewards/rejected": -1.3284456729888916, "step": 2585 }, { "epoch": 1.3861849807660143, "grad_norm": 6.868348381224617, "learning_rate": 6.510087741850254e-07, "logits/chosen": -0.14729298651218414, "logits/rejected": -0.03605302423238754, "logps/chosen": -1.1648091077804565, "logps/rejected": -1.3291151523590088, "loss": 1.1648, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.1648091077804565, "rewards/margins": 0.16430610418319702, "rewards/rejected": -1.3291151523590088, "step": 2590 }, { "epoch": 1.388861013547416, "grad_norm": 3.994491390214091, "learning_rate": 6.495233683980012e-07, "logits/chosen": -0.1050354391336441, "logits/rejected": -0.08410971611738205, "logps/chosen": -1.1836488246917725, "logps/rejected": -1.342529058456421, "loss": 1.1836, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1836488246917725, "rewards/margins": 0.15888020396232605, "rewards/rejected": -1.342529058456421, "step": 2595 }, { "epoch": 1.3915370463288175, "grad_norm": 6.795826778020931, "learning_rate": 6.480365119346011e-07, "logits/chosen": -0.04259900003671646, "logits/rejected": 0.05288093537092209, "logps/chosen": -1.210244059562683, "logps/rejected": -1.309981107711792, "loss": 1.2102, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.210244059562683, "rewards/margins": 0.09973704814910889, "rewards/rejected": -1.309981107711792, "step": 2600 }, { "epoch": 1.394213079110219, "grad_norm": 6.812230226578826, "learning_rate": 6.465482192203129e-07, "logits/chosen": -0.07310917973518372, "logits/rejected": -0.05591007322072983, "logps/chosen": -1.206110954284668, "logps/rejected": -1.3168456554412842, "loss": 1.2061, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.206110954284668, "rewards/margins": 0.11073482036590576, "rewards/rejected": -1.3168456554412842, "step": 2605 }, { "epoch": 1.3968891118916207, "grad_norm": 5.883681895747266, "learning_rate": 6.45058504694559e-07, "logits/chosen": -0.04472678527235985, "logits/rejected": 0.014445364475250244, "logps/chosen": -1.248909592628479, "logps/rejected": -1.3164310455322266, "loss": 1.2489, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.248909592628479, "rewards/margins": 0.06752137839794159, "rewards/rejected": -1.3164310455322266, "step": 2610 }, { "epoch": 1.3995651446730222, "grad_norm": 7.824456756650632, "learning_rate": 6.435673828105564e-07, "logits/chosen": -0.1301814764738083, "logits/rejected": -0.022432129830121994, "logps/chosen": -1.163451075553894, "logps/rejected": -1.380448818206787, "loss": 1.1635, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.163451075553894, "rewards/margins": 0.21699769794940948, "rewards/rejected": -1.380448818206787, "step": 2615 }, { "epoch": 1.402241177454424, "grad_norm": 6.107004885101311, "learning_rate": 6.420748680351763e-07, "logits/chosen": -0.144990473985672, "logits/rejected": -0.16245444118976593, "logps/chosen": -1.2933988571166992, "logps/rejected": -1.3192591667175293, "loss": 1.2934, "rewards/accuracies": 0.4375, "rewards/chosen": -1.2933988571166992, "rewards/margins": 0.025860270485281944, "rewards/rejected": -1.3192591667175293, "step": 2620 }, { "epoch": 1.4049172102358254, "grad_norm": 6.932801873792605, "learning_rate": 6.405809748488032e-07, "logits/chosen": -0.09054754674434662, "logits/rejected": 0.023020312190055847, "logps/chosen": -1.2118852138519287, "logps/rejected": -1.3441507816314697, "loss": 1.2119, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2118852138519287, "rewards/margins": 0.13226546347141266, "rewards/rejected": -1.3441507816314697, "step": 2625 }, { "epoch": 1.4075932430172269, "grad_norm": 6.443386684682351, "learning_rate": 6.390857177451956e-07, "logits/chosen": -0.23647303879261017, "logits/rejected": -0.07447756826877594, "logps/chosen": -1.2787659168243408, "logps/rejected": -1.2976607084274292, "loss": 1.2788, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.2787659168243408, "rewards/margins": 0.018894772976636887, "rewards/rejected": -1.2976607084274292, "step": 2630 }, { "epoch": 1.4102692757986286, "grad_norm": 7.17860477094521, "learning_rate": 6.375891112313445e-07, "logits/chosen": -0.13635018467903137, "logits/rejected": -0.08494489639997482, "logps/chosen": -1.199947714805603, "logps/rejected": -1.3176281452178955, "loss": 1.1999, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.199947714805603, "rewards/margins": 0.1176803857088089, "rewards/rejected": -1.3176281452178955, "step": 2635 }, { "epoch": 1.41294530858003, "grad_norm": 4.620354828276002, "learning_rate": 6.360911698273326e-07, "logits/chosen": -0.09817350655794144, "logits/rejected": -0.05924362689256668, "logps/chosen": -1.2546790838241577, "logps/rejected": -1.3791718482971191, "loss": 1.2547, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2546790838241577, "rewards/margins": 0.12449277937412262, "rewards/rejected": -1.3791718482971191, "step": 2640 }, { "epoch": 1.4156213413614318, "grad_norm": 5.526347820796103, "learning_rate": 6.345919080661944e-07, "logits/chosen": -0.14334005117416382, "logits/rejected": -0.0887620821595192, "logps/chosen": -1.2077385187149048, "logps/rejected": -1.410943627357483, "loss": 1.2077, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2077385187149048, "rewards/margins": 0.20320507884025574, "rewards/rejected": -1.410943627357483, "step": 2645 }, { "epoch": 1.4182973741428333, "grad_norm": 4.780706094885445, "learning_rate": 6.330913404937737e-07, "logits/chosen": -0.19839034974575043, "logits/rejected": -0.08489783108234406, "logps/chosen": -1.2020041942596436, "logps/rejected": -1.4801499843597412, "loss": 1.202, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2020041942596436, "rewards/margins": 0.2781456410884857, "rewards/rejected": -1.4801499843597412, "step": 2650 }, { "epoch": 1.4209734069242348, "grad_norm": 6.48364368122389, "learning_rate": 6.315894816685838e-07, "logits/chosen": -0.15076784789562225, "logits/rejected": -0.03991442173719406, "logps/chosen": -1.1006791591644287, "logps/rejected": -1.2479472160339355, "loss": 1.1007, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1006791591644287, "rewards/margins": 0.1472681164741516, "rewards/rejected": -1.2479472160339355, "step": 2655 }, { "epoch": 1.4236494397056365, "grad_norm": 6.2094127246419735, "learning_rate": 6.300863461616657e-07, "logits/chosen": -0.06837820261716843, "logits/rejected": -0.054329387843608856, "logps/chosen": -1.0782195329666138, "logps/rejected": -1.2633756399154663, "loss": 1.0782, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.0782195329666138, "rewards/margins": 0.18515625596046448, "rewards/rejected": -1.2633756399154663, "step": 2660 }, { "epoch": 1.426325472487038, "grad_norm": 5.534198442171728, "learning_rate": 6.285819485564465e-07, "logits/chosen": -0.1987202763557434, "logits/rejected": -0.12174384295940399, "logps/chosen": -1.2363477945327759, "logps/rejected": -1.3699829578399658, "loss": 1.2363, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2363477945327759, "rewards/margins": 0.13363517820835114, "rewards/rejected": -1.3699829578399658, "step": 2665 }, { "epoch": 1.4290015052684395, "grad_norm": 6.638380706518804, "learning_rate": 6.270763034485986e-07, "logits/chosen": -0.12321349233388901, "logits/rejected": -0.07261453568935394, "logps/chosen": -1.336830735206604, "logps/rejected": -1.4034461975097656, "loss": 1.3368, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.336830735206604, "rewards/margins": 0.06661558896303177, "rewards/rejected": -1.4034461975097656, "step": 2670 }, { "epoch": 1.4316775380498412, "grad_norm": 6.678158751287154, "learning_rate": 6.255694254458972e-07, "logits/chosen": -0.14408624172210693, "logits/rejected": -0.02955031953752041, "logps/chosen": -1.2429993152618408, "logps/rejected": -1.331028938293457, "loss": 1.243, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2429993152618408, "rewards/margins": 0.08802968263626099, "rewards/rejected": -1.331028938293457, "step": 2675 }, { "epoch": 1.4343535708312427, "grad_norm": 7.052112594941047, "learning_rate": 6.240613291680795e-07, "logits/chosen": -0.18170282244682312, "logits/rejected": -0.05639047548174858, "logps/chosen": -1.2288182973861694, "logps/rejected": -1.3074945211410522, "loss": 1.2288, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2288182973861694, "rewards/margins": 0.07867632806301117, "rewards/rejected": -1.3074945211410522, "step": 2680 }, { "epoch": 1.4370296036126442, "grad_norm": 6.470867853272974, "learning_rate": 6.225520292467021e-07, "logits/chosen": -0.17187932133674622, "logits/rejected": -0.02031542919576168, "logps/chosen": -1.2373626232147217, "logps/rejected": -1.3011116981506348, "loss": 1.2374, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2373626232147217, "rewards/margins": 0.06374889612197876, "rewards/rejected": -1.3011116981506348, "step": 2685 }, { "epoch": 1.439705636394046, "grad_norm": 8.703600033276134, "learning_rate": 6.210415403249993e-07, "logits/chosen": -0.26832717657089233, "logits/rejected": -0.09373263269662857, "logps/chosen": -1.214740514755249, "logps/rejected": -1.4110368490219116, "loss": 1.2147, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.214740514755249, "rewards/margins": 0.19629648327827454, "rewards/rejected": -1.4110368490219116, "step": 2690 }, { "epoch": 1.4423816691754474, "grad_norm": 5.259613096364513, "learning_rate": 6.195298770577415e-07, "logits/chosen": -0.161390021443367, "logits/rejected": -0.1444428563117981, "logps/chosen": -1.2049241065979004, "logps/rejected": -1.3633605241775513, "loss": 1.2049, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2049241065979004, "rewards/margins": 0.15843632817268372, "rewards/rejected": -1.3633605241775513, "step": 2695 }, { "epoch": 1.445057701956849, "grad_norm": 5.857884558532689, "learning_rate": 6.180170541110923e-07, "logits/chosen": -0.17481596767902374, "logits/rejected": -0.026687592267990112, "logps/chosen": -1.2484960556030273, "logps/rejected": -1.3777812719345093, "loss": 1.2485, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2484960556030273, "rewards/margins": 0.12928541004657745, "rewards/rejected": -1.3777812719345093, "step": 2700 }, { "epoch": 1.4477337347382506, "grad_norm": 4.846582781251134, "learning_rate": 6.165030861624663e-07, "logits/chosen": -0.19445741176605225, "logits/rejected": -0.041595134884119034, "logps/chosen": -1.1381123065948486, "logps/rejected": -1.3485126495361328, "loss": 1.1381, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1381123065948486, "rewards/margins": 0.21040037274360657, "rewards/rejected": -1.3485126495361328, "step": 2705 }, { "epoch": 1.4504097675196521, "grad_norm": 6.929032201185672, "learning_rate": 6.149879879003876e-07, "logits/chosen": -0.10069747269153595, "logits/rejected": -0.08221422880887985, "logps/chosen": -1.221793293952942, "logps/rejected": -1.3817564249038696, "loss": 1.2218, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.221793293952942, "rewards/margins": 0.15996313095092773, "rewards/rejected": -1.3817564249038696, "step": 2710 }, { "epoch": 1.4530858003010536, "grad_norm": 5.52601370795345, "learning_rate": 6.13471774024346e-07, "logits/chosen": -0.22993668913841248, "logits/rejected": -0.16399869322776794, "logps/chosen": -1.1534656286239624, "logps/rejected": -1.2989625930786133, "loss": 1.1535, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.1534656286239624, "rewards/margins": 0.1454968899488449, "rewards/rejected": -1.2989625930786133, "step": 2715 }, { "epoch": 1.4557618330824553, "grad_norm": 5.974982576622821, "learning_rate": 6.119544592446551e-07, "logits/chosen": -0.18779391050338745, "logits/rejected": -0.09995482861995697, "logps/chosen": -1.1864216327667236, "logps/rejected": -1.2550859451293945, "loss": 1.1864, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.1864216327667236, "rewards/margins": 0.06866439431905746, "rewards/rejected": -1.2550859451293945, "step": 2720 }, { "epoch": 1.4584378658638568, "grad_norm": 7.212861993638156, "learning_rate": 6.104360582823096e-07, "logits/chosen": -0.15530435740947723, "logits/rejected": -0.07445988804101944, "logps/chosen": -1.1945669651031494, "logps/rejected": -1.352791666984558, "loss": 1.1946, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1945669651031494, "rewards/margins": 0.15822476148605347, "rewards/rejected": -1.352791666984558, "step": 2725 }, { "epoch": 1.4611138986452583, "grad_norm": 5.953249224380666, "learning_rate": 6.089165858688423e-07, "logits/chosen": -0.20737583935260773, "logits/rejected": -0.0935809463262558, "logps/chosen": -1.2033823728561401, "logps/rejected": -1.3652089834213257, "loss": 1.2034, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2033823728561401, "rewards/margins": 0.16182665526866913, "rewards/rejected": -1.3652089834213257, "step": 2730 }, { "epoch": 1.46378993142666, "grad_norm": 5.422878182190185, "learning_rate": 6.073960567461811e-07, "logits/chosen": -0.15864305198192596, "logits/rejected": -0.0090013537555933, "logps/chosen": -1.1173312664031982, "logps/rejected": -1.3917604684829712, "loss": 1.1173, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.1173312664031982, "rewards/margins": 0.2744291424751282, "rewards/rejected": -1.3917604684829712, "step": 2735 }, { "epoch": 1.4664659642080615, "grad_norm": 7.1848323456782, "learning_rate": 6.058744856665065e-07, "logits/chosen": -0.18428537249565125, "logits/rejected": -0.09304045140743256, "logps/chosen": -1.1353394985198975, "logps/rejected": -1.3285410404205322, "loss": 1.1353, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1353394985198975, "rewards/margins": 0.19320157170295715, "rewards/rejected": -1.3285410404205322, "step": 2740 }, { "epoch": 1.469141996989463, "grad_norm": 5.944422888670585, "learning_rate": 6.043518873921074e-07, "logits/chosen": -0.16221825778484344, "logits/rejected": -0.09073010832071304, "logps/chosen": -1.1713159084320068, "logps/rejected": -1.2528680562973022, "loss": 1.1713, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1713159084320068, "rewards/margins": 0.08155205100774765, "rewards/rejected": -1.2528680562973022, "step": 2745 }, { "epoch": 1.4718180297708647, "grad_norm": 6.4093922309703215, "learning_rate": 6.028282766952393e-07, "logits/chosen": -0.18188731372356415, "logits/rejected": -0.11332595348358154, "logps/chosen": -1.2454031705856323, "logps/rejected": -1.4344801902770996, "loss": 1.2454, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2454031705856323, "rewards/margins": 0.18907706439495087, "rewards/rejected": -1.4344801902770996, "step": 2750 }, { "epoch": 1.4744940625522662, "grad_norm": 6.451819965156351, "learning_rate": 6.013036683579798e-07, "logits/chosen": -0.09715066850185394, "logits/rejected": 0.019139278680086136, "logps/chosen": -1.1873729228973389, "logps/rejected": -1.280526876449585, "loss": 1.1874, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.1873729228973389, "rewards/margins": 0.09315387159585953, "rewards/rejected": -1.280526876449585, "step": 2755 }, { "epoch": 1.4771700953336677, "grad_norm": 6.723615453819073, "learning_rate": 5.997780771720854e-07, "logits/chosen": -0.21195130050182343, "logits/rejected": -0.07306645065546036, "logps/chosen": -1.236307144165039, "logps/rejected": -1.424189805984497, "loss": 1.2363, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.236307144165039, "rewards/margins": 0.18788263201713562, "rewards/rejected": -1.424189805984497, "step": 2760 }, { "epoch": 1.4798461281150694, "grad_norm": 6.37471771811797, "learning_rate": 5.982515179388486e-07, "logits/chosen": -0.1285286843776703, "logits/rejected": -0.02625107206404209, "logps/chosen": -1.1433587074279785, "logps/rejected": -1.3016374111175537, "loss": 1.1434, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1433587074279785, "rewards/margins": 0.15827877819538116, "rewards/rejected": -1.3016374111175537, "step": 2765 }, { "epoch": 1.482522160896471, "grad_norm": 5.089152269904686, "learning_rate": 5.967240054689541e-07, "logits/chosen": -0.16768258810043335, "logits/rejected": -0.1134176030755043, "logps/chosen": -1.1885204315185547, "logps/rejected": -1.2140470743179321, "loss": 1.1885, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.1885204315185547, "rewards/margins": 0.025526558980345726, "rewards/rejected": -1.2140470743179321, "step": 2770 }, { "epoch": 1.4851981936778724, "grad_norm": 4.464448240066363, "learning_rate": 5.951955545823342e-07, "logits/chosen": -0.14922046661376953, "logits/rejected": -0.1408439576625824, "logps/chosen": -1.1378624439239502, "logps/rejected": -1.2946563959121704, "loss": 1.1379, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1378624439239502, "rewards/margins": 0.15679410099983215, "rewards/rejected": -1.2946563959121704, "step": 2775 }, { "epoch": 1.4878742264592741, "grad_norm": 4.842885293635465, "learning_rate": 5.936661801080263e-07, "logits/chosen": -0.153904989361763, "logits/rejected": -0.0755535140633583, "logps/chosen": -1.3155802488327026, "logps/rejected": -1.4153671264648438, "loss": 1.3156, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.3155802488327026, "rewards/margins": 0.09978682547807693, "rewards/rejected": -1.4153671264648438, "step": 2780 }, { "epoch": 1.4905502592406756, "grad_norm": 6.667975453060582, "learning_rate": 5.92135896884028e-07, "logits/chosen": -0.19937977194786072, "logits/rejected": -0.11063963174819946, "logps/chosen": -1.2699676752090454, "logps/rejected": -1.3248577117919922, "loss": 1.27, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2699676752090454, "rewards/margins": 0.05488995462656021, "rewards/rejected": -1.3248577117919922, "step": 2785 }, { "epoch": 1.4932262920220774, "grad_norm": 8.631055962250791, "learning_rate": 5.906047197571541e-07, "logits/chosen": -0.13237924873828888, "logits/rejected": -0.1464015245437622, "logps/chosen": -1.1503732204437256, "logps/rejected": -1.3199584484100342, "loss": 1.1504, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1503732204437256, "rewards/margins": 0.1695854663848877, "rewards/rejected": -1.3199584484100342, "step": 2790 }, { "epoch": 1.4959023248034788, "grad_norm": 5.81030717196785, "learning_rate": 5.890726635828919e-07, "logits/chosen": -0.012448148801922798, "logits/rejected": 0.0006254732725210488, "logps/chosen": -1.127169132232666, "logps/rejected": -1.2137677669525146, "loss": 1.1272, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.127169132232666, "rewards/margins": 0.08659853041172028, "rewards/rejected": -1.2137677669525146, "step": 2795 }, { "epoch": 1.4985783575848803, "grad_norm": 6.190639319964759, "learning_rate": 5.875397432252569e-07, "logits/chosen": -0.2165904939174652, "logits/rejected": -0.14909926056861877, "logps/chosen": -1.2779306173324585, "logps/rejected": -1.4667243957519531, "loss": 1.2779, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2779306173324585, "rewards/margins": 0.18879374861717224, "rewards/rejected": -1.4667243957519531, "step": 2800 }, { "epoch": 1.4985783575848803, "eval_logits/chosen": 0.05726103484630585, "eval_logits/rejected": 0.11986809968948364, "eval_logps/chosen": -1.2658957242965698, "eval_logps/rejected": -1.3879890441894531, "eval_loss": 1.2661501169204712, "eval_rewards/accuracies": 0.5452522039413452, "eval_rewards/chosen": -1.2658957242965698, "eval_rewards/margins": 0.12209320068359375, "eval_rewards/rejected": -1.3879890441894531, "eval_runtime": 40.4517, "eval_samples_per_second": 33.25, "eval_steps_per_second": 8.331, "step": 2800 }, { "epoch": 1.5012543903662818, "grad_norm": 4.479659321649989, "learning_rate": 5.860059735566491e-07, "logits/chosen": -0.261008083820343, "logits/rejected": -0.14623317122459412, "logps/chosen": -1.1214784383773804, "logps/rejected": -1.3182529211044312, "loss": 1.1215, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1214784383773804, "rewards/margins": 0.19677463173866272, "rewards/rejected": -1.3182529211044312, "step": 2805 }, { "epoch": 1.5039304231476835, "grad_norm": 7.1352009746743175, "learning_rate": 5.844713694577087e-07, "logits/chosen": -0.15393862128257751, "logits/rejected": -0.1452220231294632, "logps/chosen": -1.2495368719100952, "logps/rejected": -1.401196837425232, "loss": 1.2495, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2495368719100952, "rewards/margins": 0.15165987610816956, "rewards/rejected": -1.401196837425232, "step": 2810 }, { "epoch": 1.5066064559290853, "grad_norm": 6.456163244344596, "learning_rate": 5.829359458171714e-07, "logits/chosen": -0.09792011231184006, "logits/rejected": 0.0020709186792373657, "logps/chosen": -1.240867257118225, "logps/rejected": -1.3552534580230713, "loss": 1.2409, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.240867257118225, "rewards/margins": 0.11438626050949097, "rewards/rejected": -1.3552534580230713, "step": 2815 }, { "epoch": 1.5092824887104868, "grad_norm": 4.727896506987427, "learning_rate": 5.81399717531724e-07, "logits/chosen": -0.11546013504266739, "logits/rejected": 0.002158570336177945, "logps/chosen": -1.2198235988616943, "logps/rejected": -1.2009966373443604, "loss": 1.2198, "rewards/accuracies": 0.5, "rewards/chosen": -1.2198235988616943, "rewards/margins": -0.018826976418495178, "rewards/rejected": -1.2009966373443604, "step": 2820 }, { "epoch": 1.5119585214918883, "grad_norm": 6.014619411701661, "learning_rate": 5.798626995058602e-07, "logits/chosen": -0.17083027958869934, "logits/rejected": -0.049108508974313736, "logps/chosen": -1.275117039680481, "logps/rejected": -1.4545819759368896, "loss": 1.2751, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.275117039680481, "rewards/margins": 0.17946502566337585, "rewards/rejected": -1.4545819759368896, "step": 2825 }, { "epoch": 1.51463455427329, "grad_norm": 5.073861379691354, "learning_rate": 5.783249066517354e-07, "logits/chosen": -0.1516026258468628, "logits/rejected": -0.03846261650323868, "logps/chosen": -1.266571283340454, "logps/rejected": -1.2489300966262817, "loss": 1.2666, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -1.266571283340454, "rewards/margins": -0.017641117796301842, "rewards/rejected": -1.2489300966262817, "step": 2830 }, { "epoch": 1.5173105870546915, "grad_norm": 7.7705412791442185, "learning_rate": 5.767863538890228e-07, "logits/chosen": -0.1688036471605301, "logits/rejected": -0.03711722418665886, "logps/chosen": -1.1882447004318237, "logps/rejected": -1.271554708480835, "loss": 1.1882, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1882447004318237, "rewards/margins": 0.08330999314785004, "rewards/rejected": -1.271554708480835, "step": 2835 }, { "epoch": 1.519986619836093, "grad_norm": 6.662981010043394, "learning_rate": 5.75247056144768e-07, "logits/chosen": -0.13184712827205658, "logits/rejected": -0.04486880451440811, "logps/chosen": -1.2201470136642456, "logps/rejected": -1.3267196416854858, "loss": 1.2201, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2201470136642456, "rewards/margins": 0.106572724878788, "rewards/rejected": -1.3267196416854858, "step": 2840 }, { "epoch": 1.5226626526174947, "grad_norm": 4.585818747175368, "learning_rate": 5.737070283532444e-07, "logits/chosen": -0.13083884119987488, "logits/rejected": -0.07527592778205872, "logps/chosen": -1.1595942974090576, "logps/rejected": -1.3344767093658447, "loss": 1.1596, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1595942974090576, "rewards/margins": 0.17488253116607666, "rewards/rejected": -1.3344767093658447, "step": 2845 }, { "epoch": 1.5253386853988962, "grad_norm": 6.349857199074927, "learning_rate": 5.721662854558084e-07, "logits/chosen": -0.1945854127407074, "logits/rejected": -0.1434975564479828, "logps/chosen": -1.2510110139846802, "logps/rejected": -1.3266648054122925, "loss": 1.251, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -1.2510110139846802, "rewards/margins": 0.07565389573574066, "rewards/rejected": -1.3266648054122925, "step": 2850 }, { "epoch": 1.5280147181802977, "grad_norm": 5.5792235392729275, "learning_rate": 5.706248424007545e-07, "logits/chosen": -0.2043582648038864, "logits/rejected": -0.08947475999593735, "logps/chosen": -1.2953853607177734, "logps/rejected": -1.4398466348648071, "loss": 1.2954, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2953853607177734, "rewards/margins": 0.1444612592458725, "rewards/rejected": -1.4398466348648071, "step": 2855 }, { "epoch": 1.5306907509616994, "grad_norm": 7.1802021851585405, "learning_rate": 5.690827141431699e-07, "logits/chosen": -0.23055963218212128, "logits/rejected": -0.10597260296344757, "logps/chosen": -1.17118239402771, "logps/rejected": -1.2536753416061401, "loss": 1.1712, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.17118239402771, "rewards/margins": 0.08249302208423615, "rewards/rejected": -1.2536753416061401, "step": 2860 }, { "epoch": 1.5333667837431009, "grad_norm": 4.981640373447976, "learning_rate": 5.675399156447897e-07, "logits/chosen": -0.2899226248264313, "logits/rejected": -0.19321629405021667, "logps/chosen": -1.1763699054718018, "logps/rejected": -1.3188179731369019, "loss": 1.1764, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.1763699054718018, "rewards/margins": 0.14244814217090607, "rewards/rejected": -1.3188179731369019, "step": 2865 }, { "epoch": 1.5360428165245024, "grad_norm": 5.091798835694989, "learning_rate": 5.659964618738515e-07, "logits/chosen": -0.19666478037834167, "logits/rejected": -0.11957750469446182, "logps/chosen": -1.2415506839752197, "logps/rejected": -1.2366654872894287, "loss": 1.2416, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -1.2415506839752197, "rewards/margins": -0.004885273985564709, "rewards/rejected": -1.2366654872894287, "step": 2870 }, { "epoch": 1.538718849305904, "grad_norm": 5.95986909335114, "learning_rate": 5.644523678049509e-07, "logits/chosen": -0.16451522707939148, "logits/rejected": -0.11168237775564194, "logps/chosen": -1.2363080978393555, "logps/rejected": -1.3107874393463135, "loss": 1.2363, "rewards/accuracies": 0.5, "rewards/chosen": -1.2363080978393555, "rewards/margins": 0.07447931915521622, "rewards/rejected": -1.3107874393463135, "step": 2875 }, { "epoch": 1.5413948820873056, "grad_norm": 6.171604012207393, "learning_rate": 5.629076484188952e-07, "logits/chosen": -0.10785210132598877, "logits/rejected": -0.02229759842157364, "logps/chosen": -1.1848126649856567, "logps/rejected": -1.3453139066696167, "loss": 1.1848, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.1848126649856567, "rewards/margins": 0.1605014055967331, "rewards/rejected": -1.3453139066696167, "step": 2880 }, { "epoch": 1.544070914868707, "grad_norm": 5.595122929084545, "learning_rate": 5.613623187025587e-07, "logits/chosen": -0.13782432675361633, "logits/rejected": -0.043941013514995575, "logps/chosen": -1.2002809047698975, "logps/rejected": -1.3143161535263062, "loss": 1.2003, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2002809047698975, "rewards/margins": 0.11403530836105347, "rewards/rejected": -1.3143161535263062, "step": 2885 }, { "epoch": 1.5467469476501088, "grad_norm": 4.796516904426326, "learning_rate": 5.598163936487369e-07, "logits/chosen": -0.2321656048297882, "logits/rejected": -0.08728455007076263, "logps/chosen": -1.2076293230056763, "logps/rejected": -1.3068143129348755, "loss": 1.2076, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2076293230056763, "rewards/margins": 0.09918492287397385, "rewards/rejected": -1.3068143129348755, "step": 2890 }, { "epoch": 1.5494229804315103, "grad_norm": 4.936721474832882, "learning_rate": 5.582698882560017e-07, "logits/chosen": -0.1969253122806549, "logits/rejected": -0.09813370555639267, "logps/chosen": -1.1297506093978882, "logps/rejected": -1.2140153646469116, "loss": 1.1298, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.1297506093978882, "rewards/margins": 0.08426471799612045, "rewards/rejected": -1.2140153646469116, "step": 2895 }, { "epoch": 1.5520990132129118, "grad_norm": 4.819665994000895, "learning_rate": 5.567228175285549e-07, "logits/chosen": -0.1384320855140686, "logits/rejected": -0.06764237582683563, "logps/chosen": -1.2284023761749268, "logps/rejected": -1.313789963722229, "loss": 1.2284, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2284023761749268, "rewards/margins": 0.08538760244846344, "rewards/rejected": -1.313789963722229, "step": 2900 }, { "epoch": 1.5547750459943135, "grad_norm": 5.099362809348147, "learning_rate": 5.551751964760838e-07, "logits/chosen": -0.0555495023727417, "logits/rejected": -0.03724722936749458, "logps/chosen": -1.1786777973175049, "logps/rejected": -1.3275716304779053, "loss": 1.1787, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.1786777973175049, "rewards/margins": 0.14889390766620636, "rewards/rejected": -1.3275716304779053, "step": 2905 }, { "epoch": 1.557451078775715, "grad_norm": 5.956839082390559, "learning_rate": 5.536270401136145e-07, "logits/chosen": -0.12872107326984406, "logits/rejected": -0.08220528066158295, "logps/chosen": -1.1643571853637695, "logps/rejected": -1.2850828170776367, "loss": 1.1644, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.1643571853637695, "rewards/margins": 0.12072565406560898, "rewards/rejected": -1.2850828170776367, "step": 2910 }, { "epoch": 1.5601271115571165, "grad_norm": 7.335124218828, "learning_rate": 5.520783634613667e-07, "logits/chosen": -0.11989059299230576, "logits/rejected": 0.0031231895554810762, "logps/chosen": -1.2568973302841187, "logps/rejected": -1.3375461101531982, "loss": 1.2569, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2568973302841187, "rewards/margins": 0.08064886182546616, "rewards/rejected": -1.3375461101531982, "step": 2915 }, { "epoch": 1.5628031443385182, "grad_norm": 4.5633781077221025, "learning_rate": 5.505291815446082e-07, "logits/chosen": -0.09641304612159729, "logits/rejected": -0.010061273351311684, "logps/chosen": -1.2269933223724365, "logps/rejected": -1.3735332489013672, "loss": 1.227, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2269933223724365, "rewards/margins": 0.14653989672660828, "rewards/rejected": -1.3735332489013672, "step": 2920 }, { "epoch": 1.5654791771199197, "grad_norm": 5.8551479863198095, "learning_rate": 5.489795093935089e-07, "logits/chosen": -0.08407769352197647, "logits/rejected": -0.05253022909164429, "logps/chosen": -1.136770486831665, "logps/rejected": -1.3794844150543213, "loss": 1.1368, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.136770486831665, "rewards/margins": 0.2427138388156891, "rewards/rejected": -1.3794844150543213, "step": 2925 }, { "epoch": 1.5681552099013212, "grad_norm": 6.890557379089573, "learning_rate": 5.474293620429946e-07, "logits/chosen": -0.23065908253192902, "logits/rejected": -0.12214188277721405, "logps/chosen": -1.1636499166488647, "logps/rejected": -1.356605052947998, "loss": 1.1636, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.1636499166488647, "rewards/margins": 0.1929551362991333, "rewards/rejected": -1.356605052947998, "step": 2930 }, { "epoch": 1.570831242682723, "grad_norm": 6.080523755891488, "learning_rate": 5.458787545326018e-07, "logits/chosen": -0.18603457510471344, "logits/rejected": -0.07275068014860153, "logps/chosen": -1.2147294282913208, "logps/rejected": -1.3608736991882324, "loss": 1.2147, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2147294282913208, "rewards/margins": 0.1461443454027176, "rewards/rejected": -1.3608736991882324, "step": 2935 }, { "epoch": 1.5735072754641244, "grad_norm": 5.458206135329564, "learning_rate": 5.443277019063311e-07, "logits/chosen": -0.16585800051689148, "logits/rejected": -0.03128070384263992, "logps/chosen": -1.2168605327606201, "logps/rejected": -1.3623857498168945, "loss": 1.2169, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2168605327606201, "rewards/margins": 0.14552509784698486, "rewards/rejected": -1.3623857498168945, "step": 2940 }, { "epoch": 1.5761833082455259, "grad_norm": 5.731367929482277, "learning_rate": 5.427762192125023e-07, "logits/chosen": -0.21636685729026794, "logits/rejected": -0.10849936306476593, "logps/chosen": -1.1857397556304932, "logps/rejected": -1.267014980316162, "loss": 1.1857, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1857397556304932, "rewards/margins": 0.08127506077289581, "rewards/rejected": -1.267014980316162, "step": 2945 }, { "epoch": 1.5788593410269276, "grad_norm": 5.960850036347019, "learning_rate": 5.41224321503607e-07, "logits/chosen": -0.08730033040046692, "logits/rejected": 0.10454677045345306, "logps/chosen": -1.1557281017303467, "logps/rejected": -1.3083546161651611, "loss": 1.1557, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.1557281017303467, "rewards/margins": 0.15262636542320251, "rewards/rejected": -1.3083546161651611, "step": 2950 }, { "epoch": 1.5815353738083293, "grad_norm": 6.1078932710207114, "learning_rate": 5.396720238361637e-07, "logits/chosen": -0.0962892472743988, "logits/rejected": -0.022025322541594505, "logps/chosen": -1.1600878238677979, "logps/rejected": -1.344645619392395, "loss": 1.1601, "rewards/accuracies": 0.53125, "rewards/chosen": -1.1600878238677979, "rewards/margins": 0.1845577359199524, "rewards/rejected": -1.344645619392395, "step": 2955 }, { "epoch": 1.5842114065897306, "grad_norm": 6.43486469279706, "learning_rate": 5.381193412705711e-07, "logits/chosen": -0.1737672984600067, "logits/rejected": -0.08544833958148956, "logps/chosen": -1.1920510530471802, "logps/rejected": -1.3143234252929688, "loss": 1.1921, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1920510530471802, "rewards/margins": 0.12227238714694977, "rewards/rejected": -1.3143234252929688, "step": 2960 }, { "epoch": 1.5868874393711323, "grad_norm": 7.689088204692449, "learning_rate": 5.365662888709622e-07, "logits/chosen": -0.1421339362859726, "logits/rejected": -0.08030688017606735, "logps/chosen": -1.1587399244308472, "logps/rejected": -1.2618356943130493, "loss": 1.1587, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1587399244308472, "rewards/margins": 0.10309580713510513, "rewards/rejected": -1.2618356943130493, "step": 2965 }, { "epoch": 1.589563472152534, "grad_norm": 6.114436932209362, "learning_rate": 5.350128817050585e-07, "logits/chosen": -0.16020333766937256, "logits/rejected": -0.03476119786500931, "logps/chosen": -1.2653779983520508, "logps/rejected": -1.3029614686965942, "loss": 1.2654, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.2653779983520508, "rewards/margins": 0.03758341819047928, "rewards/rejected": -1.3029614686965942, "step": 2970 }, { "epoch": 1.5922395049339353, "grad_norm": 7.306027203150628, "learning_rate": 5.334591348440229e-07, "logits/chosen": -0.0941961407661438, "logits/rejected": 0.007696406450122595, "logps/chosen": -1.222201943397522, "logps/rejected": -1.4244227409362793, "loss": 1.2222, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.222201943397522, "rewards/margins": 0.2022208273410797, "rewards/rejected": -1.4244227409362793, "step": 2975 }, { "epoch": 1.594915537715337, "grad_norm": 6.190211185354726, "learning_rate": 5.319050633623141e-07, "logits/chosen": -0.21916553378105164, "logits/rejected": -0.11115749180316925, "logps/chosen": -1.2591838836669922, "logps/rejected": -1.3842235803604126, "loss": 1.2592, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2591838836669922, "rewards/margins": 0.12503968179225922, "rewards/rejected": -1.3842235803604126, "step": 2980 }, { "epoch": 1.5975915704967387, "grad_norm": 4.506846783942511, "learning_rate": 5.303506823375409e-07, "logits/chosen": -0.19603393971920013, "logits/rejected": -0.06851018965244293, "logps/chosen": -1.2823940515518188, "logps/rejected": -1.3131239414215088, "loss": 1.2824, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2823940515518188, "rewards/margins": 0.030729811638593674, "rewards/rejected": -1.3131239414215088, "step": 2985 }, { "epoch": 1.60026760327814, "grad_norm": 6.9046144948527175, "learning_rate": 5.287960068503143e-07, "logits/chosen": -0.12862034142017365, "logits/rejected": -0.0005768537521362305, "logps/chosen": -1.1648547649383545, "logps/rejected": -1.3478373289108276, "loss": 1.1649, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1648547649383545, "rewards/margins": 0.1829826831817627, "rewards/rejected": -1.3478373289108276, "step": 2990 }, { "epoch": 1.6029436360595417, "grad_norm": 6.87847075161143, "learning_rate": 5.272410519841032e-07, "logits/chosen": -0.1611863374710083, "logits/rejected": -0.10314633697271347, "logps/chosen": -1.2622981071472168, "logps/rejected": -1.4206950664520264, "loss": 1.2623, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2622981071472168, "rewards/margins": 0.15839703381061554, "rewards/rejected": -1.4206950664520264, "step": 2995 }, { "epoch": 1.6056196688409434, "grad_norm": 5.545616502962526, "learning_rate": 5.256858328250861e-07, "logits/chosen": -0.1712319552898407, "logits/rejected": -0.07471869885921478, "logps/chosen": -1.2532267570495605, "logps/rejected": -1.353662133216858, "loss": 1.2532, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2532267570495605, "rewards/margins": 0.1004352793097496, "rewards/rejected": -1.353662133216858, "step": 3000 }, { "epoch": 1.608295701622345, "grad_norm": 6.016747176613913, "learning_rate": 5.241303644620063e-07, "logits/chosen": -0.21355941891670227, "logits/rejected": -0.10049339383840561, "logps/chosen": -1.1431035995483398, "logps/rejected": -1.295676589012146, "loss": 1.1431, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1431035995483398, "rewards/margins": 0.15257291495800018, "rewards/rejected": -1.295676589012146, "step": 3005 }, { "epoch": 1.6109717344037464, "grad_norm": 6.55220249245126, "learning_rate": 5.225746619860248e-07, "logits/chosen": -0.19102191925048828, "logits/rejected": -0.10036192089319229, "logps/chosen": -1.210695743560791, "logps/rejected": -1.307587742805481, "loss": 1.2107, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.210695743560791, "rewards/margins": 0.09689190238714218, "rewards/rejected": -1.307587742805481, "step": 3010 }, { "epoch": 1.6136477671851481, "grad_norm": 8.138877806873422, "learning_rate": 5.210187404905735e-07, "logits/chosen": -0.059145741164684296, "logits/rejected": 0.0035891220904886723, "logps/chosen": -1.1858341693878174, "logps/rejected": -1.3740955591201782, "loss": 1.1858, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1858341693878174, "rewards/margins": 0.1882612407207489, "rewards/rejected": -1.3740955591201782, "step": 3015 }, { "epoch": 1.6163237999665496, "grad_norm": 4.4375315914954365, "learning_rate": 5.194626150712098e-07, "logits/chosen": -0.19371357560157776, "logits/rejected": -0.06836151331663132, "logps/chosen": -1.196567177772522, "logps/rejected": -1.2559071779251099, "loss": 1.1966, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.196567177772522, "rewards/margins": 0.0593399703502655, "rewards/rejected": -1.2559071779251099, "step": 3020 }, { "epoch": 1.6189998327479511, "grad_norm": 5.391675822293023, "learning_rate": 5.179063008254695e-07, "logits/chosen": -0.14210323989391327, "logits/rejected": -0.03532449156045914, "logps/chosen": -1.1689352989196777, "logps/rejected": -1.2794636487960815, "loss": 1.1689, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1689352989196777, "rewards/margins": 0.1105283722281456, "rewards/rejected": -1.2794636487960815, "step": 3025 }, { "epoch": 1.6216758655293528, "grad_norm": 3.881088482451402, "learning_rate": 5.163498128527199e-07, "logits/chosen": -0.1379883587360382, "logits/rejected": -0.03161315247416496, "logps/chosen": -1.2549463510513306, "logps/rejected": -1.2928833961486816, "loss": 1.2549, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2549463510513306, "rewards/margins": 0.037937164306640625, "rewards/rejected": -1.2928833961486816, "step": 3030 }, { "epoch": 1.6243518983107543, "grad_norm": 6.534711814779602, "learning_rate": 5.147931662540144e-07, "logits/chosen": -0.01311348658055067, "logits/rejected": 0.06445236504077911, "logps/chosen": -1.2364697456359863, "logps/rejected": -1.2650384902954102, "loss": 1.2365, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2364697456359863, "rewards/margins": 0.028568726032972336, "rewards/rejected": -1.2650384902954102, "step": 3035 }, { "epoch": 1.6270279310921558, "grad_norm": 8.422104425503937, "learning_rate": 5.132363761319449e-07, "logits/chosen": -0.12690068781375885, "logits/rejected": -0.08555368334054947, "logps/chosen": -1.1478677988052368, "logps/rejected": -1.3291311264038086, "loss": 1.1479, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1478677988052368, "rewards/margins": 0.1812632828950882, "rewards/rejected": -1.3291311264038086, "step": 3040 }, { "epoch": 1.6297039638735575, "grad_norm": 5.468720626805716, "learning_rate": 5.116794575904962e-07, "logits/chosen": -0.1247648224234581, "logits/rejected": -0.045296717435121536, "logps/chosen": -1.1686718463897705, "logps/rejected": -1.2428748607635498, "loss": 1.1687, "rewards/accuracies": 0.5, "rewards/chosen": -1.1686718463897705, "rewards/margins": 0.07420298457145691, "rewards/rejected": -1.2428748607635498, "step": 3045 }, { "epoch": 1.632379996654959, "grad_norm": 6.925183036114916, "learning_rate": 5.101224257348987e-07, "logits/chosen": -0.17674121260643005, "logits/rejected": -0.04949633404612541, "logps/chosen": -1.2331922054290771, "logps/rejected": -1.3919477462768555, "loss": 1.2332, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2331922054290771, "rewards/margins": 0.15875545144081116, "rewards/rejected": -1.3919477462768555, "step": 3050 }, { "epoch": 1.6350560294363605, "grad_norm": 4.546286151563044, "learning_rate": 5.085652956714823e-07, "logits/chosen": -0.18583431839942932, "logits/rejected": -0.0917961448431015, "logps/chosen": -1.1717839241027832, "logps/rejected": -1.3405070304870605, "loss": 1.1718, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1717839241027832, "rewards/margins": 0.16872315108776093, "rewards/rejected": -1.3405070304870605, "step": 3055 }, { "epoch": 1.6377320622177622, "grad_norm": 5.231043854026084, "learning_rate": 5.070080825075298e-07, "logits/chosen": -0.17444220185279846, "logits/rejected": -0.03620634227991104, "logps/chosen": -1.2162582874298096, "logps/rejected": -1.3674461841583252, "loss": 1.2163, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2162582874298096, "rewards/margins": 0.15118782222270966, "rewards/rejected": -1.3674461841583252, "step": 3060 }, { "epoch": 1.6404080949991637, "grad_norm": 5.230189865419865, "learning_rate": 5.0545080135113e-07, "logits/chosen": -0.07980786263942719, "logits/rejected": -0.06494639813899994, "logps/chosen": -1.2122255563735962, "logps/rejected": -1.430403470993042, "loss": 1.2122, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2122255563735962, "rewards/margins": 0.2181778848171234, "rewards/rejected": -1.430403470993042, "step": 3065 }, { "epoch": 1.6430841277805652, "grad_norm": 5.3087206300017336, "learning_rate": 5.038934673110316e-07, "logits/chosen": -0.2090882509946823, "logits/rejected": -0.12040390074253082, "logps/chosen": -1.2107727527618408, "logps/rejected": -1.3413320779800415, "loss": 1.2108, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2107727527618408, "rewards/margins": 0.13055939972400665, "rewards/rejected": -1.3413320779800415, "step": 3070 }, { "epoch": 1.645760160561967, "grad_norm": 4.56323115429994, "learning_rate": 5.023360954964963e-07, "logits/chosen": -0.22392483055591583, "logits/rejected": -0.18386492133140564, "logps/chosen": -1.1420236825942993, "logps/rejected": -1.288199543952942, "loss": 1.142, "rewards/accuracies": 0.625, "rewards/chosen": -1.1420236825942993, "rewards/margins": 0.146175816655159, "rewards/rejected": -1.288199543952942, "step": 3075 }, { "epoch": 1.6484361933433684, "grad_norm": 4.797377751057085, "learning_rate": 5.007787010171524e-07, "logits/chosen": -0.23340587317943573, "logits/rejected": -0.07962710410356522, "logps/chosen": -1.1386802196502686, "logps/rejected": -1.2777098417282104, "loss": 1.1387, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1386802196502686, "rewards/margins": 0.1390295922756195, "rewards/rejected": -1.2777098417282104, "step": 3080 }, { "epoch": 1.65111222612477, "grad_norm": 3.962948921638121, "learning_rate": 4.992212989828477e-07, "logits/chosen": -0.08564749360084534, "logits/rejected": -0.0895770937204361, "logps/chosen": -1.1416981220245361, "logps/rejected": -1.2751343250274658, "loss": 1.1417, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1416981220245361, "rewards/margins": 0.13343602418899536, "rewards/rejected": -1.2751343250274658, "step": 3085 }, { "epoch": 1.6537882589061716, "grad_norm": 4.873070998022385, "learning_rate": 4.976639045035036e-07, "logits/chosen": -0.06333450227975845, "logits/rejected": -0.011814338155090809, "logps/chosen": -1.184178113937378, "logps/rejected": -1.2813241481781006, "loss": 1.1842, "rewards/accuracies": 0.5625, "rewards/chosen": -1.184178113937378, "rewards/margins": 0.09714607894420624, "rewards/rejected": -1.2813241481781006, "step": 3090 }, { "epoch": 1.6564642916875731, "grad_norm": 8.339405086283804, "learning_rate": 4.961065326889683e-07, "logits/chosen": -0.12514016032218933, "logits/rejected": -0.02139069139957428, "logps/chosen": -1.1930770874023438, "logps/rejected": -1.3140230178833008, "loss": 1.1931, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1930770874023438, "rewards/margins": 0.12094597518444061, "rewards/rejected": -1.3140230178833008, "step": 3095 }, { "epoch": 1.6591403244689746, "grad_norm": 6.767365561074155, "learning_rate": 4.9454919864887e-07, "logits/chosen": -0.2904069423675537, "logits/rejected": -0.17771203815937042, "logps/chosen": -1.238992691040039, "logps/rejected": -1.3265302181243896, "loss": 1.239, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.238992691040039, "rewards/margins": 0.08753766864538193, "rewards/rejected": -1.3265302181243896, "step": 3100 }, { "epoch": 1.6618163572503764, "grad_norm": 6.296296764495293, "learning_rate": 4.929919174924701e-07, "logits/chosen": -0.18992933630943298, "logits/rejected": -0.05459104850888252, "logps/chosen": -1.243783950805664, "logps/rejected": -1.3361676931381226, "loss": 1.2438, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.243783950805664, "rewards/margins": 0.09238380938768387, "rewards/rejected": -1.3361676931381226, "step": 3105 }, { "epoch": 1.6644923900317778, "grad_norm": 6.213570176465758, "learning_rate": 4.914347043285177e-07, "logits/chosen": -0.13693954050540924, "logits/rejected": -0.05563772842288017, "logps/chosen": -1.2134144306182861, "logps/rejected": -1.37760329246521, "loss": 1.2134, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2134144306182861, "rewards/margins": 0.16418881714344025, "rewards/rejected": -1.37760329246521, "step": 3110 }, { "epoch": 1.6671684228131793, "grad_norm": 4.68234189026691, "learning_rate": 4.898775742651013e-07, "logits/chosen": -0.07137470692396164, "logits/rejected": -0.02052970603108406, "logps/chosen": -1.2224620580673218, "logps/rejected": -1.4149975776672363, "loss": 1.2225, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2224620580673218, "rewards/margins": 0.192535400390625, "rewards/rejected": -1.4149975776672363, "step": 3115 }, { "epoch": 1.669844455594581, "grad_norm": 4.419657317856542, "learning_rate": 4.883205424095037e-07, "logits/chosen": -0.20940569043159485, "logits/rejected": -0.10313689708709717, "logps/chosen": -1.2816054821014404, "logps/rejected": -1.379805564880371, "loss": 1.2816, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2816054821014404, "rewards/margins": 0.09820015728473663, "rewards/rejected": -1.379805564880371, "step": 3120 }, { "epoch": 1.6725204883759828, "grad_norm": 7.031890641879548, "learning_rate": 4.86763623868055e-07, "logits/chosen": -0.11144807189702988, "logits/rejected": -0.04012024402618408, "logps/chosen": -1.2527128458023071, "logps/rejected": -1.4043657779693604, "loss": 1.2527, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2527128458023071, "rewards/margins": 0.15165293216705322, "rewards/rejected": -1.4043657779693604, "step": 3125 }, { "epoch": 1.675196521157384, "grad_norm": 6.05708418092523, "learning_rate": 4.852068337459856e-07, "logits/chosen": -0.10461977869272232, "logits/rejected": -0.011426175944507122, "logps/chosen": -1.2805492877960205, "logps/rejected": -1.394938588142395, "loss": 1.2805, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2805492877960205, "rewards/margins": 0.11438924074172974, "rewards/rejected": -1.394938588142395, "step": 3130 }, { "epoch": 1.6778725539387858, "grad_norm": 5.531192814015001, "learning_rate": 4.8365018714728e-07, "logits/chosen": -0.06033150106668472, "logits/rejected": -0.024981459602713585, "logps/chosen": -1.2751853466033936, "logps/rejected": -1.368325114250183, "loss": 1.2752, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2751853466033936, "rewards/margins": 0.09313978254795074, "rewards/rejected": -1.368325114250183, "step": 3135 }, { "epoch": 1.6805485867201875, "grad_norm": 4.094085709793087, "learning_rate": 4.820936991745304e-07, "logits/chosen": -0.3086581826210022, "logits/rejected": -0.18255789577960968, "logps/chosen": -1.1603564023971558, "logps/rejected": -1.2195515632629395, "loss": 1.1604, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.1603564023971558, "rewards/margins": 0.05919510871171951, "rewards/rejected": -1.2195515632629395, "step": 3140 }, { "epoch": 1.6832246195015887, "grad_norm": 5.83694568375271, "learning_rate": 4.8053738492879e-07, "logits/chosen": -0.11593760550022125, "logits/rejected": -0.013399800285696983, "logps/chosen": -1.212411642074585, "logps/rejected": -1.2302130460739136, "loss": 1.2124, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -1.212411642074585, "rewards/margins": 0.017801394686102867, "rewards/rejected": -1.2302130460739136, "step": 3145 }, { "epoch": 1.6859006522829905, "grad_norm": 4.593888284893408, "learning_rate": 4.789812595094265e-07, "logits/chosen": -0.2355726659297943, "logits/rejected": -0.13702292740345, "logps/chosen": -1.2809557914733887, "logps/rejected": -1.4026927947998047, "loss": 1.281, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2809557914733887, "rewards/margins": 0.12173694372177124, "rewards/rejected": -1.4026927947998047, "step": 3150 }, { "epoch": 1.6885766850643922, "grad_norm": 5.831278632341498, "learning_rate": 4.774253380139752e-07, "logits/chosen": -0.23681864142417908, "logits/rejected": -0.14341332018375397, "logps/chosen": -1.1507720947265625, "logps/rejected": -1.3192284107208252, "loss": 1.1508, "rewards/accuracies": 0.53125, "rewards/chosen": -1.1507720947265625, "rewards/margins": 0.16845622658729553, "rewards/rejected": -1.3192284107208252, "step": 3155 }, { "epoch": 1.6912527178457935, "grad_norm": 6.537273220802109, "learning_rate": 4.758696355379936e-07, "logits/chosen": -0.19160769879817963, "logits/rejected": -0.17329370975494385, "logps/chosen": -1.1832079887390137, "logps/rejected": -1.377816915512085, "loss": 1.1832, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1832079887390137, "rewards/margins": 0.19460900127887726, "rewards/rejected": -1.377816915512085, "step": 3160 }, { "epoch": 1.6939287506271952, "grad_norm": 5.280097673057097, "learning_rate": 4.743141671749138e-07, "logits/chosen": -0.22843167185783386, "logits/rejected": -0.16405096650123596, "logps/chosen": -1.24576997756958, "logps/rejected": -1.3414485454559326, "loss": 1.2458, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.24576997756958, "rewards/margins": 0.09567873924970627, "rewards/rejected": -1.3414485454559326, "step": 3165 }, { "epoch": 1.6966047834085969, "grad_norm": 5.075317930690382, "learning_rate": 4.727589480158968e-07, "logits/chosen": -0.19082888960838318, "logits/rejected": -0.1158471331000328, "logps/chosen": -1.2228113412857056, "logps/rejected": -1.3165194988250732, "loss": 1.2228, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2228113412857056, "rewards/margins": 0.09370823204517365, "rewards/rejected": -1.3165194988250732, "step": 3170 }, { "epoch": 1.6992808161899984, "grad_norm": 5.750955160062604, "learning_rate": 4.712039931496855e-07, "logits/chosen": -0.20801416039466858, "logits/rejected": -0.14244520664215088, "logps/chosen": -1.1669471263885498, "logps/rejected": -1.3100188970565796, "loss": 1.1669, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1669471263885498, "rewards/margins": 0.14307188987731934, "rewards/rejected": -1.3100188970565796, "step": 3175 }, { "epoch": 1.7019568489713999, "grad_norm": 4.479486505653589, "learning_rate": 4.6964931766245905e-07, "logits/chosen": -0.08172741532325745, "logits/rejected": -0.03730004280805588, "logps/chosen": -1.2378443479537964, "logps/rejected": -1.3858705759048462, "loss": 1.2378, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2378443479537964, "rewards/margins": 0.14802619814872742, "rewards/rejected": -1.3858705759048462, "step": 3180 }, { "epoch": 1.7046328817528016, "grad_norm": 5.722990233608055, "learning_rate": 4.6809493663768575e-07, "logits/chosen": -0.11722283065319061, "logits/rejected": -0.10586036741733551, "logps/chosen": -1.1295064687728882, "logps/rejected": -1.2671973705291748, "loss": 1.1295, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1295064687728882, "rewards/margins": 0.13769085705280304, "rewards/rejected": -1.2671973705291748, "step": 3185 }, { "epoch": 1.707308914534203, "grad_norm": 6.693777978152059, "learning_rate": 4.6654086515597716e-07, "logits/chosen": -0.21483203768730164, "logits/rejected": -0.11267878860235214, "logps/chosen": -1.186269760131836, "logps/rejected": -1.3704369068145752, "loss": 1.1863, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.186269760131836, "rewards/margins": 0.18416717648506165, "rewards/rejected": -1.3704369068145752, "step": 3190 }, { "epoch": 1.7099849473156046, "grad_norm": 4.057933768796398, "learning_rate": 4.6498711829494154e-07, "logits/chosen": -0.23722457885742188, "logits/rejected": -0.15248191356658936, "logps/chosen": -1.1868510246276855, "logps/rejected": -1.3909766674041748, "loss": 1.1869, "rewards/accuracies": 0.625, "rewards/chosen": -1.1868510246276855, "rewards/margins": 0.20412561297416687, "rewards/rejected": -1.3909766674041748, "step": 3195 }, { "epoch": 1.7126609800970063, "grad_norm": 5.840027007980028, "learning_rate": 4.6343371112903777e-07, "logits/chosen": -0.14179784059524536, "logits/rejected": -0.013167542405426502, "logps/chosen": -1.2250038385391235, "logps/rejected": -1.4614222049713135, "loss": 1.225, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2250038385391235, "rewards/margins": 0.23641857504844666, "rewards/rejected": -1.4614222049713135, "step": 3200 }, { "epoch": 1.7126609800970063, "eval_logits/chosen": 0.11708347499370575, "eval_logits/rejected": 0.18544019758701324, "eval_logps/chosen": -1.2646814584732056, "eval_logps/rejected": -1.3871690034866333, "eval_loss": 1.2649515867233276, "eval_rewards/accuracies": 0.5489614009857178, "eval_rewards/chosen": -1.2646814584732056, "eval_rewards/margins": 0.122487373650074, "eval_rewards/rejected": -1.3871690034866333, "eval_runtime": 40.4756, "eval_samples_per_second": 33.23, "eval_steps_per_second": 8.326, "step": 3200 }, { "epoch": 1.7153370128784078, "grad_norm": 4.491143415088099, "learning_rate": 4.618806587294291e-07, "logits/chosen": -0.24364885687828064, "logits/rejected": -0.15260745584964752, "logps/chosen": -1.246119737625122, "logps/rejected": -1.3832776546478271, "loss": 1.2461, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.246119737625122, "rewards/margins": 0.13715805113315582, "rewards/rejected": -1.3832776546478271, "step": 3205 }, { "epoch": 1.7180130456598093, "grad_norm": 6.592038993231015, "learning_rate": 4.603279761638365e-07, "logits/chosen": -0.2387682944536209, "logits/rejected": -0.1664121448993683, "logps/chosen": -1.2156226634979248, "logps/rejected": -1.3444721698760986, "loss": 1.2156, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2156226634979248, "rewards/margins": 0.1288493275642395, "rewards/rejected": -1.3444721698760986, "step": 3210 }, { "epoch": 1.720689078441211, "grad_norm": 6.604896159543427, "learning_rate": 4.5877567849639315e-07, "logits/chosen": -0.1943768709897995, "logits/rejected": -0.0930970162153244, "logps/chosen": -1.1851587295532227, "logps/rejected": -1.2827646732330322, "loss": 1.1852, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.1851587295532227, "rewards/margins": 0.09760592877864838, "rewards/rejected": -1.2827646732330322, "step": 3215 }, { "epoch": 1.7233651112226125, "grad_norm": 4.976502729613055, "learning_rate": 4.572237807874979e-07, "logits/chosen": -0.19443079829216003, "logits/rejected": -0.03945695236325264, "logps/chosen": -1.268071174621582, "logps/rejected": -1.3649638891220093, "loss": 1.2681, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.268071174621582, "rewards/margins": 0.09689255058765411, "rewards/rejected": -1.3649638891220093, "step": 3220 }, { "epoch": 1.726041144004014, "grad_norm": 6.345469172209627, "learning_rate": 4.5567229809366895e-07, "logits/chosen": -0.20616427063941956, "logits/rejected": -0.09597954899072647, "logps/chosen": -1.1389878988265991, "logps/rejected": -1.3047047853469849, "loss": 1.139, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.1389878988265991, "rewards/margins": 0.1657167673110962, "rewards/rejected": -1.3047047853469849, "step": 3225 }, { "epoch": 1.7287171767854157, "grad_norm": 6.642285553091727, "learning_rate": 4.541212454673984e-07, "logits/chosen": -0.19332972168922424, "logits/rejected": -0.0818483829498291, "logps/chosen": -1.184088945388794, "logps/rejected": -1.3410418033599854, "loss": 1.1841, "rewards/accuracies": 0.5625, "rewards/chosen": -1.184088945388794, "rewards/margins": 0.15695300698280334, "rewards/rejected": -1.3410418033599854, "step": 3230 }, { "epoch": 1.7313932095668172, "grad_norm": 5.970443436073913, "learning_rate": 4.525706379570055e-07, "logits/chosen": -0.17095784842967987, "logits/rejected": -0.13807789981365204, "logps/chosen": -1.2112200260162354, "logps/rejected": -1.351171612739563, "loss": 1.2112, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2112200260162354, "rewards/margins": 0.1399514526128769, "rewards/rejected": -1.351171612739563, "step": 3235 }, { "epoch": 1.7340692423482187, "grad_norm": 5.40898411523477, "learning_rate": 4.510204906064911e-07, "logits/chosen": -0.10683652013540268, "logits/rejected": -0.04279155656695366, "logps/chosen": -1.1562834978103638, "logps/rejected": -1.3564612865447998, "loss": 1.1563, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.1562834978103638, "rewards/margins": 0.20017783343791962, "rewards/rejected": -1.3564612865447998, "step": 3240 }, { "epoch": 1.7367452751296204, "grad_norm": 5.9085242672387475, "learning_rate": 4.4947081845539177e-07, "logits/chosen": -0.27026474475860596, "logits/rejected": -0.16488447785377502, "logps/chosen": -1.157166600227356, "logps/rejected": -1.2942912578582764, "loss": 1.1572, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.157166600227356, "rewards/margins": 0.13712462782859802, "rewards/rejected": -1.2942912578582764, "step": 3245 }, { "epoch": 1.739421307911022, "grad_norm": 4.390284870793255, "learning_rate": 4.479216365386333e-07, "logits/chosen": -0.05907364934682846, "logits/rejected": 0.03399975970387459, "logps/chosen": -1.2158186435699463, "logps/rejected": -1.3945707082748413, "loss": 1.2158, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2158186435699463, "rewards/margins": 0.17875202000141144, "rewards/rejected": -1.3945707082748413, "step": 3250 }, { "epoch": 1.7420973406924234, "grad_norm": 6.62066688842394, "learning_rate": 4.4637295988638555e-07, "logits/chosen": -0.11249778419733047, "logits/rejected": -0.055951375514268875, "logps/chosen": -1.3006430864334106, "logps/rejected": -1.3099706172943115, "loss": 1.3006, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.3006430864334106, "rewards/margins": 0.00932753924280405, "rewards/rejected": -1.3099706172943115, "step": 3255 }, { "epoch": 1.744773373473825, "grad_norm": 6.412003087421061, "learning_rate": 4.4482480352391623e-07, "logits/chosen": -0.23819422721862793, "logits/rejected": -0.135118767619133, "logps/chosen": -1.2622320652008057, "logps/rejected": -1.3087371587753296, "loss": 1.2622, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2622320652008057, "rewards/margins": 0.04650520533323288, "rewards/rejected": -1.3087371587753296, "step": 3260 }, { "epoch": 1.7474494062552266, "grad_norm": 7.369535882694934, "learning_rate": 4.4327718247144507e-07, "logits/chosen": -0.15261247754096985, "logits/rejected": -0.0733204111456871, "logps/chosen": -1.1499505043029785, "logps/rejected": -1.316245198249817, "loss": 1.15, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1499505043029785, "rewards/margins": 0.1662949174642563, "rewards/rejected": -1.316245198249817, "step": 3265 }, { "epoch": 1.750125439036628, "grad_norm": 5.631215179548243, "learning_rate": 4.417301117439984e-07, "logits/chosen": -0.1376570761203766, "logits/rejected": -0.0021315242629498243, "logps/chosen": -1.114463210105896, "logps/rejected": -1.3278591632843018, "loss": 1.1145, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.114463210105896, "rewards/margins": 0.2133958786725998, "rewards/rejected": -1.3278591632843018, "step": 3270 }, { "epoch": 1.7528014718180298, "grad_norm": 4.1551117777513955, "learning_rate": 4.401836063512631e-07, "logits/chosen": -0.16986694931983948, "logits/rejected": 0.06211011856794357, "logps/chosen": -1.2395738363265991, "logps/rejected": -1.2730238437652588, "loss": 1.2396, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2395738363265991, "rewards/margins": 0.03345007449388504, "rewards/rejected": -1.2730238437652588, "step": 3275 }, { "epoch": 1.7554775045994313, "grad_norm": 6.180402164151366, "learning_rate": 4.386376812974413e-07, "logits/chosen": -0.13733018934726715, "logits/rejected": -0.06993383914232254, "logps/chosen": -1.1744128465652466, "logps/rejected": -1.2897934913635254, "loss": 1.1744, "rewards/accuracies": 0.53125, "rewards/chosen": -1.1744128465652466, "rewards/margins": 0.11538054049015045, "rewards/rejected": -1.2897934913635254, "step": 3280 }, { "epoch": 1.7581535373808328, "grad_norm": 5.537702414863006, "learning_rate": 4.370923515811048e-07, "logits/chosen": -0.1513996124267578, "logits/rejected": 0.020641013979911804, "logps/chosen": -1.1744192838668823, "logps/rejected": -1.3554773330688477, "loss": 1.1744, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1744192838668823, "rewards/margins": 0.18105804920196533, "rewards/rejected": -1.3554773330688477, "step": 3285 }, { "epoch": 1.7608295701622345, "grad_norm": 6.769207947157023, "learning_rate": 4.35547632195049e-07, "logits/chosen": -0.13918177783489227, "logits/rejected": -0.05292380973696709, "logps/chosen": -1.1867765188217163, "logps/rejected": -1.2245490550994873, "loss": 1.1868, "rewards/accuracies": 0.46875, "rewards/chosen": -1.1867765188217163, "rewards/margins": 0.03777247667312622, "rewards/rejected": -1.2245490550994873, "step": 3290 }, { "epoch": 1.763505602943636, "grad_norm": 5.893400174783782, "learning_rate": 4.340035381261484e-07, "logits/chosen": -0.17094473540782928, "logits/rejected": -0.13498765230178833, "logps/chosen": -1.2740566730499268, "logps/rejected": -1.308266282081604, "loss": 1.2741, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.2740566730499268, "rewards/margins": 0.034209586679935455, "rewards/rejected": -1.308266282081604, "step": 3295 }, { "epoch": 1.7661816357250375, "grad_norm": 5.874706150592809, "learning_rate": 4.324600843552104e-07, "logits/chosen": -0.23745815455913544, "logits/rejected": -0.13989071547985077, "logps/chosen": -1.2650535106658936, "logps/rejected": -1.3424084186553955, "loss": 1.2651, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2650535106658936, "rewards/margins": 0.07735483348369598, "rewards/rejected": -1.3424084186553955, "step": 3300 }, { "epoch": 1.7688576685064392, "grad_norm": 6.505184260815923, "learning_rate": 4.309172858568302e-07, "logits/chosen": -0.23714828491210938, "logits/rejected": -0.14114250242710114, "logps/chosen": -1.2352421283721924, "logps/rejected": -1.3073794841766357, "loss": 1.2352, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.2352421283721924, "rewards/margins": 0.07213716208934784, "rewards/rejected": -1.3073794841766357, "step": 3305 }, { "epoch": 1.771533701287841, "grad_norm": 5.536692361619693, "learning_rate": 4.293751575992455e-07, "logits/chosen": -0.07549972832202911, "logits/rejected": -0.04711247235536575, "logps/chosen": -1.2018697261810303, "logps/rejected": -1.3245131969451904, "loss": 1.2019, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2018697261810303, "rewards/margins": 0.12264355272054672, "rewards/rejected": -1.3245131969451904, "step": 3310 }, { "epoch": 1.7742097340692422, "grad_norm": 5.732939712968736, "learning_rate": 4.278337145441916e-07, "logits/chosen": -0.26538223028182983, "logits/rejected": -0.16800419986248016, "logps/chosen": -1.170317530632019, "logps/rejected": -1.3036450147628784, "loss": 1.1703, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.170317530632019, "rewards/margins": 0.13332757353782654, "rewards/rejected": -1.3036450147628784, "step": 3315 }, { "epoch": 1.776885766850644, "grad_norm": 4.884783634921944, "learning_rate": 4.262929716467556e-07, "logits/chosen": -0.1729748249053955, "logits/rejected": -0.039932895451784134, "logps/chosen": -1.1981786489486694, "logps/rejected": -1.3289659023284912, "loss": 1.1982, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1981786489486694, "rewards/margins": 0.13078723847866058, "rewards/rejected": -1.3289659023284912, "step": 3320 }, { "epoch": 1.7795617996320456, "grad_norm": 5.671564847678702, "learning_rate": 4.247529438552321e-07, "logits/chosen": -0.2597041726112366, "logits/rejected": -0.1184685230255127, "logps/chosen": -1.2164766788482666, "logps/rejected": -1.3551827669143677, "loss": 1.2165, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2164766788482666, "rewards/margins": 0.13870613276958466, "rewards/rejected": -1.3551827669143677, "step": 3325 }, { "epoch": 1.782237832413447, "grad_norm": 5.71284460510115, "learning_rate": 4.232136461109773e-07, "logits/chosen": -0.1498754918575287, "logits/rejected": -0.06742434203624725, "logps/chosen": -1.1207746267318726, "logps/rejected": -1.3057342767715454, "loss": 1.1208, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1207746267318726, "rewards/margins": 0.1849595308303833, "rewards/rejected": -1.3057342767715454, "step": 3330 }, { "epoch": 1.7849138651948486, "grad_norm": 7.862874635970484, "learning_rate": 4.216750933482646e-07, "logits/chosen": -0.19322915375232697, "logits/rejected": -0.06475642323493958, "logps/chosen": -1.2373689413070679, "logps/rejected": -1.3508301973342896, "loss": 1.2374, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2373689413070679, "rewards/margins": 0.11346123367547989, "rewards/rejected": -1.3508301973342896, "step": 3335 }, { "epoch": 1.7875898979762503, "grad_norm": 4.886953232998979, "learning_rate": 4.2013730049413986e-07, "logits/chosen": -0.13845553994178772, "logits/rejected": -0.03690153360366821, "logps/chosen": -1.1856496334075928, "logps/rejected": -1.3245713710784912, "loss": 1.1856, "rewards/accuracies": 0.53125, "rewards/chosen": -1.1856496334075928, "rewards/margins": 0.13892170786857605, "rewards/rejected": -1.3245713710784912, "step": 3340 }, { "epoch": 1.7902659307576518, "grad_norm": 5.482612684563567, "learning_rate": 4.1860028246827594e-07, "logits/chosen": -0.15019726753234863, "logits/rejected": -0.030324911698698997, "logps/chosen": -1.1158322095870972, "logps/rejected": -1.256764531135559, "loss": 1.1158, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.1158322095870972, "rewards/margins": 0.14093232154846191, "rewards/rejected": -1.256764531135559, "step": 3345 }, { "epoch": 1.7929419635390533, "grad_norm": 6.596831515489607, "learning_rate": 4.170640541828285e-07, "logits/chosen": -0.26834890246391296, "logits/rejected": -0.16133897006511688, "logps/chosen": -1.2864171266555786, "logps/rejected": -1.3465033769607544, "loss": 1.2864, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2864171266555786, "rewards/margins": 0.060086190700531006, "rewards/rejected": -1.3465033769607544, "step": 3350 }, { "epoch": 1.795617996320455, "grad_norm": 5.0256155650603205, "learning_rate": 4.1552863054229116e-07, "logits/chosen": -0.024779897183179855, "logits/rejected": 0.0008195772534236312, "logps/chosen": -1.2576894760131836, "logps/rejected": -1.2955785989761353, "loss": 1.2577, "rewards/accuracies": 0.46875, "rewards/chosen": -1.2576894760131836, "rewards/margins": 0.03788911551237106, "rewards/rejected": -1.2955785989761353, "step": 3355 }, { "epoch": 1.7982940291018565, "grad_norm": 6.660317912671412, "learning_rate": 4.139940264433508e-07, "logits/chosen": -0.15529896318912506, "logits/rejected": 0.0053543210960924625, "logps/chosen": -1.1639297008514404, "logps/rejected": -1.2652324438095093, "loss": 1.1639, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1639297008514404, "rewards/margins": 0.10130278766155243, "rewards/rejected": -1.2652324438095093, "step": 3360 }, { "epoch": 1.800970061883258, "grad_norm": 4.322341922743799, "learning_rate": 4.1246025677474303e-07, "logits/chosen": -0.18019823729991913, "logits/rejected": -0.05854920297861099, "logps/chosen": -1.2010098695755005, "logps/rejected": -1.3294098377227783, "loss": 1.201, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2010098695755005, "rewards/margins": 0.12839984893798828, "rewards/rejected": -1.3294098377227783, "step": 3365 }, { "epoch": 1.8036460946646597, "grad_norm": 4.216819973111727, "learning_rate": 4.10927336417108e-07, "logits/chosen": -0.16001906991004944, "logits/rejected": -0.04291045665740967, "logps/chosen": -1.18995201587677, "logps/rejected": -1.292647123336792, "loss": 1.19, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.18995201587677, "rewards/margins": 0.10269506275653839, "rewards/rejected": -1.292647123336792, "step": 3370 }, { "epoch": 1.8063221274460612, "grad_norm": 6.231025986873424, "learning_rate": 4.093952802428457e-07, "logits/chosen": -0.03491463139653206, "logits/rejected": -0.004274624399840832, "logps/chosen": -1.224326252937317, "logps/rejected": -1.245927095413208, "loss": 1.2243, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -1.224326252937317, "rewards/margins": 0.02160102315247059, "rewards/rejected": -1.245927095413208, "step": 3375 }, { "epoch": 1.8089981602274627, "grad_norm": 5.74500826828264, "learning_rate": 4.0786410311597184e-07, "logits/chosen": -0.22203752398490906, "logits/rejected": -0.11057785898447037, "logps/chosen": -1.1985299587249756, "logps/rejected": -1.3632029294967651, "loss": 1.1985, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1985299587249756, "rewards/margins": 0.1646731048822403, "rewards/rejected": -1.3632029294967651, "step": 3380 }, { "epoch": 1.8116741930088645, "grad_norm": 4.712667983032441, "learning_rate": 4.063338198919737e-07, "logits/chosen": -0.1958836019039154, "logits/rejected": -0.17341820895671844, "logps/chosen": -1.2585468292236328, "logps/rejected": -1.3672200441360474, "loss": 1.2585, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2585468292236328, "rewards/margins": 0.1086731106042862, "rewards/rejected": -1.3672200441360474, "step": 3385 }, { "epoch": 1.814350225790266, "grad_norm": 7.080314887261167, "learning_rate": 4.0480444541766575e-07, "logits/chosen": -0.1302884817123413, "logits/rejected": -0.04806392639875412, "logps/chosen": -1.2633554935455322, "logps/rejected": -1.2895514965057373, "loss": 1.2634, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2633554935455322, "rewards/margins": 0.026196081191301346, "rewards/rejected": -1.2895514965057373, "step": 3390 }, { "epoch": 1.8170262585716674, "grad_norm": 6.3374804985361735, "learning_rate": 4.0327599453104606e-07, "logits/chosen": -0.18071335554122925, "logits/rejected": -0.11925394833087921, "logps/chosen": -1.152075171470642, "logps/rejected": -1.327843189239502, "loss": 1.1521, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.152075171470642, "rewards/margins": 0.1757679283618927, "rewards/rejected": -1.327843189239502, "step": 3395 }, { "epoch": 1.8197022913530692, "grad_norm": 5.424007403628901, "learning_rate": 4.017484820611514e-07, "logits/chosen": -0.1527930498123169, "logits/rejected": -0.06213752552866936, "logps/chosen": -1.2161506414413452, "logps/rejected": -1.279205560684204, "loss": 1.2162, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2161506414413452, "rewards/margins": 0.06305477023124695, "rewards/rejected": -1.279205560684204, "step": 3400 }, { "epoch": 1.8223783241344707, "grad_norm": 7.101442056566429, "learning_rate": 4.002219228279148e-07, "logits/chosen": -0.15120866894721985, "logits/rejected": -0.03092968836426735, "logps/chosen": -1.2071486711502075, "logps/rejected": -1.3534046411514282, "loss": 1.2071, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2071486711502075, "rewards/margins": 0.14625594019889832, "rewards/rejected": -1.3534046411514282, "step": 3405 }, { "epoch": 1.8250543569158721, "grad_norm": 6.097350645340119, "learning_rate": 3.9869633164202045e-07, "logits/chosen": -0.1596592664718628, "logits/rejected": 0.004979190416634083, "logps/chosen": -1.3462331295013428, "logps/rejected": -1.40768301486969, "loss": 1.3462, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.3462331295013428, "rewards/margins": 0.06144970655441284, "rewards/rejected": -1.40768301486969, "step": 3410 }, { "epoch": 1.8277303896972739, "grad_norm": 5.40407359343812, "learning_rate": 3.9717172330476077e-07, "logits/chosen": -0.15269267559051514, "logits/rejected": -0.07802381366491318, "logps/chosen": -1.189989686012268, "logps/rejected": -1.3407924175262451, "loss": 1.19, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.189989686012268, "rewards/margins": 0.150802880525589, "rewards/rejected": -1.3407924175262451, "step": 3415 }, { "epoch": 1.8304064224786754, "grad_norm": 7.421355604656843, "learning_rate": 3.956481126078927e-07, "logits/chosen": -0.10775983333587646, "logits/rejected": -0.017748307436704636, "logps/chosen": -1.212646245956421, "logps/rejected": -1.4152066707611084, "loss": 1.2126, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.212646245956421, "rewards/margins": 0.20256038010120392, "rewards/rejected": -1.4152066707611084, "step": 3420 }, { "epoch": 1.8330824552600768, "grad_norm": 4.637839343457749, "learning_rate": 3.941255143334937e-07, "logits/chosen": -0.20826590061187744, "logits/rejected": -0.18438085913658142, "logps/chosen": -1.1948773860931396, "logps/rejected": -1.3124290704727173, "loss": 1.1949, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1948773860931396, "rewards/margins": 0.11755168437957764, "rewards/rejected": -1.3124290704727173, "step": 3425 }, { "epoch": 1.8357584880414786, "grad_norm": 6.387286485815509, "learning_rate": 3.9260394325381895e-07, "logits/chosen": -0.17450019717216492, "logits/rejected": -0.08261345326900482, "logps/chosen": -1.2644113302230835, "logps/rejected": -1.3569786548614502, "loss": 1.2644, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2644113302230835, "rewards/margins": 0.09256730228662491, "rewards/rejected": -1.3569786548614502, "step": 3430 }, { "epoch": 1.83843452082288, "grad_norm": 6.74207694800528, "learning_rate": 3.9108341413115784e-07, "logits/chosen": -0.18797466158866882, "logits/rejected": -0.13442282378673553, "logps/chosen": -1.2212904691696167, "logps/rejected": -1.394654631614685, "loss": 1.2213, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2212904691696167, "rewards/margins": 0.1733640879392624, "rewards/rejected": -1.394654631614685, "step": 3435 }, { "epoch": 1.8411105536042816, "grad_norm": 7.462161622212914, "learning_rate": 3.895639417176905e-07, "logits/chosen": -0.20969156920909882, "logits/rejected": -0.16908106207847595, "logps/chosen": -1.1366201639175415, "logps/rejected": -1.2815049886703491, "loss": 1.1366, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1366201639175415, "rewards/margins": 0.14488498866558075, "rewards/rejected": -1.2815049886703491, "step": 3440 }, { "epoch": 1.8437865863856833, "grad_norm": 5.1668016704448725, "learning_rate": 3.8804554075534497e-07, "logits/chosen": -0.21124522387981415, "logits/rejected": -0.02330569550395012, "logps/chosen": -1.1812747716903687, "logps/rejected": -1.298424243927002, "loss": 1.1813, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1812747716903687, "rewards/margins": 0.11714942753314972, "rewards/rejected": -1.298424243927002, "step": 3445 }, { "epoch": 1.8464626191670848, "grad_norm": 6.88003101952472, "learning_rate": 3.8652822597565403e-07, "logits/chosen": -0.2904958128929138, "logits/rejected": -0.145149365067482, "logps/chosen": -1.2083895206451416, "logps/rejected": -1.3909224271774292, "loss": 1.2084, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2083895206451416, "rewards/margins": 0.18253293633460999, "rewards/rejected": -1.3909224271774292, "step": 3450 }, { "epoch": 1.8491386519484863, "grad_norm": 5.581302669486257, "learning_rate": 3.850120120996123e-07, "logits/chosen": -0.17596125602722168, "logits/rejected": -0.05999898165464401, "logps/chosen": -1.3756617307662964, "logps/rejected": -1.498306393623352, "loss": 1.3757, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.3756617307662964, "rewards/margins": 0.1226445883512497, "rewards/rejected": -1.498306393623352, "step": 3455 }, { "epoch": 1.851814684729888, "grad_norm": 5.653556831593056, "learning_rate": 3.8349691383753356e-07, "logits/chosen": -0.074404276907444, "logits/rejected": 0.02249768190085888, "logps/chosen": -1.182502031326294, "logps/rejected": -1.3287572860717773, "loss": 1.1825, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.182502031326294, "rewards/margins": 0.14625521004199982, "rewards/rejected": -1.3287572860717773, "step": 3460 }, { "epoch": 1.8544907175112895, "grad_norm": 5.309142232895626, "learning_rate": 3.819829458889078e-07, "logits/chosen": -0.22328679263591766, "logits/rejected": -0.1177106499671936, "logps/chosen": -1.166651964187622, "logps/rejected": -1.3004415035247803, "loss": 1.1667, "rewards/accuracies": 0.59375, "rewards/chosen": -1.166651964187622, "rewards/margins": 0.13378958404064178, "rewards/rejected": -1.3004415035247803, "step": 3465 }, { "epoch": 1.857166750292691, "grad_norm": 6.342644839459581, "learning_rate": 3.804701229422585e-07, "logits/chosen": -0.2096303403377533, "logits/rejected": -0.13312794268131256, "logps/chosen": -1.2879570722579956, "logps/rejected": -1.3684704303741455, "loss": 1.288, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2879570722579956, "rewards/margins": 0.08051331341266632, "rewards/rejected": -1.3684704303741455, "step": 3470 }, { "epoch": 1.8598427830740927, "grad_norm": 5.129321142725844, "learning_rate": 3.789584596750007e-07, "logits/chosen": -0.26560333371162415, "logits/rejected": -0.2151840478181839, "logps/chosen": -1.22012197971344, "logps/rejected": -1.350990891456604, "loss": 1.2201, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.22012197971344, "rewards/margins": 0.13086895644664764, "rewards/rejected": -1.350990891456604, "step": 3475 }, { "epoch": 1.8625188158554944, "grad_norm": 5.91480656834002, "learning_rate": 3.77447970753298e-07, "logits/chosen": -0.14576849341392517, "logits/rejected": -0.12268824875354767, "logps/chosen": -1.240882158279419, "logps/rejected": -1.4257888793945312, "loss": 1.2409, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.240882158279419, "rewards/margins": 0.18490657210350037, "rewards/rejected": -1.4257888793945312, "step": 3480 }, { "epoch": 1.8651948486368957, "grad_norm": 4.4797775190231786, "learning_rate": 3.7593867083192057e-07, "logits/chosen": -0.1751851737499237, "logits/rejected": -0.09498211741447449, "logps/chosen": -1.1975809335708618, "logps/rejected": -1.3192856311798096, "loss": 1.1976, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1975809335708618, "rewards/margins": 0.12170474231243134, "rewards/rejected": -1.3192856311798096, "step": 3485 }, { "epoch": 1.8678708814182974, "grad_norm": 5.521089084351841, "learning_rate": 3.7443057455410276e-07, "logits/chosen": -0.14405176043510437, "logits/rejected": -0.04991941899061203, "logps/chosen": -1.279273271560669, "logps/rejected": -1.3103182315826416, "loss": 1.2793, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.279273271560669, "rewards/margins": 0.03104497864842415, "rewards/rejected": -1.3103182315826416, "step": 3490 }, { "epoch": 1.870546914199699, "grad_norm": 6.348840320109821, "learning_rate": 3.7292369655140145e-07, "logits/chosen": -0.23052886128425598, "logits/rejected": -0.09342405945062637, "logps/chosen": -1.2103219032287598, "logps/rejected": -1.344957947731018, "loss": 1.2103, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2103219032287598, "rewards/margins": 0.13463595509529114, "rewards/rejected": -1.344957947731018, "step": 3495 }, { "epoch": 1.8732229469811004, "grad_norm": 6.330810814246969, "learning_rate": 3.714180514435534e-07, "logits/chosen": -0.16381020843982697, "logits/rejected": -0.03726597875356674, "logps/chosen": -1.2482383251190186, "logps/rejected": -1.418566346168518, "loss": 1.2482, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2482383251190186, "rewards/margins": 0.17032794654369354, "rewards/rejected": -1.418566346168518, "step": 3500 }, { "epoch": 1.875898979762502, "grad_norm": 6.33407091188608, "learning_rate": 3.6991365383833426e-07, "logits/chosen": -0.15312156081199646, "logits/rejected": -0.06654296070337296, "logps/chosen": -1.2607905864715576, "logps/rejected": -1.3785853385925293, "loss": 1.2608, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2607905864715576, "rewards/margins": 0.11779467761516571, "rewards/rejected": -1.3785853385925293, "step": 3505 }, { "epoch": 1.8785750125439038, "grad_norm": 7.055203131034464, "learning_rate": 3.684105183314162e-07, "logits/chosen": -0.1701403111219406, "logits/rejected": -0.12244448810815811, "logps/chosen": -1.1860713958740234, "logps/rejected": -1.3112951517105103, "loss": 1.1861, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1860713958740234, "rewards/margins": 0.12522374093532562, "rewards/rejected": -1.3112951517105103, "step": 3510 }, { "epoch": 1.881251045325305, "grad_norm": 5.374803004174461, "learning_rate": 3.669086595062263e-07, "logits/chosen": -0.1807953119277954, "logits/rejected": -0.02879893220961094, "logps/chosen": -1.2467310428619385, "logps/rejected": -1.401903510093689, "loss": 1.2467, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2467310428619385, "rewards/margins": 0.155172660946846, "rewards/rejected": -1.401903510093689, "step": 3515 }, { "epoch": 1.8839270781067068, "grad_norm": 5.255947793653571, "learning_rate": 3.654080919338056e-07, "logits/chosen": -0.2264213263988495, "logits/rejected": -0.12559451162815094, "logps/chosen": -1.245815634727478, "logps/rejected": -1.3761717081069946, "loss": 1.2458, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.245815634727478, "rewards/margins": 0.1303558647632599, "rewards/rejected": -1.3761717081069946, "step": 3520 }, { "epoch": 1.8866031108881085, "grad_norm": 3.4000494903437644, "learning_rate": 3.639088301726673e-07, "logits/chosen": -0.14280949532985687, "logits/rejected": 0.013829996809363365, "logps/chosen": -1.2039129734039307, "logps/rejected": -1.3415906429290771, "loss": 1.2039, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2039129734039307, "rewards/margins": 0.1376776248216629, "rewards/rejected": -1.3415906429290771, "step": 3525 }, { "epoch": 1.88927914366951, "grad_norm": 6.101192693827946, "learning_rate": 3.624108887686556e-07, "logits/chosen": -0.14295132458209991, "logits/rejected": -0.08529139310121536, "logps/chosen": -1.1935421228408813, "logps/rejected": -1.3750500679016113, "loss": 1.1935, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1935421228408813, "rewards/margins": 0.1815079301595688, "rewards/rejected": -1.3750500679016113, "step": 3530 }, { "epoch": 1.8919551764509115, "grad_norm": 4.630819936488241, "learning_rate": 3.6091428225480433e-07, "logits/chosen": -0.24677710235118866, "logits/rejected": -0.1432533860206604, "logps/chosen": -1.1514554023742676, "logps/rejected": -1.2224502563476562, "loss": 1.1515, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.1514554023742676, "rewards/margins": 0.07099483162164688, "rewards/rejected": -1.2224502563476562, "step": 3535 }, { "epoch": 1.8946312092323132, "grad_norm": 6.864898263564319, "learning_rate": 3.5941902515119674e-07, "logits/chosen": -0.1677287220954895, "logits/rejected": 0.0277693010866642, "logps/chosen": -1.1910731792449951, "logps/rejected": -1.3603273630142212, "loss": 1.1911, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1910731792449951, "rewards/margins": 0.16925430297851562, "rewards/rejected": -1.3603273630142212, "step": 3540 }, { "epoch": 1.8973072420137147, "grad_norm": 6.244896260036876, "learning_rate": 3.5792513196482373e-07, "logits/chosen": -0.30210956931114197, "logits/rejected": -0.0909012109041214, "logps/chosen": -1.2039133310317993, "logps/rejected": -1.254995584487915, "loss": 1.2039, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.2039133310317993, "rewards/margins": 0.05108245089650154, "rewards/rejected": -1.254995584487915, "step": 3545 }, { "epoch": 1.8999832747951162, "grad_norm": 5.3439496042202865, "learning_rate": 3.5643261718944346e-07, "logits/chosen": -0.15561816096305847, "logits/rejected": -0.08493496477603912, "logps/chosen": -1.1905758380889893, "logps/rejected": -1.2918370962142944, "loss": 1.1906, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1905758380889893, "rewards/margins": 0.1012611836194992, "rewards/rejected": -1.2918370962142944, "step": 3550 }, { "epoch": 1.902659307576518, "grad_norm": 4.995176596441677, "learning_rate": 3.5494149530544087e-07, "logits/chosen": -0.23992979526519775, "logits/rejected": -0.12526682019233704, "logps/chosen": -1.14003586769104, "logps/rejected": -1.2965269088745117, "loss": 1.14, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.14003586769104, "rewards/margins": 0.15649127960205078, "rewards/rejected": -1.2965269088745117, "step": 3555 }, { "epoch": 1.9053353403579194, "grad_norm": 8.867620318650669, "learning_rate": 3.534517807796871e-07, "logits/chosen": -0.13622109591960907, "logits/rejected": -0.07917843759059906, "logps/chosen": -1.2405372858047485, "logps/rejected": -1.3727761507034302, "loss": 1.2405, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2405372858047485, "rewards/margins": 0.13223889470100403, "rewards/rejected": -1.3727761507034302, "step": 3560 }, { "epoch": 1.908011373139321, "grad_norm": 4.912569190999404, "learning_rate": 3.519634880653988e-07, "logits/chosen": -0.14780853688716888, "logits/rejected": -0.10573873668909073, "logps/chosen": -1.1588609218597412, "logps/rejected": -1.310710072517395, "loss": 1.1589, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.1588609218597412, "rewards/margins": 0.15184913575649261, "rewards/rejected": -1.310710072517395, "step": 3565 }, { "epoch": 1.9106874059207226, "grad_norm": 6.288496826967252, "learning_rate": 3.504766316019987e-07, "logits/chosen": -0.2145172655582428, "logits/rejected": -0.11133424192667007, "logps/chosen": -1.208738088607788, "logps/rejected": -1.3540771007537842, "loss": 1.2087, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.208738088607788, "rewards/margins": 0.14533916115760803, "rewards/rejected": -1.3540771007537842, "step": 3570 }, { "epoch": 1.913363438702124, "grad_norm": 5.969068718770917, "learning_rate": 3.489912258149745e-07, "logits/chosen": -0.1158086284995079, "logits/rejected": -0.019630106166005135, "logps/chosen": -1.1551603078842163, "logps/rejected": -1.3086531162261963, "loss": 1.1552, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.1551603078842163, "rewards/margins": 0.15349291265010834, "rewards/rejected": -1.3086531162261963, "step": 3575 }, { "epoch": 1.9160394714835256, "grad_norm": 4.870017147954073, "learning_rate": 3.475072851157397e-07, "logits/chosen": -0.1602131426334381, "logits/rejected": -0.1274903267621994, "logps/chosen": -1.1788512468338013, "logps/rejected": -1.3578119277954102, "loss": 1.1789, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1788512468338013, "rewards/margins": 0.17896074056625366, "rewards/rejected": -1.3578119277954102, "step": 3580 }, { "epoch": 1.9187155042649273, "grad_norm": 7.042310518402355, "learning_rate": 3.460248239014936e-07, "logits/chosen": -0.11130311340093613, "logits/rejected": -0.07458984851837158, "logps/chosen": -1.2768038511276245, "logps/rejected": -1.379234790802002, "loss": 1.2768, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2768038511276245, "rewards/margins": 0.10243091732263565, "rewards/rejected": -1.379234790802002, "step": 3585 }, { "epoch": 1.9213915370463288, "grad_norm": 6.902274989001335, "learning_rate": 3.4454385655508134e-07, "logits/chosen": -0.13669414818286896, "logits/rejected": -0.06946416199207306, "logps/chosen": -1.2367932796478271, "logps/rejected": -1.3339389562606812, "loss": 1.2368, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2367932796478271, "rewards/margins": 0.0971456840634346, "rewards/rejected": -1.3339389562606812, "step": 3590 }, { "epoch": 1.9240675698277303, "grad_norm": 5.377293693756102, "learning_rate": 3.4306439744485447e-07, "logits/chosen": -0.2711583077907562, "logits/rejected": -0.10449016094207764, "logps/chosen": -1.1803597211837769, "logps/rejected": -1.3339115381240845, "loss": 1.1804, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.1803597211837769, "rewards/margins": 0.15355174243450165, "rewards/rejected": -1.3339115381240845, "step": 3595 }, { "epoch": 1.926743602609132, "grad_norm": 8.359215803707318, "learning_rate": 3.415864609245322e-07, "logits/chosen": -0.11398275196552277, "logits/rejected": 0.03984471783041954, "logps/chosen": -1.1620866060256958, "logps/rejected": -1.3196134567260742, "loss": 1.1621, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1620866060256958, "rewards/margins": 0.1575268805027008, "rewards/rejected": -1.3196134567260742, "step": 3600 }, { "epoch": 1.926743602609132, "eval_logits/chosen": 0.08800444006919861, "eval_logits/rejected": 0.15513069927692413, "eval_logps/chosen": -1.2633721828460693, "eval_logps/rejected": -1.3853180408477783, "eval_loss": 1.2636343240737915, "eval_rewards/accuracies": 0.5474777221679688, "eval_rewards/chosen": -1.2633721828460693, "eval_rewards/margins": 0.121945820748806, "eval_rewards/rejected": -1.3853180408477783, "eval_runtime": 40.469, "eval_samples_per_second": 33.235, "eval_steps_per_second": 8.327, "step": 3600 }, { "epoch": 1.9294196353905335, "grad_norm": 5.7507063272777525, "learning_rate": 3.401100613330605e-07, "logits/chosen": -0.2414935827255249, "logits/rejected": -0.21685874462127686, "logps/chosen": -1.224172592163086, "logps/rejected": -1.2859339714050293, "loss": 1.2242, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.224172592163086, "rewards/margins": 0.06176149100065231, "rewards/rejected": -1.2859339714050293, "step": 3605 }, { "epoch": 1.932095668171935, "grad_norm": 4.796075974939055, "learning_rate": 3.3863521299447514e-07, "logits/chosen": -0.16162554919719696, "logits/rejected": -0.07857223600149155, "logps/chosen": -1.1895110607147217, "logps/rejected": -1.333155870437622, "loss": 1.1895, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1895110607147217, "rewards/margins": 0.14364464581012726, "rewards/rejected": -1.333155870437622, "step": 3610 }, { "epoch": 1.9347717009533367, "grad_norm": 4.784622926027967, "learning_rate": 3.371619302177609e-07, "logits/chosen": -0.0993657186627388, "logits/rejected": -0.013783931732177734, "logps/chosen": -1.2933303117752075, "logps/rejected": -1.3533706665039062, "loss": 1.2933, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2933303117752075, "rewards/margins": 0.060040462762117386, "rewards/rejected": -1.3533706665039062, "step": 3615 }, { "epoch": 1.9374477337347382, "grad_norm": 8.291063036255967, "learning_rate": 3.3569022729671393e-07, "logits/chosen": -0.1670067459344864, "logits/rejected": -0.11004354804754257, "logps/chosen": -1.2242882251739502, "logps/rejected": -1.2834529876708984, "loss": 1.2243, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.2242882251739502, "rewards/margins": 0.05916465446352959, "rewards/rejected": -1.2834529876708984, "step": 3620 }, { "epoch": 1.9401237665161397, "grad_norm": 7.417705294515874, "learning_rate": 3.342201185098024e-07, "logits/chosen": -0.1235552579164505, "logits/rejected": -0.10693810135126114, "logps/chosen": -1.207606554031372, "logps/rejected": -1.4045366048812866, "loss": 1.2076, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.207606554031372, "rewards/margins": 0.19693021476268768, "rewards/rejected": -1.4045366048812866, "step": 3625 }, { "epoch": 1.9427997992975414, "grad_norm": 5.6482290828359565, "learning_rate": 3.3275161812002807e-07, "logits/chosen": -0.19800932705402374, "logits/rejected": -0.15896430611610413, "logps/chosen": -1.22525954246521, "logps/rejected": -1.3354448080062866, "loss": 1.2253, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.22525954246521, "rewards/margins": 0.11018528044223785, "rewards/rejected": -1.3354448080062866, "step": 3630 }, { "epoch": 1.945475832078943, "grad_norm": 6.90176834879275, "learning_rate": 3.312847403747883e-07, "logits/chosen": -0.2255699336528778, "logits/rejected": -0.16435322165489197, "logps/chosen": -1.2477341890335083, "logps/rejected": -1.362105131149292, "loss": 1.2477, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2477341890335083, "rewards/margins": 0.11437108367681503, "rewards/rejected": -1.362105131149292, "step": 3635 }, { "epoch": 1.9481518648603444, "grad_norm": 5.475055097670966, "learning_rate": 3.2981949950573733e-07, "logits/chosen": -0.1667758971452713, "logits/rejected": -0.06685288995504379, "logps/chosen": -1.3326314687728882, "logps/rejected": -1.3735696077346802, "loss": 1.3326, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.3326314687728882, "rewards/margins": 0.04093821719288826, "rewards/rejected": -1.3735696077346802, "step": 3640 }, { "epoch": 1.9508278976417461, "grad_norm": 4.407338276016611, "learning_rate": 3.283559097286486e-07, "logits/chosen": -0.20298020541667938, "logits/rejected": -0.1015811562538147, "logps/chosen": -1.3213902711868286, "logps/rejected": -1.4126460552215576, "loss": 1.3214, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.3213902711868286, "rewards/margins": 0.09125564992427826, "rewards/rejected": -1.4126460552215576, "step": 3645 }, { "epoch": 1.9535039304231478, "grad_norm": 4.470447408629519, "learning_rate": 3.268939852432765e-07, "logits/chosen": -0.19641488790512085, "logits/rejected": -0.11250320822000504, "logps/chosen": -1.1997987031936646, "logps/rejected": -1.302686095237732, "loss": 1.1998, "rewards/accuracies": 0.53125, "rewards/chosen": -1.1997987031936646, "rewards/margins": 0.1028873473405838, "rewards/rejected": -1.302686095237732, "step": 3650 }, { "epoch": 1.9561799632045491, "grad_norm": 6.523986952910164, "learning_rate": 3.254337402332187e-07, "logits/chosen": -0.189068004488945, "logits/rejected": -0.07391491532325745, "logps/chosen": -1.258028268814087, "logps/rejected": -1.3367688655853271, "loss": 1.258, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.258028268814087, "rewards/margins": 0.07874061167240143, "rewards/rejected": -1.3367688655853271, "step": 3655 }, { "epoch": 1.9588559959859508, "grad_norm": 7.907398475584665, "learning_rate": 3.239751888657788e-07, "logits/chosen": -0.2146320790052414, "logits/rejected": -0.11358728259801865, "logps/chosen": -1.1542870998382568, "logps/rejected": -1.2416369915008545, "loss": 1.1543, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.1542870998382568, "rewards/margins": 0.08735005557537079, "rewards/rejected": -1.2416369915008545, "step": 3660 }, { "epoch": 1.9615320287673526, "grad_norm": 6.218650018578372, "learning_rate": 3.2251834529182856e-07, "logits/chosen": -0.1602194607257843, "logits/rejected": -0.06215648725628853, "logps/chosen": -1.1817227602005005, "logps/rejected": -1.2858664989471436, "loss": 1.1817, "rewards/accuracies": 0.53125, "rewards/chosen": -1.1817227602005005, "rewards/margins": 0.10414387285709381, "rewards/rejected": -1.2858664989471436, "step": 3665 }, { "epoch": 1.9642080615487538, "grad_norm": 5.826507980459703, "learning_rate": 3.2106322364567075e-07, "logits/chosen": -0.22180171310901642, "logits/rejected": -0.11028953641653061, "logps/chosen": -1.226493239402771, "logps/rejected": -1.3643046617507935, "loss": 1.2265, "rewards/accuracies": 0.5625, "rewards/chosen": -1.226493239402771, "rewards/margins": 0.13781121373176575, "rewards/rejected": -1.3643046617507935, "step": 3670 }, { "epoch": 1.9668840943301555, "grad_norm": 6.263611217173581, "learning_rate": 3.1960983804490183e-07, "logits/chosen": -0.16760757565498352, "logits/rejected": -0.057001225650310516, "logps/chosen": -1.2451612949371338, "logps/rejected": -1.4573750495910645, "loss": 1.2452, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2451612949371338, "rewards/margins": 0.21221396327018738, "rewards/rejected": -1.4573750495910645, "step": 3675 }, { "epoch": 1.9695601271115573, "grad_norm": 5.4757955363694535, "learning_rate": 3.1815820259027537e-07, "logits/chosen": -0.2003244161605835, "logits/rejected": -0.11299177259206772, "logps/chosen": -1.093846321105957, "logps/rejected": -1.2613178491592407, "loss": 1.0938, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.093846321105957, "rewards/margins": 0.16747145354747772, "rewards/rejected": -1.2613178491592407, "step": 3680 }, { "epoch": 1.9722361598929585, "grad_norm": 5.5804491936959035, "learning_rate": 3.16708331365565e-07, "logits/chosen": -0.20646099746227264, "logits/rejected": -0.14668315649032593, "logps/chosen": -1.2012248039245605, "logps/rejected": -1.3625638484954834, "loss": 1.2012, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2012248039245605, "rewards/margins": 0.16133908927440643, "rewards/rejected": -1.3625638484954834, "step": 3685 }, { "epoch": 1.9749121926743602, "grad_norm": 8.014925625122705, "learning_rate": 3.152602384374275e-07, "logits/chosen": -0.1609608232975006, "logits/rejected": -0.045977894216775894, "logps/chosen": -1.2281086444854736, "logps/rejected": -1.3579498529434204, "loss": 1.2281, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2281086444854736, "rewards/margins": 0.12984125316143036, "rewards/rejected": -1.3579498529434204, "step": 3690 }, { "epoch": 1.977588225455762, "grad_norm": 5.199348551612257, "learning_rate": 3.1381393785526697e-07, "logits/chosen": -0.12000872194766998, "logits/rejected": -0.10509549081325531, "logps/chosen": -1.275071144104004, "logps/rejected": -1.441075086593628, "loss": 1.2751, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.275071144104004, "rewards/margins": 0.16600406169891357, "rewards/rejected": -1.441075086593628, "step": 3695 }, { "epoch": 1.9802642582371635, "grad_norm": 4.743239593620239, "learning_rate": 3.123694436510979e-07, "logits/chosen": -0.09375707060098648, "logits/rejected": -0.00935634970664978, "logps/chosen": -1.1974576711654663, "logps/rejected": -1.3464930057525635, "loss": 1.1975, "rewards/accuracies": 0.625, "rewards/chosen": -1.1974576711654663, "rewards/margins": 0.14903512597084045, "rewards/rejected": -1.3464930057525635, "step": 3700 }, { "epoch": 1.982940291018565, "grad_norm": 6.121244473904194, "learning_rate": 3.1092676983940946e-07, "logits/chosen": -0.15068784356117249, "logits/rejected": -0.09690304100513458, "logps/chosen": -1.259336233139038, "logps/rejected": -1.3884046077728271, "loss": 1.2593, "rewards/accuracies": 0.53125, "rewards/chosen": -1.259336233139038, "rewards/margins": 0.12906843423843384, "rewards/rejected": -1.3884046077728271, "step": 3705 }, { "epoch": 1.9856163237999667, "grad_norm": 5.575522033261266, "learning_rate": 3.094859304170293e-07, "logits/chosen": -0.04775070399045944, "logits/rejected": -0.009759527631103992, "logps/chosen": -1.2604209184646606, "logps/rejected": -1.3536851406097412, "loss": 1.2604, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2604209184646606, "rewards/margins": 0.09326429665088654, "rewards/rejected": -1.3536851406097412, "step": 3710 }, { "epoch": 1.9882923565813682, "grad_norm": 4.493223972125323, "learning_rate": 3.0804693936298795e-07, "logits/chosen": -0.1066608875989914, "logits/rejected": -0.043001849204301834, "logps/chosen": -1.2421224117279053, "logps/rejected": -1.3679310083389282, "loss": 1.2421, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2421224117279053, "rewards/margins": 0.1258087456226349, "rewards/rejected": -1.3679310083389282, "step": 3715 }, { "epoch": 1.9909683893627697, "grad_norm": 5.287856717175514, "learning_rate": 3.066098106383826e-07, "logits/chosen": -0.16439351439476013, "logits/rejected": -0.11067134141921997, "logps/chosen": -1.198706030845642, "logps/rejected": -1.336551547050476, "loss": 1.1987, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.198706030845642, "rewards/margins": 0.1378454864025116, "rewards/rejected": -1.336551547050476, "step": 3720 }, { "epoch": 1.9936444221441714, "grad_norm": 5.526018161771605, "learning_rate": 3.0517455818624263e-07, "logits/chosen": -0.22060759365558624, "logits/rejected": -0.14614121615886688, "logps/chosen": -1.2030059099197388, "logps/rejected": -1.294613242149353, "loss": 1.203, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2030059099197388, "rewards/margins": 0.09160729497671127, "rewards/rejected": -1.294613242149353, "step": 3725 }, { "epoch": 1.9963204549255729, "grad_norm": 5.69837590642455, "learning_rate": 3.037411959313936e-07, "logits/chosen": -0.10062392801046371, "logits/rejected": -0.005271920468658209, "logps/chosen": -1.1692296266555786, "logps/rejected": -1.346396803855896, "loss": 1.1692, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1692296266555786, "rewards/margins": 0.17716726660728455, "rewards/rejected": -1.346396803855896, "step": 3730 }, { "epoch": 1.9989964877069744, "grad_norm": 6.3130293805529405, "learning_rate": 3.023097377803224e-07, "logits/chosen": -0.08427585661411285, "logits/rejected": -0.02949909307062626, "logps/chosen": -1.2999669313430786, "logps/rejected": -1.347758173942566, "loss": 1.3, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2999669313430786, "rewards/margins": 0.04779127240180969, "rewards/rejected": -1.347758173942566, "step": 3735 }, { "epoch": 2.001672520488376, "grad_norm": 6.019106921111662, "learning_rate": 3.008801976210423e-07, "logits/chosen": -0.06799745559692383, "logits/rejected": -0.04424173757433891, "logps/chosen": -1.2789679765701294, "logps/rejected": -1.342089295387268, "loss": 1.279, "rewards/accuracies": 0.46875, "rewards/chosen": -1.2789679765701294, "rewards/margins": 0.06312130391597748, "rewards/rejected": -1.342089295387268, "step": 3740 }, { "epoch": 2.0043485532697773, "grad_norm": 4.730528925341569, "learning_rate": 2.994525893229581e-07, "logits/chosen": -0.13752660155296326, "logits/rejected": -0.05609750747680664, "logps/chosen": -1.2309396266937256, "logps/rejected": -1.3445690870285034, "loss": 1.2309, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2309396266937256, "rewards/margins": 0.11362955719232559, "rewards/rejected": -1.3445690870285034, "step": 3745 }, { "epoch": 2.007024586051179, "grad_norm": 5.647824034340581, "learning_rate": 2.98026926736732e-07, "logits/chosen": -0.18176816403865814, "logits/rejected": -0.11537300050258636, "logps/chosen": -1.1377322673797607, "logps/rejected": -1.3447933197021484, "loss": 1.1377, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1377322673797607, "rewards/margins": 0.20706084370613098, "rewards/rejected": -1.3447933197021484, "step": 3750 }, { "epoch": 2.0097006188325808, "grad_norm": 4.889661140545181, "learning_rate": 2.9660322369414846e-07, "logits/chosen": -0.17274732887744904, "logits/rejected": -0.09844546020030975, "logps/chosen": -1.1538443565368652, "logps/rejected": -1.3929589986801147, "loss": 1.1538, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1538443565368652, "rewards/margins": 0.23911479115486145, "rewards/rejected": -1.3929589986801147, "step": 3755 }, { "epoch": 2.0123766516139825, "grad_norm": 6.222592698614956, "learning_rate": 2.9518149400798063e-07, "logits/chosen": -0.26015645265579224, "logits/rejected": -0.2516189217567444, "logps/chosen": -1.208571195602417, "logps/rejected": -1.343924880027771, "loss": 1.2086, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.208571195602417, "rewards/margins": 0.13535381853580475, "rewards/rejected": -1.343924880027771, "step": 3760 }, { "epoch": 2.0150526843953838, "grad_norm": 5.533731541828389, "learning_rate": 2.9376175147185633e-07, "logits/chosen": -0.11267416179180145, "logits/rejected": 0.022044114768505096, "logps/chosen": -1.176780104637146, "logps/rejected": -1.4030821323394775, "loss": 1.1768, "rewards/accuracies": 0.625, "rewards/chosen": -1.176780104637146, "rewards/margins": 0.22630195319652557, "rewards/rejected": -1.4030821323394775, "step": 3765 }, { "epoch": 2.0177287171767855, "grad_norm": 6.227479364303112, "learning_rate": 2.9234400986012376e-07, "logits/chosen": -0.2551042437553406, "logits/rejected": -0.13823463022708893, "logps/chosen": -1.1309531927108765, "logps/rejected": -1.470923900604248, "loss": 1.131, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.1309531927108765, "rewards/margins": 0.33997052907943726, "rewards/rejected": -1.470923900604248, "step": 3770 }, { "epoch": 2.020404749958187, "grad_norm": 4.45473048921191, "learning_rate": 2.9092828292771817e-07, "logits/chosen": -0.20629315078258514, "logits/rejected": -0.16825221478939056, "logps/chosen": -1.1793571710586548, "logps/rejected": -1.2881890535354614, "loss": 1.1794, "rewards/accuracies": 0.46875, "rewards/chosen": -1.1793571710586548, "rewards/margins": 0.10883182287216187, "rewards/rejected": -1.2881890535354614, "step": 3775 }, { "epoch": 2.0230807827395885, "grad_norm": 5.856827315950549, "learning_rate": 2.8951458441002875e-07, "logits/chosen": -0.1255567967891693, "logits/rejected": -0.10168837010860443, "logps/chosen": -1.2171685695648193, "logps/rejected": -1.366981863975525, "loss": 1.2172, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.2171685695648193, "rewards/margins": 0.14981338381767273, "rewards/rejected": -1.366981863975525, "step": 3780 }, { "epoch": 2.02575681552099, "grad_norm": 5.620163421577928, "learning_rate": 2.881029280227643e-07, "logits/chosen": -0.16912151873111725, "logits/rejected": -0.0803477093577385, "logps/chosen": -1.2393953800201416, "logps/rejected": -1.467238187789917, "loss": 1.2394, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2393953800201416, "rewards/margins": 0.2278430014848709, "rewards/rejected": -1.467238187789917, "step": 3785 }, { "epoch": 2.028432848302392, "grad_norm": 4.810809786242335, "learning_rate": 2.8669332746182177e-07, "logits/chosen": -0.1999644786119461, "logits/rejected": -0.06039692088961601, "logps/chosen": -1.1889235973358154, "logps/rejected": -1.3286960124969482, "loss": 1.1889, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.1889235973358154, "rewards/margins": 0.1397724449634552, "rewards/rejected": -1.3286960124969482, "step": 3790 }, { "epoch": 2.031108881083793, "grad_norm": 5.363457140266715, "learning_rate": 2.8528579640315156e-07, "logits/chosen": -0.15548428893089294, "logits/rejected": -0.13658146560192108, "logps/chosen": -1.166237473487854, "logps/rejected": -1.301145315170288, "loss": 1.1662, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.166237473487854, "rewards/margins": 0.1349080353975296, "rewards/rejected": -1.301145315170288, "step": 3795 }, { "epoch": 2.033784913865195, "grad_norm": 5.962863750222311, "learning_rate": 2.8388034850262646e-07, "logits/chosen": -0.1751735508441925, "logits/rejected": -0.08863409608602524, "logps/chosen": -1.2479246854782104, "logps/rejected": -1.4305475950241089, "loss": 1.2479, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2479246854782104, "rewards/margins": 0.18262311816215515, "rewards/rejected": -1.4305475950241089, "step": 3800 }, { "epoch": 2.0364609466465966, "grad_norm": 5.585304012697536, "learning_rate": 2.824769973959079e-07, "logits/chosen": -0.13798804581165314, "logits/rejected": -0.05599381774663925, "logps/chosen": -1.1547309160232544, "logps/rejected": -1.3481967449188232, "loss": 1.1547, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1547309160232544, "rewards/margins": 0.19346573948860168, "rewards/rejected": -1.3481967449188232, "step": 3805 }, { "epoch": 2.039136979427998, "grad_norm": 5.993058848329555, "learning_rate": 2.81075756698315e-07, "logits/chosen": -0.06462104618549347, "logits/rejected": 0.00697299325838685, "logps/chosen": -1.1821935176849365, "logps/rejected": -1.3417952060699463, "loss": 1.1822, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1821935176849365, "rewards/margins": 0.15960147976875305, "rewards/rejected": -1.3417952060699463, "step": 3810 }, { "epoch": 2.0418130122093996, "grad_norm": 5.7172963813191915, "learning_rate": 2.7967664000469035e-07, "logits/chosen": -0.2697875499725342, "logits/rejected": -0.16855919361114502, "logps/chosen": -1.212554693222046, "logps/rejected": -1.3031188249588013, "loss": 1.2126, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.212554693222046, "rewards/margins": 0.09056393802165985, "rewards/rejected": -1.3031188249588013, "step": 3815 }, { "epoch": 2.0444890449908013, "grad_norm": 8.413727170085771, "learning_rate": 2.7827966088927095e-07, "logits/chosen": -0.2294515073299408, "logits/rejected": -0.06772720813751221, "logps/chosen": -1.269126057624817, "logps/rejected": -1.2370564937591553, "loss": 1.2691, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.269126057624817, "rewards/margins": -0.032069362699985504, "rewards/rejected": -1.2370564937591553, "step": 3820 }, { "epoch": 2.0471650777722026, "grad_norm": 5.7556807111287585, "learning_rate": 2.768848329055538e-07, "logits/chosen": -0.1930791139602661, "logits/rejected": -0.10869000852108002, "logps/chosen": -1.1823856830596924, "logps/rejected": -1.2781797647476196, "loss": 1.1824, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1823856830596924, "rewards/margins": 0.09579413384199142, "rewards/rejected": -1.2781797647476196, "step": 3825 }, { "epoch": 2.0498411105536043, "grad_norm": 6.0172624618205495, "learning_rate": 2.7549216958616657e-07, "logits/chosen": -0.23403461277484894, "logits/rejected": -0.12139058113098145, "logps/chosen": -1.2531311511993408, "logps/rejected": -1.423150658607483, "loss": 1.2531, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2531311511993408, "rewards/margins": 0.17001953721046448, "rewards/rejected": -1.423150658607483, "step": 3830 }, { "epoch": 2.052517143335006, "grad_norm": 5.453750338371526, "learning_rate": 2.741016844427344e-07, "logits/chosen": -0.17381764948368073, "logits/rejected": -0.07925789058208466, "logps/chosen": -1.2436667680740356, "logps/rejected": -1.3779237270355225, "loss": 1.2437, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2436667680740356, "rewards/margins": 0.13425692915916443, "rewards/rejected": -1.3779237270355225, "step": 3835 }, { "epoch": 2.0551931761164073, "grad_norm": 6.1471637747845485, "learning_rate": 2.7271339096575073e-07, "logits/chosen": -0.13815638422966003, "logits/rejected": -0.06826861947774887, "logps/chosen": -1.123497486114502, "logps/rejected": -1.3922805786132812, "loss": 1.1235, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.123497486114502, "rewards/margins": 0.2687830924987793, "rewards/rejected": -1.3922805786132812, "step": 3840 }, { "epoch": 2.057869208897809, "grad_norm": 5.517834717217908, "learning_rate": 2.713273026244446e-07, "logits/chosen": -0.2933315634727478, "logits/rejected": -0.10774131864309311, "logps/chosen": -1.26057767868042, "logps/rejected": -1.4122060537338257, "loss": 1.2606, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.26057767868042, "rewards/margins": 0.1516282856464386, "rewards/rejected": -1.4122060537338257, "step": 3845 }, { "epoch": 2.0605452416792107, "grad_norm": 5.287679437373956, "learning_rate": 2.6994343286665156e-07, "logits/chosen": -0.18297800421714783, "logits/rejected": -0.057443369179964066, "logps/chosen": -1.2547472715377808, "logps/rejected": -1.3650306463241577, "loss": 1.2547, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2547472715377808, "rewards/margins": 0.11028333753347397, "rewards/rejected": -1.3650306463241577, "step": 3850 }, { "epoch": 2.063221274460612, "grad_norm": 6.016327774442574, "learning_rate": 2.6856179511868156e-07, "logits/chosen": -0.15936723351478577, "logits/rejected": -0.01795923337340355, "logps/chosen": -1.191485047340393, "logps/rejected": -1.4031237363815308, "loss": 1.1915, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.191485047340393, "rewards/margins": 0.21163871884346008, "rewards/rejected": -1.4031237363815308, "step": 3855 }, { "epoch": 2.0658973072420137, "grad_norm": 4.840057684225594, "learning_rate": 2.6718240278519056e-07, "logits/chosen": -0.1768343597650528, "logits/rejected": -0.050511498004198074, "logps/chosen": -1.2050405740737915, "logps/rejected": -1.2851245403289795, "loss": 1.205, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2050405740737915, "rewards/margins": 0.0800839439034462, "rewards/rejected": -1.2851245403289795, "step": 3860 }, { "epoch": 2.0685733400234154, "grad_norm": 7.472350712940947, "learning_rate": 2.6580526924904866e-07, "logits/chosen": -0.2845076322555542, "logits/rejected": -0.14164681732654572, "logps/chosen": -1.2480602264404297, "logps/rejected": -1.3078038692474365, "loss": 1.2481, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2480602264404297, "rewards/margins": 0.05974351242184639, "rewards/rejected": -1.3078038692474365, "step": 3865 }, { "epoch": 2.0712493728048167, "grad_norm": 6.200933701234593, "learning_rate": 2.6443040787121186e-07, "logits/chosen": -0.18069681525230408, "logits/rejected": -0.16745315492153168, "logps/chosen": -1.1093075275421143, "logps/rejected": -1.219215989112854, "loss": 1.1093, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1093075275421143, "rewards/margins": 0.1099083423614502, "rewards/rejected": -1.219215989112854, "step": 3870 }, { "epoch": 2.0739254055862184, "grad_norm": 6.537320071227244, "learning_rate": 2.6305783199059084e-07, "logits/chosen": -0.2183917760848999, "logits/rejected": -0.11956784874200821, "logps/chosen": -1.1798574924468994, "logps/rejected": -1.3506397008895874, "loss": 1.1799, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1798574924468994, "rewards/margins": 0.17078223824501038, "rewards/rejected": -1.3506397008895874, "step": 3875 }, { "epoch": 2.07660143836762, "grad_norm": 5.80385308484847, "learning_rate": 2.6168755492392324e-07, "logits/chosen": -0.19175554811954498, "logits/rejected": -0.0773371234536171, "logps/chosen": -1.1103461980819702, "logps/rejected": -1.2619879245758057, "loss": 1.1103, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1103461980819702, "rewards/margins": 0.15164175629615784, "rewards/rejected": -1.2619879245758057, "step": 3880 }, { "epoch": 2.0792774711490214, "grad_norm": 5.549714582917568, "learning_rate": 2.6031958996564274e-07, "logits/chosen": -0.228239968419075, "logits/rejected": -0.11980322748422623, "logps/chosen": -1.1617170572280884, "logps/rejected": -1.3236258029937744, "loss": 1.1617, "rewards/accuracies": 0.5, "rewards/chosen": -1.1617170572280884, "rewards/margins": 0.16190890967845917, "rewards/rejected": -1.3236258029937744, "step": 3885 }, { "epoch": 2.081953503930423, "grad_norm": 5.590124888668569, "learning_rate": 2.589539503877518e-07, "logits/chosen": -0.14101819694042206, "logits/rejected": -0.07082847505807877, "logps/chosen": -1.1958640813827515, "logps/rejected": -1.2869757413864136, "loss": 1.1959, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.1958640813827515, "rewards/margins": 0.09111160784959793, "rewards/rejected": -1.2869757413864136, "step": 3890 }, { "epoch": 2.084629536711825, "grad_norm": 4.982828115293815, "learning_rate": 2.5759064943969125e-07, "logits/chosen": -0.20357391238212585, "logits/rejected": -0.01986112631857395, "logps/chosen": -1.1584810018539429, "logps/rejected": -1.2918038368225098, "loss": 1.1585, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1584810018539429, "rewards/margins": 0.13332276046276093, "rewards/rejected": -1.2918038368225098, "step": 3895 }, { "epoch": 2.087305569493226, "grad_norm": 6.127811030460591, "learning_rate": 2.562297003482131e-07, "logits/chosen": -0.05941629409790039, "logits/rejected": -0.07080470025539398, "logps/chosen": -1.1782751083374023, "logps/rejected": -1.3218450546264648, "loss": 1.1783, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1782751083374023, "rewards/margins": 0.1435699462890625, "rewards/rejected": -1.3218450546264648, "step": 3900 }, { "epoch": 2.089981602274628, "grad_norm": 5.4374911961036485, "learning_rate": 2.548711163172512e-07, "logits/chosen": -0.16850116848945618, "logits/rejected": -0.0896964892745018, "logps/chosen": -1.1930769681930542, "logps/rejected": -1.289036512374878, "loss": 1.1931, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.1930769681930542, "rewards/margins": 0.09595947712659836, "rewards/rejected": -1.289036512374878, "step": 3905 }, { "epoch": 2.0926576350560295, "grad_norm": 6.015298615293008, "learning_rate": 2.53514910527794e-07, "logits/chosen": -0.08211959898471832, "logits/rejected": 0.0013775527477264404, "logps/chosen": -1.1046782732009888, "logps/rejected": -1.2789340019226074, "loss": 1.1047, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1046782732009888, "rewards/margins": 0.17425575852394104, "rewards/rejected": -1.2789340019226074, "step": 3910 }, { "epoch": 2.095333667837431, "grad_norm": 4.637842499080636, "learning_rate": 2.5216109613775573e-07, "logits/chosen": -0.17147886753082275, "logits/rejected": -0.046903591603040695, "logps/chosen": -1.2051151990890503, "logps/rejected": -1.4114007949829102, "loss": 1.2051, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2051151990890503, "rewards/margins": 0.2062855213880539, "rewards/rejected": -1.4114007949829102, "step": 3915 }, { "epoch": 2.0980097006188325, "grad_norm": 5.2640283646939094, "learning_rate": 2.5080968628184993e-07, "logits/chosen": -0.20057567954063416, "logits/rejected": -0.07562495768070221, "logps/chosen": -1.2186086177825928, "logps/rejected": -1.3974798917770386, "loss": 1.2186, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2186086177825928, "rewards/margins": 0.17887134850025177, "rewards/rejected": -1.3974798917770386, "step": 3920 }, { "epoch": 2.1006857334002342, "grad_norm": 4.181878874884371, "learning_rate": 2.494606940714605e-07, "logits/chosen": -0.17249837517738342, "logits/rejected": -0.07560286670923233, "logps/chosen": -1.1493269205093384, "logps/rejected": -1.314410924911499, "loss": 1.1493, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.1493269205093384, "rewards/margins": 0.16508421301841736, "rewards/rejected": -1.314410924911499, "step": 3925 }, { "epoch": 2.103361766181636, "grad_norm": 4.128799724690311, "learning_rate": 2.4811413259451625e-07, "logits/chosen": -0.2524082064628601, "logits/rejected": -0.15686941146850586, "logps/chosen": -1.242990255355835, "logps/rejected": -1.3105370998382568, "loss": 1.243, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.242990255355835, "rewards/margins": 0.06754690408706665, "rewards/rejected": -1.3105370998382568, "step": 3930 }, { "epoch": 2.106037798963037, "grad_norm": 6.003088774098173, "learning_rate": 2.46770014915362e-07, "logits/chosen": -0.12376214563846588, "logits/rejected": -0.08375285565853119, "logps/chosen": -1.1806390285491943, "logps/rejected": -1.3657233715057373, "loss": 1.1806, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1806390285491943, "rewards/margins": 0.185084268450737, "rewards/rejected": -1.3657233715057373, "step": 3935 }, { "epoch": 2.108713831744439, "grad_norm": 6.183488591719491, "learning_rate": 2.45428354074634e-07, "logits/chosen": -0.1445099115371704, "logits/rejected": -0.11886302381753922, "logps/chosen": -1.0970757007598877, "logps/rejected": -1.406111240386963, "loss": 1.0971, "rewards/accuracies": 0.625, "rewards/chosen": -1.0970757007598877, "rewards/margins": 0.30903545022010803, "rewards/rejected": -1.406111240386963, "step": 3940 }, { "epoch": 2.1113898645258407, "grad_norm": 6.291848926842666, "learning_rate": 2.4408916308913105e-07, "logits/chosen": -0.18734495341777802, "logits/rejected": -0.05332733318209648, "logps/chosen": -1.231751799583435, "logps/rejected": -1.3194470405578613, "loss": 1.2318, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.231751799583435, "rewards/margins": 0.08769532293081284, "rewards/rejected": -1.3194470405578613, "step": 3945 }, { "epoch": 2.114065897307242, "grad_norm": 4.9571819519215925, "learning_rate": 2.4275245495169025e-07, "logits/chosen": -0.09875687211751938, "logits/rejected": 0.024384763091802597, "logps/chosen": -1.1812176704406738, "logps/rejected": -1.2558825016021729, "loss": 1.1812, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.1812176704406738, "rewards/margins": 0.0746648833155632, "rewards/rejected": -1.2558825016021729, "step": 3950 }, { "epoch": 2.1167419300886436, "grad_norm": 5.421660152611321, "learning_rate": 2.414182426310597e-07, "logits/chosen": -0.2282371073961258, "logits/rejected": -0.1652667075395584, "logps/chosen": -1.1318384408950806, "logps/rejected": -1.2785104513168335, "loss": 1.1318, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1318384408950806, "rewards/margins": 0.1466720998287201, "rewards/rejected": -1.2785104513168335, "step": 3955 }, { "epoch": 2.1194179628700454, "grad_norm": 7.606659837184377, "learning_rate": 2.400865390717734e-07, "logits/chosen": -0.12924665212631226, "logits/rejected": -0.036776043474674225, "logps/chosen": -1.218353033065796, "logps/rejected": -1.4183902740478516, "loss": 1.2184, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.218353033065796, "rewards/margins": 0.20003733038902283, "rewards/rejected": -1.4183902740478516, "step": 3960 }, { "epoch": 2.1220939956514466, "grad_norm": 5.747353066552233, "learning_rate": 2.3875735719402475e-07, "logits/chosen": -0.11502889543771744, "logits/rejected": -0.03595975041389465, "logps/chosen": -1.13552725315094, "logps/rejected": -1.3473083972930908, "loss": 1.1355, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.13552725315094, "rewards/margins": 0.2117810696363449, "rewards/rejected": -1.3473083972930908, "step": 3965 }, { "epoch": 2.1247700284328483, "grad_norm": 4.970435932664597, "learning_rate": 2.3743070989354258e-07, "logits/chosen": -0.16472622752189636, "logits/rejected": -0.0940287858247757, "logps/chosen": -1.1654224395751953, "logps/rejected": -1.3639450073242188, "loss": 1.1654, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.1654224395751953, "rewards/margins": 0.19852247834205627, "rewards/rejected": -1.3639450073242188, "step": 3970 }, { "epoch": 2.12744606121425, "grad_norm": 4.3713270049885065, "learning_rate": 2.3610661004146454e-07, "logits/chosen": -0.08634035289287567, "logits/rejected": -0.02884233370423317, "logps/chosen": -1.0747014284133911, "logps/rejected": -1.290486454963684, "loss": 1.0747, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.0747014284133911, "rewards/margins": 0.2157849818468094, "rewards/rejected": -1.290486454963684, "step": 3975 }, { "epoch": 2.1301220939956513, "grad_norm": 5.445045856243949, "learning_rate": 2.3478507048421314e-07, "logits/chosen": -0.2231409251689911, "logits/rejected": -0.17726781964302063, "logps/chosen": -1.079582929611206, "logps/rejected": -1.3526421785354614, "loss": 1.0796, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.079582929611206, "rewards/margins": 0.2730591893196106, "rewards/rejected": -1.3526421785354614, "step": 3980 }, { "epoch": 2.132798126777053, "grad_norm": 7.900878000303733, "learning_rate": 2.334661040433713e-07, "logits/chosen": -0.2267974615097046, "logits/rejected": -0.1466803103685379, "logps/chosen": -1.1649091243743896, "logps/rejected": -1.3364784717559814, "loss": 1.1649, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.1649091243743896, "rewards/margins": 0.17156915366649628, "rewards/rejected": -1.3364784717559814, "step": 3985 }, { "epoch": 2.1354741595584548, "grad_norm": 4.72574473498025, "learning_rate": 2.321497235155568e-07, "logits/chosen": -0.23662757873535156, "logits/rejected": -0.13621188700199127, "logps/chosen": -1.1582509279251099, "logps/rejected": -1.2768819332122803, "loss": 1.1583, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.1582509279251099, "rewards/margins": 0.11863088607788086, "rewards/rejected": -1.2768819332122803, "step": 3990 }, { "epoch": 2.138150192339856, "grad_norm": 5.325732289340437, "learning_rate": 2.3083594167229965e-07, "logits/chosen": -0.30555909872055054, "logits/rejected": -0.11120004951953888, "logps/chosen": -1.182663917541504, "logps/rejected": -1.4078179597854614, "loss": 1.1827, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.182663917541504, "rewards/margins": 0.22515395283699036, "rewards/rejected": -1.4078179597854614, "step": 3995 }, { "epoch": 2.1408262251212578, "grad_norm": 5.96752840877602, "learning_rate": 2.295247712599167e-07, "logits/chosen": -0.1824541687965393, "logits/rejected": -0.09727810323238373, "logps/chosen": -1.1564702987670898, "logps/rejected": -1.3188977241516113, "loss": 1.1565, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1564702987670898, "rewards/margins": 0.16242745518684387, "rewards/rejected": -1.3188977241516113, "step": 4000 }, { "epoch": 2.1408262251212578, "eval_logits/chosen": 0.03245145455002785, "eval_logits/rejected": 0.09523612260818481, "eval_logps/chosen": -1.263055682182312, "eval_logps/rejected": -1.388040542602539, "eval_loss": 1.2633153200149536, "eval_rewards/accuracies": 0.5482195615768433, "eval_rewards/chosen": -1.263055682182312, "eval_rewards/margins": 0.12498495727777481, "eval_rewards/rejected": -1.388040542602539, "eval_runtime": 40.45, "eval_samples_per_second": 33.251, "eval_steps_per_second": 8.331, "step": 4000 }, { "epoch": 2.1435022579026595, "grad_norm": 5.917871018988046, "learning_rate": 2.2821622499938948e-07, "logits/chosen": -0.20923061668872833, "logits/rejected": -0.03972760587930679, "logps/chosen": -1.315399408340454, "logps/rejected": -1.4226207733154297, "loss": 1.3154, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.315399408340454, "rewards/margins": 0.10722143948078156, "rewards/rejected": -1.4226207733154297, "step": 4005 }, { "epoch": 2.1461782906840607, "grad_norm": 4.671711582587529, "learning_rate": 2.269103155862391e-07, "logits/chosen": -0.2011118233203888, "logits/rejected": -0.13230469822883606, "logps/chosen": -1.230077862739563, "logps/rejected": -1.3223201036453247, "loss": 1.2301, "rewards/accuracies": 0.5625, "rewards/chosen": -1.230077862739563, "rewards/margins": 0.09224215149879456, "rewards/rejected": -1.3223201036453247, "step": 4010 }, { "epoch": 2.1488543234654625, "grad_norm": 5.501376549922748, "learning_rate": 2.2560705569040483e-07, "logits/chosen": -0.2410993129014969, "logits/rejected": -0.05041719228029251, "logps/chosen": -1.2081477642059326, "logps/rejected": -1.3148329257965088, "loss": 1.2081, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2081477642059326, "rewards/margins": 0.10668500512838364, "rewards/rejected": -1.3148329257965088, "step": 4015 }, { "epoch": 2.151530356246864, "grad_norm": 4.44321924349562, "learning_rate": 2.2430645795611963e-07, "logits/chosen": -0.2821759581565857, "logits/rejected": -0.17517781257629395, "logps/chosen": -1.2414942979812622, "logps/rejected": -1.3300151824951172, "loss": 1.2415, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2414942979812622, "rewards/margins": 0.08852101862430573, "rewards/rejected": -1.3300151824951172, "step": 4020 }, { "epoch": 2.1542063890282654, "grad_norm": 5.944268441876082, "learning_rate": 2.230085350017884e-07, "logits/chosen": -0.190938800573349, "logits/rejected": -0.1164470911026001, "logps/chosen": -1.1265842914581299, "logps/rejected": -1.2535488605499268, "loss": 1.1266, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.1265842914581299, "rewards/margins": 0.1269644796848297, "rewards/rejected": -1.2535488605499268, "step": 4025 }, { "epoch": 2.156882421809667, "grad_norm": 8.61224991706566, "learning_rate": 2.2171329941986554e-07, "logits/chosen": -0.23914062976837158, "logits/rejected": -0.18171271681785583, "logps/chosen": -1.138273000717163, "logps/rejected": -1.3496696949005127, "loss": 1.1383, "rewards/accuracies": 0.5625, "rewards/chosen": -1.138273000717163, "rewards/margins": 0.2113966941833496, "rewards/rejected": -1.3496696949005127, "step": 4030 }, { "epoch": 2.159558454591069, "grad_norm": 6.41043437229576, "learning_rate": 2.2042076377673202e-07, "logits/chosen": -0.21312227845191956, "logits/rejected": -0.18235036730766296, "logps/chosen": -1.1680612564086914, "logps/rejected": -1.2263541221618652, "loss": 1.1681, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1680612564086914, "rewards/margins": 0.058292876929044724, "rewards/rejected": -1.2263541221618652, "step": 4035 }, { "epoch": 2.16223448737247, "grad_norm": 4.956756764135901, "learning_rate": 2.1913094061257476e-07, "logits/chosen": -0.2107177972793579, "logits/rejected": -0.18253345787525177, "logps/chosen": -1.1320914030075073, "logps/rejected": -1.2208181619644165, "loss": 1.1321, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1320914030075073, "rewards/margins": 0.08872680366039276, "rewards/rejected": -1.2208181619644165, "step": 4040 }, { "epoch": 2.164910520153872, "grad_norm": 6.772029070979287, "learning_rate": 2.178438424412633e-07, "logits/chosen": -0.15235461294651031, "logits/rejected": -0.05562242120504379, "logps/chosen": -1.231483817100525, "logps/rejected": -1.3283480405807495, "loss": 1.2315, "rewards/accuracies": 0.53125, "rewards/chosen": -1.231483817100525, "rewards/margins": 0.09686414897441864, "rewards/rejected": -1.3283480405807495, "step": 4045 }, { "epoch": 2.1675865529352736, "grad_norm": 4.144746571054171, "learning_rate": 2.165594817502302e-07, "logits/chosen": -0.23558945953845978, "logits/rejected": -0.1623959094285965, "logps/chosen": -1.212146520614624, "logps/rejected": -1.288278579711914, "loss": 1.2121, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.212146520614624, "rewards/margins": 0.07613208144903183, "rewards/rejected": -1.288278579711914, "step": 4050 }, { "epoch": 2.170262585716675, "grad_norm": 5.396859548537507, "learning_rate": 2.1527787100034806e-07, "logits/chosen": -0.14681723713874817, "logits/rejected": -0.10661210119724274, "logps/chosen": -1.1922398805618286, "logps/rejected": -1.337632417678833, "loss": 1.1922, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1922398805618286, "rewards/margins": 0.14539244771003723, "rewards/rejected": -1.337632417678833, "step": 4055 }, { "epoch": 2.1729386184980766, "grad_norm": 6.441986524867409, "learning_rate": 2.1399902262581037e-07, "logits/chosen": -0.06384416669607162, "logits/rejected": -5.4477528465213254e-05, "logps/chosen": -1.1192125082015991, "logps/rejected": -1.181994080543518, "loss": 1.1192, "rewards/accuracies": 0.53125, "rewards/chosen": -1.1192125082015991, "rewards/margins": 0.06278140842914581, "rewards/rejected": -1.181994080543518, "step": 4060 }, { "epoch": 2.1756146512794783, "grad_norm": 6.41128959452751, "learning_rate": 2.127229490340094e-07, "logits/chosen": -0.2593405544757843, "logits/rejected": -0.19250085949897766, "logps/chosen": -1.1923651695251465, "logps/rejected": -1.371565580368042, "loss": 1.1924, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1923651695251465, "rewards/margins": 0.17920047044754028, "rewards/rejected": -1.371565580368042, "step": 4065 }, { "epoch": 2.1782906840608796, "grad_norm": 6.359691522394878, "learning_rate": 2.1144966260541698e-07, "logits/chosen": -0.12252690643072128, "logits/rejected": 0.039988771080970764, "logps/chosen": -1.144848346710205, "logps/rejected": -1.300666093826294, "loss": 1.1448, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.144848346710205, "rewards/margins": 0.1558176577091217, "rewards/rejected": -1.300666093826294, "step": 4070 }, { "epoch": 2.1809667168422813, "grad_norm": 5.688359005855353, "learning_rate": 2.1017917569346332e-07, "logits/chosen": -0.20154765248298645, "logits/rejected": -0.06380189955234528, "logps/chosen": -1.236673355102539, "logps/rejected": -1.3484394550323486, "loss": 1.2367, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.236673355102539, "rewards/margins": 0.11176598072052002, "rewards/rejected": -1.3484394550323486, "step": 4075 }, { "epoch": 2.183642749623683, "grad_norm": 4.766007014594416, "learning_rate": 2.0891150062441837e-07, "logits/chosen": -0.19487784802913666, "logits/rejected": -0.09445010870695114, "logps/chosen": -1.2235043048858643, "logps/rejected": -1.342054843902588, "loss": 1.2235, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2235043048858643, "rewards/margins": 0.1185503751039505, "rewards/rejected": -1.342054843902588, "step": 4080 }, { "epoch": 2.1863187824050843, "grad_norm": 4.565977882369312, "learning_rate": 2.0764664969727086e-07, "logits/chosen": -0.13624389469623566, "logits/rejected": -0.05315814167261124, "logps/chosen": -1.2009947299957275, "logps/rejected": -1.2778642177581787, "loss": 1.201, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2009947299957275, "rewards/margins": 0.07686949521303177, "rewards/rejected": -1.2778642177581787, "step": 4085 }, { "epoch": 2.188994815186486, "grad_norm": 4.17017179791477, "learning_rate": 2.0638463518361033e-07, "logits/chosen": -0.21069972217082977, "logits/rejected": -0.06705079227685928, "logps/chosen": -1.1799371242523193, "logps/rejected": -1.2886154651641846, "loss": 1.1799, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1799371242523193, "rewards/margins": 0.10867851972579956, "rewards/rejected": -1.2886154651641846, "step": 4090 }, { "epoch": 2.1916708479678877, "grad_norm": 5.166713567407958, "learning_rate": 2.0512546932750702e-07, "logits/chosen": -0.1977318823337555, "logits/rejected": -0.13568253815174103, "logps/chosen": -1.263792872428894, "logps/rejected": -1.3097572326660156, "loss": 1.2638, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.263792872428894, "rewards/margins": 0.045964501798152924, "rewards/rejected": -1.3097572326660156, "step": 4095 }, { "epoch": 2.194346880749289, "grad_norm": 5.7448021282127595, "learning_rate": 2.0386916434539343e-07, "logits/chosen": -0.1364963799715042, "logits/rejected": -0.04255690425634384, "logps/chosen": -1.1069259643554688, "logps/rejected": -1.2695804834365845, "loss": 1.1069, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1069259643554688, "rewards/margins": 0.1626545637845993, "rewards/rejected": -1.2695804834365845, "step": 4100 }, { "epoch": 2.1970229135306907, "grad_norm": 5.088773880128279, "learning_rate": 2.0261573242594627e-07, "logits/chosen": -0.1848808377981186, "logits/rejected": -0.02593042515218258, "logps/chosen": -1.2358434200286865, "logps/rejected": -1.3136072158813477, "loss": 1.2358, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2358434200286865, "rewards/margins": 0.07776384055614471, "rewards/rejected": -1.3136072158813477, "step": 4105 }, { "epoch": 2.1996989463120924, "grad_norm": 6.283876304828875, "learning_rate": 2.0136518572996724e-07, "logits/chosen": -0.12379492819309235, "logits/rejected": 0.008387814275920391, "logps/chosen": -1.158935785293579, "logps/rejected": -1.269468069076538, "loss": 1.1589, "rewards/accuracies": 0.53125, "rewards/chosen": -1.158935785293579, "rewards/margins": 0.11053229868412018, "rewards/rejected": -1.269468069076538, "step": 4110 }, { "epoch": 2.202374979093494, "grad_norm": 4.563121564016602, "learning_rate": 2.0011753639026617e-07, "logits/chosen": -0.11741801351308823, "logits/rejected": -0.09790245443582535, "logps/chosen": -1.1780049800872803, "logps/rejected": -1.354170560836792, "loss": 1.178, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1780049800872803, "rewards/margins": 0.17616555094718933, "rewards/rejected": -1.354170560836792, "step": 4115 }, { "epoch": 2.2050510118748954, "grad_norm": 3.860873527140288, "learning_rate": 1.988727965115421e-07, "logits/chosen": -0.15481719374656677, "logits/rejected": -0.11571405082941055, "logps/chosen": -1.1316372156143188, "logps/rejected": -1.28236722946167, "loss": 1.1316, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1316372156143188, "rewards/margins": 0.15073008835315704, "rewards/rejected": -1.28236722946167, "step": 4120 }, { "epoch": 2.207727044656297, "grad_norm": 3.7757516588823377, "learning_rate": 1.9763097817026713e-07, "logits/chosen": -0.1941637545824051, "logits/rejected": -0.03832678869366646, "logps/chosen": -1.159583568572998, "logps/rejected": -1.2501963376998901, "loss": 1.1596, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.159583568572998, "rewards/margins": 0.09061276912689209, "rewards/rejected": -1.2501963376998901, "step": 4125 }, { "epoch": 2.210403077437699, "grad_norm": 5.802584353800864, "learning_rate": 1.9639209341456796e-07, "logits/chosen": -0.16576412320137024, "logits/rejected": -0.1054341197013855, "logps/chosen": -1.1660542488098145, "logps/rejected": -1.3407056331634521, "loss": 1.1661, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1660542488098145, "rewards/margins": 0.17465126514434814, "rewards/rejected": -1.3407056331634521, "step": 4130 }, { "epoch": 2.2130791102191, "grad_norm": 9.334710363395882, "learning_rate": 1.951561542641102e-07, "logits/chosen": -0.17607860267162323, "logits/rejected": -0.18582521378993988, "logps/chosen": -1.2012430429458618, "logps/rejected": -1.3361905813217163, "loss": 1.2012, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2012430429458618, "rewards/margins": 0.13494762778282166, "rewards/rejected": -1.3361905813217163, "step": 4135 }, { "epoch": 2.215755143000502, "grad_norm": 5.025162889950114, "learning_rate": 1.939231727099806e-07, "logits/chosen": -0.2650391161441803, "logits/rejected": -0.20038311183452606, "logps/chosen": -1.1358247995376587, "logps/rejected": -1.3236615657806396, "loss": 1.1358, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1358247995376587, "rewards/margins": 0.1878366321325302, "rewards/rejected": -1.3236615657806396, "step": 4140 }, { "epoch": 2.2184311757819035, "grad_norm": 5.4754308707512465, "learning_rate": 1.926931607145719e-07, "logits/chosen": -0.1314409077167511, "logits/rejected": -0.024513447657227516, "logps/chosen": -1.224051833152771, "logps/rejected": -1.4068472385406494, "loss": 1.2241, "rewards/accuracies": 0.5625, "rewards/chosen": -1.224051833152771, "rewards/margins": 0.18279533088207245, "rewards/rejected": -1.4068472385406494, "step": 4145 }, { "epoch": 2.221107208563305, "grad_norm": 4.869676287640066, "learning_rate": 1.9146613021146564e-07, "logits/chosen": -0.13617876172065735, "logits/rejected": -0.07552225887775421, "logps/chosen": -1.1191397905349731, "logps/rejected": -1.2858220338821411, "loss": 1.1191, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.1191397905349731, "rewards/margins": 0.16668212413787842, "rewards/rejected": -1.2858220338821411, "step": 4150 }, { "epoch": 2.2237832413447065, "grad_norm": 6.64323458803904, "learning_rate": 1.9024209310531736e-07, "logits/chosen": -0.10072729736566544, "logits/rejected": -0.13225862383842468, "logps/chosen": -1.1682339906692505, "logps/rejected": -1.3027307987213135, "loss": 1.1682, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.1682339906692505, "rewards/margins": 0.13449694216251373, "rewards/rejected": -1.3027307987213135, "step": 4155 }, { "epoch": 2.2264592741261082, "grad_norm": 6.056821268638129, "learning_rate": 1.890210612717401e-07, "logits/chosen": -0.20030978322029114, "logits/rejected": -0.08339033275842667, "logps/chosen": -1.2147990465164185, "logps/rejected": -1.3408174514770508, "loss": 1.2148, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2147990465164185, "rewards/margins": 0.12601836025714874, "rewards/rejected": -1.3408174514770508, "step": 4160 }, { "epoch": 2.2291353069075095, "grad_norm": 5.088174738992541, "learning_rate": 1.8780304655719054e-07, "logits/chosen": -0.19586379826068878, "logits/rejected": -0.1310482919216156, "logps/chosen": -1.2152433395385742, "logps/rejected": -1.35951828956604, "loss": 1.2152, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2152433395385742, "rewards/margins": 0.14427514374256134, "rewards/rejected": -1.35951828956604, "step": 4165 }, { "epoch": 2.231811339688911, "grad_norm": 6.018313240346942, "learning_rate": 1.865880607788523e-07, "logits/chosen": -0.10551221668720245, "logits/rejected": -0.07057984918355942, "logps/chosen": -1.1957372426986694, "logps/rejected": -1.3226388692855835, "loss": 1.1957, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1957372426986694, "rewards/margins": 0.12690161168575287, "rewards/rejected": -1.3226388692855835, "step": 4170 }, { "epoch": 2.234487372470313, "grad_norm": 4.857309497911004, "learning_rate": 1.8537611572452316e-07, "logits/chosen": -0.1991201639175415, "logits/rejected": -0.11777134239673615, "logps/chosen": -1.1767120361328125, "logps/rejected": -1.2918537855148315, "loss": 1.1767, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1767120361328125, "rewards/margins": 0.11514176428318024, "rewards/rejected": -1.2918537855148315, "step": 4175 }, { "epoch": 2.237163405251714, "grad_norm": 5.029139737205963, "learning_rate": 1.84167223152499e-07, "logits/chosen": -0.20967650413513184, "logits/rejected": -0.033971358090639114, "logps/chosen": -1.1271189451217651, "logps/rejected": -1.3383524417877197, "loss": 1.1271, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1271189451217651, "rewards/margins": 0.211233451962471, "rewards/rejected": -1.3383524417877197, "step": 4180 }, { "epoch": 2.239839438033116, "grad_norm": 5.42303220784513, "learning_rate": 1.8296139479146112e-07, "logits/chosen": -0.2150374948978424, "logits/rejected": -0.2195064276456833, "logps/chosen": -1.0948699712753296, "logps/rejected": -1.289721131324768, "loss": 1.0949, "rewards/accuracies": 0.59375, "rewards/chosen": -1.0948699712753296, "rewards/margins": 0.1948511004447937, "rewards/rejected": -1.289721131324768, "step": 4185 }, { "epoch": 2.2425154708145176, "grad_norm": 6.725403071329438, "learning_rate": 1.8175864234036132e-07, "logits/chosen": -0.12371613830327988, "logits/rejected": -0.03627396374940872, "logps/chosen": -1.1774771213531494, "logps/rejected": -1.2602083683013916, "loss": 1.1775, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1774771213531494, "rewards/margins": 0.08273126184940338, "rewards/rejected": -1.2602083683013916, "step": 4190 }, { "epoch": 2.245191503595919, "grad_norm": 4.519000355764333, "learning_rate": 1.805589774683094e-07, "logits/chosen": -0.29426559805870056, "logits/rejected": -0.1722303032875061, "logps/chosen": -1.2122632265090942, "logps/rejected": -1.2808400392532349, "loss": 1.2123, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2122632265090942, "rewards/margins": 0.06857667863368988, "rewards/rejected": -1.2808400392532349, "step": 4195 }, { "epoch": 2.2478675363773206, "grad_norm": 6.518170025255868, "learning_rate": 1.79362411814459e-07, "logits/chosen": -0.057030148804187775, "logits/rejected": -0.0801350325345993, "logps/chosen": -1.2187111377716064, "logps/rejected": -1.3684864044189453, "loss": 1.2187, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2187111377716064, "rewards/margins": 0.14977531135082245, "rewards/rejected": -1.3684864044189453, "step": 4200 }, { "epoch": 2.2505435691587223, "grad_norm": 6.209205069048698, "learning_rate": 1.7816895698789552e-07, "logits/chosen": -0.26415205001831055, "logits/rejected": -0.20717386901378632, "logps/chosen": -1.1587183475494385, "logps/rejected": -1.3629096746444702, "loss": 1.1587, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1587183475494385, "rewards/margins": 0.20419135689735413, "rewards/rejected": -1.3629096746444702, "step": 4205 }, { "epoch": 2.2532196019401236, "grad_norm": 5.087960555423488, "learning_rate": 1.7697862456752271e-07, "logits/chosen": -0.21760039031505585, "logits/rejected": -0.15472963452339172, "logps/chosen": -1.1644470691680908, "logps/rejected": -1.3668749332427979, "loss": 1.1644, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1644470691680908, "rewards/margins": 0.20242801308631897, "rewards/rejected": -1.3668749332427979, "step": 4210 }, { "epoch": 2.2558956347215253, "grad_norm": 5.147371002868736, "learning_rate": 1.7579142610195124e-07, "logits/chosen": -0.21174247562885284, "logits/rejected": -0.12361390888690948, "logps/chosen": -1.1994844675064087, "logps/rejected": -1.2843894958496094, "loss": 1.1995, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.1994844675064087, "rewards/margins": 0.08490513265132904, "rewards/rejected": -1.2843894958496094, "step": 4215 }, { "epoch": 2.258571667502927, "grad_norm": 5.583780723098238, "learning_rate": 1.7460737310938568e-07, "logits/chosen": -0.22364814579486847, "logits/rejected": -0.06947867572307587, "logps/chosen": -1.193168044090271, "logps/rejected": -1.2863361835479736, "loss": 1.1932, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.193168044090271, "rewards/margins": 0.09316817671060562, "rewards/rejected": -1.2863361835479736, "step": 4220 }, { "epoch": 2.2612477002843283, "grad_norm": 4.581723843600314, "learning_rate": 1.734264770775133e-07, "logits/chosen": -0.23708435893058777, "logits/rejected": -0.11690767854452133, "logps/chosen": -1.2185699939727783, "logps/rejected": -1.3994337320327759, "loss": 1.2186, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2185699939727783, "rewards/margins": 0.18086391687393188, "rewards/rejected": -1.3994337320327759, "step": 4225 }, { "epoch": 2.26392373306573, "grad_norm": 5.306162029800453, "learning_rate": 1.7224874946339241e-07, "logits/chosen": -0.24164751172065735, "logits/rejected": -0.19264449179172516, "logps/chosen": -1.223801612854004, "logps/rejected": -1.4605543613433838, "loss": 1.2238, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.223801612854004, "rewards/margins": 0.2367527186870575, "rewards/rejected": -1.4605543613433838, "step": 4230 }, { "epoch": 2.2665997658471317, "grad_norm": 4.432003919030074, "learning_rate": 1.7107420169334186e-07, "logits/chosen": -0.1994072049856186, "logits/rejected": -0.09726767241954803, "logps/chosen": -1.2524727582931519, "logps/rejected": -1.258910059928894, "loss": 1.2525, "rewards/accuracies": 0.4375, "rewards/chosen": -1.2524727582931519, "rewards/margins": 0.006437242031097412, "rewards/rejected": -1.258910059928894, "step": 4235 }, { "epoch": 2.269275798628533, "grad_norm": 5.408652015840796, "learning_rate": 1.6990284516282893e-07, "logits/chosen": -0.19136342406272888, "logits/rejected": -0.10556334257125854, "logps/chosen": -1.2034322023391724, "logps/rejected": -1.2472929954528809, "loss": 1.2034, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2034322023391724, "rewards/margins": 0.0438607856631279, "rewards/rejected": -1.2472929954528809, "step": 4240 }, { "epoch": 2.2719518314099347, "grad_norm": 5.746583304786529, "learning_rate": 1.687346912363602e-07, "logits/chosen": -0.25075429677963257, "logits/rejected": -0.16390371322631836, "logps/chosen": -1.2042067050933838, "logps/rejected": -1.2306492328643799, "loss": 1.2042, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2042067050933838, "rewards/margins": 0.026442628353834152, "rewards/rejected": -1.2306492328643799, "step": 4245 }, { "epoch": 2.2746278641913364, "grad_norm": 4.25518246668224, "learning_rate": 1.675697512473697e-07, "logits/chosen": -0.1625361144542694, "logits/rejected": -0.0362481027841568, "logps/chosen": -1.1860342025756836, "logps/rejected": -1.2699036598205566, "loss": 1.186, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.1860342025756836, "rewards/margins": 0.08386919647455215, "rewards/rejected": -1.2699036598205566, "step": 4250 }, { "epoch": 2.2773038969727377, "grad_norm": 6.835976615270388, "learning_rate": 1.6640803649811087e-07, "logits/chosen": -0.22955667972564697, "logits/rejected": -0.05028006434440613, "logps/chosen": -1.1997634172439575, "logps/rejected": -1.3339018821716309, "loss": 1.1998, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1997634172439575, "rewards/margins": 0.13413837552070618, "rewards/rejected": -1.3339018821716309, "step": 4255 }, { "epoch": 2.2799799297541394, "grad_norm": 5.996764864103618, "learning_rate": 1.6524955825954472e-07, "logits/chosen": -0.18680524826049805, "logits/rejected": -0.08410540968179703, "logps/chosen": -1.1264101266860962, "logps/rejected": -1.3488953113555908, "loss": 1.1264, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1264101266860962, "rewards/margins": 0.2224852293729782, "rewards/rejected": -1.3488953113555908, "step": 4260 }, { "epoch": 2.282655962535541, "grad_norm": 5.205292219417718, "learning_rate": 1.6409432777123277e-07, "logits/chosen": -0.2690671980381012, "logits/rejected": -0.14474911987781525, "logps/chosen": -1.1883963346481323, "logps/rejected": -1.3061914443969727, "loss": 1.1884, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1883963346481323, "rewards/margins": 0.11779503524303436, "rewards/rejected": -1.3061914443969727, "step": 4265 }, { "epoch": 2.285331995316943, "grad_norm": 4.594325184330231, "learning_rate": 1.6294235624122577e-07, "logits/chosen": -0.10278084129095078, "logits/rejected": 0.09227720648050308, "logps/chosen": -1.1948535442352295, "logps/rejected": -1.4687716960906982, "loss": 1.1949, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1948535442352295, "rewards/margins": 0.27391815185546875, "rewards/rejected": -1.4687716960906982, "step": 4270 }, { "epoch": 2.288008028098344, "grad_norm": 7.256001456063663, "learning_rate": 1.6179365484595697e-07, "logits/chosen": -0.16385479271411896, "logits/rejected": -0.10938102006912231, "logps/chosen": -1.244080662727356, "logps/rejected": -1.3825181722640991, "loss": 1.2441, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.244080662727356, "rewards/margins": 0.13843746483325958, "rewards/rejected": -1.3825181722640991, "step": 4275 }, { "epoch": 2.290684060879746, "grad_norm": 5.20160358445378, "learning_rate": 1.60648234730132e-07, "logits/chosen": -0.17534297704696655, "logits/rejected": -0.12265920639038086, "logps/chosen": -1.1491941213607788, "logps/rejected": -1.2473890781402588, "loss": 1.1492, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1491941213607788, "rewards/margins": 0.09819485247135162, "rewards/rejected": -1.2473890781402588, "step": 4280 }, { "epoch": 2.293360093661147, "grad_norm": 8.659736577980274, "learning_rate": 1.595061070066222e-07, "logits/chosen": -0.10385161638259888, "logits/rejected": -0.1160934567451477, "logps/chosen": -1.1208994388580322, "logps/rejected": -1.286213994026184, "loss": 1.1209, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1208994388580322, "rewards/margins": 0.16531459987163544, "rewards/rejected": -1.286213994026184, "step": 4285 }, { "epoch": 2.296036126442549, "grad_norm": 5.686713706526671, "learning_rate": 1.5836728275635542e-07, "logits/chosen": -0.2318524420261383, "logits/rejected": -0.09164854884147644, "logps/chosen": -1.2217899560928345, "logps/rejected": -1.27787184715271, "loss": 1.2218, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2217899560928345, "rewards/margins": 0.056081850081682205, "rewards/rejected": -1.27787184715271, "step": 4290 }, { "epoch": 2.2987121592239506, "grad_norm": 8.256090207704085, "learning_rate": 1.5723177302820984e-07, "logits/chosen": -0.1780955195426941, "logits/rejected": -0.15421970188617706, "logps/chosen": -1.1737844944000244, "logps/rejected": -1.3340444564819336, "loss": 1.1738, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1737844944000244, "rewards/margins": 0.16025999188423157, "rewards/rejected": -1.3340444564819336, "step": 4295 }, { "epoch": 2.3013881920053523, "grad_norm": 6.5427003215129815, "learning_rate": 1.5609958883890544e-07, "logits/chosen": -0.1916915476322174, "logits/rejected": -0.11251250654459, "logps/chosen": -1.1834797859191895, "logps/rejected": -1.247879147529602, "loss": 1.1835, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.1834797859191895, "rewards/margins": 0.06439925730228424, "rewards/rejected": -1.247879147529602, "step": 4300 }, { "epoch": 2.3040642247867535, "grad_norm": 5.60248286454687, "learning_rate": 1.5497074117289865e-07, "logits/chosen": -0.24063661694526672, "logits/rejected": -0.14305540919303894, "logps/chosen": -1.150460958480835, "logps/rejected": -1.3252336978912354, "loss": 1.1505, "rewards/accuracies": 0.5625, "rewards/chosen": -1.150460958480835, "rewards/margins": 0.17477254569530487, "rewards/rejected": -1.3252336978912354, "step": 4305 }, { "epoch": 2.3067402575681553, "grad_norm": 5.558118904291693, "learning_rate": 1.5384524098227402e-07, "logits/chosen": -0.20420944690704346, "logits/rejected": -0.05714661628007889, "logps/chosen": -1.1815992593765259, "logps/rejected": -1.3225786685943604, "loss": 1.1816, "rewards/accuracies": 0.53125, "rewards/chosen": -1.1815992593765259, "rewards/margins": 0.14097949862480164, "rewards/rejected": -1.3225786685943604, "step": 4310 }, { "epoch": 2.3094162903495565, "grad_norm": 4.118527702095651, "learning_rate": 1.5272309918663974e-07, "logits/chosen": -0.19265493750572205, "logits/rejected": -0.08362994343042374, "logps/chosen": -1.1631594896316528, "logps/rejected": -1.2751071453094482, "loss": 1.1632, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.1631594896316528, "rewards/margins": 0.11194761842489243, "rewards/rejected": -1.2751071453094482, "step": 4315 }, { "epoch": 2.3120923231309582, "grad_norm": 6.012671143985419, "learning_rate": 1.516043266730201e-07, "logits/chosen": -0.20080161094665527, "logits/rejected": -0.11649712175130844, "logps/chosen": -1.2434377670288086, "logps/rejected": -1.256551742553711, "loss": 1.2434, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2434377670288086, "rewards/margins": 0.013113932684063911, "rewards/rejected": -1.256551742553711, "step": 4320 }, { "epoch": 2.31476835591236, "grad_norm": 7.298088160666142, "learning_rate": 1.504889342957512e-07, "logits/chosen": -0.2159946858882904, "logits/rejected": -0.09613845497369766, "logps/chosen": -1.1608588695526123, "logps/rejected": -1.3301284313201904, "loss": 1.1609, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1608588695526123, "rewards/margins": 0.16926947236061096, "rewards/rejected": -1.3301284313201904, "step": 4325 }, { "epoch": 2.3174443886937617, "grad_norm": 4.5412054452994735, "learning_rate": 1.4937693287637453e-07, "logits/chosen": -0.16238698363304138, "logits/rejected": -0.08616816997528076, "logps/chosen": -1.2593778371810913, "logps/rejected": -1.2940012216567993, "loss": 1.2594, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -1.2593778371810913, "rewards/margins": 0.034623418003320694, "rewards/rejected": -1.2940012216567993, "step": 4330 }, { "epoch": 2.320120421475163, "grad_norm": 4.417348524834909, "learning_rate": 1.4826833320353305e-07, "logits/chosen": -0.15392228960990906, "logits/rejected": -0.11155478656291962, "logps/chosen": -1.2196530103683472, "logps/rejected": -1.3328216075897217, "loss": 1.2197, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2196530103683472, "rewards/margins": 0.1131686344742775, "rewards/rejected": -1.3328216075897217, "step": 4335 }, { "epoch": 2.3227964542565647, "grad_norm": 5.673892041870174, "learning_rate": 1.4716314603286528e-07, "logits/chosen": -0.18548209965229034, "logits/rejected": -0.05614793300628662, "logps/chosen": -1.0950952768325806, "logps/rejected": -1.3670198917388916, "loss": 1.0951, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.0950952768325806, "rewards/margins": 0.2719246745109558, "rewards/rejected": -1.3670198917388916, "step": 4340 }, { "epoch": 2.3254724870379664, "grad_norm": 6.893526138109756, "learning_rate": 1.4606138208690233e-07, "logits/chosen": -0.22514739632606506, "logits/rejected": -0.15702608227729797, "logps/chosen": -1.2859506607055664, "logps/rejected": -1.2575256824493408, "loss": 1.286, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.2859506607055664, "rewards/margins": -0.028425127267837524, "rewards/rejected": -1.2575256824493408, "step": 4345 }, { "epoch": 2.3281485198193677, "grad_norm": 5.2389914504174, "learning_rate": 1.4496305205496251e-07, "logits/chosen": -0.14314690232276917, "logits/rejected": -0.09649176895618439, "logps/chosen": -1.17621910572052, "logps/rejected": -1.3426140546798706, "loss": 1.1762, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.17621910572052, "rewards/margins": 0.16639479994773865, "rewards/rejected": -1.3426140546798706, "step": 4350 }, { "epoch": 2.3308245526007694, "grad_norm": 4.901724155915384, "learning_rate": 1.4386816659304895e-07, "logits/chosen": -0.18957838416099548, "logits/rejected": -0.12357129156589508, "logps/chosen": -1.2257616519927979, "logps/rejected": -1.3584731817245483, "loss": 1.2258, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2257616519927979, "rewards/margins": 0.1327117532491684, "rewards/rejected": -1.3584731817245483, "step": 4355 }, { "epoch": 2.333500585382171, "grad_norm": 6.128172156475164, "learning_rate": 1.4277673632374492e-07, "logits/chosen": -0.25931692123413086, "logits/rejected": -0.09256480634212494, "logps/chosen": -1.2789193391799927, "logps/rejected": -1.32516610622406, "loss": 1.2789, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2789193391799927, "rewards/margins": 0.0462469756603241, "rewards/rejected": -1.32516610622406, "step": 4360 }, { "epoch": 2.3361766181635724, "grad_norm": 5.721297770234401, "learning_rate": 1.416887718361119e-07, "logits/chosen": -0.11772127449512482, "logits/rejected": -0.12845325469970703, "logps/chosen": -1.1400355100631714, "logps/rejected": -1.3242669105529785, "loss": 1.14, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.1400355100631714, "rewards/margins": 0.18423128128051758, "rewards/rejected": -1.3242669105529785, "step": 4365 }, { "epoch": 2.338852650944974, "grad_norm": 7.385272795792124, "learning_rate": 1.406042836855859e-07, "logits/chosen": -0.15591342747211456, "logits/rejected": -0.048410721123218536, "logps/chosen": -1.0946487188339233, "logps/rejected": -1.3952341079711914, "loss": 1.0946, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.0946487188339233, "rewards/margins": 0.30058538913726807, "rewards/rejected": -1.3952341079711914, "step": 4370 }, { "epoch": 2.341528683726376, "grad_norm": 4.198642775178046, "learning_rate": 1.3952328239387595e-07, "logits/chosen": -0.28586989641189575, "logits/rejected": -0.1489974558353424, "logps/chosen": -1.2139241695404053, "logps/rejected": -1.3466516733169556, "loss": 1.2139, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2139241695404053, "rewards/margins": 0.13272757828235626, "rewards/rejected": -1.3466516733169556, "step": 4375 }, { "epoch": 2.344204716507777, "grad_norm": 5.0653702555425415, "learning_rate": 1.3844577844886109e-07, "logits/chosen": -0.1976245790719986, "logits/rejected": -0.06773992627859116, "logps/chosen": -1.2631803750991821, "logps/rejected": -1.332170009613037, "loss": 1.2632, "rewards/accuracies": 0.5, "rewards/chosen": -1.2631803750991821, "rewards/margins": 0.0689895898103714, "rewards/rejected": -1.332170009613037, "step": 4380 }, { "epoch": 2.346880749289179, "grad_norm": 6.989565701770442, "learning_rate": 1.3737178230448955e-07, "logits/chosen": -0.2593960165977478, "logits/rejected": -0.14256739616394043, "logps/chosen": -1.168222427368164, "logps/rejected": -1.3941218852996826, "loss": 1.1682, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.168222427368164, "rewards/margins": 0.22589938342571259, "rewards/rejected": -1.3941218852996826, "step": 4385 }, { "epoch": 2.3495567820705805, "grad_norm": 5.589744784340527, "learning_rate": 1.363013043806764e-07, "logits/chosen": -0.19785338640213013, "logits/rejected": -0.09946274012327194, "logps/chosen": -1.1713603734970093, "logps/rejected": -1.2832601070404053, "loss": 1.1714, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1713603734970093, "rewards/margins": 0.11189959198236465, "rewards/rejected": -1.2832601070404053, "step": 4390 }, { "epoch": 2.3522328148519818, "grad_norm": 4.525650371473416, "learning_rate": 1.352343550632034e-07, "logits/chosen": -0.15938793122768402, "logits/rejected": -0.047026894986629486, "logps/chosen": -1.1929266452789307, "logps/rejected": -1.3973157405853271, "loss": 1.1929, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1929266452789307, "rewards/margins": 0.20438921451568604, "rewards/rejected": -1.3973157405853271, "step": 4395 }, { "epoch": 2.3549088476333835, "grad_norm": 4.976103193754043, "learning_rate": 1.3417094470361722e-07, "logits/chosen": -0.269040584564209, "logits/rejected": -0.14696404337882996, "logps/chosen": -1.1515470743179321, "logps/rejected": -1.3573720455169678, "loss": 1.1515, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1515470743179321, "rewards/margins": 0.2058248519897461, "rewards/rejected": -1.3573720455169678, "step": 4400 }, { "epoch": 2.3549088476333835, "eval_logits/chosen": 0.025052379816770554, "eval_logits/rejected": 0.08800262957811356, "eval_logps/chosen": -1.2626087665557861, "eval_logps/rejected": -1.386799931526184, "eval_loss": 1.2628657817840576, "eval_rewards/accuracies": 0.5467358827590942, "eval_rewards/chosen": -1.2626087665557861, "eval_rewards/margins": 0.12419126182794571, "eval_rewards/rejected": -1.386799931526184, "eval_runtime": 40.4523, "eval_samples_per_second": 33.249, "eval_steps_per_second": 8.331, "step": 4400 }, { "epoch": 2.357584880414785, "grad_norm": 5.545055389853662, "learning_rate": 1.3311108361913015e-07, "logits/chosen": -0.2709338366985321, "logits/rejected": -0.2356809377670288, "logps/chosen": -1.167567491531372, "logps/rejected": -1.2585251331329346, "loss": 1.1676, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.167567491531372, "rewards/margins": 0.09095760434865952, "rewards/rejected": -1.2585251331329346, "step": 4405 }, { "epoch": 2.3602609131961865, "grad_norm": 5.55550743344495, "learning_rate": 1.3205478209251874e-07, "logits/chosen": -0.19681081175804138, "logits/rejected": -0.17477372288703918, "logps/chosen": -1.2580926418304443, "logps/rejected": -1.4473406076431274, "loss": 1.2581, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2580926418304443, "rewards/margins": 0.1892479509115219, "rewards/rejected": -1.4473406076431274, "step": 4410 }, { "epoch": 2.362936945977588, "grad_norm": 5.7198190573908745, "learning_rate": 1.310020503720254e-07, "logits/chosen": -0.15117819607257843, "logits/rejected": -0.028177103027701378, "logps/chosen": -1.196979284286499, "logps/rejected": -1.3209823369979858, "loss": 1.197, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.196979284286499, "rewards/margins": 0.12400289624929428, "rewards/rejected": -1.3209823369979858, "step": 4415 }, { "epoch": 2.36561297875899, "grad_norm": 4.9871348561556355, "learning_rate": 1.2995289867125752e-07, "logits/chosen": -0.1862148940563202, "logits/rejected": -0.12906624376773834, "logps/chosen": -1.2312976121902466, "logps/rejected": -1.2678049802780151, "loss": 1.2313, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2312976121902466, "rewards/margins": 0.03650742024183273, "rewards/rejected": -1.2678049802780151, "step": 4420 }, { "epoch": 2.368289011540391, "grad_norm": 4.987483271107021, "learning_rate": 1.2890733716908986e-07, "logits/chosen": -0.2000734508037567, "logits/rejected": -0.10463760048151016, "logps/chosen": -1.1580575704574585, "logps/rejected": -1.3520044088363647, "loss": 1.1581, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.1580575704574585, "rewards/margins": 0.19394680857658386, "rewards/rejected": -1.3520044088363647, "step": 4425 }, { "epoch": 2.370965044321793, "grad_norm": 4.514794580409299, "learning_rate": 1.2786537600956454e-07, "logits/chosen": -0.19761376082897186, "logits/rejected": -0.0754212960600853, "logps/chosen": -1.2426931858062744, "logps/rejected": -1.3538594245910645, "loss": 1.2427, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.2426931858062744, "rewards/margins": 0.11116622388362885, "rewards/rejected": -1.3538594245910645, "step": 4430 }, { "epoch": 2.3736410771031946, "grad_norm": 5.017376025800371, "learning_rate": 1.268270253017933e-07, "logits/chosen": -0.2842445969581604, "logits/rejected": -0.1112985759973526, "logps/chosen": -1.1290119886398315, "logps/rejected": -1.3068126440048218, "loss": 1.129, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1290119886398315, "rewards/margins": 0.17780065536499023, "rewards/rejected": -1.3068126440048218, "step": 4435 }, { "epoch": 2.376317109884596, "grad_norm": 5.282790183986984, "learning_rate": 1.257922951198591e-07, "logits/chosen": -0.2671864628791809, "logits/rejected": -0.11369846016168594, "logps/chosen": -1.2349573373794556, "logps/rejected": -1.2931219339370728, "loss": 1.235, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2349573373794556, "rewards/margins": 0.05816468596458435, "rewards/rejected": -1.2931219339370728, "step": 4440 }, { "epoch": 2.3789931426659976, "grad_norm": 5.442146699447444, "learning_rate": 1.24761195502719e-07, "logits/chosen": -0.23671261966228485, "logits/rejected": -0.10051093250513077, "logps/chosen": -1.1816486120224, "logps/rejected": -1.2978382110595703, "loss": 1.1816, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.1816486120224, "rewards/margins": 0.11618933826684952, "rewards/rejected": -1.2978382110595703, "step": 4445 }, { "epoch": 2.3816691754473993, "grad_norm": 7.000582511478185, "learning_rate": 1.2373373645410573e-07, "logits/chosen": -0.16093024611473083, "logits/rejected": -0.06658019870519638, "logps/chosen": -1.2254868745803833, "logps/rejected": -1.3245351314544678, "loss": 1.2255, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2254868745803833, "rewards/margins": 0.0990481898188591, "rewards/rejected": -1.3245351314544678, "step": 4450 }, { "epoch": 2.384345208228801, "grad_norm": 5.8977868803152935, "learning_rate": 1.2270992794243175e-07, "logits/chosen": -0.2522749900817871, "logits/rejected": -0.18404506146907806, "logps/chosen": -1.2199594974517822, "logps/rejected": -1.3153274059295654, "loss": 1.22, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2199594974517822, "rewards/margins": 0.0953679233789444, "rewards/rejected": -1.3153274059295654, "step": 4455 }, { "epoch": 2.3870212410102023, "grad_norm": 5.724510851682513, "learning_rate": 1.2168977990069147e-07, "logits/chosen": -0.27013611793518066, "logits/rejected": -0.09293849766254425, "logps/chosen": -1.1449134349822998, "logps/rejected": -1.3478654623031616, "loss": 1.1449, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1449134349822998, "rewards/margins": 0.2029520720243454, "rewards/rejected": -1.3478654623031616, "step": 4460 }, { "epoch": 2.389697273791604, "grad_norm": 5.375343586642596, "learning_rate": 1.206733022263659e-07, "logits/chosen": -0.23647236824035645, "logits/rejected": -0.09246110916137695, "logps/chosen": -1.2394686937332153, "logps/rejected": -1.3365486860275269, "loss": 1.2395, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2394686937332153, "rewards/margins": 0.09708007425069809, "rewards/rejected": -1.3365486860275269, "step": 4465 }, { "epoch": 2.3923733065730053, "grad_norm": 6.676580449432978, "learning_rate": 1.1966050478132572e-07, "logits/chosen": -0.11881140619516373, "logits/rejected": -0.06621021032333374, "logps/chosen": -1.093145489692688, "logps/rejected": -1.2244962453842163, "loss": 1.0931, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.093145489692688, "rewards/margins": 0.13135093450546265, "rewards/rejected": -1.2244962453842163, "step": 4470 }, { "epoch": 2.395049339354407, "grad_norm": 4.457376290205982, "learning_rate": 1.1865139739173635e-07, "logits/chosen": -0.22149264812469482, "logits/rejected": -0.04207824543118477, "logps/chosen": -1.1407787799835205, "logps/rejected": -1.2985563278198242, "loss": 1.1408, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1407787799835205, "rewards/margins": 0.15777751803398132, "rewards/rejected": -1.2985563278198242, "step": 4475 }, { "epoch": 2.3977253721358087, "grad_norm": 5.321210929300246, "learning_rate": 1.1764598984796187e-07, "logits/chosen": -0.17002855241298676, "logits/rejected": -0.12662367522716522, "logps/chosen": -1.0983747243881226, "logps/rejected": -1.2042315006256104, "loss": 1.0984, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.0983747243881226, "rewards/margins": 0.10585677623748779, "rewards/rejected": -1.2042315006256104, "step": 4480 }, { "epoch": 2.4004014049172104, "grad_norm": 5.187967372446003, "learning_rate": 1.1664429190447095e-07, "logits/chosen": -0.20294001698493958, "logits/rejected": -0.11380920559167862, "logps/chosen": -1.2138562202453613, "logps/rejected": -1.362051248550415, "loss": 1.2139, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2138562202453613, "rewards/margins": 0.14819493889808655, "rewards/rejected": -1.362051248550415, "step": 4485 }, { "epoch": 2.4030774376986117, "grad_norm": 5.162970507202348, "learning_rate": 1.1564631327974122e-07, "logits/chosen": -0.23560830950737, "logits/rejected": -0.060730885714292526, "logps/chosen": -1.1752207279205322, "logps/rejected": -1.369462013244629, "loss": 1.1752, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1752207279205322, "rewards/margins": 0.1942412555217743, "rewards/rejected": -1.369462013244629, "step": 4490 }, { "epoch": 2.4057534704800134, "grad_norm": 5.486699871617572, "learning_rate": 1.1465206365616587e-07, "logits/chosen": -0.2745433747768402, "logits/rejected": -0.10672204196453094, "logps/chosen": -1.1594557762145996, "logps/rejected": -1.3019659519195557, "loss": 1.1595, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1594557762145996, "rewards/margins": 0.1425103098154068, "rewards/rejected": -1.3019659519195557, "step": 4495 }, { "epoch": 2.408429503261415, "grad_norm": 5.925029187196413, "learning_rate": 1.1366155267995887e-07, "logits/chosen": -0.08818145096302032, "logits/rejected": -0.10068623721599579, "logps/chosen": -1.1927253007888794, "logps/rejected": -1.3079639673233032, "loss": 1.1927, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.1927253007888794, "rewards/margins": 0.11523852497339249, "rewards/rejected": -1.3079639673233032, "step": 4500 }, { "epoch": 2.4111055360428164, "grad_norm": 5.652946726462329, "learning_rate": 1.1267478996106228e-07, "logits/chosen": -0.23610098659992218, "logits/rejected": -0.12320003658533096, "logps/chosen": -1.177761435508728, "logps/rejected": -1.4053454399108887, "loss": 1.1778, "rewards/accuracies": 0.59375, "rewards/chosen": -1.177761435508728, "rewards/margins": 0.22758395969867706, "rewards/rejected": -1.4053454399108887, "step": 4505 }, { "epoch": 2.413781568824218, "grad_norm": 6.875483769557765, "learning_rate": 1.116917850730521e-07, "logits/chosen": -0.25140270590782166, "logits/rejected": -0.14477795362472534, "logps/chosen": -1.117880940437317, "logps/rejected": -1.2741929292678833, "loss": 1.1179, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.117880940437317, "rewards/margins": 0.15631206333637238, "rewards/rejected": -1.2741929292678833, "step": 4510 }, { "epoch": 2.41645760160562, "grad_norm": 4.885564503347514, "learning_rate": 1.1071254755304637e-07, "logits/chosen": -0.22498326003551483, "logits/rejected": -0.17471103370189667, "logps/chosen": -1.199968695640564, "logps/rejected": -1.3614121675491333, "loss": 1.2, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.199968695640564, "rewards/margins": 0.16144347190856934, "rewards/rejected": -1.3614121675491333, "step": 4515 }, { "epoch": 2.419133634387021, "grad_norm": 5.968711097075049, "learning_rate": 1.0973708690161143e-07, "logits/chosen": -0.1749815195798874, "logits/rejected": -0.11154153198003769, "logps/chosen": -1.160445213317871, "logps/rejected": -1.3862186670303345, "loss": 1.1604, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.160445213317871, "rewards/margins": 0.22577354311943054, "rewards/rejected": -1.3862186670303345, "step": 4520 }, { "epoch": 2.421809667168423, "grad_norm": 5.853072163561056, "learning_rate": 1.0876541258267119e-07, "logits/chosen": -0.27086374163627625, "logits/rejected": -0.12352031469345093, "logps/chosen": -1.286721110343933, "logps/rejected": -1.4352104663848877, "loss": 1.2867, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.286721110343933, "rewards/margins": 0.1484893262386322, "rewards/rejected": -1.4352104663848877, "step": 4525 }, { "epoch": 2.4244856999498245, "grad_norm": 4.861314656069504, "learning_rate": 1.0779753402341379e-07, "logits/chosen": -0.2293623685836792, "logits/rejected": -0.16067025065422058, "logps/chosen": -1.218178391456604, "logps/rejected": -1.3252449035644531, "loss": 1.2182, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.218178391456604, "rewards/margins": 0.10706678777933121, "rewards/rejected": -1.3252449035644531, "step": 4530 }, { "epoch": 2.427161732731226, "grad_norm": 4.643449543881948, "learning_rate": 1.0683346061420157e-07, "logits/chosen": -0.07495566457509995, "logits/rejected": 0.005963814444839954, "logps/chosen": -1.1440422534942627, "logps/rejected": -1.2799142599105835, "loss": 1.144, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1440422534942627, "rewards/margins": 0.13587205111980438, "rewards/rejected": -1.2799142599105835, "step": 4535 }, { "epoch": 2.4298377655126275, "grad_norm": 5.772127080156402, "learning_rate": 1.0587320170847874e-07, "logits/chosen": -0.0928221121430397, "logits/rejected": -0.050201691687107086, "logps/chosen": -1.1250402927398682, "logps/rejected": -1.260023832321167, "loss": 1.125, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.1250402927398682, "rewards/margins": 0.1349835842847824, "rewards/rejected": -1.260023832321167, "step": 4540 }, { "epoch": 2.4325137982940293, "grad_norm": 5.712049544323037, "learning_rate": 1.0491676662268156e-07, "logits/chosen": -0.168679878115654, "logits/rejected": -0.055943358689546585, "logps/chosen": -1.1316163539886475, "logps/rejected": -1.268754005432129, "loss": 1.1316, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.1316163539886475, "rewards/margins": 0.1371377855539322, "rewards/rejected": -1.268754005432129, "step": 4545 }, { "epoch": 2.4351898310754305, "grad_norm": 6.205112961914227, "learning_rate": 1.0396416463614732e-07, "logits/chosen": -0.2172313630580902, "logits/rejected": -0.13186350464820862, "logps/chosen": -1.1196324825286865, "logps/rejected": -1.320062279701233, "loss": 1.1196, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1196324825286865, "rewards/margins": 0.2004297971725464, "rewards/rejected": -1.320062279701233, "step": 4550 }, { "epoch": 2.4378658638568322, "grad_norm": 6.0016502106171545, "learning_rate": 1.0301540499102479e-07, "logits/chosen": -0.17630760371685028, "logits/rejected": -0.12117314338684082, "logps/chosen": -1.2467310428619385, "logps/rejected": -1.3134002685546875, "loss": 1.2467, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2467310428619385, "rewards/margins": 0.06666944921016693, "rewards/rejected": -1.3134002685546875, "step": 4555 }, { "epoch": 2.440541896638234, "grad_norm": 4.9914159920752885, "learning_rate": 1.0207049689218405e-07, "logits/chosen": -0.20659470558166504, "logits/rejected": -0.06587819010019302, "logps/chosen": -1.1441843509674072, "logps/rejected": -1.259303092956543, "loss": 1.1442, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1441843509674072, "rewards/margins": 0.11511871963739395, "rewards/rejected": -1.259303092956543, "step": 4560 }, { "epoch": 2.4432179294196352, "grad_norm": 5.047101660593324, "learning_rate": 1.0112944950712782e-07, "logits/chosen": -0.16385190188884735, "logits/rejected": -0.11542239040136337, "logps/chosen": -1.235789179801941, "logps/rejected": -1.3302574157714844, "loss": 1.2358, "rewards/accuracies": 0.5625, "rewards/chosen": -1.235789179801941, "rewards/margins": 0.09446805715560913, "rewards/rejected": -1.3302574157714844, "step": 4565 }, { "epoch": 2.445893962201037, "grad_norm": 4.854398013259162, "learning_rate": 1.0019227196590174e-07, "logits/chosen": -0.1311967670917511, "logits/rejected": -0.010543346405029297, "logps/chosen": -1.1886603832244873, "logps/rejected": -1.3756067752838135, "loss": 1.1887, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1886603832244873, "rewards/margins": 0.18694642186164856, "rewards/rejected": -1.3756067752838135, "step": 4570 }, { "epoch": 2.4485699949824387, "grad_norm": 4.428041477202952, "learning_rate": 9.925897336100664e-08, "logits/chosen": -0.11873660236597061, "logits/rejected": -0.08495937287807465, "logps/chosen": -1.1800026893615723, "logps/rejected": -1.298760175704956, "loss": 1.18, "rewards/accuracies": 0.5, "rewards/chosen": -1.1800026893615723, "rewards/margins": 0.11875753104686737, "rewards/rejected": -1.298760175704956, "step": 4575 }, { "epoch": 2.45124602776384, "grad_norm": 6.248332744103985, "learning_rate": 9.832956274730946e-08, "logits/chosen": -0.21544376015663147, "logits/rejected": -0.1786438226699829, "logps/chosen": -1.1049554347991943, "logps/rejected": -1.3358160257339478, "loss": 1.105, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1049554347991943, "rewards/margins": 0.2308606207370758, "rewards/rejected": -1.3358160257339478, "step": 4580 }, { "epoch": 2.4539220605452416, "grad_norm": 4.754013752858126, "learning_rate": 9.740404914195633e-08, "logits/chosen": -0.15220312774181366, "logits/rejected": -0.04685419797897339, "logps/chosen": -1.2529369592666626, "logps/rejected": -1.3102192878723145, "loss": 1.2529, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2529369592666626, "rewards/margins": 0.05728255584836006, "rewards/rejected": -1.3102192878723145, "step": 4585 }, { "epoch": 2.4565980933266434, "grad_norm": 5.299788318853109, "learning_rate": 9.648244152428392e-08, "logits/chosen": -0.2627236247062683, "logits/rejected": -0.11068715155124664, "logps/chosen": -1.134094476699829, "logps/rejected": -1.3241512775421143, "loss": 1.1341, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.134094476699829, "rewards/margins": 0.19005689024925232, "rewards/rejected": -1.3241512775421143, "step": 4590 }, { "epoch": 2.4592741261080446, "grad_norm": 5.600472775067502, "learning_rate": 9.556474883573379e-08, "logits/chosen": -0.22342009842395782, "logits/rejected": -0.1211988776922226, "logps/chosen": -1.1955523490905762, "logps/rejected": -1.352161169052124, "loss": 1.1956, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.1955523490905762, "rewards/margins": 0.1566087305545807, "rewards/rejected": -1.352161169052124, "step": 4595 }, { "epoch": 2.4619501588894463, "grad_norm": 5.113668607511297, "learning_rate": 9.465097997976412e-08, "logits/chosen": -0.20970675349235535, "logits/rejected": 0.006368355359882116, "logps/chosen": -1.2403113842010498, "logps/rejected": -1.4371951818466187, "loss": 1.2403, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2403113842010498, "rewards/margins": 0.19688387215137482, "rewards/rejected": -1.4371951818466187, "step": 4600 }, { "epoch": 2.464626191670848, "grad_norm": 6.1200123212335775, "learning_rate": 9.374114382176457e-08, "logits/chosen": -0.1338377147912979, "logits/rejected": -0.031677234917879105, "logps/chosen": -1.2085363864898682, "logps/rejected": -1.370495080947876, "loss": 1.2085, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2085363864898682, "rewards/margins": 0.16195864975452423, "rewards/rejected": -1.370495080947876, "step": 4605 }, { "epoch": 2.46730222445225, "grad_norm": 5.674948598418707, "learning_rate": 9.283524918896945e-08, "logits/chosen": -0.21462781727313995, "logits/rejected": -0.08784286677837372, "logps/chosen": -1.1954164505004883, "logps/rejected": -1.3704946041107178, "loss": 1.1954, "rewards/accuracies": 0.625, "rewards/chosen": -1.1954164505004883, "rewards/margins": 0.1750781238079071, "rewards/rejected": -1.3704946041107178, "step": 4610 }, { "epoch": 2.469978257233651, "grad_norm": 4.893475274498356, "learning_rate": 9.193330487037232e-08, "logits/chosen": -0.14490801095962524, "logits/rejected": -0.033690571784973145, "logps/chosen": -1.2176101207733154, "logps/rejected": -1.39728581905365, "loss": 1.2176, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2176101207733154, "rewards/margins": 0.17967572808265686, "rewards/rejected": -1.39728581905365, "step": 4615 }, { "epoch": 2.4726542900150528, "grad_norm": 6.199042549295952, "learning_rate": 9.103531961664118e-08, "logits/chosen": -0.16276778280735016, "logits/rejected": -0.009637385606765747, "logps/chosen": -1.1129305362701416, "logps/rejected": -1.3166258335113525, "loss": 1.1129, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1129305362701416, "rewards/margins": 0.203695148229599, "rewards/rejected": -1.3166258335113525, "step": 4620 }, { "epoch": 2.475330322796454, "grad_norm": 5.066527389725325, "learning_rate": 9.014130214003269e-08, "logits/chosen": -0.27972927689552307, "logits/rejected": -0.24004431068897247, "logps/chosen": -1.2379182577133179, "logps/rejected": -1.3642317056655884, "loss": 1.2379, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2379182577133179, "rewards/margins": 0.12631358206272125, "rewards/rejected": -1.3642317056655884, "step": 4625 }, { "epoch": 2.4780063555778558, "grad_norm": 5.873912999870249, "learning_rate": 8.925126111430848e-08, "logits/chosen": -0.16283734142780304, "logits/rejected": -0.09513553231954575, "logps/chosen": -1.1396925449371338, "logps/rejected": -1.3468849658966064, "loss": 1.1397, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1396925449371338, "rewards/margins": 0.20719249546527863, "rewards/rejected": -1.3468849658966064, "step": 4630 }, { "epoch": 2.4806823883592575, "grad_norm": 5.961183604618729, "learning_rate": 8.83652051746504e-08, "logits/chosen": -0.0829085260629654, "logits/rejected": 0.03310883790254593, "logps/chosen": -1.1434223651885986, "logps/rejected": -1.3173128366470337, "loss": 1.1434, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.1434223651885986, "rewards/margins": 0.17389050126075745, "rewards/rejected": -1.3173128366470337, "step": 4635 }, { "epoch": 2.483358421140659, "grad_norm": 6.063898388816428, "learning_rate": 8.748314291757696e-08, "logits/chosen": -0.12186668813228607, "logits/rejected": -0.03663311526179314, "logps/chosen": -1.1867051124572754, "logps/rejected": -1.3539354801177979, "loss": 1.1867, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1867051124572754, "rewards/margins": 0.16723033785820007, "rewards/rejected": -1.3539354801177979, "step": 4640 }, { "epoch": 2.4860344539220605, "grad_norm": 5.547928945423828, "learning_rate": 8.660508290086032e-08, "logits/chosen": -0.155680850148201, "logits/rejected": -0.04386782646179199, "logps/chosen": -1.164412260055542, "logps/rejected": -1.3528974056243896, "loss": 1.1644, "rewards/accuracies": 0.5625, "rewards/chosen": -1.164412260055542, "rewards/margins": 0.1884850561618805, "rewards/rejected": -1.3528974056243896, "step": 4645 }, { "epoch": 2.488710486703462, "grad_norm": 5.674434915209517, "learning_rate": 8.573103364344231e-08, "logits/chosen": -0.19186769425868988, "logits/rejected": 0.009135546162724495, "logps/chosen": -1.1980564594268799, "logps/rejected": -1.4211132526397705, "loss": 1.1981, "rewards/accuracies": 0.53125, "rewards/chosen": -1.1980564594268799, "rewards/margins": 0.22305670380592346, "rewards/rejected": -1.4211132526397705, "step": 4650 }, { "epoch": 2.4913865194848634, "grad_norm": 6.391720964626229, "learning_rate": 8.486100362535292e-08, "logits/chosen": -0.22106154263019562, "logits/rejected": -0.10317359119653702, "logps/chosen": -1.1705561876296997, "logps/rejected": -1.2742478847503662, "loss": 1.1706, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.1705561876296997, "rewards/margins": 0.1036917194724083, "rewards/rejected": -1.2742478847503662, "step": 4655 }, { "epoch": 2.494062552266265, "grad_norm": 5.180641798248046, "learning_rate": 8.399500128762693e-08, "logits/chosen": -0.20603862404823303, "logits/rejected": -0.11786928027868271, "logps/chosen": -1.2352402210235596, "logps/rejected": -1.3249728679656982, "loss": 1.2352, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2352402210235596, "rewards/margins": 0.08973237127065659, "rewards/rejected": -1.3249728679656982, "step": 4660 }, { "epoch": 2.496738585047667, "grad_norm": 6.225378632701211, "learning_rate": 8.313303503222313e-08, "logits/chosen": -0.1827421635389328, "logits/rejected": -0.12877950072288513, "logps/chosen": -1.258191466331482, "logps/rejected": -1.4135892391204834, "loss": 1.2582, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.258191466331482, "rewards/margins": 0.15539772808551788, "rewards/rejected": -1.4135892391204834, "step": 4665 }, { "epoch": 2.4994146178290686, "grad_norm": 5.865960427600806, "learning_rate": 8.227511322194164e-08, "logits/chosen": -0.17367592453956604, "logits/rejected": -0.07382993400096893, "logps/chosen": -1.231673002243042, "logps/rejected": -1.3367455005645752, "loss": 1.2317, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.231673002243042, "rewards/margins": 0.10507242381572723, "rewards/rejected": -1.3367455005645752, "step": 4670 }, { "epoch": 2.50209065061047, "grad_norm": 5.199556512410179, "learning_rate": 8.142124418034385e-08, "logits/chosen": -0.14071939885616302, "logits/rejected": -0.030953969806432724, "logps/chosen": -1.1400396823883057, "logps/rejected": -1.2974660396575928, "loss": 1.14, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.1400396823883057, "rewards/margins": 0.15742626786231995, "rewards/rejected": -1.2974660396575928, "step": 4675 }, { "epoch": 2.5047666833918716, "grad_norm": 6.8989390717214985, "learning_rate": 8.057143619167073e-08, "logits/chosen": -0.14170949161052704, "logits/rejected": -0.054387759417295456, "logps/chosen": -1.1868473291397095, "logps/rejected": -1.2875926494598389, "loss": 1.1868, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1868473291397095, "rewards/margins": 0.10074535757303238, "rewards/rejected": -1.2875926494598389, "step": 4680 }, { "epoch": 2.507442716173273, "grad_norm": 4.417326290569136, "learning_rate": 7.97256975007633e-08, "logits/chosen": -0.25347021222114563, "logits/rejected": -0.1125224381685257, "logps/chosen": -1.206116795539856, "logps/rejected": -1.3193378448486328, "loss": 1.2061, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.206116795539856, "rewards/margins": 0.11322104930877686, "rewards/rejected": -1.3193378448486328, "step": 4685 }, { "epoch": 2.5101187489546746, "grad_norm": 6.610158041259104, "learning_rate": 7.888403631298186e-08, "logits/chosen": -0.14728622138500214, "logits/rejected": -0.11436394602060318, "logps/chosen": -1.2254151105880737, "logps/rejected": -1.3278472423553467, "loss": 1.2254, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2254151105880737, "rewards/margins": 0.10243208706378937, "rewards/rejected": -1.3278472423553467, "step": 4690 }, { "epoch": 2.5127947817360763, "grad_norm": 5.673364536070491, "learning_rate": 7.804646079412719e-08, "logits/chosen": -0.14808592200279236, "logits/rejected": 0.007278826087713242, "logps/chosen": -1.2079174518585205, "logps/rejected": -1.320471167564392, "loss": 1.2079, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2079174518585205, "rewards/margins": 0.11255370080471039, "rewards/rejected": -1.320471167564392, "step": 4695 }, { "epoch": 2.515470814517478, "grad_norm": 5.343588220382247, "learning_rate": 7.72129790703604e-08, "logits/chosen": -0.23151977360248566, "logits/rejected": -0.14920443296432495, "logps/chosen": -1.1683440208435059, "logps/rejected": -1.2787268161773682, "loss": 1.1683, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.1683440208435059, "rewards/margins": 0.11038286983966827, "rewards/rejected": -1.2787268161773682, "step": 4700 }, { "epoch": 2.5181468472988793, "grad_norm": 7.581885856780892, "learning_rate": 7.638359922812504e-08, "logits/chosen": -0.10537122189998627, "logits/rejected": -0.09281863272190094, "logps/chosen": -1.2414765357971191, "logps/rejected": -1.3409807682037354, "loss": 1.2415, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2414765357971191, "rewards/margins": 0.09950443357229233, "rewards/rejected": -1.3409807682037354, "step": 4705 }, { "epoch": 2.520822880080281, "grad_norm": 7.463354674587174, "learning_rate": 7.555832931406774e-08, "logits/chosen": -0.1943662315607071, "logits/rejected": -0.08687295764684677, "logps/chosen": -1.2108862400054932, "logps/rejected": -1.3484553098678589, "loss": 1.2109, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2108862400054932, "rewards/margins": 0.13756921887397766, "rewards/rejected": -1.3484553098678589, "step": 4710 }, { "epoch": 2.5234989128616827, "grad_norm": 6.044996116844061, "learning_rate": 7.47371773349611e-08, "logits/chosen": -0.1980164647102356, "logits/rejected": -0.1880231499671936, "logps/chosen": -1.2795995473861694, "logps/rejected": -1.402883768081665, "loss": 1.2796, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2795995473861694, "rewards/margins": 0.12328419834375381, "rewards/rejected": -1.402883768081665, "step": 4715 }, { "epoch": 2.526174945643084, "grad_norm": 6.1643034997435695, "learning_rate": 7.392015125762496e-08, "logits/chosen": -0.15246151387691498, "logits/rejected": -0.09133397042751312, "logps/chosen": -1.1350094079971313, "logps/rejected": -1.3217235803604126, "loss": 1.135, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1350094079971313, "rewards/margins": 0.18671421706676483, "rewards/rejected": -1.3217235803604126, "step": 4720 }, { "epoch": 2.5288509784244857, "grad_norm": 5.402299913442335, "learning_rate": 7.310725900885018e-08, "logits/chosen": -0.21504196524620056, "logits/rejected": -0.16612941026687622, "logps/chosen": -1.204830527305603, "logps/rejected": -1.3071666955947876, "loss": 1.2048, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.204830527305603, "rewards/margins": 0.10233616828918457, "rewards/rejected": -1.3071666955947876, "step": 4725 }, { "epoch": 2.5315270112058874, "grad_norm": 5.393630888759507, "learning_rate": 7.229850847532076e-08, "logits/chosen": -0.1725747287273407, "logits/rejected": -0.08291719108819962, "logps/chosen": -1.1129395961761475, "logps/rejected": -1.2570351362228394, "loss": 1.1129, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.1129395961761475, "rewards/margins": 0.14409562945365906, "rewards/rejected": -1.2570351362228394, "step": 4730 }, { "epoch": 2.5342030439872887, "grad_norm": 4.578863420931409, "learning_rate": 7.149390750353779e-08, "logits/chosen": -0.17215880751609802, "logits/rejected": -0.23126590251922607, "logps/chosen": -1.2427552938461304, "logps/rejected": -1.3658427000045776, "loss": 1.2428, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2427552938461304, "rewards/margins": 0.12308740615844727, "rewards/rejected": -1.3658427000045776, "step": 4735 }, { "epoch": 2.5368790767686904, "grad_norm": 5.026205161003124, "learning_rate": 7.069346389974374e-08, "logits/chosen": -0.23052461445331573, "logits/rejected": -0.12125305831432343, "logps/chosen": -1.2288029193878174, "logps/rejected": -1.4149913787841797, "loss": 1.2288, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2288029193878174, "rewards/margins": 0.1861884742975235, "rewards/rejected": -1.4149913787841797, "step": 4740 }, { "epoch": 2.539555109550092, "grad_norm": 6.5686263359881165, "learning_rate": 6.989718542984563e-08, "logits/chosen": -0.1646455079317093, "logits/rejected": -0.1453578770160675, "logps/chosen": -1.2299884557724, "logps/rejected": -1.392346978187561, "loss": 1.23, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2299884557724, "rewards/margins": 0.16235852241516113, "rewards/rejected": -1.392346978187561, "step": 4745 }, { "epoch": 2.5422311423314934, "grad_norm": 4.404387694087082, "learning_rate": 6.9105079819341e-08, "logits/chosen": -0.14497092366218567, "logits/rejected": -0.011191338300704956, "logps/chosen": -1.1730215549468994, "logps/rejected": -1.4702101945877075, "loss": 1.173, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1730215549468994, "rewards/margins": 0.29718858003616333, "rewards/rejected": -1.4702101945877075, "step": 4750 }, { "epoch": 2.544907175112895, "grad_norm": 5.324731046194308, "learning_rate": 6.831715475324163e-08, "logits/chosen": -0.2018851935863495, "logits/rejected": -0.07457415014505386, "logps/chosen": -1.1145573854446411, "logps/rejected": -1.3626707792282104, "loss": 1.1146, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1145573854446411, "rewards/margins": 0.2481134831905365, "rewards/rejected": -1.3626707792282104, "step": 4755 }, { "epoch": 2.547583207894297, "grad_norm": 4.771341755993434, "learning_rate": 6.753341787600026e-08, "logits/chosen": -0.21315467357635498, "logits/rejected": -0.10161477327346802, "logps/chosen": -1.1191325187683105, "logps/rejected": -1.2662551403045654, "loss": 1.1191, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1191325187683105, "rewards/margins": 0.14712262153625488, "rewards/rejected": -1.2662551403045654, "step": 4760 }, { "epoch": 2.5502592406756985, "grad_norm": 6.334139129094602, "learning_rate": 6.67538767914353e-08, "logits/chosen": -0.22464995086193085, "logits/rejected": -0.0918690413236618, "logps/chosen": -1.1990457773208618, "logps/rejected": -1.2617486715316772, "loss": 1.199, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.1990457773208618, "rewards/margins": 0.0627029612660408, "rewards/rejected": -1.2617486715316772, "step": 4765 }, { "epoch": 2.5529352734571, "grad_norm": 5.862197766629301, "learning_rate": 6.597853906265793e-08, "logits/chosen": -0.1815083622932434, "logits/rejected": -0.10871344804763794, "logps/chosen": -1.2135746479034424, "logps/rejected": -1.3697984218597412, "loss": 1.2136, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2135746479034424, "rewards/margins": 0.15622378885746002, "rewards/rejected": -1.3697984218597412, "step": 4770 }, { "epoch": 2.5556113062385015, "grad_norm": 4.242771087401228, "learning_rate": 6.5207412211998e-08, "logits/chosen": -0.07523973286151886, "logits/rejected": 0.015504973940551281, "logps/chosen": -1.1409213542938232, "logps/rejected": -1.2855995893478394, "loss": 1.1409, "rewards/accuracies": 0.625, "rewards/chosen": -1.1409213542938232, "rewards/margins": 0.1446782350540161, "rewards/rejected": -1.2855995893478394, "step": 4775 }, { "epoch": 2.558287339019903, "grad_norm": 4.330226524275462, "learning_rate": 6.444050372093186e-08, "logits/chosen": -0.17349833250045776, "logits/rejected": -0.11513632535934448, "logps/chosen": -1.1944080591201782, "logps/rejected": -1.3314402103424072, "loss": 1.1944, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1944080591201782, "rewards/margins": 0.13703198730945587, "rewards/rejected": -1.3314402103424072, "step": 4780 }, { "epoch": 2.5609633718013045, "grad_norm": 6.267762199384361, "learning_rate": 6.367782103000873e-08, "logits/chosen": -0.1437850296497345, "logits/rejected": -0.10049740970134735, "logps/chosen": -1.2094330787658691, "logps/rejected": -1.305383324623108, "loss": 1.2094, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2094330787658691, "rewards/margins": 0.0959501713514328, "rewards/rejected": -1.305383324623108, "step": 4785 }, { "epoch": 2.5636394045827062, "grad_norm": 5.262577095903868, "learning_rate": 6.29193715387798e-08, "logits/chosen": -0.23821115493774414, "logits/rejected": -0.13145801424980164, "logps/chosen": -1.222237229347229, "logps/rejected": -1.345015287399292, "loss": 1.2222, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.222237229347229, "rewards/margins": 0.12277805805206299, "rewards/rejected": -1.345015287399292, "step": 4790 }, { "epoch": 2.566315437364108, "grad_norm": 7.303053540105646, "learning_rate": 6.216516260572502e-08, "logits/chosen": -0.10938284546136856, "logits/rejected": -0.06194578483700752, "logps/chosen": -1.2230128049850464, "logps/rejected": -1.3521678447723389, "loss": 1.223, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2230128049850464, "rewards/margins": 0.1291550248861313, "rewards/rejected": -1.3521678447723389, "step": 4795 }, { "epoch": 2.568991470145509, "grad_norm": 6.14102725174474, "learning_rate": 6.141520154818297e-08, "logits/chosen": -0.16496331989765167, "logits/rejected": -0.08642788231372833, "logps/chosen": -1.136447787284851, "logps/rejected": -1.312910556793213, "loss": 1.1364, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.136447787284851, "rewards/margins": 0.17646275460720062, "rewards/rejected": -1.312910556793213, "step": 4800 }, { "epoch": 2.568991470145509, "eval_logits/chosen": 0.06303287297487259, "eval_logits/rejected": 0.129154235124588, "eval_logps/chosen": -1.2622878551483154, "eval_logps/rejected": -1.386473298072815, "eval_loss": 1.2625449895858765, "eval_rewards/accuracies": 0.5467358827590942, "eval_rewards/chosen": -1.2622878551483154, "eval_rewards/margins": 0.12418550252914429, "eval_rewards/rejected": -1.386473298072815, "eval_runtime": 40.4453, "eval_samples_per_second": 33.255, "eval_steps_per_second": 8.332, "step": 4800 }, { "epoch": 2.571667502926911, "grad_norm": 7.468621364603002, "learning_rate": 6.066949564227897e-08, "logits/chosen": -0.22965911030769348, "logits/rejected": -0.13691923022270203, "logps/chosen": -1.2507753372192383, "logps/rejected": -1.447633981704712, "loss": 1.2508, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2507753372192383, "rewards/margins": 0.19685861468315125, "rewards/rejected": -1.447633981704712, "step": 4805 }, { "epoch": 2.574343535708312, "grad_norm": 4.871124292845325, "learning_rate": 5.992805212285523e-08, "logits/chosen": -0.13078241050243378, "logits/rejected": -0.022624928504228592, "logps/chosen": -1.2588207721710205, "logps/rejected": -1.3913819789886475, "loss": 1.2588, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2588207721710205, "rewards/margins": 0.13256122171878815, "rewards/rejected": -1.3913819789886475, "step": 4810 }, { "epoch": 2.577019568489714, "grad_norm": 5.3901630737675506, "learning_rate": 5.9190878183399684e-08, "logits/chosen": -0.13408245146274567, "logits/rejected": -0.04561351239681244, "logps/chosen": -1.087417721748352, "logps/rejected": -1.4007751941680908, "loss": 1.0874, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.087417721748352, "rewards/margins": 0.3133576214313507, "rewards/rejected": -1.4007751941680908, "step": 4815 }, { "epoch": 2.5796956012711156, "grad_norm": 6.771486116164208, "learning_rate": 5.845798097597748e-08, "logits/chosen": -0.1479436159133911, "logits/rejected": -0.08002855628728867, "logps/chosen": -1.2573951482772827, "logps/rejected": -1.360419511795044, "loss": 1.2574, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2573951482772827, "rewards/margins": 0.10302440077066422, "rewards/rejected": -1.360419511795044, "step": 4820 }, { "epoch": 2.5823716340525174, "grad_norm": 5.220373121272143, "learning_rate": 5.772936761116026e-08, "logits/chosen": -0.13649320602416992, "logits/rejected": -0.05891447141766548, "logps/chosen": -1.1580803394317627, "logps/rejected": -1.2749049663543701, "loss": 1.1581, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.1580803394317627, "rewards/margins": 0.11682454496622086, "rewards/rejected": -1.2749049663543701, "step": 4825 }, { "epoch": 2.5850476668339186, "grad_norm": 4.141220027265293, "learning_rate": 5.700504515795829e-08, "logits/chosen": -0.19586724042892456, "logits/rejected": -0.08037696033716202, "logps/chosen": -1.2483406066894531, "logps/rejected": -1.3194774389266968, "loss": 1.2483, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2483406066894531, "rewards/margins": 0.07113666832447052, "rewards/rejected": -1.3194774389266968, "step": 4830 }, { "epoch": 2.5877236996153203, "grad_norm": 5.486773217478368, "learning_rate": 5.628502064375101e-08, "logits/chosen": -0.2740897536277771, "logits/rejected": -0.13261668384075165, "logps/chosen": -1.1758434772491455, "logps/rejected": -1.277549147605896, "loss": 1.1758, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.1758434772491455, "rewards/margins": 0.10170559585094452, "rewards/rejected": -1.277549147605896, "step": 4835 }, { "epoch": 2.5903997323967216, "grad_norm": 6.768539936326079, "learning_rate": 5.55693010542197e-08, "logits/chosen": -0.22754459083080292, "logits/rejected": -0.09078878164291382, "logps/chosen": -1.1958142518997192, "logps/rejected": -1.3033338785171509, "loss": 1.1958, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.1958142518997192, "rewards/margins": 0.10751968622207642, "rewards/rejected": -1.3033338785171509, "step": 4840 }, { "epoch": 2.5930757651781233, "grad_norm": 5.519768340340793, "learning_rate": 5.485789333327856e-08, "logits/chosen": -0.2226492464542389, "logits/rejected": -0.11902530491352081, "logps/chosen": -1.1538795232772827, "logps/rejected": -1.3459739685058594, "loss": 1.1539, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1538795232772827, "rewards/margins": 0.19209453463554382, "rewards/rejected": -1.3459739685058594, "step": 4845 }, { "epoch": 2.595751797959525, "grad_norm": 4.9260653847253195, "learning_rate": 5.4150804383008675e-08, "logits/chosen": -0.3194291889667511, "logits/rejected": -0.19218876957893372, "logps/chosen": -1.1435991525650024, "logps/rejected": -1.2826666831970215, "loss": 1.1436, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.1435991525650024, "rewards/margins": 0.13906744122505188, "rewards/rejected": -1.2826666831970215, "step": 4850 }, { "epoch": 2.5984278307409268, "grad_norm": 5.011695501039968, "learning_rate": 5.344804106359002e-08, "logits/chosen": -0.1578296422958374, "logits/rejected": -0.03121192194521427, "logps/chosen": -1.140295147895813, "logps/rejected": -1.2503840923309326, "loss": 1.1403, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.140295147895813, "rewards/margins": 0.11008895933628082, "rewards/rejected": -1.2503840923309326, "step": 4855 }, { "epoch": 2.601103863522328, "grad_norm": 7.282392467706973, "learning_rate": 5.274961019323559e-08, "logits/chosen": -0.21210706233978271, "logits/rejected": -0.171635240316391, "logps/chosen": -1.106022834777832, "logps/rejected": -1.3897649049758911, "loss": 1.106, "rewards/accuracies": 0.625, "rewards/chosen": -1.106022834777832, "rewards/margins": 0.28374195098876953, "rewards/rejected": -1.3897649049758911, "step": 4860 }, { "epoch": 2.6037798963037297, "grad_norm": 5.346097784424451, "learning_rate": 5.205551854812451e-08, "logits/chosen": -0.2942012548446655, "logits/rejected": -0.23602068424224854, "logps/chosen": -1.1790319681167603, "logps/rejected": -1.3857446908950806, "loss": 1.179, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1790319681167603, "rewards/margins": 0.2067127227783203, "rewards/rejected": -1.3857446908950806, "step": 4865 }, { "epoch": 2.606455929085131, "grad_norm": 7.1197368137818025, "learning_rate": 5.1365772862337177e-08, "logits/chosen": -0.15675674378871918, "logits/rejected": -0.05829506367444992, "logps/chosen": -1.1807358264923096, "logps/rejected": -1.272729516029358, "loss": 1.1807, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.1807358264923096, "rewards/margins": 0.09199382364749908, "rewards/rejected": -1.272729516029358, "step": 4870 }, { "epoch": 2.6091319618665327, "grad_norm": 6.311890608366594, "learning_rate": 5.068037982778905e-08, "logits/chosen": -0.05423650145530701, "logits/rejected": 0.0007905826205387712, "logps/chosen": -1.1505919694900513, "logps/rejected": -1.4207453727722168, "loss": 1.1506, "rewards/accuracies": 0.65625, "rewards/chosen": -1.1505919694900513, "rewards/margins": 0.2701531946659088, "rewards/rejected": -1.4207453727722168, "step": 4875 }, { "epoch": 2.6118079946479344, "grad_norm": 4.640523314686869, "learning_rate": 4.999934609416656e-08, "logits/chosen": -0.09583868086338043, "logits/rejected": -0.012130677700042725, "logps/chosen": -1.1557140350341797, "logps/rejected": -1.32638680934906, "loss": 1.1557, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1557140350341797, "rewards/margins": 0.17067280411720276, "rewards/rejected": -1.32638680934906, "step": 4880 }, { "epoch": 2.614484027429336, "grad_norm": 5.513553287696556, "learning_rate": 4.932267826886183e-08, "logits/chosen": -0.1150188073515892, "logits/rejected": -0.05268536135554314, "logps/chosen": -1.2571394443511963, "logps/rejected": -1.442989706993103, "loss": 1.2571, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2571394443511963, "rewards/margins": 0.18585015833377838, "rewards/rejected": -1.442989706993103, "step": 4885 }, { "epoch": 2.6171600602107374, "grad_norm": 4.3777401756208985, "learning_rate": 4.8650382916909206e-08, "logits/chosen": -0.24362199008464813, "logits/rejected": -0.1229153722524643, "logps/chosen": -1.1839576959609985, "logps/rejected": -1.3520158529281616, "loss": 1.184, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1839576959609985, "rewards/margins": 0.1680581420660019, "rewards/rejected": -1.3520158529281616, "step": 4890 }, { "epoch": 2.619836092992139, "grad_norm": 5.766312572850582, "learning_rate": 4.7982466560920976e-08, "logits/chosen": -0.19103355705738068, "logits/rejected": -0.1145477443933487, "logps/chosen": -1.2287296056747437, "logps/rejected": -1.3050682544708252, "loss": 1.2287, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2287296056747437, "rewards/margins": 0.07633884251117706, "rewards/rejected": -1.3050682544708252, "step": 4895 }, { "epoch": 2.622512125773541, "grad_norm": 4.91552569531932, "learning_rate": 4.7318935681024685e-08, "logits/chosen": -0.14290598034858704, "logits/rejected": -0.040977831929922104, "logps/chosen": -1.174247145652771, "logps/rejected": -1.3416694402694702, "loss": 1.1742, "rewards/accuracies": 0.59375, "rewards/chosen": -1.174247145652771, "rewards/margins": 0.16742220520973206, "rewards/rejected": -1.3416694402694702, "step": 4900 }, { "epoch": 2.625188158554942, "grad_norm": 4.715070933826352, "learning_rate": 4.6659796714799745e-08, "logits/chosen": -0.20242615044116974, "logits/rejected": -0.07929400354623795, "logps/chosen": -1.1672890186309814, "logps/rejected": -1.336466670036316, "loss": 1.1673, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1672890186309814, "rewards/margins": 0.16917765140533447, "rewards/rejected": -1.336466670036316, "step": 4905 }, { "epoch": 2.627864191336344, "grad_norm": 5.171559981673705, "learning_rate": 4.60050560572155e-08, "logits/chosen": -0.21385245025157928, "logits/rejected": -0.2443496435880661, "logps/chosen": -1.1842563152313232, "logps/rejected": -1.391503930091858, "loss": 1.1843, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1842563152313232, "rewards/margins": 0.20724749565124512, "rewards/rejected": -1.391503930091858, "step": 4910 }, { "epoch": 2.6305402241177456, "grad_norm": 6.028238427896048, "learning_rate": 4.535472006056834e-08, "logits/chosen": -0.14635112881660461, "logits/rejected": -0.07556402683258057, "logps/chosen": -1.1056934595108032, "logps/rejected": -1.3315129280090332, "loss": 1.1057, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1056934595108032, "rewards/margins": 0.22581949830055237, "rewards/rejected": -1.3315129280090332, "step": 4915 }, { "epoch": 2.6332162568991473, "grad_norm": 7.82422080645486, "learning_rate": 4.470879503442132e-08, "logits/chosen": -0.11479593813419342, "logits/rejected": -0.06142418459057808, "logps/chosen": -1.1922252178192139, "logps/rejected": -1.26539146900177, "loss": 1.1922, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.1922252178192139, "rewards/margins": 0.07316620647907257, "rewards/rejected": -1.26539146900177, "step": 4920 }, { "epoch": 2.6358922896805486, "grad_norm": 5.723841330808825, "learning_rate": 4.406728724554154e-08, "logits/chosen": -0.3224028944969177, "logits/rejected": -0.12011559307575226, "logps/chosen": -1.1594828367233276, "logps/rejected": -1.3313437700271606, "loss": 1.1595, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1594828367233276, "rewards/margins": 0.17186105251312256, "rewards/rejected": -1.3313437700271606, "step": 4925 }, { "epoch": 2.6385683224619503, "grad_norm": 5.4201646321704295, "learning_rate": 4.3430202917840664e-08, "logits/chosen": -0.10645916312932968, "logits/rejected": 0.004933017306029797, "logps/chosen": -1.2200148105621338, "logps/rejected": -1.435920000076294, "loss": 1.22, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2200148105621338, "rewards/margins": 0.21590527892112732, "rewards/rejected": -1.435920000076294, "step": 4930 }, { "epoch": 2.6412443552433515, "grad_norm": 5.12306893425985, "learning_rate": 4.279754823231346e-08, "logits/chosen": -0.2289653718471527, "logits/rejected": -0.10973107814788818, "logps/chosen": -1.1822679042816162, "logps/rejected": -1.2937430143356323, "loss": 1.1823, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.1822679042816162, "rewards/margins": 0.11147511005401611, "rewards/rejected": -1.2937430143356323, "step": 4935 }, { "epoch": 2.6439203880247533, "grad_norm": 4.791567840460217, "learning_rate": 4.216932932697859e-08, "logits/chosen": -0.16858312487602234, "logits/rejected": -0.11572971194982529, "logps/chosen": -1.1895477771759033, "logps/rejected": -1.2643003463745117, "loss": 1.1895, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1895477771759033, "rewards/margins": 0.07475268840789795, "rewards/rejected": -1.2643003463745117, "step": 4940 }, { "epoch": 2.646596420806155, "grad_norm": 5.352404537214937, "learning_rate": 4.154555229681844e-08, "logits/chosen": -0.1806947886943817, "logits/rejected": -0.023491745814681053, "logps/chosen": -1.1795223951339722, "logps/rejected": -1.3868026733398438, "loss": 1.1795, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1795223951339722, "rewards/margins": 0.207280233502388, "rewards/rejected": -1.3868026733398438, "step": 4945 }, { "epoch": 2.6492724535875567, "grad_norm": 5.233599795859156, "learning_rate": 4.092622319372069e-08, "logits/chosen": -0.20476415753364563, "logits/rejected": -0.11273413896560669, "logps/chosen": -1.1422139406204224, "logps/rejected": -1.3395774364471436, "loss": 1.1422, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1422139406204224, "rewards/margins": 0.19736367464065552, "rewards/rejected": -1.3395774364471436, "step": 4950 }, { "epoch": 2.651948486368958, "grad_norm": 5.988863385979121, "learning_rate": 4.031134802641889e-08, "logits/chosen": -0.163321852684021, "logits/rejected": -0.18224772810935974, "logps/chosen": -1.1676267385482788, "logps/rejected": -1.3793046474456787, "loss": 1.1676, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1676267385482788, "rewards/margins": 0.21167802810668945, "rewards/rejected": -1.3793046474456787, "step": 4955 }, { "epoch": 2.6546245191503597, "grad_norm": 6.160434840934573, "learning_rate": 3.970093276043468e-08, "logits/chosen": -0.08542106300592422, "logits/rejected": -0.009877888485789299, "logps/chosen": -1.2123284339904785, "logps/rejected": -1.3220148086547852, "loss": 1.2123, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2123284339904785, "rewards/margins": 0.10968631505966187, "rewards/rejected": -1.3220148086547852, "step": 4960 }, { "epoch": 2.657300551931761, "grad_norm": 5.664455898524179, "learning_rate": 3.9094983318019584e-08, "logits/chosen": -0.2070048749446869, "logits/rejected": -0.10656273365020752, "logps/chosen": -1.1004118919372559, "logps/rejected": -1.261467695236206, "loss": 1.1004, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1004118919372559, "rewards/margins": 0.16105574369430542, "rewards/rejected": -1.261467695236206, "step": 4965 }, { "epoch": 2.6599765847131627, "grad_norm": 7.270962442588856, "learning_rate": 3.849350557809789e-08, "logits/chosen": -0.10519279539585114, "logits/rejected": -0.057579588145017624, "logps/chosen": -1.1422033309936523, "logps/rejected": -1.3803659677505493, "loss": 1.1422, "rewards/accuracies": 0.625, "rewards/chosen": -1.1422033309936523, "rewards/margins": 0.23816260695457458, "rewards/rejected": -1.3803659677505493, "step": 4970 }, { "epoch": 2.6626526174945644, "grad_norm": 6.017046285926171, "learning_rate": 3.789650537620903e-08, "logits/chosen": -0.1555704027414322, "logits/rejected": -0.12431790679693222, "logps/chosen": -1.2025130987167358, "logps/rejected": -1.3258905410766602, "loss": 1.2025, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2025130987167358, "rewards/margins": 0.12337744235992432, "rewards/rejected": -1.3258905410766602, "step": 4975 }, { "epoch": 2.665328650275966, "grad_norm": 5.640298097551106, "learning_rate": 3.730398850445182e-08, "logits/chosen": -0.062471628189086914, "logits/rejected": -0.018146909773349762, "logps/chosen": -1.3088743686676025, "logps/rejected": -1.3850795030593872, "loss": 1.3089, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3088743686676025, "rewards/margins": 0.0762050524353981, "rewards/rejected": -1.3850795030593872, "step": 4980 }, { "epoch": 2.6680046830573674, "grad_norm": 6.33456542929799, "learning_rate": 3.671596071142735e-08, "logits/chosen": -0.12803785502910614, "logits/rejected": -0.0008679248276166618, "logps/chosen": -1.1366140842437744, "logps/rejected": -1.3044718503952026, "loss": 1.1366, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.1366140842437744, "rewards/margins": 0.16785791516304016, "rewards/rejected": -1.3044718503952026, "step": 4985 }, { "epoch": 2.670680715838769, "grad_norm": 4.752349211180841, "learning_rate": 3.6132427702183996e-08, "logits/chosen": -0.22925138473510742, "logits/rejected": -0.04846381023526192, "logps/chosen": -1.1378283500671387, "logps/rejected": -1.3292986154556274, "loss": 1.1378, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1378283500671387, "rewards/margins": 0.191470205783844, "rewards/rejected": -1.3292986154556274, "step": 4990 }, { "epoch": 2.6733567486201704, "grad_norm": 6.043708897839066, "learning_rate": 3.555339513816147e-08, "logits/chosen": -0.21532341837882996, "logits/rejected": -0.21204447746276855, "logps/chosen": -1.1928383111953735, "logps/rejected": -1.3230355978012085, "loss": 1.1928, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1928383111953735, "rewards/margins": 0.13019739091396332, "rewards/rejected": -1.3230355978012085, "step": 4995 }, { "epoch": 2.676032781401572, "grad_norm": 6.170415029717473, "learning_rate": 3.497886863713639e-08, "logits/chosen": -0.18021997809410095, "logits/rejected": -0.1763593852519989, "logps/chosen": -1.175318956375122, "logps/rejected": -1.4058458805084229, "loss": 1.1753, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.175318956375122, "rewards/margins": 0.2305268943309784, "rewards/rejected": -1.4058458805084229, "step": 5000 }, { "epoch": 2.678708814182974, "grad_norm": 6.632022532495689, "learning_rate": 3.440885377316721e-08, "logits/chosen": -0.12952257692813873, "logits/rejected": -0.10332361608743668, "logps/chosen": -1.1716121435165405, "logps/rejected": -1.3109662532806396, "loss": 1.1716, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1716121435165405, "rewards/margins": 0.1393541395664215, "rewards/rejected": -1.3109662532806396, "step": 5005 }, { "epoch": 2.6813848469643755, "grad_norm": 5.873027098955236, "learning_rate": 3.384335607654082e-08, "logits/chosen": -0.1187458410859108, "logits/rejected": -0.050290901213884354, "logps/chosen": -1.3047231435775757, "logps/rejected": -1.433432698249817, "loss": 1.3047, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.3047231435775757, "rewards/margins": 0.12870953977108002, "rewards/rejected": -1.433432698249817, "step": 5010 }, { "epoch": 2.684060879745777, "grad_norm": 7.146536672956862, "learning_rate": 3.328238103371811e-08, "logits/chosen": -0.21975842118263245, "logits/rejected": -0.17972728610038757, "logps/chosen": -1.1732470989227295, "logps/rejected": -1.3646670579910278, "loss": 1.1732, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1732470989227295, "rewards/margins": 0.1914198249578476, "rewards/rejected": -1.3646670579910278, "step": 5015 }, { "epoch": 2.6867369125271785, "grad_norm": 6.380797659126942, "learning_rate": 3.272593408728169e-08, "logits/chosen": -0.23286859691143036, "logits/rejected": -0.0906810536980629, "logps/chosen": -1.1530263423919678, "logps/rejected": -1.2618200778961182, "loss": 1.153, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1530263423919678, "rewards/margins": 0.1087937206029892, "rewards/rejected": -1.2618200778961182, "step": 5020 }, { "epoch": 2.6894129453085798, "grad_norm": 6.110365261413846, "learning_rate": 3.217402063588204e-08, "logits/chosen": -0.2338874787092209, "logits/rejected": -0.1278344690799713, "logps/chosen": -1.2184090614318848, "logps/rejected": -1.308386206626892, "loss": 1.2184, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2184090614318848, "rewards/margins": 0.08997714519500732, "rewards/rejected": -1.308386206626892, "step": 5025 }, { "epoch": 2.6920889780899815, "grad_norm": 4.667144550951669, "learning_rate": 3.162664603418608e-08, "logits/chosen": -0.16278322041034698, "logits/rejected": -0.12372411787509918, "logps/chosen": -1.1888654232025146, "logps/rejected": -1.2626479864120483, "loss": 1.1889, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.1888654232025146, "rewards/margins": 0.07378266006708145, "rewards/rejected": -1.2626479864120483, "step": 5030 }, { "epoch": 2.694765010871383, "grad_norm": 7.405491885078588, "learning_rate": 3.1083815592824416e-08, "logits/chosen": -0.20261946320533752, "logits/rejected": -0.1019575223326683, "logps/chosen": -1.2162169218063354, "logps/rejected": -1.3538146018981934, "loss": 1.2162, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2162169218063354, "rewards/margins": 0.13759776949882507, "rewards/rejected": -1.3538146018981934, "step": 5035 }, { "epoch": 2.697441043652785, "grad_norm": 7.6103463200739, "learning_rate": 3.054553457834053e-08, "logits/chosen": -0.007795420940965414, "logits/rejected": -0.06115374714136124, "logps/chosen": -1.1218430995941162, "logps/rejected": -1.3557971715927124, "loss": 1.1218, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.1218430995941162, "rewards/margins": 0.23395399749279022, "rewards/rejected": -1.3557971715927124, "step": 5040 }, { "epoch": 2.700117076434186, "grad_norm": 5.48113185814878, "learning_rate": 3.0011808213139036e-08, "logits/chosen": -0.11308795213699341, "logits/rejected": -0.09083414077758789, "logps/chosen": -1.1362667083740234, "logps/rejected": -1.26784086227417, "loss": 1.1363, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1362667083740234, "rewards/margins": 0.13157403469085693, "rewards/rejected": -1.26784086227417, "step": 5045 }, { "epoch": 2.702793109215588, "grad_norm": 6.74753210966817, "learning_rate": 2.948264167543568e-08, "logits/chosen": -0.17321442067623138, "logits/rejected": -0.1360080987215042, "logps/chosen": -1.060376763343811, "logps/rejected": -1.3138148784637451, "loss": 1.0604, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.060376763343811, "rewards/margins": 0.25343817472457886, "rewards/rejected": -1.3138148784637451, "step": 5050 }, { "epoch": 2.7054691419969896, "grad_norm": 5.243502510758272, "learning_rate": 2.8958040099206216e-08, "logits/chosen": -0.24645903706550598, "logits/rejected": -0.18570443987846375, "logps/chosen": -1.1109473705291748, "logps/rejected": -1.2819993495941162, "loss": 1.1109, "rewards/accuracies": 0.53125, "rewards/chosen": -1.1109473705291748, "rewards/margins": 0.1710520088672638, "rewards/rejected": -1.2819993495941162, "step": 5055 }, { "epoch": 2.708145174778391, "grad_norm": 5.5298808954278105, "learning_rate": 2.843800857413775e-08, "logits/chosen": -0.17022362351417542, "logits/rejected": -0.1365174651145935, "logps/chosen": -1.1704604625701904, "logps/rejected": -1.3565353155136108, "loss": 1.1705, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1704604625701904, "rewards/margins": 0.1860748678445816, "rewards/rejected": -1.3565353155136108, "step": 5060 }, { "epoch": 2.7108212075597926, "grad_norm": 6.1145788317707455, "learning_rate": 2.7922552145578203e-08, "logits/chosen": -0.1884942650794983, "logits/rejected": -0.008563208393752575, "logps/chosen": -1.2451403141021729, "logps/rejected": -1.448102355003357, "loss": 1.2451, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2451403141021729, "rewards/margins": 0.20296220481395721, "rewards/rejected": -1.448102355003357, "step": 5065 }, { "epoch": 2.7134972403411943, "grad_norm": 5.902907338713225, "learning_rate": 2.7411675814488277e-08, "logits/chosen": -0.08610056340694427, "logits/rejected": 0.035958804190158844, "logps/chosen": -1.1193740367889404, "logps/rejected": -1.2643730640411377, "loss": 1.1194, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1193740367889404, "rewards/margins": 0.14499911665916443, "rewards/rejected": -1.2643730640411377, "step": 5070 }, { "epoch": 2.7161732731225956, "grad_norm": 5.23939334169454, "learning_rate": 2.690538453739216e-08, "logits/chosen": -0.16717438399791718, "logits/rejected": -0.11444715410470963, "logps/chosen": -1.1581038236618042, "logps/rejected": -1.2268712520599365, "loss": 1.1581, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.1581038236618042, "rewards/margins": 0.06876734644174576, "rewards/rejected": -1.2268712520599365, "step": 5075 }, { "epoch": 2.7188493059039973, "grad_norm": 5.551577280972258, "learning_rate": 2.6403683226330298e-08, "logits/chosen": -0.19519641995429993, "logits/rejected": -0.0901976227760315, "logps/chosen": -1.2374465465545654, "logps/rejected": -1.3566503524780273, "loss": 1.2374, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2374465465545654, "rewards/margins": 0.11920376867055893, "rewards/rejected": -1.3566503524780273, "step": 5080 }, { "epoch": 2.721525338685399, "grad_norm": 5.058576156028866, "learning_rate": 2.5906576748810804e-08, "logits/chosen": -0.2601271867752075, "logits/rejected": -0.17479009926319122, "logps/chosen": -1.0890238285064697, "logps/rejected": -1.402693748474121, "loss": 1.089, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.0890238285064697, "rewards/margins": 0.313669890165329, "rewards/rejected": -1.402693748474121, "step": 5085 }, { "epoch": 2.7242013714668003, "grad_norm": 6.12689648880712, "learning_rate": 2.5414069927763016e-08, "logits/chosen": -0.2594127058982849, "logits/rejected": -0.1402347981929779, "logps/chosen": -1.244004487991333, "logps/rejected": -1.3978241682052612, "loss": 1.244, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.244004487991333, "rewards/margins": 0.15381969511508942, "rewards/rejected": -1.3978241682052612, "step": 5090 }, { "epoch": 2.726877404248202, "grad_norm": 4.474237843740755, "learning_rate": 2.4926167541490185e-08, "logits/chosen": -0.2886318266391754, "logits/rejected": -0.13344204425811768, "logps/chosen": -1.1787292957305908, "logps/rejected": -1.383608102798462, "loss": 1.1787, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1787292957305908, "rewards/margins": 0.20487871766090393, "rewards/rejected": -1.383608102798462, "step": 5095 }, { "epoch": 2.7295534370296037, "grad_norm": 6.1040285921425115, "learning_rate": 2.4442874323623574e-08, "logits/chosen": -0.09103774279356003, "logits/rejected": 0.005494228098541498, "logps/chosen": -1.1232235431671143, "logps/rejected": -1.34049654006958, "loss": 1.1232, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1232235431671143, "rewards/margins": 0.21727316081523895, "rewards/rejected": -1.34049654006958, "step": 5100 }, { "epoch": 2.7322294698110055, "grad_norm": 6.384620092905336, "learning_rate": 2.396419496307589e-08, "logits/chosen": -0.18263684213161469, "logits/rejected": -0.0527019277215004, "logps/chosen": -1.1900808811187744, "logps/rejected": -1.2870020866394043, "loss": 1.1901, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1900808811187744, "rewards/margins": 0.09692118316888809, "rewards/rejected": -1.2870020866394043, "step": 5105 }, { "epoch": 2.7349055025924067, "grad_norm": 5.181611762980002, "learning_rate": 2.349013410399653e-08, "logits/chosen": -0.2411923110485077, "logits/rejected": -0.11386223882436752, "logps/chosen": -1.1505014896392822, "logps/rejected": -1.3027490377426147, "loss": 1.1505, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1505014896392822, "rewards/margins": 0.1522475630044937, "rewards/rejected": -1.3027490377426147, "step": 5110 }, { "epoch": 2.7375815353738084, "grad_norm": 6.479961587872057, "learning_rate": 2.3020696345725954e-08, "logits/chosen": -0.25723299384117126, "logits/rejected": -0.0930902510881424, "logps/chosen": -1.1651147603988647, "logps/rejected": -1.3712198734283447, "loss": 1.1651, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1651147603988647, "rewards/margins": 0.20610518753528595, "rewards/rejected": -1.3712198734283447, "step": 5115 }, { "epoch": 2.7402575681552097, "grad_norm": 5.7448808767167785, "learning_rate": 2.2555886242751398e-08, "logits/chosen": -0.1944722682237625, "logits/rejected": -0.16558077931404114, "logps/chosen": -1.2815355062484741, "logps/rejected": -1.4436309337615967, "loss": 1.2815, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2815355062484741, "rewards/margins": 0.16209545731544495, "rewards/rejected": -1.4436309337615967, "step": 5120 }, { "epoch": 2.7429336009366114, "grad_norm": 6.586185937360755, "learning_rate": 2.2095708304662453e-08, "logits/chosen": -0.2861994802951813, "logits/rejected": -0.10621882975101471, "logps/chosen": -1.1317287683486938, "logps/rejected": -1.3479727506637573, "loss": 1.1317, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.1317287683486938, "rewards/margins": 0.21624401211738586, "rewards/rejected": -1.3479727506637573, "step": 5125 }, { "epoch": 2.745609633718013, "grad_norm": 4.632641586093074, "learning_rate": 2.16401669961076e-08, "logits/chosen": -0.32163387537002563, "logits/rejected": -0.15905043482780457, "logps/chosen": -1.1643282175064087, "logps/rejected": -1.3537582159042358, "loss": 1.1643, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1643282175064087, "rewards/margins": 0.18943007290363312, "rewards/rejected": -1.3537582159042358, "step": 5130 }, { "epoch": 2.748285666499415, "grad_norm": 7.0290254149455285, "learning_rate": 2.1189266736750532e-08, "logits/chosen": -0.12631934881210327, "logits/rejected": -0.07926620543003082, "logps/chosen": -1.1601388454437256, "logps/rejected": -1.3314080238342285, "loss": 1.1601, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.1601388454437256, "rewards/margins": 0.17126919329166412, "rewards/rejected": -1.3314080238342285, "step": 5135 }, { "epoch": 2.750961699280816, "grad_norm": 4.707308512195557, "learning_rate": 2.0743011901227623e-08, "logits/chosen": -0.13060444593429565, "logits/rejected": -0.0370921865105629, "logps/chosen": -1.2538602352142334, "logps/rejected": -1.3854581117630005, "loss": 1.2539, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.2538602352142334, "rewards/margins": 0.13159780204296112, "rewards/rejected": -1.3854581117630005, "step": 5140 }, { "epoch": 2.753637732062218, "grad_norm": 6.8389227324638515, "learning_rate": 2.030140681910508e-08, "logits/chosen": -0.2216591089963913, "logits/rejected": -0.07326874136924744, "logps/chosen": -1.1536259651184082, "logps/rejected": -1.2769925594329834, "loss": 1.1536, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1536259651184082, "rewards/margins": 0.12336651980876923, "rewards/rejected": -1.2769925594329834, "step": 5145 }, { "epoch": 2.756313764843619, "grad_norm": 4.495541437791241, "learning_rate": 1.986445577483753e-08, "logits/chosen": -0.23918823897838593, "logits/rejected": -0.14619360864162445, "logps/chosen": -1.160880446434021, "logps/rejected": -1.3401274681091309, "loss": 1.1609, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.160880446434021, "rewards/margins": 0.17924697697162628, "rewards/rejected": -1.3401274681091309, "step": 5150 }, { "epoch": 2.758989797625021, "grad_norm": 5.636986387187004, "learning_rate": 1.9432163007725765e-08, "logits/chosen": -0.26029670238494873, "logits/rejected": -0.17812728881835938, "logps/chosen": -1.2226206064224243, "logps/rejected": -1.4014743566513062, "loss": 1.2226, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2226206064224243, "rewards/margins": 0.17885367572307587, "rewards/rejected": -1.4014743566513062, "step": 5155 }, { "epoch": 2.7616658304064226, "grad_norm": 4.582377634416099, "learning_rate": 1.9004532711876297e-08, "logits/chosen": -0.218178391456604, "logits/rejected": -0.16898824274539948, "logps/chosen": -1.1493439674377441, "logps/rejected": -1.44627845287323, "loss": 1.1493, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1493439674377441, "rewards/margins": 0.2969346046447754, "rewards/rejected": -1.44627845287323, "step": 5160 }, { "epoch": 2.7643418631878243, "grad_norm": 5.484535382939086, "learning_rate": 1.8581569036159928e-08, "logits/chosen": -0.20489108562469482, "logits/rejected": -0.05637436360120773, "logps/chosen": -1.14693284034729, "logps/rejected": -1.3229676485061646, "loss": 1.1469, "rewards/accuracies": 0.625, "rewards/chosen": -1.14693284034729, "rewards/margins": 0.17603489756584167, "rewards/rejected": -1.3229676485061646, "step": 5165 }, { "epoch": 2.7670178959692255, "grad_norm": 6.026163471382562, "learning_rate": 1.8163276084172285e-08, "logits/chosen": -0.17486700415611267, "logits/rejected": -0.07841528952121735, "logps/chosen": -1.2324053049087524, "logps/rejected": -1.3451995849609375, "loss": 1.2324, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2324053049087524, "rewards/margins": 0.11279428005218506, "rewards/rejected": -1.3451995849609375, "step": 5170 }, { "epoch": 2.7696939287506273, "grad_norm": 5.214944339442023, "learning_rate": 1.7749657914193194e-08, "logits/chosen": -0.2350362092256546, "logits/rejected": -0.15251624584197998, "logps/chosen": -1.2390943765640259, "logps/rejected": -1.2982499599456787, "loss": 1.2391, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2390943765640259, "rewards/margins": 0.05915568023920059, "rewards/rejected": -1.2982499599456787, "step": 5175 }, { "epoch": 2.7723699615320285, "grad_norm": 5.533191078109718, "learning_rate": 1.7340718539148203e-08, "logits/chosen": -0.163193941116333, "logits/rejected": -0.13357996940612793, "logps/chosen": -1.2446210384368896, "logps/rejected": -1.466085433959961, "loss": 1.2446, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2446210384368896, "rewards/margins": 0.22146444022655487, "rewards/rejected": -1.466085433959961, "step": 5180 }, { "epoch": 2.7750459943134302, "grad_norm": 4.891408313160939, "learning_rate": 1.6936461926568724e-08, "logits/chosen": -0.19039131700992584, "logits/rejected": -0.10839636623859406, "logps/chosen": -1.0894994735717773, "logps/rejected": -1.2624043226242065, "loss": 1.0895, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.0894994735717773, "rewards/margins": 0.17290489375591278, "rewards/rejected": -1.2624043226242065, "step": 5185 }, { "epoch": 2.777722027094832, "grad_norm": 6.2613093609088954, "learning_rate": 1.6536891998554346e-08, "logits/chosen": -0.28828439116477966, "logits/rejected": -0.17061297595500946, "logps/chosen": -1.1830190420150757, "logps/rejected": -1.2861298322677612, "loss": 1.183, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1830190420150757, "rewards/margins": 0.10311077535152435, "rewards/rejected": -1.2861298322677612, "step": 5190 }, { "epoch": 2.7803980598762337, "grad_norm": 5.490468060312781, "learning_rate": 1.6142012631734093e-08, "logits/chosen": -0.19379422068595886, "logits/rejected": -0.08788307011127472, "logps/chosen": -1.1758761405944824, "logps/rejected": -1.3165559768676758, "loss": 1.1759, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1758761405944824, "rewards/margins": 0.14068010449409485, "rewards/rejected": -1.3165559768676758, "step": 5195 }, { "epoch": 2.783074092657635, "grad_norm": 7.140975279939328, "learning_rate": 1.575182765722949e-08, "logits/chosen": -0.2439693659543991, "logits/rejected": -0.13807633519172668, "logps/chosen": -1.125562310218811, "logps/rejected": -1.2848769426345825, "loss": 1.1256, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.125562310218811, "rewards/margins": 0.15931473672389984, "rewards/rejected": -1.2848769426345825, "step": 5200 }, { "epoch": 2.783074092657635, "eval_logits/chosen": 0.055337511003017426, "eval_logits/rejected": 0.1207786351442337, "eval_logps/chosen": -1.2622963190078735, "eval_logps/rejected": -1.3864026069641113, "eval_loss": 1.2625575065612793, "eval_rewards/accuracies": 0.5474777221679688, "eval_rewards/chosen": -1.2622963190078735, "eval_rewards/margins": 0.12410631030797958, "eval_rewards/rejected": -1.3864026069641113, "eval_runtime": 40.5086, "eval_samples_per_second": 33.203, "eval_steps_per_second": 8.319, "step": 5200 }, { "epoch": 2.7857501254390367, "grad_norm": 4.804379469071292, "learning_rate": 1.536634086061672e-08, "logits/chosen": -0.1329023391008377, "logits/rejected": -0.13220670819282532, "logps/chosen": -1.1834813356399536, "logps/rejected": -1.2912867069244385, "loss": 1.1835, "rewards/accuracies": 0.41874998807907104, "rewards/chosen": -1.1834813356399536, "rewards/margins": 0.10780532658100128, "rewards/rejected": -1.2912867069244385, "step": 5205 }, { "epoch": 2.788426158220438, "grad_norm": 6.333116934721875, "learning_rate": 1.4985555981890495e-08, "logits/chosen": -0.18129980564117432, "logits/rejected": -0.12634477019309998, "logps/chosen": -1.1660865545272827, "logps/rejected": -1.2772358655929565, "loss": 1.1661, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1660865545272827, "rewards/margins": 0.11114933341741562, "rewards/rejected": -1.2772358655929565, "step": 5210 }, { "epoch": 2.7911021910018396, "grad_norm": 5.014898924824847, "learning_rate": 1.4609476715427226e-08, "logits/chosen": -0.20106740295886993, "logits/rejected": -0.1417173445224762, "logps/chosen": -1.170931100845337, "logps/rejected": -1.36701238155365, "loss": 1.1709, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.170931100845337, "rewards/margins": 0.1960812360048294, "rewards/rejected": -1.36701238155365, "step": 5215 }, { "epoch": 2.7937782237832414, "grad_norm": 5.038050245459607, "learning_rate": 1.4238106709949792e-08, "logits/chosen": -0.20431199669837952, "logits/rejected": -0.16130927205085754, "logps/chosen": -1.148289442062378, "logps/rejected": -1.3172369003295898, "loss": 1.1483, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.148289442062378, "rewards/margins": 0.16894733905792236, "rewards/rejected": -1.3172369003295898, "step": 5220 }, { "epoch": 2.796454256564643, "grad_norm": 5.483351653116132, "learning_rate": 1.3871449568491511e-08, "logits/chosen": -0.1438860446214676, "logits/rejected": -0.0478404276072979, "logps/chosen": -1.2330503463745117, "logps/rejected": -1.4011837244033813, "loss": 1.2331, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2330503463745117, "rewards/margins": 0.16813337802886963, "rewards/rejected": -1.4011837244033813, "step": 5225 }, { "epoch": 2.7991302893460444, "grad_norm": 8.56979544202609, "learning_rate": 1.3509508848361606e-08, "logits/chosen": -0.2617391347885132, "logits/rejected": -0.15851733088493347, "logps/chosen": -1.1873245239257812, "logps/rejected": -1.3135095834732056, "loss": 1.1873, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.1873245239257812, "rewards/margins": 0.12618501484394073, "rewards/rejected": -1.3135095834732056, "step": 5230 }, { "epoch": 2.801806322127446, "grad_norm": 5.4791319116226695, "learning_rate": 1.3152288061110517e-08, "logits/chosen": -0.21470221877098083, "logits/rejected": -0.15428467094898224, "logps/chosen": -1.210153579711914, "logps/rejected": -1.3099989891052246, "loss": 1.2102, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.210153579711914, "rewards/margins": 0.09984546899795532, "rewards/rejected": -1.3099989891052246, "step": 5235 }, { "epoch": 2.804482354908848, "grad_norm": 5.158810406518573, "learning_rate": 1.2799790672495814e-08, "logits/chosen": -0.20074479281902313, "logits/rejected": -0.03765871003270149, "logps/chosen": -1.1919381618499756, "logps/rejected": -1.3118988275527954, "loss": 1.1919, "rewards/accuracies": 0.53125, "rewards/chosen": -1.1919381618499756, "rewards/margins": 0.11996038258075714, "rewards/rejected": -1.3118988275527954, "step": 5240 }, { "epoch": 2.807158387690249, "grad_norm": 5.368942583828455, "learning_rate": 1.2452020102448835e-08, "logits/chosen": -0.1372217833995819, "logits/rejected": -0.10174715518951416, "logps/chosen": -1.1460888385772705, "logps/rejected": -1.2734966278076172, "loss": 1.1461, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.1460888385772705, "rewards/margins": 0.1274077594280243, "rewards/rejected": -1.2734966278076172, "step": 5245 }, { "epoch": 2.8098344204716508, "grad_norm": 5.587492324970319, "learning_rate": 1.2108979725041103e-08, "logits/chosen": -0.21874681115150452, "logits/rejected": -0.11924680322408676, "logps/chosen": -1.2174309492111206, "logps/rejected": -1.3298754692077637, "loss": 1.2174, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2174309492111206, "rewards/margins": 0.11244448274374008, "rewards/rejected": -1.3298754692077637, "step": 5250 }, { "epoch": 2.8125104532530525, "grad_norm": 5.303130785687265, "learning_rate": 1.1770672868451958e-08, "logits/chosen": -0.18482065200805664, "logits/rejected": -0.02526954934000969, "logps/chosen": -1.206457257270813, "logps/rejected": -1.2598731517791748, "loss": 1.2065, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.206457257270813, "rewards/margins": 0.053415995091199875, "rewards/rejected": -1.2598731517791748, "step": 5255 }, { "epoch": 2.8151864860344538, "grad_norm": 6.906269212675276, "learning_rate": 1.1437102814935872e-08, "logits/chosen": -0.18782971799373627, "logits/rejected": -0.13301093876361847, "logps/chosen": -1.1876240968704224, "logps/rejected": -1.3366366624832153, "loss": 1.1876, "rewards/accuracies": 0.53125, "rewards/chosen": -1.1876240968704224, "rewards/margins": 0.1490124762058258, "rewards/rejected": -1.3366366624832153, "step": 5260 }, { "epoch": 2.8178625188158555, "grad_norm": 5.348646689804216, "learning_rate": 1.1108272800791018e-08, "logits/chosen": -0.30024540424346924, "logits/rejected": -0.14680776000022888, "logps/chosen": -1.3634843826293945, "logps/rejected": -1.3892428874969482, "loss": 1.3635, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3634843826293945, "rewards/margins": 0.025758570060133934, "rewards/rejected": -1.3892428874969482, "step": 5265 }, { "epoch": 2.820538551597257, "grad_norm": 5.12888593199934, "learning_rate": 1.078418601632769e-08, "logits/chosen": -0.18852542340755463, "logits/rejected": -0.08001218736171722, "logps/chosen": -1.1309869289398193, "logps/rejected": -1.3732306957244873, "loss": 1.131, "rewards/accuracies": 0.625, "rewards/chosen": -1.1309869289398193, "rewards/margins": 0.24224376678466797, "rewards/rejected": -1.3732306957244873, "step": 5270 }, { "epoch": 2.8232145843786585, "grad_norm": 6.285922404748876, "learning_rate": 1.0464845605837159e-08, "logits/chosen": -0.16616547107696533, "logits/rejected": -0.04030843451619148, "logps/chosen": -1.213853120803833, "logps/rejected": -1.3756954669952393, "loss": 1.2139, "rewards/accuracies": 0.53125, "rewards/chosen": -1.213853120803833, "rewards/margins": 0.16184237599372864, "rewards/rejected": -1.3756954669952393, "step": 5275 }, { "epoch": 2.82589061716006, "grad_norm": 5.4490629826002746, "learning_rate": 1.0150254667561642e-08, "logits/chosen": -0.17463694512844086, "logits/rejected": -0.0539524145424366, "logps/chosen": -1.2343729734420776, "logps/rejected": -1.3306089639663696, "loss": 1.2344, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2343729734420776, "rewards/margins": 0.09623589366674423, "rewards/rejected": -1.3306089639663696, "step": 5280 }, { "epoch": 2.828566649941462, "grad_norm": 6.328783795480937, "learning_rate": 9.840416253663719e-09, "logits/chosen": -0.21892789006233215, "logits/rejected": -0.1467473804950714, "logps/chosen": -1.1612582206726074, "logps/rejected": -1.2796560525894165, "loss": 1.1613, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1612582206726074, "rewards/margins": 0.11839772760868073, "rewards/rejected": -1.2796560525894165, "step": 5285 }, { "epoch": 2.8312426827228636, "grad_norm": 4.439799358191107, "learning_rate": 9.535333370197074e-09, "logits/chosen": -0.18623670935630798, "logits/rejected": -0.08082831650972366, "logps/chosen": -1.2355719804763794, "logps/rejected": -1.3440674543380737, "loss": 1.2356, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2355719804763794, "rewards/margins": 0.10849557816982269, "rewards/rejected": -1.3440674543380737, "step": 5290 }, { "epoch": 2.833918715504265, "grad_norm": 4.014623558509195, "learning_rate": 9.23500897707713e-09, "logits/chosen": -0.22943833470344543, "logits/rejected": -0.078731469810009, "logps/chosen": -1.2756917476654053, "logps/rejected": -1.4685487747192383, "loss": 1.2757, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2756917476654053, "rewards/margins": 0.19285698235034943, "rewards/rejected": -1.4685487747192383, "step": 5295 }, { "epoch": 2.8365947482856666, "grad_norm": 6.030686883061023, "learning_rate": 8.939445988052574e-09, "logits/chosen": -0.20672044157981873, "logits/rejected": -0.18478873372077942, "logps/chosen": -1.180480718612671, "logps/rejected": -1.4278126955032349, "loss": 1.1805, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.180480718612671, "rewards/margins": 0.24733197689056396, "rewards/rejected": -1.4278126955032349, "step": 5300 }, { "epoch": 2.839270781067068, "grad_norm": 5.18656070937529, "learning_rate": 8.648647270676656e-09, "logits/chosen": -0.21785672008991241, "logits/rejected": -0.10921114683151245, "logps/chosen": -1.2446626424789429, "logps/rejected": -1.4101511240005493, "loss": 1.2447, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2446626424789429, "rewards/margins": 0.16548852622509003, "rewards/rejected": -1.4101511240005493, "step": 5305 }, { "epoch": 2.8419468138484696, "grad_norm": 3.65921413240022, "learning_rate": 8.362615646279991e-09, "logits/chosen": -0.30747875571250916, "logits/rejected": -0.1127927303314209, "logps/chosen": -1.1281988620758057, "logps/rejected": -1.3548561334609985, "loss": 1.1282, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1281988620758057, "rewards/margins": 0.22665715217590332, "rewards/rejected": -1.3548561334609985, "step": 5310 }, { "epoch": 2.8446228466298713, "grad_norm": 5.729310900202872, "learning_rate": 8.081353889942466e-09, "logits/chosen": -0.10579930245876312, "logits/rejected": -0.03876299783587456, "logps/chosen": -1.1664822101593018, "logps/rejected": -1.2470623254776, "loss": 1.1665, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1664822101593018, "rewards/margins": 0.08058010786771774, "rewards/rejected": -1.2470623254776, "step": 5315 }, { "epoch": 2.847298879411273, "grad_norm": 4.750805678701557, "learning_rate": 7.804864730467042e-09, "logits/chosen": -0.11595920473337173, "logits/rejected": -0.06495039165019989, "logps/chosen": -1.1702669858932495, "logps/rejected": -1.2472349405288696, "loss": 1.1703, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.1702669858932495, "rewards/margins": 0.07696785777807236, "rewards/rejected": -1.2472349405288696, "step": 5320 }, { "epoch": 2.8499749121926743, "grad_norm": 6.066603550649123, "learning_rate": 7.533150850352665e-09, "logits/chosen": -0.16079676151275635, "logits/rejected": -0.0731375589966774, "logps/chosen": -1.2128829956054688, "logps/rejected": -1.4128206968307495, "loss": 1.2129, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2128829956054688, "rewards/margins": 0.19993767142295837, "rewards/rejected": -1.4128206968307495, "step": 5325 }, { "epoch": 2.852650944974076, "grad_norm": 6.45928896382586, "learning_rate": 7.2662148857686175e-09, "logits/chosen": -0.12357590347528458, "logits/rejected": -0.08573149144649506, "logps/chosen": -1.14985191822052, "logps/rejected": -1.3202584981918335, "loss": 1.1499, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.14985191822052, "rewards/margins": 0.17040665447711945, "rewards/rejected": -1.3202584981918335, "step": 5330 }, { "epoch": 2.8553269777554773, "grad_norm": 6.2554297558420835, "learning_rate": 7.0040594265287635e-09, "logits/chosen": -0.11936501413583755, "logits/rejected": -0.13869568705558777, "logps/chosen": -1.1874260902404785, "logps/rejected": -1.2899043560028076, "loss": 1.1874, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.1874260902404785, "rewards/margins": 0.10247828811407089, "rewards/rejected": -1.2899043560028076, "step": 5335 }, { "epoch": 2.858003010536879, "grad_norm": 5.721521371135035, "learning_rate": 6.746687016066566e-09, "logits/chosen": -0.14275915920734406, "logits/rejected": -0.1434057652950287, "logps/chosen": -1.216410517692566, "logps/rejected": -1.3015367984771729, "loss": 1.2164, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.216410517692566, "rewards/margins": 0.08512608706951141, "rewards/rejected": -1.3015367984771729, "step": 5340 }, { "epoch": 2.8606790433182807, "grad_norm": 4.725943984779547, "learning_rate": 6.494100151410276e-09, "logits/chosen": -0.2903233468532562, "logits/rejected": -0.16574633121490479, "logps/chosen": -1.1510311365127563, "logps/rejected": -1.3279762268066406, "loss": 1.151, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1510311365127563, "rewards/margins": 0.17694523930549622, "rewards/rejected": -1.3279762268066406, "step": 5345 }, { "epoch": 2.8633550760996824, "grad_norm": 6.053561774816533, "learning_rate": 6.246301283158728e-09, "logits/chosen": -0.13502545654773712, "logits/rejected": -0.17218217253684998, "logps/chosen": -1.2667639255523682, "logps/rejected": -1.3898870944976807, "loss": 1.2668, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2667639255523682, "rewards/margins": 0.1231231540441513, "rewards/rejected": -1.3898870944976807, "step": 5350 }, { "epoch": 2.8660311088810837, "grad_norm": 6.661783431975508, "learning_rate": 6.0032928154576944e-09, "logits/chosen": -0.23956885933876038, "logits/rejected": -0.17370496690273285, "logps/chosen": -1.179540991783142, "logps/rejected": -1.3099632263183594, "loss": 1.1795, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.179540991783142, "rewards/margins": 0.13042223453521729, "rewards/rejected": -1.3099632263183594, "step": 5355 }, { "epoch": 2.8687071416624854, "grad_norm": 5.351825516333248, "learning_rate": 5.76507710597629e-09, "logits/chosen": -0.17420150339603424, "logits/rejected": -0.030449409037828445, "logps/chosen": -1.20005202293396, "logps/rejected": -1.357026219367981, "loss": 1.2001, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.20005202293396, "rewards/margins": 0.15697413682937622, "rewards/rejected": -1.357026219367981, "step": 5360 }, { "epoch": 2.8713831744438867, "grad_norm": 5.8153855212871335, "learning_rate": 5.531656465884438e-09, "logits/chosen": -0.2501664161682129, "logits/rejected": -0.1371104121208191, "logps/chosen": -1.1959173679351807, "logps/rejected": -1.3678150177001953, "loss": 1.1959, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1959173679351807, "rewards/margins": 0.17189760506153107, "rewards/rejected": -1.3678150177001953, "step": 5365 }, { "epoch": 2.8740592072252884, "grad_norm": 7.054387843681963, "learning_rate": 5.303033159830217e-09, "logits/chosen": -0.11576829105615616, "logits/rejected": -0.09689415991306305, "logps/chosen": -1.167954683303833, "logps/rejected": -1.2550865411758423, "loss": 1.168, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.167954683303833, "rewards/margins": 0.08713182061910629, "rewards/rejected": -1.2550865411758423, "step": 5370 }, { "epoch": 2.87673524000669, "grad_norm": 5.7816162397028315, "learning_rate": 5.079209405917939e-09, "logits/chosen": -0.17414376139640808, "logits/rejected": -0.09634820371866226, "logps/chosen": -1.143467903137207, "logps/rejected": -1.3563830852508545, "loss": 1.1435, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.143467903137207, "rewards/margins": 0.21291527152061462, "rewards/rejected": -1.3563830852508545, "step": 5375 }, { "epoch": 2.879411272788092, "grad_norm": 4.546004400708268, "learning_rate": 4.860187375686664e-09, "logits/chosen": -0.2004753053188324, "logits/rejected": -0.03905041515827179, "logps/chosen": -1.3214300870895386, "logps/rejected": -1.424565076828003, "loss": 1.3214, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3214300870895386, "rewards/margins": 0.10313482582569122, "rewards/rejected": -1.424565076828003, "step": 5380 }, { "epoch": 2.882087305569493, "grad_norm": 5.188981927496121, "learning_rate": 4.64596919408905e-09, "logits/chosen": -0.13796240091323853, "logits/rejected": -0.08505464345216751, "logps/chosen": -1.1717417240142822, "logps/rejected": -1.2888731956481934, "loss": 1.1717, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1717417240142822, "rewards/margins": 0.11713135242462158, "rewards/rejected": -1.2888731956481934, "step": 5385 }, { "epoch": 2.884763338350895, "grad_norm": 4.700803540478304, "learning_rate": 4.436556939470814e-09, "logits/chosen": -0.1567961722612381, "logits/rejected": -0.0846087709069252, "logps/chosen": -1.2194883823394775, "logps/rejected": -1.327146291732788, "loss": 1.2195, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2194883823394775, "rewards/margins": 0.10765796899795532, "rewards/rejected": -1.327146291732788, "step": 5390 }, { "epoch": 2.887439371132296, "grad_norm": 4.805887298997, "learning_rate": 4.23195264355064e-09, "logits/chosen": -0.32884281873703003, "logits/rejected": -0.18300950527191162, "logps/chosen": -1.1449711322784424, "logps/rejected": -1.3250820636749268, "loss": 1.145, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1449711322784424, "rewards/margins": 0.1801108568906784, "rewards/rejected": -1.3250820636749268, "step": 5395 }, { "epoch": 2.890115403913698, "grad_norm": 5.636864164467745, "learning_rate": 4.032158291400245e-09, "logits/chosen": -0.21470797061920166, "logits/rejected": -0.016922790557146072, "logps/chosen": -1.1984671354293823, "logps/rejected": -1.5167427062988281, "loss": 1.1985, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1984671354293823, "rewards/margins": 0.318275511264801, "rewards/rejected": -1.5167427062988281, "step": 5400 }, { "epoch": 2.8927914366950995, "grad_norm": 7.007513437521976, "learning_rate": 3.837175821425398e-09, "logits/chosen": -0.1337025910615921, "logits/rejected": -0.11909003555774689, "logps/chosen": -1.2382549047470093, "logps/rejected": -1.425720453262329, "loss": 1.2383, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2382549047470093, "rewards/margins": 0.18746568262577057, "rewards/rejected": -1.425720453262329, "step": 5405 }, { "epoch": 2.8954674694765012, "grad_norm": 4.805743239331045, "learning_rate": 3.6470071253467683e-09, "logits/chosen": -0.1261594295501709, "logits/rejected": -0.03386545181274414, "logps/chosen": -1.1845440864562988, "logps/rejected": -1.4290525913238525, "loss": 1.1845, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1845440864562988, "rewards/margins": 0.24450846016407013, "rewards/rejected": -1.4290525913238525, "step": 5410 }, { "epoch": 2.8981435022579025, "grad_norm": 4.047836226772145, "learning_rate": 3.461654048181939e-09, "logits/chosen": -0.21231523156166077, "logits/rejected": -0.08358360826969147, "logps/chosen": -1.2118909358978271, "logps/rejected": -1.357332468032837, "loss": 1.2119, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2118909358978271, "rewards/margins": 0.14544148743152618, "rewards/rejected": -1.357332468032837, "step": 5415 }, { "epoch": 2.9008195350393042, "grad_norm": 4.5955063130256955, "learning_rate": 3.281118388227255e-09, "logits/chosen": -0.16086669266223907, "logits/rejected": -0.11136847734451294, "logps/chosen": -1.1189218759536743, "logps/rejected": -1.27243173122406, "loss": 1.1189, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1189218759536743, "rewards/margins": 0.15350982546806335, "rewards/rejected": -1.27243173122406, "step": 5420 }, { "epoch": 2.903495567820706, "grad_norm": 7.305255717494073, "learning_rate": 3.1054018970405048e-09, "logits/chosen": -0.17626658082008362, "logits/rejected": -0.09517689794301987, "logps/chosen": -1.2194832563400269, "logps/rejected": -1.3916879892349243, "loss": 1.2195, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2194832563400269, "rewards/margins": 0.17220473289489746, "rewards/rejected": -1.3916879892349243, "step": 5425 }, { "epoch": 2.906171600602107, "grad_norm": 5.1060249559169755, "learning_rate": 2.9345062794238207e-09, "logits/chosen": -0.18313951790332794, "logits/rejected": -0.0557086355984211, "logps/chosen": -1.168881893157959, "logps/rejected": -1.4216772317886353, "loss": 1.1689, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.168881893157959, "rewards/margins": 0.2527953088283539, "rewards/rejected": -1.4216772317886353, "step": 5430 }, { "epoch": 2.908847633383509, "grad_norm": 7.097244532317615, "learning_rate": 2.7684331934072492e-09, "logits/chosen": -0.2802087366580963, "logits/rejected": -0.19921636581420898, "logps/chosen": -1.1662838459014893, "logps/rejected": -1.425647497177124, "loss": 1.1663, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1662838459014893, "rewards/margins": 0.25936368107795715, "rewards/rejected": -1.425647497177124, "step": 5435 }, { "epoch": 2.9115236661649107, "grad_norm": 6.710371793764067, "learning_rate": 2.6071842502326526e-09, "logits/chosen": -0.22454483807086945, "logits/rejected": -0.14197295904159546, "logps/chosen": -1.1773316860198975, "logps/rejected": -1.4285647869110107, "loss": 1.1773, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1773316860198975, "rewards/margins": 0.2512332797050476, "rewards/rejected": -1.4285647869110107, "step": 5440 }, { "epoch": 2.9141996989463124, "grad_norm": 5.221630906317376, "learning_rate": 2.450761014337888e-09, "logits/chosen": -0.030957262963056564, "logits/rejected": -0.03260520473122597, "logps/chosen": -1.1479628086090088, "logps/rejected": -1.4127689599990845, "loss": 1.148, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.1479628086090088, "rewards/margins": 0.2648060917854309, "rewards/rejected": -1.4127689599990845, "step": 5445 }, { "epoch": 2.9168757317277136, "grad_norm": 5.169938825187927, "learning_rate": 2.299165003341985e-09, "logits/chosen": -0.12257808446884155, "logits/rejected": -0.042447496205568314, "logps/chosen": -1.2065999507904053, "logps/rejected": -1.431854248046875, "loss": 1.2066, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2065999507904053, "rewards/margins": 0.2252543419599533, "rewards/rejected": -1.431854248046875, "step": 5450 }, { "epoch": 2.9195517645091154, "grad_norm": 4.972797365159548, "learning_rate": 2.1523976880299945e-09, "logits/chosen": -0.19540204107761383, "logits/rejected": -0.05711943656206131, "logps/chosen": -1.17696213722229, "logps/rejected": -1.2600338459014893, "loss": 1.177, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.17696213722229, "rewards/margins": 0.08307181298732758, "rewards/rejected": -1.2600338459014893, "step": 5455 }, { "epoch": 2.9222277972905166, "grad_norm": 5.991875964741956, "learning_rate": 2.010460492339161e-09, "logits/chosen": -0.18677929043769836, "logits/rejected": -0.12056591361761093, "logps/chosen": -1.1988188028335571, "logps/rejected": -1.3707642555236816, "loss": 1.1988, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1988188028335571, "rewards/margins": 0.17194542288780212, "rewards/rejected": -1.3707642555236816, "step": 5460 }, { "epoch": 2.9249038300719183, "grad_norm": 5.988125103195068, "learning_rate": 1.8733547933446614e-09, "logits/chosen": -0.27014705538749695, "logits/rejected": -0.13436944782733917, "logps/chosen": -1.2429203987121582, "logps/rejected": -1.3457458019256592, "loss": 1.2429, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2429203987121582, "rewards/margins": 0.10282555967569351, "rewards/rejected": -1.3457458019256592, "step": 5465 }, { "epoch": 2.92757986285332, "grad_norm": 6.059820512857346, "learning_rate": 1.7410819212467231e-09, "logits/chosen": -0.14478006958961487, "logits/rejected": -0.07959442585706711, "logps/chosen": -1.1232699155807495, "logps/rejected": -1.293100118637085, "loss": 1.1233, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1232699155807495, "rewards/margins": 0.1698302924633026, "rewards/rejected": -1.293100118637085, "step": 5470 }, { "epoch": 2.9302558956347218, "grad_norm": 5.699536290185374, "learning_rate": 1.613643159357192e-09, "logits/chosen": -0.09018460661172867, "logits/rejected": -0.13560739159584045, "logps/chosen": -1.0961625576019287, "logps/rejected": -1.2582213878631592, "loss": 1.0962, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.0961625576019287, "rewards/margins": 0.16205887496471405, "rewards/rejected": -1.2582213878631592, "step": 5475 }, { "epoch": 2.932931928416123, "grad_norm": 5.048481814094035, "learning_rate": 1.4910397440875967e-09, "logits/chosen": -0.1620696485042572, "logits/rejected": -0.08843860030174255, "logps/chosen": -1.1989589929580688, "logps/rejected": -1.3083572387695312, "loss": 1.199, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1989589929580688, "rewards/margins": 0.1093982607126236, "rewards/rejected": -1.3083572387695312, "step": 5480 }, { "epoch": 2.9356079611975248, "grad_norm": 5.081425677818655, "learning_rate": 1.3732728649368253e-09, "logits/chosen": -0.102336086332798, "logits/rejected": 0.02499265968799591, "logps/chosen": -1.1737130880355835, "logps/rejected": -1.3140063285827637, "loss": 1.1737, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1737130880355835, "rewards/margins": 0.14029324054718018, "rewards/rejected": -1.3140063285827637, "step": 5485 }, { "epoch": 2.938283993978926, "grad_norm": 5.63036827431107, "learning_rate": 1.260343664479524e-09, "logits/chosen": -0.17824600636959076, "logits/rejected": -0.16234485805034637, "logps/chosen": -1.1772719621658325, "logps/rejected": -1.2218278646469116, "loss": 1.1773, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.1772719621658325, "rewards/margins": 0.0445559024810791, "rewards/rejected": -1.2218278646469116, "step": 5490 }, { "epoch": 2.9409600267603278, "grad_norm": 5.334032079703629, "learning_rate": 1.1522532383554384e-09, "logits/chosen": -0.23164120316505432, "logits/rejected": -0.06480072438716888, "logps/chosen": -1.1477172374725342, "logps/rejected": -1.360404372215271, "loss": 1.1477, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1477172374725342, "rewards/margins": 0.21268704533576965, "rewards/rejected": -1.360404372215271, "step": 5495 }, { "epoch": 2.9436360595417295, "grad_norm": 5.414553485118997, "learning_rate": 1.049002635258256e-09, "logits/chosen": -0.18340924382209778, "logits/rejected": -0.09524872899055481, "logps/chosen": -1.2074484825134277, "logps/rejected": -1.2486203908920288, "loss": 1.2074, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2074484825134277, "rewards/margins": 0.04117199033498764, "rewards/rejected": -1.2486203908920288, "step": 5500 }, { "epoch": 2.946312092323131, "grad_norm": 4.938472396048951, "learning_rate": 9.505928569258358e-10, "logits/chosen": -0.11460240185260773, "logits/rejected": -0.1258409172296524, "logps/chosen": -1.1924388408660889, "logps/rejected": -1.389146089553833, "loss": 1.1924, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1924388408660889, "rewards/margins": 0.19670720398426056, "rewards/rejected": -1.389146089553833, "step": 5505 }, { "epoch": 2.9489881251045325, "grad_norm": 7.0593956529487025, "learning_rate": 8.57024858130273e-10, "logits/chosen": -0.17169317603111267, "logits/rejected": -0.09910265356302261, "logps/chosen": -1.189296841621399, "logps/rejected": -1.383187174797058, "loss": 1.1893, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.189296841621399, "rewards/margins": 0.19389040768146515, "rewards/rejected": -1.383187174797058, "step": 5510 }, { "epoch": 2.951664157885934, "grad_norm": 6.306096548224462, "learning_rate": 7.682995466686826e-10, "logits/chosen": -0.25733932852745056, "logits/rejected": -0.15879106521606445, "logps/chosen": -1.142138957977295, "logps/rejected": -1.4023430347442627, "loss": 1.1421, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.142138957977295, "rewards/margins": 0.26020392775535583, "rewards/rejected": -1.4023430347442627, "step": 5515 }, { "epoch": 2.9543401906673354, "grad_norm": 5.2245208526061475, "learning_rate": 6.844177833543741e-10, "logits/chosen": -0.15947526693344116, "logits/rejected": -0.1280960589647293, "logps/chosen": -1.132433295249939, "logps/rejected": -1.2917569875717163, "loss": 1.1324, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.132433295249939, "rewards/margins": 0.15932366251945496, "rewards/rejected": -1.2917569875717163, "step": 5520 }, { "epoch": 2.957016223448737, "grad_norm": 4.8845675982591885, "learning_rate": 6.053803820087467e-10, "logits/chosen": -0.18961724638938904, "logits/rejected": -0.10165412724018097, "logps/chosen": -1.1694127321243286, "logps/rejected": -1.3842968940734863, "loss": 1.1694, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1694127321243286, "rewards/margins": 0.2148841917514801, "rewards/rejected": -1.3842968940734863, "step": 5525 }, { "epoch": 2.959692256230139, "grad_norm": 5.927320060366725, "learning_rate": 5.311881094528514e-10, "logits/chosen": -0.21957989037036896, "logits/rejected": -0.0486355796456337, "logps/chosen": -1.2726962566375732, "logps/rejected": -1.3402175903320312, "loss": 1.2727, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2726962566375732, "rewards/margins": 0.06752129644155502, "rewards/rejected": -1.3402175903320312, "step": 5530 }, { "epoch": 2.9623682890115406, "grad_norm": 6.179901872108258, "learning_rate": 4.6184168550050806e-10, "logits/chosen": -0.19367170333862305, "logits/rejected": -0.173887699842453, "logps/chosen": -1.1592867374420166, "logps/rejected": -1.2169601917266846, "loss": 1.1593, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.1592867374420166, "rewards/margins": 0.05767344310879707, "rewards/rejected": -1.2169601917266846, "step": 5535 }, { "epoch": 2.965044321792942, "grad_norm": 5.649962105323477, "learning_rate": 3.973417829510328e-10, "logits/chosen": -0.2903217375278473, "logits/rejected": -0.16514264047145844, "logps/chosen": -1.2345366477966309, "logps/rejected": -1.3107110261917114, "loss": 1.2345, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2345366477966309, "rewards/margins": 0.07617439329624176, "rewards/rejected": -1.3107110261917114, "step": 5540 }, { "epoch": 2.9677203545743436, "grad_norm": 5.25900892952436, "learning_rate": 3.3768902758274377e-10, "logits/chosen": -0.1744503676891327, "logits/rejected": -0.10387680679559708, "logps/chosen": -1.1194565296173096, "logps/rejected": -1.2610795497894287, "loss": 1.1195, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.1194565296173096, "rewards/margins": 0.14162297546863556, "rewards/rejected": -1.2610795497894287, "step": 5545 }, { "epoch": 2.970396387355745, "grad_norm": 5.878717348612424, "learning_rate": 2.8288399814691e-10, "logits/chosen": -0.09973793476819992, "logits/rejected": -0.010059304535388947, "logps/chosen": -1.2519961595535278, "logps/rejected": -1.36223566532135, "loss": 1.252, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2519961595535278, "rewards/margins": 0.11023961007595062, "rewards/rejected": -1.36223566532135, "step": 5550 }, { "epoch": 2.9730724201371466, "grad_norm": 7.066439412779741, "learning_rate": 2.3292722636220066e-10, "logits/chosen": -0.1823541820049286, "logits/rejected": -0.01570945791900158, "logps/chosen": -1.2813233137130737, "logps/rejected": -1.3808047771453857, "loss": 1.2813, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2813233137130737, "rewards/margins": 0.09948162734508514, "rewards/rejected": -1.3808047771453857, "step": 5555 }, { "epoch": 2.9757484529185483, "grad_norm": 5.436741331324082, "learning_rate": 1.8781919690946668e-10, "logits/chosen": -0.13146591186523438, "logits/rejected": -0.14218859374523163, "logps/chosen": -1.2036951780319214, "logps/rejected": -1.239797830581665, "loss": 1.2037, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2036951780319214, "rewards/margins": 0.036102842539548874, "rewards/rejected": -1.239797830581665, "step": 5560 }, { "epoch": 2.97842448569995, "grad_norm": 6.205202011661644, "learning_rate": 1.4756034742696711e-10, "logits/chosen": -0.21282640099525452, "logits/rejected": -0.19570297002792358, "logps/chosen": -1.1811414957046509, "logps/rejected": -1.2538477182388306, "loss": 1.1811, "rewards/accuracies": 0.5, "rewards/chosen": -1.1811414957046509, "rewards/margins": 0.07270626723766327, "rewards/rejected": -1.2538477182388306, "step": 5565 }, { "epoch": 2.9811005184813513, "grad_norm": 6.391006508564717, "learning_rate": 1.12151068506261e-10, "logits/chosen": -0.11988387256860733, "logits/rejected": -0.02862481400370598, "logps/chosen": -1.1413817405700684, "logps/rejected": -1.3782150745391846, "loss": 1.1414, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1413817405700684, "rewards/margins": 0.23683352768421173, "rewards/rejected": -1.3782150745391846, "step": 5570 }, { "epoch": 2.983776551262753, "grad_norm": 5.229654235568468, "learning_rate": 8.159170368826629e-11, "logits/chosen": -0.1764468103647232, "logits/rejected": -0.06735340505838394, "logps/chosen": -1.1675987243652344, "logps/rejected": -1.3620374202728271, "loss": 1.1676, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.1675987243652344, "rewards/margins": 0.19443872570991516, "rewards/rejected": -1.3620374202728271, "step": 5575 }, { "epoch": 2.9864525840441547, "grad_norm": 6.6333488994859335, "learning_rate": 5.588254946015114e-11, "logits/chosen": -0.2435917854309082, "logits/rejected": -0.05402897670865059, "logps/chosen": -1.0960800647735596, "logps/rejected": -1.3582054376602173, "loss": 1.0961, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.0960800647735596, "rewards/margins": 0.2621254622936249, "rewards/rejected": -1.3582054376602173, "step": 5580 }, { "epoch": 2.989128616825556, "grad_norm": 5.169992910536041, "learning_rate": 3.502385525216978e-11, "logits/chosen": -0.2246054708957672, "logits/rejected": -0.0953652635216713, "logps/chosen": -1.214552879333496, "logps/rejected": -1.3581045866012573, "loss": 1.2146, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.214552879333496, "rewards/margins": 0.14355173707008362, "rewards/rejected": -1.3581045866012573, "step": 5585 }, { "epoch": 2.9918046496069577, "grad_norm": 5.5844907752387325, "learning_rate": 1.901582343555308e-11, "logits/chosen": -0.1663421243429184, "logits/rejected": -0.13721568882465363, "logps/chosen": -1.2463988065719604, "logps/rejected": -1.3094186782836914, "loss": 1.2464, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.2463988065719604, "rewards/margins": 0.06301993876695633, "rewards/rejected": -1.3094186782836914, "step": 5590 }, { "epoch": 2.9944806823883594, "grad_norm": 4.881802016529063, "learning_rate": 7.858609320232634e-12, "logits/chosen": -0.16230200231075287, "logits/rejected": -0.07590650767087936, "logps/chosen": -1.1158279180526733, "logps/rejected": -1.300206184387207, "loss": 1.1158, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1158279180526733, "rewards/margins": 0.1843782216310501, "rewards/rejected": -1.300206184387207, "step": 5595 }, { "epoch": 2.9971567151697607, "grad_norm": 5.747783362793702, "learning_rate": 1.5523211535639624e-12, "logits/chosen": -0.16551540791988373, "logits/rejected": -0.10585223138332367, "logps/chosen": -1.1655288934707642, "logps/rejected": -1.3628321886062622, "loss": 1.1655, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.1655288934707642, "rewards/margins": 0.19730353355407715, "rewards/rejected": -1.3628321886062622, "step": 5600 }, { "epoch": 2.9971567151697607, "eval_logits/chosen": 0.07597178965806961, "eval_logits/rejected": 0.1431158185005188, "eval_logps/chosen": -1.2622076272964478, "eval_logps/rejected": -1.386366844177246, "eval_loss": 1.2624659538269043, "eval_rewards/accuracies": 0.5474777221679688, "eval_rewards/chosen": -1.2622076272964478, "eval_rewards/margins": 0.12415922433137894, "eval_rewards/rejected": -1.386366844177246, "eval_runtime": 40.3873, "eval_samples_per_second": 33.303, "eval_steps_per_second": 8.344, "step": 5600 }, { "epoch": 2.999297541394882, "step": 5604, "total_flos": 0.0, "train_loss": 1.2315016932184573, "train_runtime": 30313.5413, "train_samples_per_second": 5.917, "train_steps_per_second": 0.185 } ], "logging_steps": 5, "max_steps": 5604, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }