{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9992652461425422, "eval_steps": 100, "global_step": 765, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001306229079924892, "grad_norm": 5.53417315287533, "learning_rate": 6.493506493506494e-09, "logits/chosen": -0.7533285021781921, "logits/rejected": -0.8020980358123779, "logps/chosen": -306.6681823730469, "logps/rejected": -328.7090148925781, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.002612458159849784, "grad_norm": 5.368357576960736, "learning_rate": 1.2987012987012988e-08, "logits/chosen": -0.9330071210861206, "logits/rejected": -0.9560132026672363, "logps/chosen": -330.1785888671875, "logps/rejected": -321.1015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.0039186872397746755, "grad_norm": 6.432629734255936, "learning_rate": 1.9480519480519478e-08, "logits/chosen": -0.7450475692749023, "logits/rejected": -0.7653542757034302, "logps/chosen": -332.4261779785156, "logps/rejected": -324.4579772949219, "loss": 0.6931, "rewards/accuracies": 0.53125, "rewards/chosen": -0.0014441871317103505, "rewards/margins": 0.0007401347393169999, "rewards/rejected": -0.0021843216381967068, "step": 3 }, { "epoch": 0.005224916319699568, "grad_norm": 7.0047837477379895, "learning_rate": 2.5974025974025976e-08, "logits/chosen": -0.7317397594451904, "logits/rejected": -0.7434061169624329, "logps/chosen": -350.59619140625, "logps/rejected": -409.7134704589844, "loss": 0.6926, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0004230784543324262, "rewards/margins": 0.00048499589320272207, "rewards/rejected": -6.191732245497406e-05, "step": 4 }, { "epoch": 0.006531145399624459, "grad_norm": 6.366176631380578, "learning_rate": 3.246753246753246e-08, "logits/chosen": -0.680822491645813, "logits/rejected": -0.6706495881080627, "logps/chosen": -346.29913330078125, "logps/rejected": -405.939453125, "loss": 0.6933, "rewards/accuracies": 0.4375, "rewards/chosen": 3.0529568903148174e-05, "rewards/margins": -0.0002498173853382468, "rewards/rejected": 0.00028034677961841226, "step": 5 }, { "epoch": 0.007837374479549351, "grad_norm": 5.6539667591600695, "learning_rate": 3.8961038961038956e-08, "logits/chosen": -0.8521619439125061, "logits/rejected": -0.8027850985527039, "logps/chosen": -337.5583190917969, "logps/rejected": -355.25732421875, "loss": 0.6934, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0013673019129782915, "rewards/margins": 0.0007385491626337171, "rewards/rejected": 0.0006287526921369135, "step": 6 }, { "epoch": 0.009143603559474243, "grad_norm": 5.560604956235579, "learning_rate": 4.545454545454545e-08, "logits/chosen": -0.7255852818489075, "logits/rejected": -0.7422863245010376, "logps/chosen": -319.12176513671875, "logps/rejected": -319.749267578125, "loss": 0.6933, "rewards/accuracies": 0.65625, "rewards/chosen": 0.0008336210157722235, "rewards/margins": 0.001907060039229691, "rewards/rejected": -0.0010734390234574676, "step": 7 }, { "epoch": 0.010449832639399135, "grad_norm": 5.955487253622147, "learning_rate": 5.194805194805195e-08, "logits/chosen": -0.7017946243286133, "logits/rejected": -0.7843577265739441, "logps/chosen": -278.2752380371094, "logps/rejected": -350.0848388671875, "loss": 0.6932, "rewards/accuracies": 0.5625, "rewards/chosen": -0.001695701852440834, "rewards/margins": 0.000880406005308032, "rewards/rejected": -0.002576107857748866, "step": 8 }, { "epoch": 0.011756061719324026, "grad_norm": 6.006797093418932, "learning_rate": 5.844155844155844e-08, "logits/chosen": -0.6909061074256897, "logits/rejected": -0.6938877105712891, "logps/chosen": -325.1270751953125, "logps/rejected": -367.7559814453125, "loss": 0.6931, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0005253530107438564, "rewards/margins": -8.34012171253562e-05, "rewards/rejected": -0.00044195164809934795, "step": 9 }, { "epoch": 0.013062290799248918, "grad_norm": 5.748997143664564, "learning_rate": 6.493506493506492e-08, "logits/chosen": -0.8251830339431763, "logits/rejected": -0.8110605478286743, "logps/chosen": -347.2880859375, "logps/rejected": -325.6681823730469, "loss": 0.6927, "rewards/accuracies": 0.40625, "rewards/chosen": 0.0017610812792554498, "rewards/margins": 0.0014071224723011255, "rewards/rejected": 0.0003539586905390024, "step": 10 }, { "epoch": 0.01436851987917381, "grad_norm": 5.904174312860514, "learning_rate": 7.142857142857142e-08, "logits/chosen": -0.9508811235427856, "logits/rejected": -0.9571268558502197, "logps/chosen": -372.4806213378906, "logps/rejected": -368.0173645019531, "loss": 0.6932, "rewards/accuracies": 0.53125, "rewards/chosen": 0.001803419436328113, "rewards/margins": 0.0010235118679702282, "rewards/rejected": 0.0007799076847732067, "step": 11 }, { "epoch": 0.015674748959098702, "grad_norm": 5.508795346436636, "learning_rate": 7.792207792207791e-08, "logits/chosen": -0.825182318687439, "logits/rejected": -0.843744158744812, "logps/chosen": -314.36944580078125, "logps/rejected": -337.3401794433594, "loss": 0.6936, "rewards/accuracies": 0.40625, "rewards/chosen": -0.00026215065736323595, "rewards/margins": -0.002879395382478833, "rewards/rejected": 0.0026172446087002754, "step": 12 }, { "epoch": 0.016980978039023594, "grad_norm": 5.203593985403037, "learning_rate": 8.441558441558441e-08, "logits/chosen": -0.8466988801956177, "logits/rejected": -0.8560088276863098, "logps/chosen": -377.123046875, "logps/rejected": -355.28460693359375, "loss": 0.693, "rewards/accuracies": 0.5, "rewards/chosen": -6.914167897775769e-06, "rewards/margins": -0.00111465435475111, "rewards/rejected": 0.0011077403323724866, "step": 13 }, { "epoch": 0.018287207118948486, "grad_norm": 4.714653883292506, "learning_rate": 9.09090909090909e-08, "logits/chosen": -0.6836709976196289, "logits/rejected": -0.6261293888092041, "logps/chosen": -336.0246276855469, "logps/rejected": -342.3045349121094, "loss": 0.6929, "rewards/accuracies": 0.59375, "rewards/chosen": -0.00016624919953756034, "rewards/margins": 0.0006826830212958157, "rewards/rejected": -0.0008489321917295456, "step": 14 }, { "epoch": 0.01959343619887338, "grad_norm": 6.248100770125027, "learning_rate": 9.74025974025974e-08, "logits/chosen": -0.8154281973838806, "logits/rejected": -0.8204558491706848, "logps/chosen": -365.2437744140625, "logps/rejected": -361.426025390625, "loss": 0.693, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0007468414260074496, "rewards/margins": 0.0015008283080533147, "rewards/rejected": -0.0022476697340607643, "step": 15 }, { "epoch": 0.02089966527879827, "grad_norm": 5.297998460944868, "learning_rate": 1.038961038961039e-07, "logits/chosen": -0.817639172077179, "logits/rejected": -0.8031401634216309, "logps/chosen": -360.5960388183594, "logps/rejected": -339.3817138671875, "loss": 0.6928, "rewards/accuracies": 0.625, "rewards/chosen": -0.0016389607917517424, "rewards/margins": 0.001264180988073349, "rewards/rejected": -0.002903142012655735, "step": 16 }, { "epoch": 0.022205894358723163, "grad_norm": 6.012963273370913, "learning_rate": 1.1038961038961038e-07, "logits/chosen": -0.8308616876602173, "logits/rejected": -0.8376795053482056, "logps/chosen": -368.58013916015625, "logps/rejected": -363.4211730957031, "loss": 0.693, "rewards/accuracies": 0.53125, "rewards/chosen": 5.0950038712471724e-05, "rewards/margins": 0.0007496691541746259, "rewards/rejected": -0.0006987190572544932, "step": 17 }, { "epoch": 0.02351212343864805, "grad_norm": 5.25011457146126, "learning_rate": 1.1688311688311688e-07, "logits/chosen": -0.8814838528633118, "logits/rejected": -0.9372345209121704, "logps/chosen": -336.8822937011719, "logps/rejected": -394.75213623046875, "loss": 0.6926, "rewards/accuracies": 0.46875, "rewards/chosen": -0.0012537740403786302, "rewards/margins": 0.00014532334171235561, "rewards/rejected": -0.0013990971492603421, "step": 18 }, { "epoch": 0.024818352518572943, "grad_norm": 6.447271557198901, "learning_rate": 1.2337662337662337e-07, "logits/chosen": -0.8274356126785278, "logits/rejected": -0.8105829358100891, "logps/chosen": -400.1302490234375, "logps/rejected": -395.794677734375, "loss": 0.693, "rewards/accuracies": 0.40625, "rewards/chosen": -0.0009712266619317234, "rewards/margins": -0.000335605232976377, "rewards/rejected": -0.0006356216035783291, "step": 19 }, { "epoch": 0.026124581598497836, "grad_norm": 5.223075966265418, "learning_rate": 1.2987012987012984e-07, "logits/chosen": -0.8048669099807739, "logits/rejected": -0.7924157381057739, "logps/chosen": -364.3175964355469, "logps/rejected": -345.0390625, "loss": 0.6931, "rewards/accuracies": 0.34375, "rewards/chosen": -0.002713401336222887, "rewards/margins": -0.0027847718447446823, "rewards/rejected": 7.137066859286278e-05, "step": 20 }, { "epoch": 0.027430810678422728, "grad_norm": 5.073866807536054, "learning_rate": 1.3636363636363635e-07, "logits/chosen": -0.710981011390686, "logits/rejected": -0.7615970373153687, "logps/chosen": -309.5833435058594, "logps/rejected": -340.5343322753906, "loss": 0.6935, "rewards/accuracies": 0.4375, "rewards/chosen": -0.002394826617091894, "rewards/margins": -0.0006454420508816838, "rewards/rejected": -0.0017493844497948885, "step": 21 }, { "epoch": 0.02873703975834762, "grad_norm": 5.890261968335633, "learning_rate": 1.4285714285714285e-07, "logits/chosen": -0.861196756362915, "logits/rejected": -0.839537501335144, "logps/chosen": -328.5140075683594, "logps/rejected": -330.8149108886719, "loss": 0.6928, "rewards/accuracies": 0.53125, "rewards/chosen": -0.002065961482003331, "rewards/margins": 0.001079261302947998, "rewards/rejected": -0.0031452225521206856, "step": 22 }, { "epoch": 0.030043268838272512, "grad_norm": 6.2911479429566, "learning_rate": 1.4935064935064935e-07, "logits/chosen": -0.8220208883285522, "logits/rejected": -0.7974289655685425, "logps/chosen": -359.7161560058594, "logps/rejected": -353.8252868652344, "loss": 0.693, "rewards/accuracies": 0.5, "rewards/chosen": -0.002723624696955085, "rewards/margins": -0.0005458975210785866, "rewards/rejected": -0.0021777271758764982, "step": 23 }, { "epoch": 0.031349497918197404, "grad_norm": 5.635570219614167, "learning_rate": 1.5584415584415582e-07, "logits/chosen": -0.8430062532424927, "logits/rejected": -0.8447110652923584, "logps/chosen": -374.71612548828125, "logps/rejected": -394.64373779296875, "loss": 0.6925, "rewards/accuracies": 0.53125, "rewards/chosen": -0.0024116707500070333, "rewards/margins": 0.001287098159082234, "rewards/rejected": -0.0036987685598433018, "step": 24 }, { "epoch": 0.032655726998122296, "grad_norm": 6.5390462612025635, "learning_rate": 1.6233766233766232e-07, "logits/chosen": -0.8247517943382263, "logits/rejected": -0.7864022254943848, "logps/chosen": -328.7938537597656, "logps/rejected": -388.9433288574219, "loss": 0.6932, "rewards/accuracies": 0.5, "rewards/chosen": -0.002558019245043397, "rewards/margins": 0.0008082650601863861, "rewards/rejected": -0.0033662840723991394, "step": 25 }, { "epoch": 0.03396195607804719, "grad_norm": 5.607040791661334, "learning_rate": 1.6883116883116883e-07, "logits/chosen": -0.7527787685394287, "logits/rejected": -0.788528561592102, "logps/chosen": -371.24884033203125, "logps/rejected": -365.76025390625, "loss": 0.6919, "rewards/accuracies": 0.625, "rewards/chosen": -0.002985129365697503, "rewards/margins": 0.0011370348511263728, "rewards/rejected": -0.004122164100408554, "step": 26 }, { "epoch": 0.03526818515797208, "grad_norm": 6.015200706987167, "learning_rate": 1.7532467532467533e-07, "logits/chosen": -0.7967982888221741, "logits/rejected": -0.7809445858001709, "logps/chosen": -316.6277160644531, "logps/rejected": -321.981201171875, "loss": 0.692, "rewards/accuracies": 0.625, "rewards/chosen": -0.0029974528588354588, "rewards/margins": 0.0033523414749652147, "rewards/rejected": -0.006349794566631317, "step": 27 }, { "epoch": 0.03657441423789697, "grad_norm": 5.313916762315249, "learning_rate": 1.818181818181818e-07, "logits/chosen": -0.7769748568534851, "logits/rejected": -0.7578511238098145, "logps/chosen": -289.66497802734375, "logps/rejected": -307.5926208496094, "loss": 0.6926, "rewards/accuracies": 0.625, "rewards/chosen": -0.0026655124966055155, "rewards/margins": 0.0017671298701316118, "rewards/rejected": -0.004432642366737127, "step": 28 }, { "epoch": 0.037880643317821865, "grad_norm": 6.03004406279561, "learning_rate": 1.883116883116883e-07, "logits/chosen": -0.7647740840911865, "logits/rejected": -0.7874799966812134, "logps/chosen": -355.2894592285156, "logps/rejected": -344.6125183105469, "loss": 0.6923, "rewards/accuracies": 0.59375, "rewards/chosen": -0.00052436109399423, "rewards/margins": 0.0032959890086203814, "rewards/rejected": -0.0038203501608222723, "step": 29 }, { "epoch": 0.03918687239774676, "grad_norm": 5.689135947815352, "learning_rate": 1.948051948051948e-07, "logits/chosen": -0.8144919872283936, "logits/rejected": -0.790741503238678, "logps/chosen": -331.15130615234375, "logps/rejected": -396.5157470703125, "loss": 0.6919, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0018128703813999891, "rewards/margins": 0.0031230710446834564, "rewards/rejected": -0.004935941658914089, "step": 30 }, { "epoch": 0.04049310147767165, "grad_norm": 5.393155495079252, "learning_rate": 2.012987012987013e-07, "logits/chosen": -0.7429512739181519, "logits/rejected": -0.7122764587402344, "logps/chosen": -338.0105895996094, "logps/rejected": -338.2264404296875, "loss": 0.6922, "rewards/accuracies": 0.625, "rewards/chosen": -0.003354718443006277, "rewards/margins": 0.004183335229754448, "rewards/rejected": -0.007538053207099438, "step": 31 }, { "epoch": 0.04179933055759654, "grad_norm": 5.668901019910231, "learning_rate": 2.077922077922078e-07, "logits/chosen": -0.8369336128234863, "logits/rejected": -0.8760527968406677, "logps/chosen": -320.0756530761719, "logps/rejected": -340.4056701660156, "loss": 0.6928, "rewards/accuracies": 0.46875, "rewards/chosen": -0.003008706495165825, "rewards/margins": -0.00027210236294195056, "rewards/rejected": -0.0027366040740162134, "step": 32 }, { "epoch": 0.04310555963752143, "grad_norm": 5.826830283729323, "learning_rate": 2.1428571428571426e-07, "logits/chosen": -0.8723607063293457, "logits/rejected": -0.8672237396240234, "logps/chosen": -382.5314636230469, "logps/rejected": -384.20733642578125, "loss": 0.6915, "rewards/accuracies": 0.71875, "rewards/chosen": -0.0007879233453422785, "rewards/margins": 0.004684963263571262, "rewards/rejected": -0.005472886376082897, "step": 33 }, { "epoch": 0.044411788717446325, "grad_norm": 5.158846671276125, "learning_rate": 2.2077922077922076e-07, "logits/chosen": -0.9048483967781067, "logits/rejected": -0.9139261841773987, "logps/chosen": -373.8707275390625, "logps/rejected": -382.60357666015625, "loss": 0.6917, "rewards/accuracies": 0.625, "rewards/chosen": -0.008929147385060787, "rewards/margins": 0.004028785042464733, "rewards/rejected": -0.01295793242752552, "step": 34 }, { "epoch": 0.04571801779737122, "grad_norm": 5.559939318000692, "learning_rate": 2.2727272727272726e-07, "logits/chosen": -0.7409634590148926, "logits/rejected": -0.7191226482391357, "logps/chosen": -348.6598205566406, "logps/rejected": -328.2349548339844, "loss": 0.6925, "rewards/accuracies": 0.53125, "rewards/chosen": -0.007357947528362274, "rewards/margins": 0.0007142568356357515, "rewards/rejected": -0.008072204887866974, "step": 35 }, { "epoch": 0.0470242468772961, "grad_norm": 9.154842170611309, "learning_rate": 2.3376623376623376e-07, "logits/chosen": -0.8227452635765076, "logits/rejected": -0.8269484639167786, "logps/chosen": -357.90045166015625, "logps/rejected": -402.8288269042969, "loss": 0.6908, "rewards/accuracies": 0.59375, "rewards/chosen": -0.009749974124133587, "rewards/margins": 0.0010247562313452363, "rewards/rejected": -0.010774729773402214, "step": 36 }, { "epoch": 0.048330475957220995, "grad_norm": 5.7031636677945015, "learning_rate": 2.4025974025974024e-07, "logits/chosen": -0.7661442756652832, "logits/rejected": -0.7485809326171875, "logps/chosen": -342.1812438964844, "logps/rejected": -372.6727294921875, "loss": 0.6919, "rewards/accuracies": 0.625, "rewards/chosen": -0.008897833526134491, "rewards/margins": 0.0031514859292656183, "rewards/rejected": -0.01204932015389204, "step": 37 }, { "epoch": 0.04963670503714589, "grad_norm": 4.892465982370716, "learning_rate": 2.4675324675324674e-07, "logits/chosen": -0.7579087018966675, "logits/rejected": -0.7431960105895996, "logps/chosen": -325.8722229003906, "logps/rejected": -308.6358642578125, "loss": 0.6913, "rewards/accuracies": 0.53125, "rewards/chosen": -0.004845607094466686, "rewards/margins": -5.3153024055063725e-05, "rewards/rejected": -0.004792453721165657, "step": 38 }, { "epoch": 0.05094293411707078, "grad_norm": 6.881014592563438, "learning_rate": 2.532467532467532e-07, "logits/chosen": -0.8480383157730103, "logits/rejected": -0.8337075710296631, "logps/chosen": -359.5284423828125, "logps/rejected": -349.55572509765625, "loss": 0.691, "rewards/accuracies": 0.5625, "rewards/chosen": -0.007051127031445503, "rewards/margins": 0.004876017104834318, "rewards/rejected": -0.011927143670618534, "step": 39 }, { "epoch": 0.05224916319699567, "grad_norm": 5.493754973788018, "learning_rate": 2.597402597402597e-07, "logits/chosen": -0.8774313926696777, "logits/rejected": -0.8068287372589111, "logps/chosen": -361.03448486328125, "logps/rejected": -361.7033386230469, "loss": 0.6909, "rewards/accuracies": 0.71875, "rewards/chosen": -0.0067117949947714806, "rewards/margins": 0.005268721375614405, "rewards/rejected": -0.011980515904724598, "step": 40 }, { "epoch": 0.05355539227692056, "grad_norm": 6.457786146530932, "learning_rate": 2.662337662337662e-07, "logits/chosen": -0.8075283765792847, "logits/rejected": -0.8117259740829468, "logps/chosen": -326.5600891113281, "logps/rejected": -381.46124267578125, "loss": 0.6901, "rewards/accuracies": 0.5625, "rewards/chosen": -0.010759281925857067, "rewards/margins": 0.00531811686232686, "rewards/rejected": -0.016077399253845215, "step": 41 }, { "epoch": 0.054861621356845455, "grad_norm": 5.567861196809157, "learning_rate": 2.727272727272727e-07, "logits/chosen": -0.8133203387260437, "logits/rejected": -0.7968176603317261, "logps/chosen": -356.2418212890625, "logps/rejected": -365.59423828125, "loss": 0.691, "rewards/accuracies": 0.625, "rewards/chosen": -0.007873620837926865, "rewards/margins": 0.004118986427783966, "rewards/rejected": -0.011992606334388256, "step": 42 }, { "epoch": 0.05616785043677035, "grad_norm": 7.195302776766147, "learning_rate": 2.792207792207792e-07, "logits/chosen": -0.6616829633712769, "logits/rejected": -0.7046526670455933, "logps/chosen": -355.0251770019531, "logps/rejected": -368.4796142578125, "loss": 0.6901, "rewards/accuracies": 0.53125, "rewards/chosen": -0.005020341835916042, "rewards/margins": 0.0022294616792351007, "rewards/rejected": -0.007249803282320499, "step": 43 }, { "epoch": 0.05747407951669524, "grad_norm": 5.570002780795404, "learning_rate": 2.857142857142857e-07, "logits/chosen": -0.8667637705802917, "logits/rejected": -0.8768635988235474, "logps/chosen": -373.51416015625, "logps/rejected": -368.0703430175781, "loss": 0.6904, "rewards/accuracies": 0.6875, "rewards/chosen": -0.017643045634031296, "rewards/margins": 0.008519397117197514, "rewards/rejected": -0.026162443682551384, "step": 44 }, { "epoch": 0.05878030859662013, "grad_norm": 6.762630790620927, "learning_rate": 2.922077922077922e-07, "logits/chosen": -0.8118594884872437, "logits/rejected": -0.8025345206260681, "logps/chosen": -397.09771728515625, "logps/rejected": -412.2569274902344, "loss": 0.6872, "rewards/accuracies": 0.78125, "rewards/chosen": -0.0086355684325099, "rewards/margins": 0.010753954760730267, "rewards/rejected": -0.019389525055885315, "step": 45 }, { "epoch": 0.060086537676545024, "grad_norm": 6.109227844723321, "learning_rate": 2.987012987012987e-07, "logits/chosen": -0.6671664714813232, "logits/rejected": -0.6598337888717651, "logps/chosen": -353.7411193847656, "logps/rejected": -359.293701171875, "loss": 0.6894, "rewards/accuracies": 0.59375, "rewards/chosen": -0.014533436857163906, "rewards/margins": 0.0012778714299201965, "rewards/rejected": -0.015811307355761528, "step": 46 }, { "epoch": 0.061392766756469916, "grad_norm": 6.70995116363929, "learning_rate": 3.0519480519480515e-07, "logits/chosen": -0.7817015647888184, "logits/rejected": -0.7327536344528198, "logps/chosen": -389.1075744628906, "logps/rejected": -378.82916259765625, "loss": 0.6882, "rewards/accuracies": 0.8125, "rewards/chosen": -0.011939141899347305, "rewards/margins": 0.013087021186947823, "rewards/rejected": -0.025026164948940277, "step": 47 }, { "epoch": 0.06269899583639481, "grad_norm": 5.945912831172837, "learning_rate": 3.1168831168831165e-07, "logits/chosen": -0.7871267795562744, "logits/rejected": -0.8593096733093262, "logps/chosen": -368.50732421875, "logps/rejected": -399.8733825683594, "loss": 0.6879, "rewards/accuracies": 0.6875, "rewards/chosen": -0.016629451885819435, "rewards/margins": 0.009660568088293076, "rewards/rejected": -0.02629002183675766, "step": 48 }, { "epoch": 0.0640052249163197, "grad_norm": 5.761148234193876, "learning_rate": 3.1818181818181815e-07, "logits/chosen": -0.8592261075973511, "logits/rejected": -0.8339124917984009, "logps/chosen": -342.5096435546875, "logps/rejected": -364.2750244140625, "loss": 0.6882, "rewards/accuracies": 0.71875, "rewards/chosen": -0.012762600556015968, "rewards/margins": 0.007272784598171711, "rewards/rejected": -0.020035386085510254, "step": 49 }, { "epoch": 0.06531145399624459, "grad_norm": 6.025184839765024, "learning_rate": 3.2467532467532465e-07, "logits/chosen": -0.8174738883972168, "logits/rejected": -0.8106712102890015, "logps/chosen": -361.9267578125, "logps/rejected": -387.04803466796875, "loss": 0.6863, "rewards/accuracies": 0.65625, "rewards/chosen": -0.014780733734369278, "rewards/margins": 0.008798524737358093, "rewards/rejected": -0.02357925847172737, "step": 50 }, { "epoch": 0.06661768307616948, "grad_norm": 6.1254838917404975, "learning_rate": 3.3116883116883115e-07, "logits/chosen": -0.8324989676475525, "logits/rejected": -0.858439564704895, "logps/chosen": -397.22735595703125, "logps/rejected": -415.16827392578125, "loss": 0.6897, "rewards/accuracies": 0.625, "rewards/chosen": -0.022905468940734863, "rewards/margins": 0.007425696589052677, "rewards/rejected": -0.030331166461110115, "step": 51 }, { "epoch": 0.06792391215609438, "grad_norm": 5.1791059625450835, "learning_rate": 3.3766233766233765e-07, "logits/chosen": -0.7436783909797668, "logits/rejected": -0.6987237930297852, "logps/chosen": -337.4371032714844, "logps/rejected": -393.50238037109375, "loss": 0.6894, "rewards/accuracies": 0.6875, "rewards/chosen": -0.015715861693024635, "rewards/margins": 0.009158104658126831, "rewards/rejected": -0.024873966351151466, "step": 52 }, { "epoch": 0.06923014123601927, "grad_norm": 5.593768325173036, "learning_rate": 3.4415584415584415e-07, "logits/chosen": -0.8788232207298279, "logits/rejected": -0.8599483966827393, "logps/chosen": -354.47039794921875, "logps/rejected": -345.22979736328125, "loss": 0.6863, "rewards/accuracies": 0.625, "rewards/chosen": -0.02064862847328186, "rewards/margins": 0.010548464953899384, "rewards/rejected": -0.031197093427181244, "step": 53 }, { "epoch": 0.07053637031594416, "grad_norm": 6.190297216841535, "learning_rate": 3.5064935064935066e-07, "logits/chosen": -0.8456141948699951, "logits/rejected": -0.8253046870231628, "logps/chosen": -354.0450439453125, "logps/rejected": -398.3966369628906, "loss": 0.6877, "rewards/accuracies": 0.71875, "rewards/chosen": -0.022301586344838142, "rewards/margins": 0.011066760867834091, "rewards/rejected": -0.033368345350027084, "step": 54 }, { "epoch": 0.07184259939586905, "grad_norm": 6.081492793618372, "learning_rate": 3.5714285714285716e-07, "logits/chosen": -0.8616874814033508, "logits/rejected": -0.8373209834098816, "logps/chosen": -374.4599914550781, "logps/rejected": -398.0825500488281, "loss": 0.6842, "rewards/accuracies": 0.71875, "rewards/chosen": -0.028480036184191704, "rewards/margins": 0.02368302457034588, "rewards/rejected": -0.05216306075453758, "step": 55 }, { "epoch": 0.07314882847579394, "grad_norm": 5.829629571122562, "learning_rate": 3.636363636363636e-07, "logits/chosen": -0.7987987399101257, "logits/rejected": -0.7420557737350464, "logps/chosen": -336.6307067871094, "logps/rejected": -336.77435302734375, "loss": 0.685, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0241280198097229, "rewards/margins": 0.024407856166362762, "rewards/rejected": -0.04853588342666626, "step": 56 }, { "epoch": 0.07445505755571884, "grad_norm": 7.817816498473776, "learning_rate": 3.701298701298701e-07, "logits/chosen": -0.7835097312927246, "logits/rejected": -0.8163594007492065, "logps/chosen": -323.8734436035156, "logps/rejected": -351.6636047363281, "loss": 0.682, "rewards/accuracies": 0.78125, "rewards/chosen": -0.013543693348765373, "rewards/margins": 0.026210352778434753, "rewards/rejected": -0.03975404426455498, "step": 57 }, { "epoch": 0.07576128663564373, "grad_norm": 5.328210622720547, "learning_rate": 3.766233766233766e-07, "logits/chosen": -0.864203691482544, "logits/rejected": -0.8562058806419373, "logps/chosen": -342.7728271484375, "logps/rejected": -363.4197692871094, "loss": 0.6852, "rewards/accuracies": 0.71875, "rewards/chosen": -0.0332660973072052, "rewards/margins": 0.01524202898144722, "rewards/rejected": -0.04850813001394272, "step": 58 }, { "epoch": 0.07706751571556862, "grad_norm": 5.834247719910111, "learning_rate": 3.831168831168831e-07, "logits/chosen": -0.958035945892334, "logits/rejected": -0.9945738315582275, "logps/chosen": -376.7722473144531, "logps/rejected": -405.8808288574219, "loss": 0.6835, "rewards/accuracies": 0.65625, "rewards/chosen": -0.033054254949092865, "rewards/margins": 0.016461899504065514, "rewards/rejected": -0.04951614886522293, "step": 59 }, { "epoch": 0.07837374479549351, "grad_norm": 5.941158721215624, "learning_rate": 3.896103896103896e-07, "logits/chosen": -0.8537582159042358, "logits/rejected": -0.8429221510887146, "logps/chosen": -367.89715576171875, "logps/rejected": -388.77142333984375, "loss": 0.6804, "rewards/accuracies": 0.75, "rewards/chosen": -0.024989962577819824, "rewards/margins": 0.03262363001704216, "rewards/rejected": -0.05761359632015228, "step": 60 }, { "epoch": 0.0796799738754184, "grad_norm": 5.888918338304251, "learning_rate": 3.961038961038961e-07, "logits/chosen": -0.8765754699707031, "logits/rejected": -0.8787387609481812, "logps/chosen": -344.7409973144531, "logps/rejected": -372.68890380859375, "loss": 0.6814, "rewards/accuracies": 0.71875, "rewards/chosen": -0.040865350514650345, "rewards/margins": 0.035499751567840576, "rewards/rejected": -0.07636509835720062, "step": 61 }, { "epoch": 0.0809862029553433, "grad_norm": 6.390786373294418, "learning_rate": 4.025974025974026e-07, "logits/chosen": -0.8274524211883545, "logits/rejected": -0.7997250556945801, "logps/chosen": -365.7718200683594, "logps/rejected": -375.0115661621094, "loss": 0.6842, "rewards/accuracies": 0.59375, "rewards/chosen": -0.04738666117191315, "rewards/margins": 0.021708115935325623, "rewards/rejected": -0.06909477710723877, "step": 62 }, { "epoch": 0.08229243203526819, "grad_norm": 6.591255688502251, "learning_rate": 4.090909090909091e-07, "logits/chosen": -0.9231469035148621, "logits/rejected": -0.9118146300315857, "logps/chosen": -400.7498779296875, "logps/rejected": -383.37457275390625, "loss": 0.6779, "rewards/accuracies": 0.78125, "rewards/chosen": -0.0491667315363884, "rewards/margins": 0.02765682339668274, "rewards/rejected": -0.07682356238365173, "step": 63 }, { "epoch": 0.08359866111519308, "grad_norm": 6.116625419008146, "learning_rate": 4.155844155844156e-07, "logits/chosen": -0.8560450077056885, "logits/rejected": -0.8375284075737, "logps/chosen": -407.3421630859375, "logps/rejected": -427.3675231933594, "loss": 0.6778, "rewards/accuracies": 0.75, "rewards/chosen": -0.06571590155363083, "rewards/margins": 0.03016790747642517, "rewards/rejected": -0.095883809030056, "step": 64 }, { "epoch": 0.08490489019511797, "grad_norm": 6.560476520141482, "learning_rate": 4.22077922077922e-07, "logits/chosen": -0.6509677767753601, "logits/rejected": -0.6574783325195312, "logps/chosen": -342.76953125, "logps/rejected": -382.87091064453125, "loss": 0.6751, "rewards/accuracies": 0.65625, "rewards/chosen": -0.06003749370574951, "rewards/margins": 0.036659955978393555, "rewards/rejected": -0.09669744968414307, "step": 65 }, { "epoch": 0.08621111927504287, "grad_norm": 7.017591808922279, "learning_rate": 4.285714285714285e-07, "logits/chosen": -0.8858566284179688, "logits/rejected": -0.9486796855926514, "logps/chosen": -410.815673828125, "logps/rejected": -477.1343078613281, "loss": 0.6714, "rewards/accuracies": 0.75, "rewards/chosen": -0.048973143100738525, "rewards/margins": 0.07083471864461899, "rewards/rejected": -0.11980785429477692, "step": 66 }, { "epoch": 0.08751734835496776, "grad_norm": 6.342251911458394, "learning_rate": 4.35064935064935e-07, "logits/chosen": -0.7848328948020935, "logits/rejected": -0.8182150721549988, "logps/chosen": -309.9764099121094, "logps/rejected": -375.5473937988281, "loss": 0.671, "rewards/accuracies": 0.78125, "rewards/chosen": -0.06594257056713104, "rewards/margins": 0.06243371590971947, "rewards/rejected": -0.1283762902021408, "step": 67 }, { "epoch": 0.08882357743489265, "grad_norm": 8.15248197984531, "learning_rate": 4.415584415584415e-07, "logits/chosen": -0.8365911245346069, "logits/rejected": -0.8488500118255615, "logps/chosen": -359.4619140625, "logps/rejected": -402.00555419921875, "loss": 0.6739, "rewards/accuracies": 0.75, "rewards/chosen": -0.07775245606899261, "rewards/margins": 0.059413328766822815, "rewards/rejected": -0.13716578483581543, "step": 68 }, { "epoch": 0.09012980651481754, "grad_norm": 6.142817657239046, "learning_rate": 4.48051948051948e-07, "logits/chosen": -0.6473894119262695, "logits/rejected": -0.6304734349250793, "logps/chosen": -354.01153564453125, "logps/rejected": -414.6365661621094, "loss": 0.6733, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07209914922714233, "rewards/margins": 0.0502709299325943, "rewards/rejected": -0.12237009406089783, "step": 69 }, { "epoch": 0.09143603559474243, "grad_norm": 6.086995660494217, "learning_rate": 4.545454545454545e-07, "logits/chosen": -0.9957101941108704, "logits/rejected": -0.9663684368133545, "logps/chosen": -380.725341796875, "logps/rejected": -373.6209716796875, "loss": 0.6734, "rewards/accuracies": 0.75, "rewards/chosen": -0.12014191597700119, "rewards/margins": 0.0520767904818058, "rewards/rejected": -0.1722187101840973, "step": 70 }, { "epoch": 0.09274226467466731, "grad_norm": 5.584922482955073, "learning_rate": 4.61038961038961e-07, "logits/chosen": -0.9089574813842773, "logits/rejected": -0.8867455720901489, "logps/chosen": -350.91229248046875, "logps/rejected": -343.4925842285156, "loss": 0.6709, "rewards/accuracies": 0.71875, "rewards/chosen": -0.11282657086849213, "rewards/margins": 0.05241067707538605, "rewards/rejected": -0.16523723304271698, "step": 71 }, { "epoch": 0.0940484937545922, "grad_norm": 6.927477029011778, "learning_rate": 4.675324675324675e-07, "logits/chosen": -0.7093459963798523, "logits/rejected": -0.7499104142189026, "logps/chosen": -345.0081787109375, "logps/rejected": -417.9425048828125, "loss": 0.6599, "rewards/accuracies": 0.84375, "rewards/chosen": -0.09345806390047073, "rewards/margins": 0.07576846331357956, "rewards/rejected": -0.1692265421152115, "step": 72 }, { "epoch": 0.0953547228345171, "grad_norm": 6.702250099967103, "learning_rate": 4.7402597402597397e-07, "logits/chosen": -0.818958044052124, "logits/rejected": -0.8174691200256348, "logps/chosen": -404.3327941894531, "logps/rejected": -415.90478515625, "loss": 0.6667, "rewards/accuracies": 0.6875, "rewards/chosen": -0.12455743551254272, "rewards/margins": 0.05297121778130531, "rewards/rejected": -0.17752866446971893, "step": 73 }, { "epoch": 0.09666095191444199, "grad_norm": 6.092144279530064, "learning_rate": 4.805194805194805e-07, "logits/chosen": -0.7657275795936584, "logits/rejected": -0.7777887582778931, "logps/chosen": -379.3691711425781, "logps/rejected": -428.69580078125, "loss": 0.6674, "rewards/accuracies": 0.75, "rewards/chosen": -0.1482219696044922, "rewards/margins": 0.06344159692525864, "rewards/rejected": -0.21166357398033142, "step": 74 }, { "epoch": 0.09796718099436688, "grad_norm": 6.813649848962726, "learning_rate": 4.87012987012987e-07, "logits/chosen": -0.7140544652938843, "logits/rejected": -0.7011992931365967, "logps/chosen": -348.525390625, "logps/rejected": -352.27520751953125, "loss": 0.6522, "rewards/accuracies": 0.75, "rewards/chosen": -0.16398543119430542, "rewards/margins": 0.07341369986534119, "rewards/rejected": -0.23739910125732422, "step": 75 }, { "epoch": 0.09927341007429177, "grad_norm": 5.924445401824014, "learning_rate": 4.935064935064935e-07, "logits/chosen": -0.860354483127594, "logits/rejected": -0.8541557788848877, "logps/chosen": -377.2205810546875, "logps/rejected": -426.15020751953125, "loss": 0.6664, "rewards/accuracies": 0.6875, "rewards/chosen": -0.19610106945037842, "rewards/margins": 0.06874527782201767, "rewards/rejected": -0.2648463547229767, "step": 76 }, { "epoch": 0.10057963915421667, "grad_norm": 6.276805637384467, "learning_rate": 5e-07, "logits/chosen": -0.7837976217269897, "logits/rejected": -0.8085183501243591, "logps/chosen": -337.7972106933594, "logps/rejected": -410.7466735839844, "loss": 0.6488, "rewards/accuracies": 0.75, "rewards/chosen": -0.18771220743656158, "rewards/margins": 0.13911239802837372, "rewards/rejected": -0.3268246054649353, "step": 77 }, { "epoch": 0.10188586823414156, "grad_norm": 7.1132775197267835, "learning_rate": 4.999973936536504e-07, "logits/chosen": -0.8305960297584534, "logits/rejected": -0.870134711265564, "logps/chosen": -373.0445556640625, "logps/rejected": -421.777587890625, "loss": 0.6519, "rewards/accuracies": 0.65625, "rewards/chosen": -0.20363560318946838, "rewards/margins": 0.08339838683605194, "rewards/rejected": -0.2870340049266815, "step": 78 }, { "epoch": 0.10319209731406645, "grad_norm": 7.439532614597457, "learning_rate": 4.99989574668946e-07, "logits/chosen": -0.66015625, "logits/rejected": -0.6870557069778442, "logps/chosen": -347.6210632324219, "logps/rejected": -369.9632568359375, "loss": 0.6488, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2359056919813156, "rewards/margins": 0.1219453290104866, "rewards/rejected": -0.3578509986400604, "step": 79 }, { "epoch": 0.10449832639399134, "grad_norm": 7.430436150693382, "learning_rate": 4.999765432089186e-07, "logits/chosen": -0.9603585600852966, "logits/rejected": -0.9237443208694458, "logps/chosen": -396.7093200683594, "logps/rejected": -388.0702819824219, "loss": 0.6565, "rewards/accuracies": 0.71875, "rewards/chosen": -0.2579508423805237, "rewards/margins": 0.1478283703327179, "rewards/rejected": -0.4057792127132416, "step": 80 }, { "epoch": 0.10580455547391623, "grad_norm": 7.07421240398173, "learning_rate": 4.999582995452841e-07, "logits/chosen": -0.9165019989013672, "logits/rejected": -0.918879508972168, "logps/chosen": -442.6611328125, "logps/rejected": -442.1038513183594, "loss": 0.6375, "rewards/accuracies": 0.53125, "rewards/chosen": -0.3471214771270752, "rewards/margins": 0.2095186412334442, "rewards/rejected": -0.5566401481628418, "step": 81 }, { "epoch": 0.10711078455384113, "grad_norm": 7.473916645229775, "learning_rate": 4.999348440584371e-07, "logits/chosen": -0.831704318523407, "logits/rejected": -0.8036379814147949, "logps/chosen": -387.5152282714844, "logps/rejected": -406.57977294921875, "loss": 0.6279, "rewards/accuracies": 0.625, "rewards/chosen": -0.3682708144187927, "rewards/margins": 0.06527732312679291, "rewards/rejected": -0.43354812264442444, "step": 82 }, { "epoch": 0.10841701363376602, "grad_norm": 6.758897060359514, "learning_rate": 4.999061772374425e-07, "logits/chosen": -0.8267980217933655, "logits/rejected": -0.8135631084442139, "logps/chosen": -360.9224853515625, "logps/rejected": -410.9493103027344, "loss": 0.6362, "rewards/accuracies": 0.65625, "rewards/chosen": -0.31843888759613037, "rewards/margins": 0.1723370999097824, "rewards/rejected": -0.490776002407074, "step": 83 }, { "epoch": 0.10972324271369091, "grad_norm": 7.324188272112449, "learning_rate": 4.998722996800258e-07, "logits/chosen": -0.893416166305542, "logits/rejected": -0.8873583674430847, "logps/chosen": -422.30242919921875, "logps/rejected": -506.13873291015625, "loss": 0.6302, "rewards/accuracies": 0.8125, "rewards/chosen": -0.37821486592292786, "rewards/margins": 0.3631640672683716, "rewards/rejected": -0.7413789629936218, "step": 84 }, { "epoch": 0.1110294717936158, "grad_norm": 7.071494771750178, "learning_rate": 4.998332120925598e-07, "logits/chosen": -0.8114030957221985, "logits/rejected": -0.8294497132301331, "logps/chosen": -377.10308837890625, "logps/rejected": -420.3814392089844, "loss": 0.6317, "rewards/accuracies": 0.75, "rewards/chosen": -0.3377629518508911, "rewards/margins": 0.20530584454536438, "rewards/rejected": -0.5430688261985779, "step": 85 }, { "epoch": 0.1123357008735407, "grad_norm": 7.702160752771187, "learning_rate": 4.997889152900512e-07, "logits/chosen": -0.8865103125572205, "logits/rejected": -0.8853594660758972, "logps/chosen": -379.38360595703125, "logps/rejected": -412.70294189453125, "loss": 0.6422, "rewards/accuracies": 0.75, "rewards/chosen": -0.43963319063186646, "rewards/margins": 0.18820373713970184, "rewards/rejected": -0.6278368830680847, "step": 86 }, { "epoch": 0.11364192995346559, "grad_norm": 7.899877403482623, "learning_rate": 4.997394101961223e-07, "logits/chosen": -0.738101601600647, "logits/rejected": -0.7562313079833984, "logps/chosen": -445.0351257324219, "logps/rejected": -553.4995727539062, "loss": 0.5974, "rewards/accuracies": 0.75, "rewards/chosen": -0.5740631222724915, "rewards/margins": 0.38850170373916626, "rewards/rejected": -0.9625648856163025, "step": 87 }, { "epoch": 0.11494815903339048, "grad_norm": 7.056492791536779, "learning_rate": 4.996846978429924e-07, "logits/chosen": -0.8737019896507263, "logits/rejected": -0.9100275039672852, "logps/chosen": -349.7705383300781, "logps/rejected": -389.4776611328125, "loss": 0.6083, "rewards/accuracies": 0.65625, "rewards/chosen": -0.39956849813461304, "rewards/margins": 0.17830373346805573, "rewards/rejected": -0.5778722167015076, "step": 88 }, { "epoch": 0.11625438811331537, "grad_norm": 7.42785869199897, "learning_rate": 4.996247793714564e-07, "logits/chosen": -0.8866239786148071, "logits/rejected": -0.8441641330718994, "logps/chosen": -401.234130859375, "logps/rejected": -407.03680419921875, "loss": 0.6002, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5768677592277527, "rewards/margins": 0.22373563051223755, "rewards/rejected": -0.8006033897399902, "step": 89 }, { "epoch": 0.11756061719324026, "grad_norm": 7.132596880162971, "learning_rate": 4.995596560308606e-07, "logits/chosen": -0.9549114108085632, "logits/rejected": -0.9279428720474243, "logps/chosen": -373.385009765625, "logps/rejected": -369.7181091308594, "loss": 0.6257, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5690935850143433, "rewards/margins": 0.09207704663276672, "rewards/rejected": -0.6611706614494324, "step": 90 }, { "epoch": 0.11886684627316516, "grad_norm": 8.167268203168634, "learning_rate": 4.994893291790767e-07, "logits/chosen": -0.8127573132514954, "logits/rejected": -0.857043981552124, "logps/chosen": -386.78863525390625, "logps/rejected": -473.54925537109375, "loss": 0.6402, "rewards/accuracies": 0.625, "rewards/chosen": -0.6578102707862854, "rewards/margins": 0.35495108366012573, "rewards/rejected": -1.0127613544464111, "step": 91 }, { "epoch": 0.12017307535309005, "grad_norm": 8.536222808457058, "learning_rate": 4.99413800282474e-07, "logits/chosen": -0.6337046027183533, "logits/rejected": -0.6360065937042236, "logps/chosen": -335.1719055175781, "logps/rejected": -412.2095031738281, "loss": 0.6204, "rewards/accuracies": 0.78125, "rewards/chosen": -0.596889078617096, "rewards/margins": 0.350045770406723, "rewards/rejected": -0.9469348192214966, "step": 92 }, { "epoch": 0.12147930443301494, "grad_norm": 7.4400186544457485, "learning_rate": 4.993330709158879e-07, "logits/chosen": -0.9430676698684692, "logits/rejected": -0.9217436909675598, "logps/chosen": -443.1399230957031, "logps/rejected": -472.9926452636719, "loss": 0.6013, "rewards/accuracies": 0.8125, "rewards/chosen": -0.693689227104187, "rewards/margins": 0.39686375856399536, "rewards/rejected": -1.0905530452728271, "step": 93 }, { "epoch": 0.12278553351293983, "grad_norm": 7.040196818071325, "learning_rate": 4.992471427625881e-07, "logits/chosen": -0.9236155152320862, "logits/rejected": -0.9282536506652832, "logps/chosen": -437.7912292480469, "logps/rejected": -467.1312255859375, "loss": 0.5769, "rewards/accuracies": 0.75, "rewards/chosen": -0.8054531216621399, "rewards/margins": 0.33481860160827637, "rewards/rejected": -1.1402716636657715, "step": 94 }, { "epoch": 0.12409176259286472, "grad_norm": 7.456887694852602, "learning_rate": 4.99156017614243e-07, "logits/chosen": -0.8159753680229187, "logits/rejected": -0.8799095749855042, "logps/chosen": -410.9903259277344, "logps/rejected": -498.1451416015625, "loss": 0.5744, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7265411019325256, "rewards/margins": 0.4974522292613983, "rewards/rejected": -1.2239933013916016, "step": 95 }, { "epoch": 0.12539799167278962, "grad_norm": 9.486122843338816, "learning_rate": 4.990596973708818e-07, "logits/chosen": -0.9695833325386047, "logits/rejected": -0.9918654561042786, "logps/chosen": -425.7803649902344, "logps/rejected": -433.4793701171875, "loss": 0.601, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8133587837219238, "rewards/margins": 0.20483535528182983, "rewards/rejected": -1.0181939601898193, "step": 96 }, { "epoch": 0.1267042207527145, "grad_norm": 7.446348571195673, "learning_rate": 4.989581840408562e-07, "logits/chosen": -0.8435705900192261, "logits/rejected": -0.8851275444030762, "logps/chosen": -423.5701904296875, "logps/rejected": -482.48553466796875, "loss": 0.5832, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8618156909942627, "rewards/margins": 0.42404988408088684, "rewards/rejected": -1.2858655452728271, "step": 97 }, { "epoch": 0.1280104498326394, "grad_norm": 7.488734645813898, "learning_rate": 4.988514797407971e-07, "logits/chosen": -0.8705964088439941, "logits/rejected": -0.8357868790626526, "logps/chosen": -428.7393798828125, "logps/rejected": -447.38116455078125, "loss": 0.584, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8960607647895813, "rewards/margins": 0.2598303556442261, "rewards/rejected": -1.1558911800384521, "step": 98 }, { "epoch": 0.1293166789125643, "grad_norm": 8.76021914794472, "learning_rate": 4.987395866955715e-07, "logits/chosen": -0.8333250284194946, "logits/rejected": -0.7638567686080933, "logps/chosen": -388.22039794921875, "logps/rejected": -438.1500244140625, "loss": 0.6623, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0235893726348877, "rewards/margins": 0.34661492705345154, "rewards/rejected": -1.3702044486999512, "step": 99 }, { "epoch": 0.13062290799248918, "grad_norm": 7.857957421669487, "learning_rate": 4.986225072382356e-07, "logits/chosen": -0.8902202844619751, "logits/rejected": -0.9488434791564941, "logps/chosen": -463.49908447265625, "logps/rejected": -563.2487182617188, "loss": 0.5603, "rewards/accuracies": 0.84375, "rewards/chosen": -1.0943914651870728, "rewards/margins": 0.6192384958267212, "rewards/rejected": -1.713630199432373, "step": 100 }, { "epoch": 0.13062290799248918, "eval_logits/chosen": -0.72847580909729, "eval_logits/rejected": -0.7241373062133789, "eval_logps/chosen": -452.3144836425781, "eval_logps/rejected": -505.3885192871094, "eval_loss": 0.5761846303939819, "eval_rewards/accuracies": 0.7620000243186951, "eval_rewards/chosen": -1.0828338861465454, "eval_rewards/margins": 0.4697641432285309, "eval_rewards/rejected": -1.5525977611541748, "eval_runtime": 304.0817, "eval_samples_per_second": 6.577, "eval_steps_per_second": 0.411, "step": 100 }, { "epoch": 0.13192913707241408, "grad_norm": 7.635265083517575, "learning_rate": 4.985002438099865e-07, "logits/chosen": -0.9624214768409729, "logits/rejected": -0.8964027166366577, "logps/chosen": -556.7987060546875, "logps/rejected": -576.9541015625, "loss": 0.5888, "rewards/accuracies": 0.84375, "rewards/chosen": -1.1731356382369995, "rewards/margins": 0.6622936725616455, "rewards/rejected": -1.8354291915893555, "step": 101 }, { "epoch": 0.13323536615233897, "grad_norm": 7.247392624675164, "learning_rate": 4.983727989601106e-07, "logits/chosen": -1.0717601776123047, "logits/rejected": -1.0006539821624756, "logps/chosen": -481.7669677734375, "logps/rejected": -495.4006042480469, "loss": 0.5706, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2065238952636719, "rewards/margins": 0.4556655287742615, "rewards/rejected": -1.6621893644332886, "step": 102 }, { "epoch": 0.13454159523226386, "grad_norm": 7.552359919662666, "learning_rate": 4.982401753459316e-07, "logits/chosen": -0.8172162175178528, "logits/rejected": -0.8234974145889282, "logps/chosen": -443.9198303222656, "logps/rejected": -477.24822998046875, "loss": 0.5737, "rewards/accuracies": 0.78125, "rewards/chosen": -1.0775035619735718, "rewards/margins": 0.36101657152175903, "rewards/rejected": -1.4385201930999756, "step": 103 }, { "epoch": 0.13584782431218875, "grad_norm": 7.922818559159025, "learning_rate": 4.981023757327539e-07, "logits/chosen": -0.8530606031417847, "logits/rejected": -0.9096869230270386, "logps/chosen": -455.50677490234375, "logps/rejected": -595.2265625, "loss": 0.5405, "rewards/accuracies": 0.75, "rewards/chosen": -1.1649234294891357, "rewards/margins": 0.7843038439750671, "rewards/rejected": -1.9492273330688477, "step": 104 }, { "epoch": 0.13715405339211365, "grad_norm": 8.198566814604895, "learning_rate": 4.979594029938057e-07, "logits/chosen": -0.9811791181564331, "logits/rejected": -0.9687495231628418, "logps/chosen": -421.53704833984375, "logps/rejected": -469.3525390625, "loss": 0.5213, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1149628162384033, "rewards/margins": 0.4379804730415344, "rewards/rejected": -1.552943229675293, "step": 105 }, { "epoch": 0.13846028247203854, "grad_norm": 9.074632820163128, "learning_rate": 4.978112601101787e-07, "logits/chosen": -0.8650058507919312, "logits/rejected": -0.7921556234359741, "logps/chosen": -453.47271728515625, "logps/rejected": -440.36822509765625, "loss": 0.5685, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2446175813674927, "rewards/margins": 0.22569072246551514, "rewards/rejected": -1.4703084230422974, "step": 106 }, { "epoch": 0.13976651155196343, "grad_norm": 8.97067518042141, "learning_rate": 4.976579501707664e-07, "logits/chosen": -0.9297839403152466, "logits/rejected": -0.9460043907165527, "logps/chosen": -449.9891662597656, "logps/rejected": -488.8280334472656, "loss": 0.6154, "rewards/accuracies": 0.75, "rewards/chosen": -1.3405613899230957, "rewards/margins": 0.4242904484272003, "rewards/rejected": -1.7648519277572632, "step": 107 }, { "epoch": 0.14107274063188832, "grad_norm": 9.517296734066722, "learning_rate": 4.97499476372199e-07, "logits/chosen": -0.889796257019043, "logits/rejected": -0.9328855276107788, "logps/chosen": -496.8541564941406, "logps/rejected": -586.541015625, "loss": 0.5356, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3333938121795654, "rewards/margins": 0.5474119186401367, "rewards/rejected": -1.8808057308197021, "step": 108 }, { "epoch": 0.1423789697118132, "grad_norm": 10.357543065107334, "learning_rate": 4.973358420187775e-07, "logits/chosen": -0.9294272065162659, "logits/rejected": -0.9437241554260254, "logps/chosen": -470.52130126953125, "logps/rejected": -586.981689453125, "loss": 0.5847, "rewards/accuracies": 0.78125, "rewards/chosen": -1.3664886951446533, "rewards/margins": 0.8229946494102478, "rewards/rejected": -2.189483165740967, "step": 109 }, { "epoch": 0.1436851987917381, "grad_norm": 9.267300039036453, "learning_rate": 4.971670505224043e-07, "logits/chosen": -0.6836638450622559, "logits/rejected": -0.7107027173042297, "logps/chosen": -498.22015380859375, "logps/rejected": -625.4920043945312, "loss": 0.5078, "rewards/accuracies": 0.84375, "rewards/chosen": -1.5284080505371094, "rewards/margins": 0.881400465965271, "rewards/rejected": -2.409808397293091, "step": 110 }, { "epoch": 0.144991427871663, "grad_norm": 9.927126856410316, "learning_rate": 4.969931054025121e-07, "logits/chosen": -0.8554806709289551, "logits/rejected": -0.8764801025390625, "logps/chosen": -453.3316345214844, "logps/rejected": -546.2518310546875, "loss": 0.581, "rewards/accuracies": 0.71875, "rewards/chosen": -1.422972559928894, "rewards/margins": 0.7001107335090637, "rewards/rejected": -2.1230833530426025, "step": 111 }, { "epoch": 0.1462976569515879, "grad_norm": 8.326539836612254, "learning_rate": 4.968140102859908e-07, "logits/chosen": -0.8901181817054749, "logits/rejected": -0.8997009992599487, "logps/chosen": -451.3870849609375, "logps/rejected": -563.731201171875, "loss": 0.5226, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4622441530227661, "rewards/margins": 0.8325024843215942, "rewards/rejected": -2.2947468757629395, "step": 112 }, { "epoch": 0.14760388603151278, "grad_norm": 12.633047907372701, "learning_rate": 4.966297689071116e-07, "logits/chosen": -0.8476269841194153, "logits/rejected": -0.9035442471504211, "logps/chosen": -531.0331420898438, "logps/rejected": -632.9279174804688, "loss": 0.5371, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5847188234329224, "rewards/margins": 0.6562604904174805, "rewards/rejected": -2.2409791946411133, "step": 113 }, { "epoch": 0.14891011511143767, "grad_norm": 9.252739358001087, "learning_rate": 4.964403851074493e-07, "logits/chosen": -0.9159296154975891, "logits/rejected": -0.9023479223251343, "logps/chosen": -538.9677124023438, "logps/rejected": -565.4589233398438, "loss": 0.5988, "rewards/accuracies": 0.75, "rewards/chosen": -1.9100488424301147, "rewards/margins": 0.4137873351573944, "rewards/rejected": -2.323836088180542, "step": 114 }, { "epoch": 0.15021634419136257, "grad_norm": 8.76761591596254, "learning_rate": 4.962458628358021e-07, "logits/chosen": -0.758056640625, "logits/rejected": -0.795865535736084, "logps/chosen": -458.5084228515625, "logps/rejected": -566.1270141601562, "loss": 0.5715, "rewards/accuracies": 0.75, "rewards/chosen": -1.5470147132873535, "rewards/margins": 0.7053811550140381, "rewards/rejected": -2.2523956298828125, "step": 115 }, { "epoch": 0.15152257327128746, "grad_norm": 8.875080902313666, "learning_rate": 4.960462061481092e-07, "logits/chosen": -0.8292801976203918, "logits/rejected": -0.8678185939788818, "logps/chosen": -449.1661376953125, "logps/rejected": -524.7813110351562, "loss": 0.5424, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5658118724822998, "rewards/margins": 0.7005196809768677, "rewards/rejected": -2.266331434249878, "step": 116 }, { "epoch": 0.15282880235121235, "grad_norm": 9.014607302875838, "learning_rate": 4.958414192073665e-07, "logits/chosen": -0.9082063436508179, "logits/rejected": -0.8753992915153503, "logps/chosen": -563.1096801757812, "logps/rejected": -673.3067016601562, "loss": 0.5354, "rewards/accuracies": 0.75, "rewards/chosen": -1.8120057582855225, "rewards/margins": 0.3947896659374237, "rewards/rejected": -2.2067954540252686, "step": 117 }, { "epoch": 0.15413503143113724, "grad_norm": 11.06100420906947, "learning_rate": 4.956315062835396e-07, "logits/chosen": -0.8366618156433105, "logits/rejected": -0.8464934229850769, "logps/chosen": -440.1483154296875, "logps/rejected": -477.4801025390625, "loss": 0.5435, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5700161457061768, "rewards/margins": 0.3956853747367859, "rewards/rejected": -1.9657015800476074, "step": 118 }, { "epoch": 0.15544126051106213, "grad_norm": 12.513622831753462, "learning_rate": 4.954164717534748e-07, "logits/chosen": -0.8261559009552002, "logits/rejected": -0.7612857222557068, "logps/chosen": -460.90472412109375, "logps/rejected": -490.856201171875, "loss": 0.5343, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5897612571716309, "rewards/margins": 0.3713700473308563, "rewards/rejected": -1.9611313343048096, "step": 119 }, { "epoch": 0.15674748959098703, "grad_norm": 8.160394952244957, "learning_rate": 4.951963201008075e-07, "logits/chosen": -0.8434547185897827, "logits/rejected": -0.8719228506088257, "logps/chosen": -502.63629150390625, "logps/rejected": -562.45654296875, "loss": 0.5379, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5950428247451782, "rewards/margins": 0.6352701187133789, "rewards/rejected": -2.2303130626678467, "step": 120 }, { "epoch": 0.15805371867091192, "grad_norm": 8.672085140532355, "learning_rate": 4.949710559158699e-07, "logits/chosen": -0.9285653829574585, "logits/rejected": -0.9329188466072083, "logps/chosen": -584.3574829101562, "logps/rejected": -701.239013671875, "loss": 0.5285, "rewards/accuracies": 0.75, "rewards/chosen": -2.002269744873047, "rewards/margins": 0.9513869285583496, "rewards/rejected": -2.9536566734313965, "step": 121 }, { "epoch": 0.1593599477508368, "grad_norm": 9.635911922607669, "learning_rate": 4.947406838955933e-07, "logits/chosen": -0.7806838750839233, "logits/rejected": -0.7795370221138, "logps/chosen": -526.0626831054688, "logps/rejected": -689.8364868164062, "loss": 0.5524, "rewards/accuracies": 0.75, "rewards/chosen": -1.6684224605560303, "rewards/margins": 1.2676262855529785, "rewards/rejected": -2.936048746109009, "step": 122 }, { "epoch": 0.1606661768307617, "grad_norm": 8.522293868309866, "learning_rate": 4.945052088434123e-07, "logits/chosen": -1.0120658874511719, "logits/rejected": -1.0085734128952026, "logps/chosen": -587.6144409179688, "logps/rejected": -654.2259521484375, "loss": 0.5153, "rewards/accuracies": 0.75, "rewards/chosen": -1.9400118589401245, "rewards/margins": 0.6181426048278809, "rewards/rejected": -2.558154582977295, "step": 123 }, { "epoch": 0.1619724059106866, "grad_norm": 9.48236040451038, "learning_rate": 4.942646356691631e-07, "logits/chosen": -0.7166386246681213, "logits/rejected": -0.7875579595565796, "logps/chosen": -539.0728759765625, "logps/rejected": -672.5732421875, "loss": 0.5045, "rewards/accuracies": 0.6875, "rewards/chosen": -2.182424545288086, "rewards/margins": 0.7280668616294861, "rewards/rejected": -2.910491466522217, "step": 124 }, { "epoch": 0.1632786349906115, "grad_norm": 8.526936686673514, "learning_rate": 4.940189693889818e-07, "logits/chosen": -0.988465428352356, "logits/rejected": -1.012081265449524, "logps/chosen": -544.8677978515625, "logps/rejected": -642.533447265625, "loss": 0.4646, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7634834051132202, "rewards/margins": 0.7361648678779602, "rewards/rejected": -2.499648332595825, "step": 125 }, { "epoch": 0.16458486407053638, "grad_norm": 12.786714115254995, "learning_rate": 4.937682151251997e-07, "logits/chosen": -0.9385434985160828, "logits/rejected": -0.8883792161941528, "logps/chosen": -572.0708618164062, "logps/rejected": -620.1557006835938, "loss": 0.5329, "rewards/accuracies": 0.75, "rewards/chosen": -2.0728726387023926, "rewards/margins": 0.634572446346283, "rewards/rejected": -2.7074453830718994, "step": 126 }, { "epoch": 0.16589109315046127, "grad_norm": 9.282917945488935, "learning_rate": 4.935123781062365e-07, "logits/chosen": -0.9921805262565613, "logits/rejected": -0.9778479337692261, "logps/chosen": -545.0863647460938, "logps/rejected": -655.0972290039062, "loss": 0.5103, "rewards/accuracies": 0.90625, "rewards/chosen": -1.6734533309936523, "rewards/margins": 1.0550148487091064, "rewards/rejected": -2.7284679412841797, "step": 127 }, { "epoch": 0.16719732223038616, "grad_norm": 9.229465913733174, "learning_rate": 4.932514636664913e-07, "logits/chosen": -0.9057982563972473, "logits/rejected": -0.8632020950317383, "logps/chosen": -469.0540771484375, "logps/rejected": -525.9947509765625, "loss": 0.4629, "rewards/accuracies": 0.875, "rewards/chosen": -1.562605619430542, "rewards/margins": 0.8607323169708252, "rewards/rejected": -2.423337936401367, "step": 128 }, { "epoch": 0.16850355131031106, "grad_norm": 8.962747801495956, "learning_rate": 4.929854772462311e-07, "logits/chosen": -0.8092485070228577, "logits/rejected": -0.7825321555137634, "logps/chosen": -504.3341064453125, "logps/rejected": -535.06396484375, "loss": 0.5647, "rewards/accuracies": 0.71875, "rewards/chosen": -1.720110535621643, "rewards/margins": 0.45117801427841187, "rewards/rejected": -2.1712887287139893, "step": 129 }, { "epoch": 0.16980978039023595, "grad_norm": 9.67145433756404, "learning_rate": 4.927144243914781e-07, "logits/chosen": -0.804796040058136, "logits/rejected": -0.8286614418029785, "logps/chosen": -493.044677734375, "logps/rejected": -635.99560546875, "loss": 0.4706, "rewards/accuracies": 0.75, "rewards/chosen": -2.03037691116333, "rewards/margins": 1.352684497833252, "rewards/rejected": -3.383061170578003, "step": 130 }, { "epoch": 0.17111600947016084, "grad_norm": 8.961180482268233, "learning_rate": 4.924383107538929e-07, "logits/chosen": -0.9036099910736084, "logits/rejected": -0.9011116027832031, "logps/chosen": -551.7821655273438, "logps/rejected": -679.7416381835938, "loss": 0.5407, "rewards/accuracies": 0.75, "rewards/chosen": -2.1092605590820312, "rewards/margins": 0.9267193675041199, "rewards/rejected": -3.035979747772217, "step": 131 }, { "epoch": 0.17242223855008573, "grad_norm": 11.220325409910862, "learning_rate": 4.921571420906578e-07, "logits/chosen": -0.872887372970581, "logits/rejected": -0.8407180309295654, "logps/chosen": -550.0153198242188, "logps/rejected": -656.0322875976562, "loss": 0.5722, "rewards/accuracies": 0.75, "rewards/chosen": -2.172218084335327, "rewards/margins": 0.5464658141136169, "rewards/rejected": -2.718684196472168, "step": 132 }, { "epoch": 0.17372846763001062, "grad_norm": 9.18653059128084, "learning_rate": 4.918709242643563e-07, "logits/chosen": -0.8891811370849609, "logits/rejected": -0.9292304515838623, "logps/chosen": -517.5161743164062, "logps/rejected": -619.11328125, "loss": 0.4595, "rewards/accuracies": 0.78125, "rewards/chosen": -1.8330395221710205, "rewards/margins": 0.7570334076881409, "rewards/rejected": -2.5900731086730957, "step": 133 }, { "epoch": 0.17503469670993552, "grad_norm": 9.18457403544812, "learning_rate": 4.915796632428505e-07, "logits/chosen": -0.9480959177017212, "logits/rejected": -0.9999913573265076, "logps/chosen": -586.74169921875, "logps/rejected": -696.614013671875, "loss": 0.4629, "rewards/accuracies": 0.84375, "rewards/chosen": -2.009488105773926, "rewards/margins": 0.9081063866615295, "rewards/rejected": -2.9175941944122314, "step": 134 }, { "epoch": 0.1763409257898604, "grad_norm": 11.727197619914456, "learning_rate": 4.912833650991573e-07, "logits/chosen": -0.8458810448646545, "logits/rejected": -0.8598436713218689, "logps/chosen": -524.3538818359375, "logps/rejected": -614.8167724609375, "loss": 0.5901, "rewards/accuracies": 0.90625, "rewards/chosen": -1.777543544769287, "rewards/margins": 1.0405200719833374, "rewards/rejected": -2.818063735961914, "step": 135 }, { "epoch": 0.1776471548697853, "grad_norm": 9.191709773975337, "learning_rate": 4.909820360113213e-07, "logits/chosen": -0.9621294736862183, "logits/rejected": -1.0132098197937012, "logps/chosen": -567.4052734375, "logps/rejected": -681.6702880859375, "loss": 0.4652, "rewards/accuracies": 0.78125, "rewards/chosen": -2.1079905033111572, "rewards/margins": 0.9598952531814575, "rewards/rejected": -3.0678858757019043, "step": 136 }, { "epoch": 0.1789533839497102, "grad_norm": 9.346469893179457, "learning_rate": 4.906756822622864e-07, "logits/chosen": -0.8825656175613403, "logits/rejected": -0.9310536980628967, "logps/chosen": -524.7208862304688, "logps/rejected": -718.4395141601562, "loss": 0.4943, "rewards/accuracies": 0.75, "rewards/chosen": -1.8902442455291748, "rewards/margins": 1.395514726638794, "rewards/rejected": -3.2857589721679688, "step": 137 }, { "epoch": 0.18025961302963509, "grad_norm": 8.885243669741756, "learning_rate": 4.903643102397643e-07, "logits/chosen": -0.8142070174217224, "logits/rejected": -0.8311284780502319, "logps/chosen": -557.0999145507812, "logps/rejected": -668.8630981445312, "loss": 0.4997, "rewards/accuracies": 0.78125, "rewards/chosen": -2.0316572189331055, "rewards/margins": 0.9219380617141724, "rewards/rejected": -2.9535951614379883, "step": 138 }, { "epoch": 0.18156584210955998, "grad_norm": 10.223180789734753, "learning_rate": 4.900479264361018e-07, "logits/chosen": -0.9322543740272522, "logits/rejected": -0.9504646062850952, "logps/chosen": -603.5999755859375, "logps/rejected": -702.2202758789062, "loss": 0.4762, "rewards/accuracies": 0.84375, "rewards/chosen": -2.2028281688690186, "rewards/margins": 0.9659155011177063, "rewards/rejected": -3.16874361038208, "step": 139 }, { "epoch": 0.18287207118948487, "grad_norm": 9.20081752161527, "learning_rate": 4.897265374481447e-07, "logits/chosen": -0.8732661008834839, "logits/rejected": -0.9553130269050598, "logps/chosen": -511.3672790527344, "logps/rejected": -646.978759765625, "loss": 0.468, "rewards/accuracies": 0.75, "rewards/chosen": -1.8701015710830688, "rewards/margins": 0.940824031829834, "rewards/rejected": -2.8109254837036133, "step": 140 }, { "epoch": 0.18417830026940976, "grad_norm": 9.432685979588154, "learning_rate": 4.894001499771015e-07, "logits/chosen": -0.9715933799743652, "logits/rejected": -0.8695637583732605, "logps/chosen": -649.057373046875, "logps/rejected": -643.0631103515625, "loss": 0.4503, "rewards/accuracies": 0.75, "rewards/chosen": -2.4942514896392822, "rewards/margins": 0.5338544249534607, "rewards/rejected": -3.0281057357788086, "step": 141 }, { "epoch": 0.18548452934933463, "grad_norm": 13.104777321859196, "learning_rate": 4.890687708284024e-07, "logits/chosen": -0.9079119563102722, "logits/rejected": -0.9527072906494141, "logps/chosen": -589.8755493164062, "logps/rejected": -756.660400390625, "loss": 0.4617, "rewards/accuracies": 0.8125, "rewards/chosen": -2.2556536197662354, "rewards/margins": 1.3938080072402954, "rewards/rejected": -3.649461507797241, "step": 142 }, { "epoch": 0.18679075842925952, "grad_norm": 10.528385278196993, "learning_rate": 4.887324069115581e-07, "logits/chosen": -0.9083209037780762, "logits/rejected": -0.8825032711029053, "logps/chosen": -600.8740234375, "logps/rejected": -660.4830322265625, "loss": 0.5546, "rewards/accuracies": 0.78125, "rewards/chosen": -2.31591796875, "rewards/margins": 0.6303241848945618, "rewards/rejected": -2.946242332458496, "step": 143 }, { "epoch": 0.1880969875091844, "grad_norm": 13.644717006371213, "learning_rate": 4.883910652400155e-07, "logits/chosen": -0.8000559210777283, "logits/rejected": -0.8278550505638123, "logps/chosen": -562.575927734375, "logps/rejected": -690.1162109375, "loss": 0.4827, "rewards/accuracies": 0.78125, "rewards/chosen": -2.350778102874756, "rewards/margins": 0.9358606338500977, "rewards/rejected": -3.2866384983062744, "step": 144 }, { "epoch": 0.1894032165891093, "grad_norm": 11.442706123817796, "learning_rate": 4.880447529310118e-07, "logits/chosen": -0.7975379824638367, "logits/rejected": -0.8202475309371948, "logps/chosen": -589.868408203125, "logps/rejected": -746.5616455078125, "loss": 0.5009, "rewards/accuracies": 0.71875, "rewards/chosen": -2.4633235931396484, "rewards/margins": 1.171708106994629, "rewards/rejected": -3.6350317001342773, "step": 145 }, { "epoch": 0.1907094456690342, "grad_norm": 10.565974328419516, "learning_rate": 4.876934772054251e-07, "logits/chosen": -0.7488982081413269, "logits/rejected": -0.810793399810791, "logps/chosen": -613.113525390625, "logps/rejected": -894.685302734375, "loss": 0.4131, "rewards/accuracies": 0.875, "rewards/chosen": -2.57572865486145, "rewards/margins": 2.021328926086426, "rewards/rejected": -4.597057342529297, "step": 146 }, { "epoch": 0.1920156747489591, "grad_norm": 11.457022715404966, "learning_rate": 4.873372453876254e-07, "logits/chosen": -0.8528122305870056, "logits/rejected": -0.9192670583724976, "logps/chosen": -590.0909423828125, "logps/rejected": -798.7685546875, "loss": 0.4564, "rewards/accuracies": 0.875, "rewards/chosen": -2.5091986656188965, "rewards/margins": 1.6112761497497559, "rewards/rejected": -4.120474815368652, "step": 147 }, { "epoch": 0.19332190382888398, "grad_norm": 13.441936360102783, "learning_rate": 4.869760649053207e-07, "logits/chosen": -0.8818020820617676, "logits/rejected": -0.8745623230934143, "logps/chosen": -628.12109375, "logps/rejected": -758.0457763671875, "loss": 0.5465, "rewards/accuracies": 0.78125, "rewards/chosen": -2.561213493347168, "rewards/margins": 1.085404872894287, "rewards/rejected": -3.646618366241455, "step": 148 }, { "epoch": 0.19462813290880887, "grad_norm": 11.44437228846766, "learning_rate": 4.866099432894024e-07, "logits/chosen": -0.7465147376060486, "logits/rejected": -0.7887036800384521, "logps/chosen": -581.6749267578125, "logps/rejected": -763.8986206054688, "loss": 0.4528, "rewards/accuracies": 0.6875, "rewards/chosen": -2.4309072494506836, "rewards/margins": 1.333343505859375, "rewards/rejected": -3.7642509937286377, "step": 149 }, { "epoch": 0.19593436198873376, "grad_norm": 10.261817592185366, "learning_rate": 4.862388881737881e-07, "logits/chosen": -0.7357074022293091, "logits/rejected": -0.7889531850814819, "logps/chosen": -592.6793823242188, "logps/rejected": -763.1603393554688, "loss": 0.4495, "rewards/accuracies": 0.78125, "rewards/chosen": -2.7371535301208496, "rewards/margins": 1.1889464855194092, "rewards/rejected": -3.926100254058838, "step": 150 }, { "epoch": 0.19724059106865865, "grad_norm": 9.944126517301331, "learning_rate": 4.858629072952634e-07, "logits/chosen": -0.7918027639389038, "logits/rejected": -0.8364720344543457, "logps/chosen": -590.2254028320312, "logps/rejected": -745.180908203125, "loss": 0.4616, "rewards/accuracies": 0.78125, "rewards/chosen": -2.5804126262664795, "rewards/margins": 0.9752347469329834, "rewards/rejected": -3.555647134780884, "step": 151 }, { "epoch": 0.19854682014858355, "grad_norm": 8.866243575571254, "learning_rate": 4.854820084933192e-07, "logits/chosen": -0.8944368958473206, "logits/rejected": -0.9187857508659363, "logps/chosen": -660.8124389648438, "logps/rejected": -761.919677734375, "loss": 0.4311, "rewards/accuracies": 0.75, "rewards/chosen": -2.6248433589935303, "rewards/margins": 1.121117115020752, "rewards/rejected": -3.745960235595703, "step": 152 }, { "epoch": 0.19985304922850844, "grad_norm": 14.410936928898042, "learning_rate": 4.850961997099892e-07, "logits/chosen": -0.9639002680778503, "logits/rejected": -0.9395177364349365, "logps/chosen": -597.547607421875, "logps/rejected": -680.5858764648438, "loss": 0.4692, "rewards/accuracies": 0.625, "rewards/chosen": -2.559708595275879, "rewards/margins": 0.8148014545440674, "rewards/rejected": -3.374509811401367, "step": 153 }, { "epoch": 0.20115927830843333, "grad_norm": 11.24933528745637, "learning_rate": 4.847054889896838e-07, "logits/chosen": -0.8532655239105225, "logits/rejected": -0.9007124900817871, "logps/chosen": -621.040771484375, "logps/rejected": -810.769287109375, "loss": 0.4895, "rewards/accuracies": 0.90625, "rewards/chosen": -2.8787829875946045, "rewards/margins": 1.3338385820388794, "rewards/rejected": -4.212621212005615, "step": 154 }, { "epoch": 0.20246550738835822, "grad_norm": 17.235667735611003, "learning_rate": 4.843098844790228e-07, "logits/chosen": -0.8356879353523254, "logits/rejected": -0.7729538083076477, "logps/chosen": -607.823486328125, "logps/rejected": -642.9387817382812, "loss": 0.5611, "rewards/accuracies": 0.625, "rewards/chosen": -2.6873245239257812, "rewards/margins": 0.5770570039749146, "rewards/rejected": -3.2643816471099854, "step": 155 }, { "epoch": 0.20377173646828312, "grad_norm": 15.801541834876094, "learning_rate": 4.83909394426665e-07, "logits/chosen": -0.8892329931259155, "logits/rejected": -0.8549962043762207, "logps/chosen": -718.0670166015625, "logps/rejected": -855.0306396484375, "loss": 0.5509, "rewards/accuracies": 0.71875, "rewards/chosen": -3.1425247192382812, "rewards/margins": 1.2918095588684082, "rewards/rejected": -4.434334754943848, "step": 156 }, { "epoch": 0.205077965548208, "grad_norm": 22.929329377097073, "learning_rate": 4.83504027183137e-07, "logits/chosen": -0.9463051557540894, "logits/rejected": -0.9906477332115173, "logps/chosen": -636.51904296875, "logps/rejected": -810.24169921875, "loss": 0.4337, "rewards/accuracies": 0.78125, "rewards/chosen": -2.714447498321533, "rewards/margins": 1.3694745302200317, "rewards/rejected": -4.083921432495117, "step": 157 }, { "epoch": 0.2063841946281329, "grad_norm": 11.639690209880769, "learning_rate": 4.83093791200658e-07, "logits/chosen": -0.7416931390762329, "logits/rejected": -0.7359007596969604, "logps/chosen": -532.9208984375, "logps/rejected": -670.9202880859375, "loss": 0.4256, "rewards/accuracies": 0.875, "rewards/chosen": -2.3252341747283936, "rewards/margins": 1.2705352306365967, "rewards/rejected": -3.595769166946411, "step": 158 }, { "epoch": 0.2076904237080578, "grad_norm": 15.270391743947712, "learning_rate": 4.826786950329646e-07, "logits/chosen": -0.8616015315055847, "logits/rejected": -0.8714800477027893, "logps/chosen": -603.2241821289062, "logps/rejected": -741.3072509765625, "loss": 0.5139, "rewards/accuracies": 0.84375, "rewards/chosen": -2.4090378284454346, "rewards/margins": 1.1060690879821777, "rewards/rejected": -3.5151071548461914, "step": 159 }, { "epoch": 0.20899665278798268, "grad_norm": 8.63825492932275, "learning_rate": 4.822587473351316e-07, "logits/chosen": -0.8719635009765625, "logits/rejected": -0.9493328928947449, "logps/chosen": -539.3067016601562, "logps/rejected": -690.457275390625, "loss": 0.423, "rewards/accuracies": 0.875, "rewards/chosen": -2.1986799240112305, "rewards/margins": 1.2904539108276367, "rewards/rejected": -3.489133834838867, "step": 160 }, { "epoch": 0.21030288186790758, "grad_norm": 14.424579182139063, "learning_rate": 4.818339568633926e-07, "logits/chosen": -0.940915048122406, "logits/rejected": -0.9314707517623901, "logps/chosen": -586.869384765625, "logps/rejected": -641.973876953125, "loss": 0.4428, "rewards/accuracies": 0.71875, "rewards/chosen": -2.6642723083496094, "rewards/margins": 0.6295632123947144, "rewards/rejected": -3.2938356399536133, "step": 161 }, { "epoch": 0.21160911094783247, "grad_norm": 10.626168680350675, "learning_rate": 4.81404332474956e-07, "logits/chosen": -0.7765993475914001, "logits/rejected": -0.783936619758606, "logps/chosen": -682.2221069335938, "logps/rejected": -807.446533203125, "loss": 0.4454, "rewards/accuracies": 0.75, "rewards/chosen": -2.9070258140563965, "rewards/margins": 1.1761747598648071, "rewards/rejected": -4.083200454711914, "step": 162 }, { "epoch": 0.21291534002775736, "grad_norm": 13.762353067778275, "learning_rate": 4.809698831278217e-07, "logits/chosen": -0.9006916284561157, "logits/rejected": -0.882185697555542, "logps/chosen": -582.73046875, "logps/rejected": -688.5364379882812, "loss": 0.4343, "rewards/accuracies": 0.78125, "rewards/chosen": -2.400038242340088, "rewards/margins": 1.1299452781677246, "rewards/rejected": -3.5299835205078125, "step": 163 }, { "epoch": 0.21422156910768225, "grad_norm": 15.59615888833518, "learning_rate": 4.805306178805933e-07, "logits/chosen": -0.8027355074882507, "logits/rejected": -0.7851669788360596, "logps/chosen": -634.9528198242188, "logps/rejected": -750.8024291992188, "loss": 0.5245, "rewards/accuracies": 0.6875, "rewards/chosen": -2.8227155208587646, "rewards/margins": 1.1176807880401611, "rewards/rejected": -3.940396308898926, "step": 164 }, { "epoch": 0.21552779818760714, "grad_norm": 10.146904391466114, "learning_rate": 4.800865458922898e-07, "logits/chosen": -0.8721917867660522, "logits/rejected": -0.9246646165847778, "logps/chosen": -511.4060974121094, "logps/rejected": -643.2660522460938, "loss": 0.4669, "rewards/accuracies": 0.84375, "rewards/chosen": -2.18168568611145, "rewards/margins": 0.9130581617355347, "rewards/rejected": -3.0947437286376953, "step": 165 }, { "epoch": 0.21683402726753204, "grad_norm": 13.069612323257635, "learning_rate": 4.796376764221546e-07, "logits/chosen": -1.075876235961914, "logits/rejected": -1.0736372470855713, "logps/chosen": -720.34033203125, "logps/rejected": -769.7064819335938, "loss": 0.4463, "rewards/accuracies": 0.875, "rewards/chosen": -2.6499907970428467, "rewards/margins": 0.8865436911582947, "rewards/rejected": -3.536534309387207, "step": 166 }, { "epoch": 0.21814025634745693, "grad_norm": 13.680715613764924, "learning_rate": 4.791840188294619e-07, "logits/chosen": -0.9063047766685486, "logits/rejected": -0.8700284361839294, "logps/chosen": -595.5171508789062, "logps/rejected": -717.4872436523438, "loss": 0.4196, "rewards/accuracies": 0.6875, "rewards/chosen": -2.565876007080078, "rewards/margins": 1.157697081565857, "rewards/rejected": -3.7235727310180664, "step": 167 }, { "epoch": 0.21944648542738182, "grad_norm": 26.410003114025304, "learning_rate": 4.787255825733224e-07, "logits/chosen": -0.8095680475234985, "logits/rejected": -0.8170436024665833, "logps/chosen": -659.5059204101562, "logps/rejected": -702.8510131835938, "loss": 0.4668, "rewards/accuracies": 0.71875, "rewards/chosen": -3.0381665229797363, "rewards/margins": 0.7416744828224182, "rewards/rejected": -3.7798409461975098, "step": 168 }, { "epoch": 0.2207527145073067, "grad_norm": 12.27297441753684, "learning_rate": 4.782623772124855e-07, "logits/chosen": -0.7508028149604797, "logits/rejected": -0.845954954624176, "logps/chosen": -651.4632568359375, "logps/rejected": -898.3553466796875, "loss": 0.4654, "rewards/accuracies": 0.875, "rewards/chosen": -3.235539674758911, "rewards/margins": 1.770397424697876, "rewards/rejected": -5.005937099456787, "step": 169 }, { "epoch": 0.2220589435872316, "grad_norm": 11.56139650007245, "learning_rate": 4.777944124051395e-07, "logits/chosen": -0.8815621137619019, "logits/rejected": -0.8710463047027588, "logps/chosen": -595.5137329101562, "logps/rejected": -665.1507568359375, "loss": 0.4484, "rewards/accuracies": 0.75, "rewards/chosen": -2.6465539932250977, "rewards/margins": 0.8060583472251892, "rewards/rejected": -3.4526124000549316, "step": 170 }, { "epoch": 0.2233651726671565, "grad_norm": 16.94182109300071, "learning_rate": 4.773216979087119e-07, "logits/chosen": -0.8682816028594971, "logits/rejected": -0.9051592946052551, "logps/chosen": -668.440185546875, "logps/rejected": -770.4189453125, "loss": 0.5038, "rewards/accuracies": 0.75, "rewards/chosen": -3.168666124343872, "rewards/margins": 0.9612530469894409, "rewards/rejected": -4.129919528961182, "step": 171 }, { "epoch": 0.2246714017470814, "grad_norm": 13.911281202814001, "learning_rate": 4.768442435796639e-07, "logits/chosen": -0.9317069053649902, "logits/rejected": -0.9124513864517212, "logps/chosen": -647.5184936523438, "logps/rejected": -706.0861206054688, "loss": 0.4405, "rewards/accuracies": 0.75, "rewards/chosen": -2.8290905952453613, "rewards/margins": 0.7692254781723022, "rewards/rejected": -3.598315954208374, "step": 172 }, { "epoch": 0.22597763082700628, "grad_norm": 15.550040420673614, "learning_rate": 4.7636205937328664e-07, "logits/chosen": -0.7879063487052917, "logits/rejected": -0.6880006790161133, "logps/chosen": -639.0218505859375, "logps/rejected": -654.2172241210938, "loss": 0.5298, "rewards/accuracies": 0.6875, "rewards/chosen": -2.9527385234832764, "rewards/margins": 0.6208396553993225, "rewards/rejected": -3.573578357696533, "step": 173 }, { "epoch": 0.22728385990693117, "grad_norm": 15.622889611331605, "learning_rate": 4.758751553434922e-07, "logits/chosen": -0.8385047316551208, "logits/rejected": -0.8385977745056152, "logps/chosen": -650.1749877929688, "logps/rejected": -773.8004760742188, "loss": 0.4689, "rewards/accuracies": 0.8125, "rewards/chosen": -2.798471450805664, "rewards/margins": 1.2958422899246216, "rewards/rejected": -4.094313621520996, "step": 174 }, { "epoch": 0.22859008898685607, "grad_norm": 10.630697667093651, "learning_rate": 4.753835416426051e-07, "logits/chosen": -0.7726784348487854, "logits/rejected": -0.7915071845054626, "logps/chosen": -616.429443359375, "logps/rejected": -742.1764526367188, "loss": 0.4527, "rewards/accuracies": 0.78125, "rewards/chosen": -2.6264476776123047, "rewards/margins": 1.346327304840088, "rewards/rejected": -3.9727747440338135, "step": 175 }, { "epoch": 0.22989631806678096, "grad_norm": 21.621277056488225, "learning_rate": 4.748872285211498e-07, "logits/chosen": -0.766167163848877, "logits/rejected": -0.7375807762145996, "logps/chosen": -612.8403930664062, "logps/rejected": -690.181884765625, "loss": 0.495, "rewards/accuracies": 0.75, "rewards/chosen": -2.520641326904297, "rewards/margins": 0.8460010886192322, "rewards/rejected": -3.3666419982910156, "step": 176 }, { "epoch": 0.23120254714670585, "grad_norm": 21.05225240395737, "learning_rate": 4.743862263276376e-07, "logits/chosen": -0.7831792831420898, "logits/rejected": -0.7660663723945618, "logps/chosen": -684.431640625, "logps/rejected": -809.43798828125, "loss": 0.4497, "rewards/accuracies": 0.8125, "rewards/chosen": -3.232773542404175, "rewards/margins": 1.158035159111023, "rewards/rejected": -4.390808582305908, "step": 177 }, { "epoch": 0.23250877622663074, "grad_norm": 15.628810528649288, "learning_rate": 4.738805455083502e-07, "logits/chosen": -0.8534025549888611, "logits/rejected": -0.8425536155700684, "logps/chosen": -617.8245239257812, "logps/rejected": -676.0670166015625, "loss": 0.5264, "rewards/accuracies": 0.75, "rewards/chosen": -2.6181697845458984, "rewards/margins": 0.8310192823410034, "rewards/rejected": -3.4491891860961914, "step": 178 }, { "epoch": 0.23381500530655563, "grad_norm": 12.752693104616785, "learning_rate": 4.7337019660712254e-07, "logits/chosen": -0.8026778101921082, "logits/rejected": -0.8057217001914978, "logps/chosen": -564.225830078125, "logps/rejected": -793.5078125, "loss": 0.471, "rewards/accuracies": 0.65625, "rewards/chosen": -2.4254250526428223, "rewards/margins": 1.4830188751220703, "rewards/rejected": -3.9084439277648926, "step": 179 }, { "epoch": 0.23512123438648053, "grad_norm": 14.774928040537697, "learning_rate": 4.7285519026512267e-07, "logits/chosen": -0.8348312377929688, "logits/rejected": -0.8021372556686401, "logps/chosen": -633.8662109375, "logps/rejected": -700.7567138671875, "loss": 0.4983, "rewards/accuracies": 0.78125, "rewards/chosen": -2.7143614292144775, "rewards/margins": 0.7620492577552795, "rewards/rejected": -3.4764111042022705, "step": 180 }, { "epoch": 0.23642746346640542, "grad_norm": 11.622812039115109, "learning_rate": 4.723355372206297e-07, "logits/chosen": -0.8993362188339233, "logits/rejected": -0.9162436127662659, "logps/chosen": -591.3616333007812, "logps/rejected": -703.9219970703125, "loss": 0.4848, "rewards/accuracies": 0.75, "rewards/chosen": -2.3152692317962646, "rewards/margins": 0.9252865314483643, "rewards/rejected": -3.240555763244629, "step": 181 }, { "epoch": 0.2377336925463303, "grad_norm": 15.982569238831562, "learning_rate": 4.718112483088102e-07, "logits/chosen": -0.695371150970459, "logits/rejected": -0.7330318093299866, "logps/chosen": -648.0274047851562, "logps/rejected": -778.1810302734375, "loss": 0.5135, "rewards/accuracies": 0.78125, "rewards/chosen": -3.06677508354187, "rewards/margins": 1.2116183042526245, "rewards/rejected": -4.278393745422363, "step": 182 }, { "epoch": 0.2390399216262552, "grad_norm": 14.477017877144629, "learning_rate": 4.7128233446149205e-07, "logits/chosen": -0.8352495431900024, "logits/rejected": -0.7390860915184021, "logps/chosen": -579.0858154296875, "logps/rejected": -652.7052001953125, "loss": 0.4737, "rewards/accuracies": 0.78125, "rewards/chosen": -2.529910087585449, "rewards/margins": 0.8141626119613647, "rewards/rejected": -3.3440728187561035, "step": 183 }, { "epoch": 0.2403461507061801, "grad_norm": 14.969475270165255, "learning_rate": 4.7074880670693673e-07, "logits/chosen": -0.8044037818908691, "logits/rejected": -0.7950783967971802, "logps/chosen": -594.140869140625, "logps/rejected": -658.424560546875, "loss": 0.509, "rewards/accuracies": 0.6875, "rewards/chosen": -2.541053295135498, "rewards/margins": 0.5490034818649292, "rewards/rejected": -3.090056896209717, "step": 184 }, { "epoch": 0.241652379786105, "grad_norm": 9.781489605816779, "learning_rate": 4.702106761696091e-07, "logits/chosen": -0.701477587223053, "logits/rejected": -0.785467267036438, "logps/chosen": -580.6058349609375, "logps/rejected": -789.3968505859375, "loss": 0.4721, "rewards/accuracies": 0.96875, "rewards/chosen": -2.574538230895996, "rewards/margins": 1.637567162513733, "rewards/rejected": -4.2121052742004395, "step": 185 }, { "epoch": 0.24295860886602988, "grad_norm": 12.354874815334579, "learning_rate": 4.6966795406994564e-07, "logits/chosen": -0.8601551055908203, "logits/rejected": -0.8328098654747009, "logps/chosen": -630.858642578125, "logps/rejected": -707.3577880859375, "loss": 0.4537, "rewards/accuracies": 0.75, "rewards/chosen": -2.7662644386291504, "rewards/margins": 0.8900072574615479, "rewards/rejected": -3.6562719345092773, "step": 186 }, { "epoch": 0.24426483794595477, "grad_norm": 17.448659280928165, "learning_rate": 4.6912065172412046e-07, "logits/chosen": -0.9235214591026306, "logits/rejected": -0.8834742307662964, "logps/chosen": -673.1686401367188, "logps/rejected": -738.9641723632812, "loss": 0.4663, "rewards/accuracies": 0.78125, "rewards/chosen": -2.8677878379821777, "rewards/margins": 0.9876227378845215, "rewards/rejected": -3.8554108142852783, "step": 187 }, { "epoch": 0.24557106702587966, "grad_norm": 12.47966408616588, "learning_rate": 4.685687805438094e-07, "logits/chosen": -0.8535528182983398, "logits/rejected": -0.8522934317588806, "logps/chosen": -629.5390625, "logps/rejected": -729.8989868164062, "loss": 0.4595, "rewards/accuracies": 0.8125, "rewards/chosen": -2.8061907291412354, "rewards/margins": 0.9717488884925842, "rewards/rejected": -3.7779393196105957, "step": 188 }, { "epoch": 0.24687729610580456, "grad_norm": 19.228380348984462, "learning_rate": 4.680123520359519e-07, "logits/chosen": -0.7098379731178284, "logits/rejected": -0.6918249130249023, "logps/chosen": -640.8948364257812, "logps/rejected": -736.1873779296875, "loss": 0.4534, "rewards/accuracies": 0.8125, "rewards/chosen": -2.9255459308624268, "rewards/margins": 1.0353283882141113, "rewards/rejected": -3.960874080657959, "step": 189 }, { "epoch": 0.24818352518572945, "grad_norm": 13.78796204485732, "learning_rate": 4.674513778025112e-07, "logits/chosen": -0.7351849675178528, "logits/rejected": -0.7797478437423706, "logps/chosen": -515.9529418945312, "logps/rejected": -650.210693359375, "loss": 0.4358, "rewards/accuracies": 0.875, "rewards/chosen": -2.3447022438049316, "rewards/margins": 1.2216882705688477, "rewards/rejected": -3.566390037536621, "step": 190 }, { "epoch": 0.24948975426565434, "grad_norm": 16.198151345068265, "learning_rate": 4.6688586954023255e-07, "logits/chosen": -0.649688184261322, "logits/rejected": -0.7320199012756348, "logps/chosen": -569.9924926757812, "logps/rejected": -789.763916015625, "loss": 0.3996, "rewards/accuracies": 0.90625, "rewards/chosen": -2.4391114711761475, "rewards/margins": 1.7697255611419678, "rewards/rejected": -4.208836555480957, "step": 191 }, { "epoch": 0.25079598334557923, "grad_norm": 11.119564441609715, "learning_rate": 4.663158390403991e-07, "logits/chosen": -0.8746720552444458, "logits/rejected": -0.8720070123672485, "logps/chosen": -556.7640991210938, "logps/rejected": -648.1425170898438, "loss": 0.4835, "rewards/accuracies": 0.84375, "rewards/chosen": -2.4596023559570312, "rewards/margins": 0.8872272968292236, "rewards/rejected": -3.346829652786255, "step": 192 }, { "epoch": 0.2521022124255041, "grad_norm": 16.9682810475937, "learning_rate": 4.657412981885861e-07, "logits/chosen": -0.8757528066635132, "logits/rejected": -0.8742597699165344, "logps/chosen": -655.8448486328125, "logps/rejected": -716.5648803710938, "loss": 0.4632, "rewards/accuracies": 0.75, "rewards/chosen": -2.7153611183166504, "rewards/margins": 0.9040799140930176, "rewards/rejected": -3.619441032409668, "step": 193 }, { "epoch": 0.253408441505429, "grad_norm": 12.940046755041541, "learning_rate": 4.651622589644132e-07, "logits/chosen": -0.853613555431366, "logits/rejected": -0.9235856533050537, "logps/chosen": -631.1458740234375, "logps/rejected": -784.771240234375, "loss": 0.4331, "rewards/accuracies": 0.78125, "rewards/chosen": -2.815965175628662, "rewards/margins": 1.0641415119171143, "rewards/rejected": -3.8801069259643555, "step": 194 }, { "epoch": 0.2547146705853539, "grad_norm": 15.053926223340259, "learning_rate": 4.6457873344129443e-07, "logits/chosen": -0.8052849769592285, "logits/rejected": -0.867551863193512, "logps/chosen": -657.406494140625, "logps/rejected": -894.9517211914062, "loss": 0.473, "rewards/accuracies": 0.90625, "rewards/chosen": -3.0834543704986572, "rewards/margins": 1.7164838314056396, "rewards/rejected": -4.799938201904297, "step": 195 }, { "epoch": 0.2560208996652788, "grad_norm": 14.927847637838813, "learning_rate": 4.639907337861869e-07, "logits/chosen": -0.8298658728599548, "logits/rejected": -0.8566642999649048, "logps/chosen": -635.9237060546875, "logps/rejected": -793.529541015625, "loss": 0.4053, "rewards/accuracies": 0.9375, "rewards/chosen": -2.755927562713623, "rewards/margins": 1.3652548789978027, "rewards/rejected": -4.121182441711426, "step": 196 }, { "epoch": 0.2573271287452037, "grad_norm": 12.418504870497374, "learning_rate": 4.6339827225933657e-07, "logits/chosen": -0.8944472670555115, "logits/rejected": -0.8577872514724731, "logps/chosen": -650.7933959960938, "logps/rejected": -742.0443725585938, "loss": 0.4086, "rewards/accuracies": 0.78125, "rewards/chosen": -3.070491313934326, "rewards/margins": 1.0094940662384033, "rewards/rejected": -4.079985618591309, "step": 197 }, { "epoch": 0.2586333578251286, "grad_norm": 12.08101650781618, "learning_rate": 4.62801361214023e-07, "logits/chosen": -0.8086925745010376, "logits/rejected": -0.856459379196167, "logps/chosen": -634.4454956054688, "logps/rejected": -836.5366821289062, "loss": 0.4246, "rewards/accuracies": 0.84375, "rewards/chosen": -2.9828410148620605, "rewards/margins": 1.5875557661056519, "rewards/rejected": -4.570396900177002, "step": 198 }, { "epoch": 0.2599395869050535, "grad_norm": 19.136812624206406, "learning_rate": 4.622000130963014e-07, "logits/chosen": -0.7772096395492554, "logits/rejected": -0.8084428310394287, "logps/chosen": -703.0916748046875, "logps/rejected": -838.975830078125, "loss": 0.4351, "rewards/accuracies": 0.78125, "rewards/chosen": -3.5541677474975586, "rewards/margins": 1.2203888893127441, "rewards/rejected": -4.7745561599731445, "step": 199 }, { "epoch": 0.26124581598497837, "grad_norm": 30.844882622332946, "learning_rate": 4.6159424044474383e-07, "logits/chosen": -0.783661425113678, "logits/rejected": -0.7943572402000427, "logps/chosen": -686.8040161132812, "logps/rejected": -793.8935546875, "loss": 0.5441, "rewards/accuracies": 0.78125, "rewards/chosen": -3.420450210571289, "rewards/margins": 1.065230369567871, "rewards/rejected": -4.485680103302002, "step": 200 }, { "epoch": 0.26124581598497837, "eval_logits/chosen": -0.6964080929756165, "eval_logits/rejected": -0.6965639591217041, "eval_logps/chosen": -685.1904907226562, "eval_logps/rejected": -860.1481323242188, "eval_loss": 0.44446808099746704, "eval_rewards/accuracies": 0.8360000252723694, "eval_rewards/chosen": -3.4115946292877197, "eval_rewards/margins": 1.6885993480682373, "eval_rewards/rejected": -5.100193500518799, "eval_runtime": 305.4451, "eval_samples_per_second": 6.548, "eval_steps_per_second": 0.409, "step": 200 }, { "epoch": 0.26255204506490326, "grad_norm": 17.760792922849937, "learning_rate": 4.6098405589017676e-07, "logits/chosen": -0.8347344398498535, "logits/rejected": -0.8454535603523254, "logps/chosen": -638.2152099609375, "logps/rejected": -787.5073852539062, "loss": 0.429, "rewards/accuracies": 0.75, "rewards/chosen": -3.0217223167419434, "rewards/margins": 1.3569830656051636, "rewards/rejected": -4.3787055015563965, "step": 201 }, { "epoch": 0.26385827414482815, "grad_norm": 18.25367163242615, "learning_rate": 4.6036947215541856e-07, "logits/chosen": -0.7643561363220215, "logits/rejected": -0.839961051940918, "logps/chosen": -753.9393310546875, "logps/rejected": -926.6439208984375, "loss": 0.5115, "rewards/accuracies": 0.84375, "rewards/chosen": -4.067323207855225, "rewards/margins": 1.2652134895324707, "rewards/rejected": -5.332536697387695, "step": 202 }, { "epoch": 0.26516450322475305, "grad_norm": 16.18688044045776, "learning_rate": 4.597505020550138e-07, "logits/chosen": -0.8352375030517578, "logits/rejected": -0.7859711647033691, "logps/chosen": -640.655517578125, "logps/rejected": -761.5785522460938, "loss": 0.499, "rewards/accuracies": 0.75, "rewards/chosen": -3.453089714050293, "rewards/margins": 1.1171209812164307, "rewards/rejected": -4.570209980010986, "step": 203 }, { "epoch": 0.26647073230467794, "grad_norm": 21.13978640832832, "learning_rate": 4.591271584949662e-07, "logits/chosen": -0.8258641958236694, "logits/rejected": -0.7898882627487183, "logps/chosen": -699.2488403320312, "logps/rejected": -853.1764526367188, "loss": 0.4811, "rewards/accuracies": 0.875, "rewards/chosen": -3.18013858795166, "rewards/margins": 1.710571050643921, "rewards/rejected": -4.89070987701416, "step": 204 }, { "epoch": 0.26777696138460283, "grad_norm": 13.936241022062031, "learning_rate": 4.584994544724695e-07, "logits/chosen": -0.9291080236434937, "logits/rejected": -0.8747403621673584, "logps/chosen": -621.3516235351562, "logps/rejected": -753.9003295898438, "loss": 0.3705, "rewards/accuracies": 0.96875, "rewards/chosen": -2.6660590171813965, "rewards/margins": 1.369396448135376, "rewards/rejected": -4.035455703735352, "step": 205 }, { "epoch": 0.2690831904645277, "grad_norm": 24.239952793904404, "learning_rate": 4.578674030756363e-07, "logits/chosen": -0.8430845141410828, "logits/rejected": -0.874817430973053, "logps/chosen": -669.2354736328125, "logps/rejected": -831.9165649414062, "loss": 0.3822, "rewards/accuracies": 0.875, "rewards/chosen": -3.265414237976074, "rewards/margins": 1.3706064224243164, "rewards/rejected": -4.636020660400391, "step": 206 }, { "epoch": 0.2703894195444526, "grad_norm": 16.98616197060073, "learning_rate": 4.572310174832255e-07, "logits/chosen": -0.7017495036125183, "logits/rejected": -0.6992954015731812, "logps/chosen": -630.5468139648438, "logps/rejected": -817.4756469726562, "loss": 0.3992, "rewards/accuracies": 0.84375, "rewards/chosen": -3.040794849395752, "rewards/margins": 1.788942813873291, "rewards/rejected": -4.829737663269043, "step": 207 }, { "epoch": 0.2716956486243775, "grad_norm": 15.180321612991502, "learning_rate": 4.565903109643672e-07, "logits/chosen": -0.9050486087799072, "logits/rejected": -0.8393621444702148, "logps/chosen": -680.2135009765625, "logps/rejected": -749.2343139648438, "loss": 0.4198, "rewards/accuracies": 0.78125, "rewards/chosen": -2.655306339263916, "rewards/margins": 1.145632028579712, "rewards/rejected": -3.800938367843628, "step": 208 }, { "epoch": 0.2730018777043024, "grad_norm": 25.61838659954735, "learning_rate": 4.5594529687828607e-07, "logits/chosen": -0.678799033164978, "logits/rejected": -0.6235646605491638, "logps/chosen": -606.0216674804688, "logps/rejected": -865.0751953125, "loss": 0.4466, "rewards/accuracies": 0.84375, "rewards/chosen": -3.032256841659546, "rewards/margins": 2.4543349742889404, "rewards/rejected": -5.486591815948486, "step": 209 }, { "epoch": 0.2743081067842273, "grad_norm": 16.73237051103879, "learning_rate": 4.5529598867402314e-07, "logits/chosen": -0.7634575963020325, "logits/rejected": -0.7480574250221252, "logps/chosen": -672.0314331054688, "logps/rejected": -834.9887084960938, "loss": 0.3801, "rewards/accuracies": 0.6875, "rewards/chosen": -3.0546722412109375, "rewards/margins": 1.4492563009262085, "rewards/rejected": -4.5039286613464355, "step": 210 }, { "epoch": 0.2756143358641522, "grad_norm": 14.802819908395904, "learning_rate": 4.5464239989015483e-07, "logits/chosen": -0.8287503123283386, "logits/rejected": -0.8498775959014893, "logps/chosen": -639.4356689453125, "logps/rejected": -721.876708984375, "loss": 0.4012, "rewards/accuracies": 0.6875, "rewards/chosen": -3.190981388092041, "rewards/margins": 0.6815940141677856, "rewards/rejected": -3.872575521469116, "step": 211 }, { "epoch": 0.2769205649440771, "grad_norm": 15.793263099422985, "learning_rate": 4.5398454415451126e-07, "logits/chosen": -0.8768157362937927, "logits/rejected": -0.9070101976394653, "logps/chosen": -717.0108032226562, "logps/rejected": -846.1328125, "loss": 0.4161, "rewards/accuracies": 0.75, "rewards/chosen": -3.556455612182617, "rewards/margins": 1.2665857076644897, "rewards/rejected": -4.823040962219238, "step": 212 }, { "epoch": 0.27822679402400197, "grad_norm": 17.10594111843166, "learning_rate": 4.5332243518389136e-07, "logits/chosen": -0.7659977078437805, "logits/rejected": -0.8257431983947754, "logps/chosen": -705.0125732421875, "logps/rejected": -928.66259765625, "loss": 0.3759, "rewards/accuracies": 0.875, "rewards/chosen": -3.354266405105591, "rewards/margins": 1.808074712753296, "rewards/rejected": -5.162341117858887, "step": 213 }, { "epoch": 0.27953302310392686, "grad_norm": 24.144228395118713, "learning_rate": 4.526560867837776e-07, "logits/chosen": -0.8346929550170898, "logits/rejected": -0.8240299820899963, "logps/chosen": -769.806396484375, "logps/rejected": -960.2931518554688, "loss": 0.4983, "rewards/accuracies": 0.71875, "rewards/chosen": -3.8731882572174072, "rewards/margins": 1.9701744318008423, "rewards/rejected": -5.843362331390381, "step": 214 }, { "epoch": 0.28083925218385175, "grad_norm": 16.56926567578079, "learning_rate": 4.5198551284804773e-07, "logits/chosen": -0.8138917088508606, "logits/rejected": -0.7641476392745972, "logps/chosen": -751.6607666015625, "logps/rejected": -855.57763671875, "loss": 0.4315, "rewards/accuracies": 0.875, "rewards/chosen": -4.096513271331787, "rewards/margins": 1.1268495321273804, "rewards/rejected": -5.223363399505615, "step": 215 }, { "epoch": 0.28214548126377664, "grad_norm": 20.387481706576544, "learning_rate": 4.5131072735868523e-07, "logits/chosen": -0.8751257061958313, "logits/rejected": -0.8823789358139038, "logps/chosen": -729.88818359375, "logps/rejected": -880.8683471679688, "loss": 0.4153, "rewards/accuracies": 0.84375, "rewards/chosen": -3.5861120223999023, "rewards/margins": 1.625636339187622, "rewards/rejected": -5.211748123168945, "step": 216 }, { "epoch": 0.28345171034370154, "grad_norm": 22.915151414898517, "learning_rate": 4.506317443854877e-07, "logits/chosen": -0.6338347792625427, "logits/rejected": -0.6184822916984558, "logps/chosen": -762.0576782226562, "logps/rejected": -895.22607421875, "loss": 0.4508, "rewards/accuracies": 0.71875, "rewards/chosen": -4.526608943939209, "rewards/margins": 1.1892971992492676, "rewards/rejected": -5.715906143188477, "step": 217 }, { "epoch": 0.2847579394236264, "grad_norm": 13.159897885289423, "learning_rate": 4.4994857808577337e-07, "logits/chosen": -0.6982518434524536, "logits/rejected": -0.7118569612503052, "logps/chosen": -781.4166259765625, "logps/rejected": -1003.0726928710938, "loss": 0.308, "rewards/accuracies": 0.90625, "rewards/chosen": -4.0865607261657715, "rewards/margins": 2.2177541255950928, "rewards/rejected": -6.304314613342285, "step": 218 }, { "epoch": 0.2860641685035513, "grad_norm": 27.302561844802497, "learning_rate": 4.492612427040863e-07, "logits/chosen": -0.8502603769302368, "logits/rejected": -0.8385042548179626, "logps/chosen": -834.5120849609375, "logps/rejected": -1042.2188720703125, "loss": 0.4398, "rewards/accuracies": 0.75, "rewards/chosen": -4.505267143249512, "rewards/margins": 2.019343852996826, "rewards/rejected": -6.524610996246338, "step": 219 }, { "epoch": 0.2873703975834762, "grad_norm": 16.41161061540287, "learning_rate": 4.4856975257189896e-07, "logits/chosen": -0.7951454520225525, "logits/rejected": -0.8055264949798584, "logps/chosen": -727.2957153320312, "logps/rejected": -966.2659301757812, "loss": 0.4247, "rewards/accuracies": 0.90625, "rewards/chosen": -3.9063920974731445, "rewards/margins": 2.326056718826294, "rewards/rejected": -6.232448577880859, "step": 220 }, { "epoch": 0.2886766266634011, "grad_norm": 15.073743400941652, "learning_rate": 4.478741221073135e-07, "logits/chosen": -0.8535463809967041, "logits/rejected": -0.850567638874054, "logps/chosen": -794.3897705078125, "logps/rejected": -891.7565307617188, "loss": 0.3925, "rewards/accuracies": 0.8125, "rewards/chosen": -3.9899892807006836, "rewards/margins": 1.1424330472946167, "rewards/rejected": -5.132421970367432, "step": 221 }, { "epoch": 0.289982855743326, "grad_norm": 15.717093144659811, "learning_rate": 4.471743658147614e-07, "logits/chosen": -0.805902361869812, "logits/rejected": -0.7969594597816467, "logps/chosen": -721.6943969726562, "logps/rejected": -866.4282836914062, "loss": 0.4412, "rewards/accuracies": 0.6875, "rewards/chosen": -3.9946467876434326, "rewards/margins": 1.2327009439468384, "rewards/rejected": -5.2273478507995605, "step": 222 }, { "epoch": 0.2912890848232509, "grad_norm": 14.694800440693287, "learning_rate": 4.4647049828470075e-07, "logits/chosen": -0.7446881532669067, "logits/rejected": -0.7209632396697998, "logps/chosen": -654.7391357421875, "logps/rejected": -803.6948852539062, "loss": 0.3623, "rewards/accuracies": 0.84375, "rewards/chosen": -3.3331480026245117, "rewards/margins": 1.561418056488037, "rewards/rejected": -4.894566059112549, "step": 223 }, { "epoch": 0.2925953139031758, "grad_norm": 15.526264148405597, "learning_rate": 4.4576253419331205e-07, "logits/chosen": -0.7109897136688232, "logits/rejected": -0.7546270489692688, "logps/chosen": -685.828857421875, "logps/rejected": -840.8103637695312, "loss": 0.4788, "rewards/accuracies": 0.78125, "rewards/chosen": -3.5382773876190186, "rewards/margins": 1.3709080219268799, "rewards/rejected": -4.909185409545898, "step": 224 }, { "epoch": 0.29390154298310067, "grad_norm": 22.88069996920225, "learning_rate": 4.450504883021923e-07, "logits/chosen": -1.0191212892532349, "logits/rejected": -0.9649794101715088, "logps/chosen": -757.391845703125, "logps/rejected": -843.6371459960938, "loss": 0.4204, "rewards/accuracies": 0.875, "rewards/chosen": -3.7928903102874756, "rewards/margins": 1.3336069583892822, "rewards/rejected": -5.126497268676758, "step": 225 }, { "epoch": 0.29520777206302556, "grad_norm": 13.434275589157789, "learning_rate": 4.4433437545804715e-07, "logits/chosen": -0.8132909536361694, "logits/rejected": -0.7702129483222961, "logps/chosen": -753.9683227539062, "logps/rejected": -931.2380981445312, "loss": 0.4057, "rewards/accuracies": 0.84375, "rewards/chosen": -3.648000955581665, "rewards/margins": 1.8615243434906006, "rewards/rejected": -5.509525299072266, "step": 226 }, { "epoch": 0.29651400114295046, "grad_norm": 12.235894599708347, "learning_rate": 4.436142105923814e-07, "logits/chosen": -0.9691725969314575, "logits/rejected": -1.0009491443634033, "logps/chosen": -696.4452514648438, "logps/rejected": -795.2962036132812, "loss": 0.3845, "rewards/accuracies": 0.8125, "rewards/chosen": -3.3394429683685303, "rewards/margins": 1.029392123222351, "rewards/rejected": -4.368834972381592, "step": 227 }, { "epoch": 0.29782023022287535, "grad_norm": 22.271476941430596, "learning_rate": 4.4289000872118767e-07, "logits/chosen": -0.8251463174819946, "logits/rejected": -0.8388761281967163, "logps/chosen": -622.3071899414062, "logps/rejected": -791.090087890625, "loss": 0.3786, "rewards/accuracies": 0.84375, "rewards/chosen": -3.2112112045288086, "rewards/margins": 1.293255090713501, "rewards/rejected": -4.5044660568237305, "step": 228 }, { "epoch": 0.29912645930280024, "grad_norm": 18.189840903303466, "learning_rate": 4.4216178494463295e-07, "logits/chosen": -0.9312427043914795, "logits/rejected": -0.9146935343742371, "logps/chosen": -785.6533203125, "logps/rejected": -880.157958984375, "loss": 0.3356, "rewards/accuracies": 0.8125, "rewards/chosen": -3.6715898513793945, "rewards/margins": 1.162749171257019, "rewards/rejected": -4.834339141845703, "step": 229 }, { "epoch": 0.30043268838272513, "grad_norm": 26.36376504008489, "learning_rate": 4.4142955444674463e-07, "logits/chosen": -0.9649460315704346, "logits/rejected": -0.9716936349868774, "logps/chosen": -664.906005859375, "logps/rejected": -912.1724853515625, "loss": 0.4067, "rewards/accuracies": 0.9375, "rewards/chosen": -3.310150384902954, "rewards/margins": 1.8418421745300293, "rewards/rejected": -5.1519927978515625, "step": 230 }, { "epoch": 0.30173891746265, "grad_norm": 19.94846396440649, "learning_rate": 4.406933324950928e-07, "logits/chosen": -0.9337835311889648, "logits/rejected": -0.9488065242767334, "logps/chosen": -800.484375, "logps/rejected": -963.2352294921875, "loss": 0.375, "rewards/accuracies": 0.78125, "rewards/chosen": -3.947709798812866, "rewards/margins": 1.5597035884857178, "rewards/rejected": -5.507413387298584, "step": 231 }, { "epoch": 0.3030451465425749, "grad_norm": 16.356495739331635, "learning_rate": 4.3995313444047254e-07, "logits/chosen": -0.9251805543899536, "logits/rejected": -0.8757014274597168, "logps/chosen": -750.8514404296875, "logps/rejected": -866.1437377929688, "loss": 0.4655, "rewards/accuracies": 0.75, "rewards/chosen": -3.803636074066162, "rewards/margins": 1.2165488004684448, "rewards/rejected": -5.020185470581055, "step": 232 }, { "epoch": 0.3043513756224998, "grad_norm": 21.24285096426969, "learning_rate": 4.3920897571658406e-07, "logits/chosen": -0.8091636896133423, "logits/rejected": -0.8993147611618042, "logps/chosen": -768.8638916015625, "logps/rejected": -1060.4085693359375, "loss": 0.4114, "rewards/accuracies": 0.78125, "rewards/chosen": -4.014678478240967, "rewards/margins": 2.306403875350952, "rewards/rejected": -6.321082592010498, "step": 233 }, { "epoch": 0.3056576047024247, "grad_norm": 17.134518173916593, "learning_rate": 4.384608718397102e-07, "logits/chosen": -0.7329398393630981, "logits/rejected": -0.7744626402854919, "logps/chosen": -693.506103515625, "logps/rejected": -891.974609375, "loss": 0.4762, "rewards/accuracies": 0.71875, "rewards/chosen": -3.8318393230438232, "rewards/margins": 1.836269736289978, "rewards/rejected": -5.668108940124512, "step": 234 }, { "epoch": 0.3069638337823496, "grad_norm": 16.555347617715572, "learning_rate": 4.377088384083935e-07, "logits/chosen": -0.8090454339981079, "logits/rejected": -0.8010983467102051, "logps/chosen": -734.9536743164062, "logps/rejected": -961.3272705078125, "loss": 0.3279, "rewards/accuracies": 0.96875, "rewards/chosen": -3.9544529914855957, "rewards/margins": 2.0194091796875, "rewards/rejected": -5.9738616943359375, "step": 235 }, { "epoch": 0.3082700628622745, "grad_norm": 21.692512442920936, "learning_rate": 4.369528911031105e-07, "logits/chosen": -0.6334437727928162, "logits/rejected": -0.7008833289146423, "logps/chosen": -765.7869873046875, "logps/rejected": -925.2149047851562, "loss": 0.3784, "rewards/accuracies": 0.84375, "rewards/chosen": -4.122602939605713, "rewards/margins": 1.3997082710266113, "rewards/rejected": -5.522310733795166, "step": 236 }, { "epoch": 0.3095762919421994, "grad_norm": 19.148346287583163, "learning_rate": 4.3619304568594546e-07, "logits/chosen": -0.878132700920105, "logits/rejected": -0.9558409452438354, "logps/chosen": -673.8487548828125, "logps/rejected": -857.96240234375, "loss": 0.4497, "rewards/accuracies": 0.875, "rewards/chosen": -3.3650078773498535, "rewards/margins": 1.5477778911590576, "rewards/rejected": -4.912785530090332, "step": 237 }, { "epoch": 0.31088252102212427, "grad_norm": 15.517792339257628, "learning_rate": 4.354293180002608e-07, "logits/chosen": -0.7494323253631592, "logits/rejected": -0.7071195840835571, "logps/chosen": -731.3680419921875, "logps/rejected": -777.686767578125, "loss": 0.3642, "rewards/accuracies": 0.71875, "rewards/chosen": -3.9493746757507324, "rewards/margins": 0.709790050983429, "rewards/rejected": -4.6591644287109375, "step": 238 }, { "epoch": 0.31218875010204916, "grad_norm": 19.81836418744902, "learning_rate": 4.346617239703676e-07, "logits/chosen": -0.7905477285385132, "logits/rejected": -0.7663273811340332, "logps/chosen": -812.0971069335938, "logps/rejected": -957.8253173828125, "loss": 0.3362, "rewards/accuracies": 0.84375, "rewards/chosen": -4.33413553237915, "rewards/margins": 1.6475704908370972, "rewards/rejected": -5.981706142425537, "step": 239 }, { "epoch": 0.31349497918197405, "grad_norm": 21.50199670306285, "learning_rate": 4.338902796011929e-07, "logits/chosen": -0.8615242838859558, "logits/rejected": -0.8263786435127258, "logps/chosen": -781.7965698242188, "logps/rejected": -923.489990234375, "loss": 0.3566, "rewards/accuracies": 0.96875, "rewards/chosen": -3.6246654987335205, "rewards/margins": 1.7381658554077148, "rewards/rejected": -5.362831115722656, "step": 240 }, { "epoch": 0.31480120826189895, "grad_norm": 18.40707890032469, "learning_rate": 4.331150009779465e-07, "logits/chosen": -0.8664236068725586, "logits/rejected": -0.8414658904075623, "logps/chosen": -730.3052368164062, "logps/rejected": -858.5838623046875, "loss": 0.429, "rewards/accuracies": 0.71875, "rewards/chosen": -3.8383097648620605, "rewards/margins": 1.3162572383880615, "rewards/rejected": -5.154566764831543, "step": 241 }, { "epoch": 0.31610743734182384, "grad_norm": 19.439482180801054, "learning_rate": 4.323359042657853e-07, "logits/chosen": -0.7379111051559448, "logits/rejected": -0.71939617395401, "logps/chosen": -810.045166015625, "logps/rejected": -912.1922607421875, "loss": 0.3901, "rewards/accuracies": 0.71875, "rewards/chosen": -4.5196075439453125, "rewards/margins": 0.9989256858825684, "rewards/rejected": -5.518532752990723, "step": 242 }, { "epoch": 0.31741366642174873, "grad_norm": 18.81095569934508, "learning_rate": 4.3155300570947624e-07, "logits/chosen": -0.8109769821166992, "logits/rejected": -0.7917392253875732, "logps/chosen": -820.1282958984375, "logps/rejected": -953.7001342773438, "loss": 0.376, "rewards/accuracies": 0.8125, "rewards/chosen": -3.9693336486816406, "rewards/margins": 1.540510654449463, "rewards/rejected": -5.5098443031311035, "step": 243 }, { "epoch": 0.3187198955016736, "grad_norm": 18.232556176954414, "learning_rate": 4.307663216330577e-07, "logits/chosen": -0.824482262134552, "logits/rejected": -0.859273374080658, "logps/chosen": -725.1882934570312, "logps/rejected": -920.5584716796875, "loss": 0.4058, "rewards/accuracies": 0.84375, "rewards/chosen": -3.7261176109313965, "rewards/margins": 1.9005296230316162, "rewards/rejected": -5.626646995544434, "step": 244 }, { "epoch": 0.3200261245815985, "grad_norm": 20.052726394978947, "learning_rate": 4.2997586843949896e-07, "logits/chosen": -0.8550601005554199, "logits/rejected": -0.872449517250061, "logps/chosen": -739.6688842773438, "logps/rejected": -893.7305908203125, "loss": 0.4682, "rewards/accuracies": 0.71875, "rewards/chosen": -3.840355157852173, "rewards/margins": 1.4003340005874634, "rewards/rejected": -5.240689277648926, "step": 245 }, { "epoch": 0.3213323536615234, "grad_norm": 22.120568271547505, "learning_rate": 4.2918166261035847e-07, "logits/chosen": -0.5994831323623657, "logits/rejected": -0.6662572622299194, "logps/chosen": -685.1893920898438, "logps/rejected": -880.3656005859375, "loss": 0.3879, "rewards/accuracies": 0.8125, "rewards/chosen": -3.6331100463867188, "rewards/margins": 1.4999750852584839, "rewards/rejected": -5.133085250854492, "step": 246 }, { "epoch": 0.3226385827414483, "grad_norm": 25.04758767065529, "learning_rate": 4.283837207054399e-07, "logits/chosen": -0.6785750985145569, "logits/rejected": -0.69914710521698, "logps/chosen": -704.94091796875, "logps/rejected": -909.0919799804688, "loss": 0.4266, "rewards/accuracies": 0.75, "rewards/chosen": -3.776233673095703, "rewards/margins": 1.8772923946380615, "rewards/rejected": -5.6535258293151855, "step": 247 }, { "epoch": 0.3239448118213732, "grad_norm": 15.071461682352357, "learning_rate": 4.2758205936244706e-07, "logits/chosen": -0.7484068870544434, "logits/rejected": -0.7148491144180298, "logps/chosen": -770.5116577148438, "logps/rejected": -923.0064086914062, "loss": 0.3902, "rewards/accuracies": 0.8125, "rewards/chosen": -4.398849010467529, "rewards/margins": 1.488804817199707, "rewards/rejected": -5.8876543045043945, "step": 248 }, { "epoch": 0.3252510409012981, "grad_norm": 16.115715937528535, "learning_rate": 4.2677669529663686e-07, "logits/chosen": -0.8120733499526978, "logits/rejected": -0.7861438393592834, "logps/chosen": -716.216064453125, "logps/rejected": -879.9732666015625, "loss": 0.344, "rewards/accuracies": 0.84375, "rewards/chosen": -3.722062587738037, "rewards/margins": 1.7555803060531616, "rewards/rejected": -5.477643013000488, "step": 249 }, { "epoch": 0.326557269981223, "grad_norm": 27.76120754393567, "learning_rate": 4.259676453004708e-07, "logits/chosen": -0.7561047077178955, "logits/rejected": -0.7340124845504761, "logps/chosen": -738.0452880859375, "logps/rejected": -907.1728515625, "loss": 0.4392, "rewards/accuracies": 0.78125, "rewards/chosen": -3.9837238788604736, "rewards/margins": 1.5732449293136597, "rewards/rejected": -5.556968688964844, "step": 250 }, { "epoch": 0.32786349906114787, "grad_norm": 15.979247504106482, "learning_rate": 4.25154926243265e-07, "logits/chosen": -0.8759682178497314, "logits/rejected": -0.7738591432571411, "logps/chosen": -714.5944213867188, "logps/rejected": -795.8113403320312, "loss": 0.3328, "rewards/accuracies": 0.8125, "rewards/chosen": -3.2611489295959473, "rewards/margins": 1.2338060140609741, "rewards/rejected": -4.494955539703369, "step": 251 }, { "epoch": 0.32916972814107276, "grad_norm": 21.82665055739408, "learning_rate": 4.2433855507083816e-07, "logits/chosen": -0.8485604524612427, "logits/rejected": -0.8467884659767151, "logps/chosen": -714.1653442382812, "logps/rejected": -866.3451538085938, "loss": 0.4448, "rewards/accuracies": 0.75, "rewards/chosen": -3.7272872924804688, "rewards/margins": 1.4408416748046875, "rewards/rejected": -5.168128967285156, "step": 252 }, { "epoch": 0.33047595722099765, "grad_norm": 21.215293194514476, "learning_rate": 4.235185488051585e-07, "logits/chosen": -0.706048846244812, "logits/rejected": -0.7180212140083313, "logps/chosen": -710.47314453125, "logps/rejected": -951.2412109375, "loss": 0.3518, "rewards/accuracies": 0.8125, "rewards/chosen": -3.7834746837615967, "rewards/margins": 1.9697420597076416, "rewards/rejected": -5.75321626663208, "step": 253 }, { "epoch": 0.33178218630092254, "grad_norm": 30.62127400438521, "learning_rate": 4.226949245439887e-07, "logits/chosen": -0.7309754490852356, "logits/rejected": -0.7566152215003967, "logps/chosen": -622.594970703125, "logps/rejected": -773.9696044921875, "loss": 0.3909, "rewards/accuracies": 0.6875, "rewards/chosen": -3.568760871887207, "rewards/margins": 1.102934718132019, "rewards/rejected": -4.671695709228516, "step": 254 }, { "epoch": 0.33308841538084744, "grad_norm": 13.591190068285586, "learning_rate": 4.2186769946052945e-07, "logits/chosen": -0.8123141527175903, "logits/rejected": -0.783890426158905, "logps/chosen": -756.3726196289062, "logps/rejected": -836.5651245117188, "loss": 0.3886, "rewards/accuracies": 0.75, "rewards/chosen": -3.8985278606414795, "rewards/margins": 1.0888140201568604, "rewards/rejected": -4.98734188079834, "step": 255 }, { "epoch": 0.33439464446077233, "grad_norm": 16.629705733383222, "learning_rate": 4.210368908030614e-07, "logits/chosen": -0.9059042930603027, "logits/rejected": -0.9054913520812988, "logps/chosen": -769.614013671875, "logps/rejected": -895.432861328125, "loss": 0.394, "rewards/accuracies": 0.875, "rewards/chosen": -3.691370725631714, "rewards/margins": 1.422187328338623, "rewards/rejected": -5.113557815551758, "step": 256 }, { "epoch": 0.3357008735406972, "grad_norm": 14.971829751828695, "learning_rate": 4.202025158945855e-07, "logits/chosen": -0.8011924624443054, "logits/rejected": -0.8006505966186523, "logps/chosen": -684.6802368164062, "logps/rejected": -919.0767822265625, "loss": 0.366, "rewards/accuracies": 0.875, "rewards/chosen": -4.046587944030762, "rewards/margins": 1.879364252090454, "rewards/rejected": -5.925952434539795, "step": 257 }, { "epoch": 0.3370071026206221, "grad_norm": 19.668038427375784, "learning_rate": 4.1936459213246166e-07, "logits/chosen": -0.8572171926498413, "logits/rejected": -0.8683452010154724, "logps/chosen": -809.2364501953125, "logps/rejected": -1078.41845703125, "loss": 0.3368, "rewards/accuracies": 0.84375, "rewards/chosen": -4.206806182861328, "rewards/margins": 2.6465835571289062, "rewards/rejected": -6.853388786315918, "step": 258 }, { "epoch": 0.338313331700547, "grad_norm": 39.6303014381974, "learning_rate": 4.185231369880461e-07, "logits/chosen": -0.7904149293899536, "logits/rejected": -0.7628904581069946, "logps/chosen": -836.9259033203125, "logps/rejected": -1044.5078125, "loss": 0.6005, "rewards/accuracies": 0.65625, "rewards/chosen": -5.005545616149902, "rewards/margins": 1.8065614700317383, "rewards/rejected": -6.812107563018799, "step": 259 }, { "epoch": 0.3396195607804719, "grad_norm": 18.040502881336984, "learning_rate": 4.176781680063274e-07, "logits/chosen": -0.8526827692985535, "logits/rejected": -0.8415613174438477, "logps/chosen": -822.579833984375, "logps/rejected": -931.8614501953125, "loss": 0.3968, "rewards/accuracies": 0.84375, "rewards/chosen": -4.4847869873046875, "rewards/margins": 1.229368805885315, "rewards/rejected": -5.714155673980713, "step": 260 }, { "epoch": 0.3409257898603968, "grad_norm": 18.615431463139885, "learning_rate": 4.1682970280555987e-07, "logits/chosen": -0.7734108567237854, "logits/rejected": -0.7589735388755798, "logps/chosen": -727.7169189453125, "logps/rejected": -957.0603637695312, "loss": 0.376, "rewards/accuracies": 0.78125, "rewards/chosen": -4.171051025390625, "rewards/margins": 2.155472993850708, "rewards/rejected": -6.326523780822754, "step": 261 }, { "epoch": 0.3422320189403217, "grad_norm": 14.85047813726669, "learning_rate": 4.1597775907689706e-07, "logits/chosen": -0.9151461720466614, "logits/rejected": -0.8985196352005005, "logps/chosen": -783.2546997070312, "logps/rejected": -905.3399658203125, "loss": 0.4107, "rewards/accuracies": 0.90625, "rewards/chosen": -4.076227188110352, "rewards/margins": 1.4259974956512451, "rewards/rejected": -5.502224922180176, "step": 262 }, { "epoch": 0.3435382480202466, "grad_norm": 18.614910926586862, "learning_rate": 4.1512235458402243e-07, "logits/chosen": -0.8314370512962341, "logits/rejected": -0.8302509784698486, "logps/chosen": -783.2666625976562, "logps/rejected": -1022.0087890625, "loss": 0.4005, "rewards/accuracies": 0.875, "rewards/chosen": -4.511577606201172, "rewards/margins": 2.0172276496887207, "rewards/rejected": -6.528805732727051, "step": 263 }, { "epoch": 0.34484447710017146, "grad_norm": 17.858743835450216, "learning_rate": 4.142635071627789e-07, "logits/chosen": -0.7118708491325378, "logits/rejected": -0.7446046471595764, "logps/chosen": -711.8963012695312, "logps/rejected": -930.8863525390625, "loss": 0.3693, "rewards/accuracies": 0.875, "rewards/chosen": -3.9071197509765625, "rewards/margins": 1.9890848398208618, "rewards/rejected": -5.896204948425293, "step": 264 }, { "epoch": 0.34615070618009636, "grad_norm": 36.928248348050545, "learning_rate": 4.1340123472079736e-07, "logits/chosen": -0.783199667930603, "logits/rejected": -0.8067238330841064, "logps/chosen": -670.074951171875, "logps/rejected": -888.4215698242188, "loss": 0.3255, "rewards/accuracies": 0.96875, "rewards/chosen": -3.4405863285064697, "rewards/margins": 2.0715410709381104, "rewards/rejected": -5.512126922607422, "step": 265 }, { "epoch": 0.34745693526002125, "grad_norm": 16.839612613063686, "learning_rate": 4.125355552371226e-07, "logits/chosen": -0.8244245648384094, "logits/rejected": -0.7949088215827942, "logps/chosen": -735.464599609375, "logps/rejected": -927.4498901367188, "loss": 0.4041, "rewards/accuracies": 0.875, "rewards/chosen": -4.125042915344238, "rewards/margins": 1.8313933610916138, "rewards/rejected": -5.956435203552246, "step": 266 }, { "epoch": 0.34876316433994614, "grad_norm": 16.593790276306056, "learning_rate": 4.116664867618394e-07, "logits/chosen": -0.7892999649047852, "logits/rejected": -0.8079116940498352, "logps/chosen": -764.3323974609375, "logps/rejected": -880.1994018554688, "loss": 0.3958, "rewards/accuracies": 0.71875, "rewards/chosen": -4.445393085479736, "rewards/margins": 1.059816837310791, "rewards/rejected": -5.505209922790527, "step": 267 }, { "epoch": 0.35006939341987103, "grad_norm": 15.64984000116705, "learning_rate": 4.1079404741569513e-07, "logits/chosen": -0.743574857711792, "logits/rejected": -0.7336382269859314, "logps/chosen": -741.19677734375, "logps/rejected": -862.8565063476562, "loss": 0.3945, "rewards/accuracies": 0.78125, "rewards/chosen": -4.141275882720947, "rewards/margins": 1.4089856147766113, "rewards/rejected": -5.550261497497559, "step": 268 }, { "epoch": 0.3513756224997959, "grad_norm": 23.080229037991064, "learning_rate": 4.099182553897228e-07, "logits/chosen": -0.7218165397644043, "logits/rejected": -0.7307446599006653, "logps/chosen": -690.0428466796875, "logps/rejected": -1017.659423828125, "loss": 0.3222, "rewards/accuracies": 0.9375, "rewards/chosen": -3.46806263923645, "rewards/margins": 2.848270893096924, "rewards/rejected": -6.316333770751953, "step": 269 }, { "epoch": 0.3526818515797208, "grad_norm": 15.137850783640578, "learning_rate": 4.0903912894486115e-07, "logits/chosen": -0.8584780693054199, "logits/rejected": -0.8682304620742798, "logps/chosen": -786.788330078125, "logps/rejected": -964.2037963867188, "loss": 0.3801, "rewards/accuracies": 0.75, "rewards/chosen": -4.453627586364746, "rewards/margins": 1.5645616054534912, "rewards/rejected": -6.018187999725342, "step": 270 }, { "epoch": 0.3539880806596457, "grad_norm": 16.920756196804312, "learning_rate": 4.0815668641157407e-07, "logits/chosen": -0.720942497253418, "logits/rejected": -0.7908264398574829, "logps/chosen": -785.8327026367188, "logps/rejected": -1009.3907470703125, "loss": 0.3295, "rewards/accuracies": 0.875, "rewards/chosen": -4.186331748962402, "rewards/margins": 1.9950276613235474, "rewards/rejected": -6.181359767913818, "step": 271 }, { "epoch": 0.3552943097395706, "grad_norm": 19.264848572771506, "learning_rate": 4.072709461894687e-07, "logits/chosen": -0.7569836974143982, "logits/rejected": -0.7545579671859741, "logps/chosen": -749.757080078125, "logps/rejected": -933.4493408203125, "loss": 0.3748, "rewards/accuracies": 0.875, "rewards/chosen": -4.149528503417969, "rewards/margins": 1.973623275756836, "rewards/rejected": -6.123151779174805, "step": 272 }, { "epoch": 0.3566005388194955, "grad_norm": 16.611848455705292, "learning_rate": 4.063819267469113e-07, "logits/chosen": -0.7995561957359314, "logits/rejected": -0.8192765712738037, "logps/chosen": -752.6700439453125, "logps/rejected": -1020.6839599609375, "loss": 0.363, "rewards/accuracies": 0.9375, "rewards/chosen": -4.0336713790893555, "rewards/margins": 2.4031214714050293, "rewards/rejected": -6.436792850494385, "step": 273 }, { "epoch": 0.3579067678994204, "grad_norm": 28.426846813580198, "learning_rate": 4.054896466206426e-07, "logits/chosen": -0.7028381824493408, "logits/rejected": -0.68387770652771, "logps/chosen": -789.5866088867188, "logps/rejected": -966.2410888671875, "loss": 0.4124, "rewards/accuracies": 0.875, "rewards/chosen": -4.212189674377441, "rewards/margins": 1.875523328781128, "rewards/rejected": -6.087713241577148, "step": 274 }, { "epoch": 0.3592129969793453, "grad_norm": 21.783109948267267, "learning_rate": 4.0459412441539097e-07, "logits/chosen": -0.720589280128479, "logits/rejected": -0.7432644367218018, "logps/chosen": -799.8099365234375, "logps/rejected": -975.6666259765625, "loss": 0.3745, "rewards/accuracies": 0.78125, "rewards/chosen": -4.347837448120117, "rewards/margins": 1.7226536273956299, "rewards/rejected": -6.070490837097168, "step": 275 }, { "epoch": 0.36051922605927017, "grad_norm": 19.53557011898164, "learning_rate": 4.036953788034846e-07, "logits/chosen": -0.7636262774467468, "logits/rejected": -0.7430158257484436, "logps/chosen": -754.807861328125, "logps/rejected": -938.4634399414062, "loss": 0.4592, "rewards/accuracies": 0.78125, "rewards/chosen": -4.003164768218994, "rewards/margins": 1.8176988363265991, "rewards/rejected": -5.820863723754883, "step": 276 }, { "epoch": 0.36182545513919506, "grad_norm": 22.123611426990557, "learning_rate": 4.027934285244623e-07, "logits/chosen": -0.7706893682479858, "logits/rejected": -0.7902205586433411, "logps/chosen": -764.242919921875, "logps/rejected": -956.5457763671875, "loss": 0.303, "rewards/accuracies": 0.875, "rewards/chosen": -3.8078622817993164, "rewards/margins": 2.1033294200897217, "rewards/rejected": -5.911191940307617, "step": 277 }, { "epoch": 0.36313168421911995, "grad_norm": 18.678151028721693, "learning_rate": 4.0188829238468256e-07, "logits/chosen": -0.8828777074813843, "logits/rejected": -0.9020895957946777, "logps/chosen": -643.5447387695312, "logps/rejected": -933.9203491210938, "loss": 0.4152, "rewards/accuracies": 0.71875, "rewards/chosen": -3.4640512466430664, "rewards/margins": 2.1585946083068848, "rewards/rejected": -5.622645378112793, "step": 278 }, { "epoch": 0.36443791329904485, "grad_norm": 20.975474287569014, "learning_rate": 4.0097998925693166e-07, "logits/chosen": -0.8540270924568176, "logits/rejected": -0.8402454853057861, "logps/chosen": -715.0292358398438, "logps/rejected": -820.9437866210938, "loss": 0.395, "rewards/accuracies": 0.8125, "rewards/chosen": -3.91107439994812, "rewards/margins": 1.1521496772766113, "rewards/rejected": -5.063223838806152, "step": 279 }, { "epoch": 0.36574414237896974, "grad_norm": 26.519174108929906, "learning_rate": 4.0006853808002984e-07, "logits/chosen": -0.7778640389442444, "logits/rejected": -0.792134702205658, "logps/chosen": -669.5437622070312, "logps/rejected": -876.0143432617188, "loss": 0.3501, "rewards/accuracies": 0.875, "rewards/chosen": -3.469069242477417, "rewards/margins": 1.8177546262741089, "rewards/rejected": -5.2868242263793945, "step": 280 }, { "epoch": 0.36705037145889463, "grad_norm": 26.476765515958245, "learning_rate": 3.9915395785843674e-07, "logits/chosen": -0.8218634128570557, "logits/rejected": -0.8803665637969971, "logps/chosen": -788.5381469726562, "logps/rejected": -1056.4686279296875, "loss": 0.3851, "rewards/accuracies": 0.8125, "rewards/chosen": -4.204514503479004, "rewards/margins": 2.040257692337036, "rewards/rejected": -6.244772434234619, "step": 281 }, { "epoch": 0.3683566005388195, "grad_norm": 18.11399655933868, "learning_rate": 3.9823626766185493e-07, "logits/chosen": -0.7796139121055603, "logits/rejected": -0.8175578117370605, "logps/chosen": -761.0985717773438, "logps/rejected": -1089.9530029296875, "loss": 0.3791, "rewards/accuracies": 0.90625, "rewards/chosen": -4.092470645904541, "rewards/margins": 2.7390975952148438, "rewards/rejected": -6.831568717956543, "step": 282 }, { "epoch": 0.36966282961874436, "grad_norm": 17.708912321185966, "learning_rate": 3.973154866248323e-07, "logits/chosen": -0.7699542045593262, "logits/rejected": -0.7915944457054138, "logps/chosen": -720.2528686523438, "logps/rejected": -922.4497680664062, "loss": 0.3909, "rewards/accuracies": 0.875, "rewards/chosen": -4.033339500427246, "rewards/margins": 1.768364667892456, "rewards/rejected": -5.801704406738281, "step": 283 }, { "epoch": 0.37096905869866925, "grad_norm": 24.59750532391127, "learning_rate": 3.963916339463632e-07, "logits/chosen": -0.8112433552742004, "logits/rejected": -0.8277667760848999, "logps/chosen": -811.3380126953125, "logps/rejected": -985.3678588867188, "loss": 0.4022, "rewards/accuracies": 0.75, "rewards/chosen": -4.430903434753418, "rewards/margins": 1.9277082681655884, "rewards/rejected": -6.358611106872559, "step": 284 }, { "epoch": 0.37227528777859414, "grad_norm": 17.110729368478943, "learning_rate": 3.954647288894882e-07, "logits/chosen": -0.7523562908172607, "logits/rejected": -0.7668542861938477, "logps/chosen": -688.250732421875, "logps/rejected": -911.3945922851562, "loss": 0.359, "rewards/accuracies": 0.84375, "rewards/chosen": -3.7461307048797607, "rewards/margins": 2.113868474960327, "rewards/rejected": -5.859999179840088, "step": 285 }, { "epoch": 0.37358151685851904, "grad_norm": 25.899394843198557, "learning_rate": 3.9453479078089215e-07, "logits/chosen": -0.8517543077468872, "logits/rejected": -0.861216127872467, "logps/chosen": -755.353759765625, "logps/rejected": -947.4692993164062, "loss": 0.377, "rewards/accuracies": 0.90625, "rewards/chosen": -3.596562147140503, "rewards/margins": 1.961177110671997, "rewards/rejected": -5.5577392578125, "step": 286 }, { "epoch": 0.37488774593844393, "grad_norm": 19.089362499056715, "learning_rate": 3.936018390105013e-07, "logits/chosen": -0.7259124517440796, "logits/rejected": -0.7561787962913513, "logps/chosen": -753.0985717773438, "logps/rejected": -1026.896728515625, "loss": 0.3647, "rewards/accuracies": 0.875, "rewards/chosen": -3.9283385276794434, "rewards/margins": 2.4963464736938477, "rewards/rejected": -6.424685478210449, "step": 287 }, { "epoch": 0.3761939750183688, "grad_norm": 18.268500447333547, "learning_rate": 3.926658930310793e-07, "logits/chosen": -0.8416800498962402, "logits/rejected": -0.8493793606758118, "logps/chosen": -744.7431640625, "logps/rejected": -862.709228515625, "loss": 0.4318, "rewards/accuracies": 0.875, "rewards/chosen": -4.217617511749268, "rewards/margins": 1.3080933094024658, "rewards/rejected": -5.525710582733154, "step": 288 }, { "epoch": 0.3775002040982937, "grad_norm": 24.401828710049934, "learning_rate": 3.9172697235782113e-07, "logits/chosen": -0.8084543347358704, "logits/rejected": -0.8261023759841919, "logps/chosen": -656.1014404296875, "logps/rejected": -815.1503295898438, "loss": 0.3557, "rewards/accuracies": 0.84375, "rewards/chosen": -3.5779876708984375, "rewards/margins": 1.3531869649887085, "rewards/rejected": -4.9311747550964355, "step": 289 }, { "epoch": 0.3788064331782186, "grad_norm": 26.87923966516802, "learning_rate": 3.907850965679467e-07, "logits/chosen": -0.9053621888160706, "logits/rejected": -0.8963493704795837, "logps/chosen": -796.131103515625, "logps/rejected": -926.56787109375, "loss": 0.4174, "rewards/accuracies": 0.78125, "rewards/chosen": -3.860959529876709, "rewards/margins": 1.4262628555297852, "rewards/rejected": -5.287222385406494, "step": 290 }, { "epoch": 0.3801126622581435, "grad_norm": 19.905113208319822, "learning_rate": 3.898402853002921e-07, "logits/chosen": -0.7205644845962524, "logits/rejected": -0.7387488484382629, "logps/chosen": -710.8943481445312, "logps/rejected": -1005.5984497070312, "loss": 0.3575, "rewards/accuracies": 0.875, "rewards/chosen": -3.6093287467956543, "rewards/margins": 2.4270055294036865, "rewards/rejected": -6.03633451461792, "step": 291 }, { "epoch": 0.3814188913380684, "grad_norm": 18.78689097302734, "learning_rate": 3.8889255825490053e-07, "logits/chosen": -0.8066189289093018, "logits/rejected": -0.8041746616363525, "logps/chosen": -678.9967041015625, "logps/rejected": -807.6049194335938, "loss": 0.4242, "rewards/accuracies": 0.84375, "rewards/chosen": -3.634415626525879, "rewards/margins": 1.2685359716415405, "rewards/rejected": -4.902951240539551, "step": 292 }, { "epoch": 0.3827251204179933, "grad_norm": 24.124003964607752, "learning_rate": 3.879419351926115e-07, "logits/chosen": -0.8516795039176941, "logits/rejected": -0.8285474181175232, "logps/chosen": -713.4191284179688, "logps/rejected": -836.4865112304688, "loss": 0.448, "rewards/accuracies": 0.8125, "rewards/chosen": -3.8255889415740967, "rewards/margins": 1.2763196229934692, "rewards/rejected": -5.1019086837768555, "step": 293 }, { "epoch": 0.3840313494979182, "grad_norm": 24.512379687313626, "learning_rate": 3.8698843593464843e-07, "logits/chosen": -0.874547004699707, "logits/rejected": -0.8514626622200012, "logps/chosen": -662.3638916015625, "logps/rejected": -867.0076293945312, "loss": 0.382, "rewards/accuracies": 0.78125, "rewards/chosen": -3.4917638301849365, "rewards/margins": 2.1748902797698975, "rewards/rejected": -5.66665506362915, "step": 294 }, { "epoch": 0.38533757857784307, "grad_norm": 23.28648570945252, "learning_rate": 3.860320803622059e-07, "logits/chosen": -0.8211836814880371, "logits/rejected": -0.7879142761230469, "logps/chosen": -709.470947265625, "logps/rejected": -909.6597290039062, "loss": 0.3984, "rewards/accuracies": 0.59375, "rewards/chosen": -3.751055955886841, "rewards/margins": 1.771988034248352, "rewards/rejected": -5.523044109344482, "step": 295 }, { "epoch": 0.38664380765776796, "grad_norm": 15.966272480655949, "learning_rate": 3.850728884160347e-07, "logits/chosen": -0.9212681651115417, "logits/rejected": -0.8988990783691406, "logps/chosen": -700.7277221679688, "logps/rejected": -839.85302734375, "loss": 0.4332, "rewards/accuracies": 0.71875, "rewards/chosen": -3.487891674041748, "rewards/margins": 1.4060091972351074, "rewards/rejected": -4.8939008712768555, "step": 296 }, { "epoch": 0.38795003673769285, "grad_norm": 16.62431980117085, "learning_rate": 3.841108800960264e-07, "logits/chosen": -0.8558894991874695, "logits/rejected": -0.8787685036659241, "logps/chosen": -702.3900756835938, "logps/rejected": -1032.5880126953125, "loss": 0.3388, "rewards/accuracies": 0.8125, "rewards/chosen": -3.6224589347839355, "rewards/margins": 2.8511135578155518, "rewards/rejected": -6.473572731018066, "step": 297 }, { "epoch": 0.38925626581761774, "grad_norm": 20.505033729319532, "learning_rate": 3.831460754607958e-07, "logits/chosen": -0.8187196254730225, "logits/rejected": -0.8213932514190674, "logps/chosen": -654.1541748046875, "logps/rejected": -828.8375244140625, "loss": 0.449, "rewards/accuracies": 0.875, "rewards/chosen": -3.3246121406555176, "rewards/margins": 1.8316363096237183, "rewards/rejected": -5.156248092651367, "step": 298 }, { "epoch": 0.39056249489754263, "grad_norm": 24.663180868372333, "learning_rate": 3.821784946272633e-07, "logits/chosen": -0.7171926498413086, "logits/rejected": -0.6735442876815796, "logps/chosen": -678.7763671875, "logps/rejected": -844.7467041015625, "loss": 0.3789, "rewards/accuracies": 0.84375, "rewards/chosen": -3.276444911956787, "rewards/margins": 1.5180835723876953, "rewards/rejected": -4.794528007507324, "step": 299 }, { "epoch": 0.3918687239774675, "grad_norm": 17.49083772410938, "learning_rate": 3.8120815777023506e-07, "logits/chosen": -0.7320696115493774, "logits/rejected": -0.7526537179946899, "logps/chosen": -622.1019897460938, "logps/rejected": -824.1396484375, "loss": 0.3586, "rewards/accuracies": 0.84375, "rewards/chosen": -3.15362548828125, "rewards/margins": 1.5491368770599365, "rewards/rejected": -4.702762603759766, "step": 300 }, { "epoch": 0.3918687239774675, "eval_logits/chosen": -0.765253484249115, "eval_logits/rejected": -0.7676687240600586, "eval_logps/chosen": -685.0309448242188, "eval_logps/rejected": -878.11181640625, "eval_loss": 0.39491522312164307, "eval_rewards/accuracies": 0.871999979019165, "eval_rewards/chosen": -3.4099984169006348, "eval_rewards/margins": 1.8698322772979736, "eval_rewards/rejected": -5.2798309326171875, "eval_runtime": 306.3629, "eval_samples_per_second": 6.528, "eval_steps_per_second": 0.408, "step": 300 }, { "epoch": 0.3931749530573924, "grad_norm": 16.091751836838906, "learning_rate": 3.8023508512198257e-07, "logits/chosen": -0.868710994720459, "logits/rejected": -0.8930901288986206, "logps/chosen": -620.3997192382812, "logps/rejected": -734.6292114257812, "loss": 0.3495, "rewards/accuracies": 0.84375, "rewards/chosen": -2.891201972961426, "rewards/margins": 1.2318321466445923, "rewards/rejected": -4.1230340003967285, "step": 301 }, { "epoch": 0.3944811821373173, "grad_norm": 16.748258638354255, "learning_rate": 3.792592969718204e-07, "logits/chosen": -0.8345980644226074, "logits/rejected": -0.8519657850265503, "logps/chosen": -716.6856689453125, "logps/rejected": -984.464599609375, "loss": 0.3587, "rewards/accuracies": 0.875, "rewards/chosen": -3.8957595825195312, "rewards/margins": 2.130659580230713, "rewards/rejected": -6.026419162750244, "step": 302 }, { "epoch": 0.3957874112172422, "grad_norm": 16.663033219846664, "learning_rate": 3.7828081366568384e-07, "logits/chosen": -0.8669158220291138, "logits/rejected": -0.8916921615600586, "logps/chosen": -657.3521118164062, "logps/rejected": -780.7493896484375, "loss": 0.3705, "rewards/accuracies": 0.875, "rewards/chosen": -3.228039503097534, "rewards/margins": 1.194130301475525, "rewards/rejected": -4.4221696853637695, "step": 303 }, { "epoch": 0.3970936402971671, "grad_norm": 13.630350117251616, "learning_rate": 3.772996556057039e-07, "logits/chosen": -0.8360152840614319, "logits/rejected": -0.8600199222564697, "logps/chosen": -676.6741943359375, "logps/rejected": -839.611083984375, "loss": 0.355, "rewards/accuracies": 0.875, "rewards/chosen": -3.482692003250122, "rewards/margins": 1.5282418727874756, "rewards/rejected": -5.010933876037598, "step": 304 }, { "epoch": 0.398399869377092, "grad_norm": 13.096658139864836, "learning_rate": 3.763158432497823e-07, "logits/chosen": -0.7596051692962646, "logits/rejected": -0.7697958946228027, "logps/chosen": -722.1881103515625, "logps/rejected": -954.9688720703125, "loss": 0.3335, "rewards/accuracies": 0.9375, "rewards/chosen": -3.748467206954956, "rewards/margins": 1.9416289329528809, "rewards/rejected": -5.690095901489258, "step": 305 }, { "epoch": 0.3997060984570169, "grad_norm": 15.636359990948772, "learning_rate": 3.753293971111652e-07, "logits/chosen": -1.0189201831817627, "logits/rejected": -1.0032825469970703, "logps/chosen": -795.2416381835938, "logps/rejected": -1025.907958984375, "loss": 0.3618, "rewards/accuracies": 0.96875, "rewards/chosen": -4.307117938995361, "rewards/margins": 2.324575185775757, "rewards/rejected": -6.631693363189697, "step": 306 }, { "epoch": 0.40101232753694177, "grad_norm": 14.176580126233755, "learning_rate": 3.743403377580148e-07, "logits/chosen": -0.7651489973068237, "logits/rejected": -0.8138822317123413, "logps/chosen": -788.5110473632812, "logps/rejected": -976.249267578125, "loss": 0.3584, "rewards/accuracies": 0.78125, "rewards/chosen": -4.725651264190674, "rewards/margins": 1.478232502937317, "rewards/rejected": -6.203884124755859, "step": 307 }, { "epoch": 0.40231855661686666, "grad_norm": 18.74087147989152, "learning_rate": 3.7334868581298104e-07, "logits/chosen": -0.7951301336288452, "logits/rejected": -0.8030416965484619, "logps/chosen": -865.9967651367188, "logps/rejected": -1085.788330078125, "loss": 0.3421, "rewards/accuracies": 0.8125, "rewards/chosen": -4.567200183868408, "rewards/margins": 1.9825466871261597, "rewards/rejected": -6.549746513366699, "step": 308 }, { "epoch": 0.40362478569679155, "grad_norm": 36.18623439913893, "learning_rate": 3.7235446195277136e-07, "logits/chosen": -0.8499747514724731, "logits/rejected": -0.8248738646507263, "logps/chosen": -713.7739868164062, "logps/rejected": -1034.135009765625, "loss": 0.287, "rewards/accuracies": 0.90625, "rewards/chosen": -3.9644153118133545, "rewards/margins": 2.9848814010620117, "rewards/rejected": -6.949296951293945, "step": 309 }, { "epoch": 0.40493101477671645, "grad_norm": 32.30699716936894, "learning_rate": 3.713576869077195e-07, "logits/chosen": -0.8620212078094482, "logits/rejected": -0.8468830585479736, "logps/chosen": -833.579833984375, "logps/rejected": -1144.938232421875, "loss": 0.4218, "rewards/accuracies": 0.96875, "rewards/chosen": -4.6529130935668945, "rewards/margins": 3.0569937229156494, "rewards/rejected": -7.709907054901123, "step": 310 }, { "epoch": 0.40623724385664134, "grad_norm": 16.10176328410453, "learning_rate": 3.703583814613536e-07, "logits/chosen": -0.7625004649162292, "logits/rejected": -0.7636492252349854, "logps/chosen": -889.8790893554688, "logps/rejected": -1146.9530029296875, "loss": 0.3173, "rewards/accuracies": 0.90625, "rewards/chosen": -5.2467522621154785, "rewards/margins": 2.842636823654175, "rewards/rejected": -8.089387893676758, "step": 311 }, { "epoch": 0.40754347293656623, "grad_norm": 16.447526660324478, "learning_rate": 3.693565664499623e-07, "logits/chosen": -0.961453914642334, "logits/rejected": -0.9357421398162842, "logps/chosen": -755.412841796875, "logps/rejected": -965.876953125, "loss": 0.3669, "rewards/accuracies": 0.90625, "rewards/chosen": -4.1061601638793945, "rewards/margins": 2.250901937484741, "rewards/rejected": -6.357061862945557, "step": 312 }, { "epoch": 0.4088497020164911, "grad_norm": 23.371894242649518, "learning_rate": 3.683522627621608e-07, "logits/chosen": -0.854593813419342, "logits/rejected": -0.9100630879402161, "logps/chosen": -789.1729736328125, "logps/rejected": -1074.300048828125, "loss": 0.356, "rewards/accuracies": 0.84375, "rewards/chosen": -4.489620208740234, "rewards/margins": 2.253740072250366, "rewards/rejected": -6.7433600425720215, "step": 313 }, { "epoch": 0.410155931096416, "grad_norm": 32.176515119654795, "learning_rate": 3.6734549133845533e-07, "logits/chosen": -0.7982116341590881, "logits/rejected": -0.8008893132209778, "logps/chosen": -879.0695190429688, "logps/rejected": -1155.3798828125, "loss": 0.3758, "rewards/accuracies": 0.90625, "rewards/chosen": -5.085649013519287, "rewards/margins": 2.7232906818389893, "rewards/rejected": -7.8089399337768555, "step": 314 }, { "epoch": 0.4114621601763409, "grad_norm": 26.366438342209516, "learning_rate": 3.6633627317080585e-07, "logits/chosen": -0.9103017449378967, "logits/rejected": -0.841745138168335, "logps/chosen": -873.0088500976562, "logps/rejected": -1075.5986328125, "loss": 0.3927, "rewards/accuracies": 0.84375, "rewards/chosen": -5.0356855392456055, "rewards/margins": 2.1732540130615234, "rewards/rejected": -7.208939552307129, "step": 315 }, { "epoch": 0.4127683892562658, "grad_norm": 21.575409938356536, "learning_rate": 3.653246293021891e-07, "logits/chosen": -0.8488960266113281, "logits/rejected": -0.8882749080657959, "logps/chosen": -883.19287109375, "logps/rejected": -1041.235107421875, "loss": 0.4297, "rewards/accuracies": 0.75, "rewards/chosen": -5.056049346923828, "rewards/margins": 1.5542391538619995, "rewards/rejected": -6.610287666320801, "step": 316 }, { "epoch": 0.4140746183361907, "grad_norm": 35.21304677674138, "learning_rate": 3.643105808261596e-07, "logits/chosen": -0.8114643692970276, "logits/rejected": -0.8472386002540588, "logps/chosen": -771.3401489257812, "logps/rejected": -962.2191162109375, "loss": 0.4264, "rewards/accuracies": 0.8125, "rewards/chosen": -4.822422504425049, "rewards/margins": 1.6922321319580078, "rewards/rejected": -6.514655113220215, "step": 317 }, { "epoch": 0.4153808474161156, "grad_norm": 24.020179547425144, "learning_rate": 3.632941488864097e-07, "logits/chosen": -0.7898439168930054, "logits/rejected": -0.8224785923957825, "logps/chosen": -779.1013793945312, "logps/rejected": -1024.5921630859375, "loss": 0.3937, "rewards/accuracies": 0.78125, "rewards/chosen": -4.604560852050781, "rewards/margins": 2.217036247253418, "rewards/rejected": -6.821596622467041, "step": 318 }, { "epoch": 0.4166870764960405, "grad_norm": 15.576349233981643, "learning_rate": 3.6227535467632867e-07, "logits/chosen": -0.9034079313278198, "logits/rejected": -0.9171915054321289, "logps/chosen": -718.1586303710938, "logps/rejected": -1026.34228515625, "loss": 0.3409, "rewards/accuracies": 0.9375, "rewards/chosen": -3.507500171661377, "rewards/margins": 2.7573251724243164, "rewards/rejected": -6.264824867248535, "step": 319 }, { "epoch": 0.41799330557596537, "grad_norm": 23.449788537469892, "learning_rate": 3.6125421943856125e-07, "logits/chosen": -0.7588320970535278, "logits/rejected": -0.7619040012359619, "logps/chosen": -728.4111328125, "logps/rejected": -954.530029296875, "loss": 0.3494, "rewards/accuracies": 0.84375, "rewards/chosen": -4.0672736167907715, "rewards/margins": 2.065455913543701, "rewards/rejected": -6.132730007171631, "step": 320 }, { "epoch": 0.41929953465589026, "grad_norm": 18.658517623897286, "learning_rate": 3.602307644645641e-07, "logits/chosen": -0.9201053380966187, "logits/rejected": -0.9317169785499573, "logps/chosen": -665.0424194335938, "logps/rejected": -868.5579833984375, "loss": 0.2884, "rewards/accuracies": 0.96875, "rewards/chosen": -3.754164457321167, "rewards/margins": 1.7947040796279907, "rewards/rejected": -5.548868179321289, "step": 321 }, { "epoch": 0.42060576373581515, "grad_norm": 15.532608997201159, "learning_rate": 3.5920501109416233e-07, "logits/chosen": -0.977518618106842, "logits/rejected": -0.9513765573501587, "logps/chosen": -797.5126953125, "logps/rejected": -904.82470703125, "loss": 0.3584, "rewards/accuracies": 0.78125, "rewards/chosen": -3.851003646850586, "rewards/margins": 1.2614593505859375, "rewards/rejected": -5.112462997436523, "step": 322 }, { "epoch": 0.42191199281574004, "grad_norm": 21.182996168241864, "learning_rate": 3.581769807151044e-07, "logits/chosen": -0.8207509517669678, "logits/rejected": -0.8763917684555054, "logps/chosen": -709.5821533203125, "logps/rejected": -980.6561279296875, "loss": 0.383, "rewards/accuracies": 0.875, "rewards/chosen": -3.7620749473571777, "rewards/margins": 2.246896505355835, "rewards/rejected": -6.008971214294434, "step": 323 }, { "epoch": 0.42321822189566494, "grad_norm": 16.443088156761927, "learning_rate": 3.571466947626162e-07, "logits/chosen": -0.8285816311836243, "logits/rejected": -0.814357340335846, "logps/chosen": -751.146240234375, "logps/rejected": -968.7061767578125, "loss": 0.2996, "rewards/accuracies": 0.90625, "rewards/chosen": -4.3344221115112305, "rewards/margins": 1.9379419088363647, "rewards/rejected": -6.272363662719727, "step": 324 }, { "epoch": 0.42452445097558983, "grad_norm": 19.619563047541227, "learning_rate": 3.5611417471895376e-07, "logits/chosen": -0.7870075106620789, "logits/rejected": -0.7580903172492981, "logps/chosen": -721.9214477539062, "logps/rejected": -890.1439208984375, "loss": 0.2708, "rewards/accuracies": 0.90625, "rewards/chosen": -4.170993804931641, "rewards/margins": 1.6512360572814941, "rewards/rejected": -5.822230339050293, "step": 325 }, { "epoch": 0.4258306800555147, "grad_norm": 18.263410236851353, "learning_rate": 3.5507944211295604e-07, "logits/chosen": -0.89299476146698, "logits/rejected": -0.8813902139663696, "logps/chosen": -851.031982421875, "logps/rejected": -1084.912841796875, "loss": 0.3693, "rewards/accuracies": 0.875, "rewards/chosen": -4.2356858253479, "rewards/margins": 2.128682851791382, "rewards/rejected": -6.364368915557861, "step": 326 }, { "epoch": 0.4271369091354396, "grad_norm": 24.469596399437656, "learning_rate": 3.540425185195953e-07, "logits/chosen": -0.7243574261665344, "logits/rejected": -0.7720382213592529, "logps/chosen": -872.6907348632812, "logps/rejected": -1353.14599609375, "loss": 0.408, "rewards/accuracies": 0.875, "rewards/chosen": -5.031065464019775, "rewards/margins": 3.7641468048095703, "rewards/rejected": -8.795212745666504, "step": 327 }, { "epoch": 0.4284431382153645, "grad_norm": 27.430992219157226, "learning_rate": 3.5300342555952787e-07, "logits/chosen": -0.7673375606536865, "logits/rejected": -0.8125264048576355, "logps/chosen": -761.0677490234375, "logps/rejected": -919.9730834960938, "loss": 0.4373, "rewards/accuracies": 0.78125, "rewards/chosen": -4.35603141784668, "rewards/margins": 1.2068922519683838, "rewards/rejected": -5.562923431396484, "step": 328 }, { "epoch": 0.4297493672952894, "grad_norm": 17.69204592084202, "learning_rate": 3.519621848986428e-07, "logits/chosen": -0.883451521396637, "logits/rejected": -0.9020692110061646, "logps/chosen": -846.7744140625, "logps/rejected": -1188.0194091796875, "loss": 0.3338, "rewards/accuracies": 0.9375, "rewards/chosen": -5.060979843139648, "rewards/margins": 2.9336440563201904, "rewards/rejected": -7.994624137878418, "step": 329 }, { "epoch": 0.4310555963752143, "grad_norm": 20.036121655806124, "learning_rate": 3.5091881824761046e-07, "logits/chosen": -0.8099783062934875, "logits/rejected": -0.8211954832077026, "logps/chosen": -773.2403564453125, "logps/rejected": -1035.19580078125, "loss": 0.3221, "rewards/accuracies": 0.9375, "rewards/chosen": -4.553930282592773, "rewards/margins": 2.4084632396698, "rewards/rejected": -6.962393283843994, "step": 330 }, { "epoch": 0.4323618254551392, "grad_norm": 29.84146239469266, "learning_rate": 3.4987334736142977e-07, "logits/chosen": -0.8917319774627686, "logits/rejected": -0.7940698862075806, "logps/chosen": -839.36376953125, "logps/rejected": -1059.7322998046875, "loss": 0.4586, "rewards/accuracies": 0.875, "rewards/chosen": -4.834168434143066, "rewards/margins": 2.5644521713256836, "rewards/rejected": -7.39862060546875, "step": 331 }, { "epoch": 0.4336680545350641, "grad_norm": 21.68856790635962, "learning_rate": 3.4882579403897455e-07, "logits/chosen": -0.7471975088119507, "logits/rejected": -0.8152725696563721, "logps/chosen": -822.88818359375, "logps/rejected": -1091.7850341796875, "loss": 0.3349, "rewards/accuracies": 0.84375, "rewards/chosen": -5.2126054763793945, "rewards/margins": 2.201547622680664, "rewards/rejected": -7.414153099060059, "step": 332 }, { "epoch": 0.43497428361498897, "grad_norm": 23.858340474183475, "learning_rate": 3.4777618012253895e-07, "logits/chosen": -0.9458297491073608, "logits/rejected": -0.9789966344833374, "logps/chosen": -846.14990234375, "logps/rejected": -1049.368408203125, "loss": 0.3866, "rewards/accuracies": 0.78125, "rewards/chosen": -4.42270565032959, "rewards/margins": 2.1395516395568848, "rewards/rejected": -6.562256336212158, "step": 333 }, { "epoch": 0.43628051269491386, "grad_norm": 20.135393062791458, "learning_rate": 3.4672452749738233e-07, "logits/chosen": -0.6803852319717407, "logits/rejected": -0.7442112565040588, "logps/chosen": -839.7537231445312, "logps/rejected": -1007.471923828125, "loss": 0.4086, "rewards/accuracies": 0.75, "rewards/chosen": -5.06041955947876, "rewards/margins": 1.4089503288269043, "rewards/rejected": -6.469369888305664, "step": 334 }, { "epoch": 0.43758674177483875, "grad_norm": 26.30301422269973, "learning_rate": 3.4567085809127245e-07, "logits/chosen": -0.927352249622345, "logits/rejected": -0.856540322303772, "logps/chosen": -872.8140869140625, "logps/rejected": -1098.021728515625, "loss": 0.4098, "rewards/accuracies": 0.78125, "rewards/chosen": -4.658236503601074, "rewards/margins": 2.292365312576294, "rewards/rejected": -6.950601577758789, "step": 335 }, { "epoch": 0.43889297085476364, "grad_norm": 23.07833880886756, "learning_rate": 3.446151938740285e-07, "logits/chosen": -0.8015051484107971, "logits/rejected": -0.8258290886878967, "logps/chosen": -785.102294921875, "logps/rejected": -1044.4542236328125, "loss": 0.4308, "rewards/accuracies": 0.8125, "rewards/chosen": -4.3944268226623535, "rewards/margins": 2.6531472206115723, "rewards/rejected": -7.047574043273926, "step": 336 }, { "epoch": 0.44019919993468853, "grad_norm": 24.733070230983582, "learning_rate": 3.4355755685706326e-07, "logits/chosen": -0.8332501649856567, "logits/rejected": -0.8956983685493469, "logps/chosen": -841.2762451171875, "logps/rejected": -1415.9942626953125, "loss": 0.4168, "rewards/accuracies": 0.8125, "rewards/chosen": -4.841058731079102, "rewards/margins": 5.052248954772949, "rewards/rejected": -9.893306732177734, "step": 337 }, { "epoch": 0.4415054290146134, "grad_norm": 19.097211954684813, "learning_rate": 3.4249796909292374e-07, "logits/chosen": -0.8756859302520752, "logits/rejected": -0.8802890181541443, "logps/chosen": -720.0657958984375, "logps/rejected": -893.16748046875, "loss": 0.4286, "rewards/accuracies": 0.84375, "rewards/chosen": -3.8131256103515625, "rewards/margins": 1.430503249168396, "rewards/rejected": -5.24362850189209, "step": 338 }, { "epoch": 0.4428116580945383, "grad_norm": 16.02571558109891, "learning_rate": 3.4143645267483137e-07, "logits/chosen": -0.8234140276908875, "logits/rejected": -0.8196402788162231, "logps/chosen": -729.0518798828125, "logps/rejected": -888.3472900390625, "loss": 0.3755, "rewards/accuracies": 0.875, "rewards/chosen": -4.293505668640137, "rewards/margins": 1.6021445989608765, "rewards/rejected": -5.895649433135986, "step": 339 }, { "epoch": 0.4441178871744632, "grad_norm": 20.094294388002286, "learning_rate": 3.403730297362219e-07, "logits/chosen": -0.8382161855697632, "logits/rejected": -0.8710712194442749, "logps/chosen": -740.0045166015625, "logps/rejected": -877.9818115234375, "loss": 0.4001, "rewards/accuracies": 0.78125, "rewards/chosen": -4.060654163360596, "rewards/margins": 1.3980674743652344, "rewards/rejected": -5.458722114562988, "step": 340 }, { "epoch": 0.4454241162543881, "grad_norm": 15.094516807495772, "learning_rate": 3.3930772245028317e-07, "logits/chosen": -0.761610746383667, "logits/rejected": -0.8118141293525696, "logps/chosen": -753.0406494140625, "logps/rejected": -1056.1932373046875, "loss": 0.3754, "rewards/accuracies": 0.78125, "rewards/chosen": -3.8587474822998047, "rewards/margins": 2.5299229621887207, "rewards/rejected": -6.388670921325684, "step": 341 }, { "epoch": 0.446730345334313, "grad_norm": 19.17347194519931, "learning_rate": 3.382405530294933e-07, "logits/chosen": -0.8792775869369507, "logits/rejected": -0.8162313103675842, "logps/chosen": -714.256103515625, "logps/rejected": -881.3817749023438, "loss": 0.3459, "rewards/accuracies": 0.8125, "rewards/chosen": -3.3989529609680176, "rewards/margins": 1.8843724727630615, "rewards/rejected": -5.2833251953125, "step": 342 }, { "epoch": 0.4480365744142379, "grad_norm": 14.512362955820704, "learning_rate": 3.371715437251571e-07, "logits/chosen": -0.8337914943695068, "logits/rejected": -0.858870804309845, "logps/chosen": -704.8893432617188, "logps/rejected": -1020.823974609375, "loss": 0.353, "rewards/accuracies": 0.84375, "rewards/chosen": -3.540440082550049, "rewards/margins": 2.77099871635437, "rewards/rejected": -6.311439037322998, "step": 343 }, { "epoch": 0.4493428034941628, "grad_norm": 14.767671827120036, "learning_rate": 3.3610071682694286e-07, "logits/chosen": -0.9238380193710327, "logits/rejected": -0.8974184989929199, "logps/chosen": -788.702880859375, "logps/rejected": -924.5913696289062, "loss": 0.3612, "rewards/accuracies": 0.84375, "rewards/chosen": -3.856344699859619, "rewards/margins": 1.5991241931915283, "rewards/rejected": -5.455468654632568, "step": 344 }, { "epoch": 0.45064903257408767, "grad_norm": 13.867972850755454, "learning_rate": 3.3502809466241653e-07, "logits/chosen": -1.0090371370315552, "logits/rejected": -0.9271538257598877, "logps/chosen": -864.546875, "logps/rejected": -1048.196044921875, "loss": 0.3424, "rewards/accuracies": 0.84375, "rewards/chosen": -4.667964458465576, "rewards/margins": 2.3645546436309814, "rewards/rejected": -7.032519340515137, "step": 345 }, { "epoch": 0.45195526165401256, "grad_norm": 20.821023709654465, "learning_rate": 3.3395369959657713e-07, "logits/chosen": -0.8493632674217224, "logits/rejected": -0.8992824554443359, "logps/chosen": -694.7129516601562, "logps/rejected": -967.9573974609375, "loss": 0.3349, "rewards/accuracies": 0.9375, "rewards/chosen": -3.8514938354492188, "rewards/margins": 2.331794261932373, "rewards/rejected": -6.183287620544434, "step": 346 }, { "epoch": 0.45326149073393746, "grad_norm": 20.674015602908195, "learning_rate": 3.3287755403139e-07, "logits/chosen": -0.829691469669342, "logits/rejected": -0.8424278497695923, "logps/chosen": -786.063232421875, "logps/rejected": -927.3167724609375, "loss": 0.3809, "rewards/accuracies": 0.84375, "rewards/chosen": -4.13981294631958, "rewards/margins": 1.3104791641235352, "rewards/rejected": -5.450291633605957, "step": 347 }, { "epoch": 0.45456771981386235, "grad_norm": 14.753612377748723, "learning_rate": 3.3179968040531945e-07, "logits/chosen": -0.7737857699394226, "logits/rejected": -0.8464547395706177, "logps/chosen": -736.8883056640625, "logps/rejected": -1053.45166015625, "loss": 0.3447, "rewards/accuracies": 0.90625, "rewards/chosen": -4.1345133781433105, "rewards/margins": 2.341533899307251, "rewards/rejected": -6.476047992706299, "step": 348 }, { "epoch": 0.45587394889378724, "grad_norm": 17.764757170781934, "learning_rate": 3.3072010119286155e-07, "logits/chosen": -0.8078272342681885, "logits/rejected": -0.8359349966049194, "logps/chosen": -783.0404663085938, "logps/rejected": -923.0152587890625, "loss": 0.346, "rewards/accuracies": 0.78125, "rewards/chosen": -4.535928249359131, "rewards/margins": 1.3883693218231201, "rewards/rejected": -5.92429780960083, "step": 349 }, { "epoch": 0.45718017797371213, "grad_norm": 15.68293110394188, "learning_rate": 3.2963883890407495e-07, "logits/chosen": -0.992402195930481, "logits/rejected": -0.9728306531906128, "logps/chosen": -829.182373046875, "logps/rejected": -916.9439086914062, "loss": 0.3691, "rewards/accuracies": 0.8125, "rewards/chosen": -4.468806266784668, "rewards/margins": 1.1914604902267456, "rewards/rejected": -5.660266876220703, "step": 350 }, { "epoch": 0.458486407053637, "grad_norm": 17.46952938405253, "learning_rate": 3.28555916084112e-07, "logits/chosen": -0.6900120973587036, "logits/rejected": -0.7500667572021484, "logps/chosen": -700.6179809570312, "logps/rejected": -979.9500122070312, "loss": 0.3445, "rewards/accuracies": 0.84375, "rewards/chosen": -4.12177848815918, "rewards/margins": 2.488147258758545, "rewards/rejected": -6.609926223754883, "step": 351 }, { "epoch": 0.4597926361335619, "grad_norm": 22.1790777909453, "learning_rate": 3.274713553127479e-07, "logits/chosen": -0.8638774156570435, "logits/rejected": -0.7936422824859619, "logps/chosen": -765.1031494140625, "logps/rejected": -930.421875, "loss": 0.3737, "rewards/accuracies": 0.84375, "rewards/chosen": -4.624390602111816, "rewards/margins": 1.6826943159103394, "rewards/rejected": -6.307085037231445, "step": 352 }, { "epoch": 0.4610988652134868, "grad_norm": 12.479143269697133, "learning_rate": 3.263851792039109e-07, "logits/chosen": -0.9091026782989502, "logits/rejected": -0.9592025279998779, "logps/chosen": -741.4254760742188, "logps/rejected": -963.5419921875, "loss": 0.3, "rewards/accuracies": 0.90625, "rewards/chosen": -3.949568748474121, "rewards/margins": 1.8915400505065918, "rewards/rejected": -5.841108322143555, "step": 353 }, { "epoch": 0.4624050942934117, "grad_norm": 18.630079829400888, "learning_rate": 3.252974104052101e-07, "logits/chosen": -0.7867209911346436, "logits/rejected": -0.7810537815093994, "logps/chosen": -818.9255981445312, "logps/rejected": -1026.5875244140625, "loss": 0.376, "rewards/accuracies": 0.78125, "rewards/chosen": -4.769900321960449, "rewards/margins": 2.1134841442108154, "rewards/rejected": -6.883384704589844, "step": 354 }, { "epoch": 0.4637113233733366, "grad_norm": 20.191982248042134, "learning_rate": 3.2420807159746327e-07, "logits/chosen": -0.9127646684646606, "logits/rejected": -0.8559162616729736, "logps/chosen": -896.9894409179688, "logps/rejected": -1082.9798583984375, "loss": 0.3528, "rewards/accuracies": 0.75, "rewards/chosen": -5.027846336364746, "rewards/margins": 2.029517889022827, "rewards/rejected": -7.057364463806152, "step": 355 }, { "epoch": 0.4650175524532615, "grad_norm": 34.75961013205875, "learning_rate": 3.2311718549422435e-07, "logits/chosen": -0.8626306056976318, "logits/rejected": -0.8488350510597229, "logps/chosen": -848.584716796875, "logps/rejected": -985.186767578125, "loss": 0.3659, "rewards/accuracies": 0.75, "rewards/chosen": -4.735561847686768, "rewards/margins": 1.573469877243042, "rewards/rejected": -6.309031963348389, "step": 356 }, { "epoch": 0.4663237815331864, "grad_norm": 16.615213616984725, "learning_rate": 3.220247748413094e-07, "logits/chosen": -0.9426769614219666, "logits/rejected": -0.9294852018356323, "logps/chosen": -804.0889282226562, "logps/rejected": -981.0759887695312, "loss": 0.2838, "rewards/accuracies": 0.8125, "rewards/chosen": -4.3830060958862305, "rewards/margins": 1.6862914562225342, "rewards/rejected": -6.069297790527344, "step": 357 }, { "epoch": 0.46763001061311127, "grad_norm": 21.324802872506126, "learning_rate": 3.209308624163225e-07, "logits/chosen": -0.942538857460022, "logits/rejected": -0.951295018196106, "logps/chosen": -875.485107421875, "logps/rejected": -1008.2823486328125, "loss": 0.3164, "rewards/accuracies": 0.8125, "rewards/chosen": -4.925025463104248, "rewards/margins": 1.5497174263000488, "rewards/rejected": -6.474742889404297, "step": 358 }, { "epoch": 0.46893623969303616, "grad_norm": 24.770720365271597, "learning_rate": 3.1983547102818096e-07, "logits/chosen": -0.9059143662452698, "logits/rejected": -0.8871181607246399, "logps/chosen": -854.4793090820312, "logps/rejected": -1108.9869384765625, "loss": 0.3635, "rewards/accuracies": 0.8125, "rewards/chosen": -4.860678672790527, "rewards/margins": 2.5468215942382812, "rewards/rejected": -7.407499313354492, "step": 359 }, { "epoch": 0.47024246877296105, "grad_norm": 28.415307220726664, "learning_rate": 3.187386235166396e-07, "logits/chosen": -0.900255560874939, "logits/rejected": -0.8746569156646729, "logps/chosen": -831.3404541015625, "logps/rejected": -973.556396484375, "loss": 0.4562, "rewards/accuracies": 0.75, "rewards/chosen": -4.972175598144531, "rewards/margins": 1.6384382247924805, "rewards/rejected": -6.6106133460998535, "step": 360 }, { "epoch": 0.47154869785288595, "grad_norm": 22.30729021657225, "learning_rate": 3.176403427518143e-07, "logits/chosen": -0.8422956466674805, "logits/rejected": -0.8382501602172852, "logps/chosen": -790.753662109375, "logps/rejected": -1003.9214477539062, "loss": 0.3137, "rewards/accuracies": 0.90625, "rewards/chosen": -4.53585147857666, "rewards/margins": 2.458928346633911, "rewards/rejected": -6.99478006362915, "step": 361 }, { "epoch": 0.47285492693281084, "grad_norm": 20.89001631156089, "learning_rate": 3.165406516337057e-07, "logits/chosen": -0.9484958052635193, "logits/rejected": -0.8832159042358398, "logps/chosen": -944.7320556640625, "logps/rejected": -1150.0098876953125, "loss": 0.3452, "rewards/accuracies": 0.875, "rewards/chosen": -5.379519462585449, "rewards/margins": 2.542079448699951, "rewards/rejected": -7.9215989112854, "step": 362 }, { "epoch": 0.47416115601273573, "grad_norm": 25.18282457138755, "learning_rate": 3.154395730917213e-07, "logits/chosen": -0.9980475902557373, "logits/rejected": -0.9696142077445984, "logps/chosen": -971.07275390625, "logps/rejected": -1192.8839111328125, "loss": 0.3473, "rewards/accuracies": 0.9375, "rewards/chosen": -5.382442951202393, "rewards/margins": 2.5971856117248535, "rewards/rejected": -7.979628562927246, "step": 363 }, { "epoch": 0.4754673850926606, "grad_norm": 21.129084566980914, "learning_rate": 3.143371300841973e-07, "logits/chosen": -0.8516480922698975, "logits/rejected": -0.8902812004089355, "logps/chosen": -815.789306640625, "logps/rejected": -1200.0732421875, "loss": 0.3557, "rewards/accuracies": 0.90625, "rewards/chosen": -4.755250930786133, "rewards/margins": 3.439861297607422, "rewards/rejected": -8.195111274719238, "step": 364 }, { "epoch": 0.4767736141725855, "grad_norm": 33.46170211823593, "learning_rate": 3.1323334559792015e-07, "logits/chosen": -0.909258246421814, "logits/rejected": -0.9520514011383057, "logps/chosen": -816.0494995117188, "logps/rejected": -1048.5968017578125, "loss": 0.361, "rewards/accuracies": 0.875, "rewards/chosen": -4.6202073097229, "rewards/margins": 2.1964542865753174, "rewards/rejected": -6.816661834716797, "step": 365 }, { "epoch": 0.4780798432525104, "grad_norm": 26.43165680790808, "learning_rate": 3.1212824264764727e-07, "logits/chosen": -0.7765522003173828, "logits/rejected": -0.7952960133552551, "logps/chosen": -838.6836547851562, "logps/rejected": -1241.4041748046875, "loss": 0.3913, "rewards/accuracies": 0.84375, "rewards/chosen": -4.901317119598389, "rewards/margins": 3.617424726486206, "rewards/rejected": -8.5187406539917, "step": 366 }, { "epoch": 0.4793860723324353, "grad_norm": 19.76142450798905, "learning_rate": 3.1102184427562696e-07, "logits/chosen": -0.8111026883125305, "logits/rejected": -0.8469929695129395, "logps/chosen": -913.451416015625, "logps/rejected": -1214.3255615234375, "loss": 0.3589, "rewards/accuracies": 0.84375, "rewards/chosen": -5.217056751251221, "rewards/margins": 2.5425302982330322, "rewards/rejected": -7.759586334228516, "step": 367 }, { "epoch": 0.4806923014123602, "grad_norm": 28.26438879788685, "learning_rate": 3.0991417355111807e-07, "logits/chosen": -0.9394615888595581, "logits/rejected": -0.9448978900909424, "logps/chosen": -881.9717407226562, "logps/rejected": -1111.5242919921875, "loss": 0.3877, "rewards/accuracies": 0.71875, "rewards/chosen": -4.882044792175293, "rewards/margins": 2.1486172676086426, "rewards/rejected": -7.030661582946777, "step": 368 }, { "epoch": 0.4819985304922851, "grad_norm": 18.563211767851524, "learning_rate": 3.088052535699089e-07, "logits/chosen": -0.9249133467674255, "logits/rejected": -0.9536056518554688, "logps/chosen": -917.371337890625, "logps/rejected": -1216.37060546875, "loss": 0.3908, "rewards/accuracies": 0.96875, "rewards/chosen": -4.822209358215332, "rewards/margins": 2.9570131301879883, "rewards/rejected": -7.77922248840332, "step": 369 }, { "epoch": 0.48330475957221, "grad_norm": 44.299895456640826, "learning_rate": 3.07695107453836e-07, "logits/chosen": -0.9447470307350159, "logits/rejected": -0.8442981243133545, "logps/chosen": -790.659423828125, "logps/rejected": -901.15234375, "loss": 0.3445, "rewards/accuracies": 0.78125, "rewards/chosen": -3.9590797424316406, "rewards/margins": 1.7083748579025269, "rewards/rejected": -5.667453765869141, "step": 370 }, { "epoch": 0.48461098865213487, "grad_norm": 25.707606867503262, "learning_rate": 3.0658375835030144e-07, "logits/chosen": -0.8817845582962036, "logits/rejected": -0.9164577722549438, "logps/chosen": -795.1084594726562, "logps/rejected": -1039.523681640625, "loss": 0.3583, "rewards/accuracies": 0.90625, "rewards/chosen": -4.440952301025391, "rewards/margins": 2.2085859775543213, "rewards/rejected": -6.649538993835449, "step": 371 }, { "epoch": 0.48591721773205976, "grad_norm": 16.939141948308613, "learning_rate": 3.0547122943179067e-07, "logits/chosen": -0.8850041031837463, "logits/rejected": -0.8968605399131775, "logps/chosen": -694.5146484375, "logps/rejected": -841.6837158203125, "loss": 0.4063, "rewards/accuracies": 0.875, "rewards/chosen": -3.9476029872894287, "rewards/margins": 1.4921026229858398, "rewards/rejected": -5.439705848693848, "step": 372 }, { "epoch": 0.48722344681198465, "grad_norm": 16.412461798089808, "learning_rate": 3.0435754389538925e-07, "logits/chosen": -0.8374086618423462, "logits/rejected": -0.8305915594100952, "logps/chosen": -679.0845336914062, "logps/rejected": -899.1082763671875, "loss": 0.3347, "rewards/accuracies": 0.875, "rewards/chosen": -3.579893112182617, "rewards/margins": 2.188535213470459, "rewards/rejected": -5.768428325653076, "step": 373 }, { "epoch": 0.48852967589190954, "grad_norm": 23.798205961863953, "learning_rate": 3.03242724962299e-07, "logits/chosen": -0.936191201210022, "logits/rejected": -0.8969914317131042, "logps/chosen": -755.7578125, "logps/rejected": -921.0574340820312, "loss": 0.3774, "rewards/accuracies": 0.8125, "rewards/chosen": -3.9115138053894043, "rewards/margins": 1.8655014038085938, "rewards/rejected": -5.777015209197998, "step": 374 }, { "epoch": 0.48983590497183443, "grad_norm": 12.937570248162015, "learning_rate": 3.0212679587735396e-07, "logits/chosen": -0.7778648138046265, "logits/rejected": -0.8440149426460266, "logps/chosen": -737.39208984375, "logps/rejected": -1115.466064453125, "loss": 0.3147, "rewards/accuracies": 0.9375, "rewards/chosen": -3.710268020629883, "rewards/margins": 2.5861876010894775, "rewards/rejected": -6.296454906463623, "step": 375 }, { "epoch": 0.4911421340517593, "grad_norm": 20.473240179875067, "learning_rate": 3.0100977990853565e-07, "logits/chosen": -0.7876812815666199, "logits/rejected": -0.8208199143409729, "logps/chosen": -701.5858154296875, "logps/rejected": -897.10205078125, "loss": 0.3963, "rewards/accuracies": 0.8125, "rewards/chosen": -3.745605707168579, "rewards/margins": 1.8409405946731567, "rewards/rejected": -5.586545944213867, "step": 376 }, { "epoch": 0.4924483631316842, "grad_norm": 14.20579737409417, "learning_rate": 2.998917003464882e-07, "logits/chosen": -0.8073393106460571, "logits/rejected": -0.8390824794769287, "logps/chosen": -748.408203125, "logps/rejected": -955.57470703125, "loss": 0.364, "rewards/accuracies": 0.84375, "rewards/chosen": -3.998650074005127, "rewards/margins": 1.9089974164962769, "rewards/rejected": -5.907647132873535, "step": 377 }, { "epoch": 0.4937545922116091, "grad_norm": 16.24545057635828, "learning_rate": 2.987725805040321e-07, "logits/chosen": -0.8982130289077759, "logits/rejected": -0.8425903916358948, "logps/chosen": -711.0843505859375, "logps/rejected": -820.8436889648438, "loss": 0.409, "rewards/accuracies": 0.875, "rewards/chosen": -3.7347919940948486, "rewards/margins": 1.427825927734375, "rewards/rejected": -5.162618160247803, "step": 378 }, { "epoch": 0.495060821291534, "grad_norm": 18.894573729579022, "learning_rate": 2.976524437156787e-07, "logits/chosen": -0.9128229022026062, "logits/rejected": -0.9626132249832153, "logps/chosen": -747.045166015625, "logps/rejected": -967.6761474609375, "loss": 0.336, "rewards/accuracies": 0.875, "rewards/chosen": -3.7548229694366455, "rewards/margins": 1.727982521057129, "rewards/rejected": -5.482805252075195, "step": 379 }, { "epoch": 0.4963670503714589, "grad_norm": 16.23155664279542, "learning_rate": 2.9653131333714354e-07, "logits/chosen": -0.774066686630249, "logits/rejected": -0.7619376182556152, "logps/chosen": -673.14794921875, "logps/rejected": -844.1553955078125, "loss": 0.3158, "rewards/accuracies": 0.8125, "rewards/chosen": -3.664004325866699, "rewards/margins": 1.75593900680542, "rewards/rejected": -5.419942855834961, "step": 380 }, { "epoch": 0.4976732794513838, "grad_norm": 16.135457912708656, "learning_rate": 2.954092127448591e-07, "logits/chosen": -0.8101003766059875, "logits/rejected": -0.8355172276496887, "logps/chosen": -614.1039428710938, "logps/rejected": -779.637939453125, "loss": 0.4207, "rewards/accuracies": 0.90625, "rewards/chosen": -3.3870277404785156, "rewards/margins": 1.5078352689743042, "rewards/rejected": -4.894863128662109, "step": 381 }, { "epoch": 0.4989795085313087, "grad_norm": 20.89194150920151, "learning_rate": 2.9428616533548766e-07, "logits/chosen": -0.9312489032745361, "logits/rejected": -0.9005659818649292, "logps/chosen": -723.7083740234375, "logps/rejected": -968.6890258789062, "loss": 0.3026, "rewards/accuracies": 0.84375, "rewards/chosen": -3.5352654457092285, "rewards/margins": 2.6465823650360107, "rewards/rejected": -6.18184757232666, "step": 382 }, { "epoch": 0.5002857376112335, "grad_norm": 15.514193847292193, "learning_rate": 2.931621945254334e-07, "logits/chosen": -0.8936735391616821, "logits/rejected": -0.9682626724243164, "logps/chosen": -715.4161376953125, "logps/rejected": -1014.7509155273438, "loss": 0.3123, "rewards/accuracies": 0.8125, "rewards/chosen": -3.6884326934814453, "rewards/margins": 1.9404077529907227, "rewards/rejected": -5.628840446472168, "step": 383 }, { "epoch": 0.5015919666911585, "grad_norm": 25.2144093190015, "learning_rate": 2.9203732375035387e-07, "logits/chosen": -0.9673444032669067, "logits/rejected": -0.9933240413665771, "logps/chosen": -827.9849243164062, "logps/rejected": -1104.3861083984375, "loss": 0.4075, "rewards/accuracies": 0.84375, "rewards/chosen": -4.7136616706848145, "rewards/margins": 2.6191678047180176, "rewards/rejected": -7.332829475402832, "step": 384 }, { "epoch": 0.5028981957710833, "grad_norm": 17.30717183828124, "learning_rate": 2.90911576464672e-07, "logits/chosen": -0.8663851022720337, "logits/rejected": -0.9097596406936646, "logps/chosen": -770.527099609375, "logps/rejected": -1005.6298217773438, "loss": 0.3697, "rewards/accuracies": 0.875, "rewards/chosen": -4.207431316375732, "rewards/margins": 2.028050184249878, "rewards/rejected": -6.235481262207031, "step": 385 }, { "epoch": 0.5042044248510082, "grad_norm": 19.970645583944684, "learning_rate": 2.8978497614108635e-07, "logits/chosen": -0.8607802391052246, "logits/rejected": -0.8875527381896973, "logps/chosen": -808.6739501953125, "logps/rejected": -1079.2705078125, "loss": 0.3704, "rewards/accuracies": 0.9375, "rewards/chosen": -4.122775077819824, "rewards/margins": 2.69840145111084, "rewards/rejected": -6.821176528930664, "step": 386 }, { "epoch": 0.5055106539309331, "grad_norm": 18.538857537509557, "learning_rate": 2.8865754627008205e-07, "logits/chosen": -0.9041465520858765, "logits/rejected": -0.8960049748420715, "logps/chosen": -820.70458984375, "logps/rejected": -1124.7659912109375, "loss": 0.3332, "rewards/accuracies": 0.875, "rewards/chosen": -4.593035697937012, "rewards/margins": 2.8649890422821045, "rewards/rejected": -7.458024978637695, "step": 387 }, { "epoch": 0.506816883010858, "grad_norm": 17.991315804239036, "learning_rate": 2.8752931035944083e-07, "logits/chosen": -0.8732789754867554, "logits/rejected": -0.9428113698959351, "logps/chosen": -741.2675170898438, "logps/rejected": -1010.6340942382812, "loss": 0.3097, "rewards/accuracies": 0.875, "rewards/chosen": -4.332613468170166, "rewards/margins": 2.5484251976013184, "rewards/rejected": -6.881038665771484, "step": 388 }, { "epoch": 0.5081231120907829, "grad_norm": 27.817321746324055, "learning_rate": 2.8640029193375125e-07, "logits/chosen": -0.8342300653457642, "logits/rejected": -0.8032709956169128, "logps/chosen": -769.5841064453125, "logps/rejected": -971.3031005859375, "loss": 0.4094, "rewards/accuracies": 0.8125, "rewards/chosen": -4.432335376739502, "rewards/margins": 2.4039621353149414, "rewards/rejected": -6.836297512054443, "step": 389 }, { "epoch": 0.5094293411707078, "grad_norm": 26.19481211137744, "learning_rate": 2.852705145339176e-07, "logits/chosen": -1.0258805751800537, "logits/rejected": -0.9810967445373535, "logps/chosen": -829.2476806640625, "logps/rejected": -1077.290771484375, "loss": 0.3769, "rewards/accuracies": 0.875, "rewards/chosen": -4.776191711425781, "rewards/margins": 2.748802661895752, "rewards/rejected": -7.524994850158691, "step": 390 }, { "epoch": 0.5107355702506327, "grad_norm": 19.01867691799226, "learning_rate": 2.8414000171666945e-07, "logits/chosen": -0.8570470809936523, "logits/rejected": -0.8079515099525452, "logps/chosen": -811.5380859375, "logps/rejected": -984.9368896484375, "loss": 0.3427, "rewards/accuracies": 0.90625, "rewards/chosen": -4.842589855194092, "rewards/margins": 2.0718741416931152, "rewards/rejected": -6.914463996887207, "step": 391 }, { "epoch": 0.5120417993305576, "grad_norm": 23.18645994627528, "learning_rate": 2.830087770540705e-07, "logits/chosen": -0.8229571580886841, "logits/rejected": -0.8696286678314209, "logps/chosen": -805.5714111328125, "logps/rejected": -1093.4774169921875, "loss": 0.3441, "rewards/accuracies": 0.875, "rewards/chosen": -4.443207740783691, "rewards/margins": 2.2599196434020996, "rewards/rejected": -6.703127861022949, "step": 392 }, { "epoch": 0.5133480284104824, "grad_norm": 28.281638147055716, "learning_rate": 2.81876864133027e-07, "logits/chosen": -0.817804753780365, "logits/rejected": -0.8166165351867676, "logps/chosen": -820.2814331054688, "logps/rejected": -1031.4049072265625, "loss": 0.3534, "rewards/accuracies": 0.78125, "rewards/chosen": -4.919102191925049, "rewards/margins": 1.825037956237793, "rewards/rejected": -6.744140625, "step": 393 }, { "epoch": 0.5146542574904074, "grad_norm": 24.167048796517168, "learning_rate": 2.807442865547957e-07, "logits/chosen": -0.9945539832115173, "logits/rejected": -0.9743906855583191, "logps/chosen": -915.3056030273438, "logps/rejected": -1188.7843017578125, "loss": 0.3163, "rewards/accuracies": 0.875, "rewards/chosen": -5.132131576538086, "rewards/margins": 2.8624014854431152, "rewards/rejected": -7.994532108306885, "step": 394 }, { "epoch": 0.5159604865703322, "grad_norm": 28.18102447139193, "learning_rate": 2.796110679344921e-07, "logits/chosen": -0.8455891013145447, "logits/rejected": -0.7778101563453674, "logps/chosen": -829.8065795898438, "logps/rejected": -882.5882568359375, "loss": 0.4049, "rewards/accuracies": 0.65625, "rewards/chosen": -4.8128581047058105, "rewards/margins": 1.0256637334823608, "rewards/rejected": -5.838521957397461, "step": 395 }, { "epoch": 0.5172667156502572, "grad_norm": 14.11300629365015, "learning_rate": 2.7847723190059794e-07, "logits/chosen": -0.7391270399093628, "logits/rejected": -0.7817518711090088, "logps/chosen": -692.3377075195312, "logps/rejected": -997.4937744140625, "loss": 0.3341, "rewards/accuracies": 0.875, "rewards/chosen": -4.137124061584473, "rewards/margins": 2.135505199432373, "rewards/rejected": -6.2726287841796875, "step": 396 }, { "epoch": 0.518572944730182, "grad_norm": 22.34972235720674, "learning_rate": 2.7734280209446865e-07, "logits/chosen": -0.8998773694038391, "logits/rejected": -0.9383985996246338, "logps/chosen": -799.13623046875, "logps/rejected": -1060.0535888671875, "loss": 0.308, "rewards/accuracies": 0.84375, "rewards/chosen": -4.834539413452148, "rewards/margins": 2.1194262504577637, "rewards/rejected": -6.95396614074707, "step": 397 }, { "epoch": 0.519879173810107, "grad_norm": 17.128724945944594, "learning_rate": 2.762078021698398e-07, "logits/chosen": -0.9537370800971985, "logits/rejected": -0.9668135643005371, "logps/chosen": -790.4427490234375, "logps/rejected": -1048.340576171875, "loss": 0.3238, "rewards/accuracies": 0.875, "rewards/chosen": -4.3831071853637695, "rewards/margins": 2.7197885513305664, "rewards/rejected": -7.102895736694336, "step": 398 }, { "epoch": 0.5211854028900318, "grad_norm": 29.151857918658536, "learning_rate": 2.7507225579233486e-07, "logits/chosen": -0.9291197061538696, "logits/rejected": -0.90833580493927, "logps/chosen": -777.3556518554688, "logps/rejected": -924.982421875, "loss": 0.3756, "rewards/accuracies": 0.78125, "rewards/chosen": -4.550766944885254, "rewards/margins": 1.6807010173797607, "rewards/rejected": -6.2314677238464355, "step": 399 }, { "epoch": 0.5224916319699567, "grad_norm": 21.66068267812314, "learning_rate": 2.7393618663897107e-07, "logits/chosen": -0.9538843035697937, "logits/rejected": -0.9232152700424194, "logps/chosen": -845.7114868164062, "logps/rejected": -1059.5697021484375, "loss": 0.3737, "rewards/accuracies": 0.875, "rewards/chosen": -4.844598770141602, "rewards/margins": 2.347217082977295, "rewards/rejected": -7.1918158531188965, "step": 400 }, { "epoch": 0.5224916319699567, "eval_logits/chosen": -0.7711201906204224, "eval_logits/rejected": -0.7776599526405334, "eval_logps/chosen": -779.8291015625, "eval_logps/rejected": -1017.5, "eval_loss": 0.3653486669063568, "eval_rewards/accuracies": 0.8759999871253967, "eval_rewards/chosen": -4.357980251312256, "eval_rewards/margins": 2.3157310485839844, "eval_rewards/rejected": -6.673711776733398, "eval_runtime": 306.4511, "eval_samples_per_second": 6.526, "eval_steps_per_second": 0.408, "step": 400 }, { "epoch": 0.5237978610498816, "grad_norm": 18.39108085144253, "learning_rate": 2.7279961839766587e-07, "logits/chosen": -0.9713696837425232, "logits/rejected": -0.9175607562065125, "logps/chosen": -842.3927001953125, "logps/rejected": -956.6342163085938, "loss": 0.3457, "rewards/accuracies": 0.84375, "rewards/chosen": -4.3701958656311035, "rewards/margins": 1.6710914373397827, "rewards/rejected": -6.041287422180176, "step": 401 }, { "epoch": 0.5251040901298065, "grad_norm": 15.697766908506615, "learning_rate": 2.716625747667432e-07, "logits/chosen": -0.8418725728988647, "logits/rejected": -0.8784732222557068, "logps/chosen": -694.2025756835938, "logps/rejected": -951.430908203125, "loss": 0.2994, "rewards/accuracies": 0.84375, "rewards/chosen": -4.085196495056152, "rewards/margins": 2.302374839782715, "rewards/rejected": -6.387571334838867, "step": 402 }, { "epoch": 0.5264103192097314, "grad_norm": 24.469256220077725, "learning_rate": 2.7052507945443923e-07, "logits/chosen": -0.9080750942230225, "logits/rejected": -0.8578637838363647, "logps/chosen": -704.4551391601562, "logps/rejected": -935.6524658203125, "loss": 0.3242, "rewards/accuracies": 0.8125, "rewards/chosen": -3.9254515171051025, "rewards/margins": 2.3511059284210205, "rewards/rejected": -6.276557922363281, "step": 403 }, { "epoch": 0.5277165482896563, "grad_norm": 20.419010003114707, "learning_rate": 2.69387156178408e-07, "logits/chosen": -1.0022379159927368, "logits/rejected": -0.9890075922012329, "logps/chosen": -783.5850219726562, "logps/rejected": -908.2481689453125, "loss": 0.4126, "rewards/accuracies": 0.84375, "rewards/chosen": -4.0194783210754395, "rewards/margins": 1.3325276374816895, "rewards/rejected": -5.352005958557129, "step": 404 }, { "epoch": 0.5290227773695811, "grad_norm": 16.07011993586235, "learning_rate": 2.682488286652269e-07, "logits/chosen": -0.8653430342674255, "logits/rejected": -0.9178695678710938, "logps/chosen": -677.1983642578125, "logps/rejected": -872.1953735351562, "loss": 0.4308, "rewards/accuracies": 0.8125, "rewards/chosen": -3.747790575027466, "rewards/margins": 1.7173616886138916, "rewards/rejected": -5.465152263641357, "step": 405 }, { "epoch": 0.5303290064495061, "grad_norm": 21.462739199113052, "learning_rate": 2.6711012064990194e-07, "logits/chosen": -0.8753893375396729, "logits/rejected": -0.9124100804328918, "logps/chosen": -756.1655883789062, "logps/rejected": -949.3858032226562, "loss": 0.3056, "rewards/accuracies": 0.875, "rewards/chosen": -3.986269950866699, "rewards/margins": 1.6225194931030273, "rewards/rejected": -5.608789443969727, "step": 406 }, { "epoch": 0.5316352355294309, "grad_norm": 18.071302323354917, "learning_rate": 2.6597105587537304e-07, "logits/chosen": -0.9895930290222168, "logits/rejected": -1.024172067642212, "logps/chosen": -860.986083984375, "logps/rejected": -996.1123657226562, "loss": 0.3608, "rewards/accuracies": 0.78125, "rewards/chosen": -4.353708267211914, "rewards/margins": 1.0356533527374268, "rewards/rejected": -5.389361381530762, "step": 407 }, { "epoch": 0.5329414646093559, "grad_norm": 23.905515055395856, "learning_rate": 2.648316580920187e-07, "logits/chosen": -0.8011449575424194, "logits/rejected": -0.775729775428772, "logps/chosen": -715.1135864257812, "logps/rejected": -877.1203002929688, "loss": 0.4277, "rewards/accuracies": 0.78125, "rewards/chosen": -4.1487812995910645, "rewards/margins": 1.5813078880310059, "rewards/rejected": -5.730088710784912, "step": 408 }, { "epoch": 0.5342476936892807, "grad_norm": 20.577720180588294, "learning_rate": 2.6369195105716084e-07, "logits/chosen": -0.8263465762138367, "logits/rejected": -0.897908091545105, "logps/chosen": -739.7882080078125, "logps/rejected": -1073.1529541015625, "loss": 0.3552, "rewards/accuracies": 0.875, "rewards/chosen": -4.197357654571533, "rewards/margins": 2.525136947631836, "rewards/rejected": -6.722494125366211, "step": 409 }, { "epoch": 0.5355539227692057, "grad_norm": 20.93012715133535, "learning_rate": 2.625519585345699e-07, "logits/chosen": -0.9467390179634094, "logits/rejected": -0.9878973364830017, "logps/chosen": -832.529296875, "logps/rejected": -966.189453125, "loss": 0.3034, "rewards/accuracies": 0.8125, "rewards/chosen": -4.822567462921143, "rewards/margins": 1.0942929983139038, "rewards/rejected": -5.916861534118652, "step": 410 }, { "epoch": 0.5368601518491305, "grad_norm": 15.932987548690928, "learning_rate": 2.6141170429396845e-07, "logits/chosen": -1.0242911577224731, "logits/rejected": -1.0077592134475708, "logps/chosen": -784.8360595703125, "logps/rejected": -987.2750244140625, "loss": 0.3454, "rewards/accuracies": 0.78125, "rewards/chosen": -3.951958417892456, "rewards/margins": 2.033290386199951, "rewards/rejected": -5.985249042510986, "step": 411 }, { "epoch": 0.5381663809290554, "grad_norm": 18.290713742749784, "learning_rate": 2.602712121105363e-07, "logits/chosen": -1.0145286321640015, "logits/rejected": -0.9495306611061096, "logps/chosen": -766.391845703125, "logps/rejected": -897.2517700195312, "loss": 0.3763, "rewards/accuracies": 0.9375, "rewards/chosen": -3.971484899520874, "rewards/margins": 1.6331852674484253, "rewards/rejected": -5.60467004776001, "step": 412 }, { "epoch": 0.5394726100089803, "grad_norm": 17.085604641508972, "learning_rate": 2.5913050576441473e-07, "logits/chosen": -0.9171661138534546, "logits/rejected": -0.9550764560699463, "logps/chosen": -799.6805419921875, "logps/rejected": -1040.7950439453125, "loss": 0.3239, "rewards/accuracies": 0.8125, "rewards/chosen": -4.382745742797852, "rewards/margins": 1.7179137468338013, "rewards/rejected": -6.100659370422363, "step": 413 }, { "epoch": 0.5407788390889052, "grad_norm": 20.64630955775629, "learning_rate": 2.5798960904021014e-07, "logits/chosen": -0.7455999851226807, "logits/rejected": -0.7679321765899658, "logps/chosen": -785.1378784179688, "logps/rejected": -1103.63818359375, "loss": 0.3481, "rewards/accuracies": 0.875, "rewards/chosen": -4.42498254776001, "rewards/margins": 2.7151589393615723, "rewards/rejected": -7.140141487121582, "step": 414 }, { "epoch": 0.5420850681688301, "grad_norm": 16.728318148970214, "learning_rate": 2.568485457264987e-07, "logits/chosen": -0.9213064908981323, "logits/rejected": -0.9041643738746643, "logps/chosen": -843.6577758789062, "logps/rejected": -1016.4732666015625, "loss": 0.3542, "rewards/accuracies": 0.8125, "rewards/chosen": -4.517221450805664, "rewards/margins": 1.9062294960021973, "rewards/rejected": -6.4234514236450195, "step": 415 }, { "epoch": 0.543391297248755, "grad_norm": 22.588461926828998, "learning_rate": 2.5570733961533004e-07, "logits/chosen": -0.8473490476608276, "logits/rejected": -0.8507324457168579, "logps/chosen": -790.0494995117188, "logps/rejected": -1003.7606201171875, "loss": 0.4098, "rewards/accuracies": 0.84375, "rewards/chosen": -4.537339210510254, "rewards/margins": 1.8845421075820923, "rewards/rejected": -6.421881198883057, "step": 416 }, { "epoch": 0.5446975263286798, "grad_norm": 19.870408634037258, "learning_rate": 2.545660145017312e-07, "logits/chosen": -0.7989124655723572, "logits/rejected": -0.8354920744895935, "logps/chosen": -711.0654907226562, "logps/rejected": -959.1341552734375, "loss": 0.39, "rewards/accuracies": 0.75, "rewards/chosen": -4.016317367553711, "rewards/margins": 2.0849642753601074, "rewards/rejected": -6.101282119750977, "step": 417 }, { "epoch": 0.5460037554086048, "grad_norm": 24.60218776058898, "learning_rate": 2.5342459418321057e-07, "logits/chosen": -0.894109845161438, "logits/rejected": -0.8482675552368164, "logps/chosen": -770.8828125, "logps/rejected": -894.0445556640625, "loss": 0.4073, "rewards/accuracies": 0.90625, "rewards/chosen": -4.591938018798828, "rewards/margins": 1.472078561782837, "rewards/rejected": -6.064016819000244, "step": 418 }, { "epoch": 0.5473099844885296, "grad_norm": 25.871862402700973, "learning_rate": 2.5228310245926143e-07, "logits/chosen": -0.9959041476249695, "logits/rejected": -0.9126294255256653, "logps/chosen": -855.5810546875, "logps/rejected": -1101.9541015625, "loss": 0.3782, "rewards/accuracies": 0.84375, "rewards/chosen": -4.857247352600098, "rewards/margins": 2.5603063106536865, "rewards/rejected": -7.417553424835205, "step": 419 }, { "epoch": 0.5486162135684546, "grad_norm": 21.195374508414663, "learning_rate": 2.511415631308664e-07, "logits/chosen": -0.9022479057312012, "logits/rejected": -0.9067992568016052, "logps/chosen": -878.5272827148438, "logps/rejected": -1105.0299072265625, "loss": 0.3276, "rewards/accuracies": 0.8125, "rewards/chosen": -5.165306091308594, "rewards/margins": 2.0585012435913086, "rewards/rejected": -7.223807334899902, "step": 420 }, { "epoch": 0.5499224426483794, "grad_norm": 25.41666561923603, "learning_rate": 2.5e-07, "logits/chosen": -0.9776140451431274, "logits/rejected": -0.9376563429832458, "logps/chosen": -813.5545654296875, "logps/rejected": -894.2927856445312, "loss": 0.4017, "rewards/accuracies": 0.8125, "rewards/chosen": -4.246693134307861, "rewards/margins": 0.8625474572181702, "rewards/rejected": -5.1092400550842285, "step": 421 }, { "epoch": 0.5512286717283044, "grad_norm": 14.958726659038993, "learning_rate": 2.4885843686913364e-07, "logits/chosen": -0.8434813022613525, "logits/rejected": -0.8806734681129456, "logps/chosen": -736.0880126953125, "logps/rejected": -961.4481811523438, "loss": 0.3531, "rewards/accuracies": 0.84375, "rewards/chosen": -4.191902160644531, "rewards/margins": 1.9953703880310059, "rewards/rejected": -6.187272071838379, "step": 422 }, { "epoch": 0.5525349008082292, "grad_norm": 21.495028951673262, "learning_rate": 2.4771689754073855e-07, "logits/chosen": -0.8508874773979187, "logits/rejected": -0.8240814208984375, "logps/chosen": -768.284912109375, "logps/rejected": -955.7354736328125, "loss": 0.3225, "rewards/accuracies": 0.90625, "rewards/chosen": -3.972433567047119, "rewards/margins": 2.121138095855713, "rewards/rejected": -6.09357213973999, "step": 423 }, { "epoch": 0.5538411298881541, "grad_norm": 15.722999973999624, "learning_rate": 2.4657540581678946e-07, "logits/chosen": -0.9035851955413818, "logits/rejected": -0.9613392949104309, "logps/chosen": -798.9974365234375, "logps/rejected": -1164.1221923828125, "loss": 0.3057, "rewards/accuracies": 0.875, "rewards/chosen": -4.214138984680176, "rewards/margins": 3.4577198028564453, "rewards/rejected": -7.671858787536621, "step": 424 }, { "epoch": 0.555147358968079, "grad_norm": 18.247343995966652, "learning_rate": 2.4543398549826877e-07, "logits/chosen": -0.978046178817749, "logits/rejected": -0.9603185653686523, "logps/chosen": -746.07861328125, "logps/rejected": -954.4952392578125, "loss": 0.3439, "rewards/accuracies": 0.9375, "rewards/chosen": -3.9832568168640137, "rewards/margins": 2.3160653114318848, "rewards/rejected": -6.29932165145874, "step": 425 }, { "epoch": 0.5564535880480039, "grad_norm": 21.983144847488614, "learning_rate": 2.4429266038467e-07, "logits/chosen": -0.8131458163261414, "logits/rejected": -0.8295997977256775, "logps/chosen": -758.4473266601562, "logps/rejected": -982.2353515625, "loss": 0.3388, "rewards/accuracies": 0.875, "rewards/chosen": -4.1805033683776855, "rewards/margins": 1.6914095878601074, "rewards/rejected": -5.871912956237793, "step": 426 }, { "epoch": 0.5577598171279288, "grad_norm": 18.619526651872878, "learning_rate": 2.4315145427350126e-07, "logits/chosen": -0.8365015387535095, "logits/rejected": -0.8766813278198242, "logps/chosen": -728.0346069335938, "logps/rejected": -953.6134033203125, "loss": 0.3586, "rewards/accuracies": 0.78125, "rewards/chosen": -4.375938415527344, "rewards/margins": 1.970406413078308, "rewards/rejected": -6.346344470977783, "step": 427 }, { "epoch": 0.5590660462078537, "grad_norm": 20.25987698353873, "learning_rate": 2.4201039095978983e-07, "logits/chosen": -0.9001325368881226, "logits/rejected": -0.8716564178466797, "logps/chosen": -811.824462890625, "logps/rejected": -1029.6591796875, "loss": 0.3611, "rewards/accuracies": 0.90625, "rewards/chosen": -4.3254170417785645, "rewards/margins": 2.2741880416870117, "rewards/rejected": -6.599605560302734, "step": 428 }, { "epoch": 0.5603722752877786, "grad_norm": 18.373974900181594, "learning_rate": 2.4086949423558525e-07, "logits/chosen": -1.0318374633789062, "logits/rejected": -1.025630235671997, "logps/chosen": -753.2796630859375, "logps/rejected": -957.3681640625, "loss": 0.3139, "rewards/accuracies": 0.90625, "rewards/chosen": -3.9497110843658447, "rewards/margins": 2.0519957542419434, "rewards/rejected": -6.001706600189209, "step": 429 }, { "epoch": 0.5616785043677035, "grad_norm": 17.74458197915287, "learning_rate": 2.3972878788946367e-07, "logits/chosen": -0.7888402342796326, "logits/rejected": -0.7724015712738037, "logps/chosen": -822.6279907226562, "logps/rejected": -1166.1044921875, "loss": 0.2831, "rewards/accuracies": 0.90625, "rewards/chosen": -4.497809410095215, "rewards/margins": 3.3227591514587402, "rewards/rejected": -7.820569038391113, "step": 430 }, { "epoch": 0.5629847334476283, "grad_norm": 15.667460951436432, "learning_rate": 2.3858829570603153e-07, "logits/chosen": -0.9419072270393372, "logits/rejected": -0.9077606797218323, "logps/chosen": -782.8739624023438, "logps/rejected": -954.84619140625, "loss": 0.3131, "rewards/accuracies": 0.875, "rewards/chosen": -4.551786422729492, "rewards/margins": 1.868566632270813, "rewards/rejected": -6.420353412628174, "step": 431 }, { "epoch": 0.5642909625275533, "grad_norm": 16.228404418354415, "learning_rate": 2.3744804146543003e-07, "logits/chosen": -1.0330311059951782, "logits/rejected": -1.0249035358428955, "logps/chosen": -830.9697265625, "logps/rejected": -999.312255859375, "loss": 0.3204, "rewards/accuracies": 0.875, "rewards/chosen": -4.5674214363098145, "rewards/margins": 1.7816904783248901, "rewards/rejected": -6.349112033843994, "step": 432 }, { "epoch": 0.5655971916074781, "grad_norm": 33.93589918126513, "learning_rate": 2.3630804894283906e-07, "logits/chosen": -0.8753880858421326, "logits/rejected": -0.8946934342384338, "logps/chosen": -800.2118530273438, "logps/rejected": -994.9881591796875, "loss": 0.4265, "rewards/accuracies": 0.8125, "rewards/chosen": -4.7165446281433105, "rewards/margins": 1.7866311073303223, "rewards/rejected": -6.503175735473633, "step": 433 }, { "epoch": 0.5669034206874031, "grad_norm": 21.60718764822029, "learning_rate": 2.3516834190798128e-07, "logits/chosen": -0.8376666903495789, "logits/rejected": -0.863196849822998, "logps/chosen": -910.37451171875, "logps/rejected": -1207.0146484375, "loss": 0.3037, "rewards/accuracies": 0.84375, "rewards/chosen": -5.631561279296875, "rewards/margins": 2.607675790786743, "rewards/rejected": -8.239237785339355, "step": 434 }, { "epoch": 0.5682096497673279, "grad_norm": 17.78741572283716, "learning_rate": 2.3402894412462691e-07, "logits/chosen": -0.8809553980827332, "logits/rejected": -0.8793525099754333, "logps/chosen": -816.9341430664062, "logps/rejected": -1160.713623046875, "loss": 0.3206, "rewards/accuracies": 0.875, "rewards/chosen": -4.884814262390137, "rewards/margins": 2.8916025161743164, "rewards/rejected": -7.776415824890137, "step": 435 }, { "epoch": 0.5695158788472529, "grad_norm": 19.299157629600934, "learning_rate": 2.3288987935009804e-07, "logits/chosen": -0.81032395362854, "logits/rejected": -0.8216681480407715, "logps/chosen": -795.78271484375, "logps/rejected": -1083.7606201171875, "loss": 0.3267, "rewards/accuracies": 0.71875, "rewards/chosen": -4.85698127746582, "rewards/margins": 2.928353786468506, "rewards/rejected": -7.785335540771484, "step": 436 }, { "epoch": 0.5708221079271777, "grad_norm": 18.934945354797385, "learning_rate": 2.317511713347731e-07, "logits/chosen": -0.8548052310943604, "logits/rejected": -0.8772594332695007, "logps/chosen": -776.690673828125, "logps/rejected": -983.8834838867188, "loss": 0.3146, "rewards/accuracies": 0.90625, "rewards/chosen": -4.52797794342041, "rewards/margins": 1.9513096809387207, "rewards/rejected": -6.479287147521973, "step": 437 }, { "epoch": 0.5721283370071026, "grad_norm": 27.50365285840327, "learning_rate": 2.3061284382159193e-07, "logits/chosen": -0.8831881880760193, "logits/rejected": -0.9110695719718933, "logps/chosen": -759.2625122070312, "logps/rejected": -981.3511962890625, "loss": 0.397, "rewards/accuracies": 0.71875, "rewards/chosen": -4.39530086517334, "rewards/margins": 2.060976982116699, "rewards/rejected": -6.456278324127197, "step": 438 }, { "epoch": 0.5734345660870275, "grad_norm": 28.807216329420136, "learning_rate": 2.2947492054556072e-07, "logits/chosen": -0.9537849426269531, "logits/rejected": -0.9700733423233032, "logps/chosen": -827.2046508789062, "logps/rejected": -1069.7030029296875, "loss": 0.3407, "rewards/accuracies": 0.875, "rewards/chosen": -4.6851348876953125, "rewards/margins": 1.8534389734268188, "rewards/rejected": -6.538573741912842, "step": 439 }, { "epoch": 0.5747407951669524, "grad_norm": 22.99328848252123, "learning_rate": 2.2833742523325675e-07, "logits/chosen": -0.9458773732185364, "logits/rejected": -0.9196736812591553, "logps/chosen": -804.439697265625, "logps/rejected": -1033.7572021484375, "loss": 0.3554, "rewards/accuracies": 0.8125, "rewards/chosen": -4.986883640289307, "rewards/margins": 2.2680232524871826, "rewards/rejected": -7.254906177520752, "step": 440 }, { "epoch": 0.5760470242468773, "grad_norm": 15.202510696978031, "learning_rate": 2.272003816023341e-07, "logits/chosen": -0.9241797924041748, "logits/rejected": -0.9261179566383362, "logps/chosen": -833.6920776367188, "logps/rejected": -1051.131591796875, "loss": 0.2392, "rewards/accuracies": 0.96875, "rewards/chosen": -4.833259582519531, "rewards/margins": 2.2068347930908203, "rewards/rejected": -7.040094375610352, "step": 441 }, { "epoch": 0.5773532533268022, "grad_norm": 26.410873459357198, "learning_rate": 2.2606381336102894e-07, "logits/chosen": -1.0969460010528564, "logits/rejected": -1.0607943534851074, "logps/chosen": -842.22314453125, "logps/rejected": -1066.9891357421875, "loss": 0.4235, "rewards/accuracies": 0.78125, "rewards/chosen": -4.723199367523193, "rewards/margins": 2.193436861038208, "rewards/rejected": -6.9166364669799805, "step": 442 }, { "epoch": 0.578659482406727, "grad_norm": 28.247981746947612, "learning_rate": 2.2492774420766517e-07, "logits/chosen": -0.9036411643028259, "logits/rejected": -0.915302038192749, "logps/chosen": -822.6942749023438, "logps/rejected": -1086.462890625, "loss": 0.3518, "rewards/accuracies": 0.8125, "rewards/chosen": -4.846108436584473, "rewards/margins": 2.3862531185150146, "rewards/rejected": -7.23236083984375, "step": 443 }, { "epoch": 0.579965711486652, "grad_norm": 21.11892539199568, "learning_rate": 2.2379219783016026e-07, "logits/chosen": -0.9433536529541016, "logits/rejected": -0.9617864489555359, "logps/chosen": -900.6919555664062, "logps/rejected": -1149.1103515625, "loss": 0.318, "rewards/accuracies": 0.78125, "rewards/chosen": -4.675385475158691, "rewards/margins": 2.2155497074127197, "rewards/rejected": -6.890934944152832, "step": 444 }, { "epoch": 0.5812719405665768, "grad_norm": 15.321566164302686, "learning_rate": 2.2265719790553146e-07, "logits/chosen": -0.9592891931533813, "logits/rejected": -0.9572802782058716, "logps/chosen": -730.5135498046875, "logps/rejected": -936.27880859375, "loss": 0.3303, "rewards/accuracies": 0.90625, "rewards/chosen": -4.321778297424316, "rewards/margins": 1.9940975904464722, "rewards/rejected": -6.31587553024292, "step": 445 }, { "epoch": 0.5825781696465018, "grad_norm": 14.856682076736497, "learning_rate": 2.2152276809940204e-07, "logits/chosen": -0.9369142055511475, "logits/rejected": -0.9434508681297302, "logps/chosen": -796.3789672851562, "logps/rejected": -976.5616455078125, "loss": 0.3146, "rewards/accuracies": 0.8125, "rewards/chosen": -3.9632458686828613, "rewards/margins": 1.5912312269210815, "rewards/rejected": -5.554476737976074, "step": 446 }, { "epoch": 0.5838843987264266, "grad_norm": 17.00151895193613, "learning_rate": 2.2038893206550796e-07, "logits/chosen": -1.1397333145141602, "logits/rejected": -1.1071242094039917, "logps/chosen": -875.6116943359375, "logps/rejected": -1100.5426025390625, "loss": 0.3171, "rewards/accuracies": 0.84375, "rewards/chosen": -4.550100803375244, "rewards/margins": 2.633068323135376, "rewards/rejected": -7.183169364929199, "step": 447 }, { "epoch": 0.5851906278063516, "grad_norm": 17.642907713457248, "learning_rate": 2.192557134452044e-07, "logits/chosen": -1.0641119480133057, "logits/rejected": -0.9393628835678101, "logps/chosen": -889.4263916015625, "logps/rejected": -1003.4805297851562, "loss": 0.364, "rewards/accuracies": 0.78125, "rewards/chosen": -5.028580665588379, "rewards/margins": 1.7515006065368652, "rewards/rejected": -6.780081748962402, "step": 448 }, { "epoch": 0.5864968568862764, "grad_norm": 33.31921181235088, "learning_rate": 2.1812313586697307e-07, "logits/chosen": -0.9102932214736938, "logits/rejected": -0.9287086725234985, "logps/chosen": -752.0089721679688, "logps/rejected": -919.4461669921875, "loss": 0.3462, "rewards/accuracies": 0.8125, "rewards/chosen": -4.703210830688477, "rewards/margins": 1.3443269729614258, "rewards/rejected": -6.047537803649902, "step": 449 }, { "epoch": 0.5878030859662013, "grad_norm": 17.480381281796603, "learning_rate": 2.1699122294592955e-07, "logits/chosen": -0.8606734275817871, "logits/rejected": -0.896245539188385, "logps/chosen": -832.9607543945312, "logps/rejected": -1162.478515625, "loss": 0.2817, "rewards/accuracies": 0.84375, "rewards/chosen": -4.795577049255371, "rewards/margins": 3.199331521987915, "rewards/rejected": -7.994908332824707, "step": 450 }, { "epoch": 0.5891093150461262, "grad_norm": 17.202137644316416, "learning_rate": 2.1585999828333064e-07, "logits/chosen": -0.9806517362594604, "logits/rejected": -0.9195981621742249, "logps/chosen": -829.970947265625, "logps/rejected": -1054.5245361328125, "loss": 0.2772, "rewards/accuracies": 0.90625, "rewards/chosen": -4.403877258300781, "rewards/margins": 2.6019461154937744, "rewards/rejected": -7.005823135375977, "step": 451 }, { "epoch": 0.5904155441260511, "grad_norm": 19.779239028056754, "learning_rate": 2.147294854660825e-07, "logits/chosen": -0.904513955116272, "logits/rejected": -0.8860921859741211, "logps/chosen": -786.3958129882812, "logps/rejected": -968.3399658203125, "loss": 0.3156, "rewards/accuracies": 0.84375, "rewards/chosen": -4.6048383712768555, "rewards/margins": 2.0750350952148438, "rewards/rejected": -6.679873943328857, "step": 452 }, { "epoch": 0.591721773205976, "grad_norm": 20.448314043899785, "learning_rate": 2.1359970806624884e-07, "logits/chosen": -0.8353624939918518, "logits/rejected": -0.8903397917747498, "logps/chosen": -724.5891723632812, "logps/rejected": -1040.308349609375, "loss": 0.274, "rewards/accuracies": 0.96875, "rewards/chosen": -4.321156024932861, "rewards/margins": 2.924544334411621, "rewards/rejected": -7.245700836181641, "step": 453 }, { "epoch": 0.5930280022859009, "grad_norm": 26.60414849811314, "learning_rate": 2.1247068964055917e-07, "logits/chosen": -0.8759950399398804, "logits/rejected": -0.901042103767395, "logps/chosen": -852.2246704101562, "logps/rejected": -1044.4117431640625, "loss": 0.416, "rewards/accuracies": 0.8125, "rewards/chosen": -5.1181817054748535, "rewards/margins": 1.7556366920471191, "rewards/rejected": -6.873818874359131, "step": 454 }, { "epoch": 0.5943342313658257, "grad_norm": 38.18759204125818, "learning_rate": 2.1134245372991798e-07, "logits/chosen": -0.9119591116905212, "logits/rejected": -0.8581464290618896, "logps/chosen": -774.8963012695312, "logps/rejected": -992.7687377929688, "loss": 0.4143, "rewards/accuracies": 0.875, "rewards/chosen": -4.636556625366211, "rewards/margins": 2.3142166137695312, "rewards/rejected": -6.9507737159729, "step": 455 }, { "epoch": 0.5956404604457507, "grad_norm": 28.856531286525918, "learning_rate": 2.1021502385891368e-07, "logits/chosen": -0.7651792764663696, "logits/rejected": -0.7757113575935364, "logps/chosen": -770.0438232421875, "logps/rejected": -974.64404296875, "loss": 0.3727, "rewards/accuracies": 0.8125, "rewards/chosen": -4.925751209259033, "rewards/margins": 2.0303800106048584, "rewards/rejected": -6.956131458282471, "step": 456 }, { "epoch": 0.5969466895256755, "grad_norm": 16.780867878500203, "learning_rate": 2.09088423535328e-07, "logits/chosen": -0.9524345993995667, "logits/rejected": -0.9039627313613892, "logps/chosen": -815.7782592773438, "logps/rejected": -1012.5498046875, "loss": 0.369, "rewards/accuracies": 0.78125, "rewards/chosen": -4.924071311950684, "rewards/margins": 2.095405340194702, "rewards/rejected": -7.019476413726807, "step": 457 }, { "epoch": 0.5982529186056005, "grad_norm": 35.404568396113156, "learning_rate": 2.0796267624964608e-07, "logits/chosen": -0.914667010307312, "logits/rejected": -0.8787985444068909, "logps/chosen": -872.1204223632812, "logps/rejected": -1020.7347412109375, "loss": 0.3364, "rewards/accuracies": 0.8125, "rewards/chosen": -5.007497310638428, "rewards/margins": 1.6205580234527588, "rewards/rejected": -6.628055572509766, "step": 458 }, { "epoch": 0.5995591476855253, "grad_norm": 20.457701722422097, "learning_rate": 2.0683780547456664e-07, "logits/chosen": -0.8531094193458557, "logits/rejected": -0.8431046009063721, "logps/chosen": -772.6988525390625, "logps/rejected": -952.2398071289062, "loss": 0.3301, "rewards/accuracies": 0.875, "rewards/chosen": -4.533632278442383, "rewards/margins": 1.8784475326538086, "rewards/rejected": -6.41208028793335, "step": 459 }, { "epoch": 0.6008653767654503, "grad_norm": 15.471602821349602, "learning_rate": 2.0571383466451237e-07, "logits/chosen": -0.8806161284446716, "logits/rejected": -0.9237450361251831, "logps/chosen": -768.0919189453125, "logps/rejected": -962.9945678710938, "loss": 0.3217, "rewards/accuracies": 0.8125, "rewards/chosen": -4.675704002380371, "rewards/margins": 1.6774792671203613, "rewards/rejected": -6.353183269500732, "step": 460 }, { "epoch": 0.6021716058453751, "grad_norm": 16.21404011539825, "learning_rate": 2.0459078725514089e-07, "logits/chosen": -1.0201655626296997, "logits/rejected": -1.0077934265136719, "logps/chosen": -864.165283203125, "logps/rejected": -1056.125244140625, "loss": 0.3366, "rewards/accuracies": 0.90625, "rewards/chosen": -4.843500137329102, "rewards/margins": 1.9824084043502808, "rewards/rejected": -6.825908660888672, "step": 461 }, { "epoch": 0.6034778349253, "grad_norm": 21.870842872086143, "learning_rate": 2.0346868666285644e-07, "logits/chosen": -0.8475763201713562, "logits/rejected": -0.806708574295044, "logps/chosen": -788.37109375, "logps/rejected": -972.1688842773438, "loss": 0.283, "rewards/accuracies": 0.90625, "rewards/chosen": -4.456578254699707, "rewards/margins": 2.1057913303375244, "rewards/rejected": -6.5623698234558105, "step": 462 }, { "epoch": 0.6047840640052249, "grad_norm": 38.35605889815296, "learning_rate": 2.023475562843213e-07, "logits/chosen": -0.853971004486084, "logits/rejected": -0.918674647808075, "logps/chosen": -816.5822143554688, "logps/rejected": -1115.367431640625, "loss": 0.3566, "rewards/accuracies": 0.8125, "rewards/chosen": -4.566136837005615, "rewards/margins": 2.0404365062713623, "rewards/rejected": -6.606573104858398, "step": 463 }, { "epoch": 0.6060902930851498, "grad_norm": 15.736017698404957, "learning_rate": 2.0122741949596793e-07, "logits/chosen": -0.8547173738479614, "logits/rejected": -0.894489586353302, "logps/chosen": -788.0221557617188, "logps/rejected": -1043.7552490234375, "loss": 0.3182, "rewards/accuracies": 0.90625, "rewards/chosen": -4.1634907722473145, "rewards/margins": 2.7182223796844482, "rewards/rejected": -6.881713390350342, "step": 464 }, { "epoch": 0.6073965221650747, "grad_norm": 16.34767928777031, "learning_rate": 2.0010829965351184e-07, "logits/chosen": -0.8808002471923828, "logits/rejected": -0.8989526033401489, "logps/chosen": -765.3001098632812, "logps/rejected": -985.0426635742188, "loss": 0.3329, "rewards/accuracies": 0.9375, "rewards/chosen": -3.9959161281585693, "rewards/margins": 2.0970518589019775, "rewards/rejected": -6.092967987060547, "step": 465 }, { "epoch": 0.6087027512449996, "grad_norm": 24.681668419314594, "learning_rate": 1.9899022009146435e-07, "logits/chosen": -1.0274349451065063, "logits/rejected": -1.0152531862258911, "logps/chosen": -913.58251953125, "logps/rejected": -1045.136962890625, "loss": 0.3916, "rewards/accuracies": 0.75, "rewards/chosen": -5.3043341636657715, "rewards/margins": 1.5377391576766968, "rewards/rejected": -6.842073440551758, "step": 466 }, { "epoch": 0.6100089803249245, "grad_norm": 20.13545605217525, "learning_rate": 1.9787320412264607e-07, "logits/chosen": -0.9427694082260132, "logits/rejected": -0.903557300567627, "logps/chosen": -828.9415283203125, "logps/rejected": -958.75048828125, "loss": 0.3119, "rewards/accuracies": 0.8125, "rewards/chosen": -4.989696979522705, "rewards/margins": 1.3408408164978027, "rewards/rejected": -6.330538272857666, "step": 467 }, { "epoch": 0.6113152094048494, "grad_norm": 20.553589332643803, "learning_rate": 1.96757275037701e-07, "logits/chosen": -0.9104101061820984, "logits/rejected": -0.9407252073287964, "logps/chosen": -752.98291015625, "logps/rejected": -1006.4957275390625, "loss": 0.4053, "rewards/accuracies": 0.875, "rewards/chosen": -4.013631343841553, "rewards/margins": 2.0689241886138916, "rewards/rejected": -6.082555770874023, "step": 468 }, { "epoch": 0.6126214384847742, "grad_norm": 15.86015294512485, "learning_rate": 1.9564245610461078e-07, "logits/chosen": -0.9154040217399597, "logits/rejected": -0.97034752368927, "logps/chosen": -686.4542236328125, "logps/rejected": -934.6543579101562, "loss": 0.3401, "rewards/accuracies": 0.8125, "rewards/chosen": -3.8044779300689697, "rewards/margins": 2.325495481491089, "rewards/rejected": -6.129973411560059, "step": 469 }, { "epoch": 0.6139276675646992, "grad_norm": 16.003289738862154, "learning_rate": 1.945287705682093e-07, "logits/chosen": -0.8327289819717407, "logits/rejected": -0.8527270555496216, "logps/chosen": -717.9485473632812, "logps/rejected": -982.47216796875, "loss": 0.2847, "rewards/accuracies": 0.90625, "rewards/chosen": -4.025069236755371, "rewards/margins": 2.376330614089966, "rewards/rejected": -6.401400089263916, "step": 470 }, { "epoch": 0.615233896644624, "grad_norm": 18.621148191091866, "learning_rate": 1.9341624164969859e-07, "logits/chosen": -1.0173578262329102, "logits/rejected": -1.0285704135894775, "logps/chosen": -774.8818359375, "logps/rejected": -1054.2027587890625, "loss": 0.3245, "rewards/accuracies": 0.8125, "rewards/chosen": -4.084187984466553, "rewards/margins": 2.5124804973602295, "rewards/rejected": -6.596668720245361, "step": 471 }, { "epoch": 0.616540125724549, "grad_norm": 16.118635035544298, "learning_rate": 1.92304892546164e-07, "logits/chosen": -0.8497686386108398, "logits/rejected": -0.862455427646637, "logps/chosen": -812.705810546875, "logps/rejected": -1088.639892578125, "loss": 0.2711, "rewards/accuracies": 0.9375, "rewards/chosen": -4.906893730163574, "rewards/margins": 2.828660249710083, "rewards/rejected": -7.735554218292236, "step": 472 }, { "epoch": 0.6178463548044738, "grad_norm": 16.48765253489488, "learning_rate": 1.9119474643009108e-07, "logits/chosen": -0.9804749488830566, "logits/rejected": -0.9375219345092773, "logps/chosen": -757.4735107421875, "logps/rejected": -914.5074462890625, "loss": 0.301, "rewards/accuracies": 0.90625, "rewards/chosen": -4.477548122406006, "rewards/margins": 1.8111082315444946, "rewards/rejected": -6.288656234741211, "step": 473 }, { "epoch": 0.6191525838843988, "grad_norm": 15.195437398662849, "learning_rate": 1.9008582644888196e-07, "logits/chosen": -0.6733070611953735, "logits/rejected": -0.7402501106262207, "logps/chosen": -763.7427978515625, "logps/rejected": -1086.78857421875, "loss": 0.2907, "rewards/accuracies": 0.9375, "rewards/chosen": -4.562100410461426, "rewards/margins": 2.552955389022827, "rewards/rejected": -7.115055561065674, "step": 474 }, { "epoch": 0.6204588129643236, "grad_norm": 20.023388007330663, "learning_rate": 1.8897815572437301e-07, "logits/chosen": -0.8706028461456299, "logits/rejected": -0.9461374878883362, "logps/chosen": -732.7521362304688, "logps/rejected": -1005.2787475585938, "loss": 0.4367, "rewards/accuracies": 0.875, "rewards/chosen": -4.528969764709473, "rewards/margins": 2.5225186347961426, "rewards/rejected": -7.051488876342773, "step": 475 }, { "epoch": 0.6217650420442485, "grad_norm": 20.49346284139183, "learning_rate": 1.8787175735235273e-07, "logits/chosen": -0.8300421237945557, "logits/rejected": -0.9250699281692505, "logps/chosen": -778.4098510742188, "logps/rejected": -1078.0272216796875, "loss": 0.3242, "rewards/accuracies": 0.9375, "rewards/chosen": -4.306589603424072, "rewards/margins": 2.6981916427612305, "rewards/rejected": -7.004781246185303, "step": 476 }, { "epoch": 0.6230712711241734, "grad_norm": 18.35973579335231, "learning_rate": 1.8676665440207977e-07, "logits/chosen": -0.8798630237579346, "logits/rejected": -0.9428200721740723, "logps/chosen": -847.861083984375, "logps/rejected": -1115.469970703125, "loss": 0.3065, "rewards/accuracies": 0.875, "rewards/chosen": -4.708725452423096, "rewards/margins": 2.0331783294677734, "rewards/rejected": -6.741903781890869, "step": 477 }, { "epoch": 0.6243775002040983, "grad_norm": 21.457791070514528, "learning_rate": 1.8566286991580267e-07, "logits/chosen": -1.001512050628662, "logits/rejected": -0.9693081974983215, "logps/chosen": -736.879638671875, "logps/rejected": -926.0786743164062, "loss": 0.3584, "rewards/accuracies": 0.84375, "rewards/chosen": -4.08529806137085, "rewards/margins": 2.071559429168701, "rewards/rejected": -6.156857013702393, "step": 478 }, { "epoch": 0.6256837292840232, "grad_norm": 17.65870882480035, "learning_rate": 1.8456042690827866e-07, "logits/chosen": -0.9303890466690063, "logits/rejected": -0.9561671018600464, "logps/chosen": -872.1222534179688, "logps/rejected": -1080.7010498046875, "loss": 0.3102, "rewards/accuracies": 0.8125, "rewards/chosen": -5.10383415222168, "rewards/margins": 2.146256685256958, "rewards/rejected": -7.2500901222229, "step": 479 }, { "epoch": 0.6269899583639481, "grad_norm": 21.848843169526116, "learning_rate": 1.834593483662942e-07, "logits/chosen": -1.0235671997070312, "logits/rejected": -1.0022693872451782, "logps/chosen": -783.5675048828125, "logps/rejected": -991.603271484375, "loss": 0.3159, "rewards/accuracies": 0.875, "rewards/chosen": -4.475238800048828, "rewards/margins": 2.0906448364257812, "rewards/rejected": -6.565884113311768, "step": 480 }, { "epoch": 0.6282961874438729, "grad_norm": 20.285807323673964, "learning_rate": 1.823596572481856e-07, "logits/chosen": -0.8799965977668762, "logits/rejected": -0.865379273891449, "logps/chosen": -883.8758544921875, "logps/rejected": -1119.204345703125, "loss": 0.3373, "rewards/accuracies": 0.875, "rewards/chosen": -5.130545139312744, "rewards/margins": 2.4770002365112305, "rewards/rejected": -7.607545375823975, "step": 481 }, { "epoch": 0.6296024165237979, "grad_norm": 23.36396355797146, "learning_rate": 1.8126137648336042e-07, "logits/chosen": -0.882436215877533, "logits/rejected": -0.9245233535766602, "logps/chosen": -784.2247924804688, "logps/rejected": -1008.7647705078125, "loss": 0.3069, "rewards/accuracies": 0.90625, "rewards/chosen": -4.217325687408447, "rewards/margins": 2.155444622039795, "rewards/rejected": -6.372769832611084, "step": 482 }, { "epoch": 0.6309086456037227, "grad_norm": 16.44635250770276, "learning_rate": 1.8016452897181899e-07, "logits/chosen": -0.9179688692092896, "logits/rejected": -0.9435247778892517, "logps/chosen": -887.08056640625, "logps/rejected": -1131.097412109375, "loss": 0.2691, "rewards/accuracies": 0.90625, "rewards/chosen": -4.979192733764648, "rewards/margins": 2.308243751525879, "rewards/rejected": -7.2874369621276855, "step": 483 }, { "epoch": 0.6322148746836477, "grad_norm": 26.002503785556364, "learning_rate": 1.7906913758367743e-07, "logits/chosen": -0.8960538506507874, "logits/rejected": -0.9162447452545166, "logps/chosen": -847.0087280273438, "logps/rejected": -1033.681396484375, "loss": 0.3612, "rewards/accuracies": 0.875, "rewards/chosen": -4.641334533691406, "rewards/margins": 2.035878896713257, "rewards/rejected": -6.677213668823242, "step": 484 }, { "epoch": 0.6335211037635725, "grad_norm": 47.32281221829127, "learning_rate": 1.779752251586906e-07, "logits/chosen": -0.9276028871536255, "logits/rejected": -0.912039041519165, "logps/chosen": -842.8237915039062, "logps/rejected": -1087.3580322265625, "loss": 0.3949, "rewards/accuracies": 0.8125, "rewards/chosen": -4.849575996398926, "rewards/margins": 2.352696657180786, "rewards/rejected": -7.202272415161133, "step": 485 }, { "epoch": 0.6348273328434975, "grad_norm": 28.467391326181026, "learning_rate": 1.7688281450577565e-07, "logits/chosen": -0.9715847969055176, "logits/rejected": -0.99041748046875, "logps/chosen": -854.5145874023438, "logps/rejected": -1045.529541015625, "loss": 0.3979, "rewards/accuracies": 0.71875, "rewards/chosen": -4.9397478103637695, "rewards/margins": 1.85337233543396, "rewards/rejected": -6.79311990737915, "step": 486 }, { "epoch": 0.6361335619234223, "grad_norm": 17.19454286875019, "learning_rate": 1.7579192840253676e-07, "logits/chosen": -0.8678796291351318, "logits/rejected": -0.9223478436470032, "logps/chosen": -828.3589477539062, "logps/rejected": -1285.54638671875, "loss": 0.2783, "rewards/accuracies": 0.9375, "rewards/chosen": -4.885687828063965, "rewards/margins": 4.284566879272461, "rewards/rejected": -9.17025375366211, "step": 487 }, { "epoch": 0.6374397910033472, "grad_norm": 18.646551972970194, "learning_rate": 1.7470258959478997e-07, "logits/chosen": -0.9070701599121094, "logits/rejected": -0.9281150102615356, "logps/chosen": -845.6627807617188, "logps/rejected": -1099.25244140625, "loss": 0.2406, "rewards/accuracies": 0.8125, "rewards/chosen": -4.664644718170166, "rewards/margins": 2.3845531940460205, "rewards/rejected": -7.049198150634766, "step": 488 }, { "epoch": 0.6387460200832721, "grad_norm": 28.390001327673815, "learning_rate": 1.7361482079608912e-07, "logits/chosen": -0.8741916418075562, "logits/rejected": -0.9074406623840332, "logps/chosen": -759.44873046875, "logps/rejected": -991.1700439453125, "loss": 0.3513, "rewards/accuracies": 0.84375, "rewards/chosen": -4.629438877105713, "rewards/margins": 2.0953075885772705, "rewards/rejected": -6.724746227264404, "step": 489 }, { "epoch": 0.640052249163197, "grad_norm": 19.202983618249952, "learning_rate": 1.7252864468725217e-07, "logits/chosen": -0.9950762987136841, "logits/rejected": -0.9689397811889648, "logps/chosen": -817.3816528320312, "logps/rejected": -1081.819091796875, "loss": 0.3163, "rewards/accuracies": 0.875, "rewards/chosen": -4.948513507843018, "rewards/margins": 2.7807250022888184, "rewards/rejected": -7.729238033294678, "step": 490 }, { "epoch": 0.6413584782431219, "grad_norm": 47.54669959337329, "learning_rate": 1.7144408391588812e-07, "logits/chosen": -0.9382389783859253, "logits/rejected": -0.9015699625015259, "logps/chosen": -884.1466674804688, "logps/rejected": -1108.498291015625, "loss": 0.4699, "rewards/accuracies": 0.96875, "rewards/chosen": -5.405143737792969, "rewards/margins": 2.269254684448242, "rewards/rejected": -7.674398422241211, "step": 491 }, { "epoch": 0.6426647073230468, "grad_norm": 24.49787042714506, "learning_rate": 1.7036116109592503e-07, "logits/chosen": -0.9624199867248535, "logits/rejected": -0.9387903213500977, "logps/chosen": -866.46923828125, "logps/rejected": -1160.3365478515625, "loss": 0.4061, "rewards/accuracies": 0.84375, "rewards/chosen": -4.7858428955078125, "rewards/margins": 3.046083927154541, "rewards/rejected": -7.8319268226623535, "step": 492 }, { "epoch": 0.6439709364029716, "grad_norm": 23.6266600811225, "learning_rate": 1.692798988071385e-07, "logits/chosen": -0.9590986967086792, "logits/rejected": -0.9157624244689941, "logps/chosen": -707.189697265625, "logps/rejected": -848.1437377929688, "loss": 0.3058, "rewards/accuracies": 0.90625, "rewards/chosen": -3.7680490016937256, "rewards/margins": 1.6261415481567383, "rewards/rejected": -5.394190788269043, "step": 493 }, { "epoch": 0.6452771654828966, "grad_norm": 32.38766640788159, "learning_rate": 1.6820031959468058e-07, "logits/chosen": -0.8452016711235046, "logits/rejected": -0.8521511554718018, "logps/chosen": -761.6751708984375, "logps/rejected": -1109.984375, "loss": 0.3427, "rewards/accuracies": 0.8125, "rewards/chosen": -4.9624176025390625, "rewards/margins": 3.2722549438476562, "rewards/rejected": -8.234672546386719, "step": 494 }, { "epoch": 0.6465833945628214, "grad_norm": 27.974709752171293, "learning_rate": 1.6712244596861005e-07, "logits/chosen": -0.9747927784919739, "logits/rejected": -0.9465031027793884, "logps/chosen": -825.25439453125, "logps/rejected": -1000.7984008789062, "loss": 0.3166, "rewards/accuracies": 0.90625, "rewards/chosen": -4.469265937805176, "rewards/margins": 2.1552627086639404, "rewards/rejected": -6.624528884887695, "step": 495 }, { "epoch": 0.6478896236427464, "grad_norm": 20.810107529075697, "learning_rate": 1.6604630040342287e-07, "logits/chosen": -0.8734793663024902, "logits/rejected": -0.8113576173782349, "logps/chosen": -733.5122680664062, "logps/rejected": -941.5518188476562, "loss": 0.2349, "rewards/accuracies": 0.875, "rewards/chosen": -4.06757116317749, "rewards/margins": 2.2041683197021484, "rewards/rejected": -6.271739482879639, "step": 496 }, { "epoch": 0.6491958527226712, "grad_norm": 26.424860817633128, "learning_rate": 1.6497190533758347e-07, "logits/chosen": -0.8922603130340576, "logits/rejected": -0.9254794120788574, "logps/chosen": -755.695068359375, "logps/rejected": -1069.649658203125, "loss": 0.3252, "rewards/accuracies": 0.8125, "rewards/chosen": -4.36348819732666, "rewards/margins": 2.7446978092193604, "rewards/rejected": -7.108185768127441, "step": 497 }, { "epoch": 0.6505020818025962, "grad_norm": 33.756785960357675, "learning_rate": 1.6389928317305714e-07, "logits/chosen": -0.9473923444747925, "logits/rejected": -0.9284683465957642, "logps/chosen": -743.8952026367188, "logps/rejected": -949.4517211914062, "loss": 0.4077, "rewards/accuracies": 0.84375, "rewards/chosen": -3.9511754512786865, "rewards/margins": 2.0509424209594727, "rewards/rejected": -6.00211763381958, "step": 498 }, { "epoch": 0.651808310882521, "grad_norm": 18.712023115705826, "learning_rate": 1.6282845627484286e-07, "logits/chosen": -0.9681456089019775, "logits/rejected": -1.0133285522460938, "logps/chosen": -914.3257446289062, "logps/rejected": -1230.6312255859375, "loss": 0.3075, "rewards/accuracies": 0.9375, "rewards/chosen": -5.505870819091797, "rewards/margins": 3.0627996921539307, "rewards/rejected": -8.568670272827148, "step": 499 }, { "epoch": 0.653114539962446, "grad_norm": 21.626442285320586, "learning_rate": 1.6175944697050676e-07, "logits/chosen": -0.7312138080596924, "logits/rejected": -0.8125101327896118, "logps/chosen": -764.114013671875, "logps/rejected": -1055.6309814453125, "loss": 0.2611, "rewards/accuracies": 0.78125, "rewards/chosen": -5.200470924377441, "rewards/margins": 2.1423423290252686, "rewards/rejected": -7.342813014984131, "step": 500 }, { "epoch": 0.653114539962446, "eval_logits/chosen": -0.8074228167533875, "eval_logits/rejected": -0.8136730790138245, "eval_logps/chosen": -834.2015380859375, "eval_logps/rejected": -1117.25146484375, "eval_loss": 0.3457300364971161, "eval_rewards/accuracies": 0.8859999775886536, "eval_rewards/chosen": -4.901705265045166, "eval_rewards/margins": 2.7695229053497314, "eval_rewards/rejected": -7.67122745513916, "eval_runtime": 303.8262, "eval_samples_per_second": 6.583, "eval_steps_per_second": 0.411, "step": 500 }, { "epoch": 0.6544207690423708, "grad_norm": 35.83155605863081, "learning_rate": 1.606922775497168e-07, "logits/chosen": -0.8929469585418701, "logits/rejected": -0.9043774604797363, "logps/chosen": -784.7332763671875, "logps/rejected": -1039.4635009765625, "loss": 0.3203, "rewards/accuracies": 0.84375, "rewards/chosen": -4.490346431732178, "rewards/margins": 2.162130117416382, "rewards/rejected": -6.6524763107299805, "step": 501 }, { "epoch": 0.6557269981222957, "grad_norm": 24.012110512122, "learning_rate": 1.5962697026377808e-07, "logits/chosen": -0.8780226707458496, "logits/rejected": -0.9489363431930542, "logps/chosen": -765.6043701171875, "logps/rejected": -1039.3092041015625, "loss": 0.3271, "rewards/accuracies": 0.875, "rewards/chosen": -4.720160484313965, "rewards/margins": 2.347363233566284, "rewards/rejected": -7.06752347946167, "step": 502 }, { "epoch": 0.6570332272022206, "grad_norm": 24.277681500510212, "learning_rate": 1.5856354732516863e-07, "logits/chosen": -0.9098899960517883, "logits/rejected": -0.926539957523346, "logps/chosen": -867.1011962890625, "logps/rejected": -1170.328369140625, "loss": 0.3981, "rewards/accuracies": 0.78125, "rewards/chosen": -4.973918914794922, "rewards/margins": 2.8096470832824707, "rewards/rejected": -7.783565044403076, "step": 503 }, { "epoch": 0.6583394562821455, "grad_norm": 26.363019101162333, "learning_rate": 1.575020309070763e-07, "logits/chosen": -1.0532864332199097, "logits/rejected": -1.0039781332015991, "logps/chosen": -903.8301391601562, "logps/rejected": -1048.2803955078125, "loss": 0.404, "rewards/accuracies": 0.78125, "rewards/chosen": -5.257813930511475, "rewards/margins": 1.9001696109771729, "rewards/rejected": -7.157983779907227, "step": 504 }, { "epoch": 0.6596456853620704, "grad_norm": 34.051380385152065, "learning_rate": 1.564424431429367e-07, "logits/chosen": -0.9329712390899658, "logits/rejected": -0.9821687340736389, "logps/chosen": -780.6395263671875, "logps/rejected": -1030.6109619140625, "loss": 0.4478, "rewards/accuracies": 0.84375, "rewards/chosen": -4.551027297973633, "rewards/margins": 2.0627126693725586, "rewards/rejected": -6.613739490509033, "step": 505 }, { "epoch": 0.6609519144419953, "grad_norm": 18.91185817042615, "learning_rate": 1.553848061259715e-07, "logits/chosen": -0.9905709624290466, "logits/rejected": -0.9948655366897583, "logps/chosen": -721.0912475585938, "logps/rejected": -946.108642578125, "loss": 0.3625, "rewards/accuracies": 0.90625, "rewards/chosen": -4.270671367645264, "rewards/margins": 2.0259008407592773, "rewards/rejected": -6.296573162078857, "step": 506 }, { "epoch": 0.6622581435219201, "grad_norm": 21.4133638867673, "learning_rate": 1.5432914190872756e-07, "logits/chosen": -0.9507661461830139, "logits/rejected": -0.9466673135757446, "logps/chosen": -741.1168823242188, "logps/rejected": -1028.7220458984375, "loss": 0.3489, "rewards/accuracies": 0.9375, "rewards/chosen": -4.295059680938721, "rewards/margins": 2.8194098472595215, "rewards/rejected": -7.114469528198242, "step": 507 }, { "epoch": 0.6635643726018451, "grad_norm": 16.850885406295667, "learning_rate": 1.5327547250261764e-07, "logits/chosen": -1.0218459367752075, "logits/rejected": -1.0179591178894043, "logps/chosen": -858.17333984375, "logps/rejected": -1121.5169677734375, "loss": 0.3068, "rewards/accuracies": 0.9375, "rewards/chosen": -4.725369930267334, "rewards/margins": 2.5118465423583984, "rewards/rejected": -7.237216472625732, "step": 508 }, { "epoch": 0.6648706016817699, "grad_norm": 18.433618884607107, "learning_rate": 1.5222381987746102e-07, "logits/chosen": -0.9506387114524841, "logits/rejected": -1.002458095550537, "logps/chosen": -795.8396606445312, "logps/rejected": -1047.6082763671875, "loss": 0.3417, "rewards/accuracies": 0.8125, "rewards/chosen": -4.511249542236328, "rewards/margins": 2.4017834663391113, "rewards/rejected": -6.9130330085754395, "step": 509 }, { "epoch": 0.6661768307616949, "grad_norm": 22.548196729659644, "learning_rate": 1.5117420596102548e-07, "logits/chosen": -0.8478186130523682, "logits/rejected": -0.9220375418663025, "logps/chosen": -742.0947265625, "logps/rejected": -1046.636474609375, "loss": 0.2665, "rewards/accuracies": 0.8125, "rewards/chosen": -4.394111633300781, "rewards/margins": 2.5048062801361084, "rewards/rejected": -6.8989176750183105, "step": 510 }, { "epoch": 0.6674830598416197, "grad_norm": 16.478239174352726, "learning_rate": 1.501266526385702e-07, "logits/chosen": -1.0730950832366943, "logits/rejected": -1.092372179031372, "logps/chosen": -767.8289184570312, "logps/rejected": -985.9404296875, "loss": 0.3245, "rewards/accuracies": 0.75, "rewards/chosen": -4.549228668212891, "rewards/margins": 2.1183018684387207, "rewards/rejected": -6.667530059814453, "step": 511 }, { "epoch": 0.6687892889215447, "grad_norm": 23.638655193847455, "learning_rate": 1.490811817523896e-07, "logits/chosen": -0.9937188029289246, "logits/rejected": -1.0091791152954102, "logps/chosen": -847.7296142578125, "logps/rejected": -1097.6788330078125, "loss": 0.3033, "rewards/accuracies": 0.8125, "rewards/chosen": -4.684628963470459, "rewards/margins": 2.2222049236297607, "rewards/rejected": -6.906834125518799, "step": 512 }, { "epoch": 0.6700955180014695, "grad_norm": 25.469595664561382, "learning_rate": 1.4803781510135722e-07, "logits/chosen": -0.8402674198150635, "logits/rejected": -0.8698500394821167, "logps/chosen": -856.8892822265625, "logps/rejected": -1082.5703125, "loss": 0.3428, "rewards/accuracies": 0.875, "rewards/chosen": -4.897946357727051, "rewards/margins": 2.5066497325897217, "rewards/rejected": -7.404595851898193, "step": 513 }, { "epoch": 0.6714017470813944, "grad_norm": 23.476193663048107, "learning_rate": 1.4699657444047213e-07, "logits/chosen": -0.9435025453567505, "logits/rejected": -0.8921396732330322, "logps/chosen": -774.4256591796875, "logps/rejected": -1036.7841796875, "loss": 0.3501, "rewards/accuracies": 0.9375, "rewards/chosen": -4.357728004455566, "rewards/margins": 2.8723790645599365, "rewards/rejected": -7.230106353759766, "step": 514 }, { "epoch": 0.6727079761613193, "grad_norm": 18.52575101879441, "learning_rate": 1.4595748148040465e-07, "logits/chosen": -0.9316481947898865, "logits/rejected": -0.871412456035614, "logps/chosen": -797.5765380859375, "logps/rejected": -989.5557861328125, "loss": 0.3503, "rewards/accuracies": 0.90625, "rewards/chosen": -4.4651994705200195, "rewards/margins": 2.085799217224121, "rewards/rejected": -6.550998210906982, "step": 515 }, { "epoch": 0.6740142052412442, "grad_norm": 23.95953824389779, "learning_rate": 1.4492055788704394e-07, "logits/chosen": -0.8964416980743408, "logits/rejected": -0.9257286190986633, "logps/chosen": -900.4140625, "logps/rejected": -1141.0982666015625, "loss": 0.2902, "rewards/accuracies": 0.875, "rewards/chosen": -5.290503025054932, "rewards/margins": 2.3838093280792236, "rewards/rejected": -7.674312591552734, "step": 516 }, { "epoch": 0.6753204343211691, "grad_norm": 20.124074172997577, "learning_rate": 1.4388582528104627e-07, "logits/chosen": -0.8705368041992188, "logits/rejected": -0.8657253980636597, "logps/chosen": -774.8510131835938, "logps/rejected": -1063.6641845703125, "loss": 0.3582, "rewards/accuracies": 0.8125, "rewards/chosen": -4.379948616027832, "rewards/margins": 2.763211250305176, "rewards/rejected": -7.143159866333008, "step": 517 }, { "epoch": 0.676626663401094, "grad_norm": 23.85216054356757, "learning_rate": 1.4285330523738385e-07, "logits/chosen": -0.8885217308998108, "logits/rejected": -0.9661125540733337, "logps/chosen": -780.2984619140625, "logps/rejected": -1220.000732421875, "loss": 0.2674, "rewards/accuracies": 0.875, "rewards/chosen": -4.0980353355407715, "rewards/margins": 3.5056042671203613, "rewards/rejected": -7.603639125823975, "step": 518 }, { "epoch": 0.6779328924810188, "grad_norm": 23.081844702273408, "learning_rate": 1.4182301928489554e-07, "logits/chosen": -0.9551963210105896, "logits/rejected": -1.0183593034744263, "logps/chosen": -836.30126953125, "logps/rejected": -1116.5960693359375, "loss": 0.2732, "rewards/accuracies": 0.875, "rewards/chosen": -4.809637069702148, "rewards/margins": 2.4394960403442383, "rewards/rejected": -7.2491326332092285, "step": 519 }, { "epoch": 0.6792391215609438, "grad_norm": 22.095622056118994, "learning_rate": 1.4079498890583762e-07, "logits/chosen": -0.9430717825889587, "logits/rejected": -0.9482989311218262, "logps/chosen": -835.6499633789062, "logps/rejected": -1055.31396484375, "loss": 0.3049, "rewards/accuracies": 0.8125, "rewards/chosen": -4.793646812438965, "rewards/margins": 2.296147346496582, "rewards/rejected": -7.089794158935547, "step": 520 }, { "epoch": 0.6805453506408686, "grad_norm": 20.717133805005673, "learning_rate": 1.3976923553543585e-07, "logits/chosen": -1.1042985916137695, "logits/rejected": -1.0969711542129517, "logps/chosen": -912.5108642578125, "logps/rejected": -1019.5659790039062, "loss": 0.4057, "rewards/accuracies": 0.71875, "rewards/chosen": -5.131659507751465, "rewards/margins": 1.0571504831314087, "rewards/rejected": -6.188809871673584, "step": 521 }, { "epoch": 0.6818515797207936, "grad_norm": 18.213109701212694, "learning_rate": 1.387457805614387e-07, "logits/chosen": -0.8132377862930298, "logits/rejected": -0.8710845708847046, "logps/chosen": -770.2957763671875, "logps/rejected": -1096.7030029296875, "loss": 0.3482, "rewards/accuracies": 0.875, "rewards/chosen": -4.406842231750488, "rewards/margins": 2.6769254207611084, "rewards/rejected": -7.083767890930176, "step": 522 }, { "epoch": 0.6831578088007184, "grad_norm": 18.85695986546673, "learning_rate": 1.3772464532367123e-07, "logits/chosen": -1.0416616201400757, "logits/rejected": -0.9788161516189575, "logps/chosen": -803.7244873046875, "logps/rejected": -996.6585083007812, "loss": 0.2956, "rewards/accuracies": 0.8125, "rewards/chosen": -4.829257965087891, "rewards/margins": 2.251364231109619, "rewards/rejected": -7.08062219619751, "step": 523 }, { "epoch": 0.6844640378806434, "grad_norm": 25.509597048268557, "learning_rate": 1.3670585111359034e-07, "logits/chosen": -1.050762414932251, "logits/rejected": -1.0215251445770264, "logps/chosen": -797.5689086914062, "logps/rejected": -912.2422485351562, "loss": 0.3768, "rewards/accuracies": 0.8125, "rewards/chosen": -4.425357818603516, "rewards/margins": 1.3879318237304688, "rewards/rejected": -5.813289642333984, "step": 524 }, { "epoch": 0.6857702669605682, "grad_norm": 31.852282652245847, "learning_rate": 1.3568941917384036e-07, "logits/chosen": -0.9219133853912354, "logits/rejected": -0.9097141027450562, "logps/chosen": -750.3033447265625, "logps/rejected": -955.424560546875, "loss": 0.3827, "rewards/accuracies": 0.78125, "rewards/chosen": -3.9384779930114746, "rewards/margins": 1.7265440225601196, "rewards/rejected": -5.665022373199463, "step": 525 }, { "epoch": 0.6870764960404931, "grad_norm": 27.83642396092612, "learning_rate": 1.3467537069781083e-07, "logits/chosen": -0.8348858952522278, "logits/rejected": -0.912026584148407, "logps/chosen": -755.4243774414062, "logps/rejected": -1065.1910400390625, "loss": 0.3305, "rewards/accuracies": 0.875, "rewards/chosen": -4.685622692108154, "rewards/margins": 2.498351812362671, "rewards/rejected": -7.183974266052246, "step": 526 }, { "epoch": 0.688382725120418, "grad_norm": 20.154859857966876, "learning_rate": 1.3366372682919413e-07, "logits/chosen": -0.9479906558990479, "logits/rejected": -1.0296324491500854, "logps/chosen": -764.8931274414062, "logps/rejected": -997.9901123046875, "loss": 0.3715, "rewards/accuracies": 0.875, "rewards/chosen": -4.2235212326049805, "rewards/margins": 1.9003543853759766, "rewards/rejected": -6.123875617980957, "step": 527 }, { "epoch": 0.6896889542003429, "grad_norm": 43.509262990869395, "learning_rate": 1.3265450866154465e-07, "logits/chosen": -0.8818020820617676, "logits/rejected": -0.8966426849365234, "logps/chosen": -793.8504028320312, "logps/rejected": -989.6928100585938, "loss": 0.2804, "rewards/accuracies": 0.9375, "rewards/chosen": -4.28352165222168, "rewards/margins": 2.0015485286712646, "rewards/rejected": -6.285069465637207, "step": 528 }, { "epoch": 0.6909951832802678, "grad_norm": 22.370295801986263, "learning_rate": 1.3164773723783916e-07, "logits/chosen": -0.8573867082595825, "logits/rejected": -0.9925711154937744, "logps/chosen": -731.441650390625, "logps/rejected": -1192.9609375, "loss": 0.2617, "rewards/accuracies": 0.875, "rewards/chosen": -4.299685955047607, "rewards/margins": 3.543790578842163, "rewards/rejected": -7.843476295471191, "step": 529 }, { "epoch": 0.6923014123601927, "grad_norm": 30.337274628575507, "learning_rate": 1.3064343355003773e-07, "logits/chosen": -1.0423648357391357, "logits/rejected": -1.0021618604660034, "logps/chosen": -769.501220703125, "logps/rejected": -996.4588623046875, "loss": 0.2833, "rewards/accuracies": 0.96875, "rewards/chosen": -3.6555285453796387, "rewards/margins": 2.304563045501709, "rewards/rejected": -5.960091590881348, "step": 530 }, { "epoch": 0.6936076414401176, "grad_norm": 21.726072264869213, "learning_rate": 1.2964161853864652e-07, "logits/chosen": -0.9621329307556152, "logits/rejected": -0.9972319006919861, "logps/chosen": -807.4346923828125, "logps/rejected": -1011.5498046875, "loss": 0.3271, "rewards/accuracies": 0.78125, "rewards/chosen": -4.329762935638428, "rewards/margins": 2.0207858085632324, "rewards/rejected": -6.350549221038818, "step": 531 }, { "epoch": 0.6949138705200425, "grad_norm": 16.228236624062486, "learning_rate": 1.2864231309228055e-07, "logits/chosen": -0.9178920388221741, "logits/rejected": -0.9607722759246826, "logps/chosen": -800.65185546875, "logps/rejected": -1050.902587890625, "loss": 0.371, "rewards/accuracies": 0.90625, "rewards/chosen": -4.402060031890869, "rewards/margins": 2.257364511489868, "rewards/rejected": -6.659424304962158, "step": 532 }, { "epoch": 0.6962200995999673, "grad_norm": 17.11412863341009, "learning_rate": 1.2764553804722867e-07, "logits/chosen": -0.8288196325302124, "logits/rejected": -0.7849943041801453, "logps/chosen": -822.9666748046875, "logps/rejected": -1016.3614501953125, "loss": 0.3288, "rewards/accuracies": 0.875, "rewards/chosen": -4.726102828979492, "rewards/margins": 2.1954498291015625, "rewards/rejected": -6.9215521812438965, "step": 533 }, { "epoch": 0.6975263286798923, "grad_norm": 17.168886769292737, "learning_rate": 1.2665131418701896e-07, "logits/chosen": -0.9636877179145813, "logits/rejected": -0.8900684118270874, "logps/chosen": -831.2034912109375, "logps/rejected": -1021.8273315429688, "loss": 0.3186, "rewards/accuracies": 0.96875, "rewards/chosen": -4.344829559326172, "rewards/margins": 2.0918314456939697, "rewards/rejected": -6.436661720275879, "step": 534 }, { "epoch": 0.6988325577598171, "grad_norm": 21.860374335135038, "learning_rate": 1.2565966224198518e-07, "logits/chosen": -0.9835441708564758, "logits/rejected": -1.0193074941635132, "logps/chosen": -773.3865356445312, "logps/rejected": -947.406494140625, "loss": 0.2801, "rewards/accuracies": 0.8125, "rewards/chosen": -4.3468403816223145, "rewards/margins": 1.6079427003860474, "rewards/rejected": -5.954782485961914, "step": 535 }, { "epoch": 0.7001387868397421, "grad_norm": 18.01057131898774, "learning_rate": 1.246706028888348e-07, "logits/chosen": -0.9090981483459473, "logits/rejected": -0.958797812461853, "logps/chosen": -801.5009155273438, "logps/rejected": -1012.5660400390625, "loss": 0.341, "rewards/accuracies": 0.8125, "rewards/chosen": -4.3776373863220215, "rewards/margins": 1.7712655067443848, "rewards/rejected": -6.1489033699035645, "step": 536 }, { "epoch": 0.7014450159196669, "grad_norm": 25.088999958240304, "learning_rate": 1.2368415675021768e-07, "logits/chosen": -1.0301973819732666, "logits/rejected": -1.0415173768997192, "logps/chosen": -846.6824340820312, "logps/rejected": -1005.0170288085938, "loss": 0.3147, "rewards/accuracies": 0.84375, "rewards/chosen": -4.839428424835205, "rewards/margins": 1.6613373756408691, "rewards/rejected": -6.500766277313232, "step": 537 }, { "epoch": 0.7027512449995919, "grad_norm": 24.708438696063062, "learning_rate": 1.2270034439429623e-07, "logits/chosen": -0.9158374667167664, "logits/rejected": -0.9857248067855835, "logps/chosen": -898.869384765625, "logps/rejected": -1202.664306640625, "loss": 0.3081, "rewards/accuracies": 0.90625, "rewards/chosen": -5.038189888000488, "rewards/margins": 2.465287923812866, "rewards/rejected": -7.503477573394775, "step": 538 }, { "epoch": 0.7040574740795167, "grad_norm": 23.937697291164447, "learning_rate": 1.2171918633431622e-07, "logits/chosen": -0.9632992148399353, "logits/rejected": -0.9589728713035583, "logps/chosen": -894.701904296875, "logps/rejected": -1180.599609375, "loss": 0.2895, "rewards/accuracies": 0.84375, "rewards/chosen": -4.9801764488220215, "rewards/margins": 2.669917345046997, "rewards/rejected": -7.650093078613281, "step": 539 }, { "epoch": 0.7053637031594416, "grad_norm": 26.25664570501424, "learning_rate": 1.2074070302817959e-07, "logits/chosen": -1.0312018394470215, "logits/rejected": -1.0125337839126587, "logps/chosen": -776.10400390625, "logps/rejected": -995.2510986328125, "loss": 0.2686, "rewards/accuracies": 0.90625, "rewards/chosen": -4.570556163787842, "rewards/margins": 2.495591402053833, "rewards/rejected": -7.066147804260254, "step": 540 }, { "epoch": 0.7066699322393665, "grad_norm": 24.597234782614173, "learning_rate": 1.1976491487801746e-07, "logits/chosen": -0.7603006362915039, "logits/rejected": -0.8024336099624634, "logps/chosen": -772.5233154296875, "logps/rejected": -1199.2117919921875, "loss": 0.3024, "rewards/accuracies": 0.90625, "rewards/chosen": -4.72584867477417, "rewards/margins": 3.7751638889312744, "rewards/rejected": -8.501012802124023, "step": 541 }, { "epoch": 0.7079761613192914, "grad_norm": 35.3757532271724, "learning_rate": 1.1879184222976488e-07, "logits/chosen": -0.9676728248596191, "logits/rejected": -0.9333958625793457, "logps/chosen": -862.1279296875, "logps/rejected": -1095.59912109375, "loss": 0.3705, "rewards/accuracies": 0.875, "rewards/chosen": -4.918542861938477, "rewards/margins": 2.3512566089630127, "rewards/rejected": -7.26979923248291, "step": 542 }, { "epoch": 0.7092823903992163, "grad_norm": 24.764732292716715, "learning_rate": 1.1782150537273664e-07, "logits/chosen": -1.0130747556686401, "logits/rejected": -0.9589080810546875, "logps/chosen": -920.9356689453125, "logps/rejected": -1052.5150146484375, "loss": 0.3492, "rewards/accuracies": 0.8125, "rewards/chosen": -5.504761695861816, "rewards/margins": 1.7927732467651367, "rewards/rejected": -7.297534465789795, "step": 543 }, { "epoch": 0.7105886194791412, "grad_norm": 21.974575796244796, "learning_rate": 1.168539245392042e-07, "logits/chosen": -0.9473114609718323, "logits/rejected": -0.8918843865394592, "logps/chosen": -876.5565795898438, "logps/rejected": -1087.864990234375, "loss": 0.3665, "rewards/accuracies": 0.875, "rewards/chosen": -5.10919189453125, "rewards/margins": 2.627614736557007, "rewards/rejected": -7.736806869506836, "step": 544 }, { "epoch": 0.711894848559066, "grad_norm": 17.97992792815349, "learning_rate": 1.1588911990397362e-07, "logits/chosen": -0.9870251417160034, "logits/rejected": -1.0186010599136353, "logps/chosen": -835.75830078125, "logps/rejected": -1067.68505859375, "loss": 0.2964, "rewards/accuracies": 0.875, "rewards/chosen": -5.034979820251465, "rewards/margins": 2.080564498901367, "rewards/rejected": -7.11554479598999, "step": 545 }, { "epoch": 0.713201077638991, "grad_norm": 29.36410303717562, "learning_rate": 1.1492711158396523e-07, "logits/chosen": -0.9277634024620056, "logits/rejected": -0.9040770530700684, "logps/chosen": -778.7317504882812, "logps/rejected": -1031.9451904296875, "loss": 0.389, "rewards/accuracies": 0.875, "rewards/chosen": -4.545460224151611, "rewards/margins": 2.5905895233154297, "rewards/rejected": -7.136049270629883, "step": 546 }, { "epoch": 0.7145073067189158, "grad_norm": 17.973634079399982, "learning_rate": 1.1396791963779409e-07, "logits/chosen": -0.8808873891830444, "logits/rejected": -0.9185099601745605, "logps/chosen": -859.2599487304688, "logps/rejected": -1208.873046875, "loss": 0.2681, "rewards/accuracies": 0.96875, "rewards/chosen": -4.937643051147461, "rewards/margins": 2.997467041015625, "rewards/rejected": -7.935110092163086, "step": 547 }, { "epoch": 0.7158135357988408, "grad_norm": 25.10421767660218, "learning_rate": 1.1301156406535156e-07, "logits/chosen": -0.8444209694862366, "logits/rejected": -0.9109209775924683, "logps/chosen": -869.1829833984375, "logps/rejected": -1528.3863525390625, "loss": 0.2826, "rewards/accuracies": 0.84375, "rewards/chosen": -5.416234016418457, "rewards/margins": 5.921084403991699, "rewards/rejected": -11.337318420410156, "step": 548 }, { "epoch": 0.7171197648787656, "grad_norm": 26.637099440513637, "learning_rate": 1.120580648073885e-07, "logits/chosen": -1.0137161016464233, "logits/rejected": -0.9916458129882812, "logps/chosen": -847.381591796875, "logps/rejected": -1100.040283203125, "loss": 0.3916, "rewards/accuracies": 0.8125, "rewards/chosen": -4.878036975860596, "rewards/margins": 2.2340753078460693, "rewards/rejected": -7.112112045288086, "step": 549 }, { "epoch": 0.7184259939586906, "grad_norm": 17.61880269068818, "learning_rate": 1.1110744174509951e-07, "logits/chosen": -0.897437572479248, "logits/rejected": -0.8928896188735962, "logps/chosen": -867.4434204101562, "logps/rejected": -1089.23193359375, "loss": 0.3098, "rewards/accuracies": 0.9375, "rewards/chosen": -5.121048450469971, "rewards/margins": 2.1236259937286377, "rewards/rejected": -7.2446746826171875, "step": 550 }, { "epoch": 0.7197322230386154, "grad_norm": 15.95782153940809, "learning_rate": 1.1015971469970795e-07, "logits/chosen": -0.9469540119171143, "logits/rejected": -0.9789649844169617, "logps/chosen": -829.0692138671875, "logps/rejected": -1165.4215087890625, "loss": 0.2877, "rewards/accuracies": 0.90625, "rewards/chosen": -4.792716026306152, "rewards/margins": 3.257359027862549, "rewards/rejected": -8.05007553100586, "step": 551 }, { "epoch": 0.7210384521185403, "grad_norm": 23.122374666528035, "learning_rate": 1.0921490343205333e-07, "logits/chosen": -1.0128257274627686, "logits/rejected": -1.0176877975463867, "logps/chosen": -916.9769287109375, "logps/rejected": -1195.42578125, "loss": 0.3506, "rewards/accuracies": 0.90625, "rewards/chosen": -5.4742326736450195, "rewards/margins": 2.6680378913879395, "rewards/rejected": -8.1422700881958, "step": 552 }, { "epoch": 0.7223446811984652, "grad_norm": 26.270798765214032, "learning_rate": 1.0827302764217886e-07, "logits/chosen": -0.9295449256896973, "logits/rejected": -0.9562668800354004, "logps/chosen": -891.7932739257812, "logps/rejected": -1106.447509765625, "loss": 0.3752, "rewards/accuracies": 0.75, "rewards/chosen": -5.4281229972839355, "rewards/margins": 2.2159817218780518, "rewards/rejected": -7.644104957580566, "step": 553 }, { "epoch": 0.7236509102783901, "grad_norm": 23.32404619948412, "learning_rate": 1.0733410696892072e-07, "logits/chosen": -0.8652491569519043, "logits/rejected": -0.8973567485809326, "logps/chosen": -811.0621948242188, "logps/rejected": -1088.7923583984375, "loss": 0.3382, "rewards/accuracies": 0.875, "rewards/chosen": -4.997215270996094, "rewards/margins": 2.428532838821411, "rewards/rejected": -7.425748348236084, "step": 554 }, { "epoch": 0.724957139358315, "grad_norm": 21.973497977464827, "learning_rate": 1.063981609894987e-07, "logits/chosen": -0.8850119709968567, "logits/rejected": -0.8317074775695801, "logps/chosen": -895.1964721679688, "logps/rejected": -1150.310791015625, "loss": 0.3526, "rewards/accuracies": 0.875, "rewards/chosen": -5.471782207489014, "rewards/margins": 2.4418041706085205, "rewards/rejected": -7.913586616516113, "step": 555 }, { "epoch": 0.7262633684382399, "grad_norm": 35.91689983958402, "learning_rate": 1.0546520921910784e-07, "logits/chosen": -0.9072127342224121, "logits/rejected": -0.8466538190841675, "logps/chosen": -877.4606323242188, "logps/rejected": -1109.957763671875, "loss": 0.4265, "rewards/accuracies": 0.8125, "rewards/chosen": -5.126779079437256, "rewards/margins": 2.529775381088257, "rewards/rejected": -7.656554698944092, "step": 556 }, { "epoch": 0.7275695975181647, "grad_norm": 23.511498465014164, "learning_rate": 1.0453527111051183e-07, "logits/chosen": -0.9427922964096069, "logits/rejected": -0.939167320728302, "logps/chosen": -806.1412353515625, "logps/rejected": -1016.7940063476562, "loss": 0.3023, "rewards/accuracies": 0.8125, "rewards/chosen": -5.077123165130615, "rewards/margins": 2.114588499069214, "rewards/rejected": -7.191711902618408, "step": 557 }, { "epoch": 0.7288758265980897, "grad_norm": 24.44477376498831, "learning_rate": 1.0360836605363679e-07, "logits/chosen": -0.9482549428939819, "logits/rejected": -0.9721497893333435, "logps/chosen": -800.2005615234375, "logps/rejected": -1005.4768676757812, "loss": 0.355, "rewards/accuracies": 0.84375, "rewards/chosen": -4.509579181671143, "rewards/margins": 1.8559819459915161, "rewards/rejected": -6.365560531616211, "step": 558 }, { "epoch": 0.7301820556780145, "grad_norm": 21.167159308599608, "learning_rate": 1.0268451337516773e-07, "logits/chosen": -1.0456604957580566, "logits/rejected": -1.012890338897705, "logps/chosen": -903.2613525390625, "logps/rejected": -1080.0423583984375, "loss": 0.3808, "rewards/accuracies": 0.75, "rewards/chosen": -5.362766265869141, "rewards/margins": 2.1328072547912598, "rewards/rejected": -7.495573997497559, "step": 559 }, { "epoch": 0.7314882847579395, "grad_norm": 39.859419738396554, "learning_rate": 1.0176373233814509e-07, "logits/chosen": -0.9403766393661499, "logits/rejected": -0.9335325360298157, "logps/chosen": -843.5707397460938, "logps/rejected": -1124.05322265625, "loss": 0.3127, "rewards/accuracies": 0.90625, "rewards/chosen": -4.752773761749268, "rewards/margins": 2.5625851154327393, "rewards/rejected": -7.315358638763428, "step": 560 }, { "epoch": 0.7327945138378643, "grad_norm": 29.279533102730955, "learning_rate": 1.0084604214156322e-07, "logits/chosen": -0.967137336730957, "logits/rejected": -0.9893122315406799, "logps/chosen": -775.4454345703125, "logps/rejected": -981.1903686523438, "loss": 0.2862, "rewards/accuracies": 0.9375, "rewards/chosen": -3.9357616901397705, "rewards/margins": 2.174323558807373, "rewards/rejected": -6.1100850105285645, "step": 561 }, { "epoch": 0.7341007429177893, "grad_norm": 20.628888387982226, "learning_rate": 9.99314619199701e-08, "logits/chosen": -0.8735795617103577, "logits/rejected": -0.9275645017623901, "logps/chosen": -814.8197021484375, "logps/rejected": -1053.7369384765625, "loss": 0.3197, "rewards/accuracies": 0.9375, "rewards/chosen": -4.9540815353393555, "rewards/margins": 2.201838493347168, "rewards/rejected": -7.155920028686523, "step": 562 }, { "epoch": 0.7354069719977141, "grad_norm": 28.19914485268819, "learning_rate": 9.902001074306834e-08, "logits/chosen": -0.9230560064315796, "logits/rejected": -0.9692875146865845, "logps/chosen": -799.16162109375, "logps/rejected": -1046.1767578125, "loss": 0.3496, "rewards/accuracies": 1.0, "rewards/chosen": -4.5525031089782715, "rewards/margins": 2.366610527038574, "rewards/rejected": -6.919114112854004, "step": 563 }, { "epoch": 0.736713201077639, "grad_norm": 18.395440299034046, "learning_rate": 9.811170761531739e-08, "logits/chosen": -0.857429027557373, "logits/rejected": -0.9134597182273865, "logps/chosen": -754.7802734375, "logps/rejected": -1091.92919921875, "loss": 0.2861, "rewards/accuracies": 0.8125, "rewards/chosen": -4.486932754516602, "rewards/margins": 3.0629043579101562, "rewards/rejected": -7.549837112426758, "step": 564 }, { "epoch": 0.7380194301575639, "grad_norm": 16.713239429608148, "learning_rate": 9.720657147553767e-08, "logits/chosen": -0.874890923500061, "logits/rejected": -0.8897607922554016, "logps/chosen": -796.7207641601562, "logps/rejected": -1120.3206787109375, "loss": 0.2626, "rewards/accuracies": 0.84375, "rewards/chosen": -4.593537330627441, "rewards/margins": 2.665463447570801, "rewards/rejected": -7.259000778198242, "step": 565 }, { "epoch": 0.7393256592374887, "grad_norm": 20.5336851623424, "learning_rate": 9.630462119651537e-08, "logits/chosen": -0.8550716638565063, "logits/rejected": -0.8601816892623901, "logps/chosen": -947.548828125, "logps/rejected": -1085.7291259765625, "loss": 0.3776, "rewards/accuracies": 0.65625, "rewards/chosen": -5.3931732177734375, "rewards/margins": 1.4523639678955078, "rewards/rejected": -6.845536708831787, "step": 566 }, { "epoch": 0.7406318883174137, "grad_norm": 20.707084475190243, "learning_rate": 9.5405875584609e-08, "logits/chosen": -0.922339677810669, "logits/rejected": -0.9448087811470032, "logps/chosen": -823.510498046875, "logps/rejected": -1043.0206298828125, "loss": 0.2913, "rewards/accuracies": 0.875, "rewards/chosen": -4.648580551147461, "rewards/margins": 2.157121419906616, "rewards/rejected": -6.805701732635498, "step": 567 }, { "epoch": 0.7419381173973385, "grad_norm": 16.852464603891658, "learning_rate": 9.451035337935731e-08, "logits/chosen": -0.8741306066513062, "logits/rejected": -0.9167773723602295, "logps/chosen": -726.5533447265625, "logps/rejected": -1080.0703125, "loss": 0.2901, "rewards/accuracies": 0.96875, "rewards/chosen": -3.9515931606292725, "rewards/margins": 3.231985092163086, "rewards/rejected": -7.183577537536621, "step": 568 }, { "epoch": 0.7432443464772635, "grad_norm": 17.208831278317064, "learning_rate": 9.36180732530886e-08, "logits/chosen": -0.9364518523216248, "logits/rejected": -0.9597791433334351, "logps/chosen": -785.6038818359375, "logps/rejected": -1091.4915771484375, "loss": 0.3465, "rewards/accuracies": 0.875, "rewards/chosen": -4.260547161102295, "rewards/margins": 2.8101701736450195, "rewards/rejected": -7.070717811584473, "step": 569 }, { "epoch": 0.7445505755571883, "grad_norm": 20.57132249050924, "learning_rate": 9.272905381053131e-08, "logits/chosen": -0.9936679601669312, "logits/rejected": -0.9452012777328491, "logps/chosen": -783.473388671875, "logps/rejected": -977.3974609375, "loss": 0.4034, "rewards/accuracies": 0.71875, "rewards/chosen": -4.772213935852051, "rewards/margins": 2.122070550918579, "rewards/rejected": -6.894284248352051, "step": 570 }, { "epoch": 0.7458568046371132, "grad_norm": 24.488955872960492, "learning_rate": 9.184331358842592e-08, "logits/chosen": -1.1170083284378052, "logits/rejected": -1.1208350658416748, "logps/chosen": -913.275146484375, "logps/rejected": -1134.02734375, "loss": 0.3542, "rewards/accuracies": 0.90625, "rewards/chosen": -4.806280612945557, "rewards/margins": 2.3573532104492188, "rewards/rejected": -7.163634300231934, "step": 571 }, { "epoch": 0.7471630337170381, "grad_norm": 24.774291129440638, "learning_rate": 9.096087105513894e-08, "logits/chosen": -0.9611457586288452, "logits/rejected": -0.9745159149169922, "logps/chosen": -850.2299194335938, "logps/rejected": -1079.2684326171875, "loss": 0.3096, "rewards/accuracies": 0.84375, "rewards/chosen": -4.939201354980469, "rewards/margins": 1.692257046699524, "rewards/rejected": -6.631458282470703, "step": 572 }, { "epoch": 0.748469262796963, "grad_norm": 21.002407149688135, "learning_rate": 9.008174461027723e-08, "logits/chosen": -0.8872397541999817, "logits/rejected": -0.9736205339431763, "logps/chosen": -820.9461669921875, "logps/rejected": -1208.108642578125, "loss": 0.3514, "rewards/accuracies": 0.875, "rewards/chosen": -5.033115863800049, "rewards/margins": 3.3874659538269043, "rewards/rejected": -8.420580863952637, "step": 573 }, { "epoch": 0.7497754918768879, "grad_norm": 32.13967455164451, "learning_rate": 8.920595258430486e-08, "logits/chosen": -0.825206458568573, "logits/rejected": -0.8276383876800537, "logps/chosen": -889.4954833984375, "logps/rejected": -1139.07666015625, "loss": 0.38, "rewards/accuracies": 0.75, "rewards/chosen": -5.5783257484436035, "rewards/margins": 2.063443183898926, "rewards/rejected": -7.641768932342529, "step": 574 }, { "epoch": 0.7510817209568128, "grad_norm": 28.60867511694671, "learning_rate": 8.833351323816063e-08, "logits/chosen": -0.9238741397857666, "logits/rejected": -0.9454509019851685, "logps/chosen": -878.4554443359375, "logps/rejected": -1141.947265625, "loss": 0.3943, "rewards/accuracies": 0.75, "rewards/chosen": -5.250237941741943, "rewards/margins": 2.2067952156066895, "rewards/rejected": -7.457033157348633, "step": 575 }, { "epoch": 0.7523879500367376, "grad_norm": 16.26873259584617, "learning_rate": 8.746444476287737e-08, "logits/chosen": -1.0511510372161865, "logits/rejected": -1.057821273803711, "logps/chosen": -857.7843627929688, "logps/rejected": -1114.47607421875, "loss": 0.3172, "rewards/accuracies": 0.84375, "rewards/chosen": -4.964174747467041, "rewards/margins": 2.3061084747314453, "rewards/rejected": -7.2702836990356445, "step": 576 }, { "epoch": 0.7536941791166626, "grad_norm": 19.308146548606004, "learning_rate": 8.659876527920276e-08, "logits/chosen": -0.9624135494232178, "logits/rejected": -0.9660477638244629, "logps/chosen": -785.8853759765625, "logps/rejected": -972.9590454101562, "loss": 0.3061, "rewards/accuracies": 0.90625, "rewards/chosen": -4.485295295715332, "rewards/margins": 2.169145107269287, "rewards/rejected": -6.654440402984619, "step": 577 }, { "epoch": 0.7550004081965874, "grad_norm": 21.61560102039662, "learning_rate": 8.573649283722115e-08, "logits/chosen": -0.7940883040428162, "logits/rejected": -0.8307343125343323, "logps/chosen": -768.4230346679688, "logps/rejected": -1035.4759521484375, "loss": 0.3014, "rewards/accuracies": 0.84375, "rewards/chosen": -4.916931629180908, "rewards/margins": 2.457148551940918, "rewards/rejected": -7.374079704284668, "step": 578 }, { "epoch": 0.7563066372765124, "grad_norm": 22.88690338836935, "learning_rate": 8.487764541597764e-08, "logits/chosen": -0.9109131693840027, "logits/rejected": -0.9031786918640137, "logps/chosen": -765.204833984375, "logps/rejected": -1008.220703125, "loss": 0.3464, "rewards/accuracies": 0.84375, "rewards/chosen": -4.222220420837402, "rewards/margins": 2.112513303756714, "rewards/rejected": -6.3347344398498535, "step": 579 }, { "epoch": 0.7576128663564372, "grad_norm": 21.862872793760197, "learning_rate": 8.402224092310297e-08, "logits/chosen": -0.9250282049179077, "logits/rejected": -0.9852439165115356, "logps/chosen": -871.0116577148438, "logps/rejected": -1154.585693359375, "loss": 0.3678, "rewards/accuracies": 0.84375, "rewards/chosen": -5.070359230041504, "rewards/margins": 2.512484312057495, "rewards/rejected": -7.58284330368042, "step": 580 }, { "epoch": 0.7589190954363622, "grad_norm": 19.942880125656473, "learning_rate": 8.317029719444016e-08, "logits/chosen": -0.9003323316574097, "logits/rejected": -0.8879726529121399, "logps/chosen": -813.396484375, "logps/rejected": -1111.6331787109375, "loss": 0.3085, "rewards/accuracies": 0.9375, "rewards/chosen": -4.536358833312988, "rewards/margins": 2.7713277339935303, "rewards/rejected": -7.307686805725098, "step": 581 }, { "epoch": 0.760225324516287, "grad_norm": 45.19702149073493, "learning_rate": 8.232183199367265e-08, "logits/chosen": -0.8801093101501465, "logits/rejected": -0.9021286964416504, "logps/chosen": -898.3988037109375, "logps/rejected": -1257.00830078125, "loss": 0.3687, "rewards/accuracies": 0.84375, "rewards/chosen": -5.8340349197387695, "rewards/margins": 2.4283087253570557, "rewards/rejected": -8.262344360351562, "step": 582 }, { "epoch": 0.7615315535962119, "grad_norm": 20.347902273024776, "learning_rate": 8.147686301195383e-08, "logits/chosen": -0.8726892471313477, "logits/rejected": -0.9775661826133728, "logps/chosen": -773.6263427734375, "logps/rejected": -1147.288330078125, "loss": 0.2819, "rewards/accuracies": 0.8125, "rewards/chosen": -4.852556228637695, "rewards/margins": 2.879547357559204, "rewards/rejected": -7.73210334777832, "step": 583 }, { "epoch": 0.7628377826761368, "grad_norm": 30.931119659844114, "learning_rate": 8.063540786753842e-08, "logits/chosen": -0.9059603214263916, "logits/rejected": -0.936095654964447, "logps/chosen": -769.556884765625, "logps/rejected": -1065.5628662109375, "loss": 0.3303, "rewards/accuracies": 1.0, "rewards/chosen": -4.389358997344971, "rewards/margins": 3.1140201091766357, "rewards/rejected": -7.5033793449401855, "step": 584 }, { "epoch": 0.7641440117560617, "grad_norm": 60.40616710629875, "learning_rate": 7.979748410541451e-08, "logits/chosen": -0.9225594997406006, "logits/rejected": -0.9047824144363403, "logps/chosen": -866.5232543945312, "logps/rejected": -1200.9637451171875, "loss": 0.4029, "rewards/accuracies": 0.875, "rewards/chosen": -5.22835636138916, "rewards/margins": 3.382688283920288, "rewards/rejected": -8.611045837402344, "step": 585 }, { "epoch": 0.7654502408359866, "grad_norm": 31.555998473913466, "learning_rate": 7.896310919693858e-08, "logits/chosen": -0.8222339749336243, "logits/rejected": -0.8743284344673157, "logps/chosen": -867.50048828125, "logps/rejected": -1188.538818359375, "loss": 0.3465, "rewards/accuracies": 0.84375, "rewards/chosen": -5.528315544128418, "rewards/margins": 2.8255791664123535, "rewards/rejected": -8.35389518737793, "step": 586 }, { "epoch": 0.7667564699159115, "grad_norm": 21.69936273672222, "learning_rate": 7.813230053947054e-08, "logits/chosen": -0.9629799127578735, "logits/rejected": -0.9869503378868103, "logps/chosen": -853.8582763671875, "logps/rejected": -1051.514892578125, "loss": 0.3534, "rewards/accuracies": 0.90625, "rewards/chosen": -5.540807247161865, "rewards/margins": 1.6317949295043945, "rewards/rejected": -7.172601699829102, "step": 587 }, { "epoch": 0.7680626989958363, "grad_norm": 24.484278864691312, "learning_rate": 7.730507545601131e-08, "logits/chosen": -0.8617520332336426, "logits/rejected": -0.9498992562294006, "logps/chosen": -748.60546875, "logps/rejected": -1134.1103515625, "loss": 0.3334, "rewards/accuracies": 0.90625, "rewards/chosen": -4.2913970947265625, "rewards/margins": 3.1425302028656006, "rewards/rejected": -7.433926582336426, "step": 588 }, { "epoch": 0.7693689280757613, "grad_norm": 27.511342623503438, "learning_rate": 7.648145119484151e-08, "logits/chosen": -1.0093884468078613, "logits/rejected": -1.006633996963501, "logps/chosen": -758.9545288085938, "logps/rejected": -935.381103515625, "loss": 0.3744, "rewards/accuracies": 0.84375, "rewards/chosen": -4.51336145401001, "rewards/margins": 1.7640907764434814, "rewards/rejected": -6.27745246887207, "step": 589 }, { "epoch": 0.7706751571556861, "grad_norm": 22.414541733494126, "learning_rate": 7.566144492916191e-08, "logits/chosen": -0.8036502599716187, "logits/rejected": -0.8703159093856812, "logps/chosen": -794.802001953125, "logps/rejected": -1297.468994140625, "loss": 0.2328, "rewards/accuracies": 1.0, "rewards/chosen": -4.608358860015869, "rewards/margins": 4.21331262588501, "rewards/rejected": -8.821670532226562, "step": 590 }, { "epoch": 0.7719813862356111, "grad_norm": 26.029107586759444, "learning_rate": 7.484507375673505e-08, "logits/chosen": -0.9289277791976929, "logits/rejected": -0.8871047496795654, "logps/chosen": -740.1527709960938, "logps/rejected": -1045.8631591796875, "loss": 0.2845, "rewards/accuracies": 0.9375, "rewards/chosen": -3.9896395206451416, "rewards/margins": 2.939906120300293, "rewards/rejected": -6.9295454025268555, "step": 591 }, { "epoch": 0.7732876153155359, "grad_norm": 30.580890149968727, "learning_rate": 7.40323546995292e-08, "logits/chosen": -0.9810222387313843, "logits/rejected": -0.9534710645675659, "logps/chosen": -907.60498046875, "logps/rejected": -1219.4244384765625, "loss": 0.2711, "rewards/accuracies": 0.9375, "rewards/chosen": -5.280550003051758, "rewards/margins": 3.0338191986083984, "rewards/rejected": -8.314369201660156, "step": 592 }, { "epoch": 0.7745938443954609, "grad_norm": 25.625976246905026, "learning_rate": 7.322330470336313e-08, "logits/chosen": -0.8802429437637329, "logits/rejected": -0.8881117105484009, "logps/chosen": -817.2022705078125, "logps/rejected": -982.9677124023438, "loss": 0.3506, "rewards/accuracies": 0.71875, "rewards/chosen": -4.862236022949219, "rewards/margins": 1.5472270250320435, "rewards/rejected": -6.409462928771973, "step": 593 }, { "epoch": 0.7759000734753857, "grad_norm": 20.818249995545727, "learning_rate": 7.241794063755291e-08, "logits/chosen": -0.9595385193824768, "logits/rejected": -1.021976113319397, "logps/chosen": -852.221923828125, "logps/rejected": -1156.3397216796875, "loss": 0.3032, "rewards/accuracies": 0.84375, "rewards/chosen": -5.065010070800781, "rewards/margins": 2.4911441802978516, "rewards/rejected": -7.556154251098633, "step": 594 }, { "epoch": 0.7772063025553106, "grad_norm": 26.077317678524693, "learning_rate": 7.161627929456004e-08, "logits/chosen": -0.8960578441619873, "logits/rejected": -0.826008677482605, "logps/chosen": -792.5365600585938, "logps/rejected": -893.3995361328125, "loss": 0.3794, "rewards/accuracies": 0.8125, "rewards/chosen": -4.322532653808594, "rewards/margins": 1.6258131265640259, "rewards/rejected": -5.94834566116333, "step": 595 }, { "epoch": 0.7785125316352355, "grad_norm": 27.584632338786395, "learning_rate": 7.081833738964149e-08, "logits/chosen": -1.0183488130569458, "logits/rejected": -0.9943074584007263, "logps/chosen": -820.5167846679688, "logps/rejected": -1048.6409912109375, "loss": 0.3894, "rewards/accuracies": 0.78125, "rewards/chosen": -4.430448055267334, "rewards/margins": 2.2347586154937744, "rewards/rejected": -6.665206432342529, "step": 596 }, { "epoch": 0.7798187607151604, "grad_norm": 24.080235978815235, "learning_rate": 7.002413156050108e-08, "logits/chosen": -0.8766260743141174, "logits/rejected": -0.8280268311500549, "logps/chosen": -820.6423950195312, "logps/rejected": -977.9401245117188, "loss": 0.3307, "rewards/accuracies": 0.8125, "rewards/chosen": -4.771195411682129, "rewards/margins": 1.9175888299942017, "rewards/rejected": -6.688784599304199, "step": 597 }, { "epoch": 0.7811249897950853, "grad_norm": 17.451309577317936, "learning_rate": 6.923367836694236e-08, "logits/chosen": -1.0711160898208618, "logits/rejected": -1.0157512426376343, "logps/chosen": -793.7659301757812, "logps/rejected": -992.5198364257812, "loss": 0.3032, "rewards/accuracies": 0.90625, "rewards/chosen": -4.318778991699219, "rewards/margins": 2.1037962436676025, "rewards/rejected": -6.422574996948242, "step": 598 }, { "epoch": 0.7824312188750102, "grad_norm": 32.13868420831174, "learning_rate": 6.844699429052375e-08, "logits/chosen": -0.9602109789848328, "logits/rejected": -0.9311934113502502, "logps/chosen": -840.842041015625, "logps/rejected": -1053.7744140625, "loss": 0.3717, "rewards/accuracies": 0.75, "rewards/chosen": -4.749244213104248, "rewards/margins": 2.2002172470092773, "rewards/rejected": -6.949460983276367, "step": 599 }, { "epoch": 0.783737447954935, "grad_norm": 19.259699976993907, "learning_rate": 6.766409573421466e-08, "logits/chosen": -0.9458956718444824, "logits/rejected": -1.0178898572921753, "logps/chosen": -805.6896362304688, "logps/rejected": -1252.30908203125, "loss": 0.3342, "rewards/accuracies": 0.9375, "rewards/chosen": -4.890609264373779, "rewards/margins": 4.166367053985596, "rewards/rejected": -9.056976318359375, "step": 600 }, { "epoch": 0.783737447954935, "eval_logits/chosen": -0.799916684627533, "eval_logits/rejected": -0.8080865740776062, "eval_logps/chosen": -814.440185546875, "eval_logps/rejected": -1083.55029296875, "eval_loss": 0.3353707194328308, "eval_rewards/accuracies": 0.8920000195503235, "eval_rewards/chosen": -4.7040910720825195, "eval_rewards/margins": 2.6301238536834717, "eval_rewards/rejected": -7.334214210510254, "eval_runtime": 305.2507, "eval_samples_per_second": 6.552, "eval_steps_per_second": 0.409, "step": 600 }, { "epoch": 0.78504367703486, "grad_norm": 20.361644602615105, "learning_rate": 6.688499902205345e-08, "logits/chosen": -0.8707839846611023, "logits/rejected": -0.9645876884460449, "logps/chosen": -891.9090576171875, "logps/rejected": -1203.7598876953125, "loss": 0.2809, "rewards/accuracies": 0.96875, "rewards/chosen": -5.256412982940674, "rewards/margins": 2.483121395111084, "rewards/rejected": -7.739534378051758, "step": 601 }, { "epoch": 0.7863499061147848, "grad_norm": 20.043709061394022, "learning_rate": 6.610972039880704e-08, "logits/chosen": -0.8824882507324219, "logits/rejected": -0.9258497953414917, "logps/chosen": -727.640625, "logps/rejected": -986.2440185546875, "loss": 0.2183, "rewards/accuracies": 0.875, "rewards/chosen": -4.061704158782959, "rewards/margins": 2.5300819873809814, "rewards/rejected": -6.591785907745361, "step": 602 }, { "epoch": 0.7876561351947098, "grad_norm": 25.028173721925533, "learning_rate": 6.533827602963244e-08, "logits/chosen": -0.9209650754928589, "logits/rejected": -0.9454638361930847, "logps/chosen": -781.026611328125, "logps/rejected": -1081.3594970703125, "loss": 0.3306, "rewards/accuracies": 0.96875, "rewards/chosen": -4.376533508300781, "rewards/margins": 2.9207139015197754, "rewards/rejected": -7.297247409820557, "step": 603 }, { "epoch": 0.7889623642746346, "grad_norm": 22.348702273934453, "learning_rate": 6.45706819997392e-08, "logits/chosen": -1.0147395133972168, "logits/rejected": -1.038183331489563, "logps/chosen": -824.8892822265625, "logps/rejected": -1079.4228515625, "loss": 0.3183, "rewards/accuracies": 0.875, "rewards/chosen": -4.653099060058594, "rewards/margins": 2.434006452560425, "rewards/rejected": -7.087105751037598, "step": 604 }, { "epoch": 0.7902685933545596, "grad_norm": 27.467778442681738, "learning_rate": 6.380695431405453e-08, "logits/chosen": -0.8424959778785706, "logits/rejected": -0.9284894466400146, "logps/chosen": -802.2348022460938, "logps/rejected": -1163.3359375, "loss": 0.3341, "rewards/accuracies": 0.8125, "rewards/chosen": -4.737122058868408, "rewards/margins": 2.743659734725952, "rewards/rejected": -7.480782508850098, "step": 605 }, { "epoch": 0.7915748224344844, "grad_norm": 19.34862528757651, "learning_rate": 6.304710889688944e-08, "logits/chosen": -0.9255674481391907, "logits/rejected": -0.9599024057388306, "logps/chosen": -789.0675048828125, "logps/rejected": -1047.88232421875, "loss": 0.2828, "rewards/accuracies": 0.9375, "rewards/chosen": -4.71975040435791, "rewards/margins": 2.2590696811676025, "rewards/rejected": -6.978819847106934, "step": 606 }, { "epoch": 0.7928810515144094, "grad_norm": 63.39258546983016, "learning_rate": 6.229116159160652e-08, "logits/chosen": -1.0031625032424927, "logits/rejected": -1.0137825012207031, "logps/chosen": -832.7035522460938, "logps/rejected": -1037.726806640625, "loss": 0.3436, "rewards/accuracies": 0.84375, "rewards/chosen": -4.836849212646484, "rewards/margins": 2.296910285949707, "rewards/rejected": -7.133759498596191, "step": 607 }, { "epoch": 0.7941872805943342, "grad_norm": 18.22971639721024, "learning_rate": 6.153912816028976e-08, "logits/chosen": -0.919977068901062, "logits/rejected": -0.923467218875885, "logps/chosen": -871.0953369140625, "logps/rejected": -1129.403564453125, "loss": 0.2677, "rewards/accuracies": 0.9375, "rewards/chosen": -5.000254154205322, "rewards/margins": 2.288395881652832, "rewards/rejected": -7.288650035858154, "step": 608 }, { "epoch": 0.7954935096742591, "grad_norm": 26.306570079281883, "learning_rate": 6.079102428341587e-08, "logits/chosen": -0.9508645534515381, "logits/rejected": -0.955719530582428, "logps/chosen": -792.3866577148438, "logps/rejected": -1104.510009765625, "loss": 0.2807, "rewards/accuracies": 0.90625, "rewards/chosen": -4.013969898223877, "rewards/margins": 3.101562261581421, "rewards/rejected": -7.115531921386719, "step": 609 }, { "epoch": 0.796799738754184, "grad_norm": 27.27759663326369, "learning_rate": 6.004686555952742e-08, "logits/chosen": -0.9942679405212402, "logits/rejected": -0.9757542014122009, "logps/chosen": -863.2615966796875, "logps/rejected": -1148.544677734375, "loss": 0.3943, "rewards/accuracies": 0.78125, "rewards/chosen": -5.1844658851623535, "rewards/margins": 1.9732271432876587, "rewards/rejected": -7.157692909240723, "step": 610 }, { "epoch": 0.7981059678341089, "grad_norm": 22.309371027174624, "learning_rate": 5.9306667504907234e-08, "logits/chosen": -0.966468095779419, "logits/rejected": -0.9042306542396545, "logps/chosen": -819.4505615234375, "logps/rejected": -979.9266967773438, "loss": 0.3763, "rewards/accuracies": 0.875, "rewards/chosen": -4.6890740394592285, "rewards/margins": 2.081890344619751, "rewards/rejected": -6.770963668823242, "step": 611 }, { "epoch": 0.7994121969140338, "grad_norm": 35.47329473307664, "learning_rate": 5.857044555325535e-08, "logits/chosen": -1.0889376401901245, "logits/rejected": -1.0171784162521362, "logps/chosen": -853.3372802734375, "logps/rejected": -974.7836303710938, "loss": 0.3427, "rewards/accuracies": 0.875, "rewards/chosen": -4.811555862426758, "rewards/margins": 1.7559360265731812, "rewards/rejected": -6.56749153137207, "step": 612 }, { "epoch": 0.8007184259939587, "grad_norm": 36.349287307385815, "learning_rate": 5.7838215055366954e-08, "logits/chosen": -0.9624323844909668, "logits/rejected": -0.9520595669746399, "logps/chosen": -828.0532836914062, "logps/rejected": -1140.3701171875, "loss": 0.3107, "rewards/accuracies": 0.90625, "rewards/chosen": -4.769121170043945, "rewards/margins": 2.8237016201019287, "rewards/rejected": -7.592822551727295, "step": 613 }, { "epoch": 0.8020246550738835, "grad_norm": 35.09922416585137, "learning_rate": 5.710999127881233e-08, "logits/chosen": -1.011865496635437, "logits/rejected": -1.0389556884765625, "logps/chosen": -782.23486328125, "logps/rejected": -1016.12646484375, "loss": 0.3195, "rewards/accuracies": 0.8125, "rewards/chosen": -4.666965007781982, "rewards/margins": 2.2044434547424316, "rewards/rejected": -6.871408462524414, "step": 614 }, { "epoch": 0.8033308841538085, "grad_norm": 19.12905924237332, "learning_rate": 5.6385789407618593e-08, "logits/chosen": -0.9514075517654419, "logits/rejected": -0.9145126938819885, "logps/chosen": -811.7752685546875, "logps/rejected": -1043.508544921875, "loss": 0.338, "rewards/accuracies": 0.8125, "rewards/chosen": -4.971982955932617, "rewards/margins": 2.395125150680542, "rewards/rejected": -7.367108345031738, "step": 615 }, { "epoch": 0.8046371132337333, "grad_norm": 60.821253048729666, "learning_rate": 5.5665624541952865e-08, "logits/chosen": -1.0484111309051514, "logits/rejected": -1.0743966102600098, "logps/chosen": -804.5001831054688, "logps/rejected": -1006.0529174804688, "loss": 0.3242, "rewards/accuracies": 0.90625, "rewards/chosen": -4.520447254180908, "rewards/margins": 2.239243507385254, "rewards/rejected": -6.759690284729004, "step": 616 }, { "epoch": 0.8059433423136583, "grad_norm": 59.38789752379059, "learning_rate": 5.494951169780776e-08, "logits/chosen": -0.8696390390396118, "logits/rejected": -0.8830188512802124, "logps/chosen": -848.6278686523438, "logps/rejected": -1117.7974853515625, "loss": 0.3619, "rewards/accuracies": 0.78125, "rewards/chosen": -5.382977485656738, "rewards/margins": 2.234811544418335, "rewards/rejected": -7.617789268493652, "step": 617 }, { "epoch": 0.8072495713935831, "grad_norm": 20.31680378636864, "learning_rate": 5.4237465806688004e-08, "logits/chosen": -0.9961526989936829, "logits/rejected": -0.9020803570747375, "logps/chosen": -790.9029541015625, "logps/rejected": -987.162353515625, "loss": 0.2983, "rewards/accuracies": 0.84375, "rewards/chosen": -4.762269973754883, "rewards/margins": 2.2328810691833496, "rewards/rejected": -6.995151519775391, "step": 618 }, { "epoch": 0.8085558004735081, "grad_norm": 38.36725235605651, "learning_rate": 5.3529501715299266e-08, "logits/chosen": -0.9393517971038818, "logits/rejected": -1.0322364568710327, "logps/chosen": -831.10302734375, "logps/rejected": -1079.7054443359375, "loss": 0.3344, "rewards/accuracies": 0.90625, "rewards/chosen": -4.986616134643555, "rewards/margins": 2.054384469985962, "rewards/rejected": -7.041000843048096, "step": 619 }, { "epoch": 0.8098620295534329, "grad_norm": 23.713937668491724, "learning_rate": 5.2825634185238583e-08, "logits/chosen": -0.9017822742462158, "logits/rejected": -0.9380840063095093, "logps/chosen": -789.52734375, "logps/rejected": -969.8795166015625, "loss": 0.3775, "rewards/accuracies": 0.90625, "rewards/chosen": -4.724421501159668, "rewards/margins": 1.923351764678955, "rewards/rejected": -6.647772789001465, "step": 620 }, { "epoch": 0.8111682586333578, "grad_norm": 20.719618911102806, "learning_rate": 5.212587789268649e-08, "logits/chosen": -0.9668422937393188, "logits/rejected": -0.997907280921936, "logps/chosen": -849.0941772460938, "logps/rejected": -1135.4027099609375, "loss": 0.3439, "rewards/accuracies": 0.78125, "rewards/chosen": -4.6942877769470215, "rewards/margins": 2.6681554317474365, "rewards/rejected": -7.362443923950195, "step": 621 }, { "epoch": 0.8124744877132827, "grad_norm": 28.78342856793349, "learning_rate": 5.1430247428101067e-08, "logits/chosen": -0.8712970018386841, "logits/rejected": -0.8977018594741821, "logps/chosen": -872.6751098632812, "logps/rejected": -1110.70751953125, "loss": 0.283, "rewards/accuracies": 0.875, "rewards/chosen": -4.823261260986328, "rewards/margins": 2.413942337036133, "rewards/rejected": -7.237203121185303, "step": 622 }, { "epoch": 0.8137807167932076, "grad_norm": 20.643675704776587, "learning_rate": 5.0738757295913674e-08, "logits/chosen": -1.0791032314300537, "logits/rejected": -0.9943168759346008, "logps/chosen": -833.9264526367188, "logps/rejected": -957.189453125, "loss": 0.3851, "rewards/accuracies": 0.78125, "rewards/chosen": -4.792407512664795, "rewards/margins": 1.5870347023010254, "rewards/rejected": -6.37944221496582, "step": 623 }, { "epoch": 0.8150869458731325, "grad_norm": 18.136247401431493, "learning_rate": 5.005142191422665e-08, "logits/chosen": -0.9701358675956726, "logits/rejected": -0.9419609308242798, "logps/chosen": -746.4559326171875, "logps/rejected": -925.2149658203125, "loss": 0.4097, "rewards/accuracies": 0.875, "rewards/chosen": -4.279734134674072, "rewards/margins": 1.9801875352859497, "rewards/rejected": -6.259922027587891, "step": 624 }, { "epoch": 0.8163931749530574, "grad_norm": 26.072306298901207, "learning_rate": 4.936825561451235e-08, "logits/chosen": -1.0418720245361328, "logits/rejected": -0.9948733448982239, "logps/chosen": -899.941650390625, "logps/rejected": -1055.2972412109375, "loss": 0.341, "rewards/accuracies": 0.8125, "rewards/chosen": -5.106154441833496, "rewards/margins": 1.7844371795654297, "rewards/rejected": -6.890592575073242, "step": 625 }, { "epoch": 0.8176994040329822, "grad_norm": 23.034148794273474, "learning_rate": 4.868927264131476e-08, "logits/chosen": -0.9666779637336731, "logits/rejected": -1.0086778402328491, "logps/chosen": -884.3038940429688, "logps/rejected": -1230.3251953125, "loss": 0.3529, "rewards/accuracies": 0.78125, "rewards/chosen": -5.3153486251831055, "rewards/margins": 3.0668115615844727, "rewards/rejected": -8.382160186767578, "step": 626 }, { "epoch": 0.8190056331129072, "grad_norm": 23.650355623633093, "learning_rate": 4.801448715195227e-08, "logits/chosen": -1.0483986139297485, "logits/rejected": -1.0794347524642944, "logps/chosen": -864.2808227539062, "logps/rejected": -1053.04150390625, "loss": 0.3211, "rewards/accuracies": 0.875, "rewards/chosen": -4.786685943603516, "rewards/margins": 2.0903913974761963, "rewards/rejected": -6.877077102661133, "step": 627 }, { "epoch": 0.820311862192832, "grad_norm": 17.274048500959722, "learning_rate": 4.734391321622242e-08, "logits/chosen": -0.9791491627693176, "logits/rejected": -0.9082823395729065, "logps/chosen": -833.59619140625, "logps/rejected": -984.154052734375, "loss": 0.3178, "rewards/accuracies": 0.90625, "rewards/chosen": -4.538018226623535, "rewards/margins": 2.0235354900360107, "rewards/rejected": -6.561553001403809, "step": 628 }, { "epoch": 0.821618091272757, "grad_norm": 35.535109704827164, "learning_rate": 4.667756481610866e-08, "logits/chosen": -0.9152229428291321, "logits/rejected": -0.9227945804595947, "logps/chosen": -828.603271484375, "logps/rejected": -1044.041259765625, "loss": 0.365, "rewards/accuracies": 0.875, "rewards/chosen": -4.549627780914307, "rewards/margins": 1.8636504411697388, "rewards/rejected": -6.413278579711914, "step": 629 }, { "epoch": 0.8229243203526818, "grad_norm": 30.88845897386843, "learning_rate": 4.60154558454888e-08, "logits/chosen": -1.064465045928955, "logits/rejected": -1.0457439422607422, "logps/chosen": -889.4046020507812, "logps/rejected": -1094.5487060546875, "loss": 0.424, "rewards/accuracies": 0.84375, "rewards/chosen": -5.0392560958862305, "rewards/margins": 2.392564535140991, "rewards/rejected": -7.431820869445801, "step": 630 }, { "epoch": 0.8242305494326068, "grad_norm": 20.901329900516245, "learning_rate": 4.535760010984513e-08, "logits/chosen": -0.8499188423156738, "logits/rejected": -0.8458962440490723, "logps/chosen": -831.1834716796875, "logps/rejected": -1015.3555908203125, "loss": 0.2817, "rewards/accuracies": 0.78125, "rewards/chosen": -5.353586196899414, "rewards/margins": 1.9744594097137451, "rewards/rejected": -7.328044891357422, "step": 631 }, { "epoch": 0.8255367785125316, "grad_norm": 31.389449614869186, "learning_rate": 4.470401132597687e-08, "logits/chosen": -0.8895751237869263, "logits/rejected": -0.90300053358078, "logps/chosen": -820.342529296875, "logps/rejected": -1040.498779296875, "loss": 0.3141, "rewards/accuracies": 0.84375, "rewards/chosen": -4.610342025756836, "rewards/margins": 2.0794625282287598, "rewards/rejected": -6.689804553985596, "step": 632 }, { "epoch": 0.8268430075924565, "grad_norm": 22.440107167681898, "learning_rate": 4.405470312171392e-08, "logits/chosen": -0.9385513663291931, "logits/rejected": -0.9701187610626221, "logps/chosen": -816.0090942382812, "logps/rejected": -1153.3282470703125, "loss": 0.3333, "rewards/accuracies": 0.84375, "rewards/chosen": -4.639499664306641, "rewards/margins": 3.054454803466797, "rewards/rejected": -7.693953514099121, "step": 633 }, { "epoch": 0.8281492366723814, "grad_norm": 22.836475349274135, "learning_rate": 4.340968903563283e-08, "logits/chosen": -1.0385801792144775, "logits/rejected": -0.9340227842330933, "logps/chosen": -811.0709228515625, "logps/rejected": -988.4837036132812, "loss": 0.2933, "rewards/accuracies": 0.8125, "rewards/chosen": -4.569432258605957, "rewards/margins": 2.175413131713867, "rewards/rejected": -6.744844913482666, "step": 634 }, { "epoch": 0.8294554657523063, "grad_norm": 27.201658907499592, "learning_rate": 4.2768982516774495e-08, "logits/chosen": -0.9834165573120117, "logits/rejected": -1.0088675022125244, "logps/chosen": -799.6683959960938, "logps/rejected": -950.1758422851562, "loss": 0.346, "rewards/accuracies": 0.84375, "rewards/chosen": -4.314089775085449, "rewards/margins": 1.5654332637786865, "rewards/rejected": -5.879523277282715, "step": 635 }, { "epoch": 0.8307616948322312, "grad_norm": 21.147760460354423, "learning_rate": 4.213259692436366e-08, "logits/chosen": -0.9662840366363525, "logits/rejected": -1.0308414697647095, "logps/chosen": -726.1981201171875, "logps/rejected": -1077.3818359375, "loss": 0.3096, "rewards/accuracies": 1.0, "rewards/chosen": -3.9752042293548584, "rewards/margins": 2.9876835346221924, "rewards/rejected": -6.962887287139893, "step": 636 }, { "epoch": 0.8320679239121561, "grad_norm": 21.42406296057284, "learning_rate": 4.1500545527530544e-08, "logits/chosen": -0.9212138652801514, "logits/rejected": -0.926445722579956, "logps/chosen": -825.0010986328125, "logps/rejected": -1020.178955078125, "loss": 0.3342, "rewards/accuracies": 0.75, "rewards/chosen": -4.832255840301514, "rewards/margins": 2.0423848628997803, "rewards/rejected": -6.874640941619873, "step": 637 }, { "epoch": 0.833374152992081, "grad_norm": 22.437840301602577, "learning_rate": 4.087284150503381e-08, "logits/chosen": -0.8695635795593262, "logits/rejected": -0.929308295249939, "logps/chosen": -847.9984130859375, "logps/rejected": -1145.2864990234375, "loss": 0.3356, "rewards/accuracies": 0.8125, "rewards/chosen": -4.7246928215026855, "rewards/margins": 2.388370990753174, "rewards/rejected": -7.113063812255859, "step": 638 }, { "epoch": 0.8346803820720059, "grad_norm": 30.94820697370598, "learning_rate": 4.024949794498622e-08, "logits/chosen": -1.0347599983215332, "logits/rejected": -1.0474956035614014, "logps/chosen": -764.6439208984375, "logps/rejected": -1023.9473876953125, "loss": 0.3739, "rewards/accuracies": 1.0, "rewards/chosen": -4.080129623413086, "rewards/margins": 2.6597049236297607, "rewards/rejected": -6.739834308624268, "step": 639 }, { "epoch": 0.8359866111519307, "grad_norm": 25.92041492366139, "learning_rate": 3.963052784458146e-08, "logits/chosen": -0.9137096405029297, "logits/rejected": -0.9526919722557068, "logps/chosen": -831.0147705078125, "logps/rejected": -1173.9879150390625, "loss": 0.2769, "rewards/accuracies": 0.96875, "rewards/chosen": -4.643778324127197, "rewards/margins": 3.2461047172546387, "rewards/rejected": -7.889883518218994, "step": 640 }, { "epoch": 0.8372928402318557, "grad_norm": 19.424694402625263, "learning_rate": 3.901594410982326e-08, "logits/chosen": -0.9314712285995483, "logits/rejected": -1.0186083316802979, "logps/chosen": -792.0285034179688, "logps/rejected": -1059.8748779296875, "loss": 0.3201, "rewards/accuracies": 0.8125, "rewards/chosen": -4.710291862487793, "rewards/margins": 2.3436036109924316, "rewards/rejected": -7.053895950317383, "step": 641 }, { "epoch": 0.8385990693117805, "grad_norm": 49.750792586062126, "learning_rate": 3.8405759555256156e-08, "logits/chosen": -1.0113201141357422, "logits/rejected": -1.0383992195129395, "logps/chosen": -765.2586059570312, "logps/rejected": -982.585205078125, "loss": 0.3595, "rewards/accuracies": 0.84375, "rewards/chosen": -4.2787184715271, "rewards/margins": 1.8915104866027832, "rewards/rejected": -6.170229434967041, "step": 642 }, { "epoch": 0.8399052983917055, "grad_norm": 32.60901590818127, "learning_rate": 3.779998690369857e-08, "logits/chosen": -0.9662805795669556, "logits/rejected": -1.001565933227539, "logps/chosen": -731.9835815429688, "logps/rejected": -1034.92578125, "loss": 0.3098, "rewards/accuracies": 0.90625, "rewards/chosen": -3.9542222023010254, "rewards/margins": 2.9862449169158936, "rewards/rejected": -6.940467357635498, "step": 643 }, { "epoch": 0.8412115274716303, "grad_norm": 32.001776906392735, "learning_rate": 3.719863878597704e-08, "logits/chosen": -0.9169551134109497, "logits/rejected": -0.909413754940033, "logps/chosen": -867.5076293945312, "logps/rejected": -965.96044921875, "loss": 0.4021, "rewards/accuracies": 0.75, "rewards/chosen": -4.9577765464782715, "rewards/margins": 1.3180768489837646, "rewards/rejected": -6.275854110717773, "step": 644 }, { "epoch": 0.8425177565515553, "grad_norm": 26.775446426020242, "learning_rate": 3.660172774066339e-08, "logits/chosen": -0.9407652020454407, "logits/rejected": -0.8637470006942749, "logps/chosen": -742.7183837890625, "logps/rejected": -966.02978515625, "loss": 0.3015, "rewards/accuracies": 0.90625, "rewards/chosen": -3.942478895187378, "rewards/margins": 2.3000540733337402, "rewards/rejected": -6.242532730102539, "step": 645 }, { "epoch": 0.8438239856314801, "grad_norm": 18.452462326552563, "learning_rate": 3.600926621381306e-08, "logits/chosen": -0.9551993608474731, "logits/rejected": -0.9735285639762878, "logps/chosen": -796.7625732421875, "logps/rejected": -1045.31005859375, "loss": 0.2917, "rewards/accuracies": 0.875, "rewards/chosen": -4.553375720977783, "rewards/margins": 2.676002025604248, "rewards/rejected": -7.229377746582031, "step": 646 }, { "epoch": 0.845130214711405, "grad_norm": 21.2291726685947, "learning_rate": 3.54212665587055e-08, "logits/chosen": -0.9161040782928467, "logits/rejected": -0.8548312783241272, "logps/chosen": -755.5303955078125, "logps/rejected": -881.5941772460938, "loss": 0.3122, "rewards/accuracies": 0.84375, "rewards/chosen": -4.344512462615967, "rewards/margins": 1.6735119819641113, "rewards/rejected": -6.018024444580078, "step": 647 }, { "epoch": 0.8464364437913299, "grad_norm": 17.077302518351114, "learning_rate": 3.4837741035586816e-08, "logits/chosen": -0.9503481388092041, "logits/rejected": -1.0166294574737549, "logps/chosen": -806.5877685546875, "logps/rejected": -1030.3157958984375, "loss": 0.3193, "rewards/accuracies": 0.90625, "rewards/chosen": -4.656474590301514, "rewards/margins": 2.0568296909332275, "rewards/rejected": -6.71330451965332, "step": 648 }, { "epoch": 0.8477426728712548, "grad_norm": 21.127833153046492, "learning_rate": 3.425870181141394e-08, "logits/chosen": -0.8827044367790222, "logits/rejected": -0.9482054114341736, "logps/chosen": -739.4212036132812, "logps/rejected": -993.3704833984375, "loss": 0.3141, "rewards/accuracies": 0.875, "rewards/chosen": -4.291741371154785, "rewards/margins": 2.3153553009033203, "rewards/rejected": -6.607097625732422, "step": 649 }, { "epoch": 0.8490489019511797, "grad_norm": 18.112637343103085, "learning_rate": 3.3684160959600917e-08, "logits/chosen": -0.9638092517852783, "logits/rejected": -0.9645333290100098, "logps/chosen": -846.0302734375, "logps/rejected": -1121.314453125, "loss": 0.3411, "rewards/accuracies": 0.9375, "rewards/chosen": -5.2316718101501465, "rewards/margins": 2.5505781173706055, "rewards/rejected": -7.782249927520752, "step": 650 }, { "epoch": 0.8503551310311046, "grad_norm": 25.52167601536166, "learning_rate": 3.311413045976741e-08, "logits/chosen": -0.8650178909301758, "logits/rejected": -0.8249406218528748, "logps/chosen": -850.0011596679688, "logps/rejected": -1077.047119140625, "loss": 0.2862, "rewards/accuracies": 0.90625, "rewards/chosen": -4.9619140625, "rewards/margins": 2.3785924911499023, "rewards/rejected": -7.340506553649902, "step": 651 }, { "epoch": 0.8516613601110294, "grad_norm": 16.842773477083057, "learning_rate": 3.2548622197488744e-08, "logits/chosen": -0.9444965124130249, "logits/rejected": -0.9097779393196106, "logps/chosen": -765.1162109375, "logps/rejected": -952.4530029296875, "loss": 0.3226, "rewards/accuracies": 0.875, "rewards/chosen": -4.136512756347656, "rewards/margins": 2.2481629848480225, "rewards/rejected": -6.3846755027771, "step": 652 }, { "epoch": 0.8529675891909544, "grad_norm": 15.833969137148738, "learning_rate": 3.198764796404807e-08, "logits/chosen": -0.8982550501823425, "logits/rejected": -0.8870823383331299, "logps/chosen": -783.821044921875, "logps/rejected": -1065.482421875, "loss": 0.272, "rewards/accuracies": 0.875, "rewards/chosen": -4.595544338226318, "rewards/margins": 2.925309181213379, "rewards/rejected": -7.520853042602539, "step": 653 }, { "epoch": 0.8542738182708792, "grad_norm": 20.813983014515298, "learning_rate": 3.1431219456190563e-08, "logits/chosen": -0.7716184854507446, "logits/rejected": -0.7652902603149414, "logps/chosen": -788.2040405273438, "logps/rejected": -1052.087158203125, "loss": 0.3365, "rewards/accuracies": 0.90625, "rewards/chosen": -4.614079475402832, "rewards/margins": 2.3715004920959473, "rewards/rejected": -6.985579967498779, "step": 654 }, { "epoch": 0.8555800473508042, "grad_norm": 33.85912168424529, "learning_rate": 3.0879348275879484e-08, "logits/chosen": -0.9596065282821655, "logits/rejected": -0.9730892777442932, "logps/chosen": -832.7069091796875, "logps/rejected": -1055.71533203125, "loss": 0.4184, "rewards/accuracies": 0.84375, "rewards/chosen": -5.167130947113037, "rewards/margins": 2.034320116043091, "rewards/rejected": -7.201451301574707, "step": 655 }, { "epoch": 0.856886276430729, "grad_norm": 25.840604770233004, "learning_rate": 3.033204593005439e-08, "logits/chosen": -0.9529532194137573, "logits/rejected": -0.9935981631278992, "logps/chosen": -806.2481079101562, "logps/rejected": -1091.0167236328125, "loss": 0.3256, "rewards/accuracies": 0.875, "rewards/chosen": -4.538727760314941, "rewards/margins": 2.7176947593688965, "rewards/rejected": -7.256422519683838, "step": 656 }, { "epoch": 0.858192505510654, "grad_norm": 29.406599320225745, "learning_rate": 2.9789323830390927e-08, "logits/chosen": -1.0245752334594727, "logits/rejected": -0.9978625774383545, "logps/chosen": -851.211669921875, "logps/rejected": -1119.0037841796875, "loss": 0.3483, "rewards/accuracies": 0.9375, "rewards/chosen": -4.464425086975098, "rewards/margins": 2.873805522918701, "rewards/rejected": -7.338231086730957, "step": 657 }, { "epoch": 0.8594987345905788, "grad_norm": 19.04536365055149, "learning_rate": 2.925119329306333e-08, "logits/chosen": -0.8678416013717651, "logits/rejected": -0.8142892122268677, "logps/chosen": -808.4974365234375, "logps/rejected": -921.2274780273438, "loss": 0.3633, "rewards/accuracies": 0.8125, "rewards/chosen": -4.781768798828125, "rewards/margins": 1.3113124370574951, "rewards/rejected": -6.093081474304199, "step": 658 }, { "epoch": 0.8608049636705037, "grad_norm": 17.60296500618362, "learning_rate": 2.871766553850796e-08, "logits/chosen": -0.917082667350769, "logits/rejected": -0.9683120846748352, "logps/chosen": -802.0975952148438, "logps/rejected": -1129.573486328125, "loss": 0.2501, "rewards/accuracies": 0.8125, "rewards/chosen": -4.332143306732178, "rewards/margins": 2.751530885696411, "rewards/rejected": -7.083674907684326, "step": 659 }, { "epoch": 0.8621111927504286, "grad_norm": 21.15842626873544, "learning_rate": 2.818875169118981e-08, "logits/chosen": -0.9736424684524536, "logits/rejected": -0.971178412437439, "logps/chosen": -825.02001953125, "logps/rejected": -1002.52783203125, "loss": 0.2667, "rewards/accuracies": 0.90625, "rewards/chosen": -5.003799915313721, "rewards/margins": 1.8433326482772827, "rewards/rejected": -6.847132205963135, "step": 660 }, { "epoch": 0.8634174218303535, "grad_norm": 23.135441218561276, "learning_rate": 2.766446277937029e-08, "logits/chosen": -1.1651058197021484, "logits/rejected": -1.1745052337646484, "logps/chosen": -888.7207641601562, "logps/rejected": -1079.0030517578125, "loss": 0.3053, "rewards/accuracies": 0.8125, "rewards/chosen": -4.9126296043396, "rewards/margins": 2.0017378330230713, "rewards/rejected": -6.914368152618408, "step": 661 }, { "epoch": 0.8647236509102784, "grad_norm": 21.245688923606288, "learning_rate": 2.7144809734877316e-08, "logits/chosen": -0.9561842679977417, "logits/rejected": -0.9894252419471741, "logps/chosen": -791.2134399414062, "logps/rejected": -1127.950439453125, "loss": 0.2715, "rewards/accuracies": 0.90625, "rewards/chosen": -4.601735591888428, "rewards/margins": 3.0200610160827637, "rewards/rejected": -7.621796607971191, "step": 662 }, { "epoch": 0.8660298799902033, "grad_norm": 28.174818636350068, "learning_rate": 2.6629803392877486e-08, "logits/chosen": -0.9367572069168091, "logits/rejected": -1.0345512628555298, "logps/chosen": -845.8182983398438, "logps/rejected": -1210.5560302734375, "loss": 0.318, "rewards/accuracies": 0.96875, "rewards/chosen": -4.806478500366211, "rewards/margins": 3.0188956260681152, "rewards/rejected": -7.825375080108643, "step": 663 }, { "epoch": 0.8673361090701281, "grad_norm": 21.922625343186045, "learning_rate": 2.6119454491649845e-08, "logits/chosen": -0.8922625780105591, "logits/rejected": -0.9068939685821533, "logps/chosen": -841.5750122070312, "logps/rejected": -1101.30810546875, "loss": 0.3871, "rewards/accuracies": 0.8125, "rewards/chosen": -4.900118827819824, "rewards/margins": 2.467489719390869, "rewards/rejected": -7.367608070373535, "step": 664 }, { "epoch": 0.8686423381500531, "grad_norm": 31.04807995918663, "learning_rate": 2.5613773672362476e-08, "logits/chosen": -1.0141932964324951, "logits/rejected": -1.0032148361206055, "logps/chosen": -839.9802856445312, "logps/rejected": -1058.4979248046875, "loss": 0.3468, "rewards/accuracies": 0.875, "rewards/chosen": -4.662171363830566, "rewards/margins": 2.167586326599121, "rewards/rejected": -6.8297576904296875, "step": 665 }, { "epoch": 0.8699485672299779, "grad_norm": 28.627499460485247, "learning_rate": 2.5112771478850186e-08, "logits/chosen": -1.0315594673156738, "logits/rejected": -1.0283108949661255, "logps/chosen": -772.7771606445312, "logps/rejected": -975.7742919921875, "loss": 0.3636, "rewards/accuracies": 0.90625, "rewards/chosen": -4.486318111419678, "rewards/margins": 1.9306285381317139, "rewards/rejected": -6.4169464111328125, "step": 666 }, { "epoch": 0.8712547963099029, "grad_norm": 27.501666238776274, "learning_rate": 2.46164583573949e-08, "logits/chosen": -0.9367161989212036, "logits/rejected": -0.9585106372833252, "logps/chosen": -824.9557495117188, "logps/rejected": -1110.76123046875, "loss": 0.3143, "rewards/accuracies": 0.875, "rewards/chosen": -5.000235557556152, "rewards/margins": 2.6299326419830322, "rewards/rejected": -7.630168437957764, "step": 667 }, { "epoch": 0.8725610253898277, "grad_norm": 31.364682261474258, "learning_rate": 2.412484465650774e-08, "logits/chosen": -0.882140576839447, "logits/rejected": -0.9227953553199768, "logps/chosen": -795.4391479492188, "logps/rejected": -1053.7271728515625, "loss": 0.4831, "rewards/accuracies": 0.78125, "rewards/chosen": -4.803238868713379, "rewards/margins": 2.2624800205230713, "rewards/rejected": -7.065718650817871, "step": 668 }, { "epoch": 0.8738672544697527, "grad_norm": 23.304091321744853, "learning_rate": 2.3637940626713342e-08, "logits/chosen": -1.0385204553604126, "logits/rejected": -1.0561144351959229, "logps/chosen": -851.8370361328125, "logps/rejected": -1017.2864990234375, "loss": 0.3164, "rewards/accuracies": 0.84375, "rewards/chosen": -5.111514568328857, "rewards/margins": 1.816351056098938, "rewards/rejected": -6.927865505218506, "step": 669 }, { "epoch": 0.8751734835496775, "grad_norm": 23.269790245201648, "learning_rate": 2.315575642033604e-08, "logits/chosen": -1.024946928024292, "logits/rejected": -1.0573128461837769, "logps/chosen": -793.3582763671875, "logps/rejected": -1110.23876953125, "loss": 0.2668, "rewards/accuracies": 0.96875, "rewards/chosen": -4.664315223693848, "rewards/margins": 3.121136426925659, "rewards/rejected": -7.785451889038086, "step": 670 }, { "epoch": 0.8764797126296024, "grad_norm": 30.247532810977688, "learning_rate": 2.2678302091288155e-08, "logits/chosen": -1.0933949947357178, "logits/rejected": -1.0754190683364868, "logps/chosen": -805.4337768554688, "logps/rejected": -1031.197021484375, "loss": 0.3132, "rewards/accuracies": 0.8125, "rewards/chosen": -4.566261291503906, "rewards/margins": 2.2215776443481445, "rewards/rejected": -6.787838935852051, "step": 671 }, { "epoch": 0.8777859417095273, "grad_norm": 107.71053234226156, "learning_rate": 2.2205587594860463e-08, "logits/chosen": -0.8631647229194641, "logits/rejected": -0.8830554485321045, "logps/chosen": -810.8700561523438, "logps/rejected": -1053.4078369140625, "loss": 0.3621, "rewards/accuracies": 0.8125, "rewards/chosen": -5.119283676147461, "rewards/margins": 2.2090401649475098, "rewards/rejected": -7.328324317932129, "step": 672 }, { "epoch": 0.8790921707894522, "grad_norm": 30.991850825647404, "learning_rate": 2.1737622787514593e-08, "logits/chosen": -0.813284695148468, "logits/rejected": -0.8452885150909424, "logps/chosen": -796.1976318359375, "logps/rejected": -997.8446044921875, "loss": 0.4298, "rewards/accuracies": 0.78125, "rewards/chosen": -5.063453674316406, "rewards/margins": 1.775707721710205, "rewards/rejected": -6.8391618728637695, "step": 673 }, { "epoch": 0.8803983998693771, "grad_norm": 22.463846231847736, "learning_rate": 2.1274417426677514e-08, "logits/chosen": -0.8230746984481812, "logits/rejected": -0.8600046038627625, "logps/chosen": -820.2351684570312, "logps/rejected": -1084.288818359375, "loss": 0.3613, "rewards/accuracies": 0.84375, "rewards/chosen": -4.950501918792725, "rewards/margins": 2.2537648677825928, "rewards/rejected": -7.204266548156738, "step": 674 }, { "epoch": 0.881704628949302, "grad_norm": 21.4399416595423, "learning_rate": 2.081598117053801e-08, "logits/chosen": -0.9868246912956238, "logits/rejected": -1.0269767045974731, "logps/chosen": -844.5371704101562, "logps/rejected": -1028.14501953125, "loss": 0.3512, "rewards/accuracies": 0.84375, "rewards/chosen": -4.722963809967041, "rewards/margins": 1.6241395473480225, "rewards/rejected": -6.347103595733643, "step": 675 }, { "epoch": 0.8830108580292269, "grad_norm": 23.258634574631902, "learning_rate": 2.0362323577845424e-08, "logits/chosen": -0.9350968599319458, "logits/rejected": -0.9319708347320557, "logps/chosen": -831.6373291015625, "logps/rejected": -1064.093505859375, "loss": 0.3229, "rewards/accuracies": 0.90625, "rewards/chosen": -4.654766082763672, "rewards/margins": 2.459113836288452, "rewards/rejected": -7.113880157470703, "step": 676 }, { "epoch": 0.8843170871091518, "grad_norm": 17.971275755186788, "learning_rate": 1.991345410771017e-08, "logits/chosen": -0.9765560626983643, "logits/rejected": -1.001975655555725, "logps/chosen": -934.716796875, "logps/rejected": -1197.37109375, "loss": 0.3025, "rewards/accuracies": 0.84375, "rewards/chosen": -5.539895057678223, "rewards/margins": 2.42952823638916, "rewards/rejected": -7.969422817230225, "step": 677 }, { "epoch": 0.8856233161890766, "grad_norm": 28.491347745271142, "learning_rate": 1.9469382119406714e-08, "logits/chosen": -0.9596845507621765, "logits/rejected": -0.8702284097671509, "logps/chosen": -802.1336669921875, "logps/rejected": -1096.5390625, "loss": 0.2762, "rewards/accuracies": 0.9375, "rewards/chosen": -4.722399711608887, "rewards/margins": 3.0645108222961426, "rewards/rejected": -7.7869110107421875, "step": 678 }, { "epoch": 0.8869295452690016, "grad_norm": 32.96394591000798, "learning_rate": 1.9030116872178314e-08, "logits/chosen": -0.9617183804512024, "logits/rejected": -1.048295021057129, "logps/chosen": -864.1490478515625, "logps/rejected": -1119.6983642578125, "loss": 0.4001, "rewards/accuracies": 0.75, "rewards/chosen": -5.045253753662109, "rewards/margins": 1.980230689048767, "rewards/rejected": -7.025485038757324, "step": 679 }, { "epoch": 0.8882357743489264, "grad_norm": 21.241499611321817, "learning_rate": 1.8595667525043963e-08, "logits/chosen": -0.9886503219604492, "logits/rejected": -0.9136042594909668, "logps/chosen": -812.7432861328125, "logps/rejected": -1024.8572998046875, "loss": 0.3109, "rewards/accuracies": 0.84375, "rewards/chosen": -4.500929832458496, "rewards/margins": 2.2964494228363037, "rewards/rejected": -6.797379016876221, "step": 680 }, { "epoch": 0.8895420034288514, "grad_norm": 23.122708193972542, "learning_rate": 1.816604313660741e-08, "logits/chosen": -1.0124708414077759, "logits/rejected": -1.0051053762435913, "logps/chosen": -804.603271484375, "logps/rejected": -1008.0371704101562, "loss": 0.3708, "rewards/accuracies": 0.90625, "rewards/chosen": -4.575305461883545, "rewards/margins": 2.1409244537353516, "rewards/rejected": -6.716229438781738, "step": 681 }, { "epoch": 0.8908482325087762, "grad_norm": 20.451259814838977, "learning_rate": 1.7741252664868312e-08, "logits/chosen": -0.8169146180152893, "logits/rejected": -0.8272385597229004, "logps/chosen": -828.5379638671875, "logps/rejected": -1150.41943359375, "loss": 0.34, "rewards/accuracies": 0.90625, "rewards/chosen": -5.057079315185547, "rewards/margins": 2.8700027465820312, "rewards/rejected": -7.927082061767578, "step": 682 }, { "epoch": 0.8921544615887012, "grad_norm": 30.63702941806789, "learning_rate": 1.7321304967035487e-08, "logits/chosen": -0.9371048808097839, "logits/rejected": -0.8836950063705444, "logps/chosen": -878.0043334960938, "logps/rejected": -1135.9581298828125, "loss": 0.325, "rewards/accuracies": 0.875, "rewards/chosen": -4.758838653564453, "rewards/margins": 2.4666881561279297, "rewards/rejected": -7.225526809692383, "step": 683 }, { "epoch": 0.893460690668626, "grad_norm": 19.813773854758352, "learning_rate": 1.6906208799342014e-08, "logits/chosen": -0.9349536895751953, "logits/rejected": -0.9600227475166321, "logps/chosen": -775.5115966796875, "logps/rejected": -1027.392578125, "loss": 0.3199, "rewards/accuracies": 0.90625, "rewards/chosen": -4.448516368865967, "rewards/margins": 2.41306734085083, "rewards/rejected": -6.861583709716797, "step": 684 }, { "epoch": 0.8947669197485509, "grad_norm": 29.17870420577452, "learning_rate": 1.649597281686302e-08, "logits/chosen": -0.8184996247291565, "logits/rejected": -0.9192086458206177, "logps/chosen": -812.7239990234375, "logps/rejected": -1163.47412109375, "loss": 0.3899, "rewards/accuracies": 0.71875, "rewards/chosen": -4.799856662750244, "rewards/margins": 2.99330997467041, "rewards/rejected": -7.793166637420654, "step": 685 }, { "epoch": 0.8960731488284758, "grad_norm": 20.560672395597233, "learning_rate": 1.6090605573334915e-08, "logits/chosen": -0.8407419919967651, "logits/rejected": -0.9089653491973877, "logps/chosen": -818.4083862304688, "logps/rejected": -1127.5633544921875, "loss": 0.2907, "rewards/accuracies": 0.875, "rewards/chosen": -4.807866096496582, "rewards/margins": 2.694227457046509, "rewards/rejected": -7.502093315124512, "step": 686 }, { "epoch": 0.8973793779084007, "grad_norm": 16.610603081015654, "learning_rate": 1.569011552097718e-08, "logits/chosen": -0.8607504963874817, "logits/rejected": -0.88968825340271, "logps/chosen": -860.8826293945312, "logps/rejected": -1173.7161865234375, "loss": 0.2899, "rewards/accuracies": 0.875, "rewards/chosen": -5.202151775360107, "rewards/margins": 2.443506956100464, "rewards/rejected": -7.645658493041992, "step": 687 }, { "epoch": 0.8986856069883256, "grad_norm": 30.987286097239437, "learning_rate": 1.5294511010316145e-08, "logits/chosen": -0.8558975458145142, "logits/rejected": -0.8481131792068481, "logps/chosen": -832.5034790039062, "logps/rejected": -972.0008544921875, "loss": 0.4205, "rewards/accuracies": 0.78125, "rewards/chosen": -4.9938507080078125, "rewards/margins": 1.4612228870391846, "rewards/rejected": -6.455073356628418, "step": 688 }, { "epoch": 0.8999918360682505, "grad_norm": 32.486020006493476, "learning_rate": 1.4903800290010815e-08, "logits/chosen": -0.9570955038070679, "logits/rejected": -0.9459174871444702, "logps/chosen": -849.517333984375, "logps/rejected": -1078.22314453125, "loss": 0.4155, "rewards/accuracies": 0.8125, "rewards/chosen": -4.753889083862305, "rewards/margins": 2.205451488494873, "rewards/rejected": -6.959341049194336, "step": 689 }, { "epoch": 0.9012980651481753, "grad_norm": 25.45059241483055, "learning_rate": 1.4517991506680761e-08, "logits/chosen": -0.9805968999862671, "logits/rejected": -1.0026856660842896, "logps/chosen": -796.7140502929688, "logps/rejected": -1002.83447265625, "loss": 0.3279, "rewards/accuracies": 0.78125, "rewards/chosen": -4.365528583526611, "rewards/margins": 2.011324167251587, "rewards/rejected": -6.376852512359619, "step": 690 }, { "epoch": 0.9026042942281003, "grad_norm": 20.08399102951379, "learning_rate": 1.4137092704736564e-08, "logits/chosen": -1.0272362232208252, "logits/rejected": -1.046217679977417, "logps/chosen": -896.72314453125, "logps/rejected": -1169.2354736328125, "loss": 0.2868, "rewards/accuracies": 0.875, "rewards/chosen": -5.155872344970703, "rewards/margins": 2.6793265342712402, "rewards/rejected": -7.835198402404785, "step": 691 }, { "epoch": 0.9039105233080251, "grad_norm": 19.737152866013854, "learning_rate": 1.3761111826211813e-08, "logits/chosen": -0.884811282157898, "logits/rejected": -0.93790203332901, "logps/chosen": -825.8671875, "logps/rejected": -1213.501220703125, "loss": 0.3309, "rewards/accuracies": 0.96875, "rewards/chosen": -4.409741401672363, "rewards/margins": 3.5079073905944824, "rewards/rejected": -7.917649269104004, "step": 692 }, { "epoch": 0.9052167523879501, "grad_norm": 24.701952490736144, "learning_rate": 1.3390056710597647e-08, "logits/chosen": -1.0386333465576172, "logits/rejected": -1.0525237321853638, "logps/chosen": -774.3814697265625, "logps/rejected": -959.9278564453125, "loss": 0.3527, "rewards/accuracies": 0.90625, "rewards/chosen": -4.583800315856934, "rewards/margins": 1.6484012603759766, "rewards/rejected": -6.23220157623291, "step": 693 }, { "epoch": 0.9065229814678749, "grad_norm": 25.601346480134133, "learning_rate": 1.302393509467925e-08, "logits/chosen": -1.007061243057251, "logits/rejected": -1.0328550338745117, "logps/chosen": -828.13525390625, "logps/rejected": -1115.3458251953125, "loss": 0.3754, "rewards/accuracies": 0.90625, "rewards/chosen": -5.077081680297852, "rewards/margins": 2.570059299468994, "rewards/rejected": -7.6471405029296875, "step": 694 }, { "epoch": 0.9078292105477999, "grad_norm": 22.751192382217827, "learning_rate": 1.2662754612374482e-08, "logits/chosen": -1.1416237354278564, "logits/rejected": -1.1116670370101929, "logps/chosen": -862.865966796875, "logps/rejected": -1017.9955444335938, "loss": 0.3109, "rewards/accuracies": 0.78125, "rewards/chosen": -4.581521511077881, "rewards/margins": 1.9770212173461914, "rewards/rejected": -6.5585432052612305, "step": 695 }, { "epoch": 0.9091354396277247, "grad_norm": 29.147918711392688, "learning_rate": 1.2306522794574864e-08, "logits/chosen": -0.9129850268363953, "logits/rejected": -0.9268524646759033, "logps/chosen": -758.1630249023438, "logps/rejected": -1032.794677734375, "loss": 0.3501, "rewards/accuracies": 0.9375, "rewards/chosen": -4.635600566864014, "rewards/margins": 2.712730646133423, "rewards/rejected": -7.348330974578857, "step": 696 }, { "epoch": 0.9104416687076496, "grad_norm": 18.5578048726823, "learning_rate": 1.195524706898826e-08, "logits/chosen": -0.959962010383606, "logits/rejected": -0.9326778650283813, "logps/chosen": -780.856689453125, "logps/rejected": -976.901611328125, "loss": 0.2673, "rewards/accuracies": 0.875, "rewards/chosen": -4.898529529571533, "rewards/margins": 2.022247076034546, "rewards/rejected": -6.9207763671875, "step": 697 }, { "epoch": 0.9117478977875745, "grad_norm": 19.744948826838282, "learning_rate": 1.1608934759984424e-08, "logits/chosen": -0.9286206960678101, "logits/rejected": -1.01764976978302, "logps/chosen": -881.43896484375, "logps/rejected": -1238.6363525390625, "loss": 0.3231, "rewards/accuracies": 0.71875, "rewards/chosen": -5.401637554168701, "rewards/margins": 2.623457908630371, "rewards/rejected": -8.025094985961914, "step": 698 }, { "epoch": 0.9130541268674994, "grad_norm": 32.792459109040685, "learning_rate": 1.1267593088441884e-08, "logits/chosen": -0.8476910591125488, "logits/rejected": -0.8974804282188416, "logps/chosen": -830.3240356445312, "logps/rejected": -1020.6620483398438, "loss": 0.3228, "rewards/accuracies": 0.78125, "rewards/chosen": -4.793473243713379, "rewards/margins": 2.098773956298828, "rewards/rejected": -6.892247200012207, "step": 699 }, { "epoch": 0.9143603559474243, "grad_norm": 16.358095159369643, "learning_rate": 1.0931229171597583e-08, "logits/chosen": -0.9544919729232788, "logits/rejected": -0.9292432069778442, "logps/chosen": -915.648681640625, "logps/rejected": -1094.630126953125, "loss": 0.3251, "rewards/accuracies": 0.71875, "rewards/chosen": -5.481724262237549, "rewards/margins": 2.09407377243042, "rewards/rejected": -7.575798034667969, "step": 700 }, { "epoch": 0.9143603559474243, "eval_logits/chosen": -0.8042228817939758, "eval_logits/rejected": -0.8119060397148132, "eval_logps/chosen": -827.6954345703125, "eval_logps/rejected": -1104.072998046875, "eval_loss": 0.33346831798553467, "eval_rewards/accuracies": 0.8880000114440918, "eval_rewards/chosen": -4.836644649505615, "eval_rewards/margins": 2.7027976512908936, "eval_rewards/rejected": -7.53944206237793, "eval_runtime": 304.3609, "eval_samples_per_second": 6.571, "eval_steps_per_second": 0.411, "step": 700 }, { "epoch": 0.9156665850273492, "grad_norm": 45.78976016140901, "learning_rate": 1.0599850022898537e-08, "logits/chosen": -0.7225776314735413, "logits/rejected": -0.6860483884811401, "logps/chosen": -801.6976318359375, "logps/rejected": -1225.3739013671875, "loss": 0.2817, "rewards/accuracies": 0.90625, "rewards/chosen": -4.951022624969482, "rewards/margins": 3.52390193939209, "rewards/rejected": -8.47492504119873, "step": 701 }, { "epoch": 0.916972814107274, "grad_norm": 25.229183577535213, "learning_rate": 1.0273462551855295e-08, "logits/chosen": -0.9126315712928772, "logits/rejected": -0.9276062250137329, "logps/chosen": -742.0897216796875, "logps/rejected": -975.28125, "loss": 0.365, "rewards/accuracies": 0.875, "rewards/chosen": -4.212459564208984, "rewards/margins": 2.135211229324341, "rewards/rejected": -6.347670555114746, "step": 702 }, { "epoch": 0.918279043187199, "grad_norm": 18.00129692277072, "learning_rate": 9.952073563898322e-09, "logits/chosen": -0.9886517524719238, "logits/rejected": -1.0185874700546265, "logps/chosen": -852.1192626953125, "logps/rejected": -1145.8603515625, "loss": 0.3242, "rewards/accuracies": 0.90625, "rewards/chosen": -5.017488479614258, "rewards/margins": 2.6713109016418457, "rewards/rejected": -7.688799858093262, "step": 703 }, { "epoch": 0.9195852722671238, "grad_norm": 31.544292267398593, "learning_rate": 9.635689760235682e-09, "logits/chosen": -0.9542191624641418, "logits/rejected": -0.9516808986663818, "logps/chosen": -867.5074462890625, "logps/rejected": -1126.504638671875, "loss": 0.4693, "rewards/accuracies": 0.78125, "rewards/chosen": -5.200738430023193, "rewards/margins": 2.536679983139038, "rewards/rejected": -7.7374186515808105, "step": 704 }, { "epoch": 0.9208915013470488, "grad_norm": 46.72515936026558, "learning_rate": 9.324317737713555e-09, "logits/chosen": -0.9351953864097595, "logits/rejected": -0.9409655928611755, "logps/chosen": -792.6817016601562, "logps/rejected": -1014.02685546875, "loss": 0.3214, "rewards/accuracies": 0.90625, "rewards/chosen": -4.412503242492676, "rewards/margins": 2.4800729751586914, "rewards/rejected": -6.892575740814209, "step": 705 }, { "epoch": 0.9221977304269736, "grad_norm": 23.61367143969372, "learning_rate": 9.017963988678601e-09, "logits/chosen": -1.03610360622406, "logits/rejected": -0.990273654460907, "logps/chosen": -841.663818359375, "logps/rejected": -1107.4656982421875, "loss": 0.3304, "rewards/accuracies": 0.90625, "rewards/chosen": -4.593728065490723, "rewards/margins": 2.6971254348754883, "rewards/rejected": -7.290853500366211, "step": 706 }, { "epoch": 0.9235039595068986, "grad_norm": 23.281354565557432, "learning_rate": 8.716634900842651e-09, "logits/chosen": -0.9835007786750793, "logits/rejected": -0.9869141578674316, "logps/chosen": -792.2289428710938, "logps/rejected": -1047.859375, "loss": 0.3497, "rewards/accuracies": 0.90625, "rewards/chosen": -4.580854892730713, "rewards/margins": 2.6027719974517822, "rewards/rejected": -7.183626174926758, "step": 707 }, { "epoch": 0.9248101885868234, "grad_norm": 24.611145520672583, "learning_rate": 8.420336757149454e-09, "logits/chosen": -1.1103384494781494, "logits/rejected": -1.1257877349853516, "logps/chosen": -846.7141723632812, "logps/rejected": -1046.1112060546875, "loss": 0.397, "rewards/accuracies": 0.84375, "rewards/chosen": -4.437961101531982, "rewards/margins": 2.0100278854370117, "rewards/rejected": -6.447989463806152, "step": 708 }, { "epoch": 0.9261164176667483, "grad_norm": 29.316192799394898, "learning_rate": 8.129075735643698e-09, "logits/chosen": -0.9531034827232361, "logits/rejected": -0.8710059523582458, "logps/chosen": -958.839599609375, "logps/rejected": -1179.770751953125, "loss": 0.3341, "rewards/accuracies": 0.84375, "rewards/chosen": -5.782157897949219, "rewards/margins": 2.180623769760132, "rewards/rejected": -7.96278190612793, "step": 709 }, { "epoch": 0.9274226467466732, "grad_norm": 67.50239470619628, "learning_rate": 7.842857909342165e-09, "logits/chosen": -0.8792818188667297, "logits/rejected": -0.8596967458724976, "logps/chosen": -762.2716064453125, "logps/rejected": -981.783447265625, "loss": 0.3203, "rewards/accuracies": 0.875, "rewards/chosen": -4.564942359924316, "rewards/margins": 2.2731096744537354, "rewards/rejected": -6.838052272796631, "step": 710 }, { "epoch": 0.9287288758265981, "grad_norm": 22.773314772539816, "learning_rate": 7.561689246107145e-09, "logits/chosen": -0.9148231744766235, "logits/rejected": -0.9793925881385803, "logps/chosen": -750.6596069335938, "logps/rejected": -1137.5262451171875, "loss": 0.3189, "rewards/accuracies": 0.875, "rewards/chosen": -4.167259216308594, "rewards/margins": 3.1219406127929688, "rewards/rejected": -7.2891998291015625, "step": 711 }, { "epoch": 0.930035104906523, "grad_norm": 22.30274318697143, "learning_rate": 7.2855756085219714e-09, "logits/chosen": -0.8467492461204529, "logits/rejected": -0.8688889741897583, "logps/chosen": -749.0939331054688, "logps/rejected": -948.42626953125, "loss": 0.3543, "rewards/accuracies": 0.875, "rewards/chosen": -4.155250072479248, "rewards/margins": 2.0990848541259766, "rewards/rejected": -6.254334449768066, "step": 712 }, { "epoch": 0.9313413339864479, "grad_norm": 22.012504399894368, "learning_rate": 7.014522753768848e-09, "logits/chosen": -0.977868914604187, "logits/rejected": -1.0833169221878052, "logps/chosen": -883.92529296875, "logps/rejected": -1315.0213623046875, "loss": 0.2862, "rewards/accuracies": 0.875, "rewards/chosen": -5.060840606689453, "rewards/margins": 3.245311975479126, "rewards/rejected": -8.306153297424316, "step": 713 }, { "epoch": 0.9326475630663728, "grad_norm": 18.190681387753177, "learning_rate": 6.7485363335087475e-09, "logits/chosen": -0.8860818147659302, "logits/rejected": -0.9461392164230347, "logps/chosen": -795.0383911132812, "logps/rejected": -1020.0634765625, "loss": 0.3417, "rewards/accuracies": 0.8125, "rewards/chosen": -5.095912933349609, "rewards/margins": 2.0163002014160156, "rewards/rejected": -7.112213134765625, "step": 714 }, { "epoch": 0.9339537921462977, "grad_norm": 26.781074891011134, "learning_rate": 6.4876218937634786e-09, "logits/chosen": -0.7991393208503723, "logits/rejected": -0.7921952605247498, "logps/chosen": -789.0760498046875, "logps/rejected": -1029.6826171875, "loss": 0.3409, "rewards/accuracies": 0.875, "rewards/chosen": -4.609364032745361, "rewards/margins": 2.325976610183716, "rewards/rejected": -6.93533992767334, "step": 715 }, { "epoch": 0.9352600212262225, "grad_norm": 23.807291079941198, "learning_rate": 6.231784874800306e-09, "logits/chosen": -0.9109543561935425, "logits/rejected": -0.9539563655853271, "logps/chosen": -866.422607421875, "logps/rejected": -1126.484130859375, "loss": 0.3146, "rewards/accuracies": 0.90625, "rewards/chosen": -4.959641456604004, "rewards/margins": 2.240410566329956, "rewards/rejected": -7.200052261352539, "step": 716 }, { "epoch": 0.9365662503061475, "grad_norm": 26.79365637604462, "learning_rate": 5.981030611018234e-09, "logits/chosen": -0.8974629640579224, "logits/rejected": -0.9451881647109985, "logps/chosen": -748.3617553710938, "logps/rejected": -979.998291015625, "loss": 0.2984, "rewards/accuracies": 0.8125, "rewards/chosen": -4.526296615600586, "rewards/margins": 1.7230241298675537, "rewards/rejected": -6.2493205070495605, "step": 717 }, { "epoch": 0.9378724793860723, "grad_norm": 24.570663697903854, "learning_rate": 5.735364330836906e-09, "logits/chosen": -0.8672972321510315, "logits/rejected": -0.9144647717475891, "logps/chosen": -848.5457763671875, "logps/rejected": -1264.940185546875, "loss": 0.2165, "rewards/accuracies": 1.0, "rewards/chosen": -5.055003643035889, "rewards/margins": 3.4090349674224854, "rewards/rejected": -8.464038848876953, "step": 718 }, { "epoch": 0.9391787084659973, "grad_norm": 15.934773179952451, "learning_rate": 5.494791156587686e-09, "logits/chosen": -1.0396963357925415, "logits/rejected": -1.020418405532837, "logps/chosen": -749.0263671875, "logps/rejected": -883.5203857421875, "loss": 0.339, "rewards/accuracies": 0.84375, "rewards/chosen": -4.272754669189453, "rewards/margins": 1.393311858177185, "rewards/rejected": -5.666066646575928, "step": 719 }, { "epoch": 0.9404849375459221, "grad_norm": 20.2536773624075, "learning_rate": 5.259316104406636e-09, "logits/chosen": -0.9552453756332397, "logits/rejected": -0.940865159034729, "logps/chosen": -834.688232421875, "logps/rejected": -1037.806396484375, "loss": 0.3436, "rewards/accuracies": 0.8125, "rewards/chosen": -4.794745445251465, "rewards/margins": 2.015627384185791, "rewards/rejected": -6.810372829437256, "step": 720 }, { "epoch": 0.941791166625847, "grad_norm": 53.84085219336435, "learning_rate": 5.028944084130155e-09, "logits/chosen": -0.914104163646698, "logits/rejected": -0.9181921482086182, "logps/chosen": -845.41455078125, "logps/rejected": -1107.7197265625, "loss": 0.3146, "rewards/accuracies": 0.90625, "rewards/chosen": -4.820213794708252, "rewards/margins": 2.7967965602874756, "rewards/rejected": -7.61700963973999, "step": 721 }, { "epoch": 0.9430973957057719, "grad_norm": 20.32358933591622, "learning_rate": 4.803679899192392e-09, "logits/chosen": -1.017437219619751, "logits/rejected": -0.9803633093833923, "logps/chosen": -998.7100830078125, "logps/rejected": -1228.1575927734375, "loss": 0.3148, "rewards/accuracies": 0.875, "rewards/chosen": -5.515434741973877, "rewards/margins": 2.422970771789551, "rewards/rejected": -7.938405513763428, "step": 722 }, { "epoch": 0.9444036247856968, "grad_norm": 26.226455627687997, "learning_rate": 4.5835282465252476e-09, "logits/chosen": -0.8738875985145569, "logits/rejected": -0.9390866756439209, "logps/chosen": -794.534423828125, "logps/rejected": -1080.243896484375, "loss": 0.2523, "rewards/accuracies": 0.875, "rewards/chosen": -4.412847518920898, "rewards/margins": 2.8086698055267334, "rewards/rejected": -7.221517562866211, "step": 723 }, { "epoch": 0.9457098538656217, "grad_norm": 33.10596601901346, "learning_rate": 4.368493716460392e-09, "logits/chosen": -0.9799577593803406, "logits/rejected": -0.9760726094245911, "logps/chosen": -836.212158203125, "logps/rejected": -1077.455322265625, "loss": 0.3313, "rewards/accuracies": 0.90625, "rewards/chosen": -4.865520477294922, "rewards/margins": 2.487382411956787, "rewards/rejected": -7.352903366088867, "step": 724 }, { "epoch": 0.9470160829455466, "grad_norm": 21.851274725696168, "learning_rate": 4.158580792633482e-09, "logits/chosen": -1.0691559314727783, "logits/rejected": -0.9570724964141846, "logps/chosen": -888.2042846679688, "logps/rejected": -1064.596923828125, "loss": 0.3111, "rewards/accuracies": 0.84375, "rewards/chosen": -4.672724723815918, "rewards/margins": 2.358022451400757, "rewards/rejected": -7.030746936798096, "step": 725 }, { "epoch": 0.9483223120254715, "grad_norm": 38.81636628127586, "learning_rate": 3.953793851890791e-09, "logits/chosen": -0.9587030410766602, "logits/rejected": -0.9567082524299622, "logps/chosen": -809.8760986328125, "logps/rejected": -1075.88916015625, "loss": 0.2958, "rewards/accuracies": 0.8125, "rewards/chosen": -4.480008602142334, "rewards/margins": 2.473860740661621, "rewards/rejected": -6.953869819641113, "step": 726 }, { "epoch": 0.9496285411053964, "grad_norm": 24.374217026089934, "learning_rate": 3.754137164197923e-09, "logits/chosen": -0.9956728219985962, "logits/rejected": -0.962954044342041, "logps/chosen": -934.9635009765625, "logps/rejected": -1057.8211669921875, "loss": 0.3842, "rewards/accuracies": 0.78125, "rewards/chosen": -5.475031852722168, "rewards/margins": 1.5260260105133057, "rewards/rejected": -7.0010576248168945, "step": 727 }, { "epoch": 0.9509347701853212, "grad_norm": 21.760779525811405, "learning_rate": 3.559614892550661e-09, "logits/chosen": -0.7743152976036072, "logits/rejected": -0.794499397277832, "logps/chosen": -774.2794799804688, "logps/rejected": -1053.2183837890625, "loss": 0.3013, "rewards/accuracies": 0.875, "rewards/chosen": -4.855001926422119, "rewards/margins": 2.7619917392730713, "rewards/rejected": -7.6169939041137695, "step": 728 }, { "epoch": 0.9522409992652462, "grad_norm": 27.51247655254993, "learning_rate": 3.370231092888365e-09, "logits/chosen": -0.9370498657226562, "logits/rejected": -0.9787413477897644, "logps/chosen": -823.4442749023438, "logps/rejected": -1096.9632568359375, "loss": 0.313, "rewards/accuracies": 0.875, "rewards/chosen": -4.691824913024902, "rewards/margins": 2.6661367416381836, "rewards/rejected": -7.357962131500244, "step": 729 }, { "epoch": 0.953547228345171, "grad_norm": 22.64921257852498, "learning_rate": 3.185989714009185e-09, "logits/chosen": -0.8871971368789673, "logits/rejected": -0.9147964715957642, "logps/chosen": -774.44921875, "logps/rejected": -1189.8350830078125, "loss": 0.2955, "rewards/accuracies": 0.90625, "rewards/chosen": -4.431604385375977, "rewards/margins": 3.59488582611084, "rewards/rejected": -8.026490211486816, "step": 730 }, { "epoch": 0.954853457425096, "grad_norm": 38.98489185084576, "learning_rate": 3.0068945974878744e-09, "logits/chosen": -0.9947443604469299, "logits/rejected": -0.9405471682548523, "logps/chosen": -875.498291015625, "logps/rejected": -1069.519287109375, "loss": 0.2867, "rewards/accuracies": 0.875, "rewards/chosen": -5.040494918823242, "rewards/margins": 2.050215005874634, "rewards/rejected": -7.090709686279297, "step": 731 }, { "epoch": 0.9561596865050208, "grad_norm": 24.62642376244049, "learning_rate": 2.8329494775956862e-09, "logits/chosen": -1.1279411315917969, "logits/rejected": -1.147200584411621, "logps/chosen": -838.68359375, "logps/rejected": -1042.97412109375, "loss": 0.2784, "rewards/accuracies": 0.84375, "rewards/chosen": -4.729671478271484, "rewards/margins": 2.1291439533233643, "rewards/rejected": -6.858816146850586, "step": 732 }, { "epoch": 0.9574659155849458, "grad_norm": 28.94900881890447, "learning_rate": 2.664157981222437e-09, "logits/chosen": -0.9128333330154419, "logits/rejected": -0.9717162847518921, "logps/chosen": -858.6765747070312, "logps/rejected": -1108.43115234375, "loss": 0.3688, "rewards/accuracies": 0.84375, "rewards/chosen": -5.196346759796143, "rewards/margins": 2.261138916015625, "rewards/rejected": -7.457486152648926, "step": 733 }, { "epoch": 0.9587721446648706, "grad_norm": 29.009131812279637, "learning_rate": 2.5005236278009546e-09, "logits/chosen": -0.973136305809021, "logits/rejected": -0.9964556694030762, "logps/chosen": -806.8096923828125, "logps/rejected": -1122.0406494140625, "loss": 0.3474, "rewards/accuracies": 0.90625, "rewards/chosen": -4.701882839202881, "rewards/margins": 2.8338756561279297, "rewards/rejected": -7.5357584953308105, "step": 734 }, { "epoch": 0.9600783737447955, "grad_norm": 19.15685039048966, "learning_rate": 2.342049829233611e-09, "logits/chosen": -0.8886818885803223, "logits/rejected": -0.9232064485549927, "logps/chosen": -770.3201293945312, "logps/rejected": -1065.265869140625, "loss": 0.2658, "rewards/accuracies": 1.0, "rewards/chosen": -4.26518440246582, "rewards/margins": 3.126837730407715, "rewards/rejected": -7.392023086547852, "step": 735 }, { "epoch": 0.9613846028247204, "grad_norm": 29.54789803184001, "learning_rate": 2.188739889821267e-09, "logits/chosen": -0.9659155607223511, "logits/rejected": -0.9455364942550659, "logps/chosen": -881.7896118164062, "logps/rejected": -1096.7828369140625, "loss": 0.3206, "rewards/accuracies": 0.9375, "rewards/chosen": -5.22705078125, "rewards/margins": 2.3798322677612305, "rewards/rejected": -7.606882095336914, "step": 736 }, { "epoch": 0.9626908319046453, "grad_norm": 28.345744890500455, "learning_rate": 2.0405970061943003e-09, "logits/chosen": -0.852986752986908, "logits/rejected": -0.8692293167114258, "logps/chosen": -796.1908569335938, "logps/rejected": -921.0655517578125, "loss": 0.3987, "rewards/accuracies": 0.75, "rewards/chosen": -4.667433738708496, "rewards/margins": 1.1772699356079102, "rewards/rejected": -5.844703674316406, "step": 737 }, { "epoch": 0.9639970609845702, "grad_norm": 22.909021102233012, "learning_rate": 1.897624267246073e-09, "logits/chosen": -0.9738621115684509, "logits/rejected": -1.0005155801773071, "logps/chosen": -783.131103515625, "logps/rejected": -1031.7144775390625, "loss": 0.2402, "rewards/accuracies": 0.90625, "rewards/chosen": -4.542294502258301, "rewards/margins": 2.2111833095550537, "rewards/rejected": -6.753478050231934, "step": 738 }, { "epoch": 0.9653032900644951, "grad_norm": 30.905899100885105, "learning_rate": 1.7598246540683481e-09, "logits/chosen": -0.9035788774490356, "logits/rejected": -0.891940712928772, "logps/chosen": -839.0357666015625, "logps/rejected": -1132.1015625, "loss": 0.3277, "rewards/accuracies": 0.8125, "rewards/chosen": -4.749287128448486, "rewards/margins": 3.2719829082489014, "rewards/rejected": -8.021270751953125, "step": 739 }, { "epoch": 0.96660951914442, "grad_norm": 20.962666279736965, "learning_rate": 1.6272010398893088e-09, "logits/chosen": -1.1005297899246216, "logits/rejected": -1.0309395790100098, "logps/chosen": -900.8092041015625, "logps/rejected": -1103.6551513671875, "loss": 0.2369, "rewards/accuracies": 0.9375, "rewards/chosen": -5.278766632080078, "rewards/margins": 2.1895079612731934, "rewards/rejected": -7.4682745933532715, "step": 740 }, { "epoch": 0.9679157482243449, "grad_norm": 29.86451006227898, "learning_rate": 1.4997561900135236e-09, "logits/chosen": -1.0119831562042236, "logits/rejected": -1.0092520713806152, "logps/chosen": -885.3237915039062, "logps/rejected": -1074.3250732421875, "loss": 0.3015, "rewards/accuracies": 0.875, "rewards/chosen": -5.477841377258301, "rewards/margins": 1.7145339250564575, "rewards/rejected": -7.192375183105469, "step": 741 }, { "epoch": 0.9692219773042697, "grad_norm": 22.580987418682184, "learning_rate": 1.377492761764354e-09, "logits/chosen": -0.9602839946746826, "logits/rejected": -0.9752902388572693, "logps/chosen": -784.1845703125, "logps/rejected": -1084.4151611328125, "loss": 0.2918, "rewards/accuracies": 0.96875, "rewards/chosen": -4.722711086273193, "rewards/margins": 2.7062840461730957, "rewards/rejected": -7.428995132446289, "step": 742 }, { "epoch": 0.9705282063841947, "grad_norm": 26.643058025664857, "learning_rate": 1.2604133044284982e-09, "logits/chosen": -0.8779647946357727, "logits/rejected": -0.9308136701583862, "logps/chosen": -818.1796264648438, "logps/rejected": -1074.2091064453125, "loss": 0.3351, "rewards/accuracies": 0.9375, "rewards/chosen": -4.8115034103393555, "rewards/margins": 2.1788394451141357, "rewards/rejected": -6.990341663360596, "step": 743 }, { "epoch": 0.9718344354641195, "grad_norm": 23.47737766636236, "learning_rate": 1.148520259202923e-09, "logits/chosen": -1.0406291484832764, "logits/rejected": -1.0501799583435059, "logps/chosen": -862.27978515625, "logps/rejected": -1064.9189453125, "loss": 0.3525, "rewards/accuracies": 0.8125, "rewards/chosen": -5.038253307342529, "rewards/margins": 2.2377071380615234, "rewards/rejected": -7.275960922241211, "step": 744 }, { "epoch": 0.9731406645440445, "grad_norm": 18.114965193565368, "learning_rate": 1.0418159591438214e-09, "logits/chosen": -0.9138543009757996, "logits/rejected": -0.9569115042686462, "logps/chosen": -823.882568359375, "logps/rejected": -1185.687744140625, "loss": 0.3084, "rewards/accuracies": 0.84375, "rewards/chosen": -4.7956647872924805, "rewards/margins": 3.1947028636932373, "rewards/rejected": -7.990367412567139, "step": 745 }, { "epoch": 0.9744468936239693, "grad_norm": 18.817714111289238, "learning_rate": 9.403026291181505e-10, "logits/chosen": -0.9524965286254883, "logits/rejected": -0.9789271354675293, "logps/chosen": -869.812744140625, "logps/rejected": -1176.66943359375, "loss": 0.3283, "rewards/accuracies": 0.9375, "rewards/chosen": -4.999002933502197, "rewards/margins": 2.8612561225891113, "rewards/rejected": -7.860259056091309, "step": 746 }, { "epoch": 0.9757531227038942, "grad_norm": 24.90036779280411, "learning_rate": 8.439823857570305e-10, "logits/chosen": -0.8803153038024902, "logits/rejected": -0.9198551774024963, "logps/chosen": -919.7659912109375, "logps/rejected": -1150.556884765625, "loss": 0.39, "rewards/accuracies": 0.84375, "rewards/chosen": -5.701601028442383, "rewards/margins": 1.7264282703399658, "rewards/rejected": -7.4280290603637695, "step": 747 }, { "epoch": 0.9770593517838191, "grad_norm": 21.430458788071867, "learning_rate": 7.52857237411808e-10, "logits/chosen": -0.9212626814842224, "logits/rejected": -0.9755610227584839, "logps/chosen": -859.5005493164062, "logps/rejected": -1160.1336669921875, "loss": 0.298, "rewards/accuracies": 0.90625, "rewards/chosen": -4.9637250900268555, "rewards/margins": 2.754920482635498, "rewards/rejected": -7.718645095825195, "step": 748 }, { "epoch": 0.978365580863744, "grad_norm": 24.23282484362412, "learning_rate": 6.66929084112089e-10, "logits/chosen": -1.0224872827529907, "logits/rejected": -1.0391262769699097, "logps/chosen": -782.97998046875, "logps/rejected": -1147.94775390625, "loss": 0.2762, "rewards/accuracies": 0.90625, "rewards/chosen": -4.704994201660156, "rewards/margins": 3.344956874847412, "rewards/rejected": -8.04995059967041, "step": 749 }, { "epoch": 0.9796718099436689, "grad_norm": 46.74074760052989, "learning_rate": 5.861997175260758e-10, "logits/chosen": -0.9382998943328857, "logits/rejected": -0.8692302703857422, "logps/chosen": -863.0648193359375, "logps/rejected": -1199.2513427734375, "loss": 0.3445, "rewards/accuracies": 0.90625, "rewards/chosen": -4.992151260375977, "rewards/margins": 3.3170485496520996, "rewards/rejected": -8.309200286865234, "step": 750 }, { "epoch": 0.9809780390235938, "grad_norm": 45.54321160333492, "learning_rate": 5.106708209232647e-10, "logits/chosen": -0.9324420094490051, "logits/rejected": -0.957527220249176, "logps/chosen": -840.7574462890625, "logps/rejected": -1093.603271484375, "loss": 0.3618, "rewards/accuracies": 0.875, "rewards/chosen": -5.038885593414307, "rewards/margins": 2.237790107727051, "rewards/rejected": -7.276675224304199, "step": 751 }, { "epoch": 0.9822842681035187, "grad_norm": 22.458925326174544, "learning_rate": 4.4034396913941727e-10, "logits/chosen": -0.9433171153068542, "logits/rejected": -1.005143404006958, "logps/chosen": -894.079833984375, "logps/rejected": -1195.688720703125, "loss": 0.3837, "rewards/accuracies": 0.8125, "rewards/chosen": -5.210080146789551, "rewards/margins": 2.7397055625915527, "rewards/rejected": -7.949785232543945, "step": 752 }, { "epoch": 0.9835904971834436, "grad_norm": 22.408106908118054, "learning_rate": 3.7522062854355997e-10, "logits/chosen": -0.9450298547744751, "logits/rejected": -1.0143907070159912, "logps/chosen": -807.2323608398438, "logps/rejected": -1332.6236572265625, "loss": 0.2876, "rewards/accuracies": 0.90625, "rewards/chosen": -4.483209133148193, "rewards/margins": 4.754749298095703, "rewards/rejected": -9.237958908081055, "step": 753 }, { "epoch": 0.9848967262633684, "grad_norm": 19.418290612472866, "learning_rate": 3.1530215700756313e-10, "logits/chosen": -1.102285623550415, "logits/rejected": -1.0857857465744019, "logps/chosen": -883.404052734375, "logps/rejected": -1100.610595703125, "loss": 0.3042, "rewards/accuracies": 0.9375, "rewards/chosen": -5.034350872039795, "rewards/margins": 2.195341110229492, "rewards/rejected": -7.229691982269287, "step": 754 }, { "epoch": 0.9862029553432933, "grad_norm": 35.85551364174609, "learning_rate": 2.605898038777199e-10, "logits/chosen": -0.9625508189201355, "logits/rejected": -0.9833151698112488, "logps/chosen": -790.8330078125, "logps/rejected": -1025.868896484375, "loss": 0.3716, "rewards/accuracies": 0.96875, "rewards/chosen": -4.699563503265381, "rewards/margins": 2.502751350402832, "rewards/rejected": -7.202314376831055, "step": 755 }, { "epoch": 0.9875091844232182, "grad_norm": 28.53621501039099, "learning_rate": 2.110847099488222e-10, "logits/chosen": -0.94759202003479, "logits/rejected": -0.9781441688537598, "logps/chosen": -839.1832275390625, "logps/rejected": -1023.45361328125, "loss": 0.2928, "rewards/accuracies": 0.90625, "rewards/chosen": -4.758579254150391, "rewards/margins": 1.8572137355804443, "rewards/rejected": -6.615793228149414, "step": 756 }, { "epoch": 0.9888154135031431, "grad_norm": 21.22918791897412, "learning_rate": 1.6678790744015236e-10, "logits/chosen": -0.9919801354408264, "logits/rejected": -0.9901013374328613, "logps/chosen": -787.056396484375, "logps/rejected": -934.1174926757812, "loss": 0.3382, "rewards/accuracies": 0.90625, "rewards/chosen": -4.492656230926514, "rewards/margins": 1.5403560400009155, "rewards/rejected": -6.033012390136719, "step": 757 }, { "epoch": 0.990121642583068, "grad_norm": 16.615289037512497, "learning_rate": 1.277003199742499e-10, "logits/chosen": -0.981942892074585, "logits/rejected": -0.9660012125968933, "logps/chosen": -772.347900390625, "logps/rejected": -923.9476928710938, "loss": 0.2967, "rewards/accuracies": 0.78125, "rewards/chosen": -4.184555530548096, "rewards/margins": 2.0594732761383057, "rewards/rejected": -6.2440290451049805, "step": 758 }, { "epoch": 0.9914278716629928, "grad_norm": 19.158852761980633, "learning_rate": 9.382276255742727e-11, "logits/chosen": -1.071539044380188, "logits/rejected": -1.0275521278381348, "logps/chosen": -838.9002075195312, "logps/rejected": -1043.4322509765625, "loss": 0.2753, "rewards/accuracies": 0.90625, "rewards/chosen": -3.9658560752868652, "rewards/margins": 2.511625289916992, "rewards/rejected": -6.477481842041016, "step": 759 }, { "epoch": 0.9927341007429178, "grad_norm": 22.168120567752247, "learning_rate": 6.515594156286663e-11, "logits/chosen": -0.9493943452835083, "logits/rejected": -0.910268247127533, "logps/chosen": -899.511962890625, "logps/rejected": -1148.454345703125, "loss": 0.293, "rewards/accuracies": 0.9375, "rewards/chosen": -5.537384510040283, "rewards/margins": 2.493018388748169, "rewards/rejected": -8.030403137207031, "step": 760 }, { "epoch": 0.9940403298228426, "grad_norm": 24.026839554437526, "learning_rate": 4.170045471588168e-11, "logits/chosen": -0.9243339896202087, "logits/rejected": -0.9904308319091797, "logps/chosen": -838.7510986328125, "logps/rejected": -1014.429931640625, "loss": 0.3245, "rewards/accuracies": 0.875, "rewards/chosen": -4.799351692199707, "rewards/margins": 1.7270526885986328, "rewards/rejected": -6.52640438079834, "step": 761 }, { "epoch": 0.9953465589027676, "grad_norm": 28.08092475083908, "learning_rate": 2.3456791081455375e-11, "logits/chosen": -0.8584170341491699, "logits/rejected": -0.915625810623169, "logps/chosen": -841.3638916015625, "logps/rejected": -1255.8558349609375, "loss": 0.3535, "rewards/accuracies": 0.90625, "rewards/chosen": -4.8940863609313965, "rewards/margins": 3.722672939300537, "rewards/rejected": -8.61676025390625, "step": 762 }, { "epoch": 0.9966527879826924, "grad_norm": 28.280188569328004, "learning_rate": 1.0425331054025876e-11, "logits/chosen": -0.9674179553985596, "logits/rejected": -0.9638044834136963, "logps/chosen": -806.9545288085938, "logps/rejected": -1109.9842529296875, "loss": 0.3833, "rewards/accuracies": 0.90625, "rewards/chosen": -4.446547985076904, "rewards/margins": 2.917111873626709, "rewards/rejected": -7.363659858703613, "step": 763 }, { "epoch": 0.9979590170626174, "grad_norm": 22.616993906433937, "learning_rate": 2.6063463495762384e-12, "logits/chosen": -0.9143103361129761, "logits/rejected": -0.9458924531936646, "logps/chosen": -783.2197875976562, "logps/rejected": -1015.7607421875, "loss": 0.3291, "rewards/accuracies": 0.96875, "rewards/chosen": -4.563847541809082, "rewards/margins": 2.473099708557129, "rewards/rejected": -7.036947250366211, "step": 764 }, { "epoch": 0.9992652461425422, "grad_norm": 15.41969150521287, "learning_rate": 0.0, "logits/chosen": -0.9288902282714844, "logits/rejected": -0.9434102773666382, "logps/chosen": -739.760986328125, "logps/rejected": -1050.915771484375, "loss": 0.2618, "rewards/accuracies": 0.875, "rewards/chosen": -4.296210289001465, "rewards/margins": 2.6045422554016113, "rewards/rejected": -6.900752544403076, "step": 765 }, { "epoch": 0.9992652461425422, "step": 765, "total_flos": 0.0, "train_loss": 0.41019999388775796, "train_runtime": 41758.0505, "train_samples_per_second": 2.347, "train_steps_per_second": 0.018 } ], "logging_steps": 1, "max_steps": 765, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }