{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.3841828274817964, "eval_steps": 100, "global_step": 1200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0023069713791363275, "grad_norm": 70.983255945064, "learning_rate": 4e-09, "logits/chosen": -1.6907414197921753, "logits/rejected": -1.6978764533996582, "logps/chosen": -135.08778381347656, "logps/rejected": -140.00140380859375, "loss": 0.6978, "rewards/accuracies": 0.625, "rewards/chosen": 0.010493194684386253, "rewards/margins": 0.006632559932768345, "rewards/rejected": 0.003860633820295334, "step": 2 }, { "epoch": 0.004613942758272655, "grad_norm": 79.97063682582153, "learning_rate": 8e-09, "logits/chosen": -1.6330227851867676, "logits/rejected": -1.7231806516647339, "logps/chosen": -197.88365173339844, "logps/rejected": -218.62255859375, "loss": 0.6925, "rewards/accuracies": 0.46875, "rewards/chosen": 0.008352389559149742, "rewards/margins": -0.00352578517049551, "rewards/rejected": 0.011878175660967827, "step": 4 }, { "epoch": 0.006920914137408983, "grad_norm": 78.73277588414827, "learning_rate": 1.1999999999999998e-08, "logits/chosen": -1.7628690004348755, "logits/rejected": -1.6921380758285522, "logps/chosen": -181.12741088867188, "logps/rejected": -177.64956665039062, "loss": 0.6919, "rewards/accuracies": 0.40625, "rewards/chosen": -0.005539673380553722, "rewards/margins": -0.012507464736700058, "rewards/rejected": 0.006967790424823761, "step": 6 }, { "epoch": 0.00922788551654531, "grad_norm": 84.94475101410946, "learning_rate": 1.6e-08, "logits/chosen": -1.6862337589263916, "logits/rejected": -1.6957104206085205, "logps/chosen": -229.57574462890625, "logps/rejected": -308.63421630859375, "loss": 0.6949, "rewards/accuracies": 0.59375, "rewards/chosen": 0.019691964611411095, "rewards/margins": 0.022208284586668015, "rewards/rejected": -0.0025163227692246437, "step": 8 }, { "epoch": 0.011534856895681638, "grad_norm": 78.53713189911603, "learning_rate": 2e-08, "logits/chosen": -1.72577965259552, "logits/rejected": -1.72530198097229, "logps/chosen": -182.21597290039062, "logps/rejected": -197.15383911132812, "loss": 0.6876, "rewards/accuracies": 0.46875, "rewards/chosen": -0.00015587342204526067, "rewards/margins": -0.00128166563808918, "rewards/rejected": 0.0011257934384047985, "step": 10 }, { "epoch": 0.013841828274817966, "grad_norm": 60.639642502259356, "learning_rate": 2.3999999999999997e-08, "logits/chosen": -1.5887395143508911, "logits/rejected": -1.7574589252471924, "logps/chosen": -121.71543884277344, "logps/rejected": -164.58782958984375, "loss": 0.6915, "rewards/accuracies": 0.5625, "rewards/chosen": 0.004625019151717424, "rewards/margins": 0.00861622579395771, "rewards/rejected": -0.003991207107901573, "step": 12 }, { "epoch": 0.016148799653954292, "grad_norm": 86.96054524483631, "learning_rate": 2.8000000000000003e-08, "logits/chosen": -1.5507514476776123, "logits/rejected": -1.5499210357666016, "logps/chosen": -147.94631958007812, "logps/rejected": -200.87417602539062, "loss": 0.6926, "rewards/accuracies": 0.53125, "rewards/chosen": 0.013226826675236225, "rewards/margins": 0.021001461893320084, "rewards/rejected": -0.00777463661506772, "step": 14 }, { "epoch": 0.01845577103309062, "grad_norm": 77.14253037311525, "learning_rate": 3.2e-08, "logits/chosen": -1.6721559762954712, "logits/rejected": -1.7068090438842773, "logps/chosen": -157.89622497558594, "logps/rejected": -199.2628631591797, "loss": 0.6951, "rewards/accuracies": 0.46875, "rewards/chosen": -0.0034806535113602877, "rewards/margins": 0.006839222740381956, "rewards/rejected": -0.010319876484572887, "step": 16 }, { "epoch": 0.020762742412226948, "grad_norm": 78.17299743340894, "learning_rate": 3.6e-08, "logits/chosen": -1.6556451320648193, "logits/rejected": -1.7276983261108398, "logps/chosen": -135.32223510742188, "logps/rejected": -158.1257781982422, "loss": 0.6884, "rewards/accuracies": 0.5625, "rewards/chosen": 0.01184301357716322, "rewards/margins": 0.023063620552420616, "rewards/rejected": -0.011220606043934822, "step": 18 }, { "epoch": 0.023069713791363276, "grad_norm": 77.30701284634611, "learning_rate": 4e-08, "logits/chosen": -1.735813856124878, "logits/rejected": -1.789333701133728, "logps/chosen": -157.4953155517578, "logps/rejected": -186.05862426757812, "loss": 0.693, "rewards/accuracies": 0.53125, "rewards/chosen": 0.004054094199091196, "rewards/margins": 0.010992627590894699, "rewards/rejected": -0.006938533391803503, "step": 20 }, { "epoch": 0.025376685170499604, "grad_norm": 77.95778126726918, "learning_rate": 4.4e-08, "logits/chosen": -1.5402836799621582, "logits/rejected": -1.5884689092636108, "logps/chosen": -133.27554321289062, "logps/rejected": -170.48594665527344, "loss": 0.6944, "rewards/accuracies": 0.6875, "rewards/chosen": 0.009935155510902405, "rewards/margins": 0.019674377515912056, "rewards/rejected": -0.009739222005009651, "step": 22 }, { "epoch": 0.02768365654963593, "grad_norm": 78.69027024475307, "learning_rate": 4.799999999999999e-08, "logits/chosen": -1.4641175270080566, "logits/rejected": -1.6491130590438843, "logps/chosen": -139.53375244140625, "logps/rejected": -193.21438598632812, "loss": 0.6945, "rewards/accuracies": 0.46875, "rewards/chosen": -0.010036014020442963, "rewards/margins": 0.005420446861535311, "rewards/rejected": -0.015456462278962135, "step": 24 }, { "epoch": 0.02999062792877226, "grad_norm": 82.14181039722058, "learning_rate": 5.2e-08, "logits/chosen": -1.7350623607635498, "logits/rejected": -1.6639574766159058, "logps/chosen": -161.62164306640625, "logps/rejected": -160.58840942382812, "loss": 0.6926, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0011872733011841774, "rewards/margins": -0.006157425232231617, "rewards/rejected": 0.0049701533280313015, "step": 26 }, { "epoch": 0.032297599307908584, "grad_norm": 73.66413843287592, "learning_rate": 5.6000000000000005e-08, "logits/chosen": -1.4280009269714355, "logits/rejected": -1.6318210363388062, "logps/chosen": -131.8518524169922, "logps/rejected": -166.72335815429688, "loss": 0.6964, "rewards/accuracies": 0.3125, "rewards/chosen": -0.01029287837445736, "rewards/margins": -0.014272996224462986, "rewards/rejected": 0.003980117850005627, "step": 28 }, { "epoch": 0.034604570687044915, "grad_norm": 86.40461346238433, "learning_rate": 6e-08, "logits/chosen": -1.6838908195495605, "logits/rejected": -1.7304034233093262, "logps/chosen": -124.49476623535156, "logps/rejected": -140.77841186523438, "loss": 0.6931, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0041793473064899445, "rewards/margins": -0.0030185298528522253, "rewards/rejected": 0.007197877857834101, "step": 30 }, { "epoch": 0.03691154206618124, "grad_norm": 79.23808836252253, "learning_rate": 6.4e-08, "logits/chosen": -1.5962588787078857, "logits/rejected": -1.549019455909729, "logps/chosen": -194.4256591796875, "logps/rejected": -237.36117553710938, "loss": 0.7003, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0013054789742454886, "rewards/margins": -0.018302934244275093, "rewards/rejected": 0.016997454687952995, "step": 32 }, { "epoch": 0.03921851344531757, "grad_norm": 74.59682312018288, "learning_rate": 6.8e-08, "logits/chosen": -1.740609884262085, "logits/rejected": -1.6540827751159668, "logps/chosen": -157.9348907470703, "logps/rejected": -158.6222686767578, "loss": 0.6889, "rewards/accuracies": 0.78125, "rewards/chosen": 0.01421279925853014, "rewards/margins": 0.03194922208786011, "rewards/rejected": -0.017736420035362244, "step": 34 }, { "epoch": 0.041525484824453895, "grad_norm": 79.71071945309694, "learning_rate": 7.2e-08, "logits/chosen": -1.5709240436553955, "logits/rejected": -1.6920911073684692, "logps/chosen": -173.94007873535156, "logps/rejected": -219.2288818359375, "loss": 0.6962, "rewards/accuracies": 0.375, "rewards/chosen": -0.011220686137676239, "rewards/margins": -0.008418139070272446, "rewards/rejected": -0.0028025463689118624, "step": 36 }, { "epoch": 0.04383245620359023, "grad_norm": 86.21753265595521, "learning_rate": 7.599999999999999e-08, "logits/chosen": -1.7426936626434326, "logits/rejected": -1.5694864988327026, "logps/chosen": -159.00730895996094, "logps/rejected": -146.18772888183594, "loss": 0.6971, "rewards/accuracies": 0.5, "rewards/chosen": -0.01584451086819172, "rewards/margins": -0.012521232478320599, "rewards/rejected": -0.003323277225717902, "step": 38 }, { "epoch": 0.04613942758272655, "grad_norm": 76.87714834723751, "learning_rate": 8e-08, "logits/chosen": -1.7129234075546265, "logits/rejected": -1.6838488578796387, "logps/chosen": -191.7474822998047, "logps/rejected": -162.81466674804688, "loss": 0.6986, "rewards/accuracies": 0.5, "rewards/chosen": 0.0020668436773121357, "rewards/margins": -0.008048251271247864, "rewards/rejected": 0.010115095414221287, "step": 40 }, { "epoch": 0.04844639896186288, "grad_norm": 71.62490665795943, "learning_rate": 8.4e-08, "logits/chosen": -1.7202041149139404, "logits/rejected": -1.7000675201416016, "logps/chosen": -157.0576934814453, "logps/rejected": -186.76138305664062, "loss": 0.6889, "rewards/accuracies": 0.46875, "rewards/chosen": 0.0023848186247050762, "rewards/margins": 0.002543981885537505, "rewards/rejected": -0.00015916326083242893, "step": 42 }, { "epoch": 0.05075337034099921, "grad_norm": 81.75722048702342, "learning_rate": 8.8e-08, "logits/chosen": -1.5299909114837646, "logits/rejected": -1.5974533557891846, "logps/chosen": -158.53753662109375, "logps/rejected": -179.9112548828125, "loss": 0.6924, "rewards/accuracies": 0.625, "rewards/chosen": 0.006203922443091869, "rewards/margins": 0.02108878269791603, "rewards/rejected": -0.014884857460856438, "step": 44 }, { "epoch": 0.05306034172013553, "grad_norm": 78.9572151515279, "learning_rate": 9.2e-08, "logits/chosen": -1.6293164491653442, "logits/rejected": -1.6444960832595825, "logps/chosen": -152.41412353515625, "logps/rejected": -170.09869384765625, "loss": 0.6925, "rewards/accuracies": 0.625, "rewards/chosen": 0.010927575640380383, "rewards/margins": 0.0014875519555062056, "rewards/rejected": 0.009440025314688683, "step": 46 }, { "epoch": 0.05536731309927186, "grad_norm": 81.84986800244135, "learning_rate": 9.599999999999999e-08, "logits/chosen": -1.7789497375488281, "logits/rejected": -1.7547317743301392, "logps/chosen": -214.71987915039062, "logps/rejected": -240.7999725341797, "loss": 0.6889, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0025361552834510803, "rewards/margins": 0.006769699975848198, "rewards/rejected": -0.004233543295413256, "step": 48 }, { "epoch": 0.05767428447840819, "grad_norm": 76.76668544313137, "learning_rate": 1e-07, "logits/chosen": -1.491020917892456, "logits/rejected": -1.5447454452514648, "logps/chosen": -201.1317901611328, "logps/rejected": -267.04248046875, "loss": 0.6974, "rewards/accuracies": 0.5, "rewards/chosen": -0.005572921596467495, "rewards/margins": 0.01456289179623127, "rewards/rejected": -0.02013581432402134, "step": 50 }, { "epoch": 0.05998125585754452, "grad_norm": 84.23078533480681, "learning_rate": 1.04e-07, "logits/chosen": -1.6120061874389648, "logits/rejected": -1.7649461030960083, "logps/chosen": -183.1412353515625, "logps/rejected": -282.4610290527344, "loss": 0.695, "rewards/accuracies": 0.3125, "rewards/chosen": -0.02553856186568737, "rewards/margins": -0.038517288863658905, "rewards/rejected": 0.012978724204003811, "step": 52 }, { "epoch": 0.06228822723668084, "grad_norm": 92.0173388072239, "learning_rate": 1.08e-07, "logits/chosen": -1.4607347249984741, "logits/rejected": -1.625195026397705, "logps/chosen": -152.8062744140625, "logps/rejected": -219.1183624267578, "loss": 0.6938, "rewards/accuracies": 0.375, "rewards/chosen": -0.012677345424890518, "rewards/margins": -0.032261885702610016, "rewards/rejected": 0.01958453841507435, "step": 54 }, { "epoch": 0.06459519861581717, "grad_norm": 79.32388836348366, "learning_rate": 1.1200000000000001e-07, "logits/chosen": -1.6375060081481934, "logits/rejected": -1.5966099500656128, "logps/chosen": -223.85108947753906, "logps/rejected": -256.30072021484375, "loss": 0.699, "rewards/accuracies": 0.40625, "rewards/chosen": -0.023084305226802826, "rewards/margins": -0.02949170023202896, "rewards/rejected": 0.006407391745597124, "step": 56 }, { "epoch": 0.0669021699949535, "grad_norm": 69.72321013611293, "learning_rate": 1.1599999999999999e-07, "logits/chosen": -1.5106103420257568, "logits/rejected": -1.596940517425537, "logps/chosen": -188.92218017578125, "logps/rejected": -275.0534973144531, "loss": 0.6941, "rewards/accuracies": 0.5, "rewards/chosen": 0.017192328348755836, "rewards/margins": 0.03240702301263809, "rewards/rejected": -0.015214694663882256, "step": 58 }, { "epoch": 0.06920914137408983, "grad_norm": 72.64787237644839, "learning_rate": 1.2e-07, "logits/chosen": -1.667373776435852, "logits/rejected": -1.6947064399719238, "logps/chosen": -103.05116271972656, "logps/rejected": -151.50807189941406, "loss": 0.6928, "rewards/accuracies": 0.5625, "rewards/chosen": 0.00641946354880929, "rewards/margins": 0.032484300434589386, "rewards/rejected": -0.026064833626151085, "step": 60 }, { "epoch": 0.07151611275322615, "grad_norm": 75.24112563737103, "learning_rate": 1.24e-07, "logits/chosen": -1.5305185317993164, "logits/rejected": -1.6095894575119019, "logps/chosen": -154.5998077392578, "logps/rejected": -176.33970642089844, "loss": 0.6965, "rewards/accuracies": 0.53125, "rewards/chosen": 0.00010838371235877275, "rewards/margins": 0.006697986274957657, "rewards/rejected": -0.006589602679014206, "step": 62 }, { "epoch": 0.07382308413236248, "grad_norm": 70.73819169103417, "learning_rate": 1.28e-07, "logits/chosen": -1.6058859825134277, "logits/rejected": -1.6498842239379883, "logps/chosen": -151.62286376953125, "logps/rejected": -169.5069122314453, "loss": 0.6927, "rewards/accuracies": 0.46875, "rewards/chosen": 0.003975578583776951, "rewards/margins": 0.006644865497946739, "rewards/rejected": -0.0026692876126617193, "step": 64 }, { "epoch": 0.0761300555114988, "grad_norm": 79.80905942384086, "learning_rate": 1.32e-07, "logits/chosen": -1.566676139831543, "logits/rejected": -1.635036587715149, "logps/chosen": -213.95590209960938, "logps/rejected": -268.84747314453125, "loss": 0.6878, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0030877357348799706, "rewards/margins": -0.004700601100921631, "rewards/rejected": 0.0016128652496263385, "step": 66 }, { "epoch": 0.07843702689063514, "grad_norm": 68.290418726697, "learning_rate": 1.36e-07, "logits/chosen": -1.625745177268982, "logits/rejected": -1.7014200687408447, "logps/chosen": -191.14842224121094, "logps/rejected": -222.51779174804688, "loss": 0.6954, "rewards/accuracies": 0.4375, "rewards/chosen": 0.0007452260470017791, "rewards/margins": 0.00873138289898634, "rewards/rejected": -0.007986157201230526, "step": 68 }, { "epoch": 0.08074399826977147, "grad_norm": 79.51788387191074, "learning_rate": 1.3999999999999998e-07, "logits/chosen": -1.5817363262176514, "logits/rejected": -1.6993348598480225, "logps/chosen": -131.7422637939453, "logps/rejected": -162.98304748535156, "loss": 0.6941, "rewards/accuracies": 0.53125, "rewards/chosen": -0.0074032689444720745, "rewards/margins": -0.010667804628610611, "rewards/rejected": 0.003264536615461111, "step": 70 }, { "epoch": 0.08305096964890779, "grad_norm": 83.37639460390962, "learning_rate": 1.44e-07, "logits/chosen": -1.5910240411758423, "logits/rejected": -1.6661615371704102, "logps/chosen": -181.304931640625, "logps/rejected": -209.23526000976562, "loss": 0.6943, "rewards/accuracies": 0.375, "rewards/chosen": -0.0042681945487856865, "rewards/margins": -0.02247396856546402, "rewards/rejected": 0.018205774948000908, "step": 72 }, { "epoch": 0.08535794102804412, "grad_norm": 85.04605551266509, "learning_rate": 1.48e-07, "logits/chosen": -1.5876970291137695, "logits/rejected": -1.7304015159606934, "logps/chosen": -146.5454559326172, "logps/rejected": -182.179931640625, "loss": 0.6891, "rewards/accuracies": 0.46875, "rewards/chosen": -0.011930807493627071, "rewards/margins": -0.013042710721492767, "rewards/rejected": 0.0011119036935269833, "step": 74 }, { "epoch": 0.08766491240718045, "grad_norm": 71.97159401776734, "learning_rate": 1.5199999999999998e-07, "logits/chosen": -1.6893718242645264, "logits/rejected": -1.6752575635910034, "logps/chosen": -163.57489013671875, "logps/rejected": -162.34803771972656, "loss": 0.6906, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0007063052617013454, "rewards/margins": 0.002567308023571968, "rewards/rejected": -0.0018610022962093353, "step": 76 }, { "epoch": 0.08997188378631678, "grad_norm": 80.09910611023234, "learning_rate": 1.56e-07, "logits/chosen": -1.612238883972168, "logits/rejected": -1.5296804904937744, "logps/chosen": -143.39723205566406, "logps/rejected": -165.65318298339844, "loss": 0.6871, "rewards/accuracies": 0.5625, "rewards/chosen": 0.009035947732627392, "rewards/margins": 0.025550464168190956, "rewards/rejected": -0.01651451550424099, "step": 78 }, { "epoch": 0.0922788551654531, "grad_norm": 70.85997602518799, "learning_rate": 1.6e-07, "logits/chosen": -1.6167306900024414, "logits/rejected": -1.720908522605896, "logps/chosen": -137.2986602783203, "logps/rejected": -246.95404052734375, "loss": 0.693, "rewards/accuracies": 0.625, "rewards/chosen": 0.006022963672876358, "rewards/margins": 0.008436123840510845, "rewards/rejected": -0.0024131599348038435, "step": 80 }, { "epoch": 0.09458582654458943, "grad_norm": 73.43468397188529, "learning_rate": 1.6399999999999999e-07, "logits/chosen": -1.7178070545196533, "logits/rejected": -1.7651526927947998, "logps/chosen": -154.39617919921875, "logps/rejected": -187.47491455078125, "loss": 0.6985, "rewards/accuracies": 0.4375, "rewards/chosen": -0.014386245980858803, "rewards/margins": -0.011840756982564926, "rewards/rejected": -0.002545490860939026, "step": 82 }, { "epoch": 0.09689279792372577, "grad_norm": 82.20956953972079, "learning_rate": 1.68e-07, "logits/chosen": -1.5902974605560303, "logits/rejected": -1.604806900024414, "logps/chosen": -127.57711029052734, "logps/rejected": -146.8506317138672, "loss": 0.6887, "rewards/accuracies": 0.5, "rewards/chosen": -0.006446457467973232, "rewards/margins": 0.008786877617239952, "rewards/rejected": -0.015233333222568035, "step": 84 }, { "epoch": 0.09919976930286209, "grad_norm": 76.58342600254258, "learning_rate": 1.7199999999999998e-07, "logits/chosen": -1.6294573545455933, "logits/rejected": -1.6458450555801392, "logps/chosen": -248.47845458984375, "logps/rejected": -246.7737579345703, "loss": 0.6895, "rewards/accuracies": 0.53125, "rewards/chosen": -0.009375479072332382, "rewards/margins": 0.025924015790224075, "rewards/rejected": -0.03529949486255646, "step": 86 }, { "epoch": 0.10150674068199841, "grad_norm": 73.05738349044326, "learning_rate": 1.76e-07, "logits/chosen": -1.84279465675354, "logits/rejected": -1.7476646900177002, "logps/chosen": -153.08554077148438, "logps/rejected": -154.7803497314453, "loss": 0.6909, "rewards/accuracies": 0.53125, "rewards/chosen": -0.002370176836848259, "rewards/margins": -0.0033320121001452208, "rewards/rejected": 0.0009618350304663181, "step": 88 }, { "epoch": 0.10381371206113474, "grad_norm": 77.95992122604585, "learning_rate": 1.8e-07, "logits/chosen": -1.6355715990066528, "logits/rejected": -1.6984450817108154, "logps/chosen": -169.7340087890625, "logps/rejected": -185.59031677246094, "loss": 0.6929, "rewards/accuracies": 0.46875, "rewards/chosen": -0.01826353184878826, "rewards/margins": -0.008648518472909927, "rewards/rejected": -0.009615011513233185, "step": 90 }, { "epoch": 0.10612068344027106, "grad_norm": 74.70608110655593, "learning_rate": 1.84e-07, "logits/chosen": -1.5153872966766357, "logits/rejected": -1.5389485359191895, "logps/chosen": -214.6402130126953, "logps/rejected": -224.4317626953125, "loss": 0.6877, "rewards/accuracies": 0.65625, "rewards/chosen": 0.00619399081915617, "rewards/margins": 0.011179441586136818, "rewards/rejected": -0.004985451698303223, "step": 92 }, { "epoch": 0.1084276548194074, "grad_norm": 74.73028912491218, "learning_rate": 1.88e-07, "logits/chosen": -1.5655710697174072, "logits/rejected": -1.5568063259124756, "logps/chosen": -170.8540496826172, "logps/rejected": -195.4169158935547, "loss": 0.6887, "rewards/accuracies": 0.5625, "rewards/chosen": -0.016467105597257614, "rewards/margins": 0.011973596177995205, "rewards/rejected": -0.028440698981285095, "step": 94 }, { "epoch": 0.11073462619854373, "grad_norm": 81.65007402302682, "learning_rate": 1.9199999999999997e-07, "logits/chosen": -1.7869484424591064, "logits/rejected": -1.7626042366027832, "logps/chosen": -208.46002197265625, "logps/rejected": -256.5970458984375, "loss": 0.6846, "rewards/accuracies": 0.5625, "rewards/chosen": -0.009895925410091877, "rewards/margins": 0.011812476441264153, "rewards/rejected": -0.021708402782678604, "step": 96 }, { "epoch": 0.11304159757768005, "grad_norm": 77.09604224965807, "learning_rate": 1.9599999999999998e-07, "logits/chosen": -1.6421016454696655, "logits/rejected": -1.5873386859893799, "logps/chosen": -176.68853759765625, "logps/rejected": -188.64544677734375, "loss": 0.69, "rewards/accuracies": 0.59375, "rewards/chosen": -0.01306343637406826, "rewards/margins": 0.021781262010335922, "rewards/rejected": -0.03484470024704933, "step": 98 }, { "epoch": 0.11534856895681637, "grad_norm": 77.37699461097638, "learning_rate": 2e-07, "logits/chosen": -1.5747566223144531, "logits/rejected": -1.5757074356079102, "logps/chosen": -146.1053009033203, "logps/rejected": -177.65733337402344, "loss": 0.6926, "rewards/accuracies": 0.4375, "rewards/chosen": -0.01261211559176445, "rewards/margins": 0.005542438477277756, "rewards/rejected": -0.018154552206397057, "step": 100 }, { "epoch": 0.11534856895681637, "eval_logits/chosen": -1.592301368713379, "eval_logits/rejected": -1.4917248487472534, "eval_logps/chosen": -185.32534790039062, "eval_logps/rejected": -150.51693725585938, "eval_loss": 0.6938029527664185, "eval_rewards/accuracies": 0.4000000059604645, "eval_rewards/chosen": -0.014318165369331837, "eval_rewards/margins": -0.014164167456328869, "eval_rewards/rejected": -0.00015399709809571505, "eval_runtime": 22.8572, "eval_samples_per_second": 4.375, "eval_steps_per_second": 1.094, "step": 100 }, { "epoch": 0.11765554033595271, "grad_norm": 83.18929756458088, "learning_rate": 1.9999925887938156e-07, "logits/chosen": -1.553455114364624, "logits/rejected": -1.6009831428527832, "logps/chosen": -171.79664611816406, "logps/rejected": -223.1472930908203, "loss": 0.6923, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0011008224682882428, "rewards/margins": 0.010834511369466782, "rewards/rejected": -0.009733689948916435, "step": 102 }, { "epoch": 0.11996251171508904, "grad_norm": 73.85193862190509, "learning_rate": 1.9999703552851146e-07, "logits/chosen": -1.7583006620407104, "logits/rejected": -1.714582920074463, "logps/chosen": -209.88302612304688, "logps/rejected": -255.11888122558594, "loss": 0.6913, "rewards/accuracies": 0.4375, "rewards/chosen": -0.030152078717947006, "rewards/margins": -0.011741320602595806, "rewards/rejected": -0.018410757184028625, "step": 104 }, { "epoch": 0.12226948309422536, "grad_norm": 73.38727733625704, "learning_rate": 1.9999332998034512e-07, "logits/chosen": -1.6966747045516968, "logits/rejected": -1.6100220680236816, "logps/chosen": -160.12281799316406, "logps/rejected": -167.38145446777344, "loss": 0.686, "rewards/accuracies": 0.5, "rewards/chosen": -0.008994976989924908, "rewards/margins": 0.011080076918005943, "rewards/rejected": -0.020075054839253426, "step": 106 }, { "epoch": 0.12457645447336169, "grad_norm": 79.80728796393491, "learning_rate": 1.9998814228980768e-07, "logits/chosen": -1.6656932830810547, "logits/rejected": -1.7435060739517212, "logps/chosen": -156.0963897705078, "logps/rejected": -208.7286376953125, "loss": 0.6956, "rewards/accuracies": 0.5, "rewards/chosen": -0.01158140879124403, "rewards/margins": -0.003068419173359871, "rewards/rejected": -0.008512990549206734, "step": 108 }, { "epoch": 0.126883425852498, "grad_norm": 71.572270187751, "learning_rate": 1.9998147253379324e-07, "logits/chosen": -1.7250394821166992, "logits/rejected": -1.720632791519165, "logps/chosen": -143.606201171875, "logps/rejected": -164.64126586914062, "loss": 0.6911, "rewards/accuracies": 0.59375, "rewards/chosen": -0.00029725395143032074, "rewards/margins": 0.004560052417218685, "rewards/rejected": -0.004857306368649006, "step": 110 }, { "epoch": 0.12919039723163434, "grad_norm": 76.7363281665151, "learning_rate": 1.999733208111637e-07, "logits/chosen": -1.680725336074829, "logits/rejected": -1.7269576787948608, "logps/chosen": -141.92416381835938, "logps/rejected": -163.63902282714844, "loss": 0.6878, "rewards/accuracies": 0.625, "rewards/chosen": -0.013096440583467484, "rewards/margins": 0.013841900043189526, "rewards/rejected": -0.026938341557979584, "step": 112 }, { "epoch": 0.13149736861077066, "grad_norm": 71.23095597503095, "learning_rate": 1.9996368724274726e-07, "logits/chosen": -1.7746036052703857, "logits/rejected": -1.651440978050232, "logps/chosen": -201.5678253173828, "logps/rejected": -208.96060180664062, "loss": 0.6814, "rewards/accuracies": 0.46875, "rewards/chosen": -0.020835982635617256, "rewards/margins": 0.0012882971204817295, "rewards/rejected": -0.02212427742779255, "step": 114 }, { "epoch": 0.133804339989907, "grad_norm": 78.35820889872501, "learning_rate": 1.999525719713366e-07, "logits/chosen": -1.6184967756271362, "logits/rejected": -1.6276531219482422, "logps/chosen": -138.03579711914062, "logps/rejected": -156.24818420410156, "loss": 0.6893, "rewards/accuracies": 0.53125, "rewards/chosen": -0.024172522127628326, "rewards/margins": -0.007286247797310352, "rewards/rejected": -0.0168862733989954, "step": 116 }, { "epoch": 0.13611131136904334, "grad_norm": 73.30378065799947, "learning_rate": 1.9993997516168685e-07, "logits/chosen": -1.5095572471618652, "logits/rejected": -1.4317773580551147, "logps/chosen": -168.31259155273438, "logps/rejected": -181.01010131835938, "loss": 0.6946, "rewards/accuracies": 0.53125, "rewards/chosen": -0.02184070646762848, "rewards/margins": -0.0038399603217840195, "rewards/rejected": -0.01800074614584446, "step": 118 }, { "epoch": 0.13841828274817966, "grad_norm": 76.67682193401895, "learning_rate": 1.9992589700051315e-07, "logits/chosen": -1.6505416631698608, "logits/rejected": -1.6528055667877197, "logps/chosen": -163.4833221435547, "logps/rejected": -173.32627868652344, "loss": 0.691, "rewards/accuracies": 0.5, "rewards/chosen": -0.037532731890678406, "rewards/margins": -0.001648992532864213, "rewards/rejected": -0.035883739590644836, "step": 120 }, { "epoch": 0.14072525412731599, "grad_norm": 79.25219710625622, "learning_rate": 1.9991033769648782e-07, "logits/chosen": -1.6732072830200195, "logits/rejected": -1.6914747953414917, "logps/chosen": -192.20941162109375, "logps/rejected": -249.57064819335938, "loss": 0.6785, "rewards/accuracies": 0.6875, "rewards/chosen": -0.02337605319917202, "rewards/margins": 0.03376854211091995, "rewards/rejected": -0.05714459717273712, "step": 122 }, { "epoch": 0.1430322255064523, "grad_norm": 74.1794316582206, "learning_rate": 1.9989329748023723e-07, "logits/chosen": -1.6055612564086914, "logits/rejected": -1.6374058723449707, "logps/chosen": -150.5140838623047, "logps/rejected": -178.90463256835938, "loss": 0.6838, "rewards/accuracies": 0.53125, "rewards/chosen": -0.03467298671603203, "rewards/margins": 0.017992481589317322, "rewards/rejected": -0.05266546458005905, "step": 124 }, { "epoch": 0.14533919688558863, "grad_norm": 76.65585990685855, "learning_rate": 1.9987477660433854e-07, "logits/chosen": -1.6969408988952637, "logits/rejected": -1.7563108205795288, "logps/chosen": -142.4885711669922, "logps/rejected": -210.4172821044922, "loss": 0.689, "rewards/accuracies": 0.625, "rewards/chosen": -0.024329736828804016, "rewards/margins": 0.004793995060026646, "rewards/rejected": -0.029123730957508087, "step": 126 }, { "epoch": 0.14764616826472496, "grad_norm": 78.42733649878224, "learning_rate": 1.998547753433158e-07, "logits/chosen": -1.6231815814971924, "logits/rejected": -1.4993540048599243, "logps/chosen": -248.5255584716797, "logps/rejected": -283.0628662109375, "loss": 0.6867, "rewards/accuracies": 0.65625, "rewards/chosen": -0.022915348410606384, "rewards/margins": 0.04015136882662773, "rewards/rejected": -0.06306671351194382, "step": 128 }, { "epoch": 0.14995313964386128, "grad_norm": 81.34489884736102, "learning_rate": 1.9983329399363594e-07, "logits/chosen": -1.696123719215393, "logits/rejected": -1.5894306898117065, "logps/chosen": -157.25205993652344, "logps/rejected": -169.28765869140625, "loss": 0.6888, "rewards/accuracies": 0.6875, "rewards/chosen": -0.02339826337993145, "rewards/margins": 0.029402071610093117, "rewards/rejected": -0.05280033499002457, "step": 130 }, { "epoch": 0.1522601110229976, "grad_norm": 71.86675386697708, "learning_rate": 1.998103328737044e-07, "logits/chosen": -1.614111304283142, "logits/rejected": -1.668984055519104, "logps/chosen": -169.32870483398438, "logps/rejected": -184.28652954101562, "loss": 0.694, "rewards/accuracies": 0.53125, "rewards/chosen": -0.032653309404850006, "rewards/margins": -0.012304544448852539, "rewards/rejected": -0.020348764955997467, "step": 132 }, { "epoch": 0.15456708240213396, "grad_norm": 81.85224601265395, "learning_rate": 1.9978589232386034e-07, "logits/chosen": -1.715609073638916, "logits/rejected": -1.7786422967910767, "logps/chosen": -167.58688354492188, "logps/rejected": -199.66270446777344, "loss": 0.6864, "rewards/accuracies": 0.5625, "rewards/chosen": -0.03811431676149368, "rewards/margins": 0.017543859779834747, "rewards/rejected": -0.05565817654132843, "step": 134 }, { "epoch": 0.15687405378127028, "grad_norm": 73.7622516118343, "learning_rate": 1.9975997270637168e-07, "logits/chosen": -1.6321560144424438, "logits/rejected": -1.7015608549118042, "logps/chosen": -159.351318359375, "logps/rejected": -176.723876953125, "loss": 0.6859, "rewards/accuracies": 0.6875, "rewards/chosen": -0.02714327722787857, "rewards/margins": 0.021138466894626617, "rewards/rejected": -0.04828174412250519, "step": 136 }, { "epoch": 0.1591810251604066, "grad_norm": 71.96760531715911, "learning_rate": 1.997325744054297e-07, "logits/chosen": -1.5530474185943604, "logits/rejected": -1.5373188257217407, "logps/chosen": -158.63812255859375, "logps/rejected": -204.0651092529297, "loss": 0.6845, "rewards/accuracies": 0.53125, "rewards/chosen": -0.01808898150920868, "rewards/margins": 0.017602307721972466, "rewards/rejected": -0.035691291093826294, "step": 138 }, { "epoch": 0.16148799653954293, "grad_norm": 73.93273755038686, "learning_rate": 1.9970369782714328e-07, "logits/chosen": -1.522450566291809, "logits/rejected": -1.635149598121643, "logps/chosen": -142.74459838867188, "logps/rejected": -149.5770721435547, "loss": 0.6894, "rewards/accuracies": 0.4375, "rewards/chosen": -0.025994691997766495, "rewards/margins": -0.006413338705897331, "rewards/rejected": -0.019581351429224014, "step": 140 }, { "epoch": 0.16379496791867926, "grad_norm": 79.55861212361464, "learning_rate": 1.99673343399533e-07, "logits/chosen": -1.525217056274414, "logits/rejected": -1.5952740907669067, "logps/chosen": -116.86170959472656, "logps/rejected": -175.00779724121094, "loss": 0.6832, "rewards/accuracies": 0.59375, "rewards/chosen": -0.020393915474414825, "rewards/margins": 0.032093390822410583, "rewards/rejected": -0.05248731002211571, "step": 142 }, { "epoch": 0.16610193929781558, "grad_norm": 83.97535534931897, "learning_rate": 1.9964151157252466e-07, "logits/chosen": -1.6767423152923584, "logits/rejected": -1.6693997383117676, "logps/chosen": -207.50936889648438, "logps/rejected": -216.3134002685547, "loss": 0.6789, "rewards/accuracies": 0.5625, "rewards/chosen": -0.03636579588055611, "rewards/margins": 0.03607642278075218, "rewards/rejected": -0.07244221866130829, "step": 144 }, { "epoch": 0.1684089106769519, "grad_norm": 73.85111113100436, "learning_rate": 1.996082028179428e-07, "logits/chosen": -1.4807971715927124, "logits/rejected": -1.4092496633529663, "logps/chosen": -168.455078125, "logps/rejected": -172.4587860107422, "loss": 0.689, "rewards/accuracies": 0.625, "rewards/chosen": -0.06219344958662987, "rewards/margins": 0.011872416362166405, "rewards/rejected": -0.07406586408615112, "step": 146 }, { "epoch": 0.17071588205608823, "grad_norm": 80.66954314511047, "learning_rate": 1.9957341762950344e-07, "logits/chosen": -1.5618644952774048, "logits/rejected": -1.67661452293396, "logps/chosen": -114.58411407470703, "logps/rejected": -158.1700439453125, "loss": 0.6807, "rewards/accuracies": 0.6875, "rewards/chosen": -0.04578070342540741, "rewards/margins": 0.029419898986816406, "rewards/rejected": -0.07520060241222382, "step": 148 }, { "epoch": 0.17302285343522458, "grad_norm": 71.43623661516392, "learning_rate": 1.9953715652280706e-07, "logits/chosen": -1.6976016759872437, "logits/rejected": -1.6299835443496704, "logps/chosen": -228.1553497314453, "logps/rejected": -214.32037353515625, "loss": 0.6848, "rewards/accuracies": 0.53125, "rewards/chosen": -0.06526876986026764, "rewards/margins": -0.002617661375552416, "rewards/rejected": -0.06265110522508621, "step": 150 }, { "epoch": 0.1753298248143609, "grad_norm": 85.31030453694517, "learning_rate": 1.9949942003533064e-07, "logits/chosen": -1.7211732864379883, "logits/rejected": -1.720245122909546, "logps/chosen": -138.57936096191406, "logps/rejected": -158.134521484375, "loss": 0.6829, "rewards/accuracies": 0.59375, "rewards/chosen": -0.042685676366090775, "rewards/margins": -0.006229763850569725, "rewards/rejected": -0.0364559069275856, "step": 152 }, { "epoch": 0.17763679619349723, "grad_norm": 75.37656195588123, "learning_rate": 1.9946020872642006e-07, "logits/chosen": -1.602712631225586, "logits/rejected": -1.5105926990509033, "logps/chosen": -152.95616149902344, "logps/rejected": -252.92359924316406, "loss": 0.6848, "rewards/accuracies": 0.65625, "rewards/chosen": -0.05936397612094879, "rewards/margins": 0.023188650608062744, "rewards/rejected": -0.08255261927843094, "step": 154 }, { "epoch": 0.17994376757263356, "grad_norm": 74.43256656918146, "learning_rate": 1.9941952317728147e-07, "logits/chosen": -1.6266837120056152, "logits/rejected": -1.5794254541397095, "logps/chosen": -154.70660400390625, "logps/rejected": -171.6692657470703, "loss": 0.688, "rewards/accuracies": 0.625, "rewards/chosen": -0.0499938428401947, "rewards/margins": 0.022745870053768158, "rewards/rejected": -0.07273972034454346, "step": 156 }, { "epoch": 0.18225073895176988, "grad_norm": 75.91057680239186, "learning_rate": 1.993773639909728e-07, "logits/chosen": -1.49541437625885, "logits/rejected": -1.6966127157211304, "logps/chosen": -165.41343688964844, "logps/rejected": -208.544189453125, "loss": 0.6768, "rewards/accuracies": 0.8125, "rewards/chosen": -0.02937229909002781, "rewards/margins": 0.061954449862241745, "rewards/rejected": -0.0913267433643341, "step": 158 }, { "epoch": 0.1845577103309062, "grad_norm": 79.53079693796676, "learning_rate": 1.99333731792395e-07, "logits/chosen": -1.5714216232299805, "logits/rejected": -1.543687343597412, "logps/chosen": -153.09767150878906, "logps/rejected": -177.41847229003906, "loss": 0.684, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0551866851747036, "rewards/margins": 0.03179415315389633, "rewards/rejected": -0.08698083460330963, "step": 160 }, { "epoch": 0.18686468171004253, "grad_norm": 73.91686194821585, "learning_rate": 1.9928862722828242e-07, "logits/chosen": -1.7037162780761719, "logits/rejected": -1.675144076347351, "logps/chosen": -153.01358032226562, "logps/rejected": -175.93673706054688, "loss": 0.6781, "rewards/accuracies": 0.59375, "rewards/chosen": -0.023145336657762527, "rewards/margins": 0.06261962652206421, "rewards/rejected": -0.08576496690511703, "step": 162 }, { "epoch": 0.18917165308917885, "grad_norm": 76.57698818122466, "learning_rate": 1.9924205096719357e-07, "logits/chosen": -1.5918736457824707, "logits/rejected": -1.4768625497817993, "logps/chosen": -196.1853485107422, "logps/rejected": -179.04530334472656, "loss": 0.6692, "rewards/accuracies": 0.8125, "rewards/chosen": -0.04375737905502319, "rewards/margins": 0.0569818913936615, "rewards/rejected": -0.10073927044868469, "step": 164 }, { "epoch": 0.19147862446831518, "grad_norm": 77.81644602396882, "learning_rate": 1.9919400369950097e-07, "logits/chosen": -1.4722576141357422, "logits/rejected": -1.540255069732666, "logps/chosen": -205.6660614013672, "logps/rejected": -248.9489288330078, "loss": 0.6786, "rewards/accuracies": 0.71875, "rewards/chosen": -0.043852321803569794, "rewards/margins": 0.0406915545463562, "rewards/rejected": -0.084543876349926, "step": 166 }, { "epoch": 0.19378559584745153, "grad_norm": 75.44269350923808, "learning_rate": 1.9914448613738103e-07, "logits/chosen": -1.529039740562439, "logits/rejected": -1.5173804759979248, "logps/chosen": -202.2668914794922, "logps/rejected": -226.52684020996094, "loss": 0.6763, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07004385441541672, "rewards/margins": 0.021601226180791855, "rewards/rejected": -0.09164508432149887, "step": 168 }, { "epoch": 0.19609256722658785, "grad_norm": 76.79086636181425, "learning_rate": 1.9909349901480347e-07, "logits/chosen": -1.610205888748169, "logits/rejected": -1.622270107269287, "logps/chosen": -152.17263793945312, "logps/rejected": -153.09571838378906, "loss": 0.6826, "rewards/accuracies": 0.53125, "rewards/chosen": -0.049044616520404816, "rewards/margins": 0.020372100174427032, "rewards/rejected": -0.06941672414541245, "step": 170 }, { "epoch": 0.19839953860572418, "grad_norm": 65.6162753738054, "learning_rate": 1.990410430875205e-07, "logits/chosen": -1.6482963562011719, "logits/rejected": -1.6161506175994873, "logps/chosen": -131.635986328125, "logps/rejected": -142.7827606201172, "loss": 0.6691, "rewards/accuracies": 0.78125, "rewards/chosen": -0.028512008488178253, "rewards/margins": 0.06365156173706055, "rewards/rejected": -0.0921635702252388, "step": 172 }, { "epoch": 0.2007065099848605, "grad_norm": 90.14268267387868, "learning_rate": 1.9898711913305547e-07, "logits/chosen": -1.5566825866699219, "logits/rejected": -1.6173129081726074, "logps/chosen": -174.24017333984375, "logps/rejected": -181.01629638671875, "loss": 0.6766, "rewards/accuracies": 0.65625, "rewards/chosen": -0.05642353743314743, "rewards/margins": 0.03944730758666992, "rewards/rejected": -0.09587083756923676, "step": 174 }, { "epoch": 0.20301348136399683, "grad_norm": 76.41450508388954, "learning_rate": 1.9893172795069142e-07, "logits/chosen": -1.5998440980911255, "logits/rejected": -1.6545708179473877, "logps/chosen": -156.8663330078125, "logps/rejected": -159.1892547607422, "loss": 0.6915, "rewards/accuracies": 0.4375, "rewards/chosen": -0.05183999985456467, "rewards/margins": 0.0032500806264579296, "rewards/rejected": -0.055090077221393585, "step": 176 }, { "epoch": 0.20532045274313315, "grad_norm": 87.56584383747318, "learning_rate": 1.988748703614594e-07, "logits/chosen": -1.6627997159957886, "logits/rejected": -1.6540586948394775, "logps/chosen": -155.84368896484375, "logps/rejected": -186.67495727539062, "loss": 0.6755, "rewards/accuracies": 0.65625, "rewards/chosen": -0.03810984268784523, "rewards/margins": 0.029699210077524185, "rewards/rejected": -0.06780905276536942, "step": 178 }, { "epoch": 0.20762742412226948, "grad_norm": 70.4422512373765, "learning_rate": 1.9881654720812592e-07, "logits/chosen": -1.5361154079437256, "logits/rejected": -1.610466480255127, "logps/chosen": -115.56837463378906, "logps/rejected": -142.57766723632812, "loss": 0.6805, "rewards/accuracies": 0.75, "rewards/chosen": -0.02763887494802475, "rewards/margins": 0.0444108322262764, "rewards/rejected": -0.07204970717430115, "step": 180 }, { "epoch": 0.2099343955014058, "grad_norm": 71.9817063853092, "learning_rate": 1.9875675935518094e-07, "logits/chosen": -1.547518014907837, "logits/rejected": -1.5500645637512207, "logps/chosen": -226.55401611328125, "logps/rejected": -206.99913024902344, "loss": 0.6836, "rewards/accuracies": 0.625, "rewards/chosen": -0.09524687379598618, "rewards/margins": 0.004950803238898516, "rewards/rejected": -0.1001976728439331, "step": 182 }, { "epoch": 0.21224136688054213, "grad_norm": 84.4720159255109, "learning_rate": 1.9869550768882454e-07, "logits/chosen": -1.5599523782730103, "logits/rejected": -1.5133187770843506, "logps/chosen": -182.1778564453125, "logps/rejected": -241.0075225830078, "loss": 0.6671, "rewards/accuracies": 0.6875, "rewards/chosen": -0.057929787784814835, "rewards/margins": 0.0736684501171112, "rewards/rejected": -0.13159823417663574, "step": 184 }, { "epoch": 0.21454833825967848, "grad_norm": 73.76638464490958, "learning_rate": 1.9863279311695428e-07, "logits/chosen": -1.4902362823486328, "logits/rejected": -1.55423903465271, "logps/chosen": -219.845703125, "logps/rejected": -273.73712158203125, "loss": 0.6773, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07643218338489532, "rewards/margins": 0.07768993079662323, "rewards/rejected": -0.15412212908267975, "step": 186 }, { "epoch": 0.2168553096388148, "grad_norm": 68.86254513199266, "learning_rate": 1.985686165691514e-07, "logits/chosen": -1.704699993133545, "logits/rejected": -1.6342915296554565, "logps/chosen": -120.14341735839844, "logps/rejected": -114.1607666015625, "loss": 0.6819, "rewards/accuracies": 0.5625, "rewards/chosen": -0.03834118694067001, "rewards/margins": -0.00312834233045578, "rewards/rejected": -0.03521284461021423, "step": 188 }, { "epoch": 0.21916228101795113, "grad_norm": 77.39475970610044, "learning_rate": 1.9850297899666707e-07, "logits/chosen": -1.6166346073150635, "logits/rejected": -1.666224479675293, "logps/chosen": -138.47250366210938, "logps/rejected": -183.75860595703125, "loss": 0.6784, "rewards/accuracies": 0.5625, "rewards/chosen": -0.059065092355012894, "rewards/margins": 0.0348266139626503, "rewards/rejected": -0.09389171749353409, "step": 190 }, { "epoch": 0.22146925239708745, "grad_norm": 79.62622809714712, "learning_rate": 1.9843588137240855e-07, "logits/chosen": -1.4786595106124878, "logits/rejected": -1.5819900035858154, "logps/chosen": -156.80630493164062, "logps/rejected": -225.65887451171875, "loss": 0.6727, "rewards/accuracies": 0.625, "rewards/chosen": -0.06785481423139572, "rewards/margins": 0.046741731464862823, "rewards/rejected": -0.11459654569625854, "step": 192 }, { "epoch": 0.22377622377622378, "grad_norm": 71.61908905732174, "learning_rate": 1.9836732469092446e-07, "logits/chosen": -1.7382750511169434, "logits/rejected": -1.7238702774047852, "logps/chosen": -135.97625732421875, "logps/rejected": -134.9537353515625, "loss": 0.6751, "rewards/accuracies": 0.59375, "rewards/chosen": -0.07399953901767731, "rewards/margins": -0.0150267593562603, "rewards/rejected": -0.05897277966141701, "step": 194 }, { "epoch": 0.2260831951553601, "grad_norm": 77.39998086488785, "learning_rate": 1.982973099683902e-07, "logits/chosen": -1.6806734800338745, "logits/rejected": -1.7225916385650635, "logps/chosen": -139.36279296875, "logps/rejected": -160.1461639404297, "loss": 0.6592, "rewards/accuracies": 0.65625, "rewards/chosen": -0.05896836146712303, "rewards/margins": 0.05167616903781891, "rewards/rejected": -0.11064451932907104, "step": 196 }, { "epoch": 0.22839016653449642, "grad_norm": 71.42176978474515, "learning_rate": 1.982258382425928e-07, "logits/chosen": -1.53923499584198, "logits/rejected": -1.5509108304977417, "logps/chosen": -145.9883575439453, "logps/rejected": -173.95388793945312, "loss": 0.6819, "rewards/accuracies": 0.75, "rewards/chosen": -0.06814471632242203, "rewards/margins": 0.05520961806178093, "rewards/rejected": -0.12335430830717087, "step": 198 }, { "epoch": 0.23069713791363275, "grad_norm": 65.61388633290626, "learning_rate": 1.9815291057291578e-07, "logits/chosen": -1.5758477449417114, "logits/rejected": -1.6140058040618896, "logps/chosen": -105.85581970214844, "logps/rejected": -122.97176361083984, "loss": 0.679, "rewards/accuracies": 0.625, "rewards/chosen": -0.06785964965820312, "rewards/margins": 0.01952926628291607, "rewards/rejected": -0.08738891780376434, "step": 200 }, { "epoch": 0.23069713791363275, "eval_logits/chosen": -1.5610222816467285, "eval_logits/rejected": -1.462950587272644, "eval_logps/chosen": -186.2912139892578, "eval_logps/rejected": -151.6300048828125, "eval_loss": 0.6922155618667603, "eval_rewards/accuracies": 0.5600000023841858, "eval_rewards/chosen": -0.11090204119682312, "eval_rewards/margins": 0.0005596327828243375, "eval_rewards/rejected": -0.11146167665719986, "eval_runtime": 21.7555, "eval_samples_per_second": 4.597, "eval_steps_per_second": 1.149, "step": 200 }, { "epoch": 0.23300410929276907, "grad_norm": 69.78727065961301, "learning_rate": 1.9807852804032302e-07, "logits/chosen": -1.4734337329864502, "logits/rejected": -1.491389513015747, "logps/chosen": -154.9561767578125, "logps/rejected": -204.73797607421875, "loss": 0.6705, "rewards/accuracies": 0.71875, "rewards/chosen": -0.046838290989398956, "rewards/margins": 0.11147616803646088, "rewards/rejected": -0.15831446647644043, "step": 202 }, { "epoch": 0.23531108067190543, "grad_norm": 82.59595988703826, "learning_rate": 1.980026917473432e-07, "logits/chosen": -1.5889283418655396, "logits/rejected": -1.7049566507339478, "logps/chosen": -174.74598693847656, "logps/rejected": -223.11842346191406, "loss": 0.6737, "rewards/accuracies": 0.75, "rewards/chosen": -0.05181250721216202, "rewards/margins": 0.09021350741386414, "rewards/rejected": -0.14202602207660675, "step": 204 }, { "epoch": 0.23761805205104175, "grad_norm": 67.38723201323596, "learning_rate": 1.9792540281805298e-07, "logits/chosen": -1.4892499446868896, "logits/rejected": -1.517817497253418, "logps/chosen": -140.6378631591797, "logps/rejected": -161.47348022460938, "loss": 0.6682, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08649797737598419, "rewards/margins": 0.03692768141627312, "rewards/rejected": -0.12342565506696701, "step": 206 }, { "epoch": 0.23992502343017807, "grad_norm": 73.06374890842753, "learning_rate": 1.9784666239806089e-07, "logits/chosen": -1.5101206302642822, "logits/rejected": -1.5768136978149414, "logps/chosen": -164.27035522460938, "logps/rejected": -203.25975036621094, "loss": 0.6718, "rewards/accuracies": 0.625, "rewards/chosen": -0.06570874899625778, "rewards/margins": 0.09578941017389297, "rewards/rejected": -0.16149815917015076, "step": 208 }, { "epoch": 0.2422319948093144, "grad_norm": 75.60130708995507, "learning_rate": 1.9776647165448983e-07, "logits/chosen": -1.5699687004089355, "logits/rejected": -1.520723581314087, "logps/chosen": -188.6508331298828, "logps/rejected": -217.18365478515625, "loss": 0.6708, "rewards/accuracies": 0.59375, "rewards/chosen": -0.08720759302377701, "rewards/margins": 0.02055392973124981, "rewards/rejected": -0.10776151716709137, "step": 210 }, { "epoch": 0.24453896618845072, "grad_norm": 76.08847895651103, "learning_rate": 1.9768483177596006e-07, "logits/chosen": -1.5900119543075562, "logits/rejected": -1.6237448453903198, "logps/chosen": -143.37664794921875, "logps/rejected": -166.52413940429688, "loss": 0.6689, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0496542751789093, "rewards/margins": 0.06798863410949707, "rewards/rejected": -0.11764290928840637, "step": 212 }, { "epoch": 0.24684593756758705, "grad_norm": 85.55002048517719, "learning_rate": 1.9760174397257153e-07, "logits/chosen": -1.5799341201782227, "logits/rejected": -1.5739325284957886, "logps/chosen": -187.42578125, "logps/rejected": -227.91946411132812, "loss": 0.6871, "rewards/accuracies": 0.53125, "rewards/chosen": -0.11932831257581711, "rewards/margins": 0.00022871512919664383, "rewards/rejected": -0.11955701559782028, "step": 214 }, { "epoch": 0.24915290894672337, "grad_norm": 79.3908422125236, "learning_rate": 1.97517209475886e-07, "logits/chosen": -1.578735589981079, "logits/rejected": -1.7003827095031738, "logps/chosen": -147.41778564453125, "logps/rejected": -185.31602478027344, "loss": 0.6678, "rewards/accuracies": 0.78125, "rewards/chosen": -0.07496561855077744, "rewards/margins": 0.0857120081782341, "rewards/rejected": -0.16067762672901154, "step": 216 }, { "epoch": 0.2514598803258597, "grad_norm": 78.5386783375159, "learning_rate": 1.9743122953890854e-07, "logits/chosen": -1.5871162414550781, "logits/rejected": -1.5231672525405884, "logps/chosen": -174.3480682373047, "logps/rejected": -196.07998657226562, "loss": 0.6548, "rewards/accuracies": 0.71875, "rewards/chosen": -0.07399033010005951, "rewards/margins": 0.05520808696746826, "rewards/rejected": -0.12919840216636658, "step": 218 }, { "epoch": 0.253766851704996, "grad_norm": 80.06682238716625, "learning_rate": 1.9734380543606927e-07, "logits/chosen": -1.643662452697754, "logits/rejected": -1.6430741548538208, "logps/chosen": -200.37826538085938, "logps/rejected": -207.41136169433594, "loss": 0.6815, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07183612138032913, "rewards/margins": 0.07066242396831512, "rewards/rejected": -0.14249853789806366, "step": 220 }, { "epoch": 0.25607382308413235, "grad_norm": 73.47957521005397, "learning_rate": 1.972549384632043e-07, "logits/chosen": -1.5852243900299072, "logits/rejected": -1.736151099205017, "logps/chosen": -167.52194213867188, "logps/rejected": -218.90122985839844, "loss": 0.6604, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0951998233795166, "rewards/margins": 0.027591748163104057, "rewards/rejected": -0.12279157340526581, "step": 222 }, { "epoch": 0.25838079446326867, "grad_norm": 79.05067580811021, "learning_rate": 1.9716462993753655e-07, "logits/chosen": -1.476207971572876, "logits/rejected": -1.5456207990646362, "logps/chosen": -288.57379150390625, "logps/rejected": -338.8498840332031, "loss": 0.6567, "rewards/accuracies": 0.6875, "rewards/chosen": -0.17371979355812073, "rewards/margins": 0.15009327232837677, "rewards/rejected": -0.3238130807876587, "step": 224 }, { "epoch": 0.260687765842405, "grad_norm": 67.9414989656304, "learning_rate": 1.9707288119765622e-07, "logits/chosen": -1.5781480073928833, "logits/rejected": -1.569219708442688, "logps/chosen": -124.80656433105469, "logps/rejected": -141.52476501464844, "loss": 0.6732, "rewards/accuracies": 0.625, "rewards/chosen": -0.12547817826271057, "rewards/margins": 0.03562304750084877, "rewards/rejected": -0.16110120713710785, "step": 226 }, { "epoch": 0.2629947372215413, "grad_norm": 78.11670530443735, "learning_rate": 1.9697969360350095e-07, "logits/chosen": -1.6346409320831299, "logits/rejected": -1.565224051475525, "logps/chosen": -178.9912109375, "logps/rejected": -190.82681274414062, "loss": 0.6661, "rewards/accuracies": 0.625, "rewards/chosen": -0.10180149972438812, "rewards/margins": 0.05022910237312317, "rewards/rejected": -0.1520306020975113, "step": 228 }, { "epoch": 0.2653017086006777, "grad_norm": 68.3329236115507, "learning_rate": 1.968850685363357e-07, "logits/chosen": -1.7000384330749512, "logits/rejected": -1.7287462949752808, "logps/chosen": -199.75430297851562, "logps/rejected": -241.5220947265625, "loss": 0.6556, "rewards/accuracies": 0.625, "rewards/chosen": -0.09640266001224518, "rewards/margins": 0.09287622570991516, "rewards/rejected": -0.18927887082099915, "step": 230 }, { "epoch": 0.267608679979814, "grad_norm": 82.26094651176318, "learning_rate": 1.9678900739873226e-07, "logits/chosen": -1.677142858505249, "logits/rejected": -1.6745737791061401, "logps/chosen": -170.59425354003906, "logps/rejected": -181.05661010742188, "loss": 0.6694, "rewards/accuracies": 0.59375, "rewards/chosen": -0.1134408637881279, "rewards/margins": 0.036339692771434784, "rewards/rejected": -0.14978057146072388, "step": 232 }, { "epoch": 0.26991565135895035, "grad_norm": 78.36181831181821, "learning_rate": 1.966915116145484e-07, "logits/chosen": -1.4915921688079834, "logits/rejected": -1.523095726966858, "logps/chosen": -155.88290405273438, "logps/rejected": -164.45022583007812, "loss": 0.6567, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0988837480545044, "rewards/margins": 0.09765380620956421, "rewards/rejected": -0.196537584066391, "step": 234 }, { "epoch": 0.2722226227380867, "grad_norm": 83.64468364737313, "learning_rate": 1.965925826289068e-07, "logits/chosen": -1.6482906341552734, "logits/rejected": -1.6469372510910034, "logps/chosen": -185.45001220703125, "logps/rejected": -208.0437469482422, "loss": 0.6708, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0873761773109436, "rewards/margins": 0.06748253107070923, "rewards/rejected": -0.15485870838165283, "step": 236 }, { "epoch": 0.274529594117223, "grad_norm": 74.16820675500993, "learning_rate": 1.964922219081738e-07, "logits/chosen": -1.764983057975769, "logits/rejected": -1.7145969867706299, "logps/chosen": -223.3017578125, "logps/rejected": -218.1916046142578, "loss": 0.6555, "rewards/accuracies": 0.6875, "rewards/chosen": -0.11543229222297668, "rewards/margins": 0.12305162101984024, "rewards/rejected": -0.23848390579223633, "step": 238 }, { "epoch": 0.2768365654963593, "grad_norm": 75.00121800473413, "learning_rate": 1.9639043093993727e-07, "logits/chosen": -1.5264173746109009, "logits/rejected": -1.4717910289764404, "logps/chosen": -178.43338012695312, "logps/rejected": -188.60101318359375, "loss": 0.6481, "rewards/accuracies": 0.59375, "rewards/chosen": -0.09335748851299286, "rewards/margins": 0.026005972176790237, "rewards/rejected": -0.1193634569644928, "step": 240 }, { "epoch": 0.27914353687549565, "grad_norm": 64.46972140040022, "learning_rate": 1.9628721123298492e-07, "logits/chosen": -1.6837042570114136, "logits/rejected": -1.6980068683624268, "logps/chosen": -161.4723663330078, "logps/rejected": -171.20248413085938, "loss": 0.6609, "rewards/accuracies": 0.53125, "rewards/chosen": -0.11237211525440216, "rewards/margins": 0.049555521458387375, "rewards/rejected": -0.16192764043807983, "step": 242 }, { "epoch": 0.28145050825463197, "grad_norm": 66.85001786944659, "learning_rate": 1.961825643172819e-07, "logits/chosen": -1.5771496295928955, "logits/rejected": -1.5039366483688354, "logps/chosen": -158.33685302734375, "logps/rejected": -160.24057006835938, "loss": 0.6701, "rewards/accuracies": 0.59375, "rewards/chosen": -0.14522022008895874, "rewards/margins": 0.04340605437755585, "rewards/rejected": -0.1886262595653534, "step": 244 }, { "epoch": 0.2837574796337683, "grad_norm": 76.39303352760503, "learning_rate": 1.9607649174394787e-07, "logits/chosen": -1.4101349115371704, "logits/rejected": -1.4513871669769287, "logps/chosen": -147.43826293945312, "logps/rejected": -182.31005859375, "loss": 0.6596, "rewards/accuracies": 0.71875, "rewards/chosen": -0.08130650967359543, "rewards/margins": 0.0953046903014183, "rewards/rejected": -0.17661119997501373, "step": 246 }, { "epoch": 0.2860644510129046, "grad_norm": 84.03579410656208, "learning_rate": 1.959689950852343e-07, "logits/chosen": -1.6520403623580933, "logits/rejected": -1.6739228963851929, "logps/chosen": -172.19305419921875, "logps/rejected": -184.803466796875, "loss": 0.6669, "rewards/accuracies": 0.53125, "rewards/chosen": -0.16027307510375977, "rewards/margins": 0.02343956008553505, "rewards/rejected": -0.1837126612663269, "step": 248 }, { "epoch": 0.28837142239204094, "grad_norm": 78.31280460476106, "learning_rate": 1.9586007593450095e-07, "logits/chosen": -1.568188190460205, "logits/rejected": -1.586582064628601, "logps/chosen": -169.95675659179688, "logps/rejected": -188.78858947753906, "loss": 0.6779, "rewards/accuracies": 0.59375, "rewards/chosen": -0.1373595893383026, "rewards/margins": 0.02867070771753788, "rewards/rejected": -0.16603030264377594, "step": 250 }, { "epoch": 0.29067839377117727, "grad_norm": 77.82835801736759, "learning_rate": 1.957497359061924e-07, "logits/chosen": -1.5796047449111938, "logits/rejected": -1.5543608665466309, "logps/chosen": -191.53219604492188, "logps/rejected": -220.70361328125, "loss": 0.6393, "rewards/accuracies": 0.71875, "rewards/chosen": -0.15552642941474915, "rewards/margins": 0.09192191064357758, "rewards/rejected": -0.24744835495948792, "step": 252 }, { "epoch": 0.2929853651503136, "grad_norm": 81.6835137198976, "learning_rate": 1.956379766358141e-07, "logits/chosen": -1.5779876708984375, "logits/rejected": -1.504298448562622, "logps/chosen": -218.59942626953125, "logps/rejected": -230.2102813720703, "loss": 0.6635, "rewards/accuracies": 0.75, "rewards/chosen": -0.14433127641677856, "rewards/margins": 0.08938172459602356, "rewards/rejected": -0.23371298611164093, "step": 254 }, { "epoch": 0.2952923365294499, "grad_norm": 74.03670184892391, "learning_rate": 1.9552479977990798e-07, "logits/chosen": -1.6765474081039429, "logits/rejected": -1.643741488456726, "logps/chosen": -185.69444274902344, "logps/rejected": -199.7008819580078, "loss": 0.676, "rewards/accuracies": 0.46875, "rewards/chosen": -0.14419934153556824, "rewards/margins": 0.016861233860254288, "rewards/rejected": -0.16106057167053223, "step": 256 }, { "epoch": 0.29759930790858624, "grad_norm": 79.12914884219855, "learning_rate": 1.954102070160281e-07, "logits/chosen": -1.6632733345031738, "logits/rejected": -1.6073827743530273, "logps/chosen": -149.79641723632812, "logps/rejected": -174.7237091064453, "loss": 0.6638, "rewards/accuracies": 0.65625, "rewards/chosen": -0.10197722166776657, "rewards/margins": 0.07854845374822617, "rewards/rejected": -0.18052567541599274, "step": 258 }, { "epoch": 0.29990627928772257, "grad_norm": 80.451175401901, "learning_rate": 1.9529420004271567e-07, "logits/chosen": -1.5313125848770142, "logits/rejected": -1.5560095310211182, "logps/chosen": -207.1497802734375, "logps/rejected": -222.1211395263672, "loss": 0.6407, "rewards/accuracies": 0.625, "rewards/chosen": -0.16675114631652832, "rewards/margins": 0.1034877672791481, "rewards/rejected": -0.2702389061450958, "step": 260 }, { "epoch": 0.3022132506668589, "grad_norm": 66.85531080853532, "learning_rate": 1.9517678057947382e-07, "logits/chosen": -1.6430004835128784, "logits/rejected": -1.597357153892517, "logps/chosen": -135.1138153076172, "logps/rejected": -132.63619995117188, "loss": 0.6618, "rewards/accuracies": 0.625, "rewards/chosen": -0.11752544343471527, "rewards/margins": 0.03515633940696716, "rewards/rejected": -0.15268178284168243, "step": 262 }, { "epoch": 0.3045202220459952, "grad_norm": 80.22448615221649, "learning_rate": 1.9505795036674232e-07, "logits/chosen": -1.6319184303283691, "logits/rejected": -1.4991899728775024, "logps/chosen": -217.16680908203125, "logps/rejected": -245.2107696533203, "loss": 0.6523, "rewards/accuracies": 0.5625, "rewards/chosen": -0.18451935052871704, "rewards/margins": 0.11840308457612991, "rewards/rejected": -0.30292242765426636, "step": 264 }, { "epoch": 0.3068271934251316, "grad_norm": 69.95576974969472, "learning_rate": 1.9493771116587156e-07, "logits/chosen": -1.5522364377975464, "logits/rejected": -1.5948469638824463, "logps/chosen": -113.81831359863281, "logps/rejected": -155.99346923828125, "loss": 0.6551, "rewards/accuracies": 0.78125, "rewards/chosen": -0.08541279286146164, "rewards/margins": 0.14459526538848877, "rewards/rejected": -0.230008065700531, "step": 266 }, { "epoch": 0.3091341648042679, "grad_norm": 75.25744246014891, "learning_rate": 1.9481606475909656e-07, "logits/chosen": -1.500025749206543, "logits/rejected": -1.5494239330291748, "logps/chosen": -125.84722900390625, "logps/rejected": -164.76669311523438, "loss": 0.6526, "rewards/accuracies": 0.84375, "rewards/chosen": -0.08980143815279007, "rewards/margins": 0.17133310437202454, "rewards/rejected": -0.2611345648765564, "step": 268 }, { "epoch": 0.31144113618340424, "grad_norm": 77.53434606640313, "learning_rate": 1.9469301294951057e-07, "logits/chosen": -1.6267601251602173, "logits/rejected": -1.5587116479873657, "logps/chosen": -172.08139038085938, "logps/rejected": -181.32717895507812, "loss": 0.6596, "rewards/accuracies": 0.625, "rewards/chosen": -0.1692037582397461, "rewards/margins": 0.051171936094760895, "rewards/rejected": -0.2203756868839264, "step": 270 }, { "epoch": 0.31374810756254057, "grad_norm": 74.84897771580975, "learning_rate": 1.9456855756103816e-07, "logits/chosen": -1.5624661445617676, "logits/rejected": -1.6530312299728394, "logps/chosen": -147.84597778320312, "logps/rejected": -174.6589813232422, "loss": 0.6707, "rewards/accuracies": 0.53125, "rewards/chosen": -0.13743659853935242, "rewards/margins": 0.05220307409763336, "rewards/rejected": -0.18963965773582458, "step": 272 }, { "epoch": 0.3160550789416769, "grad_norm": 71.99044362933952, "learning_rate": 1.9444270043840852e-07, "logits/chosen": -1.6625701189041138, "logits/rejected": -1.5914949178695679, "logps/chosen": -147.29147338867188, "logps/rejected": -129.6570587158203, "loss": 0.6831, "rewards/accuracies": 0.46875, "rewards/chosen": -0.21893730759620667, "rewards/margins": -0.0283275805413723, "rewards/rejected": -0.19060972332954407, "step": 274 }, { "epoch": 0.3183620503208132, "grad_norm": 75.21160091684173, "learning_rate": 1.9431544344712772e-07, "logits/chosen": -1.4378788471221924, "logits/rejected": -1.3864963054656982, "logps/chosen": -147.2783660888672, "logps/rejected": -177.4646759033203, "loss": 0.6472, "rewards/accuracies": 0.71875, "rewards/chosen": -0.11688334494829178, "rewards/margins": 0.11695411056280136, "rewards/rejected": -0.23383745551109314, "step": 276 }, { "epoch": 0.32066902169994954, "grad_norm": 72.07898599790276, "learning_rate": 1.9418678847345146e-07, "logits/chosen": -1.5210872888565063, "logits/rejected": -1.5768458843231201, "logps/chosen": -164.58419799804688, "logps/rejected": -213.6575469970703, "loss": 0.6664, "rewards/accuracies": 0.53125, "rewards/chosen": -0.12316928803920746, "rewards/margins": 0.07401876151561737, "rewards/rejected": -0.19718804955482483, "step": 278 }, { "epoch": 0.32297599307908587, "grad_norm": 67.17404804695744, "learning_rate": 1.9405673742435676e-07, "logits/chosen": -1.5087511539459229, "logits/rejected": -1.5612874031066895, "logps/chosen": -142.5220947265625, "logps/rejected": -195.3551483154297, "loss": 0.6718, "rewards/accuracies": 0.6875, "rewards/chosen": -0.14393991231918335, "rewards/margins": 0.11825156211853027, "rewards/rejected": -0.2621914744377136, "step": 280 }, { "epoch": 0.3252829644582222, "grad_norm": 81.75237089659649, "learning_rate": 1.939252922275139e-07, "logits/chosen": -1.6113684177398682, "logits/rejected": -1.520400047302246, "logps/chosen": -215.8910675048828, "logps/rejected": -227.26637268066406, "loss": 0.6556, "rewards/accuracies": 0.59375, "rewards/chosen": -0.2654890716075897, "rewards/margins": 0.09542025625705719, "rewards/rejected": -0.3609093129634857, "step": 282 }, { "epoch": 0.3275899358373585, "grad_norm": 65.02297736065502, "learning_rate": 1.937924548312578e-07, "logits/chosen": -1.6812703609466553, "logits/rejected": -1.7281326055526733, "logps/chosen": -130.5011749267578, "logps/rejected": -195.49452209472656, "loss": 0.6431, "rewards/accuracies": 0.71875, "rewards/chosen": -0.12183240056037903, "rewards/margins": 0.12844915688037872, "rewards/rejected": -0.25028154253959656, "step": 284 }, { "epoch": 0.32989690721649484, "grad_norm": 75.13263031113792, "learning_rate": 1.9365822720455912e-07, "logits/chosen": -1.4847445487976074, "logits/rejected": -1.4161133766174316, "logps/chosen": -154.5245361328125, "logps/rejected": -203.3861541748047, "loss": 0.6537, "rewards/accuracies": 0.53125, "rewards/chosen": -0.16228517889976501, "rewards/margins": 0.12002203613519669, "rewards/rejected": -0.2823072075843811, "step": 286 }, { "epoch": 0.33220387859563116, "grad_norm": 78.41024724428831, "learning_rate": 1.935226113369951e-07, "logits/chosen": -1.686346173286438, "logits/rejected": -1.6542606353759766, "logps/chosen": -172.25059509277344, "logps/rejected": -199.93182373046875, "loss": 0.6469, "rewards/accuracies": 0.78125, "rewards/chosen": -0.12491661310195923, "rewards/margins": 0.12406705319881439, "rewards/rejected": -0.24898366630077362, "step": 288 }, { "epoch": 0.3345108499747675, "grad_norm": 74.32689822052723, "learning_rate": 1.9338560923872006e-07, "logits/chosen": -1.5119750499725342, "logits/rejected": -1.524541974067688, "logps/chosen": -159.21376037597656, "logps/rejected": -237.09561157226562, "loss": 0.6455, "rewards/accuracies": 0.8125, "rewards/chosen": -0.1832338273525238, "rewards/margins": 0.2291288673877716, "rewards/rejected": -0.4123626947402954, "step": 290 }, { "epoch": 0.3368178213539038, "grad_norm": 77.04623177811982, "learning_rate": 1.9324722294043556e-07, "logits/chosen": -1.6212831735610962, "logits/rejected": -1.5947524309158325, "logps/chosen": -187.361572265625, "logps/rejected": -187.34519958496094, "loss": 0.6585, "rewards/accuracies": 0.59375, "rewards/chosen": -0.24990221858024597, "rewards/margins": 0.07834864407777786, "rewards/rejected": -0.3282508850097656, "step": 292 }, { "epoch": 0.33912479273304014, "grad_norm": 83.55231847560428, "learning_rate": 1.9310745449336044e-07, "logits/chosen": -1.58076012134552, "logits/rejected": -1.5445674657821655, "logps/chosen": -192.48617553710938, "logps/rejected": -215.64193725585938, "loss": 0.6418, "rewards/accuracies": 0.75, "rewards/chosen": -0.1813565194606781, "rewards/margins": 0.1242499127984047, "rewards/rejected": -0.3056064546108246, "step": 294 }, { "epoch": 0.34143176411217646, "grad_norm": 73.20320061572161, "learning_rate": 1.929663059692002e-07, "logits/chosen": -1.477115273475647, "logits/rejected": -1.5140092372894287, "logps/chosen": -154.4539794921875, "logps/rejected": -214.9960174560547, "loss": 0.6894, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2464270293712616, "rewards/margins": 0.08608925342559814, "rewards/rejected": -0.33251628279685974, "step": 296 }, { "epoch": 0.3437387354913128, "grad_norm": 82.85464536249332, "learning_rate": 1.928237794601165e-07, "logits/chosen": -1.5687949657440186, "logits/rejected": -1.6849851608276367, "logps/chosen": -140.14784240722656, "logps/rejected": -234.17706298828125, "loss": 0.6525, "rewards/accuracies": 0.78125, "rewards/chosen": -0.1264043152332306, "rewards/margins": 0.24397864937782288, "rewards/rejected": -0.3703829348087311, "step": 298 }, { "epoch": 0.34604570687044917, "grad_norm": 65.65777237412837, "learning_rate": 1.9267987707869604e-07, "logits/chosen": -1.4391192197799683, "logits/rejected": -1.4724018573760986, "logps/chosen": -153.69284057617188, "logps/rejected": -173.3372039794922, "loss": 0.6486, "rewards/accuracies": 0.59375, "rewards/chosen": -0.16697266697883606, "rewards/margins": 0.13511566817760468, "rewards/rejected": -0.30208835005760193, "step": 300 }, { "epoch": 0.34604570687044917, "eval_logits/chosen": -1.5305781364440918, "eval_logits/rejected": -1.4347938299179077, "eval_logps/chosen": -187.96263122558594, "eval_logps/rejected": -153.34820556640625, "eval_loss": 0.678679347038269, "eval_rewards/accuracies": 0.6399999856948853, "eval_rewards/chosen": -0.278046578168869, "eval_rewards/margins": 0.005234198644757271, "eval_rewards/rejected": -0.28328076004981995, "eval_runtime": 21.7114, "eval_samples_per_second": 4.606, "eval_steps_per_second": 1.151, "step": 300 }, { "epoch": 0.3483526782495855, "grad_norm": 69.96196416042814, "learning_rate": 1.9253460095791922e-07, "logits/chosen": -1.5020473003387451, "logits/rejected": -1.4953689575195312, "logps/chosen": -106.53646087646484, "logps/rejected": -165.1669158935547, "loss": 0.6546, "rewards/accuracies": 0.59375, "rewards/chosen": -0.15904603898525238, "rewards/margins": 0.06554871797561646, "rewards/rejected": -0.22459478676319122, "step": 302 }, { "epoch": 0.3506596496287218, "grad_norm": 74.69729400373957, "learning_rate": 1.9238795325112868e-07, "logits/chosen": -1.636529803276062, "logits/rejected": -1.6348826885223389, "logps/chosen": -140.86441040039062, "logps/rejected": -174.48370361328125, "loss": 0.6433, "rewards/accuracies": 0.84375, "rewards/chosen": -0.12615619599819183, "rewards/margins": 0.20733490586280823, "rewards/rejected": -0.3334910571575165, "step": 304 }, { "epoch": 0.35296662100785814, "grad_norm": 84.17293540044481, "learning_rate": 1.9223993613199713e-07, "logits/chosen": -1.6913816928863525, "logits/rejected": -1.6646835803985596, "logps/chosen": -152.25997924804688, "logps/rejected": -171.05575561523438, "loss": 0.6514, "rewards/accuracies": 0.65625, "rewards/chosen": -0.11823489516973495, "rewards/margins": 0.18948128819465637, "rewards/rejected": -0.3077161908149719, "step": 306 }, { "epoch": 0.35527359238699446, "grad_norm": 83.6870493511653, "learning_rate": 1.9209055179449537e-07, "logits/chosen": -1.517793893814087, "logits/rejected": -1.6404225826263428, "logps/chosen": -91.36832427978516, "logps/rejected": -134.06529235839844, "loss": 0.6551, "rewards/accuracies": 0.71875, "rewards/chosen": -0.10601670295000076, "rewards/margins": 0.14076808094978333, "rewards/rejected": -0.24678479135036469, "step": 308 }, { "epoch": 0.3575805637661308, "grad_norm": 64.57674968550867, "learning_rate": 1.9193980245285966e-07, "logits/chosen": -1.4689788818359375, "logits/rejected": -1.3954423666000366, "logps/chosen": -143.7101287841797, "logps/rejected": -169.8336181640625, "loss": 0.6402, "rewards/accuracies": 0.65625, "rewards/chosen": -0.16834121942520142, "rewards/margins": 0.08874449878931046, "rewards/rejected": -0.25708574056625366, "step": 310 }, { "epoch": 0.3598875351452671, "grad_norm": 81.4185321584637, "learning_rate": 1.9178769034155887e-07, "logits/chosen": -1.6560229063034058, "logits/rejected": -1.7177590131759644, "logps/chosen": -144.23033142089844, "logps/rejected": -166.01162719726562, "loss": 0.6303, "rewards/accuracies": 0.65625, "rewards/chosen": -0.19495287537574768, "rewards/margins": 0.08614547550678253, "rewards/rejected": -0.281098335981369, "step": 312 }, { "epoch": 0.36219450652440344, "grad_norm": 70.47869326950462, "learning_rate": 1.9163421771526151e-07, "logits/chosen": -1.5131672620773315, "logits/rejected": -1.548357367515564, "logps/chosen": -146.3427734375, "logps/rejected": -159.85092163085938, "loss": 0.6536, "rewards/accuracies": 0.75, "rewards/chosen": -0.1731819212436676, "rewards/margins": 0.1254611313343048, "rewards/rejected": -0.29864302277565, "step": 314 }, { "epoch": 0.36450147790353976, "grad_norm": 79.69549984021036, "learning_rate": 1.914793868488021e-07, "logits/chosen": -1.512197732925415, "logits/rejected": -1.4396047592163086, "logps/chosen": -97.64339447021484, "logps/rejected": -117.3057632446289, "loss": 0.6579, "rewards/accuracies": 0.5625, "rewards/chosen": -0.1673259437084198, "rewards/margins": 0.045555103570222855, "rewards/rejected": -0.21288102865219116, "step": 316 }, { "epoch": 0.3668084492826761, "grad_norm": 82.99383875929993, "learning_rate": 1.9132320003714754e-07, "logits/chosen": -1.5376619100570679, "logits/rejected": -1.5551142692565918, "logps/chosen": -207.0707244873047, "logps/rejected": -242.56712341308594, "loss": 0.6439, "rewards/accuracies": 0.65625, "rewards/chosen": -0.24572816491127014, "rewards/margins": 0.16944444179534912, "rewards/rejected": -0.41517263650894165, "step": 318 }, { "epoch": 0.3691154206618124, "grad_norm": 78.2099765504223, "learning_rate": 1.9116565959536327e-07, "logits/chosen": -1.4779236316680908, "logits/rejected": -1.4861027002334595, "logps/chosen": -193.60748291015625, "logps/rejected": -232.04690551757812, "loss": 0.6534, "rewards/accuracies": 0.625, "rewards/chosen": -0.16232052445411682, "rewards/margins": 0.13388732075691223, "rewards/rejected": -0.29620781540870667, "step": 320 }, { "epoch": 0.37142239204094873, "grad_norm": 74.80406821040707, "learning_rate": 1.9100676785857857e-07, "logits/chosen": -1.6256941556930542, "logits/rejected": -1.5659886598587036, "logps/chosen": -170.6388702392578, "logps/rejected": -198.07733154296875, "loss": 0.6395, "rewards/accuracies": 0.6875, "rewards/chosen": -0.17732584476470947, "rewards/margins": 0.1462487280368805, "rewards/rejected": -0.32357457280158997, "step": 322 }, { "epoch": 0.37372936342008506, "grad_norm": 81.93843569632895, "learning_rate": 1.9084652718195236e-07, "logits/chosen": -1.5257925987243652, "logits/rejected": -1.4617056846618652, "logps/chosen": -208.795166015625, "logps/rejected": -243.7969970703125, "loss": 0.6648, "rewards/accuracies": 0.53125, "rewards/chosen": -0.2373400181531906, "rewards/margins": 0.16046729683876038, "rewards/rejected": -0.3978073298931122, "step": 324 }, { "epoch": 0.3760363347992214, "grad_norm": 68.63199696676665, "learning_rate": 1.9068493994063798e-07, "logits/chosen": -1.4899076223373413, "logits/rejected": -1.5616645812988281, "logps/chosen": -133.66110229492188, "logps/rejected": -236.15924072265625, "loss": 0.6245, "rewards/accuracies": 0.65625, "rewards/chosen": -0.15444569289684296, "rewards/margins": 0.2277567982673645, "rewards/rejected": -0.38220247626304626, "step": 326 }, { "epoch": 0.3783433061783577, "grad_norm": 77.96696778978115, "learning_rate": 1.905220085297482e-07, "logits/chosen": -1.5441091060638428, "logits/rejected": -1.6405153274536133, "logps/chosen": -204.56991577148438, "logps/rejected": -610.9658203125, "loss": 0.6369, "rewards/accuracies": 0.78125, "rewards/chosen": -0.25125816464424133, "rewards/margins": 0.27758753299713135, "rewards/rejected": -0.5288456678390503, "step": 328 }, { "epoch": 0.38065027755749403, "grad_norm": 70.94819657566394, "learning_rate": 1.9035773536431955e-07, "logits/chosen": -1.5916917324066162, "logits/rejected": -1.529220461845398, "logps/chosen": -137.5714111328125, "logps/rejected": -160.11544799804688, "loss": 0.628, "rewards/accuracies": 0.65625, "rewards/chosen": -0.20854628086090088, "rewards/margins": 0.11146115511655807, "rewards/rejected": -0.32000741362571716, "step": 330 }, { "epoch": 0.38295724893663036, "grad_norm": 74.31467840644032, "learning_rate": 1.901921228792766e-07, "logits/chosen": -1.5668599605560303, "logits/rejected": -1.6017038822174072, "logps/chosen": -253.0677947998047, "logps/rejected": -266.9024658203125, "loss": 0.6419, "rewards/accuracies": 0.59375, "rewards/chosen": -0.2701232433319092, "rewards/margins": 0.1171327605843544, "rewards/rejected": -0.387255996465683, "step": 332 }, { "epoch": 0.3852642203157667, "grad_norm": 80.19418315617096, "learning_rate": 1.9002517352939596e-07, "logits/chosen": -1.538657784461975, "logits/rejected": -1.4902359247207642, "logps/chosen": -151.844482421875, "logps/rejected": -182.43423461914062, "loss": 0.6542, "rewards/accuracies": 0.625, "rewards/chosen": -0.20499791204929352, "rewards/margins": 0.14708584547042847, "rewards/rejected": -0.3520837724208832, "step": 334 }, { "epoch": 0.38757119169490306, "grad_norm": 78.45881437768317, "learning_rate": 1.898568897892697e-07, "logits/chosen": -1.502273440361023, "logits/rejected": -1.567176342010498, "logps/chosen": -149.17568969726562, "logps/rejected": -218.93869018554688, "loss": 0.6324, "rewards/accuracies": 0.71875, "rewards/chosen": -0.21270516514778137, "rewards/margins": 0.24096481502056122, "rewards/rejected": -0.4536699950695038, "step": 336 }, { "epoch": 0.3898781630740394, "grad_norm": 69.72871536048268, "learning_rate": 1.8968727415326882e-07, "logits/chosen": -1.595134973526001, "logits/rejected": -1.6751508712768555, "logps/chosen": -112.13485717773438, "logps/rejected": -138.27838134765625, "loss": 0.6302, "rewards/accuracies": 0.6875, "rewards/chosen": -0.11406655609607697, "rewards/margins": 0.13377144932746887, "rewards/rejected": -0.24783800542354584, "step": 338 }, { "epoch": 0.3921851344531757, "grad_norm": 66.47735099680594, "learning_rate": 1.8951632913550623e-07, "logits/chosen": -1.6112767457962036, "logits/rejected": -1.5350615978240967, "logps/chosen": -212.4505615234375, "logps/rejected": -239.0753173828125, "loss": 0.621, "rewards/accuracies": 0.625, "rewards/chosen": -0.12918683886528015, "rewards/margins": 0.254965603351593, "rewards/rejected": -0.3841524124145508, "step": 340 }, { "epoch": 0.39449210583231203, "grad_norm": 81.17863346925296, "learning_rate": 1.8934405726979945e-07, "logits/chosen": -1.4070253372192383, "logits/rejected": -1.4879088401794434, "logps/chosen": -166.3784942626953, "logps/rejected": -204.57489013671875, "loss": 0.6395, "rewards/accuracies": 0.65625, "rewards/chosen": -0.31329959630966187, "rewards/margins": 0.13568538427352905, "rewards/rejected": -0.4489849805831909, "step": 342 }, { "epoch": 0.39679907721144836, "grad_norm": 72.25844304700202, "learning_rate": 1.8917046110963314e-07, "logits/chosen": -1.6808464527130127, "logits/rejected": -1.6618741750717163, "logps/chosen": -184.7408905029297, "logps/rejected": -213.8212127685547, "loss": 0.6414, "rewards/accuracies": 0.65625, "rewards/chosen": -0.1948232203722, "rewards/margins": 0.18943095207214355, "rewards/rejected": -0.3842541575431824, "step": 344 }, { "epoch": 0.3991060485905847, "grad_norm": 69.12287284056892, "learning_rate": 1.8899554322812116e-07, "logits/chosen": -1.677032470703125, "logits/rejected": -1.6319351196289062, "logps/chosen": -114.67143249511719, "logps/rejected": -125.2265625, "loss": 0.6256, "rewards/accuracies": 0.78125, "rewards/chosen": -0.18165619671344757, "rewards/margins": 0.17791113257408142, "rewards/rejected": -0.3595673143863678, "step": 346 }, { "epoch": 0.401413019969721, "grad_norm": 68.82861341006546, "learning_rate": 1.8881930621796846e-07, "logits/chosen": -1.531043291091919, "logits/rejected": -1.4552069902420044, "logps/chosen": -172.90670776367188, "logps/rejected": -228.29833984375, "loss": 0.6321, "rewards/accuracies": 0.65625, "rewards/chosen": -0.21518906950950623, "rewards/margins": 0.16281384229660034, "rewards/rejected": -0.37800291180610657, "step": 348 }, { "epoch": 0.40371999134885733, "grad_norm": 79.01675049183694, "learning_rate": 1.8864175269143273e-07, "logits/chosen": -1.628811001777649, "logits/rejected": -1.5073944330215454, "logps/chosen": -162.4159393310547, "logps/rejected": -173.65521240234375, "loss": 0.6361, "rewards/accuracies": 0.71875, "rewards/chosen": -0.17217856645584106, "rewards/margins": 0.20255069434642792, "rewards/rejected": -0.3747292459011078, "step": 350 }, { "epoch": 0.40602696272799366, "grad_norm": 80.14358020089544, "learning_rate": 1.8846288528028552e-07, "logits/chosen": -1.2868863344192505, "logits/rejected": -1.4563894271850586, "logps/chosen": -176.4993438720703, "logps/rejected": -219.99745178222656, "loss": 0.6388, "rewards/accuracies": 0.65625, "rewards/chosen": -0.34355729818344116, "rewards/margins": 0.19085751473903656, "rewards/rejected": -0.5344148278236389, "step": 352 }, { "epoch": 0.40833393410713, "grad_norm": 72.34750725400806, "learning_rate": 1.8828270663577336e-07, "logits/chosen": -1.5702780485153198, "logits/rejected": -1.6198755502700806, "logps/chosen": -135.76097106933594, "logps/rejected": -133.5688018798828, "loss": 0.6593, "rewards/accuracies": 0.625, "rewards/chosen": -0.28700345754623413, "rewards/margins": 0.014538988471031189, "rewards/rejected": -0.3015424311161041, "step": 354 }, { "epoch": 0.4106409054862663, "grad_norm": 71.70524840332104, "learning_rate": 1.8810121942857845e-07, "logits/chosen": -1.5310659408569336, "logits/rejected": -1.547040343284607, "logps/chosen": -137.63137817382812, "logps/rejected": -175.15028381347656, "loss": 0.6293, "rewards/accuracies": 0.75, "rewards/chosen": -0.1476406753063202, "rewards/margins": 0.20084424316883087, "rewards/rejected": -0.34848493337631226, "step": 356 }, { "epoch": 0.41294787686540263, "grad_norm": 77.60677795627835, "learning_rate": 1.8791842634877896e-07, "logits/chosen": -1.546626091003418, "logits/rejected": -1.6076010465621948, "logps/chosen": -136.61058044433594, "logps/rejected": -187.11056518554688, "loss": 0.6506, "rewards/accuracies": 0.625, "rewards/chosen": -0.2092825025320053, "rewards/margins": 0.11802927404642105, "rewards/rejected": -0.32731181383132935, "step": 358 }, { "epoch": 0.41525484824453895, "grad_norm": 76.22986147865214, "learning_rate": 1.8773433010580933e-07, "logits/chosen": -1.5016052722930908, "logits/rejected": -1.6018908023834229, "logps/chosen": -129.33348083496094, "logps/rejected": -151.12342834472656, "loss": 0.627, "rewards/accuracies": 0.5625, "rewards/chosen": -0.1853492707014084, "rewards/margins": 0.10909079760313034, "rewards/rejected": -0.2944400906562805, "step": 360 }, { "epoch": 0.4175618196236753, "grad_norm": 71.86807271397895, "learning_rate": 1.8754893342842e-07, "logits/chosen": -1.5751183032989502, "logits/rejected": -1.4908232688903809, "logps/chosen": -187.5486602783203, "logps/rejected": -194.04296875, "loss": 0.6223, "rewards/accuracies": 0.71875, "rewards/chosen": -0.27427998185157776, "rewards/margins": 0.1835474967956543, "rewards/rejected": -0.45782750844955444, "step": 362 }, { "epoch": 0.4198687910028116, "grad_norm": 70.36519300815779, "learning_rate": 1.8736223906463695e-07, "logits/chosen": -1.6419646739959717, "logits/rejected": -1.6212923526763916, "logps/chosen": -165.32421875, "logps/rejected": -171.27830505371094, "loss": 0.6154, "rewards/accuracies": 0.6875, "rewards/chosen": -0.21126417815685272, "rewards/margins": 0.188466876745224, "rewards/rejected": -0.3997310400009155, "step": 364 }, { "epoch": 0.4221757623819479, "grad_norm": 70.09468918933095, "learning_rate": 1.8717424978172102e-07, "logits/chosen": -1.3921918869018555, "logits/rejected": -1.469792127609253, "logps/chosen": -167.81964111328125, "logps/rejected": -210.77825927734375, "loss": 0.6308, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2520577609539032, "rewards/margins": 0.21120049059391022, "rewards/rejected": -0.463258296251297, "step": 366 }, { "epoch": 0.42448273376108425, "grad_norm": 83.57733506311956, "learning_rate": 1.8698496836612691e-07, "logits/chosen": -1.494173288345337, "logits/rejected": -1.5522290468215942, "logps/chosen": -163.31491088867188, "logps/rejected": -189.11239624023438, "loss": 0.6605, "rewards/accuracies": 0.71875, "rewards/chosen": -0.2657621204853058, "rewards/margins": 0.16207075119018555, "rewards/rejected": -0.42783284187316895, "step": 368 }, { "epoch": 0.4267897051402206, "grad_norm": 81.29498139829452, "learning_rate": 1.8679439762346184e-07, "logits/chosen": -1.5649724006652832, "logits/rejected": -1.6319153308868408, "logps/chosen": -208.2643585205078, "logps/rejected": -215.9363555908203, "loss": 0.6724, "rewards/accuracies": 0.71875, "rewards/chosen": -0.27036455273628235, "rewards/margins": 0.1651400327682495, "rewards/rejected": -0.43550461530685425, "step": 370 }, { "epoch": 0.42909667651935696, "grad_norm": 76.18451864107462, "learning_rate": 1.8660254037844388e-07, "logits/chosen": -1.4427084922790527, "logits/rejected": -1.5188959836959839, "logps/chosen": -171.85968017578125, "logps/rejected": -233.1151580810547, "loss": 0.629, "rewards/accuracies": 0.71875, "rewards/chosen": -0.27071186900138855, "rewards/margins": 0.2559873163700104, "rewards/rejected": -0.5266991853713989, "step": 372 }, { "epoch": 0.4314036478984933, "grad_norm": 82.63010621157098, "learning_rate": 1.8640939947486023e-07, "logits/chosen": -1.5887802839279175, "logits/rejected": -1.355837106704712, "logps/chosen": -242.5066375732422, "logps/rejected": -230.2034912109375, "loss": 0.6329, "rewards/accuracies": 0.59375, "rewards/chosen": -0.3870730698108673, "rewards/margins": 0.15506887435913086, "rewards/rejected": -0.5421419143676758, "step": 374 }, { "epoch": 0.4337106192776296, "grad_norm": 59.14499379914714, "learning_rate": 1.8621497777552505e-07, "logits/chosen": -1.420657992362976, "logits/rejected": -1.4776450395584106, "logps/chosen": -127.46673583984375, "logps/rejected": -184.2600860595703, "loss": 0.5869, "rewards/accuracies": 0.875, "rewards/chosen": -0.15772147476673126, "rewards/margins": 0.3883221745491028, "rewards/rejected": -0.5460436344146729, "step": 376 }, { "epoch": 0.43601759065676593, "grad_norm": 76.51933767322383, "learning_rate": 1.8601927816223695e-07, "logits/chosen": -1.3575465679168701, "logits/rejected": -1.3156774044036865, "logps/chosen": -218.0836944580078, "logps/rejected": -228.03778076171875, "loss": 0.6557, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4051874279975891, "rewards/margins": 0.143568217754364, "rewards/rejected": -0.5487555861473083, "step": 378 }, { "epoch": 0.43832456203590225, "grad_norm": 61.424133205634206, "learning_rate": 1.8582230353573624e-07, "logits/chosen": -1.4618622064590454, "logits/rejected": -1.4945478439331055, "logps/chosen": -95.66145324707031, "logps/rejected": -135.7235870361328, "loss": 0.6206, "rewards/accuracies": 0.75, "rewards/chosen": -0.1784054934978485, "rewards/margins": 0.23733605444431305, "rewards/rejected": -0.415741503238678, "step": 380 }, { "epoch": 0.4406315334150386, "grad_norm": 64.92661329207279, "learning_rate": 1.8562405681566214e-07, "logits/chosen": -1.5636019706726074, "logits/rejected": -1.5756021738052368, "logps/chosen": -201.42442321777344, "logps/rejected": -188.35606384277344, "loss": 0.6289, "rewards/accuracies": 0.625, "rewards/chosen": -0.3109050691127777, "rewards/margins": 0.10487519204616547, "rewards/rejected": -0.415780246257782, "step": 382 }, { "epoch": 0.4429385047941749, "grad_norm": 83.39366061705226, "learning_rate": 1.854245409405092e-07, "logits/chosen": -1.6649830341339111, "logits/rejected": -1.5097665786743164, "logps/chosen": -217.35536193847656, "logps/rejected": -223.5187225341797, "loss": 0.6113, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2543387711048126, "rewards/margins": 0.2463696151971817, "rewards/rejected": -0.5007083415985107, "step": 384 }, { "epoch": 0.4452454761733112, "grad_norm": 74.49558456416251, "learning_rate": 1.852237588675841e-07, "logits/chosen": -1.582183599472046, "logits/rejected": -1.7068113088607788, "logps/chosen": -162.75521850585938, "logps/rejected": -220.6885986328125, "loss": 0.5992, "rewards/accuracies": 0.75, "rewards/chosen": -0.21387754380702972, "rewards/margins": 0.31847310066223145, "rewards/rejected": -0.5323505997657776, "step": 386 }, { "epoch": 0.44755244755244755, "grad_norm": 72.0795411450381, "learning_rate": 1.850217135729614e-07, "logits/chosen": -1.605985164642334, "logits/rejected": -1.5858122110366821, "logps/chosen": -196.78073120117188, "logps/rejected": -213.26580810546875, "loss": 0.6034, "rewards/accuracies": 0.625, "rewards/chosen": -0.44325143098831177, "rewards/margins": 0.07666480541229248, "rewards/rejected": -0.5199161767959595, "step": 388 }, { "epoch": 0.4498594189315839, "grad_norm": 72.48651390442274, "learning_rate": 1.8481840805143987e-07, "logits/chosen": -1.5632058382034302, "logits/rejected": -1.5244344472885132, "logps/chosen": -127.80747985839844, "logps/rejected": -152.81256103515625, "loss": 0.6163, "rewards/accuracies": 0.875, "rewards/chosen": -0.1298586130142212, "rewards/margins": 0.42240971326828003, "rewards/rejected": -0.5522683262825012, "step": 390 }, { "epoch": 0.4521663903107202, "grad_norm": 74.34299341635638, "learning_rate": 1.8461384531649773e-07, "logits/chosen": -1.4820444583892822, "logits/rejected": -1.605046033859253, "logps/chosen": -105.68638610839844, "logps/rejected": -156.26785278320312, "loss": 0.6202, "rewards/accuracies": 0.75, "rewards/chosen": -0.1893598437309265, "rewards/margins": 0.2589360773563385, "rewards/rejected": -0.4482958912849426, "step": 392 }, { "epoch": 0.4544733616898565, "grad_norm": 76.36773452235572, "learning_rate": 1.844080284002482e-07, "logits/chosen": -1.5065568685531616, "logits/rejected": -1.5656404495239258, "logps/chosen": -158.7242889404297, "logps/rejected": -228.84844970703125, "loss": 0.6139, "rewards/accuracies": 0.75, "rewards/chosen": -0.251006543636322, "rewards/margins": 0.21102304756641388, "rewards/rejected": -0.46202951669692993, "step": 394 }, { "epoch": 0.45678033306899285, "grad_norm": 71.03674812873284, "learning_rate": 1.8420096035339452e-07, "logits/chosen": -1.5289005041122437, "logits/rejected": -1.527197003364563, "logps/chosen": -200.40029907226562, "logps/rejected": -212.3697967529297, "loss": 0.6187, "rewards/accuracies": 0.625, "rewards/chosen": -0.2883009612560272, "rewards/margins": 0.30317747592926025, "rewards/rejected": -0.5914784073829651, "step": 396 }, { "epoch": 0.4590873044481292, "grad_norm": 81.19707296013529, "learning_rate": 1.8399264424518465e-07, "logits/chosen": -1.494114875793457, "logits/rejected": -1.4553757905960083, "logps/chosen": -173.10043334960938, "logps/rejected": -222.2396240234375, "loss": 0.5955, "rewards/accuracies": 0.59375, "rewards/chosen": -0.3878926932811737, "rewards/margins": 0.3027462959289551, "rewards/rejected": -0.6906389594078064, "step": 398 }, { "epoch": 0.4613942758272655, "grad_norm": 89.13135103863338, "learning_rate": 1.8378308316336582e-07, "logits/chosen": -1.618680715560913, "logits/rejected": -1.5578938722610474, "logps/chosen": -191.10128784179688, "logps/rejected": -280.5110778808594, "loss": 0.6411, "rewards/accuracies": 0.625, "rewards/chosen": -0.4683380126953125, "rewards/margins": 0.19769813120365143, "rewards/rejected": -0.6660361289978027, "step": 400 }, { "epoch": 0.4613942758272655, "eval_logits/chosen": -1.4853571653366089, "eval_logits/rejected": -1.3932629823684692, "eval_logps/chosen": -189.0384521484375, "eval_logps/rejected": -156.24160766601562, "eval_loss": 0.654194176197052, "eval_rewards/accuracies": 0.6800000071525574, "eval_rewards/chosen": -0.38562828302383423, "eval_rewards/margins": 0.18699264526367188, "eval_rewards/rejected": -0.5726209282875061, "eval_runtime": 26.5299, "eval_samples_per_second": 3.769, "eval_steps_per_second": 0.942, "step": 400 }, { "epoch": 0.4637012472064018, "grad_norm": 69.21606890792003, "learning_rate": 1.8357228021413883e-07, "logits/chosen": -1.5431230068206787, "logits/rejected": -1.7365866899490356, "logps/chosen": -147.3966827392578, "logps/rejected": -170.9712371826172, "loss": 0.6581, "rewards/accuracies": 0.59375, "rewards/chosen": -0.30663323402404785, "rewards/margins": 0.11269617080688477, "rewards/rejected": -0.4193294048309326, "step": 402 }, { "epoch": 0.46600821858553815, "grad_norm": 78.7990153253576, "learning_rate": 1.8336023852211194e-07, "logits/chosen": -1.5721492767333984, "logits/rejected": -1.4822769165039062, "logps/chosen": -148.9419403076172, "logps/rejected": -158.44668579101562, "loss": 0.609, "rewards/accuracies": 0.75, "rewards/chosen": -0.27455994486808777, "rewards/margins": 0.3990754187107086, "rewards/rejected": -0.6736353039741516, "step": 404 }, { "epoch": 0.4683151899646745, "grad_norm": 67.81492283153628, "learning_rate": 1.8314696123025453e-07, "logits/chosen": -1.6370363235473633, "logits/rejected": -1.5174671411514282, "logps/chosen": -145.17050170898438, "logps/rejected": -142.74551391601562, "loss": 0.6312, "rewards/accuracies": 0.71875, "rewards/chosen": -0.28109437227249146, "rewards/margins": 0.2069387137889862, "rewards/rejected": -0.48803308606147766, "step": 406 }, { "epoch": 0.47062216134381085, "grad_norm": 78.2843593072173, "learning_rate": 1.8293245149985053e-07, "logits/chosen": -1.5488444566726685, "logits/rejected": -1.4798938035964966, "logps/chosen": -161.83570861816406, "logps/rejected": -162.7615509033203, "loss": 0.6484, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2718978822231293, "rewards/margins": 0.15639187395572662, "rewards/rejected": -0.4282897710800171, "step": 408 }, { "epoch": 0.4729291327229472, "grad_norm": 73.10449012391845, "learning_rate": 1.827167125104517e-07, "logits/chosen": -1.4978845119476318, "logits/rejected": -1.4839560985565186, "logps/chosen": -148.445556640625, "logps/rejected": -161.85986328125, "loss": 0.6481, "rewards/accuracies": 0.59375, "rewards/chosen": -0.27761712670326233, "rewards/margins": 0.09577606618404388, "rewards/rejected": -0.3733932077884674, "step": 410 }, { "epoch": 0.4752361041020835, "grad_norm": 77.23312704566136, "learning_rate": 1.8249974745983021e-07, "logits/chosen": -1.4896149635314941, "logits/rejected": -1.4279950857162476, "logps/chosen": -136.3888397216797, "logps/rejected": -184.14625549316406, "loss": 0.6186, "rewards/accuracies": 0.75, "rewards/chosen": -0.3546374440193176, "rewards/margins": 0.3140718638896942, "rewards/rejected": -0.6687093377113342, "step": 412 }, { "epoch": 0.4775430754812198, "grad_norm": 65.58481770102698, "learning_rate": 1.822815595639316e-07, "logits/chosen": -1.4790016412734985, "logits/rejected": -1.525940179824829, "logps/chosen": -162.99288940429688, "logps/rejected": -190.2974853515625, "loss": 0.6112, "rewards/accuracies": 0.6875, "rewards/chosen": -0.36069726943969727, "rewards/margins": 0.20576652884483337, "rewards/rejected": -0.5664637684822083, "step": 414 }, { "epoch": 0.47985004686035615, "grad_norm": 68.7972400850831, "learning_rate": 1.820621520568268e-07, "logits/chosen": -1.5574984550476074, "logits/rejected": -1.4820420742034912, "logps/chosen": -178.15878295898438, "logps/rejected": -191.66177368164062, "loss": 0.6, "rewards/accuracies": 0.75, "rewards/chosen": -0.314214825630188, "rewards/margins": 0.32970941066741943, "rewards/rejected": -0.6439242362976074, "step": 416 }, { "epoch": 0.4821570182394925, "grad_norm": 77.22458475405976, "learning_rate": 1.8184152819066434e-07, "logits/chosen": -1.5454033613204956, "logits/rejected": -1.5681257247924805, "logps/chosen": -206.4539031982422, "logps/rejected": -221.17599487304688, "loss": 0.6395, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4876091778278351, "rewards/margins": 0.06031504273414612, "rewards/rejected": -0.5479242205619812, "step": 418 }, { "epoch": 0.4844639896186288, "grad_norm": 69.59230881656185, "learning_rate": 1.8161969123562217e-07, "logits/chosen": -1.54752516746521, "logits/rejected": -1.5821384191513062, "logps/chosen": -182.0235137939453, "logps/rejected": -163.29364013671875, "loss": 0.6107, "rewards/accuracies": 0.75, "rewards/chosen": -0.3480142056941986, "rewards/margins": 0.3120378255844116, "rewards/rejected": -0.6600520610809326, "step": 420 }, { "epoch": 0.4867709609977651, "grad_norm": 68.29468448816121, "learning_rate": 1.813966444798591e-07, "logits/chosen": -1.513810634613037, "logits/rejected": -1.4666978120803833, "logps/chosen": -204.99462890625, "logps/rejected": -204.5595245361328, "loss": 0.6143, "rewards/accuracies": 0.75, "rewards/chosen": -0.3375055491924286, "rewards/margins": 0.3794183135032654, "rewards/rejected": -0.7169238328933716, "step": 422 }, { "epoch": 0.48907793237690145, "grad_norm": 73.69015362328696, "learning_rate": 1.8117239122946611e-07, "logits/chosen": -1.3477180004119873, "logits/rejected": -1.4509586095809937, "logps/chosen": -118.67777252197266, "logps/rejected": -176.48667907714844, "loss": 0.6192, "rewards/accuracies": 0.625, "rewards/chosen": -0.3034321069717407, "rewards/margins": 0.12479298561811447, "rewards/rejected": -0.4282251298427582, "step": 424 }, { "epoch": 0.49138490375603777, "grad_norm": 78.31541581493791, "learning_rate": 1.809469348084174e-07, "logits/chosen": -1.459653377532959, "logits/rejected": -1.5776402950286865, "logps/chosen": -159.45347595214844, "logps/rejected": -189.2720489501953, "loss": 0.6554, "rewards/accuracies": 0.65625, "rewards/chosen": -0.37468722462654114, "rewards/margins": 0.1383470892906189, "rewards/rejected": -0.5130342841148376, "step": 426 }, { "epoch": 0.4936918751351741, "grad_norm": 130.5379676824635, "learning_rate": 1.8072027855852095e-07, "logits/chosen": -1.4528967142105103, "logits/rejected": -1.423844814300537, "logps/chosen": -172.85316467285156, "logps/rejected": -215.22189331054688, "loss": 0.6639, "rewards/accuracies": 0.71875, "rewards/chosen": -0.41784724593162537, "rewards/margins": 0.3192124366760254, "rewards/rejected": -0.7370596528053284, "step": 428 }, { "epoch": 0.4959988465143104, "grad_norm": 63.21984381769687, "learning_rate": 1.8049242583936918e-07, "logits/chosen": -1.5084190368652344, "logits/rejected": -1.4574109315872192, "logps/chosen": -165.896484375, "logps/rejected": -227.423828125, "loss": 0.5893, "rewards/accuracies": 0.8125, "rewards/chosen": -0.25652381777763367, "rewards/margins": 0.47441697120666504, "rewards/rejected": -0.7309407591819763, "step": 430 }, { "epoch": 0.49830581789344675, "grad_norm": 71.69590925642426, "learning_rate": 1.802633800282891e-07, "logits/chosen": -1.516315221786499, "logits/rejected": -1.6526371240615845, "logps/chosen": -229.77777099609375, "logps/rejected": -292.7660827636719, "loss": 0.5979, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3780279755592346, "rewards/margins": 0.49888893961906433, "rewards/rejected": -0.8769169449806213, "step": 432 }, { "epoch": 0.5006127892725831, "grad_norm": 72.54608833334152, "learning_rate": 1.8003314452029213e-07, "logits/chosen": -1.5792149305343628, "logits/rejected": -1.550574779510498, "logps/chosen": -226.616455078125, "logps/rejected": -228.4210205078125, "loss": 0.6046, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5066580176353455, "rewards/margins": 0.34013134241104126, "rewards/rejected": -0.8467893600463867, "step": 434 }, { "epoch": 0.5029197606517194, "grad_norm": 73.04169645370872, "learning_rate": 1.7980172272802395e-07, "logits/chosen": -1.5109785795211792, "logits/rejected": -1.499125361442566, "logps/chosen": -154.92233276367188, "logps/rejected": -175.07643127441406, "loss": 0.5817, "rewards/accuracies": 0.8125, "rewards/chosen": -0.25602594017982483, "rewards/margins": 0.5013114809989929, "rewards/rejected": -0.7573373913764954, "step": 436 }, { "epoch": 0.5052267320308558, "grad_norm": 69.05059334922119, "learning_rate": 1.7956911808171373e-07, "logits/chosen": -1.561600923538208, "logits/rejected": -1.5301151275634766, "logps/chosen": -217.26930236816406, "logps/rejected": -240.7093048095703, "loss": 0.6151, "rewards/accuracies": 0.59375, "rewards/chosen": -0.46973368525505066, "rewards/margins": 0.2093038558959961, "rewards/rejected": -0.6790375113487244, "step": 438 }, { "epoch": 0.507533703409992, "grad_norm": 74.68873536524164, "learning_rate": 1.793353340291235e-07, "logits/chosen": -1.3198765516281128, "logits/rejected": -1.4805912971496582, "logps/chosen": -175.9479217529297, "logps/rejected": -226.83265686035156, "loss": 0.6134, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5549490451812744, "rewards/margins": 0.23202911019325256, "rewards/rejected": -0.7869781851768494, "step": 440 }, { "epoch": 0.5098406747891284, "grad_norm": 73.37532376774183, "learning_rate": 1.7910037403549692e-07, "logits/chosen": -1.4717934131622314, "logits/rejected": -1.5461549758911133, "logps/chosen": -159.91883850097656, "logps/rejected": -204.87376403808594, "loss": 0.6459, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4326345920562744, "rewards/margins": 0.22945694625377655, "rewards/rejected": -0.6620914936065674, "step": 442 }, { "epoch": 0.5121476461682647, "grad_norm": 69.28741446430803, "learning_rate": 1.7886424158350782e-07, "logits/chosen": -1.5604138374328613, "logits/rejected": -1.663907766342163, "logps/chosen": -158.54408264160156, "logps/rejected": -192.7698516845703, "loss": 0.5921, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3090921640396118, "rewards/margins": 0.3891502916812897, "rewards/rejected": -0.6982424855232239, "step": 444 }, { "epoch": 0.5144546175474011, "grad_norm": 77.66154968693108, "learning_rate": 1.7862694017320886e-07, "logits/chosen": -1.3435657024383545, "logits/rejected": -1.3843066692352295, "logps/chosen": -174.62672424316406, "logps/rejected": -288.0128173828125, "loss": 0.6145, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4857187271118164, "rewards/margins": 0.4818662703037262, "rewards/rejected": -0.9675850868225098, "step": 446 }, { "epoch": 0.5167615889265373, "grad_norm": 86.0701716220196, "learning_rate": 1.7838847332197937e-07, "logits/chosen": -1.4369436502456665, "logits/rejected": -1.5111709833145142, "logps/chosen": -193.0187225341797, "logps/rejected": -258.660400390625, "loss": 0.6179, "rewards/accuracies": 0.71875, "rewards/chosen": -0.4460400640964508, "rewards/margins": 0.4027029871940613, "rewards/rejected": -0.8487430810928345, "step": 448 }, { "epoch": 0.5190685603056737, "grad_norm": 84.40844346826594, "learning_rate": 1.7814884456447335e-07, "logits/chosen": -1.5306761264801025, "logits/rejected": -1.4944154024124146, "logps/chosen": -195.49612426757812, "logps/rejected": -222.01425170898438, "loss": 0.6006, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2904947102069855, "rewards/margins": 0.5166550874710083, "rewards/rejected": -0.8071498870849609, "step": 450 }, { "epoch": 0.52137553168481, "grad_norm": 86.3712126774886, "learning_rate": 1.7790805745256703e-07, "logits/chosen": -1.3275847434997559, "logits/rejected": -1.38175630569458, "logps/chosen": -136.90707397460938, "logps/rejected": -184.36331176757812, "loss": 0.6767, "rewards/accuracies": 0.625, "rewards/chosen": -0.44699156284332275, "rewards/margins": 0.12617343664169312, "rewards/rejected": -0.5731649398803711, "step": 452 }, { "epoch": 0.5236825030639464, "grad_norm": 66.61833278109548, "learning_rate": 1.7766611555530635e-07, "logits/chosen": -1.6141921281814575, "logits/rejected": -1.5151243209838867, "logps/chosen": -156.77407836914062, "logps/rejected": -154.7230682373047, "loss": 0.5733, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3759933114051819, "rewards/margins": 0.17464786767959595, "rewards/rejected": -0.5506411790847778, "step": 454 }, { "epoch": 0.5259894744430826, "grad_norm": 69.26758309677136, "learning_rate": 1.774230224588538e-07, "logits/chosen": -1.3204282522201538, "logits/rejected": -1.4286822080612183, "logps/chosen": -152.52542114257812, "logps/rejected": -232.16189575195312, "loss": 0.5494, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4260653853416443, "rewards/margins": 0.5102941989898682, "rewards/rejected": -0.9363595247268677, "step": 456 }, { "epoch": 0.528296445822219, "grad_norm": 81.07739462727531, "learning_rate": 1.771787817664356e-07, "logits/chosen": -1.508811116218567, "logits/rejected": -1.5395921468734741, "logps/chosen": -134.4735565185547, "logps/rejected": -166.41592407226562, "loss": 0.6351, "rewards/accuracies": 0.5, "rewards/chosen": -0.49481019377708435, "rewards/margins": 0.1262877732515335, "rewards/rejected": -0.6210979223251343, "step": 458 }, { "epoch": 0.5306034172013554, "grad_norm": 86.01343093557993, "learning_rate": 1.769333970982879e-07, "logits/chosen": -1.518664836883545, "logits/rejected": -1.3482635021209717, "logps/chosen": -173.78538513183594, "logps/rejected": -160.53573608398438, "loss": 0.5857, "rewards/accuracies": 0.625, "rewards/chosen": -0.49463319778442383, "rewards/margins": 0.202806293964386, "rewards/rejected": -0.6974395513534546, "step": 460 }, { "epoch": 0.5329103885804917, "grad_norm": 85.16027410016599, "learning_rate": 1.766868720916035e-07, "logits/chosen": -1.359481930732727, "logits/rejected": -1.3029265403747559, "logps/chosen": -134.05616760253906, "logps/rejected": -134.0654754638672, "loss": 0.6487, "rewards/accuracies": 0.625, "rewards/chosen": -0.4239296019077301, "rewards/margins": 0.03123108297586441, "rewards/rejected": -0.4551607072353363, "step": 462 }, { "epoch": 0.535217359959628, "grad_norm": 84.5629811685175, "learning_rate": 1.7643921040047766e-07, "logits/chosen": -1.6018937826156616, "logits/rejected": -1.6816954612731934, "logps/chosen": -237.3992919921875, "logps/rejected": -253.08688354492188, "loss": 0.597, "rewards/accuracies": 0.59375, "rewards/chosen": -0.6288573741912842, "rewards/margins": 0.15610165894031525, "rewards/rejected": -0.7849590182304382, "step": 464 }, { "epoch": 0.5375243313387643, "grad_norm": 80.72916842158041, "learning_rate": 1.7619041569585418e-07, "logits/chosen": -1.4444328546524048, "logits/rejected": -1.4673030376434326, "logps/chosen": -170.2801971435547, "logps/rejected": -214.7718963623047, "loss": 0.6181, "rewards/accuracies": 0.75, "rewards/chosen": -0.564181923866272, "rewards/margins": 0.2302751988172531, "rewards/rejected": -0.7944571375846863, "step": 466 }, { "epoch": 0.5398313027179007, "grad_norm": 76.00828750498393, "learning_rate": 1.759404916654707e-07, "logits/chosen": -1.4668854475021362, "logits/rejected": -1.421462059020996, "logps/chosen": -360.7674560546875, "logps/rejected": -301.1515197753906, "loss": 0.6139, "rewards/accuracies": 0.75, "rewards/chosen": -0.6432144641876221, "rewards/margins": 0.3255874514579773, "rewards/rejected": -0.9688019156455994, "step": 468 }, { "epoch": 0.542138274097037, "grad_norm": 75.00038820917719, "learning_rate": 1.756894420138043e-07, "logits/chosen": -1.5766559839248657, "logits/rejected": -1.656800627708435, "logps/chosen": -216.8627471923828, "logps/rejected": -270.90850830078125, "loss": 0.615, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4949862062931061, "rewards/margins": 0.4039486050605774, "rewards/rejected": -0.8989347815513611, "step": 470 }, { "epoch": 0.5444452454761733, "grad_norm": 86.17675092820859, "learning_rate": 1.754372704620164e-07, "logits/chosen": -1.4618090391159058, "logits/rejected": -1.5533053874969482, "logps/chosen": -202.59561157226562, "logps/rejected": -221.70413208007812, "loss": 0.6478, "rewards/accuracies": 0.75, "rewards/chosen": -0.44822004437446594, "rewards/margins": 0.28794264793395996, "rewards/rejected": -0.7361626625061035, "step": 472 }, { "epoch": 0.5467522168553096, "grad_norm": 72.36150215283246, "learning_rate": 1.7518398074789774e-07, "logits/chosen": -1.4804517030715942, "logits/rejected": -1.5212501287460327, "logps/chosen": -195.58935546875, "logps/rejected": -247.99276733398438, "loss": 0.553, "rewards/accuracies": 0.8125, "rewards/chosen": -0.44707149267196655, "rewards/margins": 0.6286894679069519, "rewards/rejected": -1.0757609605789185, "step": 474 }, { "epoch": 0.549059188234446, "grad_norm": 73.94947964279808, "learning_rate": 1.7492957662581294e-07, "logits/chosen": -1.3577089309692383, "logits/rejected": -1.4486963748931885, "logps/chosen": -133.3319091796875, "logps/rejected": -188.2812957763672, "loss": 0.6001, "rewards/accuracies": 0.65625, "rewards/chosen": -0.34889039397239685, "rewards/margins": 0.3021068871021271, "rewards/rejected": -0.6509972214698792, "step": 476 }, { "epoch": 0.5513661596135823, "grad_norm": 74.0047644626624, "learning_rate": 1.7467406186664473e-07, "logits/chosen": -1.5747010707855225, "logits/rejected": -1.5058567523956299, "logps/chosen": -216.6630401611328, "logps/rejected": -223.66598510742188, "loss": 0.6345, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5371094346046448, "rewards/margins": 0.3996596932411194, "rewards/rejected": -0.9367691874504089, "step": 478 }, { "epoch": 0.5536731309927186, "grad_norm": 50.915761396824145, "learning_rate": 1.7441744025773834e-07, "logits/chosen": -1.4014126062393188, "logits/rejected": -1.569306492805481, "logps/chosen": -156.43629455566406, "logps/rejected": -228.84625244140625, "loss": 0.5975, "rewards/accuracies": 0.5625, "rewards/chosen": -0.399608850479126, "rewards/margins": 0.29513585567474365, "rewards/rejected": -0.6947447061538696, "step": 480 }, { "epoch": 0.5559801023718549, "grad_norm": 80.40246802194461, "learning_rate": 1.74159715602845e-07, "logits/chosen": -1.49760103225708, "logits/rejected": -1.4302232265472412, "logps/chosen": -152.4906005859375, "logps/rejected": -165.43942260742188, "loss": 0.6511, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4252295196056366, "rewards/margins": 0.12136977910995483, "rewards/rejected": -0.5465993285179138, "step": 482 }, { "epoch": 0.5582870737509913, "grad_norm": 70.56990492477674, "learning_rate": 1.739008917220659e-07, "logits/chosen": -1.4919289350509644, "logits/rejected": -1.5267033576965332, "logps/chosen": -187.85191345214844, "logps/rejected": -220.8524169921875, "loss": 0.5689, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5196070671081543, "rewards/margins": 0.3590528666973114, "rewards/rejected": -0.8786599636077881, "step": 484 }, { "epoch": 0.5605940451301276, "grad_norm": 78.98020718967784, "learning_rate": 1.7364097245179527e-07, "logits/chosen": -1.599880337715149, "logits/rejected": -1.5224246978759766, "logps/chosen": -196.72555541992188, "logps/rejected": -213.14309692382812, "loss": 0.5892, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5108906030654907, "rewards/margins": 0.1902090609073639, "rewards/rejected": -0.701099693775177, "step": 486 }, { "epoch": 0.5629010165092639, "grad_norm": 75.35371757401214, "learning_rate": 1.733799616446637e-07, "logits/chosen": -1.4978597164154053, "logits/rejected": -1.5102261304855347, "logps/chosen": -186.15167236328125, "logps/rejected": -226.00375366210938, "loss": 0.6112, "rewards/accuracies": 0.75, "rewards/chosen": -0.43081170320510864, "rewards/margins": 0.36774906516075134, "rewards/rejected": -0.7985607385635376, "step": 488 }, { "epoch": 0.5652079878884002, "grad_norm": 75.43303696622675, "learning_rate": 1.7311786316948108e-07, "logits/chosen": -1.418121337890625, "logits/rejected": -1.4920923709869385, "logps/chosen": -179.17889404296875, "logps/rejected": -229.40098571777344, "loss": 0.5938, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6558996438980103, "rewards/margins": 0.27045130729675293, "rewards/rejected": -0.9263509511947632, "step": 490 }, { "epoch": 0.5675149592675366, "grad_norm": 71.0686050492484, "learning_rate": 1.7285468091117904e-07, "logits/chosen": -1.4989047050476074, "logits/rejected": -1.4156945943832397, "logps/chosen": -153.10214233398438, "logps/rejected": -172.13262939453125, "loss": 0.5901, "rewards/accuracies": 0.625, "rewards/chosen": -0.4824844300746918, "rewards/margins": 0.44079095125198364, "rewards/rejected": -0.9232754707336426, "step": 492 }, { "epoch": 0.569821930646673, "grad_norm": 67.99918941849218, "learning_rate": 1.7259041877075352e-07, "logits/chosen": -1.430630087852478, "logits/rejected": -1.3989218473434448, "logps/chosen": -209.73452758789062, "logps/rejected": -254.0313720703125, "loss": 0.5729, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5363369584083557, "rewards/margins": 0.5890082120895386, "rewards/rejected": -1.125345230102539, "step": 494 }, { "epoch": 0.5721289020258092, "grad_norm": 78.40754956054191, "learning_rate": 1.7232508066520698e-07, "logits/chosen": -1.5510261058807373, "logits/rejected": -1.5487847328186035, "logps/chosen": -211.16983032226562, "logps/rejected": -240.33824157714844, "loss": 0.5772, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4531714916229248, "rewards/margins": 0.2688879370689392, "rewards/rejected": -0.7220594882965088, "step": 496 }, { "epoch": 0.5744358734049456, "grad_norm": 61.990430466819326, "learning_rate": 1.7205867052749023e-07, "logits/chosen": -1.363396167755127, "logits/rejected": -1.3964465856552124, "logps/chosen": -147.12242126464844, "logps/rejected": -180.23667907714844, "loss": 0.6459, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5750865340232849, "rewards/margins": 0.11157172918319702, "rewards/rejected": -0.6866582632064819, "step": 498 }, { "epoch": 0.5767428447840819, "grad_norm": 76.0573953537264, "learning_rate": 1.717911923064442e-07, "logits/chosen": -1.5747530460357666, "logits/rejected": -1.4509817361831665, "logps/chosen": -181.61216735839844, "logps/rejected": -153.97573852539062, "loss": 0.6012, "rewards/accuracies": 0.59375, "rewards/chosen": -0.5273740887641907, "rewards/margins": 0.1454104781150818, "rewards/rejected": -0.6727845668792725, "step": 500 }, { "epoch": 0.5767428447840819, "eval_logits/chosen": -1.440444827079773, "eval_logits/rejected": -1.3533989191055298, "eval_logps/chosen": -191.4648895263672, "eval_logps/rejected": -158.6099395751953, "eval_loss": 0.636239767074585, "eval_rewards/accuracies": 0.6800000071525574, "eval_rewards/chosen": -0.628268837928772, "eval_rewards/margins": 0.18118661642074585, "eval_rewards/rejected": -0.809455394744873, "eval_runtime": 37.9799, "eval_samples_per_second": 2.633, "eval_steps_per_second": 0.658, "step": 500 }, { "epoch": 0.5790498161632183, "grad_norm": 77.78220283215643, "learning_rate": 1.7152264996674135e-07, "logits/chosen": -1.4428610801696777, "logits/rejected": -1.2872042655944824, "logps/chosen": -184.39501953125, "logps/rejected": -238.38723754882812, "loss": 0.5953, "rewards/accuracies": 0.75, "rewards/chosen": -0.6475786566734314, "rewards/margins": 0.2779845893383026, "rewards/rejected": -0.9255632758140564, "step": 502 }, { "epoch": 0.5813567875423545, "grad_norm": 93.29916680291039, "learning_rate": 1.71253047488827e-07, "logits/chosen": -1.4898688793182373, "logits/rejected": -1.5620332956314087, "logps/chosen": -178.47802734375, "logps/rejected": -205.5224609375, "loss": 0.6703, "rewards/accuracies": 0.625, "rewards/chosen": -0.5680350065231323, "rewards/margins": 0.18766377866268158, "rewards/rejected": -0.7556988000869751, "step": 504 }, { "epoch": 0.5836637589214909, "grad_norm": 77.19105499219319, "learning_rate": 1.7098238886886024e-07, "logits/chosen": -1.4835506677627563, "logits/rejected": -1.5302045345306396, "logps/chosen": -203.8736114501953, "logps/rejected": -228.69265747070312, "loss": 0.5951, "rewards/accuracies": 0.53125, "rewards/chosen": -0.47867119312286377, "rewards/margins": 0.22942683100700378, "rewards/rejected": -0.7080979943275452, "step": 506 }, { "epoch": 0.5859707303006272, "grad_norm": 67.4261860354, "learning_rate": 1.7071067811865473e-07, "logits/chosen": -1.4649958610534668, "logits/rejected": -1.4145183563232422, "logps/chosen": -199.42066955566406, "logps/rejected": -235.40292358398438, "loss": 0.5368, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4195340573787689, "rewards/margins": 0.551209032535553, "rewards/rejected": -0.9707430601119995, "step": 508 }, { "epoch": 0.5882777016797636, "grad_norm": 87.85240065033273, "learning_rate": 1.7043791926561932e-07, "logits/chosen": -1.5964919328689575, "logits/rejected": -1.561856746673584, "logps/chosen": -201.67276000976562, "logps/rejected": -234.04359436035156, "loss": 0.651, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6162290573120117, "rewards/margins": 0.4439167082309723, "rewards/rejected": -1.0601458549499512, "step": 510 }, { "epoch": 0.5905846730588998, "grad_norm": 62.42968300457303, "learning_rate": 1.7016411635269815e-07, "logits/chosen": -1.4615092277526855, "logits/rejected": -1.4488492012023926, "logps/chosen": -151.2560577392578, "logps/rejected": -176.4474334716797, "loss": 0.609, "rewards/accuracies": 0.75, "rewards/chosen": -0.33995571732521057, "rewards/margins": 0.2483442723751068, "rewards/rejected": -0.5882999897003174, "step": 512 }, { "epoch": 0.5928916444380362, "grad_norm": 74.39629379240114, "learning_rate": 1.6988927343831091e-07, "logits/chosen": -1.5747379064559937, "logits/rejected": -1.4773468971252441, "logps/chosen": -198.891845703125, "logps/rejected": -210.0729522705078, "loss": 0.61, "rewards/accuracies": 0.78125, "rewards/chosen": -0.47531554102897644, "rewards/margins": 0.47791624069213867, "rewards/rejected": -0.9532317519187927, "step": 514 }, { "epoch": 0.5951986158171725, "grad_norm": 70.19350216590036, "learning_rate": 1.6961339459629266e-07, "logits/chosen": -1.4481630325317383, "logits/rejected": -1.4714566469192505, "logps/chosen": -190.8370361328125, "logps/rejected": -242.71621704101562, "loss": 0.5872, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5172877907752991, "rewards/margins": 0.48140281438827515, "rewards/rejected": -0.998690664768219, "step": 516 }, { "epoch": 0.5975055871963089, "grad_norm": 73.75535823993799, "learning_rate": 1.6933648391583328e-07, "logits/chosen": -1.531792163848877, "logits/rejected": -1.4680547714233398, "logps/chosen": -144.9717559814453, "logps/rejected": -172.87686157226562, "loss": 0.6006, "rewards/accuracies": 0.59375, "rewards/chosen": -0.3757992386817932, "rewards/margins": 0.35130438208580017, "rewards/rejected": -0.7271036505699158, "step": 518 }, { "epoch": 0.5998125585754451, "grad_norm": 69.85303523035323, "learning_rate": 1.6905854550141714e-07, "logits/chosen": -1.5805073976516724, "logits/rejected": -1.5384862422943115, "logps/chosen": -171.9115753173828, "logps/rejected": -169.82862854003906, "loss": 0.5875, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5081273317337036, "rewards/margins": 0.2863667607307434, "rewards/rejected": -0.794494092464447, "step": 520 }, { "epoch": 0.6021195299545815, "grad_norm": 69.03602758187714, "learning_rate": 1.6877958347276197e-07, "logits/chosen": -1.4844419956207275, "logits/rejected": -1.4906061887741089, "logps/chosen": -149.6005859375, "logps/rejected": -163.59097290039062, "loss": 0.6013, "rewards/accuracies": 0.65625, "rewards/chosen": -0.42841285467147827, "rewards/margins": 0.30834630131721497, "rewards/rejected": -0.7367592453956604, "step": 522 }, { "epoch": 0.6044265013337178, "grad_norm": 80.75337933099041, "learning_rate": 1.6849960196475805e-07, "logits/chosen": -1.5245236158370972, "logits/rejected": -1.5345442295074463, "logps/chosen": -148.5638885498047, "logps/rejected": -178.37429809570312, "loss": 0.5909, "rewards/accuracies": 0.75, "rewards/chosen": -0.3656730651855469, "rewards/margins": 0.3520704507827759, "rewards/rejected": -0.7177435159683228, "step": 524 }, { "epoch": 0.6067334727128542, "grad_norm": 79.6488573037571, "learning_rate": 1.682186051274067e-07, "logits/chosen": -1.4462357759475708, "logits/rejected": -1.4616801738739014, "logps/chosen": -144.83853149414062, "logps/rejected": -191.320556640625, "loss": 0.5847, "rewards/accuracies": 0.75, "rewards/chosen": -0.6087457537651062, "rewards/margins": 0.3239368498325348, "rewards/rejected": -0.9326826930046082, "step": 526 }, { "epoch": 0.6090404440919904, "grad_norm": 82.53815106903608, "learning_rate": 1.6793659712575895e-07, "logits/chosen": -1.5642480850219727, "logits/rejected": -1.4599685668945312, "logps/chosen": -215.29837036132812, "logps/rejected": -199.14767456054688, "loss": 0.5928, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5695382356643677, "rewards/margins": 0.271673321723938, "rewards/rejected": -0.8412115573883057, "step": 528 }, { "epoch": 0.6113474154711268, "grad_norm": 86.53571512694035, "learning_rate": 1.676535821398537e-07, "logits/chosen": -1.3208836317062378, "logits/rejected": -1.3146097660064697, "logps/chosen": -189.41128540039062, "logps/rejected": -232.5477294921875, "loss": 0.6013, "rewards/accuracies": 0.65625, "rewards/chosen": -0.654186487197876, "rewards/margins": 0.4602148234844208, "rewards/rejected": -1.1144013404846191, "step": 530 }, { "epoch": 0.6136543868502632, "grad_norm": 70.64851504723866, "learning_rate": 1.6736956436465573e-07, "logits/chosen": -1.3590030670166016, "logits/rejected": -1.4608113765716553, "logps/chosen": -148.809326171875, "logps/rejected": -203.59759521484375, "loss": 0.5861, "rewards/accuracies": 0.71875, "rewards/chosen": -0.496415913105011, "rewards/margins": 0.31767329573631287, "rewards/rejected": -0.814089298248291, "step": 532 }, { "epoch": 0.6159613582293995, "grad_norm": 73.57136513502368, "learning_rate": 1.6708454800999366e-07, "logits/chosen": -1.4504910707473755, "logits/rejected": -1.4983229637145996, "logps/chosen": -166.2091522216797, "logps/rejected": -206.8488311767578, "loss": 0.6153, "rewards/accuracies": 0.75, "rewards/chosen": -0.49555644392967224, "rewards/margins": 0.3523869812488556, "rewards/rejected": -0.8479433655738831, "step": 534 }, { "epoch": 0.6182683296085358, "grad_norm": 67.83021038753246, "learning_rate": 1.667985373004974e-07, "logits/chosen": -1.4747323989868164, "logits/rejected": -1.3922568559646606, "logps/chosen": -159.47254943847656, "logps/rejected": -177.21884155273438, "loss": 0.5691, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2918567657470703, "rewards/margins": 0.5216075778007507, "rewards/rejected": -0.8134642839431763, "step": 536 }, { "epoch": 0.6205753009876721, "grad_norm": 75.55693314924734, "learning_rate": 1.6651153647553567e-07, "logits/chosen": -1.6021491289138794, "logits/rejected": -1.6126930713653564, "logps/chosen": -165.55172729492188, "logps/rejected": -197.1583251953125, "loss": 0.5986, "rewards/accuracies": 0.65625, "rewards/chosen": -0.505136251449585, "rewards/margins": 0.2592867612838745, "rewards/rejected": -0.7644230127334595, "step": 538 }, { "epoch": 0.6228822723668085, "grad_norm": 74.57237448077612, "learning_rate": 1.6622354978915304e-07, "logits/chosen": -1.3560292720794678, "logits/rejected": -1.4895740747451782, "logps/chosen": -152.60386657714844, "logps/rejected": -200.48497009277344, "loss": 0.5976, "rewards/accuracies": 0.75, "rewards/chosen": -0.450514554977417, "rewards/margins": 0.42979568243026733, "rewards/rejected": -0.8803102374076843, "step": 540 }, { "epoch": 0.6251892437459448, "grad_norm": 76.07758708375029, "learning_rate": 1.6593458151000687e-07, "logits/chosen": -1.418495535850525, "logits/rejected": -1.5285032987594604, "logps/chosen": -174.468017578125, "logps/rejected": -212.58534240722656, "loss": 0.6021, "rewards/accuracies": 0.625, "rewards/chosen": -0.4992409944534302, "rewards/margins": 0.357663631439209, "rewards/rejected": -0.8569046854972839, "step": 542 }, { "epoch": 0.6274962151250811, "grad_norm": 67.61668250943133, "learning_rate": 1.6564463592130426e-07, "logits/chosen": -1.6000475883483887, "logits/rejected": -1.5714551210403442, "logps/chosen": -129.46788024902344, "logps/rejected": -137.58729553222656, "loss": 0.6027, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4155838191509247, "rewards/margins": 0.31966376304626465, "rewards/rejected": -0.7352475523948669, "step": 544 }, { "epoch": 0.6298031865042174, "grad_norm": 67.37831547087359, "learning_rate": 1.6535371732073823e-07, "logits/chosen": -1.5627467632293701, "logits/rejected": -1.4833993911743164, "logps/chosen": -115.5599594116211, "logps/rejected": -121.90804290771484, "loss": 0.5859, "rewards/accuracies": 0.625, "rewards/chosen": -0.286516010761261, "rewards/margins": 0.36314332485198975, "rewards/rejected": -0.6496593356132507, "step": 546 }, { "epoch": 0.6321101578833538, "grad_norm": 79.67037148877638, "learning_rate": 1.650618300204242e-07, "logits/chosen": -1.4731521606445312, "logits/rejected": -1.5530614852905273, "logps/chosen": -218.06552124023438, "logps/rejected": -257.6269226074219, "loss": 0.6104, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7696484923362732, "rewards/margins": 0.28321802616119385, "rewards/rejected": -1.0528665781021118, "step": 548 }, { "epoch": 0.63441712926249, "grad_norm": 67.9423797863854, "learning_rate": 1.6476897834683618e-07, "logits/chosen": -1.4056189060211182, "logits/rejected": -1.4078246355056763, "logps/chosen": -147.92111206054688, "logps/rejected": -188.60968017578125, "loss": 0.6018, "rewards/accuracies": 0.75, "rewards/chosen": -0.5256268978118896, "rewards/margins": 0.4678364396095276, "rewards/rejected": -0.9934633374214172, "step": 550 }, { "epoch": 0.6367241006416264, "grad_norm": 68.15375283996126, "learning_rate": 1.644751666407424e-07, "logits/chosen": -1.2929272651672363, "logits/rejected": -1.3170608282089233, "logps/chosen": -207.3567352294922, "logps/rejected": -262.3974609375, "loss": 0.5823, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7716534733772278, "rewards/margins": 0.6446899771690369, "rewards/rejected": -1.4163434505462646, "step": 552 }, { "epoch": 0.6390310720207627, "grad_norm": 71.41650018580867, "learning_rate": 1.6418039925714115e-07, "logits/chosen": -1.3858839273452759, "logits/rejected": -1.3953114748001099, "logps/chosen": -160.35096740722656, "logps/rejected": -186.47933959960938, "loss": 0.5559, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5581396222114563, "rewards/margins": 0.3457927703857422, "rewards/rejected": -0.9039323329925537, "step": 554 }, { "epoch": 0.6413380433998991, "grad_norm": 76.78836475295354, "learning_rate": 1.6388468056519612e-07, "logits/chosen": -1.4668548107147217, "logits/rejected": -1.4067307710647583, "logps/chosen": -212.10546875, "logps/rejected": -193.7842254638672, "loss": 0.5721, "rewards/accuracies": 0.71875, "rewards/chosen": -0.618504524230957, "rewards/margins": 0.36426225304603577, "rewards/rejected": -0.9827668070793152, "step": 556 }, { "epoch": 0.6436450147790354, "grad_norm": 66.95864858123714, "learning_rate": 1.6358801494817172e-07, "logits/chosen": -1.4181556701660156, "logits/rejected": -1.409440279006958, "logps/chosen": -139.5923309326172, "logps/rejected": -183.9441375732422, "loss": 0.5663, "rewards/accuracies": 0.71875, "rewards/chosen": -0.42550671100616455, "rewards/margins": 0.626122236251831, "rewards/rejected": -1.0516289472579956, "step": 558 }, { "epoch": 0.6459519861581717, "grad_norm": 88.18680458715171, "learning_rate": 1.6329040680336805e-07, "logits/chosen": -1.468677282333374, "logits/rejected": -1.5043675899505615, "logps/chosen": -161.72213745117188, "logps/rejected": -206.85214233398438, "loss": 0.572, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5167573690414429, "rewards/margins": 0.36671191453933716, "rewards/rejected": -0.8834693431854248, "step": 560 }, { "epoch": 0.648258957537308, "grad_norm": 71.84112642036989, "learning_rate": 1.6299186054205575e-07, "logits/chosen": -1.5098912715911865, "logits/rejected": -1.4657700061798096, "logps/chosen": -177.00067138671875, "logps/rejected": -190.06985473632812, "loss": 0.5365, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3948441743850708, "rewards/margins": 0.5432202816009521, "rewards/rejected": -0.9380643963813782, "step": 562 }, { "epoch": 0.6505659289164444, "grad_norm": 77.21845596596229, "learning_rate": 1.6269238058941067e-07, "logits/chosen": -1.5354855060577393, "logits/rejected": -1.4872441291809082, "logps/chosen": -220.86279296875, "logps/rejected": -242.259765625, "loss": 0.6141, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5020161867141724, "rewards/margins": 0.3912605345249176, "rewards/rejected": -0.8932766914367676, "step": 564 }, { "epoch": 0.6528729002955808, "grad_norm": 77.14842839642075, "learning_rate": 1.6239197138444807e-07, "logits/chosen": -1.4313609600067139, "logits/rejected": -1.4305431842803955, "logps/chosen": -99.62786865234375, "logps/rejected": -128.8907928466797, "loss": 0.5895, "rewards/accuracies": 0.75, "rewards/chosen": -0.1888483613729477, "rewards/margins": 0.4503237307071686, "rewards/rejected": -0.6391721367835999, "step": 566 }, { "epoch": 0.655179871674717, "grad_norm": 62.79374975719681, "learning_rate": 1.6209063737995714e-07, "logits/chosen": -1.4637759923934937, "logits/rejected": -1.4549309015274048, "logps/chosen": -144.82948303222656, "logps/rejected": -185.9346466064453, "loss": 0.5515, "rewards/accuracies": 0.71875, "rewards/chosen": -0.44154876470565796, "rewards/margins": 0.37137869000434875, "rewards/rejected": -0.8129273653030396, "step": 568 }, { "epoch": 0.6574868430538534, "grad_norm": 77.33084496555169, "learning_rate": 1.6178838304243472e-07, "logits/chosen": -1.491298794746399, "logits/rejected": -1.5582300424575806, "logps/chosen": -193.7870635986328, "logps/rejected": -242.5855712890625, "loss": 0.5723, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5082133412361145, "rewards/margins": 0.6296249628067017, "rewards/rejected": -1.1378382444381714, "step": 570 }, { "epoch": 0.6597938144329897, "grad_norm": 67.02472308421605, "learning_rate": 1.6148521285201927e-07, "logits/chosen": -1.4817756414413452, "logits/rejected": -1.402366042137146, "logps/chosen": -154.45765686035156, "logps/rejected": -178.16561889648438, "loss": 0.5564, "rewards/accuracies": 0.75, "rewards/chosen": -0.3961385488510132, "rewards/margins": 0.5840703248977661, "rewards/rejected": -0.9802089333534241, "step": 572 }, { "epoch": 0.6621007858121261, "grad_norm": 73.0106659319347, "learning_rate": 1.6118113130242432e-07, "logits/chosen": -1.4550271034240723, "logits/rejected": -1.4115763902664185, "logps/chosen": -221.6585235595703, "logps/rejected": -195.1796417236328, "loss": 0.5774, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8219617009162903, "rewards/margins": 0.16280440986156464, "rewards/rejected": -0.9847662448883057, "step": 574 }, { "epoch": 0.6644077571912623, "grad_norm": 77.31259598468839, "learning_rate": 1.6087614290087206e-07, "logits/chosen": -1.4929287433624268, "logits/rejected": -1.4764537811279297, "logps/chosen": -230.29653930664062, "logps/rejected": -284.22412109375, "loss": 0.5818, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6301875114440918, "rewards/margins": 0.7476638555526733, "rewards/rejected": -1.3778512477874756, "step": 576 }, { "epoch": 0.6667147285703987, "grad_norm": 69.04855850678052, "learning_rate": 1.605702521680263e-07, "logits/chosen": -1.3067015409469604, "logits/rejected": -1.338529348373413, "logps/chosen": -147.36080932617188, "logps/rejected": -193.80665588378906, "loss": 0.5757, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6234080791473389, "rewards/margins": 0.39194294810295105, "rewards/rejected": -1.0153510570526123, "step": 578 }, { "epoch": 0.669021699949535, "grad_norm": 81.45402825293101, "learning_rate": 1.6026346363792565e-07, "logits/chosen": -1.4524238109588623, "logits/rejected": -1.3550243377685547, "logps/chosen": -187.0885772705078, "logps/rejected": -177.09780883789062, "loss": 0.6058, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7711231708526611, "rewards/margins": 0.17797166109085083, "rewards/rejected": -0.9490947127342224, "step": 580 }, { "epoch": 0.6713286713286714, "grad_norm": 65.47602685653504, "learning_rate": 1.5995578185791616e-07, "logits/chosen": -1.387951374053955, "logits/rejected": -1.3309695720672607, "logps/chosen": -158.39202880859375, "logps/rejected": -186.85105895996094, "loss": 0.5825, "rewards/accuracies": 0.75, "rewards/chosen": -0.48583418130874634, "rewards/margins": 0.503716230392456, "rewards/rejected": -0.9895503520965576, "step": 582 }, { "epoch": 0.6736356427078076, "grad_norm": 76.89288613284735, "learning_rate": 1.596472113885841e-07, "logits/chosen": -1.4493763446807861, "logits/rejected": -1.4876127243041992, "logps/chosen": -180.78541564941406, "logps/rejected": -220.08172607421875, "loss": 0.5822, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5832819938659668, "rewards/margins": 0.494464248418808, "rewards/rejected": -1.0777461528778076, "step": 584 }, { "epoch": 0.675942614086944, "grad_norm": 82.2690699212878, "learning_rate": 1.5933775680368822e-07, "logits/chosen": -1.4559937715530396, "logits/rejected": -1.5102128982543945, "logps/chosen": -169.15960693359375, "logps/rejected": -176.64280700683594, "loss": 0.6272, "rewards/accuracies": 0.59375, "rewards/chosen": -0.5040290355682373, "rewards/margins": 0.27444028854370117, "rewards/rejected": -0.7784693241119385, "step": 586 }, { "epoch": 0.6782495854660803, "grad_norm": 76.21062906880101, "learning_rate": 1.5902742269009194e-07, "logits/chosen": -1.348806381225586, "logits/rejected": -1.293540358543396, "logps/chosen": -135.5105438232422, "logps/rejected": -156.5147705078125, "loss": 0.5875, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5231513977050781, "rewards/margins": 0.4782097041606903, "rewards/rejected": -1.0013611316680908, "step": 588 }, { "epoch": 0.6805565568452167, "grad_norm": 75.50192821178838, "learning_rate": 1.5871621364769553e-07, "logits/chosen": -1.5168403387069702, "logits/rejected": -1.4424357414245605, "logps/chosen": -183.81605529785156, "logps/rejected": -171.45872497558594, "loss": 0.6035, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7719120979309082, "rewards/margins": 0.2601196765899658, "rewards/rejected": -1.0320318937301636, "step": 590 }, { "epoch": 0.6828635282243529, "grad_norm": 84.93892075040027, "learning_rate": 1.5840413428936766e-07, "logits/chosen": -1.3720101118087769, "logits/rejected": -1.391021490097046, "logps/chosen": -171.98031616210938, "logps/rejected": -176.23892211914062, "loss": 0.599, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7516859769821167, "rewards/margins": 0.21854539215564728, "rewards/rejected": -0.9702314138412476, "step": 592 }, { "epoch": 0.6851704996034893, "grad_norm": 66.70595859312724, "learning_rate": 1.5809118924087733e-07, "logits/chosen": -1.4547669887542725, "logits/rejected": -1.430787205696106, "logps/chosen": -177.32481384277344, "logps/rejected": -208.61553955078125, "loss": 0.6102, "rewards/accuracies": 0.625, "rewards/chosen": -0.5358410477638245, "rewards/margins": 0.26219645142555237, "rewards/rejected": -0.7980375289916992, "step": 594 }, { "epoch": 0.6874774709826256, "grad_norm": 82.62176636567787, "learning_rate": 1.5777738314082511e-07, "logits/chosen": -1.4137248992919922, "logits/rejected": -1.404469609260559, "logps/chosen": -164.01600646972656, "logps/rejected": -184.97645568847656, "loss": 0.6472, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5918564200401306, "rewards/margins": 0.21411672234535217, "rewards/rejected": -0.8059731721878052, "step": 596 }, { "epoch": 0.689784442361762, "grad_norm": 72.16505210857706, "learning_rate": 1.5746272064057439e-07, "logits/chosen": -1.3921738862991333, "logits/rejected": -1.3382896184921265, "logps/chosen": -199.48634338378906, "logps/rejected": -226.77871704101562, "loss": 0.5858, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5180130004882812, "rewards/margins": 0.4014572501182556, "rewards/rejected": -0.9194702506065369, "step": 598 }, { "epoch": 0.6920914137408983, "grad_norm": 78.66776375616931, "learning_rate": 1.5714720640418247e-07, "logits/chosen": -1.511127233505249, "logits/rejected": -1.5256671905517578, "logps/chosen": -182.10826110839844, "logps/rejected": -198.63510131835938, "loss": 0.618, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6393432021141052, "rewards/margins": 0.16456884145736694, "rewards/rejected": -0.8039120435714722, "step": 600 }, { "epoch": 0.6920914137408983, "eval_logits/chosen": -1.4086966514587402, "eval_logits/rejected": -1.3254387378692627, "eval_logps/chosen": -191.96621704101562, "eval_logps/rejected": -160.9102325439453, "eval_loss": 0.6056262850761414, "eval_rewards/accuracies": 0.7200000286102295, "eval_rewards/chosen": -0.6784057021141052, "eval_rewards/margins": 0.3610783815383911, "eval_rewards/rejected": -1.0394840240478516, "eval_runtime": 37.022, "eval_samples_per_second": 2.701, "eval_steps_per_second": 0.675, "step": 600 }, { "epoch": 0.6943983851200346, "grad_norm": 60.2099543993624, "learning_rate": 1.5683084510833155e-07, "logits/chosen": -1.506928563117981, "logits/rejected": -1.4527332782745361, "logps/chosen": -136.79698181152344, "logps/rejected": -171.7388916015625, "loss": 0.528, "rewards/accuracies": 0.75, "rewards/chosen": -0.40472638607025146, "rewards/margins": 0.5502471923828125, "rewards/rejected": -0.9549736380577087, "step": 602 }, { "epoch": 0.696705356499171, "grad_norm": 61.94420954124892, "learning_rate": 1.5651364144225918e-07, "logits/chosen": -1.385040521621704, "logits/rejected": -1.4811543226242065, "logps/chosen": -156.2510986328125, "logps/rejected": -228.10455322265625, "loss": 0.5886, "rewards/accuracies": 0.78125, "rewards/chosen": -0.45847296714782715, "rewards/margins": 0.805530309677124, "rewards/rejected": -1.2640032768249512, "step": 604 }, { "epoch": 0.6990123278783072, "grad_norm": 68.10520883639806, "learning_rate": 1.5619560010768892e-07, "logits/chosen": -1.429363489151001, "logits/rejected": -1.474833369255066, "logps/chosen": -105.5804672241211, "logps/rejected": -149.79159545898438, "loss": 0.5776, "rewards/accuracies": 0.53125, "rewards/chosen": -0.4616313576698303, "rewards/margins": 0.11374694108963013, "rewards/rejected": -0.5753782987594604, "step": 606 }, { "epoch": 0.7013192992574436, "grad_norm": 80.98545740701343, "learning_rate": 1.558767258187605e-07, "logits/chosen": -1.3892110586166382, "logits/rejected": -1.3783142566680908, "logps/chosen": -192.46571350097656, "logps/rejected": -301.35589599609375, "loss": 0.5993, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7047985196113586, "rewards/margins": 0.531038761138916, "rewards/rejected": -1.2358373403549194, "step": 608 }, { "epoch": 0.7036262706365799, "grad_norm": 67.59649388895859, "learning_rate": 1.555570233019602e-07, "logits/chosen": -1.528752326965332, "logits/rejected": -1.4884089231491089, "logps/chosen": -149.33953857421875, "logps/rejected": -156.87937927246094, "loss": 0.5549, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5484266877174377, "rewards/margins": 0.2577958106994629, "rewards/rejected": -0.8062225580215454, "step": 610 }, { "epoch": 0.7059332420157163, "grad_norm": 70.41159723680883, "learning_rate": 1.5523649729605057e-07, "logits/chosen": -1.549803614616394, "logits/rejected": -1.5478185415267944, "logps/chosen": -202.39064025878906, "logps/rejected": -191.8920135498047, "loss": 0.5702, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7880931496620178, "rewards/margins": 0.3749513328075409, "rewards/rejected": -1.1630443334579468, "step": 612 }, { "epoch": 0.7082402133948525, "grad_norm": 84.62171550148092, "learning_rate": 1.5491515255200023e-07, "logits/chosen": -1.3567522764205933, "logits/rejected": -1.3661439418792725, "logps/chosen": -194.17117309570312, "logps/rejected": -233.16098022460938, "loss": 0.5773, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7068712711334229, "rewards/margins": 0.5711615085601807, "rewards/rejected": -1.2780327796936035, "step": 614 }, { "epoch": 0.7105471847739889, "grad_norm": 78.0462456849789, "learning_rate": 1.5459299383291345e-07, "logits/chosen": -1.4880882501602173, "logits/rejected": -1.5348231792449951, "logps/chosen": -165.6639404296875, "logps/rejected": -203.5524139404297, "loss": 0.5763, "rewards/accuracies": 0.75, "rewards/chosen": -0.4784421920776367, "rewards/margins": 0.41431570053100586, "rewards/rejected": -0.8927579522132874, "step": 616 }, { "epoch": 0.7128541561531252, "grad_norm": 77.7841366725459, "learning_rate": 1.5427002591395964e-07, "logits/chosen": -1.4515254497528076, "logits/rejected": -1.4519662857055664, "logps/chosen": -172.7522430419922, "logps/rejected": -327.67791748046875, "loss": 0.6045, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5820431709289551, "rewards/margins": 0.3262157440185547, "rewards/rejected": -0.908258855342865, "step": 618 }, { "epoch": 0.7151611275322616, "grad_norm": 62.11750703500811, "learning_rate": 1.539462535823025e-07, "logits/chosen": -1.3129442930221558, "logits/rejected": -1.4445109367370605, "logps/chosen": -101.69780731201172, "logps/rejected": -159.57347106933594, "loss": 0.5584, "rewards/accuracies": 0.8125, "rewards/chosen": -0.33953019976615906, "rewards/margins": 0.539714515209198, "rewards/rejected": -0.8792447447776794, "step": 620 }, { "epoch": 0.7174680989113978, "grad_norm": 64.90210429350962, "learning_rate": 1.5362168163702897e-07, "logits/chosen": -1.4593255519866943, "logits/rejected": -1.497260570526123, "logps/chosen": -174.7530975341797, "logps/rejected": -190.82069396972656, "loss": 0.5744, "rewards/accuracies": 0.5, "rewards/chosen": -0.44477853178977966, "rewards/margins": 0.10030045360326767, "rewards/rejected": -0.5450789928436279, "step": 622 }, { "epoch": 0.7197750702905342, "grad_norm": 62.61220782799182, "learning_rate": 1.5329631488907834e-07, "logits/chosen": -1.3799552917480469, "logits/rejected": -1.3643782138824463, "logps/chosen": -106.81502532958984, "logps/rejected": -163.37252807617188, "loss": 0.5539, "rewards/accuracies": 0.71875, "rewards/chosen": -0.45432302355766296, "rewards/margins": 0.5782679319381714, "rewards/rejected": -1.0325908660888672, "step": 624 }, { "epoch": 0.7220820416696705, "grad_norm": 64.30209297911543, "learning_rate": 1.529701581611707e-07, "logits/chosen": -1.3249229192733765, "logits/rejected": -1.312375783920288, "logps/chosen": -188.57989501953125, "logps/rejected": -259.47991943359375, "loss": 0.5864, "rewards/accuracies": 0.625, "rewards/chosen": -0.6669080257415771, "rewards/margins": 0.5166423916816711, "rewards/rejected": -1.1835503578186035, "step": 626 }, { "epoch": 0.7243890130488069, "grad_norm": 81.32626957899065, "learning_rate": 1.5264321628773557e-07, "logits/chosen": -1.5589840412139893, "logits/rejected": -1.6068872213363647, "logps/chosen": -152.81272888183594, "logps/rejected": -160.92356872558594, "loss": 0.5575, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5344883799552917, "rewards/margins": 0.2917326092720032, "rewards/rejected": -0.8262209296226501, "step": 628 }, { "epoch": 0.7266959844279431, "grad_norm": 63.10819469515795, "learning_rate": 1.5231549411484021e-07, "logits/chosen": -1.3950941562652588, "logits/rejected": -1.4998093843460083, "logps/chosen": -182.9403533935547, "logps/rejected": -533.6143798828125, "loss": 0.528, "rewards/accuracies": 0.75, "rewards/chosen": -0.6201209425926208, "rewards/margins": 0.9562212228775024, "rewards/rejected": -1.5763421058654785, "step": 630 }, { "epoch": 0.7290029558070795, "grad_norm": 74.17672200121388, "learning_rate": 1.5198699650011783e-07, "logits/chosen": -1.5746220350265503, "logits/rejected": -1.5152575969696045, "logps/chosen": -114.51097869873047, "logps/rejected": -142.73123168945312, "loss": 0.5682, "rewards/accuracies": 0.53125, "rewards/chosen": -0.5173571705818176, "rewards/margins": 0.4040333032608032, "rewards/rejected": -0.9213904738426208, "step": 632 }, { "epoch": 0.7313099271862158, "grad_norm": 74.93724286416074, "learning_rate": 1.5165772831269546e-07, "logits/chosen": -1.4648115634918213, "logits/rejected": -1.353775978088379, "logps/chosen": -166.06561279296875, "logps/rejected": -161.02513122558594, "loss": 0.5914, "rewards/accuracies": 0.53125, "rewards/chosen": -0.6317393779754639, "rewards/margins": 0.12096243351697922, "rewards/rejected": -0.7527018189430237, "step": 634 }, { "epoch": 0.7336168985653522, "grad_norm": 78.1768997076042, "learning_rate": 1.5132769443312206e-07, "logits/chosen": -1.3494369983673096, "logits/rejected": -1.3753654956817627, "logps/chosen": -170.8379669189453, "logps/rejected": -236.137939453125, "loss": 0.5601, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7240103483200073, "rewards/margins": 0.7221677303314209, "rewards/rejected": -1.4461781978607178, "step": 636 }, { "epoch": 0.7359238699444886, "grad_norm": 72.68675984518603, "learning_rate": 1.5099689975329582e-07, "logits/chosen": -1.454272747039795, "logits/rejected": -1.3425519466400146, "logps/chosen": -159.56729125976562, "logps/rejected": -176.78076171875, "loss": 0.5931, "rewards/accuracies": 0.75, "rewards/chosen": -0.6215909719467163, "rewards/margins": 0.44397681951522827, "rewards/rejected": -1.0655678510665894, "step": 638 }, { "epoch": 0.7382308413236248, "grad_norm": 103.38173736649145, "learning_rate": 1.5066534917639194e-07, "logits/chosen": -1.5096681118011475, "logits/rejected": -1.4664478302001953, "logps/chosen": -168.00894165039062, "logps/rejected": -172.64730834960938, "loss": 0.6387, "rewards/accuracies": 0.75, "rewards/chosen": -0.6311260461807251, "rewards/margins": 0.37282636761665344, "rewards/rejected": -1.0039525032043457, "step": 640 }, { "epoch": 0.7405378127027612, "grad_norm": 89.90439421362515, "learning_rate": 1.5033304761678974e-07, "logits/chosen": -1.4626061916351318, "logits/rejected": -1.3341164588928223, "logps/chosen": -205.5465087890625, "logps/rejected": -253.4654541015625, "loss": 0.694, "rewards/accuracies": 0.59375, "rewards/chosen": -0.898277997970581, "rewards/margins": 0.6167319416999817, "rewards/rejected": -1.5150099992752075, "step": 642 }, { "epoch": 0.7428447840818975, "grad_norm": 80.94685802233828, "learning_rate": 1.5e-07, "logits/chosen": -1.5373523235321045, "logits/rejected": -1.5548286437988281, "logps/chosen": -202.79559326171875, "logps/rejected": -259.5960998535156, "loss": 0.5766, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6312007308006287, "rewards/margins": 0.5061548948287964, "rewards/rejected": -1.1373556852340698, "step": 644 }, { "epoch": 0.7451517554610338, "grad_norm": 76.95960849605379, "learning_rate": 1.4966621126259182e-07, "logits/chosen": -1.3829816579818726, "logits/rejected": -1.437648892402649, "logps/chosen": -176.02261352539062, "logps/rejected": -231.1138916015625, "loss": 0.5749, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8245862126350403, "rewards/margins": 0.5100637674331665, "rewards/rejected": -1.3346500396728516, "step": 646 }, { "epoch": 0.7474587268401701, "grad_norm": 86.16535127809355, "learning_rate": 1.4933168635211954e-07, "logits/chosen": -1.3192147016525269, "logits/rejected": -1.2893555164337158, "logps/chosen": -180.10354614257812, "logps/rejected": -209.48648071289062, "loss": 0.6253, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6453790664672852, "rewards/margins": 0.4227368235588074, "rewards/rejected": -1.0681159496307373, "step": 648 }, { "epoch": 0.7497656982193065, "grad_norm": 76.07003275403453, "learning_rate": 1.489964302270493e-07, "logits/chosen": -1.3541210889816284, "logits/rejected": -1.4539867639541626, "logps/chosen": -152.72354125976562, "logps/rejected": -201.84646606445312, "loss": 0.5653, "rewards/accuracies": 0.59375, "rewards/chosen": -0.6403982043266296, "rewards/margins": 0.3868240416049957, "rewards/rejected": -1.0272222757339478, "step": 650 }, { "epoch": 0.7520726695984428, "grad_norm": 57.518835795007355, "learning_rate": 1.4866044785668562e-07, "logits/chosen": -1.496565341949463, "logits/rejected": -1.5742026567459106, "logps/chosen": -184.71319580078125, "logps/rejected": -223.27279663085938, "loss": 0.5689, "rewards/accuracies": 0.59375, "rewards/chosen": -0.731714129447937, "rewards/margins": 0.3506891429424286, "rewards/rejected": -1.082403302192688, "step": 652 }, { "epoch": 0.7543796409775791, "grad_norm": 69.54916104252412, "learning_rate": 1.483237442210978e-07, "logits/chosen": -1.5298668146133423, "logits/rejected": -1.4372026920318604, "logps/chosen": -182.90591430664062, "logps/rejected": -185.94142150878906, "loss": 0.5684, "rewards/accuracies": 0.5625, "rewards/chosen": -0.652667224407196, "rewards/margins": 0.2604321837425232, "rewards/rejected": -0.9130994081497192, "step": 654 }, { "epoch": 0.7566866123567154, "grad_norm": 86.35756436296293, "learning_rate": 1.479863243110459e-07, "logits/chosen": -1.5615227222442627, "logits/rejected": -1.5152493715286255, "logps/chosen": -135.16763305664062, "logps/rejected": -151.04348754882812, "loss": 0.5864, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4842577576637268, "rewards/margins": 0.39588865637779236, "rewards/rejected": -0.8801463842391968, "step": 656 }, { "epoch": 0.7589935837358518, "grad_norm": 71.27708160399212, "learning_rate": 1.4764819312790704e-07, "logits/chosen": -1.3233333826065063, "logits/rejected": -1.3788268566131592, "logps/chosen": -167.7100067138672, "logps/rejected": -222.81932067871094, "loss": 0.5365, "rewards/accuracies": 0.75, "rewards/chosen": -0.4721766710281372, "rewards/margins": 0.8050169944763184, "rewards/rejected": -1.277193546295166, "step": 658 }, { "epoch": 0.7613005551149881, "grad_norm": 67.4040730071334, "learning_rate": 1.4730935568360101e-07, "logits/chosen": -1.4443995952606201, "logits/rejected": -1.4764584302902222, "logps/chosen": -132.14419555664062, "logps/rejected": -231.72938537597656, "loss": 0.574, "rewards/accuracies": 0.625, "rewards/chosen": -0.47360759973526, "rewards/margins": 0.66644686460495, "rewards/rejected": -1.14005446434021, "step": 660 }, { "epoch": 0.7636075264941244, "grad_norm": 72.31061103050068, "learning_rate": 1.4696981700051613e-07, "logits/chosen": -1.4308209419250488, "logits/rejected": -1.5072565078735352, "logps/chosen": -215.11119079589844, "logps/rejected": -308.4154968261719, "loss": 0.5539, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7280828952789307, "rewards/margins": 0.589575469493866, "rewards/rejected": -1.3176583051681519, "step": 662 }, { "epoch": 0.7659144978732607, "grad_norm": 78.75618194817389, "learning_rate": 1.4662958211143478e-07, "logits/chosen": -1.4515475034713745, "logits/rejected": -1.3340954780578613, "logps/chosen": -177.06072998046875, "logps/rejected": -191.52259826660156, "loss": 0.546, "rewards/accuracies": 0.84375, "rewards/chosen": -0.6530216336250305, "rewards/margins": 0.680112361907959, "rewards/rejected": -1.3331340551376343, "step": 664 }, { "epoch": 0.7682214692523971, "grad_norm": 71.81422997168194, "learning_rate": 1.4628865605945884e-07, "logits/chosen": -1.4638550281524658, "logits/rejected": -1.562100887298584, "logps/chosen": -149.26190185546875, "logps/rejected": -181.56045532226562, "loss": 0.5891, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6122503876686096, "rewards/margins": 0.3339022099971771, "rewards/rejected": -0.9461526870727539, "step": 666 }, { "epoch": 0.7705284406315334, "grad_norm": 64.13905687965561, "learning_rate": 1.4594704389793476e-07, "logits/chosen": -1.342291235923767, "logits/rejected": -1.3401648998260498, "logps/chosen": -132.9333038330078, "logps/rejected": -151.05657958984375, "loss": 0.5292, "rewards/accuracies": 0.65625, "rewards/chosen": -0.4934079647064209, "rewards/margins": 0.5540266036987305, "rewards/rejected": -1.0474345684051514, "step": 668 }, { "epoch": 0.7728354120106697, "grad_norm": 64.25891918041506, "learning_rate": 1.4560475069037895e-07, "logits/chosen": -1.4274932146072388, "logits/rejected": -1.320129156112671, "logps/chosen": -131.24008178710938, "logps/rejected": -158.29779052734375, "loss": 0.559, "rewards/accuracies": 0.8125, "rewards/chosen": -0.47251197695732117, "rewards/margins": 0.6539871692657471, "rewards/rejected": -1.126499056816101, "step": 670 }, { "epoch": 0.7751423833898061, "grad_norm": 79.74116137345376, "learning_rate": 1.4526178151040238e-07, "logits/chosen": -1.4236022233963013, "logits/rejected": -1.4343180656433105, "logps/chosen": -216.5073699951172, "logps/rejected": -253.56356811523438, "loss": 0.6319, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8843379616737366, "rewards/margins": 0.40150371193885803, "rewards/rejected": -1.285841703414917, "step": 672 }, { "epoch": 0.7774493547689424, "grad_norm": 56.17449348305076, "learning_rate": 1.449181414416357e-07, "logits/chosen": -1.43699312210083, "logits/rejected": -1.4178071022033691, "logps/chosen": -157.00311279296875, "logps/rejected": -156.90017700195312, "loss": 0.5701, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6511672139167786, "rewards/margins": 0.3164646625518799, "rewards/rejected": -0.9676318168640137, "step": 674 }, { "epoch": 0.7797563261480788, "grad_norm": 66.63614533861005, "learning_rate": 1.4457383557765383e-07, "logits/chosen": -1.4309780597686768, "logits/rejected": -1.4221723079681396, "logps/chosen": -151.88272094726562, "logps/rejected": -181.7390594482422, "loss": 0.5585, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7592535614967346, "rewards/margins": 0.47323358058929443, "rewards/rejected": -1.2324870824813843, "step": 676 }, { "epoch": 0.782063297527215, "grad_norm": 72.3899064306705, "learning_rate": 1.4422886902190013e-07, "logits/chosen": -1.4335781335830688, "logits/rejected": -1.3959101438522339, "logps/chosen": -227.32432556152344, "logps/rejected": -237.228515625, "loss": 0.5979, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7621864676475525, "rewards/margins": 0.6047258377075195, "rewards/rejected": -1.3669122457504272, "step": 678 }, { "epoch": 0.7843702689063514, "grad_norm": 68.22407643194211, "learning_rate": 1.438832468876112e-07, "logits/chosen": -1.5670726299285889, "logits/rejected": -1.5206338167190552, "logps/chosen": -162.11500549316406, "logps/rejected": -189.17723083496094, "loss": 0.5806, "rewards/accuracies": 0.6875, "rewards/chosen": -0.46744874119758606, "rewards/margins": 0.37639617919921875, "rewards/rejected": -0.843845009803772, "step": 680 }, { "epoch": 0.7866772402854877, "grad_norm": 76.17187863595697, "learning_rate": 1.435369742977408e-07, "logits/chosen": -1.3989527225494385, "logits/rejected": -1.331002116203308, "logps/chosen": -150.4923553466797, "logps/rejected": -152.4705810546875, "loss": 0.5466, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5739311575889587, "rewards/margins": 0.6111598610877991, "rewards/rejected": -1.1850910186767578, "step": 682 }, { "epoch": 0.7889842116646241, "grad_norm": 71.79810308029995, "learning_rate": 1.4319005638488411e-07, "logits/chosen": -1.4992091655731201, "logits/rejected": -1.5468194484710693, "logps/chosen": -139.28636169433594, "logps/rejected": -164.89332580566406, "loss": 0.6042, "rewards/accuracies": 0.75, "rewards/chosen": -0.5346908569335938, "rewards/margins": 0.4574483633041382, "rewards/rejected": -0.9921392202377319, "step": 684 }, { "epoch": 0.7912911830437603, "grad_norm": 84.61839383932663, "learning_rate": 1.4284249829120144e-07, "logits/chosen": -1.4420665502548218, "logits/rejected": -1.4561961889266968, "logps/chosen": -163.67910766601562, "logps/rejected": -212.11285400390625, "loss": 0.5338, "rewards/accuracies": 0.59375, "rewards/chosen": -0.6722000241279602, "rewards/margins": 0.5183134078979492, "rewards/rejected": -1.1905133724212646, "step": 686 }, { "epoch": 0.7935981544228967, "grad_norm": 69.04050689616894, "learning_rate": 1.4249430516834219e-07, "logits/chosen": -1.427943468093872, "logits/rejected": -1.2925009727478027, "logps/chosen": -170.96783447265625, "logps/rejected": -154.1614227294922, "loss": 0.6101, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6926352977752686, "rewards/margins": 0.3386973738670349, "rewards/rejected": -1.0313327312469482, "step": 688 }, { "epoch": 0.795905125802033, "grad_norm": 79.30403180179479, "learning_rate": 1.4214548217736842e-07, "logits/chosen": -1.42530357837677, "logits/rejected": -1.4162826538085938, "logps/chosen": -159.53546142578125, "logps/rejected": -169.24526977539062, "loss": 0.5704, "rewards/accuracies": 0.59375, "rewards/chosen": -0.5039569139480591, "rewards/margins": 0.28192925453186035, "rewards/rejected": -0.7858862280845642, "step": 690 }, { "epoch": 0.7982120971811694, "grad_norm": 73.96748010582954, "learning_rate": 1.4179603448867835e-07, "logits/chosen": -1.3817723989486694, "logits/rejected": -1.493046760559082, "logps/chosen": -159.07061767578125, "logps/rejected": -224.4259033203125, "loss": 0.5923, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6248303651809692, "rewards/margins": 0.5017886161804199, "rewards/rejected": -1.1266189813613892, "step": 692 }, { "epoch": 0.8005190685603056, "grad_norm": 84.16059294161194, "learning_rate": 1.414459672819297e-07, "logits/chosen": -1.3444278240203857, "logits/rejected": -1.3752013444900513, "logps/chosen": -148.96214294433594, "logps/rejected": -246.97567749023438, "loss": 0.6059, "rewards/accuracies": 0.75, "rewards/chosen": -0.723760187625885, "rewards/margins": 0.6408222913742065, "rewards/rejected": -1.3645824193954468, "step": 694 }, { "epoch": 0.802826039939442, "grad_norm": 81.81384074979675, "learning_rate": 1.41095285745963e-07, "logits/chosen": -1.3597787618637085, "logits/rejected": -1.422480821609497, "logps/chosen": -184.29083251953125, "logps/rejected": -324.6047668457031, "loss": 0.57, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7926025390625, "rewards/margins": 0.9075123071670532, "rewards/rejected": -1.7001147270202637, "step": 696 }, { "epoch": 0.8051330113185783, "grad_norm": 86.69046431842193, "learning_rate": 1.4074399507872455e-07, "logits/chosen": -1.46107017993927, "logits/rejected": -1.5106903314590454, "logps/chosen": -169.36878967285156, "logps/rejected": -217.03964233398438, "loss": 0.5998, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9666664004325867, "rewards/margins": 0.4315177798271179, "rewards/rejected": -1.3981841802597046, "step": 698 }, { "epoch": 0.8074399826977147, "grad_norm": 72.764039414317, "learning_rate": 1.4039210048718947e-07, "logits/chosen": -1.3857448101043701, "logits/rejected": -1.38966703414917, "logps/chosen": -236.34902954101562, "logps/rejected": -287.2010803222656, "loss": 0.5593, "rewards/accuracies": 0.625, "rewards/chosen": -1.1096588373184204, "rewards/margins": 0.5392670631408691, "rewards/rejected": -1.648926019668579, "step": 700 }, { "epoch": 0.8074399826977147, "eval_logits/chosen": -1.4024839401245117, "eval_logits/rejected": -1.3188287019729614, "eval_logps/chosen": -193.01976013183594, "eval_logps/rejected": -162.88392639160156, "eval_loss": 0.581591010093689, "eval_rewards/accuracies": 0.7200000286102295, "eval_rewards/chosen": -0.7837581038475037, "eval_rewards/margins": 0.45309457182884216, "eval_rewards/rejected": -1.2368526458740234, "eval_runtime": 28.9042, "eval_samples_per_second": 3.46, "eval_steps_per_second": 0.865, "step": 700 }, { "epoch": 0.8097469540768509, "grad_norm": 92.59632644873196, "learning_rate": 1.4003960718728458e-07, "logits/chosen": -1.434832215309143, "logits/rejected": -1.368481159210205, "logps/chosen": -188.60244750976562, "logps/rejected": -191.86891174316406, "loss": 0.5808, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7094125747680664, "rewards/margins": 0.36493563652038574, "rewards/rejected": -1.0743482112884521, "step": 702 }, { "epoch": 0.8120539254559873, "grad_norm": 88.02921453439615, "learning_rate": 1.3968652040381087e-07, "logits/chosen": -1.5649467706680298, "logits/rejected": -1.6187723875045776, "logps/chosen": -174.0366668701172, "logps/rejected": -194.51516723632812, "loss": 0.5795, "rewards/accuracies": 0.625, "rewards/chosen": -0.7656524777412415, "rewards/margins": 0.09420950710773468, "rewards/rejected": -0.8598620295524597, "step": 704 }, { "epoch": 0.8143608968351237, "grad_norm": 70.53446281652334, "learning_rate": 1.3933284537036626e-07, "logits/chosen": -1.3043855428695679, "logits/rejected": -1.4392000436782837, "logps/chosen": -184.783935546875, "logps/rejected": -343.3705139160156, "loss": 0.56, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8058075904846191, "rewards/margins": 0.9816871285438538, "rewards/rejected": -1.7874946594238281, "step": 706 }, { "epoch": 0.81666786821426, "grad_norm": 85.1907599000231, "learning_rate": 1.3897858732926794e-07, "logits/chosen": -1.5064363479614258, "logits/rejected": -1.4735333919525146, "logps/chosen": -206.25363159179688, "logps/rejected": -199.1461944580078, "loss": 0.6216, "rewards/accuracies": 0.625, "rewards/chosen": -0.7892999053001404, "rewards/margins": 0.3833533525466919, "rewards/rejected": -1.1726531982421875, "step": 708 }, { "epoch": 0.8189748395933963, "grad_norm": 73.09038090478488, "learning_rate": 1.3862375153147464e-07, "logits/chosen": -1.545288324356079, "logits/rejected": -1.5258712768554688, "logps/chosen": -154.7598419189453, "logps/rejected": -196.91165161132812, "loss": 0.5838, "rewards/accuracies": 0.71875, "rewards/chosen": -0.47864556312561035, "rewards/margins": 0.47909313440322876, "rewards/rejected": -0.9577386379241943, "step": 710 }, { "epoch": 0.8212818109725326, "grad_norm": 93.34170526258869, "learning_rate": 1.3826834323650897e-07, "logits/chosen": -1.4695608615875244, "logits/rejected": -1.4735374450683594, "logps/chosen": -188.8636474609375, "logps/rejected": -209.92041015625, "loss": 0.5469, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8135924935340881, "rewards/margins": 0.24801144003868103, "rewards/rejected": -1.0616040229797363, "step": 712 }, { "epoch": 0.823588782351669, "grad_norm": 74.73130142354223, "learning_rate": 1.3791236771237917e-07, "logits/chosen": -1.384385108947754, "logits/rejected": -1.5454891920089722, "logps/chosen": -167.55824279785156, "logps/rejected": -229.03851318359375, "loss": 0.5864, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6291375756263733, "rewards/margins": 0.5017825961112976, "rewards/rejected": -1.130920171737671, "step": 714 }, { "epoch": 0.8258957537308053, "grad_norm": 82.88128786393553, "learning_rate": 1.3755583023550127e-07, "logits/chosen": -1.4609194993972778, "logits/rejected": -1.5298848152160645, "logps/chosen": -197.1524658203125, "logps/rejected": -219.78167724609375, "loss": 0.5435, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7119307518005371, "rewards/margins": 0.5048573613166809, "rewards/rejected": -1.2167880535125732, "step": 716 }, { "epoch": 0.8282027251099416, "grad_norm": 81.32166792287437, "learning_rate": 1.3719873609062075e-07, "logits/chosen": -1.3578637838363647, "logits/rejected": -1.3476288318634033, "logps/chosen": -183.71852111816406, "logps/rejected": -201.42977905273438, "loss": 0.6069, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7198283076286316, "rewards/margins": 0.41808462142944336, "rewards/rejected": -1.1379129886627197, "step": 718 }, { "epoch": 0.8305096964890779, "grad_norm": 76.1412354910528, "learning_rate": 1.3684109057073433e-07, "logits/chosen": -1.5357989072799683, "logits/rejected": -1.5547668933868408, "logps/chosen": -158.44583129882812, "logps/rejected": -186.08767700195312, "loss": 0.5387, "rewards/accuracies": 0.78125, "rewards/chosen": -0.55253666639328, "rewards/margins": 0.6118079423904419, "rewards/rejected": -1.1643445491790771, "step": 720 }, { "epoch": 0.8328166678682143, "grad_norm": 76.88330992572241, "learning_rate": 1.3648289897701134e-07, "logits/chosen": -1.4407174587249756, "logits/rejected": -1.4180082082748413, "logps/chosen": -229.4559326171875, "logps/rejected": -275.36004638671875, "loss": 0.5708, "rewards/accuracies": 0.78125, "rewards/chosen": -0.9139447808265686, "rewards/margins": 0.7458863258361816, "rewards/rejected": -1.6598312854766846, "step": 722 }, { "epoch": 0.8351236392473506, "grad_norm": 76.40403716300928, "learning_rate": 1.361241666187153e-07, "logits/chosen": -1.5596188306808472, "logits/rejected": -1.4953889846801758, "logps/chosen": -181.32513427734375, "logps/rejected": -228.9872589111328, "loss": 0.558, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6869115233421326, "rewards/margins": 0.6277297735214233, "rewards/rejected": -1.3146413564682007, "step": 724 }, { "epoch": 0.8374306106264869, "grad_norm": 88.00729362814612, "learning_rate": 1.3576489881312516e-07, "logits/chosen": -1.4083036184310913, "logits/rejected": -1.4413796663284302, "logps/chosen": -163.39768981933594, "logps/rejected": -190.90277099609375, "loss": 0.6253, "rewards/accuracies": 0.59375, "rewards/chosen": -0.7067568898200989, "rewards/margins": 0.3021257817745209, "rewards/rejected": -1.0088826417922974, "step": 726 }, { "epoch": 0.8397375820056232, "grad_norm": 69.35704721685642, "learning_rate": 1.354051008854565e-07, "logits/chosen": -1.4755336046218872, "logits/rejected": -1.4025901556015015, "logps/chosen": -166.0547332763672, "logps/rejected": -179.10647583007812, "loss": 0.5798, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5479276776313782, "rewards/margins": 0.6779038310050964, "rewards/rejected": -1.2258315086364746, "step": 728 }, { "epoch": 0.8420445533847596, "grad_norm": 63.6003273966082, "learning_rate": 1.3504477816878258e-07, "logits/chosen": -1.4906251430511475, "logits/rejected": -1.524994134902954, "logps/chosen": -152.8936309814453, "logps/rejected": -154.26930236816406, "loss": 0.5613, "rewards/accuracies": 0.75, "rewards/chosen": -0.41379597783088684, "rewards/margins": 0.31041520833969116, "rewards/rejected": -0.7242112159729004, "step": 730 }, { "epoch": 0.8443515247638959, "grad_norm": 72.20930120779202, "learning_rate": 1.3468393600395524e-07, "logits/chosen": -1.4875296354293823, "logits/rejected": -1.4794172048568726, "logps/chosen": -180.05352783203125, "logps/rejected": -201.59454345703125, "loss": 0.5522, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7309121489524841, "rewards/margins": 0.2411407232284546, "rewards/rejected": -0.972052812576294, "step": 732 }, { "epoch": 0.8466584961430322, "grad_norm": 68.5642733850933, "learning_rate": 1.3432257973952592e-07, "logits/chosen": -1.4924743175506592, "logits/rejected": -1.450230598449707, "logps/chosen": -197.60543823242188, "logps/rejected": -195.38174438476562, "loss": 0.6159, "rewards/accuracies": 0.5, "rewards/chosen": -0.9123257994651794, "rewards/margins": 0.038229409605264664, "rewards/rejected": -0.9505552053451538, "step": 734 }, { "epoch": 0.8489654675221685, "grad_norm": 89.1573385492241, "learning_rate": 1.3396071473166612e-07, "logits/chosen": -1.4010210037231445, "logits/rejected": -1.3554859161376953, "logps/chosen": -166.287841796875, "logps/rejected": -207.92405700683594, "loss": 0.5817, "rewards/accuracies": 0.625, "rewards/chosen": -0.7026151418685913, "rewards/margins": 0.763970136642456, "rewards/rejected": -1.4665852785110474, "step": 736 }, { "epoch": 0.8512724389013049, "grad_norm": 102.9368979611397, "learning_rate": 1.3359834634408828e-07, "logits/chosen": -1.4197382926940918, "logits/rejected": -1.431471586227417, "logps/chosen": -194.96832275390625, "logps/rejected": -240.55545043945312, "loss": 0.5919, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6298823356628418, "rewards/margins": 0.6693976521492004, "rewards/rejected": -1.299280047416687, "step": 738 }, { "epoch": 0.8535794102804412, "grad_norm": 66.37386209857932, "learning_rate": 1.3323547994796595e-07, "logits/chosen": -1.3606467247009277, "logits/rejected": -1.399714708328247, "logps/chosen": -149.84036254882812, "logps/rejected": -184.58444213867188, "loss": 0.5977, "rewards/accuracies": 0.75, "rewards/chosen": -0.527281641960144, "rewards/margins": 0.46527186036109924, "rewards/rejected": -0.9925534725189209, "step": 740 }, { "epoch": 0.8558863816595775, "grad_norm": 64.67920251978872, "learning_rate": 1.3287212092185464e-07, "logits/chosen": -1.3525441884994507, "logits/rejected": -1.3670110702514648, "logps/chosen": -173.00009155273438, "logps/rejected": -217.89830017089844, "loss": 0.5148, "rewards/accuracies": 0.90625, "rewards/chosen": -0.4874517023563385, "rewards/margins": 0.9208201766014099, "rewards/rejected": -1.4082717895507812, "step": 742 }, { "epoch": 0.8581933530387139, "grad_norm": 63.56760422068707, "learning_rate": 1.3250827465161151e-07, "logits/chosen": -1.5336980819702148, "logits/rejected": -1.560868501663208, "logps/chosen": -146.79489135742188, "logps/rejected": -155.26324462890625, "loss": 0.6079, "rewards/accuracies": 0.59375, "rewards/chosen": -0.4935835003852844, "rewards/margins": 0.11392290890216827, "rewards/rejected": -0.6075063943862915, "step": 744 }, { "epoch": 0.8605003244178502, "grad_norm": 78.89522512382298, "learning_rate": 1.3214394653031614e-07, "logits/chosen": -1.4548529386520386, "logits/rejected": -1.3734767436981201, "logps/chosen": -234.0319061279297, "logps/rejected": -297.91796875, "loss": 0.5865, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8137804269790649, "rewards/margins": 0.6477915048599243, "rewards/rejected": -1.4615719318389893, "step": 746 }, { "epoch": 0.8628072957969866, "grad_norm": 65.37034003085697, "learning_rate": 1.3177914195819015e-07, "logits/chosen": -1.5144624710083008, "logits/rejected": -1.4655039310455322, "logps/chosen": -158.4036407470703, "logps/rejected": -191.18508911132812, "loss": 0.555, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5954052209854126, "rewards/margins": 0.38705208897590637, "rewards/rejected": -0.9824572801589966, "step": 748 }, { "epoch": 0.8651142671761228, "grad_norm": 59.16704488192239, "learning_rate": 1.3141386634251734e-07, "logits/chosen": -1.3658645153045654, "logits/rejected": -1.4540934562683105, "logps/chosen": -150.2242431640625, "logps/rejected": -222.25180053710938, "loss": 0.5299, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5700072646141052, "rewards/margins": 0.9387847185134888, "rewards/rejected": -1.5087921619415283, "step": 750 }, { "epoch": 0.8674212385552592, "grad_norm": 67.80027199845135, "learning_rate": 1.3104812509756348e-07, "logits/chosen": -1.3619776964187622, "logits/rejected": -1.4681222438812256, "logps/chosen": -215.67242431640625, "logps/rejected": -251.7977294921875, "loss": 0.581, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6836814880371094, "rewards/margins": 0.4148728549480438, "rewards/rejected": -1.0985543727874756, "step": 752 }, { "epoch": 0.8697282099343955, "grad_norm": 78.14387370484089, "learning_rate": 1.3068192364449616e-07, "logits/chosen": -1.4636235237121582, "logits/rejected": -1.381880760192871, "logps/chosen": -200.4328155517578, "logps/rejected": -232.8759765625, "loss": 0.6182, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7163984179496765, "rewards/margins": 0.484641432762146, "rewards/rejected": -1.2010399103164673, "step": 754 }, { "epoch": 0.8720351813135319, "grad_norm": 81.47898503729805, "learning_rate": 1.3031526741130435e-07, "logits/chosen": -1.4324874877929688, "logits/rejected": -1.4492231607437134, "logps/chosen": -242.81385803222656, "logps/rejected": -297.9952697753906, "loss": 0.5814, "rewards/accuracies": 0.8125, "rewards/chosen": -0.759175717830658, "rewards/margins": 0.6992828845977783, "rewards/rejected": -1.458458423614502, "step": 756 }, { "epoch": 0.8743421526926681, "grad_norm": 68.36284032683393, "learning_rate": 1.2994816183271787e-07, "logits/chosen": -1.4733608961105347, "logits/rejected": -1.48493492603302, "logps/chosen": -170.6515350341797, "logps/rejected": -165.4122314453125, "loss": 0.6076, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7016798257827759, "rewards/margins": 0.40051549673080444, "rewards/rejected": -1.1021952629089355, "step": 758 }, { "epoch": 0.8766491240718045, "grad_norm": 61.599893474989166, "learning_rate": 1.2958061235012705e-07, "logits/chosen": -1.496275544166565, "logits/rejected": -1.460773229598999, "logps/chosen": -176.19471740722656, "logps/rejected": -198.0939178466797, "loss": 0.5757, "rewards/accuracies": 0.75, "rewards/chosen": -0.5562551021575928, "rewards/margins": 0.5198081135749817, "rewards/rejected": -1.0760632753372192, "step": 760 }, { "epoch": 0.8789560954509408, "grad_norm": 70.46057628732943, "learning_rate": 1.2921262441150183e-07, "logits/chosen": -1.3644537925720215, "logits/rejected": -1.445560097694397, "logps/chosen": -151.073974609375, "logps/rejected": -173.70281982421875, "loss": 0.5721, "rewards/accuracies": 0.75, "rewards/chosen": -0.5794097781181335, "rewards/margins": 0.4722179174423218, "rewards/rejected": -1.0516277551651, "step": 762 }, { "epoch": 0.8812630668300772, "grad_norm": 69.10199599212679, "learning_rate": 1.2884420347131121e-07, "logits/chosen": -1.4612513780593872, "logits/rejected": -1.3745832443237305, "logps/chosen": -196.2886962890625, "logps/rejected": -216.65379333496094, "loss": 0.5377, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7168931365013123, "rewards/margins": 0.5213975310325623, "rewards/rejected": -1.2382906675338745, "step": 764 }, { "epoch": 0.8835700382092134, "grad_norm": 90.51296519002845, "learning_rate": 1.284753549904423e-07, "logits/chosen": -1.4165701866149902, "logits/rejected": -1.3087973594665527, "logps/chosen": -173.92013549804688, "logps/rejected": -149.10337829589844, "loss": 0.6138, "rewards/accuracies": 0.59375, "rewards/chosen": -0.6343285441398621, "rewards/margins": 0.3654475212097168, "rewards/rejected": -0.9997760653495789, "step": 766 }, { "epoch": 0.8858770095883498, "grad_norm": 65.98966257822153, "learning_rate": 1.281060844361194e-07, "logits/chosen": -1.4396263360977173, "logits/rejected": -1.4435733556747437, "logps/chosen": -174.42852783203125, "logps/rejected": -210.5076904296875, "loss": 0.5741, "rewards/accuracies": 0.625, "rewards/chosen": -0.8263449668884277, "rewards/margins": 0.2867377698421478, "rewards/rejected": -1.1130828857421875, "step": 768 }, { "epoch": 0.8881839809674861, "grad_norm": 74.46183309877627, "learning_rate": 1.277363972818229e-07, "logits/chosen": -1.4313443899154663, "logits/rejected": -1.3114702701568604, "logps/chosen": -224.059326171875, "logps/rejected": -208.44281005859375, "loss": 0.5699, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7303236126899719, "rewards/margins": 0.44990167021751404, "rewards/rejected": -1.1802253723144531, "step": 770 }, { "epoch": 0.8904909523466225, "grad_norm": 73.75893335166967, "learning_rate": 1.273662990072083e-07, "logits/chosen": -1.5675609111785889, "logits/rejected": -1.5368494987487793, "logps/chosen": -171.8189239501953, "logps/rejected": -178.9991912841797, "loss": 0.5579, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6912405490875244, "rewards/margins": 0.27446943521499634, "rewards/rejected": -0.965709924697876, "step": 772 }, { "epoch": 0.8927979237257587, "grad_norm": 74.45202982534715, "learning_rate": 1.2699579509802477e-07, "logits/chosen": -1.381720781326294, "logits/rejected": -1.378785252571106, "logps/chosen": -250.43006896972656, "logps/rejected": -286.6543884277344, "loss": 0.5975, "rewards/accuracies": 0.5, "rewards/chosen": -1.0578484535217285, "rewards/margins": 0.05342524126172066, "rewards/rejected": -1.1112737655639648, "step": 774 }, { "epoch": 0.8951048951048951, "grad_norm": 70.06032358406004, "learning_rate": 1.2662489104603408e-07, "logits/chosen": -1.3842642307281494, "logits/rejected": -1.447717547416687, "logps/chosen": -168.9984893798828, "logps/rejected": -207.52444458007812, "loss": 0.5499, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7638756632804871, "rewards/margins": 0.5212388038635254, "rewards/rejected": -1.2851144075393677, "step": 776 }, { "epoch": 0.8974118664840315, "grad_norm": 66.63104285153635, "learning_rate": 1.2625359234892904e-07, "logits/chosen": -1.494588851928711, "logits/rejected": -1.451404333114624, "logps/chosen": -174.24356079101562, "logps/rejected": -197.6580810546875, "loss": 0.5362, "rewards/accuracies": 0.59375, "rewards/chosen": -0.6031234860420227, "rewards/margins": 0.3164597749710083, "rewards/rejected": -0.9195833206176758, "step": 778 }, { "epoch": 0.8997188378631678, "grad_norm": 79.81280336065728, "learning_rate": 1.2588190451025208e-07, "logits/chosen": -1.5040947198867798, "logits/rejected": -1.4643135070800781, "logps/chosen": -162.25146484375, "logps/rejected": -201.78736877441406, "loss": 0.5741, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6632722616195679, "rewards/margins": 0.5367907881736755, "rewards/rejected": -1.2000629901885986, "step": 780 }, { "epoch": 0.9020258092423041, "grad_norm": 52.09260056010907, "learning_rate": 1.2550983303931355e-07, "logits/chosen": -1.459741234779358, "logits/rejected": -1.5611631870269775, "logps/chosen": -142.47457885742188, "logps/rejected": -182.74696350097656, "loss": 0.527, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6918852925300598, "rewards/margins": 0.42247286438941956, "rewards/rejected": -1.1143580675125122, "step": 782 }, { "epoch": 0.9043327806214404, "grad_norm": 85.88356052811159, "learning_rate": 1.2513738345111027e-07, "logits/chosen": -1.4681496620178223, "logits/rejected": -1.5013573169708252, "logps/chosen": -177.7476806640625, "logps/rejected": -230.48324584960938, "loss": 0.5762, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7633711695671082, "rewards/margins": 0.5456692576408386, "rewards/rejected": -1.3090405464172363, "step": 784 }, { "epoch": 0.9066397520005768, "grad_norm": 65.36266724814506, "learning_rate": 1.2476456126624362e-07, "logits/chosen": -1.42805814743042, "logits/rejected": -1.410811185836792, "logps/chosen": -192.83355712890625, "logps/rejected": -199.4910430908203, "loss": 0.5464, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6136312484741211, "rewards/margins": 0.6339128613471985, "rewards/rejected": -1.2475441694259644, "step": 786 }, { "epoch": 0.908946723379713, "grad_norm": 70.49473006417183, "learning_rate": 1.2439137201083773e-07, "logits/chosen": -1.395878553390503, "logits/rejected": -1.4350143671035767, "logps/chosen": -168.0191650390625, "logps/rejected": -204.7823944091797, "loss": 0.5237, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6597675681114197, "rewards/margins": 0.4419274926185608, "rewards/rejected": -1.10169517993927, "step": 788 }, { "epoch": 0.9112536947588494, "grad_norm": 66.94572101908982, "learning_rate": 1.2401782121645766e-07, "logits/chosen": -1.4355082511901855, "logits/rejected": -1.3577499389648438, "logps/chosen": -182.4734344482422, "logps/rejected": -229.62594604492188, "loss": 0.5091, "rewards/accuracies": 0.75, "rewards/chosen": -0.931696891784668, "rewards/margins": 0.7762287259101868, "rewards/rejected": -1.70792555809021, "step": 790 }, { "epoch": 0.9135606661379857, "grad_norm": 61.38646622891244, "learning_rate": 1.236439144200273e-07, "logits/chosen": -1.4619312286376953, "logits/rejected": -1.400052547454834, "logps/chosen": -193.9796600341797, "logps/rejected": -239.8403778076172, "loss": 0.5438, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6676934957504272, "rewards/margins": 0.9647277593612671, "rewards/rejected": -1.6324213743209839, "step": 792 }, { "epoch": 0.9158676375171221, "grad_norm": 64.0363496745867, "learning_rate": 1.2326965716374745e-07, "logits/chosen": -1.377258062362671, "logits/rejected": -1.3179837465286255, "logps/chosen": -163.70166015625, "logps/rejected": -187.57347106933594, "loss": 0.5332, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6885131597518921, "rewards/margins": 0.731677770614624, "rewards/rejected": -1.4201909303665161, "step": 794 }, { "epoch": 0.9181746088962583, "grad_norm": 84.44705948583216, "learning_rate": 1.2289505499501342e-07, "logits/chosen": -1.402807593345642, "logits/rejected": -1.5039292573928833, "logps/chosen": -173.46568298339844, "logps/rejected": -235.6153564453125, "loss": 0.5977, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7485046982765198, "rewards/margins": 0.44616127014160156, "rewards/rejected": -1.1946659088134766, "step": 796 }, { "epoch": 0.9204815802753947, "grad_norm": 78.69447391068418, "learning_rate": 1.2252011346633304e-07, "logits/chosen": -1.4286327362060547, "logits/rejected": -1.5545501708984375, "logps/chosen": -181.63096618652344, "logps/rejected": -206.10789489746094, "loss": 0.5791, "rewards/accuracies": 0.75, "rewards/chosen": -0.7195813655853271, "rewards/margins": 0.3036889433860779, "rewards/rejected": -1.0232703685760498, "step": 798 }, { "epoch": 0.922788551654531, "grad_norm": 75.64682467171481, "learning_rate": 1.2214483813524428e-07, "logits/chosen": -1.4555621147155762, "logits/rejected": -1.455336332321167, "logps/chosen": -175.06056213378906, "logps/rejected": -196.7834930419922, "loss": 0.6186, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8022370934486389, "rewards/margins": 0.4286205768585205, "rewards/rejected": -1.2308577299118042, "step": 800 }, { "epoch": 0.922788551654531, "eval_logits/chosen": -1.3925344944000244, "eval_logits/rejected": -1.3117806911468506, "eval_logps/chosen": -194.27879333496094, "eval_logps/rejected": -164.40199279785156, "eval_loss": 0.5684230327606201, "eval_rewards/accuracies": 0.7200000286102295, "eval_rewards/chosen": -0.9096614718437195, "eval_rewards/margins": 0.47899794578552246, "eval_rewards/rejected": -1.3886592388153076, "eval_runtime": 27.1566, "eval_samples_per_second": 3.682, "eval_steps_per_second": 0.921, "step": 800 }, { "epoch": 0.9250955230336674, "grad_norm": 58.026864033274386, "learning_rate": 1.2176923456423282e-07, "logits/chosen": -1.4152424335479736, "logits/rejected": -1.4212266206741333, "logps/chosen": -213.00340270996094, "logps/rejected": -278.18646240234375, "loss": 0.5195, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8872989416122437, "rewards/margins": 0.8531396389007568, "rewards/rejected": -1.74043869972229, "step": 802 }, { "epoch": 0.9274024944128036, "grad_norm": 79.10112459107924, "learning_rate": 1.2139330832064973e-07, "logits/chosen": -1.3687729835510254, "logits/rejected": -1.3873934745788574, "logps/chosen": -171.1462860107422, "logps/rejected": -211.0199737548828, "loss": 0.6017, "rewards/accuracies": 0.4375, "rewards/chosen": -0.9144918918609619, "rewards/margins": 0.24607527256011963, "rewards/rejected": -1.160567283630371, "step": 804 }, { "epoch": 0.92970946579194, "grad_norm": 76.51626942304289, "learning_rate": 1.2101706497662877e-07, "logits/chosen": -1.4041192531585693, "logits/rejected": -1.489201307296753, "logps/chosen": -116.04072570800781, "logps/rejected": -152.79568481445312, "loss": 0.6006, "rewards/accuracies": 0.75, "rewards/chosen": -0.623583197593689, "rewards/margins": 0.3285295367240906, "rewards/rejected": -0.9521127343177795, "step": 806 }, { "epoch": 0.9320164371710763, "grad_norm": 101.33844171967075, "learning_rate": 1.2064051010900395e-07, "logits/chosen": -1.5294184684753418, "logits/rejected": -1.4096076488494873, "logps/chosen": -190.36326599121094, "logps/rejected": -191.35450744628906, "loss": 0.5799, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8971662521362305, "rewards/margins": 0.6536959409713745, "rewards/rejected": -1.550862193107605, "step": 808 }, { "epoch": 0.9343234085502127, "grad_norm": 83.05378892487924, "learning_rate": 1.202636492992268e-07, "logits/chosen": -1.3589162826538086, "logits/rejected": -1.4270801544189453, "logps/chosen": -175.5372314453125, "logps/rejected": -191.54107666015625, "loss": 0.5364, "rewards/accuracies": 0.78125, "rewards/chosen": -0.812555730342865, "rewards/margins": 0.5130024552345276, "rewards/rejected": -1.325558066368103, "step": 810 }, { "epoch": 0.936630379929349, "grad_norm": 91.03023580750298, "learning_rate": 1.1988648813328367e-07, "logits/chosen": -1.458229660987854, "logits/rejected": -1.499216079711914, "logps/chosen": -161.9021759033203, "logps/rejected": -194.4558563232422, "loss": 0.6202, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8818404078483582, "rewards/margins": 0.5195723176002502, "rewards/rejected": -1.4014127254486084, "step": 812 }, { "epoch": 0.9389373513084853, "grad_norm": 66.03519272128433, "learning_rate": 1.1950903220161285e-07, "logits/chosen": -1.4822351932525635, "logits/rejected": -1.4912382364273071, "logps/chosen": -217.2351837158203, "logps/rejected": -242.7360076904297, "loss": 0.5274, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6898525357246399, "rewards/margins": 0.736096203327179, "rewards/rejected": -1.4259487390518188, "step": 814 }, { "epoch": 0.9412443226876217, "grad_norm": 75.29793707597936, "learning_rate": 1.1913128709902181e-07, "logits/chosen": -1.3654075860977173, "logits/rejected": -1.2488131523132324, "logps/chosen": -214.5912628173828, "logps/rejected": -244.13392639160156, "loss": 0.5761, "rewards/accuracies": 0.78125, "rewards/chosen": -0.9588913321495056, "rewards/margins": 0.6373323202133179, "rewards/rejected": -1.5962237119674683, "step": 816 }, { "epoch": 0.943551294066758, "grad_norm": 68.17305476915286, "learning_rate": 1.1875325842460422e-07, "logits/chosen": -1.5225324630737305, "logits/rejected": -1.4224079847335815, "logps/chosen": -172.602294921875, "logps/rejected": -187.74075317382812, "loss": 0.5471, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5260685682296753, "rewards/margins": 0.7308987975120544, "rewards/rejected": -1.256967544555664, "step": 818 }, { "epoch": 0.9458582654458944, "grad_norm": 68.2006052745151, "learning_rate": 1.1837495178165705e-07, "logits/chosen": -1.3616454601287842, "logits/rejected": -1.3527371883392334, "logps/chosen": -141.04449462890625, "logps/rejected": -203.32545471191406, "loss": 0.5124, "rewards/accuracies": 0.71875, "rewards/chosen": -0.4041934609413147, "rewards/margins": 0.8920172452926636, "rewards/rejected": -1.296210765838623, "step": 820 }, { "epoch": 0.9481652368250306, "grad_norm": 77.87367487197442, "learning_rate": 1.1799637277759728e-07, "logits/chosen": -1.3988251686096191, "logits/rejected": -1.43572998046875, "logps/chosen": -181.34315490722656, "logps/rejected": -251.28089904785156, "loss": 0.5431, "rewards/accuracies": 0.84375, "rewards/chosen": -0.8347877264022827, "rewards/margins": 0.829275369644165, "rewards/rejected": -1.6640632152557373, "step": 822 }, { "epoch": 0.950472208204167, "grad_norm": 76.077749149788, "learning_rate": 1.1761752702387911e-07, "logits/chosen": -1.4185755252838135, "logits/rejected": -1.3347017765045166, "logps/chosen": -160.37074279785156, "logps/rejected": -183.9326171875, "loss": 0.5697, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8654420375823975, "rewards/margins": 0.6139148473739624, "rewards/rejected": -1.4793568849563599, "step": 824 }, { "epoch": 0.9527791795833033, "grad_norm": 75.00986364125768, "learning_rate": 1.1723842013591043e-07, "logits/chosen": -1.3774935007095337, "logits/rejected": -1.3088113069534302, "logps/chosen": -166.6764678955078, "logps/rejected": -186.21182250976562, "loss": 0.5373, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6048790812492371, "rewards/margins": 0.6833386421203613, "rewards/rejected": -1.2882177829742432, "step": 826 }, { "epoch": 0.9550861509624397, "grad_norm": 72.996726199162, "learning_rate": 1.1685905773296991e-07, "logits/chosen": -1.362646222114563, "logits/rejected": -1.376693606376648, "logps/chosen": -195.9502410888672, "logps/rejected": -225.694091796875, "loss": 0.5749, "rewards/accuracies": 0.75, "rewards/chosen": -0.9920042157173157, "rewards/margins": 0.47201159596443176, "rewards/rejected": -1.4640157222747803, "step": 828 }, { "epoch": 0.9573931223415759, "grad_norm": 75.95414823856552, "learning_rate": 1.1647944543812354e-07, "logits/chosen": -1.315481424331665, "logits/rejected": -1.3268320560455322, "logps/chosen": -131.01446533203125, "logps/rejected": -158.54641723632812, "loss": 0.5773, "rewards/accuracies": 0.625, "rewards/chosen": -0.684681236743927, "rewards/margins": 0.20931464433670044, "rewards/rejected": -0.8939959406852722, "step": 830 }, { "epoch": 0.9597000937207123, "grad_norm": 60.32263903162582, "learning_rate": 1.1609958887814128e-07, "logits/chosen": -1.5966579914093018, "logits/rejected": -1.4850637912750244, "logps/chosen": -175.0374755859375, "logps/rejected": -158.18453979492188, "loss": 0.5429, "rewards/accuracies": 0.625, "rewards/chosen": -0.6850875616073608, "rewards/margins": 0.4383313059806824, "rewards/rejected": -1.1234188079833984, "step": 832 }, { "epoch": 0.9620070650998486, "grad_norm": 73.91015165855379, "learning_rate": 1.1571949368341369e-07, "logits/chosen": -1.4515485763549805, "logits/rejected": -1.4335086345672607, "logps/chosen": -194.0410614013672, "logps/rejected": -221.74459838867188, "loss": 0.5637, "rewards/accuracies": 0.625, "rewards/chosen": -0.9065202474594116, "rewards/margins": 0.5008846521377563, "rewards/rejected": -1.407404899597168, "step": 834 }, { "epoch": 0.964314036478985, "grad_norm": 70.40231383528358, "learning_rate": 1.1533916548786855e-07, "logits/chosen": -1.3997491598129272, "logits/rejected": -1.412211537361145, "logps/chosen": -148.58609008789062, "logps/rejected": -177.541748046875, "loss": 0.5569, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6641438603401184, "rewards/margins": 0.4562597870826721, "rewards/rejected": -1.12040376663208, "step": 836 }, { "epoch": 0.9666210078581212, "grad_norm": 102.06203365904534, "learning_rate": 1.149586099288871e-07, "logits/chosen": -1.5500867366790771, "logits/rejected": -1.5184556245803833, "logps/chosen": -221.6972198486328, "logps/rejected": -220.07156372070312, "loss": 0.5965, "rewards/accuracies": 0.65625, "rewards/chosen": -0.777428388595581, "rewards/margins": 0.4735814929008484, "rewards/rejected": -1.2510098218917847, "step": 838 }, { "epoch": 0.9689279792372576, "grad_norm": 66.29447298049823, "learning_rate": 1.1457783264722085e-07, "logits/chosen": -1.4360487461090088, "logits/rejected": -1.462703824043274, "logps/chosen": -152.64370727539062, "logps/rejected": -184.56980895996094, "loss": 0.5717, "rewards/accuracies": 0.84375, "rewards/chosen": -0.6025056838989258, "rewards/margins": 0.5723504424095154, "rewards/rejected": -1.1748559474945068, "step": 840 }, { "epoch": 0.9712349506163939, "grad_norm": 78.55117522201758, "learning_rate": 1.1419683928690765e-07, "logits/chosen": -1.4210811853408813, "logits/rejected": -1.3443081378936768, "logps/chosen": -170.15673828125, "logps/rejected": -172.01361083984375, "loss": 0.6309, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8064141869544983, "rewards/margins": 0.3521476089954376, "rewards/rejected": -1.1585618257522583, "step": 842 }, { "epoch": 0.9735419219955302, "grad_norm": 69.22598182098773, "learning_rate": 1.1381563549518822e-07, "logits/chosen": -1.49723482131958, "logits/rejected": -1.410536527633667, "logps/chosen": -210.95184326171875, "logps/rejected": -203.29696655273438, "loss": 0.5463, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6800968647003174, "rewards/margins": 0.37646064162254333, "rewards/rejected": -1.056557536125183, "step": 844 }, { "epoch": 0.9758488933746665, "grad_norm": 75.78361862598538, "learning_rate": 1.1343422692242233e-07, "logits/chosen": -1.6359862089157104, "logits/rejected": -1.5996215343475342, "logps/chosen": -228.96900939941406, "logps/rejected": -303.59930419921875, "loss": 0.4869, "rewards/accuracies": 0.875, "rewards/chosen": -0.8022640943527222, "rewards/margins": 0.9272950291633606, "rewards/rejected": -1.729559063911438, "step": 846 }, { "epoch": 0.9781558647538029, "grad_norm": 70.34054678321432, "learning_rate": 1.1305261922200517e-07, "logits/chosen": -1.516348123550415, "logits/rejected": -1.415407657623291, "logps/chosen": -143.39247131347656, "logps/rejected": -152.77947998046875, "loss": 0.5319, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4436328709125519, "rewards/margins": 0.6538999676704407, "rewards/rejected": -1.097532868385315, "step": 848 }, { "epoch": 0.9804628361329393, "grad_norm": 68.10947471952929, "learning_rate": 1.1267081805028337e-07, "logits/chosen": -1.435686469078064, "logits/rejected": -1.481810212135315, "logps/chosen": -241.48797607421875, "logps/rejected": -261.9681701660156, "loss": 0.484, "rewards/accuracies": 0.84375, "rewards/chosen": -0.825613260269165, "rewards/margins": 0.9319115281105042, "rewards/rejected": -1.7575247287750244, "step": 850 }, { "epoch": 0.9827698075120755, "grad_norm": 59.37071682800829, "learning_rate": 1.1228882906647141e-07, "logits/chosen": -1.4485299587249756, "logits/rejected": -1.4876492023468018, "logps/chosen": -106.88033294677734, "logps/rejected": -123.23741149902344, "loss": 0.5289, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5273531079292297, "rewards/margins": 0.2863643765449524, "rewards/rejected": -0.8137176036834717, "step": 852 }, { "epoch": 0.9850767788912119, "grad_norm": 70.05549851743753, "learning_rate": 1.1190665793256748e-07, "logits/chosen": -1.4265426397323608, "logits/rejected": -1.3960927724838257, "logps/chosen": -188.1573944091797, "logps/rejected": -207.80072021484375, "loss": 0.5884, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8124721050262451, "rewards/margins": 0.4456351101398468, "rewards/rejected": -1.2581071853637695, "step": 854 }, { "epoch": 0.9873837502703482, "grad_norm": 73.14550405861347, "learning_rate": 1.1152431031326976e-07, "logits/chosen": -1.4437755346298218, "logits/rejected": -1.4316751956939697, "logps/chosen": -165.9943084716797, "logps/rejected": -192.70034790039062, "loss": 0.5473, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4255121052265167, "rewards/margins": 0.6612747311592102, "rewards/rejected": -1.0867868661880493, "step": 856 }, { "epoch": 0.9896907216494846, "grad_norm": 85.66970631714595, "learning_rate": 1.1114179187589233e-07, "logits/chosen": -1.3930505514144897, "logits/rejected": -1.3498919010162354, "logps/chosen": -256.72113037109375, "logps/rejected": -278.62896728515625, "loss": 0.5699, "rewards/accuracies": 0.5625, "rewards/chosen": -1.049241542816162, "rewards/margins": 0.28325188159942627, "rewards/rejected": -1.332493543624878, "step": 858 }, { "epoch": 0.9919976930286208, "grad_norm": 71.05679162032278, "learning_rate": 1.1075910829028114e-07, "logits/chosen": -1.437722086906433, "logits/rejected": -1.4826639890670776, "logps/chosen": -202.7578887939453, "logps/rejected": -261.2666015625, "loss": 0.5684, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6350253820419312, "rewards/margins": 0.6514700651168823, "rewards/rejected": -1.286495566368103, "step": 860 }, { "epoch": 0.9943046644077572, "grad_norm": 70.36206426876662, "learning_rate": 1.1037626522873019e-07, "logits/chosen": -1.41848623752594, "logits/rejected": -1.4980326890945435, "logps/chosen": -144.5270233154297, "logps/rejected": -200.35751342773438, "loss": 0.5309, "rewards/accuracies": 0.78125, "rewards/chosen": -0.522635817527771, "rewards/margins": 0.7352780699729919, "rewards/rejected": -1.2579139471054077, "step": 862 }, { "epoch": 0.9966116357868935, "grad_norm": 72.67382936260628, "learning_rate": 1.0999326836589715e-07, "logits/chosen": -1.3685966730117798, "logits/rejected": -1.394378423690796, "logps/chosen": -166.87631225585938, "logps/rejected": -200.53883361816406, "loss": 0.5929, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7416751384735107, "rewards/margins": 0.5859281420707703, "rewards/rejected": -1.3276032209396362, "step": 864 }, { "epoch": 0.9989186071660299, "grad_norm": 84.36611725847108, "learning_rate": 1.0961012337871949e-07, "logits/chosen": -1.3915830850601196, "logits/rejected": -1.3707163333892822, "logps/chosen": -165.9635009765625, "logps/rejected": -182.629150390625, "loss": 0.5651, "rewards/accuracies": 0.625, "rewards/chosen": -0.719946026802063, "rewards/margins": 0.30040302872657776, "rewards/rejected": -1.0203490257263184, "step": 866 }, { "epoch": 1.0012255785451663, "grad_norm": 65.19893923782902, "learning_rate": 1.092268359463302e-07, "logits/chosen": -1.380025029182434, "logits/rejected": -1.3949352502822876, "logps/chosen": -170.98512268066406, "logps/rejected": -166.76988220214844, "loss": 0.4741, "rewards/accuracies": 0.8125, "rewards/chosen": -0.654694139957428, "rewards/margins": 0.7143049240112305, "rewards/rejected": -1.3689990043640137, "step": 868 }, { "epoch": 1.0035325499243024, "grad_norm": 48.4483138752335, "learning_rate": 1.0884341174997366e-07, "logits/chosen": -1.497901439666748, "logits/rejected": -1.5079542398452759, "logps/chosen": -142.2536163330078, "logps/rejected": -170.02557373046875, "loss": 0.4175, "rewards/accuracies": 0.84375, "rewards/chosen": -0.33470043540000916, "rewards/margins": 0.9799707531929016, "rewards/rejected": -1.3146711587905884, "step": 870 }, { "epoch": 1.0058395213034388, "grad_norm": 47.35534970375215, "learning_rate": 1.0845985647292139e-07, "logits/chosen": -1.383826494216919, "logits/rejected": -1.449791669845581, "logps/chosen": -158.0416717529297, "logps/rejected": -176.6147918701172, "loss": 0.4029, "rewards/accuracies": 0.9375, "rewards/chosen": -0.4849981963634491, "rewards/margins": 0.8160945177078247, "rewards/rejected": -1.3010927438735962, "step": 872 }, { "epoch": 1.0081464926825752, "grad_norm": 43.77560135583366, "learning_rate": 1.0807617580038795e-07, "logits/chosen": -1.5267244577407837, "logits/rejected": -1.5721828937530518, "logps/chosen": -199.91258239746094, "logps/rejected": -248.39645385742188, "loss": 0.3876, "rewards/accuracies": 0.84375, "rewards/chosen": -0.6073986291885376, "rewards/margins": 1.2279101610183716, "rewards/rejected": -1.8353086709976196, "step": 874 }, { "epoch": 1.0104534640617115, "grad_norm": 42.52904656646403, "learning_rate": 1.0769237541944638e-07, "logits/chosen": -1.4312025308609009, "logits/rejected": -1.4796687364578247, "logps/chosen": -145.2313232421875, "logps/rejected": -158.61622619628906, "loss": 0.4168, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3942721486091614, "rewards/margins": 0.6692065000534058, "rewards/rejected": -1.063478708267212, "step": 876 }, { "epoch": 1.0127604354408477, "grad_norm": 48.90185425829871, "learning_rate": 1.0730846101894426e-07, "logits/chosen": -1.5106614828109741, "logits/rejected": -1.4605977535247803, "logps/chosen": -138.33749389648438, "logps/rejected": -161.0500946044922, "loss": 0.4457, "rewards/accuracies": 0.96875, "rewards/chosen": -0.4576440453529358, "rewards/margins": 0.9095314741134644, "rewards/rejected": -1.367175579071045, "step": 878 }, { "epoch": 1.015067406819984, "grad_norm": 43.02658405721309, "learning_rate": 1.0692443828941917e-07, "logits/chosen": -1.5405513048171997, "logits/rejected": -1.5392036437988281, "logps/chosen": -182.1168670654297, "logps/rejected": -233.71990966796875, "loss": 0.4028, "rewards/accuracies": 0.84375, "rewards/chosen": -0.6404014825820923, "rewards/margins": 1.0966987609863281, "rewards/rejected": -1.73710036277771, "step": 880 }, { "epoch": 1.0173743781991205, "grad_norm": 48.32435717433864, "learning_rate": 1.0654031292301431e-07, "logits/chosen": -1.3465638160705566, "logits/rejected": -1.3601791858673096, "logps/chosen": -173.30892944335938, "logps/rejected": -222.39129638671875, "loss": 0.4313, "rewards/accuracies": 0.875, "rewards/chosen": -0.6509138345718384, "rewards/margins": 0.8590379357337952, "rewards/rejected": -1.5099515914916992, "step": 882 }, { "epoch": 1.0196813495782568, "grad_norm": 42.7359721693661, "learning_rate": 1.061560906133943e-07, "logits/chosen": -1.4033329486846924, "logits/rejected": -1.3475041389465332, "logps/chosen": -179.88003540039062, "logps/rejected": -126.02889251708984, "loss": 0.4241, "rewards/accuracies": 0.84375, "rewards/chosen": -0.39270564913749695, "rewards/margins": 0.8330918550491333, "rewards/rejected": -1.2257975339889526, "step": 884 }, { "epoch": 1.0219883209573932, "grad_norm": 58.308886811175576, "learning_rate": 1.057717770556606e-07, "logits/chosen": -1.3382686376571655, "logits/rejected": -1.354683756828308, "logps/chosen": -163.423828125, "logps/rejected": -212.0486602783203, "loss": 0.4546, "rewards/accuracies": 0.75, "rewards/chosen": -0.7182069420814514, "rewards/margins": 0.8601030111312866, "rewards/rejected": -1.5783098936080933, "step": 886 }, { "epoch": 1.0242952923365294, "grad_norm": 52.41648679332183, "learning_rate": 1.0538737794626732e-07, "logits/chosen": -1.4164263010025024, "logits/rejected": -1.4541833400726318, "logps/chosen": -183.5728759765625, "logps/rejected": -235.9500274658203, "loss": 0.375, "rewards/accuracies": 0.84375, "rewards/chosen": -0.634645938873291, "rewards/margins": 1.2829779386520386, "rewards/rejected": -1.9176236391067505, "step": 888 }, { "epoch": 1.0266022637156658, "grad_norm": 59.99002267055787, "learning_rate": 1.0500289898293653e-07, "logits/chosen": -1.2998998165130615, "logits/rejected": -1.2930588722229004, "logps/chosen": -127.99386596679688, "logps/rejected": -172.67710876464844, "loss": 0.4373, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6694655418395996, "rewards/margins": 1.0086742639541626, "rewards/rejected": -1.6781398057937622, "step": 890 }, { "epoch": 1.0289092350948021, "grad_norm": 59.24537451441084, "learning_rate": 1.0461834586457397e-07, "logits/chosen": -1.4053970575332642, "logits/rejected": -1.4072213172912598, "logps/chosen": -207.63168334960938, "logps/rejected": -327.9807434082031, "loss": 0.4149, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9034566879272461, "rewards/margins": 1.59257972240448, "rewards/rejected": -2.4960365295410156, "step": 892 }, { "epoch": 1.0312162064739385, "grad_norm": 46.427849200004886, "learning_rate": 1.0423372429118453e-07, "logits/chosen": -1.4245054721832275, "logits/rejected": -1.3622853755950928, "logps/chosen": -121.53076171875, "logps/rejected": -153.8699951171875, "loss": 0.4026, "rewards/accuracies": 0.875, "rewards/chosen": -0.4701806306838989, "rewards/margins": 1.028322458267212, "rewards/rejected": -1.4985029697418213, "step": 894 }, { "epoch": 1.0335231778530747, "grad_norm": 50.69335447767206, "learning_rate": 1.0384903996378782e-07, "logits/chosen": -1.3767621517181396, "logits/rejected": -1.2876626253128052, "logps/chosen": -129.5188751220703, "logps/rejected": -140.3291778564453, "loss": 0.4704, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6933719515800476, "rewards/margins": 0.5130415558815002, "rewards/rejected": -1.2064135074615479, "step": 896 }, { "epoch": 1.035830149232211, "grad_norm": 51.48813587010882, "learning_rate": 1.0346429858433352e-07, "logits/chosen": -1.316386342048645, "logits/rejected": -1.3732808828353882, "logps/chosen": -123.05658721923828, "logps/rejected": -217.4422607421875, "loss": 0.3842, "rewards/accuracies": 0.84375, "rewards/chosen": -0.6196972727775574, "rewards/margins": 0.9186638593673706, "rewards/rejected": -1.5383610725402832, "step": 898 }, { "epoch": 1.0381371206113474, "grad_norm": 58.72057690388196, "learning_rate": 1.0307950585561704e-07, "logits/chosen": -1.3871917724609375, "logits/rejected": -1.3122071027755737, "logps/chosen": -149.32241821289062, "logps/rejected": -177.53614807128906, "loss": 0.435, "rewards/accuracies": 0.9375, "rewards/chosen": -0.593085765838623, "rewards/margins": 0.9729093909263611, "rewards/rejected": -1.565995216369629, "step": 900 }, { "epoch": 1.0381371206113474, "eval_logits/chosen": -1.368806004524231, "eval_logits/rejected": -1.2883577346801758, "eval_logps/chosen": -195.90835571289062, "eval_logps/rejected": -166.81427001953125, "eval_loss": 0.5445123314857483, "eval_rewards/accuracies": 0.6800000071525574, "eval_rewards/chosen": -1.072619080543518, "eval_rewards/margins": 0.5572704076766968, "eval_rewards/rejected": -1.6298894882202148, "eval_runtime": 25.9079, "eval_samples_per_second": 3.86, "eval_steps_per_second": 0.965, "step": 900 }, { "epoch": 1.0404440919904838, "grad_norm": 46.08264467438493, "learning_rate": 1.0269466748119488e-07, "logits/chosen": -1.3400801420211792, "logits/rejected": -1.3535000085830688, "logps/chosen": -231.73153686523438, "logps/rejected": -337.78509521484375, "loss": 0.4217, "rewards/accuracies": 0.9375, "rewards/chosen": -0.846785306930542, "rewards/margins": 1.6273854970932007, "rewards/rejected": -2.474170684814453, "step": 902 }, { "epoch": 1.04275106336962, "grad_norm": 47.478718630060946, "learning_rate": 1.023097891653001e-07, "logits/chosen": -1.4396333694458008, "logits/rejected": -1.3391939401626587, "logps/chosen": -192.85023498535156, "logps/rejected": -182.81593322753906, "loss": 0.4663, "rewards/accuracies": 0.625, "rewards/chosen": -0.5741289854049683, "rewards/margins": 0.8019107580184937, "rewards/rejected": -1.376039743423462, "step": 904 }, { "epoch": 1.0450580347487564, "grad_norm": 42.07373362101555, "learning_rate": 1.0192487661275784e-07, "logits/chosen": -1.3626071214675903, "logits/rejected": -1.3448035717010498, "logps/chosen": -198.46783447265625, "logps/rejected": -268.89617919921875, "loss": 0.3584, "rewards/accuracies": 0.96875, "rewards/chosen": -0.7600739002227783, "rewards/margins": 1.7053577899932861, "rewards/rejected": -2.4654316902160645, "step": 906 }, { "epoch": 1.0473650061278927, "grad_norm": 64.80462314401966, "learning_rate": 1.0153993552890068e-07, "logits/chosen": -1.287585973739624, "logits/rejected": -1.2358753681182861, "logps/chosen": -201.56106567382812, "logps/rejected": -247.15313720703125, "loss": 0.4304, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9964087009429932, "rewards/margins": 1.5290757417678833, "rewards/rejected": -2.525484323501587, "step": 908 }, { "epoch": 1.0496719775070291, "grad_norm": 49.86586606129721, "learning_rate": 1.0115497161948408e-07, "logits/chosen": -1.4171031713485718, "logits/rejected": -1.4290010929107666, "logps/chosen": -183.05397033691406, "logps/rejected": -225.85943603515625, "loss": 0.3901, "rewards/accuracies": 0.90625, "rewards/chosen": -0.8562214970588684, "rewards/margins": 1.18411123752594, "rewards/rejected": -2.040332794189453, "step": 910 }, { "epoch": 1.0519789488861653, "grad_norm": 53.027457701933855, "learning_rate": 1.0076999059060187e-07, "logits/chosen": -1.299911379814148, "logits/rejected": -1.2465267181396484, "logps/chosen": -120.93717193603516, "logps/rejected": -130.535400390625, "loss": 0.4346, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7218230962753296, "rewards/margins": 0.7116464376449585, "rewards/rejected": -1.4334694147109985, "step": 912 }, { "epoch": 1.0542859202653017, "grad_norm": 66.83942678385118, "learning_rate": 1.0038499814860157e-07, "logits/chosen": -1.4031901359558105, "logits/rejected": -1.3700772523880005, "logps/chosen": -198.3015899658203, "logps/rejected": -224.3856658935547, "loss": 0.4385, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9429548978805542, "rewards/margins": 0.9581265449523926, "rewards/rejected": -1.9010815620422363, "step": 914 }, { "epoch": 1.056592891644438, "grad_norm": 39.525727639140996, "learning_rate": 1e-07, "logits/chosen": -1.2172764539718628, "logits/rejected": -1.089323878288269, "logps/chosen": -138.57373046875, "logps/rejected": -164.6386260986328, "loss": 0.4169, "rewards/accuracies": 0.875, "rewards/chosen": -0.6064477562904358, "rewards/margins": 1.2073055505752563, "rewards/rejected": -1.813753366470337, "step": 916 }, { "epoch": 1.0588998630235744, "grad_norm": 38.99856681963842, "learning_rate": 9.961500185139842e-08, "logits/chosen": -1.3283095359802246, "logits/rejected": -1.3477778434753418, "logps/chosen": -203.667236328125, "logps/rejected": -270.05352783203125, "loss": 0.3598, "rewards/accuracies": 0.90625, "rewards/chosen": -0.8687477707862854, "rewards/margins": 1.5828710794448853, "rewards/rejected": -2.4516186714172363, "step": 918 }, { "epoch": 1.0612068344027108, "grad_norm": 43.560042003470414, "learning_rate": 9.923000940939814e-08, "logits/chosen": -1.3665335178375244, "logits/rejected": -1.3108726739883423, "logps/chosen": -132.06370544433594, "logps/rejected": -142.1192626953125, "loss": 0.4212, "rewards/accuracies": 0.875, "rewards/chosen": -0.608191967010498, "rewards/margins": 0.7716971635818481, "rewards/rejected": -1.3798891305923462, "step": 920 }, { "epoch": 1.063513805781847, "grad_norm": 42.934097495699255, "learning_rate": 9.884502838051594e-08, "logits/chosen": -1.3540059328079224, "logits/rejected": -1.2924156188964844, "logps/chosen": -124.7350845336914, "logps/rejected": -100.70684814453125, "loss": 0.4639, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5759610533714294, "rewards/margins": 0.6847679615020752, "rewards/rejected": -1.2607290744781494, "step": 922 }, { "epoch": 1.0658207771609833, "grad_norm": 53.04637102765547, "learning_rate": 9.846006447109932e-08, "logits/chosen": -1.4333351850509644, "logits/rejected": -1.3867840766906738, "logps/chosen": -116.47769165039062, "logps/rejected": -108.24977111816406, "loss": 0.4403, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5576272010803223, "rewards/margins": 0.5203125476837158, "rewards/rejected": -1.077939748764038, "step": 924 }, { "epoch": 1.0681277485401197, "grad_norm": 47.85818098965212, "learning_rate": 9.807512338724216e-08, "logits/chosen": -1.435934066772461, "logits/rejected": -1.355026125907898, "logps/chosen": -135.8258056640625, "logps/rejected": -158.5557861328125, "loss": 0.4239, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7292733192443848, "rewards/margins": 1.0533298254013062, "rewards/rejected": -1.782603144645691, "step": 926 }, { "epoch": 1.070434719919256, "grad_norm": 72.70327780863528, "learning_rate": 9.769021083469989e-08, "logits/chosen": -1.4023041725158691, "logits/rejected": -1.3046897649765015, "logps/chosen": -141.689453125, "logps/rejected": -162.58108520507812, "loss": 0.408, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6851909756660461, "rewards/margins": 1.2052912712097168, "rewards/rejected": -1.8904823064804077, "step": 928 }, { "epoch": 1.0727416912983923, "grad_norm": 53.100154938901355, "learning_rate": 9.730533251880515e-08, "logits/chosen": -1.4096457958221436, "logits/rejected": -1.4489352703094482, "logps/chosen": -193.12330627441406, "logps/rejected": -209.87611389160156, "loss": 0.372, "rewards/accuracies": 0.84375, "rewards/chosen": -0.7944599390029907, "rewards/margins": 1.150054931640625, "rewards/rejected": -1.9445148706436157, "step": 930 }, { "epoch": 1.0750486626775286, "grad_norm": 45.4444435369967, "learning_rate": 9.692049414438298e-08, "logits/chosen": -1.5730786323547363, "logits/rejected": -1.509131669998169, "logps/chosen": -170.0063934326172, "logps/rejected": -200.8542938232422, "loss": 0.4116, "rewards/accuracies": 0.875, "rewards/chosen": -0.8289997577667236, "rewards/margins": 1.3618550300598145, "rewards/rejected": -2.190855026245117, "step": 932 }, { "epoch": 1.077355634056665, "grad_norm": 54.74440505132641, "learning_rate": 9.653570141566652e-08, "logits/chosen": -1.3093923330307007, "logits/rejected": -1.4106438159942627, "logps/chosen": -144.6334686279297, "logps/rejected": -195.14810180664062, "loss": 0.4191, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6524811387062073, "rewards/margins": 1.1764483451843262, "rewards/rejected": -1.8289295434951782, "step": 934 }, { "epoch": 1.0796626054358014, "grad_norm": 60.39174139928238, "learning_rate": 9.61509600362122e-08, "logits/chosen": -1.405623435974121, "logits/rejected": -1.45893132686615, "logps/chosen": -167.4293670654297, "logps/rejected": -190.83563232421875, "loss": 0.4699, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5997176766395569, "rewards/margins": 0.7248048186302185, "rewards/rejected": -1.3245224952697754, "step": 936 }, { "epoch": 1.0819695768149376, "grad_norm": 57.80810696219112, "learning_rate": 9.576627570881549e-08, "logits/chosen": -1.3913531303405762, "logits/rejected": -1.3687852621078491, "logps/chosen": -196.201416015625, "logps/rejected": -240.41012573242188, "loss": 0.4425, "rewards/accuracies": 0.84375, "rewards/chosen": -0.9165946245193481, "rewards/margins": 0.9089750051498413, "rewards/rejected": -1.8255695104599, "step": 938 }, { "epoch": 1.084276548194074, "grad_norm": 47.94477989169309, "learning_rate": 9.538165413542607e-08, "logits/chosen": -1.3398971557617188, "logits/rejected": -1.3986927270889282, "logps/chosen": -222.11538696289062, "logps/rejected": -260.91241455078125, "loss": 0.3776, "rewards/accuracies": 0.875, "rewards/chosen": -0.653100848197937, "rewards/margins": 1.5059380531311035, "rewards/rejected": -2.159038543701172, "step": 940 }, { "epoch": 1.0865835195732103, "grad_norm": 55.94671589153818, "learning_rate": 9.499710101706346e-08, "logits/chosen": -1.4237765073776245, "logits/rejected": -1.4448227882385254, "logps/chosen": -210.11026000976562, "logps/rejected": -244.41043090820312, "loss": 0.4016, "rewards/accuracies": 0.90625, "rewards/chosen": -0.4846809208393097, "rewards/margins": 1.2182962894439697, "rewards/rejected": -1.702976942062378, "step": 942 }, { "epoch": 1.0888904909523467, "grad_norm": 51.245921498733374, "learning_rate": 9.461262205373268e-08, "logits/chosen": -1.3350170850753784, "logits/rejected": -1.3754874467849731, "logps/chosen": -167.61691284179688, "logps/rejected": -182.33018493652344, "loss": 0.3982, "rewards/accuracies": 0.875, "rewards/chosen": -0.8486964702606201, "rewards/margins": 1.005102276802063, "rewards/rejected": -1.8537986278533936, "step": 944 }, { "epoch": 1.0911974623314828, "grad_norm": 53.58036009588695, "learning_rate": 9.422822294433938e-08, "logits/chosen": -1.2717465162277222, "logits/rejected": -1.2129467725753784, "logps/chosen": -150.95550537109375, "logps/rejected": -160.45558166503906, "loss": 0.4122, "rewards/accuracies": 0.78125, "rewards/chosen": -0.9215366840362549, "rewards/margins": 0.8769155740737915, "rewards/rejected": -1.7984521389007568, "step": 946 }, { "epoch": 1.0935044337106192, "grad_norm": 42.3521804722769, "learning_rate": 9.38439093866057e-08, "logits/chosen": -1.3253931999206543, "logits/rejected": -1.3314661979675293, "logps/chosen": -137.23956298828125, "logps/rejected": -147.44735717773438, "loss": 0.4522, "rewards/accuracies": 0.84375, "rewards/chosen": -0.5181431770324707, "rewards/margins": 0.8911628723144531, "rewards/rejected": -1.4093059301376343, "step": 948 }, { "epoch": 1.0958114050897556, "grad_norm": 52.84833057426872, "learning_rate": 9.345968707698568e-08, "logits/chosen": -1.51572585105896, "logits/rejected": -1.5353983640670776, "logps/chosen": -162.04800415039062, "logps/rejected": -193.7029266357422, "loss": 0.3981, "rewards/accuracies": 0.75, "rewards/chosen": -0.5837193131446838, "rewards/margins": 0.8988637328147888, "rewards/rejected": -1.482582926750183, "step": 950 }, { "epoch": 1.098118376468892, "grad_norm": 51.08843143746294, "learning_rate": 9.307556171058084e-08, "logits/chosen": -1.3710289001464844, "logits/rejected": -1.501839280128479, "logps/chosen": -177.76930236816406, "logps/rejected": -265.036865234375, "loss": 0.3462, "rewards/accuracies": 1.0, "rewards/chosen": -0.6180956363677979, "rewards/margins": 1.6954776048660278, "rewards/rejected": -2.3135733604431152, "step": 952 }, { "epoch": 1.1004253478480281, "grad_norm": 39.48580147628757, "learning_rate": 9.269153898105571e-08, "logits/chosen": -1.3733117580413818, "logits/rejected": -1.370259404182434, "logps/chosen": -196.75180053710938, "logps/rejected": -220.63250732421875, "loss": 0.3896, "rewards/accuracies": 0.84375, "rewards/chosen": -0.897487223148346, "rewards/margins": 1.2682368755340576, "rewards/rejected": -2.165724039077759, "step": 954 }, { "epoch": 1.1027323192271645, "grad_norm": 40.58851230363069, "learning_rate": 9.230762458055362e-08, "logits/chosen": -1.3409641981124878, "logits/rejected": -1.5087528228759766, "logps/chosen": -162.93719482421875, "logps/rejected": -228.28309631347656, "loss": 0.4086, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7766309380531311, "rewards/margins": 1.0290602445602417, "rewards/rejected": -1.8056910037994385, "step": 956 }, { "epoch": 1.105039290606301, "grad_norm": 48.07092159296336, "learning_rate": 9.192382419961207e-08, "logits/chosen": -1.2765452861785889, "logits/rejected": -1.3672202825546265, "logps/chosen": -156.22647094726562, "logps/rejected": -200.10728454589844, "loss": 0.4416, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6270304918289185, "rewards/margins": 0.8705189824104309, "rewards/rejected": -1.4975494146347046, "step": 958 }, { "epoch": 1.1073462619854373, "grad_norm": 58.85426890701801, "learning_rate": 9.15401435270786e-08, "logits/chosen": -1.4625585079193115, "logits/rejected": -1.4515876770019531, "logps/chosen": -175.11221313476562, "logps/rejected": -179.67308044433594, "loss": 0.433, "rewards/accuracies": 0.75, "rewards/chosen": -0.8706232905387878, "rewards/margins": 0.8775316476821899, "rewards/rejected": -1.748154878616333, "step": 960 }, { "epoch": 1.1096532333645737, "grad_norm": 48.13444718101857, "learning_rate": 9.115658825002634e-08, "logits/chosen": -1.4449965953826904, "logits/rejected": -1.4944627285003662, "logps/chosen": -148.5018768310547, "logps/rejected": -192.43560791015625, "loss": 0.4478, "rewards/accuracies": 0.84375, "rewards/chosen": -0.6375682353973389, "rewards/margins": 1.2029744386672974, "rewards/rejected": -1.8405426740646362, "step": 962 }, { "epoch": 1.1119602047437098, "grad_norm": 45.630310836419994, "learning_rate": 9.077316405366981e-08, "logits/chosen": -1.5309690237045288, "logits/rejected": -1.4158992767333984, "logps/chosen": -237.97152709960938, "logps/rejected": -307.7381286621094, "loss": 0.438, "rewards/accuracies": 0.875, "rewards/chosen": -0.753577470779419, "rewards/margins": 1.5821527242660522, "rewards/rejected": -2.3357303142547607, "step": 964 }, { "epoch": 1.1142671761228462, "grad_norm": 54.60688529032658, "learning_rate": 9.03898766212805e-08, "logits/chosen": -1.425299048423767, "logits/rejected": -1.4033348560333252, "logps/chosen": -216.15237426757812, "logps/rejected": -262.237548828125, "loss": 0.4143, "rewards/accuracies": 0.90625, "rewards/chosen": -0.9375750422477722, "rewards/margins": 1.173370122909546, "rewards/rejected": -2.110945224761963, "step": 966 }, { "epoch": 1.1165741475019826, "grad_norm": 47.343958384978336, "learning_rate": 9.000673163410286e-08, "logits/chosen": -1.194286823272705, "logits/rejected": -1.2353427410125732, "logps/chosen": -137.03073120117188, "logps/rejected": -158.67300415039062, "loss": 0.4048, "rewards/accuracies": 0.84375, "rewards/chosen": -0.8332533240318298, "rewards/margins": 0.733294665813446, "rewards/rejected": -1.5665481090545654, "step": 968 }, { "epoch": 1.118881118881119, "grad_norm": 48.41236984702601, "learning_rate": 8.962373477126982e-08, "logits/chosen": -1.3657209873199463, "logits/rejected": -1.4510833024978638, "logps/chosen": -112.81056213378906, "logps/rejected": -161.59786987304688, "loss": 0.4003, "rewards/accuracies": 0.90625, "rewards/chosen": -0.6659589409828186, "rewards/margins": 0.8242032527923584, "rewards/rejected": -1.4901621341705322, "step": 970 }, { "epoch": 1.1211880902602551, "grad_norm": 47.3495191045043, "learning_rate": 8.924089170971887e-08, "logits/chosen": -1.4114599227905273, "logits/rejected": -1.4022752046585083, "logps/chosen": -142.42562866210938, "logps/rejected": -152.8037872314453, "loss": 0.3903, "rewards/accuracies": 0.84375, "rewards/chosen": -0.6654877662658691, "rewards/margins": 0.8193020820617676, "rewards/rejected": -1.4847897291183472, "step": 972 }, { "epoch": 1.1234950616393915, "grad_norm": 56.44959316652502, "learning_rate": 8.885820812410769e-08, "logits/chosen": -1.2863538265228271, "logits/rejected": -1.3213986158370972, "logps/chosen": -161.03211975097656, "logps/rejected": -256.6575012207031, "loss": 0.4055, "rewards/accuracies": 0.90625, "rewards/chosen": -0.6689996123313904, "rewards/margins": 1.2615712881088257, "rewards/rejected": -1.9305710792541504, "step": 974 }, { "epoch": 1.1258020330185279, "grad_norm": 48.64694104803259, "learning_rate": 8.847568968673024e-08, "logits/chosen": -1.2597205638885498, "logits/rejected": -1.2746870517730713, "logps/chosen": -167.2078399658203, "logps/rejected": -216.3764190673828, "loss": 0.4015, "rewards/accuracies": 0.84375, "rewards/chosen": -0.7549591064453125, "rewards/margins": 1.0452518463134766, "rewards/rejected": -1.800210952758789, "step": 976 }, { "epoch": 1.1281090043976643, "grad_norm": 54.54545983114275, "learning_rate": 8.809334206743251e-08, "logits/chosen": -1.3839998245239258, "logits/rejected": -1.2565964460372925, "logps/chosen": -220.92575073242188, "logps/rejected": -232.58221435546875, "loss": 0.3953, "rewards/accuracies": 0.875, "rewards/chosen": -0.8072776794433594, "rewards/margins": 1.098924994468689, "rewards/rejected": -1.9062025547027588, "step": 978 }, { "epoch": 1.1304159757768004, "grad_norm": 50.669478447045044, "learning_rate": 8.77111709335286e-08, "logits/chosen": -1.3114320039749146, "logits/rejected": -1.3967440128326416, "logps/chosen": -183.023193359375, "logps/rejected": -242.5177764892578, "loss": 0.4064, "rewards/accuracies": 0.90625, "rewards/chosen": -0.8590079545974731, "rewards/margins": 1.4726592302322388, "rewards/rejected": -2.331667184829712, "step": 980 }, { "epoch": 1.1327229471559368, "grad_norm": 45.01192442719795, "learning_rate": 8.732918194971663e-08, "logits/chosen": -1.4155831336975098, "logits/rejected": -1.362624168395996, "logps/chosen": -142.08741760253906, "logps/rejected": -175.91061401367188, "loss": 0.3832, "rewards/accuracies": 0.96875, "rewards/chosen": -0.6290794014930725, "rewards/margins": 1.268273115158081, "rewards/rejected": -1.8973525762557983, "step": 982 }, { "epoch": 1.1350299185350732, "grad_norm": 52.363728579221444, "learning_rate": 8.694738077799486e-08, "logits/chosen": -1.3025627136230469, "logits/rejected": -1.4123029708862305, "logps/chosen": -118.02716827392578, "logps/rejected": -147.27581787109375, "loss": 0.4048, "rewards/accuracies": 0.84375, "rewards/chosen": -0.43484801054000854, "rewards/margins": 0.925700306892395, "rewards/rejected": -1.3605482578277588, "step": 984 }, { "epoch": 1.1373368899142096, "grad_norm": 48.92241704444725, "learning_rate": 8.656577307757766e-08, "logits/chosen": -1.397874355316162, "logits/rejected": -1.3083161115646362, "logps/chosen": -183.96243286132812, "logps/rejected": -178.71238708496094, "loss": 0.4146, "rewards/accuracies": 0.90625, "rewards/chosen": -1.0279752016067505, "rewards/margins": 0.7176418304443359, "rewards/rejected": -1.7456170320510864, "step": 986 }, { "epoch": 1.139643861293346, "grad_norm": 47.66250134338094, "learning_rate": 8.618436450481181e-08, "logits/chosen": -1.4895576238632202, "logits/rejected": -1.5260589122772217, "logps/chosen": -224.04379272460938, "logps/rejected": -272.7261962890625, "loss": 0.3914, "rewards/accuracies": 0.78125, "rewards/chosen": -0.801969587802887, "rewards/margins": 1.2178035974502563, "rewards/rejected": -2.019773006439209, "step": 988 }, { "epoch": 1.141950832672482, "grad_norm": 65.26400202215471, "learning_rate": 8.580316071309234e-08, "logits/chosen": -1.4646153450012207, "logits/rejected": -1.486820936203003, "logps/chosen": -154.2437744140625, "logps/rejected": -184.03640747070312, "loss": 0.4269, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8698714971542358, "rewards/margins": 1.1085283756256104, "rewards/rejected": -1.9783998727798462, "step": 990 }, { "epoch": 1.1442578040516185, "grad_norm": 55.38310724678633, "learning_rate": 8.542216735277917e-08, "logits/chosen": -1.3877445459365845, "logits/rejected": -1.426324725151062, "logps/chosen": -226.8168182373047, "logps/rejected": -320.26861572265625, "loss": 0.406, "rewards/accuracies": 0.875, "rewards/chosen": -0.9063752293586731, "rewards/margins": 1.4490259885787964, "rewards/rejected": -2.3554012775421143, "step": 992 }, { "epoch": 1.1465647754307549, "grad_norm": 55.30320117524687, "learning_rate": 8.504139007111289e-08, "logits/chosen": -1.524666666984558, "logits/rejected": -1.4656498432159424, "logps/chosen": -180.05706787109375, "logps/rejected": -224.0005340576172, "loss": 0.4497, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8488324880599976, "rewards/margins": 1.1417200565338135, "rewards/rejected": -1.9905524253845215, "step": 994 }, { "epoch": 1.1488717468098912, "grad_norm": 59.50571311635345, "learning_rate": 8.466083451213145e-08, "logits/chosen": -1.3895516395568848, "logits/rejected": -1.3458820581436157, "logps/chosen": -222.20672607421875, "logps/rejected": -246.43609619140625, "loss": 0.3834, "rewards/accuracies": 0.875, "rewards/chosen": -0.9480006694793701, "rewards/margins": 1.4348657131195068, "rewards/rejected": -2.382866621017456, "step": 996 }, { "epoch": 1.1511787181890274, "grad_norm": 52.90540723372322, "learning_rate": 8.428050631658627e-08, "logits/chosen": -1.463295340538025, "logits/rejected": -1.527518630027771, "logps/chosen": -169.41342163085938, "logps/rejected": -211.99317932128906, "loss": 0.409, "rewards/accuracies": 0.75, "rewards/chosen": -0.7570784687995911, "rewards/margins": 1.2448766231536865, "rewards/rejected": -2.001955032348633, "step": 998 }, { "epoch": 1.1534856895681638, "grad_norm": 62.46156202928193, "learning_rate": 8.39004111218587e-08, "logits/chosen": -1.4075658321380615, "logits/rejected": -1.402864933013916, "logps/chosen": -174.88134765625, "logps/rejected": -177.6551055908203, "loss": 0.3574, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7704441547393799, "rewards/margins": 0.945233166217804, "rewards/rejected": -1.715677261352539, "step": 1000 }, { "epoch": 1.1534856895681638, "eval_logits/chosen": -1.3622280359268188, "eval_logits/rejected": -1.2871321439743042, "eval_logps/chosen": -197.57440185546875, "eval_logps/rejected": -168.7324676513672, "eval_loss": 0.5431498885154724, "eval_rewards/accuracies": 0.7599999904632568, "eval_rewards/chosen": -1.239221215248108, "eval_rewards/margins": 0.582485556602478, "eval_rewards/rejected": -1.8217066526412964, "eval_runtime": 23.3967, "eval_samples_per_second": 4.274, "eval_steps_per_second": 1.069, "step": 1000 }, { "epoch": 1.1557926609473002, "grad_norm": 44.181005655906645, "learning_rate": 8.352055456187644e-08, "logits/chosen": -1.3333595991134644, "logits/rejected": -1.2514859437942505, "logps/chosen": -184.87631225585938, "logps/rejected": -213.40603637695312, "loss": 0.3575, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8501152992248535, "rewards/margins": 1.3011317253112793, "rewards/rejected": -2.151247262954712, "step": 1002 }, { "epoch": 1.1580996323264365, "grad_norm": 42.72442121434693, "learning_rate": 8.314094226703007e-08, "logits/chosen": -1.4761807918548584, "logits/rejected": -1.5279865264892578, "logps/chosen": -138.0496826171875, "logps/rejected": -223.12356567382812, "loss": 0.4402, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5688883662223816, "rewards/margins": 1.421848177909851, "rewards/rejected": -1.990736722946167, "step": 1004 }, { "epoch": 1.1604066037055727, "grad_norm": 42.42576201307595, "learning_rate": 8.276157986408958e-08, "logits/chosen": -1.4122170209884644, "logits/rejected": -1.5143085718154907, "logps/chosen": -150.01748657226562, "logps/rejected": -205.741943359375, "loss": 0.408, "rewards/accuracies": 0.75, "rewards/chosen": -0.709044873714447, "rewards/margins": 1.0970107316970825, "rewards/rejected": -1.8060553073883057, "step": 1006 }, { "epoch": 1.162713575084709, "grad_norm": 46.98053894558455, "learning_rate": 8.238247297612091e-08, "logits/chosen": -1.4477453231811523, "logits/rejected": -1.456710696220398, "logps/chosen": -189.01307678222656, "logps/rejected": -224.03646850585938, "loss": 0.3941, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5644448399543762, "rewards/margins": 1.1727917194366455, "rewards/rejected": -1.737236738204956, "step": 1008 }, { "epoch": 1.1650205464638455, "grad_norm": 45.113641573334704, "learning_rate": 8.200362722240272e-08, "logits/chosen": -1.368615746498108, "logits/rejected": -1.452118992805481, "logps/chosen": -115.6017837524414, "logps/rejected": -158.69747924804688, "loss": 0.4656, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7957598567008972, "rewards/margins": 0.9295057654380798, "rewards/rejected": -1.725265622138977, "step": 1010 }, { "epoch": 1.1673275178429818, "grad_norm": 52.770081650362485, "learning_rate": 8.162504821834295e-08, "logits/chosen": -1.4406243562698364, "logits/rejected": -1.3740017414093018, "logps/chosen": -177.7287139892578, "logps/rejected": -185.05763244628906, "loss": 0.4228, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7840394973754883, "rewards/margins": 0.9432986378669739, "rewards/rejected": -1.7273380756378174, "step": 1012 }, { "epoch": 1.1696344892221182, "grad_norm": 40.23487524634427, "learning_rate": 8.124674157539577e-08, "logits/chosen": -1.3667372465133667, "logits/rejected": -1.3167593479156494, "logps/chosen": -205.84791564941406, "logps/rejected": -246.4522705078125, "loss": 0.3889, "rewards/accuracies": 0.875, "rewards/chosen": -0.8650726079940796, "rewards/margins": 1.2019354104995728, "rewards/rejected": -2.0670082569122314, "step": 1014 }, { "epoch": 1.1719414606012544, "grad_norm": 64.51013858621744, "learning_rate": 8.086871290097821e-08, "logits/chosen": -1.3617969751358032, "logits/rejected": -1.4326754808425903, "logps/chosen": -137.392578125, "logps/rejected": -219.85238647460938, "loss": 0.4504, "rewards/accuracies": 0.875, "rewards/chosen": -1.033658742904663, "rewards/margins": 0.985666811466217, "rewards/rejected": -2.0193252563476562, "step": 1016 }, { "epoch": 1.1742484319803907, "grad_norm": 40.94400015951729, "learning_rate": 8.049096779838717e-08, "logits/chosen": -1.4027369022369385, "logits/rejected": -1.3317725658416748, "logps/chosen": -178.0692138671875, "logps/rejected": -199.9445343017578, "loss": 0.3641, "rewards/accuracies": 0.78125, "rewards/chosen": -0.67464679479599, "rewards/margins": 1.4650667905807495, "rewards/rejected": -2.1397135257720947, "step": 1018 }, { "epoch": 1.1765554033595271, "grad_norm": 58.285176845680446, "learning_rate": 8.011351186671635e-08, "logits/chosen": -1.3443338871002197, "logits/rejected": -1.4246101379394531, "logps/chosen": -186.61431884765625, "logps/rejected": -230.87966918945312, "loss": 0.4095, "rewards/accuracies": 0.90625, "rewards/chosen": -0.9058922529220581, "rewards/margins": 1.0936301946640015, "rewards/rejected": -1.9995224475860596, "step": 1020 }, { "epoch": 1.1788623747386633, "grad_norm": 40.74344848622249, "learning_rate": 7.973635070077318e-08, "logits/chosen": -1.3627556562423706, "logits/rejected": -1.2945423126220703, "logps/chosen": -169.3376922607422, "logps/rejected": -196.9662628173828, "loss": 0.3688, "rewards/accuracies": 0.875, "rewards/chosen": -0.8068114519119263, "rewards/margins": 1.142544150352478, "rewards/rejected": -1.9493556022644043, "step": 1022 }, { "epoch": 1.1811693461177997, "grad_norm": 52.371921447490024, "learning_rate": 7.935948989099605e-08, "logits/chosen": -1.3821394443511963, "logits/rejected": -1.385388970375061, "logps/chosen": -188.00381469726562, "logps/rejected": -241.38780212402344, "loss": 0.4173, "rewards/accuracies": 0.84375, "rewards/chosen": -1.226588487625122, "rewards/margins": 1.1497639417648315, "rewards/rejected": -2.376352310180664, "step": 1024 }, { "epoch": 1.183476317496936, "grad_norm": 47.37840376254983, "learning_rate": 7.898293502337122e-08, "logits/chosen": -1.3714951276779175, "logits/rejected": -1.283757209777832, "logps/chosen": -136.48751831054688, "logps/rejected": -152.93016052246094, "loss": 0.4274, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7810596227645874, "rewards/margins": 0.9014545679092407, "rewards/rejected": -1.6825141906738281, "step": 1026 }, { "epoch": 1.1857832888760724, "grad_norm": 50.908179365332344, "learning_rate": 7.860669167935028e-08, "logits/chosen": -1.3071757555007935, "logits/rejected": -1.287663221359253, "logps/chosen": -230.11318969726562, "logps/rejected": -298.1128234863281, "loss": 0.4266, "rewards/accuracies": 0.84375, "rewards/chosen": -1.0475901365280151, "rewards/margins": 1.5074639320373535, "rewards/rejected": -2.555053949356079, "step": 1028 }, { "epoch": 1.1880902602552088, "grad_norm": 44.365837449973405, "learning_rate": 7.823076543576717e-08, "logits/chosen": -1.4471302032470703, "logits/rejected": -1.442579984664917, "logps/chosen": -164.46011352539062, "logps/rejected": -170.95596313476562, "loss": 0.3882, "rewards/accuracies": 0.84375, "rewards/chosen": -0.5805782079696655, "rewards/margins": 0.9794151782989502, "rewards/rejected": -1.5599933862686157, "step": 1030 }, { "epoch": 1.190397231634345, "grad_norm": 49.23836352683586, "learning_rate": 7.785516186475574e-08, "logits/chosen": -1.2814298868179321, "logits/rejected": -1.3165457248687744, "logps/chosen": -154.7665557861328, "logps/rejected": -175.01173400878906, "loss": 0.399, "rewards/accuracies": 0.84375, "rewards/chosen": -0.9837753176689148, "rewards/margins": 1.1613128185272217, "rewards/rejected": -2.145087957382202, "step": 1032 }, { "epoch": 1.1927042030134813, "grad_norm": 51.14719932201333, "learning_rate": 7.747988653366697e-08, "logits/chosen": -1.5139728784561157, "logits/rejected": -1.4376604557037354, "logps/chosen": -156.8878173828125, "logps/rejected": -158.37210083007812, "loss": 0.3949, "rewards/accuracies": 0.75, "rewards/chosen": -0.8450059294700623, "rewards/margins": 0.7959898710250854, "rewards/rejected": -1.6409958600997925, "step": 1034 }, { "epoch": 1.1950111743926177, "grad_norm": 49.56300160493831, "learning_rate": 7.710494500498662e-08, "logits/chosen": -1.4593157768249512, "logits/rejected": -1.3894437551498413, "logps/chosen": -106.61005401611328, "logps/rejected": -107.51506042480469, "loss": 0.3957, "rewards/accuracies": 0.875, "rewards/chosen": -0.8094179034233093, "rewards/margins": 0.61183100938797, "rewards/rejected": -1.4212487936019897, "step": 1036 }, { "epoch": 1.197318145771754, "grad_norm": 46.81170703509067, "learning_rate": 7.673034283625257e-08, "logits/chosen": -1.4491219520568848, "logits/rejected": -1.5339434146881104, "logps/chosen": -157.70791625976562, "logps/rejected": -199.98257446289062, "loss": 0.4217, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8838984966278076, "rewards/margins": 1.085174798965454, "rewards/rejected": -1.9690735340118408, "step": 1038 }, { "epoch": 1.1996251171508903, "grad_norm": 59.481661469725495, "learning_rate": 7.635608557997271e-08, "logits/chosen": -1.426401138305664, "logits/rejected": -1.4868569374084473, "logps/chosen": -175.43914794921875, "logps/rejected": -214.3563232421875, "loss": 0.4172, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8752603530883789, "rewards/margins": 1.1431567668914795, "rewards/rejected": -2.0184173583984375, "step": 1040 }, { "epoch": 1.2019320885300266, "grad_norm": 53.22583112987165, "learning_rate": 7.598217878354236e-08, "logits/chosen": -1.4433585405349731, "logits/rejected": -1.4538565874099731, "logps/chosen": -157.7897186279297, "logps/rejected": -229.7288055419922, "loss": 0.4108, "rewards/accuracies": 0.84375, "rewards/chosen": -0.8286512494087219, "rewards/margins": 1.2663472890853882, "rewards/rejected": -2.094998598098755, "step": 1042 }, { "epoch": 1.204239059909163, "grad_norm": 49.60437729117302, "learning_rate": 7.560862798916228e-08, "logits/chosen": -1.4474220275878906, "logits/rejected": -1.4672783613204956, "logps/chosen": -159.66246032714844, "logps/rejected": -196.7609405517578, "loss": 0.4004, "rewards/accuracies": 0.90625, "rewards/chosen": -0.754623293876648, "rewards/margins": 1.0629355907440186, "rewards/rejected": -1.8175588846206665, "step": 1044 }, { "epoch": 1.2065460312882994, "grad_norm": 47.76416560805182, "learning_rate": 7.52354387337564e-08, "logits/chosen": -1.3766390085220337, "logits/rejected": -1.453482985496521, "logps/chosen": -127.11229705810547, "logps/rejected": -186.68060302734375, "loss": 0.4089, "rewards/accuracies": 0.84375, "rewards/chosen": -0.5222644209861755, "rewards/margins": 1.1581171751022339, "rewards/rejected": -1.6803816556930542, "step": 1046 }, { "epoch": 1.2088530026674356, "grad_norm": 44.83004511877691, "learning_rate": 7.486261654888972e-08, "logits/chosen": -1.2795186042785645, "logits/rejected": -1.3099991083145142, "logps/chosen": -132.81536865234375, "logps/rejected": -216.51211547851562, "loss": 0.4054, "rewards/accuracies": 0.875, "rewards/chosen": -0.7403107285499573, "rewards/margins": 1.632882833480835, "rewards/rejected": -2.3731932640075684, "step": 1048 }, { "epoch": 1.211159974046572, "grad_norm": 46.48503074566374, "learning_rate": 7.449016696068645e-08, "logits/chosen": -1.3228113651275635, "logits/rejected": -1.3149969577789307, "logps/chosen": -153.4790496826172, "logps/rejected": -168.47897338867188, "loss": 0.3849, "rewards/accuracies": 0.84375, "rewards/chosen": -0.8511451482772827, "rewards/margins": 0.9793115854263306, "rewards/rejected": -1.8304567337036133, "step": 1050 }, { "epoch": 1.2134669454257083, "grad_norm": 45.82113493125643, "learning_rate": 7.411809548974791e-08, "logits/chosen": -1.3215563297271729, "logits/rejected": -1.419891119003296, "logps/chosen": -229.16896057128906, "logps/rejected": -293.1673278808594, "loss": 0.364, "rewards/accuracies": 0.875, "rewards/chosen": -1.1792607307434082, "rewards/margins": 1.518965482711792, "rewards/rejected": -2.6982262134552, "step": 1052 }, { "epoch": 1.2157739168048447, "grad_norm": 49.08087412471043, "learning_rate": 7.374640765107095e-08, "logits/chosen": -1.4140170812606812, "logits/rejected": -1.4835015535354614, "logps/chosen": -223.50379943847656, "logps/rejected": -291.9320373535156, "loss": 0.4479, "rewards/accuracies": 0.84375, "rewards/chosen": -1.0911102294921875, "rewards/margins": 1.4864944219589233, "rewards/rejected": -2.5776045322418213, "step": 1054 }, { "epoch": 1.218080888183981, "grad_norm": 56.10565437870734, "learning_rate": 7.337510895396591e-08, "logits/chosen": -1.2541605234146118, "logits/rejected": -1.3272130489349365, "logps/chosen": -124.03968048095703, "logps/rejected": -178.76068115234375, "loss": 0.3705, "rewards/accuracies": 1.0, "rewards/chosen": -0.6784568428993225, "rewards/margins": 1.2771177291870117, "rewards/rejected": -1.9555747509002686, "step": 1056 }, { "epoch": 1.2203878595631172, "grad_norm": 55.75804088847802, "learning_rate": 7.300420490197523e-08, "logits/chosen": -1.311789631843567, "logits/rejected": -1.3173766136169434, "logps/chosen": -174.1820526123047, "logps/rejected": -234.0801239013672, "loss": 0.4168, "rewards/accuracies": 0.84375, "rewards/chosen": -0.9961414933204651, "rewards/margins": 1.0488609075546265, "rewards/rejected": -2.0450022220611572, "step": 1058 }, { "epoch": 1.2226948309422536, "grad_norm": 40.489341178222624, "learning_rate": 7.263370099279171e-08, "logits/chosen": -1.188340663909912, "logits/rejected": -1.1492575407028198, "logps/chosen": -163.66763305664062, "logps/rejected": -190.305908203125, "loss": 0.3879, "rewards/accuracies": 0.84375, "rewards/chosen": -0.9333768486976624, "rewards/margins": 1.2128130197525024, "rewards/rejected": -2.1461899280548096, "step": 1060 }, { "epoch": 1.22500180232139, "grad_norm": 50.037739270264275, "learning_rate": 7.226360271817708e-08, "logits/chosen": -1.3506156206130981, "logits/rejected": -1.3269940614700317, "logps/chosen": -206.15762329101562, "logps/rejected": -231.0203094482422, "loss": 0.3727, "rewards/accuracies": 0.90625, "rewards/chosen": -0.9508618712425232, "rewards/margins": 1.099410891532898, "rewards/rejected": -2.0502727031707764, "step": 1062 }, { "epoch": 1.2273087737005262, "grad_norm": 49.869486100814115, "learning_rate": 7.189391556388058e-08, "logits/chosen": -1.4331234693527222, "logits/rejected": -1.3977779150009155, "logps/chosen": -209.97015380859375, "logps/rejected": -260.2592468261719, "loss": 0.4164, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1883045434951782, "rewards/margins": 1.4966893196105957, "rewards/rejected": -2.6849939823150635, "step": 1064 }, { "epoch": 1.2296157450796625, "grad_norm": 49.45531524736339, "learning_rate": 7.152464500955768e-08, "logits/chosen": -1.3637323379516602, "logits/rejected": -1.3475160598754883, "logps/chosen": -194.37664794921875, "logps/rejected": -210.33851623535156, "loss": 0.3775, "rewards/accuracies": 0.90625, "rewards/chosen": -1.1909425258636475, "rewards/margins": 1.4508752822875977, "rewards/rejected": -2.641818046569824, "step": 1066 }, { "epoch": 1.231922716458799, "grad_norm": 52.71504231247347, "learning_rate": 7.115579652868878e-08, "logits/chosen": -1.2816321849822998, "logits/rejected": -1.2634057998657227, "logps/chosen": -153.44662475585938, "logps/rejected": -207.44244384765625, "loss": 0.3713, "rewards/accuracies": 0.8125, "rewards/chosen": -0.833113431930542, "rewards/margins": 1.5628983974456787, "rewards/rejected": -2.3960118293762207, "step": 1068 }, { "epoch": 1.2342296878379353, "grad_norm": 56.75539650441376, "learning_rate": 7.078737558849818e-08, "logits/chosen": -1.3372987508773804, "logits/rejected": -1.3544334173202515, "logps/chosen": -152.42861938476562, "logps/rejected": -215.0513916015625, "loss": 0.4061, "rewards/accuracies": 0.84375, "rewards/chosen": -0.9685344696044922, "rewards/margins": 1.3745617866516113, "rewards/rejected": -2.3430962562561035, "step": 1070 }, { "epoch": 1.2365366592170717, "grad_norm": 54.12178163249736, "learning_rate": 7.041938764987296e-08, "logits/chosen": -1.4193816184997559, "logits/rejected": -1.4047815799713135, "logps/chosen": -206.84002685546875, "logps/rejected": -220.5758819580078, "loss": 0.3785, "rewards/accuracies": 0.875, "rewards/chosen": -0.9423998594284058, "rewards/margins": 1.1570698022842407, "rewards/rejected": -2.0994696617126465, "step": 1072 }, { "epoch": 1.2388436305962078, "grad_norm": 41.420273500877144, "learning_rate": 7.005183816728213e-08, "logits/chosen": -1.4170185327529907, "logits/rejected": -1.4868308305740356, "logps/chosen": -205.6683807373047, "logps/rejected": -309.4522705078125, "loss": 0.3775, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0644909143447876, "rewards/margins": 1.8807792663574219, "rewards/rejected": -2.945270299911499, "step": 1074 }, { "epoch": 1.2411506019753442, "grad_norm": 54.934353878039, "learning_rate": 6.968473258869565e-08, "logits/chosen": -1.352636694908142, "logits/rejected": -1.449910283088684, "logps/chosen": -168.9800567626953, "logps/rejected": -243.92965698242188, "loss": 0.3853, "rewards/accuracies": 0.90625, "rewards/chosen": -0.8759716749191284, "rewards/margins": 1.3743195533752441, "rewards/rejected": -2.250291585922241, "step": 1076 }, { "epoch": 1.2434575733544806, "grad_norm": 48.00874815986513, "learning_rate": 6.931807635550383e-08, "logits/chosen": -1.519219994544983, "logits/rejected": -1.5146820545196533, "logps/chosen": -205.1129913330078, "logps/rejected": -249.6070098876953, "loss": 0.3314, "rewards/accuracies": 0.84375, "rewards/chosen": -0.9951453804969788, "rewards/margins": 1.2512072324752808, "rewards/rejected": -2.2463526725769043, "step": 1078 }, { "epoch": 1.245764544733617, "grad_norm": 50.334923119590876, "learning_rate": 6.89518749024365e-08, "logits/chosen": -1.4772987365722656, "logits/rejected": -1.4446271657943726, "logps/chosen": -154.9414520263672, "logps/rejected": -164.93331909179688, "loss": 0.3882, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6665012240409851, "rewards/margins": 0.8699983954429626, "rewards/rejected": -1.5364995002746582, "step": 1080 }, { "epoch": 1.2480715161127531, "grad_norm": 57.712585093330645, "learning_rate": 6.858613365748267e-08, "logits/chosen": -1.2888410091400146, "logits/rejected": -1.3315074443817139, "logps/chosen": -234.24783325195312, "logps/rejected": -301.7956848144531, "loss": 0.4517, "rewards/accuracies": 0.84375, "rewards/chosen": -1.1589394807815552, "rewards/margins": 1.467663288116455, "rewards/rejected": -2.6266026496887207, "step": 1082 }, { "epoch": 1.2503784874918895, "grad_norm": 47.1157877073686, "learning_rate": 6.822085804180984e-08, "logits/chosen": -1.4558112621307373, "logits/rejected": -1.355411171913147, "logps/chosen": -205.1348876953125, "logps/rejected": -189.77523803710938, "loss": 0.3582, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7983089685440063, "rewards/margins": 1.0706913471221924, "rewards/rejected": -1.8690004348754883, "step": 1084 }, { "epoch": 1.252685458871026, "grad_norm": 52.21642067042654, "learning_rate": 6.785605346968386e-08, "logits/chosen": -1.4896191358566284, "logits/rejected": -1.5129355192184448, "logps/chosen": -217.2392120361328, "logps/rejected": -240.3934783935547, "loss": 0.4285, "rewards/accuracies": 0.84375, "rewards/chosen": -1.158021330833435, "rewards/margins": 1.2918341159820557, "rewards/rejected": -2.4498555660247803, "step": 1086 }, { "epoch": 1.2549924302501623, "grad_norm": 54.48719637145695, "learning_rate": 6.749172534838848e-08, "logits/chosen": -1.4208894968032837, "logits/rejected": -1.3335964679718018, "logps/chosen": -180.71263122558594, "logps/rejected": -172.00143432617188, "loss": 0.3962, "rewards/accuracies": 0.84375, "rewards/chosen": -0.8040263056755066, "rewards/margins": 0.9753020405769348, "rewards/rejected": -1.7793283462524414, "step": 1088 }, { "epoch": 1.2572994016292984, "grad_norm": 44.86151315350074, "learning_rate": 6.712787907814541e-08, "logits/chosen": -1.3919479846954346, "logits/rejected": -1.3973674774169922, "logps/chosen": -234.341796875, "logps/rejected": -334.7096862792969, "loss": 0.3718, "rewards/accuracies": 1.0, "rewards/chosen": -1.0190818309783936, "rewards/margins": 2.0405924320220947, "rewards/rejected": -3.0596742630004883, "step": 1090 }, { "epoch": 1.2596063730084348, "grad_norm": 49.585862978376184, "learning_rate": 6.676452005203405e-08, "logits/chosen": -1.3598554134368896, "logits/rejected": -1.435031533241272, "logps/chosen": -164.2373809814453, "logps/rejected": -212.1796875, "loss": 0.4162, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9823134541511536, "rewards/margins": 1.0851922035217285, "rewards/rejected": -2.0675055980682373, "step": 1092 }, { "epoch": 1.2619133443875712, "grad_norm": 61.87058570329516, "learning_rate": 6.640165365591175e-08, "logits/chosen": -1.420425295829773, "logits/rejected": -1.3986551761627197, "logps/chosen": -179.66464233398438, "logps/rejected": -229.3308563232422, "loss": 0.4133, "rewards/accuracies": 0.90625, "rewards/chosen": -0.8618680238723755, "rewards/margins": 1.1394115686416626, "rewards/rejected": -2.001279830932617, "step": 1094 }, { "epoch": 1.2642203157667076, "grad_norm": 38.55062311194488, "learning_rate": 6.603928526833386e-08, "logits/chosen": -1.4957396984100342, "logits/rejected": -1.484830617904663, "logps/chosen": -195.9167938232422, "logps/rejected": -239.69021606445312, "loss": 0.3935, "rewards/accuracies": 0.875, "rewards/chosen": -0.8891066312789917, "rewards/margins": 1.3260498046875, "rewards/rejected": -2.2151565551757812, "step": 1096 }, { "epoch": 1.266527287145844, "grad_norm": 47.6968545203006, "learning_rate": 6.567742026047405e-08, "logits/chosen": -1.3937017917633057, "logits/rejected": -1.4098970890045166, "logps/chosen": -198.03074645996094, "logps/rejected": -259.1086120605469, "loss": 0.4311, "rewards/accuracies": 0.9375, "rewards/chosen": -1.115444302558899, "rewards/margins": 1.3988248109817505, "rewards/rejected": -2.5142691135406494, "step": 1098 }, { "epoch": 1.26883425852498, "grad_norm": 53.01531141976359, "learning_rate": 6.531606399604472e-08, "logits/chosen": -1.3574663400650024, "logits/rejected": -1.3760308027267456, "logps/chosen": -195.03289794921875, "logps/rejected": -241.59120178222656, "loss": 0.3629, "rewards/accuracies": 0.96875, "rewards/chosen": -1.0627293586730957, "rewards/margins": 1.5820095539093018, "rewards/rejected": -2.6447389125823975, "step": 1100 }, { "epoch": 1.26883425852498, "eval_logits/chosen": -1.346447467803955, "eval_logits/rejected": -1.2698445320129395, "eval_logps/chosen": -198.6750030517578, "eval_logps/rejected": -170.5380096435547, "eval_loss": 0.5291120409965515, "eval_rewards/accuracies": 0.7599999904632568, "eval_rewards/chosen": -1.3492814302444458, "eval_rewards/margins": 0.6529796719551086, "eval_rewards/rejected": -2.002261161804199, "eval_runtime": 23.3211, "eval_samples_per_second": 4.288, "eval_steps_per_second": 1.072, "step": 1100 }, { "epoch": 1.2711412299041165, "grad_norm": 54.927019109306244, "learning_rate": 6.49552218312174e-08, "logits/chosen": -1.49580717086792, "logits/rejected": -1.5456180572509766, "logps/chosen": -191.8947296142578, "logps/rejected": -221.13339233398438, "loss": 0.4337, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8337838649749756, "rewards/margins": 1.0560129880905151, "rewards/rejected": -1.8897968530654907, "step": 1102 }, { "epoch": 1.2734482012832529, "grad_norm": 57.11044799181979, "learning_rate": 6.459489911454348e-08, "logits/chosen": -1.2576816082000732, "logits/rejected": -1.3321532011032104, "logps/chosen": -153.68167114257812, "logps/rejected": -187.43959045410156, "loss": 0.4551, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0336787700653076, "rewards/margins": 0.5679832100868225, "rewards/rejected": -1.601662039756775, "step": 1104 }, { "epoch": 1.275755172662389, "grad_norm": 48.9077605697985, "learning_rate": 6.423510118687482e-08, "logits/chosen": -1.2373907566070557, "logits/rejected": -1.2709932327270508, "logps/chosen": -142.4616241455078, "logps/rejected": -163.26995849609375, "loss": 0.4101, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8073585033416748, "rewards/margins": 0.8551322817802429, "rewards/rejected": -1.662490725517273, "step": 1106 }, { "epoch": 1.2780621440415254, "grad_norm": 56.507420732209305, "learning_rate": 6.387583338128471e-08, "logits/chosen": -1.2857414484024048, "logits/rejected": -1.331521987915039, "logps/chosen": -150.70591735839844, "logps/rejected": -171.67141723632812, "loss": 0.4416, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8073572516441345, "rewards/margins": 0.8103399276733398, "rewards/rejected": -1.6176972389221191, "step": 1108 }, { "epoch": 1.2803691154206618, "grad_norm": 45.69922325435096, "learning_rate": 6.351710102298867e-08, "logits/chosen": -1.3792985677719116, "logits/rejected": -1.452039361000061, "logps/chosen": -207.93603515625, "logps/rejected": -253.76393127441406, "loss": 0.3717, "rewards/accuracies": 0.875, "rewards/chosen": -0.9924658536911011, "rewards/margins": 1.279854655265808, "rewards/rejected": -2.27232027053833, "step": 1110 }, { "epoch": 1.2826760867997982, "grad_norm": 46.67725956292131, "learning_rate": 6.31589094292657e-08, "logits/chosen": -1.3903954029083252, "logits/rejected": -1.4184892177581787, "logps/chosen": -219.1831512451172, "logps/rejected": -280.4320068359375, "loss": 0.4046, "rewards/accuracies": 0.84375, "rewards/chosen": -1.0702323913574219, "rewards/margins": 1.5914549827575684, "rewards/rejected": -2.6616873741149902, "step": 1112 }, { "epoch": 1.2849830581789345, "grad_norm": 40.80571923993709, "learning_rate": 6.280126390937924e-08, "logits/chosen": -1.4774876832962036, "logits/rejected": -1.4684927463531494, "logps/chosen": -215.90835571289062, "logps/rejected": -221.57406616210938, "loss": 0.436, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9599077701568604, "rewards/margins": 0.8318207263946533, "rewards/rejected": -1.7917284965515137, "step": 1114 }, { "epoch": 1.2872900295580707, "grad_norm": 42.68311460025384, "learning_rate": 6.244416976449875e-08, "logits/chosen": -1.3221067190170288, "logits/rejected": -1.2649778127670288, "logps/chosen": -119.57810974121094, "logps/rejected": -151.20071411132812, "loss": 0.3855, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7455071806907654, "rewards/margins": 0.9170046448707581, "rewards/rejected": -1.6625118255615234, "step": 1116 }, { "epoch": 1.289597000937207, "grad_norm": 43.61686616170438, "learning_rate": 6.208763228762082e-08, "logits/chosen": -1.2151230573654175, "logits/rejected": -1.2297194004058838, "logps/chosen": -164.2678985595703, "logps/rejected": -242.52076721191406, "loss": 0.3943, "rewards/accuracies": 0.90625, "rewards/chosen": -1.0782736539840698, "rewards/margins": 1.6307076215744019, "rewards/rejected": -2.708981513977051, "step": 1118 }, { "epoch": 1.2919039723163435, "grad_norm": 54.35054138906459, "learning_rate": 6.173165676349102e-08, "logits/chosen": -1.2725952863693237, "logits/rejected": -1.4444892406463623, "logps/chosen": -166.21847534179688, "logps/rejected": -293.0514831542969, "loss": 0.4145, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8468817472457886, "rewards/margins": 1.518345594406128, "rewards/rejected": -2.365227222442627, "step": 1120 }, { "epoch": 1.2942109436954798, "grad_norm": 44.69616309273757, "learning_rate": 6.137624846852535e-08, "logits/chosen": -1.2640550136566162, "logits/rejected": -1.1721779108047485, "logps/chosen": -126.40272521972656, "logps/rejected": -149.834228515625, "loss": 0.3984, "rewards/accuracies": 0.84375, "rewards/chosen": -0.9074777364730835, "rewards/margins": 1.2042250633239746, "rewards/rejected": -2.1117029190063477, "step": 1122 }, { "epoch": 1.2965179150746162, "grad_norm": 44.46454668934902, "learning_rate": 6.102141267073207e-08, "logits/chosen": -1.3089536428451538, "logits/rejected": -1.4028515815734863, "logps/chosen": -139.8980255126953, "logps/rejected": -190.38258361816406, "loss": 0.3884, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9231299161911011, "rewards/margins": 1.0761425495147705, "rewards/rejected": -1.9992725849151611, "step": 1124 }, { "epoch": 1.2988248864537524, "grad_norm": 55.74989650178749, "learning_rate": 6.066715462963375e-08, "logits/chosen": -1.4593031406402588, "logits/rejected": -1.5641745328903198, "logps/chosen": -196.91680908203125, "logps/rejected": -257.47308349609375, "loss": 0.4039, "rewards/accuracies": 0.90625, "rewards/chosen": -1.2544538974761963, "rewards/margins": 1.0345784425735474, "rewards/rejected": -2.289032220840454, "step": 1126 }, { "epoch": 1.3011318578328888, "grad_norm": 51.02973123233033, "learning_rate": 6.031347959618913e-08, "logits/chosen": -1.357025384902954, "logits/rejected": -1.4425618648529053, "logps/chosen": -173.85137939453125, "logps/rejected": -200.96022033691406, "loss": 0.4391, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8829444646835327, "rewards/margins": 1.008034110069275, "rewards/rejected": -1.8909785747528076, "step": 1128 }, { "epoch": 1.3034388292120251, "grad_norm": 41.84275287977337, "learning_rate": 5.996039281271543e-08, "logits/chosen": -1.2947957515716553, "logits/rejected": -1.2887905836105347, "logps/chosen": -139.00042724609375, "logps/rejected": -151.3178253173828, "loss": 0.4256, "rewards/accuracies": 0.75, "rewards/chosen": -0.8678557276725769, "rewards/margins": 0.6157978177070618, "rewards/rejected": -1.4836535453796387, "step": 1130 }, { "epoch": 1.3057458005911613, "grad_norm": 51.77131279496888, "learning_rate": 5.96078995128105e-08, "logits/chosen": -1.3757154941558838, "logits/rejected": -1.4178260564804077, "logps/chosen": -150.56947326660156, "logps/rejected": -227.6977996826172, "loss": 0.3699, "rewards/accuracies": 0.875, "rewards/chosen": -1.034728765487671, "rewards/margins": 1.7096223831176758, "rewards/rejected": -2.7443511486053467, "step": 1132 }, { "epoch": 1.3080527719702977, "grad_norm": 47.700143499181216, "learning_rate": 5.925600492127547e-08, "logits/chosen": -1.3131159543991089, "logits/rejected": -1.310932993888855, "logps/chosen": -141.53134155273438, "logps/rejected": -191.7972412109375, "loss": 0.3982, "rewards/accuracies": 0.6875, "rewards/chosen": -0.870035707950592, "rewards/margins": 1.2973705530166626, "rewards/rejected": -2.1674063205718994, "step": 1134 }, { "epoch": 1.310359743349434, "grad_norm": 53.07658826682632, "learning_rate": 5.8904714254037025e-08, "logits/chosen": -1.436853289604187, "logits/rejected": -1.5420252084732056, "logps/chosen": -137.68533325195312, "logps/rejected": -197.42489624023438, "loss": 0.4101, "rewards/accuracies": 0.90625, "rewards/chosen": -0.8586965799331665, "rewards/margins": 1.3288297653198242, "rewards/rejected": -2.1875264644622803, "step": 1136 }, { "epoch": 1.3126667147285704, "grad_norm": 55.98499802869115, "learning_rate": 5.855403271807032e-08, "logits/chosen": -1.4441746473312378, "logits/rejected": -1.3740431070327759, "logps/chosen": -141.81146240234375, "logps/rejected": -157.10916137695312, "loss": 0.3657, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9153862595558167, "rewards/margins": 1.5047463178634644, "rewards/rejected": -2.420132637023926, "step": 1138 }, { "epoch": 1.3149736861077068, "grad_norm": 47.88409037346829, "learning_rate": 5.82039655113217e-08, "logits/chosen": -1.5311238765716553, "logits/rejected": -1.4044201374053955, "logps/chosen": -165.2921600341797, "logps/rejected": -202.717041015625, "loss": 0.3879, "rewards/accuracies": 0.84375, "rewards/chosen": -1.142822027206421, "rewards/margins": 1.5451483726501465, "rewards/rejected": -2.6879703998565674, "step": 1140 }, { "epoch": 1.317280657486843, "grad_norm": 63.28691341194904, "learning_rate": 5.785451782263161e-08, "logits/chosen": -1.3691322803497314, "logits/rejected": -1.3665211200714111, "logps/chosen": -185.71286010742188, "logps/rejected": -199.6193084716797, "loss": 0.4569, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1284613609313965, "rewards/margins": 0.9253698587417603, "rewards/rejected": -2.053831100463867, "step": 1142 }, { "epoch": 1.3195876288659794, "grad_norm": 47.39309353229901, "learning_rate": 5.750569483165784e-08, "logits/chosen": -1.4143561124801636, "logits/rejected": -1.307342767715454, "logps/chosen": -236.60182189941406, "logps/rejected": -257.746337890625, "loss": 0.4071, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2457427978515625, "rewards/margins": 1.2368875741958618, "rewards/rejected": -2.482630491256714, "step": 1144 }, { "epoch": 1.3218946002451157, "grad_norm": 47.09697442795077, "learning_rate": 5.7157501708798584e-08, "logits/chosen": -1.5032621622085571, "logits/rejected": -1.3734570741653442, "logps/chosen": -182.57345581054688, "logps/rejected": -169.54696655273438, "loss": 0.4129, "rewards/accuracies": 0.78125, "rewards/chosen": -0.912488579750061, "rewards/margins": 0.9649824500083923, "rewards/rejected": -1.8774710893630981, "step": 1146 }, { "epoch": 1.3242015716242521, "grad_norm": 57.0952473659576, "learning_rate": 5.6809943615115904e-08, "logits/chosen": -1.4295306205749512, "logits/rejected": -1.399375557899475, "logps/chosen": -137.4228057861328, "logps/rejected": -176.73545837402344, "loss": 0.3865, "rewards/accuracies": 0.875, "rewards/chosen": -0.7859193086624146, "rewards/margins": 1.589166283607483, "rewards/rejected": -2.3750853538513184, "step": 1148 }, { "epoch": 1.3265085430033885, "grad_norm": 57.54839610092353, "learning_rate": 5.646302570225918e-08, "logits/chosen": -1.2610923051834106, "logits/rejected": -1.1784062385559082, "logps/chosen": -173.5068359375, "logps/rejected": -212.621337890625, "loss": 0.4041, "rewards/accuracies": 0.875, "rewards/chosen": -0.9680019021034241, "rewards/margins": 1.6587209701538086, "rewards/rejected": -2.626722812652588, "step": 1150 }, { "epoch": 1.3288155143825247, "grad_norm": 48.13045472735045, "learning_rate": 5.6116753112388794e-08, "logits/chosen": -1.4241957664489746, "logits/rejected": -1.4249969720840454, "logps/chosen": -174.09970092773438, "logps/rejected": -180.14329528808594, "loss": 0.415, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8689387440681458, "rewards/margins": 0.9074426889419556, "rewards/rejected": -1.776381254196167, "step": 1152 }, { "epoch": 1.331122485761661, "grad_norm": 54.2150965617913, "learning_rate": 5.577113097809989e-08, "logits/chosen": -1.3543493747711182, "logits/rejected": -1.3709462881088257, "logps/chosen": -142.47586059570312, "logps/rejected": -179.7193145751953, "loss": 0.3837, "rewards/accuracies": 0.875, "rewards/chosen": -0.8821631073951721, "rewards/margins": 1.1474261283874512, "rewards/rejected": -2.0295891761779785, "step": 1154 }, { "epoch": 1.3334294571407974, "grad_norm": 52.86369603141297, "learning_rate": 5.542616442234618e-08, "logits/chosen": -1.365761637687683, "logits/rejected": -1.441951036453247, "logps/chosen": -169.7147979736328, "logps/rejected": -230.1123046875, "loss": 0.4563, "rewards/accuracies": 0.75, "rewards/chosen": -1.0090858936309814, "rewards/margins": 1.1310818195343018, "rewards/rejected": -2.140167713165283, "step": 1156 }, { "epoch": 1.3357364285199336, "grad_norm": 48.68345037696806, "learning_rate": 5.508185855836425e-08, "logits/chosen": -1.3327983617782593, "logits/rejected": -1.4392744302749634, "logps/chosen": -133.7701416015625, "logps/rejected": -191.17843627929688, "loss": 0.4077, "rewards/accuracies": 0.84375, "rewards/chosen": -0.9508491158485413, "rewards/margins": 1.3994641304016113, "rewards/rejected": -2.350313186645508, "step": 1158 }, { "epoch": 1.33804339989907, "grad_norm": 42.64070787602433, "learning_rate": 5.473821848959761e-08, "logits/chosen": -1.2654824256896973, "logits/rejected": -1.3428267240524292, "logps/chosen": -177.55340576171875, "logps/rejected": -255.42984008789062, "loss": 0.3802, "rewards/accuracies": 0.875, "rewards/chosen": -0.9991359114646912, "rewards/margins": 1.5646129846572876, "rewards/rejected": -2.563748836517334, "step": 1160 }, { "epoch": 1.3403503712782063, "grad_norm": 50.28942241228028, "learning_rate": 5.4395249309621097e-08, "logits/chosen": -1.3695107698440552, "logits/rejected": -1.2706151008605957, "logps/chosen": -283.1875, "logps/rejected": -286.9220886230469, "loss": 0.38, "rewards/accuracies": 0.84375, "rewards/chosen": -1.4288372993469238, "rewards/margins": 1.4134087562561035, "rewards/rejected": -2.8422460556030273, "step": 1162 }, { "epoch": 1.3426573426573427, "grad_norm": 61.971878969120056, "learning_rate": 5.405295610206524e-08, "logits/chosen": -1.3642442226409912, "logits/rejected": -1.3541247844696045, "logps/chosen": -174.163330078125, "logps/rejected": -198.29196166992188, "loss": 0.4006, "rewards/accuracies": 0.90625, "rewards/chosen": -0.9269206523895264, "rewards/margins": 1.0355504751205444, "rewards/rejected": -1.9624711275100708, "step": 1164 }, { "epoch": 1.344964314036479, "grad_norm": 46.86768761320917, "learning_rate": 5.371134394054115e-08, "logits/chosen": -1.4383530616760254, "logits/rejected": -1.397727131843567, "logps/chosen": -225.30628967285156, "logps/rejected": -246.29644775390625, "loss": 0.4128, "rewards/accuracies": 0.75, "rewards/chosen": -0.9159881472587585, "rewards/margins": 1.293990135192871, "rewards/rejected": -2.2099781036376953, "step": 1166 }, { "epoch": 1.3472712854156152, "grad_norm": 53.19774662060618, "learning_rate": 5.337041788856518e-08, "logits/chosen": -1.3878076076507568, "logits/rejected": -1.3727601766586304, "logps/chosen": -161.39529418945312, "logps/rejected": -191.12062072753906, "loss": 0.407, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8259797096252441, "rewards/margins": 1.1816279888153076, "rewards/rejected": -2.0076074600219727, "step": 1168 }, { "epoch": 1.3495782567947516, "grad_norm": 60.247505085915364, "learning_rate": 5.303018299948389e-08, "logits/chosen": -1.3918073177337646, "logits/rejected": -1.3807213306427002, "logps/chosen": -133.32913208007812, "logps/rejected": -169.5771942138672, "loss": 0.3973, "rewards/accuracies": 0.90625, "rewards/chosen": -0.6024202108383179, "rewards/margins": 1.2600902318954468, "rewards/rejected": -1.8625102043151855, "step": 1170 }, { "epoch": 1.351885228173888, "grad_norm": 47.80602272845366, "learning_rate": 5.2690644316399004e-08, "logits/chosen": -1.3061091899871826, "logits/rejected": -1.3677294254302979, "logps/chosen": -173.61183166503906, "logps/rejected": -228.7135772705078, "loss": 0.3373, "rewards/accuracies": 0.875, "rewards/chosen": -1.1685351133346558, "rewards/margins": 1.3943965435028076, "rewards/rejected": -2.562931537628174, "step": 1172 }, { "epoch": 1.3541921995530242, "grad_norm": 52.65074333120445, "learning_rate": 5.235180687209295e-08, "logits/chosen": -1.3143095970153809, "logits/rejected": -1.373111367225647, "logps/chosen": -237.4772491455078, "logps/rejected": -283.6535339355469, "loss": 0.4258, "rewards/accuracies": 0.84375, "rewards/chosen": -1.1781258583068848, "rewards/margins": 1.684722900390625, "rewards/rejected": -2.8628485202789307, "step": 1174 }, { "epoch": 1.3564991709321605, "grad_norm": 56.602930524428096, "learning_rate": 5.201367568895407e-08, "logits/chosen": -1.2158329486846924, "logits/rejected": -1.2403156757354736, "logps/chosen": -219.708740234375, "logps/rejected": -289.25732421875, "loss": 0.3716, "rewards/accuracies": 0.90625, "rewards/chosen": -1.003746747970581, "rewards/margins": 2.028501272201538, "rewards/rejected": -3.03224778175354, "step": 1176 }, { "epoch": 1.358806142311297, "grad_norm": 42.6278186261498, "learning_rate": 5.167625577890222e-08, "logits/chosen": -1.4404326677322388, "logits/rejected": -1.4168187379837036, "logps/chosen": -189.86143493652344, "logps/rejected": -238.81646728515625, "loss": 0.3508, "rewards/accuracies": 0.875, "rewards/chosen": -0.7041274309158325, "rewards/margins": 1.4839988946914673, "rewards/rejected": -2.188126564025879, "step": 1178 }, { "epoch": 1.3611131136904333, "grad_norm": 46.23876462891979, "learning_rate": 5.133955214331438e-08, "logits/chosen": -1.359468698501587, "logits/rejected": -1.2669312953948975, "logps/chosen": -175.90753173828125, "logps/rejected": -184.07627868652344, "loss": 0.3872, "rewards/accuracies": 0.84375, "rewards/chosen": -0.9324202537536621, "rewards/margins": 1.3268898725509644, "rewards/rejected": -2.259310007095337, "step": 1180 }, { "epoch": 1.3634200850695697, "grad_norm": 42.009432528251, "learning_rate": 5.1003569772950714e-08, "logits/chosen": -1.413282036781311, "logits/rejected": -1.3845648765563965, "logps/chosen": -162.90414428710938, "logps/rejected": -288.2789611816406, "loss": 0.3854, "rewards/accuracies": 0.8125, "rewards/chosen": -0.790794849395752, "rewards/margins": 1.7329788208007812, "rewards/rejected": -2.523773670196533, "step": 1182 }, { "epoch": 1.3657270564487058, "grad_norm": 59.51613798378988, "learning_rate": 5.0668313647880465e-08, "logits/chosen": -1.4198514223098755, "logits/rejected": -1.367343783378601, "logps/chosen": -153.51954650878906, "logps/rejected": -163.91555786132812, "loss": 0.4409, "rewards/accuracies": 0.875, "rewards/chosen": -0.9148359298706055, "rewards/margins": 0.9861717224121094, "rewards/rejected": -1.9010077714920044, "step": 1184 }, { "epoch": 1.3680340278278422, "grad_norm": 43.64496759806874, "learning_rate": 5.033378873740819e-08, "logits/chosen": -1.4157196283340454, "logits/rejected": -1.3928645849227905, "logps/chosen": -151.0714111328125, "logps/rejected": -140.967529296875, "loss": 0.3607, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9113080501556396, "rewards/margins": 0.7878793478012085, "rewards/rejected": -1.6991872787475586, "step": 1186 }, { "epoch": 1.3703409992069786, "grad_norm": 59.24746681628499, "learning_rate": 5.000000000000002e-08, "logits/chosen": -1.2240504026412964, "logits/rejected": -1.2518386840820312, "logps/chosen": -197.45103454589844, "logps/rejected": -240.2637176513672, "loss": 0.3703, "rewards/accuracies": 0.875, "rewards/chosen": -0.9717167615890503, "rewards/margins": 1.4621995687484741, "rewards/rejected": -2.4339160919189453, "step": 1188 }, { "epoch": 1.372647970586115, "grad_norm": 69.8619637568134, "learning_rate": 4.966695238321027e-08, "logits/chosen": -1.359329104423523, "logits/rejected": -1.3603085279464722, "logps/chosen": -226.86892700195312, "logps/rejected": -452.2628173828125, "loss": 0.4596, "rewards/accuracies": 0.875, "rewards/chosen": -1.174229621887207, "rewards/margins": 1.8406164646148682, "rewards/rejected": -3.014845848083496, "step": 1190 }, { "epoch": 1.3749549419652514, "grad_norm": 39.85416303865178, "learning_rate": 4.933465082360807e-08, "logits/chosen": -1.328731894493103, "logits/rejected": -1.313866138458252, "logps/chosen": -143.9809112548828, "logps/rejected": -182.53477478027344, "loss": 0.3694, "rewards/accuracies": 0.84375, "rewards/chosen": -0.59998619556427, "rewards/margins": 1.3576034307479858, "rewards/rejected": -1.9575895071029663, "step": 1192 }, { "epoch": 1.3772619133443875, "grad_norm": 50.47296010963747, "learning_rate": 4.90031002467042e-08, "logits/chosen": -1.4128611087799072, "logits/rejected": -1.3620035648345947, "logps/chosen": -213.6893310546875, "logps/rejected": -255.20004272460938, "loss": 0.3618, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9902510046958923, "rewards/margins": 1.6613683700561523, "rewards/rejected": -2.6516194343566895, "step": 1194 }, { "epoch": 1.379568884723524, "grad_norm": 44.05325481629088, "learning_rate": 4.867230556687796e-08, "logits/chosen": -1.2905137538909912, "logits/rejected": -1.2903690338134766, "logps/chosen": -130.3699951171875, "logps/rejected": -239.37860107421875, "loss": 0.3685, "rewards/accuracies": 0.75, "rewards/chosen": -0.9421904683113098, "rewards/margins": 1.2194817066192627, "rewards/rejected": -2.1616721153259277, "step": 1196 }, { "epoch": 1.3818758561026603, "grad_norm": 58.075075554298664, "learning_rate": 4.8342271687304504e-08, "logits/chosen": -1.3242213726043701, "logits/rejected": -1.311003565788269, "logps/chosen": -150.2444610595703, "logps/rejected": -197.29025268554688, "loss": 0.451, "rewards/accuracies": 0.875, "rewards/chosen": -0.9144178032875061, "rewards/margins": 1.1128352880477905, "rewards/rejected": -2.0272531509399414, "step": 1198 }, { "epoch": 1.3841828274817964, "grad_norm": 37.508119781233376, "learning_rate": 4.801300349988219e-08, "logits/chosen": -1.3149864673614502, "logits/rejected": -1.423133373260498, "logps/chosen": -187.515380859375, "logps/rejected": -250.7738037109375, "loss": 0.372, "rewards/accuracies": 0.96875, "rewards/chosen": -0.6898128986358643, "rewards/margins": 1.6321946382522583, "rewards/rejected": -2.322007417678833, "step": 1200 }, { "epoch": 1.3841828274817964, "eval_logits/chosen": -1.3467315435409546, "eval_logits/rejected": -1.2710996866226196, "eval_logps/chosen": -199.28550720214844, "eval_logps/rejected": -170.8891143798828, "eval_loss": 0.5353700518608093, "eval_rewards/accuracies": 0.6800000071525574, "eval_rewards/chosen": -1.410333275794983, "eval_rewards/margins": 0.6270380616188049, "eval_rewards/rejected": -2.0373716354370117, "eval_runtime": 23.5786, "eval_samples_per_second": 4.241, "eval_steps_per_second": 1.06, "step": 1200 } ], "logging_steps": 2, "max_steps": 1732, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }