{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.6920914137408983, "eval_steps": 100, "global_step": 600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0023069713791363275, "grad_norm": 70.983255945064, "learning_rate": 4e-09, "logits/chosen": -1.6907414197921753, "logits/rejected": -1.6978764533996582, "logps/chosen": -135.08778381347656, "logps/rejected": -140.00140380859375, "loss": 0.6978, "rewards/accuracies": 0.625, "rewards/chosen": 0.010493194684386253, "rewards/margins": 0.006632559932768345, "rewards/rejected": 0.003860633820295334, "step": 2 }, { "epoch": 0.004613942758272655, "grad_norm": 79.97063682582153, "learning_rate": 8e-09, "logits/chosen": -1.6330227851867676, "logits/rejected": -1.7231806516647339, "logps/chosen": -197.88365173339844, "logps/rejected": -218.62255859375, "loss": 0.6925, "rewards/accuracies": 0.46875, "rewards/chosen": 0.008352389559149742, "rewards/margins": -0.00352578517049551, "rewards/rejected": 0.011878175660967827, "step": 4 }, { "epoch": 0.006920914137408983, "grad_norm": 78.73277588414827, "learning_rate": 1.1999999999999998e-08, "logits/chosen": -1.7628690004348755, "logits/rejected": -1.6921380758285522, "logps/chosen": -181.12741088867188, "logps/rejected": -177.64956665039062, "loss": 0.6919, "rewards/accuracies": 0.40625, "rewards/chosen": -0.005539673380553722, "rewards/margins": -0.012507464736700058, "rewards/rejected": 0.006967790424823761, "step": 6 }, { "epoch": 0.00922788551654531, "grad_norm": 84.94475101410946, "learning_rate": 1.6e-08, "logits/chosen": -1.6862337589263916, "logits/rejected": -1.6957104206085205, "logps/chosen": -229.57574462890625, "logps/rejected": -308.63421630859375, "loss": 0.6949, "rewards/accuracies": 0.59375, "rewards/chosen": 0.019691964611411095, "rewards/margins": 0.022208284586668015, "rewards/rejected": -0.0025163227692246437, "step": 8 }, { "epoch": 0.011534856895681638, "grad_norm": 78.53713189911603, "learning_rate": 2e-08, "logits/chosen": -1.72577965259552, "logits/rejected": -1.72530198097229, "logps/chosen": -182.21597290039062, "logps/rejected": -197.15383911132812, "loss": 0.6876, "rewards/accuracies": 0.46875, "rewards/chosen": -0.00015587342204526067, "rewards/margins": -0.00128166563808918, "rewards/rejected": 0.0011257934384047985, "step": 10 }, { "epoch": 0.013841828274817966, "grad_norm": 60.639642502259356, "learning_rate": 2.3999999999999997e-08, "logits/chosen": -1.5887395143508911, "logits/rejected": -1.7574589252471924, "logps/chosen": -121.71543884277344, "logps/rejected": -164.58782958984375, "loss": 0.6915, "rewards/accuracies": 0.5625, "rewards/chosen": 0.004625019151717424, "rewards/margins": 0.00861622579395771, "rewards/rejected": -0.003991207107901573, "step": 12 }, { "epoch": 0.016148799653954292, "grad_norm": 86.96054524483631, "learning_rate": 2.8000000000000003e-08, "logits/chosen": -1.5507514476776123, "logits/rejected": -1.5499210357666016, "logps/chosen": -147.94631958007812, "logps/rejected": -200.87417602539062, "loss": 0.6926, "rewards/accuracies": 0.53125, "rewards/chosen": 0.013226826675236225, "rewards/margins": 0.021001461893320084, "rewards/rejected": -0.00777463661506772, "step": 14 }, { "epoch": 0.01845577103309062, "grad_norm": 77.14253037311525, "learning_rate": 3.2e-08, "logits/chosen": -1.6721559762954712, "logits/rejected": -1.7068090438842773, "logps/chosen": -157.89622497558594, "logps/rejected": -199.2628631591797, "loss": 0.6951, "rewards/accuracies": 0.46875, "rewards/chosen": -0.0034806535113602877, "rewards/margins": 0.006839222740381956, "rewards/rejected": -0.010319876484572887, "step": 16 }, { "epoch": 0.020762742412226948, "grad_norm": 78.17299743340894, "learning_rate": 3.6e-08, "logits/chosen": -1.6556451320648193, "logits/rejected": -1.7276983261108398, "logps/chosen": -135.32223510742188, "logps/rejected": -158.1257781982422, "loss": 0.6884, "rewards/accuracies": 0.5625, "rewards/chosen": 0.01184301357716322, "rewards/margins": 0.023063620552420616, "rewards/rejected": -0.011220606043934822, "step": 18 }, { "epoch": 0.023069713791363276, "grad_norm": 77.30701284634611, "learning_rate": 4e-08, "logits/chosen": -1.735813856124878, "logits/rejected": -1.789333701133728, "logps/chosen": -157.4953155517578, "logps/rejected": -186.05862426757812, "loss": 0.693, "rewards/accuracies": 0.53125, "rewards/chosen": 0.004054094199091196, "rewards/margins": 0.010992627590894699, "rewards/rejected": -0.006938533391803503, "step": 20 }, { "epoch": 0.025376685170499604, "grad_norm": 77.95778126726918, "learning_rate": 4.4e-08, "logits/chosen": -1.5402836799621582, "logits/rejected": -1.5884689092636108, "logps/chosen": -133.27554321289062, "logps/rejected": -170.48594665527344, "loss": 0.6944, "rewards/accuracies": 0.6875, "rewards/chosen": 0.009935155510902405, "rewards/margins": 0.019674377515912056, "rewards/rejected": -0.009739222005009651, "step": 22 }, { "epoch": 0.02768365654963593, "grad_norm": 78.69027024475307, "learning_rate": 4.799999999999999e-08, "logits/chosen": -1.4641175270080566, "logits/rejected": -1.6491130590438843, "logps/chosen": -139.53375244140625, "logps/rejected": -193.21438598632812, "loss": 0.6945, "rewards/accuracies": 0.46875, "rewards/chosen": -0.010036014020442963, "rewards/margins": 0.005420446861535311, "rewards/rejected": -0.015456462278962135, "step": 24 }, { "epoch": 0.02999062792877226, "grad_norm": 82.14181039722058, "learning_rate": 5.2e-08, "logits/chosen": -1.7350623607635498, "logits/rejected": -1.6639574766159058, "logps/chosen": -161.62164306640625, "logps/rejected": -160.58840942382812, "loss": 0.6926, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0011872733011841774, "rewards/margins": -0.006157425232231617, "rewards/rejected": 0.0049701533280313015, "step": 26 }, { "epoch": 0.032297599307908584, "grad_norm": 73.66413843287592, "learning_rate": 5.6000000000000005e-08, "logits/chosen": -1.4280009269714355, "logits/rejected": -1.6318210363388062, "logps/chosen": -131.8518524169922, "logps/rejected": -166.72335815429688, "loss": 0.6964, "rewards/accuracies": 0.3125, "rewards/chosen": -0.01029287837445736, "rewards/margins": -0.014272996224462986, "rewards/rejected": 0.003980117850005627, "step": 28 }, { "epoch": 0.034604570687044915, "grad_norm": 86.40461346238433, "learning_rate": 6e-08, "logits/chosen": -1.6838908195495605, "logits/rejected": -1.7304034233093262, "logps/chosen": -124.49476623535156, "logps/rejected": -140.77841186523438, "loss": 0.6931, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0041793473064899445, "rewards/margins": -0.0030185298528522253, "rewards/rejected": 0.007197877857834101, "step": 30 }, { "epoch": 0.03691154206618124, "grad_norm": 79.23808836252253, "learning_rate": 6.4e-08, "logits/chosen": -1.5962588787078857, "logits/rejected": -1.549019455909729, "logps/chosen": -194.4256591796875, "logps/rejected": -237.36117553710938, "loss": 0.7003, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0013054789742454886, "rewards/margins": -0.018302934244275093, "rewards/rejected": 0.016997454687952995, "step": 32 }, { "epoch": 0.03921851344531757, "grad_norm": 74.59682312018288, "learning_rate": 6.8e-08, "logits/chosen": -1.740609884262085, "logits/rejected": -1.6540827751159668, "logps/chosen": -157.9348907470703, "logps/rejected": -158.6222686767578, "loss": 0.6889, "rewards/accuracies": 0.78125, "rewards/chosen": 0.01421279925853014, "rewards/margins": 0.03194922208786011, "rewards/rejected": -0.017736420035362244, "step": 34 }, { "epoch": 0.041525484824453895, "grad_norm": 79.71071945309694, "learning_rate": 7.2e-08, "logits/chosen": -1.5709240436553955, "logits/rejected": -1.6920911073684692, "logps/chosen": -173.94007873535156, "logps/rejected": -219.2288818359375, "loss": 0.6962, "rewards/accuracies": 0.375, "rewards/chosen": -0.011220686137676239, "rewards/margins": -0.008418139070272446, "rewards/rejected": -0.0028025463689118624, "step": 36 }, { "epoch": 0.04383245620359023, "grad_norm": 86.21753265595521, "learning_rate": 7.599999999999999e-08, "logits/chosen": -1.7426936626434326, "logits/rejected": -1.5694864988327026, "logps/chosen": -159.00730895996094, "logps/rejected": -146.18772888183594, "loss": 0.6971, "rewards/accuracies": 0.5, "rewards/chosen": -0.01584451086819172, "rewards/margins": -0.012521232478320599, "rewards/rejected": -0.003323277225717902, "step": 38 }, { "epoch": 0.04613942758272655, "grad_norm": 76.87714834723751, "learning_rate": 8e-08, "logits/chosen": -1.7129234075546265, "logits/rejected": -1.6838488578796387, "logps/chosen": -191.7474822998047, "logps/rejected": -162.81466674804688, "loss": 0.6986, "rewards/accuracies": 0.5, "rewards/chosen": 0.0020668436773121357, "rewards/margins": -0.008048251271247864, "rewards/rejected": 0.010115095414221287, "step": 40 }, { "epoch": 0.04844639896186288, "grad_norm": 71.62490665795943, "learning_rate": 8.4e-08, "logits/chosen": -1.7202041149139404, "logits/rejected": -1.7000675201416016, "logps/chosen": -157.0576934814453, "logps/rejected": -186.76138305664062, "loss": 0.6889, "rewards/accuracies": 0.46875, "rewards/chosen": 0.0023848186247050762, "rewards/margins": 0.002543981885537505, "rewards/rejected": -0.00015916326083242893, "step": 42 }, { "epoch": 0.05075337034099921, "grad_norm": 81.75722048702342, "learning_rate": 8.8e-08, "logits/chosen": -1.5299909114837646, "logits/rejected": -1.5974533557891846, "logps/chosen": -158.53753662109375, "logps/rejected": -179.9112548828125, "loss": 0.6924, "rewards/accuracies": 0.625, "rewards/chosen": 0.006203922443091869, "rewards/margins": 0.02108878269791603, "rewards/rejected": -0.014884857460856438, "step": 44 }, { "epoch": 0.05306034172013553, "grad_norm": 78.9572151515279, "learning_rate": 9.2e-08, "logits/chosen": -1.6293164491653442, "logits/rejected": -1.6444960832595825, "logps/chosen": -152.41412353515625, "logps/rejected": -170.09869384765625, "loss": 0.6925, "rewards/accuracies": 0.625, "rewards/chosen": 0.010927575640380383, "rewards/margins": 0.0014875519555062056, "rewards/rejected": 0.009440025314688683, "step": 46 }, { "epoch": 0.05536731309927186, "grad_norm": 81.84986800244135, "learning_rate": 9.599999999999999e-08, "logits/chosen": -1.7789497375488281, "logits/rejected": -1.7547317743301392, "logps/chosen": -214.71987915039062, "logps/rejected": -240.7999725341797, "loss": 0.6889, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0025361552834510803, "rewards/margins": 0.006769699975848198, "rewards/rejected": -0.004233543295413256, "step": 48 }, { "epoch": 0.05767428447840819, "grad_norm": 76.76668544313137, "learning_rate": 1e-07, "logits/chosen": -1.491020917892456, "logits/rejected": -1.5447454452514648, "logps/chosen": -201.1317901611328, "logps/rejected": -267.04248046875, "loss": 0.6974, "rewards/accuracies": 0.5, "rewards/chosen": -0.005572921596467495, "rewards/margins": 0.01456289179623127, "rewards/rejected": -0.02013581432402134, "step": 50 }, { "epoch": 0.05998125585754452, "grad_norm": 84.23078533480681, "learning_rate": 1.04e-07, "logits/chosen": -1.6120061874389648, "logits/rejected": -1.7649461030960083, "logps/chosen": -183.1412353515625, "logps/rejected": -282.4610290527344, "loss": 0.695, "rewards/accuracies": 0.3125, "rewards/chosen": -0.02553856186568737, "rewards/margins": -0.038517288863658905, "rewards/rejected": 0.012978724204003811, "step": 52 }, { "epoch": 0.06228822723668084, "grad_norm": 92.0173388072239, "learning_rate": 1.08e-07, "logits/chosen": -1.4607347249984741, "logits/rejected": -1.625195026397705, "logps/chosen": -152.8062744140625, "logps/rejected": -219.1183624267578, "loss": 0.6938, "rewards/accuracies": 0.375, "rewards/chosen": -0.012677345424890518, "rewards/margins": -0.032261885702610016, "rewards/rejected": 0.01958453841507435, "step": 54 }, { "epoch": 0.06459519861581717, "grad_norm": 79.32388836348366, "learning_rate": 1.1200000000000001e-07, "logits/chosen": -1.6375060081481934, "logits/rejected": -1.5966099500656128, "logps/chosen": -223.85108947753906, "logps/rejected": -256.30072021484375, "loss": 0.699, "rewards/accuracies": 0.40625, "rewards/chosen": -0.023084305226802826, "rewards/margins": -0.02949170023202896, "rewards/rejected": 0.006407391745597124, "step": 56 }, { "epoch": 0.0669021699949535, "grad_norm": 69.72321013611293, "learning_rate": 1.1599999999999999e-07, "logits/chosen": -1.5106103420257568, "logits/rejected": -1.596940517425537, "logps/chosen": -188.92218017578125, "logps/rejected": -275.0534973144531, "loss": 0.6941, "rewards/accuracies": 0.5, "rewards/chosen": 0.017192328348755836, "rewards/margins": 0.03240702301263809, "rewards/rejected": -0.015214694663882256, "step": 58 }, { "epoch": 0.06920914137408983, "grad_norm": 72.64787237644839, "learning_rate": 1.2e-07, "logits/chosen": -1.667373776435852, "logits/rejected": -1.6947064399719238, "logps/chosen": -103.05116271972656, "logps/rejected": -151.50807189941406, "loss": 0.6928, "rewards/accuracies": 0.5625, "rewards/chosen": 0.00641946354880929, "rewards/margins": 0.032484300434589386, "rewards/rejected": -0.026064833626151085, "step": 60 }, { "epoch": 0.07151611275322615, "grad_norm": 75.24112563737103, "learning_rate": 1.24e-07, "logits/chosen": -1.5305185317993164, "logits/rejected": -1.6095894575119019, "logps/chosen": -154.5998077392578, "logps/rejected": -176.33970642089844, "loss": 0.6965, "rewards/accuracies": 0.53125, "rewards/chosen": 0.00010838371235877275, "rewards/margins": 0.006697986274957657, "rewards/rejected": -0.006589602679014206, "step": 62 }, { "epoch": 0.07382308413236248, "grad_norm": 70.73819169103417, "learning_rate": 1.28e-07, "logits/chosen": -1.6058859825134277, "logits/rejected": -1.6498842239379883, "logps/chosen": -151.62286376953125, "logps/rejected": -169.5069122314453, "loss": 0.6927, "rewards/accuracies": 0.46875, "rewards/chosen": 0.003975578583776951, "rewards/margins": 0.006644865497946739, "rewards/rejected": -0.0026692876126617193, "step": 64 }, { "epoch": 0.0761300555114988, "grad_norm": 79.80905942384086, "learning_rate": 1.32e-07, "logits/chosen": -1.566676139831543, "logits/rejected": -1.635036587715149, "logps/chosen": -213.95590209960938, "logps/rejected": -268.84747314453125, "loss": 0.6878, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0030877357348799706, "rewards/margins": -0.004700601100921631, "rewards/rejected": 0.0016128652496263385, "step": 66 }, { "epoch": 0.07843702689063514, "grad_norm": 68.290418726697, "learning_rate": 1.36e-07, "logits/chosen": -1.625745177268982, "logits/rejected": -1.7014200687408447, "logps/chosen": -191.14842224121094, "logps/rejected": -222.51779174804688, "loss": 0.6954, "rewards/accuracies": 0.4375, "rewards/chosen": 0.0007452260470017791, "rewards/margins": 0.00873138289898634, "rewards/rejected": -0.007986157201230526, "step": 68 }, { "epoch": 0.08074399826977147, "grad_norm": 79.51788387191074, "learning_rate": 1.3999999999999998e-07, "logits/chosen": -1.5817363262176514, "logits/rejected": -1.6993348598480225, "logps/chosen": -131.7422637939453, "logps/rejected": -162.98304748535156, "loss": 0.6941, "rewards/accuracies": 0.53125, "rewards/chosen": -0.0074032689444720745, "rewards/margins": -0.010667804628610611, "rewards/rejected": 0.003264536615461111, "step": 70 }, { "epoch": 0.08305096964890779, "grad_norm": 83.37639460390962, "learning_rate": 1.44e-07, "logits/chosen": -1.5910240411758423, "logits/rejected": -1.6661615371704102, "logps/chosen": -181.304931640625, "logps/rejected": -209.23526000976562, "loss": 0.6943, "rewards/accuracies": 0.375, "rewards/chosen": -0.0042681945487856865, "rewards/margins": -0.02247396856546402, "rewards/rejected": 0.018205774948000908, "step": 72 }, { "epoch": 0.08535794102804412, "grad_norm": 85.04605551266509, "learning_rate": 1.48e-07, "logits/chosen": -1.5876970291137695, "logits/rejected": -1.7304015159606934, "logps/chosen": -146.5454559326172, "logps/rejected": -182.179931640625, "loss": 0.6891, "rewards/accuracies": 0.46875, "rewards/chosen": -0.011930807493627071, "rewards/margins": -0.013042710721492767, "rewards/rejected": 0.0011119036935269833, "step": 74 }, { "epoch": 0.08766491240718045, "grad_norm": 71.97159401776734, "learning_rate": 1.5199999999999998e-07, "logits/chosen": -1.6893718242645264, "logits/rejected": -1.6752575635910034, "logps/chosen": -163.57489013671875, "logps/rejected": -162.34803771972656, "loss": 0.6906, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0007063052617013454, "rewards/margins": 0.002567308023571968, "rewards/rejected": -0.0018610022962093353, "step": 76 }, { "epoch": 0.08997188378631678, "grad_norm": 80.09910611023234, "learning_rate": 1.56e-07, "logits/chosen": -1.612238883972168, "logits/rejected": -1.5296804904937744, "logps/chosen": -143.39723205566406, "logps/rejected": -165.65318298339844, "loss": 0.6871, "rewards/accuracies": 0.5625, "rewards/chosen": 0.009035947732627392, "rewards/margins": 0.025550464168190956, "rewards/rejected": -0.01651451550424099, "step": 78 }, { "epoch": 0.0922788551654531, "grad_norm": 70.85997602518799, "learning_rate": 1.6e-07, "logits/chosen": -1.6167306900024414, "logits/rejected": -1.720908522605896, "logps/chosen": -137.2986602783203, "logps/rejected": -246.95404052734375, "loss": 0.693, "rewards/accuracies": 0.625, "rewards/chosen": 0.006022963672876358, "rewards/margins": 0.008436123840510845, "rewards/rejected": -0.0024131599348038435, "step": 80 }, { "epoch": 0.09458582654458943, "grad_norm": 73.43468397188529, "learning_rate": 1.6399999999999999e-07, "logits/chosen": -1.7178070545196533, "logits/rejected": -1.7651526927947998, "logps/chosen": -154.39617919921875, "logps/rejected": -187.47491455078125, "loss": 0.6985, "rewards/accuracies": 0.4375, "rewards/chosen": -0.014386245980858803, "rewards/margins": -0.011840756982564926, "rewards/rejected": -0.002545490860939026, "step": 82 }, { "epoch": 0.09689279792372577, "grad_norm": 82.20956953972079, "learning_rate": 1.68e-07, "logits/chosen": -1.5902974605560303, "logits/rejected": -1.604806900024414, "logps/chosen": -127.57711029052734, "logps/rejected": -146.8506317138672, "loss": 0.6887, "rewards/accuracies": 0.5, "rewards/chosen": -0.006446457467973232, "rewards/margins": 0.008786877617239952, "rewards/rejected": -0.015233333222568035, "step": 84 }, { "epoch": 0.09919976930286209, "grad_norm": 76.58342600254258, "learning_rate": 1.7199999999999998e-07, "logits/chosen": -1.6294573545455933, "logits/rejected": -1.6458450555801392, "logps/chosen": -248.47845458984375, "logps/rejected": -246.7737579345703, "loss": 0.6895, "rewards/accuracies": 0.53125, "rewards/chosen": -0.009375479072332382, "rewards/margins": 0.025924015790224075, "rewards/rejected": -0.03529949486255646, "step": 86 }, { "epoch": 0.10150674068199841, "grad_norm": 73.05738349044326, "learning_rate": 1.76e-07, "logits/chosen": -1.84279465675354, "logits/rejected": -1.7476646900177002, "logps/chosen": -153.08554077148438, "logps/rejected": -154.7803497314453, "loss": 0.6909, "rewards/accuracies": 0.53125, "rewards/chosen": -0.002370176836848259, "rewards/margins": -0.0033320121001452208, "rewards/rejected": 0.0009618350304663181, "step": 88 }, { "epoch": 0.10381371206113474, "grad_norm": 77.95992122604585, "learning_rate": 1.8e-07, "logits/chosen": -1.6355715990066528, "logits/rejected": -1.6984450817108154, "logps/chosen": -169.7340087890625, "logps/rejected": -185.59031677246094, "loss": 0.6929, "rewards/accuracies": 0.46875, "rewards/chosen": -0.01826353184878826, "rewards/margins": -0.008648518472909927, "rewards/rejected": -0.009615011513233185, "step": 90 }, { "epoch": 0.10612068344027106, "grad_norm": 74.70608110655593, "learning_rate": 1.84e-07, "logits/chosen": -1.5153872966766357, "logits/rejected": -1.5389485359191895, "logps/chosen": -214.6402130126953, "logps/rejected": -224.4317626953125, "loss": 0.6877, "rewards/accuracies": 0.65625, "rewards/chosen": 0.00619399081915617, "rewards/margins": 0.011179441586136818, "rewards/rejected": -0.004985451698303223, "step": 92 }, { "epoch": 0.1084276548194074, "grad_norm": 74.73028912491218, "learning_rate": 1.88e-07, "logits/chosen": -1.5655710697174072, "logits/rejected": -1.5568063259124756, "logps/chosen": -170.8540496826172, "logps/rejected": -195.4169158935547, "loss": 0.6887, "rewards/accuracies": 0.5625, "rewards/chosen": -0.016467105597257614, "rewards/margins": 0.011973596177995205, "rewards/rejected": -0.028440698981285095, "step": 94 }, { "epoch": 0.11073462619854373, "grad_norm": 81.65007402302682, "learning_rate": 1.9199999999999997e-07, "logits/chosen": -1.7869484424591064, "logits/rejected": -1.7626042366027832, "logps/chosen": -208.46002197265625, "logps/rejected": -256.5970458984375, "loss": 0.6846, "rewards/accuracies": 0.5625, "rewards/chosen": -0.009895925410091877, "rewards/margins": 0.011812476441264153, "rewards/rejected": -0.021708402782678604, "step": 96 }, { "epoch": 0.11304159757768005, "grad_norm": 77.09604224965807, "learning_rate": 1.9599999999999998e-07, "logits/chosen": -1.6421016454696655, "logits/rejected": -1.5873386859893799, "logps/chosen": -176.68853759765625, "logps/rejected": -188.64544677734375, "loss": 0.69, "rewards/accuracies": 0.59375, "rewards/chosen": -0.01306343637406826, "rewards/margins": 0.021781262010335922, "rewards/rejected": -0.03484470024704933, "step": 98 }, { "epoch": 0.11534856895681637, "grad_norm": 77.37699461097638, "learning_rate": 2e-07, "logits/chosen": -1.5747566223144531, "logits/rejected": -1.5757074356079102, "logps/chosen": -146.1053009033203, "logps/rejected": -177.65733337402344, "loss": 0.6926, "rewards/accuracies": 0.4375, "rewards/chosen": -0.01261211559176445, "rewards/margins": 0.005542438477277756, "rewards/rejected": -0.018154552206397057, "step": 100 }, { "epoch": 0.11534856895681637, "eval_logits/chosen": -1.592301368713379, "eval_logits/rejected": -1.4917248487472534, "eval_logps/chosen": -185.32534790039062, "eval_logps/rejected": -150.51693725585938, "eval_loss": 0.6938029527664185, "eval_rewards/accuracies": 0.4000000059604645, "eval_rewards/chosen": -0.014318165369331837, "eval_rewards/margins": -0.014164167456328869, "eval_rewards/rejected": -0.00015399709809571505, "eval_runtime": 22.8572, "eval_samples_per_second": 4.375, "eval_steps_per_second": 1.094, "step": 100 }, { "epoch": 0.11765554033595271, "grad_norm": 83.18929756458088, "learning_rate": 1.9999925887938156e-07, "logits/chosen": -1.553455114364624, "logits/rejected": -1.6009831428527832, "logps/chosen": -171.79664611816406, "logps/rejected": -223.1472930908203, "loss": 0.6923, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0011008224682882428, "rewards/margins": 0.010834511369466782, "rewards/rejected": -0.009733689948916435, "step": 102 }, { "epoch": 0.11996251171508904, "grad_norm": 73.85193862190509, "learning_rate": 1.9999703552851146e-07, "logits/chosen": -1.7583006620407104, "logits/rejected": -1.714582920074463, "logps/chosen": -209.88302612304688, "logps/rejected": -255.11888122558594, "loss": 0.6913, "rewards/accuracies": 0.4375, "rewards/chosen": -0.030152078717947006, "rewards/margins": -0.011741320602595806, "rewards/rejected": -0.018410757184028625, "step": 104 }, { "epoch": 0.12226948309422536, "grad_norm": 73.38727733625704, "learning_rate": 1.9999332998034512e-07, "logits/chosen": -1.6966747045516968, "logits/rejected": -1.6100220680236816, "logps/chosen": -160.12281799316406, "logps/rejected": -167.38145446777344, "loss": 0.686, "rewards/accuracies": 0.5, "rewards/chosen": -0.008994976989924908, "rewards/margins": 0.011080076918005943, "rewards/rejected": -0.020075054839253426, "step": 106 }, { "epoch": 0.12457645447336169, "grad_norm": 79.80728796393491, "learning_rate": 1.9998814228980768e-07, "logits/chosen": -1.6656932830810547, "logits/rejected": -1.7435060739517212, "logps/chosen": -156.0963897705078, "logps/rejected": -208.7286376953125, "loss": 0.6956, "rewards/accuracies": 0.5, "rewards/chosen": -0.01158140879124403, "rewards/margins": -0.003068419173359871, "rewards/rejected": -0.008512990549206734, "step": 108 }, { "epoch": 0.126883425852498, "grad_norm": 71.572270187751, "learning_rate": 1.9998147253379324e-07, "logits/chosen": -1.7250394821166992, "logits/rejected": -1.720632791519165, "logps/chosen": -143.606201171875, "logps/rejected": -164.64126586914062, "loss": 0.6911, "rewards/accuracies": 0.59375, "rewards/chosen": -0.00029725395143032074, "rewards/margins": 0.004560052417218685, "rewards/rejected": -0.004857306368649006, "step": 110 }, { "epoch": 0.12919039723163434, "grad_norm": 76.7363281665151, "learning_rate": 1.999733208111637e-07, "logits/chosen": -1.680725336074829, "logits/rejected": -1.7269576787948608, "logps/chosen": -141.92416381835938, "logps/rejected": -163.63902282714844, "loss": 0.6878, "rewards/accuracies": 0.625, "rewards/chosen": -0.013096440583467484, "rewards/margins": 0.013841900043189526, "rewards/rejected": -0.026938341557979584, "step": 112 }, { "epoch": 0.13149736861077066, "grad_norm": 71.23095597503095, "learning_rate": 1.9996368724274726e-07, "logits/chosen": -1.7746036052703857, "logits/rejected": -1.651440978050232, "logps/chosen": -201.5678253173828, "logps/rejected": -208.96060180664062, "loss": 0.6814, "rewards/accuracies": 0.46875, "rewards/chosen": -0.020835982635617256, "rewards/margins": 0.0012882971204817295, "rewards/rejected": -0.02212427742779255, "step": 114 }, { "epoch": 0.133804339989907, "grad_norm": 78.35820889872501, "learning_rate": 1.999525719713366e-07, "logits/chosen": -1.6184967756271362, "logits/rejected": -1.6276531219482422, "logps/chosen": -138.03579711914062, "logps/rejected": -156.24818420410156, "loss": 0.6893, "rewards/accuracies": 0.53125, "rewards/chosen": -0.024172522127628326, "rewards/margins": -0.007286247797310352, "rewards/rejected": -0.0168862733989954, "step": 116 }, { "epoch": 0.13611131136904334, "grad_norm": 73.30378065799947, "learning_rate": 1.9993997516168685e-07, "logits/chosen": -1.5095572471618652, "logits/rejected": -1.4317773580551147, "logps/chosen": -168.31259155273438, "logps/rejected": -181.01010131835938, "loss": 0.6946, "rewards/accuracies": 0.53125, "rewards/chosen": -0.02184070646762848, "rewards/margins": -0.0038399603217840195, "rewards/rejected": -0.01800074614584446, "step": 118 }, { "epoch": 0.13841828274817966, "grad_norm": 76.67682193401895, "learning_rate": 1.9992589700051315e-07, "logits/chosen": -1.6505416631698608, "logits/rejected": -1.6528055667877197, "logps/chosen": -163.4833221435547, "logps/rejected": -173.32627868652344, "loss": 0.691, "rewards/accuracies": 0.5, "rewards/chosen": -0.037532731890678406, "rewards/margins": -0.001648992532864213, "rewards/rejected": -0.035883739590644836, "step": 120 }, { "epoch": 0.14072525412731599, "grad_norm": 79.25219710625622, "learning_rate": 1.9991033769648782e-07, "logits/chosen": -1.6732072830200195, "logits/rejected": -1.6914747953414917, "logps/chosen": -192.20941162109375, "logps/rejected": -249.57064819335938, "loss": 0.6785, "rewards/accuracies": 0.6875, "rewards/chosen": -0.02337605319917202, "rewards/margins": 0.03376854211091995, "rewards/rejected": -0.05714459717273712, "step": 122 }, { "epoch": 0.1430322255064523, "grad_norm": 74.1794316582206, "learning_rate": 1.9989329748023723e-07, "logits/chosen": -1.6055612564086914, "logits/rejected": -1.6374058723449707, "logps/chosen": -150.5140838623047, "logps/rejected": -178.90463256835938, "loss": 0.6838, "rewards/accuracies": 0.53125, "rewards/chosen": -0.03467298671603203, "rewards/margins": 0.017992481589317322, "rewards/rejected": -0.05266546458005905, "step": 124 }, { "epoch": 0.14533919688558863, "grad_norm": 76.65585990685855, "learning_rate": 1.9987477660433854e-07, "logits/chosen": -1.6969408988952637, "logits/rejected": -1.7563108205795288, "logps/chosen": -142.4885711669922, "logps/rejected": -210.4172821044922, "loss": 0.689, "rewards/accuracies": 0.625, "rewards/chosen": -0.024329736828804016, "rewards/margins": 0.004793995060026646, "rewards/rejected": -0.029123730957508087, "step": 126 }, { "epoch": 0.14764616826472496, "grad_norm": 78.42733649878224, "learning_rate": 1.998547753433158e-07, "logits/chosen": -1.6231815814971924, "logits/rejected": -1.4993540048599243, "logps/chosen": -248.5255584716797, "logps/rejected": -283.0628662109375, "loss": 0.6867, "rewards/accuracies": 0.65625, "rewards/chosen": -0.022915348410606384, "rewards/margins": 0.04015136882662773, "rewards/rejected": -0.06306671351194382, "step": 128 }, { "epoch": 0.14995313964386128, "grad_norm": 81.34489884736102, "learning_rate": 1.9983329399363594e-07, "logits/chosen": -1.696123719215393, "logits/rejected": -1.5894306898117065, "logps/chosen": -157.25205993652344, "logps/rejected": -169.28765869140625, "loss": 0.6888, "rewards/accuracies": 0.6875, "rewards/chosen": -0.02339826337993145, "rewards/margins": 0.029402071610093117, "rewards/rejected": -0.05280033499002457, "step": 130 }, { "epoch": 0.1522601110229976, "grad_norm": 71.86675386697708, "learning_rate": 1.998103328737044e-07, "logits/chosen": -1.614111304283142, "logits/rejected": -1.668984055519104, "logps/chosen": -169.32870483398438, "logps/rejected": -184.28652954101562, "loss": 0.694, "rewards/accuracies": 0.53125, "rewards/chosen": -0.032653309404850006, "rewards/margins": -0.012304544448852539, "rewards/rejected": -0.020348764955997467, "step": 132 }, { "epoch": 0.15456708240213396, "grad_norm": 81.85224601265395, "learning_rate": 1.9978589232386034e-07, "logits/chosen": -1.715609073638916, "logits/rejected": -1.7786422967910767, "logps/chosen": -167.58688354492188, "logps/rejected": -199.66270446777344, "loss": 0.6864, "rewards/accuracies": 0.5625, "rewards/chosen": -0.03811431676149368, "rewards/margins": 0.017543859779834747, "rewards/rejected": -0.05565817654132843, "step": 134 }, { "epoch": 0.15687405378127028, "grad_norm": 73.7622516118343, "learning_rate": 1.9975997270637168e-07, "logits/chosen": -1.6321560144424438, "logits/rejected": -1.7015608549118042, "logps/chosen": -159.351318359375, "logps/rejected": -176.723876953125, "loss": 0.6859, "rewards/accuracies": 0.6875, "rewards/chosen": -0.02714327722787857, "rewards/margins": 0.021138466894626617, "rewards/rejected": -0.04828174412250519, "step": 136 }, { "epoch": 0.1591810251604066, "grad_norm": 71.96760531715911, "learning_rate": 1.997325744054297e-07, "logits/chosen": -1.5530474185943604, "logits/rejected": -1.5373188257217407, "logps/chosen": -158.63812255859375, "logps/rejected": -204.0651092529297, "loss": 0.6845, "rewards/accuracies": 0.53125, "rewards/chosen": -0.01808898150920868, "rewards/margins": 0.017602307721972466, "rewards/rejected": -0.035691291093826294, "step": 138 }, { "epoch": 0.16148799653954293, "grad_norm": 73.93273755038686, "learning_rate": 1.9970369782714328e-07, "logits/chosen": -1.522450566291809, "logits/rejected": -1.635149598121643, "logps/chosen": -142.74459838867188, "logps/rejected": -149.5770721435547, "loss": 0.6894, "rewards/accuracies": 0.4375, "rewards/chosen": -0.025994691997766495, "rewards/margins": -0.006413338705897331, "rewards/rejected": -0.019581351429224014, "step": 140 }, { "epoch": 0.16379496791867926, "grad_norm": 79.55861212361464, "learning_rate": 1.99673343399533e-07, "logits/chosen": -1.525217056274414, "logits/rejected": -1.5952740907669067, "logps/chosen": -116.86170959472656, "logps/rejected": -175.00779724121094, "loss": 0.6832, "rewards/accuracies": 0.59375, "rewards/chosen": -0.020393915474414825, "rewards/margins": 0.032093390822410583, "rewards/rejected": -0.05248731002211571, "step": 142 }, { "epoch": 0.16610193929781558, "grad_norm": 83.97535534931897, "learning_rate": 1.9964151157252466e-07, "logits/chosen": -1.6767423152923584, "logits/rejected": -1.6693997383117676, "logps/chosen": -207.50936889648438, "logps/rejected": -216.3134002685547, "loss": 0.6789, "rewards/accuracies": 0.5625, "rewards/chosen": -0.03636579588055611, "rewards/margins": 0.03607642278075218, "rewards/rejected": -0.07244221866130829, "step": 144 }, { "epoch": 0.1684089106769519, "grad_norm": 73.85111113100436, "learning_rate": 1.996082028179428e-07, "logits/chosen": -1.4807971715927124, "logits/rejected": -1.4092496633529663, "logps/chosen": -168.455078125, "logps/rejected": -172.4587860107422, "loss": 0.689, "rewards/accuracies": 0.625, "rewards/chosen": -0.06219344958662987, "rewards/margins": 0.011872416362166405, "rewards/rejected": -0.07406586408615112, "step": 146 }, { "epoch": 0.17071588205608823, "grad_norm": 80.66954314511047, "learning_rate": 1.9957341762950344e-07, "logits/chosen": -1.5618644952774048, "logits/rejected": -1.67661452293396, "logps/chosen": -114.58411407470703, "logps/rejected": -158.1700439453125, "loss": 0.6807, "rewards/accuracies": 0.6875, "rewards/chosen": -0.04578070342540741, "rewards/margins": 0.029419898986816406, "rewards/rejected": -0.07520060241222382, "step": 148 }, { "epoch": 0.17302285343522458, "grad_norm": 71.43623661516392, "learning_rate": 1.9953715652280706e-07, "logits/chosen": -1.6976016759872437, "logits/rejected": -1.6299835443496704, "logps/chosen": -228.1553497314453, "logps/rejected": -214.32037353515625, "loss": 0.6848, "rewards/accuracies": 0.53125, "rewards/chosen": -0.06526876986026764, "rewards/margins": -0.002617661375552416, "rewards/rejected": -0.06265110522508621, "step": 150 }, { "epoch": 0.1753298248143609, "grad_norm": 85.31030453694517, "learning_rate": 1.9949942003533064e-07, "logits/chosen": -1.7211732864379883, "logits/rejected": -1.720245122909546, "logps/chosen": -138.57936096191406, "logps/rejected": -158.134521484375, "loss": 0.6829, "rewards/accuracies": 0.59375, "rewards/chosen": -0.042685676366090775, "rewards/margins": -0.006229763850569725, "rewards/rejected": -0.0364559069275856, "step": 152 }, { "epoch": 0.17763679619349723, "grad_norm": 75.37656195588123, "learning_rate": 1.9946020872642006e-07, "logits/chosen": -1.602712631225586, "logits/rejected": -1.5105926990509033, "logps/chosen": -152.95616149902344, "logps/rejected": -252.92359924316406, "loss": 0.6848, "rewards/accuracies": 0.65625, "rewards/chosen": -0.05936397612094879, "rewards/margins": 0.023188650608062744, "rewards/rejected": -0.08255261927843094, "step": 154 }, { "epoch": 0.17994376757263356, "grad_norm": 74.43256656918146, "learning_rate": 1.9941952317728147e-07, "logits/chosen": -1.6266837120056152, "logits/rejected": -1.5794254541397095, "logps/chosen": -154.70660400390625, "logps/rejected": -171.6692657470703, "loss": 0.688, "rewards/accuracies": 0.625, "rewards/chosen": -0.0499938428401947, "rewards/margins": 0.022745870053768158, "rewards/rejected": -0.07273972034454346, "step": 156 }, { "epoch": 0.18225073895176988, "grad_norm": 75.91057680239186, "learning_rate": 1.993773639909728e-07, "logits/chosen": -1.49541437625885, "logits/rejected": -1.6966127157211304, "logps/chosen": -165.41343688964844, "logps/rejected": -208.544189453125, "loss": 0.6768, "rewards/accuracies": 0.8125, "rewards/chosen": -0.02937229909002781, "rewards/margins": 0.061954449862241745, "rewards/rejected": -0.0913267433643341, "step": 158 }, { "epoch": 0.1845577103309062, "grad_norm": 79.53079693796676, "learning_rate": 1.99333731792395e-07, "logits/chosen": -1.5714216232299805, "logits/rejected": -1.543687343597412, "logps/chosen": -153.09767150878906, "logps/rejected": -177.41847229003906, "loss": 0.684, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0551866851747036, "rewards/margins": 0.03179415315389633, "rewards/rejected": -0.08698083460330963, "step": 160 }, { "epoch": 0.18686468171004253, "grad_norm": 73.91686194821585, "learning_rate": 1.9928862722828242e-07, "logits/chosen": -1.7037162780761719, "logits/rejected": -1.675144076347351, "logps/chosen": -153.01358032226562, "logps/rejected": -175.93673706054688, "loss": 0.6781, "rewards/accuracies": 0.59375, "rewards/chosen": -0.023145336657762527, "rewards/margins": 0.06261962652206421, "rewards/rejected": -0.08576496690511703, "step": 162 }, { "epoch": 0.18917165308917885, "grad_norm": 76.57698818122466, "learning_rate": 1.9924205096719357e-07, "logits/chosen": -1.5918736457824707, "logits/rejected": -1.4768625497817993, "logps/chosen": -196.1853485107422, "logps/rejected": -179.04530334472656, "loss": 0.6692, "rewards/accuracies": 0.8125, "rewards/chosen": -0.04375737905502319, "rewards/margins": 0.0569818913936615, "rewards/rejected": -0.10073927044868469, "step": 164 }, { "epoch": 0.19147862446831518, "grad_norm": 77.81644602396882, "learning_rate": 1.9919400369950097e-07, "logits/chosen": -1.4722576141357422, "logits/rejected": -1.540255069732666, "logps/chosen": -205.6660614013672, "logps/rejected": -248.9489288330078, "loss": 0.6786, "rewards/accuracies": 0.71875, "rewards/chosen": -0.043852321803569794, "rewards/margins": 0.0406915545463562, "rewards/rejected": -0.084543876349926, "step": 166 }, { "epoch": 0.19378559584745153, "grad_norm": 75.44269350923808, "learning_rate": 1.9914448613738103e-07, "logits/chosen": -1.529039740562439, "logits/rejected": -1.5173804759979248, "logps/chosen": -202.2668914794922, "logps/rejected": -226.52684020996094, "loss": 0.6763, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07004385441541672, "rewards/margins": 0.021601226180791855, "rewards/rejected": -0.09164508432149887, "step": 168 }, { "epoch": 0.19609256722658785, "grad_norm": 76.79086636181425, "learning_rate": 1.9909349901480347e-07, "logits/chosen": -1.610205888748169, "logits/rejected": -1.622270107269287, "logps/chosen": -152.17263793945312, "logps/rejected": -153.09571838378906, "loss": 0.6826, "rewards/accuracies": 0.53125, "rewards/chosen": -0.049044616520404816, "rewards/margins": 0.020372100174427032, "rewards/rejected": -0.06941672414541245, "step": 170 }, { "epoch": 0.19839953860572418, "grad_norm": 65.6162753738054, "learning_rate": 1.990410430875205e-07, "logits/chosen": -1.6482963562011719, "logits/rejected": -1.6161506175994873, "logps/chosen": -131.635986328125, "logps/rejected": -142.7827606201172, "loss": 0.6691, "rewards/accuracies": 0.78125, "rewards/chosen": -0.028512008488178253, "rewards/margins": 0.06365156173706055, "rewards/rejected": -0.0921635702252388, "step": 172 }, { "epoch": 0.2007065099848605, "grad_norm": 90.14268267387868, "learning_rate": 1.9898711913305547e-07, "logits/chosen": -1.5566825866699219, "logits/rejected": -1.6173129081726074, "logps/chosen": -174.24017333984375, "logps/rejected": -181.01629638671875, "loss": 0.6766, "rewards/accuracies": 0.65625, "rewards/chosen": -0.05642353743314743, "rewards/margins": 0.03944730758666992, "rewards/rejected": -0.09587083756923676, "step": 174 }, { "epoch": 0.20301348136399683, "grad_norm": 76.41450508388954, "learning_rate": 1.9893172795069142e-07, "logits/chosen": -1.5998440980911255, "logits/rejected": -1.6545708179473877, "logps/chosen": -156.8663330078125, "logps/rejected": -159.1892547607422, "loss": 0.6915, "rewards/accuracies": 0.4375, "rewards/chosen": -0.05183999985456467, "rewards/margins": 0.0032500806264579296, "rewards/rejected": -0.055090077221393585, "step": 176 }, { "epoch": 0.20532045274313315, "grad_norm": 87.56584383747318, "learning_rate": 1.988748703614594e-07, "logits/chosen": -1.6627997159957886, "logits/rejected": -1.6540586948394775, "logps/chosen": -155.84368896484375, "logps/rejected": -186.67495727539062, "loss": 0.6755, "rewards/accuracies": 0.65625, "rewards/chosen": -0.03810984268784523, "rewards/margins": 0.029699210077524185, "rewards/rejected": -0.06780905276536942, "step": 178 }, { "epoch": 0.20762742412226948, "grad_norm": 70.4422512373765, "learning_rate": 1.9881654720812592e-07, "logits/chosen": -1.5361154079437256, "logits/rejected": -1.610466480255127, "logps/chosen": -115.56837463378906, "logps/rejected": -142.57766723632812, "loss": 0.6805, "rewards/accuracies": 0.75, "rewards/chosen": -0.02763887494802475, "rewards/margins": 0.0444108322262764, "rewards/rejected": -0.07204970717430115, "step": 180 }, { "epoch": 0.2099343955014058, "grad_norm": 71.9817063853092, "learning_rate": 1.9875675935518094e-07, "logits/chosen": -1.547518014907837, "logits/rejected": -1.5500645637512207, "logps/chosen": -226.55401611328125, "logps/rejected": -206.99913024902344, "loss": 0.6836, "rewards/accuracies": 0.625, "rewards/chosen": -0.09524687379598618, "rewards/margins": 0.004950803238898516, "rewards/rejected": -0.1001976728439331, "step": 182 }, { "epoch": 0.21224136688054213, "grad_norm": 84.4720159255109, "learning_rate": 1.9869550768882454e-07, "logits/chosen": -1.5599523782730103, "logits/rejected": -1.5133187770843506, "logps/chosen": -182.1778564453125, "logps/rejected": -241.0075225830078, "loss": 0.6671, "rewards/accuracies": 0.6875, "rewards/chosen": -0.057929787784814835, "rewards/margins": 0.0736684501171112, "rewards/rejected": -0.13159823417663574, "step": 184 }, { "epoch": 0.21454833825967848, "grad_norm": 73.76638464490958, "learning_rate": 1.9863279311695428e-07, "logits/chosen": -1.4902362823486328, "logits/rejected": -1.55423903465271, "logps/chosen": -219.845703125, "logps/rejected": -273.73712158203125, "loss": 0.6773, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07643218338489532, "rewards/margins": 0.07768993079662323, "rewards/rejected": -0.15412212908267975, "step": 186 }, { "epoch": 0.2168553096388148, "grad_norm": 68.86254513199266, "learning_rate": 1.985686165691514e-07, "logits/chosen": -1.704699993133545, "logits/rejected": -1.6342915296554565, "logps/chosen": -120.14341735839844, "logps/rejected": -114.1607666015625, "loss": 0.6819, "rewards/accuracies": 0.5625, "rewards/chosen": -0.03834118694067001, "rewards/margins": -0.00312834233045578, "rewards/rejected": -0.03521284461021423, "step": 188 }, { "epoch": 0.21916228101795113, "grad_norm": 77.39475970610044, "learning_rate": 1.9850297899666707e-07, "logits/chosen": -1.6166346073150635, "logits/rejected": -1.666224479675293, "logps/chosen": -138.47250366210938, "logps/rejected": -183.75860595703125, "loss": 0.6784, "rewards/accuracies": 0.5625, "rewards/chosen": -0.059065092355012894, "rewards/margins": 0.0348266139626503, "rewards/rejected": -0.09389171749353409, "step": 190 }, { "epoch": 0.22146925239708745, "grad_norm": 79.62622809714712, "learning_rate": 1.9843588137240855e-07, "logits/chosen": -1.4786595106124878, "logits/rejected": -1.5819900035858154, "logps/chosen": -156.80630493164062, "logps/rejected": -225.65887451171875, "loss": 0.6727, "rewards/accuracies": 0.625, "rewards/chosen": -0.06785481423139572, "rewards/margins": 0.046741731464862823, "rewards/rejected": -0.11459654569625854, "step": 192 }, { "epoch": 0.22377622377622378, "grad_norm": 71.61908905732174, "learning_rate": 1.9836732469092446e-07, "logits/chosen": -1.7382750511169434, "logits/rejected": -1.7238702774047852, "logps/chosen": -135.97625732421875, "logps/rejected": -134.9537353515625, "loss": 0.6751, "rewards/accuracies": 0.59375, "rewards/chosen": -0.07399953901767731, "rewards/margins": -0.0150267593562603, "rewards/rejected": -0.05897277966141701, "step": 194 }, { "epoch": 0.2260831951553601, "grad_norm": 77.39998086488785, "learning_rate": 1.982973099683902e-07, "logits/chosen": -1.6806734800338745, "logits/rejected": -1.7225916385650635, "logps/chosen": -139.36279296875, "logps/rejected": -160.1461639404297, "loss": 0.6592, "rewards/accuracies": 0.65625, "rewards/chosen": -0.05896836146712303, "rewards/margins": 0.05167616903781891, "rewards/rejected": -0.11064451932907104, "step": 196 }, { "epoch": 0.22839016653449642, "grad_norm": 71.42176978474515, "learning_rate": 1.982258382425928e-07, "logits/chosen": -1.53923499584198, "logits/rejected": -1.5509108304977417, "logps/chosen": -145.9883575439453, "logps/rejected": -173.95388793945312, "loss": 0.6819, "rewards/accuracies": 0.75, "rewards/chosen": -0.06814471632242203, "rewards/margins": 0.05520961806178093, "rewards/rejected": -0.12335430830717087, "step": 198 }, { "epoch": 0.23069713791363275, "grad_norm": 65.61388633290626, "learning_rate": 1.9815291057291578e-07, "logits/chosen": -1.5758477449417114, "logits/rejected": -1.6140058040618896, "logps/chosen": -105.85581970214844, "logps/rejected": -122.97176361083984, "loss": 0.679, "rewards/accuracies": 0.625, "rewards/chosen": -0.06785964965820312, "rewards/margins": 0.01952926628291607, "rewards/rejected": -0.08738891780376434, "step": 200 }, { "epoch": 0.23069713791363275, "eval_logits/chosen": -1.5610222816467285, "eval_logits/rejected": -1.462950587272644, "eval_logps/chosen": -186.2912139892578, "eval_logps/rejected": -151.6300048828125, "eval_loss": 0.6922155618667603, "eval_rewards/accuracies": 0.5600000023841858, "eval_rewards/chosen": -0.11090204119682312, "eval_rewards/margins": 0.0005596327828243375, "eval_rewards/rejected": -0.11146167665719986, "eval_runtime": 21.7555, "eval_samples_per_second": 4.597, "eval_steps_per_second": 1.149, "step": 200 }, { "epoch": 0.23300410929276907, "grad_norm": 69.78727065961301, "learning_rate": 1.9807852804032302e-07, "logits/chosen": -1.4734337329864502, "logits/rejected": -1.491389513015747, "logps/chosen": -154.9561767578125, "logps/rejected": -204.73797607421875, "loss": 0.6705, "rewards/accuracies": 0.71875, "rewards/chosen": -0.046838290989398956, "rewards/margins": 0.11147616803646088, "rewards/rejected": -0.15831446647644043, "step": 202 }, { "epoch": 0.23531108067190543, "grad_norm": 82.59595988703826, "learning_rate": 1.980026917473432e-07, "logits/chosen": -1.5889283418655396, "logits/rejected": -1.7049566507339478, "logps/chosen": -174.74598693847656, "logps/rejected": -223.11842346191406, "loss": 0.6737, "rewards/accuracies": 0.75, "rewards/chosen": -0.05181250721216202, "rewards/margins": 0.09021350741386414, "rewards/rejected": -0.14202602207660675, "step": 204 }, { "epoch": 0.23761805205104175, "grad_norm": 67.38723201323596, "learning_rate": 1.9792540281805298e-07, "logits/chosen": -1.4892499446868896, "logits/rejected": -1.517817497253418, "logps/chosen": -140.6378631591797, "logps/rejected": -161.47348022460938, "loss": 0.6682, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08649797737598419, "rewards/margins": 0.03692768141627312, "rewards/rejected": -0.12342565506696701, "step": 206 }, { "epoch": 0.23992502343017807, "grad_norm": 73.06374890842753, "learning_rate": 1.9784666239806089e-07, "logits/chosen": -1.5101206302642822, "logits/rejected": -1.5768136978149414, "logps/chosen": -164.27035522460938, "logps/rejected": -203.25975036621094, "loss": 0.6718, "rewards/accuracies": 0.625, "rewards/chosen": -0.06570874899625778, "rewards/margins": 0.09578941017389297, "rewards/rejected": -0.16149815917015076, "step": 208 }, { "epoch": 0.2422319948093144, "grad_norm": 75.60130708995507, "learning_rate": 1.9776647165448983e-07, "logits/chosen": -1.5699687004089355, "logits/rejected": -1.520723581314087, "logps/chosen": -188.6508331298828, "logps/rejected": -217.18365478515625, "loss": 0.6708, "rewards/accuracies": 0.59375, "rewards/chosen": -0.08720759302377701, "rewards/margins": 0.02055392973124981, "rewards/rejected": -0.10776151716709137, "step": 210 }, { "epoch": 0.24453896618845072, "grad_norm": 76.08847895651103, "learning_rate": 1.9768483177596006e-07, "logits/chosen": -1.5900119543075562, "logits/rejected": -1.6237448453903198, "logps/chosen": -143.37664794921875, "logps/rejected": -166.52413940429688, "loss": 0.6689, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0496542751789093, "rewards/margins": 0.06798863410949707, "rewards/rejected": -0.11764290928840637, "step": 212 }, { "epoch": 0.24684593756758705, "grad_norm": 85.55002048517719, "learning_rate": 1.9760174397257153e-07, "logits/chosen": -1.5799341201782227, "logits/rejected": -1.5739325284957886, "logps/chosen": -187.42578125, "logps/rejected": -227.91946411132812, "loss": 0.6871, "rewards/accuracies": 0.53125, "rewards/chosen": -0.11932831257581711, "rewards/margins": 0.00022871512919664383, "rewards/rejected": -0.11955701559782028, "step": 214 }, { "epoch": 0.24915290894672337, "grad_norm": 79.3908422125236, "learning_rate": 1.97517209475886e-07, "logits/chosen": -1.578735589981079, "logits/rejected": -1.7003827095031738, "logps/chosen": -147.41778564453125, "logps/rejected": -185.31602478027344, "loss": 0.6678, "rewards/accuracies": 0.78125, "rewards/chosen": -0.07496561855077744, "rewards/margins": 0.0857120081782341, "rewards/rejected": -0.16067762672901154, "step": 216 }, { "epoch": 0.2514598803258597, "grad_norm": 78.5386783375159, "learning_rate": 1.9743122953890854e-07, "logits/chosen": -1.5871162414550781, "logits/rejected": -1.5231672525405884, "logps/chosen": -174.3480682373047, "logps/rejected": -196.07998657226562, "loss": 0.6548, "rewards/accuracies": 0.71875, "rewards/chosen": -0.07399033010005951, "rewards/margins": 0.05520808696746826, "rewards/rejected": -0.12919840216636658, "step": 218 }, { "epoch": 0.253766851704996, "grad_norm": 80.06682238716625, "learning_rate": 1.9734380543606927e-07, "logits/chosen": -1.643662452697754, "logits/rejected": -1.6430741548538208, "logps/chosen": -200.37826538085938, "logps/rejected": -207.41136169433594, "loss": 0.6815, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07183612138032913, "rewards/margins": 0.07066242396831512, "rewards/rejected": -0.14249853789806366, "step": 220 }, { "epoch": 0.25607382308413235, "grad_norm": 73.47957521005397, "learning_rate": 1.972549384632043e-07, "logits/chosen": -1.5852243900299072, "logits/rejected": -1.736151099205017, "logps/chosen": -167.52194213867188, "logps/rejected": -218.90122985839844, "loss": 0.6604, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0951998233795166, "rewards/margins": 0.027591748163104057, "rewards/rejected": -0.12279157340526581, "step": 222 }, { "epoch": 0.25838079446326867, "grad_norm": 79.05067580811021, "learning_rate": 1.9716462993753655e-07, "logits/chosen": -1.476207971572876, "logits/rejected": -1.5456207990646362, "logps/chosen": -288.57379150390625, "logps/rejected": -338.8498840332031, "loss": 0.6567, "rewards/accuracies": 0.6875, "rewards/chosen": -0.17371979355812073, "rewards/margins": 0.15009327232837677, "rewards/rejected": -0.3238130807876587, "step": 224 }, { "epoch": 0.260687765842405, "grad_norm": 67.9414989656304, "learning_rate": 1.9707288119765622e-07, "logits/chosen": -1.5781480073928833, "logits/rejected": -1.569219708442688, "logps/chosen": -124.80656433105469, "logps/rejected": -141.52476501464844, "loss": 0.6732, "rewards/accuracies": 0.625, "rewards/chosen": -0.12547817826271057, "rewards/margins": 0.03562304750084877, "rewards/rejected": -0.16110120713710785, "step": 226 }, { "epoch": 0.2629947372215413, "grad_norm": 78.11670530443735, "learning_rate": 1.9697969360350095e-07, "logits/chosen": -1.6346409320831299, "logits/rejected": -1.565224051475525, "logps/chosen": -178.9912109375, "logps/rejected": -190.82681274414062, "loss": 0.6661, "rewards/accuracies": 0.625, "rewards/chosen": -0.10180149972438812, "rewards/margins": 0.05022910237312317, "rewards/rejected": -0.1520306020975113, "step": 228 }, { "epoch": 0.2653017086006777, "grad_norm": 68.3329236115507, "learning_rate": 1.968850685363357e-07, "logits/chosen": -1.7000384330749512, "logits/rejected": -1.7287462949752808, "logps/chosen": -199.75430297851562, "logps/rejected": -241.5220947265625, "loss": 0.6556, "rewards/accuracies": 0.625, "rewards/chosen": -0.09640266001224518, "rewards/margins": 0.09287622570991516, "rewards/rejected": -0.18927887082099915, "step": 230 }, { "epoch": 0.267608679979814, "grad_norm": 82.26094651176318, "learning_rate": 1.9678900739873226e-07, "logits/chosen": -1.677142858505249, "logits/rejected": -1.6745737791061401, "logps/chosen": -170.59425354003906, "logps/rejected": -181.05661010742188, "loss": 0.6694, "rewards/accuracies": 0.59375, "rewards/chosen": -0.1134408637881279, "rewards/margins": 0.036339692771434784, "rewards/rejected": -0.14978057146072388, "step": 232 }, { "epoch": 0.26991565135895035, "grad_norm": 78.36181831181821, "learning_rate": 1.966915116145484e-07, "logits/chosen": -1.4915921688079834, "logits/rejected": -1.523095726966858, "logps/chosen": -155.88290405273438, "logps/rejected": -164.45022583007812, "loss": 0.6567, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0988837480545044, "rewards/margins": 0.09765380620956421, "rewards/rejected": -0.196537584066391, "step": 234 }, { "epoch": 0.2722226227380867, "grad_norm": 83.64468364737313, "learning_rate": 1.965925826289068e-07, "logits/chosen": -1.6482906341552734, "logits/rejected": -1.6469372510910034, "logps/chosen": -185.45001220703125, "logps/rejected": -208.0437469482422, "loss": 0.6708, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0873761773109436, "rewards/margins": 0.06748253107070923, "rewards/rejected": -0.15485870838165283, "step": 236 }, { "epoch": 0.274529594117223, "grad_norm": 74.16820675500993, "learning_rate": 1.964922219081738e-07, "logits/chosen": -1.764983057975769, "logits/rejected": -1.7145969867706299, "logps/chosen": -223.3017578125, "logps/rejected": -218.1916046142578, "loss": 0.6555, "rewards/accuracies": 0.6875, "rewards/chosen": -0.11543229222297668, "rewards/margins": 0.12305162101984024, "rewards/rejected": -0.23848390579223633, "step": 238 }, { "epoch": 0.2768365654963593, "grad_norm": 75.00121800473413, "learning_rate": 1.9639043093993727e-07, "logits/chosen": -1.5264173746109009, "logits/rejected": -1.4717910289764404, "logps/chosen": -178.43338012695312, "logps/rejected": -188.60101318359375, "loss": 0.6481, "rewards/accuracies": 0.59375, "rewards/chosen": -0.09335748851299286, "rewards/margins": 0.026005972176790237, "rewards/rejected": -0.1193634569644928, "step": 240 }, { "epoch": 0.27914353687549565, "grad_norm": 64.46972140040022, "learning_rate": 1.9628721123298492e-07, "logits/chosen": -1.6837042570114136, "logits/rejected": -1.6980068683624268, "logps/chosen": -161.4723663330078, "logps/rejected": -171.20248413085938, "loss": 0.6609, "rewards/accuracies": 0.53125, "rewards/chosen": -0.11237211525440216, "rewards/margins": 0.049555521458387375, "rewards/rejected": -0.16192764043807983, "step": 242 }, { "epoch": 0.28145050825463197, "grad_norm": 66.85001786944659, "learning_rate": 1.961825643172819e-07, "logits/chosen": -1.5771496295928955, "logits/rejected": -1.5039366483688354, "logps/chosen": -158.33685302734375, "logps/rejected": -160.24057006835938, "loss": 0.6701, "rewards/accuracies": 0.59375, "rewards/chosen": -0.14522022008895874, "rewards/margins": 0.04340605437755585, "rewards/rejected": -0.1886262595653534, "step": 244 }, { "epoch": 0.2837574796337683, "grad_norm": 76.39303352760503, "learning_rate": 1.9607649174394787e-07, "logits/chosen": -1.4101349115371704, "logits/rejected": -1.4513871669769287, "logps/chosen": -147.43826293945312, "logps/rejected": -182.31005859375, "loss": 0.6596, "rewards/accuracies": 0.71875, "rewards/chosen": -0.08130650967359543, "rewards/margins": 0.0953046903014183, "rewards/rejected": -0.17661119997501373, "step": 246 }, { "epoch": 0.2860644510129046, "grad_norm": 84.03579410656208, "learning_rate": 1.959689950852343e-07, "logits/chosen": -1.6520403623580933, "logits/rejected": -1.6739228963851929, "logps/chosen": -172.19305419921875, "logps/rejected": -184.803466796875, "loss": 0.6669, "rewards/accuracies": 0.53125, "rewards/chosen": -0.16027307510375977, "rewards/margins": 0.02343956008553505, "rewards/rejected": -0.1837126612663269, "step": 248 }, { "epoch": 0.28837142239204094, "grad_norm": 78.31280460476106, "learning_rate": 1.9586007593450095e-07, "logits/chosen": -1.568188190460205, "logits/rejected": -1.586582064628601, "logps/chosen": -169.95675659179688, "logps/rejected": -188.78858947753906, "loss": 0.6779, "rewards/accuracies": 0.59375, "rewards/chosen": -0.1373595893383026, "rewards/margins": 0.02867070771753788, "rewards/rejected": -0.16603030264377594, "step": 250 }, { "epoch": 0.29067839377117727, "grad_norm": 77.82835801736759, "learning_rate": 1.957497359061924e-07, "logits/chosen": -1.5796047449111938, "logits/rejected": -1.5543608665466309, "logps/chosen": -191.53219604492188, "logps/rejected": -220.70361328125, "loss": 0.6393, "rewards/accuracies": 0.71875, "rewards/chosen": -0.15552642941474915, "rewards/margins": 0.09192191064357758, "rewards/rejected": -0.24744835495948792, "step": 252 }, { "epoch": 0.2929853651503136, "grad_norm": 81.6835137198976, "learning_rate": 1.956379766358141e-07, "logits/chosen": -1.5779876708984375, "logits/rejected": -1.504298448562622, "logps/chosen": -218.59942626953125, "logps/rejected": -230.2102813720703, "loss": 0.6635, "rewards/accuracies": 0.75, "rewards/chosen": -0.14433127641677856, "rewards/margins": 0.08938172459602356, "rewards/rejected": -0.23371298611164093, "step": 254 }, { "epoch": 0.2952923365294499, "grad_norm": 74.03670184892391, "learning_rate": 1.9552479977990798e-07, "logits/chosen": -1.6765474081039429, "logits/rejected": -1.643741488456726, "logps/chosen": -185.69444274902344, "logps/rejected": -199.7008819580078, "loss": 0.676, "rewards/accuracies": 0.46875, "rewards/chosen": -0.14419934153556824, "rewards/margins": 0.016861233860254288, "rewards/rejected": -0.16106057167053223, "step": 256 }, { "epoch": 0.29759930790858624, "grad_norm": 79.12914884219855, "learning_rate": 1.954102070160281e-07, "logits/chosen": -1.6632733345031738, "logits/rejected": -1.6073827743530273, "logps/chosen": -149.79641723632812, "logps/rejected": -174.7237091064453, "loss": 0.6638, "rewards/accuracies": 0.65625, "rewards/chosen": -0.10197722166776657, "rewards/margins": 0.07854845374822617, "rewards/rejected": -0.18052567541599274, "step": 258 }, { "epoch": 0.29990627928772257, "grad_norm": 80.451175401901, "learning_rate": 1.9529420004271567e-07, "logits/chosen": -1.5313125848770142, "logits/rejected": -1.5560095310211182, "logps/chosen": -207.1497802734375, "logps/rejected": -222.1211395263672, "loss": 0.6407, "rewards/accuracies": 0.625, "rewards/chosen": -0.16675114631652832, "rewards/margins": 0.1034877672791481, "rewards/rejected": -0.2702389061450958, "step": 260 }, { "epoch": 0.3022132506668589, "grad_norm": 66.85531080853532, "learning_rate": 1.9517678057947382e-07, "logits/chosen": -1.6430004835128784, "logits/rejected": -1.597357153892517, "logps/chosen": -135.1138153076172, "logps/rejected": -132.63619995117188, "loss": 0.6618, "rewards/accuracies": 0.625, "rewards/chosen": -0.11752544343471527, "rewards/margins": 0.03515633940696716, "rewards/rejected": -0.15268178284168243, "step": 262 }, { "epoch": 0.3045202220459952, "grad_norm": 80.22448615221649, "learning_rate": 1.9505795036674232e-07, "logits/chosen": -1.6319184303283691, "logits/rejected": -1.4991899728775024, "logps/chosen": -217.16680908203125, "logps/rejected": -245.2107696533203, "loss": 0.6523, "rewards/accuracies": 0.5625, "rewards/chosen": -0.18451935052871704, "rewards/margins": 0.11840308457612991, "rewards/rejected": -0.30292242765426636, "step": 264 }, { "epoch": 0.3068271934251316, "grad_norm": 69.95576974969472, "learning_rate": 1.9493771116587156e-07, "logits/chosen": -1.5522364377975464, "logits/rejected": -1.5948469638824463, "logps/chosen": -113.81831359863281, "logps/rejected": -155.99346923828125, "loss": 0.6551, "rewards/accuracies": 0.78125, "rewards/chosen": -0.08541279286146164, "rewards/margins": 0.14459526538848877, "rewards/rejected": -0.230008065700531, "step": 266 }, { "epoch": 0.3091341648042679, "grad_norm": 75.25744246014891, "learning_rate": 1.9481606475909656e-07, "logits/chosen": -1.500025749206543, "logits/rejected": -1.5494239330291748, "logps/chosen": -125.84722900390625, "logps/rejected": -164.76669311523438, "loss": 0.6526, "rewards/accuracies": 0.84375, "rewards/chosen": -0.08980143815279007, "rewards/margins": 0.17133310437202454, "rewards/rejected": -0.2611345648765564, "step": 268 }, { "epoch": 0.31144113618340424, "grad_norm": 77.53434606640313, "learning_rate": 1.9469301294951057e-07, "logits/chosen": -1.6267601251602173, "logits/rejected": -1.5587116479873657, "logps/chosen": -172.08139038085938, "logps/rejected": -181.32717895507812, "loss": 0.6596, "rewards/accuracies": 0.625, "rewards/chosen": -0.1692037582397461, "rewards/margins": 0.051171936094760895, "rewards/rejected": -0.2203756868839264, "step": 270 }, { "epoch": 0.31374810756254057, "grad_norm": 74.84897771580975, "learning_rate": 1.9456855756103816e-07, "logits/chosen": -1.5624661445617676, "logits/rejected": -1.6530312299728394, "logps/chosen": -147.84597778320312, "logps/rejected": -174.6589813232422, "loss": 0.6707, "rewards/accuracies": 0.53125, "rewards/chosen": -0.13743659853935242, "rewards/margins": 0.05220307409763336, "rewards/rejected": -0.18963965773582458, "step": 272 }, { "epoch": 0.3160550789416769, "grad_norm": 71.99044362933952, "learning_rate": 1.9444270043840852e-07, "logits/chosen": -1.6625701189041138, "logits/rejected": -1.5914949178695679, "logps/chosen": -147.29147338867188, "logps/rejected": -129.6570587158203, "loss": 0.6831, "rewards/accuracies": 0.46875, "rewards/chosen": -0.21893730759620667, "rewards/margins": -0.0283275805413723, "rewards/rejected": -0.19060972332954407, "step": 274 }, { "epoch": 0.3183620503208132, "grad_norm": 75.21160091684173, "learning_rate": 1.9431544344712772e-07, "logits/chosen": -1.4378788471221924, "logits/rejected": -1.3864963054656982, "logps/chosen": -147.2783660888672, "logps/rejected": -177.4646759033203, "loss": 0.6472, "rewards/accuracies": 0.71875, "rewards/chosen": -0.11688334494829178, "rewards/margins": 0.11695411056280136, "rewards/rejected": -0.23383745551109314, "step": 276 }, { "epoch": 0.32066902169994954, "grad_norm": 72.07898599790276, "learning_rate": 1.9418678847345146e-07, "logits/chosen": -1.5210872888565063, "logits/rejected": -1.5768458843231201, "logps/chosen": -164.58419799804688, "logps/rejected": -213.6575469970703, "loss": 0.6664, "rewards/accuracies": 0.53125, "rewards/chosen": -0.12316928803920746, "rewards/margins": 0.07401876151561737, "rewards/rejected": -0.19718804955482483, "step": 278 }, { "epoch": 0.32297599307908587, "grad_norm": 67.17404804695744, "learning_rate": 1.9405673742435676e-07, "logits/chosen": -1.5087511539459229, "logits/rejected": -1.5612874031066895, "logps/chosen": -142.5220947265625, "logps/rejected": -195.3551483154297, "loss": 0.6718, "rewards/accuracies": 0.6875, "rewards/chosen": -0.14393991231918335, "rewards/margins": 0.11825156211853027, "rewards/rejected": -0.2621914744377136, "step": 280 }, { "epoch": 0.3252829644582222, "grad_norm": 81.75237089659649, "learning_rate": 1.939252922275139e-07, "logits/chosen": -1.6113684177398682, "logits/rejected": -1.520400047302246, "logps/chosen": -215.8910675048828, "logps/rejected": -227.26637268066406, "loss": 0.6556, "rewards/accuracies": 0.59375, "rewards/chosen": -0.2654890716075897, "rewards/margins": 0.09542025625705719, "rewards/rejected": -0.3609093129634857, "step": 282 }, { "epoch": 0.3275899358373585, "grad_norm": 65.02297736065502, "learning_rate": 1.937924548312578e-07, "logits/chosen": -1.6812703609466553, "logits/rejected": -1.7281326055526733, "logps/chosen": -130.5011749267578, "logps/rejected": -195.49452209472656, "loss": 0.6431, "rewards/accuracies": 0.71875, "rewards/chosen": -0.12183240056037903, "rewards/margins": 0.12844915688037872, "rewards/rejected": -0.25028154253959656, "step": 284 }, { "epoch": 0.32989690721649484, "grad_norm": 75.13263031113792, "learning_rate": 1.9365822720455912e-07, "logits/chosen": -1.4847445487976074, "logits/rejected": -1.4161133766174316, "logps/chosen": -154.5245361328125, "logps/rejected": -203.3861541748047, "loss": 0.6537, "rewards/accuracies": 0.53125, "rewards/chosen": -0.16228517889976501, "rewards/margins": 0.12002203613519669, "rewards/rejected": -0.2823072075843811, "step": 286 }, { "epoch": 0.33220387859563116, "grad_norm": 78.41024724428831, "learning_rate": 1.935226113369951e-07, "logits/chosen": -1.686346173286438, "logits/rejected": -1.6542606353759766, "logps/chosen": -172.25059509277344, "logps/rejected": -199.93182373046875, "loss": 0.6469, "rewards/accuracies": 0.78125, "rewards/chosen": -0.12491661310195923, "rewards/margins": 0.12406705319881439, "rewards/rejected": -0.24898366630077362, "step": 288 }, { "epoch": 0.3345108499747675, "grad_norm": 74.32689822052723, "learning_rate": 1.9338560923872006e-07, "logits/chosen": -1.5119750499725342, "logits/rejected": -1.524541974067688, "logps/chosen": -159.21376037597656, "logps/rejected": -237.09561157226562, "loss": 0.6455, "rewards/accuracies": 0.8125, "rewards/chosen": -0.1832338273525238, "rewards/margins": 0.2291288673877716, "rewards/rejected": -0.4123626947402954, "step": 290 }, { "epoch": 0.3368178213539038, "grad_norm": 77.04623177811982, "learning_rate": 1.9324722294043556e-07, "logits/chosen": -1.6212831735610962, "logits/rejected": -1.5947524309158325, "logps/chosen": -187.361572265625, "logps/rejected": -187.34519958496094, "loss": 0.6585, "rewards/accuracies": 0.59375, "rewards/chosen": -0.24990221858024597, "rewards/margins": 0.07834864407777786, "rewards/rejected": -0.3282508850097656, "step": 292 }, { "epoch": 0.33912479273304014, "grad_norm": 83.55231847560428, "learning_rate": 1.9310745449336044e-07, "logits/chosen": -1.58076012134552, "logits/rejected": -1.5445674657821655, "logps/chosen": -192.48617553710938, "logps/rejected": -215.64193725585938, "loss": 0.6418, "rewards/accuracies": 0.75, "rewards/chosen": -0.1813565194606781, "rewards/margins": 0.1242499127984047, "rewards/rejected": -0.3056064546108246, "step": 294 }, { "epoch": 0.34143176411217646, "grad_norm": 73.20320061572161, "learning_rate": 1.929663059692002e-07, "logits/chosen": -1.477115273475647, "logits/rejected": -1.5140092372894287, "logps/chosen": -154.4539794921875, "logps/rejected": -214.9960174560547, "loss": 0.6894, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2464270293712616, "rewards/margins": 0.08608925342559814, "rewards/rejected": -0.33251628279685974, "step": 296 }, { "epoch": 0.3437387354913128, "grad_norm": 82.85464536249332, "learning_rate": 1.928237794601165e-07, "logits/chosen": -1.5687949657440186, "logits/rejected": -1.6849851608276367, "logps/chosen": -140.14784240722656, "logps/rejected": -234.17706298828125, "loss": 0.6525, "rewards/accuracies": 0.78125, "rewards/chosen": -0.1264043152332306, "rewards/margins": 0.24397864937782288, "rewards/rejected": -0.3703829348087311, "step": 298 }, { "epoch": 0.34604570687044917, "grad_norm": 65.65777237412837, "learning_rate": 1.9267987707869604e-07, "logits/chosen": -1.4391192197799683, "logits/rejected": -1.4724018573760986, "logps/chosen": -153.69284057617188, "logps/rejected": -173.3372039794922, "loss": 0.6486, "rewards/accuracies": 0.59375, "rewards/chosen": -0.16697266697883606, "rewards/margins": 0.13511566817760468, "rewards/rejected": -0.30208835005760193, "step": 300 }, { "epoch": 0.34604570687044917, "eval_logits/chosen": -1.5305781364440918, "eval_logits/rejected": -1.4347938299179077, "eval_logps/chosen": -187.96263122558594, "eval_logps/rejected": -153.34820556640625, "eval_loss": 0.678679347038269, "eval_rewards/accuracies": 0.6399999856948853, "eval_rewards/chosen": -0.278046578168869, "eval_rewards/margins": 0.005234198644757271, "eval_rewards/rejected": -0.28328076004981995, "eval_runtime": 21.7114, "eval_samples_per_second": 4.606, "eval_steps_per_second": 1.151, "step": 300 }, { "epoch": 0.3483526782495855, "grad_norm": 69.96196416042814, "learning_rate": 1.9253460095791922e-07, "logits/chosen": -1.5020473003387451, "logits/rejected": -1.4953689575195312, "logps/chosen": -106.53646087646484, "logps/rejected": -165.1669158935547, "loss": 0.6546, "rewards/accuracies": 0.59375, "rewards/chosen": -0.15904603898525238, "rewards/margins": 0.06554871797561646, "rewards/rejected": -0.22459478676319122, "step": 302 }, { "epoch": 0.3506596496287218, "grad_norm": 74.69729400373957, "learning_rate": 1.9238795325112868e-07, "logits/chosen": -1.636529803276062, "logits/rejected": -1.6348826885223389, "logps/chosen": -140.86441040039062, "logps/rejected": -174.48370361328125, "loss": 0.6433, "rewards/accuracies": 0.84375, "rewards/chosen": -0.12615619599819183, "rewards/margins": 0.20733490586280823, "rewards/rejected": -0.3334910571575165, "step": 304 }, { "epoch": 0.35296662100785814, "grad_norm": 84.17293540044481, "learning_rate": 1.9223993613199713e-07, "logits/chosen": -1.6913816928863525, "logits/rejected": -1.6646835803985596, "logps/chosen": -152.25997924804688, "logps/rejected": -171.05575561523438, "loss": 0.6514, "rewards/accuracies": 0.65625, "rewards/chosen": -0.11823489516973495, "rewards/margins": 0.18948128819465637, "rewards/rejected": -0.3077161908149719, "step": 306 }, { "epoch": 0.35527359238699446, "grad_norm": 83.6870493511653, "learning_rate": 1.9209055179449537e-07, "logits/chosen": -1.517793893814087, "logits/rejected": -1.6404225826263428, "logps/chosen": -91.36832427978516, "logps/rejected": -134.06529235839844, "loss": 0.6551, "rewards/accuracies": 0.71875, "rewards/chosen": -0.10601670295000076, "rewards/margins": 0.14076808094978333, "rewards/rejected": -0.24678479135036469, "step": 308 }, { "epoch": 0.3575805637661308, "grad_norm": 64.57674968550867, "learning_rate": 1.9193980245285966e-07, "logits/chosen": -1.4689788818359375, "logits/rejected": -1.3954423666000366, "logps/chosen": -143.7101287841797, "logps/rejected": -169.8336181640625, "loss": 0.6402, "rewards/accuracies": 0.65625, "rewards/chosen": -0.16834121942520142, "rewards/margins": 0.08874449878931046, "rewards/rejected": -0.25708574056625366, "step": 310 }, { "epoch": 0.3598875351452671, "grad_norm": 81.4185321584637, "learning_rate": 1.9178769034155887e-07, "logits/chosen": -1.6560229063034058, "logits/rejected": -1.7177590131759644, "logps/chosen": -144.23033142089844, "logps/rejected": -166.01162719726562, "loss": 0.6303, "rewards/accuracies": 0.65625, "rewards/chosen": -0.19495287537574768, "rewards/margins": 0.08614547550678253, "rewards/rejected": -0.281098335981369, "step": 312 }, { "epoch": 0.36219450652440344, "grad_norm": 70.47869326950462, "learning_rate": 1.9163421771526151e-07, "logits/chosen": -1.5131672620773315, "logits/rejected": -1.548357367515564, "logps/chosen": -146.3427734375, "logps/rejected": -159.85092163085938, "loss": 0.6536, "rewards/accuracies": 0.75, "rewards/chosen": -0.1731819212436676, "rewards/margins": 0.1254611313343048, "rewards/rejected": -0.29864302277565, "step": 314 }, { "epoch": 0.36450147790353976, "grad_norm": 79.69549984021036, "learning_rate": 1.914793868488021e-07, "logits/chosen": -1.512197732925415, "logits/rejected": -1.4396047592163086, "logps/chosen": -97.64339447021484, "logps/rejected": -117.3057632446289, "loss": 0.6579, "rewards/accuracies": 0.5625, "rewards/chosen": -0.1673259437084198, "rewards/margins": 0.045555103570222855, "rewards/rejected": -0.21288102865219116, "step": 316 }, { "epoch": 0.3668084492826761, "grad_norm": 82.99383875929993, "learning_rate": 1.9132320003714754e-07, "logits/chosen": -1.5376619100570679, "logits/rejected": -1.5551142692565918, "logps/chosen": -207.0707244873047, "logps/rejected": -242.56712341308594, "loss": 0.6439, "rewards/accuracies": 0.65625, "rewards/chosen": -0.24572816491127014, "rewards/margins": 0.16944444179534912, "rewards/rejected": -0.41517263650894165, "step": 318 }, { "epoch": 0.3691154206618124, "grad_norm": 78.2099765504223, "learning_rate": 1.9116565959536327e-07, "logits/chosen": -1.4779236316680908, "logits/rejected": -1.4861027002334595, "logps/chosen": -193.60748291015625, "logps/rejected": -232.04690551757812, "loss": 0.6534, "rewards/accuracies": 0.625, "rewards/chosen": -0.16232052445411682, "rewards/margins": 0.13388732075691223, "rewards/rejected": -0.29620781540870667, "step": 320 }, { "epoch": 0.37142239204094873, "grad_norm": 74.80406821040707, "learning_rate": 1.9100676785857857e-07, "logits/chosen": -1.6256941556930542, "logits/rejected": -1.5659886598587036, "logps/chosen": -170.6388702392578, "logps/rejected": -198.07733154296875, "loss": 0.6395, "rewards/accuracies": 0.6875, "rewards/chosen": -0.17732584476470947, "rewards/margins": 0.1462487280368805, "rewards/rejected": -0.32357457280158997, "step": 322 }, { "epoch": 0.37372936342008506, "grad_norm": 81.93843569632895, "learning_rate": 1.9084652718195236e-07, "logits/chosen": -1.5257925987243652, "logits/rejected": -1.4617056846618652, "logps/chosen": -208.795166015625, "logps/rejected": -243.7969970703125, "loss": 0.6648, "rewards/accuracies": 0.53125, "rewards/chosen": -0.2373400181531906, "rewards/margins": 0.16046729683876038, "rewards/rejected": -0.3978073298931122, "step": 324 }, { "epoch": 0.3760363347992214, "grad_norm": 68.63199696676665, "learning_rate": 1.9068493994063798e-07, "logits/chosen": -1.4899076223373413, "logits/rejected": -1.5616645812988281, "logps/chosen": -133.66110229492188, "logps/rejected": -236.15924072265625, "loss": 0.6245, "rewards/accuracies": 0.65625, "rewards/chosen": -0.15444569289684296, "rewards/margins": 0.2277567982673645, "rewards/rejected": -0.38220247626304626, "step": 326 }, { "epoch": 0.3783433061783577, "grad_norm": 77.96696778978115, "learning_rate": 1.905220085297482e-07, "logits/chosen": -1.5441091060638428, "logits/rejected": -1.6405153274536133, "logps/chosen": -204.56991577148438, "logps/rejected": -610.9658203125, "loss": 0.6369, "rewards/accuracies": 0.78125, "rewards/chosen": -0.25125816464424133, "rewards/margins": 0.27758753299713135, "rewards/rejected": -0.5288456678390503, "step": 328 }, { "epoch": 0.38065027755749403, "grad_norm": 70.94819657566394, "learning_rate": 1.9035773536431955e-07, "logits/chosen": -1.5916917324066162, "logits/rejected": -1.529220461845398, "logps/chosen": -137.5714111328125, "logps/rejected": -160.11544799804688, "loss": 0.628, "rewards/accuracies": 0.65625, "rewards/chosen": -0.20854628086090088, "rewards/margins": 0.11146115511655807, "rewards/rejected": -0.32000741362571716, "step": 330 }, { "epoch": 0.38295724893663036, "grad_norm": 74.31467840644032, "learning_rate": 1.901921228792766e-07, "logits/chosen": -1.5668599605560303, "logits/rejected": -1.6017038822174072, "logps/chosen": -253.0677947998047, "logps/rejected": -266.9024658203125, "loss": 0.6419, "rewards/accuracies": 0.59375, "rewards/chosen": -0.2701232433319092, "rewards/margins": 0.1171327605843544, "rewards/rejected": -0.387255996465683, "step": 332 }, { "epoch": 0.3852642203157667, "grad_norm": 80.19418315617096, "learning_rate": 1.9002517352939596e-07, "logits/chosen": -1.538657784461975, "logits/rejected": -1.4902359247207642, "logps/chosen": -151.844482421875, "logps/rejected": -182.43423461914062, "loss": 0.6542, "rewards/accuracies": 0.625, "rewards/chosen": -0.20499791204929352, "rewards/margins": 0.14708584547042847, "rewards/rejected": -0.3520837724208832, "step": 334 }, { "epoch": 0.38757119169490306, "grad_norm": 78.45881437768317, "learning_rate": 1.898568897892697e-07, "logits/chosen": -1.502273440361023, "logits/rejected": -1.567176342010498, "logps/chosen": -149.17568969726562, "logps/rejected": -218.93869018554688, "loss": 0.6324, "rewards/accuracies": 0.71875, "rewards/chosen": -0.21270516514778137, "rewards/margins": 0.24096481502056122, "rewards/rejected": -0.4536699950695038, "step": 336 }, { "epoch": 0.3898781630740394, "grad_norm": 69.72871536048268, "learning_rate": 1.8968727415326882e-07, "logits/chosen": -1.595134973526001, "logits/rejected": -1.6751508712768555, "logps/chosen": -112.13485717773438, "logps/rejected": -138.27838134765625, "loss": 0.6302, "rewards/accuracies": 0.6875, "rewards/chosen": -0.11406655609607697, "rewards/margins": 0.13377144932746887, "rewards/rejected": -0.24783800542354584, "step": 338 }, { "epoch": 0.3921851344531757, "grad_norm": 66.47735099680594, "learning_rate": 1.8951632913550623e-07, "logits/chosen": -1.6112767457962036, "logits/rejected": -1.5350615978240967, "logps/chosen": -212.4505615234375, "logps/rejected": -239.0753173828125, "loss": 0.621, "rewards/accuracies": 0.625, "rewards/chosen": -0.12918683886528015, "rewards/margins": 0.254965603351593, "rewards/rejected": -0.3841524124145508, "step": 340 }, { "epoch": 0.39449210583231203, "grad_norm": 81.17863346925296, "learning_rate": 1.8934405726979945e-07, "logits/chosen": -1.4070253372192383, "logits/rejected": -1.4879088401794434, "logps/chosen": -166.3784942626953, "logps/rejected": -204.57489013671875, "loss": 0.6395, "rewards/accuracies": 0.65625, "rewards/chosen": -0.31329959630966187, "rewards/margins": 0.13568538427352905, "rewards/rejected": -0.4489849805831909, "step": 342 }, { "epoch": 0.39679907721144836, "grad_norm": 72.25844304700202, "learning_rate": 1.8917046110963314e-07, "logits/chosen": -1.6808464527130127, "logits/rejected": -1.6618741750717163, "logps/chosen": -184.7408905029297, "logps/rejected": -213.8212127685547, "loss": 0.6414, "rewards/accuracies": 0.65625, "rewards/chosen": -0.1948232203722, "rewards/margins": 0.18943095207214355, "rewards/rejected": -0.3842541575431824, "step": 344 }, { "epoch": 0.3991060485905847, "grad_norm": 69.12287284056892, "learning_rate": 1.8899554322812116e-07, "logits/chosen": -1.677032470703125, "logits/rejected": -1.6319351196289062, "logps/chosen": -114.67143249511719, "logps/rejected": -125.2265625, "loss": 0.6256, "rewards/accuracies": 0.78125, "rewards/chosen": -0.18165619671344757, "rewards/margins": 0.17791113257408142, "rewards/rejected": -0.3595673143863678, "step": 346 }, { "epoch": 0.401413019969721, "grad_norm": 68.82861341006546, "learning_rate": 1.8881930621796846e-07, "logits/chosen": -1.531043291091919, "logits/rejected": -1.4552069902420044, "logps/chosen": -172.90670776367188, "logps/rejected": -228.29833984375, "loss": 0.6321, "rewards/accuracies": 0.65625, "rewards/chosen": -0.21518906950950623, "rewards/margins": 0.16281384229660034, "rewards/rejected": -0.37800291180610657, "step": 348 }, { "epoch": 0.40371999134885733, "grad_norm": 79.01675049183694, "learning_rate": 1.8864175269143273e-07, "logits/chosen": -1.628811001777649, "logits/rejected": -1.5073944330215454, "logps/chosen": -162.4159393310547, "logps/rejected": -173.65521240234375, "loss": 0.6361, "rewards/accuracies": 0.71875, "rewards/chosen": -0.17217856645584106, "rewards/margins": 0.20255069434642792, "rewards/rejected": -0.3747292459011078, "step": 350 }, { "epoch": 0.40602696272799366, "grad_norm": 80.14358020089544, "learning_rate": 1.8846288528028552e-07, "logits/chosen": -1.2868863344192505, "logits/rejected": -1.4563894271850586, "logps/chosen": -176.4993438720703, "logps/rejected": -219.99745178222656, "loss": 0.6388, "rewards/accuracies": 0.65625, "rewards/chosen": -0.34355729818344116, "rewards/margins": 0.19085751473903656, "rewards/rejected": -0.5344148278236389, "step": 352 }, { "epoch": 0.40833393410713, "grad_norm": 72.34750725400806, "learning_rate": 1.8828270663577336e-07, "logits/chosen": -1.5702780485153198, "logits/rejected": -1.6198755502700806, "logps/chosen": -135.76097106933594, "logps/rejected": -133.5688018798828, "loss": 0.6593, "rewards/accuracies": 0.625, "rewards/chosen": -0.28700345754623413, "rewards/margins": 0.014538988471031189, "rewards/rejected": -0.3015424311161041, "step": 354 }, { "epoch": 0.4106409054862663, "grad_norm": 71.70524840332104, "learning_rate": 1.8810121942857845e-07, "logits/chosen": -1.5310659408569336, "logits/rejected": -1.547040343284607, "logps/chosen": -137.63137817382812, "logps/rejected": -175.15028381347656, "loss": 0.6293, "rewards/accuracies": 0.75, "rewards/chosen": -0.1476406753063202, "rewards/margins": 0.20084424316883087, "rewards/rejected": -0.34848493337631226, "step": 356 }, { "epoch": 0.41294787686540263, "grad_norm": 77.60677795627835, "learning_rate": 1.8791842634877896e-07, "logits/chosen": -1.546626091003418, "logits/rejected": -1.6076010465621948, "logps/chosen": -136.61058044433594, "logps/rejected": -187.11056518554688, "loss": 0.6506, "rewards/accuracies": 0.625, "rewards/chosen": -0.2092825025320053, "rewards/margins": 0.11802927404642105, "rewards/rejected": -0.32731181383132935, "step": 358 }, { "epoch": 0.41525484824453895, "grad_norm": 76.22986147865214, "learning_rate": 1.8773433010580933e-07, "logits/chosen": -1.5016052722930908, "logits/rejected": -1.6018908023834229, "logps/chosen": -129.33348083496094, "logps/rejected": -151.12342834472656, "loss": 0.627, "rewards/accuracies": 0.5625, "rewards/chosen": -0.1853492707014084, "rewards/margins": 0.10909079760313034, "rewards/rejected": -0.2944400906562805, "step": 360 }, { "epoch": 0.4175618196236753, "grad_norm": 71.86807271397895, "learning_rate": 1.8754893342842e-07, "logits/chosen": -1.5751183032989502, "logits/rejected": -1.4908232688903809, "logps/chosen": -187.5486602783203, "logps/rejected": -194.04296875, "loss": 0.6223, "rewards/accuracies": 0.71875, "rewards/chosen": -0.27427998185157776, "rewards/margins": 0.1835474967956543, "rewards/rejected": -0.45782750844955444, "step": 362 }, { "epoch": 0.4198687910028116, "grad_norm": 70.36519300815779, "learning_rate": 1.8736223906463695e-07, "logits/chosen": -1.6419646739959717, "logits/rejected": -1.6212923526763916, "logps/chosen": -165.32421875, "logps/rejected": -171.27830505371094, "loss": 0.6154, "rewards/accuracies": 0.6875, "rewards/chosen": -0.21126417815685272, "rewards/margins": 0.188466876745224, "rewards/rejected": -0.3997310400009155, "step": 364 }, { "epoch": 0.4221757623819479, "grad_norm": 70.09468918933095, "learning_rate": 1.8717424978172102e-07, "logits/chosen": -1.3921918869018555, "logits/rejected": -1.469792127609253, "logps/chosen": -167.81964111328125, "logps/rejected": -210.77825927734375, "loss": 0.6308, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2520577609539032, "rewards/margins": 0.21120049059391022, "rewards/rejected": -0.463258296251297, "step": 366 }, { "epoch": 0.42448273376108425, "grad_norm": 83.57733506311956, "learning_rate": 1.8698496836612691e-07, "logits/chosen": -1.494173288345337, "logits/rejected": -1.5522290468215942, "logps/chosen": -163.31491088867188, "logps/rejected": -189.11239624023438, "loss": 0.6605, "rewards/accuracies": 0.71875, "rewards/chosen": -0.2657621204853058, "rewards/margins": 0.16207075119018555, "rewards/rejected": -0.42783284187316895, "step": 368 }, { "epoch": 0.4267897051402206, "grad_norm": 81.29498139829452, "learning_rate": 1.8679439762346184e-07, "logits/chosen": -1.5649724006652832, "logits/rejected": -1.6319153308868408, "logps/chosen": -208.2643585205078, "logps/rejected": -215.9363555908203, "loss": 0.6724, "rewards/accuracies": 0.71875, "rewards/chosen": -0.27036455273628235, "rewards/margins": 0.1651400327682495, "rewards/rejected": -0.43550461530685425, "step": 370 }, { "epoch": 0.42909667651935696, "grad_norm": 76.18451864107462, "learning_rate": 1.8660254037844388e-07, "logits/chosen": -1.4427084922790527, "logits/rejected": -1.5188959836959839, "logps/chosen": -171.85968017578125, "logps/rejected": -233.1151580810547, "loss": 0.629, "rewards/accuracies": 0.71875, "rewards/chosen": -0.27071186900138855, "rewards/margins": 0.2559873163700104, "rewards/rejected": -0.5266991853713989, "step": 372 }, { "epoch": 0.4314036478984933, "grad_norm": 82.63010621157098, "learning_rate": 1.8640939947486023e-07, "logits/chosen": -1.5887802839279175, "logits/rejected": -1.355837106704712, "logps/chosen": -242.5066375732422, "logps/rejected": -230.2034912109375, "loss": 0.6329, "rewards/accuracies": 0.59375, "rewards/chosen": -0.3870730698108673, "rewards/margins": 0.15506887435913086, "rewards/rejected": -0.5421419143676758, "step": 374 }, { "epoch": 0.4337106192776296, "grad_norm": 59.14499379914714, "learning_rate": 1.8621497777552505e-07, "logits/chosen": -1.420657992362976, "logits/rejected": -1.4776450395584106, "logps/chosen": -127.46673583984375, "logps/rejected": -184.2600860595703, "loss": 0.5869, "rewards/accuracies": 0.875, "rewards/chosen": -0.15772147476673126, "rewards/margins": 0.3883221745491028, "rewards/rejected": -0.5460436344146729, "step": 376 }, { "epoch": 0.43601759065676593, "grad_norm": 76.51933767322383, "learning_rate": 1.8601927816223695e-07, "logits/chosen": -1.3575465679168701, "logits/rejected": -1.3156774044036865, "logps/chosen": -218.0836944580078, "logps/rejected": -228.03778076171875, "loss": 0.6557, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4051874279975891, "rewards/margins": 0.143568217754364, "rewards/rejected": -0.5487555861473083, "step": 378 }, { "epoch": 0.43832456203590225, "grad_norm": 61.424133205634206, "learning_rate": 1.8582230353573624e-07, "logits/chosen": -1.4618622064590454, "logits/rejected": -1.4945478439331055, "logps/chosen": -95.66145324707031, "logps/rejected": -135.7235870361328, "loss": 0.6206, "rewards/accuracies": 0.75, "rewards/chosen": -0.1784054934978485, "rewards/margins": 0.23733605444431305, "rewards/rejected": -0.415741503238678, "step": 380 }, { "epoch": 0.4406315334150386, "grad_norm": 64.92661329207279, "learning_rate": 1.8562405681566214e-07, "logits/chosen": -1.5636019706726074, "logits/rejected": -1.5756021738052368, "logps/chosen": -201.42442321777344, "logps/rejected": -188.35606384277344, "loss": 0.6289, "rewards/accuracies": 0.625, "rewards/chosen": -0.3109050691127777, "rewards/margins": 0.10487519204616547, "rewards/rejected": -0.415780246257782, "step": 382 }, { "epoch": 0.4429385047941749, "grad_norm": 83.39366061705226, "learning_rate": 1.854245409405092e-07, "logits/chosen": -1.6649830341339111, "logits/rejected": -1.5097665786743164, "logps/chosen": -217.35536193847656, "logps/rejected": -223.5187225341797, "loss": 0.6113, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2543387711048126, "rewards/margins": 0.2463696151971817, "rewards/rejected": -0.5007083415985107, "step": 384 }, { "epoch": 0.4452454761733112, "grad_norm": 74.49558456416251, "learning_rate": 1.852237588675841e-07, "logits/chosen": -1.582183599472046, "logits/rejected": -1.7068113088607788, "logps/chosen": -162.75521850585938, "logps/rejected": -220.6885986328125, "loss": 0.5992, "rewards/accuracies": 0.75, "rewards/chosen": -0.21387754380702972, "rewards/margins": 0.31847310066223145, "rewards/rejected": -0.5323505997657776, "step": 386 }, { "epoch": 0.44755244755244755, "grad_norm": 72.0795411450381, "learning_rate": 1.850217135729614e-07, "logits/chosen": -1.605985164642334, "logits/rejected": -1.5858122110366821, "logps/chosen": -196.78073120117188, "logps/rejected": -213.26580810546875, "loss": 0.6034, "rewards/accuracies": 0.625, "rewards/chosen": -0.44325143098831177, "rewards/margins": 0.07666480541229248, "rewards/rejected": -0.5199161767959595, "step": 388 }, { "epoch": 0.4498594189315839, "grad_norm": 72.48651390442274, "learning_rate": 1.8481840805143987e-07, "logits/chosen": -1.5632058382034302, "logits/rejected": -1.5244344472885132, "logps/chosen": -127.80747985839844, "logps/rejected": -152.81256103515625, "loss": 0.6163, "rewards/accuracies": 0.875, "rewards/chosen": -0.1298586130142212, "rewards/margins": 0.42240971326828003, "rewards/rejected": -0.5522683262825012, "step": 390 }, { "epoch": 0.4521663903107202, "grad_norm": 74.34299341635638, "learning_rate": 1.8461384531649773e-07, "logits/chosen": -1.4820444583892822, "logits/rejected": -1.605046033859253, "logps/chosen": -105.68638610839844, "logps/rejected": -156.26785278320312, "loss": 0.6202, "rewards/accuracies": 0.75, "rewards/chosen": -0.1893598437309265, "rewards/margins": 0.2589360773563385, "rewards/rejected": -0.4482958912849426, "step": 392 }, { "epoch": 0.4544733616898565, "grad_norm": 76.36773452235572, "learning_rate": 1.844080284002482e-07, "logits/chosen": -1.5065568685531616, "logits/rejected": -1.5656404495239258, "logps/chosen": -158.7242889404297, "logps/rejected": -228.84844970703125, "loss": 0.6139, "rewards/accuracies": 0.75, "rewards/chosen": -0.251006543636322, "rewards/margins": 0.21102304756641388, "rewards/rejected": -0.46202951669692993, "step": 394 }, { "epoch": 0.45678033306899285, "grad_norm": 71.03674812873284, "learning_rate": 1.8420096035339452e-07, "logits/chosen": -1.5289005041122437, "logits/rejected": -1.527197003364563, "logps/chosen": -200.40029907226562, "logps/rejected": -212.3697967529297, "loss": 0.6187, "rewards/accuracies": 0.625, "rewards/chosen": -0.2883009612560272, "rewards/margins": 0.30317747592926025, "rewards/rejected": -0.5914784073829651, "step": 396 }, { "epoch": 0.4590873044481292, "grad_norm": 81.19707296013529, "learning_rate": 1.8399264424518465e-07, "logits/chosen": -1.494114875793457, "logits/rejected": -1.4553757905960083, "logps/chosen": -173.10043334960938, "logps/rejected": -222.2396240234375, "loss": 0.5955, "rewards/accuracies": 0.59375, "rewards/chosen": -0.3878926932811737, "rewards/margins": 0.3027462959289551, "rewards/rejected": -0.6906389594078064, "step": 398 }, { "epoch": 0.4613942758272655, "grad_norm": 89.13135103863338, "learning_rate": 1.8378308316336582e-07, "logits/chosen": -1.618680715560913, "logits/rejected": -1.5578938722610474, "logps/chosen": -191.10128784179688, "logps/rejected": -280.5110778808594, "loss": 0.6411, "rewards/accuracies": 0.625, "rewards/chosen": -0.4683380126953125, "rewards/margins": 0.19769813120365143, "rewards/rejected": -0.6660361289978027, "step": 400 }, { "epoch": 0.4613942758272655, "eval_logits/chosen": -1.4853571653366089, "eval_logits/rejected": -1.3932629823684692, "eval_logps/chosen": -189.0384521484375, "eval_logps/rejected": -156.24160766601562, "eval_loss": 0.654194176197052, "eval_rewards/accuracies": 0.6800000071525574, "eval_rewards/chosen": -0.38562828302383423, "eval_rewards/margins": 0.18699264526367188, "eval_rewards/rejected": -0.5726209282875061, "eval_runtime": 26.5299, "eval_samples_per_second": 3.769, "eval_steps_per_second": 0.942, "step": 400 }, { "epoch": 0.4637012472064018, "grad_norm": 69.21606890792003, "learning_rate": 1.8357228021413883e-07, "logits/chosen": -1.5431230068206787, "logits/rejected": -1.7365866899490356, "logps/chosen": -147.3966827392578, "logps/rejected": -170.9712371826172, "loss": 0.6581, "rewards/accuracies": 0.59375, "rewards/chosen": -0.30663323402404785, "rewards/margins": 0.11269617080688477, "rewards/rejected": -0.4193294048309326, "step": 402 }, { "epoch": 0.46600821858553815, "grad_norm": 78.7990153253576, "learning_rate": 1.8336023852211194e-07, "logits/chosen": -1.5721492767333984, "logits/rejected": -1.4822769165039062, "logps/chosen": -148.9419403076172, "logps/rejected": -158.44668579101562, "loss": 0.609, "rewards/accuracies": 0.75, "rewards/chosen": -0.27455994486808777, "rewards/margins": 0.3990754187107086, "rewards/rejected": -0.6736353039741516, "step": 404 }, { "epoch": 0.4683151899646745, "grad_norm": 67.81492283153628, "learning_rate": 1.8314696123025453e-07, "logits/chosen": -1.6370363235473633, "logits/rejected": -1.5174671411514282, "logps/chosen": -145.17050170898438, "logps/rejected": -142.74551391601562, "loss": 0.6312, "rewards/accuracies": 0.71875, "rewards/chosen": -0.28109437227249146, "rewards/margins": 0.2069387137889862, "rewards/rejected": -0.48803308606147766, "step": 406 }, { "epoch": 0.47062216134381085, "grad_norm": 78.2843593072173, "learning_rate": 1.8293245149985053e-07, "logits/chosen": -1.5488444566726685, "logits/rejected": -1.4798938035964966, "logps/chosen": -161.83570861816406, "logps/rejected": -162.7615509033203, "loss": 0.6484, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2718978822231293, "rewards/margins": 0.15639187395572662, "rewards/rejected": -0.4282897710800171, "step": 408 }, { "epoch": 0.4729291327229472, "grad_norm": 73.10449012391845, "learning_rate": 1.827167125104517e-07, "logits/chosen": -1.4978845119476318, "logits/rejected": -1.4839560985565186, "logps/chosen": -148.445556640625, "logps/rejected": -161.85986328125, "loss": 0.6481, "rewards/accuracies": 0.59375, "rewards/chosen": -0.27761712670326233, "rewards/margins": 0.09577606618404388, "rewards/rejected": -0.3733932077884674, "step": 410 }, { "epoch": 0.4752361041020835, "grad_norm": 77.23312704566136, "learning_rate": 1.8249974745983021e-07, "logits/chosen": -1.4896149635314941, "logits/rejected": -1.4279950857162476, "logps/chosen": -136.3888397216797, "logps/rejected": -184.14625549316406, "loss": 0.6186, "rewards/accuracies": 0.75, "rewards/chosen": -0.3546374440193176, "rewards/margins": 0.3140718638896942, "rewards/rejected": -0.6687093377113342, "step": 412 }, { "epoch": 0.4775430754812198, "grad_norm": 65.58481770102698, "learning_rate": 1.822815595639316e-07, "logits/chosen": -1.4790016412734985, "logits/rejected": -1.525940179824829, "logps/chosen": -162.99288940429688, "logps/rejected": -190.2974853515625, "loss": 0.6112, "rewards/accuracies": 0.6875, "rewards/chosen": -0.36069726943969727, "rewards/margins": 0.20576652884483337, "rewards/rejected": -0.5664637684822083, "step": 414 }, { "epoch": 0.47985004686035615, "grad_norm": 68.7972400850831, "learning_rate": 1.820621520568268e-07, "logits/chosen": -1.5574984550476074, "logits/rejected": -1.4820420742034912, "logps/chosen": -178.15878295898438, "logps/rejected": -191.66177368164062, "loss": 0.6, "rewards/accuracies": 0.75, "rewards/chosen": -0.314214825630188, "rewards/margins": 0.32970941066741943, "rewards/rejected": -0.6439242362976074, "step": 416 }, { "epoch": 0.4821570182394925, "grad_norm": 77.22458475405976, "learning_rate": 1.8184152819066434e-07, "logits/chosen": -1.5454033613204956, "logits/rejected": -1.5681257247924805, "logps/chosen": -206.4539031982422, "logps/rejected": -221.17599487304688, "loss": 0.6395, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4876091778278351, "rewards/margins": 0.06031504273414612, "rewards/rejected": -0.5479242205619812, "step": 418 }, { "epoch": 0.4844639896186288, "grad_norm": 69.59230881656185, "learning_rate": 1.8161969123562217e-07, "logits/chosen": -1.54752516746521, "logits/rejected": -1.5821384191513062, "logps/chosen": -182.0235137939453, "logps/rejected": -163.29364013671875, "loss": 0.6107, "rewards/accuracies": 0.75, "rewards/chosen": -0.3480142056941986, "rewards/margins": 0.3120378255844116, "rewards/rejected": -0.6600520610809326, "step": 420 }, { "epoch": 0.4867709609977651, "grad_norm": 68.29468448816121, "learning_rate": 1.813966444798591e-07, "logits/chosen": -1.513810634613037, "logits/rejected": -1.4666978120803833, "logps/chosen": -204.99462890625, "logps/rejected": -204.5595245361328, "loss": 0.6143, "rewards/accuracies": 0.75, "rewards/chosen": -0.3375055491924286, "rewards/margins": 0.3794183135032654, "rewards/rejected": -0.7169238328933716, "step": 422 }, { "epoch": 0.48907793237690145, "grad_norm": 73.69015362328696, "learning_rate": 1.8117239122946611e-07, "logits/chosen": -1.3477180004119873, "logits/rejected": -1.4509586095809937, "logps/chosen": -118.67777252197266, "logps/rejected": -176.48667907714844, "loss": 0.6192, "rewards/accuracies": 0.625, "rewards/chosen": -0.3034321069717407, "rewards/margins": 0.12479298561811447, "rewards/rejected": -0.4282251298427582, "step": 424 }, { "epoch": 0.49138490375603777, "grad_norm": 78.31541581493791, "learning_rate": 1.809469348084174e-07, "logits/chosen": -1.459653377532959, "logits/rejected": -1.5776402950286865, "logps/chosen": -159.45347595214844, "logps/rejected": -189.2720489501953, "loss": 0.6554, "rewards/accuracies": 0.65625, "rewards/chosen": -0.37468722462654114, "rewards/margins": 0.1383470892906189, "rewards/rejected": -0.5130342841148376, "step": 426 }, { "epoch": 0.4936918751351741, "grad_norm": 130.5379676824635, "learning_rate": 1.8072027855852095e-07, "logits/chosen": -1.4528967142105103, "logits/rejected": -1.423844814300537, "logps/chosen": -172.85316467285156, "logps/rejected": -215.22189331054688, "loss": 0.6639, "rewards/accuracies": 0.71875, "rewards/chosen": -0.41784724593162537, "rewards/margins": 0.3192124366760254, "rewards/rejected": -0.7370596528053284, "step": 428 }, { "epoch": 0.4959988465143104, "grad_norm": 63.21984381769687, "learning_rate": 1.8049242583936918e-07, "logits/chosen": -1.5084190368652344, "logits/rejected": -1.4574109315872192, "logps/chosen": -165.896484375, "logps/rejected": -227.423828125, "loss": 0.5893, "rewards/accuracies": 0.8125, "rewards/chosen": -0.25652381777763367, "rewards/margins": 0.47441697120666504, "rewards/rejected": -0.7309407591819763, "step": 430 }, { "epoch": 0.49830581789344675, "grad_norm": 71.69590925642426, "learning_rate": 1.802633800282891e-07, "logits/chosen": -1.516315221786499, "logits/rejected": -1.6526371240615845, "logps/chosen": -229.77777099609375, "logps/rejected": -292.7660827636719, "loss": 0.5979, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3780279755592346, "rewards/margins": 0.49888893961906433, "rewards/rejected": -0.8769169449806213, "step": 432 }, { "epoch": 0.5006127892725831, "grad_norm": 72.54608833334152, "learning_rate": 1.8003314452029213e-07, "logits/chosen": -1.5792149305343628, "logits/rejected": -1.550574779510498, "logps/chosen": -226.616455078125, "logps/rejected": -228.4210205078125, "loss": 0.6046, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5066580176353455, "rewards/margins": 0.34013134241104126, "rewards/rejected": -0.8467893600463867, "step": 434 }, { "epoch": 0.5029197606517194, "grad_norm": 73.04169645370872, "learning_rate": 1.7980172272802395e-07, "logits/chosen": -1.5109785795211792, "logits/rejected": -1.499125361442566, "logps/chosen": -154.92233276367188, "logps/rejected": -175.07643127441406, "loss": 0.5817, "rewards/accuracies": 0.8125, "rewards/chosen": -0.25602594017982483, "rewards/margins": 0.5013114809989929, "rewards/rejected": -0.7573373913764954, "step": 436 }, { "epoch": 0.5052267320308558, "grad_norm": 69.05059334922119, "learning_rate": 1.7956911808171373e-07, "logits/chosen": -1.561600923538208, "logits/rejected": -1.5301151275634766, "logps/chosen": -217.26930236816406, "logps/rejected": -240.7093048095703, "loss": 0.6151, "rewards/accuracies": 0.59375, "rewards/chosen": -0.46973368525505066, "rewards/margins": 0.2093038558959961, "rewards/rejected": -0.6790375113487244, "step": 438 }, { "epoch": 0.507533703409992, "grad_norm": 74.68873536524164, "learning_rate": 1.793353340291235e-07, "logits/chosen": -1.3198765516281128, "logits/rejected": -1.4805912971496582, "logps/chosen": -175.9479217529297, "logps/rejected": -226.83265686035156, "loss": 0.6134, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5549490451812744, "rewards/margins": 0.23202911019325256, "rewards/rejected": -0.7869781851768494, "step": 440 }, { "epoch": 0.5098406747891284, "grad_norm": 73.37532376774183, "learning_rate": 1.7910037403549692e-07, "logits/chosen": -1.4717934131622314, "logits/rejected": -1.5461549758911133, "logps/chosen": -159.91883850097656, "logps/rejected": -204.87376403808594, "loss": 0.6459, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4326345920562744, "rewards/margins": 0.22945694625377655, "rewards/rejected": -0.6620914936065674, "step": 442 }, { "epoch": 0.5121476461682647, "grad_norm": 69.28741446430803, "learning_rate": 1.7886424158350782e-07, "logits/chosen": -1.5604138374328613, "logits/rejected": -1.663907766342163, "logps/chosen": -158.54408264160156, "logps/rejected": -192.7698516845703, "loss": 0.5921, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3090921640396118, "rewards/margins": 0.3891502916812897, "rewards/rejected": -0.6982424855232239, "step": 444 }, { "epoch": 0.5144546175474011, "grad_norm": 77.66154968693108, "learning_rate": 1.7862694017320886e-07, "logits/chosen": -1.3435657024383545, "logits/rejected": -1.3843066692352295, "logps/chosen": -174.62672424316406, "logps/rejected": -288.0128173828125, "loss": 0.6145, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4857187271118164, "rewards/margins": 0.4818662703037262, "rewards/rejected": -0.9675850868225098, "step": 446 }, { "epoch": 0.5167615889265373, "grad_norm": 86.0701716220196, "learning_rate": 1.7838847332197937e-07, "logits/chosen": -1.4369436502456665, "logits/rejected": -1.5111709833145142, "logps/chosen": -193.0187225341797, "logps/rejected": -258.660400390625, "loss": 0.6179, "rewards/accuracies": 0.71875, "rewards/chosen": -0.4460400640964508, "rewards/margins": 0.4027029871940613, "rewards/rejected": -0.8487430810928345, "step": 448 }, { "epoch": 0.5190685603056737, "grad_norm": 84.40844346826594, "learning_rate": 1.7814884456447335e-07, "logits/chosen": -1.5306761264801025, "logits/rejected": -1.4944154024124146, "logps/chosen": -195.49612426757812, "logps/rejected": -222.01425170898438, "loss": 0.6006, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2904947102069855, "rewards/margins": 0.5166550874710083, "rewards/rejected": -0.8071498870849609, "step": 450 }, { "epoch": 0.52137553168481, "grad_norm": 86.3712126774886, "learning_rate": 1.7790805745256703e-07, "logits/chosen": -1.3275847434997559, "logits/rejected": -1.38175630569458, "logps/chosen": -136.90707397460938, "logps/rejected": -184.36331176757812, "loss": 0.6767, "rewards/accuracies": 0.625, "rewards/chosen": -0.44699156284332275, "rewards/margins": 0.12617343664169312, "rewards/rejected": -0.5731649398803711, "step": 452 }, { "epoch": 0.5236825030639464, "grad_norm": 66.61833278109548, "learning_rate": 1.7766611555530635e-07, "logits/chosen": -1.6141921281814575, "logits/rejected": -1.5151243209838867, "logps/chosen": -156.77407836914062, "logps/rejected": -154.7230682373047, "loss": 0.5733, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3759933114051819, "rewards/margins": 0.17464786767959595, "rewards/rejected": -0.5506411790847778, "step": 454 }, { "epoch": 0.5259894744430826, "grad_norm": 69.26758309677136, "learning_rate": 1.774230224588538e-07, "logits/chosen": -1.3204282522201538, "logits/rejected": -1.4286822080612183, "logps/chosen": -152.52542114257812, "logps/rejected": -232.16189575195312, "loss": 0.5494, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4260653853416443, "rewards/margins": 0.5102941989898682, "rewards/rejected": -0.9363595247268677, "step": 456 }, { "epoch": 0.528296445822219, "grad_norm": 81.07739462727531, "learning_rate": 1.771787817664356e-07, "logits/chosen": -1.508811116218567, "logits/rejected": -1.5395921468734741, "logps/chosen": -134.4735565185547, "logps/rejected": -166.41592407226562, "loss": 0.6351, "rewards/accuracies": 0.5, "rewards/chosen": -0.49481019377708435, "rewards/margins": 0.1262877732515335, "rewards/rejected": -0.6210979223251343, "step": 458 }, { "epoch": 0.5306034172013554, "grad_norm": 86.01343093557993, "learning_rate": 1.769333970982879e-07, "logits/chosen": -1.518664836883545, "logits/rejected": -1.3482635021209717, "logps/chosen": -173.78538513183594, "logps/rejected": -160.53573608398438, "loss": 0.5857, "rewards/accuracies": 0.625, "rewards/chosen": -0.49463319778442383, "rewards/margins": 0.202806293964386, "rewards/rejected": -0.6974395513534546, "step": 460 }, { "epoch": 0.5329103885804917, "grad_norm": 85.16027410016599, "learning_rate": 1.766868720916035e-07, "logits/chosen": -1.359481930732727, "logits/rejected": -1.3029265403747559, "logps/chosen": -134.05616760253906, "logps/rejected": -134.0654754638672, "loss": 0.6487, "rewards/accuracies": 0.625, "rewards/chosen": -0.4239296019077301, "rewards/margins": 0.03123108297586441, "rewards/rejected": -0.4551607072353363, "step": 462 }, { "epoch": 0.535217359959628, "grad_norm": 84.5629811685175, "learning_rate": 1.7643921040047766e-07, "logits/chosen": -1.6018937826156616, "logits/rejected": -1.6816954612731934, "logps/chosen": -237.3992919921875, "logps/rejected": -253.08688354492188, "loss": 0.597, "rewards/accuracies": 0.59375, "rewards/chosen": -0.6288573741912842, "rewards/margins": 0.15610165894031525, "rewards/rejected": -0.7849590182304382, "step": 464 }, { "epoch": 0.5375243313387643, "grad_norm": 80.72916842158041, "learning_rate": 1.7619041569585418e-07, "logits/chosen": -1.4444328546524048, "logits/rejected": -1.4673030376434326, "logps/chosen": -170.2801971435547, "logps/rejected": -214.7718963623047, "loss": 0.6181, "rewards/accuracies": 0.75, "rewards/chosen": -0.564181923866272, "rewards/margins": 0.2302751988172531, "rewards/rejected": -0.7944571375846863, "step": 466 }, { "epoch": 0.5398313027179007, "grad_norm": 76.00828750498393, "learning_rate": 1.759404916654707e-07, "logits/chosen": -1.4668854475021362, "logits/rejected": -1.421462059020996, "logps/chosen": -360.7674560546875, "logps/rejected": -301.1515197753906, "loss": 0.6139, "rewards/accuracies": 0.75, "rewards/chosen": -0.6432144641876221, "rewards/margins": 0.3255874514579773, "rewards/rejected": -0.9688019156455994, "step": 468 }, { "epoch": 0.542138274097037, "grad_norm": 75.00038820917719, "learning_rate": 1.756894420138043e-07, "logits/chosen": -1.5766559839248657, "logits/rejected": -1.656800627708435, "logps/chosen": -216.8627471923828, "logps/rejected": -270.90850830078125, "loss": 0.615, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4949862062931061, "rewards/margins": 0.4039486050605774, "rewards/rejected": -0.8989347815513611, "step": 470 }, { "epoch": 0.5444452454761733, "grad_norm": 86.17675092820859, "learning_rate": 1.754372704620164e-07, "logits/chosen": -1.4618090391159058, "logits/rejected": -1.5533053874969482, "logps/chosen": -202.59561157226562, "logps/rejected": -221.70413208007812, "loss": 0.6478, "rewards/accuracies": 0.75, "rewards/chosen": -0.44822004437446594, "rewards/margins": 0.28794264793395996, "rewards/rejected": -0.7361626625061035, "step": 472 }, { "epoch": 0.5467522168553096, "grad_norm": 72.36150215283246, "learning_rate": 1.7518398074789774e-07, "logits/chosen": -1.4804517030715942, "logits/rejected": -1.5212501287460327, "logps/chosen": -195.58935546875, "logps/rejected": -247.99276733398438, "loss": 0.553, "rewards/accuracies": 0.8125, "rewards/chosen": -0.44707149267196655, "rewards/margins": 0.6286894679069519, "rewards/rejected": -1.0757609605789185, "step": 474 }, { "epoch": 0.549059188234446, "grad_norm": 73.94947964279808, "learning_rate": 1.7492957662581294e-07, "logits/chosen": -1.3577089309692383, "logits/rejected": -1.4486963748931885, "logps/chosen": -133.3319091796875, "logps/rejected": -188.2812957763672, "loss": 0.6001, "rewards/accuracies": 0.65625, "rewards/chosen": -0.34889039397239685, "rewards/margins": 0.3021068871021271, "rewards/rejected": -0.6509972214698792, "step": 476 }, { "epoch": 0.5513661596135823, "grad_norm": 74.0047644626624, "learning_rate": 1.7467406186664473e-07, "logits/chosen": -1.5747010707855225, "logits/rejected": -1.5058567523956299, "logps/chosen": -216.6630401611328, "logps/rejected": -223.66598510742188, "loss": 0.6345, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5371094346046448, "rewards/margins": 0.3996596932411194, "rewards/rejected": -0.9367691874504089, "step": 478 }, { "epoch": 0.5536731309927186, "grad_norm": 50.915761396824145, "learning_rate": 1.7441744025773834e-07, "logits/chosen": -1.4014126062393188, "logits/rejected": -1.569306492805481, "logps/chosen": -156.43629455566406, "logps/rejected": -228.84625244140625, "loss": 0.5975, "rewards/accuracies": 0.5625, "rewards/chosen": -0.399608850479126, "rewards/margins": 0.29513585567474365, "rewards/rejected": -0.6947447061538696, "step": 480 }, { "epoch": 0.5559801023718549, "grad_norm": 80.40246802194461, "learning_rate": 1.74159715602845e-07, "logits/chosen": -1.49760103225708, "logits/rejected": -1.4302232265472412, "logps/chosen": -152.4906005859375, "logps/rejected": -165.43942260742188, "loss": 0.6511, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4252295196056366, "rewards/margins": 0.12136977910995483, "rewards/rejected": -0.5465993285179138, "step": 482 }, { "epoch": 0.5582870737509913, "grad_norm": 70.56990492477674, "learning_rate": 1.739008917220659e-07, "logits/chosen": -1.4919289350509644, "logits/rejected": -1.5267033576965332, "logps/chosen": -187.85191345214844, "logps/rejected": -220.8524169921875, "loss": 0.5689, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5196070671081543, "rewards/margins": 0.3590528666973114, "rewards/rejected": -0.8786599636077881, "step": 484 }, { "epoch": 0.5605940451301276, "grad_norm": 78.98020718967784, "learning_rate": 1.7364097245179527e-07, "logits/chosen": -1.599880337715149, "logits/rejected": -1.5224246978759766, "logps/chosen": -196.72555541992188, "logps/rejected": -213.14309692382812, "loss": 0.5892, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5108906030654907, "rewards/margins": 0.1902090609073639, "rewards/rejected": -0.701099693775177, "step": 486 }, { "epoch": 0.5629010165092639, "grad_norm": 75.35371757401214, "learning_rate": 1.733799616446637e-07, "logits/chosen": -1.4978597164154053, "logits/rejected": -1.5102261304855347, "logps/chosen": -186.15167236328125, "logps/rejected": -226.00375366210938, "loss": 0.6112, "rewards/accuracies": 0.75, "rewards/chosen": -0.43081170320510864, "rewards/margins": 0.36774906516075134, "rewards/rejected": -0.7985607385635376, "step": 488 }, { "epoch": 0.5652079878884002, "grad_norm": 75.43303696622675, "learning_rate": 1.7311786316948108e-07, "logits/chosen": -1.418121337890625, "logits/rejected": -1.4920923709869385, "logps/chosen": -179.17889404296875, "logps/rejected": -229.40098571777344, "loss": 0.5938, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6558996438980103, "rewards/margins": 0.27045130729675293, "rewards/rejected": -0.9263509511947632, "step": 490 }, { "epoch": 0.5675149592675366, "grad_norm": 71.0686050492484, "learning_rate": 1.7285468091117904e-07, "logits/chosen": -1.4989047050476074, "logits/rejected": -1.4156945943832397, "logps/chosen": -153.10214233398438, "logps/rejected": -172.13262939453125, "loss": 0.5901, "rewards/accuracies": 0.625, "rewards/chosen": -0.4824844300746918, "rewards/margins": 0.44079095125198364, "rewards/rejected": -0.9232754707336426, "step": 492 }, { "epoch": 0.569821930646673, "grad_norm": 67.99918941849218, "learning_rate": 1.7259041877075352e-07, "logits/chosen": -1.430630087852478, "logits/rejected": -1.3989218473434448, "logps/chosen": -209.73452758789062, "logps/rejected": -254.0313720703125, "loss": 0.5729, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5363369584083557, "rewards/margins": 0.5890082120895386, "rewards/rejected": -1.125345230102539, "step": 494 }, { "epoch": 0.5721289020258092, "grad_norm": 78.40754956054191, "learning_rate": 1.7232508066520698e-07, "logits/chosen": -1.5510261058807373, "logits/rejected": -1.5487847328186035, "logps/chosen": -211.16983032226562, "logps/rejected": -240.33824157714844, "loss": 0.5772, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4531714916229248, "rewards/margins": 0.2688879370689392, "rewards/rejected": -0.7220594882965088, "step": 496 }, { "epoch": 0.5744358734049456, "grad_norm": 61.990430466819326, "learning_rate": 1.7205867052749023e-07, "logits/chosen": -1.363396167755127, "logits/rejected": -1.3964465856552124, "logps/chosen": -147.12242126464844, "logps/rejected": -180.23667907714844, "loss": 0.6459, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5750865340232849, "rewards/margins": 0.11157172918319702, "rewards/rejected": -0.6866582632064819, "step": 498 }, { "epoch": 0.5767428447840819, "grad_norm": 76.0573953537264, "learning_rate": 1.717911923064442e-07, "logits/chosen": -1.5747530460357666, "logits/rejected": -1.4509817361831665, "logps/chosen": -181.61216735839844, "logps/rejected": -153.97573852539062, "loss": 0.6012, "rewards/accuracies": 0.59375, "rewards/chosen": -0.5273740887641907, "rewards/margins": 0.1454104781150818, "rewards/rejected": -0.6727845668792725, "step": 500 }, { "epoch": 0.5767428447840819, "eval_logits/chosen": -1.440444827079773, "eval_logits/rejected": -1.3533989191055298, "eval_logps/chosen": -191.4648895263672, "eval_logps/rejected": -158.6099395751953, "eval_loss": 0.636239767074585, "eval_rewards/accuracies": 0.6800000071525574, "eval_rewards/chosen": -0.628268837928772, "eval_rewards/margins": 0.18118661642074585, "eval_rewards/rejected": -0.809455394744873, "eval_runtime": 37.9799, "eval_samples_per_second": 2.633, "eval_steps_per_second": 0.658, "step": 500 }, { "epoch": 0.5790498161632183, "grad_norm": 77.78220283215643, "learning_rate": 1.7152264996674135e-07, "logits/chosen": -1.4428610801696777, "logits/rejected": -1.2872042655944824, "logps/chosen": -184.39501953125, "logps/rejected": -238.38723754882812, "loss": 0.5953, "rewards/accuracies": 0.75, "rewards/chosen": -0.6475786566734314, "rewards/margins": 0.2779845893383026, "rewards/rejected": -0.9255632758140564, "step": 502 }, { "epoch": 0.5813567875423545, "grad_norm": 93.29916680291039, "learning_rate": 1.71253047488827e-07, "logits/chosen": -1.4898688793182373, "logits/rejected": -1.5620332956314087, "logps/chosen": -178.47802734375, "logps/rejected": -205.5224609375, "loss": 0.6703, "rewards/accuracies": 0.625, "rewards/chosen": -0.5680350065231323, "rewards/margins": 0.18766377866268158, "rewards/rejected": -0.7556988000869751, "step": 504 }, { "epoch": 0.5836637589214909, "grad_norm": 77.19105499219319, "learning_rate": 1.7098238886886024e-07, "logits/chosen": -1.4835506677627563, "logits/rejected": -1.5302045345306396, "logps/chosen": -203.8736114501953, "logps/rejected": -228.69265747070312, "loss": 0.5951, "rewards/accuracies": 0.53125, "rewards/chosen": -0.47867119312286377, "rewards/margins": 0.22942683100700378, "rewards/rejected": -0.7080979943275452, "step": 506 }, { "epoch": 0.5859707303006272, "grad_norm": 67.4261860354, "learning_rate": 1.7071067811865473e-07, "logits/chosen": -1.4649958610534668, "logits/rejected": -1.4145183563232422, "logps/chosen": -199.42066955566406, "logps/rejected": -235.40292358398438, "loss": 0.5368, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4195340573787689, "rewards/margins": 0.551209032535553, "rewards/rejected": -0.9707430601119995, "step": 508 }, { "epoch": 0.5882777016797636, "grad_norm": 87.85240065033273, "learning_rate": 1.7043791926561932e-07, "logits/chosen": -1.5964919328689575, "logits/rejected": -1.561856746673584, "logps/chosen": -201.67276000976562, "logps/rejected": -234.04359436035156, "loss": 0.651, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6162290573120117, "rewards/margins": 0.4439167082309723, "rewards/rejected": -1.0601458549499512, "step": 510 }, { "epoch": 0.5905846730588998, "grad_norm": 62.42968300457303, "learning_rate": 1.7016411635269815e-07, "logits/chosen": -1.4615092277526855, "logits/rejected": -1.4488492012023926, "logps/chosen": -151.2560577392578, "logps/rejected": -176.4474334716797, "loss": 0.609, "rewards/accuracies": 0.75, "rewards/chosen": -0.33995571732521057, "rewards/margins": 0.2483442723751068, "rewards/rejected": -0.5882999897003174, "step": 512 }, { "epoch": 0.5928916444380362, "grad_norm": 74.39629379240114, "learning_rate": 1.6988927343831091e-07, "logits/chosen": -1.5747379064559937, "logits/rejected": -1.4773468971252441, "logps/chosen": -198.891845703125, "logps/rejected": -210.0729522705078, "loss": 0.61, "rewards/accuracies": 0.78125, "rewards/chosen": -0.47531554102897644, "rewards/margins": 0.47791624069213867, "rewards/rejected": -0.9532317519187927, "step": 514 }, { "epoch": 0.5951986158171725, "grad_norm": 70.19350216590036, "learning_rate": 1.6961339459629266e-07, "logits/chosen": -1.4481630325317383, "logits/rejected": -1.4714566469192505, "logps/chosen": -190.8370361328125, "logps/rejected": -242.71621704101562, "loss": 0.5872, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5172877907752991, "rewards/margins": 0.48140281438827515, "rewards/rejected": -0.998690664768219, "step": 516 }, { "epoch": 0.5975055871963089, "grad_norm": 73.75535823993799, "learning_rate": 1.6933648391583328e-07, "logits/chosen": -1.531792163848877, "logits/rejected": -1.4680547714233398, "logps/chosen": -144.9717559814453, "logps/rejected": -172.87686157226562, "loss": 0.6006, "rewards/accuracies": 0.59375, "rewards/chosen": -0.3757992386817932, "rewards/margins": 0.35130438208580017, "rewards/rejected": -0.7271036505699158, "step": 518 }, { "epoch": 0.5998125585754451, "grad_norm": 69.85303523035323, "learning_rate": 1.6905854550141714e-07, "logits/chosen": -1.5805073976516724, "logits/rejected": -1.5384862422943115, "logps/chosen": -171.9115753173828, "logps/rejected": -169.82862854003906, "loss": 0.5875, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5081273317337036, "rewards/margins": 0.2863667607307434, "rewards/rejected": -0.794494092464447, "step": 520 }, { "epoch": 0.6021195299545815, "grad_norm": 69.03602758187714, "learning_rate": 1.6877958347276197e-07, "logits/chosen": -1.4844419956207275, "logits/rejected": -1.4906061887741089, "logps/chosen": -149.6005859375, "logps/rejected": -163.59097290039062, "loss": 0.6013, "rewards/accuracies": 0.65625, "rewards/chosen": -0.42841285467147827, "rewards/margins": 0.30834630131721497, "rewards/rejected": -0.7367592453956604, "step": 522 }, { "epoch": 0.6044265013337178, "grad_norm": 80.75337933099041, "learning_rate": 1.6849960196475805e-07, "logits/chosen": -1.5245236158370972, "logits/rejected": -1.5345442295074463, "logps/chosen": -148.5638885498047, "logps/rejected": -178.37429809570312, "loss": 0.5909, "rewards/accuracies": 0.75, "rewards/chosen": -0.3656730651855469, "rewards/margins": 0.3520704507827759, "rewards/rejected": -0.7177435159683228, "step": 524 }, { "epoch": 0.6067334727128542, "grad_norm": 79.6488573037571, "learning_rate": 1.682186051274067e-07, "logits/chosen": -1.4462357759475708, "logits/rejected": -1.4616801738739014, "logps/chosen": -144.83853149414062, "logps/rejected": -191.320556640625, "loss": 0.5847, "rewards/accuracies": 0.75, "rewards/chosen": -0.6087457537651062, "rewards/margins": 0.3239368498325348, "rewards/rejected": -0.9326826930046082, "step": 526 }, { "epoch": 0.6090404440919904, "grad_norm": 82.53815106903608, "learning_rate": 1.6793659712575895e-07, "logits/chosen": -1.5642480850219727, "logits/rejected": -1.4599685668945312, "logps/chosen": -215.29837036132812, "logps/rejected": -199.14767456054688, "loss": 0.5928, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5695382356643677, "rewards/margins": 0.271673321723938, "rewards/rejected": -0.8412115573883057, "step": 528 }, { "epoch": 0.6113474154711268, "grad_norm": 86.53571512694035, "learning_rate": 1.676535821398537e-07, "logits/chosen": -1.3208836317062378, "logits/rejected": -1.3146097660064697, "logps/chosen": -189.41128540039062, "logps/rejected": -232.5477294921875, "loss": 0.6013, "rewards/accuracies": 0.65625, "rewards/chosen": -0.654186487197876, "rewards/margins": 0.4602148234844208, "rewards/rejected": -1.1144013404846191, "step": 530 }, { "epoch": 0.6136543868502632, "grad_norm": 70.64851504723866, "learning_rate": 1.6736956436465573e-07, "logits/chosen": -1.3590030670166016, "logits/rejected": -1.4608113765716553, "logps/chosen": -148.809326171875, "logps/rejected": -203.59759521484375, "loss": 0.5861, "rewards/accuracies": 0.71875, "rewards/chosen": -0.496415913105011, "rewards/margins": 0.31767329573631287, "rewards/rejected": -0.814089298248291, "step": 532 }, { "epoch": 0.6159613582293995, "grad_norm": 73.57136513502368, "learning_rate": 1.6708454800999366e-07, "logits/chosen": -1.4504910707473755, "logits/rejected": -1.4983229637145996, "logps/chosen": -166.2091522216797, "logps/rejected": -206.8488311767578, "loss": 0.6153, "rewards/accuracies": 0.75, "rewards/chosen": -0.49555644392967224, "rewards/margins": 0.3523869812488556, "rewards/rejected": -0.8479433655738831, "step": 534 }, { "epoch": 0.6182683296085358, "grad_norm": 67.83021038753246, "learning_rate": 1.667985373004974e-07, "logits/chosen": -1.4747323989868164, "logits/rejected": -1.3922568559646606, "logps/chosen": -159.47254943847656, "logps/rejected": -177.21884155273438, "loss": 0.5691, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2918567657470703, "rewards/margins": 0.5216075778007507, "rewards/rejected": -0.8134642839431763, "step": 536 }, { "epoch": 0.6205753009876721, "grad_norm": 75.55693314924734, "learning_rate": 1.6651153647553567e-07, "logits/chosen": -1.6021491289138794, "logits/rejected": -1.6126930713653564, "logps/chosen": -165.55172729492188, "logps/rejected": -197.1583251953125, "loss": 0.5986, "rewards/accuracies": 0.65625, "rewards/chosen": -0.505136251449585, "rewards/margins": 0.2592867612838745, "rewards/rejected": -0.7644230127334595, "step": 538 }, { "epoch": 0.6228822723668085, "grad_norm": 74.57237448077612, "learning_rate": 1.6622354978915304e-07, "logits/chosen": -1.3560292720794678, "logits/rejected": -1.4895740747451782, "logps/chosen": -152.60386657714844, "logps/rejected": -200.48497009277344, "loss": 0.5976, "rewards/accuracies": 0.75, "rewards/chosen": -0.450514554977417, "rewards/margins": 0.42979568243026733, "rewards/rejected": -0.8803102374076843, "step": 540 }, { "epoch": 0.6251892437459448, "grad_norm": 76.07758708375029, "learning_rate": 1.6593458151000687e-07, "logits/chosen": -1.418495535850525, "logits/rejected": -1.5285032987594604, "logps/chosen": -174.468017578125, "logps/rejected": -212.58534240722656, "loss": 0.6021, "rewards/accuracies": 0.625, "rewards/chosen": -0.4992409944534302, "rewards/margins": 0.357663631439209, "rewards/rejected": -0.8569046854972839, "step": 542 }, { "epoch": 0.6274962151250811, "grad_norm": 67.61668250943133, "learning_rate": 1.6564463592130426e-07, "logits/chosen": -1.6000475883483887, "logits/rejected": -1.5714551210403442, "logps/chosen": -129.46788024902344, "logps/rejected": -137.58729553222656, "loss": 0.6027, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4155838191509247, "rewards/margins": 0.31966376304626465, "rewards/rejected": -0.7352475523948669, "step": 544 }, { "epoch": 0.6298031865042174, "grad_norm": 67.37831547087359, "learning_rate": 1.6535371732073823e-07, "logits/chosen": -1.5627467632293701, "logits/rejected": -1.4833993911743164, "logps/chosen": -115.5599594116211, "logps/rejected": -121.90804290771484, "loss": 0.5859, "rewards/accuracies": 0.625, "rewards/chosen": -0.286516010761261, "rewards/margins": 0.36314332485198975, "rewards/rejected": -0.6496593356132507, "step": 546 }, { "epoch": 0.6321101578833538, "grad_norm": 79.67037148877638, "learning_rate": 1.650618300204242e-07, "logits/chosen": -1.4731521606445312, "logits/rejected": -1.5530614852905273, "logps/chosen": -218.06552124023438, "logps/rejected": -257.6269226074219, "loss": 0.6104, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7696484923362732, "rewards/margins": 0.28321802616119385, "rewards/rejected": -1.0528665781021118, "step": 548 }, { "epoch": 0.63441712926249, "grad_norm": 67.9423797863854, "learning_rate": 1.6476897834683618e-07, "logits/chosen": -1.4056189060211182, "logits/rejected": -1.4078246355056763, "logps/chosen": -147.92111206054688, "logps/rejected": -188.60968017578125, "loss": 0.6018, "rewards/accuracies": 0.75, "rewards/chosen": -0.5256268978118896, "rewards/margins": 0.4678364396095276, "rewards/rejected": -0.9934633374214172, "step": 550 }, { "epoch": 0.6367241006416264, "grad_norm": 68.15375283996126, "learning_rate": 1.644751666407424e-07, "logits/chosen": -1.2929272651672363, "logits/rejected": -1.3170608282089233, "logps/chosen": -207.3567352294922, "logps/rejected": -262.3974609375, "loss": 0.5823, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7716534733772278, "rewards/margins": 0.6446899771690369, "rewards/rejected": -1.4163434505462646, "step": 552 }, { "epoch": 0.6390310720207627, "grad_norm": 71.41650018580867, "learning_rate": 1.6418039925714115e-07, "logits/chosen": -1.3858839273452759, "logits/rejected": -1.3953114748001099, "logps/chosen": -160.35096740722656, "logps/rejected": -186.47933959960938, "loss": 0.5559, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5581396222114563, "rewards/margins": 0.3457927703857422, "rewards/rejected": -0.9039323329925537, "step": 554 }, { "epoch": 0.6413380433998991, "grad_norm": 76.78836475295354, "learning_rate": 1.6388468056519612e-07, "logits/chosen": -1.4668548107147217, "logits/rejected": -1.4067307710647583, "logps/chosen": -212.10546875, "logps/rejected": -193.7842254638672, "loss": 0.5721, "rewards/accuracies": 0.71875, "rewards/chosen": -0.618504524230957, "rewards/margins": 0.36426225304603577, "rewards/rejected": -0.9827668070793152, "step": 556 }, { "epoch": 0.6436450147790354, "grad_norm": 66.95864858123714, "learning_rate": 1.6358801494817172e-07, "logits/chosen": -1.4181556701660156, "logits/rejected": -1.409440279006958, "logps/chosen": -139.5923309326172, "logps/rejected": -183.9441375732422, "loss": 0.5663, "rewards/accuracies": 0.71875, "rewards/chosen": -0.42550671100616455, "rewards/margins": 0.626122236251831, "rewards/rejected": -1.0516289472579956, "step": 558 }, { "epoch": 0.6459519861581717, "grad_norm": 88.18680458715171, "learning_rate": 1.6329040680336805e-07, "logits/chosen": -1.468677282333374, "logits/rejected": -1.5043675899505615, "logps/chosen": -161.72213745117188, "logps/rejected": -206.85214233398438, "loss": 0.572, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5167573690414429, "rewards/margins": 0.36671191453933716, "rewards/rejected": -0.8834693431854248, "step": 560 }, { "epoch": 0.648258957537308, "grad_norm": 71.84112642036989, "learning_rate": 1.6299186054205575e-07, "logits/chosen": -1.5098912715911865, "logits/rejected": -1.4657700061798096, "logps/chosen": -177.00067138671875, "logps/rejected": -190.06985473632812, "loss": 0.5365, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3948441743850708, "rewards/margins": 0.5432202816009521, "rewards/rejected": -0.9380643963813782, "step": 562 }, { "epoch": 0.6505659289164444, "grad_norm": 77.21845596596229, "learning_rate": 1.6269238058941067e-07, "logits/chosen": -1.5354855060577393, "logits/rejected": -1.4872441291809082, "logps/chosen": -220.86279296875, "logps/rejected": -242.259765625, "loss": 0.6141, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5020161867141724, "rewards/margins": 0.3912605345249176, "rewards/rejected": -0.8932766914367676, "step": 564 }, { "epoch": 0.6528729002955808, "grad_norm": 77.14842839642075, "learning_rate": 1.6239197138444807e-07, "logits/chosen": -1.4313609600067139, "logits/rejected": -1.4305431842803955, "logps/chosen": -99.62786865234375, "logps/rejected": -128.8907928466797, "loss": 0.5895, "rewards/accuracies": 0.75, "rewards/chosen": -0.1888483613729477, "rewards/margins": 0.4503237307071686, "rewards/rejected": -0.6391721367835999, "step": 566 }, { "epoch": 0.655179871674717, "grad_norm": 62.79374975719681, "learning_rate": 1.6209063737995714e-07, "logits/chosen": -1.4637759923934937, "logits/rejected": -1.4549309015274048, "logps/chosen": -144.82948303222656, "logps/rejected": -185.9346466064453, "loss": 0.5515, "rewards/accuracies": 0.71875, "rewards/chosen": -0.44154876470565796, "rewards/margins": 0.37137869000434875, "rewards/rejected": -0.8129273653030396, "step": 568 }, { "epoch": 0.6574868430538534, "grad_norm": 77.33084496555169, "learning_rate": 1.6178838304243472e-07, "logits/chosen": -1.491298794746399, "logits/rejected": -1.5582300424575806, "logps/chosen": -193.7870635986328, "logps/rejected": -242.5855712890625, "loss": 0.5723, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5082133412361145, "rewards/margins": 0.6296249628067017, "rewards/rejected": -1.1378382444381714, "step": 570 }, { "epoch": 0.6597938144329897, "grad_norm": 67.02472308421605, "learning_rate": 1.6148521285201927e-07, "logits/chosen": -1.4817756414413452, "logits/rejected": -1.402366042137146, "logps/chosen": -154.45765686035156, "logps/rejected": -178.16561889648438, "loss": 0.5564, "rewards/accuracies": 0.75, "rewards/chosen": -0.3961385488510132, "rewards/margins": 0.5840703248977661, "rewards/rejected": -0.9802089333534241, "step": 572 }, { "epoch": 0.6621007858121261, "grad_norm": 73.0106659319347, "learning_rate": 1.6118113130242432e-07, "logits/chosen": -1.4550271034240723, "logits/rejected": -1.4115763902664185, "logps/chosen": -221.6585235595703, "logps/rejected": -195.1796417236328, "loss": 0.5774, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8219617009162903, "rewards/margins": 0.16280440986156464, "rewards/rejected": -0.9847662448883057, "step": 574 }, { "epoch": 0.6644077571912623, "grad_norm": 77.31259598468839, "learning_rate": 1.6087614290087206e-07, "logits/chosen": -1.4929287433624268, "logits/rejected": -1.4764537811279297, "logps/chosen": -230.29653930664062, "logps/rejected": -284.22412109375, "loss": 0.5818, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6301875114440918, "rewards/margins": 0.7476638555526733, "rewards/rejected": -1.3778512477874756, "step": 576 }, { "epoch": 0.6667147285703987, "grad_norm": 69.04855850678052, "learning_rate": 1.605702521680263e-07, "logits/chosen": -1.3067015409469604, "logits/rejected": -1.338529348373413, "logps/chosen": -147.36080932617188, "logps/rejected": -193.80665588378906, "loss": 0.5757, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6234080791473389, "rewards/margins": 0.39194294810295105, "rewards/rejected": -1.0153510570526123, "step": 578 }, { "epoch": 0.669021699949535, "grad_norm": 81.45402825293101, "learning_rate": 1.6026346363792565e-07, "logits/chosen": -1.4524238109588623, "logits/rejected": -1.3550243377685547, "logps/chosen": -187.0885772705078, "logps/rejected": -177.09780883789062, "loss": 0.6058, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7711231708526611, "rewards/margins": 0.17797166109085083, "rewards/rejected": -0.9490947127342224, "step": 580 }, { "epoch": 0.6713286713286714, "grad_norm": 65.47602685653504, "learning_rate": 1.5995578185791616e-07, "logits/chosen": -1.387951374053955, "logits/rejected": -1.3309695720672607, "logps/chosen": -158.39202880859375, "logps/rejected": -186.85105895996094, "loss": 0.5825, "rewards/accuracies": 0.75, "rewards/chosen": -0.48583418130874634, "rewards/margins": 0.503716230392456, "rewards/rejected": -0.9895503520965576, "step": 582 }, { "epoch": 0.6736356427078076, "grad_norm": 76.89288613284735, "learning_rate": 1.596472113885841e-07, "logits/chosen": -1.4493763446807861, "logits/rejected": -1.4876127243041992, "logps/chosen": -180.78541564941406, "logps/rejected": -220.08172607421875, "loss": 0.5822, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5832819938659668, "rewards/margins": 0.494464248418808, "rewards/rejected": -1.0777461528778076, "step": 584 }, { "epoch": 0.675942614086944, "grad_norm": 82.2690699212878, "learning_rate": 1.5933775680368822e-07, "logits/chosen": -1.4559937715530396, "logits/rejected": -1.5102128982543945, "logps/chosen": -169.15960693359375, "logps/rejected": -176.64280700683594, "loss": 0.6272, "rewards/accuracies": 0.59375, "rewards/chosen": -0.5040290355682373, "rewards/margins": 0.27444028854370117, "rewards/rejected": -0.7784693241119385, "step": 586 }, { "epoch": 0.6782495854660803, "grad_norm": 76.21062906880101, "learning_rate": 1.5902742269009194e-07, "logits/chosen": -1.348806381225586, "logits/rejected": -1.293540358543396, "logps/chosen": -135.5105438232422, "logps/rejected": -156.5147705078125, "loss": 0.5875, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5231513977050781, "rewards/margins": 0.4782097041606903, "rewards/rejected": -1.0013611316680908, "step": 588 }, { "epoch": 0.6805565568452167, "grad_norm": 75.50192821178838, "learning_rate": 1.5871621364769553e-07, "logits/chosen": -1.5168403387069702, "logits/rejected": -1.4424357414245605, "logps/chosen": -183.81605529785156, "logps/rejected": -171.45872497558594, "loss": 0.6035, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7719120979309082, "rewards/margins": 0.2601196765899658, "rewards/rejected": -1.0320318937301636, "step": 590 }, { "epoch": 0.6828635282243529, "grad_norm": 84.93892075040027, "learning_rate": 1.5840413428936766e-07, "logits/chosen": -1.3720101118087769, "logits/rejected": -1.391021490097046, "logps/chosen": -171.98031616210938, "logps/rejected": -176.23892211914062, "loss": 0.599, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7516859769821167, "rewards/margins": 0.21854539215564728, "rewards/rejected": -0.9702314138412476, "step": 592 }, { "epoch": 0.6851704996034893, "grad_norm": 66.70595859312724, "learning_rate": 1.5809118924087733e-07, "logits/chosen": -1.4547669887542725, "logits/rejected": -1.430787205696106, "logps/chosen": -177.32481384277344, "logps/rejected": -208.61553955078125, "loss": 0.6102, "rewards/accuracies": 0.625, "rewards/chosen": -0.5358410477638245, "rewards/margins": 0.26219645142555237, "rewards/rejected": -0.7980375289916992, "step": 594 }, { "epoch": 0.6874774709826256, "grad_norm": 82.62176636567787, "learning_rate": 1.5777738314082511e-07, "logits/chosen": -1.4137248992919922, "logits/rejected": -1.404469609260559, "logps/chosen": -164.01600646972656, "logps/rejected": -184.97645568847656, "loss": 0.6472, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5918564200401306, "rewards/margins": 0.21411672234535217, "rewards/rejected": -0.8059731721878052, "step": 596 }, { "epoch": 0.689784442361762, "grad_norm": 72.16505210857706, "learning_rate": 1.5746272064057439e-07, "logits/chosen": -1.3921738862991333, "logits/rejected": -1.3382896184921265, "logps/chosen": -199.48634338378906, "logps/rejected": -226.77871704101562, "loss": 0.5858, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5180130004882812, "rewards/margins": 0.4014572501182556, "rewards/rejected": -0.9194702506065369, "step": 598 }, { "epoch": 0.6920914137408983, "grad_norm": 78.66776375616931, "learning_rate": 1.5714720640418247e-07, "logits/chosen": -1.511127233505249, "logits/rejected": -1.5256671905517578, "logps/chosen": -182.10826110839844, "logps/rejected": -198.63510131835938, "loss": 0.618, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6393432021141052, "rewards/margins": 0.16456884145736694, "rewards/rejected": -0.8039120435714722, "step": 600 }, { "epoch": 0.6920914137408983, "eval_logits/chosen": -1.4086966514587402, "eval_logits/rejected": -1.3254387378692627, "eval_logps/chosen": -191.96621704101562, "eval_logps/rejected": -160.9102325439453, "eval_loss": 0.6056262850761414, "eval_rewards/accuracies": 0.7200000286102295, "eval_rewards/chosen": -0.6784057021141052, "eval_rewards/margins": 0.3610783815383911, "eval_rewards/rejected": -1.0394840240478516, "eval_runtime": 37.022, "eval_samples_per_second": 2.701, "eval_steps_per_second": 0.675, "step": 600 } ], "logging_steps": 2, "max_steps": 1732, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }